diff options
author | Allan Sandfeld Jensen <allan.jensen@qt.io> | 2017-04-05 14:08:31 +0200 |
---|---|---|
committer | Allan Sandfeld Jensen <allan.jensen@qt.io> | 2017-04-11 07:46:53 +0000 |
commit | 6a4cabb866f66d4128a97cdc6d9d08ce074f1247 (patch) | |
tree | ab00f70a5e89278d6a0d16ff0c42578dc4d84a2d /chromium/third_party/libvpx | |
parent | e733310db58160074f574c429d48f8308c0afe17 (diff) | |
download | qtwebengine-chromium-6a4cabb866f66d4128a97cdc6d9d08ce074f1247.tar.gz |
BASELINE: Update Chromium to 57.0.2987.144
Change-Id: I29db402ff696c71a04c4dbaec822c2e53efe0267
Reviewed-by: Peter Varga <pvarga@inf.u-szeged.hu>
Diffstat (limited to 'chromium/third_party/libvpx')
134 files changed, 8809 insertions, 4338 deletions
diff --git a/chromium/third_party/libvpx/BUILD.gn b/chromium/third_party/libvpx/BUILD.gn index 0a39291205b..2c79ee0f405 100644 --- a/chromium/third_party/libvpx/BUILD.gn +++ b/chromium/third_party/libvpx/BUILD.gn @@ -38,9 +38,9 @@ if (is_nacl) { # vpx_config.asm if (is_ios && current_cpu == "arm") { os_category = current_os - } else if (is_posix) { # Should cover linux, mac, and the ios simulator. + } else if (is_posix) { # Should cover linux, mac, and the ios simulator. os_category = "linux" - } else { # This should only match windows. + } else { # This should only match windows. os_category = current_os } platform_include_dir = diff --git a/chromium/third_party/libvpx/README.chromium b/chromium/third_party/libvpx/README.chromium index 390b58ebdc1..7fcfd85f425 100644 --- a/chromium/third_party/libvpx/README.chromium +++ b/chromium/third_party/libvpx/README.chromium @@ -5,9 +5,9 @@ License: BSD License File: source/libvpx/LICENSE Security Critical: yes -Date: Tuesday November 08 2016 +Date: Monday January 09 2017 Branch: master -Commit: 5c64c01c7ca3780d30f140e54a30088f780ae66a +Commit: 5b1a8ca5e846f838062becaec9ed6b5ecef306e5 Description: Contains the sources used to compile libvpx binaries used by Google Chrome and diff --git a/chromium/third_party/libvpx/libvpx_srcs.gni b/chromium/third_party/libvpx/libvpx_srcs.gni index 37850e29793..664a0fdbb2e 100644 --- a/chromium/third_party/libvpx/libvpx_srcs.gni +++ b/chromium/third_party/libvpx/libvpx_srcs.gni @@ -1515,15 +1515,16 @@ libvpx_srcs_arm_neon = [ "//third_party/libvpx/source/libvpx/vpx/vpx_integer.h", "//third_party/libvpx/source/libvpx/vpx_dsp/add_noise.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/avg_neon.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/arm/deblock_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/hadamard_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_neon.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon.c", - "//third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_vertical_4_dual_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/sad4d_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/sad_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/subpel_variance_neon.c", @@ -1986,14 +1987,15 @@ libvpx_srcs_arm_neon_cpu_detect_neon = [ "//third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_error_neon.c", "//third_party/libvpx/source/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/avg_neon.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/arm/deblock_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/hadamard_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_neon.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon.c", - "//third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_vertical_4_dual_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/sad4d_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/sad_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/subpel_variance_neon.c", @@ -2299,11 +2301,12 @@ libvpx_srcs_arm64 = [ "//third_party/libvpx/source/libvpx/vpx/vpx_integer.h", "//third_party/libvpx/source/libvpx/vpx_dsp/add_noise.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/avg_neon.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/arm/deblock_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/hadamard_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c", - "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_neon.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c", diff --git a/chromium/third_party/libvpx/source/config/ios/arm-neon/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/ios/arm-neon/vp9_rtcd.h index 5f0e862cbfa..d07bcaa3def 100644 --- a/chromium/third_party/libvpx/source/config/ios/arm-neon/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/ios/arm-neon/vp9_rtcd.h @@ -70,12 +70,12 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); #define vp9_iht16x16_256_add vp9_iht16x16_256_add_c -void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); -void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht4x4_16_add vp9_iht4x4_16_add_neon -void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); -void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht8x8_64_add vp9_iht8x8_64_add_neon void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); diff --git a/chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_dsp_rtcd.h index a5c50f21727..c9d867d00cd 100644 --- a/chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_dsp_rtcd.h @@ -75,17 +75,20 @@ void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *a #define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c +void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_neon void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c +void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_neon void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_neon void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c +void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_neon void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c @@ -128,7 +131,8 @@ void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_ #define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_neon void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c +void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_neon void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); @@ -312,52 +316,52 @@ void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, int16_t *coe void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c -void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_10_add vpx_idct16x16_10_add_neon -void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_1_add vpx_idct16x16_1_add_neon -void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_256_add vpx_idct16x16_256_add_neon -void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_neon -void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); -#define vpx_idct32x32_135_add vpx_idct32x32_1024_add_neon +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_135_add vpx_idct32x32_135_add_neon -void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_1_add vpx_idct32x32_1_add_neon -void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_34_add vpx_idct32x32_34_add_neon -void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct4x4_16_add vpx_idct4x4_16_add_neon -void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct4x4_1_add vpx_idct4x4_1_add_neon -void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct8x8_12_add vpx_idct8x8_12_add_neon -void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct8x8_1_add vpx_idct8x8_1_add_neon -void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct8x8_64_add vpx_idct8x8_64_add_neon int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); @@ -368,10 +372,10 @@ void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, void vpx_int_pro_row_neon(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); #define vpx_int_pro_row vpx_int_pro_row_neon -void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c -void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); @@ -423,10 +427,12 @@ void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, #define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_neon void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_c +void vpx_mbpost_proc_across_ip_neon(unsigned char *dst, int pitch, int rows, int cols,int flimit); +#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_neon void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -#define vpx_mbpost_proc_down vpx_mbpost_proc_down_c +void vpx_mbpost_proc_down_neon(unsigned char *dst, int pitch, int rows, int cols,int flimit); +#define vpx_mbpost_proc_down vpx_mbpost_proc_down_neon void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); void vpx_minmax_8x8_neon(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); @@ -449,7 +455,8 @@ void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp, #define vpx_plane_add_noise vpx_plane_add_noise_c void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); -#define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_c +void vpx_post_proc_down_and_across_mb_row_neon(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +#define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_neon void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vpx_quantize_b vpx_quantize_b_c diff --git a/chromium/third_party/libvpx/source/config/ios/arm64/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/ios/arm64/vp9_rtcd.h index 5f0e862cbfa..d07bcaa3def 100644 --- a/chromium/third_party/libvpx/source/config/ios/arm64/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/ios/arm64/vp9_rtcd.h @@ -70,12 +70,12 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); #define vp9_iht16x16_256_add vp9_iht16x16_256_add_c -void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); -void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht4x4_16_add vp9_iht4x4_16_add_neon -void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); -void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht8x8_64_add vp9_iht8x8_64_add_neon void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); diff --git a/chromium/third_party/libvpx/source/config/ios/arm64/vpx_config.c b/chromium/third_party/libvpx/source/config/ios/arm64/vpx_config.c index 5f93ebfb676..56a5348abd6 100644 --- a/chromium/third_party/libvpx/source/config/ios/arm64/vpx_config.c +++ b/chromium/third_party/libvpx/source/config/ios/arm64/vpx_config.c @@ -6,5 +6,5 @@ /* in the file PATENTS. All contributing project authors may */ /* be found in the AUTHORS file in the root of the source tree. */ #include "vpx/vpx_codec.h" -static const char* const cfg = "--target=arm64-linux-gcc --enable-external-build --enable-postproc --enable-multi-res-encoding --enable-temporal-denoising --enable-vp9-temporal-denoising --enable-vp9-postproc --size-limit=16384x16384 --enable-realtime-only --disable-install-docs"; +static const char* const cfg = "--target=armv8-linux-gcc --enable-external-build --enable-postproc --enable-multi-res-encoding --enable-temporal-denoising --enable-vp9-temporal-denoising --enable-vp9-postproc --size-limit=16384x16384 --enable-realtime-only --disable-install-docs"; const char *vpx_codec_build_config(void) {return cfg;} diff --git a/chromium/third_party/libvpx/source/config/ios/arm64/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/ios/arm64/vpx_dsp_rtcd.h index a5c50f21727..c9d867d00cd 100644 --- a/chromium/third_party/libvpx/source/config/ios/arm64/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/ios/arm64/vpx_dsp_rtcd.h @@ -75,17 +75,20 @@ void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *a #define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c +void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_neon void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c +void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_neon void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_neon void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c +void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_neon void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c @@ -128,7 +131,8 @@ void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_ #define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_neon void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c +void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_neon void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); @@ -312,52 +316,52 @@ void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, int16_t *coe void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c -void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_10_add vpx_idct16x16_10_add_neon -void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_1_add vpx_idct16x16_1_add_neon -void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_256_add vpx_idct16x16_256_add_neon -void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_neon -void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); -#define vpx_idct32x32_135_add vpx_idct32x32_1024_add_neon +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_135_add vpx_idct32x32_135_add_neon -void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_1_add vpx_idct32x32_1_add_neon -void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_34_add vpx_idct32x32_34_add_neon -void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct4x4_16_add vpx_idct4x4_16_add_neon -void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct4x4_1_add vpx_idct4x4_1_add_neon -void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct8x8_12_add vpx_idct8x8_12_add_neon -void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct8x8_1_add vpx_idct8x8_1_add_neon -void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct8x8_64_add vpx_idct8x8_64_add_neon int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); @@ -368,10 +372,10 @@ void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, void vpx_int_pro_row_neon(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); #define vpx_int_pro_row vpx_int_pro_row_neon -void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c -void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); @@ -423,10 +427,12 @@ void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, #define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_neon void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_c +void vpx_mbpost_proc_across_ip_neon(unsigned char *dst, int pitch, int rows, int cols,int flimit); +#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_neon void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -#define vpx_mbpost_proc_down vpx_mbpost_proc_down_c +void vpx_mbpost_proc_down_neon(unsigned char *dst, int pitch, int rows, int cols,int flimit); +#define vpx_mbpost_proc_down vpx_mbpost_proc_down_neon void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); void vpx_minmax_8x8_neon(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); @@ -449,7 +455,8 @@ void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp, #define vpx_plane_add_noise vpx_plane_add_noise_c void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); -#define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_c +void vpx_post_proc_down_and_across_mb_row_neon(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +#define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_neon void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vpx_quantize_b vpx_quantize_b_c diff --git a/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h index f7ac2dc300a..789724ffb93 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vp9_rtcd.h @@ -70,13 +70,13 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); #define vp9_iht16x16_256_add vp9_iht16x16_256_add_c -void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); -void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); -RTCD_EXTERN void (*vp9_iht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +RTCD_EXTERN void (*vp9_iht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); -void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); -void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); -RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); diff --git a/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_dsp_rtcd.h index 0028d86c3ed..2712530f99c 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_dsp_rtcd.h @@ -75,17 +75,20 @@ void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *a #define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c +void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d135_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c +void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d135_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); RTCD_EXTERN void (*vpx_d135_predictor_4x4)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c +void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d135_predictor_8x8)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c @@ -128,7 +131,8 @@ void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_ RTCD_EXTERN void (*vpx_d45_predictor_16x16)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c +void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +RTCD_EXTERN void (*vpx_d45_predictor_32x32)(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); @@ -312,53 +316,53 @@ RTCD_EXTERN void (*vpx_hadamard_8x8)(const int16_t *src_diff, int src_stride, in void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c -void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct8x8_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct8x8_1_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride); int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); int16_t vpx_int_pro_col_neon(const uint8_t *ref, const int width); @@ -368,10 +372,10 @@ void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, void vpx_int_pro_row_neon(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); RTCD_EXTERN void (*vpx_int_pro_row)(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); -void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c -void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); @@ -423,10 +427,12 @@ void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, RTCD_EXTERN void (*vpx_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_c +void vpx_mbpost_proc_across_ip_neon(unsigned char *dst, int pitch, int rows, int cols,int flimit); +RTCD_EXTERN void (*vpx_mbpost_proc_across_ip)(unsigned char *dst, int pitch, int rows, int cols,int flimit); void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -#define vpx_mbpost_proc_down vpx_mbpost_proc_down_c +void vpx_mbpost_proc_down_neon(unsigned char *dst, int pitch, int rows, int cols,int flimit); +RTCD_EXTERN void (*vpx_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols,int flimit); void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); void vpx_minmax_8x8_neon(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); @@ -449,7 +455,8 @@ void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp, #define vpx_plane_add_noise vpx_plane_add_noise_c void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); -#define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_c +void vpx_post_proc_down_and_across_mb_row_neon(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +RTCD_EXTERN void (*vpx_post_proc_down_and_across_mb_row)(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vpx_quantize_b vpx_quantize_b_c @@ -861,10 +868,18 @@ static void setup_rtcd_internal(void) if (flags & HAS_NEON) vpx_convolve_avg = vpx_convolve_avg_neon; vpx_convolve_copy = vpx_convolve_copy_c; if (flags & HAS_NEON) vpx_convolve_copy = vpx_convolve_copy_neon; + vpx_d135_predictor_16x16 = vpx_d135_predictor_16x16_c; + if (flags & HAS_NEON) vpx_d135_predictor_16x16 = vpx_d135_predictor_16x16_neon; + vpx_d135_predictor_32x32 = vpx_d135_predictor_32x32_c; + if (flags & HAS_NEON) vpx_d135_predictor_32x32 = vpx_d135_predictor_32x32_neon; vpx_d135_predictor_4x4 = vpx_d135_predictor_4x4_c; if (flags & HAS_NEON) vpx_d135_predictor_4x4 = vpx_d135_predictor_4x4_neon; + vpx_d135_predictor_8x8 = vpx_d135_predictor_8x8_c; + if (flags & HAS_NEON) vpx_d135_predictor_8x8 = vpx_d135_predictor_8x8_neon; vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_c; if (flags & HAS_NEON) vpx_d45_predictor_16x16 = vpx_d45_predictor_16x16_neon; + vpx_d45_predictor_32x32 = vpx_d45_predictor_32x32_c; + if (flags & HAS_NEON) vpx_d45_predictor_32x32 = vpx_d45_predictor_32x32_neon; vpx_d45_predictor_4x4 = vpx_d45_predictor_4x4_c; if (flags & HAS_NEON) vpx_d45_predictor_4x4 = vpx_d45_predictor_4x4_neon; vpx_d45_predictor_8x8 = vpx_d45_predictor_8x8_c; @@ -932,7 +947,7 @@ static void setup_rtcd_internal(void) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_c; if (flags & HAS_NEON) vpx_idct32x32_1024_add = vpx_idct32x32_1024_add_neon; vpx_idct32x32_135_add = vpx_idct32x32_135_add_c; - if (flags & HAS_NEON) vpx_idct32x32_135_add = vpx_idct32x32_1024_add_neon; + if (flags & HAS_NEON) vpx_idct32x32_135_add = vpx_idct32x32_135_add_neon; vpx_idct32x32_1_add = vpx_idct32x32_1_add_c; if (flags & HAS_NEON) vpx_idct32x32_1_add = vpx_idct32x32_1_add_neon; vpx_idct32x32_34_add = vpx_idct32x32_34_add_c; @@ -975,10 +990,16 @@ static void setup_rtcd_internal(void) if (flags & HAS_NEON) vpx_lpf_vertical_8 = vpx_lpf_vertical_8_neon; vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_c; if (flags & HAS_NEON) vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_neon; + vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_c; + if (flags & HAS_NEON) vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_neon; + vpx_mbpost_proc_down = vpx_mbpost_proc_down_c; + if (flags & HAS_NEON) vpx_mbpost_proc_down = vpx_mbpost_proc_down_neon; vpx_minmax_8x8 = vpx_minmax_8x8_c; if (flags & HAS_NEON) vpx_minmax_8x8 = vpx_minmax_8x8_neon; vpx_mse16x16 = vpx_mse16x16_c; if (flags & HAS_NEON) vpx_mse16x16 = vpx_mse16x16_neon; + vpx_post_proc_down_and_across_mb_row = vpx_post_proc_down_and_across_mb_row_c; + if (flags & HAS_NEON) vpx_post_proc_down_and_across_mb_row = vpx_post_proc_down_and_across_mb_row_neon; vpx_sad16x16 = vpx_sad16x16_c; if (flags & HAS_NEON) vpx_sad16x16 = vpx_sad16x16_neon; vpx_sad16x16x4d = vpx_sad16x16x4d_c; diff --git a/chromium/third_party/libvpx/source/config/linux/arm-neon/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm-neon/vp9_rtcd.h index 5f0e862cbfa..d07bcaa3def 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm-neon/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/arm-neon/vp9_rtcd.h @@ -70,12 +70,12 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); #define vp9_iht16x16_256_add vp9_iht16x16_256_add_c -void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); -void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht4x4_16_add vp9_iht4x4_16_add_neon -void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); -void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht8x8_64_add vp9_iht8x8_64_add_neon void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); diff --git a/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_dsp_rtcd.h index a5c50f21727..c9d867d00cd 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_dsp_rtcd.h @@ -75,17 +75,20 @@ void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *a #define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c +void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_neon void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c +void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_neon void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_neon void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c +void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_neon void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c @@ -128,7 +131,8 @@ void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_ #define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_neon void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c +void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_neon void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); @@ -312,52 +316,52 @@ void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, int16_t *coe void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c -void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_10_add vpx_idct16x16_10_add_neon -void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_1_add vpx_idct16x16_1_add_neon -void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_256_add vpx_idct16x16_256_add_neon -void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_neon -void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); -#define vpx_idct32x32_135_add vpx_idct32x32_1024_add_neon +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_135_add vpx_idct32x32_135_add_neon -void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_1_add vpx_idct32x32_1_add_neon -void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_34_add vpx_idct32x32_34_add_neon -void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct4x4_16_add vpx_idct4x4_16_add_neon -void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct4x4_1_add vpx_idct4x4_1_add_neon -void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct8x8_12_add vpx_idct8x8_12_add_neon -void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct8x8_1_add vpx_idct8x8_1_add_neon -void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct8x8_64_add vpx_idct8x8_64_add_neon int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); @@ -368,10 +372,10 @@ void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, void vpx_int_pro_row_neon(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); #define vpx_int_pro_row vpx_int_pro_row_neon -void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c -void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); @@ -423,10 +427,12 @@ void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, #define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_neon void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_c +void vpx_mbpost_proc_across_ip_neon(unsigned char *dst, int pitch, int rows, int cols,int flimit); +#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_neon void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -#define vpx_mbpost_proc_down vpx_mbpost_proc_down_c +void vpx_mbpost_proc_down_neon(unsigned char *dst, int pitch, int rows, int cols,int flimit); +#define vpx_mbpost_proc_down vpx_mbpost_proc_down_neon void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); void vpx_minmax_8x8_neon(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); @@ -449,7 +455,8 @@ void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp, #define vpx_plane_add_noise vpx_plane_add_noise_c void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); -#define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_c +void vpx_post_proc_down_and_across_mb_row_neon(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +#define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_neon void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vpx_quantize_b vpx_quantize_b_c diff --git a/chromium/third_party/libvpx/source/config/linux/arm/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm/vp9_rtcd.h index 206f5e5dba4..cd5726c2e6b 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/arm/vp9_rtcd.h @@ -68,10 +68,10 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); #define vp9_iht16x16_256_add vp9_iht16x16_256_add_c -void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht4x4_16_add vp9_iht4x4_16_add_c -void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht8x8_64_add vp9_iht8x8_64_add_c void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); diff --git a/chromium/third_party/libvpx/source/config/linux/arm/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm/vpx_dsp_rtcd.h index 6aa4b73856e..89b44dc986c 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/arm/vpx_dsp_rtcd.h @@ -271,40 +271,40 @@ void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff) void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c -void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_10_add vpx_idct16x16_10_add_c -void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_1_add vpx_idct16x16_1_add_c -void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_256_add vpx_idct16x16_256_add_c -void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c -void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_135_add vpx_idct32x32_135_add_c -void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_1_add vpx_idct32x32_1_add_c -void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_34_add vpx_idct32x32_34_add_c -void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct4x4_16_add vpx_idct4x4_16_add_c -void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct4x4_1_add vpx_idct4x4_1_add_c -void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct8x8_12_add vpx_idct8x8_12_add_c -void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct8x8_1_add vpx_idct8x8_1_add_c -void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct8x8_64_add vpx_idct8x8_64_add_c int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); @@ -313,10 +313,10 @@ int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); #define vpx_int_pro_row vpx_int_pro_row_c -void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c -void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); diff --git a/chromium/third_party/libvpx/source/config/linux/arm64/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm64/vp9_rtcd.h index 5f0e862cbfa..d07bcaa3def 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm64/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/arm64/vp9_rtcd.h @@ -70,12 +70,12 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); #define vp9_iht16x16_256_add vp9_iht16x16_256_add_c -void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); -void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht4x4_16_add vp9_iht4x4_16_add_neon -void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); -void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht8x8_64_add vp9_iht8x8_64_add_neon void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); diff --git a/chromium/third_party/libvpx/source/config/linux/arm64/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm64/vpx_dsp_rtcd.h index a5c50f21727..c9d867d00cd 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm64/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/arm64/vpx_dsp_rtcd.h @@ -75,17 +75,20 @@ void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *a #define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c +void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_neon void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c +void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_neon void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_neon void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c +void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_neon void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c @@ -128,7 +131,8 @@ void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_ #define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_neon void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); -#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c +void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_neon void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); @@ -312,52 +316,52 @@ void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, int16_t *coe void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c -void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_10_add vpx_idct16x16_10_add_neon -void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_1_add vpx_idct16x16_1_add_neon -void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_256_add vpx_idct16x16_256_add_neon -void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_neon -void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); -#define vpx_idct32x32_135_add vpx_idct32x32_1024_add_neon +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest, int stride); +#define vpx_idct32x32_135_add vpx_idct32x32_135_add_neon -void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_1_add vpx_idct32x32_1_add_neon -void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_34_add vpx_idct32x32_34_add_neon -void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct4x4_16_add vpx_idct4x4_16_add_neon -void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct4x4_1_add vpx_idct4x4_1_add_neon -void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct8x8_12_add vpx_idct8x8_12_add_neon -void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct8x8_1_add vpx_idct8x8_1_add_neon -void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct8x8_64_add vpx_idct8x8_64_add_neon int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); @@ -368,10 +372,10 @@ void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, void vpx_int_pro_row_neon(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); #define vpx_int_pro_row vpx_int_pro_row_neon -void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c -void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); @@ -423,10 +427,12 @@ void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, #define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_neon void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_c +void vpx_mbpost_proc_across_ip_neon(unsigned char *dst, int pitch, int rows, int cols,int flimit); +#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_neon void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -#define vpx_mbpost_proc_down vpx_mbpost_proc_down_c +void vpx_mbpost_proc_down_neon(unsigned char *dst, int pitch, int rows, int cols,int flimit); +#define vpx_mbpost_proc_down vpx_mbpost_proc_down_neon void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); void vpx_minmax_8x8_neon(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); @@ -449,7 +455,8 @@ void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp, #define vpx_plane_add_noise vpx_plane_add_noise_c void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); -#define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_c +void vpx_post_proc_down_and_across_mb_row_neon(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +#define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_neon void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vpx_quantize_b vpx_quantize_b_c diff --git a/chromium/third_party/libvpx/source/config/linux/generic/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/generic/vp9_rtcd.h index f0824a37a80..8251c1b5a19 100644 --- a/chromium/third_party/libvpx/source/config/linux/generic/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/generic/vp9_rtcd.h @@ -83,10 +83,10 @@ void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd); #define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c -void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd); +void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c -void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd); +void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit); @@ -110,10 +110,10 @@ void vp9_highbd_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, ui void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); #define vp9_iht16x16_256_add vp9_iht16x16_256_add_c -void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht4x4_16_add vp9_iht4x4_16_add_c -void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht8x8_64_add vp9_iht8x8_64_add_c void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); diff --git a/chromium/third_party/libvpx/source/config/linux/generic/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/generic/vpx_dsp_rtcd.h index 163cf7611a8..f537568dd91 100644 --- a/chromium/third_party/libvpx/source/config/linux/generic/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/generic/vpx_dsp_rtcd.h @@ -901,43 +901,43 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1 void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); #define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c -void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_c -void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c -void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_c -void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c -void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_c -void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c -void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_c -void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c -void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_c -void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c -void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_c -void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c -void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); @@ -1177,40 +1177,40 @@ void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1 void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); #define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_c -void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_10_add vpx_idct16x16_10_add_c -void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_1_add vpx_idct16x16_1_add_c -void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_256_add vpx_idct16x16_256_add_c -void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c -void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_135_add vpx_idct32x32_135_add_c -void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_1_add vpx_idct32x32_1_add_c -void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_34_add vpx_idct32x32_34_add_c -void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct4x4_16_add vpx_idct4x4_16_add_c -void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct4x4_1_add vpx_idct4x4_1_add_c -void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct8x8_12_add vpx_idct8x8_12_add_c -void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct8x8_1_add vpx_idct8x8_1_add_c -void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct8x8_64_add vpx_idct8x8_64_add_c int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); @@ -1219,10 +1219,10 @@ int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); #define vpx_int_pro_row vpx_int_pro_row_c -void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c -void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); diff --git a/chromium/third_party/libvpx/source/config/linux/ia32/vp8_rtcd.h b/chromium/third_party/libvpx/source/config/linux/ia32/vp8_rtcd.h index 8b235e876e3..3addf41714b 100644 --- a/chromium/third_party/libvpx/source/config/linux/ia32/vp8_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/ia32/vp8_rtcd.h @@ -297,7 +297,7 @@ static void setup_rtcd_internal(void) vp8_mbuverror = vp8_mbuverror_c; if (flags & HAS_SSE2) vp8_mbuverror = vp8_mbuverror_sse2; vp8_refining_search_sad = vp8_refining_search_sad_c; - if (flags & HAS_SSE3) vp8_refining_search_sad = vp8_refining_search_sadx4; + if (flags & HAS_SSE2) vp8_refining_search_sad = vp8_refining_search_sadx4; vp8_regular_quantize_b = vp8_regular_quantize_b_c; if (flags & HAS_SSE2) vp8_regular_quantize_b = vp8_regular_quantize_b_sse2; if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1; diff --git a/chromium/third_party/libvpx/source/config/linux/ia32/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/ia32/vp9_rtcd.h index 55c229554e3..28b5da86510 100644 --- a/chromium/third_party/libvpx/source/config/linux/ia32/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/ia32/vp9_rtcd.h @@ -97,10 +97,10 @@ void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd); #define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c -void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd); +void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c -void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd); +void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit); @@ -125,13 +125,13 @@ void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); RTCD_EXTERN void (*vp9_iht16x16_256_add)(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); -void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); -void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); -RTCD_EXTERN void (*vp9_iht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +RTCD_EXTERN void (*vp9_iht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); -void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); -void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); -RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_quantize_fp vp9_quantize_fp_c diff --git a/chromium/third_party/libvpx/source/config/linux/ia32/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/ia32/vpx_dsp_rtcd.h index 5e31286207d..58079fa420e 100644 --- a/chromium/third_party/libvpx/source/config/linux/ia32/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/ia32/vpx_dsp_rtcd.h @@ -1084,49 +1084,49 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1 void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); #define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c -void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c -void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c -void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c -void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c -void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c -void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c -void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); @@ -1423,53 +1423,53 @@ void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint1 void vpx_highbd_v_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); RTCD_EXTERN void (*vpx_highbd_v_predictor_8x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); -void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct8x8_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct8x8_1_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride); int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width); @@ -1479,11 +1479,11 @@ void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); RTCD_EXTERN void (*vpx_int_pro_row)(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); -void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_iwht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_iwht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); @@ -1537,11 +1537,11 @@ void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, RTCD_EXTERN void (*vpx_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -void vpx_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); +void vpx_mbpost_proc_across_ip_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit); RTCD_EXTERN void (*vpx_mbpost_proc_across_ip)(unsigned char *dst, int pitch, int rows, int cols,int flimit); void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -void vpx_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); +void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit); RTCD_EXTERN void (*vpx_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols,int flimit); void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); @@ -2618,9 +2618,9 @@ static void setup_rtcd_internal(void) vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_c; if (flags & HAS_SSE2) vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_sse2; vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_c; - if (flags & HAS_SSE2) vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_xmm; + if (flags & HAS_SSE2) vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_sse2; vpx_mbpost_proc_down = vpx_mbpost_proc_down_c; - if (flags & HAS_SSE2) vpx_mbpost_proc_down = vpx_mbpost_proc_down_xmm; + if (flags & HAS_SSE2) vpx_mbpost_proc_down = vpx_mbpost_proc_down_sse2; vpx_minmax_8x8 = vpx_minmax_8x8_c; if (flags & HAS_SSE2) vpx_minmax_8x8 = vpx_minmax_8x8_sse2; vpx_mse16x16 = vpx_mse16x16_c; diff --git a/chromium/third_party/libvpx/source/config/linux/mips64el/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/mips64el/vp9_rtcd.h index 3d80ce20e90..403db512b5d 100644 --- a/chromium/third_party/libvpx/source/config/linux/mips64el/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/mips64el/vp9_rtcd.h @@ -68,10 +68,10 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); #define vp9_iht16x16_256_add vp9_iht16x16_256_add_c -void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht4x4_16_add vp9_iht4x4_16_add_c -void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht8x8_64_add vp9_iht8x8_64_add_c void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); diff --git a/chromium/third_party/libvpx/source/config/linux/mips64el/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/mips64el/vpx_dsp_rtcd.h index 3dfc85323ca..8a27f964d1d 100644 --- a/chromium/third_party/libvpx/source/config/linux/mips64el/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/mips64el/vpx_dsp_rtcd.h @@ -271,40 +271,40 @@ void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff) void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c -void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_10_add vpx_idct16x16_10_add_c -void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_1_add vpx_idct16x16_1_add_c -void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_256_add vpx_idct16x16_256_add_c -void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c -void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_135_add vpx_idct32x32_135_add_c -void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_1_add vpx_idct32x32_1_add_c -void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_34_add vpx_idct32x32_34_add_c -void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct4x4_16_add vpx_idct4x4_16_add_c -void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct4x4_1_add vpx_idct4x4_1_add_c -void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct8x8_12_add vpx_idct8x8_12_add_c -void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct8x8_1_add vpx_idct8x8_1_add_c -void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct8x8_64_add vpx_idct8x8_64_add_c int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); @@ -313,10 +313,10 @@ int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); #define vpx_int_pro_row vpx_int_pro_row_c -void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c -void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); diff --git a/chromium/third_party/libvpx/source/config/linux/mipsel/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/mipsel/vp9_rtcd.h index 3d80ce20e90..403db512b5d 100644 --- a/chromium/third_party/libvpx/source/config/linux/mipsel/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/mipsel/vp9_rtcd.h @@ -68,10 +68,10 @@ void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); #define vp9_iht16x16_256_add vp9_iht16x16_256_add_c -void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht4x4_16_add vp9_iht4x4_16_add_c -void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht8x8_64_add vp9_iht8x8_64_add_c void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); diff --git a/chromium/third_party/libvpx/source/config/linux/mipsel/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/mipsel/vpx_dsp_rtcd.h index 3dfc85323ca..8a27f964d1d 100644 --- a/chromium/third_party/libvpx/source/config/linux/mipsel/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/mipsel/vpx_dsp_rtcd.h @@ -271,40 +271,40 @@ void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff) void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); #define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c -void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_10_add vpx_idct16x16_10_add_c -void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_1_add vpx_idct16x16_1_add_c -void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_256_add vpx_idct16x16_256_add_c -void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c -void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_135_add vpx_idct32x32_135_add_c -void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_1_add vpx_idct32x32_1_add_c -void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_34_add vpx_idct32x32_34_add_c -void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct4x4_16_add vpx_idct4x4_16_add_c -void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct4x4_1_add vpx_idct4x4_1_add_c -void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct8x8_12_add vpx_idct8x8_12_add_c -void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct8x8_1_add vpx_idct8x8_1_add_c -void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct8x8_64_add vpx_idct8x8_64_add_c int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); @@ -313,10 +313,10 @@ int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); #define vpx_int_pro_row vpx_int_pro_row_c -void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c -void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); diff --git a/chromium/third_party/libvpx/source/config/linux/x64/vp8_rtcd.h b/chromium/third_party/libvpx/source/config/linux/x64/vp8_rtcd.h index c66d7913431..8dcc9eb2b99 100644 --- a/chromium/third_party/libvpx/source/config/linux/x64/vp8_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/x64/vp8_rtcd.h @@ -169,7 +169,7 @@ int vp8_mbuverror_sse2(struct macroblock *mb); int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_refining_search_sad vp8_refining_search_sadx4 void vp8_regular_quantize_b_c(struct block *, struct blockd *); void vp8_regular_quantize_b_sse2(struct block *, struct blockd *); @@ -240,8 +240,6 @@ static void setup_rtcd_internal(void) vp8_full_search_sad = vp8_full_search_sad_c; if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3; if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8; - vp8_refining_search_sad = vp8_refining_search_sad_c; - if (flags & HAS_SSE3) vp8_refining_search_sad = vp8_refining_search_sadx4; vp8_regular_quantize_b = vp8_regular_quantize_b_sse2; if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1; vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2; diff --git a/chromium/third_party/libvpx/source/config/linux/x64/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/x64/vp9_rtcd.h index f747ed67d01..072f858e67e 100644 --- a/chromium/third_party/libvpx/source/config/linux/x64/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/x64/vp9_rtcd.h @@ -97,10 +97,10 @@ void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd); #define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c -void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd); +void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c -void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd); +void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit); @@ -125,12 +125,12 @@ void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); #define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2 -void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); -void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht4x4_16_add vp9_iht4x4_16_add_sse2 -void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); -void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht8x8_64_add vp9_iht8x8_64_add_sse2 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); diff --git a/chromium/third_party/libvpx/source/config/linux/x64/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/x64/vpx_dsp_rtcd.h index 1188bb43b56..bcb567d8ec6 100644 --- a/chromium/third_party/libvpx/source/config/linux/x64/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/x64/vpx_dsp_rtcd.h @@ -1091,49 +1091,49 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1 void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); #define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c -void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_sse2 -void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c -void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_sse2 -void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c -void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_sse2 -void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c -void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_sse2 -void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c -void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_sse2 -void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c -void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_sse2 -void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c -void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); @@ -1430,58 +1430,58 @@ void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint1 void vpx_highbd_v_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); #define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_sse2 -void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_10_add vpx_idct16x16_10_add_sse2 -void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_1_add vpx_idct16x16_1_add_sse2 -void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_256_add vpx_idct16x16_256_add_sse2 -void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_1_add vpx_idct32x32_1_add_sse2 -void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct4x4_16_add vpx_idct4x4_16_add_sse2 -void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct4x4_1_add vpx_idct4x4_1_add_sse2 -void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct8x8_1_add vpx_idct8x8_1_add_sse2 -void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride); int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width); @@ -1491,11 +1491,11 @@ void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); #define vpx_int_pro_row vpx_int_pro_row_sse2 -void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_sse2 -void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); @@ -1549,12 +1549,12 @@ void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, #define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_sse2 void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -void vpx_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); -#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_xmm +void vpx_mbpost_proc_across_ip_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit); +#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_sse2 void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -void vpx_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); -#define vpx_mbpost_proc_down vpx_mbpost_proc_down_xmm +void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit); +#define vpx_mbpost_proc_down vpx_mbpost_proc_down_sse2 void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); diff --git a/chromium/third_party/libvpx/source/config/mac/ia32/vp8_rtcd.h b/chromium/third_party/libvpx/source/config/mac/ia32/vp8_rtcd.h index 8b235e876e3..3addf41714b 100644 --- a/chromium/third_party/libvpx/source/config/mac/ia32/vp8_rtcd.h +++ b/chromium/third_party/libvpx/source/config/mac/ia32/vp8_rtcd.h @@ -297,7 +297,7 @@ static void setup_rtcd_internal(void) vp8_mbuverror = vp8_mbuverror_c; if (flags & HAS_SSE2) vp8_mbuverror = vp8_mbuverror_sse2; vp8_refining_search_sad = vp8_refining_search_sad_c; - if (flags & HAS_SSE3) vp8_refining_search_sad = vp8_refining_search_sadx4; + if (flags & HAS_SSE2) vp8_refining_search_sad = vp8_refining_search_sadx4; vp8_regular_quantize_b = vp8_regular_quantize_b_c; if (flags & HAS_SSE2) vp8_regular_quantize_b = vp8_regular_quantize_b_sse2; if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1; diff --git a/chromium/third_party/libvpx/source/config/mac/ia32/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/mac/ia32/vp9_rtcd.h index 55c229554e3..28b5da86510 100644 --- a/chromium/third_party/libvpx/source/config/mac/ia32/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/mac/ia32/vp9_rtcd.h @@ -97,10 +97,10 @@ void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd); #define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c -void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd); +void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c -void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd); +void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit); @@ -125,13 +125,13 @@ void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); RTCD_EXTERN void (*vp9_iht16x16_256_add)(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); -void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); -void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); -RTCD_EXTERN void (*vp9_iht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +RTCD_EXTERN void (*vp9_iht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); -void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); -void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); -RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_quantize_fp vp9_quantize_fp_c diff --git a/chromium/third_party/libvpx/source/config/mac/ia32/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/mac/ia32/vpx_dsp_rtcd.h index 5e31286207d..58079fa420e 100644 --- a/chromium/third_party/libvpx/source/config/mac/ia32/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/mac/ia32/vpx_dsp_rtcd.h @@ -1084,49 +1084,49 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1 void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); #define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c -void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c -void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c -void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c -void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c -void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c -void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c -void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); @@ -1423,53 +1423,53 @@ void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint1 void vpx_highbd_v_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); RTCD_EXTERN void (*vpx_highbd_v_predictor_8x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); -void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct8x8_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct8x8_1_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride); int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width); @@ -1479,11 +1479,11 @@ void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); RTCD_EXTERN void (*vpx_int_pro_row)(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); -void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_iwht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_iwht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); @@ -1537,11 +1537,11 @@ void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, RTCD_EXTERN void (*vpx_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -void vpx_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); +void vpx_mbpost_proc_across_ip_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit); RTCD_EXTERN void (*vpx_mbpost_proc_across_ip)(unsigned char *dst, int pitch, int rows, int cols,int flimit); void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -void vpx_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); +void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit); RTCD_EXTERN void (*vpx_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols,int flimit); void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); @@ -2618,9 +2618,9 @@ static void setup_rtcd_internal(void) vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_c; if (flags & HAS_SSE2) vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_sse2; vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_c; - if (flags & HAS_SSE2) vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_xmm; + if (flags & HAS_SSE2) vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_sse2; vpx_mbpost_proc_down = vpx_mbpost_proc_down_c; - if (flags & HAS_SSE2) vpx_mbpost_proc_down = vpx_mbpost_proc_down_xmm; + if (flags & HAS_SSE2) vpx_mbpost_proc_down = vpx_mbpost_proc_down_sse2; vpx_minmax_8x8 = vpx_minmax_8x8_c; if (flags & HAS_SSE2) vpx_minmax_8x8 = vpx_minmax_8x8_sse2; vpx_mse16x16 = vpx_mse16x16_c; diff --git a/chromium/third_party/libvpx/source/config/mac/x64/vp8_rtcd.h b/chromium/third_party/libvpx/source/config/mac/x64/vp8_rtcd.h index c66d7913431..8dcc9eb2b99 100644 --- a/chromium/third_party/libvpx/source/config/mac/x64/vp8_rtcd.h +++ b/chromium/third_party/libvpx/source/config/mac/x64/vp8_rtcd.h @@ -169,7 +169,7 @@ int vp8_mbuverror_sse2(struct macroblock *mb); int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_refining_search_sad vp8_refining_search_sadx4 void vp8_regular_quantize_b_c(struct block *, struct blockd *); void vp8_regular_quantize_b_sse2(struct block *, struct blockd *); @@ -240,8 +240,6 @@ static void setup_rtcd_internal(void) vp8_full_search_sad = vp8_full_search_sad_c; if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3; if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8; - vp8_refining_search_sad = vp8_refining_search_sad_c; - if (flags & HAS_SSE3) vp8_refining_search_sad = vp8_refining_search_sadx4; vp8_regular_quantize_b = vp8_regular_quantize_b_sse2; if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1; vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2; diff --git a/chromium/third_party/libvpx/source/config/mac/x64/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/mac/x64/vp9_rtcd.h index f747ed67d01..072f858e67e 100644 --- a/chromium/third_party/libvpx/source/config/mac/x64/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/mac/x64/vp9_rtcd.h @@ -97,10 +97,10 @@ void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd); #define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c -void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd); +void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c -void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd); +void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit); @@ -125,12 +125,12 @@ void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); #define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2 -void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); -void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht4x4_16_add vp9_iht4x4_16_add_sse2 -void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); -void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht8x8_64_add vp9_iht8x8_64_add_sse2 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); diff --git a/chromium/third_party/libvpx/source/config/mac/x64/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/mac/x64/vpx_dsp_rtcd.h index 1188bb43b56..bcb567d8ec6 100644 --- a/chromium/third_party/libvpx/source/config/mac/x64/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/mac/x64/vpx_dsp_rtcd.h @@ -1091,49 +1091,49 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1 void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); #define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c -void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_sse2 -void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c -void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_sse2 -void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c -void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_sse2 -void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c -void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_sse2 -void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c -void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_sse2 -void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c -void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_sse2 -void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c -void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); @@ -1430,58 +1430,58 @@ void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint1 void vpx_highbd_v_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); #define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_sse2 -void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_10_add vpx_idct16x16_10_add_sse2 -void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_1_add vpx_idct16x16_1_add_sse2 -void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_256_add vpx_idct16x16_256_add_sse2 -void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_1_add vpx_idct32x32_1_add_sse2 -void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct4x4_16_add vpx_idct4x4_16_add_sse2 -void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct4x4_1_add vpx_idct4x4_1_add_sse2 -void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct8x8_1_add vpx_idct8x8_1_add_sse2 -void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride); int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width); @@ -1491,11 +1491,11 @@ void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); #define vpx_int_pro_row vpx_int_pro_row_sse2 -void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_sse2 -void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); @@ -1549,12 +1549,12 @@ void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, #define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_sse2 void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -void vpx_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); -#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_xmm +void vpx_mbpost_proc_across_ip_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit); +#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_sse2 void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -void vpx_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); -#define vpx_mbpost_proc_down vpx_mbpost_proc_down_xmm +void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit); +#define vpx_mbpost_proc_down vpx_mbpost_proc_down_sse2 void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); diff --git a/chromium/third_party/libvpx/source/config/nacl/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/nacl/vp9_rtcd.h index f0824a37a80..8251c1b5a19 100644 --- a/chromium/third_party/libvpx/source/config/nacl/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/nacl/vp9_rtcd.h @@ -83,10 +83,10 @@ void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd); #define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c -void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd); +void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c -void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd); +void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit); @@ -110,10 +110,10 @@ void vp9_highbd_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, ui void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); #define vp9_iht16x16_256_add vp9_iht16x16_256_add_c -void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht4x4_16_add vp9_iht4x4_16_add_c -void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht8x8_64_add vp9_iht8x8_64_add_c void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); diff --git a/chromium/third_party/libvpx/source/config/nacl/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/nacl/vpx_dsp_rtcd.h index 163cf7611a8..f537568dd91 100644 --- a/chromium/third_party/libvpx/source/config/nacl/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/nacl/vpx_dsp_rtcd.h @@ -901,43 +901,43 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1 void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); #define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c -void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_c -void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c -void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_c -void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c -void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_c -void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c -void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_c -void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c -void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_c -void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c -void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_c -void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c -void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); @@ -1177,40 +1177,40 @@ void vpx_highbd_v_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1 void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); #define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_c -void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_10_add vpx_idct16x16_10_add_c -void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_1_add vpx_idct16x16_1_add_c -void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_256_add vpx_idct16x16_256_add_c -void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_c -void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_135_add vpx_idct32x32_135_add_c -void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_1_add vpx_idct32x32_1_add_c -void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_34_add vpx_idct32x32_34_add_c -void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct4x4_16_add vpx_idct4x4_16_add_c -void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct4x4_1_add vpx_idct4x4_1_add_c -void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct8x8_12_add vpx_idct8x8_12_add_c -void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct8x8_1_add vpx_idct8x8_1_add_c -void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct8x8_64_add vpx_idct8x8_64_add_c int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); @@ -1219,10 +1219,10 @@ int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); #define vpx_int_pro_row vpx_int_pro_row_c -void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c -void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); diff --git a/chromium/third_party/libvpx/source/config/vpx_version.h b/chromium/third_party/libvpx/source/config/vpx_version.h index 97666fffaf6..07f046ed1bc 100644 --- a/chromium/third_party/libvpx/source/config/vpx_version.h +++ b/chromium/third_party/libvpx/source/config/vpx_version.h @@ -1,7 +1,7 @@ #define VERSION_MAJOR 1 #define VERSION_MINOR 6 #define VERSION_PATCH 0 -#define VERSION_EXTRA "702-g5c64c01" +#define VERSION_EXTRA "903-g5b1a8ca5e" #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH)) -#define VERSION_STRING_NOSP "v1.6.0-702-g5c64c01" -#define VERSION_STRING " v1.6.0-702-g5c64c01" +#define VERSION_STRING_NOSP "v1.6.0-903-g5b1a8ca5e" +#define VERSION_STRING " v1.6.0-903-g5b1a8ca5e" diff --git a/chromium/third_party/libvpx/source/config/win/ia32/vp8_rtcd.h b/chromium/third_party/libvpx/source/config/win/ia32/vp8_rtcd.h index 8b235e876e3..3addf41714b 100644 --- a/chromium/third_party/libvpx/source/config/win/ia32/vp8_rtcd.h +++ b/chromium/third_party/libvpx/source/config/win/ia32/vp8_rtcd.h @@ -297,7 +297,7 @@ static void setup_rtcd_internal(void) vp8_mbuverror = vp8_mbuverror_c; if (flags & HAS_SSE2) vp8_mbuverror = vp8_mbuverror_sse2; vp8_refining_search_sad = vp8_refining_search_sad_c; - if (flags & HAS_SSE3) vp8_refining_search_sad = vp8_refining_search_sadx4; + if (flags & HAS_SSE2) vp8_refining_search_sad = vp8_refining_search_sadx4; vp8_regular_quantize_b = vp8_regular_quantize_b_c; if (flags & HAS_SSE2) vp8_regular_quantize_b = vp8_regular_quantize_b_sse2; if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1; diff --git a/chromium/third_party/libvpx/source/config/win/ia32/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/win/ia32/vp9_rtcd.h index 55c229554e3..28b5da86510 100644 --- a/chromium/third_party/libvpx/source/config/win/ia32/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/win/ia32/vp9_rtcd.h @@ -97,10 +97,10 @@ void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd); #define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c -void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd); +void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c -void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd); +void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit); @@ -125,13 +125,13 @@ void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); RTCD_EXTERN void (*vp9_iht16x16_256_add)(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); -void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); -void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); -RTCD_EXTERN void (*vp9_iht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +RTCD_EXTERN void (*vp9_iht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); -void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); -void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); -RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +RTCD_EXTERN void (*vp9_iht8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); #define vp9_quantize_fp vp9_quantize_fp_c diff --git a/chromium/third_party/libvpx/source/config/win/ia32/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/win/ia32/vpx_dsp_rtcd.h index 5e31286207d..58079fa420e 100644 --- a/chromium/third_party/libvpx/source/config/win/ia32/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/win/ia32/vpx_dsp_rtcd.h @@ -1084,49 +1084,49 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1 void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); #define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c -void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c -void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c -void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c -void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c -void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c -void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride, int bd); -void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c -void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); @@ -1423,53 +1423,53 @@ void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint1 void vpx_highbd_v_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); RTCD_EXTERN void (*vpx_highbd_v_predictor_8x8)(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); -void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_10_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_1_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct16x16_256_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct4x4_1_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct8x8_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct8x8_1_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride); int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width); @@ -1479,11 +1479,11 @@ void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); RTCD_EXTERN void (*vpx_int_pro_row)(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); -void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_iwht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_iwht4x4_16_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); @@ -1537,11 +1537,11 @@ void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, RTCD_EXTERN void (*vpx_lpf_vertical_8_dual)(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -void vpx_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); +void vpx_mbpost_proc_across_ip_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit); RTCD_EXTERN void (*vpx_mbpost_proc_across_ip)(unsigned char *dst, int pitch, int rows, int cols,int flimit); void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -void vpx_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); +void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit); RTCD_EXTERN void (*vpx_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols,int flimit); void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); @@ -2618,9 +2618,9 @@ static void setup_rtcd_internal(void) vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_c; if (flags & HAS_SSE2) vpx_lpf_vertical_8_dual = vpx_lpf_vertical_8_dual_sse2; vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_c; - if (flags & HAS_SSE2) vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_xmm; + if (flags & HAS_SSE2) vpx_mbpost_proc_across_ip = vpx_mbpost_proc_across_ip_sse2; vpx_mbpost_proc_down = vpx_mbpost_proc_down_c; - if (flags & HAS_SSE2) vpx_mbpost_proc_down = vpx_mbpost_proc_down_xmm; + if (flags & HAS_SSE2) vpx_mbpost_proc_down = vpx_mbpost_proc_down_sse2; vpx_minmax_8x8 = vpx_minmax_8x8_c; if (flags & HAS_SSE2) vpx_minmax_8x8 = vpx_minmax_8x8_sse2; vpx_mse16x16 = vpx_mse16x16_c; diff --git a/chromium/third_party/libvpx/source/config/win/x64/vp8_rtcd.h b/chromium/third_party/libvpx/source/config/win/x64/vp8_rtcd.h index c66d7913431..8dcc9eb2b99 100644 --- a/chromium/third_party/libvpx/source/config/win/x64/vp8_rtcd.h +++ b/chromium/third_party/libvpx/source/config/win/x64/vp8_rtcd.h @@ -169,7 +169,7 @@ int vp8_mbuverror_sse2(struct macroblock *mb); int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); int vp8_refining_search_sadx4(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); -RTCD_EXTERN int (*vp8_refining_search_sad)(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_refining_search_sad vp8_refining_search_sadx4 void vp8_regular_quantize_b_c(struct block *, struct blockd *); void vp8_regular_quantize_b_sse2(struct block *, struct blockd *); @@ -240,8 +240,6 @@ static void setup_rtcd_internal(void) vp8_full_search_sad = vp8_full_search_sad_c; if (flags & HAS_SSE3) vp8_full_search_sad = vp8_full_search_sadx3; if (flags & HAS_SSE4_1) vp8_full_search_sad = vp8_full_search_sadx8; - vp8_refining_search_sad = vp8_refining_search_sad_c; - if (flags & HAS_SSE3) vp8_refining_search_sad = vp8_refining_search_sadx4; vp8_regular_quantize_b = vp8_regular_quantize_b_sse2; if (flags & HAS_SSE4_1) vp8_regular_quantize_b = vp8_regular_quantize_b_sse4_1; vp8_sixtap_predict16x16 = vp8_sixtap_predict16x16_sse2; diff --git a/chromium/third_party/libvpx/source/config/win/x64/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/win/x64/vp9_rtcd.h index f747ed67d01..072f858e67e 100644 --- a/chromium/third_party/libvpx/source/config/win/x64/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/win/x64/vp9_rtcd.h @@ -97,10 +97,10 @@ void vp9_highbd_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); void vp9_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd); #define vp9_highbd_iht16x16_256_add vp9_highbd_iht16x16_256_add_c -void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd); +void vp9_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht4x4_16_add vp9_highbd_iht4x4_16_add_c -void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd); +void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd); #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c void vp9_highbd_mbpost_proc_across_ip_c(uint16_t *src, int pitch, int rows, int cols, int flimit); @@ -125,12 +125,12 @@ void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); #define vp9_iht16x16_256_add vp9_iht16x16_256_add_sse2 -void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); -void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht4x4_16_add vp9_iht4x4_16_add_sse2 -void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); -void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); +void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type); #define vp9_iht8x8_64_add vp9_iht8x8_64_add_sse2 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); diff --git a/chromium/third_party/libvpx/source/config/win/x64/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/win/x64/vpx_dsp_rtcd.h index 1188bb43b56..bcb567d8ec6 100644 --- a/chromium/third_party/libvpx/source/config/win/x64/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/win/x64/vpx_dsp_rtcd.h @@ -1091,49 +1091,49 @@ void vpx_highbd_h_predictor_4x4_c(uint16_t *dst, ptrdiff_t y_stride, const uint1 void vpx_highbd_h_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); #define vpx_highbd_h_predictor_8x8 vpx_highbd_h_predictor_8x8_c -void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_10_add vpx_highbd_idct16x16_10_add_sse2 -void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_1_add vpx_highbd_idct16x16_1_add_c -void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct16x16_256_add vpx_highbd_idct16x16_256_add_sse2 -void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_1024_add vpx_highbd_idct32x32_1024_add_c -void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_1_add vpx_highbd_idct32x32_1_add_sse2 -void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct32x32_34_add vpx_highbd_idct32x32_34_add_c -void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct4x4_16_add vpx_highbd_idct4x4_16_add_sse2 -void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c -void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_sse2 -void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c -void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); +void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_idct8x8_64_add vpx_highbd_idct8x8_64_add_sse2 -void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_iwht4x4_16_add vpx_highbd_iwht4x4_16_add_c -void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride, int bd); #define vpx_highbd_iwht4x4_1_add vpx_highbd_iwht4x4_1_add_c void vpx_highbd_lpf_horizontal_16_c(uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd); @@ -1430,58 +1430,58 @@ void vpx_highbd_v_predictor_8x8_c(uint16_t *dst, ptrdiff_t y_stride, const uint1 void vpx_highbd_v_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd); #define vpx_highbd_v_predictor_8x8 vpx_highbd_v_predictor_8x8_sse2 -void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_10_add vpx_idct16x16_10_add_sse2 -void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_1_add vpx_idct16x16_1_add_sse2 -void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct16x16_256_add vpx_idct16x16_256_add_sse2 -void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_1024_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_135_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct32x32_1_add vpx_idct32x32_1_add_sse2 -void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct4x4_16_add vpx_idct4x4_16_add_sse2 -void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct4x4_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct4x4_1_add vpx_idct4x4_1_add_sse2 -void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int stride); -void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_1_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_idct8x8_1_add vpx_idct8x8_1_add_sse2 -void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, int dest_stride); -RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, int stride); +RTCD_EXTERN void (*vpx_idct8x8_64_add)(const tran_low_t *input, uint8_t *dest, int stride); int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width); @@ -1491,11 +1491,11 @@ void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); #define vpx_int_pro_row vpx_int_pro_row_sse2 -void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride); +void vpx_iwht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_sse2 -void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride); #define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); @@ -1549,12 +1549,12 @@ void vpx_lpf_vertical_8_dual_sse2(uint8_t *s, int pitch, const uint8_t *blimit0, #define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_sse2 void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -void vpx_mbpost_proc_across_ip_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); -#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_xmm +void vpx_mbpost_proc_across_ip_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit); +#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_sse2 void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); -void vpx_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols,int flimit); -#define vpx_mbpost_proc_down vpx_mbpost_proc_down_xmm +void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, int cols,int flimit); +#define vpx_mbpost_proc_down vpx_mbpost_proc_down_sse2 void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/Android.mk b/chromium/third_party/libvpx/source/libvpx/build/make/Android.mk index 09bdc5d2f70..a88f90056e4 100644 --- a/chromium/third_party/libvpx/source/libvpx/build/make/Android.mk +++ b/chromium/third_party/libvpx/source/libvpx/build/make/Android.mk @@ -64,6 +64,9 @@ CONFIG_DIR := $(LOCAL_PATH)/ LIBVPX_PATH := $(LOCAL_PATH)/libvpx ASM_CNV_PATH_LOCAL := $(TARGET_ARCH_ABI)/ads2gas ASM_CNV_PATH := $(LOCAL_PATH)/$(ASM_CNV_PATH_LOCAL) +ifneq ($(V),1) + qexec := @ +endif # Use the makefiles generated by upstream configure to determine which files to # build. Also set any architecture-specific flags. @@ -103,8 +106,8 @@ LOCAL_ASMFLAGS := -I$(LIBVPX_PATH) .PRECIOUS: %.asm.S $(ASM_CNV_PATH)/libvpx/%.asm.S: $(LIBVPX_PATH)/%.asm - @mkdir -p $(dir $@) - @$(CONFIG_DIR)$(ASM_CONVERSION) <$< > $@ + $(qexec)mkdir -p $(dir $@) + $(qexec)$(CONFIG_DIR)$(ASM_CONVERSION) <$< > $@ # For building *_rtcd.h, which have rules in libs.mk TGT_ISA:=$(word 1, $(subst -, ,$(TOOLCHAIN))) @@ -150,15 +153,27 @@ CODEC_SRCS_ASM_ADS2GAS = $(patsubst %.S, \ LOCAL_SRC_FILES += $(CODEC_SRCS_ASM_ADS2GAS) ifeq ($(TARGET_ARCH_ABI),armeabi-v7a) + ASM_INCLUDES := vpx_dsp/arm/idct_neon.asm.S CODEC_SRCS_ASM_NEON = $(foreach v, \ $(CODEC_SRCS_ASM_ARM_ALL),\ $(if $(findstring neon,$(v)),$(v),)) + CODEC_SRCS_ASM_NEON := $(filter-out $(addprefix %, $(ASM_INCLUDES)), \ + $(CODEC_SRCS_ASM_NEON)) CODEC_SRCS_ASM_NEON_ADS2GAS = $(patsubst %.S, \ $(ASM_CNV_PATH_LOCAL)/libvpx/%.S, \ $(CODEC_SRCS_ASM_NEON)) LOCAL_SRC_FILES += $(patsubst %.S, \ %.S.neon, \ $(CODEC_SRCS_ASM_NEON_ADS2GAS)) + + NEON_ASM_TARGETS = $(patsubst %.S, \ + $(ASM_CNV_PATH)/libvpx/%.S, \ + $(CODEC_SRCS_ASM_NEON)) +# add a dependency to the full path to the ads2gas output to ensure the +# includes are converted first. +ifneq ($(strip $(NEON_ASM_TARGETS)),) +$(NEON_ASM_TARGETS): $(addprefix $(ASM_CNV_PATH)/libvpx/, $(ASM_INCLUDES)) +endif endif LOCAL_CFLAGS += \ @@ -187,7 +202,7 @@ $$(rtcd_dep_template_SRCS): vpx_scale_rtcd.h $$(rtcd_dep_template_SRCS): vpx_dsp_rtcd.h rtcd_dep_template_CONFIG_ASM_ABIS := x86 x86_64 armeabi-v7a -ifneq ($(findstring $(TARGET_ARCH_ABI),$(rtcd_dep_template_CONFIG_ASM_ABIS)),) +ifneq ($$(findstring $(TARGET_ARCH_ABI),$$(rtcd_dep_template_CONFIG_ASM_ABIS)),) $$(rtcd_dep_template_SRCS): vpx_config.asm endif endef @@ -197,16 +212,17 @@ $(eval $(call rtcd_dep_template)) .PHONY: clean clean: @echo "Clean: ads2gas files [$(TARGET_ARCH_ABI)]" - @$(RM) $(CODEC_SRCS_ASM_ADS2GAS) $(CODEC_SRCS_ASM_NEON_ADS2GAS) - @$(RM) -r $(ASM_CNV_PATH) - @$(RM) $(CLEAN-OBJS) + $(qexec)$(RM) $(CODEC_SRCS_ASM_ADS2GAS) $(CODEC_SRCS_ASM_NEON_ADS2GAS) + $(qexec)$(RM) -r $(ASM_CNV_PATH) + $(qexec)$(RM) $(CLEAN-OBJS) ifeq ($(ENABLE_SHARED),1) + LOCAL_CFLAGS += -fPIC include $(BUILD_SHARED_LIBRARY) else include $(BUILD_STATIC_LIBRARY) endif ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes) -$(call import-module,cpufeatures) +$(call import-module,android/cpufeatures) endif diff --git a/chromium/third_party/libvpx/source/libvpx/examples/vp8_multi_resolution_encoder.c b/chromium/third_party/libvpx/source/libvpx/examples/vp8_multi_resolution_encoder.c index 65308a0bd0b..0b9663c777b 100644 --- a/chromium/third_party/libvpx/source/libvpx/examples/vp8_multi_resolution_encoder.c +++ b/chromium/third_party/libvpx/source/libvpx/examples/vp8_multi_resolution_encoder.c @@ -240,9 +240,9 @@ static void set_temporal_layer_pattern(int num_temporal_layers, cfg->ts_layer_id[1] = 2; cfg->ts_layer_id[2] = 1; cfg->ts_layer_id[3] = 2; - // Use 40/20/40 bit allocation as example. - cfg->ts_target_bitrate[0] = 0.4f * bitrate; - cfg->ts_target_bitrate[1] = 0.6f * bitrate; + // Use 45/20/35 bit allocation as example. + cfg->ts_target_bitrate[0] = 0.45f * bitrate; + cfg->ts_target_bitrate[1] = 0.65f * bitrate; cfg->ts_target_bitrate[2] = bitrate; /* 0=L, 1=GF, 2=ARF */ @@ -460,7 +460,7 @@ int main(int argc, char **argv) { // Set the number of threads per encode/spatial layer. // (1, 1, 1) means no encoder threading. - cfg[0].g_threads = 2; + cfg[0].g_threads = 1; cfg[1].g_threads = 1; cfg[2].g_threads = 1; @@ -507,9 +507,11 @@ int main(int argc, char **argv) { /* Set NOISE_SENSITIVITY to do TEMPORAL_DENOISING */ /* Enable denoising for the highest-resolution encoder. */ - if (vpx_codec_control(&codec[0], VP8E_SET_NOISE_SENSITIVITY, 4)) + if (vpx_codec_control(&codec[0], VP8E_SET_NOISE_SENSITIVITY, 1)) die_codec(&codec[0], "Failed to set noise_sensitivity"); - for (i = 1; i < NUM_ENCODERS; i++) { + if (vpx_codec_control(&codec[1], VP8E_SET_NOISE_SENSITIVITY, 1)) + die_codec(&codec[1], "Failed to set noise_sensitivity"); + for (i = 2; i < NUM_ENCODERS; i++) { if (vpx_codec_control(&codec[i], VP8E_SET_NOISE_SENSITIVITY, 0)) die_codec(&codec[i], "Failed to set noise_sensitivity"); } diff --git a/chromium/third_party/libvpx/source/libvpx/examples/vp9_spatial_svc_encoder.c b/chromium/third_party/libvpx/source/libvpx/examples/vp9_spatial_svc_encoder.c index fa2df7271b2..0e409387b3e 100644 --- a/chromium/third_party/libvpx/source/libvpx/examples/vp9_spatial_svc_encoder.c +++ b/chromium/third_party/libvpx/source/libvpx/examples/vp9_spatial_svc_encoder.c @@ -679,7 +679,7 @@ int main(int argc, const char **argv) { } #if OUTPUT_RC_STATS // For now, just write temporal layer streams. - // TODO(wonkap): do spatial by re-writing superframe. + // TODO(marpan): do spatial by re-writing superframe. if (svc_ctx.output_rc_stat) { for (tl = 0; tl < enc_cfg.ts_number_layers; ++tl) { char file_name[PATH_MAX]; @@ -770,7 +770,7 @@ int main(int argc, const char **argv) { cx_pkt->data.frame.sz, cx_pkt->data.frame.pts); #if OUTPUT_RC_STATS - // TODO(marpan/wonkap): Put this (to line728) in separate function. + // TODO(marpan): Put this (to line728) in separate function. if (svc_ctx.output_rc_stat) { vpx_codec_control(&codec, VP9E_GET_SVC_LAYER_ID, &layer_id); parse_superframe_index(cx_pkt->data.frame.buf, diff --git a/chromium/third_party/libvpx/source/libvpx/examples/vpx_temporal_svc_encoder.c b/chromium/third_party/libvpx/source/libvpx/examples/vpx_temporal_svc_encoder.c index 752c1baead1..b9069808350 100644 --- a/chromium/third_party/libvpx/source/libvpx/examples/vpx_temporal_svc_encoder.c +++ b/chromium/third_party/libvpx/source/libvpx/examples/vpx_temporal_svc_encoder.c @@ -702,11 +702,14 @@ int main(int argc, char **argv) { vpx_codec_control(&codec, VP8E_SET_CPUUSED, -speed); vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOff); vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1); + vpx_codec_control(&codec, VP8E_SET_GF_CBR_BOOST_PCT, 0); } else if (strncmp(encoder->name, "vp9", 3) == 0) { vpx_svc_extra_cfg_t svc_params; memset(&svc_params, 0, sizeof(svc_params)); vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed); vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3); + vpx_codec_control(&codec, VP9E_SET_GF_CBR_BOOST_PCT, 0); + vpx_codec_control(&codec, VP9E_SET_FRAME_PARALLEL_DECODING, 0); vpx_codec_control(&codec, VP9E_SET_FRAME_PERIODIC_BOOST, 0); vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, kDenoiserOff); vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1); diff --git a/chromium/third_party/libvpx/source/libvpx/libs.mk b/chromium/third_party/libvpx/source/libvpx/libs.mk index f4f48cc1621..e0a2cc097de 100644 --- a/chromium/third_party/libvpx/source/libvpx/libs.mk +++ b/chromium/third_party/libvpx/source/libvpx/libs.mk @@ -391,7 +391,7 @@ LIBVPX_TEST_SRCS=$(addprefix test/,$(call enabled,LIBVPX_TEST_SRCS)) LIBVPX_TEST_BIN=./test_libvpx$(EXE_SFX) LIBVPX_TEST_DATA=$(addprefix $(LIBVPX_TEST_DATA_PATH)/,\ $(call enabled,LIBVPX_TEST_DATA)) -libvpx_test_data_url=http://downloads.webmproject.org/test_data/libvpx/$(1) +libvpx_test_data_url=https://storage.googleapis.com/downloads.webmproject.org/test_data/libvpx/$(1) TEST_INTRA_PRED_SPEED_BIN=./test_intra_pred_speed$(EXE_SFX) TEST_INTRA_PRED_SPEED_SRCS=$(addprefix test/,$(call enabled,TEST_INTRA_PRED_SPEED_SRCS)) @@ -405,7 +405,7 @@ CLEAN-OBJS += libvpx_test_srcs.txt $(LIBVPX_TEST_DATA): $(SRC_PATH_BARE)/test/test-data.sha1 @echo " [DOWNLOAD] $@" $(qexec)trap 'rm -f $@' INT TERM &&\ - curl -L -o $@ $(call libvpx_test_data_url,$(@F)) + curl --retry 1 -L -o $@ $(call libvpx_test_data_url,$(@F)) testdata:: $(LIBVPX_TEST_DATA) $(qexec)[ -x "$$(which sha1sum)" ] && sha1sum=sha1sum;\ diff --git a/chromium/third_party/libvpx/source/libvpx/tools_common.h b/chromium/third_party/libvpx/source/libvpx/tools_common.h index 73ba1bc03ba..c4a48b24de0 100644 --- a/chromium/third_party/libvpx/source/libvpx/tools_common.h +++ b/chromium/third_party/libvpx/source/libvpx/tools_common.h @@ -26,11 +26,21 @@ /* MSVS uses _f{seek,tell}i64. */ #define fseeko _fseeki64 #define ftello _ftelli64 +typedef int64_t FileOffset; #elif defined(_WIN32) /* MinGW uses f{seek,tell}o64 for large files. */ #define fseeko fseeko64 #define ftello ftello64 -#endif /* _WIN32 */ +typedef off64_t FileOffset; +#elif CONFIG_OS_SUPPORT +typedef off_t FileOffset; +/* Use 32-bit file operations in WebM file format when building ARM + * executables (.axf) with RVCT. */ +#else +#define fseeko fseek +#define ftello ftell +typedef long FileOffset /* NOLINT */ +#endif /* CONFIG_OS_SUPPORT */ #if CONFIG_OS_SUPPORT #if defined(_MSC_VER) @@ -42,13 +52,6 @@ #endif /* _MSC_VER */ #endif /* CONFIG_OS_SUPPORT */ -/* Use 32-bit file operations in WebM file format when building ARM - * executables (.axf) with RVCT. */ -#if !CONFIG_OS_SUPPORT -#define fseeko fseek -#define ftello ftell -#endif /* CONFIG_OS_SUPPORT */ - #define LITERALU64(hi, lo) ((((uint64_t)hi) << 32) | lo) #ifndef PATH_MAX diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/dspr2/filter_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/dspr2/filter_dspr2.c index 7612024b7d0..2de343419ac 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/dspr2/filter_dspr2.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/dspr2/filter_dspr2.c @@ -1469,6 +1469,7 @@ void vp8_filter_block2d_second_pass_8(unsigned char *RESTRICT src_ptr, unsigned char src_ptr_r2; unsigned char src_ptr_r3; unsigned char *cm = ff_cropTbl + CROP_WIDTH; + (void)output_width; vector4a = 64; diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c index b79af1cc88f..d2c34425156 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/dspr2/vp8_loopfilter_filters_dspr2.c @@ -306,6 +306,7 @@ void vp8_loop_filter_horizontal_edge_mips(unsigned char *s, int p, uint32_t hev; uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6; + (void)count; mask = 0; hev = 0; @@ -498,6 +499,7 @@ void vp8_loop_filter_uvhorizontal_edge_mips(unsigned char *s, int p, uint32_t hev; uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6; + (void)count; mask = 0; hev = 0; @@ -918,6 +920,7 @@ void vp8_loop_filter_uvvertical_edge_mips(unsigned char *s, int p, uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; unsigned char *s1, *s2, *s3, *s4; uint32_t prim1, prim2, sec3, sec4, prim3, prim4; + (void)count; /* loop filter designed to work using chars so that we can make maximum use * of 8 bit simd instructions. @@ -1612,6 +1615,7 @@ void vp8_mbloop_filter_uvhorizontal_edge_mips(unsigned char *s, int p, uint32_t mask, hev; uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; unsigned char *sm1, *s0, *s1, *s2, *s3, *s4, *s5, *s6; + (void)count; mask = 0; hev = 0; @@ -1915,6 +1919,7 @@ void vp8_mbloop_filter_uvvertical_edge_mips(unsigned char *s, int p, uint32_t pm1, p0, p1, p2, p3, p4, p5, p6; unsigned char *s1, *s2, *s3, *s4; uint32_t prim1, prim2, sec3, sec4, prim3, prim4; + (void)count; mask = 0; hev = 0; diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppflags.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/ppflags.h index 713f5dffe09..96e3af6c9c1 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/ppflags.h +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/ppflags.h @@ -19,14 +19,7 @@ enum { VP8D_DEBLOCK = 1 << 0, VP8D_DEMACROBLOCK = 1 << 1, VP8D_ADDNOISE = 1 << 2, - VP8D_DEBUG_TXT_FRAME_INFO = 1 << 3, - VP8D_DEBUG_TXT_MBLK_MODES = 1 << 4, - VP8D_DEBUG_TXT_DC_DIFF = 1 << 5, - VP8D_DEBUG_TXT_RATE_INFO = 1 << 6, - VP8D_DEBUG_DRAW_MV = 1 << 7, - VP8D_DEBUG_CLR_BLK_MODES = 1 << 8, - VP8D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9, - VP8D_MFQE = 1 << 10 + VP8D_MFQE = 1 << 3 }; typedef struct { diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd_defs.pl b/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd_defs.pl index c0e95b15a0f..bc5e0579999 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd_defs.pl +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/rtcd_defs.pl @@ -210,8 +210,9 @@ $vp8_full_search_sad_sse3=vp8_full_search_sadx3; $vp8_full_search_sad_sse4_1=vp8_full_search_sadx8; add_proto qw/int vp8_refining_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv"; -specialize qw/vp8_refining_search_sad sse3/; -$vp8_refining_search_sad_sse3=vp8_refining_search_sadx4; +specialize qw/vp8_refining_search_sad sse2 msa/; +$vp8_refining_search_sad_sse2=vp8_refining_search_sadx4; +$vp8_refining_search_sad_msa=vp8_refining_search_sadx4; add_proto qw/int vp8_diamond_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv"; specialize qw/vp8_diamond_search_sad sse2 msa/; diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/threading.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/threading.h index f27b209c40e..ece64f3fb43 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/threading.h +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/threading.h @@ -191,8 +191,47 @@ static inline int sem_destroy(sem_t *sem) { #define x86_pause_hint() #endif +#if defined(__has_feature) +#if __has_feature(thread_sanitizer) +#define USE_MUTEX_LOCK 1 +#endif +#endif + #include "vpx_util/vpx_thread.h" +static INLINE int protected_read(pthread_mutex_t *const mutex, const int *p) { + (void)mutex; +#if defined(USE_MUTEX_LOCK) + int ret; + pthread_mutex_lock(mutex); + ret = *p; + pthread_mutex_unlock(mutex); + return ret; +#endif + return *p; +} + +static INLINE void sync_read(pthread_mutex_t *const mutex, int mb_col, + const int *last_row_current_mb_col, + const int nsync) { + while (mb_col > (protected_read(mutex, last_row_current_mb_col) - nsync)) { + x86_pause_hint(); + thread_sleep(0); + } +} + +static INLINE void protected_write(pthread_mutex_t *mutex, int *p, int v) { + (void)mutex; +#if defined(USE_MUTEX_LOCK) + pthread_mutex_lock(mutex); + *p = v; + pthread_mutex_unlock(mutex); + return; +#endif + *p = v; +} + +#undef USE_MUTEX_LOCK #endif /* CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD */ #ifdef __cplusplus diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_int.h b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_int.h index e50fafd4f94..88b1ff16bca 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_int.h +++ b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/onyxd_int.h @@ -67,7 +67,8 @@ typedef struct VP8D_COMP { #if CONFIG_MULTITHREAD /* variable for threading */ - volatile int b_multithreaded_rd; + + int b_multithreaded_rd; int max_threads; int current_mb_col_main; unsigned int decoding_thread_count; @@ -76,6 +77,8 @@ typedef struct VP8D_COMP { int mt_baseline_filter_level[MAX_MB_SEGMENTS]; int sync_range; int *mt_current_mb_col; /* Each row remembers its already decoded column. */ + pthread_mutex_t *pmutex; + pthread_mutex_t mt_mutex; /* mutex for b_multithreaded_rd */ unsigned char **mt_yabove_row; /* mb_rows x width */ unsigned char **mt_uabove_row; diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/threading.c b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/threading.c index 44ca16bfdd4..9f77519882c 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/decoder/threading.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/decoder/threading.c @@ -50,9 +50,6 @@ static void setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, mbd->subpixel_predict8x8 = xd->subpixel_predict8x8; mbd->subpixel_predict16x16 = xd->subpixel_predict16x16; - mbd->mode_info_context = pc->mi + pc->mode_info_stride * (i + 1); - mbd->mode_info_stride = pc->mode_info_stride; - mbd->frame_type = pc->frame_type; mbd->pre = xd->pre; mbd->dst = xd->dst; @@ -251,8 +248,8 @@ static void mt_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, int start_mb_row) { - volatile const int *last_row_current_mb_col; - volatile int *current_mb_col; + const int *last_row_current_mb_col; + int *current_mb_col; int mb_row; VP8_COMMON *pc = &pbi->common; const int nsync = pbi->sync_range; @@ -289,6 +286,9 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, xd->up_available = (start_mb_row != 0); + xd->mode_info_context = pc->mi + pc->mode_info_stride * start_mb_row; + xd->mode_info_stride = pc->mode_info_stride; + for (mb_row = start_mb_row; mb_row < pc->mb_rows; mb_row += (pbi->decoding_thread_count + 1)) { int recon_yoffset, recon_uvoffset; @@ -318,7 +318,7 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, xd->left_available = 0; - xd->mb_to_top_edge = -((mb_row * 16)) << 3; + xd->mb_to_top_edge = -((mb_row * 16) << 3); xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3; if (pbi->common.filter_level) { @@ -355,14 +355,15 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, xd->dst.uv_stride); } - for (mb_col = 0; mb_col < pc->mb_cols; mb_col++) { - *current_mb_col = mb_col - 1; + for (mb_col = 0; mb_col < pc->mb_cols; ++mb_col) { + if (((mb_col - 1) % nsync) == 0) { + pthread_mutex_t *mutex = &pbi->pmutex[mb_row]; + protected_write(mutex, current_mb_col, mb_col - 1); + } - if ((mb_col & (nsync - 1)) == 0) { - while (mb_col > (*last_row_current_mb_col - nsync)) { - x86_pause_hint(); - thread_sleep(0); - } + if (mb_row && !(mb_col & (nsync - 1))) { + pthread_mutex_t *mutex = &pbi->pmutex[mb_row - 1]; + sync_read(mutex, mb_col, last_row_current_mb_col, nsync); } /* Distance of MB to the various image edges. @@ -548,7 +549,7 @@ static void mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd, } /* last MB of row is ready just after extension is done */ - *current_mb_col = mb_col + nsync; + protected_write(&pbi->pmutex[mb_row], current_mb_col, mb_col + nsync); ++xd->mode_info_context; /* skip prediction column */ xd->up_available = 1; @@ -568,10 +569,10 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data) { ENTROPY_CONTEXT_PLANES mb_row_left_context; while (1) { - if (pbi->b_multithreaded_rd == 0) break; + if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd) == 0) break; if (sem_wait(&pbi->h_event_start_decoding[ithread]) == 0) { - if (pbi->b_multithreaded_rd == 0) { + if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd) == 0) { break; } else { MACROBLOCKD *xd = &mbrd->mbd; @@ -591,6 +592,7 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) { pbi->b_multithreaded_rd = 0; pbi->allocated_decoding_thread_count = 0; + pthread_mutex_init(&pbi->mt_mutex, NULL); /* limit decoding threads to the max number of token partitions */ core_count = (pbi->max_threads > 8) ? 8 : pbi->max_threads; @@ -647,6 +649,16 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) { void vp8mt_de_alloc_temp_buffers(VP8D_COMP *pbi, int mb_rows) { int i; + /* De-allocate mutex */ + if (pbi->pmutex != NULL) { + for (i = 0; i < mb_rows; ++i) { + pthread_mutex_destroy(&pbi->pmutex[i]); + } + + vpx_free(pbi->pmutex); + pbi->pmutex = NULL; + } + vpx_free(pbi->mt_current_mb_col); pbi->mt_current_mb_col = NULL; @@ -712,7 +724,7 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) { int i; int uv_width; - if (pbi->b_multithreaded_rd) { + if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd)) { vp8mt_de_alloc_temp_buffers(pbi, prev_mb_rows); /* our internal buffers are always multiples of 16 */ @@ -730,6 +742,15 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) { uv_width = width >> 1; + /* Allocate mutex */ + CHECK_MEM_ERROR(pbi->pmutex, + vpx_malloc(sizeof(*pbi->pmutex) * pc->mb_rows)); + if (pbi->pmutex) { + for (i = 0; i < pc->mb_rows; ++i) { + pthread_mutex_init(&pbi->pmutex[i], NULL); + } + } + /* Allocate an int for each mb row. */ CALLOC_ARRAY(pbi->mt_current_mb_col, pc->mb_rows); @@ -772,9 +793,9 @@ void vp8mt_alloc_temp_buffers(VP8D_COMP *pbi, int width, int prev_mb_rows) { void vp8_decoder_remove_threads(VP8D_COMP *pbi) { /* shutdown MB Decoding thread; */ - if (pbi->b_multithreaded_rd) { + if (protected_read(&pbi->mt_mutex, &pbi->b_multithreaded_rd)) { int i; - pbi->b_multithreaded_rd = 0; + protected_write(&pbi->mt_mutex, &pbi->b_multithreaded_rd, 0); /* allow all threads to exit */ for (i = 0; i < pbi->allocated_decoding_thread_count; ++i) { @@ -804,6 +825,7 @@ void vp8_decoder_remove_threads(VP8D_COMP *pbi) { vp8mt_de_alloc_temp_buffers(pbi, pbi->common.mb_rows); } + pthread_mutex_destroy(&pbi->mt_mutex); } void vp8mt_decode_mb_rows(VP8D_COMP *pbi, MACROBLOCKD *xd) { diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeframe.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeframe.c index e41d513c1b7..c7ad3bfe2c9 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeframe.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/encodeframe.c @@ -345,8 +345,8 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row, #if CONFIG_MULTITHREAD const int nsync = cpi->mt_sync_range; const int rightmost_col = cm->mb_cols + nsync; - volatile const int *last_row_current_mb_col; - volatile int *current_mb_col = &cpi->mt_current_mb_col[mb_row]; + const int *last_row_current_mb_col; + int *current_mb_col = &cpi->mt_current_mb_col[mb_row]; if ((cpi->b_multi_threaded != 0) && (mb_row != 0)) { last_row_current_mb_col = &cpi->mt_current_mb_col[mb_row - 1]; @@ -419,13 +419,14 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row, #if CONFIG_MULTITHREAD if (cpi->b_multi_threaded != 0) { - *current_mb_col = mb_col - 1; /* set previous MB done */ + if (((mb_col - 1) % nsync) == 0) { + pthread_mutex_t *mutex = &cpi->pmutex[mb_row]; + protected_write(mutex, current_mb_col, mb_col - 1); + } - if ((mb_col & (nsync - 1)) == 0) { - while (mb_col > (*last_row_current_mb_col - nsync)) { - x86_pause_hint(); - thread_sleep(0); - } + if (mb_row && !(mb_col & (nsync - 1))) { + pthread_mutex_t *mutex = &cpi->pmutex[mb_row - 1]; + sync_read(mutex, mb_col, last_row_current_mb_col, nsync); } } #endif @@ -565,7 +566,9 @@ static void encode_mb_row(VP8_COMP *cpi, VP8_COMMON *cm, int mb_row, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8); #if CONFIG_MULTITHREAD - if (cpi->b_multi_threaded != 0) *current_mb_col = rightmost_col; + if (cpi->b_multi_threaded != 0) { + protected_write(&cpi->pmutex[mb_row], current_mb_col, rightmost_col); + } #endif /* this is to account for the border */ diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ethreading.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ethreading.c index 708002b1e67..df34997accd 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ethreading.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ethreading.c @@ -25,11 +25,11 @@ static THREAD_FUNCTION thread_loopfilter(void *p_data) { VP8_COMMON *cm = &cpi->common; while (1) { - if (cpi->b_multi_threaded == 0) break; + if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break; if (sem_wait(&cpi->h_event_start_lpf) == 0) { /* we're shutting down */ - if (cpi->b_multi_threaded == 0) break; + if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break; vp8_loopfilter_frame(cpi, cm); @@ -47,7 +47,7 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) { ENTROPY_CONTEXT_PLANES mb_row_left_context; while (1) { - if (cpi->b_multi_threaded == 0) break; + if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break; if (sem_wait(&cpi->h_event_start_encoding[ithread]) == 0) { const int nsync = cpi->mt_sync_range; @@ -65,7 +65,10 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) { int *totalrate = &mbri->totalrate; /* we're shutting down */ - if (cpi->b_multi_threaded == 0) break; + if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded) == 0) break; + + xd->mode_info_context = cm->mi + cm->mode_info_stride * (ithread + 1); + xd->mode_info_stride = cm->mode_info_stride; for (mb_row = ithread + 1; mb_row < cm->mb_rows; mb_row += (cpi->encoding_thread_count + 1)) { @@ -76,8 +79,8 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) { int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride; int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride; int map_index = (mb_row * cm->mb_cols); - volatile const int *last_row_current_mb_col; - volatile int *current_mb_col = &cpi->mt_current_mb_col[mb_row]; + const int *last_row_current_mb_col; + int *current_mb_col = &cpi->mt_current_mb_col[mb_row]; #if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING) vp8_writer *w = &cpi->bc[1 + (mb_row % num_part)]; @@ -103,13 +106,14 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) { /* for each macroblock col in image */ for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) { - *current_mb_col = mb_col - 1; + if (((mb_col - 1) % nsync) == 0) { + pthread_mutex_t *mutex = &cpi->pmutex[mb_row]; + protected_write(mutex, current_mb_col, mb_col - 1); + } - if ((mb_col & (nsync - 1)) == 0) { - while (mb_col > (*last_row_current_mb_col - nsync)) { - x86_pause_hint(); - thread_sleep(0); - } + if (mb_row && !(mb_col & (nsync - 1))) { + pthread_mutex_t *mutex = &cpi->pmutex[mb_row - 1]; + sync_read(mutex, mb_col, last_row_current_mb_col, nsync); } #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING @@ -281,7 +285,7 @@ static THREAD_FUNCTION thread_encoding_proc(void *p_data) { vp8_extend_mb_row(&cm->yv12_fb[dst_fb_idx], xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8); - *current_mb_col = mb_col + nsync; + protected_write(&cpi->pmutex[mb_row], current_mb_col, mb_col + nsync); /* this is to account for the border */ xd->mode_info_context++; @@ -450,9 +454,6 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi, MACROBLOCK *x, mb->partition_info = x->pi + x->e_mbd.mode_info_stride * (i + 1); - mbd->mode_info_context = cm->mi + x->e_mbd.mode_info_stride * (i + 1); - mbd->mode_info_stride = cm->mode_info_stride; - mbd->frame_type = cm->frame_type; mb->src = *cpi->Source; @@ -492,6 +493,8 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { cpi->encoding_thread_count = 0; cpi->b_lpf_running = 0; + pthread_mutex_init(&cpi->mt_mutex, NULL); + if (cm->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1) { int ithread; int th_count = cpi->oxcf.multi_threaded - 1; @@ -551,7 +554,7 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { if (rc) { /* shutdown other threads */ - cpi->b_multi_threaded = 0; + protected_write(&cpi->mt_mutex, &cpi->b_multi_threaded, 0); for (--ithread; ithread >= 0; ithread--) { pthread_join(cpi->h_encoding_thread[ithread], 0); sem_destroy(&cpi->h_event_start_encoding[ithread]); @@ -565,6 +568,8 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { vpx_free(cpi->mb_row_ei); vpx_free(cpi->en_thread_data); + pthread_mutex_destroy(&cpi->mt_mutex); + return -1; } @@ -579,7 +584,7 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { if (rc) { /* shutdown other threads */ - cpi->b_multi_threaded = 0; + protected_write(&cpi->mt_mutex, &cpi->b_multi_threaded, 0); for (--ithread; ithread >= 0; ithread--) { sem_post(&cpi->h_event_start_encoding[ithread]); sem_post(&cpi->h_event_end_encoding[ithread]); @@ -597,6 +602,8 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { vpx_free(cpi->mb_row_ei); vpx_free(cpi->en_thread_data); + pthread_mutex_destroy(&cpi->mt_mutex); + return -2; } } @@ -605,9 +612,9 @@ int vp8cx_create_encoder_threads(VP8_COMP *cpi) { } void vp8cx_remove_encoder_threads(VP8_COMP *cpi) { - if (cpi->b_multi_threaded) { + if (protected_read(&cpi->mt_mutex, &cpi->b_multi_threaded)) { /* shutdown other threads */ - cpi->b_multi_threaded = 0; + protected_write(&cpi->mt_mutex, &cpi->b_multi_threaded, 0); { int i; @@ -635,5 +642,6 @@ void vp8cx_remove_encoder_threads(VP8_COMP *cpi) { vpx_free(cpi->mb_row_ei); vpx_free(cpi->en_thread_data); } + pthread_mutex_destroy(&cpi->mt_mutex); } #endif diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c index c5389594553..9717feb136b 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c @@ -446,6 +446,18 @@ static void dealloc_compressor_data(VP8_COMP *cpi) { cpi->mb.pip = 0; #if CONFIG_MULTITHREAD + /* De-allocate mutex */ + if (cpi->pmutex != NULL) { + VP8_COMMON *const pc = &cpi->common; + int i; + + for (i = 0; i < pc->mb_rows; ++i) { + pthread_mutex_destroy(&cpi->pmutex[i]); + } + vpx_free(cpi->pmutex); + cpi->pmutex = NULL; + } + vpx_free(cpi->mt_current_mb_col); cpi->mt_current_mb_col = NULL; #endif @@ -1075,6 +1087,9 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) { int width = cm->Width; int height = cm->Height; +#if CONFIG_MULTITHREAD + int prev_mb_rows = cm->mb_rows; +#endif if (vp8_alloc_frame_buffers(cm, width, height)) { vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, @@ -1164,6 +1179,25 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi) { } if (cpi->oxcf.multi_threaded > 1) { + int i; + + /* De-allocate and re-allocate mutex */ + if (cpi->pmutex != NULL) { + for (i = 0; i < prev_mb_rows; ++i) { + pthread_mutex_destroy(&cpi->pmutex[i]); + } + vpx_free(cpi->pmutex); + cpi->pmutex = NULL; + } + + CHECK_MEM_ERROR(cpi->pmutex, + vpx_malloc(sizeof(*cpi->pmutex) * cm->mb_rows)); + if (cpi->pmutex) { + for (i = 0; i < cm->mb_rows; ++i) { + pthread_mutex_init(&cpi->pmutex[i], NULL); + } + } + vpx_free(cpi->mt_current_mb_col); CHECK_MEM_ERROR(cpi->mt_current_mb_col, vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows)); diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h index bfcc6457c19..fe775064a45 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h +++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h @@ -511,6 +511,8 @@ typedef struct VP8_COMP { #if CONFIG_MULTITHREAD /* multithread data */ + pthread_mutex_t *pmutex; + pthread_mutex_t mt_mutex; /* mutex for b_multi_threaded */ int *mt_current_mb_col; int mt_sync_range; int b_multi_threaded; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c index cbd61de90ab..dd1ea03b6b9 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht4x4_add_neon.c @@ -77,10 +77,10 @@ static INLINE void IDCT4x4_1D(int16x4_t *d0s16, int16x4_t *d1s16, q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16); q10s32 = vmlal_s16(q10s32, d19s16, *d2s16); - d26s16 = vqrshrn_n_s32(q13s32, 14); - d27s16 = vqrshrn_n_s32(q14s32, 14); - d29s16 = vqrshrn_n_s32(q15s32, 14); - d28s16 = vqrshrn_n_s32(q10s32, 14); + d26s16 = vrshrn_n_s32(q13s32, 14); + d27s16 = vrshrn_n_s32(q14s32, 14); + d29s16 = vrshrn_n_s32(q15s32, 14); + d28s16 = vrshrn_n_s32(q10s32, 14); q13s16 = vcombine_s16(d26s16, d27s16); q14s16 = vcombine_s16(d28s16, d29s16); @@ -125,17 +125,17 @@ static INLINE void IADST4x4_1D(int16x4_t *d3s16, int16x4_t *d4s16, q14s32 = vaddq_s32(q11s32, q12s32); q10s32 = vsubq_s32(q10s32, q12s32); - d16s16 = vqrshrn_n_s32(q13s32, 14); - d17s16 = vqrshrn_n_s32(q14s32, 14); - d18s16 = vqrshrn_n_s32(q15s32, 14); - d19s16 = vqrshrn_n_s32(q10s32, 14); + d16s16 = vrshrn_n_s32(q13s32, 14); + d17s16 = vrshrn_n_s32(q14s32, 14); + d18s16 = vrshrn_n_s32(q15s32, 14); + d19s16 = vrshrn_n_s32(q10s32, 14); *q8s16 = vcombine_s16(d16s16, d17s16); *q9s16 = vcombine_s16(d18s16, d19s16); } -void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, - int dest_stride, int tx_type) { +void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { uint8x8_t d26u8, d27u8; int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16; uint32x2_t d26u32, d27u32; @@ -151,7 +151,7 @@ void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, switch (tx_type) { case 0: // idct_idct is not supported. Fall back to C - vp9_iht4x4_16_add_c(input, dest, dest_stride, tx_type); + vp9_iht4x4_16_add_c(input, dest, stride, tx_type); return; case 1: // iadst_idct // generate constants @@ -203,11 +203,11 @@ void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, q9s16 = vrshrq_n_s16(q9s16, 4); d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0); - dest += dest_stride; + dest += stride; d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1); - dest += dest_stride; + dest += stride; d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0); - dest += dest_stride; + dest += stride; d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1); q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32)); @@ -217,10 +217,10 @@ void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1); - dest -= dest_stride; + dest -= stride; vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0); - dest -= dest_stride; + dest -= stride; vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1); - dest -= dest_stride; + dest -= stride; vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0); } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c index f7e0a6d9817..1c739861c38 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/arm/neon/vp9_iht8x8_add_neon.c @@ -76,10 +76,10 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16, q5s32 = vmlsl_s16(q5s32, d22s16, d3s16); q6s32 = vmlsl_s16(q6s32, d23s16, d3s16); - d8s16 = vqrshrn_n_s32(q2s32, 14); - d9s16 = vqrshrn_n_s32(q3s32, 14); - d10s16 = vqrshrn_n_s32(q5s32, 14); - d11s16 = vqrshrn_n_s32(q6s32, 14); + d8s16 = vrshrn_n_s32(q2s32, 14); + d9s16 = vrshrn_n_s32(q3s32, 14); + d10s16 = vrshrn_n_s32(q5s32, 14); + d11s16 = vrshrn_n_s32(q6s32, 14); q4s16 = vcombine_s16(d8s16, d9s16); q5s16 = vcombine_s16(d10s16, d11s16); @@ -93,10 +93,10 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16, q9s32 = vmlal_s16(q9s32, d22s16, d2s16); q13s32 = vmlal_s16(q13s32, d23s16, d2s16); - d14s16 = vqrshrn_n_s32(q2s32, 14); - d15s16 = vqrshrn_n_s32(q3s32, 14); - d12s16 = vqrshrn_n_s32(q9s32, 14); - d13s16 = vqrshrn_n_s32(q13s32, 14); + d14s16 = vrshrn_n_s32(q2s32, 14); + d15s16 = vrshrn_n_s32(q3s32, 14); + d12s16 = vrshrn_n_s32(q9s32, 14); + d13s16 = vrshrn_n_s32(q13s32, 14); q6s16 = vcombine_s16(d12s16, d13s16); q7s16 = vcombine_s16(d14s16, d15s16); @@ -115,10 +115,10 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16, d0s16 = vdup_n_s16(cospi_24_64); d1s16 = vdup_n_s16(cospi_8_64); - d18s16 = vqrshrn_n_s32(q2s32, 14); - d19s16 = vqrshrn_n_s32(q3s32, 14); - d22s16 = vqrshrn_n_s32(q13s32, 14); - d23s16 = vqrshrn_n_s32(q15s32, 14); + d18s16 = vrshrn_n_s32(q2s32, 14); + d19s16 = vrshrn_n_s32(q3s32, 14); + d22s16 = vrshrn_n_s32(q13s32, 14); + d23s16 = vrshrn_n_s32(q15s32, 14); *q9s16 = vcombine_s16(d18s16, d19s16); *q11s16 = vcombine_s16(d22s16, d23s16); @@ -132,10 +132,10 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16, q8s32 = vmlal_s16(q8s32, d28s16, d0s16); q12s32 = vmlal_s16(q12s32, d29s16, d0s16); - d26s16 = vqrshrn_n_s32(q2s32, 14); - d27s16 = vqrshrn_n_s32(q3s32, 14); - d30s16 = vqrshrn_n_s32(q8s32, 14); - d31s16 = vqrshrn_n_s32(q12s32, 14); + d26s16 = vrshrn_n_s32(q2s32, 14); + d27s16 = vrshrn_n_s32(q3s32, 14); + d30s16 = vrshrn_n_s32(q8s32, 14); + d31s16 = vrshrn_n_s32(q12s32, 14); *q13s16 = vcombine_s16(d26s16, d27s16); *q15s16 = vcombine_s16(d30s16, d31s16); @@ -165,10 +165,10 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16, q11s32 = vmlal_s16(q11s32, d26s16, d16s16); q12s32 = vmlal_s16(q12s32, d27s16, d16s16); - d10s16 = vqrshrn_n_s32(q9s32, 14); - d11s16 = vqrshrn_n_s32(q10s32, 14); - d12s16 = vqrshrn_n_s32(q11s32, 14); - d13s16 = vqrshrn_n_s32(q12s32, 14); + d10s16 = vrshrn_n_s32(q9s32, 14); + d11s16 = vrshrn_n_s32(q10s32, 14); + d12s16 = vrshrn_n_s32(q11s32, 14); + d13s16 = vrshrn_n_s32(q12s32, 14); q5s16 = vcombine_s16(d10s16, d11s16); q6s16 = vcombine_s16(d12s16, d13s16); @@ -242,8 +242,8 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16, q1s32 = vsubq_s32(q1s32, q5s32); q2s32 = vsubq_s32(q2s32, q6s32); - d22s16 = vqrshrn_n_s32(q11s32, 14); - d23s16 = vqrshrn_n_s32(q12s32, 14); + d22s16 = vrshrn_n_s32(q11s32, 14); + d23s16 = vrshrn_n_s32(q12s32, 14); *q11s16 = vcombine_s16(d22s16, d23s16); q12s32 = vaddq_s32(q3s32, q7s32); @@ -251,12 +251,12 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16, q3s32 = vsubq_s32(q3s32, q7s32); q4s32 = vsubq_s32(q4s32, q8s32); - d2s16 = vqrshrn_n_s32(q1s32, 14); - d3s16 = vqrshrn_n_s32(q2s32, 14); - d24s16 = vqrshrn_n_s32(q12s32, 14); - d25s16 = vqrshrn_n_s32(q15s32, 14); - d6s16 = vqrshrn_n_s32(q3s32, 14); - d7s16 = vqrshrn_n_s32(q4s32, 14); + d2s16 = vrshrn_n_s32(q1s32, 14); + d3s16 = vrshrn_n_s32(q2s32, 14); + d24s16 = vrshrn_n_s32(q12s32, 14); + d25s16 = vrshrn_n_s32(q15s32, 14); + d6s16 = vrshrn_n_s32(q3s32, 14); + d7s16 = vrshrn_n_s32(q4s32, 14); *q12s16 = vcombine_s16(d24s16, d25s16); d0s16 = vdup_n_s16(cospi_10_64); @@ -291,10 +291,10 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16, q2s32 = vsubq_s32(q2s32, q10s32); q6s32 = vsubq_s32(q6s32, q9s32); - d28s16 = vqrshrn_n_s32(q14s32, 14); - d29s16 = vqrshrn_n_s32(q15s32, 14); - d4s16 = vqrshrn_n_s32(q2s32, 14); - d5s16 = vqrshrn_n_s32(q6s32, 14); + d28s16 = vrshrn_n_s32(q14s32, 14); + d29s16 = vrshrn_n_s32(q15s32, 14); + d4s16 = vrshrn_n_s32(q2s32, 14); + d5s16 = vrshrn_n_s32(q6s32, 14); *q14s16 = vcombine_s16(d28s16, d29s16); q9s32 = vaddq_s32(q4s32, q0s32); @@ -305,10 +305,10 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16, d30s16 = vdup_n_s16(cospi_8_64); d31s16 = vdup_n_s16(cospi_24_64); - d18s16 = vqrshrn_n_s32(q9s32, 14); - d19s16 = vqrshrn_n_s32(q10s32, 14); - d8s16 = vqrshrn_n_s32(q4s32, 14); - d9s16 = vqrshrn_n_s32(q5s32, 14); + d18s16 = vrshrn_n_s32(q9s32, 14); + d19s16 = vrshrn_n_s32(q10s32, 14); + d8s16 = vrshrn_n_s32(q4s32, 14); + d9s16 = vrshrn_n_s32(q5s32, 14); *q9s16 = vcombine_s16(d18s16, d19s16); q5s32 = vmull_s16(d2s16, d30s16); @@ -341,10 +341,10 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16, q5s32 = vsubq_s32(q5s32, q1s32); q6s32 = vsubq_s32(q6s32, q3s32); - d18s16 = vqrshrn_n_s32(q14s32, 14); - d19s16 = vqrshrn_n_s32(q15s32, 14); - d10s16 = vqrshrn_n_s32(q5s32, 14); - d11s16 = vqrshrn_n_s32(q6s32, 14); + d18s16 = vrshrn_n_s32(q14s32, 14); + d19s16 = vrshrn_n_s32(q15s32, 14); + d10s16 = vrshrn_n_s32(q5s32, 14); + d11s16 = vrshrn_n_s32(q6s32, 14); *q9s16 = vcombine_s16(d18s16, d19s16); q1s32 = vaddq_s32(q7s32, q10s32); @@ -352,10 +352,10 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16, q7s32 = vsubq_s32(q7s32, q10s32); q0s32 = vsubq_s32(q0s32, q2s32); - d28s16 = vqrshrn_n_s32(q1s32, 14); - d29s16 = vqrshrn_n_s32(q3s32, 14); - d14s16 = vqrshrn_n_s32(q7s32, 14); - d15s16 = vqrshrn_n_s32(q0s32, 14); + d28s16 = vrshrn_n_s32(q1s32, 14); + d29s16 = vrshrn_n_s32(q3s32, 14); + d14s16 = vrshrn_n_s32(q7s32, 14); + d15s16 = vrshrn_n_s32(q0s32, 14); *q14s16 = vcombine_s16(d28s16, d29s16); d30s16 = vdup_n_s16(cospi_16_64); @@ -374,10 +374,10 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16, q13s32 = vmlsl_s16(q13s32, d24s16, d30s16); q1s32 = vmlsl_s16(q1s32, d25s16, d30s16); - d4s16 = vqrshrn_n_s32(q2s32, 14); - d5s16 = vqrshrn_n_s32(q3s32, 14); - d24s16 = vqrshrn_n_s32(q13s32, 14); - d25s16 = vqrshrn_n_s32(q1s32, 14); + d4s16 = vrshrn_n_s32(q2s32, 14); + d5s16 = vrshrn_n_s32(q3s32, 14); + d24s16 = vrshrn_n_s32(q13s32, 14); + d25s16 = vrshrn_n_s32(q1s32, 14); q2s16 = vcombine_s16(d4s16, d5s16); *q12s16 = vcombine_s16(d24s16, d25s16); @@ -391,10 +391,10 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16, q11s32 = vmlsl_s16(q11s32, d14s16, d30s16); q0s32 = vmlsl_s16(q0s32, d15s16, d30s16); - d20s16 = vqrshrn_n_s32(q13s32, 14); - d21s16 = vqrshrn_n_s32(q1s32, 14); - d12s16 = vqrshrn_n_s32(q11s32, 14); - d13s16 = vqrshrn_n_s32(q0s32, 14); + d20s16 = vrshrn_n_s32(q13s32, 14); + d21s16 = vrshrn_n_s32(q1s32, 14); + d12s16 = vrshrn_n_s32(q11s32, 14); + d13s16 = vrshrn_n_s32(q0s32, 14); *q10s16 = vcombine_s16(d20s16, d21s16); q6s16 = vcombine_s16(d12s16, d13s16); @@ -406,8 +406,8 @@ static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16, *q15s16 = vsubq_s16(q5s16, q4s16); } -void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, - int dest_stride, int tx_type) { +void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, + int tx_type) { int i; uint8_t *d1, *d2; uint8x8_t d0u8, d1u8, d2u8, d3u8; @@ -429,7 +429,7 @@ void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, switch (tx_type) { case 0: // idct_idct is not supported. Fall back to C - vp9_iht8x8_64_add_c(input, dest, dest_stride, tx_type); + vp9_iht8x8_64_add_c(input, dest, stride, tx_type); return; case 1: // iadst_idct // generate IDCT constants @@ -508,13 +508,13 @@ void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, } d0u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; + d1 += stride; d1u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; + d1 += stride; d2u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; + d1 += stride; d3u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; + d1 += stride; q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); @@ -529,12 +529,12 @@ void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); - d2 += dest_stride; + d2 += stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); - d2 += dest_stride; + d2 += stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += dest_stride; + d2 += stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; + d2 += stride; } } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c index 2d4839174db..f6b29265e66 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans4_dspr2.c @@ -21,8 +21,8 @@ #include "vpx_ports/mem.h" #if HAVE_DSPR2 -void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride, int tx_type) { +void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, int stride, + int tx_type) { int i, j; DECLARE_ALIGNED(32, int16_t, out[4 * 4]); int16_t *outptr = out; @@ -37,7 +37,7 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, switch (tx_type) { case DCT_DCT: // DCT in both horizontal and vertical vpx_idct4_rows_dspr2(input, outptr); - vpx_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride); + vpx_idct4_columns_add_blk_dspr2(&out[0], dest, stride); break; case ADST_DCT: // ADST in vertical, DCT in horizontal vpx_idct4_rows_dspr2(input, outptr); @@ -48,8 +48,8 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, iadst4_dspr2(outptr, temp_out); for (j = 0; j < 4; ++j) - dest[j * dest_stride + i] = clip_pixel( - ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]); + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) + + dest[j * stride + i]); outptr += 4; } @@ -66,7 +66,7 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, temp_in[i * 4 + j] = out[j * 4 + i]; } } - vpx_idct4_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride); + vpx_idct4_columns_add_blk_dspr2(&temp_in[0], dest, stride); break; case ADST_ADST: // ADST in both directions for (i = 0; i < 4; ++i) { @@ -80,8 +80,8 @@ void vp9_iht4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, iadst4_dspr2(temp_in, temp_out); for (j = 0; j < 4; ++j) - dest[j * dest_stride + i] = clip_pixel( - ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]); + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) + + dest[j * stride + i]); } break; default: printf("vp9_short_iht4x4_add_dspr2 : Invalid tx_type\n"); break; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c index 86896f04ca5..b945e307e63 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/mips/dspr2/vp9_itrans8_dspr2.c @@ -20,8 +20,8 @@ #include "vpx_ports/mem.h" #if HAVE_DSPR2 -void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride, int tx_type) { +void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, int stride, + int tx_type) { int i, j; DECLARE_ALIGNED(32, int16_t, out[8 * 8]); int16_t *outptr = out; @@ -34,7 +34,7 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, switch (tx_type) { case DCT_DCT: // DCT in both horizontal and vertical idct8_rows_dspr2(input, outptr, 8); - idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); + idct8_columns_add_blk_dspr2(&out[0], dest, stride); break; case ADST_DCT: // ADST in vertical, DCT in horizontal idct8_rows_dspr2(input, outptr, 8); @@ -43,8 +43,8 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, iadst8_dspr2(&out[i * 8], temp_out); for (j = 0; j < 8; ++j) - dest[j * dest_stride + i] = clip_pixel( - ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]); + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) + + dest[j * stride + i]); } break; case DCT_ADST: // DCT in vertical, ADST in horizontal @@ -59,7 +59,7 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, temp_in[i * 8 + j] = out[j * 8 + i]; } } - idct8_columns_add_blk_dspr2(&temp_in[0], dest, dest_stride); + idct8_columns_add_blk_dspr2(&temp_in[0], dest, stride); break; case ADST_ADST: // ADST in both directions for (i = 0; i < 8; ++i) { @@ -74,8 +74,8 @@ void vp9_iht8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, iadst8_dspr2(temp_in, temp_out); for (j = 0; j < 8; ++j) - dest[j * dest_stride + i] = clip_pixel( - ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]); + dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) + + dest[j * stride + i]); } break; default: printf("vp9_short_iht8x8_add_dspr2 : Invalid tx_type\n"); break; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_ppflags.h b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_ppflags.h index 6dcfa412bee..b8b647bf18d 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_ppflags.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_ppflags.h @@ -20,14 +20,7 @@ enum { VP9D_DEBLOCK = 1 << 0, VP9D_DEMACROBLOCK = 1 << 1, VP9D_ADDNOISE = 1 << 2, - VP9D_DEBUG_TXT_FRAME_INFO = 1 << 3, - VP9D_DEBUG_TXT_MBLK_MODES = 1 << 4, - VP9D_DEBUG_TXT_DC_DIFF = 1 << 5, - VP9D_DEBUG_TXT_RATE_INFO = 1 << 6, - VP9D_DEBUG_DRAW_MV = 1 << 7, - VP9D_DEBUG_CLR_BLK_MODES = 1 << 8, - VP9D_DEBUG_CLR_FRM_REF_BLKS = 1 << 9, - VP9D_MFQE = 1 << 10 + VP9D_MFQE = 1 << 3 }; typedef struct { diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl index abef0676396..088b004f528 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl @@ -48,16 +48,16 @@ specialize qw/vp9_filter_by_weight8x8 sse2 msa/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # Force C versions if CONFIG_EMULATE_HARDWARE is 1 if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") { - add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; + add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type"; - add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; + add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type"; add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; } else { - add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; + add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type"; specialize qw/vp9_iht4x4_16_add sse2/; - add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; + add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type"; specialize qw/vp9_iht8x8_64_add sse2/; add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; @@ -66,16 +66,16 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { } else { # Force C versions if CONFIG_EMULATE_HARDWARE is 1 if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") { - add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; + add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type"; - add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; + add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type"; add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; } else { - add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; + add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type"; specialize qw/vp9_iht4x4_16_add sse2 neon dspr2 msa/; - add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type"; + add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type"; specialize qw/vp9_iht8x8_64_add sse2 neon dspr2 msa/; add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; @@ -101,9 +101,9 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # # Note as optimized versions of these functions are added we need to add a check to ensure # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only. - add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd"; + add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd"; - add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd"; + add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type, int bd"; add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd"; } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c index 072d92e4e91..3dc88b1914e 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c @@ -128,16 +128,20 @@ int vp9_cyclic_refresh_rc_bits_per_mb(const VP9_COMP *cpi, int i, CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; int bits_per_mb; int num8x8bl = cm->MBs << 2; + // Compute delta-q corresponding to qindex i. + int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta); // Weight for segment prior to encoding: take the average of the target // number for the frame to be encoded and the actual from the previous frame. + // Use the target if its less. int target_refresh = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100; + double weight_segment_target = (double)(target_refresh) / num8x8bl; double weight_segment = (double)((target_refresh + cr->actual_num_seg1_blocks + cr->actual_num_seg2_blocks) >> 1) / num8x8bl; - // Compute delta-q corresponding to qindex i. - int deltaq = compute_deltaq(cpi, i, cr->rate_ratio_qdelta); + if (weight_segment_target < 7 * weight_segment / 8) + weight_segment = weight_segment_target; // Take segment weighted average for bits per mb. bits_per_mb = (int)((1.0 - weight_segment) * vp9_rc_bits_per_mb(cm->frame_type, i, @@ -383,13 +387,14 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) { : vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex); // More aggressive settings for noisy content. if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium) { - consec_zero_mv_thresh = 80; + consec_zero_mv_thresh = 60; qindex_thresh = VPXMAX(vp9_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST1, cm->base_qindex), - 7 * cm->base_qindex >> 3); + cm->base_qindex); } do { int sum_map = 0; + int consec_zero_mv_thresh_block = consec_zero_mv_thresh; // Get the mi_row/mi_col corresponding to superblock index i. int sb_row_index = (i / sb_cols); int sb_col_index = i - sb_row_index * sb_cols; @@ -403,6 +408,9 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) { VPXMIN(cm->mi_cols - mi_col, num_8x8_blocks_wide_lookup[BLOCK_64X64]); ymis = VPXMIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[BLOCK_64X64]); + if (cpi->noise_estimate.enabled && cpi->noise_estimate.level >= kMedium && + (xmis <= 2 || ymis <= 2)) + consec_zero_mv_thresh_block = 10; for (y = 0; y < ymis; y++) { for (x = 0; x < xmis; x++) { const int bl_index2 = bl_index + y * cm->mi_cols + x; @@ -412,7 +420,7 @@ static void cyclic_refresh_update_map(VP9_COMP *const cpi) { if (cr->map[bl_index2] == 0) { count_tot++; if (cr->last_coded_q_map[bl_index2] > qindex_thresh || - cpi->consec_zero_mv[bl_index2] < consec_zero_mv_thresh) { + cpi->consec_zero_mv[bl_index2] < consec_zero_mv_thresh_block) { sum_map++; count_sel++; } @@ -468,8 +476,8 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) { } // Adjust some parameters for low resolutions at low bitrates. if (cm->width <= 352 && cm->height <= 288 && rc->avg_frame_bandwidth < 3400) { - cr->motion_thresh = 4; - cr->rate_boost_fac = 10; + cr->motion_thresh = 16; + cr->rate_boost_fac = 13; } if (cpi->svc.spatial_layer_id > 0) { cr->motion_thresh = 4; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c index 3ab05375ff7..323c053edff 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c @@ -477,8 +477,8 @@ static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) { thresholds[2] = threshold_base >> 2; thresholds[3] = threshold_base << 2; } else { - // Increase base variance threshold based on estimated noise level. - if (cpi->noise_estimate.enabled) { + // Increase base variance threshold based on estimated noise level. + if (cpi->noise_estimate.enabled && cm->width >= 640 && cm->height >= 480) { NOISE_LEVEL noise_level = vp9_noise_estimate_extract_level(&cpi->noise_estimate); if (noise_level == kHigh) @@ -526,6 +526,7 @@ void vp9_set_variance_partition_thresholds(VP9_COMP *cpi, int q) { : 1000; cpi->vbp_bsize_min = BLOCK_16X16; } + cpi->vbp_threshold_copy = cpi->vbp_thresholds[0] << 16; cpi->vbp_threshold_minmax = 15 + (q >> 3); } } @@ -742,9 +743,13 @@ static void set_low_temp_var_flag(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, continue; if ((*this_mi)->sb_type == BLOCK_32X32) { - if (vt->split[i].part_variances.none.variance < (thresholds[1] >> 1)) + int64_t threshold_32x32 = (cpi->sf.short_circuit_low_temp_var == 1 || + cpi->sf.short_circuit_low_temp_var == 3) + ? ((5 * thresholds[1]) >> 3) + : (thresholds[1] >> 1); + if (vt->split[i].part_variances.none.variance < threshold_32x32) x->variance_low[i + 5] = 1; - } else if (cpi->sf.short_circuit_low_temp_var == 2) { + } else if (cpi->sf.short_circuit_low_temp_var >= 2) { // For 32x16 and 16x32 blocks, the flag is set on each 16x16 block // inside. if ((*this_mi)->sb_type == BLOCK_16X16 || @@ -762,6 +767,93 @@ static void set_low_temp_var_flag(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, } } +static void copy_prev_partition(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col) { + VP9_COMMON *const cm = &cpi->common; + BLOCK_SIZE *prev_part = cpi->prev_partition; + int start_pos = mi_row * cm->mi_stride + mi_col; + + const int bsl = b_width_log2_lookup[bsize]; + const int bs = (1 << bsl) / 4; + BLOCK_SIZE subsize; + PARTITION_TYPE partition; + MODE_INFO *mi = NULL; + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + + partition = partition_lookup[bsl][prev_part[start_pos]]; + subsize = get_subsize(bsize, partition); + mi = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col]; + + if (subsize < BLOCK_8X8) { + mi->sb_type = bsize; + } else { + switch (partition) { + case PARTITION_NONE: mi->sb_type = bsize; break; + case PARTITION_HORZ: + mi->sb_type = subsize; + if (mi_row + bs < cm->mi_rows) + cm->mi_grid_visible[(mi_row + bs) * cm->mi_stride + mi_col]->sb_type = + subsize; + break; + case PARTITION_VERT: + mi->sb_type = subsize; + if (mi_col + bs < cm->mi_cols) + cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col + bs]->sb_type = + subsize; + break; + case PARTITION_SPLIT: + copy_prev_partition(cpi, subsize, mi_row, mi_col); + copy_prev_partition(cpi, subsize, mi_row + bs, mi_col); + copy_prev_partition(cpi, subsize, mi_row, mi_col + bs); + copy_prev_partition(cpi, subsize, mi_row + bs, mi_col + bs); + break; + default: assert(0); + } + } +} + +static void update_prev_partition(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row, + int mi_col) { + VP9_COMMON *const cm = &cpi->common; + BLOCK_SIZE *prev_part = cpi->prev_partition; + int start_pos = mi_row * cm->mi_stride + mi_col; + const int bsl = b_width_log2_lookup[bsize]; + const int bs = (1 << bsl) / 4; + BLOCK_SIZE subsize; + PARTITION_TYPE partition; + const MODE_INFO *mi = NULL; + + if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; + + mi = cm->mi_grid_visible[start_pos]; + partition = partition_lookup[bsl][mi->sb_type]; + subsize = get_subsize(bsize, partition); + if (subsize < BLOCK_8X8) { + prev_part[start_pos] = bsize; + } else { + switch (partition) { + case PARTITION_NONE: prev_part[start_pos] = bsize; break; + case PARTITION_HORZ: + prev_part[start_pos] = subsize; + if (mi_row + bs < cm->mi_rows) + prev_part[start_pos + bs * cm->mi_stride] = subsize; + break; + case PARTITION_VERT: + prev_part[start_pos] = subsize; + if (mi_col + bs < cm->mi_cols) prev_part[start_pos + bs] = subsize; + break; + case PARTITION_SPLIT: + update_prev_partition(cpi, subsize, mi_row, mi_col); + update_prev_partition(cpi, subsize, mi_row + bs, mi_col); + update_prev_partition(cpi, subsize, mi_row, mi_col + bs); + update_prev_partition(cpi, subsize, mi_row + bs, mi_col + bs); + break; + default: assert(0); + } + } +} + static void chroma_check(VP9_COMP *cpi, MACROBLOCK *x, int bsize, unsigned int y_sad, int is_key_frame) { int i; @@ -824,6 +916,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, const int low_res = (cm->width <= 352 && cm->height <= 288); int variance4x4downsample[16]; int segment_id; + int offset = cm->mi_stride * mi_row + mi_col; set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64); segment_id = xd->mi[0]->segment_id; @@ -834,8 +927,8 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, } } - threshold_4x4avg = - (cpi->oxcf.speed < 8) ? thresholds[1] << 1 : thresholds[2] >> 1; + // For non keyframes, disable 4x4 average for low resolution when speed = 8 + threshold_4x4avg = (cpi->oxcf.speed < 8) ? thresholds[1] << 1 : INT64_MAX; memset(x->variance_low, 0, sizeof(x->variance_low)); @@ -857,7 +950,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME); const YV12_BUFFER_CONFIG *yv12_g = NULL; - unsigned int y_sad_g, y_sad_thr; + unsigned int y_sad_g, y_sad_thr, y_sad_last; bsize = BLOCK_32X32 + (mi_col + 4 < cm->mi_cols) * 2 + (mi_row + 4 < cm->mi_rows); @@ -897,6 +990,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, mi->interp_filter = BILINEAR; y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col); + y_sad_last = y_sad; // Pick ref frame for partitioning, bias last frame when y_sad_g and y_sad // are close if short_circuit_low_temp_var is on. y_sad_thr = cpi->sf.short_circuit_low_temp_var ? (y_sad * 7) >> 3 : y_sad; @@ -937,6 +1031,20 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, return 0; } } + + // If the y_sad is small enough, copy the partition of the superblock in the + // last frame to current frame only if the last frame is not a keyframe. + // TODO(jianj) : tune the threshold. + if (cpi->sf.copy_partition_flag && cpi->rc.frames_since_key > 1 && + segment_id == CR_SEGMENT_ID_BASE && + cpi->prev_segment_id[offset] == CR_SEGMENT_ID_BASE && + y_sad_last < cpi->vbp_threshold_copy) { + if (cpi->prev_partition != NULL) { + copy_prev_partition(cpi, BLOCK_64X64, mi_row, mi_col); + chroma_check(cpi, x, bsize, y_sad, is_key_frame); + return 0; + } + } } else { d = VP9_VAR_OFFS; dp = 0; @@ -1131,6 +1239,11 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, } } + if (cm->frame_type != KEY_FRAME && cpi->sf.copy_partition_flag) { + update_prev_partition(cpi, BLOCK_64X64, mi_row, mi_col); + cpi->prev_segment_id[offset] = segment_id; + } + if (cpi->sf.short_circuit_low_temp_var) { set_low_temp_var_flag(cpi, x, xd, &vt, thresholds, ref_frame_partition, mi_col, mi_row); diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c index 20ebe68197e..2cb137d8b93 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemb.c @@ -109,6 +109,8 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, int64_t error0, error1; int16_t t0, t1; EXTRABIT e0; + unsigned int(*const token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] = + mb->token_costs[tx_size][type][ref]; int best, band, pt, i, final_eob; #if CONFIG_VP9_HIGHBITDEPTH const int *cat6_high_cost = vp9_get_high_cost_table(xd->bd); @@ -137,7 +139,6 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, int x = qcoeff[rc]; /* Only add a trellis state for non-zero coefficients. */ if (x) { - int shortcut = 0; error0 = tokens[next][0].error; error1 = tokens[next][1].error; /* Evaluate the first possibility for this state. */ @@ -148,10 +149,8 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, if (next < default_eob) { band = band_translate[i + 1]; pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache); - rate0 += mb->token_costs[tx_size][type][ref][band][0][pt] - [tokens[next][0].token]; - rate1 += mb->token_costs[tx_size][type][ref][band][0][pt] - [tokens[next][1].token]; + rate0 += token_costs[band][0][pt][tokens[next][0].token]; + rate1 += token_costs[band][0][pt][tokens[next][1].token]; } UPDATE_RD_COST(); /* And pick the best. */ @@ -178,12 +177,7 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, if ((abs(x) * dequant_ptr[rc != 0] > (abs(coeff[rc]) << shift)) && (abs(x) * dequant_ptr[rc != 0] < - (abs(coeff[rc]) << shift) + dequant_ptr[rc != 0])) - shortcut = 1; - else - shortcut = 0; - - if (shortcut) { + (abs(coeff[rc]) << shift) + dequant_ptr[rc != 0])) { sz = -(x < 0); x -= 2 * sz + 1; } else { @@ -208,13 +202,11 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, band = band_translate[i + 1]; if (t0 != EOB_TOKEN) { pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache); - rate0 += mb->token_costs[tx_size][type][ref][band][!x][pt] - [tokens[next][0].token]; + rate0 += token_costs[band][!x][pt][tokens[next][0].token]; } if (t1 != EOB_TOKEN) { pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache); - rate1 += mb->token_costs[tx_size][type][ref][band][!x][pt] - [tokens[next][1].token]; + rate1 += token_costs[band][!x][pt][tokens[next][1].token]; } } @@ -223,18 +215,17 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, best = rd_cost1 < rd_cost0; base_bits = vp9_get_cost(t0, e0, cat6_high_cost); - if (shortcut) { #if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - dx -= ((dequant_ptr[rc != 0] >> (xd->bd - 8)) + sz) ^ sz; - } else { - dx -= (dequant_ptr[rc != 0] + sz) ^ sz; - } -#else + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + dx -= ((dequant_ptr[rc != 0] >> (xd->bd - 8)) + sz) ^ sz; + } else { dx -= (dequant_ptr[rc != 0] + sz) ^ sz; -#endif // CONFIG_VP9_HIGHBITDEPTH - d2 = dx * dx; } +#else + dx -= (dequant_ptr[rc != 0] + sz) ^ sz; +#endif // CONFIG_VP9_HIGHBITDEPTH + d2 = dx * dx; + tokens[i][1].rate = base_bits + (best ? rate1 : rate0); tokens[i][1].error = d2 + (best ? error1 : error0); tokens[i][1].next = next; @@ -270,13 +261,11 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, t1 = tokens[next][1].token; /* Update the cost of each path if we're past the EOB token. */ if (t0 != EOB_TOKEN) { - tokens[next][0].rate += - mb->token_costs[tx_size][type][ref][band][1][pt][t0]; + tokens[next][0].rate += token_costs[band][1][pt][t0]; tokens[next][0].token = ZERO_TOKEN; } if (t1 != EOB_TOKEN) { - tokens[next][1].rate += - mb->token_costs[tx_size][type][ref][band][1][pt][t1]; + tokens[next][1].rate += token_costs[band][1][pt][t1]; tokens[next][1].token = ZERO_TOKEN; } tokens[i][0].best_index = tokens[i][1].best_index = 0; @@ -292,8 +281,8 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, error1 = tokens[next][1].error; t0 = tokens[next][0].token; t1 = tokens[next][1].token; - rate0 += mb->token_costs[tx_size][type][ref][band][0][ctx][t0]; - rate1 += mb->token_costs[tx_size][type][ref][band][0][ctx][t1]; + rate0 += token_costs[band][0][ctx][t0]; + rate1 += token_costs[band][0][ctx][t1]; UPDATE_RD_COST(); best = rd_cost1 < rd_cost0; final_eob = -1; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c index 2a58003829c..432eac8da00 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c @@ -108,7 +108,7 @@ static int is_psnr_calc_enabled(VP9_COMP *cpi) { } /* clang-format off */ -static const Vp9LevelSpec vp9_level_defs[VP9_LEVELS] = { +const Vp9LevelSpec vp9_level_defs[VP9_LEVELS] = { { LEVEL_1, 829440, 36864, 200, 400, 2, 1, 4, 8 }, { LEVEL_1_1, 2764800, 73728, 800, 1000, 2, 1, 4, 8 }, { LEVEL_2, 4608000, 122880, 1800, 1500, 2, 1, 4, 8 }, @@ -128,6 +128,16 @@ static const Vp9LevelSpec vp9_level_defs[VP9_LEVELS] = { }; /* clang-format on */ +static const char *level_fail_messages[TARGET_LEVEL_FAIL_IDS] = + { "The average bit-rate is too high.", + "The picture size is too large.", + "The luma sample rate is too large.", + "The CPB size is too large.", + "The compression ratio is too small", + "Too many column tiles are used.", + "The alt-ref distance is too small.", + "Too many reference buffers are used." }; + static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) { switch (mode) { case NORMAL: @@ -224,8 +234,9 @@ VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) { for (i = 0; i < VP9_LEVELS; ++i) { this_level = &vp9_level_defs[i]; - if ((double)level_spec->max_luma_sample_rate * (1 + SAMPLE_RATE_GRACE_P) > - (double)this_level->max_luma_sample_rate || + if ((double)level_spec->max_luma_sample_rate > + (double)this_level->max_luma_sample_rate * + (1 + SAMPLE_RATE_GRACE_P) || level_spec->max_luma_picture_size > this_level->max_luma_picture_size || level_spec->average_bitrate > this_level->average_bitrate || level_spec->max_cpb_size > this_level->max_cpb_size || @@ -439,6 +450,12 @@ static void dealloc_compressor_data(VP9_COMP *cpi) { cpi->nmvsadcosts_hp[0] = NULL; cpi->nmvsadcosts_hp[1] = NULL; + vpx_free(cpi->prev_partition); + cpi->prev_partition = NULL; + + vpx_free(cpi->prev_segment_id); + cpi->prev_segment_id = NULL; + vp9_cyclic_refresh_free(cpi->cyclic_refresh); cpi->cyclic_refresh = NULL; @@ -872,6 +889,22 @@ static void init_buffer_indices(VP9_COMP *cpi) { cpi->alt_fb_idx = 2; } +static void init_level_constraint(LevelConstraint *lc) { + lc->level_index = -1; + lc->max_cpb_size = INT_MAX; + lc->max_frame_size = INT_MAX; + lc->rc_config_updated = 0; + lc->fail_flag = 0; +} + +static void set_level_constraint(LevelConstraint *ls, int8_t level_index) { + vpx_clear_system_state(); + ls->level_index = level_index; + if (level_index >= 0) { + ls->max_cpb_size = vp9_level_defs[level_index].max_cpb_size * (double)1000; + } +} + static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) { VP9_COMMON *const cm = &cpi->common; @@ -887,6 +920,8 @@ static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) { cpi->target_level = oxcf->target_level; cpi->keep_level_stats = oxcf->target_level != LEVEL_MAX; + set_level_constraint(&cpi->level_constraint, + get_level_index(cpi->target_level)); cm->width = oxcf->width; cm->height = oxcf->height; @@ -1403,6 +1438,8 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { cpi->target_level = oxcf->target_level; cpi->keep_level_stats = oxcf->target_level != LEVEL_MAX; + set_level_constraint(&cpi->level_constraint, + get_level_index(cpi->target_level)); if (cm->profile <= PROFILE_1) assert(cm->bit_depth == VPX_BITS_8); @@ -1679,6 +1716,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS; init_level_info(&cpi->level_info); + init_level_constraint(&cpi->level_constraint); #if CONFIG_INTERNAL_STATS cpi->b_calculate_blockiness = 1; @@ -3127,7 +3165,7 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, if (cpi->oxcf.pass == 0 && cpi->oxcf.mode == REALTIME && cpi->oxcf.speed >= 5 && cpi->resize_state == 0 && (cpi->oxcf.content == VP9E_CONTENT_SCREEN || - cpi->oxcf.rc_mode == VPX_VBR) && + cpi->oxcf.rc_mode == VPX_VBR || cpi->sf.copy_partition_flag) && cm->show_frame) vp9_avg_source_sad(cpi); @@ -3238,9 +3276,14 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, int frame_over_shoot_limit; int frame_under_shoot_limit; int q = 0, q_low = 0, q_high = 0; + int enable_acl; set_size_independent_vars(cpi); + enable_acl = cpi->sf.allow_acl + ? (cm->frame_type == KEY_FRAME) || (cm->show_frame == 0) + : 0; + do { vpx_clear_system_state(); @@ -3335,7 +3378,6 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, if (!cpi->sf.use_nonrd_pick_mode) vp9_pack_bitstream(cpi, dest, size); rc->projected_frame_size = (int)(*size) << 3; - restore_coding_context(cpi); if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1; } @@ -3505,7 +3547,22 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, ++cpi->tot_recode_hits; #endif } + + if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) + if (loop || !enable_acl) restore_coding_context(cpi); } while (loop); + + if (enable_acl) { + vp9_encode_frame(cpi); + vpx_clear_system_state(); + restore_coding_context(cpi); + vp9_pack_bitstream(cpi, dest, size); + + vp9_encode_frame(cpi); + vpx_clear_system_state(); + + restore_coding_context(cpi); + } } static int get_ref_frame_flags(const VP9_COMP *cpi) { @@ -4288,6 +4345,26 @@ static void adjust_image_stat(double y, double u, double v, double all, } #endif // CONFIG_INTERNAL_STATS +// Adjust the maximum allowable frame size for the target level. +static void level_rc_framerate(VP9_COMP *cpi, int arf_src_index) { + RATE_CONTROL *const rc = &cpi->rc; + LevelConstraint *const ls = &cpi->level_constraint; + VP9_COMMON *const cm = &cpi->common; + const double max_cpb_size = ls->max_cpb_size; + vpx_clear_system_state(); + rc->max_frame_bandwidth = VPXMIN(rc->max_frame_bandwidth, ls->max_frame_size); + if (frame_is_intra_only(cm)) { + rc->max_frame_bandwidth = + VPXMIN(rc->max_frame_bandwidth, (int)(max_cpb_size * 0.5)); + } else if (arf_src_index > 0) { + rc->max_frame_bandwidth = + VPXMIN(rc->max_frame_bandwidth, (int)(max_cpb_size * 0.4)); + } else { + rc->max_frame_bandwidth = + VPXMIN(rc->max_frame_bandwidth, (int)(max_cpb_size * 0.2)); + } +} + static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) { VP9_COMMON *const cm = &cpi->common; Vp9LevelInfo *const level_info = &cpi->level_info; @@ -4296,6 +4373,8 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) { int i, idx; uint64_t luma_samples, dur_end; const uint32_t luma_pic_size = cm->width * cm->height; + LevelConstraint *const level_constraint = &cpi->level_constraint; + const int8_t level_index = level_constraint->level_index; double cpb_data_size; vpx_clear_system_state(); @@ -4406,6 +4485,78 @@ static void update_level_info(VP9_COMP *cpi, size_t *size, int arf_src_index) { if (level_spec->max_col_tiles < (1 << cm->log2_tile_cols)) { level_spec->max_col_tiles = (1 << cm->log2_tile_cols); } + + if (level_index >= 0 && level_constraint->fail_flag == 0) { + if (level_spec->max_luma_picture_size > + vp9_level_defs[level_index].max_luma_picture_size) { + level_constraint->fail_flag |= (1 << LUMA_PIC_SIZE_TOO_LARGE); + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Failed to encode to the target level %d. %s", + vp9_level_defs[level_index].level, + level_fail_messages[LUMA_PIC_SIZE_TOO_LARGE]); + } + + if ((double)level_spec->max_luma_sample_rate > + (double)vp9_level_defs[level_index].max_luma_sample_rate * + (1 + SAMPLE_RATE_GRACE_P)) { + level_constraint->fail_flag |= (1 << LUMA_SAMPLE_RATE_TOO_LARGE); + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Failed to encode to the target level %d. %s", + vp9_level_defs[level_index].level, + level_fail_messages[LUMA_SAMPLE_RATE_TOO_LARGE]); + } + + if (level_spec->max_col_tiles > vp9_level_defs[level_index].max_col_tiles) { + level_constraint->fail_flag |= (1 << TOO_MANY_COLUMN_TILE); + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Failed to encode to the target level %d. %s", + vp9_level_defs[level_index].level, + level_fail_messages[TOO_MANY_COLUMN_TILE]); + } + + if (level_spec->min_altref_distance < + vp9_level_defs[level_index].min_altref_distance) { + level_constraint->fail_flag |= (1 << ALTREF_DIST_TOO_SMALL); + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Failed to encode to the target level %d. %s", + vp9_level_defs[level_index].level, + level_fail_messages[ALTREF_DIST_TOO_SMALL]); + } + + if (level_spec->max_ref_frame_buffers > + vp9_level_defs[level_index].max_ref_frame_buffers) { + level_constraint->fail_flag |= (1 << TOO_MANY_REF_BUFFER); + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Failed to encode to the target level %d. %s", + vp9_level_defs[level_index].level, + level_fail_messages[TOO_MANY_REF_BUFFER]); + } + + if (level_spec->max_cpb_size > vp9_level_defs[level_index].max_cpb_size) { + level_constraint->fail_flag |= (1 << CPB_TOO_LARGE); + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Failed to encode to the target level %d. %s", + vp9_level_defs[level_index].level, + level_fail_messages[CPB_TOO_LARGE]); + } + + // Set an upper bound for the next frame size. It will be used in + // level_rc_framerate() before encoding the next frame. + cpb_data_size = 0; + for (i = 0; i < CPB_WINDOW_SIZE - 1; ++i) { + if (i >= level_stats->frame_window_buffer.len) break; + idx = (level_stats->frame_window_buffer.start + + level_stats->frame_window_buffer.len - 1 - i) % + FRAME_WINDOW_SIZE; + cpb_data_size += level_stats->frame_window_buffer.buf[idx].size; + } + cpb_data_size = cpb_data_size / 125.0; + level_constraint->max_frame_size = + (int)((vp9_level_defs[level_index].max_cpb_size - cpb_data_size) * + 1000.0); + if (level_stats->frame_window_buffer.len < CPB_WINDOW_SIZE - 1) + level_constraint->max_frame_size >>= 1; + } } int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, @@ -4633,6 +4784,10 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, set_frame_size(cpi); } + if (oxcf->pass != 1 && cpi->level_constraint.level_index >= 0 && + cpi->level_constraint.fail_flag == 0) + level_rc_framerate(cpi, arf_src_index); + if (cpi->oxcf.pass != 0 || cpi->use_svc || frame_is_intra_only(cm) == 1) { for (i = 0; i < MAX_REF_FRAMES; ++i) cpi->scaled_ref_idx[i] = INVALID_IDX; } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h index 0007e6395da..de324d3aab9 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h @@ -237,7 +237,7 @@ typedef struct VP9EncoderConfig { int max_threads; - int target_level; + unsigned int target_level; vpx_fixed_buf_t two_pass_stats_in; struct vpx_codec_pkt_list *output_pkt_list; @@ -341,6 +341,8 @@ typedef struct { uint8_t max_ref_frame_buffers; } Vp9LevelSpec; +extern const Vp9LevelSpec vp9_level_defs[VP9_LEVELS]; + typedef struct { int64_t ts; // timestamp uint32_t luma_samples; @@ -368,6 +370,26 @@ typedef struct { Vp9LevelSpec level_spec; } Vp9LevelInfo; +typedef enum { + BITRATE_TOO_LARGE = 0, + LUMA_PIC_SIZE_TOO_LARGE = 1, + LUMA_SAMPLE_RATE_TOO_LARGE = 2, + CPB_TOO_LARGE = 3, + COMPRESSION_RATIO_TOO_SMALL = 4, + TOO_MANY_COLUMN_TILE = 5, + ALTREF_DIST_TOO_SMALL = 6, + TOO_MANY_REF_BUFFER = 7, + TARGET_LEVEL_FAIL_IDS = 8 +} TARGET_LEVEL_FAIL_ID; + +typedef struct { + int8_t level_index; + uint8_t rc_config_updated; + uint8_t fail_flag; + int max_frame_size; // in bits + double max_cpb_size; // in bits +} LevelConstraint; + typedef struct VP9_COMP { QUANTS quants; ThreadData td; @@ -594,6 +616,8 @@ typedef struct VP9_COMP { int64_t vbp_thresholds[4]; int64_t vbp_threshold_minmax; int64_t vbp_threshold_sad; + // Threshold used for partition copy + int64_t vbp_threshold_copy; BLOCK_SIZE vbp_bsize_min; // Multi-threading @@ -605,6 +629,12 @@ typedef struct VP9_COMP { int keep_level_stats; Vp9LevelInfo level_info; + + // Previous Partition Info + BLOCK_SIZE *prev_partition; + int8_t *prev_segment_id; + + LevelConstraint level_constraint; } VP9_COMP; void vp9_initialize_enc(void); @@ -760,6 +790,14 @@ static INLINE int *cond_cost_list(const struct VP9_COMP *cpi, int *cost_list) { return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL; } +static INLINE int get_level_index(VP9_LEVEL level) { + int i; + for (i = 0; i < VP9_LEVELS; ++i) { + if (level == vp9_level_defs[i].level) return i; + } + return -1; +} + VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec); void vp9_new_framerate(VP9_COMP *cpi, double framerate); diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c index 788952d3467..72e9ac77e78 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c @@ -117,8 +117,7 @@ static void output_stats(FIRSTPASS_STATS *stats, stats->intra_skip_pct, stats->intra_smooth_pct, stats->inactive_zone_rows, stats->inactive_zone_cols, stats->MVr, stats->mvr_abs, stats->MVc, stats->mvc_abs, stats->MVrv, - stats->MVcv, stats->mv_in_out_count, stats->new_mv_count, - stats->count, stats->duration); + stats->MVcv, stats->mv_in_out_count, stats->count, stats->duration); fclose(fpfile); } #endif @@ -157,7 +156,6 @@ static void zero_stats(FIRSTPASS_STATS *section) { section->MVrv = 0.0; section->MVcv = 0.0; section->mv_in_out_count = 0.0; - section->new_mv_count = 0.0; section->count = 0.0; section->duration = 1.0; section->spatial_layer_id = 0; @@ -187,7 +185,6 @@ static void accumulate_stats(FIRSTPASS_STATS *section, section->MVrv += frame->MVrv; section->MVcv += frame->MVcv; section->mv_in_out_count += frame->mv_in_out_count; - section->new_mv_count += frame->new_mv_count; section->count += frame->count; section->duration += frame->duration; } @@ -215,7 +212,6 @@ static void subtract_stats(FIRSTPASS_STATS *section, section->MVrv -= frame->MVrv; section->MVcv -= frame->MVcv; section->mv_in_out_count -= frame->mv_in_out_count; - section->new_mv_count -= frame->new_mv_count; section->count -= frame->count; section->duration -= frame->duration; } @@ -679,9 +675,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { int intra_skip_count = 0; int intra_smooth_count = 0; int image_data_start_row = INVALID_ROW; - int new_mv_count = 0; int sum_in_vectors = 0; - MV lastmv = { 0, 0 }; TWO_PASS *twopass = &cpi->twopass; const MV zero_mv = { 0, 0 }; int recon_y_stride, recon_uv_stride, uv_mb_height; @@ -1144,10 +1138,6 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { } #endif - // Non-zero vector, was it different from the last non zero vector? - if (!is_equal_mv(&mv, &lastmv)) ++new_mv_count; - lastmv = mv; - // Does the row vector point inwards or outwards? if (mb_row < cm->mb_rows / 2) { if (mv.row > 0) @@ -1263,7 +1253,6 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { fps.MVcv = ((double)sum_mvcs - ((double)sum_mvc * sum_mvc / mvcount)) / mvcount; fps.mv_in_out_count = (double)sum_in_vectors / (mvcount * 2); - fps.new_mv_count = new_mv_count; fps.pcnt_motion = (double)mvcount / num_mbs; } else { fps.MVr = 0.0; @@ -1273,7 +1262,6 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { fps.MVrv = 0.0; fps.MVcv = 0.0; fps.mv_in_out_count = 0.0; - fps.new_mv_count = 0.0; fps.pcnt_motion = 0.0; } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h index 6aa39cdc004..5541893dc89 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.h @@ -61,7 +61,6 @@ typedef struct { double MVrv; double MVcv; double mv_in_out_count; - double new_mv_count; double duration; double count; int64_t spatial_layer_id; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c index 2d9bcbda679..70deda84211 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_mcomp.c @@ -277,7 +277,7 @@ static INLINE const uint8_t *pre(const uint8_t *buf, int stride, int r, int c) { const uint8_t *const z = x->plane[0].src.buf; \ const int src_stride = x->plane[0].src.stride; \ const MACROBLOCKD *xd = &x->e_mbd; \ - unsigned int besterr = INT_MAX; \ + unsigned int besterr = UINT_MAX; \ unsigned int sse; \ unsigned int whichdir; \ int thismse; \ @@ -472,7 +472,7 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned_evenmore( if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) || (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3))) - return INT_MAX; + return UINT_MAX; return besterr; } @@ -622,7 +622,7 @@ uint32_t vp9_find_best_sub_pixel_tree_pruned( if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) || (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3))) - return INT_MAX; + return UINT_MAX; return besterr; } @@ -646,7 +646,7 @@ uint32_t vp9_find_best_sub_pixel_tree( const uint8_t *const src_address = z; const int src_stride = x->plane[0].src.stride; const MACROBLOCKD *xd = &x->e_mbd; - unsigned int besterr = INT_MAX; + unsigned int besterr = UINT_MAX; unsigned int sse; int thismse; const int y_stride = xd->plane[0].pre[0].stride; @@ -708,7 +708,7 @@ uint32_t vp9_find_best_sub_pixel_tree( *sse1 = sse; } } else { - cost_array[idx] = INT_MAX; + cost_array[idx] = UINT_MAX; } } @@ -737,7 +737,7 @@ uint32_t vp9_find_best_sub_pixel_tree( *sse1 = sse; } } else { - cost_array[idx] = INT_MAX; + cost_array[idx] = UINT_MAX; } if (best_idx < 4 && best_idx >= 0) { @@ -771,7 +771,7 @@ uint32_t vp9_find_best_sub_pixel_tree( if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) || (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3))) - return INT_MAX; + return UINT_MAX; return besterr; } @@ -2318,11 +2318,14 @@ int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, const struct buf_2d *const what = &x->plane[0].src; const struct buf_2d *const in_what = &xd->plane[0].pre[0]; const MV fcenter_mv = { center_mv->row >> 3, center_mv->col >> 3 }; - unsigned int best_sad = + unsigned int best_sad = INT_MAX; + int i, j; + clamp_mv(ref_mv, x->mv_limits.col_min, x->mv_limits.col_max, + x->mv_limits.row_min, x->mv_limits.row_max); + best_sad = fn_ptr->sdaf(what->buf, what->stride, get_buf_from_mv(in_what, ref_mv), in_what->stride, second_pred) + mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit); - int i, j; for (i = 0; i < search_range; ++i) { int best_site = -1; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_noise_estimate.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_noise_estimate.c index 0e5d8ade4ae..2252fe16b9d 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_noise_estimate.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_noise_estimate.c @@ -26,21 +26,23 @@ void vp9_noise_estimate_init(NOISE_ESTIMATE *const ne, int width, int height) { ne->level = kLowLow; ne->value = 0; ne->count = 0; - ne->thresh = 90; + ne->thresh = 100; ne->last_w = 0; ne->last_h = 0; if (width * height >= 1920 * 1080) { ne->thresh = 200; } else if (width * height >= 1280 * 720) { - ne->thresh = 130; + ne->thresh = 140; } ne->num_frames_estimate = 20; } static int enable_noise_estimation(VP9_COMP *const cpi) { -// Enable noise estimation if denoising is on. +// Enable noise estimation if denoising is on, but not for low resolutions. #if CONFIG_VP9_TEMPORAL_DENOISING - if (cpi->oxcf.noise_sensitivity > 0) return 1; + if (cpi->oxcf.noise_sensitivity > 0 && cpi->common.width >= 640 && + cpi->common.height >= 360) + return 1; #endif // Only allow noise estimate under certain encoding mode. // Enabled for 1 pass CBR, speed >=5, and if resolution is same as original. @@ -50,7 +52,7 @@ static int enable_noise_estimation(VP9_COMP *const cpi) { cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.speed >= 5 && cpi->resize_state == ORIG && cpi->resize_pending == 0 && !cpi->use_svc && cpi->oxcf.content != VP9E_CONTENT_SCREEN && cpi->common.width >= 640 && - cpi->common.height >= 480) + cpi->common.height >= 360) return 1; else return 0; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c index 2b7ddbcd948..33f3f5a476c 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c @@ -1259,16 +1259,17 @@ static void recheck_zeromv_after_denoising( [INTER_OFFSET(ZEROMV)]; this_rdc.dist = dist; this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, rate, dist); - // Switch to ZEROMV if the rdcost for ZEROMV on denoised source - // is lower than best_ref mode (on original source). + // Don't switch to ZEROMV if the rdcost for ZEROMV on denoised source + // is higher than best_ref mode (on original source). if (this_rdc.rdcost > best_rdc->rdcost) { this_rdc = *best_rdc; mi->mode = ctx_den->best_mode; mi->ref_frame[0] = ctx_den->best_ref_frame; mi->interp_filter = ctx_den->best_pred_filter; - if (ctx_den->best_ref_frame == INTRA_FRAME) + if (ctx_den->best_ref_frame == INTRA_FRAME) { mi->mv[0].as_int = INVALID_MV; - else if (ctx_den->best_ref_frame == GOLDEN_FRAME) { + mi->interp_filter = SWITCHABLE_FILTERS; + } else if (ctx_den->best_ref_frame == GOLDEN_FRAME) { mi->mv[0].as_int = ctx_den->frame_mv[ctx_den->best_mode][ctx_den->best_ref_frame] .as_int; @@ -1395,6 +1396,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int perform_intra_pred = 1; int use_golden_nonzeromv = 1; int force_skip_low_temp_var = 0; + int skip_ref_find_pred[4] = { 0 }; #if CONFIG_VP9_TEMPORAL_DENOISING VP9_PICKMODE_CTX_DEN ctx_den; int64_t zero_last_cost_orig = INT64_MAX; @@ -1469,9 +1471,15 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, usable_ref_frame = GOLDEN_FRAME; } - if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR && - (cpi->rc.alt_ref_gf_group || cpi->rc.is_src_frame_alt_ref)) - usable_ref_frame = ALTREF_FRAME; + if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR) { + if (cpi->rc.alt_ref_gf_group || cpi->rc.is_src_frame_alt_ref) + usable_ref_frame = ALTREF_FRAME; + + if (cpi->rc.is_src_frame_alt_ref) { + skip_ref_find_pred[LAST_FRAME] = 1; + skip_ref_find_pred[GOLDEN_FRAME] = 1; + } + } // For svc mode, on spatial_layer_id > 0: if the reference has different scale // constrain the inter mode to only test zero motion. @@ -1490,6 +1498,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (cpi->sf.short_circuit_low_temp_var) { force_skip_low_temp_var = get_force_skip_low_temp_var(&x->variance_low[0], mi_row, mi_col, bsize); + // If force_skip_low_temp_var is set, and for short circuit mode = 1 and 3, + // skip golden reference. + if ((cpi->sf.short_circuit_low_temp_var == 1 || + cpi->sf.short_circuit_low_temp_var == 3) && + force_skip_low_temp_var) { + usable_ref_frame = LAST_FRAME; + } } if (!((cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) && @@ -1497,9 +1512,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, use_golden_nonzeromv = 0; for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) { - find_predictors(cpi, x, ref_frame, frame_mv, const_motion, - &ref_frame_skip_mask, flag_list, tile_data, mi_row, mi_col, - yv12_mb, bsize, force_skip_low_temp_var); + if (!skip_ref_find_pred[ref_frame]) { + find_predictors(cpi, x, ref_frame, frame_mv, const_motion, + &ref_frame_skip_mask, flag_list, tile_data, mi_row, + mi_col, yv12_mb, bsize, force_skip_low_temp_var); + } } for (idx = 0; idx < RT_INTER_MODES; ++idx) { @@ -1519,6 +1536,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, ref_frame = ref_mode_set_svc[idx].ref_frame; } if (ref_frame > usable_ref_frame) continue; + if (skip_ref_find_pred[ref_frame]) continue; if (sf->short_circuit_flat_blocks && x->source_variance == 0 && this_mode != NEARESTMV) { @@ -1558,7 +1576,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, continue; } - if (cpi->sf.short_circuit_low_temp_var == 2 && force_skip_low_temp_var && + if (cpi->sf.short_circuit_low_temp_var >= 2 && force_skip_low_temp_var && ref_frame == LAST_FRAME && this_mode == NEWMV) { continue; } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c index b5cfd5de6c6..02059a70544 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c @@ -45,7 +45,7 @@ #define FRAME_OVERHEAD_BITS 200 -// Use this macro to turn on/off use of alt-refs in one-pass mode. +// Use this macro to turn on/off use of alt-refs in one-pass vbr mode. #define USE_ALTREF_FOR_ONE_PASS 0 #if CONFIG_VP9_HIGHBITDEPTH @@ -414,7 +414,7 @@ static double get_rate_correction_factor(const VP9_COMP *cpi) { } else { if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) && !rc->is_src_frame_alt_ref && !cpi->use_svc && - (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 20)) + (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 100)) rcf = rc->rate_correction_factors[GF_ARF_STD]; else rcf = rc->rate_correction_factors[INTER_NORMAL]; @@ -440,7 +440,7 @@ static void set_rate_correction_factor(VP9_COMP *cpi, double factor) { } else { if ((cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame) && !rc->is_src_frame_alt_ref && !cpi->use_svc && - (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 20)) + (cpi->oxcf.rc_mode != VPX_CBR || cpi->oxcf.gf_cbr_boost_pct > 100)) rc->rate_correction_factors[GF_ARF_STD] = factor; else rc->rate_correction_factors[INTER_NORMAL] = factor; @@ -560,15 +560,17 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame, // In CBR mode, this makes sure q is between oscillating Qs to prevent // resonance. if (cpi->oxcf.rc_mode == VPX_CBR && + (!cpi->oxcf.gf_cbr_boost_pct || + !(cpi->refresh_alt_ref_frame || cpi->refresh_golden_frame)) && (cpi->rc.rc_1_frame * cpi->rc.rc_2_frame == -1) && cpi->rc.q_1_frame != cpi->rc.q_2_frame) { q = clamp(q, VPXMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame), VPXMAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame)); } #if USE_ALTREF_FOR_ONE_PASS - if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_VBR && - cpi->oxcf.lag_in_frames > 0 && cpi->rc.is_src_frame_alt_ref && - !cpi->rc.alt_ref_gf_group) { + if (cpi->oxcf.enable_auto_arf && cpi->oxcf.pass == 0 && + cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0 && + cpi->rc.is_src_frame_alt_ref && !cpi->rc.alt_ref_gf_group) { q = VPXMIN(q, (q + cpi->rc.last_boosted_qindex) >> 1); } #endif @@ -1528,8 +1530,14 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) { adjust_gfint_frame_constraint(cpi, rc->frames_to_key); rc->frames_till_gf_update_due = rc->baseline_gf_interval; cpi->refresh_golden_frame = 1; - rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS; - rc->alt_ref_gf_group = USE_ALTREF_FOR_ONE_PASS; + rc->source_alt_ref_pending = 0; + rc->alt_ref_gf_group = 0; +#if USE_ALTREF_FOR_ONE_PASS + if (cpi->oxcf.enable_auto_arf) { + rc->source_alt_ref_pending = 1; + rc->alt_ref_gf_group = 1; + } +#endif } if (cm->frame_type == KEY_FRAME) target = calc_iframe_target_size_one_pass_vbr(cpi); @@ -2140,20 +2148,22 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) { rc->gfu_boost = DEFAULT_GF_BOOST >> 2; } #if USE_ALTREF_FOR_ONE_PASS - // Don't use alt-ref if there is a scene cut within the group, - // or content is not low. - if ((rc->high_source_sad_lagindex > 0 && - rc->high_source_sad_lagindex <= rc->frames_till_gf_update_due) || - (avg_source_sad_lag > 3 * sad_thresh1 >> 3)) { - rc->source_alt_ref_pending = 0; - rc->alt_ref_gf_group = 0; - } else { - rc->source_alt_ref_pending = 1; - rc->alt_ref_gf_group = 1; - // If alt-ref is used for this gf group, limit the interval. - if (rc->baseline_gf_interval > 10 && - rc->baseline_gf_interval < rc->frames_to_key) - rc->baseline_gf_interval = 10; + if (cpi->oxcf.enable_auto_arf) { + // Don't use alt-ref if there is a scene cut within the group, + // or content is not low. + if ((rc->high_source_sad_lagindex > 0 && + rc->high_source_sad_lagindex <= rc->frames_till_gf_update_due) || + (avg_source_sad_lag > 3 * sad_thresh1 >> 3)) { + rc->source_alt_ref_pending = 0; + rc->alt_ref_gf_group = 0; + } else { + rc->source_alt_ref_pending = 1; + rc->alt_ref_gf_group = 1; + // If alt-ref is used for this gf group, limit the interval. + if (rc->baseline_gf_interval > 10 && + rc->baseline_gf_interval < rc->frames_to_key) + rc->baseline_gf_interval = 10; + } } #endif target = calc_pframe_target_size_one_pass_vbr(cpi); @@ -2243,10 +2253,12 @@ void vp9_avg_source_sad(VP9_COMP *cpi) { for (sbi_row = 0; sbi_row < sb_rows; ++sbi_row) { for (sbi_col = 0; sbi_col < sb_cols; ++sbi_col) { // Checker-board pattern, ignore boundary. - if ((sbi_row > 0 && sbi_col > 0) && - (sbi_row < sb_rows - 1 && sbi_col < sb_cols - 1) && - ((sbi_row % 2 == 0 && sbi_col % 2 == 0) || - (sbi_row % 2 != 0 && sbi_col % 2 != 0))) { + // If the partition copy is on, compute for every superblock. + if (cpi->sf.copy_partition_flag || + ((sbi_row > 0 && sbi_col > 0) && + (sbi_row < sb_rows - 1 && sbi_col < sb_cols - 1) && + ((sbi_row % 2 == 0 && sbi_col % 2 == 0) || + (sbi_row % 2 != 0 && sbi_col % 2 != 0)))) { num_samples++; avg_sad += cpi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y, last_src_ystride); @@ -2284,7 +2296,10 @@ void vp9_avg_source_sad(VP9_COMP *cpi) { cpi->ext_refresh_frame_flags_pending == 0) { int target; cpi->refresh_golden_frame = 1; - rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS; + rc->source_alt_ref_pending = 0; +#if USE_ALTREF_FOR_ONE_PASS + if (cpi->oxcf.enable_auto_arf) rc->source_alt_ref_pending = 1; +#endif rc->gfu_boost = DEFAULT_GF_BOOST >> 1; rc->baseline_gf_interval = VPXMIN(20, VPXMAX(10, rc->baseline_gf_interval)); diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c index 3e1ed50a6d2..81cb431ba58 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c @@ -182,6 +182,7 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, sf->mv.subpel_iters_per_step = 1; sf->mode_skip_start = 10; sf->adaptive_pred_interp_filter = 1; + sf->allow_acl = 0; sf->intra_y_mode_mask[TX_32X32] = INTRA_DC_H_V; sf->intra_uv_mode_mask[TX_32X32] = INTRA_DC_H_V; @@ -309,6 +310,8 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed, sf->use_fast_coef_costing = 1; sf->allow_exhaustive_searches = 0; sf->exhaustive_searches_thresh = INT_MAX; + sf->allow_acl = 0; + sf->copy_partition_flag = 0; if (speed >= 1) { sf->allow_txfm_domain_distortion = 1; @@ -494,6 +497,18 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed, if (speed >= 8) { sf->adaptive_rd_thresh = 4; + // Disabled for now until the threshold is tuned. + sf->copy_partition_flag = 0; + if (sf->copy_partition_flag) { + if (cpi->prev_partition == NULL) { + cpi->prev_partition = (BLOCK_SIZE *)vpx_calloc( + cm->mi_stride * cm->mi_rows, sizeof(BLOCK_SIZE)); + } + if (cpi->prev_segment_id == NULL) { + cpi->prev_segment_id = + (int8_t *)vpx_calloc(cm->mi_stride * cm->mi_rows, sizeof(int8_t)); + } + } sf->mv.subpel_force_stop = (content == VP9E_CONTENT_SCREEN) ? 3 : 2; if (content == VP9E_CONTENT_SCREEN) sf->lpf_pick = LPF_PICK_MINIMAL_LPF; // Only keep INTRA_DC mode for speed 8. @@ -505,7 +520,7 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed, if (!cpi->use_svc && cpi->oxcf.rc_mode == VPX_CBR && content != VP9E_CONTENT_SCREEN) { // More aggressive short circuit for speed 8. - sf->short_circuit_low_temp_var = 2; + sf->short_circuit_low_temp_var = 3; } sf->limit_newmv_early_exit = 0; } @@ -592,6 +607,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { sf->tx_domain_thresh = 99.0; sf->allow_quant_coeff_opt = sf->optimize_coefficients; sf->quant_opt_thresh = 99.0; + sf->allow_acl = 1; for (i = 0; i < TX_SIZES; i++) { sf->intra_y_mode_mask[i] = INTRA_ALL; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.h index 6d0b9420a1d..944fe6322fb 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.h @@ -244,6 +244,10 @@ typedef struct SPEED_FEATURES { int allow_quant_coeff_opt; double quant_opt_thresh; + // Enable asymptotic closed-loop encoding decision for key frame and + // alternate reference frames. + int allow_acl; + // Use transform domain distortion. Use pixel domain distortion in speed 0 // and certain situations in higher speed to improve the RD model precision. int allow_txfm_domain_distortion; @@ -452,11 +456,13 @@ typedef struct SPEED_FEATURES { int short_circuit_flat_blocks; // Skip a number of expensive mode evaluations for blocks with very low - // temporal variance. - // 1: Skip golden non-zeromv and ALL INTRA for bsize >= 32x32. + // temporal variance. If the low temporal variance flag is set for a block, + // do the following: + // 1: Skip all golden modes and ALL INTRA for bsize >= 32x32. // 2: Skip golden non-zeromv and newmv-last for bsize >= 16x16, skip ALL // INTRA for bsize >= 32x32 and vert/horz INTRA for bsize 16x16, 16x32 and // 32x16. + // 3: Same as (2), but also skip golden zeromv. int short_circuit_low_temp_var; // Limits the rd-threshold update for early exit for the newmv-last mode, @@ -469,6 +475,9 @@ typedef struct SPEED_FEATURES { // Bias to use base mv and skip 1/4 subpel search when use base mv in // enhancement layer. int base_mv_aggressive; + + // Global flag to enable partition copy from the previous frame. + int copy_partition_flag; } SPEED_FEATURES; struct VP9_COMP; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c index 2d29e268b1f..1d892dc148b 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_svc_layercontext.c @@ -650,6 +650,21 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) { lc->scaling_factor_num, lc->scaling_factor_den, &width, &height); + // The usage of use_base_mv assumes down-scale of 2x2. For now, turn off use + // of base motion vectors if spatial scale factors for any layers are not 2. + // TODO(marpan): Fix this to allow for use_base_mv for scale factors != 2. + if (cpi->svc.number_spatial_layers > 1) { + int sl; + for (sl = 0; sl < cpi->svc.number_spatial_layers - 1; ++sl) { + lc = &cpi->svc.layer_context[sl * cpi->svc.number_temporal_layers + + cpi->svc.temporal_layer_id]; + if (lc->scaling_factor_num != lc->scaling_factor_den >> 1) { + cpi->svc.use_base_mv = 0; + break; + } + } + } + if (vp9_set_size_literal(cpi, width, height) != 0) return VPX_CODEC_INVALID_PARAM; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c index a167eeb15de..344658483a1 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_temporal_filter.c @@ -208,17 +208,17 @@ void vp9_highbd_temporal_filter_apply_c( } #endif // CONFIG_VP9_HIGHBITDEPTH -static int temporal_filter_find_matching_mb_c(VP9_COMP *cpi, - uint8_t *arf_frame_buf, - uint8_t *frame_ptr_buf, - int stride) { +static uint32_t temporal_filter_find_matching_mb_c(VP9_COMP *cpi, + uint8_t *arf_frame_buf, + uint8_t *frame_ptr_buf, + int stride) { MACROBLOCK *const x = &cpi->td.mb; MACROBLOCKD *const xd = &x->e_mbd; MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv; const SEARCH_METHODS old_search_method = mv_sf->search_method; int step_param; int sadpb = x->sadperbit16; - int bestsme = INT_MAX; + uint32_t bestsme = UINT_MAX; uint32_t distortion; uint32_t sse; int cost_list[5]; @@ -334,8 +334,8 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, ((mb_cols - 1 - mb_col) * 16) + (17 - 2 * VP9_INTERP_EXTEND); for (frame = 0; frame < frame_count; frame++) { - const int thresh_low = 10000; - const int thresh_high = 20000; + const uint32_t thresh_low = 10000; + const uint32_t thresh_high = 20000; if (frames[frame] == NULL) continue; @@ -346,7 +346,7 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, filter_weight = 2; } else { // Find best match in this frame by MC - int err = temporal_filter_find_matching_mb_c( + uint32_t err = temporal_filter_find_matching_mb_c( cpi, frames[alt_ref_index]->y_buffer + mb_y_offset, frames[frame]->y_buffer + mb_y_offset, frames[frame]->y_stride); diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c index a797b2c2624..e6cea080d16 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_cx_iface.c @@ -157,6 +157,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK_HI(cfg, rc_max_quantizer, 63); RANGE_CHECK_HI(cfg, rc_min_quantizer, cfg->rc_max_quantizer); RANGE_CHECK_BOOL(extra_cfg, lossless); + RANGE_CHECK_BOOL(extra_cfg, frame_parallel_decoding_mode); RANGE_CHECK(extra_cfg, aq_mode, 0, AQ_MODE_COUNT - 2); RANGE_CHECK(extra_cfg, alt_ref_aq, 0, 1); RANGE_CHECK(extra_cfg, frame_periodic_boost, 0, 1); @@ -389,6 +390,50 @@ static int get_image_bps(const vpx_image_t *img) { return 0; } +// Modify the encoder config for the target level. +static void config_target_level(VP9EncoderConfig *oxcf) { + double max_average_bitrate; // in bits per second + int max_over_shoot_pct; + const int target_level_index = get_level_index(oxcf->target_level); + + vpx_clear_system_state(); + assert(target_level_index >= 0); + assert(target_level_index < VP9_LEVELS); + + // Maximum target bit-rate is level_limit * 80%. + max_average_bitrate = + vp9_level_defs[target_level_index].average_bitrate * 800.0; + if ((double)oxcf->target_bandwidth > max_average_bitrate) + oxcf->target_bandwidth = (int64_t)(max_average_bitrate); + if (oxcf->ss_number_layers == 1 && oxcf->pass != 0) + oxcf->ss_target_bitrate[0] = (int)oxcf->target_bandwidth; + + // Adjust max over-shoot percentage. + max_over_shoot_pct = + (int)((max_average_bitrate * 1.10 - (double)oxcf->target_bandwidth) * + 100 / (double)(oxcf->target_bandwidth)); + if (oxcf->over_shoot_pct > max_over_shoot_pct) + oxcf->over_shoot_pct = max_over_shoot_pct; + + // Adjust worst allowed quantizer. + oxcf->worst_allowed_q = vp9_quantizer_to_qindex(63); + + // Adjust minimum art-ref distance. + if (oxcf->min_gf_interval < + (int)vp9_level_defs[target_level_index].min_altref_distance) + oxcf->min_gf_interval = + (int)vp9_level_defs[target_level_index].min_altref_distance; + + // Adjust maximum column tiles. + if (vp9_level_defs[target_level_index].max_col_tiles < + (1 << oxcf->tile_columns)) { + while (oxcf->tile_columns > 0 && + vp9_level_defs[target_level_index].max_col_tiles < + (1 << oxcf->tile_columns)) + --oxcf->tile_columns; + } +} + static vpx_codec_err_t set_encoder_config( VP9EncoderConfig *oxcf, const vpx_codec_enc_cfg_t *cfg, const struct vp9_extracfg *extra_cfg) { @@ -532,6 +577,8 @@ static vpx_codec_err_t set_encoder_config( } else if (oxcf->ts_number_layers == 1) { oxcf->ts_rate_decimator[0] = 1; } + + if (get_level_index(oxcf->target_level) >= 0) config_target_level(oxcf); /* printf("Current VP9 Settings: \n"); printf("target_bandwidth: %d\n", oxcf->target_bandwidth); @@ -1002,6 +1049,28 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, if (cpi == NULL) return VPX_CODEC_INVALID_PARAM; + if (cpi->oxcf.pass == 2 && cpi->level_constraint.level_index >= 0 && + !cpi->level_constraint.rc_config_updated) { + SVC *const svc = &cpi->svc; + const int is_two_pass_svc = + (svc->number_spatial_layers > 1) || (svc->number_temporal_layers > 1); + const VP9EncoderConfig *const oxcf = &cpi->oxcf; + TWO_PASS *const twopass = &cpi->twopass; + FIRSTPASS_STATS *stats = &twopass->total_stats; + if (is_two_pass_svc) { + const double frame_rate = 10000000.0 * stats->count / stats->duration; + vp9_update_spatial_layer_framerate(cpi, frame_rate); + twopass->bits_left = + (int64_t)(stats->duration * + svc->layer_context[svc->spatial_layer_id].target_bandwidth / + 10000000.0); + } else { + twopass->bits_left = + (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0); + } + cpi->level_constraint.rc_config_updated = 1; + } + if (img != NULL) { res = validate_img(ctx, img); if (res == VPX_CODEC_OK) { diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c b/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c index 88b1531d8c4..c2f80d88515 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c @@ -201,7 +201,7 @@ static vpx_codec_err_t parse_options(SvcContext *svc_ctx, const char *options) { char *input_string; char *option_name; char *option_value; - char *input_ptr; + char *input_ptr = NULL; SvcInternal_t *const si = get_svc_internal(svc_ctx); vpx_codec_err_t res = VPX_CODEC_OK; int i, alt_ref_enabled = 0; diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/deblock_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/deblock_neon.c new file mode 100644 index 00000000000..1fb41d29920 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/deblock_neon.c @@ -0,0 +1,485 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include <assert.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/transpose_neon.h" + +extern const int16_t vpx_rv[]; + +static uint8x8_t average_k_out(const uint8x8_t a2, const uint8x8_t a1, + const uint8x8_t v0, const uint8x8_t b1, + const uint8x8_t b2) { + const uint8x8_t k1 = vrhadd_u8(a2, a1); + const uint8x8_t k2 = vrhadd_u8(b2, b1); + const uint8x8_t k3 = vrhadd_u8(k1, k2); + return vrhadd_u8(k3, v0); +} + +static uint8x8_t generate_mask(const uint8x8_t a2, const uint8x8_t a1, + const uint8x8_t v0, const uint8x8_t b1, + const uint8x8_t b2, const uint8x8_t filter) { + const uint8x8_t a2_v0 = vabd_u8(a2, v0); + const uint8x8_t a1_v0 = vabd_u8(a1, v0); + const uint8x8_t b1_v0 = vabd_u8(b1, v0); + const uint8x8_t b2_v0 = vabd_u8(b2, v0); + + uint8x8_t max = vmax_u8(a2_v0, a1_v0); + max = vmax_u8(b1_v0, max); + max = vmax_u8(b2_v0, max); + return vclt_u8(max, filter); +} + +static uint8x8_t generate_output(const uint8x8_t a2, const uint8x8_t a1, + const uint8x8_t v0, const uint8x8_t b1, + const uint8x8_t b2, const uint8x8_t filter) { + const uint8x8_t k_out = average_k_out(a2, a1, v0, b1, b2); + const uint8x8_t mask = generate_mask(a2, a1, v0, b1, b2, filter); + + return vbsl_u8(mask, k_out, v0); +} + +// Same functions but for uint8x16_t. +static uint8x16_t average_k_outq(const uint8x16_t a2, const uint8x16_t a1, + const uint8x16_t v0, const uint8x16_t b1, + const uint8x16_t b2) { + const uint8x16_t k1 = vrhaddq_u8(a2, a1); + const uint8x16_t k2 = vrhaddq_u8(b2, b1); + const uint8x16_t k3 = vrhaddq_u8(k1, k2); + return vrhaddq_u8(k3, v0); +} + +static uint8x16_t generate_maskq(const uint8x16_t a2, const uint8x16_t a1, + const uint8x16_t v0, const uint8x16_t b1, + const uint8x16_t b2, const uint8x16_t filter) { + const uint8x16_t a2_v0 = vabdq_u8(a2, v0); + const uint8x16_t a1_v0 = vabdq_u8(a1, v0); + const uint8x16_t b1_v0 = vabdq_u8(b1, v0); + const uint8x16_t b2_v0 = vabdq_u8(b2, v0); + + uint8x16_t max = vmaxq_u8(a2_v0, a1_v0); + max = vmaxq_u8(b1_v0, max); + max = vmaxq_u8(b2_v0, max); + return vcltq_u8(max, filter); +} + +static uint8x16_t generate_outputq(const uint8x16_t a2, const uint8x16_t a1, + const uint8x16_t v0, const uint8x16_t b1, + const uint8x16_t b2, + const uint8x16_t filter) { + const uint8x16_t k_out = average_k_outq(a2, a1, v0, b1, b2); + const uint8x16_t mask = generate_maskq(a2, a1, v0, b1, b2, filter); + + return vbslq_u8(mask, k_out, v0); +} + +void vpx_post_proc_down_and_across_mb_row_neon(uint8_t *src_ptr, + uint8_t *dst_ptr, int src_stride, + int dst_stride, int cols, + uint8_t *f, int size) { + uint8_t *src, *dst; + int row; + int col; + + // Process a stripe of macroblocks. The stripe will be a multiple of 16 (for + // Y) or 8 (for U/V) wide (cols) and the height (size) will be 16 (for Y) or 8 + // (for U/V). + assert((size == 8 || size == 16) && cols % 8 == 0); + + // While columns of length 16 can be processed, load them. + for (col = 0; col < cols - 8; col += 16) { + uint8x16_t a0, a1, a2, a3, a4, a5, a6, a7; + src = src_ptr - 2 * src_stride; + dst = dst_ptr; + + a0 = vld1q_u8(src); + src += src_stride; + a1 = vld1q_u8(src); + src += src_stride; + a2 = vld1q_u8(src); + src += src_stride; + a3 = vld1q_u8(src); + src += src_stride; + + for (row = 0; row < size; row += 4) { + uint8x16_t v_out_0, v_out_1, v_out_2, v_out_3; + const uint8x16_t filterq = vld1q_u8(f + col); + + a4 = vld1q_u8(src); + src += src_stride; + a5 = vld1q_u8(src); + src += src_stride; + a6 = vld1q_u8(src); + src += src_stride; + a7 = vld1q_u8(src); + src += src_stride; + + v_out_0 = generate_outputq(a0, a1, a2, a3, a4, filterq); + v_out_1 = generate_outputq(a1, a2, a3, a4, a5, filterq); + v_out_2 = generate_outputq(a2, a3, a4, a5, a6, filterq); + v_out_3 = generate_outputq(a3, a4, a5, a6, a7, filterq); + + vst1q_u8(dst, v_out_0); + dst += dst_stride; + vst1q_u8(dst, v_out_1); + dst += dst_stride; + vst1q_u8(dst, v_out_2); + dst += dst_stride; + vst1q_u8(dst, v_out_3); + dst += dst_stride; + + // Rotate over to the next slot. + a0 = a4; + a1 = a5; + a2 = a6; + a3 = a7; + } + + src_ptr += 16; + dst_ptr += 16; + } + + // Clean up any left over column of length 8. + if (col != cols) { + uint8x8_t a0, a1, a2, a3, a4, a5, a6, a7; + src = src_ptr - 2 * src_stride; + dst = dst_ptr; + + a0 = vld1_u8(src); + src += src_stride; + a1 = vld1_u8(src); + src += src_stride; + a2 = vld1_u8(src); + src += src_stride; + a3 = vld1_u8(src); + src += src_stride; + + for (row = 0; row < size; row += 4) { + uint8x8_t v_out_0, v_out_1, v_out_2, v_out_3; + const uint8x8_t filter = vld1_u8(f + col); + + a4 = vld1_u8(src); + src += src_stride; + a5 = vld1_u8(src); + src += src_stride; + a6 = vld1_u8(src); + src += src_stride; + a7 = vld1_u8(src); + src += src_stride; + + v_out_0 = generate_output(a0, a1, a2, a3, a4, filter); + v_out_1 = generate_output(a1, a2, a3, a4, a5, filter); + v_out_2 = generate_output(a2, a3, a4, a5, a6, filter); + v_out_3 = generate_output(a3, a4, a5, a6, a7, filter); + + vst1_u8(dst, v_out_0); + dst += dst_stride; + vst1_u8(dst, v_out_1); + dst += dst_stride; + vst1_u8(dst, v_out_2); + dst += dst_stride; + vst1_u8(dst, v_out_3); + dst += dst_stride; + + // Rotate over to the next slot. + a0 = a4; + a1 = a5; + a2 = a6; + a3 = a7; + } + + // Not strictly necessary but makes resetting dst_ptr easier. + dst_ptr += 8; + } + + dst_ptr -= cols; + + for (row = 0; row < size; row += 8) { + uint8x8_t a0, a1, a2, a3; + uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7; + + src = dst_ptr; + dst = dst_ptr; + + // Load 8 values, transpose 4 of them, and discard 2 because they will be + // reloaded later. + load_and_transpose_u8_4x8(src, dst_stride, &a0, &a1, &a2, &a3); + a3 = a1; + a2 = a1 = a0; // Extend left border. + + src += 2; + + for (col = 0; col < cols; col += 8) { + uint8x8_t v_out_0, v_out_1, v_out_2, v_out_3, v_out_4, v_out_5, v_out_6, + v_out_7; + // Although the filter is meant to be applied vertically and is instead + // being applied horizontally here it's OK because it's set in blocks of 8 + // (or 16). + const uint8x8_t filter = vld1_u8(f + col); + + load_and_transpose_u8_8x8(src, dst_stride, &b0, &b1, &b2, &b3, &b4, &b5, + &b6, &b7); + + if (col + 8 == cols) { + // Last row. Extend border (b5). + b6 = b7 = b5; + } + + v_out_0 = generate_output(a0, a1, a2, a3, b0, filter); + v_out_1 = generate_output(a1, a2, a3, b0, b1, filter); + v_out_2 = generate_output(a2, a3, b0, b1, b2, filter); + v_out_3 = generate_output(a3, b0, b1, b2, b3, filter); + v_out_4 = generate_output(b0, b1, b2, b3, b4, filter); + v_out_5 = generate_output(b1, b2, b3, b4, b5, filter); + v_out_6 = generate_output(b2, b3, b4, b5, b6, filter); + v_out_7 = generate_output(b3, b4, b5, b6, b7, filter); + + transpose_and_store_u8_8x8(dst, dst_stride, v_out_0, v_out_1, v_out_2, + v_out_3, v_out_4, v_out_5, v_out_6, v_out_7); + + a0 = b4; + a1 = b5; + a2 = b6; + a3 = b7; + + src += 8; + dst += 8; + } + + dst_ptr += 8 * dst_stride; + } +} + +// sum += x; +// sumsq += x * y; +static void accumulate_sum_sumsq(const int16x4_t x, const int32x4_t xy, + int16x4_t *const sum, int32x4_t *const sumsq) { + const int16x4_t zero = vdup_n_s16(0); + const int32x4_t zeroq = vdupq_n_s32(0); + + // Add in the first set because vext doesn't work with '0'. + *sum = vadd_s16(*sum, x); + *sumsq = vaddq_s32(*sumsq, xy); + + // Shift x and xy to the right and sum. vext requires an immediate. + *sum = vadd_s16(*sum, vext_s16(zero, x, 1)); + *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 1)); + + *sum = vadd_s16(*sum, vext_s16(zero, x, 2)); + *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 2)); + + *sum = vadd_s16(*sum, vext_s16(zero, x, 3)); + *sumsq = vaddq_s32(*sumsq, vextq_s32(zeroq, xy, 3)); +} + +// Generate mask based on (sumsq * 15 - sum * sum < flimit) +static uint16x4_t calculate_mask(const int16x4_t sum, const int32x4_t sumsq, + const int32x4_t f, const int32x4_t fifteen) { + const int32x4_t a = vmulq_s32(sumsq, fifteen); + const int32x4_t b = vmlsl_s16(a, sum, sum); + const uint32x4_t mask32 = vcltq_s32(b, f); + return vmovn_u32(mask32); +} + +static uint8x8_t combine_mask(const int16x4_t sum_low, const int16x4_t sum_high, + const int32x4_t sumsq_low, + const int32x4_t sumsq_high, const int32x4_t f) { + const int32x4_t fifteen = vdupq_n_s32(15); + const uint16x4_t mask16_low = calculate_mask(sum_low, sumsq_low, f, fifteen); + const uint16x4_t mask16_high = + calculate_mask(sum_high, sumsq_high, f, fifteen); + return vmovn_u16(vcombine_u16(mask16_low, mask16_high)); +} + +// Apply filter of (8 + sum + s[c]) >> 4. +static uint8x8_t filter_pixels(const int16x8_t sum, const uint8x8_t s) { + const int16x8_t s16 = vreinterpretq_s16_u16(vmovl_u8(s)); + const int16x8_t sum_s = vaddq_s16(sum, s16); + + return vqrshrun_n_s16(sum_s, 4); +} + +void vpx_mbpost_proc_across_ip_neon(uint8_t *src, int pitch, int rows, int cols, + int flimit) { + int row, col; + const int32x4_t f = vdupq_n_s32(flimit); + + assert(cols % 8 == 0); + + for (row = 0; row < rows; ++row) { + // Sum the first 8 elements, which are extended from s[0]. + // sumsq gets primed with +16. + int sumsq = src[0] * src[0] * 9 + 16; + int sum = src[0] * 9; + + uint8x8_t left_context, s, right_context; + int16x4_t sum_low, sum_high; + int32x4_t sumsq_low, sumsq_high; + + // Sum (+square) the next 6 elements. + // Skip [0] because it's included above. + for (col = 1; col <= 6; ++col) { + sumsq += src[col] * src[col]; + sum += src[col]; + } + + // Prime the sums. Later the loop uses the _high values to prime the new + // vectors. + sumsq_high = vdupq_n_s32(sumsq); + sum_high = vdup_n_s16(sum); + + // Manually extend the left border. + left_context = vdup_n_u8(src[0]); + + for (col = 0; col < cols; col += 8) { + uint8x8_t mask, output; + int16x8_t x, y; + int32x4_t xy_low, xy_high; + + s = vld1_u8(src + col); + + if (col + 8 == cols) { + // Last row. Extend border. + right_context = vdup_n_u8(src[col + 7]); + } else { + right_context = vld1_u8(src + col + 7); + } + + x = vreinterpretq_s16_u16(vsubl_u8(right_context, left_context)); + y = vreinterpretq_s16_u16(vaddl_u8(right_context, left_context)); + xy_low = vmull_s16(vget_low_s16(x), vget_low_s16(y)); + xy_high = vmull_s16(vget_high_s16(x), vget_high_s16(y)); + + // Catch up to the last sum'd value. + sum_low = vdup_lane_s16(sum_high, 3); + sumsq_low = vdupq_lane_s32(vget_high_s32(sumsq_high), 1); + + accumulate_sum_sumsq(vget_low_s16(x), xy_low, &sum_low, &sumsq_low); + + // Need to do this sequentially because we need the max value from + // sum_low. + sum_high = vdup_lane_s16(sum_low, 3); + sumsq_high = vdupq_lane_s32(vget_high_s32(sumsq_low), 1); + + accumulate_sum_sumsq(vget_high_s16(x), xy_high, &sum_high, &sumsq_high); + + mask = combine_mask(sum_low, sum_high, sumsq_low, sumsq_high, f); + + output = filter_pixels(vcombine_s16(sum_low, sum_high), s); + output = vbsl_u8(mask, output, s); + + vst1_u8(src + col, output); + + left_context = s; + } + + src += pitch; + } +} + +// Apply filter of (vpx_rv + sum + s[c]) >> 4. +static uint8x8_t filter_pixels_rv(const int16x8_t sum, const uint8x8_t s, + const int16x8_t rv) { + const int16x8_t s16 = vreinterpretq_s16_u16(vmovl_u8(s)); + const int16x8_t sum_s = vaddq_s16(sum, s16); + const int16x8_t rounded = vaddq_s16(sum_s, rv); + + return vqshrun_n_s16(rounded, 4); +} + +void vpx_mbpost_proc_down_neon(uint8_t *dst, int pitch, int rows, int cols, + int flimit) { + int row, col, i; + const int32x4_t f = vdupq_n_s32(flimit); + uint8x8_t below_context = vdup_n_u8(0); + + // 8 columns are processed at a time. + // If rows is less than 8 the bottom border extension fails. + assert(cols % 8 == 0); + assert(rows >= 8); + + // Load and keep the first 8 values in memory. Process a vertical stripe that + // is 8 wide. + for (col = 0; col < cols; col += 8) { + uint8x8_t s, above_context[8]; + int16x8_t sum, sum_tmp; + int32x4_t sumsq_low, sumsq_high; + + // Load and extend the top border. + s = vld1_u8(dst); + for (i = 0; i < 8; i++) { + above_context[i] = s; + } + + sum_tmp = vreinterpretq_s16_u16(vmovl_u8(s)); + + // sum * 9 + sum = vmulq_n_s16(sum_tmp, 9); + + // (sum * 9) * sum == sum * sum * 9 + sumsq_low = vmull_s16(vget_low_s16(sum), vget_low_s16(sum_tmp)); + sumsq_high = vmull_s16(vget_high_s16(sum), vget_high_s16(sum_tmp)); + + // Load and discard the next 6 values to prime sum and sumsq. + for (i = 1; i <= 6; ++i) { + const uint8x8_t a = vld1_u8(dst + i * pitch); + const int16x8_t b = vreinterpretq_s16_u16(vmovl_u8(a)); + sum = vaddq_s16(sum, b); + + sumsq_low = vmlal_s16(sumsq_low, vget_low_s16(b), vget_low_s16(b)); + sumsq_high = vmlal_s16(sumsq_high, vget_high_s16(b), vget_high_s16(b)); + } + + for (row = 0; row < rows; ++row) { + uint8x8_t mask, output; + int16x8_t x, y; + int32x4_t xy_low, xy_high; + + s = vld1_u8(dst + row * pitch); + + // Extend the bottom border. + if (row + 7 < rows) { + below_context = vld1_u8(dst + (row + 7) * pitch); + } + + x = vreinterpretq_s16_u16(vsubl_u8(below_context, above_context[0])); + y = vreinterpretq_s16_u16(vaddl_u8(below_context, above_context[0])); + xy_low = vmull_s16(vget_low_s16(x), vget_low_s16(y)); + xy_high = vmull_s16(vget_high_s16(x), vget_high_s16(y)); + + sum = vaddq_s16(sum, x); + + sumsq_low = vaddq_s32(sumsq_low, xy_low); + sumsq_high = vaddq_s32(sumsq_high, xy_high); + + mask = combine_mask(vget_low_s16(sum), vget_high_s16(sum), sumsq_low, + sumsq_high, f); + + output = filter_pixels_rv(sum, s, vld1q_s16(vpx_rv + (row & 127))); + output = vbsl_u8(mask, output, s); + + vst1_u8(dst + row * pitch, output); + + above_context[0] = above_context[1]; + above_context[1] = above_context[2]; + above_context[2] = above_context[3]; + above_context[3] = above_context[4]; + above_context[4] = above_context[5]; + above_context[5] = above_context[6]; + above_context[6] = above_context[7]; + above_context[7] = s; + } + + dst += 8; + } +} diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c new file mode 100644 index 00000000000..26fa3e216bb --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/inv_txfm.h" + +static INLINE void highbd_idct4x4_1_add_kernel1(uint16_t **dest, + const int stride, + const int16x8_t res, + const int16x8_t max) { + const uint16x4_t a0 = vld1_u16(*dest); + const uint16x4_t a1 = vld1_u16(*dest + stride); + const int16x8_t a = vreinterpretq_s16_u16(vcombine_u16(a0, a1)); + // Note: In some profile tests, res is quite close to +/-32767. + // We use saturating addition. + const int16x8_t b = vqaddq_s16(res, a); + const int16x8_t c = vminq_s16(b, max); + const uint16x8_t d = vqshluq_n_s16(c, 0); + vst1_u16(*dest, vget_low_u16(d)); + *dest += stride; + vst1_u16(*dest, vget_high_u16(d)); + *dest += stride; +} + +// res is in reverse row order +static INLINE void highbd_idct4x4_1_add_kernel2(uint16_t **dest, + const int stride, + const int16x8_t res, + const int16x8_t max) { + const uint16x4_t a0 = vld1_u16(*dest); + const uint16x4_t a1 = vld1_u16(*dest + stride); + const int16x8_t a = vreinterpretq_s16_u16(vcombine_u16(a1, a0)); + // Note: In some profile tests, res is quite close to +/-32767. + // We use saturating addition. + const int16x8_t b = vqaddq_s16(res, a); + const int16x8_t c = vminq_s16(b, max); + const uint16x8_t d = vqshluq_n_s16(c, 0); + vst1_u16(*dest, vget_high_u16(d)); + *dest += stride; + vst1_u16(*dest, vget_low_u16(d)); + *dest += stride; +} + +void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + const tran_low_t out0 = + HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); + const tran_low_t out1 = + HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd); + const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4); + const int16x8_t dc = vdupq_n_s16(a1); + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max); + highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max); +} + +static INLINE void idct4x4_16_kernel_bd10(const int32x4_t cospis, + int32x4_t *const a0, + int32x4_t *const a1, + int32x4_t *const a2, + int32x4_t *const a3) { + int32x4_t b0, b1, b2, b3; + + transpose_s32_4x4(a0, a1, a2, a3); + b0 = vaddq_s32(*a0, *a2); + b1 = vsubq_s32(*a0, *a2); + b0 = vmulq_lane_s32(b0, vget_high_s32(cospis), 0); + b1 = vmulq_lane_s32(b1, vget_high_s32(cospis), 0); + b2 = vmulq_lane_s32(*a1, vget_high_s32(cospis), 1); + b3 = vmulq_lane_s32(*a1, vget_low_s32(cospis), 1); + b2 = vmlsq_lane_s32(b2, *a3, vget_low_s32(cospis), 1); + b3 = vmlaq_lane_s32(b3, *a3, vget_high_s32(cospis), 1); + b0 = vrshrq_n_s32(b0, 14); + b1 = vrshrq_n_s32(b1, 14); + b2 = vrshrq_n_s32(b2, 14); + b3 = vrshrq_n_s32(b3, 14); + *a0 = vaddq_s32(b0, b3); + *a1 = vaddq_s32(b1, b2); + *a2 = vsubq_s32(b1, b2); + *a3 = vsubq_s32(b0, b3); +} + +static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis, + int32x4_t *const a0, + int32x4_t *const a1, + int32x4_t *const a2, + int32x4_t *const a3) { + int32x4_t b0, b1, b2, b3; + int64x2_t c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11; + + transpose_s32_4x4(a0, a1, a2, a3); + b0 = vaddq_s32(*a0, *a2); + b1 = vsubq_s32(*a0, *a2); + c0 = vmull_lane_s32(vget_low_s32(b0), vget_high_s32(cospis), 0); + c1 = vmull_lane_s32(vget_high_s32(b0), vget_high_s32(cospis), 0); + c2 = vmull_lane_s32(vget_low_s32(b1), vget_high_s32(cospis), 0); + c3 = vmull_lane_s32(vget_high_s32(b1), vget_high_s32(cospis), 0); + c4 = vmull_lane_s32(vget_low_s32(*a1), vget_high_s32(cospis), 1); + c5 = vmull_lane_s32(vget_high_s32(*a1), vget_high_s32(cospis), 1); + c6 = vmull_lane_s32(vget_low_s32(*a1), vget_low_s32(cospis), 1); + c7 = vmull_lane_s32(vget_high_s32(*a1), vget_low_s32(cospis), 1); + c8 = vmull_lane_s32(vget_low_s32(*a3), vget_low_s32(cospis), 1); + c9 = vmull_lane_s32(vget_high_s32(*a3), vget_low_s32(cospis), 1); + c10 = vmull_lane_s32(vget_low_s32(*a3), vget_high_s32(cospis), 1); + c11 = vmull_lane_s32(vget_high_s32(*a3), vget_high_s32(cospis), 1); + c4 = vsubq_s64(c4, c8); + c5 = vsubq_s64(c5, c9); + c6 = vaddq_s64(c6, c10); + c7 = vaddq_s64(c7, c11); + b0 = vcombine_s32(vrshrn_n_s64(c0, 14), vrshrn_n_s64(c1, 14)); + b1 = vcombine_s32(vrshrn_n_s64(c2, 14), vrshrn_n_s64(c3, 14)); + b2 = vcombine_s32(vrshrn_n_s64(c4, 14), vrshrn_n_s64(c5, 14)); + b3 = vcombine_s32(vrshrn_n_s64(c6, 14), vrshrn_n_s64(c7, 14)); + *a0 = vaddq_s32(b0, b3); + *a1 = vaddq_s32(b1, b2); + *a2 = vsubq_s32(b1, b2); + *a3 = vsubq_s32(b0, b3); +} + +void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + int32x4_t c0 = vld1q_s32(input); + int32x4_t c1 = vld1q_s32(input + 4); + int32x4_t c2 = vld1q_s32(input + 8); + int32x4_t c3 = vld1q_s32(input + 12); + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + int16x8_t a0, a1; + + if (bd == 8) { + const int16x4_t cospis = vld1_s16(kCospi); + + // Rows + a0 = vcombine_s16(vmovn_s32(c0), vmovn_s32(c1)); + a1 = vcombine_s16(vmovn_s32(c2), vmovn_s32(c3)); + idct4x4_16_kernel_bd8(cospis, &a0, &a1); + + // Columns + a1 = vcombine_s16(vget_high_s16(a1), vget_low_s16(a1)); + idct4x4_16_kernel_bd8(cospis, &a0, &a1); + a0 = vrshrq_n_s16(a0, 4); + a1 = vrshrq_n_s16(a1, 4); + } else { + const int32x4_t cospis = vld1q_s32(kCospi32); + + if (bd == 10) { + idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3); + idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3); + } else { + idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3); + idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3); + } + a0 = vcombine_s16(vqrshrn_n_s32(c0, 4), vqrshrn_n_s32(c1, 4)); + a1 = vcombine_s16(vqrshrn_n_s32(c3, 4), vqrshrn_n_s32(c2, 4)); + } + + highbd_idct4x4_1_add_kernel1(&dest, stride, a0, max); + highbd_idct4x4_1_add_kernel2(&dest, stride, a1, max); +} diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c new file mode 100644 index 00000000000..c1c0f645d18 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c @@ -0,0 +1,614 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/inv_txfm.h" + +static INLINE void highbd_idct8x8_1_add_kernel(uint16_t **dest, + const int stride, + const int16x8_t res, + const int16x8_t max) { + const uint16x8_t a = vld1q_u16(*dest); + const int16x8_t b = vaddq_s16(res, vreinterpretq_s16_u16(a)); + const int16x8_t c = vminq_s16(b, max); + const uint16x8_t d = vqshluq_n_s16(c, 0); + vst1q_u16(*dest, d); + *dest += stride; +} + +void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + const tran_low_t out0 = + HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); + const tran_low_t out1 = + HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd); + const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5); + const int16x8_t dc = vdupq_n_s16(a1); + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + highbd_idct8x8_1_add_kernel(&dest, stride, dc, max); + highbd_idct8x8_1_add_kernel(&dest, stride, dc, max); + highbd_idct8x8_1_add_kernel(&dest, stride, dc, max); + highbd_idct8x8_1_add_kernel(&dest, stride, dc, max); + highbd_idct8x8_1_add_kernel(&dest, stride, dc, max); + highbd_idct8x8_1_add_kernel(&dest, stride, dc, max); + highbd_idct8x8_1_add_kernel(&dest, stride, dc, max); + highbd_idct8x8_1_add_kernel(&dest, stride, dc, max); +} + +static INLINE void idct8x8_12_half1d_bd10( + const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0, + int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3, + int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6, + int32x4_t *const io7) { + int32x4_t step1[8], step2[8]; + + transpose_s32_4x4(io0, io1, io2, io3); + + // stage 1 + step1[4] = vmulq_lane_s32(*io1, vget_high_s32(cospis1), 1); + step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0); + step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1); + step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0); + step1[4] = vrshrq_n_s32(step1[4], 14); + step1[5] = vrshrq_n_s32(step1[5], 14); + step1[6] = vrshrq_n_s32(step1[6], 14); + step1[7] = vrshrq_n_s32(step1[7], 14); + + // stage 2 + step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0); + step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1); + step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1); + step2[1] = vrshrq_n_s32(step2[1], 14); + step2[2] = vrshrq_n_s32(step2[2], 14); + step2[3] = vrshrq_n_s32(step2[3], 14); + + step2[4] = vaddq_s32(step1[4], step1[5]); + step2[5] = vsubq_s32(step1[4], step1[5]); + step2[6] = vsubq_s32(step1[7], step1[6]); + step2[7] = vaddq_s32(step1[7], step1[6]); + + // stage 3 + step1[0] = vaddq_s32(step2[1], step2[3]); + step1[1] = vaddq_s32(step2[1], step2[2]); + step1[2] = vsubq_s32(step2[1], step2[2]); + step1[3] = vsubq_s32(step2[1], step2[3]); + + step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0); + step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0); + step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0); + step1[5] = vrshrq_n_s32(step1[5], 14); + step1[6] = vrshrq_n_s32(step1[6], 14); + + // stage 4 + *io0 = vaddq_s32(step1[0], step2[7]); + *io1 = vaddq_s32(step1[1], step1[6]); + *io2 = vaddq_s32(step1[2], step1[5]); + *io3 = vaddq_s32(step1[3], step2[4]); + *io4 = vsubq_s32(step1[3], step2[4]); + *io5 = vsubq_s32(step1[2], step1[5]); + *io6 = vsubq_s32(step1[1], step1[6]); + *io7 = vsubq_s32(step1[0], step2[7]); +} + +static INLINE void idct8x8_12_half1d_bd12( + const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0, + int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3, + int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6, + int32x4_t *const io7) { + int32x2_t input_1l, input_1h, input_3l, input_3h; + int32x2_t step1l[2], step1h[2]; + int32x4_t step1[8], step2[8]; + int64x2_t t64[8]; + int32x2_t t32[8]; + + transpose_s32_4x4(io0, io1, io2, io3); + + // stage 1 + input_1l = vget_low_s32(*io1); + input_1h = vget_high_s32(*io1); + input_3l = vget_low_s32(*io3); + input_3h = vget_high_s32(*io3); + step1l[0] = vget_low_s32(*io0); + step1h[0] = vget_high_s32(*io0); + step1l[1] = vget_low_s32(*io2); + step1h[1] = vget_high_s32(*io2); + + t64[0] = vmull_lane_s32(input_1l, vget_high_s32(cospis1), 1); + t64[1] = vmull_lane_s32(input_1h, vget_high_s32(cospis1), 1); + t64[2] = vmull_lane_s32(input_3l, vget_high_s32(cospis1), 0); + t64[3] = vmull_lane_s32(input_3h, vget_high_s32(cospis1), 0); + t64[4] = vmull_lane_s32(input_3l, vget_low_s32(cospis1), 1); + t64[5] = vmull_lane_s32(input_3h, vget_low_s32(cospis1), 1); + t64[6] = vmull_lane_s32(input_1l, vget_low_s32(cospis1), 0); + t64[7] = vmull_lane_s32(input_1h, vget_low_s32(cospis1), 0); + t32[0] = vrshrn_n_s64(t64[0], 14); + t32[1] = vrshrn_n_s64(t64[1], 14); + t32[2] = vrshrn_n_s64(t64[2], 14); + t32[3] = vrshrn_n_s64(t64[3], 14); + t32[4] = vrshrn_n_s64(t64[4], 14); + t32[5] = vrshrn_n_s64(t64[5], 14); + t32[6] = vrshrn_n_s64(t64[6], 14); + t32[7] = vrshrn_n_s64(t64[7], 14); + step1[4] = vcombine_s32(t32[0], t32[1]); + step1[5] = vcombine_s32(t32[2], t32[3]); + step1[6] = vcombine_s32(t32[4], t32[5]); + step1[7] = vcombine_s32(t32[6], t32[7]); + + // stage 2 + t64[2] = vmull_lane_s32(step1l[0], vget_high_s32(cospis0), 0); + t64[3] = vmull_lane_s32(step1h[0], vget_high_s32(cospis0), 0); + t64[4] = vmull_lane_s32(step1l[1], vget_high_s32(cospis0), 1); + t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1); + t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1); + t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1); + t32[2] = vrshrn_n_s64(t64[2], 14); + t32[3] = vrshrn_n_s64(t64[3], 14); + t32[4] = vrshrn_n_s64(t64[4], 14); + t32[5] = vrshrn_n_s64(t64[5], 14); + t32[6] = vrshrn_n_s64(t64[6], 14); + t32[7] = vrshrn_n_s64(t64[7], 14); + step2[1] = vcombine_s32(t32[2], t32[3]); + step2[2] = vcombine_s32(t32[4], t32[5]); + step2[3] = vcombine_s32(t32[6], t32[7]); + + step2[4] = vaddq_s32(step1[4], step1[5]); + step2[5] = vsubq_s32(step1[4], step1[5]); + step2[6] = vsubq_s32(step1[7], step1[6]); + step2[7] = vaddq_s32(step1[7], step1[6]); + + // stage 3 + step1[0] = vaddq_s32(step2[1], step2[3]); + step1[1] = vaddq_s32(step2[1], step2[2]); + step1[2] = vsubq_s32(step2[1], step2[2]); + step1[3] = vsubq_s32(step2[1], step2[3]); + + t64[2] = vmull_lane_s32(vget_low_s32(step2[6]), vget_high_s32(cospis0), 0); + t64[3] = vmull_lane_s32(vget_high_s32(step2[6]), vget_high_s32(cospis0), 0); + t64[0] = + vmlsl_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0); + t64[1] = vmlsl_lane_s32(t64[3], vget_high_s32(step2[5]), + vget_high_s32(cospis0), 0); + t64[2] = + vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0); + t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]), + vget_high_s32(cospis0), 0); + t32[0] = vrshrn_n_s64(t64[0], 14); + t32[1] = vrshrn_n_s64(t64[1], 14); + t32[2] = vrshrn_n_s64(t64[2], 14); + t32[3] = vrshrn_n_s64(t64[3], 14); + step1[5] = vcombine_s32(t32[0], t32[1]); + step1[6] = vcombine_s32(t32[2], t32[3]); + + // stage 4 + *io0 = vaddq_s32(step1[0], step2[7]); + *io1 = vaddq_s32(step1[1], step1[6]); + *io2 = vaddq_s32(step1[2], step1[5]); + *io3 = vaddq_s32(step1[3], step2[4]); + *io4 = vsubq_s32(step1[3], step2[4]); + *io5 = vsubq_s32(step1[2], step1[5]); + *io6 = vsubq_s32(step1[1], step1[6]); + *io7 = vsubq_s32(step1[0], step2[7]); +} + +static INLINE void highbd_add8x8(int16x8_t a0, int16x8_t a1, int16x8_t a2, + int16x8_t a3, int16x8_t a4, int16x8_t a5, + int16x8_t a6, int16x8_t a7, uint16_t *dest, + const int stride, const int bd) { + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + const uint16_t *dst = dest; + uint16x8_t d0, d1, d2, d3, d4, d5, d6, d7; + uint16x8_t d0_u16, d1_u16, d2_u16, d3_u16, d4_u16, d5_u16, d6_u16, d7_u16; + int16x8_t d0_s16, d1_s16, d2_s16, d3_s16, d4_s16, d5_s16, d6_s16, d7_s16; + + d0 = vld1q_u16(dst); + dst += stride; + d1 = vld1q_u16(dst); + dst += stride; + d2 = vld1q_u16(dst); + dst += stride; + d3 = vld1q_u16(dst); + dst += stride; + d4 = vld1q_u16(dst); + dst += stride; + d5 = vld1q_u16(dst); + dst += stride; + d6 = vld1q_u16(dst); + dst += stride; + d7 = vld1q_u16(dst); + + d0_s16 = vqaddq_s16(a0, vreinterpretq_s16_u16(d0)); + d1_s16 = vqaddq_s16(a1, vreinterpretq_s16_u16(d1)); + d2_s16 = vqaddq_s16(a2, vreinterpretq_s16_u16(d2)); + d3_s16 = vqaddq_s16(a3, vreinterpretq_s16_u16(d3)); + d4_s16 = vqaddq_s16(a4, vreinterpretq_s16_u16(d4)); + d5_s16 = vqaddq_s16(a5, vreinterpretq_s16_u16(d5)); + d6_s16 = vqaddq_s16(a6, vreinterpretq_s16_u16(d6)); + d7_s16 = vqaddq_s16(a7, vreinterpretq_s16_u16(d7)); + + d0_s16 = vminq_s16(d0_s16, max); + d1_s16 = vminq_s16(d1_s16, max); + d2_s16 = vminq_s16(d2_s16, max); + d3_s16 = vminq_s16(d3_s16, max); + d4_s16 = vminq_s16(d4_s16, max); + d5_s16 = vminq_s16(d5_s16, max); + d6_s16 = vminq_s16(d6_s16, max); + d7_s16 = vminq_s16(d7_s16, max); + d0_u16 = vqshluq_n_s16(d0_s16, 0); + d1_u16 = vqshluq_n_s16(d1_s16, 0); + d2_u16 = vqshluq_n_s16(d2_s16, 0); + d3_u16 = vqshluq_n_s16(d3_s16, 0); + d4_u16 = vqshluq_n_s16(d4_s16, 0); + d5_u16 = vqshluq_n_s16(d5_s16, 0); + d6_u16 = vqshluq_n_s16(d6_s16, 0); + d7_u16 = vqshluq_n_s16(d7_s16, 0); + + vst1q_u16(dest, d0_u16); + dest += stride; + vst1q_u16(dest, d1_u16); + dest += stride; + vst1q_u16(dest, d2_u16); + dest += stride; + vst1q_u16(dest, d3_u16); + dest += stride; + vst1q_u16(dest, d4_u16); + dest += stride; + vst1q_u16(dest, d5_u16); + dest += stride; + vst1q_u16(dest, d6_u16); + dest += stride; + vst1q_u16(dest, d7_u16); +} + +void vpx_highbd_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + int32x4_t a0 = vld1q_s32(input); + int32x4_t a1 = vld1q_s32(input + 8); + int32x4_t a2 = vld1q_s32(input + 16); + int32x4_t a3 = vld1q_s32(input + 24); + int16x8_t c0, c1, c2, c3, c4, c5, c6, c7; + + if (bd == 8) { + const int16x8_t cospis = vld1q_s16(kCospi); + const int16x8_t cospisd = vaddq_s16(cospis, cospis); + const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 + const int16x4_t cospisd0 = vget_low_s16(cospisd); // doubled 0, 8, 16, 24 + const int16x4_t cospisd1 = vget_high_s16(cospisd); // doubled 4, 12, 20, 28 + int16x4_t b0 = vmovn_s32(a0); + int16x4_t b1 = vmovn_s32(a1); + int16x4_t b2 = vmovn_s32(a2); + int16x4_t b3 = vmovn_s32(a3); + int16x4_t b4, b5, b6, b7; + + idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, &b0, &b1, &b2, &b3, &b4, + &b5, &b6, &b7); + idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, b0, b1, b2, b3, b4, b5, + b6, b7, &c0, &c1, &c2, &c3, &c4, &c5, &c6, &c7); + c0 = vrshrq_n_s16(c0, 5); + c1 = vrshrq_n_s16(c1, 5); + c2 = vrshrq_n_s16(c2, 5); + c3 = vrshrq_n_s16(c3, 5); + c4 = vrshrq_n_s16(c4, 5); + c5 = vrshrq_n_s16(c5, 5); + c6 = vrshrq_n_s16(c6, 5); + c7 = vrshrq_n_s16(c7, 5); + } else { + const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24 + const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28 + int32x4_t a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15; + + if (bd == 10) { + idct8x8_12_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, + &a6, &a7); + idct8x8_12_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a8, &a9, + &a10, &a11); + idct8x8_12_half1d_bd10(cospis0, cospis1, &a4, &a5, &a6, &a7, &a12, &a13, + &a14, &a15); + } else { + idct8x8_12_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, + &a6, &a7); + idct8x8_12_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a8, &a9, + &a10, &a11); + idct8x8_12_half1d_bd12(cospis0, cospis1, &a4, &a5, &a6, &a7, &a12, &a13, + &a14, &a15); + } + c0 = vcombine_s16(vrshrn_n_s32(a0, 5), vrshrn_n_s32(a4, 5)); + c1 = vcombine_s16(vrshrn_n_s32(a1, 5), vrshrn_n_s32(a5, 5)); + c2 = vcombine_s16(vrshrn_n_s32(a2, 5), vrshrn_n_s32(a6, 5)); + c3 = vcombine_s16(vrshrn_n_s32(a3, 5), vrshrn_n_s32(a7, 5)); + c4 = vcombine_s16(vrshrn_n_s32(a8, 5), vrshrn_n_s32(a12, 5)); + c5 = vcombine_s16(vrshrn_n_s32(a9, 5), vrshrn_n_s32(a13, 5)); + c6 = vcombine_s16(vrshrn_n_s32(a10, 5), vrshrn_n_s32(a14, 5)); + c7 = vcombine_s16(vrshrn_n_s32(a11, 5), vrshrn_n_s32(a15, 5)); + } + highbd_add8x8(c0, c1, c2, c3, c4, c5, c6, c7, dest, stride, bd); +} + +static INLINE void idct8x8_64_half1d_bd10( + const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0, + int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3, + int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6, + int32x4_t *const io7) { + int32x4_t step1[8], step2[8]; + + transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7); + + // stage 1 + step1[4] = vmulq_lane_s32(*io1, vget_high_s32(cospis1), 1); + step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0); + step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1); + step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0); + + step1[4] = vmlsq_lane_s32(step1[4], *io7, vget_low_s32(cospis1), 0); + step1[5] = vmlaq_lane_s32(step1[5], *io5, vget_low_s32(cospis1), 1); + step1[6] = vmlsq_lane_s32(step1[6], *io5, vget_high_s32(cospis1), 0); + step1[7] = vmlaq_lane_s32(step1[7], *io7, vget_high_s32(cospis1), 1); + + step1[4] = vrshrq_n_s32(step1[4], 14); + step1[5] = vrshrq_n_s32(step1[5], 14); + step1[6] = vrshrq_n_s32(step1[6], 14); + step1[7] = vrshrq_n_s32(step1[7], 14); + + // stage 2 + step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0); + step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1); + step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1); + + step2[0] = vmlaq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0); + step2[1] = vmlsq_lane_s32(step2[1], *io4, vget_high_s32(cospis0), 0); + step2[2] = vmlsq_lane_s32(step2[2], *io6, vget_low_s32(cospis0), 1); + step2[3] = vmlaq_lane_s32(step2[3], *io6, vget_high_s32(cospis0), 1); + + step2[0] = vrshrq_n_s32(step2[0], 14); + step2[1] = vrshrq_n_s32(step2[1], 14); + step2[2] = vrshrq_n_s32(step2[2], 14); + step2[3] = vrshrq_n_s32(step2[3], 14); + + step2[4] = vaddq_s32(step1[4], step1[5]); + step2[5] = vsubq_s32(step1[4], step1[5]); + step2[6] = vsubq_s32(step1[7], step1[6]); + step2[7] = vaddq_s32(step1[7], step1[6]); + + // stage 3 + step1[0] = vaddq_s32(step2[0], step2[3]); + step1[1] = vaddq_s32(step2[1], step2[2]); + step1[2] = vsubq_s32(step2[1], step2[2]); + step1[3] = vsubq_s32(step2[0], step2[3]); + + step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0); + step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0); + step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0); + step1[5] = vrshrq_n_s32(step1[5], 14); + step1[6] = vrshrq_n_s32(step1[6], 14); + + // stage 4 + *io0 = vaddq_s32(step1[0], step2[7]); + *io1 = vaddq_s32(step1[1], step1[6]); + *io2 = vaddq_s32(step1[2], step1[5]); + *io3 = vaddq_s32(step1[3], step2[4]); + *io4 = vsubq_s32(step1[3], step2[4]); + *io5 = vsubq_s32(step1[2], step1[5]); + *io6 = vsubq_s32(step1[1], step1[6]); + *io7 = vsubq_s32(step1[0], step2[7]); +} + +static INLINE void idct8x8_64_half1d_bd12( + const int32x4_t cospis0, const int32x4_t cospis1, int32x4_t *const io0, + int32x4_t *const io1, int32x4_t *const io2, int32x4_t *const io3, + int32x4_t *const io4, int32x4_t *const io5, int32x4_t *const io6, + int32x4_t *const io7) { + int32x2_t input_1l, input_1h, input_3l, input_3h, input_5l, input_5h, + input_7l, input_7h; + int32x2_t step1l[4], step1h[4]; + int32x4_t step1[8], step2[8]; + int64x2_t t64[8]; + int32x2_t t32[8]; + + transpose_s32_8x4(io0, io1, io2, io3, io4, io5, io6, io7); + + // stage 1 + input_1l = vget_low_s32(*io1); + input_1h = vget_high_s32(*io1); + input_3l = vget_low_s32(*io3); + input_3h = vget_high_s32(*io3); + input_5l = vget_low_s32(*io5); + input_5h = vget_high_s32(*io5); + input_7l = vget_low_s32(*io7); + input_7h = vget_high_s32(*io7); + step1l[0] = vget_low_s32(*io0); + step1h[0] = vget_high_s32(*io0); + step1l[1] = vget_low_s32(*io2); + step1h[1] = vget_high_s32(*io2); + step1l[2] = vget_low_s32(*io4); + step1h[2] = vget_high_s32(*io4); + step1l[3] = vget_low_s32(*io6); + step1h[3] = vget_high_s32(*io6); + + t64[0] = vmull_lane_s32(input_1l, vget_high_s32(cospis1), 1); + t64[1] = vmull_lane_s32(input_1h, vget_high_s32(cospis1), 1); + t64[2] = vmull_lane_s32(input_3l, vget_high_s32(cospis1), 0); + t64[3] = vmull_lane_s32(input_3h, vget_high_s32(cospis1), 0); + t64[4] = vmull_lane_s32(input_3l, vget_low_s32(cospis1), 1); + t64[5] = vmull_lane_s32(input_3h, vget_low_s32(cospis1), 1); + t64[6] = vmull_lane_s32(input_1l, vget_low_s32(cospis1), 0); + t64[7] = vmull_lane_s32(input_1h, vget_low_s32(cospis1), 0); + t64[0] = vmlsl_lane_s32(t64[0], input_7l, vget_low_s32(cospis1), 0); + t64[1] = vmlsl_lane_s32(t64[1], input_7h, vget_low_s32(cospis1), 0); + t64[2] = vmlal_lane_s32(t64[2], input_5l, vget_low_s32(cospis1), 1); + t64[3] = vmlal_lane_s32(t64[3], input_5h, vget_low_s32(cospis1), 1); + t64[4] = vmlsl_lane_s32(t64[4], input_5l, vget_high_s32(cospis1), 0); + t64[5] = vmlsl_lane_s32(t64[5], input_5h, vget_high_s32(cospis1), 0); + t64[6] = vmlal_lane_s32(t64[6], input_7l, vget_high_s32(cospis1), 1); + t64[7] = vmlal_lane_s32(t64[7], input_7h, vget_high_s32(cospis1), 1); + t32[0] = vrshrn_n_s64(t64[0], 14); + t32[1] = vrshrn_n_s64(t64[1], 14); + t32[2] = vrshrn_n_s64(t64[2], 14); + t32[3] = vrshrn_n_s64(t64[3], 14); + t32[4] = vrshrn_n_s64(t64[4], 14); + t32[5] = vrshrn_n_s64(t64[5], 14); + t32[6] = vrshrn_n_s64(t64[6], 14); + t32[7] = vrshrn_n_s64(t64[7], 14); + step1[4] = vcombine_s32(t32[0], t32[1]); + step1[5] = vcombine_s32(t32[2], t32[3]); + step1[6] = vcombine_s32(t32[4], t32[5]); + step1[7] = vcombine_s32(t32[6], t32[7]); + + // stage 2 + t64[2] = vmull_lane_s32(step1l[0], vget_high_s32(cospis0), 0); + t64[3] = vmull_lane_s32(step1h[0], vget_high_s32(cospis0), 0); + t64[4] = vmull_lane_s32(step1l[1], vget_high_s32(cospis0), 1); + t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1); + t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1); + t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1); + t64[0] = vmlal_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0); + t64[1] = vmlal_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0); + t64[2] = vmlsl_lane_s32(t64[2], step1l[2], vget_high_s32(cospis0), 0); + t64[3] = vmlsl_lane_s32(t64[3], step1h[2], vget_high_s32(cospis0), 0); + t64[4] = vmlsl_lane_s32(t64[4], step1l[3], vget_low_s32(cospis0), 1); + t64[5] = vmlsl_lane_s32(t64[5], step1h[3], vget_low_s32(cospis0), 1); + t64[6] = vmlal_lane_s32(t64[6], step1l[3], vget_high_s32(cospis0), 1); + t64[7] = vmlal_lane_s32(t64[7], step1h[3], vget_high_s32(cospis0), 1); + t32[0] = vrshrn_n_s64(t64[0], 14); + t32[1] = vrshrn_n_s64(t64[1], 14); + t32[2] = vrshrn_n_s64(t64[2], 14); + t32[3] = vrshrn_n_s64(t64[3], 14); + t32[4] = vrshrn_n_s64(t64[4], 14); + t32[5] = vrshrn_n_s64(t64[5], 14); + t32[6] = vrshrn_n_s64(t64[6], 14); + t32[7] = vrshrn_n_s64(t64[7], 14); + step2[0] = vcombine_s32(t32[0], t32[1]); + step2[1] = vcombine_s32(t32[2], t32[3]); + step2[2] = vcombine_s32(t32[4], t32[5]); + step2[3] = vcombine_s32(t32[6], t32[7]); + + step2[4] = vaddq_s32(step1[4], step1[5]); + step2[5] = vsubq_s32(step1[4], step1[5]); + step2[6] = vsubq_s32(step1[7], step1[6]); + step2[7] = vaddq_s32(step1[7], step1[6]); + + // stage 3 + step1[0] = vaddq_s32(step2[0], step2[3]); + step1[1] = vaddq_s32(step2[1], step2[2]); + step1[2] = vsubq_s32(step2[1], step2[2]); + step1[3] = vsubq_s32(step2[0], step2[3]); + + t64[2] = vmull_lane_s32(vget_low_s32(step2[6]), vget_high_s32(cospis0), 0); + t64[3] = vmull_lane_s32(vget_high_s32(step2[6]), vget_high_s32(cospis0), 0); + t64[0] = + vmlsl_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0); + t64[1] = vmlsl_lane_s32(t64[3], vget_high_s32(step2[5]), + vget_high_s32(cospis0), 0); + t64[2] = + vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0); + t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]), + vget_high_s32(cospis0), 0); + t32[0] = vrshrn_n_s64(t64[0], 14); + t32[1] = vrshrn_n_s64(t64[1], 14); + t32[2] = vrshrn_n_s64(t64[2], 14); + t32[3] = vrshrn_n_s64(t64[3], 14); + step1[5] = vcombine_s32(t32[0], t32[1]); + step1[6] = vcombine_s32(t32[2], t32[3]); + + // stage 4 + *io0 = vaddq_s32(step1[0], step2[7]); + *io1 = vaddq_s32(step1[1], step1[6]); + *io2 = vaddq_s32(step1[2], step1[5]); + *io3 = vaddq_s32(step1[3], step2[4]); + *io4 = vsubq_s32(step1[3], step2[4]); + *io5 = vsubq_s32(step1[2], step1[5]); + *io6 = vsubq_s32(step1[1], step1[6]); + *io7 = vsubq_s32(step1[0], step2[7]); +} + +void vpx_highbd_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + int32x4_t a0 = vld1q_s32(input); + int32x4_t a1 = vld1q_s32(input + 4); + int32x4_t a2 = vld1q_s32(input + 8); + int32x4_t a3 = vld1q_s32(input + 12); + int32x4_t a4 = vld1q_s32(input + 16); + int32x4_t a5 = vld1q_s32(input + 20); + int32x4_t a6 = vld1q_s32(input + 24); + int32x4_t a7 = vld1q_s32(input + 28); + int32x4_t a8 = vld1q_s32(input + 32); + int32x4_t a9 = vld1q_s32(input + 36); + int32x4_t a10 = vld1q_s32(input + 40); + int32x4_t a11 = vld1q_s32(input + 44); + int32x4_t a12 = vld1q_s32(input + 48); + int32x4_t a13 = vld1q_s32(input + 52); + int32x4_t a14 = vld1q_s32(input + 56); + int32x4_t a15 = vld1q_s32(input + 60); + int16x8_t c0, c1, c2, c3, c4, c5, c6, c7; + + if (bd == 8) { + const int16x8_t cospis = vld1q_s16(kCospi); + const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 + const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28 + int16x8_t b0 = vcombine_s16(vmovn_s32(a0), vmovn_s32(a1)); + int16x8_t b1 = vcombine_s16(vmovn_s32(a2), vmovn_s32(a3)); + int16x8_t b2 = vcombine_s16(vmovn_s32(a4), vmovn_s32(a5)); + int16x8_t b3 = vcombine_s16(vmovn_s32(a6), vmovn_s32(a7)); + int16x8_t b4 = vcombine_s16(vmovn_s32(a8), vmovn_s32(a9)); + int16x8_t b5 = vcombine_s16(vmovn_s32(a10), vmovn_s32(a11)); + int16x8_t b6 = vcombine_s16(vmovn_s32(a12), vmovn_s32(a13)); + int16x8_t b7 = vcombine_s16(vmovn_s32(a14), vmovn_s32(a15)); + + idct8x8_64_1d_bd8(cospis0, cospis1, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7); + idct8x8_64_1d_bd8(cospis0, cospis1, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7); + + c0 = vrshrq_n_s16(b0, 5); + c1 = vrshrq_n_s16(b1, 5); + c2 = vrshrq_n_s16(b2, 5); + c3 = vrshrq_n_s16(b3, 5); + c4 = vrshrq_n_s16(b4, 5); + c5 = vrshrq_n_s16(b5, 5); + c6 = vrshrq_n_s16(b6, 5); + c7 = vrshrq_n_s16(b7, 5); + } else { + const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24 + const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28 + + if (bd == 10) { + idct8x8_64_half1d_bd10(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, + &a6, &a7); + idct8x8_64_half1d_bd10(cospis0, cospis1, &a8, &a9, &a10, &a11, &a12, &a13, + &a14, &a15); + idct8x8_64_half1d_bd10(cospis0, cospis1, &a0, &a8, &a1, &a9, &a2, &a10, + &a3, &a11); + idct8x8_64_half1d_bd10(cospis0, cospis1, &a4, &a12, &a5, &a13, &a6, &a14, + &a7, &a15); + } else { + idct8x8_64_half1d_bd12(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, + &a6, &a7); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a8, &a9, &a10, &a11, &a12, &a13, + &a14, &a15); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a0, &a8, &a1, &a9, &a2, &a10, + &a3, &a11); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a4, &a12, &a5, &a13, &a6, &a14, + &a7, &a15); + } + c0 = vcombine_s16(vrshrn_n_s32(a0, 5), vrshrn_n_s32(a4, 5)); + c1 = vcombine_s16(vrshrn_n_s32(a8, 5), vrshrn_n_s32(a12, 5)); + c2 = vcombine_s16(vrshrn_n_s32(a1, 5), vrshrn_n_s32(a5, 5)); + c3 = vcombine_s16(vrshrn_n_s32(a9, 5), vrshrn_n_s32(a13, 5)); + c4 = vcombine_s16(vrshrn_n_s32(a2, 5), vrshrn_n_s32(a6, 5)); + c5 = vcombine_s16(vrshrn_n_s32(a10, 5), vrshrn_n_s32(a14, 5)); + c6 = vcombine_s16(vrshrn_n_s32(a3, 5), vrshrn_n_s32(a7, 5)); + c7 = vcombine_s16(vrshrn_n_s32(a11, 5), vrshrn_n_s32(a15, 5)); + } + highbd_add8x8(c0, c1, c2, c3, c4, c5, c6, c7, dest, stride, bd); +} diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c new file mode 100644 index 00000000000..6f7e5da7627 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_intrapred_neon.c @@ -0,0 +1,1078 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +//------------------------------------------------------------------------------ +// DC 4x4 + +static INLINE uint16x4_t dc_sum_4(const uint16_t *ref) { + const uint16x4_t ref_u16 = vld1_u16(ref); + const uint16x4_t p0 = vpadd_u16(ref_u16, ref_u16); + return vpadd_u16(p0, p0); +} + +static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride, + const uint16x4_t dc) { + const uint16x4_t dc_dup = vdup_lane_u16(dc, 0); + int i; + for (i = 0; i < 4; ++i, dst += stride) { + vst1_u16(dst, dc_dup); + } +} + +void vpx_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x4_t a = vld1_u16(above); + const uint16x4_t l = vld1_u16(left); + uint16x4_t sum; + uint16x4_t dc; + (void)bd; + sum = vadd_u16(a, l); + sum = vpadd_u16(sum, sum); + sum = vpadd_u16(sum, sum); + dc = vrshr_n_u16(sum, 3); + dc_store_4x4(dst, stride, dc); +} + +void vpx_highbd_dc_left_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x4_t sum = dc_sum_4(left); + const uint16x4_t dc = vrshr_n_u16(sum, 2); + (void)above; + (void)bd; + dc_store_4x4(dst, stride, dc); +} + +void vpx_highbd_dc_top_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x4_t sum = dc_sum_4(above); + const uint16x4_t dc = vrshr_n_u16(sum, 2); + (void)left; + (void)bd; + dc_store_4x4(dst, stride, dc); +} + +void vpx_highbd_dc_128_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x4_t dc = vdup_n_u16(1 << (bd - 1)); + (void)above; + (void)left; + dc_store_4x4(dst, stride, dc); +} + +//------------------------------------------------------------------------------ +// DC 8x8 + +static INLINE uint16x4_t dc_sum_8(const uint16_t *ref) { + const uint16x8_t ref_u16 = vld1q_u16(ref); + uint16x4_t sum = vadd_u16(vget_low_u16(ref_u16), vget_high_u16(ref_u16)); + sum = vpadd_u16(sum, sum); + return vpadd_u16(sum, sum); +} + +static INLINE void dc_store_8x8(uint16_t *dst, ptrdiff_t stride, + const uint16x4_t dc) { + const uint16x8_t dc_dup = vdupq_lane_u16(dc, 0); + int i; + for (i = 0; i < 8; ++i, dst += stride) { + vst1q_u16(dst, dc_dup); + } +} + +void vpx_highbd_dc_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t above_u16 = vld1q_u16(above); + const uint16x8_t left_u16 = vld1q_u16(left); + const uint16x8_t p0 = vaddq_u16(above_u16, left_u16); + uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); + uint16x4_t dc; + (void)bd; + sum = vpadd_u16(sum, sum); + sum = vpadd_u16(sum, sum); + dc = vrshr_n_u16(sum, 4); + dc_store_8x8(dst, stride, dc); +} + +void vpx_highbd_dc_left_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x4_t sum = dc_sum_8(left); + const uint16x4_t dc = vrshr_n_u16(sum, 3); + (void)above; + (void)bd; + dc_store_8x8(dst, stride, dc); +} + +void vpx_highbd_dc_top_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x4_t sum = dc_sum_8(above); + const uint16x4_t dc = vrshr_n_u16(sum, 3); + (void)left; + (void)bd; + dc_store_8x8(dst, stride, dc); +} + +void vpx_highbd_dc_128_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x4_t dc = vdup_n_u16(1 << (bd - 1)); + (void)above; + (void)left; + dc_store_8x8(dst, stride, dc); +} + +//------------------------------------------------------------------------------ +// DC 16x16 + +static INLINE uint16x4_t dc_sum_16(const uint16_t *ref) { + const uint16x8x2_t ref_u16 = vld2q_u16(ref); + const uint16x8_t p0 = vaddq_u16(ref_u16.val[0], ref_u16.val[1]); + uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); + sum = vpadd_u16(sum, sum); + return vpadd_u16(sum, sum); +} + +static INLINE void dc_store_16x16(uint16_t *dst, ptrdiff_t stride, + const uint16x4_t dc) { + uint16x8x2_t dc_dup; + int i; + dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u16(dc, 0); + for (i = 0; i < 16; ++i, dst += stride) { + vst2q_u16(dst, dc_dup); + } +} + +void vpx_highbd_dc_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8x2_t a = vld2q_u16(above); + const uint16x8x2_t l = vld2q_u16(left); + const uint16x8_t pa = vaddq_u16(a.val[0], a.val[1]); + const uint16x8_t pl = vaddq_u16(l.val[0], l.val[1]); + const uint16x8_t pal0 = vaddq_u16(pa, pl); + uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0)); + uint32x2_t sum; + uint16x4_t dc; + (void)bd; + pal1 = vpadd_u16(pal1, pal1); + sum = vpaddl_u16(pal1); + dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5)); + dc_store_16x16(dst, stride, dc); +} + +void vpx_highbd_dc_left_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x4_t sum = dc_sum_16(left); + const uint16x4_t dc = vrshr_n_u16(sum, 4); + (void)above; + (void)bd; + dc_store_16x16(dst, stride, dc); +} + +void vpx_highbd_dc_top_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x4_t sum = dc_sum_16(above); + const uint16x4_t dc = vrshr_n_u16(sum, 4); + (void)left; + (void)bd; + dc_store_16x16(dst, stride, dc); +} + +void vpx_highbd_dc_128_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x4_t dc = vdup_n_u16(1 << (bd - 1)); + (void)above; + (void)left; + dc_store_16x16(dst, stride, dc); +} + +//------------------------------------------------------------------------------ +// DC 32x32 + +static INLINE uint32x2_t dc_sum_32(const uint16_t *ref) { + const uint16x8x4_t r = vld4q_u16(ref); + const uint16x8_t p0 = vaddq_u16(r.val[0], r.val[1]); + const uint16x8_t p1 = vaddq_u16(r.val[2], r.val[3]); + const uint16x8_t p2 = vaddq_u16(p0, p1); + uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); + sum = vpadd_u16(sum, sum); + return vpaddl_u16(sum); +} + +static INLINE void dc_store_32x32(uint16_t *dst, ptrdiff_t stride, + const uint16x4_t dc) { + uint16x8x2_t dc_dup; + int i; + dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u16(dc, 0); + + for (i = 0; i < 32; ++i) { + vst2q_u16(dst, dc_dup); + dst += 16; + vst2q_u16(dst, dc_dup); + dst += stride - 16; + } +} + +void vpx_highbd_dc_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8x4_t a = vld4q_u16(above); + const uint16x8x4_t l = vld4q_u16(left); + const uint16x8_t pa0 = vaddq_u16(a.val[0], a.val[1]); + const uint16x8_t pa1 = vaddq_u16(a.val[2], a.val[3]); + const uint16x8_t pl0 = vaddq_u16(l.val[0], l.val[1]); + const uint16x8_t pl1 = vaddq_u16(l.val[2], l.val[3]); + const uint16x8_t pa = vaddq_u16(pa0, pa1); + const uint16x8_t pl = vaddq_u16(pl0, pl1); + const uint16x8_t pal0 = vaddq_u16(pa, pl); + const uint16x4_t pal1 = vadd_u16(vget_low_u16(pal0), vget_high_u16(pal0)); + uint32x2_t sum = vpaddl_u16(pal1); + uint16x4_t dc; + (void)bd; + sum = vpadd_u32(sum, sum); + dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 6)); + dc_store_32x32(dst, stride, dc); +} + +void vpx_highbd_dc_left_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint32x2_t sum = dc_sum_32(left); + const uint16x4_t dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5)); + (void)above; + (void)bd; + dc_store_32x32(dst, stride, dc); +} + +void vpx_highbd_dc_top_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint32x2_t sum = dc_sum_32(above); + const uint16x4_t dc = vreinterpret_u16_u32(vrshr_n_u32(sum, 5)); + (void)left; + (void)bd; + dc_store_32x32(dst, stride, dc); +} + +void vpx_highbd_dc_128_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x4_t dc = vdup_n_u16(1 << (bd - 1)); + (void)above; + (void)left; + dc_store_32x32(dst, stride, dc); +} + +// ----------------------------------------------------------------------------- + +void vpx_highbd_d45_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t ABCDEFGH = vld1q_u16(above); + const uint16x8_t BCDEFGH0 = vld1q_u16(above + 1); + const uint16x8_t CDEFGH00 = vld1q_u16(above + 2); + const uint16x8_t avg1 = vhaddq_u16(ABCDEFGH, CDEFGH00); + const uint16x8_t avg2 = vrhaddq_u16(avg1, BCDEFGH0); + const uint16x4_t avg2_low = vget_low_u16(avg2); + const uint16x4_t avg2_high = vget_high_u16(avg2); + const uint16x4_t r1 = vext_u16(avg2_low, avg2_high, 1); + const uint16x4_t r2 = vext_u16(avg2_low, avg2_high, 2); + const uint16x4_t r3 = vext_u16(avg2_low, avg2_high, 3); + (void)left; + (void)bd; + vst1_u16(dst, avg2_low); + dst += stride; + vst1_u16(dst, r1); + dst += stride; + vst1_u16(dst, r2); + dst += stride; + vst1_u16(dst, r3); + vst1q_lane_u16(dst + 3, ABCDEFGH, 7); +} + +static INLINE void d45_store_8(uint16_t **dst, const ptrdiff_t stride, + const uint16x8_t above_right, uint16x8_t *row) { + *row = vextq_u16(*row, above_right, 1); + vst1q_u16(*dst, *row); + *dst += stride; +} + +void vpx_highbd_d45_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t A0 = vld1q_u16(above); + const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0), 3); + const uint16x8_t A1 = vld1q_u16(above + 1); + const uint16x8_t A2 = vld1q_u16(above + 2); + const uint16x8_t avg1 = vhaddq_u16(A0, A2); + uint16x8_t row = vrhaddq_u16(avg1, A1); + (void)left; + (void)bd; + + vst1q_u16(dst, row); + dst += stride; + d45_store_8(&dst, stride, above_right, &row); + d45_store_8(&dst, stride, above_right, &row); + d45_store_8(&dst, stride, above_right, &row); + d45_store_8(&dst, stride, above_right, &row); + d45_store_8(&dst, stride, above_right, &row); + d45_store_8(&dst, stride, above_right, &row); + vst1q_u16(dst, above_right); +} + +static INLINE void d45_store_16(uint16_t **dst, const ptrdiff_t stride, + const uint16x8_t above_right, uint16x8_t *row_0, + uint16x8_t *row_1) { + *row_0 = vextq_u16(*row_0, *row_1, 1); + *row_1 = vextq_u16(*row_1, above_right, 1); + vst1q_u16(*dst, *row_0); + *dst += 8; + vst1q_u16(*dst, *row_1); + *dst += stride - 8; +} + +void vpx_highbd_d45_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t A0_0 = vld1q_u16(above); + const uint16x8_t A0_1 = vld1q_u16(above + 8); + const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_1), 3); + const uint16x8_t A1_0 = vld1q_u16(above + 1); + const uint16x8_t A1_1 = vld1q_u16(above + 9); + const uint16x8_t A2_0 = vld1q_u16(above + 2); + const uint16x8_t A2_1 = vld1q_u16(above + 10); + const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0); + const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1); + uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0); + uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1); + (void)left; + (void)bd; + + vst1q_u16(dst, row_0); + vst1q_u16(dst + 8, row_1); + dst += stride; + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + d45_store_16(&dst, stride, above_right, &row_0, &row_1); + vst1q_u16(dst, above_right); + vst1q_u16(dst + 8, above_right); +} + +void vpx_highbd_d45_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t A0_0 = vld1q_u16(above); + const uint16x8_t A0_1 = vld1q_u16(above + 8); + const uint16x8_t A0_2 = vld1q_u16(above + 16); + const uint16x8_t A0_3 = vld1q_u16(above + 24); + const uint16x8_t above_right = vdupq_lane_u16(vget_high_u16(A0_3), 3); + const uint16x8_t A1_0 = vld1q_u16(above + 1); + const uint16x8_t A1_1 = vld1q_u16(above + 9); + const uint16x8_t A1_2 = vld1q_u16(above + 17); + const uint16x8_t A1_3 = vld1q_u16(above + 25); + const uint16x8_t A2_0 = vld1q_u16(above + 2); + const uint16x8_t A2_1 = vld1q_u16(above + 10); + const uint16x8_t A2_2 = vld1q_u16(above + 18); + const uint16x8_t A2_3 = vld1q_u16(above + 26); + const uint16x8_t avg_0 = vhaddq_u16(A0_0, A2_0); + const uint16x8_t avg_1 = vhaddq_u16(A0_1, A2_1); + const uint16x8_t avg_2 = vhaddq_u16(A0_2, A2_2); + const uint16x8_t avg_3 = vhaddq_u16(A0_3, A2_3); + uint16x8_t row_0 = vrhaddq_u16(avg_0, A1_0); + uint16x8_t row_1 = vrhaddq_u16(avg_1, A1_1); + uint16x8_t row_2 = vrhaddq_u16(avg_2, A1_2); + uint16x8_t row_3 = vrhaddq_u16(avg_3, A1_3); + int i; + (void)left; + (void)bd; + + vst1q_u16(dst, row_0); + dst += 8; + vst1q_u16(dst, row_1); + dst += 8; + vst1q_u16(dst, row_2); + dst += 8; + vst1q_u16(dst, row_3); + dst += stride - 24; + + for (i = 0; i < 30; ++i) { + row_0 = vextq_u16(row_0, row_1, 1); + row_1 = vextq_u16(row_1, row_2, 1); + row_2 = vextq_u16(row_2, row_3, 1); + row_3 = vextq_u16(row_3, above_right, 1); + vst1q_u16(dst, row_0); + dst += 8; + vst1q_u16(dst, row_1); + dst += 8; + vst1q_u16(dst, row_2); + dst += 8; + vst1q_u16(dst, row_3); + dst += stride - 24; + } + + vst1q_u16(dst, above_right); + dst += 8; + vst1q_u16(dst, above_right); + dst += 8; + vst1q_u16(dst, above_right); + dst += 8; + vst1q_u16(dst, above_right); +} + +// ----------------------------------------------------------------------------- + +void vpx_highbd_d135_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t XA0123___ = vld1q_u16(above - 1); + const uint16x4_t L0123 = vld1_u16(left); + const uint16x4_t L3210 = vrev64_u16(L0123); + const uint16x8_t L____3210 = vcombine_u16(L0123, L3210); + const uint16x8_t L3210XA012 = vcombine_u16(L3210, vget_low_u16(XA0123___)); + const uint16x8_t L210XA0123 = vextq_u16(L____3210, XA0123___, 5); + const uint16x8_t L10XA0123_ = vextq_u16(L____3210, XA0123___, 6); + const uint16x8_t avg1 = vhaddq_u16(L3210XA012, L10XA0123_); + const uint16x8_t avg2 = vrhaddq_u16(avg1, L210XA0123); + const uint16x4_t row_0 = vget_low_u16(avg2); + const uint16x4_t row_1 = vget_high_u16(avg2); + const uint16x4_t r0 = vext_u16(row_0, row_1, 3); + const uint16x4_t r1 = vext_u16(row_0, row_1, 2); + const uint16x4_t r2 = vext_u16(row_0, row_1, 1); + (void)bd; + vst1_u16(dst, r0); + dst += stride; + vst1_u16(dst, r1); + dst += stride; + vst1_u16(dst, r2); + dst += stride; + vst1_u16(dst, row_0); +} + +void vpx_highbd_d135_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t XA0123456 = vld1q_u16(above - 1); + const uint16x8_t A01234567 = vld1q_u16(above); + const uint16x8_t A1234567_ = vld1q_u16(above + 1); + const uint16x8_t L01234567 = vld1q_u16(left); + const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567)); + const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567)); + const uint16x8_t L76543210 = vcombine_u16(L7654, L3210); + const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1); + const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2); + const uint16x8_t avg_0 = vhaddq_u16(L76543210, L543210XA0); + const uint16x8_t avg_1 = vhaddq_u16(XA0123456, A1234567_); + const uint16x8_t row_0 = vrhaddq_u16(avg_0, L6543210X); + const uint16x8_t row_1 = vrhaddq_u16(avg_1, A01234567); + const uint16x8_t r0 = vextq_u16(row_0, row_1, 7); + const uint16x8_t r1 = vextq_u16(row_0, row_1, 6); + const uint16x8_t r2 = vextq_u16(row_0, row_1, 5); + const uint16x8_t r3 = vextq_u16(row_0, row_1, 4); + const uint16x8_t r4 = vextq_u16(row_0, row_1, 3); + const uint16x8_t r5 = vextq_u16(row_0, row_1, 2); + const uint16x8_t r6 = vextq_u16(row_0, row_1, 1); + (void)bd; + vst1q_u16(dst, r0); + dst += stride; + vst1q_u16(dst, r1); + dst += stride; + vst1q_u16(dst, r2); + dst += stride; + vst1q_u16(dst, r3); + dst += stride; + vst1q_u16(dst, r4); + dst += stride; + vst1q_u16(dst, r5); + dst += stride; + vst1q_u16(dst, r6); + dst += stride; + vst1q_u16(dst, row_0); +} + +static INLINE void d135_store_16(uint16_t **dst, const ptrdiff_t stride, + const uint16x8_t row_0, + const uint16x8_t row_1) { + vst1q_u16(*dst, row_0); + *dst += 8; + vst1q_u16(*dst, row_1); + *dst += stride - 8; +} + +void vpx_highbd_d135_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t L01234567 = vld1q_u16(left); + const uint16x8_t L89abcdef = vld1q_u16(left + 8); + const uint16x4_t L3210 = vrev64_u16(vget_low_u16(L01234567)); + const uint16x4_t L7654 = vrev64_u16(vget_high_u16(L01234567)); + const uint16x4_t Lba98 = vrev64_u16(vget_low_u16(L89abcdef)); + const uint16x4_t Lfedc = vrev64_u16(vget_high_u16(L89abcdef)); + const uint16x8_t L76543210 = vcombine_u16(L7654, L3210); + const uint16x8_t Lfedcba98 = vcombine_u16(Lfedc, Lba98); + const uint16x8_t Ledcba987 = vextq_u16(Lfedcba98, L76543210, 1); + const uint16x8_t Ldcba9876 = vextq_u16(Lfedcba98, L76543210, 2); + const uint16x8_t avg_0 = vhaddq_u16(Lfedcba98, Ldcba9876); + const uint16x8_t row_0 = vrhaddq_u16(avg_0, Ledcba987); + + const uint16x8_t XA0123456 = vld1q_u16(above - 1); + const uint16x8_t L6543210X = vextq_u16(L76543210, XA0123456, 1); + const uint16x8_t L543210XA0 = vextq_u16(L76543210, XA0123456, 2); + const uint16x8_t avg_1 = vhaddq_u16(L76543210, L543210XA0); + const uint16x8_t row_1 = vrhaddq_u16(avg_1, L6543210X); + + const uint16x8_t A01234567 = vld1q_u16(above); + const uint16x8_t A12345678 = vld1q_u16(above + 1); + const uint16x8_t avg_2 = vhaddq_u16(XA0123456, A12345678); + const uint16x8_t row_2 = vrhaddq_u16(avg_2, A01234567); + + const uint16x8_t A789abcde = vld1q_u16(above + 7); + const uint16x8_t A89abcdef = vld1q_u16(above + 8); + const uint16x8_t A9abcdef_ = vld1q_u16(above + 9); + const uint16x8_t avg_3 = vhaddq_u16(A789abcde, A9abcdef_); + const uint16x8_t row_3 = vrhaddq_u16(avg_3, A89abcdef); + + const uint16x8_t r0_0 = vextq_u16(row_1, row_2, 7); + const uint16x8_t r0_1 = vextq_u16(row_2, row_3, 7); + const uint16x8_t r1_0 = vextq_u16(row_1, row_2, 6); + const uint16x8_t r1_1 = vextq_u16(row_2, row_3, 6); + const uint16x8_t r2_0 = vextq_u16(row_1, row_2, 5); + const uint16x8_t r2_1 = vextq_u16(row_2, row_3, 5); + const uint16x8_t r3_0 = vextq_u16(row_1, row_2, 4); + const uint16x8_t r3_1 = vextq_u16(row_2, row_3, 4); + const uint16x8_t r4_0 = vextq_u16(row_1, row_2, 3); + const uint16x8_t r4_1 = vextq_u16(row_2, row_3, 3); + const uint16x8_t r5_0 = vextq_u16(row_1, row_2, 2); + const uint16x8_t r5_1 = vextq_u16(row_2, row_3, 2); + const uint16x8_t r6_0 = vextq_u16(row_1, row_2, 1); + const uint16x8_t r6_1 = vextq_u16(row_2, row_3, 1); + const uint16x8_t r8_0 = vextq_u16(row_0, row_1, 7); + const uint16x8_t r9_0 = vextq_u16(row_0, row_1, 6); + const uint16x8_t ra_0 = vextq_u16(row_0, row_1, 5); + const uint16x8_t rb_0 = vextq_u16(row_0, row_1, 4); + const uint16x8_t rc_0 = vextq_u16(row_0, row_1, 3); + const uint16x8_t rd_0 = vextq_u16(row_0, row_1, 2); + const uint16x8_t re_0 = vextq_u16(row_0, row_1, 1); + (void)bd; + + d135_store_16(&dst, stride, r0_0, r0_1); + d135_store_16(&dst, stride, r1_0, r1_1); + d135_store_16(&dst, stride, r2_0, r2_1); + d135_store_16(&dst, stride, r3_0, r3_1); + d135_store_16(&dst, stride, r4_0, r4_1); + d135_store_16(&dst, stride, r5_0, r5_1); + d135_store_16(&dst, stride, r6_0, r6_1); + d135_store_16(&dst, stride, row_1, row_2); + d135_store_16(&dst, stride, r8_0, r0_0); + d135_store_16(&dst, stride, r9_0, r1_0); + d135_store_16(&dst, stride, ra_0, r2_0); + d135_store_16(&dst, stride, rb_0, r3_0); + d135_store_16(&dst, stride, rc_0, r4_0); + d135_store_16(&dst, stride, rd_0, r5_0); + d135_store_16(&dst, stride, re_0, r6_0); + vst1q_u16(dst, row_0); + dst += 8; + vst1q_u16(dst, row_1); +} + +void vpx_highbd_d135_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t LL01234567 = vld1q_u16(left + 16); + const uint16x8_t LL89abcdef = vld1q_u16(left + 24); + const uint16x4_t LL3210 = vrev64_u16(vget_low_u16(LL01234567)); + const uint16x4_t LL7654 = vrev64_u16(vget_high_u16(LL01234567)); + const uint16x4_t LLba98 = vrev64_u16(vget_low_u16(LL89abcdef)); + const uint16x4_t LLfedc = vrev64_u16(vget_high_u16(LL89abcdef)); + const uint16x8_t LL76543210 = vcombine_u16(LL7654, LL3210); + const uint16x8_t LLfedcba98 = vcombine_u16(LLfedc, LLba98); + const uint16x8_t LLedcba987 = vextq_u16(LLfedcba98, LL76543210, 1); + const uint16x8_t LLdcba9876 = vextq_u16(LLfedcba98, LL76543210, 2); + const uint16x8_t avg_0 = vhaddq_u16(LLfedcba98, LLdcba9876); + uint16x8_t row_0 = vrhaddq_u16(avg_0, LLedcba987); + + const uint16x8_t LU01234567 = vld1q_u16(left); + const uint16x8_t LU89abcdef = vld1q_u16(left + 8); + const uint16x4_t LU3210 = vrev64_u16(vget_low_u16(LU01234567)); + const uint16x4_t LU7654 = vrev64_u16(vget_high_u16(LU01234567)); + const uint16x4_t LUba98 = vrev64_u16(vget_low_u16(LU89abcdef)); + const uint16x4_t LUfedc = vrev64_u16(vget_high_u16(LU89abcdef)); + const uint16x8_t LU76543210 = vcombine_u16(LU7654, LU3210); + const uint16x8_t LUfedcba98 = vcombine_u16(LUfedc, LUba98); + const uint16x8_t LL6543210Uf = vextq_u16(LL76543210, LUfedcba98, 1); + const uint16x8_t LL543210Ufe = vextq_u16(LL76543210, LUfedcba98, 2); + const uint16x8_t avg_1 = vhaddq_u16(LL76543210, LL543210Ufe); + uint16x8_t row_1 = vrhaddq_u16(avg_1, LL6543210Uf); + + const uint16x8_t LUedcba987 = vextq_u16(LUfedcba98, LU76543210, 1); + const uint16x8_t LUdcba9876 = vextq_u16(LUfedcba98, LU76543210, 2); + const uint16x8_t avg_2 = vhaddq_u16(LUfedcba98, LUdcba9876); + uint16x8_t row_2 = vrhaddq_u16(avg_2, LUedcba987); + + const uint16x8_t XAL0123456 = vld1q_u16(above - 1); + const uint16x8_t LU6543210X = vextq_u16(LU76543210, XAL0123456, 1); + const uint16x8_t LU543210XA0 = vextq_u16(LU76543210, XAL0123456, 2); + const uint16x8_t avg_3 = vhaddq_u16(LU76543210, LU543210XA0); + uint16x8_t row_3 = vrhaddq_u16(avg_3, LU6543210X); + + const uint16x8_t AL01234567 = vld1q_u16(above); + const uint16x8_t AL12345678 = vld1q_u16(above + 1); + const uint16x8_t avg_4 = vhaddq_u16(XAL0123456, AL12345678); + uint16x8_t row_4 = vrhaddq_u16(avg_4, AL01234567); + + const uint16x8_t AL789abcde = vld1q_u16(above + 7); + const uint16x8_t AL89abcdef = vld1q_u16(above + 8); + const uint16x8_t AL9abcdefg = vld1q_u16(above + 9); + const uint16x8_t avg_5 = vhaddq_u16(AL789abcde, AL9abcdefg); + uint16x8_t row_5 = vrhaddq_u16(avg_5, AL89abcdef); + + const uint16x8_t ALfR0123456 = vld1q_u16(above + 15); + const uint16x8_t AR01234567 = vld1q_u16(above + 16); + const uint16x8_t AR12345678 = vld1q_u16(above + 17); + const uint16x8_t avg_6 = vhaddq_u16(ALfR0123456, AR12345678); + uint16x8_t row_6 = vrhaddq_u16(avg_6, AR01234567); + + const uint16x8_t AR789abcde = vld1q_u16(above + 23); + const uint16x8_t AR89abcdef = vld1q_u16(above + 24); + const uint16x8_t AR9abcdef_ = vld1q_u16(above + 25); + const uint16x8_t avg_7 = vhaddq_u16(AR789abcde, AR9abcdef_); + uint16x8_t row_7 = vrhaddq_u16(avg_7, AR89abcdef); + int i, j; + (void)bd; + + dst += 31 * stride; + for (i = 0; i < 4; ++i) { + for (j = 0; j < 8; ++j) { + vst1q_u16(dst, row_0); + dst += 8; + vst1q_u16(dst, row_1); + dst += 8; + vst1q_u16(dst, row_2); + dst += 8; + vst1q_u16(dst, row_3); + dst -= stride + 24; + row_0 = vextq_u16(row_0, row_1, 1); + row_1 = vextq_u16(row_1, row_2, 1); + row_2 = vextq_u16(row_2, row_3, 1); + row_3 = vextq_u16(row_3, row_4, 1); + row_4 = vextq_u16(row_4, row_4, 1); + } + row_4 = row_5; + row_5 = row_6; + row_6 = row_7; + } +} + +//------------------------------------------------------------------------------ + +void vpx_highbd_v_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x4_t row = vld1_u16(above); + int i; + (void)left; + (void)bd; + + for (i = 0; i < 4; i++, dst += stride) { + vst1_u16(dst, row); + } +} + +void vpx_highbd_v_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t row = vld1q_u16(above); + int i; + (void)left; + (void)bd; + + for (i = 0; i < 8; i++, dst += stride) { + vst1q_u16(dst, row); + } +} + +void vpx_highbd_v_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8x2_t row = vld2q_u16(above); + int i; + (void)left; + (void)bd; + + for (i = 0; i < 16; i++, dst += stride) { + vst2q_u16(dst, row); + } +} + +void vpx_highbd_v_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8x2_t row0 = vld2q_u16(above); + const uint16x8x2_t row1 = vld2q_u16(above + 16); + int i; + (void)left; + (void)bd; + + for (i = 0; i < 32; i++) { + vst2q_u16(dst, row0); + dst += 16; + vst2q_u16(dst, row1); + dst += stride - 16; + } +} + +// ----------------------------------------------------------------------------- + +void vpx_highbd_h_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x4_t left_u16 = vld1_u16(left); + uint16x4_t row; + (void)above; + (void)bd; + + row = vdup_lane_u16(left_u16, 0); + vst1_u16(dst, row); + dst += stride; + row = vdup_lane_u16(left_u16, 1); + vst1_u16(dst, row); + dst += stride; + row = vdup_lane_u16(left_u16, 2); + vst1_u16(dst, row); + dst += stride; + row = vdup_lane_u16(left_u16, 3); + vst1_u16(dst, row); +} + +void vpx_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const uint16x8_t left_u16 = vld1q_u16(left); + const uint16x4_t left_low = vget_low_u16(left_u16); + const uint16x4_t left_high = vget_high_u16(left_u16); + uint16x8_t row; + (void)above; + (void)bd; + + row = vdupq_lane_u16(left_low, 0); + vst1q_u16(dst, row); + dst += stride; + row = vdupq_lane_u16(left_low, 1); + vst1q_u16(dst, row); + dst += stride; + row = vdupq_lane_u16(left_low, 2); + vst1q_u16(dst, row); + dst += stride; + row = vdupq_lane_u16(left_low, 3); + vst1q_u16(dst, row); + dst += stride; + row = vdupq_lane_u16(left_high, 0); + vst1q_u16(dst, row); + dst += stride; + row = vdupq_lane_u16(left_high, 1); + vst1q_u16(dst, row); + dst += stride; + row = vdupq_lane_u16(left_high, 2); + vst1q_u16(dst, row); + dst += stride; + row = vdupq_lane_u16(left_high, 3); + vst1q_u16(dst, row); +} + +static INLINE void h_store_16(uint16_t **dst, const ptrdiff_t stride, + const uint16x8_t row) { + // Note: vst1q is faster than vst2q + vst1q_u16(*dst, row); + *dst += 8; + vst1q_u16(*dst, row); + *dst += stride - 8; +} + +void vpx_highbd_h_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 2; i++, left += 8) { + const uint16x8_t left_u16q = vld1q_u16(left); + const uint16x4_t left_low = vget_low_u16(left_u16q); + const uint16x4_t left_high = vget_high_u16(left_u16q); + uint16x8_t row; + + row = vdupq_lane_u16(left_low, 0); + h_store_16(&dst, stride, row); + row = vdupq_lane_u16(left_low, 1); + h_store_16(&dst, stride, row); + row = vdupq_lane_u16(left_low, 2); + h_store_16(&dst, stride, row); + row = vdupq_lane_u16(left_low, 3); + h_store_16(&dst, stride, row); + row = vdupq_lane_u16(left_high, 0); + h_store_16(&dst, stride, row); + row = vdupq_lane_u16(left_high, 1); + h_store_16(&dst, stride, row); + row = vdupq_lane_u16(left_high, 2); + h_store_16(&dst, stride, row); + row = vdupq_lane_u16(left_high, 3); + h_store_16(&dst, stride, row); + } +} + +static INLINE void h_store_32(uint16_t **dst, const ptrdiff_t stride, + const uint16x8_t row) { + // Note: vst1q is faster than vst2q + vst1q_u16(*dst, row); + *dst += 8; + vst1q_u16(*dst, row); + *dst += 8; + vst1q_u16(*dst, row); + *dst += 8; + vst1q_u16(*dst, row); + *dst += stride - 24; +} + +void vpx_highbd_h_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + int i; + (void)above; + (void)bd; + + for (i = 0; i < 4; i++, left += 8) { + const uint16x8_t left_u16q = vld1q_u16(left); + const uint16x4_t left_low = vget_low_u16(left_u16q); + const uint16x4_t left_high = vget_high_u16(left_u16q); + uint16x8_t row; + + row = vdupq_lane_u16(left_low, 0); + h_store_32(&dst, stride, row); + row = vdupq_lane_u16(left_low, 1); + h_store_32(&dst, stride, row); + row = vdupq_lane_u16(left_low, 2); + h_store_32(&dst, stride, row); + row = vdupq_lane_u16(left_low, 3); + h_store_32(&dst, stride, row); + row = vdupq_lane_u16(left_high, 0); + h_store_32(&dst, stride, row); + row = vdupq_lane_u16(left_high, 1); + h_store_32(&dst, stride, row); + row = vdupq_lane_u16(left_high, 2); + h_store_32(&dst, stride, row); + row = vdupq_lane_u16(left_high, 3); + h_store_32(&dst, stride, row); + } +} + +// ----------------------------------------------------------------------------- + +void vpx_highbd_tm_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int16x8_t max = vmovq_n_s16((1 << bd) - 1); + const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1)); + const int16x4_t above_s16d = vld1_s16((const int16_t *)above); + const int16x8_t above_s16 = vcombine_s16(above_s16d, above_s16d); + const int16x4_t left_s16 = vld1_s16((const int16_t *)left); + const int16x8_t sub = vsubq_s16(above_s16, top_left); + int16x8_t sum; + uint16x8_t row; + + sum = vcombine_s16(vdup_lane_s16(left_s16, 0), vdup_lane_s16(left_s16, 1)); + sum = vaddq_s16(sum, sub); + sum = vminq_s16(sum, max); + row = vqshluq_n_s16(sum, 0); + vst1_u16(dst, vget_low_u16(row)); + dst += stride; + vst1_u16(dst, vget_high_u16(row)); + dst += stride; + + sum = vcombine_s16(vdup_lane_s16(left_s16, 2), vdup_lane_s16(left_s16, 3)); + sum = vaddq_s16(sum, sub); + sum = vminq_s16(sum, max); + row = vqshluq_n_s16(sum, 0); + vst1_u16(dst, vget_low_u16(row)); + dst += stride; + vst1_u16(dst, vget_high_u16(row)); +} + +static INLINE void tm_8_kernel(uint16_t **dst, const ptrdiff_t stride, + const int16x8_t left_dup, const int16x8_t sub, + const int16x8_t max) { + uint16x8_t row; + int16x8_t sum = vaddq_s16(left_dup, sub); + sum = vminq_s16(sum, max); + row = vqshluq_n_s16(sum, 0); + vst1q_u16(*dst, row); + *dst += stride; +} + +void vpx_highbd_tm_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int16x8_t max = vmovq_n_s16((1 << bd) - 1); + const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1)); + const int16x8_t above_s16 = vld1q_s16((const int16_t *)above); + const int16x8_t left_s16 = vld1q_s16((const int16_t *)left); + const int16x8_t sub = vsubq_s16(above_s16, top_left); + int16x4_t left_s16d; + int16x8_t left_dup; + int i; + + left_s16d = vget_low_s16(left_s16); + + for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16)) { + left_dup = vdupq_lane_s16(left_s16d, 0); + tm_8_kernel(&dst, stride, left_dup, sub, max); + + left_dup = vdupq_lane_s16(left_s16d, 1); + tm_8_kernel(&dst, stride, left_dup, sub, max); + + left_dup = vdupq_lane_s16(left_s16d, 2); + tm_8_kernel(&dst, stride, left_dup, sub, max); + + left_dup = vdupq_lane_s16(left_s16d, 3); + tm_8_kernel(&dst, stride, left_dup, sub, max); + } +} + +static INLINE void tm_16_kernel(uint16_t **dst, const ptrdiff_t stride, + const int16x8_t left_dup, const int16x8_t sub0, + const int16x8_t sub1, const int16x8_t max) { + uint16x8_t row0, row1; + int16x8_t sum0 = vaddq_s16(left_dup, sub0); + int16x8_t sum1 = vaddq_s16(left_dup, sub1); + sum0 = vminq_s16(sum0, max); + sum1 = vminq_s16(sum1, max); + row0 = vqshluq_n_s16(sum0, 0); + row1 = vqshluq_n_s16(sum1, 0); + vst1q_u16(*dst, row0); + *dst += 8; + vst1q_u16(*dst, row1); + *dst += stride - 8; +} + +void vpx_highbd_tm_predictor_16x16_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int16x8_t max = vmovq_n_s16((1 << bd) - 1); + const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1)); + const int16x8_t above0 = vld1q_s16((const int16_t *)above); + const int16x8_t above1 = vld1q_s16((const int16_t *)(above + 8)); + const int16x8_t sub0 = vsubq_s16(above0, top_left); + const int16x8_t sub1 = vsubq_s16(above1, top_left); + int16x8_t left_dup; + int i, j; + + for (j = 0; j < 2; j++, left += 8) { + const int16x8_t left_s16q = vld1q_s16((const int16_t *)left); + int16x4_t left_s16d = vget_low_s16(left_s16q); + for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) { + left_dup = vdupq_lane_s16(left_s16d, 0); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max); + + left_dup = vdupq_lane_s16(left_s16d, 1); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max); + + left_dup = vdupq_lane_s16(left_s16d, 2); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max); + + left_dup = vdupq_lane_s16(left_s16d, 3); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1, max); + } + } +} + +static INLINE void tm_32_kernel(uint16_t **dst, const ptrdiff_t stride, + const int16x8_t left_dup, const int16x8_t sub0, + const int16x8_t sub1, const int16x8_t sub2, + const int16x8_t sub3, const int16x8_t max) { + uint16x8_t row0, row1, row2, row3; + int16x8_t sum0 = vaddq_s16(left_dup, sub0); + int16x8_t sum1 = vaddq_s16(left_dup, sub1); + int16x8_t sum2 = vaddq_s16(left_dup, sub2); + int16x8_t sum3 = vaddq_s16(left_dup, sub3); + sum0 = vminq_s16(sum0, max); + sum1 = vminq_s16(sum1, max); + sum2 = vminq_s16(sum2, max); + sum3 = vminq_s16(sum3, max); + row0 = vqshluq_n_s16(sum0, 0); + row1 = vqshluq_n_s16(sum1, 0); + row2 = vqshluq_n_s16(sum2, 0); + row3 = vqshluq_n_s16(sum3, 0); + vst1q_u16(*dst, row0); + *dst += 8; + vst1q_u16(*dst, row1); + *dst += 8; + vst1q_u16(*dst, row2); + *dst += 8; + vst1q_u16(*dst, row3); + *dst += stride - 24; +} + +void vpx_highbd_tm_predictor_32x32_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const int16x8_t max = vmovq_n_s16((1 << bd) - 1); + const int16x8_t top_left = vld1q_dup_s16((const int16_t *)(above - 1)); + const int16x8_t above0 = vld1q_s16((const int16_t *)above); + const int16x8_t above1 = vld1q_s16((const int16_t *)(above + 8)); + const int16x8_t above2 = vld1q_s16((const int16_t *)(above + 16)); + const int16x8_t above3 = vld1q_s16((const int16_t *)(above + 24)); + const int16x8_t sub0 = vsubq_s16(above0, top_left); + const int16x8_t sub1 = vsubq_s16(above1, top_left); + const int16x8_t sub2 = vsubq_s16(above2, top_left); + const int16x8_t sub3 = vsubq_s16(above3, top_left); + int16x8_t left_dup; + int i, j; + + for (i = 0; i < 4; i++, left += 8) { + const int16x8_t left_s16q = vld1q_s16((const int16_t *)left); + int16x4_t left_s16d = vget_low_s16(left_s16q); + for (j = 0; j < 2; j++, left_s16d = vget_high_s16(left_s16q)) { + left_dup = vdupq_lane_s16(left_s16d, 0); + tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max); + + left_dup = vdupq_lane_s16(left_s16d, 1); + tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max); + + left_dup = vdupq_lane_s16(left_s16d, 2); + tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max); + + left_dup = vdupq_lane_s16(left_s16d, 3); + tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3, max); + } + } +} diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm index e3c0c5210d2..d648840df40 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm @@ -15,12 +15,11 @@ AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vpx_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, -; int dest_stride) +;void vpx_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, int stride) ; ; r0 int16_t input ; r1 uint8_t *dest -; r2 int dest_stride) +; r2 int stride) |vpx_idct16x16_1_add_neon| PROC ldrsh r0, [r0] diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c index f1e49ff5178..968bc5cc3ab 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c @@ -11,49 +11,66 @@ #include <arm_neon.h> #include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" #include "vpx_dsp/inv_txfm.h" -#include "vpx_ports/mem.h" -void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, - int dest_stride) { - uint8x8_t d2u8, d3u8, d30u8, d31u8; - uint64x1_t d2u64, d3u64, d4u64, d5u64; - uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16; - int16x8_t q0s16; - uint8_t *d1, *d2; - int16_t i, j, a1; - int16_t out = dct_const_round_shift(input[0] * cospi_16_64); - out = dct_const_round_shift(out * cospi_16_64); - a1 = ROUND_POWER_OF_TWO(out, 6); - - q0s16 = vdupq_n_s16(a1); - q0u16 = vreinterpretq_u16_s16(q0s16); - - for (d1 = d2 = dest, i = 0; i < 4; i++) { - for (j = 0; j < 2; j++) { - d2u64 = vld1_u64((const uint64_t *)d1); - d3u64 = vld1_u64((const uint64_t *)(d1 + 8)); - d1 += dest_stride; - d4u64 = vld1_u64((const uint64_t *)d1); - d5u64 = vld1_u64((const uint64_t *)(d1 + 8)); - d1 += dest_stride; +static INLINE void idct16x16_1_add_pos_kernel(uint8_t **dest, const int stride, + const uint8x16_t res) { + const uint8x16_t a = vld1q_u8(*dest); + const uint8x16_t b = vqaddq_u8(a, res); + vst1q_u8(*dest, b); + *dest += stride; +} - q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64)); - q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64)); - q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64)); - q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64)); +static INLINE void idct16x16_1_add_neg_kernel(uint8_t **dest, const int stride, + const uint8x16_t res) { + const uint8x16_t a = vld1q_u8(*dest); + const uint8x16_t b = vqsubq_u8(a, res); + vst1q_u8(*dest, b); + *dest += stride; +} - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); +void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64)); + const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6); - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8)); - vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8)); - d2 += dest_stride; - } + if (a1 >= 0) { + const uint8x16_t dc = create_dcq(a1); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + } else { + const uint8x16_t dc = create_dcq(-a1); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); } } diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm index 5e64cea0ae7..ea6b099d3bb 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm @@ -8,8 +8,14 @@ ; be found in the AUTHORS file in the root of the source tree. ; + INCLUDE vpx_dsp/arm/idct_neon.asm.S + EXPORT |vpx_idct16x16_256_add_neon_pass1| EXPORT |vpx_idct16x16_256_add_neon_pass2| + IF CONFIG_VP9_HIGHBITDEPTH + EXPORT |vpx_idct16x16_256_add_neon_pass1_tran_low| + EXPORT |vpx_idct16x16_256_add_neon_pass2_tran_low| + ENDIF EXPORT |vpx_idct16x16_10_add_neon_pass1| EXPORT |vpx_idct16x16_10_add_neon_pass2| ARM @@ -36,12 +42,10 @@ MEND AREA Block, CODE, READONLY ; name this block of code -;void |vpx_idct16x16_256_add_neon_pass1|(int16_t *input, -; int16_t *output, int output_stride) +;void |vpx_idct16x16_256_add_neon_pass1|(const int16_t *input, int16_t *output) ; -; r0 int16_t input +; r0 const int16_t *input ; r1 int16_t *output -; r2 int output_stride) ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output ; will be stored back into q8-q15 registers. This function will touch q0-q7 @@ -60,6 +64,7 @@ vld2.s16 {q1,q2}, [r0]! vmov.s16 q15, q1 +idct16x16_256_add_neon_pass1 ; cospi_28_64 = 3196 movw r3, #0x0c7c @@ -100,12 +105,12 @@ vdup.16 d3, r12 ; duplicate cospi_20_64 ; dct_const_round_shift(temp1) - vqrshrn.s32 d8, q2, #14 ; >> 14 - vqrshrn.s32 d9, q3, #14 ; >> 14 + vrshrn.s32 d8, q2, #14 ; >> 14 + vrshrn.s32 d9, q3, #14 ; >> 14 ; dct_const_round_shift(temp2) - vqrshrn.s32 d14, q5, #14 ; >> 14 - vqrshrn.s32 d15, q6, #14 ; >> 14 + vrshrn.s32 d14, q5, #14 ; >> 14 + vrshrn.s32 d15, q6, #14 ; >> 14 ; preloading to avoid stall ; cospi_16_64 = 11585 @@ -131,12 +136,12 @@ vmlal.s16 q15, d23, d2 ; dct_const_round_shift(temp1) - vqrshrn.s32 d10, q2, #14 ; >> 14 - vqrshrn.s32 d11, q3, #14 ; >> 14 + vrshrn.s32 d10, q2, #14 ; >> 14 + vrshrn.s32 d11, q3, #14 ; >> 14 ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q9, #14 ; >> 14 - vqrshrn.s32 d13, q15, #14 ; >> 14 + vrshrn.s32 d12, q9, #14 ; >> 14 + vrshrn.s32 d13, q15, #14 ; >> 14 ; stage 4 vdup.16 d30, r3 ; cospi_16_64 @@ -164,12 +169,12 @@ vsub.s32 q1, q11, q1 ; dct_const_round_shift(temp1) - vqrshrn.s32 d16, q3, #14 ; >> 14 - vqrshrn.s32 d17, q12, #14 ; >> 14 + vrshrn.s32 d16, q3, #14 ; >> 14 + vrshrn.s32 d17, q12, #14 ; >> 14 ; dct_const_round_shift(temp2) - vqrshrn.s32 d18, q13, #14 ; >> 14 - vqrshrn.s32 d19, q1, #14 ; >> 14 + vrshrn.s32 d18, q13, #14 ; >> 14 + vrshrn.s32 d19, q1, #14 ; >> 14 ; step1[2] * cospi_24_64 - step1[3] * cospi_8_64; ; step1[2] * cospi_8_64 @@ -189,12 +194,12 @@ vmlsl.s16 q13, d29, d31 ; dct_const_round_shift(temp2) - vqrshrn.s32 d22, q0, #14 ; >> 14 - vqrshrn.s32 d23, q1, #14 ; >> 14 + vrshrn.s32 d22, q0, #14 ; >> 14 + vrshrn.s32 d23, q1, #14 ; >> 14 ; dct_const_round_shift(temp1) - vqrshrn.s32 d20, q12, #14 ; >> 14 - vqrshrn.s32 d21, q13, #14 ; >> 14 + vrshrn.s32 d20, q12, #14 ; >> 14 + vrshrn.s32 d21, q13, #14 ; >> 14 vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5]; vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5]; @@ -229,15 +234,15 @@ vadd.s32 q10, q10, q12 ; dct_const_round_shift(temp1) - vqrshrn.s32 d10, q6, #14 ; >> 14 - vqrshrn.s32 d11, q13, #14 ; >> 14 + vrshrn.s32 d10, q6, #14 ; >> 14 + vrshrn.s32 d11, q13, #14 ; >> 14 ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q9, #14 ; >> 14 - vqrshrn.s32 d13, q10, #14 ; >> 14 + vrshrn.s32 d12, q9, #14 ; >> 14 + vrshrn.s32 d13, q10, #14 ; >> 14 ; stage 6 - vadd.s16 q8, q0, q15 ; step2[0] = step1[0] + step1[7]; + vadd.s16 q8, q0, q15 ; step2[0] = step1[0] + step1[7]; vadd.s16 q9, q1, q6 ; step2[1] = step1[1] + step1[6]; vadd.s16 q10, q2, q5 ; step2[2] = step1[2] + step1[5]; vadd.s16 q11, q3, q4 ; step2[3] = step1[3] + step1[4]; @@ -247,46 +252,54 @@ vsub.s16 q15, q0, q15 ; step2[7] = step1[0] - step1[7]; ; store the data - vst1.64 {d16}, [r1], r2 - vst1.64 {d17}, [r1], r2 - vst1.64 {d18}, [r1], r2 - vst1.64 {d19}, [r1], r2 - vst1.64 {d20}, [r1], r2 - vst1.64 {d21}, [r1], r2 - vst1.64 {d22}, [r1], r2 - vst1.64 {d23}, [r1], r2 - vst1.64 {d24}, [r1], r2 - vst1.64 {d25}, [r1], r2 - vst1.64 {d26}, [r1], r2 - vst1.64 {d27}, [r1], r2 - vst1.64 {d28}, [r1], r2 - vst1.64 {d29}, [r1], r2 - vst1.64 {d30}, [r1], r2 - vst1.64 {d31}, [r1], r2 + vst1.64 {q8-q9}, [r1]! + vst1.64 {q10-q11}, [r1]! + vst1.64 {q12-q13}, [r1]! + vst1.64 {q14-q15}, [r1] bx lr ENDP ; |vpx_idct16x16_256_add_neon_pass1| -;void vpx_idct16x16_256_add_neon_pass2(int16_t *src, -; int16_t *output, -; int16_t *pass1Output, -; int16_t skip_adding, -; uint8_t *dest, -; int dest_stride) + IF CONFIG_VP9_HIGHBITDEPTH +;void |vpx_idct16x16_256_add_neon_pass1_tran_low|(const tran_low_t *input, +; int16_t *output) +; +; r0 const tran_low_t *input +; r1 int16_t *output + +|vpx_idct16x16_256_add_neon_pass1_tran_low| PROC + LOAD_TRAN_LOW_TO_S16X2 d16, d17, d18, d19, r0 + LOAD_TRAN_LOW_TO_S16X2 d18, d19, d20, d21, r0 + LOAD_TRAN_LOW_TO_S16X2 d20, d21, d22, d23, r0 + LOAD_TRAN_LOW_TO_S16X2 d22, d23, d24, d25, r0 + LOAD_TRAN_LOW_TO_S16X2 d24, d25, d26, d27, r0 + LOAD_TRAN_LOW_TO_S16X2 d26, d27, d28, d29, r0 + LOAD_TRAN_LOW_TO_S16X2 d28, d29, d30, d31, r0 + LOAD_TRAN_LOW_TO_S16X2 d2, d3, d4, d5, r0 + vmov.s16 q15, q1 + + b idct16x16_256_add_neon_pass1 + ENDP ; |vpx_idct16x16_256_add_neon_pass1_tran_low| + ENDIF ; CONFIG_VP9_HIGHBITDEPTH + +;void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, +; int16_t *output, +; int16_t *pass1_output, +; int16_t skip_adding, +; uint8_t *dest, +; int stride) ; -; r0 int16_t *src -; r1 int16_t *output, -; r2 int16_t *pass1Output, -; r3 int16_t skip_adding, -; r4 uint8_t *dest, -; r5 int dest_stride) +; r0 const int16_t *src +; r1 int16_t *output +; r2 int16_t *pass1_output +; r3 int16_t skip_adding +; r4 uint8_t *dest +; r5 int stride ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output ; will be stored back into q8-q15 registers. This function will touch q0-q7 ; registers and use them as buffer during calculation. |vpx_idct16x16_256_add_neon_pass2| PROC - push {r3-r9} - ; TODO(hkuang): Find a better way to load the elements. ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 vld2.s16 {q8,q9}, [r0]! @@ -299,6 +312,9 @@ vld2.s16 {q0,q1}, [r0]! vmov.s16 q15, q0; +idct16x16_256_add_neon_pass2 + push {r3-r9} + ; cospi_30_64 = 1606 movw r3, #0x0646 @@ -339,12 +355,12 @@ vdup.16 d31, r12 ; duplicate cospi_18_64 ; dct_const_round_shift(temp1) - vqrshrn.s32 d0, q2, #14 ; >> 14 - vqrshrn.s32 d1, q3, #14 ; >> 14 + vrshrn.s32 d0, q2, #14 ; >> 14 + vrshrn.s32 d1, q3, #14 ; >> 14 ; dct_const_round_shift(temp2) - vqrshrn.s32 d14, q1, #14 ; >> 14 - vqrshrn.s32 d15, q4, #14 ; >> 14 + vrshrn.s32 d14, q1, #14 ; >> 14 + vrshrn.s32 d15, q4, #14 ; >> 14 ; preloading to avoid stall ; cospi_22_64 = 7723 @@ -373,12 +389,12 @@ vdup.16 d31, r12 ; duplicate cospi_10_64 ; dct_const_round_shift(temp1) - vqrshrn.s32 d2, q2, #14 ; >> 14 - vqrshrn.s32 d3, q3, #14 ; >> 14 + vrshrn.s32 d2, q2, #14 ; >> 14 + vrshrn.s32 d3, q3, #14 ; >> 14 ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q4, #14 ; >> 14 - vqrshrn.s32 d13, q5, #14 ; >> 14 + vrshrn.s32 d12, q4, #14 ; >> 14 + vrshrn.s32 d13, q5, #14 ; >> 14 ; step1[10] * cospi_22_64 vmull.s16 q11, d20, d30 @@ -407,12 +423,12 @@ vdup.16 d31, r12 ; duplicate cospi_26_64 ; dct_const_round_shift(temp1) - vqrshrn.s32 d4, q11, #14 ; >> 14 - vqrshrn.s32 d5, q12, #14 ; >> 14 + vrshrn.s32 d4, q11, #14 ; >> 14 + vrshrn.s32 d5, q12, #14 ; >> 14 ; dct_const_round_shift(temp2) - vqrshrn.s32 d11, q5, #14 ; >> 14 - vqrshrn.s32 d10, q4, #14 ; >> 14 + vrshrn.s32 d11, q5, #14 ; >> 14 + vrshrn.s32 d10, q4, #14 ; >> 14 ; step1[11] * cospi_6_64 vmull.s16 q10, d28, d30 @@ -434,12 +450,12 @@ vadd.s16 q0, q0, q1 ; step1[8]=step2[8]+step2[9] ; dct_const_round_shift(temp1) - vqrshrn.s32 d6, q10, #14 ; >> 14 - vqrshrn.s32 d7, q11, #14 ; >> 14 + vrshrn.s32 d6, q10, #14 ; >> 14 + vrshrn.s32 d7, q11, #14 ; >> 14 ; dct_const_round_shift(temp2) - vqrshrn.s32 d8, q12, #14 ; >> 14 - vqrshrn.s32 d9, q13, #14 ; >> 14 + vrshrn.s32 d8, q12, #14 ; >> 14 + vrshrn.s32 d9, q13, #14 ; >> 14 ; stage 3 vsub.s16 q10, q3, q2 ; step1[10]=-step2[10]+step2[11] @@ -480,12 +496,12 @@ vdup.16 d30, r12 ; duplicate -cospi_8_64 ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q2, #14 ; >> 14 - vqrshrn.s32 d13, q3, #14 ; >> 14 + vrshrn.s32 d12, q2, #14 ; >> 14 + vrshrn.s32 d13, q3, #14 ; >> 14 ; dct_const_round_shift(temp1) - vqrshrn.s32 d2, q4, #14 ; >> 14 - vqrshrn.s32 d3, q5, #14 ; >> 14 + vrshrn.s32 d2, q4, #14 ; >> 14 + vrshrn.s32 d3, q5, #14 ; >> 14 vmov.s16 q3, q11 vmov.s16 q4, q12 @@ -507,12 +523,12 @@ vmlal.s16 q9, d27, d31 ; dct_const_round_shift(temp2) - vqrshrn.s32 d4, q11, #14 ; >> 14 - vqrshrn.s32 d5, q12, #14 ; >> 14 + vrshrn.s32 d4, q11, #14 ; >> 14 + vrshrn.s32 d5, q12, #14 ; >> 14 ; dct_const_round_shift(temp1) - vqrshrn.s32 d10, q8, #14 ; >> 14 - vqrshrn.s32 d11, q9, #14 ; >> 14 + vrshrn.s32 d10, q8, #14 ; >> 14 + vrshrn.s32 d11, q9, #14 ; >> 14 ; stage 5 vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11]; @@ -547,12 +563,12 @@ vadd.s32 q4, q4, q1 ; dct_const_round_shift(temp1) - vqrshrn.s32 d4, q5, #14 ; >> 14 - vqrshrn.s32 d5, q6, #14 ; >> 14 + vrshrn.s32 d4, q5, #14 ; >> 14 + vrshrn.s32 d5, q6, #14 ; >> 14 ; dct_const_round_shift(temp2) - vqrshrn.s32 d10, q10, #14 ; >> 14 - vqrshrn.s32 d11, q4, #14 ; >> 14 + vrshrn.s32 d10, q10, #14 ; >> 14 + vrshrn.s32 d11, q4, #14 ; >> 14 ; step1[11] * cospi_16_64 vmull.s16 q0, d22, d14 @@ -571,21 +587,21 @@ vadd.s32 q6, q6, q1 ; dct_const_round_shift(temp1) - vqrshrn.s32 d6, q10, #14 ; >> 14 - vqrshrn.s32 d7, q4, #14 ; >> 14 + vrshrn.s32 d6, q10, #14 ; >> 14 + vrshrn.s32 d7, q4, #14 ; >> 14 ; dct_const_round_shift(temp2) - vqrshrn.s32 d8, q13, #14 ; >> 14 - vqrshrn.s32 d9, q6, #14 ; >> 14 + vrshrn.s32 d8, q13, #14 ; >> 14 + vrshrn.s32 d9, q6, #14 ; >> 14 - mov r4, #16 ; pass1Output stride + mov r4, #16 ; pass1_output stride ldr r3, [sp] ; load skip_adding cmp r3, #0 ; check if need adding dest data beq skip_adding_dest ldr r7, [sp, #28] ; dest used to save element 0-7 mov r9, r7 ; save dest pointer for later use - ldr r8, [sp, #32] ; load dest_stride + ldr r8, [sp, #32] ; load stride ; stage 7 ; load the data in pass1 @@ -599,8 +615,8 @@ vadd.s16 q13, q1, q14 ; step2[1] + step2[14] vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] - vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] + vaddw.u8 q12, q12, d12 ; + dest[j * stride + i] + vaddw.u8 q13, q13, d13 ; + dest[j * stride + i] vqmovun.s16 d12, q12 ; clip pixel vqmovun.s16 d13, q13 ; clip pixel vst1.64 {d12}, [r9], r8 ; store the data @@ -613,8 +629,8 @@ vadd.s16 q13, q11, q4 ; step2[3] + step2[12] vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] - vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] + vaddw.u8 q12, q12, d12 ; + dest[j * stride + i] + vaddw.u8 q13, q13, d13 ; + dest[j * stride + i] vqmovun.s16 d12, q12 ; clip pixel vqmovun.s16 d13, q13 ; clip pixel vst1.64 {d12}, [r9], r8 ; store the data @@ -631,8 +647,8 @@ vadd.s16 q13, q1, q2 ; step2[5] + step2[10] vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] - vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] + vaddw.u8 q12, q12, d12 ; + dest[j * stride + i] + vaddw.u8 q13, q13, d13 ; + dest[j * stride + i] vqmovun.s16 d12, q12 ; clip pixel vqmovun.s16 d13, q13 ; clip pixel vst1.64 {d12}, [r9], r8 ; store the data @@ -645,8 +661,8 @@ vadd.s16 q13, q11, q8 ; step2[7] + step2[8] vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q12, q12, d12 ; + dest[j * dest_stride + i] - vaddw.u8 q13, q13, d13 ; + dest[j * dest_stride + i] + vaddw.u8 q12, q12, d12 ; + dest[j * stride + i] + vaddw.u8 q13, q13, d13 ; + dest[j * stride + i] vqmovun.s16 d12, q12 ; clip pixel vqmovun.s16 d13, q13 ; clip pixel vst1.64 {d12}, [r9], r8 ; store the data @@ -658,42 +674,42 @@ ; store the data output 8,9,10,11,12,13,14,15 vrshr.s16 q8, q8, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q8, q8, d12 ; + dest[j * dest_stride + i] + vaddw.u8 q8, q8, d12 ; + dest[j * stride + i] vqmovun.s16 d12, q8 ; clip pixel vst1.64 {d12}, [r9], r8 ; store the data vld1.64 {d12}, [r7], r8 ; load destinatoin data vrshr.s16 q9, q9, #6 - vaddw.u8 q9, q9, d13 ; + dest[j * dest_stride + i] + vaddw.u8 q9, q9, d13 ; + dest[j * stride + i] vqmovun.s16 d13, q9 ; clip pixel vst1.64 {d13}, [r9], r8 ; store the data vld1.64 {d13}, [r7], r8 ; load destinatoin data vrshr.s16 q2, q2, #6 - vaddw.u8 q2, q2, d12 ; + dest[j * dest_stride + i] + vaddw.u8 q2, q2, d12 ; + dest[j * stride + i] vqmovun.s16 d12, q2 ; clip pixel vst1.64 {d12}, [r9], r8 ; store the data vld1.64 {d12}, [r7], r8 ; load destinatoin data vrshr.s16 q3, q3, #6 - vaddw.u8 q3, q3, d13 ; + dest[j * dest_stride + i] + vaddw.u8 q3, q3, d13 ; + dest[j * stride + i] vqmovun.s16 d13, q3 ; clip pixel vst1.64 {d13}, [r9], r8 ; store the data vld1.64 {d13}, [r7], r8 ; load destinatoin data vrshr.s16 q4, q4, #6 - vaddw.u8 q4, q4, d12 ; + dest[j * dest_stride + i] + vaddw.u8 q4, q4, d12 ; + dest[j * stride + i] vqmovun.s16 d12, q4 ; clip pixel vst1.64 {d12}, [r9], r8 ; store the data vld1.64 {d12}, [r7], r8 ; load destinatoin data vrshr.s16 q5, q5, #6 - vaddw.u8 q5, q5, d13 ; + dest[j * dest_stride + i] + vaddw.u8 q5, q5, d13 ; + dest[j * stride + i] vqmovun.s16 d13, q5 ; clip pixel vst1.64 {d13}, [r9], r8 ; store the data vld1.64 {d13}, [r7], r8 ; load destinatoin data vrshr.s16 q14, q14, #6 - vaddw.u8 q14, q14, d12 ; + dest[j * dest_stride + i] + vaddw.u8 q14, q14, d12 ; + dest[j * stride + i] vqmovun.s16 d12, q14 ; clip pixel vst1.64 {d12}, [r9], r8 ; store the data vld1.64 {d12}, [r7], r8 ; load destinatoin data vrshr.s16 q15, q15, #6 - vaddw.u8 q15, q15, d13 ; + dest[j * dest_stride + i] + vaddw.u8 q15, q15, d13 ; + dest[j * stride + i] vqmovun.s16 d13, q15 ; clip pixel vst1.64 {d13}, [r9], r8 ; store the data b end_idct16x16_pass2 @@ -767,12 +783,41 @@ end_idct16x16_pass2 bx lr ENDP ; |vpx_idct16x16_256_add_neon_pass2| -;void |vpx_idct16x16_10_add_neon_pass1|(int16_t *input, -; int16_t *output, int output_stride) + IF CONFIG_VP9_HIGHBITDEPTH +;void vpx_idct16x16_256_add_neon_pass2_tran_low(const tran_low_t *src, +; int16_t *output, +; int16_t *pass1_output, +; int16_t skip_adding, +; uint8_t *dest, +; int stride) ; -; r0 int16_t input +; r0 const tran_low_t *src +; r1 int16_t *output +; r2 int16_t *pass1_output +; r3 int16_t skip_adding +; r4 uint8_t *dest +; r5 int stride + +|vpx_idct16x16_256_add_neon_pass2_tran_low| PROC + LOAD_TRAN_LOW_TO_S16X2 d16, d17, d18, d19, r0 + LOAD_TRAN_LOW_TO_S16X2 d18, d19, d20, d21, r0 + LOAD_TRAN_LOW_TO_S16X2 d20, d21, d22, d23, r0 + LOAD_TRAN_LOW_TO_S16X2 d22, d23, d24, d25, r0 + LOAD_TRAN_LOW_TO_S16X2 d24, d25, d26, d27, r0 + LOAD_TRAN_LOW_TO_S16X2 d26, d27, d28, d29, r0 + LOAD_TRAN_LOW_TO_S16X2 d28, d29, d30, d31, r0 + LOAD_TRAN_LOW_TO_S16X2 d0, d1, d2, d3, r0 + vmov.s16 q15, q0 + + b idct16x16_256_add_neon_pass2 + ENDP ; |vpx_idct16x16_256_add_neon_pass2_tran_low| + ENDIF ; CONFIG_VP9_HIGHBITDEPTH + +;void |vpx_idct16x16_10_add_neon_pass1|(const tran_low_t *input, +; int16_t *output) +; +; r0 const tran_low_t *input ; r1 int16_t *output -; r2 int output_stride) ; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output ; will be stored back into q8-q15 registers. This function will touch q0-q7 @@ -781,14 +826,14 @@ end_idct16x16_pass2 ; TODO(hkuang): Find a better way to load the elements. ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 - vld2.s16 {q8,q9}, [r0]! - vld2.s16 {q9,q10}, [r0]! - vld2.s16 {q10,q11}, [r0]! - vld2.s16 {q11,q12}, [r0]! - vld2.s16 {q12,q13}, [r0]! - vld2.s16 {q13,q14}, [r0]! - vld2.s16 {q14,q15}, [r0]! - vld2.s16 {q1,q2}, [r0]! + LOAD_TRAN_LOW_TO_S16X2 d16, d17, d18, d19, r0 + LOAD_TRAN_LOW_TO_S16X2 d18, d19, d20, d21, r0 + LOAD_TRAN_LOW_TO_S16X2 d20, d21, d22, d23, r0 + LOAD_TRAN_LOW_TO_S16X2 d22, d23, d24, d25, r0 + LOAD_TRAN_LOW_TO_S16X2 d24, d25, d26, d27, r0 + LOAD_TRAN_LOW_TO_S16X2 d26, d27, d28, d29, r0 + LOAD_TRAN_LOW_TO_S16X2 d28, d29, d30, d31, r0 + LOAD_TRAN_LOW_TO_S16X2 d2, d3, d4, d5, r0 vmov.s16 q15, q1 ; cospi_28_64*2 = 6392 @@ -846,12 +891,12 @@ end_idct16x16_pass2 vadd.s32 q10, q10, q12 ; dct_const_round_shift(temp1) - vqrshrn.s32 d11, q15, #14 ; >> 14 - vqrshrn.s32 d10, q6, #14 ; >> 14 + vrshrn.s32 d11, q15, #14 ; >> 14 + vrshrn.s32 d10, q6, #14 ; >> 14 ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q9, #14 ; >> 14 - vqrshrn.s32 d13, q10, #14 ; >> 14 + vrshrn.s32 d12, q9, #14 ; >> 14 + vrshrn.s32 d13, q10, #14 ; >> 14 ; stage 6 vadd.s16 q2, q8, q7 ; step2[0] = step1[0] + step1[7]; @@ -864,39 +909,21 @@ end_idct16x16_pass2 vsub.s16 q15, q8, q7 ; step2[7] = step1[0] - step1[7]; ; store the data - vst1.64 {d4}, [r1], r2 - vst1.64 {d5}, [r1], r2 - vst1.64 {d18}, [r1], r2 - vst1.64 {d19}, [r1], r2 - vst1.64 {d20}, [r1], r2 - vst1.64 {d21}, [r1], r2 - vst1.64 {d22}, [r1], r2 - vst1.64 {d23}, [r1], r2 - vst1.64 {d24}, [r1], r2 - vst1.64 {d25}, [r1], r2 - vst1.64 {d26}, [r1], r2 - vst1.64 {d27}, [r1], r2 - vst1.64 {d28}, [r1], r2 - vst1.64 {d29}, [r1], r2 - vst1.64 {d30}, [r1], r2 - vst1.64 {d31}, [r1], r2 + vst1.64 {q2}, [r1]! + vst1.64 {q9-q10}, [r1]! + vst1.64 {q11-q12}, [r1]! + vst1.64 {q13-q14}, [r1]! + vst1.64 {q15}, [r1] bx lr ENDP ; |vpx_idct16x16_10_add_neon_pass1| -;void vpx_idct16x16_10_add_neon_pass2(int16_t *src, -; int16_t *output, -; int16_t *pass1Output, -; int16_t skip_adding, -; uint8_t *dest, -; int dest_stride) +;void vpx_idct16x16_10_add_neon_pass2(const tran_low_t *src, int16_t *output, +; int16_t *pass1_output) ; -; r0 int16_t *src -; r1 int16_t *output, -; r2 int16_t *pass1Output, -; r3 int16_t skip_adding, -; r4 uint8_t *dest, -; r5 int dest_stride) +; r0 const tran_low_t *src +; r1 int16_t *output +; r2 int16_t *pass1_output ; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output ; will be stored back into q8-q15 registers. This function will touch q0-q7 @@ -906,14 +933,14 @@ end_idct16x16_pass2 ; TODO(hkuang): Find a better way to load the elements. ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 - vld2.s16 {q8,q9}, [r0]! - vld2.s16 {q9,q10}, [r0]! - vld2.s16 {q10,q11}, [r0]! - vld2.s16 {q11,q12}, [r0]! - vld2.s16 {q12,q13}, [r0]! - vld2.s16 {q13,q14}, [r0]! - vld2.s16 {q14,q15}, [r0]! - vld2.s16 {q0,q1}, [r0]! + LOAD_TRAN_LOW_TO_S16X2 d16, d17, d18, d19, r0 + LOAD_TRAN_LOW_TO_S16X2 d18, d19, d20, d21, r0 + LOAD_TRAN_LOW_TO_S16X2 d20, d21, d22, d23, r0 + LOAD_TRAN_LOW_TO_S16X2 d22, d23, d24, d25, r0 + LOAD_TRAN_LOW_TO_S16X2 d24, d25, d26, d27, r0 + LOAD_TRAN_LOW_TO_S16X2 d26, d27, d28, d29, r0 + LOAD_TRAN_LOW_TO_S16X2 d28, d29, d30, d31, r0 + LOAD_TRAN_LOW_TO_S16X2 d0, d1, d2, d3, r0 vmov.s16 q15, q0; ; 2*cospi_30_64 = 3212 @@ -981,12 +1008,12 @@ end_idct16x16_pass2 vdup.16 d30, r12 ; duplicate -cospi_8_64 ; dct_const_round_shift(temp1) - vqrshrn.s32 d2, q12, #14 ; >> 14 - vqrshrn.s32 d3, q5, #14 ; >> 14 + vrshrn.s32 d2, q12, #14 ; >> 14 + vrshrn.s32 d3, q5, #14 ; >> 14 ; dct_const_round_shift(temp2) - vqrshrn.s32 d12, q2, #14 ; >> 14 - vqrshrn.s32 d13, q11, #14 ; >> 14 + vrshrn.s32 d12, q2, #14 ; >> 14 + vrshrn.s32 d13, q11, #14 ; >> 14 ; - step1[13] * cospi_8_64 vmull.s16 q10, d8, d30 @@ -1005,12 +1032,12 @@ end_idct16x16_pass2 vmlal.s16 q9, d9, d31 ; dct_const_round_shift(temp1) - vqrshrn.s32 d4, q10, #14 ; >> 14 - vqrshrn.s32 d5, q13, #14 ; >> 14 + vrshrn.s32 d4, q10, #14 ; >> 14 + vrshrn.s32 d5, q13, #14 ; >> 14 ; dct_const_round_shift(temp2) - vqrshrn.s32 d10, q8, #14 ; >> 14 - vqrshrn.s32 d11, q9, #14 ; >> 14 + vrshrn.s32 d10, q8, #14 ; >> 14 + vrshrn.s32 d11, q9, #14 ; >> 14 ; stage 5 vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11]; @@ -1045,12 +1072,12 @@ end_idct16x16_pass2 vadd.s32 q1, q4, q1 ; dct_const_round_shift(temp1) - vqrshrn.s32 d4, q5, #14 ; >> 14 - vqrshrn.s32 d5, q6, #14 ; >> 14 + vrshrn.s32 d4, q5, #14 ; >> 14 + vrshrn.s32 d5, q6, #14 ; >> 14 ; dct_const_round_shift(temp2) - vqrshrn.s32 d10, q0, #14 ; >> 14 - vqrshrn.s32 d11, q1, #14 ; >> 14 + vrshrn.s32 d10, q0, #14 ; >> 14 + vrshrn.s32 d11, q1, #14 ; >> 14 ; step1[11] * cospi_16_64 vmull.s16 q0, d22, d14 @@ -1069,14 +1096,14 @@ end_idct16x16_pass2 vadd.s32 q6, q6, q1 ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d6, q10, #14 ; >> 14 - vqrshrn.s32 d7, q4, #14 ; >> 14 + vrshrn.s32 d6, q10, #14 ; >> 14 + vrshrn.s32 d7, q4, #14 ; >> 14 ; dct_const_round_shift((step1[11] + step1[12]) * cospi_16_64); - vqrshrn.s32 d8, q13, #14 ; >> 14 - vqrshrn.s32 d9, q6, #14 ; >> 14 + vrshrn.s32 d8, q13, #14 ; >> 14 + vrshrn.s32 d9, q6, #14 ; >> 14 - mov r4, #16 ; pass1Output stride + mov r4, #16 ; pass1_output stride ldr r3, [sp] ; load skip_adding ; stage 7 diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c index f682afc7bf6..0c891919b76 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c @@ -10,1218 +10,813 @@ #include <arm_neon.h> -#include "./vpx_config.h" -#include "vpx_dsp/arm/transpose_neon.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" #include "vpx_dsp/txfm_common.h" -void vpx_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out, - int output_stride) { - int16x4_t d0s16, d1s16, d2s16, d3s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - uint64x1_t d16u64, d17u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64; - uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - int32x4_t q0s32, q1s32, q2s32, q3s32, q5s32, q6s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; - int16x8x2_t q0x2s16; - - q0x2s16 = vld2q_s16(in); - q8s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q9s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q10s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q11s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q12s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q13s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q14s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q15s16 = q0x2s16.val[0]; - - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - d16s16 = vget_low_s16(q8s16); - d17s16 = vget_high_s16(q8s16); - d18s16 = vget_low_s16(q9s16); - d19s16 = vget_high_s16(q9s16); - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d22s16 = vget_low_s16(q11s16); - d23s16 = vget_high_s16(q11s16); - d24s16 = vget_low_s16(q12s16); - d25s16 = vget_high_s16(q12s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - d30s16 = vget_low_s16(q15s16); - d31s16 = vget_high_s16(q15s16); +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void idct16x16_256_add_load_tran_low_kernel( + const tran_low_t **input, int16_t **out) { + int16x8_t s; - // stage 3 - d0s16 = vdup_n_s16((int16_t)cospi_28_64); - d1s16 = vdup_n_s16((int16_t)cospi_4_64); - - q2s32 = vmull_s16(d18s16, d0s16); - q3s32 = vmull_s16(d19s16, d0s16); - q5s32 = vmull_s16(d18s16, d1s16); - q6s32 = vmull_s16(d19s16, d1s16); - - q2s32 = vmlsl_s16(q2s32, d30s16, d1s16); - q3s32 = vmlsl_s16(q3s32, d31s16, d1s16); - q5s32 = vmlal_s16(q5s32, d30s16, d0s16); - q6s32 = vmlal_s16(q6s32, d31s16, d0s16); - - d2s16 = vdup_n_s16((int16_t)cospi_12_64); - d3s16 = vdup_n_s16((int16_t)cospi_20_64); - - d8s16 = vqrshrn_n_s32(q2s32, 14); - d9s16 = vqrshrn_n_s32(q3s32, 14); - d14s16 = vqrshrn_n_s32(q5s32, 14); - d15s16 = vqrshrn_n_s32(q6s32, 14); - q4s16 = vcombine_s16(d8s16, d9s16); - q7s16 = vcombine_s16(d14s16, d15s16); - - q2s32 = vmull_s16(d26s16, d2s16); - q3s32 = vmull_s16(d27s16, d2s16); - q9s32 = vmull_s16(d26s16, d3s16); - q15s32 = vmull_s16(d27s16, d3s16); - - q2s32 = vmlsl_s16(q2s32, d22s16, d3s16); - q3s32 = vmlsl_s16(q3s32, d23s16, d3s16); - q9s32 = vmlal_s16(q9s32, d22s16, d2s16); - q15s32 = vmlal_s16(q15s32, d23s16, d2s16); - - d10s16 = vqrshrn_n_s32(q2s32, 14); - d11s16 = vqrshrn_n_s32(q3s32, 14); - d12s16 = vqrshrn_n_s32(q9s32, 14); - d13s16 = vqrshrn_n_s32(q15s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); + s = load_tran_low_to_s16q(*input); + vst1q_s16(*out, s); + *input += 8; + *out += 8; +} - // stage 4 - d30s16 = vdup_n_s16((int16_t)cospi_16_64); - - q2s32 = vmull_s16(d16s16, d30s16); - q11s32 = vmull_s16(d17s16, d30s16); - q0s32 = vmull_s16(d24s16, d30s16); - q1s32 = vmull_s16(d25s16, d30s16); - - d30s16 = vdup_n_s16((int16_t)cospi_24_64); - d31s16 = vdup_n_s16((int16_t)cospi_8_64); - - q3s32 = vaddq_s32(q2s32, q0s32); - q12s32 = vaddq_s32(q11s32, q1s32); - q13s32 = vsubq_s32(q2s32, q0s32); - q1s32 = vsubq_s32(q11s32, q1s32); - - d16s16 = vqrshrn_n_s32(q3s32, 14); - d17s16 = vqrshrn_n_s32(q12s32, 14); - d18s16 = vqrshrn_n_s32(q13s32, 14); - d19s16 = vqrshrn_n_s32(q1s32, 14); - q8s16 = vcombine_s16(d16s16, d17s16); - q9s16 = vcombine_s16(d18s16, d19s16); - - q0s32 = vmull_s16(d20s16, d31s16); - q1s32 = vmull_s16(d21s16, d31s16); - q12s32 = vmull_s16(d20s16, d30s16); - q13s32 = vmull_s16(d21s16, d30s16); - - q0s32 = vmlal_s16(q0s32, d28s16, d30s16); - q1s32 = vmlal_s16(q1s32, d29s16, d30s16); - q12s32 = vmlsl_s16(q12s32, d28s16, d31s16); - q13s32 = vmlsl_s16(q13s32, d29s16, d31s16); - - d22s16 = vqrshrn_n_s32(q0s32, 14); - d23s16 = vqrshrn_n_s32(q1s32, 14); - d20s16 = vqrshrn_n_s32(q12s32, 14); - d21s16 = vqrshrn_n_s32(q13s32, 14); - q10s16 = vcombine_s16(d20s16, d21s16); - q11s16 = vcombine_s16(d22s16, d23s16); - - q13s16 = vsubq_s16(q4s16, q5s16); - q4s16 = vaddq_s16(q4s16, q5s16); - q14s16 = vsubq_s16(q7s16, q6s16); - q15s16 = vaddq_s16(q6s16, q7s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); +static INLINE void idct16x16_256_add_load_tran_low(const tran_low_t *input, + int16_t *out) { + idct16x16_256_add_load_tran_low_kernel(&input, &out); + idct16x16_256_add_load_tran_low_kernel(&input, &out); + idct16x16_256_add_load_tran_low_kernel(&input, &out); + idct16x16_256_add_load_tran_low_kernel(&input, &out); + idct16x16_256_add_load_tran_low_kernel(&input, &out); + idct16x16_256_add_load_tran_low_kernel(&input, &out); + idct16x16_256_add_load_tran_low_kernel(&input, &out); + idct16x16_256_add_load_tran_low_kernel(&input, &out); + idct16x16_256_add_load_tran_low_kernel(&input, &out); + idct16x16_256_add_load_tran_low_kernel(&input, &out); + idct16x16_256_add_load_tran_low_kernel(&input, &out); + idct16x16_256_add_load_tran_low_kernel(&input, &out); + idct16x16_256_add_load_tran_low_kernel(&input, &out); + idct16x16_256_add_load_tran_low_kernel(&input, &out); + idct16x16_256_add_load_tran_low_kernel(&input, &out); + idct16x16_256_add_load_tran_low_kernel(&input, &out); + idct16x16_256_add_load_tran_low_kernel(&input, &out); + idct16x16_256_add_load_tran_low_kernel(&input, &out); + idct16x16_256_add_load_tran_low_kernel(&input, &out); + idct16x16_256_add_load_tran_low_kernel(&input, &out); + idct16x16_256_add_load_tran_low_kernel(&input, &out); + idct16x16_256_add_load_tran_low_kernel(&input, &out); + idct16x16_256_add_load_tran_low_kernel(&input, &out); + idct16x16_256_add_load_tran_low_kernel(&input, &out); + idct16x16_256_add_load_tran_low_kernel(&input, &out); + idct16x16_256_add_load_tran_low_kernel(&input, &out); + idct16x16_256_add_load_tran_low_kernel(&input, &out); + idct16x16_256_add_load_tran_low_kernel(&input, &out); + idct16x16_256_add_load_tran_low_kernel(&input, &out); + idct16x16_256_add_load_tran_low_kernel(&input, &out); + idct16x16_256_add_load_tran_low_kernel(&input, &out); + idct16x16_256_add_load_tran_low_kernel(&input, &out); +} +#endif // CONFIG_VP9_HIGHBITDEPTH - // stage 5 - q0s16 = vaddq_s16(q8s16, q11s16); - q1s16 = vaddq_s16(q9s16, q10s16); - q2s16 = vsubq_s16(q9s16, q10s16); - q3s16 = vsubq_s16(q8s16, q11s16); - - d16s16 = vdup_n_s16((int16_t)cospi_16_64); - - q11s32 = vmull_s16(d26s16, d16s16); - q12s32 = vmull_s16(d27s16, d16s16); - q9s32 = vmull_s16(d28s16, d16s16); - q10s32 = vmull_s16(d29s16, d16s16); - - q6s32 = vsubq_s32(q9s32, q11s32); - q13s32 = vsubq_s32(q10s32, q12s32); - q9s32 = vaddq_s32(q9s32, q11s32); - q10s32 = vaddq_s32(q10s32, q12s32); - - d10s16 = vqrshrn_n_s32(q6s32, 14); - d11s16 = vqrshrn_n_s32(q13s32, 14); - d12s16 = vqrshrn_n_s32(q9s32, 14); - d13s16 = vqrshrn_n_s32(q10s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); +static INLINE void wrap_low_4x2(const int32x4_t *const t32, int16x4_t *const d0, + int16x4_t *const d1) { + *d0 = vrshrn_n_s32(t32[0], 14); + *d1 = vrshrn_n_s32(t32[1], 14); +} - // stage 6 - q8s16 = vaddq_s16(q0s16, q15s16); - q9s16 = vaddq_s16(q1s16, q6s16); - q10s16 = vaddq_s16(q2s16, q5s16); - q11s16 = vaddq_s16(q3s16, q4s16); - q12s16 = vsubq_s16(q3s16, q4s16); - q13s16 = vsubq_s16(q2s16, q5s16); - q14s16 = vsubq_s16(q1s16, q6s16); - q15s16 = vsubq_s16(q0s16, q15s16); - - d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16)); - d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16)); - d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); - d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); - d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16)); - d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16)); - d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16)); - d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16)); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); - d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); - d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); - d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); - - // store the data - output_stride >>= 1; // output_stride / 2, out is int16_t - vst1_u64((uint64_t *)out, d16u64); - out += output_stride; - vst1_u64((uint64_t *)out, d17u64); - out += output_stride; - vst1_u64((uint64_t *)out, d18u64); - out += output_stride; - vst1_u64((uint64_t *)out, d19u64); - out += output_stride; - vst1_u64((uint64_t *)out, d20u64); - out += output_stride; - vst1_u64((uint64_t *)out, d21u64); - out += output_stride; - vst1_u64((uint64_t *)out, d22u64); - out += output_stride; - vst1_u64((uint64_t *)out, d23u64); - out += output_stride; - vst1_u64((uint64_t *)out, d24u64); - out += output_stride; - vst1_u64((uint64_t *)out, d25u64); - out += output_stride; - vst1_u64((uint64_t *)out, d26u64); - out += output_stride; - vst1_u64((uint64_t *)out, d27u64); - out += output_stride; - vst1_u64((uint64_t *)out, d28u64); - out += output_stride; - vst1_u64((uint64_t *)out, d29u64); - out += output_stride; - vst1_u64((uint64_t *)out, d30u64); - out += output_stride; - vst1_u64((uint64_t *)out, d31u64); +static INLINE void idct_cospi_2_30(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_2_30_10_22, + int16x8_t *const d0, int16x8_t *const d1) { + int32x4_t t32[6]; + + t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 1); + t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 1); + t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 1); + t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 1); + t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 0); + t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 0); + t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 0); + t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 0); + idct16x16_add_wrap_low_8x2(t32, d0, d1); } -void vpx_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out, - int16_t *pass1Output, int16_t skip_adding, - uint8_t *dest, int dest_stride) { - uint8_t *d; - uint8x8_t d12u8, d13u8; - int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - uint64x1_t d24u64, d25u64, d26u64, d27u64; - int64x1_t d12s64, d13s64; - uint16x8_t q2u16, q3u16, q4u16, q5u16, q8u16; - uint16x8_t q9u16, q12u16, q13u16, q14u16, q15u16; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q13s32; - int16x8x2_t q0x2s16; - - q0x2s16 = vld2q_s16(src); - q8s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q9s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q10s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q11s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q12s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q13s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q14s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q15s16 = q0x2s16.val[0]; - - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - d16s16 = vget_low_s16(q8s16); - d17s16 = vget_high_s16(q8s16); - d18s16 = vget_low_s16(q9s16); - d19s16 = vget_high_s16(q9s16); - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d22s16 = vget_low_s16(q11s16); - d23s16 = vget_high_s16(q11s16); - d24s16 = vget_low_s16(q12s16); - d25s16 = vget_high_s16(q12s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - d30s16 = vget_low_s16(q15s16); - d31s16 = vget_high_s16(q15s16); +static INLINE void idct_cospi_4_28(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_4_12_20N_28, + int16x8_t *const d0, int16x8_t *const d1) { + int32x4_t t32[6]; + + t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 3); + t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 3); + t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 3); + t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 3); + t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 0); + t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 0); + t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 0); + t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 0); + idct16x16_add_wrap_low_8x2(t32, d0, d1); +} - // stage 3 - d12s16 = vdup_n_s16((int16_t)cospi_30_64); - d13s16 = vdup_n_s16((int16_t)cospi_2_64); - - q2s32 = vmull_s16(d16s16, d12s16); - q3s32 = vmull_s16(d17s16, d12s16); - q1s32 = vmull_s16(d16s16, d13s16); - q4s32 = vmull_s16(d17s16, d13s16); - - q2s32 = vmlsl_s16(q2s32, d30s16, d13s16); - q3s32 = vmlsl_s16(q3s32, d31s16, d13s16); - q1s32 = vmlal_s16(q1s32, d30s16, d12s16); - q4s32 = vmlal_s16(q4s32, d31s16, d12s16); - - d0s16 = vqrshrn_n_s32(q2s32, 14); - d1s16 = vqrshrn_n_s32(q3s32, 14); - d14s16 = vqrshrn_n_s32(q1s32, 14); - d15s16 = vqrshrn_n_s32(q4s32, 14); - q0s16 = vcombine_s16(d0s16, d1s16); - q7s16 = vcombine_s16(d14s16, d15s16); - - d30s16 = vdup_n_s16((int16_t)cospi_14_64); - d31s16 = vdup_n_s16((int16_t)cospi_18_64); - - q2s32 = vmull_s16(d24s16, d30s16); - q3s32 = vmull_s16(d25s16, d30s16); - q4s32 = vmull_s16(d24s16, d31s16); - q5s32 = vmull_s16(d25s16, d31s16); - - q2s32 = vmlsl_s16(q2s32, d22s16, d31s16); - q3s32 = vmlsl_s16(q3s32, d23s16, d31s16); - q4s32 = vmlal_s16(q4s32, d22s16, d30s16); - q5s32 = vmlal_s16(q5s32, d23s16, d30s16); - - d2s16 = vqrshrn_n_s32(q2s32, 14); - d3s16 = vqrshrn_n_s32(q3s32, 14); - d12s16 = vqrshrn_n_s32(q4s32, 14); - d13s16 = vqrshrn_n_s32(q5s32, 14); - q1s16 = vcombine_s16(d2s16, d3s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - d30s16 = vdup_n_s16((int16_t)cospi_22_64); - d31s16 = vdup_n_s16((int16_t)cospi_10_64); - - q11s32 = vmull_s16(d20s16, d30s16); - q12s32 = vmull_s16(d21s16, d30s16); - q4s32 = vmull_s16(d20s16, d31s16); - q5s32 = vmull_s16(d21s16, d31s16); - - q11s32 = vmlsl_s16(q11s32, d26s16, d31s16); - q12s32 = vmlsl_s16(q12s32, d27s16, d31s16); - q4s32 = vmlal_s16(q4s32, d26s16, d30s16); - q5s32 = vmlal_s16(q5s32, d27s16, d30s16); - - d4s16 = vqrshrn_n_s32(q11s32, 14); - d5s16 = vqrshrn_n_s32(q12s32, 14); - d11s16 = vqrshrn_n_s32(q5s32, 14); - d10s16 = vqrshrn_n_s32(q4s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - d30s16 = vdup_n_s16((int16_t)cospi_6_64); - d31s16 = vdup_n_s16((int16_t)cospi_26_64); - - q10s32 = vmull_s16(d28s16, d30s16); - q11s32 = vmull_s16(d29s16, d30s16); - q12s32 = vmull_s16(d28s16, d31s16); - q13s32 = vmull_s16(d29s16, d31s16); - - q10s32 = vmlsl_s16(q10s32, d18s16, d31s16); - q11s32 = vmlsl_s16(q11s32, d19s16, d31s16); - q12s32 = vmlal_s16(q12s32, d18s16, d30s16); - q13s32 = vmlal_s16(q13s32, d19s16, d30s16); - - d6s16 = vqrshrn_n_s32(q10s32, 14); - d7s16 = vqrshrn_n_s32(q11s32, 14); - d8s16 = vqrshrn_n_s32(q12s32, 14); - d9s16 = vqrshrn_n_s32(q13s32, 14); - q3s16 = vcombine_s16(d6s16, d7s16); - q4s16 = vcombine_s16(d8s16, d9s16); +static INLINE void idct_cospi_6_26(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_6_26_14_18N, + int16x8_t *const d0, int16x8_t *const d1) { + int32x4_t t32[6]; + + t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26_14_18N, 0); + t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26_14_18N, 0); + t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26_14_18N, 0); + t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26_14_18N, 0); + t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26_14_18N, 1); + t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26_14_18N, 1); + t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26_14_18N, 1); + t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26_14_18N, 1); + idct16x16_add_wrap_low_8x2(t32, d0, d1); +} + +static INLINE void idct_cospi_8_24_d_kernel(const int16x4_t s0, + const int16x4_t s1, + const int16x4_t cospi_0_8_16_24, + int32x4_t *const t32) { + t32[0] = vmull_lane_s16(s0, cospi_0_8_16_24, 3); + t32[1] = vmull_lane_s16(s1, cospi_0_8_16_24, 3); + t32[0] = vmlsl_lane_s16(t32[0], s1, cospi_0_8_16_24, 1); + t32[1] = vmlal_lane_s16(t32[1], s0, cospi_0_8_16_24, 1); +} + +static INLINE void idct_cospi_8_24_d(const int16x4_t s0, const int16x4_t s1, + const int16x4_t cospi_0_8_16_24, + int16x4_t *const d0, int16x4_t *const d1) { + int32x4_t t32[2]; + + idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32); + wrap_low_4x2(t32, d0, d1); +} + +static INLINE void idct_cospi_8_24_neg_d(const int16x4_t s0, const int16x4_t s1, + const int16x4_t cospi_0_8_16_24, + int16x4_t *const d0, + int16x4_t *const d1) { + int32x4_t t32[2]; + + idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t32); + t32[1] = vnegq_s32(t32[1]); + wrap_low_4x2(t32, d0, d1); +} + +static INLINE void idct_cospi_10_22(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_2_30_10_22, + int16x8_t *const d0, int16x8_t *const d1) { + int32x4_t t32[6]; + + t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 3); + t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 3); + t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 3); + t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 3); + t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 2); + t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 2); + t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 2); + t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 2); + idct16x16_add_wrap_low_8x2(t32, d0, d1); +} + +static INLINE void idct_cospi_12_20(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_4_12_20N_28, + int16x8_t *const d0, int16x8_t *const d1) { + int32x4_t t32[6]; + + t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 1); + t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 1); + t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 1); + t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 1); + t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 2); + t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 2); + t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 2); + t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 2); + idct16x16_add_wrap_low_8x2(t32, d0, d1); +} + +static INLINE void idct_cospi_14_18(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_6_26_14_18N, + int16x8_t *const d0, int16x8_t *const d1) { + int32x4_t t32[6]; + + t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26_14_18N, 2); + t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26_14_18N, 2); + t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26_14_18N, 2); + t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26_14_18N, 2); + t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26_14_18N, 3); + t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26_14_18N, 3); + t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26_14_18N, 3); + t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26_14_18N, 3); + idct16x16_add_wrap_low_8x2(t32, d0, d1); +} + +static INLINE void idct_cospi_16_16_d(const int16x4_t s0, const int16x4_t s1, + const int16x4_t cospi_0_8_16_24, + int16x4_t *const d0, + int16x4_t *const d1) { + int32x4_t t32[3]; + + t32[2] = vmull_lane_s16(s1, cospi_0_8_16_24, 2); + t32[0] = vmlsl_lane_s16(t32[2], s0, cospi_0_8_16_24, 2); + t32[1] = vmlal_lane_s16(t32[2], s0, cospi_0_8_16_24, 2); + wrap_low_4x2(t32, d0, d1); +} + +static void idct16x16_256_add_half1d(const int16_t *input, int16_t *output, + uint8_t *dest, int stride) { + const int16x8_t cospis0 = vld1q_s16(kCospi); + const int16x8_t cospis1 = vld1q_s16(kCospi + 8); + const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0); + const int16x4_t cospi_4_12_20N_28 = vget_high_s16(cospis0); + const int16x4_t cospi_2_30_10_22 = vget_low_s16(cospis1); + const int16x4_t cospi_6_26_14_18N = vget_high_s16(cospis1); + int16x8_t in[16], step1[16], step2[16], out[16]; + + // Load input (16x8) + in[0] = vld1q_s16(input); + input += 8; + in[8] = vld1q_s16(input); + input += 8; + in[1] = vld1q_s16(input); + input += 8; + in[9] = vld1q_s16(input); + input += 8; + in[2] = vld1q_s16(input); + input += 8; + in[10] = vld1q_s16(input); + input += 8; + in[3] = vld1q_s16(input); + input += 8; + in[11] = vld1q_s16(input); + input += 8; + in[4] = vld1q_s16(input); + input += 8; + in[12] = vld1q_s16(input); + input += 8; + in[5] = vld1q_s16(input); + input += 8; + in[13] = vld1q_s16(input); + input += 8; + in[6] = vld1q_s16(input); + input += 8; + in[14] = vld1q_s16(input); + input += 8; + in[7] = vld1q_s16(input); + input += 8; + in[15] = vld1q_s16(input); + + // Transpose + transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + transpose_s16_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14], + &in[15]); + + // stage 1 + step1[0] = in[0 / 2]; + step1[1] = in[16 / 2]; + step1[2] = in[8 / 2]; + step1[3] = in[24 / 2]; + step1[4] = in[4 / 2]; + step1[5] = in[20 / 2]; + step1[6] = in[12 / 2]; + step1[7] = in[28 / 2]; + step1[8] = in[2 / 2]; + step1[9] = in[18 / 2]; + step1[10] = in[10 / 2]; + step1[11] = in[26 / 2]; + step1[12] = in[6 / 2]; + step1[13] = in[22 / 2]; + step1[14] = in[14 / 2]; + step1[15] = in[30 / 2]; + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + idct_cospi_2_30(step1[8], step1[15], cospi_2_30_10_22, &step2[8], &step2[15]); + idct_cospi_14_18(step1[9], step1[14], cospi_6_26_14_18N, &step2[9], + &step2[14]); + idct_cospi_10_22(step1[10], step1[13], cospi_2_30_10_22, &step2[10], + &step2[13]); + idct_cospi_6_26(step1[11], step1[12], cospi_6_26_14_18N, &step2[11], + &step2[12]); // stage 3 - q9s16 = vsubq_s16(q0s16, q1s16); - q0s16 = vaddq_s16(q0s16, q1s16); - q10s16 = vsubq_s16(q3s16, q2s16); - q11s16 = vaddq_s16(q2s16, q3s16); - q12s16 = vaddq_s16(q4s16, q5s16); - q13s16 = vsubq_s16(q4s16, q5s16); - q14s16 = vsubq_s16(q7s16, q6s16); - q7s16 = vaddq_s16(q6s16, q7s16); + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + idct_cospi_4_28(step2[4], step2[7], cospi_4_12_20N_28, &step1[4], &step1[7]); + idct_cospi_12_20(step2[5], step2[6], cospi_4_12_20N_28, &step1[5], &step1[6]); + step1[8] = vaddq_s16(step2[8], step2[9]); + step1[9] = vsubq_s16(step2[8], step2[9]); + step1[10] = vsubq_s16(step2[11], step2[10]); + step1[11] = vaddq_s16(step2[11], step2[10]); + step1[12] = vaddq_s16(step2[12], step2[13]); + step1[13] = vsubq_s16(step2[12], step2[13]); + step1[14] = vsubq_s16(step2[15], step2[14]); + step1[15] = vaddq_s16(step2[15], step2[14]); // stage 4 - d18s16 = vget_low_s16(q9s16); - d19s16 = vget_high_s16(q9s16); - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - - d30s16 = vdup_n_s16((int16_t)cospi_8_64); - d31s16 = vdup_n_s16((int16_t)cospi_24_64); - - q2s32 = vmull_s16(d18s16, d31s16); - q3s32 = vmull_s16(d19s16, d31s16); - q4s32 = vmull_s16(d28s16, d31s16); - q5s32 = vmull_s16(d29s16, d31s16); - - q2s32 = vmlal_s16(q2s32, d28s16, d30s16); - q3s32 = vmlal_s16(q3s32, d29s16, d30s16); - q4s32 = vmlsl_s16(q4s32, d18s16, d30s16); - q5s32 = vmlsl_s16(q5s32, d19s16, d30s16); - - d12s16 = vqrshrn_n_s32(q2s32, 14); - d13s16 = vqrshrn_n_s32(q3s32, 14); - d2s16 = vqrshrn_n_s32(q4s32, 14); - d3s16 = vqrshrn_n_s32(q5s32, 14); - q1s16 = vcombine_s16(d2s16, d3s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - q3s16 = q11s16; - q4s16 = q12s16; - - d30s16 = vdup_n_s16(-cospi_8_64); - q11s32 = vmull_s16(d26s16, d30s16); - q12s32 = vmull_s16(d27s16, d30s16); - q8s32 = vmull_s16(d20s16, d30s16); - q9s32 = vmull_s16(d21s16, d30s16); - - q11s32 = vmlsl_s16(q11s32, d20s16, d31s16); - q12s32 = vmlsl_s16(q12s32, d21s16, d31s16); - q8s32 = vmlal_s16(q8s32, d26s16, d31s16); - q9s32 = vmlal_s16(q9s32, d27s16, d31s16); - - d4s16 = vqrshrn_n_s32(q11s32, 14); - d5s16 = vqrshrn_n_s32(q12s32, 14); - d10s16 = vqrshrn_n_s32(q8s32, 14); - d11s16 = vqrshrn_n_s32(q9s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - q5s16 = vcombine_s16(d10s16, d11s16); + idct_cospi_16_16_q(step1[1], step1[0], cospi_0_8_16_24, &step2[1], &step2[0]); + idct_cospi_8_24_q(step1[2], step1[3], cospi_0_8_16_24, &step2[2], &step2[3]); + step2[4] = vaddq_s16(step1[4], step1[5]); + step2[5] = vsubq_s16(step1[4], step1[5]); + step2[6] = vsubq_s16(step1[7], step1[6]); + step2[7] = vaddq_s16(step1[7], step1[6]); + step2[8] = step1[8]; + idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9], + &step2[14]); + idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13], + &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; // stage 5 - q8s16 = vaddq_s16(q0s16, q3s16); - q9s16 = vaddq_s16(q1s16, q2s16); - q10s16 = vsubq_s16(q1s16, q2s16); - q11s16 = vsubq_s16(q0s16, q3s16); - q12s16 = vsubq_s16(q7s16, q4s16); - q13s16 = vsubq_s16(q6s16, q5s16); - q14s16 = vaddq_s16(q6s16, q5s16); - q15s16 = vaddq_s16(q7s16, q4s16); + step1[0] = vaddq_s16(step2[0], step2[3]); + step1[1] = vaddq_s16(step2[1], step2[2]); + step1[2] = vsubq_s16(step2[1], step2[2]); + step1[3] = vsubq_s16(step2[0], step2[3]); + step1[4] = step2[4]; + idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]); + step1[7] = step2[7]; + step1[8] = vaddq_s16(step2[8], step2[11]); + step1[9] = vaddq_s16(step2[9], step2[10]); + step1[10] = vsubq_s16(step2[9], step2[10]); + step1[11] = vsubq_s16(step2[8], step2[11]); + step1[12] = vsubq_s16(step2[15], step2[12]); + step1[13] = vsubq_s16(step2[14], step2[13]); + step1[14] = vaddq_s16(step2[14], step2[13]); + step1[15] = vaddq_s16(step2[15], step2[12]); // stage 6 - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d22s16 = vget_low_s16(q11s16); - d23s16 = vget_high_s16(q11s16); - d24s16 = vget_low_s16(q12s16); - d25s16 = vget_high_s16(q12s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - - d14s16 = vdup_n_s16((int16_t)cospi_16_64); - - q3s32 = vmull_s16(d26s16, d14s16); - q4s32 = vmull_s16(d27s16, d14s16); - q0s32 = vmull_s16(d20s16, d14s16); - q1s32 = vmull_s16(d21s16, d14s16); - - q5s32 = vsubq_s32(q3s32, q0s32); - q6s32 = vsubq_s32(q4s32, q1s32); - q10s32 = vaddq_s32(q3s32, q0s32); - q4s32 = vaddq_s32(q4s32, q1s32); - - d4s16 = vqrshrn_n_s32(q5s32, 14); - d5s16 = vqrshrn_n_s32(q6s32, 14); - d10s16 = vqrshrn_n_s32(q10s32, 14); - d11s16 = vqrshrn_n_s32(q4s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - q0s32 = vmull_s16(d22s16, d14s16); - q1s32 = vmull_s16(d23s16, d14s16); - q13s32 = vmull_s16(d24s16, d14s16); - q6s32 = vmull_s16(d25s16, d14s16); - - q10s32 = vsubq_s32(q13s32, q0s32); - q4s32 = vsubq_s32(q6s32, q1s32); - q13s32 = vaddq_s32(q13s32, q0s32); - q6s32 = vaddq_s32(q6s32, q1s32); - - d6s16 = vqrshrn_n_s32(q10s32, 14); - d7s16 = vqrshrn_n_s32(q4s32, 14); - d8s16 = vqrshrn_n_s32(q13s32, 14); - d9s16 = vqrshrn_n_s32(q6s32, 14); - q3s16 = vcombine_s16(d6s16, d7s16); - q4s16 = vcombine_s16(d8s16, d9s16); + step2[0] = vaddq_s16(step1[0], step1[7]); + step2[1] = vaddq_s16(step1[1], step1[6]); + step2[2] = vaddq_s16(step1[2], step1[5]); + step2[3] = vaddq_s16(step1[3], step1[4]); + step2[4] = vsubq_s16(step1[3], step1[4]); + step2[5] = vsubq_s16(step1[2], step1[5]); + step2[6] = vsubq_s16(step1[1], step1[6]); + step2[7] = vsubq_s16(step1[0], step1[7]); + idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10], + &step2[13]); + idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11], + &step2[12]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; // stage 7 - if (skip_adding != 0) { - d = dest; - // load the data in pass1 - q0s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q1s16 = vld1q_s16(pass1Output); - pass1Output += 8; - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - d13s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - - q12s16 = vaddq_s16(q0s16, q15s16); - q13s16 = vaddq_s16(q1s16, q14s16); - q12s16 = vrshrq_n_s16(q12s16, 6); - q13s16 = vrshrq_n_s16(q13s16, 6); - q12u16 = - vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64)); - q13u16 = - vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); - d += dest_stride; - q14s16 = vsubq_s16(q1s16, q14s16); - q15s16 = vsubq_s16(q0s16, q15s16); - - q10s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q11s16 = vld1q_s16(pass1Output); - pass1Output += 8; - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - d13s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q12s16 = vaddq_s16(q10s16, q5s16); - q13s16 = vaddq_s16(q11s16, q4s16); - q12s16 = vrshrq_n_s16(q12s16, 6); - q13s16 = vrshrq_n_s16(q13s16, 6); - q12u16 = - vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64)); - q13u16 = - vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); - d += dest_stride; - q4s16 = vsubq_s16(q11s16, q4s16); - q5s16 = vsubq_s16(q10s16, q5s16); - - q0s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q1s16 = vld1q_s16(pass1Output); - pass1Output += 8; - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - d13s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q12s16 = vaddq_s16(q0s16, q3s16); - q13s16 = vaddq_s16(q1s16, q2s16); - q12s16 = vrshrq_n_s16(q12s16, 6); - q13s16 = vrshrq_n_s16(q13s16, 6); - q12u16 = - vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64)); - q13u16 = - vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); - d += dest_stride; - q2s16 = vsubq_s16(q1s16, q2s16); - q3s16 = vsubq_s16(q0s16, q3s16); - - q10s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q11s16 = vld1q_s16(pass1Output); - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - d13s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q12s16 = vaddq_s16(q10s16, q9s16); - q13s16 = vaddq_s16(q11s16, q8s16); - q12s16 = vrshrq_n_s16(q12s16, 6); - q13s16 = vrshrq_n_s16(q13s16, 6); - q12u16 = - vaddw_u8(vreinterpretq_u16_s16(q12s16), vreinterpret_u8_s64(d12s64)); - q13u16 = - vaddw_u8(vreinterpretq_u16_s16(q13s16), vreinterpret_u8_s64(d13s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); - d13u8 = vqmovun_s16(vreinterpretq_s16_u16(q13u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d13u8)); - d += dest_stride; - q8s16 = vsubq_s16(q11s16, q8s16); - q9s16 = vsubq_s16(q10s16, q9s16); - - // store the data out 8,9,10,11,12,13,14,15 - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q8s16 = vrshrq_n_s16(q8s16, 6); - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q9s16 = vrshrq_n_s16(q9s16, 6); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q2s16 = vrshrq_n_s16(q2s16, 6); - q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q3s16 = vrshrq_n_s16(q3s16, 6); - q3u16 = vaddw_u8(vreinterpretq_u16_s16(q3s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q3u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q4s16 = vrshrq_n_s16(q4s16, 6); - q4u16 = vaddw_u8(vreinterpretq_u16_s16(q4s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q4u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q5s16 = vrshrq_n_s16(q5s16, 6); - q5u16 = vaddw_u8(vreinterpretq_u16_s16(q5s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q5u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - dest += dest_stride; - q14s16 = vrshrq_n_s16(q14s16, 6); - q14u16 = - vaddw_u8(vreinterpretq_u16_s16(q14s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q14u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - d += dest_stride; - - d12s64 = vld1_s64((int64_t *)dest); - q15s16 = vrshrq_n_s16(q15s16, 6); - q15u16 = - vaddw_u8(vreinterpretq_u16_s16(q15s16), vreinterpret_u8_s64(d12s64)); - d12u8 = vqmovun_s16(vreinterpretq_s16_u16(q15u16)); - vst1_u64((uint64_t *)d, vreinterpret_u64_u8(d12u8)); - } else { // skip_adding_dest - q0s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q1s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q0s16, q15s16); - q13s16 = vaddq_s16(q1s16, q14s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q14s16 = vsubq_s16(q1s16, q14s16); - q15s16 = vsubq_s16(q0s16, q15s16); - - q10s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q11s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q10s16, q5s16); - q13s16 = vaddq_s16(q11s16, q4s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q4s16 = vsubq_s16(q11s16, q4s16); - q5s16 = vsubq_s16(q10s16, q5s16); - - q0s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q1s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q0s16, q3s16); - q13s16 = vaddq_s16(q1s16, q2s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q2s16 = vsubq_s16(q1s16, q2s16); - q3s16 = vsubq_s16(q0s16, q3s16); - - q10s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q11s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q10s16, q9s16); - q13s16 = vaddq_s16(q11s16, q8s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q8s16 = vsubq_s16(q11s16, q8s16); - q9s16 = vsubq_s16(q10s16, q9s16); - - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q8s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q8s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q9s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q9s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q2s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q2s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q3s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q3s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q4s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q4s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q5s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q5s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q14s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q14s16))); - out += 12; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_low_s16(q15s16))); - out += 4; - vst1_u64((uint64_t *)out, vreinterpret_u64_s16(vget_high_s16(q15s16))); + out[0] = vaddq_s16(step2[0], step2[15]); + out[1] = vaddq_s16(step2[1], step2[14]); + out[2] = vaddq_s16(step2[2], step2[13]); + out[3] = vaddq_s16(step2[3], step2[12]); + out[4] = vaddq_s16(step2[4], step2[11]); + out[5] = vaddq_s16(step2[5], step2[10]); + out[6] = vaddq_s16(step2[6], step2[9]); + out[7] = vaddq_s16(step2[7], step2[8]); + out[8] = vsubq_s16(step2[7], step2[8]); + out[9] = vsubq_s16(step2[6], step2[9]); + out[10] = vsubq_s16(step2[5], step2[10]); + out[11] = vsubq_s16(step2[4], step2[11]); + out[12] = vsubq_s16(step2[3], step2[12]); + out[13] = vsubq_s16(step2[2], step2[13]); + out[14] = vsubq_s16(step2[1], step2[14]); + out[15] = vsubq_s16(step2[0], step2[15]); + + if (output) { + // pass 1: save the result into output + vst1q_s16(output, out[0]); + output += 16; + vst1q_s16(output, out[1]); + output += 16; + vst1q_s16(output, out[2]); + output += 16; + vst1q_s16(output, out[3]); + output += 16; + vst1q_s16(output, out[4]); + output += 16; + vst1q_s16(output, out[5]); + output += 16; + vst1q_s16(output, out[6]); + output += 16; + vst1q_s16(output, out[7]); + output += 16; + vst1q_s16(output, out[8]); + output += 16; + vst1q_s16(output, out[9]); + output += 16; + vst1q_s16(output, out[10]); + output += 16; + vst1q_s16(output, out[11]); + output += 16; + vst1q_s16(output, out[12]); + output += 16; + vst1q_s16(output, out[13]); + output += 16; + vst1q_s16(output, out[14]); + output += 16; + vst1q_s16(output, out[15]); + } else { + // pass 2: add the result to dest. + idct16x16_add8x1(out[0], &dest, stride); + idct16x16_add8x1(out[1], &dest, stride); + idct16x16_add8x1(out[2], &dest, stride); + idct16x16_add8x1(out[3], &dest, stride); + idct16x16_add8x1(out[4], &dest, stride); + idct16x16_add8x1(out[5], &dest, stride); + idct16x16_add8x1(out[6], &dest, stride); + idct16x16_add8x1(out[7], &dest, stride); + idct16x16_add8x1(out[8], &dest, stride); + idct16x16_add8x1(out[9], &dest, stride); + idct16x16_add8x1(out[10], &dest, stride); + idct16x16_add8x1(out[11], &dest, stride); + idct16x16_add8x1(out[12], &dest, stride); + idct16x16_add8x1(out[13], &dest, stride); + idct16x16_add8x1(out[14], &dest, stride); + idct16x16_add8x1(out[15], &dest, stride); } } -void vpx_idct16x16_10_add_neon_pass1(int16_t *in, int16_t *out, - int output_stride) { - int16x4_t d4s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - uint64x1_t d4u64, d5u64, d18u64, d19u64, d20u64, d21u64, d22u64, d23u64; - uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; - int16x8_t q0s16, q1s16, q2s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - int32x4_t q6s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q15s32; - int16x8x2_t q0x2s16; - - q0x2s16 = vld2q_s16(in); - q8s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q9s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q10s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q11s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q12s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q13s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q14s16 = q0x2s16.val[0]; - in += 16; - q0x2s16 = vld2q_s16(in); - q15s16 = q0x2s16.val[0]; - - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); +static void idct16x16_10_add_half1d_pass1(const tran_low_t *input, + int16_t *output) { + const int16x8_t cospis0 = vld1q_s16(kCospi); + const int16x8_t cospis1 = vld1q_s16(kCospi + 8); + const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0); + const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1); + const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0); + const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0); + const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0); + const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1); + const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1); + int16x4_t in[4], step1[16], step2[16], out[16]; + +// Load input (4x4) +#if CONFIG_VP9_HIGHBITDEPTH + in[0] = load_tran_low_to_s16d(input); + input += 16; + in[1] = load_tran_low_to_s16d(input); + input += 16; + in[2] = load_tran_low_to_s16d(input); + input += 16; + in[3] = load_tran_low_to_s16d(input); +#else + in[0] = vld1_s16(input); + input += 16; + in[1] = vld1_s16(input); + input += 16; + in[2] = vld1_s16(input); + input += 16; + in[3] = vld1_s16(input); +#endif // CONFIG_VP9_HIGHBITDEPTH + + // Transpose + transpose_s16_4x4d(&in[0], &in[1], &in[2], &in[3]); + + // stage 1 + step1[0] = in[0 / 2]; + step1[4] = in[4 / 2]; + step1[8] = in[2 / 2]; + step1[12] = in[6 / 2]; + + // stage 2 + step2[0] = step1[0]; + step2[4] = step1[4]; + step2[8] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 1); + step2[11] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 1); + step2[12] = vqrdmulh_lane_s16(step1[12], cospid_6_26_14_18N, 0); + step2[15] = vqrdmulh_lane_s16(step1[8], cospid_2_30_10_22, 0); // stage 3 - q0s16 = vdupq_n_s16((int16_t)cospi_28_64 * 2); - q1s16 = vdupq_n_s16((int16_t)cospi_4_64 * 2); - - q4s16 = vqrdmulhq_s16(q9s16, q0s16); - q7s16 = vqrdmulhq_s16(q9s16, q1s16); + step1[0] = step2[0]; + step1[4] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 3); + step1[7] = vqrdmulh_lane_s16(step2[4], cospid_4_12_20N_28, 0); + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + step1[14] = step2[15]; + step1[15] = step2[15]; // stage 4 - q1s16 = vdupq_n_s16((int16_t)cospi_16_64 * 2); - d4s16 = vdup_n_s16((int16_t)cospi_16_64); - - q8s16 = vqrdmulhq_s16(q8s16, q1s16); - - d8s16 = vget_low_s16(q4s16); - d9s16 = vget_high_s16(q4s16); - d14s16 = vget_low_s16(q7s16); - d15s16 = vget_high_s16(q7s16); - q9s32 = vmull_s16(d14s16, d4s16); - q10s32 = vmull_s16(d15s16, d4s16); - q12s32 = vmull_s16(d9s16, d4s16); - q11s32 = vmull_s16(d8s16, d4s16); - - q15s32 = vsubq_s32(q10s32, q12s32); - q6s32 = vsubq_s32(q9s32, q11s32); - q9s32 = vaddq_s32(q9s32, q11s32); - q10s32 = vaddq_s32(q10s32, q12s32); - - d11s16 = vqrshrn_n_s32(q15s32, 14); - d10s16 = vqrshrn_n_s32(q6s32, 14); - d12s16 = vqrshrn_n_s32(q9s32, 14); - d13s16 = vqrshrn_n_s32(q10s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); + step2[0] = step2[1] = vqrdmulh_lane_s16(step1[0], cospid_0_8_16_24, 2); + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; + step2[8] = step1[8]; + idct_cospi_8_24_d(step1[14], step1[9], cospi_0_8_16_24, &step2[9], + &step2[14]); + idct_cospi_8_24_neg_d(step1[13], step1[10], cospi_0_8_16_24, &step2[13], + &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[1]; + step1[3] = step2[0]; + step1[4] = step2[4]; + idct_cospi_16_16_d(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]); + step1[7] = step2[7]; + step1[8] = vadd_s16(step2[8], step2[11]); + step1[9] = vadd_s16(step2[9], step2[10]); + step1[10] = vsub_s16(step2[9], step2[10]); + step1[11] = vsub_s16(step2[8], step2[11]); + step1[12] = vsub_s16(step2[15], step2[12]); + step1[13] = vsub_s16(step2[14], step2[13]); + step1[14] = vadd_s16(step2[14], step2[13]); + step1[15] = vadd_s16(step2[15], step2[12]); // stage 6 - q2s16 = vaddq_s16(q8s16, q7s16); - q9s16 = vaddq_s16(q8s16, q6s16); - q10s16 = vaddq_s16(q8s16, q5s16); - q11s16 = vaddq_s16(q8s16, q4s16); - q12s16 = vsubq_s16(q8s16, q4s16); - q13s16 = vsubq_s16(q8s16, q5s16); - q14s16 = vsubq_s16(q8s16, q6s16); - q15s16 = vsubq_s16(q8s16, q7s16); - - d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16)); - d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16)); - d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); - d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); - d20u64 = vreinterpret_u64_s16(vget_low_s16(q10s16)); - d21u64 = vreinterpret_u64_s16(vget_high_s16(q10s16)); - d22u64 = vreinterpret_u64_s16(vget_low_s16(q11s16)); - d23u64 = vreinterpret_u64_s16(vget_high_s16(q11s16)); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); - d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); - d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); - d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); - - // store the data - output_stride >>= 1; // output_stride / 2, out is int16_t - vst1_u64((uint64_t *)out, d4u64); - out += output_stride; - vst1_u64((uint64_t *)out, d5u64); - out += output_stride; - vst1_u64((uint64_t *)out, d18u64); - out += output_stride; - vst1_u64((uint64_t *)out, d19u64); - out += output_stride; - vst1_u64((uint64_t *)out, d20u64); - out += output_stride; - vst1_u64((uint64_t *)out, d21u64); - out += output_stride; - vst1_u64((uint64_t *)out, d22u64); - out += output_stride; - vst1_u64((uint64_t *)out, d23u64); - out += output_stride; - vst1_u64((uint64_t *)out, d24u64); - out += output_stride; - vst1_u64((uint64_t *)out, d25u64); - out += output_stride; - vst1_u64((uint64_t *)out, d26u64); - out += output_stride; - vst1_u64((uint64_t *)out, d27u64); - out += output_stride; - vst1_u64((uint64_t *)out, d28u64); - out += output_stride; - vst1_u64((uint64_t *)out, d29u64); - out += output_stride; - vst1_u64((uint64_t *)out, d30u64); - out += output_stride; - vst1_u64((uint64_t *)out, d31u64); + step2[0] = vadd_s16(step1[0], step1[7]); + step2[1] = vadd_s16(step1[1], step1[6]); + step2[2] = vadd_s16(step1[2], step1[5]); + step2[3] = vadd_s16(step1[3], step1[4]); + step2[4] = vsub_s16(step1[3], step1[4]); + step2[5] = vsub_s16(step1[2], step1[5]); + step2[6] = vsub_s16(step1[1], step1[6]); + step2[7] = vsub_s16(step1[0], step1[7]); + idct_cospi_16_16_d(step1[10], step1[13], cospi_0_8_16_24, &step2[10], + &step2[13]); + idct_cospi_16_16_d(step1[11], step1[12], cospi_0_8_16_24, &step2[11], + &step2[12]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + out[0] = vadd_s16(step2[0], step2[15]); + out[1] = vadd_s16(step2[1], step2[14]); + out[2] = vadd_s16(step2[2], step2[13]); + out[3] = vadd_s16(step2[3], step2[12]); + out[4] = vadd_s16(step2[4], step2[11]); + out[5] = vadd_s16(step2[5], step2[10]); + out[6] = vadd_s16(step2[6], step2[9]); + out[7] = vadd_s16(step2[7], step2[8]); + out[8] = vsub_s16(step2[7], step2[8]); + out[9] = vsub_s16(step2[6], step2[9]); + out[10] = vsub_s16(step2[5], step2[10]); + out[11] = vsub_s16(step2[4], step2[11]); + out[12] = vsub_s16(step2[3], step2[12]); + out[13] = vsub_s16(step2[2], step2[13]); + out[14] = vsub_s16(step2[1], step2[14]); + out[15] = vsub_s16(step2[0], step2[15]); + + // pass 1: save the result into output + vst1_s16(output, out[0]); + output += 4; + vst1_s16(output, out[1]); + output += 4; + vst1_s16(output, out[2]); + output += 4; + vst1_s16(output, out[3]); + output += 4; + vst1_s16(output, out[4]); + output += 4; + vst1_s16(output, out[5]); + output += 4; + vst1_s16(output, out[6]); + output += 4; + vst1_s16(output, out[7]); + output += 4; + vst1_s16(output, out[8]); + output += 4; + vst1_s16(output, out[9]); + output += 4; + vst1_s16(output, out[10]); + output += 4; + vst1_s16(output, out[11]); + output += 4; + vst1_s16(output, out[12]); + output += 4; + vst1_s16(output, out[13]); + output += 4; + vst1_s16(output, out[14]); + output += 4; + vst1_s16(output, out[15]); } -void vpx_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *out, - int16_t *pass1Output, int16_t skip_adding, - uint8_t *dest, int dest_stride) { - int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x4_t d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d30s16, d31s16; - uint64x1_t d4u64, d5u64, d6u64, d7u64, d8u64, d9u64, d10u64, d11u64; - uint64x1_t d16u64, d17u64, d18u64, d19u64; - uint64x1_t d24u64, d25u64, d26u64, d27u64, d28u64, d29u64, d30u64, d31u64; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q8s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q13s32; - int16x8x2_t q0x2s16; - (void)skip_adding; - (void)dest; - (void)dest_stride; - - q0x2s16 = vld2q_s16(src); - q8s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q9s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q10s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q11s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q12s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q13s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q14s16 = q0x2s16.val[0]; - src += 16; - q0x2s16 = vld2q_s16(src); - q15s16 = q0x2s16.val[0]; - - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); +static void idct16x16_10_add_half1d_pass2(const int16_t *input, int16_t *output, + uint8_t *dest, int stride) { + const int16x8_t cospis0 = vld1q_s16(kCospi); + const int16x8_t cospis1 = vld1q_s16(kCospi + 8); + const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0); + const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1); + const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0); + const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0); + const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0); + const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1); + const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1); + int16x4_t ind[8]; + int16x8_t in[4], step1[16], step2[16], out[16]; + + // Load input (4x8) + ind[0] = vld1_s16(input); + input += 4; + ind[1] = vld1_s16(input); + input += 4; + ind[2] = vld1_s16(input); + input += 4; + ind[3] = vld1_s16(input); + input += 4; + ind[4] = vld1_s16(input); + input += 4; + ind[5] = vld1_s16(input); + input += 4; + ind[6] = vld1_s16(input); + input += 4; + ind[7] = vld1_s16(input); + + // Transpose + transpose_s16_4x8(ind[0], ind[1], ind[2], ind[3], ind[4], ind[5], ind[6], + ind[7], &in[0], &in[1], &in[2], &in[3]); + + // stage 1 + step1[0] = in[0 / 2]; + step1[4] = in[4 / 2]; + step1[8] = in[2 / 2]; + step1[12] = in[6 / 2]; + + // stage 2 + step2[0] = step1[0]; + step2[4] = step1[4]; + step2[8] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 1); + step2[11] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 1); + step2[12] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 0); + step2[15] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 0); // stage 3 - q6s16 = vdupq_n_s16((int16_t)cospi_30_64 * 2); - q0s16 = vqrdmulhq_s16(q8s16, q6s16); - q6s16 = vdupq_n_s16((int16_t)cospi_2_64 * 2); - q7s16 = vqrdmulhq_s16(q8s16, q6s16); - - q15s16 = vdupq_n_s16((int16_t)-cospi_26_64 * 2); - q14s16 = vdupq_n_s16((int16_t)cospi_6_64 * 2); - q3s16 = vqrdmulhq_s16(q9s16, q15s16); - q4s16 = vqrdmulhq_s16(q9s16, q14s16); + step1[0] = step2[0]; + step1[4] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 3); + step1[7] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 0); + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + step1[14] = step2[15]; + step1[15] = step2[15]; // stage 4 - d0s16 = vget_low_s16(q0s16); - d1s16 = vget_high_s16(q0s16); - d6s16 = vget_low_s16(q3s16); - d7s16 = vget_high_s16(q3s16); - d8s16 = vget_low_s16(q4s16); - d9s16 = vget_high_s16(q4s16); - d14s16 = vget_low_s16(q7s16); - d15s16 = vget_high_s16(q7s16); - - d30s16 = vdup_n_s16((int16_t)cospi_8_64); - d31s16 = vdup_n_s16((int16_t)cospi_24_64); - - q12s32 = vmull_s16(d14s16, d31s16); - q5s32 = vmull_s16(d15s16, d31s16); - q2s32 = vmull_s16(d0s16, d31s16); - q11s32 = vmull_s16(d1s16, d31s16); - - q12s32 = vmlsl_s16(q12s32, d0s16, d30s16); - q5s32 = vmlsl_s16(q5s32, d1s16, d30s16); - q2s32 = vmlal_s16(q2s32, d14s16, d30s16); - q11s32 = vmlal_s16(q11s32, d15s16, d30s16); - - d2s16 = vqrshrn_n_s32(q12s32, 14); - d3s16 = vqrshrn_n_s32(q5s32, 14); - d12s16 = vqrshrn_n_s32(q2s32, 14); - d13s16 = vqrshrn_n_s32(q11s32, 14); - q1s16 = vcombine_s16(d2s16, d3s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - d30s16 = vdup_n_s16(-cospi_8_64); - q10s32 = vmull_s16(d8s16, d30s16); - q13s32 = vmull_s16(d9s16, d30s16); - q8s32 = vmull_s16(d6s16, d30s16); - q9s32 = vmull_s16(d7s16, d30s16); - - q10s32 = vmlsl_s16(q10s32, d6s16, d31s16); - q13s32 = vmlsl_s16(q13s32, d7s16, d31s16); - q8s32 = vmlal_s16(q8s32, d8s16, d31s16); - q9s32 = vmlal_s16(q9s32, d9s16, d31s16); - - d4s16 = vqrshrn_n_s32(q10s32, 14); - d5s16 = vqrshrn_n_s32(q13s32, 14); - d10s16 = vqrshrn_n_s32(q8s32, 14); - d11s16 = vqrshrn_n_s32(q9s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - q5s16 = vcombine_s16(d10s16, d11s16); + step2[0] = step2[1] = vqrdmulhq_lane_s16(step1[0], cospid_0_8_16_24, 2); + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; + step2[8] = step1[8]; + idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9], + &step2[14]); + idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13], + &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; // stage 5 - q8s16 = vaddq_s16(q0s16, q3s16); - q9s16 = vaddq_s16(q1s16, q2s16); - q10s16 = vsubq_s16(q1s16, q2s16); - q11s16 = vsubq_s16(q0s16, q3s16); - q12s16 = vsubq_s16(q7s16, q4s16); - q13s16 = vsubq_s16(q6s16, q5s16); - q14s16 = vaddq_s16(q6s16, q5s16); - q15s16 = vaddq_s16(q7s16, q4s16); + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[1]; + step1[3] = step2[0]; + step1[4] = step2[4]; + idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]); + step1[7] = step2[7]; + step1[8] = vaddq_s16(step2[8], step2[11]); + step1[9] = vaddq_s16(step2[9], step2[10]); + step1[10] = vsubq_s16(step2[9], step2[10]); + step1[11] = vsubq_s16(step2[8], step2[11]); + step1[12] = vsubq_s16(step2[15], step2[12]); + step1[13] = vsubq_s16(step2[14], step2[13]); + step1[14] = vaddq_s16(step2[14], step2[13]); + step1[15] = vaddq_s16(step2[15], step2[12]); // stage 6 - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d22s16 = vget_low_s16(q11s16); - d23s16 = vget_high_s16(q11s16); - d24s16 = vget_low_s16(q12s16); - d25s16 = vget_high_s16(q12s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - - d14s16 = vdup_n_s16((int16_t)cospi_16_64); - q3s32 = vmull_s16(d26s16, d14s16); - q4s32 = vmull_s16(d27s16, d14s16); - q0s32 = vmull_s16(d20s16, d14s16); - q1s32 = vmull_s16(d21s16, d14s16); - - q5s32 = vsubq_s32(q3s32, q0s32); - q6s32 = vsubq_s32(q4s32, q1s32); - q0s32 = vaddq_s32(q3s32, q0s32); - q4s32 = vaddq_s32(q4s32, q1s32); - - d4s16 = vqrshrn_n_s32(q5s32, 14); - d5s16 = vqrshrn_n_s32(q6s32, 14); - d10s16 = vqrshrn_n_s32(q0s32, 14); - d11s16 = vqrshrn_n_s32(q4s32, 14); - q2s16 = vcombine_s16(d4s16, d5s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - q0s32 = vmull_s16(d22s16, d14s16); - q1s32 = vmull_s16(d23s16, d14s16); - q13s32 = vmull_s16(d24s16, d14s16); - q6s32 = vmull_s16(d25s16, d14s16); - - q10s32 = vsubq_s32(q13s32, q0s32); - q4s32 = vsubq_s32(q6s32, q1s32); - q13s32 = vaddq_s32(q13s32, q0s32); - q6s32 = vaddq_s32(q6s32, q1s32); - - d6s16 = vqrshrn_n_s32(q10s32, 14); - d7s16 = vqrshrn_n_s32(q4s32, 14); - d8s16 = vqrshrn_n_s32(q13s32, 14); - d9s16 = vqrshrn_n_s32(q6s32, 14); - q3s16 = vcombine_s16(d6s16, d7s16); - q4s16 = vcombine_s16(d8s16, d9s16); + step2[0] = vaddq_s16(step1[0], step1[7]); + step2[1] = vaddq_s16(step1[1], step1[6]); + step2[2] = vaddq_s16(step1[2], step1[5]); + step2[3] = vaddq_s16(step1[3], step1[4]); + step2[4] = vsubq_s16(step1[3], step1[4]); + step2[5] = vsubq_s16(step1[2], step1[5]); + step2[6] = vsubq_s16(step1[1], step1[6]); + step2[7] = vsubq_s16(step1[0], step1[7]); + idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10], + &step2[13]); + idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11], + &step2[12]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; // stage 7 - q0s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q1s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q0s16, q15s16); - q13s16 = vaddq_s16(q1s16, q14s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q14s16 = vsubq_s16(q1s16, q14s16); - q15s16 = vsubq_s16(q0s16, q15s16); - - q10s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q11s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q10s16, q5s16); - q13s16 = vaddq_s16(q11s16, q4s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q4s16 = vsubq_s16(q11s16, q4s16); - q5s16 = vsubq_s16(q10s16, q5s16); - - q0s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q1s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q12s16 = vaddq_s16(q0s16, q3s16); - q13s16 = vaddq_s16(q1s16, q2s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q2s16 = vsubq_s16(q1s16, q2s16); - q3s16 = vsubq_s16(q0s16, q3s16); - - q10s16 = vld1q_s16(pass1Output); - pass1Output += 8; - q11s16 = vld1q_s16(pass1Output); - q12s16 = vaddq_s16(q10s16, q9s16); - q13s16 = vaddq_s16(q11s16, q8s16); - d24u64 = vreinterpret_u64_s16(vget_low_s16(q12s16)); - d25u64 = vreinterpret_u64_s16(vget_high_s16(q12s16)); - d26u64 = vreinterpret_u64_s16(vget_low_s16(q13s16)); - d27u64 = vreinterpret_u64_s16(vget_high_s16(q13s16)); - vst1_u64((uint64_t *)out, d24u64); - out += 4; - vst1_u64((uint64_t *)out, d25u64); - out += 12; - vst1_u64((uint64_t *)out, d26u64); - out += 4; - vst1_u64((uint64_t *)out, d27u64); - out += 12; - q8s16 = vsubq_s16(q11s16, q8s16); - q9s16 = vsubq_s16(q10s16, q9s16); - - d4u64 = vreinterpret_u64_s16(vget_low_s16(q2s16)); - d5u64 = vreinterpret_u64_s16(vget_high_s16(q2s16)); - d6u64 = vreinterpret_u64_s16(vget_low_s16(q3s16)); - d7u64 = vreinterpret_u64_s16(vget_high_s16(q3s16)); - d8u64 = vreinterpret_u64_s16(vget_low_s16(q4s16)); - d9u64 = vreinterpret_u64_s16(vget_high_s16(q4s16)); - d10u64 = vreinterpret_u64_s16(vget_low_s16(q5s16)); - d11u64 = vreinterpret_u64_s16(vget_high_s16(q5s16)); - d16u64 = vreinterpret_u64_s16(vget_low_s16(q8s16)); - d17u64 = vreinterpret_u64_s16(vget_high_s16(q8s16)); - d18u64 = vreinterpret_u64_s16(vget_low_s16(q9s16)); - d19u64 = vreinterpret_u64_s16(vget_high_s16(q9s16)); - d28u64 = vreinterpret_u64_s16(vget_low_s16(q14s16)); - d29u64 = vreinterpret_u64_s16(vget_high_s16(q14s16)); - d30u64 = vreinterpret_u64_s16(vget_low_s16(q15s16)); - d31u64 = vreinterpret_u64_s16(vget_high_s16(q15s16)); - - vst1_u64((uint64_t *)out, d16u64); - out += 4; - vst1_u64((uint64_t *)out, d17u64); - out += 12; - vst1_u64((uint64_t *)out, d18u64); - out += 4; - vst1_u64((uint64_t *)out, d19u64); - out += 12; - vst1_u64((uint64_t *)out, d4u64); - out += 4; - vst1_u64((uint64_t *)out, d5u64); - out += 12; - vst1_u64((uint64_t *)out, d6u64); - out += 4; - vst1_u64((uint64_t *)out, d7u64); - out += 12; - vst1_u64((uint64_t *)out, d8u64); - out += 4; - vst1_u64((uint64_t *)out, d9u64); - out += 12; - vst1_u64((uint64_t *)out, d10u64); - out += 4; - vst1_u64((uint64_t *)out, d11u64); - out += 12; - vst1_u64((uint64_t *)out, d28u64); - out += 4; - vst1_u64((uint64_t *)out, d29u64); - out += 12; - vst1_u64((uint64_t *)out, d30u64); - out += 4; - vst1_u64((uint64_t *)out, d31u64); + out[0] = vaddq_s16(step2[0], step2[15]); + out[1] = vaddq_s16(step2[1], step2[14]); + out[2] = vaddq_s16(step2[2], step2[13]); + out[3] = vaddq_s16(step2[3], step2[12]); + out[4] = vaddq_s16(step2[4], step2[11]); + out[5] = vaddq_s16(step2[5], step2[10]); + out[6] = vaddq_s16(step2[6], step2[9]); + out[7] = vaddq_s16(step2[7], step2[8]); + out[8] = vsubq_s16(step2[7], step2[8]); + out[9] = vsubq_s16(step2[6], step2[9]); + out[10] = vsubq_s16(step2[5], step2[10]); + out[11] = vsubq_s16(step2[4], step2[11]); + out[12] = vsubq_s16(step2[3], step2[12]); + out[13] = vsubq_s16(step2[2], step2[13]); + out[14] = vsubq_s16(step2[1], step2[14]); + out[15] = vsubq_s16(step2[0], step2[15]); + + if (output) { + // pass 1: save the result into output + vst1q_s16(output, out[0]); + output += 16; + vst1q_s16(output, out[1]); + output += 16; + vst1q_s16(output, out[2]); + output += 16; + vst1q_s16(output, out[3]); + output += 16; + vst1q_s16(output, out[4]); + output += 16; + vst1q_s16(output, out[5]); + output += 16; + vst1q_s16(output, out[6]); + output += 16; + vst1q_s16(output, out[7]); + output += 16; + vst1q_s16(output, out[8]); + output += 16; + vst1q_s16(output, out[9]); + output += 16; + vst1q_s16(output, out[10]); + output += 16; + vst1q_s16(output, out[11]); + output += 16; + vst1q_s16(output, out[12]); + output += 16; + vst1q_s16(output, out[13]); + output += 16; + vst1q_s16(output, out[14]); + output += 16; + vst1q_s16(output, out[15]); + } else { + // pass 2: add the result to dest. + idct16x16_add8x1(out[0], &dest, stride); + idct16x16_add8x1(out[1], &dest, stride); + idct16x16_add8x1(out[2], &dest, stride); + idct16x16_add8x1(out[3], &dest, stride); + idct16x16_add8x1(out[4], &dest, stride); + idct16x16_add8x1(out[5], &dest, stride); + idct16x16_add8x1(out[6], &dest, stride); + idct16x16_add8x1(out[7], &dest, stride); + idct16x16_add8x1(out[8], &dest, stride); + idct16x16_add8x1(out[9], &dest, stride); + idct16x16_add8x1(out[10], &dest, stride); + idct16x16_add8x1(out[11], &dest, stride); + idct16x16_add8x1(out[12], &dest, stride); + idct16x16_add8x1(out[13], &dest, stride); + idct16x16_add8x1(out[14], &dest, stride); + idct16x16_add8x1(out[15], &dest, stride); + } +} + +void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + int16_t row_idct_output[16 * 16]; + +#if CONFIG_VP9_HIGHBITDEPTH + int16_t pass1_input[16 * 16]; + idct16x16_256_add_load_tran_low(input, pass1_input); +#else + const int16_t *pass1_input = input; +#endif // CONFIG_VP9_HIGHBITDEPTH + + // pass 1 + // Parallel idct on the upper 8 rows + idct16x16_256_add_half1d(pass1_input, row_idct_output, dest, stride); + + // Parallel idct on the lower 8 rows + idct16x16_256_add_half1d(pass1_input + 8 * 16, row_idct_output + 8, dest, + stride); + + // pass 2 + // Parallel idct to get the left 8 columns + idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride); + + // Parallel idct to get the right 8 columns + idct16x16_256_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride); +} + +void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + int16_t row_idct_output[4 * 16]; + + // pass 1 + // Parallel idct on the upper 8 rows + idct16x16_10_add_half1d_pass1(input, row_idct_output); + + // pass 2 + // Parallel idct to get the left 8 columns + idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride); + + // Parallel idct to get the right 8 columns + idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, dest + 8, + stride); } diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_neon.c index bdbbf519332..47366bcb7d6 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_neon.c @@ -11,16 +11,29 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/vpx_dsp_common.h" -void vpx_idct16x16_256_add_neon_pass1(const int16_t *input, int16_t *output, - int output_stride); +void vpx_idct16x16_256_add_neon_pass1(const int16_t *input, int16_t *output); void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *output, - int16_t *pass1Output, int16_t skip_adding, - uint8_t *dest, int dest_stride); -void vpx_idct16x16_10_add_neon_pass1(const int16_t *input, int16_t *output, - int output_stride); -void vpx_idct16x16_10_add_neon_pass2(const int16_t *src, int16_t *output, - int16_t *pass1Output, int16_t skip_adding, - uint8_t *dest, int dest_stride); + int16_t *pass1_output, + int16_t skip_adding, uint8_t *dest, + int stride); +#if CONFIG_VP9_HIGHBITDEPTH +void vpx_idct16x16_256_add_neon_pass1_tran_low(const tran_low_t *input, + int16_t *output); +void vpx_idct16x16_256_add_neon_pass2_tran_low(const tran_low_t *src, + int16_t *output, + int16_t *pass1_output, + int16_t skip_adding, + uint8_t *dest, int stride); +#else +#define vpx_idct16x16_256_add_neon_pass1_tran_low \ + vpx_idct16x16_256_add_neon_pass1 +#define vpx_idct16x16_256_add_neon_pass2_tran_low \ + vpx_idct16x16_256_add_neon_pass2 +#endif + +void vpx_idct16x16_10_add_neon_pass1(const tran_low_t *input, int16_t *output); +void vpx_idct16x16_10_add_neon_pass2(const tran_low_t *src, int16_t *output, + int16_t *pass1_output); #if HAVE_NEON_ASM /* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */ @@ -28,8 +41,8 @@ extern void vpx_push_neon(int64_t *store); extern void vpx_pop_neon(int64_t *store); #endif // HAVE_NEON_ASM -void vpx_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest, - int dest_stride) { +void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { #if HAVE_NEON_ASM int64_t store_reg[8]; #endif @@ -44,47 +57,47 @@ void vpx_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest, /* Parallel idct on the upper 8 rows */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. - vpx_idct16x16_256_add_neon_pass1(input, pass1_output, 8); + vpx_idct16x16_256_add_neon_pass1_tran_low(input, pass1_output); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7 // which will be saved into row_idct_output. - vpx_idct16x16_256_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0, - dest, dest_stride); + vpx_idct16x16_256_add_neon_pass2_tran_low(input + 1, row_idct_output, + pass1_output, 0, dest, stride); /* Parallel idct on the lower 8 rows */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. - vpx_idct16x16_256_add_neon_pass1(input + 8 * 16, pass1_output, 8); + vpx_idct16x16_256_add_neon_pass1_tran_low(input + 8 * 16, pass1_output); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7 // which will be saved into row_idct_output. - vpx_idct16x16_256_add_neon_pass2(input + 8 * 16 + 1, row_idct_output + 8, - pass1_output, 0, dest, dest_stride); + vpx_idct16x16_256_add_neon_pass2_tran_low( + input + 8 * 16 + 1, row_idct_output + 8, pass1_output, 0, dest, stride); /* Parallel idct on the left 8 columns */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. - vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8); + vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7. // Then add the result to the destination data. vpx_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output, - pass1_output, 1, dest, dest_stride); + pass1_output, 1, dest, stride); /* Parallel idct on the right 8 columns */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. - vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8); + vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7. // Then add the result to the destination data. vpx_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1, row_idct_output + 8, pass1_output, 1, - dest + 8, dest_stride); + dest + 8, stride); #if HAVE_NEON_ASM // restore d8-d15 register values. @@ -92,8 +105,8 @@ void vpx_idct16x16_256_add_neon(const int16_t *input, uint8_t *dest, #endif } -void vpx_idct16x16_10_add_neon(const int16_t *input, uint8_t *dest, - int dest_stride) { +void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { #if HAVE_NEON_ASM int64_t store_reg[8]; #endif @@ -108,38 +121,37 @@ void vpx_idct16x16_10_add_neon(const int16_t *input, uint8_t *dest, /* Parallel idct on the upper 8 rows */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. - vpx_idct16x16_10_add_neon_pass1(input, pass1_output, 8); + vpx_idct16x16_10_add_neon_pass1(input, pass1_output); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7 // which will be saved into row_idct_output. - vpx_idct16x16_10_add_neon_pass2(input + 1, row_idct_output, pass1_output, 0, - dest, dest_stride); + vpx_idct16x16_10_add_neon_pass2(input + 1, row_idct_output, pass1_output); /* Skip Parallel idct on the lower 8 rows as they are all 0s */ /* Parallel idct on the left 8 columns */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. - vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output, 8); + vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7. // Then add the result to the destination data. vpx_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output, - pass1_output, 1, dest, dest_stride); + pass1_output, 1, dest, stride); /* Parallel idct on the right 8 columns */ // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the // stage 6 result in pass1_output. - vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output, 8); + vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output); // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines // with result in pass1(pass1_output) to calculate final result in stage 7. // Then add the result to the destination data. vpx_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1, row_idct_output + 8, pass1_output, 1, - dest + 8, dest_stride); + dest + 8, stride); #if HAVE_NEON_ASM // restore d8-d15 register values. diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c new file mode 100644 index 00000000000..28b94655848 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_135_add_neon.c @@ -0,0 +1,714 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/txfm_common.h" + +static INLINE void load_8x8_s16(const tran_low_t *input, int16x8_t *const in0, + int16x8_t *const in1, int16x8_t *const in2, + int16x8_t *const in3, int16x8_t *const in4, + int16x8_t *const in5, int16x8_t *const in6, + int16x8_t *const in7) { + *in0 = load_tran_low_to_s16q(input); + input += 32; + *in1 = load_tran_low_to_s16q(input); + input += 32; + *in2 = load_tran_low_to_s16q(input); + input += 32; + *in3 = load_tran_low_to_s16q(input); + input += 32; + *in4 = load_tran_low_to_s16q(input); + input += 32; + *in5 = load_tran_low_to_s16q(input); + input += 32; + *in6 = load_tran_low_to_s16q(input); + input += 32; + *in7 = load_tran_low_to_s16q(input); +} + +static INLINE void load_4x8_s16(const tran_low_t *input, int16x4_t *const in0, + int16x4_t *const in1, int16x4_t *const in2, + int16x4_t *const in3, int16x4_t *const in4, + int16x4_t *const in5, int16x4_t *const in6, + int16x4_t *const in7) { + *in0 = load_tran_low_to_s16d(input); + input += 32; + *in1 = load_tran_low_to_s16d(input); + input += 32; + *in2 = load_tran_low_to_s16d(input); + input += 32; + *in3 = load_tran_low_to_s16d(input); + input += 32; + *in4 = load_tran_low_to_s16d(input); + input += 32; + *in5 = load_tran_low_to_s16d(input); + input += 32; + *in6 = load_tran_low_to_s16d(input); + input += 32; + *in7 = load_tran_low_to_s16d(input); +} + +// Only for the first pass of the _135_ variant. Since it only uses values from +// the top left 16x16 it can safely assume all the remaining values are 0 and +// skip an awful lot of calculations. In fact, only the first 12 columns make +// the cut. None of the elements in the 13th, 14th, 15th or 16th columns are +// used so it skips any calls to input[12|13|14|15] too. +// In C this does a single row of 32 for each call. Here it transposes the top +// left 12x8 to allow using SIMD. + +// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 135 non-zero +// coefficients as follows: +// 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +// 0 0 2 5 10 17 25 38 47 62 83 101 121 +// 1 1 4 8 15 22 30 45 58 74 92 112 133 +// 2 3 7 12 18 28 36 52 64 82 102 118 +// 3 6 11 16 23 31 43 60 73 90 109 126 +// 4 9 14 19 29 37 50 65 78 98 116 134 +// 5 13 20 26 35 44 54 72 85 105 123 +// 6 21 27 33 42 53 63 80 94 113 132 +// 7 24 32 39 48 57 71 88 104 120 +// 8 34 40 46 56 68 81 96 111 130 +// 9 41 49 55 67 77 91 107 124 +// 10 51 59 66 76 89 99 119 131 +// 11 61 69 75 87 100 114 129 +// 12 70 79 86 97 108 122 +// 13 84 93 103 110 125 +// 14 98 106 115 127 +// 15 117 128 +static void idct32_12_neon(const tran_low_t *input, int16_t *output) { + int16x8_t in0, in1, in2, in3, in4, in5, in6, in7; + int16x4_t tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int16x8_t in8, in9, in10, in11; + int16x8_t s1_16, s1_18, s1_19, s1_20, s1_21, s1_23, s1_24, s1_26, s1_27, + s1_28, s1_29, s1_31; + int16x8_t s2_8, s2_10, s2_11, s2_12, s2_13, s2_15, s2_18, s2_19, s2_20, s2_21, + s2_26, s2_27, s2_28, s2_29; + int16x8_t s3_4, s3_7, s3_10, s3_11, s3_12, s3_13, s3_17, s3_18, s3_21, s3_22, + s3_25, s3_26, s3_29, s3_30; + int16x8_t s4_0, s4_2, s4_3, s4_9, s4_10, s4_13, s4_14, s4_16, s4_17, s4_18, + s4_19, s4_20, s4_21, s4_22, s4_23, s4_24, s4_25, s4_26, s4_27, s4_28, + s4_29, s4_30, s4_31; + int16x8_t s5_0, s5_1, s5_2, s5_3, s5_5, s5_6, s5_8, s5_9, s5_10, s5_11, s5_12, + s5_13, s5_14, s5_15, s5_18, s5_19, s5_20, s5_21, s5_26, s5_27, s5_28, + s5_29; + int16x8_t s6_0, s6_1, s6_2, s6_3, s6_4, s6_5, s6_6, s6_7, s6_10, s6_11, s6_12, + s6_13, s6_16, s6_17, s6_18, s6_19, s6_20, s6_21, s6_22, s6_23, s6_24, + s6_25, s6_26, s6_27, s6_28, s6_29, s6_30, s6_31; + int16x8_t s7_0, s7_1, s7_2, s7_3, s7_4, s7_5, s7_6, s7_7, s7_8, s7_9, s7_10, + s7_11, s7_12, s7_13, s7_14, s7_15, s7_20, s7_21, s7_22, s7_23, s7_24, + s7_25, s7_26, s7_27; + + load_8x8_s16(input, &in0, &in1, &in2, &in3, &in4, &in5, &in6, &in7); + transpose_s16_8x8(&in0, &in1, &in2, &in3, &in4, &in5, &in6, &in7); + + load_4x8_s16(input + 8, &tmp0, &tmp1, &tmp2, &tmp3, &tmp4, &tmp5, &tmp6, + &tmp7); + transpose_s16_4x8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, &in8, &in9, + &in10, &in11); + + // stage 1 + s1_16 = multiply_shift_and_narrow_s16(in1, cospi_31_64); + s1_31 = multiply_shift_and_narrow_s16(in1, cospi_1_64); + + s1_18 = multiply_shift_and_narrow_s16(in9, cospi_23_64); + s1_29 = multiply_shift_and_narrow_s16(in9, cospi_9_64); + + s1_19 = multiply_shift_and_narrow_s16(in7, -cospi_25_64); + s1_28 = multiply_shift_and_narrow_s16(in7, cospi_7_64); + + s1_20 = multiply_shift_and_narrow_s16(in5, cospi_27_64); + s1_27 = multiply_shift_and_narrow_s16(in5, cospi_5_64); + + s1_21 = multiply_shift_and_narrow_s16(in11, -cospi_21_64); + s1_26 = multiply_shift_and_narrow_s16(in11, cospi_11_64); + + s1_23 = multiply_shift_and_narrow_s16(in3, -cospi_29_64); + s1_24 = multiply_shift_and_narrow_s16(in3, cospi_3_64); + + // stage 2 + s2_8 = multiply_shift_and_narrow_s16(in2, cospi_30_64); + s2_15 = multiply_shift_and_narrow_s16(in2, cospi_2_64); + + s2_10 = multiply_shift_and_narrow_s16(in10, cospi_22_64); + s2_13 = multiply_shift_and_narrow_s16(in10, cospi_10_64); + + s2_11 = multiply_shift_and_narrow_s16(in6, -cospi_26_64); + s2_12 = multiply_shift_and_narrow_s16(in6, cospi_6_64); + + s2_18 = vsubq_s16(s1_19, s1_18); + s2_19 = vaddq_s16(s1_18, s1_19); + s2_20 = vaddq_s16(s1_20, s1_21); + s2_21 = vsubq_s16(s1_20, s1_21); + s2_26 = vsubq_s16(s1_27, s1_26); + s2_27 = vaddq_s16(s1_26, s1_27); + s2_28 = vaddq_s16(s1_28, s1_29); + s2_29 = vsubq_s16(s1_28, s1_29); + + // stage 3 + s3_4 = multiply_shift_and_narrow_s16(in4, cospi_28_64); + s3_7 = multiply_shift_and_narrow_s16(in4, cospi_4_64); + + s3_10 = vsubq_s16(s2_11, s2_10); + s3_11 = vaddq_s16(s2_10, s2_11); + s3_12 = vaddq_s16(s2_12, s2_13); + s3_13 = vsubq_s16(s2_12, s2_13); + + s3_17 = multiply_accumulate_shift_and_narrow_s16(s1_16, -cospi_4_64, s1_31, + cospi_28_64); + s3_30 = multiply_accumulate_shift_and_narrow_s16(s1_16, cospi_28_64, s1_31, + cospi_4_64); + + s3_18 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_28_64, s2_29, + -cospi_4_64); + s3_29 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_4_64, s2_29, + cospi_28_64); + + s3_21 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_20_64, s2_26, + cospi_12_64); + s3_26 = multiply_accumulate_shift_and_narrow_s16(s2_21, cospi_12_64, s2_26, + cospi_20_64); + + s3_22 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_12_64, s1_24, + -cospi_20_64); + s3_25 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_20_64, s1_24, + cospi_12_64); + + // stage 4 + s4_0 = multiply_shift_and_narrow_s16(in0, cospi_16_64); + s4_2 = multiply_shift_and_narrow_s16(in8, cospi_24_64); + s4_3 = multiply_shift_and_narrow_s16(in8, cospi_8_64); + + s4_9 = multiply_accumulate_shift_and_narrow_s16(s2_8, -cospi_8_64, s2_15, + cospi_24_64); + s4_14 = multiply_accumulate_shift_and_narrow_s16(s2_8, cospi_24_64, s2_15, + cospi_8_64); + + s4_10 = multiply_accumulate_shift_and_narrow_s16(s3_10, -cospi_24_64, s3_13, + -cospi_8_64); + s4_13 = multiply_accumulate_shift_and_narrow_s16(s3_10, -cospi_8_64, s3_13, + cospi_24_64); + + s4_16 = vaddq_s16(s1_16, s2_19); + s4_17 = vaddq_s16(s3_17, s3_18); + s4_18 = vsubq_s16(s3_17, s3_18); + s4_19 = vsubq_s16(s1_16, s2_19); + s4_20 = vsubq_s16(s1_23, s2_20); + s4_21 = vsubq_s16(s3_22, s3_21); + s4_22 = vaddq_s16(s3_21, s3_22); + s4_23 = vaddq_s16(s2_20, s1_23); + s4_24 = vaddq_s16(s1_24, s2_27); + s4_25 = vaddq_s16(s3_25, s3_26); + s4_26 = vsubq_s16(s3_25, s3_26); + s4_27 = vsubq_s16(s1_24, s2_27); + s4_28 = vsubq_s16(s1_31, s2_28); + s4_29 = vsubq_s16(s3_30, s3_29); + s4_30 = vaddq_s16(s3_29, s3_30); + s4_31 = vaddq_s16(s2_28, s1_31); + + // stage 5 + s5_0 = vaddq_s16(s4_0, s4_3); + s5_1 = vaddq_s16(s4_0, s4_2); + s5_2 = vsubq_s16(s4_0, s4_2); + s5_3 = vsubq_s16(s4_0, s4_3); + + s5_5 = sub_multiply_shift_and_narrow_s16(s3_7, s3_4, cospi_16_64); + s5_6 = add_multiply_shift_and_narrow_s16(s3_4, s3_7, cospi_16_64); + + s5_8 = vaddq_s16(s2_8, s3_11); + s5_9 = vaddq_s16(s4_9, s4_10); + s5_10 = vsubq_s16(s4_9, s4_10); + s5_11 = vsubq_s16(s2_8, s3_11); + s5_12 = vsubq_s16(s2_15, s3_12); + s5_13 = vsubq_s16(s4_14, s4_13); + s5_14 = vaddq_s16(s4_13, s4_14); + s5_15 = vaddq_s16(s2_15, s3_12); + + s5_18 = multiply_accumulate_shift_and_narrow_s16(s4_18, -cospi_8_64, s4_29, + cospi_24_64); + s5_29 = multiply_accumulate_shift_and_narrow_s16(s4_18, cospi_24_64, s4_29, + cospi_8_64); + + s5_19 = multiply_accumulate_shift_and_narrow_s16(s4_19, -cospi_8_64, s4_28, + cospi_24_64); + s5_28 = multiply_accumulate_shift_and_narrow_s16(s4_19, cospi_24_64, s4_28, + cospi_8_64); + + s5_20 = multiply_accumulate_shift_and_narrow_s16(s4_20, -cospi_24_64, s4_27, + -cospi_8_64); + s5_27 = multiply_accumulate_shift_and_narrow_s16(s4_20, -cospi_8_64, s4_27, + cospi_24_64); + + s5_21 = multiply_accumulate_shift_and_narrow_s16(s4_21, -cospi_24_64, s4_26, + -cospi_8_64); + s5_26 = multiply_accumulate_shift_and_narrow_s16(s4_21, -cospi_8_64, s4_26, + cospi_24_64); + + // stage 6 + s6_0 = vaddq_s16(s5_0, s3_7); + s6_1 = vaddq_s16(s5_1, s5_6); + s6_2 = vaddq_s16(s5_2, s5_5); + s6_3 = vaddq_s16(s5_3, s3_4); + s6_4 = vsubq_s16(s5_3, s3_4); + s6_5 = vsubq_s16(s5_2, s5_5); + s6_6 = vsubq_s16(s5_1, s5_6); + s6_7 = vsubq_s16(s5_0, s3_7); + + s6_10 = sub_multiply_shift_and_narrow_s16(s5_13, s5_10, cospi_16_64); + s6_13 = add_multiply_shift_and_narrow_s16(s5_10, s5_13, cospi_16_64); + + s6_11 = sub_multiply_shift_and_narrow_s16(s5_12, s5_11, cospi_16_64); + s6_12 = add_multiply_shift_and_narrow_s16(s5_11, s5_12, cospi_16_64); + + s6_16 = vaddq_s16(s4_16, s4_23); + s6_17 = vaddq_s16(s4_17, s4_22); + s6_18 = vaddq_s16(s5_18, s5_21); + s6_19 = vaddq_s16(s5_19, s5_20); + s6_20 = vsubq_s16(s5_19, s5_20); + s6_21 = vsubq_s16(s5_18, s5_21); + s6_22 = vsubq_s16(s4_17, s4_22); + s6_23 = vsubq_s16(s4_16, s4_23); + + s6_24 = vsubq_s16(s4_31, s4_24); + s6_25 = vsubq_s16(s4_30, s4_25); + s6_26 = vsubq_s16(s5_29, s5_26); + s6_27 = vsubq_s16(s5_28, s5_27); + s6_28 = vaddq_s16(s5_27, s5_28); + s6_29 = vaddq_s16(s5_26, s5_29); + s6_30 = vaddq_s16(s4_25, s4_30); + s6_31 = vaddq_s16(s4_24, s4_31); + + // stage 7 + s7_0 = vaddq_s16(s6_0, s5_15); + s7_1 = vaddq_s16(s6_1, s5_14); + s7_2 = vaddq_s16(s6_2, s6_13); + s7_3 = vaddq_s16(s6_3, s6_12); + s7_4 = vaddq_s16(s6_4, s6_11); + s7_5 = vaddq_s16(s6_5, s6_10); + s7_6 = vaddq_s16(s6_6, s5_9); + s7_7 = vaddq_s16(s6_7, s5_8); + s7_8 = vsubq_s16(s6_7, s5_8); + s7_9 = vsubq_s16(s6_6, s5_9); + s7_10 = vsubq_s16(s6_5, s6_10); + s7_11 = vsubq_s16(s6_4, s6_11); + s7_12 = vsubq_s16(s6_3, s6_12); + s7_13 = vsubq_s16(s6_2, s6_13); + s7_14 = vsubq_s16(s6_1, s5_14); + s7_15 = vsubq_s16(s6_0, s5_15); + + s7_20 = sub_multiply_shift_and_narrow_s16(s6_27, s6_20, cospi_16_64); + s7_27 = add_multiply_shift_and_narrow_s16(s6_20, s6_27, cospi_16_64); + + s7_21 = sub_multiply_shift_and_narrow_s16(s6_26, s6_21, cospi_16_64); + s7_26 = add_multiply_shift_and_narrow_s16(s6_21, s6_26, cospi_16_64); + + s7_22 = sub_multiply_shift_and_narrow_s16(s6_25, s6_22, cospi_16_64); + s7_25 = add_multiply_shift_and_narrow_s16(s6_22, s6_25, cospi_16_64); + + s7_23 = sub_multiply_shift_and_narrow_s16(s6_24, s6_23, cospi_16_64); + s7_24 = add_multiply_shift_and_narrow_s16(s6_23, s6_24, cospi_16_64); + + // final stage + vst1q_s16(output, vaddq_s16(s7_0, s6_31)); + output += 16; + vst1q_s16(output, vaddq_s16(s7_1, s6_30)); + output += 16; + vst1q_s16(output, vaddq_s16(s7_2, s6_29)); + output += 16; + vst1q_s16(output, vaddq_s16(s7_3, s6_28)); + output += 16; + vst1q_s16(output, vaddq_s16(s7_4, s7_27)); + output += 16; + vst1q_s16(output, vaddq_s16(s7_5, s7_26)); + output += 16; + vst1q_s16(output, vaddq_s16(s7_6, s7_25)); + output += 16; + vst1q_s16(output, vaddq_s16(s7_7, s7_24)); + output += 16; + + vst1q_s16(output, vaddq_s16(s7_8, s7_23)); + output += 16; + vst1q_s16(output, vaddq_s16(s7_9, s7_22)); + output += 16; + vst1q_s16(output, vaddq_s16(s7_10, s7_21)); + output += 16; + vst1q_s16(output, vaddq_s16(s7_11, s7_20)); + output += 16; + vst1q_s16(output, vaddq_s16(s7_12, s6_19)); + output += 16; + vst1q_s16(output, vaddq_s16(s7_13, s6_18)); + output += 16; + vst1q_s16(output, vaddq_s16(s7_14, s6_17)); + output += 16; + vst1q_s16(output, vaddq_s16(s7_15, s6_16)); + output += 16; + + vst1q_s16(output, vsubq_s16(s7_15, s6_16)); + output += 16; + vst1q_s16(output, vsubq_s16(s7_14, s6_17)); + output += 16; + vst1q_s16(output, vsubq_s16(s7_13, s6_18)); + output += 16; + vst1q_s16(output, vsubq_s16(s7_12, s6_19)); + output += 16; + vst1q_s16(output, vsubq_s16(s7_11, s7_20)); + output += 16; + vst1q_s16(output, vsubq_s16(s7_10, s7_21)); + output += 16; + vst1q_s16(output, vsubq_s16(s7_9, s7_22)); + output += 16; + vst1q_s16(output, vsubq_s16(s7_8, s7_23)); + output += 16; + + vst1q_s16(output, vsubq_s16(s7_7, s7_24)); + output += 16; + vst1q_s16(output, vsubq_s16(s7_6, s7_25)); + output += 16; + vst1q_s16(output, vsubq_s16(s7_5, s7_26)); + output += 16; + vst1q_s16(output, vsubq_s16(s7_4, s7_27)); + output += 16; + vst1q_s16(output, vsubq_s16(s7_3, s6_28)); + output += 16; + vst1q_s16(output, vsubq_s16(s7_2, s6_29)); + output += 16; + vst1q_s16(output, vsubq_s16(s7_1, s6_30)); + output += 16; + vst1q_s16(output, vsubq_s16(s7_0, s6_31)); +} + +static void idct32_16_neon(const int16_t *input, uint8_t *output, int stride) { + int16x8_t in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, + in13, in14, in15; + int16x8_t s1_16, s1_17, s1_18, s1_19, s1_20, s1_21, s1_22, s1_23, s1_24, + s1_25, s1_26, s1_27, s1_28, s1_29, s1_30, s1_31; + int16x8_t s2_8, s2_9, s2_10, s2_11, s2_12, s2_13, s2_14, s2_15, s2_16, s2_17, + s2_18, s2_19, s2_20, s2_21, s2_22, s2_23, s2_24, s2_25, s2_26, s2_27, + s2_28, s2_29, s2_30, s2_31; + int16x8_t s3_4, s3_5, s3_6, s3_7, s3_8, s3_9, s3_10, s3_11, s3_12, s3_13, + s3_14, s3_15, s3_17, s3_18, s3_21, s3_22, s3_25, s3_26, s3_29, s3_30; + int16x8_t s4_0, s4_2, s4_3, s4_4, s4_5, s4_6, s4_7, s4_9, s4_10, s4_13, s4_14, + s4_16, s4_17, s4_18, s4_19, s4_20, s4_21, s4_22, s4_23, s4_24, s4_25, + s4_26, s4_27, s4_28, s4_29, s4_30, s4_31; + int16x8_t s5_0, s5_1, s5_2, s5_3, s5_5, s5_6, s5_8, s5_9, s5_10, s5_11, s5_12, + s5_13, s5_14, s5_15, s5_18, s5_19, s5_20, s5_21, s5_26, s5_27, s5_28, + s5_29; + int16x8_t s6_0, s6_1, s6_2, s6_3, s6_4, s6_5, s6_6, s6_7, s6_10, s6_11, s6_12, + s6_13, s6_16, s6_17, s6_18, s6_19, s6_20, s6_21, s6_22, s6_23, s6_24, + s6_25, s6_26, s6_27, s6_28, s6_29, s6_30, s6_31; + int16x8_t s7_0, s7_1, s7_2, s7_3, s7_4, s7_5, s7_6, s7_7, s7_8, s7_9, s7_10, + s7_11, s7_12, s7_13, s7_14, s7_15, s7_20, s7_21, s7_22, s7_23, s7_24, + s7_25, s7_26, s7_27; + int16x8_t out0, out1, out2, out3, out4, out5, out6, out7; + + load_and_transpose_s16_8x8(input, 16, &in0, &in1, &in2, &in3, &in4, &in5, + &in6, &in7); + + load_and_transpose_s16_8x8(input + 8, 16, &in8, &in9, &in10, &in11, &in12, + &in13, &in14, &in15); + + // stage 1 + s1_16 = multiply_shift_and_narrow_s16(in1, cospi_31_64); + s1_31 = multiply_shift_and_narrow_s16(in1, cospi_1_64); + + s1_17 = multiply_shift_and_narrow_s16(in15, -cospi_17_64); + s1_30 = multiply_shift_and_narrow_s16(in15, cospi_15_64); + + s1_18 = multiply_shift_and_narrow_s16(in9, cospi_23_64); + s1_29 = multiply_shift_and_narrow_s16(in9, cospi_9_64); + + s1_19 = multiply_shift_and_narrow_s16(in7, -cospi_25_64); + s1_28 = multiply_shift_and_narrow_s16(in7, cospi_7_64); + + s1_20 = multiply_shift_and_narrow_s16(in5, cospi_27_64); + s1_27 = multiply_shift_and_narrow_s16(in5, cospi_5_64); + + s1_21 = multiply_shift_and_narrow_s16(in11, -cospi_21_64); + s1_26 = multiply_shift_and_narrow_s16(in11, cospi_11_64); + + s1_22 = multiply_shift_and_narrow_s16(in13, cospi_19_64); + s1_25 = multiply_shift_and_narrow_s16(in13, cospi_13_64); + + s1_23 = multiply_shift_and_narrow_s16(in3, -cospi_29_64); + s1_24 = multiply_shift_and_narrow_s16(in3, cospi_3_64); + + // stage 2 + s2_8 = multiply_shift_and_narrow_s16(in2, cospi_30_64); + s2_15 = multiply_shift_and_narrow_s16(in2, cospi_2_64); + + s2_9 = multiply_shift_and_narrow_s16(in14, -cospi_18_64); + s2_14 = multiply_shift_and_narrow_s16(in14, cospi_14_64); + + s2_10 = multiply_shift_and_narrow_s16(in10, cospi_22_64); + s2_13 = multiply_shift_and_narrow_s16(in10, cospi_10_64); + + s2_11 = multiply_shift_and_narrow_s16(in6, -cospi_26_64); + s2_12 = multiply_shift_and_narrow_s16(in6, cospi_6_64); + + s2_16 = vaddq_s16(s1_16, s1_17); + s2_17 = vsubq_s16(s1_16, s1_17); + s2_18 = vsubq_s16(s1_19, s1_18); + s2_19 = vaddq_s16(s1_18, s1_19); + s2_20 = vaddq_s16(s1_20, s1_21); + s2_21 = vsubq_s16(s1_20, s1_21); + s2_22 = vsubq_s16(s1_23, s1_22); + s2_23 = vaddq_s16(s1_22, s1_23); + s2_24 = vaddq_s16(s1_24, s1_25); + s2_25 = vsubq_s16(s1_24, s1_25); + s2_26 = vsubq_s16(s1_27, s1_26); + s2_27 = vaddq_s16(s1_26, s1_27); + s2_28 = vaddq_s16(s1_28, s1_29); + s2_29 = vsubq_s16(s1_28, s1_29); + s2_30 = vsubq_s16(s1_31, s1_30); + s2_31 = vaddq_s16(s1_30, s1_31); + + // stage 3 + s3_4 = multiply_shift_and_narrow_s16(in4, cospi_28_64); + s3_7 = multiply_shift_and_narrow_s16(in4, cospi_4_64); + + s3_5 = multiply_shift_and_narrow_s16(in12, -cospi_20_64); + s3_6 = multiply_shift_and_narrow_s16(in12, cospi_12_64); + + s3_8 = vaddq_s16(s2_8, s2_9); + s3_9 = vsubq_s16(s2_8, s2_9); + s3_10 = vsubq_s16(s2_11, s2_10); + s3_11 = vaddq_s16(s2_10, s2_11); + s3_12 = vaddq_s16(s2_12, s2_13); + s3_13 = vsubq_s16(s2_12, s2_13); + s3_14 = vsubq_s16(s2_15, s2_14); + s3_15 = vaddq_s16(s2_14, s2_15); + + s3_17 = multiply_accumulate_shift_and_narrow_s16(s2_17, -cospi_4_64, s2_30, + cospi_28_64); + s3_30 = multiply_accumulate_shift_and_narrow_s16(s2_17, cospi_28_64, s2_30, + cospi_4_64); + + s3_18 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_28_64, s2_29, + -cospi_4_64); + s3_29 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_4_64, s2_29, + cospi_28_64); + + s3_21 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_20_64, s2_26, + cospi_12_64); + s3_26 = multiply_accumulate_shift_and_narrow_s16(s2_21, cospi_12_64, s2_26, + cospi_20_64); + + s3_22 = multiply_accumulate_shift_and_narrow_s16(s2_22, -cospi_12_64, s2_25, + -cospi_20_64); + s3_25 = multiply_accumulate_shift_and_narrow_s16(s2_22, -cospi_20_64, s2_25, + cospi_12_64); + + // stage 4 + s4_0 = multiply_shift_and_narrow_s16(in0, cospi_16_64); + s4_2 = multiply_shift_and_narrow_s16(in8, cospi_24_64); + s4_3 = multiply_shift_and_narrow_s16(in8, cospi_8_64); + + s4_4 = vaddq_s16(s3_4, s3_5); + s4_5 = vsubq_s16(s3_4, s3_5); + s4_6 = vsubq_s16(s3_7, s3_6); + s4_7 = vaddq_s16(s3_6, s3_7); + + s4_9 = multiply_accumulate_shift_and_narrow_s16(s3_9, -cospi_8_64, s3_14, + cospi_24_64); + s4_14 = multiply_accumulate_shift_and_narrow_s16(s3_9, cospi_24_64, s3_14, + cospi_8_64); + + s4_10 = multiply_accumulate_shift_and_narrow_s16(s3_10, -cospi_24_64, s3_13, + -cospi_8_64); + s4_13 = multiply_accumulate_shift_and_narrow_s16(s3_10, -cospi_8_64, s3_13, + cospi_24_64); + + s4_16 = vaddq_s16(s2_16, s2_19); + s4_17 = vaddq_s16(s3_17, s3_18); + s4_18 = vsubq_s16(s3_17, s3_18); + s4_19 = vsubq_s16(s2_16, s2_19); + s4_20 = vsubq_s16(s2_23, s2_20); + s4_21 = vsubq_s16(s3_22, s3_21); + s4_22 = vaddq_s16(s3_21, s3_22); + s4_23 = vaddq_s16(s2_20, s2_23); + s4_24 = vaddq_s16(s2_24, s2_27); + s4_25 = vaddq_s16(s3_25, s3_26); + s4_26 = vsubq_s16(s3_25, s3_26); + s4_27 = vsubq_s16(s2_24, s2_27); + s4_28 = vsubq_s16(s2_31, s2_28); + s4_29 = vsubq_s16(s3_30, s3_29); + s4_30 = vaddq_s16(s3_29, s3_30); + s4_31 = vaddq_s16(s2_28, s2_31); + + // stage 5 + s5_0 = vaddq_s16(s4_0, s4_3); + s5_1 = vaddq_s16(s4_0, s4_2); + s5_2 = vsubq_s16(s4_0, s4_2); + s5_3 = vsubq_s16(s4_0, s4_3); + + s5_5 = sub_multiply_shift_and_narrow_s16(s4_6, s4_5, cospi_16_64); + s5_6 = add_multiply_shift_and_narrow_s16(s4_5, s4_6, cospi_16_64); + + s5_8 = vaddq_s16(s3_8, s3_11); + s5_9 = vaddq_s16(s4_9, s4_10); + s5_10 = vsubq_s16(s4_9, s4_10); + s5_11 = vsubq_s16(s3_8, s3_11); + s5_12 = vsubq_s16(s3_15, s3_12); + s5_13 = vsubq_s16(s4_14, s4_13); + s5_14 = vaddq_s16(s4_13, s4_14); + s5_15 = vaddq_s16(s3_15, s3_12); + + s5_18 = multiply_accumulate_shift_and_narrow_s16(s4_18, -cospi_8_64, s4_29, + cospi_24_64); + s5_29 = multiply_accumulate_shift_and_narrow_s16(s4_18, cospi_24_64, s4_29, + cospi_8_64); + + s5_19 = multiply_accumulate_shift_and_narrow_s16(s4_19, -cospi_8_64, s4_28, + cospi_24_64); + s5_28 = multiply_accumulate_shift_and_narrow_s16(s4_19, cospi_24_64, s4_28, + cospi_8_64); + + s5_20 = multiply_accumulate_shift_and_narrow_s16(s4_20, -cospi_24_64, s4_27, + -cospi_8_64); + s5_27 = multiply_accumulate_shift_and_narrow_s16(s4_20, -cospi_8_64, s4_27, + cospi_24_64); + + s5_21 = multiply_accumulate_shift_and_narrow_s16(s4_21, -cospi_24_64, s4_26, + -cospi_8_64); + s5_26 = multiply_accumulate_shift_and_narrow_s16(s4_21, -cospi_8_64, s4_26, + cospi_24_64); + + // stage 6 + s6_0 = vaddq_s16(s5_0, s4_7); + s6_1 = vaddq_s16(s5_1, s5_6); + s6_2 = vaddq_s16(s5_2, s5_5); + s6_3 = vaddq_s16(s5_3, s4_4); + s6_4 = vsubq_s16(s5_3, s4_4); + s6_5 = vsubq_s16(s5_2, s5_5); + s6_6 = vsubq_s16(s5_1, s5_6); + s6_7 = vsubq_s16(s5_0, s4_7); + + s6_10 = sub_multiply_shift_and_narrow_s16(s5_13, s5_10, cospi_16_64); + s6_13 = add_multiply_shift_and_narrow_s16(s5_10, s5_13, cospi_16_64); + + s6_11 = sub_multiply_shift_and_narrow_s16(s5_12, s5_11, cospi_16_64); + s6_12 = add_multiply_shift_and_narrow_s16(s5_11, s5_12, cospi_16_64); + + s6_16 = vaddq_s16(s4_16, s4_23); + s6_17 = vaddq_s16(s4_17, s4_22); + s6_18 = vaddq_s16(s5_18, s5_21); + s6_19 = vaddq_s16(s5_19, s5_20); + s6_20 = vsubq_s16(s5_19, s5_20); + s6_21 = vsubq_s16(s5_18, s5_21); + s6_22 = vsubq_s16(s4_17, s4_22); + s6_23 = vsubq_s16(s4_16, s4_23); + s6_24 = vsubq_s16(s4_31, s4_24); + s6_25 = vsubq_s16(s4_30, s4_25); + s6_26 = vsubq_s16(s5_29, s5_26); + s6_27 = vsubq_s16(s5_28, s5_27); + s6_28 = vaddq_s16(s5_27, s5_28); + s6_29 = vaddq_s16(s5_26, s5_29); + s6_30 = vaddq_s16(s4_25, s4_30); + s6_31 = vaddq_s16(s4_24, s4_31); + + // stage 7 + s7_0 = vaddq_s16(s6_0, s5_15); + s7_1 = vaddq_s16(s6_1, s5_14); + s7_2 = vaddq_s16(s6_2, s6_13); + s7_3 = vaddq_s16(s6_3, s6_12); + s7_4 = vaddq_s16(s6_4, s6_11); + s7_5 = vaddq_s16(s6_5, s6_10); + s7_6 = vaddq_s16(s6_6, s5_9); + s7_7 = vaddq_s16(s6_7, s5_8); + s7_8 = vsubq_s16(s6_7, s5_8); + s7_9 = vsubq_s16(s6_6, s5_9); + s7_10 = vsubq_s16(s6_5, s6_10); + s7_11 = vsubq_s16(s6_4, s6_11); + s7_12 = vsubq_s16(s6_3, s6_12); + s7_13 = vsubq_s16(s6_2, s6_13); + s7_14 = vsubq_s16(s6_1, s5_14); + s7_15 = vsubq_s16(s6_0, s5_15); + + s7_20 = sub_multiply_shift_and_narrow_s16(s6_27, s6_20, cospi_16_64); + s7_27 = add_multiply_shift_and_narrow_s16(s6_20, s6_27, cospi_16_64); + + s7_21 = sub_multiply_shift_and_narrow_s16(s6_26, s6_21, cospi_16_64); + s7_26 = add_multiply_shift_and_narrow_s16(s6_21, s6_26, cospi_16_64); + + s7_22 = sub_multiply_shift_and_narrow_s16(s6_25, s6_22, cospi_16_64); + s7_25 = add_multiply_shift_and_narrow_s16(s6_22, s6_25, cospi_16_64); + + s7_23 = sub_multiply_shift_and_narrow_s16(s6_24, s6_23, cospi_16_64); + s7_24 = add_multiply_shift_and_narrow_s16(s6_23, s6_24, cospi_16_64); + + // final stage + out0 = vaddq_s16(s7_0, s6_31); + out1 = vaddq_s16(s7_1, s6_30); + out2 = vaddq_s16(s7_2, s6_29); + out3 = vaddq_s16(s7_3, s6_28); + out4 = vaddq_s16(s7_4, s7_27); + out5 = vaddq_s16(s7_5, s7_26); + out6 = vaddq_s16(s7_6, s7_25); + out7 = vaddq_s16(s7_7, s7_24); + + add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7, output, + stride); + + out0 = vaddq_s16(s7_8, s7_23); + out1 = vaddq_s16(s7_9, s7_22); + out2 = vaddq_s16(s7_10, s7_21); + out3 = vaddq_s16(s7_11, s7_20); + out4 = vaddq_s16(s7_12, s6_19); + out5 = vaddq_s16(s7_13, s6_18); + out6 = vaddq_s16(s7_14, s6_17); + out7 = vaddq_s16(s7_15, s6_16); + + add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7, + output + (8 * stride), stride); + + out0 = vsubq_s16(s7_15, s6_16); + out1 = vsubq_s16(s7_14, s6_17); + out2 = vsubq_s16(s7_13, s6_18); + out3 = vsubq_s16(s7_12, s6_19); + out4 = vsubq_s16(s7_11, s7_20); + out5 = vsubq_s16(s7_10, s7_21); + out6 = vsubq_s16(s7_9, s7_22); + out7 = vsubq_s16(s7_8, s7_23); + + add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7, + output + (16 * stride), stride); + + out0 = vsubq_s16(s7_7, s7_24); + out1 = vsubq_s16(s7_6, s7_25); + out2 = vsubq_s16(s7_5, s7_26); + out3 = vsubq_s16(s7_4, s7_27); + out4 = vsubq_s16(s7_3, s6_28); + out5 = vsubq_s16(s7_2, s6_29); + out6 = vsubq_s16(s7_1, s6_30); + out7 = vsubq_s16(s7_0, s6_31); + + add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7, + output + (24 * stride), stride); +} + +void vpx_idct32x32_135_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + int i; + int16_t temp[32 * 16]; + int16_t *t = temp; + + idct32_12_neon(input, temp); + idct32_12_neon(input + 32 * 8, temp + 8); + + for (i = 0; i < 32; i += 8) { + idct32_16_neon(t, dest, stride); + t += (16 * 8); + dest += 8; + } +} diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c index 6be4b01229b..604d82abd18 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c @@ -10,127 +10,48 @@ #include <arm_neon.h> -#include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" #include "vpx_dsp/inv_txfm.h" -#include "vpx_ports/mem.h" -static INLINE void LD_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8, - uint8x16_t *q9u8, uint8x16_t *q10u8, - uint8x16_t *q11u8, uint8x16_t *q12u8, - uint8x16_t *q13u8, uint8x16_t *q14u8, - uint8x16_t *q15u8) { - *q8u8 = vld1q_u8(d); - d += d_stride; - *q9u8 = vld1q_u8(d); - d += d_stride; - *q10u8 = vld1q_u8(d); - d += d_stride; - *q11u8 = vld1q_u8(d); - d += d_stride; - *q12u8 = vld1q_u8(d); - d += d_stride; - *q13u8 = vld1q_u8(d); - d += d_stride; - *q14u8 = vld1q_u8(d); - d += d_stride; - *q15u8 = vld1q_u8(d); +static INLINE void idct32x32_1_add_pos_kernel(uint8_t **dest, const int stride, + const uint8x16_t res) { + const uint8x16_t a0 = vld1q_u8(*dest); + const uint8x16_t a1 = vld1q_u8(*dest + 16); + const uint8x16_t b0 = vqaddq_u8(a0, res); + const uint8x16_t b1 = vqaddq_u8(a1, res); + vst1q_u8(*dest, b0); + vst1q_u8(*dest + 16, b1); + *dest += stride; } -static INLINE void ADD_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8, - uint8x16_t *q9u8, uint8x16_t *q10u8, - uint8x16_t *q11u8, uint8x16_t *q12u8, - uint8x16_t *q13u8, uint8x16_t *q14u8, - uint8x16_t *q15u8) { - *q8u8 = vqaddq_u8(*q8u8, qdiffu8); - *q9u8 = vqaddq_u8(*q9u8, qdiffu8); - *q10u8 = vqaddq_u8(*q10u8, qdiffu8); - *q11u8 = vqaddq_u8(*q11u8, qdiffu8); - *q12u8 = vqaddq_u8(*q12u8, qdiffu8); - *q13u8 = vqaddq_u8(*q13u8, qdiffu8); - *q14u8 = vqaddq_u8(*q14u8, qdiffu8); - *q15u8 = vqaddq_u8(*q15u8, qdiffu8); -} - -static INLINE void SUB_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8, - uint8x16_t *q9u8, uint8x16_t *q10u8, - uint8x16_t *q11u8, uint8x16_t *q12u8, - uint8x16_t *q13u8, uint8x16_t *q14u8, - uint8x16_t *q15u8) { - *q8u8 = vqsubq_u8(*q8u8, qdiffu8); - *q9u8 = vqsubq_u8(*q9u8, qdiffu8); - *q10u8 = vqsubq_u8(*q10u8, qdiffu8); - *q11u8 = vqsubq_u8(*q11u8, qdiffu8); - *q12u8 = vqsubq_u8(*q12u8, qdiffu8); - *q13u8 = vqsubq_u8(*q13u8, qdiffu8); - *q14u8 = vqsubq_u8(*q14u8, qdiffu8); - *q15u8 = vqsubq_u8(*q15u8, qdiffu8); -} - -static INLINE void ST_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8, - uint8x16_t *q9u8, uint8x16_t *q10u8, - uint8x16_t *q11u8, uint8x16_t *q12u8, - uint8x16_t *q13u8, uint8x16_t *q14u8, - uint8x16_t *q15u8) { - vst1q_u8(d, *q8u8); - d += d_stride; - vst1q_u8(d, *q9u8); - d += d_stride; - vst1q_u8(d, *q10u8); - d += d_stride; - vst1q_u8(d, *q11u8); - d += d_stride; - vst1q_u8(d, *q12u8); - d += d_stride; - vst1q_u8(d, *q13u8); - d += d_stride; - vst1q_u8(d, *q14u8); - d += d_stride; - vst1q_u8(d, *q15u8); +static INLINE void idct32x32_1_add_neg_kernel(uint8_t **dest, const int stride, + const uint8x16_t res) { + const uint8x16_t a0 = vld1q_u8(*dest); + const uint8x16_t a1 = vld1q_u8(*dest + 16); + const uint8x16_t b0 = vqsubq_u8(a0, res); + const uint8x16_t b1 = vqsubq_u8(a1, res); + vst1q_u8(*dest, b0); + vst1q_u8(*dest + 16, b1); + *dest += stride; } void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, - int dest_stride) { - uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8; - int i, j, dest_stride8; - uint8_t *d; - int16_t a1; - int16_t out = dct_const_round_shift(input[0] * cospi_16_64); - - out = dct_const_round_shift(out * cospi_16_64); - a1 = ROUND_POWER_OF_TWO(out, 6); - - dest_stride8 = dest_stride * 8; - if (a1 >= 0) { // diff_positive_32_32 - a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1; - q0u8 = vdupq_n_u8((uint8_t)a1); - for (i = 0; i < 2; i++, dest += 16) { // diff_positive_32_32_loop - d = dest; - for (j = 0; j < 4; j++) { - LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, - &q14u8, &q15u8); - ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, - &q14u8, &q15u8); - ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, - &q14u8, &q15u8); - d += dest_stride8; - } + int stride) { + int i; + const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64)); + const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6); + + if (a1 >= 0) { + const uint8x16_t dc = create_dcq(a1); + for (i = 0; i < 32; i++) { + idct32x32_1_add_pos_kernel(&dest, stride, dc); } - } else { // diff_negative_32_32 - a1 = -a1; - a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1; - q0u8 = vdupq_n_u8((uint8_t)a1); - for (i = 0; i < 2; i++, dest += 16) { // diff_negative_32_32_loop - d = dest; - for (j = 0; j < 4; j++) { - LD_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, - &q14u8, &q15u8); - SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, - &q14u8, &q15u8); - ST_16x8(d, dest_stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, - &q14u8, &q15u8); - d += dest_stride8; - } + } else { + const uint8x16_t dc = create_dcq(-a1); + for (i = 0; i < 32; i++) { + idct32x32_1_add_neg_kernel(&dest, stride, dc); } } } diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c index ebec9df54ad..b56deeea6de 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c @@ -13,6 +13,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/txfm_common.h" // Only for the first pass of the _34_ variant. Since it only uses values from @@ -34,7 +35,7 @@ // 5 13 20 26 // 6 21 27 33 // 7 24 32 -static void idct32_6_neon(const int16_t *input, int16_t *output) { +static void idct32_6_neon(const tran_low_t *input, int16_t *output) { int16x8_t in0, in1, in2, in3, in4, in5, in6, in7; int16x8_t s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s1_9, s1_10, s1_11, s1_12, s1_13, s1_14, s1_15, s1_16, s1_17, s1_18, s1_19, s1_20, @@ -46,8 +47,22 @@ static void idct32_6_neon(const int16_t *input, int16_t *output) { s2_31; int16x8_t s3_24, s3_25, s3_26, s3_27; - load_and_transpose_s16_8x8(input, 32, &in0, &in1, &in2, &in3, &in4, &in5, - &in6, &in7); + in0 = load_tran_low_to_s16q(input); + input += 32; + in1 = load_tran_low_to_s16q(input); + input += 32; + in2 = load_tran_low_to_s16q(input); + input += 32; + in3 = load_tran_low_to_s16q(input); + input += 32; + in4 = load_tran_low_to_s16q(input); + input += 32; + in5 = load_tran_low_to_s16q(input); + input += 32; + in6 = load_tran_low_to_s16q(input); + input += 32; + in7 = load_tran_low_to_s16q(input); + transpose_s16_8x8(&in0, &in1, &in2, &in3, &in4, &in5, &in6, &in7); // stage 1 // input[1] * cospi_31_64 - input[31] * cospi_1_64 (but input[31] == 0) @@ -503,7 +518,7 @@ static void idct32_8_neon(const int16_t *input, uint8_t *output, int stride) { output + (24 * stride), stride); } -void vpx_idct32x32_34_add_neon(const int16_t *input, uint8_t *dest, +void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int stride) { int i; int16_t temp[32 * 8]; diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c index 4eff9b970d9..de1bf978750 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c @@ -12,6 +12,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/txfm_common.h" @@ -146,55 +147,101 @@ static INLINE void DO_BUTTERFLY(int16x8_t q14s16, int16x8_t q13s16, q11s32 = vaddq_s32(q12s32, q11s32); q10s32 = vaddq_s32(q10s32, q15s32); - *qAs16 = vcombine_s16(vqrshrn_n_s32(q8s32, 14), vqrshrn_n_s32(q9s32, 14)); - *qBs16 = vcombine_s16(vqrshrn_n_s32(q11s32, 14), vqrshrn_n_s32(q10s32, 14)); + *qAs16 = vcombine_s16(vrshrn_n_s32(q8s32, 14), vrshrn_n_s32(q9s32, 14)); + *qBs16 = vcombine_s16(vrshrn_n_s32(q11s32, 14), vrshrn_n_s32(q10s32, 14)); +} + +static INLINE void load_s16x8q(const int16_t *in, int16x8_t *s0, int16x8_t *s1, + int16x8_t *s2, int16x8_t *s3, int16x8_t *s4, + int16x8_t *s5, int16x8_t *s6, int16x8_t *s7) { + *s0 = vld1q_s16(in); + in += 32; + *s1 = vld1q_s16(in); + in += 32; + *s2 = vld1q_s16(in); + in += 32; + *s3 = vld1q_s16(in); + in += 32; + *s4 = vld1q_s16(in); + in += 32; + *s5 = vld1q_s16(in); + in += 32; + *s6 = vld1q_s16(in); + in += 32; + *s7 = vld1q_s16(in); +} + +static INLINE void transpose_and_store_s16_8x8(int16x8_t a0, int16x8_t a1, + int16x8_t a2, int16x8_t a3, + int16x8_t a4, int16x8_t a5, + int16x8_t a6, int16x8_t a7, + int16_t **out) { + transpose_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + + vst1q_s16(*out, a0); + *out += 8; + vst1q_s16(*out, a1); + *out += 8; + vst1q_s16(*out, a2); + *out += 8; + vst1q_s16(*out, a3); + *out += 8; + vst1q_s16(*out, a4); + *out += 8; + vst1q_s16(*out, a5); + *out += 8; + vst1q_s16(*out, a6); + *out += 8; + vst1q_s16(*out, a7); + *out += 8; } static INLINE void idct32_transpose_pair(const int16_t *input, int16_t *t_buf) { - const int16_t *in; int i; - const int stride = 32; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + + for (i = 0; i < 4; i++, input += 8) { + load_s16x8q(input, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + transpose_and_store_s16_8x8(s0, s1, s2, s3, s4, s5, s6, s7, &t_buf); + } +} + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void load_s16x8q_tran_low(const tran_low_t *in, int16x8_t *s0, + int16x8_t *s1, int16x8_t *s2, + int16x8_t *s3, int16x8_t *s4, + int16x8_t *s5, int16x8_t *s6, + int16x8_t *s7) { + *s0 = load_tran_low_to_s16q(in); + in += 32; + *s1 = load_tran_low_to_s16q(in); + in += 32; + *s2 = load_tran_low_to_s16q(in); + in += 32; + *s3 = load_tran_low_to_s16q(in); + in += 32; + *s4 = load_tran_low_to_s16q(in); + in += 32; + *s5 = load_tran_low_to_s16q(in); + in += 32; + *s6 = load_tran_low_to_s16q(in); + in += 32; + *s7 = load_tran_low_to_s16q(in); +} + +static INLINE void idct32_transpose_pair_tran_low(const tran_low_t *input, + int16_t *t_buf) { + int i; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; for (i = 0; i < 4; i++, input += 8) { - in = input; - q8s16 = vld1q_s16(in); - in += stride; - q9s16 = vld1q_s16(in); - in += stride; - q10s16 = vld1q_s16(in); - in += stride; - q11s16 = vld1q_s16(in); - in += stride; - q12s16 = vld1q_s16(in); - in += stride; - q13s16 = vld1q_s16(in); - in += stride; - q14s16 = vld1q_s16(in); - in += stride; - q15s16 = vld1q_s16(in); - - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, - &q14s16, &q15s16); - - vst1q_s16(t_buf, q8s16); - t_buf += 8; - vst1q_s16(t_buf, q9s16); - t_buf += 8; - vst1q_s16(t_buf, q10s16); - t_buf += 8; - vst1q_s16(t_buf, q11s16); - t_buf += 8; - vst1q_s16(t_buf, q12s16); - t_buf += 8; - vst1q_s16(t_buf, q13s16); - t_buf += 8; - vst1q_s16(t_buf, q14s16); - t_buf += 8; - vst1q_s16(t_buf, q15s16); - t_buf += 8; + load_s16x8q_tran_low(input, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + transpose_and_store_s16_8x8(s0, s1, s2, s3, s4, s5, s6, s7, &t_buf); } } +#else // !CONFIG_VP9_HIGHBITDEPTH +#define idct32_transpose_pair_tran_low idct32_transpose_pair +#endif // CONFIG_VP9_HIGHBITDEPTH static INLINE void idct32_bands_end_1st_pass(int16_t *out, int16x8_t q2s16, int16x8_t q3s16, int16x8_t q6s16, @@ -383,16 +430,21 @@ void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int16_t trans_buf[32 * 8]; int16_t pass1[32 * 32]; int16_t pass2[32 * 32]; + const int16_t *input_pass2 = pass1; // input of pass2 is the result of pass1 int16_t *out; int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; for (idct32_pass_loop = 0, out = pass1; idct32_pass_loop < 2; - idct32_pass_loop++, - input = pass1, // the input of pass2 is the result of pass1 - out = pass2) { - for (i = 0; i < 4; i++, input += 32 * 8, out += 8) { // idct32_bands_loop - idct32_transpose_pair(input, trans_buf); + idct32_pass_loop++, out = pass2) { + for (i = 0; i < 4; i++, out += 8) { // idct32_bands_loop + if (idct32_pass_loop == 0) { + idct32_transpose_pair_tran_low(input, trans_buf); + input += 32 * 8; + } else { + idct32_transpose_pair(input_pass2, trans_buf); + input_pass2 += 32 * 8; + } // ----------------------------------------- // BLOCK A: 16-19,28-31 diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm index cbfab361af8..d83421e9e66 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm @@ -15,12 +15,11 @@ AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vpx_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, -; int dest_stride) +;void vpx_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, int stride) ; ; r0 int16_t input ; r1 uint8_t *dest -; r2 int dest_stride) +; r2 int stride) |vpx_idct4x4_1_add_neon| PROC ldrsh r0, [r0] diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c index 525aac05a84..d1eae24a222 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c @@ -9,39 +9,37 @@ */ #include <arm_neon.h> +#include <assert.h> #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/inv_txfm.h" -#include "vpx_ports/mem.h" -void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, - int dest_stride) { - uint8x8_t d6u8; - uint32x2_t d2u32 = vdup_n_u32(0); - uint16x8_t q8u16; - int16x8_t q0s16; - uint8_t *d1, *d2; - int16_t i, a1; - int16_t out = dct_const_round_shift(input[0] * cospi_16_64); - out = dct_const_round_shift(out * cospi_16_64); - a1 = ROUND_POWER_OF_TWO(out, 4); - - q0s16 = vdupq_n_s16(a1); +static INLINE void idct4x4_1_add_kernel(uint8_t **dest, const int stride, + const int16x8_t res, + uint32x2_t *const d) { + uint16x8_t a; + uint8x8_t b; + *d = vld1_lane_u32((const uint32_t *)*dest, *d, 0); + *d = vld1_lane_u32((const uint32_t *)(*dest + stride), *d, 1); + a = vaddw_u8(vreinterpretq_u16_s16(res), vreinterpret_u8_u32(*d)); + b = vqmovun_s16(vreinterpretq_s16_u16(a)); + vst1_lane_u32((uint32_t *)*dest, vreinterpret_u32_u8(b), 0); + *dest += stride; + vst1_lane_u32((uint32_t *)*dest, vreinterpret_u32_u8(b), 1); + *dest += stride; +} - // dc_only_idct_add - d1 = d2 = dest; - for (i = 0; i < 2; i++) { - d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0); - d1 += dest_stride; - d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1); - d1 += dest_stride; +void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64)); + const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4); + const int16x8_t dc = vdupq_n_s16(a1); + uint32x2_t d = vdup_n_u32(0); - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16), vreinterpret_u8_u32(d2u32)); - d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); + assert(!((intptr_t)dest % sizeof(uint32_t))); + assert(!(stride % sizeof(uint32_t))); - vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0); - d2 += dest_stride; - vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1); - d2 += dest_stride; - } + idct4x4_1_add_kernel(&dest, stride, dc, &d); + idct4x4_1_add_kernel(&dest, stride, dc, &d); } diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm index bd4e86ded25..184d218941c 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm @@ -18,11 +18,11 @@ INCLUDE vpx_dsp/arm/idct_neon.asm.S AREA Block, CODE, READONLY ; name this block of code -;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) +;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int stride) ; ; r0 int16_t input ; r1 uint8_t *dest -; r2 int dest_stride) +; r2 int stride) |vpx_idct4x4_16_add_neon| PROC @@ -72,16 +72,15 @@ ; do the transform on transposed rows ; stage 1 - vadd.s16 d23, d16, d18 ; (input[0] + input[2]) - vsub.s16 d24, d16, d18 ; (input[0] - input[2]) - vmull.s16 q15, d17, d22 ; input[1] * cospi_24_64 vmull.s16 q1, d17, d20 ; input[1] * cospi_8_64 ; (input[0] + input[2]) * cospi_16_64; ; (input[0] - input[2]) * cospi_16_64; - vmull.s16 q13, d23, d21 - vmull.s16 q14, d24, d21 + vmull.s16 q8, d16, d21 + vmull.s16 q14, d18, d21 + vadd.s32 q13, q8, q14 + vsub.s32 q14, q8, q14 ; input[1] * cospi_24_64 - input[3] * cospi_8_64; ; input[1] * cospi_8_64 + input[3] * cospi_24_64; @@ -89,10 +88,10 @@ vmlal.s16 q1, d19, d22 ; dct_const_round_shift - vqrshrn.s32 d26, q13, #14 - vqrshrn.s32 d27, q14, #14 - vqrshrn.s32 d29, q15, #14 - vqrshrn.s32 d28, q1, #14 + vrshrn.s32 d26, q13, #14 + vrshrn.s32 d27, q14, #14 + vrshrn.s32 d29, q15, #14 + vrshrn.s32 d28, q1, #14 ; stage 2 ; output[0] = step[0] + step[3]; @@ -140,10 +139,10 @@ vmlal.s16 q1, d19, d22 ; dct_const_round_shift - vqrshrn.s32 d26, q13, #14 - vqrshrn.s32 d27, q14, #14 - vqrshrn.s32 d29, q15, #14 - vqrshrn.s32 d28, q1, #14 + vrshrn.s32 d26, q13, #14 + vrshrn.s32 d27, q14, #14 + vrshrn.s32 d29, q15, #14 + vrshrn.s32 d28, q1, #14 ; stage 2 ; output[0] = step[0] + step[3]; @@ -168,7 +167,7 @@ vld1.32 {d27[1]}, [r1], r2 vld1.32 {d27[0]}, [r1] ; no post-increment - ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i] + ; ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * stride + i] vaddw.u8 q8, q8, d26 vaddw.u8 q9, q9, d27 diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.c index 8f669c90765..bff98cbc169 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.c @@ -9,139 +9,56 @@ */ #include <arm_neon.h> +#include <assert.h> #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/arm/idct_neon.h" #include "vpx_dsp/txfm_common.h" void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, - int dest_stride) { - uint8x8_t d26u8, d27u8; - uint32x2_t d26u32, d27u32; - uint16x8_t q8u16, q9u16; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16; - int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16; - int16x8_t q8s16, q9s16, q13s16, q14s16; - int32x4_t q1s32, q13s32, q14s32, q15s32; - int16x4x2_t d0x2s16, d1x2s16; - int32x4x2_t q0x2s32; - uint8_t *d; - - d26u32 = d27u32 = vdup_n_u32(0); - - q8s16 = load_tran_low_to_s16(input); - q9s16 = load_tran_low_to_s16(input + 8); - - d16s16 = vget_low_s16(q8s16); - d17s16 = vget_high_s16(q8s16); - d18s16 = vget_low_s16(q9s16); - d19s16 = vget_high_s16(q9s16); - - d0x2s16 = vtrn_s16(d16s16, d17s16); - d1x2s16 = vtrn_s16(d18s16, d19s16); - q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); - q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); - - d20s16 = vdup_n_s16((int16_t)cospi_8_64); - d21s16 = vdup_n_s16((int16_t)cospi_16_64); - - q0x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16)); - d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); - d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); - d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); - d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); - - d22s16 = vdup_n_s16((int16_t)cospi_24_64); - - // stage 1 - d23s16 = vadd_s16(d16s16, d18s16); - d24s16 = vsub_s16(d16s16, d18s16); - - q15s32 = vmull_s16(d17s16, d22s16); - q1s32 = vmull_s16(d17s16, d20s16); - q13s32 = vmull_s16(d23s16, d21s16); - q14s32 = vmull_s16(d24s16, d21s16); - - q15s32 = vmlsl_s16(q15s32, d19s16, d20s16); - q1s32 = vmlal_s16(q1s32, d19s16, d22s16); - - d26s16 = vqrshrn_n_s32(q13s32, 14); - d27s16 = vqrshrn_n_s32(q14s32, 14); - d29s16 = vqrshrn_n_s32(q15s32, 14); - d28s16 = vqrshrn_n_s32(q1s32, 14); - q13s16 = vcombine_s16(d26s16, d27s16); - q14s16 = vcombine_s16(d28s16, d29s16); - - // stage 2 - q8s16 = vaddq_s16(q13s16, q14s16); - q9s16 = vsubq_s16(q13s16, q14s16); - - d16s16 = vget_low_s16(q8s16); - d17s16 = vget_high_s16(q8s16); - d18s16 = vget_high_s16(q9s16); // vswp d18 d19 - d19s16 = vget_low_s16(q9s16); - - d0x2s16 = vtrn_s16(d16s16, d17s16); - d1x2s16 = vtrn_s16(d18s16, d19s16); - q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); - q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); - - q0x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16)); - d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); - d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); - d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); - d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); - - // do the transform on columns - // stage 1 - d23s16 = vadd_s16(d16s16, d18s16); - d24s16 = vsub_s16(d16s16, d18s16); - - q15s32 = vmull_s16(d17s16, d22s16); - q1s32 = vmull_s16(d17s16, d20s16); - q13s32 = vmull_s16(d23s16, d21s16); - q14s32 = vmull_s16(d24s16, d21s16); - - q15s32 = vmlsl_s16(q15s32, d19s16, d20s16); - q1s32 = vmlal_s16(q1s32, d19s16, d22s16); - - d26s16 = vqrshrn_n_s32(q13s32, 14); - d27s16 = vqrshrn_n_s32(q14s32, 14); - d29s16 = vqrshrn_n_s32(q15s32, 14); - d28s16 = vqrshrn_n_s32(q1s32, 14); - q13s16 = vcombine_s16(d26s16, d27s16); - q14s16 = vcombine_s16(d28s16, d29s16); - - // stage 2 - q8s16 = vaddq_s16(q13s16, q14s16); - q9s16 = vsubq_s16(q13s16, q14s16); - - q8s16 = vrshrq_n_s16(q8s16, 4); - q9s16 = vrshrq_n_s16(q9s16, 4); - - d = dest; - d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0); - d += dest_stride; - d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1); - d += dest_stride; - d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1); - d += dest_stride; - d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0); - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32)); - - d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - - d = dest; - vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0); - d += dest_stride; - vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1); - d += dest_stride; - vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1); - d += dest_stride; - vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0); + int stride) { + const uint8_t *dst = dest; + const int16x4_t cospis = vld1_s16(kCospi); + uint32x2_t dest01_u32 = vdup_n_u32(0); + uint32x2_t dest32_u32 = vdup_n_u32(0); + int16x8_t a0, a1; + uint8x8_t d01, d32; + uint16x8_t d01_u16, d32_u16; + + assert(!((intptr_t)dest % sizeof(uint32_t))); + assert(!(stride % sizeof(uint32_t))); + + // Rows + a0 = load_tran_low_to_s16q(input); + a1 = load_tran_low_to_s16q(input + 8); + idct4x4_16_kernel_bd8(cospis, &a0, &a1); + + // Columns + a1 = vcombine_s16(vget_high_s16(a1), vget_low_s16(a1)); + idct4x4_16_kernel_bd8(cospis, &a0, &a1); + a0 = vrshrq_n_s16(a0, 4); + a1 = vrshrq_n_s16(a1, 4); + + dest01_u32 = vld1_lane_u32((const uint32_t *)dst, dest01_u32, 0); + dst += stride; + dest01_u32 = vld1_lane_u32((const uint32_t *)dst, dest01_u32, 1); + dst += stride; + dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 1); + dst += stride; + dest32_u32 = vld1_lane_u32((const uint32_t *)dst, dest32_u32, 0); + + d01_u16 = + vaddw_u8(vreinterpretq_u16_s16(a0), vreinterpret_u8_u32(dest01_u32)); + d32_u16 = + vaddw_u8(vreinterpretq_u16_s16(a1), vreinterpret_u8_u32(dest32_u32)); + d01 = vqmovun_s16(vreinterpretq_s16_u16(d01_u16)); + d32 = vqmovun_s16(vreinterpretq_s16_u16(d32_u16)); + + vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d01), 0); + dest += stride; + vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d01), 1); + dest += stride; + vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 1); + dest += stride; + vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d32), 0); } diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm index e4531c6e97f..29f678a0382 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm @@ -15,12 +15,11 @@ AREA ||.text||, CODE, READONLY, ALIGN=2 -;void vpx_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, -; int dest_stride) +;void vpx_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, int stride) ; ; r0 int16_t input ; r1 uint8_t *dest -; r2 int dest_stride) +; r2 int stride) |vpx_idct8x8_1_add_neon| PROC ldrsh r0, [r0] diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c index eee41e6c6b1..7bcce913bdb 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c @@ -12,51 +12,53 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/inv_txfm.h" -#include "vpx_ports/mem.h" -void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, - int dest_stride) { - uint8x8_t d2u8, d3u8, d30u8, d31u8; - uint64x1_t d2u64, d3u64, d4u64, d5u64; - uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16; - int16x8_t q0s16; - uint8_t *d1, *d2; - int16_t i, a1; - int16_t out = dct_const_round_shift(input[0] * cospi_16_64); - out = dct_const_round_shift(out * cospi_16_64); - a1 = ROUND_POWER_OF_TWO(out, 5); - - q0s16 = vdupq_n_s16(a1); - q0u16 = vreinterpretq_u16_s16(q0s16); +static INLINE uint8x8_t create_dcd(const int16_t dc) { + int16x8_t t = vdupq_n_s16(dc); + return vqmovun_s16(t); +} - d1 = d2 = dest; - for (i = 0; i < 2; i++) { - d2u64 = vld1_u64((const uint64_t *)d1); - d1 += dest_stride; - d3u64 = vld1_u64((const uint64_t *)d1); - d1 += dest_stride; - d4u64 = vld1_u64((const uint64_t *)d1); - d1 += dest_stride; - d5u64 = vld1_u64((const uint64_t *)d1); - d1 += dest_stride; +static INLINE void idct8x8_1_add_pos_kernel(uint8_t **dest, const int stride, + const uint8x8_t res) { + const uint8x8_t a = vld1_u8(*dest); + const uint8x8_t b = vqadd_u8(a, res); + vst1_u8(*dest, b); + *dest += stride; +} - q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64)); - q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64)); - q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64)); - q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64)); +static INLINE void idct8x8_1_add_neg_kernel(uint8_t **dest, const int stride, + const uint8x8_t res) { + const uint8x8_t a = vld1_u8(*dest); + const uint8x8_t b = vqsub_u8(a, res); + vst1_u8(*dest, b); + *dest += stride; +} - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); +void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64)); + const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5); - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8)); - d2 += dest_stride; + if (a1 >= 0) { + const uint8x8_t dc = create_dcd(a1); + idct8x8_1_add_pos_kernel(&dest, stride, dc); + idct8x8_1_add_pos_kernel(&dest, stride, dc); + idct8x8_1_add_pos_kernel(&dest, stride, dc); + idct8x8_1_add_pos_kernel(&dest, stride, dc); + idct8x8_1_add_pos_kernel(&dest, stride, dc); + idct8x8_1_add_pos_kernel(&dest, stride, dc); + idct8x8_1_add_pos_kernel(&dest, stride, dc); + idct8x8_1_add_pos_kernel(&dest, stride, dc); + } else { + const uint8x8_t dc = create_dcd(-a1); + idct8x8_1_add_neg_kernel(&dest, stride, dc); + idct8x8_1_add_neg_kernel(&dest, stride, dc); + idct8x8_1_add_neg_kernel(&dest, stride, dc); + idct8x8_1_add_neg_kernel(&dest, stride, dc); + idct8x8_1_add_neg_kernel(&dest, stride, dc); + idct8x8_1_add_neg_kernel(&dest, stride, dc); + idct8x8_1_add_neg_kernel(&dest, stride, dc); + idct8x8_1_add_neg_kernel(&dest, stride, dc); } } diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm index a5c9c927d67..2bfbcc5a52c 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm @@ -47,12 +47,12 @@ vmlsl.s16 q6, d23, d3 ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d8, q2, #14 ; >> 14 - vqrshrn.s32 d9, q3, #14 ; >> 14 + vrshrn.s32 d8, q2, #14 ; >> 14 + vrshrn.s32 d9, q3, #14 ; >> 14 ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d10, q5, #14 ; >> 14 - vqrshrn.s32 d11, q6, #14 ; >> 14 + vrshrn.s32 d10, q5, #14 ; >> 14 + vrshrn.s32 d11, q6, #14 ; >> 14 ; input[1] * cospi_4_64 vmull.s16 q2, d18, d1 @@ -71,15 +71,15 @@ vmlal.s16 q13, d23, d2 ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d14, q2, #14 ; >> 14 - vqrshrn.s32 d15, q3, #14 ; >> 14 + vrshrn.s32 d14, q2, #14 ; >> 14 + vrshrn.s32 d15, q3, #14 ; >> 14 ; stage 2 & stage 3 - even half vdup.16 d0, r7 ; duplicate cospi_16_64 ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d12, q9, #14 ; >> 14 - vqrshrn.s32 d13, q13, #14 ; >> 14 + vrshrn.s32 d12, q9, #14 ; >> 14 + vrshrn.s32 d13, q13, #14 ; >> 14 ; input[0] * cospi_16_64 vmull.s16 q2, d16, d0 @@ -101,12 +101,12 @@ vdup.16 d1, r9 ; duplicate cospi_8_64 ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d18, q2, #14 ; >> 14 - vqrshrn.s32 d19, q3, #14 ; >> 14 + vrshrn.s32 d18, q2, #14 ; >> 14 + vrshrn.s32 d19, q3, #14 ; >> 14 ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d22, q13, #14 ; >> 14 - vqrshrn.s32 d23, q15, #14 ; >> 14 + vrshrn.s32 d22, q13, #14 ; >> 14 + vrshrn.s32 d23, q15, #14 ; >> 14 ; input[1] * cospi_24_64 - input[3] * cospi_8_64 ; input[1] * cospi_24_64 @@ -126,12 +126,12 @@ vmlal.s16 q12, d29, d0 ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d26, q2, #14 ; >> 14 - vqrshrn.s32 d27, q3, #14 ; >> 14 + vrshrn.s32 d26, q2, #14 ; >> 14 + vrshrn.s32 d27, q3, #14 ; >> 14 ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d30, q8, #14 ; >> 14 - vqrshrn.s32 d31, q12, #14 ; >> 14 + vrshrn.s32 d30, q8, #14 ; >> 14 + vrshrn.s32 d31, q12, #14 ; >> 14 vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3] vadd.s16 q1, q11, q13 ; output[1] = step[1] + step[2] @@ -164,12 +164,12 @@ vmlal.s16 q12, d27, d16 ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d10, q9, #14 ; >> 14 - vqrshrn.s32 d11, q10, #14 ; >> 14 + vrshrn.s32 d10, q9, #14 ; >> 14 + vrshrn.s32 d11, q10, #14 ; >> 14 ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d12, q11, #14 ; >> 14 - vqrshrn.s32 d13, q12, #14 ; >> 14 + vrshrn.s32 d12, q11, #14 ; >> 14 + vrshrn.s32 d13, q12, #14 ; >> 14 ; stage 4 vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7]; @@ -200,11 +200,11 @@ MEND AREA Block, CODE, READONLY ; name this block of code -;void vpx_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride) +;void vpx_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int stride) ; ; r0 int16_t input ; r1 uint8_t *dest -; r2 int dest_stride) +; r2 int stride) |vpx_idct8x8_64_add_neon| PROC push {r4-r9} @@ -270,7 +270,7 @@ vld1.64 {d6}, [r1], r2 vld1.64 {d7}, [r1] - ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i] + ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * stride + i] vaddw.u8 q8, q8, d0 vaddw.u8 q9, q9, d1 vaddw.u8 q10, q10, d2 @@ -305,11 +305,11 @@ bx lr ENDP ; |vpx_idct8x8_64_add_neon| -;void vpx_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) +;void vpx_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int stride) ; ; r0 int16_t input ; r1 uint8_t *dest -; r2 int dest_stride) +; r2 int stride) |vpx_idct8x8_12_add_neon| PROC push {r4-r9} @@ -423,12 +423,12 @@ vmlal.s16 q12, d27, d16 ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d10, q9, #14 ; >> 14 - vqrshrn.s32 d11, q10, #14 ; >> 14 + vrshrn.s32 d10, q9, #14 ; >> 14 + vrshrn.s32 d11, q10, #14 ; >> 14 ; dct_const_round_shift(input_dc * cospi_16_64) - vqrshrn.s32 d12, q11, #14 ; >> 14 - vqrshrn.s32 d13, q12, #14 ; >> 14 + vrshrn.s32 d12, q11, #14 ; >> 14 + vrshrn.s32 d13, q12, #14 ; >> 14 ; stage 4 vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7]; @@ -469,7 +469,7 @@ vld1.64 {d6}, [r1], r2 vld1.64 {d7}, [r1] - ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i] + ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * stride + i] vaddw.u8 q8, q8, d0 vaddw.u8 q9, q9, d1 vaddw.u8 q10, q10, d2 diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.c index 159a6ec9891..279da67d74f 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.c @@ -16,431 +16,111 @@ #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/txfm_common.h" -static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16, - int16x8_t *q10s16, int16x8_t *q11s16, - int16x8_t *q12s16, int16x8_t *q13s16, - int16x8_t *q14s16, int16x8_t *q15s16) { - int16x4_t d0s16, d1s16, d2s16, d3s16; - int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32; - int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; - - d0s16 = vdup_n_s16((int16_t)cospi_28_64); - d1s16 = vdup_n_s16((int16_t)cospi_4_64); - d2s16 = vdup_n_s16((int16_t)cospi_12_64); - d3s16 = vdup_n_s16((int16_t)cospi_20_64); - - d16s16 = vget_low_s16(*q8s16); - d17s16 = vget_high_s16(*q8s16); - d18s16 = vget_low_s16(*q9s16); - d19s16 = vget_high_s16(*q9s16); - d20s16 = vget_low_s16(*q10s16); - d21s16 = vget_high_s16(*q10s16); - d22s16 = vget_low_s16(*q11s16); - d23s16 = vget_high_s16(*q11s16); - d24s16 = vget_low_s16(*q12s16); - d25s16 = vget_high_s16(*q12s16); - d26s16 = vget_low_s16(*q13s16); - d27s16 = vget_high_s16(*q13s16); - d28s16 = vget_low_s16(*q14s16); - d29s16 = vget_high_s16(*q14s16); - d30s16 = vget_low_s16(*q15s16); - d31s16 = vget_high_s16(*q15s16); - - q2s32 = vmull_s16(d18s16, d0s16); - q3s32 = vmull_s16(d19s16, d0s16); - q5s32 = vmull_s16(d26s16, d2s16); - q6s32 = vmull_s16(d27s16, d2s16); - - q2s32 = vmlsl_s16(q2s32, d30s16, d1s16); - q3s32 = vmlsl_s16(q3s32, d31s16, d1s16); - q5s32 = vmlsl_s16(q5s32, d22s16, d3s16); - q6s32 = vmlsl_s16(q6s32, d23s16, d3s16); - - d8s16 = vqrshrn_n_s32(q2s32, 14); - d9s16 = vqrshrn_n_s32(q3s32, 14); - d10s16 = vqrshrn_n_s32(q5s32, 14); - d11s16 = vqrshrn_n_s32(q6s32, 14); - q4s16 = vcombine_s16(d8s16, d9s16); - q5s16 = vcombine_s16(d10s16, d11s16); - - q2s32 = vmull_s16(d18s16, d1s16); - q3s32 = vmull_s16(d19s16, d1s16); - q9s32 = vmull_s16(d26s16, d3s16); - q13s32 = vmull_s16(d27s16, d3s16); - - q2s32 = vmlal_s16(q2s32, d30s16, d0s16); - q3s32 = vmlal_s16(q3s32, d31s16, d0s16); - q9s32 = vmlal_s16(q9s32, d22s16, d2s16); - q13s32 = vmlal_s16(q13s32, d23s16, d2s16); - - d14s16 = vqrshrn_n_s32(q2s32, 14); - d15s16 = vqrshrn_n_s32(q3s32, 14); - d12s16 = vqrshrn_n_s32(q9s32, 14); - d13s16 = vqrshrn_n_s32(q13s32, 14); - q6s16 = vcombine_s16(d12s16, d13s16); - q7s16 = vcombine_s16(d14s16, d15s16); - - d0s16 = vdup_n_s16((int16_t)cospi_16_64); - - q2s32 = vmull_s16(d16s16, d0s16); - q3s32 = vmull_s16(d17s16, d0s16); - q13s32 = vmull_s16(d16s16, d0s16); - q15s32 = vmull_s16(d17s16, d0s16); - - q2s32 = vmlal_s16(q2s32, d24s16, d0s16); - q3s32 = vmlal_s16(q3s32, d25s16, d0s16); - q13s32 = vmlsl_s16(q13s32, d24s16, d0s16); - q15s32 = vmlsl_s16(q15s32, d25s16, d0s16); - - d0s16 = vdup_n_s16((int16_t)cospi_24_64); - d1s16 = vdup_n_s16((int16_t)cospi_8_64); - - d18s16 = vqrshrn_n_s32(q2s32, 14); - d19s16 = vqrshrn_n_s32(q3s32, 14); - d22s16 = vqrshrn_n_s32(q13s32, 14); - d23s16 = vqrshrn_n_s32(q15s32, 14); - *q9s16 = vcombine_s16(d18s16, d19s16); - *q11s16 = vcombine_s16(d22s16, d23s16); - - q2s32 = vmull_s16(d20s16, d0s16); - q3s32 = vmull_s16(d21s16, d0s16); - q8s32 = vmull_s16(d20s16, d1s16); - q12s32 = vmull_s16(d21s16, d1s16); - - q2s32 = vmlsl_s16(q2s32, d28s16, d1s16); - q3s32 = vmlsl_s16(q3s32, d29s16, d1s16); - q8s32 = vmlal_s16(q8s32, d28s16, d0s16); - q12s32 = vmlal_s16(q12s32, d29s16, d0s16); - - d26s16 = vqrshrn_n_s32(q2s32, 14); - d27s16 = vqrshrn_n_s32(q3s32, 14); - d30s16 = vqrshrn_n_s32(q8s32, 14); - d31s16 = vqrshrn_n_s32(q12s32, 14); - *q13s16 = vcombine_s16(d26s16, d27s16); - *q15s16 = vcombine_s16(d30s16, d31s16); - - q0s16 = vaddq_s16(*q9s16, *q15s16); - q1s16 = vaddq_s16(*q11s16, *q13s16); - q2s16 = vsubq_s16(*q11s16, *q13s16); - q3s16 = vsubq_s16(*q9s16, *q15s16); - - *q13s16 = vsubq_s16(q4s16, q5s16); - q4s16 = vaddq_s16(q4s16, q5s16); - *q14s16 = vsubq_s16(q7s16, q6s16); - q7s16 = vaddq_s16(q7s16, q6s16); - d26s16 = vget_low_s16(*q13s16); - d27s16 = vget_high_s16(*q13s16); - d28s16 = vget_low_s16(*q14s16); - d29s16 = vget_high_s16(*q14s16); - - d16s16 = vdup_n_s16((int16_t)cospi_16_64); - - q9s32 = vmull_s16(d28s16, d16s16); - q10s32 = vmull_s16(d29s16, d16s16); - q11s32 = vmull_s16(d28s16, d16s16); - q12s32 = vmull_s16(d29s16, d16s16); - - q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); - q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); - q11s32 = vmlal_s16(q11s32, d26s16, d16s16); - q12s32 = vmlal_s16(q12s32, d27s16, d16s16); - - d10s16 = vqrshrn_n_s32(q9s32, 14); - d11s16 = vqrshrn_n_s32(q10s32, 14); - d12s16 = vqrshrn_n_s32(q11s32, 14); - d13s16 = vqrshrn_n_s32(q12s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - *q8s16 = vaddq_s16(q0s16, q7s16); - *q9s16 = vaddq_s16(q1s16, q6s16); - *q10s16 = vaddq_s16(q2s16, q5s16); - *q11s16 = vaddq_s16(q3s16, q4s16); - *q12s16 = vsubq_s16(q3s16, q4s16); - *q13s16 = vsubq_s16(q2s16, q5s16); - *q14s16 = vsubq_s16(q1s16, q6s16); - *q15s16 = vsubq_s16(q0s16, q7s16); +static INLINE void add8x8(int16x8_t a0, int16x8_t a1, int16x8_t a2, + int16x8_t a3, int16x8_t a4, int16x8_t a5, + int16x8_t a6, int16x8_t a7, uint8_t *dest, + const int stride) { + const uint8_t *dst = dest; + uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7; + uint16x8_t d0_u16, d1_u16, d2_u16, d3_u16, d4_u16, d5_u16, d6_u16, d7_u16; + + a0 = vrshrq_n_s16(a0, 5); + a1 = vrshrq_n_s16(a1, 5); + a2 = vrshrq_n_s16(a2, 5); + a3 = vrshrq_n_s16(a3, 5); + a4 = vrshrq_n_s16(a4, 5); + a5 = vrshrq_n_s16(a5, 5); + a6 = vrshrq_n_s16(a6, 5); + a7 = vrshrq_n_s16(a7, 5); + + d0 = vld1_u8(dst); + dst += stride; + d1 = vld1_u8(dst); + dst += stride; + d2 = vld1_u8(dst); + dst += stride; + d3 = vld1_u8(dst); + dst += stride; + d4 = vld1_u8(dst); + dst += stride; + d5 = vld1_u8(dst); + dst += stride; + d6 = vld1_u8(dst); + dst += stride; + d7 = vld1_u8(dst); + + d0_u16 = vaddw_u8(vreinterpretq_u16_s16(a0), d0); + d1_u16 = vaddw_u8(vreinterpretq_u16_s16(a1), d1); + d2_u16 = vaddw_u8(vreinterpretq_u16_s16(a2), d2); + d3_u16 = vaddw_u8(vreinterpretq_u16_s16(a3), d3); + d4_u16 = vaddw_u8(vreinterpretq_u16_s16(a4), d4); + d5_u16 = vaddw_u8(vreinterpretq_u16_s16(a5), d5); + d6_u16 = vaddw_u8(vreinterpretq_u16_s16(a6), d6); + d7_u16 = vaddw_u8(vreinterpretq_u16_s16(a7), d7); + + d0 = vqmovun_s16(vreinterpretq_s16_u16(d0_u16)); + d1 = vqmovun_s16(vreinterpretq_s16_u16(d1_u16)); + d2 = vqmovun_s16(vreinterpretq_s16_u16(d2_u16)); + d3 = vqmovun_s16(vreinterpretq_s16_u16(d3_u16)); + d4 = vqmovun_s16(vreinterpretq_s16_u16(d4_u16)); + d5 = vqmovun_s16(vreinterpretq_s16_u16(d5_u16)); + d6 = vqmovun_s16(vreinterpretq_s16_u16(d6_u16)); + d7 = vqmovun_s16(vreinterpretq_s16_u16(d7_u16)); + + vst1_u8(dest, d0); + dest += stride; + vst1_u8(dest, d1); + dest += stride; + vst1_u8(dest, d2); + dest += stride; + vst1_u8(dest, d3); + dest += stride; + vst1_u8(dest, d4); + dest += stride; + vst1_u8(dest, d5); + dest += stride; + vst1_u8(dest, d6); + dest += stride; + vst1_u8(dest, d7); } void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, - int dest_stride) { - uint8_t *d1, *d2; - uint8x8_t d0u8, d1u8, d2u8, d3u8; - uint64x1_t d0u64, d1u64, d2u64, d3u64; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - uint16x8_t q8u16, q9u16, q10u16, q11u16; - - q8s16 = load_tran_low_to_s16(input); - q9s16 = load_tran_low_to_s16(input + 8); - q10s16 = load_tran_low_to_s16(input + 16); - q11s16 = load_tran_low_to_s16(input + 24); - q12s16 = load_tran_low_to_s16(input + 32); - q13s16 = load_tran_low_to_s16(input + 40); - q14s16 = load_tran_low_to_s16(input + 48); - q15s16 = load_tran_low_to_s16(input + 56); - - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - q8s16 = vrshrq_n_s16(q8s16, 5); - q9s16 = vrshrq_n_s16(q9s16, 5); - q10s16 = vrshrq_n_s16(q10s16, 5); - q11s16 = vrshrq_n_s16(q11s16, 5); - q12s16 = vrshrq_n_s16(q12s16, 5); - q13s16 = vrshrq_n_s16(q13s16, 5); - q14s16 = vrshrq_n_s16(q14s16, 5); - q15s16 = vrshrq_n_s16(q15s16, 5); - - d1 = d2 = dest; - - d0u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d1u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d2u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d3u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); - q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); - q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); - - d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; - - q8s16 = q12s16; - q9s16 = q13s16; - q10s16 = q14s16; - q11s16 = q15s16; - - d0u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d1u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d2u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d3u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); - q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); - q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); - - d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; + int stride) { + const int16x8_t cospis = vld1q_s16(kCospi); + const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 + const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28 + int16x8_t a0 = load_tran_low_to_s16q(input); + int16x8_t a1 = load_tran_low_to_s16q(input + 8); + int16x8_t a2 = load_tran_low_to_s16q(input + 16); + int16x8_t a3 = load_tran_low_to_s16q(input + 24); + int16x8_t a4 = load_tran_low_to_s16q(input + 32); + int16x8_t a5 = load_tran_low_to_s16q(input + 40); + int16x8_t a6 = load_tran_low_to_s16q(input + 48); + int16x8_t a7 = load_tran_low_to_s16q(input + 56); + + idct8x8_64_1d_bd8(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + idct8x8_64_1d_bd8(cospis0, cospis1, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + add8x8(a0, a1, a2, a3, a4, a5, a6, a7, dest, stride); } void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, - int dest_stride) { - uint8_t *d1, *d2; - uint8x8_t d0u8, d1u8, d2u8, d3u8; - int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16; - int16x4_t d26s16, d27s16, d28s16, d29s16; - uint64x1_t d0u64, d1u64, d2u64, d3u64; - int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; - int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - uint16x8_t q8u16, q9u16, q10u16, q11u16; - int32x4_t q9s32, q10s32, q11s32, q12s32; - - q8s16 = load_tran_low_to_s16(input); - q9s16 = load_tran_low_to_s16(input + 8); - q10s16 = load_tran_low_to_s16(input + 16); - q11s16 = load_tran_low_to_s16(input + 24); - q12s16 = load_tran_low_to_s16(input + 32); - q13s16 = load_tran_low_to_s16(input + 40); - q14s16 = load_tran_low_to_s16(input + 48); - q15s16 = load_tran_low_to_s16(input + 56); - - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - // First transform rows - // stage 1 - q0s16 = vdupq_n_s16((int16_t)cospi_28_64 * 2); - q1s16 = vdupq_n_s16((int16_t)cospi_4_64 * 2); - - q4s16 = vqrdmulhq_s16(q9s16, q0s16); - - q0s16 = vdupq_n_s16(-(int16_t)cospi_20_64 * 2); - - q7s16 = vqrdmulhq_s16(q9s16, q1s16); - - q1s16 = vdupq_n_s16((int16_t)cospi_12_64 * 2); - - q5s16 = vqrdmulhq_s16(q11s16, q0s16); - - q0s16 = vdupq_n_s16((int16_t)cospi_16_64 * 2); - - q6s16 = vqrdmulhq_s16(q11s16, q1s16); - - // stage 2 & stage 3 - even half - q1s16 = vdupq_n_s16((int16_t)cospi_24_64 * 2); - - q9s16 = vqrdmulhq_s16(q8s16, q0s16); - - q0s16 = vdupq_n_s16((int16_t)cospi_8_64 * 2); - - q13s16 = vqrdmulhq_s16(q10s16, q1s16); - - q15s16 = vqrdmulhq_s16(q10s16, q0s16); - - // stage 3 -odd half - q0s16 = vaddq_s16(q9s16, q15s16); - q1s16 = vaddq_s16(q9s16, q13s16); - q2s16 = vsubq_s16(q9s16, q13s16); - q3s16 = vsubq_s16(q9s16, q15s16); - - // stage 2 - odd half - q13s16 = vsubq_s16(q4s16, q5s16); - q4s16 = vaddq_s16(q4s16, q5s16); - q14s16 = vsubq_s16(q7s16, q6s16); - q7s16 = vaddq_s16(q7s16, q6s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - - d16s16 = vdup_n_s16((int16_t)cospi_16_64); - q9s32 = vmull_s16(d28s16, d16s16); - q10s32 = vmull_s16(d29s16, d16s16); - q11s32 = vmull_s16(d28s16, d16s16); - q12s32 = vmull_s16(d29s16, d16s16); - - q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); - q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); - q11s32 = vmlal_s16(q11s32, d26s16, d16s16); - q12s32 = vmlal_s16(q12s32, d27s16, d16s16); - - d10s16 = vqrshrn_n_s32(q9s32, 14); - d11s16 = vqrshrn_n_s32(q10s32, 14); - d12s16 = vqrshrn_n_s32(q11s32, 14); - d13s16 = vqrshrn_n_s32(q12s32, 14); - q5s16 = vcombine_s16(d10s16, d11s16); - q6s16 = vcombine_s16(d12s16, d13s16); - - // stage 4 - q8s16 = vaddq_s16(q0s16, q7s16); - q9s16 = vaddq_s16(q1s16, q6s16); - q10s16 = vaddq_s16(q2s16, q5s16); - q11s16 = vaddq_s16(q3s16, q4s16); - q12s16 = vsubq_s16(q3s16, q4s16); - q13s16 = vsubq_s16(q2s16, q5s16); - q14s16 = vsubq_s16(q1s16, q6s16); - q15s16 = vsubq_s16(q0s16, q7s16); - - transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); - - q8s16 = vrshrq_n_s16(q8s16, 5); - q9s16 = vrshrq_n_s16(q9s16, 5); - q10s16 = vrshrq_n_s16(q10s16, 5); - q11s16 = vrshrq_n_s16(q11s16, 5); - q12s16 = vrshrq_n_s16(q12s16, 5); - q13s16 = vrshrq_n_s16(q13s16, 5); - q14s16 = vrshrq_n_s16(q14s16, 5); - q15s16 = vrshrq_n_s16(q15s16, 5); - - d1 = d2 = dest; - - d0u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d1u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d2u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d3u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); - q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); - q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); - - d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; - - q8s16 = q12s16; - q9s16 = q13s16; - q10s16 = q14s16; - q11s16 = q15s16; - - d0u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d1u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d2u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - d3u64 = vld1_u64((uint64_t *)d1); - d1 += dest_stride; - - q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); - q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); - q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); - q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); - - d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); - d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - d2 += dest_stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); - d2 += dest_stride; + int stride) { + const int16x8_t cospis = vld1q_s16(kCospi); + const int16x8_t cospisd = vaddq_s16(cospis, cospis); + const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 + const int16x4_t cospisd0 = vget_low_s16(cospisd); // doubled 0, 8, 16, 24 + const int16x4_t cospisd1 = vget_high_s16(cospisd); // doubled 4, 12, 20, 28 + int16x4_t a0, a1, a2, a3, a4, a5, a6, a7; + int16x8_t b0, b1, b2, b3, b4, b5, b6, b7; + + a0 = load_tran_low_to_s16d(input); + a1 = load_tran_low_to_s16d(input + 8); + a2 = load_tran_low_to_s16d(input + 16); + a3 = load_tran_low_to_s16d(input + 24); + + idct8x8_12_pass1_bd8(cospis0, cospisd0, cospisd1, &a0, &a1, &a2, &a3, &a4, + &a5, &a6, &a7); + idct8x8_12_pass2_bd8(cospis0, cospisd0, cospisd1, a0, a1, a2, a3, a4, a5, a6, + a7, &b0, &b1, &b2, &b3, &b4, &b5, &b6, &b7); + add8x8(b0, b1, b2, b3, b4, b5, b6, b7, dest, stride); } diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.asm index f39e8ddd4b4..5dd9bdc7888 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.asm +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.asm @@ -10,8 +10,9 @@ INCLUDE ./vpx_config.asm - ; Helper function used to load tran_low_t into int16, narrowing if + ; Helper functions used to load tran_low_t into int16, narrowing if ; necessary. + ; $dst0..3 are d registers with the pairs assumed to be contiguous in ; non-high-bitdepth builds. q0-q3 are used as temporaries in high-bitdepth. MACRO @@ -27,4 +28,19 @@ vld1.s16 {$dst0-$dst1,$dst2-$dst3}, [$src]! ENDIF MEND + + ; $dst0..3 are d registers. q0-q3 are used as temporaries in high-bitdepth. + MACRO + LOAD_TRAN_LOW_TO_S16X2 $dst0, $dst1, $dst2, $dst3, $src + IF CONFIG_VP9_HIGHBITDEPTH + vld2.s32 {q0,q1}, [$src]! + vld2.s32 {q2,q3}, [$src]! + vmovn.i32 $dst0, q0 + vmovn.i32 $dst1, q2 + vmovn.i32 $dst2, q1 + vmovn.i32 $dst3, q3 + ELSE + vld2.s16 {$dst0,$dst1,$dst2,$dst3}, [$src]! + ENDIF + MEND END diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h index 5c2a53c034f..d9b85223c76 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h @@ -17,10 +17,45 @@ #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/vpx_dsp_common.h" +DECLARE_ALIGNED(16, static const int16_t, kCospi[16]) = { + 16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */, + 11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */, + 16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */, + -9102 /* -cospi_20_64 */, 3196 /* cospi_28_64 */, + 16305 /* cospi_2_64 */, 1606 /* cospi_30_64 */, + 14449 /* cospi_10_64 */, 7723 /* cospi_22_64 */, + 15679 /* cospi_6_64 */, -4756 /* -cospi_26_64 */, + 12665 /* cospi_14_64 */, -10394 /* -cospi_18_64 */ +}; + +DECLARE_ALIGNED(16, static const int32_t, kCospi32[8]) = { + 16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */, + 11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */, + 16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */, + -9102 /* -cospi_20_64 */, 3196 /* cospi_28_64 */ +}; + //------------------------------------------------------------------------------ +// Helper functions used to load tran_low_t into int16, narrowing if necessary. -// Helper function used to load tran_low_t into int16, narrowing if necessary. -static INLINE int16x8_t load_tran_low_to_s16(const tran_low_t *buf) { +static INLINE int16x8x2_t load_tran_low_to_s16x2q(const tran_low_t *buf) { +#if CONFIG_VP9_HIGHBITDEPTH + const int32x4x2_t v0 = vld2q_s32(buf); + const int32x4x2_t v1 = vld2q_s32(buf + 8); + const int16x4_t s0 = vmovn_s32(v0.val[0]); + const int16x4_t s1 = vmovn_s32(v0.val[1]); + const int16x4_t s2 = vmovn_s32(v1.val[0]); + const int16x4_t s3 = vmovn_s32(v1.val[1]); + int16x8x2_t res; + res.val[0] = vcombine_s16(s0, s2); + res.val[1] = vcombine_s16(s1, s3); + return res; +#else + return vld2q_s16(buf); +#endif +} + +static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) { #if CONFIG_VP9_HIGHBITDEPTH const int32x4_t v0 = vld1q_s32(buf); const int32x4_t v1 = vld1q_s32(buf + 4); @@ -32,6 +67,17 @@ static INLINE int16x8_t load_tran_low_to_s16(const tran_low_t *buf) { #endif } +static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) { +#if CONFIG_VP9_HIGHBITDEPTH + const int32x4_t v0 = vld1q_s32(buf); + return vmovn_s32(v0); +#else + return vld1_s16(buf); +#endif +} + +//------------------------------------------------------------------------------ + // Multiply a by a_const. Saturate, shift and narrow by 14. static INLINE int16x8_t multiply_shift_and_narrow_s16(const int16x8_t a, const int16_t a_const) { @@ -85,30 +131,6 @@ static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16( return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14)); } -static INLINE void load_and_transpose_s16_8x8(const int16_t *a, int a_stride, - int16x8_t *a0, int16x8_t *a1, - int16x8_t *a2, int16x8_t *a3, - int16x8_t *a4, int16x8_t *a5, - int16x8_t *a6, int16x8_t *a7) { - *a0 = vld1q_s16(a); - a += a_stride; - *a1 = vld1q_s16(a); - a += a_stride; - *a2 = vld1q_s16(a); - a += a_stride; - *a3 = vld1q_s16(a); - a += a_stride; - *a4 = vld1q_s16(a); - a += a_stride; - *a5 = vld1q_s16(a); - a += a_stride; - *a6 = vld1q_s16(a); - a += a_stride; - *a7 = vld1q_s16(a); - - transpose_s16_8x8(a0, a1, a2, a3, a4, a5, a6, a7); -} - // Shift the output down by 6 and add it to the destination buffer. static INLINE void add_and_store_u8_s16(const int16x8_t a0, const int16x8_t a1, const int16x8_t a2, const int16x8_t a3, @@ -169,4 +191,354 @@ static INLINE void add_and_store_u8_s16(const int16x8_t a0, const int16x8_t a1, b += b_stride; vst1_u8(b, b7); } + +static INLINE uint8x16_t create_dcq(const int16_t dc) { + // Clip both sides and gcc may compile to assembly 'usat'. + const int16_t t = (dc < 0) ? 0 : ((dc > 255) ? 255 : dc); + return vdupq_n_u8((uint8_t)t); +} + +static INLINE void idct4x4_16_kernel_bd8(const int16x4_t cospis, + int16x8_t *const a0, + int16x8_t *const a1) { + int16x4_t b0, b1, b2, b3; + int32x4_t c0, c1, c2, c3; + int16x8_t d0, d1; + + transpose_s16_4x4q(a0, a1); + b0 = vget_low_s16(*a0); + b1 = vget_high_s16(*a0); + b2 = vget_low_s16(*a1); + b3 = vget_high_s16(*a1); + c0 = vmull_lane_s16(b0, cospis, 2); + c2 = vmull_lane_s16(b1, cospis, 2); + c1 = vsubq_s32(c0, c2); + c0 = vaddq_s32(c0, c2); + c2 = vmull_lane_s16(b2, cospis, 3); + c3 = vmull_lane_s16(b2, cospis, 1); + c2 = vmlsl_lane_s16(c2, b3, cospis, 1); + c3 = vmlal_lane_s16(c3, b3, cospis, 3); + b0 = vrshrn_n_s32(c0, 14); + b1 = vrshrn_n_s32(c1, 14); + b2 = vrshrn_n_s32(c2, 14); + b3 = vrshrn_n_s32(c3, 14); + d0 = vcombine_s16(b0, b1); + d1 = vcombine_s16(b3, b2); + *a0 = vaddq_s16(d0, d1); + *a1 = vsubq_s16(d0, d1); +} + +static INLINE void idct8x8_12_pass1_bd8( + const int16x4_t cospis0, const int16x4_t cospisd0, const int16x4_t cospisd1, + int16x4_t *const io0, int16x4_t *const io1, int16x4_t *const io2, + int16x4_t *const io3, int16x4_t *const io4, int16x4_t *const io5, + int16x4_t *const io6, int16x4_t *const io7) { + int16x4_t step1[8], step2[8]; + int32x4_t t32[2]; + + transpose_s16_4x4d(io0, io1, io2, io3); + + // stage 1 + step1[4] = vqrdmulh_lane_s16(*io1, cospisd1, 3); + step1[5] = vqrdmulh_lane_s16(*io3, cospisd1, 2); + step1[6] = vqrdmulh_lane_s16(*io3, cospisd1, 1); + step1[7] = vqrdmulh_lane_s16(*io1, cospisd1, 0); + + // stage 2 + step2[1] = vqrdmulh_lane_s16(*io0, cospisd0, 2); + step2[2] = vqrdmulh_lane_s16(*io2, cospisd0, 3); + step2[3] = vqrdmulh_lane_s16(*io2, cospisd0, 1); + + step2[4] = vadd_s16(step1[4], step1[5]); + step2[5] = vsub_s16(step1[4], step1[5]); + step2[6] = vsub_s16(step1[7], step1[6]); + step2[7] = vadd_s16(step1[7], step1[6]); + + // stage 3 + step1[0] = vadd_s16(step2[1], step2[3]); + step1[1] = vadd_s16(step2[1], step2[2]); + step1[2] = vsub_s16(step2[1], step2[2]); + step1[3] = vsub_s16(step2[1], step2[3]); + + t32[1] = vmull_lane_s16(step2[6], cospis0, 2); + t32[0] = vmlsl_lane_s16(t32[1], step2[5], cospis0, 2); + t32[1] = vmlal_lane_s16(t32[1], step2[5], cospis0, 2); + step1[5] = vrshrn_n_s32(t32[0], 14); + step1[6] = vrshrn_n_s32(t32[1], 14); + + // stage 4 + *io0 = vadd_s16(step1[0], step2[7]); + *io1 = vadd_s16(step1[1], step1[6]); + *io2 = vadd_s16(step1[2], step1[5]); + *io3 = vadd_s16(step1[3], step2[4]); + *io4 = vsub_s16(step1[3], step2[4]); + *io5 = vsub_s16(step1[2], step1[5]); + *io6 = vsub_s16(step1[1], step1[6]); + *io7 = vsub_s16(step1[0], step2[7]); +} + +static INLINE void idct8x8_12_pass2_bd8( + const int16x4_t cospis0, const int16x4_t cospisd0, const int16x4_t cospisd1, + const int16x4_t input0, const int16x4_t input1, const int16x4_t input2, + const int16x4_t input3, const int16x4_t input4, const int16x4_t input5, + const int16x4_t input6, const int16x4_t input7, int16x8_t *const output0, + int16x8_t *const output1, int16x8_t *const output2, + int16x8_t *const output3, int16x8_t *const output4, + int16x8_t *const output5, int16x8_t *const output6, + int16x8_t *const output7) { + int16x8_t in[4]; + int16x8_t step1[8], step2[8]; + int32x4_t t32[8]; + int16x4_t t16[8]; + + transpose_s16_4x8(input0, input1, input2, input3, input4, input5, input6, + input7, &in[0], &in[1], &in[2], &in[3]); + + // stage 1 + step1[4] = vqrdmulhq_lane_s16(in[1], cospisd1, 3); + step1[5] = vqrdmulhq_lane_s16(in[3], cospisd1, 2); + step1[6] = vqrdmulhq_lane_s16(in[3], cospisd1, 1); + step1[7] = vqrdmulhq_lane_s16(in[1], cospisd1, 0); + + // stage 2 + step2[1] = vqrdmulhq_lane_s16(in[0], cospisd0, 2); + step2[2] = vqrdmulhq_lane_s16(in[2], cospisd0, 3); + step2[3] = vqrdmulhq_lane_s16(in[2], cospisd0, 1); + + step2[4] = vaddq_s16(step1[4], step1[5]); + step2[5] = vsubq_s16(step1[4], step1[5]); + step2[6] = vsubq_s16(step1[7], step1[6]); + step2[7] = vaddq_s16(step1[7], step1[6]); + + // stage 3 + step1[0] = vaddq_s16(step2[1], step2[3]); + step1[1] = vaddq_s16(step2[1], step2[2]); + step1[2] = vsubq_s16(step2[1], step2[2]); + step1[3] = vsubq_s16(step2[1], step2[3]); + + t32[2] = vmull_lane_s16(vget_low_s16(step2[6]), cospis0, 2); + t32[3] = vmull_lane_s16(vget_high_s16(step2[6]), cospis0, 2); + t32[0] = vmlsl_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2); + t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); + t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2); + t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); + t16[0] = vrshrn_n_s32(t32[0], 14); + t16[1] = vrshrn_n_s32(t32[1], 14); + t16[2] = vrshrn_n_s32(t32[2], 14); + t16[3] = vrshrn_n_s32(t32[3], 14); + step1[5] = vcombine_s16(t16[0], t16[1]); + step1[6] = vcombine_s16(t16[2], t16[3]); + + // stage 4 + *output0 = vaddq_s16(step1[0], step2[7]); + *output1 = vaddq_s16(step1[1], step1[6]); + *output2 = vaddq_s16(step1[2], step1[5]); + *output3 = vaddq_s16(step1[3], step2[4]); + *output4 = vsubq_s16(step1[3], step2[4]); + *output5 = vsubq_s16(step1[2], step1[5]); + *output6 = vsubq_s16(step1[1], step1[6]); + *output7 = vsubq_s16(step1[0], step2[7]); +} + +static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0, + const int16x4_t cospis1, + int16x8_t *const io0, int16x8_t *const io1, + int16x8_t *const io2, int16x8_t *const io3, + int16x8_t *const io4, int16x8_t *const io5, + int16x8_t *const io6, + int16x8_t *const io7) { + int16x4_t input_1l, input_1h, input_3l, input_3h, input_5l, input_5h, + input_7l, input_7h; + int16x4_t step1l[4], step1h[4]; + int16x8_t step1[8], step2[8]; + int32x4_t t32[8]; + int16x4_t t16[8]; + + transpose_s16_8x8(io0, io1, io2, io3, io4, io5, io6, io7); + + // stage 1 + input_1l = vget_low_s16(*io1); + input_1h = vget_high_s16(*io1); + input_3l = vget_low_s16(*io3); + input_3h = vget_high_s16(*io3); + input_5l = vget_low_s16(*io5); + input_5h = vget_high_s16(*io5); + input_7l = vget_low_s16(*io7); + input_7h = vget_high_s16(*io7); + step1l[0] = vget_low_s16(*io0); + step1h[0] = vget_high_s16(*io0); + step1l[1] = vget_low_s16(*io2); + step1h[1] = vget_high_s16(*io2); + step1l[2] = vget_low_s16(*io4); + step1h[2] = vget_high_s16(*io4); + step1l[3] = vget_low_s16(*io6); + step1h[3] = vget_high_s16(*io6); + + t32[0] = vmull_lane_s16(input_1l, cospis1, 3); + t32[1] = vmull_lane_s16(input_1h, cospis1, 3); + t32[2] = vmull_lane_s16(input_3l, cospis1, 2); + t32[3] = vmull_lane_s16(input_3h, cospis1, 2); + t32[4] = vmull_lane_s16(input_3l, cospis1, 1); + t32[5] = vmull_lane_s16(input_3h, cospis1, 1); + t32[6] = vmull_lane_s16(input_1l, cospis1, 0); + t32[7] = vmull_lane_s16(input_1h, cospis1, 0); + t32[0] = vmlsl_lane_s16(t32[0], input_7l, cospis1, 0); + t32[1] = vmlsl_lane_s16(t32[1], input_7h, cospis1, 0); + t32[2] = vmlal_lane_s16(t32[2], input_5l, cospis1, 1); + t32[3] = vmlal_lane_s16(t32[3], input_5h, cospis1, 1); + t32[4] = vmlsl_lane_s16(t32[4], input_5l, cospis1, 2); + t32[5] = vmlsl_lane_s16(t32[5], input_5h, cospis1, 2); + t32[6] = vmlal_lane_s16(t32[6], input_7l, cospis1, 3); + t32[7] = vmlal_lane_s16(t32[7], input_7h, cospis1, 3); + t16[0] = vrshrn_n_s32(t32[0], 14); + t16[1] = vrshrn_n_s32(t32[1], 14); + t16[2] = vrshrn_n_s32(t32[2], 14); + t16[3] = vrshrn_n_s32(t32[3], 14); + t16[4] = vrshrn_n_s32(t32[4], 14); + t16[5] = vrshrn_n_s32(t32[5], 14); + t16[6] = vrshrn_n_s32(t32[6], 14); + t16[7] = vrshrn_n_s32(t32[7], 14); + step1[4] = vcombine_s16(t16[0], t16[1]); + step1[5] = vcombine_s16(t16[2], t16[3]); + step1[6] = vcombine_s16(t16[4], t16[5]); + step1[7] = vcombine_s16(t16[6], t16[7]); + + // stage 2 + t32[2] = vmull_lane_s16(step1l[0], cospis0, 2); + t32[3] = vmull_lane_s16(step1h[0], cospis0, 2); + t32[4] = vmull_lane_s16(step1l[1], cospis0, 3); + t32[5] = vmull_lane_s16(step1h[1], cospis0, 3); + t32[6] = vmull_lane_s16(step1l[1], cospis0, 1); + t32[7] = vmull_lane_s16(step1h[1], cospis0, 1); + t32[0] = vmlal_lane_s16(t32[2], step1l[2], cospis0, 2); + t32[1] = vmlal_lane_s16(t32[3], step1h[2], cospis0, 2); + t32[2] = vmlsl_lane_s16(t32[2], step1l[2], cospis0, 2); + t32[3] = vmlsl_lane_s16(t32[3], step1h[2], cospis0, 2); + t32[4] = vmlsl_lane_s16(t32[4], step1l[3], cospis0, 1); + t32[5] = vmlsl_lane_s16(t32[5], step1h[3], cospis0, 1); + t32[6] = vmlal_lane_s16(t32[6], step1l[3], cospis0, 3); + t32[7] = vmlal_lane_s16(t32[7], step1h[3], cospis0, 3); + t16[0] = vrshrn_n_s32(t32[0], 14); + t16[1] = vrshrn_n_s32(t32[1], 14); + t16[2] = vrshrn_n_s32(t32[2], 14); + t16[3] = vrshrn_n_s32(t32[3], 14); + t16[4] = vrshrn_n_s32(t32[4], 14); + t16[5] = vrshrn_n_s32(t32[5], 14); + t16[6] = vrshrn_n_s32(t32[6], 14); + t16[7] = vrshrn_n_s32(t32[7], 14); + step2[0] = vcombine_s16(t16[0], t16[1]); + step2[1] = vcombine_s16(t16[2], t16[3]); + step2[2] = vcombine_s16(t16[4], t16[5]); + step2[3] = vcombine_s16(t16[6], t16[7]); + + step2[4] = vaddq_s16(step1[4], step1[5]); + step2[5] = vsubq_s16(step1[4], step1[5]); + step2[6] = vsubq_s16(step1[7], step1[6]); + step2[7] = vaddq_s16(step1[7], step1[6]); + + // stage 3 + step1[0] = vaddq_s16(step2[0], step2[3]); + step1[1] = vaddq_s16(step2[1], step2[2]); + step1[2] = vsubq_s16(step2[1], step2[2]); + step1[3] = vsubq_s16(step2[0], step2[3]); + + t32[2] = vmull_lane_s16(vget_low_s16(step2[6]), cospis0, 2); + t32[3] = vmull_lane_s16(vget_high_s16(step2[6]), cospis0, 2); + t32[0] = vmlsl_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2); + t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); + t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2); + t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); + t16[0] = vrshrn_n_s32(t32[0], 14); + t16[1] = vrshrn_n_s32(t32[1], 14); + t16[2] = vrshrn_n_s32(t32[2], 14); + t16[3] = vrshrn_n_s32(t32[3], 14); + step1[5] = vcombine_s16(t16[0], t16[1]); + step1[6] = vcombine_s16(t16[2], t16[3]); + + // stage 4 + *io0 = vaddq_s16(step1[0], step2[7]); + *io1 = vaddq_s16(step1[1], step1[6]); + *io2 = vaddq_s16(step1[2], step1[5]); + *io3 = vaddq_s16(step1[3], step2[4]); + *io4 = vsubq_s16(step1[3], step2[4]); + *io5 = vsubq_s16(step1[2], step1[5]); + *io6 = vsubq_s16(step1[1], step1[6]); + *io7 = vsubq_s16(step1[0], step2[7]); +} + +static INLINE void idct16x16_add_wrap_low_8x2(const int32x4_t *const t32, + int16x8_t *const d0, + int16x8_t *const d1) { + int16x4_t t16[4]; + + t16[0] = vrshrn_n_s32(t32[0], 14); + t16[1] = vrshrn_n_s32(t32[1], 14); + t16[2] = vrshrn_n_s32(t32[2], 14); + t16[3] = vrshrn_n_s32(t32[3], 14); + *d0 = vcombine_s16(t16[0], t16[1]); + *d1 = vcombine_s16(t16[2], t16[3]); +} + +static INLINE void idct_cospi_8_24_q_kernel(const int16x8_t s0, + const int16x8_t s1, + const int16x4_t cospi_0_8_16_24, + int32x4_t *const t32) { + t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_0_8_16_24, 3); + t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_0_8_16_24, 3); + t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_0_8_16_24, 3); + t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_0_8_16_24, 3); + t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_0_8_16_24, 1); + t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_0_8_16_24, 1); + t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_0_8_16_24, 1); + t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_0_8_16_24, 1); +} + +static INLINE void idct_cospi_8_24_q(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_0_8_16_24, + int16x8_t *const d0, int16x8_t *const d1) { + int32x4_t t32[4]; + + idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32); + idct16x16_add_wrap_low_8x2(t32, d0, d1); +} + +static INLINE void idct_cospi_8_24_neg_q(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_0_8_16_24, + int16x8_t *const d0, + int16x8_t *const d1) { + int32x4_t t32[4]; + + idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t32); + t32[2] = vnegq_s32(t32[2]); + t32[3] = vnegq_s32(t32[3]); + idct16x16_add_wrap_low_8x2(t32, d0, d1); +} + +static INLINE void idct_cospi_16_16_q(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_0_8_16_24, + int16x8_t *const d0, + int16x8_t *const d1) { + int32x4_t t32[6]; + + t32[4] = vmull_lane_s16(vget_low_s16(s1), cospi_0_8_16_24, 2); + t32[5] = vmull_lane_s16(vget_high_s16(s1), cospi_0_8_16_24, 2); + t32[0] = vmlsl_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2); + t32[1] = vmlsl_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2); + t32[2] = vmlal_lane_s16(t32[4], vget_low_s16(s0), cospi_0_8_16_24, 2); + t32[3] = vmlal_lane_s16(t32[5], vget_high_s16(s0), cospi_0_8_16_24, 2); + idct16x16_add_wrap_low_8x2(t32, d0, d1); +} + +static INLINE void idct16x16_add8x1(int16x8_t res, uint8_t **dest, + const int stride) { + uint8x8_t d = vld1_u8(*dest); + uint16x8_t q; + + res = vrshrq_n_s16(res, 6); + q = vaddw_u8(vreinterpretq_u16_s16(res), d); + d = vqmovun_s16(vreinterpretq_s16_u16(q)); + vst1_u8(*dest, d); + *dest += stride; +} + #endif // VPX_DSP_ARM_IDCT_NEON_H_ diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon.c index e150a5302d5..fb1fa6b681d 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon.c @@ -346,20 +346,54 @@ void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, vst1q_u8(dst, above_right); } +void vpx_d45_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t A0_0 = vld1q_u8(above); + const uint8x16_t A0_1 = vld1q_u8(above + 16); + const uint8x16_t above_right = vdupq_lane_u8(vget_high_u8(A0_1), 7); + const uint8x16_t A1_0 = vld1q_u8(above + 1); + const uint8x16_t A1_1 = vld1q_u8(above + 17); + const uint8x16_t A2_0 = vld1q_u8(above + 2); + const uint8x16_t A2_1 = vld1q_u8(above + 18); + const uint8x16_t avg_0 = vhaddq_u8(A0_0, A2_0); + const uint8x16_t avg_1 = vhaddq_u8(A0_1, A2_1); + uint8x16_t row_0 = vrhaddq_u8(avg_0, A1_0); + uint8x16_t row_1 = vrhaddq_u8(avg_1, A1_1); + int i; + (void)left; + + vst1q_u8(dst, row_0); + dst += 16; + vst1q_u8(dst, row_1); + dst += stride - 16; + + for (i = 0; i < 30; ++i) { + row_0 = vextq_u8(row_0, row_1, 1); + row_1 = vextq_u8(row_1, above_right, 1); + vst1q_u8(dst, row_0); + dst += 16; + vst1q_u8(dst, row_1); + dst += stride - 16; + } + + vst1q_u8(dst, above_right); + dst += 16; + vst1q_u8(dst, row_1); +} + // ----------------------------------------------------------------------------- void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x8_t XABCD = vld1_u8(above - 1); - const uint32x2_t zero = vdup_n_u32(0); - const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0); - const uint8x8_t LKJI = vrev64_u8(vreinterpret_u8_u32(IJKL)); - const uint8x8_t LKJIXABC = vext_u8(LKJI, XABCD, 4); - const uint8x8_t KJIXABCD = vext_u8(LKJI, XABCD, 5); - const uint8x8_t JIXABCD0 = - vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(KJIXABCD), 8)); - const uint8x8_t avg1 = vhadd_u8(JIXABCD0, LKJIXABC); - const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABCD); + const uint8x8_t XA0123 = vld1_u8(above - 1); + const uint8x8_t L0123 = vld1_u8(left); + const uint8x8_t L3210 = vrev64_u8(L0123); + const uint8x8_t L3210XA012 = vext_u8(L3210, XA0123, 4); + const uint8x8_t L210XA0123 = vext_u8(L3210, XA0123, 5); + const uint8x8_t L10XA0123_ = + vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(L210XA0123), 8)); + const uint8x8_t avg1 = vhadd_u8(L10XA0123_, L3210XA012); + const uint8x8_t avg2 = vrhadd_u8(avg1, L210XA0123); const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); const uint32x2_t r3 = vreinterpret_u32_u8(avg2); const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); @@ -374,6 +408,265 @@ void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, vst1_lane_u32((uint32_t *)dst, r3, 0); } +void vpx_d135_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x8_t XA0123456 = vld1_u8(above - 1); + const uint8x8_t A01234567 = vld1_u8(above); + const uint8x8_t A1234567_ = vld1_u8(above + 1); + const uint8x8_t L01234567 = vld1_u8(left); + const uint8x8_t L76543210 = vrev64_u8(L01234567); + const uint8x8_t L6543210X = vext_u8(L76543210, XA0123456, 1); + const uint8x8_t L543210XA0 = vext_u8(L76543210, XA0123456, 2); + const uint8x16_t L76543210XA0123456 = vcombine_u8(L76543210, XA0123456); + const uint8x16_t L6543210XA01234567 = vcombine_u8(L6543210X, A01234567); + const uint8x16_t L543210XA01234567_ = vcombine_u8(L543210XA0, A1234567_); + const uint8x16_t avg = vhaddq_u8(L76543210XA0123456, L543210XA01234567_); + const uint8x16_t row = vrhaddq_u8(avg, L6543210XA01234567); + const uint8x8_t row_0 = vget_low_u8(row); + const uint8x8_t row_1 = vget_high_u8(row); + const uint8x8_t r0 = vext_u8(row_0, row_1, 7); + const uint8x8_t r1 = vext_u8(row_0, row_1, 6); + const uint8x8_t r2 = vext_u8(row_0, row_1, 5); + const uint8x8_t r3 = vext_u8(row_0, row_1, 4); + const uint8x8_t r4 = vext_u8(row_0, row_1, 3); + const uint8x8_t r5 = vext_u8(row_0, row_1, 2); + const uint8x8_t r6 = vext_u8(row_0, row_1, 1); + + vst1_u8(dst, r0); + dst += stride; + vst1_u8(dst, r1); + dst += stride; + vst1_u8(dst, r2); + dst += stride; + vst1_u8(dst, r3); + dst += stride; + vst1_u8(dst, r4); + dst += stride; + vst1_u8(dst, r5); + dst += stride; + vst1_u8(dst, r6); + dst += stride; + vst1_u8(dst, row_0); +} + +static INLINE void d135_store_16x8( + uint8_t **dst, const ptrdiff_t stride, const uint8x16_t row_0, + const uint8x16_t row_1, const uint8x16_t row_2, const uint8x16_t row_3, + const uint8x16_t row_4, const uint8x16_t row_5, const uint8x16_t row_6, + const uint8x16_t row_7) { + vst1q_u8(*dst, row_0); + *dst += stride; + vst1q_u8(*dst, row_1); + *dst += stride; + vst1q_u8(*dst, row_2); + *dst += stride; + vst1q_u8(*dst, row_3); + *dst += stride; + vst1q_u8(*dst, row_4); + *dst += stride; + vst1q_u8(*dst, row_5); + *dst += stride; + vst1q_u8(*dst, row_6); + *dst += stride; + vst1q_u8(*dst, row_7); + *dst += stride; +} + +void vpx_d135_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t XA0123456789abcde = vld1q_u8(above - 1); + const uint8x16_t A0123456789abcdef = vld1q_u8(above); + const uint8x16_t A123456789abcdef_ = vld1q_u8(above + 1); + const uint8x16_t L0123456789abcdef = vld1q_u8(left); + const uint8x8_t L76543210 = vrev64_u8(vget_low_u8(L0123456789abcdef)); + const uint8x8_t Lfedcba98 = vrev64_u8(vget_high_u8(L0123456789abcdef)); + const uint8x16_t Lfedcba9876543210 = vcombine_u8(Lfedcba98, L76543210); + const uint8x16_t Ledcba9876543210X = + vextq_u8(Lfedcba9876543210, XA0123456789abcde, 1); + const uint8x16_t Ldcba9876543210XA0 = + vextq_u8(Lfedcba9876543210, XA0123456789abcde, 2); + const uint8x16_t avg_0 = vhaddq_u8(Lfedcba9876543210, Ldcba9876543210XA0); + const uint8x16_t avg_1 = vhaddq_u8(XA0123456789abcde, A123456789abcdef_); + const uint8x16_t row_0 = vrhaddq_u8(avg_0, Ledcba9876543210X); + const uint8x16_t row_1 = vrhaddq_u8(avg_1, A0123456789abcdef); + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 15); + const uint8x16_t r_1 = vextq_u8(row_0, row_1, 14); + const uint8x16_t r_2 = vextq_u8(row_0, row_1, 13); + const uint8x16_t r_3 = vextq_u8(row_0, row_1, 12); + const uint8x16_t r_4 = vextq_u8(row_0, row_1, 11); + const uint8x16_t r_5 = vextq_u8(row_0, row_1, 10); + const uint8x16_t r_6 = vextq_u8(row_0, row_1, 9); + const uint8x16_t r_7 = vcombine_u8(vget_high_u8(row_0), vget_low_u8(row_1)); + const uint8x16_t r_8 = vextq_u8(row_0, row_1, 7); + const uint8x16_t r_9 = vextq_u8(row_0, row_1, 6); + const uint8x16_t r_a = vextq_u8(row_0, row_1, 5); + const uint8x16_t r_b = vextq_u8(row_0, row_1, 4); + const uint8x16_t r_c = vextq_u8(row_0, row_1, 3); + const uint8x16_t r_d = vextq_u8(row_0, row_1, 2); + const uint8x16_t r_e = vextq_u8(row_0, row_1, 1); + + d135_store_16x8(&dst, stride, r_0, r_1, r_2, r_3, r_4, r_5, r_6, r_7); + d135_store_16x8(&dst, stride, r_8, r_9, r_a, r_b, r_c, r_d, r_e, row_0); +} + +static INLINE void d135_store_32x2(uint8_t **dst, const ptrdiff_t stride, + const uint8x16_t row_0, + const uint8x16_t row_1, + const uint8x16_t row_2) { + uint8_t *dst2 = *dst; + vst1q_u8(dst2, row_1); + dst2 += 16; + vst1q_u8(dst2, row_2); + dst2 += 16 * stride - 16; + vst1q_u8(dst2, row_0); + dst2 += 16; + vst1q_u8(dst2, row_1); + *dst += stride; +} + +void vpx_d135_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, const uint8_t *left) { + const uint8x16_t LL0123456789abcdef = vld1q_u8(left + 16); + const uint8x16_t LU0123456789abcdef = vld1q_u8(left); + const uint8x8_t LL76543210 = vrev64_u8(vget_low_u8(LL0123456789abcdef)); + const uint8x8_t LU76543210 = vrev64_u8(vget_low_u8(LU0123456789abcdef)); + const uint8x8_t LLfedcba98 = vrev64_u8(vget_high_u8(LL0123456789abcdef)); + const uint8x8_t LUfedcba98 = vrev64_u8(vget_high_u8(LU0123456789abcdef)); + const uint8x16_t LLfedcba9876543210 = vcombine_u8(LLfedcba98, LL76543210); + const uint8x16_t LUfedcba9876543210 = vcombine_u8(LUfedcba98, LU76543210); + const uint8x16_t LLedcba9876543210Uf = + vextq_u8(LLfedcba9876543210, LUfedcba9876543210, 1); + const uint8x16_t LLdcba9876543210Ufe = + vextq_u8(LLfedcba9876543210, LUfedcba9876543210, 2); + const uint8x16_t avg_0 = vhaddq_u8(LLfedcba9876543210, LLdcba9876543210Ufe); + const uint8x16_t row_0 = vrhaddq_u8(avg_0, LLedcba9876543210Uf); + + const uint8x16_t XAL0123456789abcde = vld1q_u8(above - 1); + const uint8x16_t LUedcba9876543210X = + vextq_u8(LUfedcba9876543210, XAL0123456789abcde, 1); + const uint8x16_t LUdcba9876543210XA0 = + vextq_u8(LUfedcba9876543210, XAL0123456789abcde, 2); + const uint8x16_t avg_1 = vhaddq_u8(LUfedcba9876543210, LUdcba9876543210XA0); + const uint8x16_t row_1 = vrhaddq_u8(avg_1, LUedcba9876543210X); + + const uint8x16_t AL0123456789abcdef = vld1q_u8(above); + const uint8x16_t AL123456789abcdefg = vld1q_u8(above + 1); + const uint8x16_t ALfR0123456789abcde = vld1q_u8(above + 15); + const uint8x16_t AR0123456789abcdef = vld1q_u8(above + 16); + const uint8x16_t AR123456789abcdef_ = vld1q_u8(above + 17); + const uint8x16_t avg_2 = vhaddq_u8(XAL0123456789abcde, AL123456789abcdefg); + const uint8x16_t row_2 = vrhaddq_u8(avg_2, AL0123456789abcdef); + const uint8x16_t avg_3 = vhaddq_u8(ALfR0123456789abcde, AR123456789abcdef_); + const uint8x16_t row_3 = vrhaddq_u8(avg_3, AR0123456789abcdef); + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 15); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 15); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 15); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 14); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 14); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 14); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 13); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 13); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 13); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 12); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 12); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 12); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 11); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 11); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 11); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 10); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 10); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 10); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 9); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 9); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 9); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 8); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 8); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 8); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 7); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 7); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 7); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 6); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 6); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 6); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 5); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 5); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 5); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 4); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 4); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 4); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 3); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 3); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 3); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 2); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 2); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 2); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + { + const uint8x16_t r_0 = vextq_u8(row_0, row_1, 1); + const uint8x16_t r_1 = vextq_u8(row_1, row_2, 1); + const uint8x16_t r_2 = vextq_u8(row_2, row_3, 1); + d135_store_32x2(&dst, stride, r_0, r_1, r_2); + } + + d135_store_32x2(&dst, stride, row_0, row_1, row_2); +} + // ----------------------------------------------------------------------------- #if !HAVE_NEON_ASM @@ -483,133 +776,98 @@ void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, vst1_u8(dst, d); } +static INLINE void h_store_16x8(uint8_t **dst, const ptrdiff_t stride, + const uint8x8_t left) { + const uint8x16_t row_0 = vdupq_lane_u8(left, 0); + const uint8x16_t row_1 = vdupq_lane_u8(left, 1); + const uint8x16_t row_2 = vdupq_lane_u8(left, 2); + const uint8x16_t row_3 = vdupq_lane_u8(left, 3); + const uint8x16_t row_4 = vdupq_lane_u8(left, 4); + const uint8x16_t row_5 = vdupq_lane_u8(left, 5); + const uint8x16_t row_6 = vdupq_lane_u8(left, 6); + const uint8x16_t row_7 = vdupq_lane_u8(left, 7); + + vst1q_u8(*dst, row_0); + *dst += stride; + vst1q_u8(*dst, row_1); + *dst += stride; + vst1q_u8(*dst, row_2); + *dst += stride; + vst1q_u8(*dst, row_3); + *dst += stride; + vst1q_u8(*dst, row_4); + *dst += stride; + vst1q_u8(*dst, row_5); + *dst += stride; + vst1q_u8(*dst, row_6); + *dst += stride; + vst1q_u8(*dst, row_7); + *dst += stride; +} + void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x16_t left_u8q = vld1q_u8(left); - uint8x8_t left_u8d = vget_low_u8(left_u8q); - uint8x16_t d; - int i; (void)above; - for (i = 0; i < 2; i++, left_u8d = vget_high_u8(left_u8q)) { - d = vdupq_lane_u8(left_u8d, 0); - vst1q_u8(dst, d); - dst += stride; - d = vdupq_lane_u8(left_u8d, 1); - vst1q_u8(dst, d); - dst += stride; - d = vdupq_lane_u8(left_u8d, 2); - vst1q_u8(dst, d); - dst += stride; - d = vdupq_lane_u8(left_u8d, 3); - vst1q_u8(dst, d); - dst += stride; - d = vdupq_lane_u8(left_u8d, 4); - vst1q_u8(dst, d); - dst += stride; - d = vdupq_lane_u8(left_u8d, 5); - vst1q_u8(dst, d); - dst += stride; - d = vdupq_lane_u8(left_u8d, 6); - vst1q_u8(dst, d); - dst += stride; - d = vdupq_lane_u8(left_u8d, 7); - vst1q_u8(dst, d); - dst += stride; - } + h_store_16x8(&dst, stride, vget_low_u8(left_u8q)); + h_store_16x8(&dst, stride, vget_high_u8(left_u8q)); +} + +static INLINE void h_store_32x8(uint8_t **dst, const ptrdiff_t stride, + const uint8x8_t left) { + const uint8x16_t row_0 = vdupq_lane_u8(left, 0); + const uint8x16_t row_1 = vdupq_lane_u8(left, 1); + const uint8x16_t row_2 = vdupq_lane_u8(left, 2); + const uint8x16_t row_3 = vdupq_lane_u8(left, 3); + const uint8x16_t row_4 = vdupq_lane_u8(left, 4); + const uint8x16_t row_5 = vdupq_lane_u8(left, 5); + const uint8x16_t row_6 = vdupq_lane_u8(left, 6); + const uint8x16_t row_7 = vdupq_lane_u8(left, 7); + + vst1q_u8(*dst, row_0); // Note clang-3.8 produced poor code w/vst2q_u8 + *dst += 16; + vst1q_u8(*dst, row_0); + *dst += stride - 16; + vst1q_u8(*dst, row_1); + *dst += 16; + vst1q_u8(*dst, row_1); + *dst += stride - 16; + vst1q_u8(*dst, row_2); + *dst += 16; + vst1q_u8(*dst, row_2); + *dst += stride - 16; + vst1q_u8(*dst, row_3); + *dst += 16; + vst1q_u8(*dst, row_3); + *dst += stride - 16; + vst1q_u8(*dst, row_4); + *dst += 16; + vst1q_u8(*dst, row_4); + *dst += stride - 16; + vst1q_u8(*dst, row_5); + *dst += 16; + vst1q_u8(*dst, row_5); + *dst += stride - 16; + vst1q_u8(*dst, row_6); + *dst += 16; + vst1q_u8(*dst, row_6); + *dst += stride - 16; + vst1q_u8(*dst, row_7); + *dst += 16; + vst1q_u8(*dst, row_7); + *dst += stride - 16; } void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - uint8x16_t d; int i; (void)above; for (i = 0; i < 2; i++, left += 16) { const uint8x16_t left_u8 = vld1q_u8(left); - const uint8x8_t left_low = vget_low_u8(left_u8); - const uint8x8_t left_high = vget_high_u8(left_u8); - d = vdupq_lane_u8(left_low, 0); - vst1q_u8(dst, d); // Note clang-3.8 produced poor code w/vst2q_u8 - dst += 16; - vst1q_u8(dst, d); - dst += stride - 16; - d = vdupq_lane_u8(left_low, 1); - vst1q_u8(dst, d); - dst += 16; - vst1q_u8(dst, d); - dst += stride - 16; - d = vdupq_lane_u8(left_low, 2); - vst1q_u8(dst, d); - dst += 16; - vst1q_u8(dst, d); - dst += stride - 16; - d = vdupq_lane_u8(left_low, 3); - vst1q_u8(dst, d); - dst += 16; - vst1q_u8(dst, d); - dst += stride - 16; - d = vdupq_lane_u8(left_low, 4); - vst1q_u8(dst, d); - dst += 16; - vst1q_u8(dst, d); - dst += stride - 16; - d = vdupq_lane_u8(left_low, 5); - vst1q_u8(dst, d); - dst += 16; - vst1q_u8(dst, d); - dst += stride - 16; - d = vdupq_lane_u8(left_low, 6); - vst1q_u8(dst, d); - dst += 16; - vst1q_u8(dst, d); - dst += stride - 16; - d = vdupq_lane_u8(left_low, 7); - vst1q_u8(dst, d); - dst += 16; - vst1q_u8(dst, d); - dst += stride - 16; - - d = vdupq_lane_u8(left_high, 0); - vst1q_u8(dst, d); - dst += 16; - vst1q_u8(dst, d); - dst += stride - 16; - d = vdupq_lane_u8(left_high, 1); - vst1q_u8(dst, d); - dst += 16; - vst1q_u8(dst, d); - dst += stride - 16; - d = vdupq_lane_u8(left_high, 2); - vst1q_u8(dst, d); - dst += 16; - vst1q_u8(dst, d); - dst += stride - 16; - d = vdupq_lane_u8(left_high, 3); - vst1q_u8(dst, d); - dst += 16; - vst1q_u8(dst, d); - dst += stride - 16; - d = vdupq_lane_u8(left_high, 4); - vst1q_u8(dst, d); - dst += 16; - vst1q_u8(dst, d); - dst += stride - 16; - d = vdupq_lane_u8(left_high, 5); - vst1q_u8(dst, d); - dst += 16; - vst1q_u8(dst, d); - dst += stride - 16; - d = vdupq_lane_u8(left_high, 6); - vst1q_u8(dst, d); - dst += 16; - vst1q_u8(dst, d); - dst += stride - 16; - d = vdupq_lane_u8(left_high, 7); - vst1q_u8(dst, d); - dst += 16; - vst1q_u8(dst, d); - dst += stride - 16; + h_store_32x8(&dst, stride, vget_low_u8(left_u8)); + h_store_32x8(&dst, stride, vget_high_u8(left_u8)); } } diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm index 5cd9170aea7..907e9183804 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm @@ -11,6 +11,7 @@ EXPORT |vpx_lpf_horizontal_4_neon| EXPORT |vpx_lpf_vertical_4_neon| EXPORT |vpx_lpf_horizontal_4_dual_neon| + EXPORT |vpx_lpf_vertical_4_dual_neon| ARM AREA ||.text||, CODE, READONLY, ALIGN=2 @@ -54,7 +55,7 @@ sub r2, r2, r1, lsl #1 sub r3, r3, r1, lsl #1 - bl vpx_loop_filter_neon + bl filter4_8 vst1.u8 {d4}, [r2@64], r1 ; store op1 vst1.u8 {d5}, [r3@64], r1 ; store op0 @@ -114,7 +115,7 @@ vtrn.8 d7, d16 vtrn.8 d17, d18 - bl vpx_loop_filter_neon + bl filter4_8 sub r0, r0, #2 @@ -131,7 +132,7 @@ pop {pc} ENDP ; |vpx_lpf_vertical_4_neon| -; void vpx_loop_filter_neon(); +; void filter4_8(); ; This is a helper function for the loopfilters. The invidual functions do the ; necessary load, transpose (if necessary) and store. The function does not use ; registers d8-d15. @@ -155,7 +156,7 @@ ; d5 op0 ; d6 oq0 ; d7 oq1 -|vpx_loop_filter_neon| PROC +|filter4_8| PROC ; filter_mask vabd.u8 d19, d3, d4 ; m1 = abs(p3 - p2) vabd.u8 d20, d4, d5 ; m2 = abs(p2 - p1) @@ -245,7 +246,7 @@ veor d7, d20, d18 ; *oq1 = u^0x80 bx lr - ENDP ; |vpx_loop_filter_neon| + ENDP ; |filter4_8| ;void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p, ; const uint8_t *blimit0, @@ -300,7 +301,7 @@ sub r2, r2, r1, lsl #1 sub r3, r3, r1, lsl #1 - bl vpx_loop_filter_neon_16 + bl filter4_16 vst1.u8 {q5}, [r2@64], r1 ; store op1 vst1.u8 {q6}, [r3@64], r1 ; store op0 @@ -312,7 +313,122 @@ pop {pc} ENDP ; |vpx_lpf_horizontal_4_dual_neon| -; void vpx_loop_filter_neon_16(); +;void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, +; const uint8_t *blimit0, +; const uint8_t *limit0, +; const uint8_t *thresh0, +; const uint8_t *blimit1, +; const uint8_t *limit1, +; const uint8_t *thresh1) +; r0 uint8_t *s, +; r1 int p, +; r2 const uint8_t *blimit0, +; r3 const uint8_t *limit0, +; sp const uint8_t *thresh0, +; sp+4 const uint8_t *blimit1, +; sp+8 const uint8_t *limit1, +; sp+12 const uint8_t *thresh1, + +|vpx_lpf_vertical_4_dual_neon| PROC + push {lr} + + ldr r12, [sp, #4] ; load thresh0 + vld1.8 {d0}, [r2] ; load blimit0 to first half q + vld1.8 {d2}, [r3] ; load limit0 to first half q + + ldr r2, [sp, #8] ; load blimit1 + + vld1.8 {d4}, [r12] ; load thresh0 to first half q + + ldr r3, [sp, #12] ; load limit1 + ldr r12, [sp, #16] ; load thresh1 + vld1.8 {d1}, [r2] ; load blimit1 to 2nd half q + + sub r2, r0, #4 ; s[-4] + + vld1.8 {d3}, [r3] ; load limit1 to 2nd half q + vld1.8 {d5}, [r12] ; load thresh1 to 2nd half q + + vpush {d8-d15} ; save neon registers + + vld1.u8 {d6}, [r2], r1 ; 00 01 02 03 04 05 06 07 + vld1.u8 {d8}, [r2], r1 ; 10 11 12 13 14 15 16 17 + vld1.u8 {d10}, [r2], r1 ; 20 21 22 23 24 25 26 27 + vld1.u8 {d12}, [r2], r1 ; 30 31 32 33 34 35 36 37 + vld1.u8 {d14}, [r2], r1 ; 40 41 42 43 44 45 46 47 + vld1.u8 {d16}, [r2], r1 ; 50 51 52 53 54 55 56 57 + vld1.u8 {d18}, [r2], r1 ; 60 61 62 63 64 65 66 67 + vld1.u8 {d20}, [r2], r1 ; 70 71 72 73 74 75 76 77 + vld1.u8 {d7}, [r2], r1 ; 80 81 82 83 84 85 86 87 + vld1.u8 {d9}, [r2], r1 ; 90 91 92 93 94 95 96 97 + vld1.u8 {d11}, [r2], r1 ; A0 A1 A2 A3 A4 A5 A6 A7 + vld1.u8 {d13}, [r2], r1 ; B0 B1 B2 B3 B4 B5 B6 B7 + vld1.u8 {d15}, [r2], r1 ; C0 C1 C2 C3 C4 C5 C6 C7 + vld1.u8 {d17}, [r2], r1 ; D0 D1 D2 D3 D4 D5 D6 D7 + vld1.u8 {d19}, [r2], r1 ; E0 E1 E2 E3 E4 E5 E6 E7 + vld1.u8 {d21}, [r2] ; F0 F1 F2 F3 F4 F5 F6 F7 + + vtrn.8 q3, q4 ; q3 : 00 10 02 12 04 14 06 16 80 90 82 92 84 94 86 96 + ; q4 : 01 11 03 13 05 15 07 17 81 91 83 93 85 95 87 97 + vtrn.8 q5, q6 ; q5 : 20 30 22 32 24 34 26 36 A0 B0 A2 B2 A4 B4 A6 B6 + ; q6 : 21 31 23 33 25 35 27 37 A1 B1 A3 B3 A5 B5 A7 B7 + vtrn.8 q7, q8 ; q7 : 40 50 42 52 44 54 46 56 C0 D0 C2 D2 C4 D4 C6 D6 + ; q8 : 41 51 43 53 45 55 47 57 C1 D1 C3 D3 C5 D5 C7 D7 + vtrn.8 q9, q10 ; q9 : 60 70 62 72 64 74 66 76 E0 F0 E2 F2 E4 F4 E6 F6 + ; q10: 61 71 63 73 65 75 67 77 E1 F1 E3 F3 E5 F5 E7 F7 + + vtrn.16 q3, q5 ; q3 : 00 10 20 30 04 14 24 34 80 90 A0 B0 84 94 A4 B4 + ; q5 : 02 12 22 32 06 16 26 36 82 92 A2 B2 86 96 A6 B6 + vtrn.16 q4, q6 ; q4 : 01 11 21 31 05 15 25 35 81 91 A1 B1 85 95 A5 B5 + ; q6 : 03 13 23 33 07 17 27 37 83 93 A3 B3 87 97 A7 B7 + vtrn.16 q7, q9 ; q7 : 40 50 60 70 44 54 64 74 C0 D0 E0 F0 C4 D4 E4 F4 + ; q9 : 42 52 62 72 46 56 66 76 C2 D2 E2 F2 C6 D6 E6 F6 + vtrn.16 q8, q10 ; q8 : 41 51 61 71 45 55 65 75 C1 D1 E1 F1 C5 D5 E5 F5 + ; q10: 43 53 63 73 47 57 67 77 C3 D3 E3 F3 C7 D7 E7 F7 + + vtrn.32 q3, q7 ; q3 : 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0 + ; q7 : 04 14 24 34 44 54 64 74 84 94 A4 B4 C4 D4 E4 F4 + vtrn.32 q5, q9 ; q5 : 02 12 22 32 42 52 62 72 82 92 A2 B2 C2 D2 E2 F2 + ; q9 : 06 16 26 36 46 56 66 76 86 96 A6 B6 C6 D6 E6 F6 + vtrn.32 q4, q8 ; q4 : 01 11 21 31 41 51 61 71 81 91 A1 B1 C1 D1 E1 F1 + ; q8 : 05 15 25 35 45 55 65 75 85 95 A5 B5 C5 D5 E5 F5 + vtrn.32 q6, q10 ; q6 : 03 13 23 33 43 53 63 73 83 93 A3 B3 C3 D3 E3 F3 + ; q10: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7 + + bl filter4_16 + + sub r0, #2 + + vmov d0, d11 + vmov d1, d13 + vmov d2, d15 + vmov d3, d17 + vmov d11, d12 + vmov d12, d14 + vmov d13, d16 + vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1 + vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1 + vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1 + vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1 + vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1 + vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1 + vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1 + vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0], r1 + vst4.8 {d0[0], d1[0], d2[0], d3[0]}, [r0], r1 + vst4.8 {d0[1], d1[1], d2[1], d3[1]}, [r0], r1 + vst4.8 {d0[2], d1[2], d2[2], d3[2]}, [r0], r1 + vst4.8 {d0[3], d1[3], d2[3], d3[3]}, [r0], r1 + vst4.8 {d0[4], d1[4], d2[4], d3[4]}, [r0], r1 + vst4.8 {d0[5], d1[5], d2[5], d3[5]}, [r0], r1 + vst4.8 {d0[6], d1[6], d2[6], d3[6]}, [r0], r1 + vst4.8 {d0[7], d1[7], d2[7], d3[7]}, [r0] + + vpop {d8-d15} ; restore neon registers + + pop {pc} + ENDP ; |vpx_lpf_vertical_4_dual_neon| + +; void filter4_16(); ; This is a helper function for the loopfilters. The invidual functions do the ; necessary load, transpose (if necessary) and store. This function uses ; registers d8-d15, so the calling function must save those registers. @@ -335,7 +451,7 @@ ; q6 op0 ; q7 oq0 ; q8 oq1 -|vpx_loop_filter_neon_16| PROC +|filter4_16| PROC ; filter_mask vabd.u8 q11, q3, q4 ; m1 = abs(p3 - p2) @@ -428,6 +544,6 @@ veor q8, q12, q10 ; *oq1 = u^0x80 bx lr - ENDP ; |vpx_loop_filter_neon_16| + ENDP ; |filter4_16| END diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_vertical_4_dual_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_vertical_4_dual_neon.c deleted file mode 100644 index ced5aef0ab2..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_vertical_4_dual_neon.c +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <arm_neon.h> - -#include "./vpx_dsp_rtcd.h" -#include "./vpx_config.h" -#include "vpx/vpx_integer.h" - -void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, - const uint8_t *limit0, const uint8_t *thresh0, - const uint8_t *blimit1, const uint8_t *limit1, - const uint8_t *thresh1) { - vpx_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0); - vpx_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1); -} diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/transpose_neon.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/transpose_neon.h index 445add29689..8366ce50b87 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/transpose_neon.h +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/transpose_neon.h @@ -21,7 +21,7 @@ // // b0.val[0]: 00 01 02 03 16 17 18 19 // b0.val[1]: 04 05 06 07 20 21 22 23 -static INLINE int16x8x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) { +static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) { int16x8x2_t b0; b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)), vreinterpret_s16_s32(vget_low_s32(a1))); @@ -30,7 +30,23 @@ static INLINE int16x8x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) { return b0; } -static INLINE uint8x16x2_t vpx_vtrnq_u64(uint32x4_t a0, uint32x4_t a1) { +static INLINE int32x4x2_t vpx_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) { + int32x4x2_t b0; + b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1)); + b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1)); + return b0; +} + +static INLINE int64x2x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) { + int64x2x2_t b0; + b0.val[0] = vcombine_s64(vreinterpret_s64_s32(vget_low_s32(a0)), + vreinterpret_s64_s32(vget_low_s32(a1))); + b0.val[1] = vcombine_s64(vreinterpret_s64_s32(vget_high_s32(a0)), + vreinterpret_s64_s32(vget_high_s32(a1))); + return b0; +} + +static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) { uint8x16x2_t b0; b0.val[0] = vcombine_u8(vreinterpret_u8_u32(vget_low_u32(a0)), vreinterpret_u8_u32(vget_low_u32(a1))); @@ -110,6 +126,37 @@ static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1, *a3 = vreinterpret_s16_s32(c1.val[1]); } +static INLINE void transpose_s16_4x4q(int16x8_t *a0, int16x8_t *a1) { + // Swap 32 bit elements. Goes from: + // a0: 00 01 02 03 10 11 12 13 + // a1: 20 21 22 23 30 31 32 33 + // to: + // b0.val[0]: 00 01 20 21 10 11 30 31 + // b0.val[1]: 02 03 22 23 12 13 32 33 + + const int32x4x2_t b0 = + vtrnq_s32(vreinterpretq_s32_s16(*a0), vreinterpretq_s32_s16(*a1)); + + // Swap 64 bit elements resulting in: + // c0.val[0]: 00 01 20 21 02 03 22 23 + // c0.val[1]: 10 11 30 31 12 13 32 33 + + const int32x4_t c0 = + vcombine_s32(vget_low_s32(b0.val[0]), vget_low_s32(b0.val[1])); + const int32x4_t c1 = + vcombine_s32(vget_high_s32(b0.val[0]), vget_high_s32(b0.val[1])); + + // Swap 16 bit elements resulting in: + // d0.val[0]: 00 10 20 30 02 12 22 32 + // d0.val[1]: 01 11 21 31 03 13 23 33 + + const int16x8x2_t d0 = + vtrnq_s16(vreinterpretq_s16_s32(c0), vreinterpretq_s16_s32(c1)); + + *a0 = d0.val[0]; + *a1 = d0.val[1]; +} + static INLINE void transpose_u16_4x4q(uint16x8_t *a0, uint16x8_t *a1) { // Swap 32 bit elements. Goes from: // a0: 00 01 02 03 10 11 12 13 @@ -141,6 +188,211 @@ static INLINE void transpose_u16_4x4q(uint16x8_t *a0, uint16x8_t *a1) { *a1 = d0.val[1]; } +static INLINE void transpose_u8_4x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, + uint8x8_t *a3, const uint8x8_t a4, + const uint8x8_t a5, const uint8x8_t a6, + const uint8x8_t a7) { + // Swap 32 bit elements. Goes from: + // a0: 00 01 02 03 XX XX XX XX + // a1: 10 11 12 13 XX XX XX XX + // a2: 20 21 22 23 XX XX XX XX + // a3; 30 31 32 33 XX XX XX XX + // a4: 40 41 42 43 XX XX XX XX + // a5: 50 51 52 53 XX XX XX XX + // a6: 60 61 62 63 XX XX XX XX + // a7: 70 71 72 73 XX XX XX XX + // to: + // b0.val[0]: 00 01 02 03 40 41 42 43 + // b1.val[0]: 10 11 12 13 50 51 52 53 + // b2.val[0]: 20 21 22 23 60 61 62 63 + // b3.val[0]: 30 31 32 33 70 71 72 73 + + const uint32x2x2_t b0 = + vtrn_u32(vreinterpret_u32_u8(*a0), vreinterpret_u32_u8(a4)); + const uint32x2x2_t b1 = + vtrn_u32(vreinterpret_u32_u8(*a1), vreinterpret_u32_u8(a5)); + const uint32x2x2_t b2 = + vtrn_u32(vreinterpret_u32_u8(*a2), vreinterpret_u32_u8(a6)); + const uint32x2x2_t b3 = + vtrn_u32(vreinterpret_u32_u8(*a3), vreinterpret_u32_u8(a7)); + + // Swap 16 bit elements resulting in: + // c0.val[0]: 00 01 20 21 40 41 60 61 + // c0.val[1]: 02 03 22 23 42 43 62 63 + // c1.val[0]: 10 11 30 31 50 51 70 71 + // c1.val[1]: 12 13 32 33 52 53 72 73 + + const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]), + vreinterpret_u16_u32(b2.val[0])); + const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]), + vreinterpret_u16_u32(b3.val[0])); + + // Swap 8 bit elements resulting in: + // d0.val[0]: 00 10 20 30 40 50 60 70 + // d0.val[1]: 01 11 21 31 41 51 61 71 + // d1.val[0]: 02 12 22 32 42 52 62 72 + // d1.val[1]: 03 13 23 33 43 53 63 73 + + const uint8x8x2_t d0 = + vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0])); + const uint8x8x2_t d1 = + vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1])); + + *a0 = d0.val[0]; + *a1 = d0.val[1]; + *a2 = d1.val[0]; + *a3 = d1.val[1]; +} + +static INLINE void transpose_s32_4x4(int32x4_t *a0, int32x4_t *a1, + int32x4_t *a2, int32x4_t *a3) { + // Swap 32 bit elements. Goes from: + // a0: 00 01 02 03 + // a1: 10 11 12 13 + // a2: 20 21 22 23 + // a3: 30 31 32 33 + // to: + // b0.val[0]: 00 10 02 12 + // b0.val[1]: 01 11 03 13 + // b1.val[0]: 20 30 22 32 + // b1.val[1]: 21 31 23 33 + + const int32x4x2_t b0 = vtrnq_s32(*a0, *a1); + const int32x4x2_t b1 = vtrnq_s32(*a2, *a3); + + // Swap 64 bit elements resulting in: + // c0.val[0]: 00 10 20 30 + // c0.val[1]: 02 12 22 32 + // c1.val[0]: 01 11 21 31 + // c1.val[1]: 03 13 23 33 + + const int32x4x2_t c0 = vpx_vtrnq_s64_to_s32(b0.val[0], b1.val[0]); + const int32x4x2_t c1 = vpx_vtrnq_s64_to_s32(b0.val[1], b1.val[1]); + + *a0 = c0.val[0]; + *a1 = c1.val[0]; + *a2 = c0.val[1]; + *a3 = c1.val[1]; +} + +static INLINE void transpose_s16_4x8(const int16x4_t a0, const int16x4_t a1, + const int16x4_t a2, const int16x4_t a3, + const int16x4_t a4, const int16x4_t a5, + const int16x4_t a6, const int16x4_t a7, + int16x8_t *const o0, int16x8_t *const o1, + int16x8_t *const o2, int16x8_t *const o3) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 + // a1: 10 11 12 13 + // a2: 20 21 22 23 + // a3: 30 31 32 33 + // a4: 40 41 42 43 + // a5: 50 51 52 53 + // a6: 60 61 62 63 + // a7: 70 71 72 73 + // to: + // b0.val[0]: 00 10 02 12 + // b0.val[1]: 01 11 03 13 + // b1.val[0]: 20 30 22 32 + // b1.val[1]: 21 31 23 33 + // b2.val[0]: 40 50 42 52 + // b2.val[1]: 41 51 43 53 + // b3.val[0]: 60 70 62 72 + // b3.val[1]: 61 71 63 73 + + const int16x4x2_t b0 = vtrn_s16(a0, a1); + const int16x4x2_t b1 = vtrn_s16(a2, a3); + const int16x4x2_t b2 = vtrn_s16(a4, a5); + const int16x4x2_t b3 = vtrn_s16(a6, a7); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 + // c0.val[1]: 02 12 22 32 + // c1.val[0]: 01 11 21 31 + // c1.val[1]: 03 13 23 33 + // c2.val[0]: 40 50 60 70 + // c2.val[1]: 42 52 62 72 + // c3.val[0]: 41 51 61 71 + // c3.val[1]: 43 53 63 73 + + const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]), + vreinterpret_s32_s16(b1.val[0])); + const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]), + vreinterpret_s32_s16(b1.val[1])); + const int32x2x2_t c2 = vtrn_s32(vreinterpret_s32_s16(b2.val[0]), + vreinterpret_s32_s16(b3.val[0])); + const int32x2x2_t c3 = vtrn_s32(vreinterpret_s32_s16(b2.val[1]), + vreinterpret_s32_s16(b3.val[1])); + + // Swap 64 bit elements resulting in: + // o0: 00 10 20 30 40 50 60 70 + // o1: 01 11 21 31 41 51 61 71 + // o2: 02 12 22 32 42 52 62 72 + // o3: 03 13 23 33 43 53 63 73 + + *o0 = vcombine_s16(vreinterpret_s16_s32(c0.val[0]), + vreinterpret_s16_s32(c2.val[0])); + *o1 = vcombine_s16(vreinterpret_s16_s32(c1.val[0]), + vreinterpret_s16_s32(c3.val[0])); + *o2 = vcombine_s16(vreinterpret_s16_s32(c0.val[1]), + vreinterpret_s16_s32(c2.val[1])); + *o3 = vcombine_s16(vreinterpret_s16_s32(c1.val[1]), + vreinterpret_s16_s32(c3.val[1])); +} + +static INLINE void transpose_s32_4x8(int32x4_t *const a0, int32x4_t *const a1, + int32x4_t *const a2, int32x4_t *const a3, + int32x4_t *const a4, int32x4_t *const a5, + int32x4_t *const a6, int32x4_t *const a7) { + // Swap 32 bit elements. Goes from: + // a0: 00 01 02 03 + // a1: 10 11 12 13 + // a2: 20 21 22 23 + // a3: 30 31 32 33 + // a4: 40 41 42 43 + // a5: 50 51 52 53 + // a6: 60 61 62 63 + // a7: 70 71 72 73 + // to: + // b0.val[0]: 00 10 02 12 + // b0.val[1]: 01 11 03 13 + // b1.val[0]: 20 30 22 32 + // b1.val[1]: 21 31 23 33 + // b2.val[0]: 40 50 42 52 + // b2.val[1]: 41 51 43 53 + // b3.val[0]: 60 70 62 72 + // b3.val[1]: 61 71 63 73 + + const int32x4x2_t b0 = vtrnq_s32(*a0, *a1); + const int32x4x2_t b1 = vtrnq_s32(*a2, *a3); + const int32x4x2_t b2 = vtrnq_s32(*a4, *a5); + const int32x4x2_t b3 = vtrnq_s32(*a6, *a7); + + // Swap 64 bit elements resulting in: + // c0.val[0]: 00 10 20 30 + // c0.val[1]: 02 12 22 32 + // c1.val[0]: 01 11 21 31 + // c1.val[1]: 03 13 23 33 + // c2.val[0]: 40 50 60 70 + // c2.val[1]: 42 52 62 72 + // c3.val[0]: 41 51 61 71 + // c3.val[1]: 43 53 63 73 + + const int64x2x2_t c0 = vpx_vtrnq_s64(b0.val[0], b1.val[0]); + const int64x2x2_t c1 = vpx_vtrnq_s64(b0.val[1], b1.val[1]); + const int64x2x2_t c2 = vpx_vtrnq_s64(b2.val[0], b3.val[0]); + const int64x2x2_t c3 = vpx_vtrnq_s64(b2.val[1], b3.val[1]); + + *a0 = vreinterpretq_s32_s64(c0.val[0]); + *a1 = vreinterpretq_s32_s64(c2.val[0]); + *a2 = vreinterpretq_s32_s64(c1.val[0]); + *a3 = vreinterpretq_s32_s64(c3.val[0]); + *a4 = vreinterpretq_s32_s64(c0.val[1]); + *a5 = vreinterpretq_s32_s64(c2.val[1]); + *a6 = vreinterpretq_s32_s64(c1.val[1]); + *a7 = vreinterpretq_s32_s64(c3.val[1]); +} + static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, uint8x8_t *a3) { // Swap 8 bit elements. Goes from: @@ -207,6 +459,59 @@ static INLINE void transpose_u16_8x4(uint16x8_t *a0, uint16x8_t *a1, *a3 = vreinterpretq_u16_u32(c1.val[1]); } +static INLINE void transpose_s32_8x4(int32x4_t *const a0, int32x4_t *const a1, + int32x4_t *const a2, int32x4_t *const a3, + int32x4_t *const a4, int32x4_t *const a5, + int32x4_t *const a6, int32x4_t *const a7) { + // Swap 32 bit elements. Goes from: + // a0: 00 01 02 03 + // a1: 04 05 06 07 + // a2: 10 11 12 13 + // a3: 14 15 16 17 + // a4: 20 21 22 23 + // a5: 24 25 26 27 + // a6: 30 31 32 33 + // a7: 34 35 36 37 + // to: + // b0.val[0]: 00 10 02 12 + // b0.val[1]: 01 11 03 13 + // b1.val[0]: 04 14 06 16 + // b1.val[1]: 05 15 07 17 + // b2.val[0]: 20 30 22 32 + // b2.val[1]: 21 31 23 33 + // b3.val[0]: 24 34 26 36 + // b3.val[1]: 25 35 27 37 + + const int32x4x2_t b0 = vtrnq_s32(*a0, *a2); + const int32x4x2_t b1 = vtrnq_s32(*a1, *a3); + const int32x4x2_t b2 = vtrnq_s32(*a4, *a6); + const int32x4x2_t b3 = vtrnq_s32(*a5, *a7); + + // Swap 64 bit elements resulting in: + // c0.val[0]: 00 10 20 30 + // c0.val[1]: 02 12 22 32 + // c1.val[0]: 01 11 21 31 + // c1.val[1]: 03 13 23 33 + // c2.val[0]: 04 14 24 34 + // c2.val[1]: 06 16 26 36 + // c3.val[0]: 05 15 25 35 + // c3.val[1]: 07 17 27 37 + + const int64x2x2_t c0 = vpx_vtrnq_s64(b0.val[0], b2.val[0]); + const int64x2x2_t c1 = vpx_vtrnq_s64(b0.val[1], b2.val[1]); + const int64x2x2_t c2 = vpx_vtrnq_s64(b1.val[0], b3.val[0]); + const int64x2x2_t c3 = vpx_vtrnq_s64(b1.val[1], b3.val[1]); + + *a0 = vreinterpretq_s32_s64(c0.val[0]); + *a1 = vreinterpretq_s32_s64(c1.val[0]); + *a2 = vreinterpretq_s32_s64(c0.val[1]); + *a3 = vreinterpretq_s32_s64(c1.val[1]); + *a4 = vreinterpretq_s32_s64(c2.val[0]); + *a5 = vreinterpretq_s32_s64(c3.val[0]); + *a6 = vreinterpretq_s32_s64(c2.val[1]); + *a7 = vreinterpretq_s32_s64(c3.val[1]); +} + // Note: Using 'd' registers or 'q' registers has almost identical speed. We use // 'q' registers here to save some instructions. static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, @@ -319,10 +624,10 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1, // d2.val[1]: 06 16 26 36 46 56 66 76 // d3.val[0]: 03 13 23 33 43 53 63 73 // d3.val[1]: 07 17 27 37 47 57 67 77 - const int16x8x2_t d0 = vpx_vtrnq_s64(c0.val[0], c2.val[0]); - const int16x8x2_t d1 = vpx_vtrnq_s64(c1.val[0], c3.val[0]); - const int16x8x2_t d2 = vpx_vtrnq_s64(c0.val[1], c2.val[1]); - const int16x8x2_t d3 = vpx_vtrnq_s64(c1.val[1], c3.val[1]); + const int16x8x2_t d0 = vpx_vtrnq_s64_to_s16(c0.val[0], c2.val[0]); + const int16x8x2_t d1 = vpx_vtrnq_s64_to_s16(c1.val[0], c3.val[0]); + const int16x8x2_t d2 = vpx_vtrnq_s64_to_s16(c0.val[1], c2.val[1]); + const int16x8x2_t d3 = vpx_vtrnq_s64_to_s16(c1.val[1], c3.val[1]); *a0 = d0.val[0]; *a1 = d1.val[0]; @@ -758,14 +1063,14 @@ static INLINE void transpose_u8_16x16( // e6.val[1]: 0E 1E 2E 3E 4E 5E 6E 7E 8E 9E AE BE CE DE EE FE // e7.val[0]: 07 17 27 37 47 57 67 77 87 97 A7 B7 C7 D7 E7 F7 // e7.val[1]: 0F 1F 2F 3F 4F 5F 6F 7F 8F 9F AF BF CF DF EF FF - const uint8x16x2_t e0 = vpx_vtrnq_u64(d0.val[0], d4.val[0]); - const uint8x16x2_t e1 = vpx_vtrnq_u64(d2.val[0], d6.val[0]); - const uint8x16x2_t e2 = vpx_vtrnq_u64(d1.val[0], d5.val[0]); - const uint8x16x2_t e3 = vpx_vtrnq_u64(d3.val[0], d7.val[0]); - const uint8x16x2_t e4 = vpx_vtrnq_u64(d0.val[1], d4.val[1]); - const uint8x16x2_t e5 = vpx_vtrnq_u64(d2.val[1], d6.val[1]); - const uint8x16x2_t e6 = vpx_vtrnq_u64(d1.val[1], d5.val[1]); - const uint8x16x2_t e7 = vpx_vtrnq_u64(d3.val[1], d7.val[1]); + const uint8x16x2_t e0 = vpx_vtrnq_u64_to_u8(d0.val[0], d4.val[0]); + const uint8x16x2_t e1 = vpx_vtrnq_u64_to_u8(d2.val[0], d6.val[0]); + const uint8x16x2_t e2 = vpx_vtrnq_u64_to_u8(d1.val[0], d5.val[0]); + const uint8x16x2_t e3 = vpx_vtrnq_u64_to_u8(d3.val[0], d7.val[0]); + const uint8x16x2_t e4 = vpx_vtrnq_u64_to_u8(d0.val[1], d4.val[1]); + const uint8x16x2_t e5 = vpx_vtrnq_u64_to_u8(d2.val[1], d6.val[1]); + const uint8x16x2_t e6 = vpx_vtrnq_u64_to_u8(d1.val[1], d5.val[1]); + const uint8x16x2_t e7 = vpx_vtrnq_u64_to_u8(d3.val[1], d7.val[1]); // Output: // o0 : 00 10 20 30 40 50 60 70 80 90 A0 B0 C0 D0 E0 F0 @@ -802,4 +1107,101 @@ static INLINE void transpose_u8_16x16( *o15 = e7.val[1]; } +static INLINE void load_and_transpose_u8_4x8(const uint8_t *a, + const int a_stride, uint8x8_t *a0, + uint8x8_t *a1, uint8x8_t *a2, + uint8x8_t *a3) { + uint8x8_t a4, a5, a6, a7; + *a0 = vld1_u8(a); + a += a_stride; + *a1 = vld1_u8(a); + a += a_stride; + *a2 = vld1_u8(a); + a += a_stride; + *a3 = vld1_u8(a); + a += a_stride; + a4 = vld1_u8(a); + a += a_stride; + a5 = vld1_u8(a); + a += a_stride; + a6 = vld1_u8(a); + a += a_stride; + a7 = vld1_u8(a); + + transpose_u8_4x8(a0, a1, a2, a3, a4, a5, a6, a7); +} + +static INLINE void load_and_transpose_u8_8x8(const uint8_t *a, + const int a_stride, uint8x8_t *a0, + uint8x8_t *a1, uint8x8_t *a2, + uint8x8_t *a3, uint8x8_t *a4, + uint8x8_t *a5, uint8x8_t *a6, + uint8x8_t *a7) { + *a0 = vld1_u8(a); + a += a_stride; + *a1 = vld1_u8(a); + a += a_stride; + *a2 = vld1_u8(a); + a += a_stride; + *a3 = vld1_u8(a); + a += a_stride; + *a4 = vld1_u8(a); + a += a_stride; + *a5 = vld1_u8(a); + a += a_stride; + *a6 = vld1_u8(a); + a += a_stride; + *a7 = vld1_u8(a); + + transpose_u8_8x8(a0, a1, a2, a3, a4, a5, a6, a7); +} + +static INLINE void transpose_and_store_u8_8x8(uint8_t *a, const int a_stride, + uint8x8_t a0, uint8x8_t a1, + uint8x8_t a2, uint8x8_t a3, + uint8x8_t a4, uint8x8_t a5, + uint8x8_t a6, uint8x8_t a7) { + transpose_u8_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + + vst1_u8(a, a0); + a += a_stride; + vst1_u8(a, a1); + a += a_stride; + vst1_u8(a, a2); + a += a_stride; + vst1_u8(a, a3); + a += a_stride; + vst1_u8(a, a4); + a += a_stride; + vst1_u8(a, a5); + a += a_stride; + vst1_u8(a, a6); + a += a_stride; + vst1_u8(a, a7); +} + +static INLINE void load_and_transpose_s16_8x8(const int16_t *a, + const int a_stride, int16x8_t *a0, + int16x8_t *a1, int16x8_t *a2, + int16x8_t *a3, int16x8_t *a4, + int16x8_t *a5, int16x8_t *a6, + int16x8_t *a7) { + *a0 = vld1q_s16(a); + a += a_stride; + *a1 = vld1q_s16(a); + a += a_stride; + *a2 = vld1q_s16(a); + a += a_stride; + *a3 = vld1q_s16(a); + a += a_stride; + *a4 = vld1q_s16(a); + a += a_stride; + *a5 = vld1q_s16(a); + a += a_stride; + *a6 = vld1q_s16(a); + a += a_stride; + *a7 = vld1q_s16(a); + + transpose_s16_8x8(a0, a1, a2, a3, a4, a5, a6, a7); +} #endif // VPX_DSP_ARM_TRANSPOSE_NEON_H_ diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/deblock.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/deblock.c index 589b124e26a..6c27484979a 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/deblock.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/deblock.c @@ -117,7 +117,7 @@ void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, unsigned char d[16]; for (r = 0; r < rows; r++) { - int sumsq = 0; + int sumsq = 16; int sum = 0; for (i = -8; i < 0; i++) s[i] = s[0]; @@ -156,14 +156,12 @@ void vpx_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit) { int r, c, i; - const int16_t *rv3 = &vpx_rv[63 & rand()]; for (c = 0; c < cols; c++) { unsigned char *s = &dst[c]; int sumsq = 0; int sum = 0; unsigned char d[16]; - const int16_t *rv2 = rv3 + ((c * 17) & 127); for (i = -8; i < 0; i++) s[i * pitch] = s[0]; @@ -183,7 +181,7 @@ void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, d[r & 15] = s[0]; if (sumsq * 15 - sum * sum < flimit) { - d[r & 15] = (rv2[r & 127] + sum + s[0]) >> 4; + d[r & 15] = (vpx_rv[(r & 127) + (c & 7)] + sum + s[0]) >> 4; } if (r >= 8) s[-8 * pitch] = d[(r - 8) & 15]; s += pitch; diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c index f3f543ddfe8..0f9aff1892a 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c @@ -67,7 +67,7 @@ void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { } } -void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) { +void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int stride) { int i; tran_high_t a1, e1; tran_low_t tmp[4]; @@ -84,10 +84,10 @@ void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) { for (i = 0; i < 4; i++) { e1 = ip[0] >> 1; a1 = ip[0] - e1; - dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1); - dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1); - dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1); - dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1); + dest[stride * 0] = clip_pixel_add(dest[stride * 0], a1); + dest[stride * 1] = clip_pixel_add(dest[stride * 1], e1); + dest[stride * 2] = clip_pixel_add(dest[stride * 2], e1); + dest[stride * 3] = clip_pixel_add(dest[stride * 3], e1); ip++; dest++; } @@ -138,8 +138,7 @@ void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { } } -void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, - int dest_stride) { +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { int i; tran_high_t a1; tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); @@ -152,7 +151,7 @@ void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, dest[1] = clip_pixel_add(dest[1], a1); dest[2] = clip_pixel_add(dest[2], a1); dest[3] = clip_pixel_add(dest[3], a1); - dest += dest_stride; + dest += stride; } } @@ -1324,7 +1323,7 @@ void vpx_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, } void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, - int dest_stride, int bd) { + int stride, int bd) { int i; tran_high_t a1, e1; tran_low_t tmp[4]; @@ -1343,14 +1342,10 @@ void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, for (i = 0; i < 4; i++) { e1 = ip[0] >> 1; a1 = ip[0] - e1; - dest[dest_stride * 0] = - highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd); - dest[dest_stride * 1] = - highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd); - dest[dest_stride * 2] = - highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd); - dest[dest_stride * 3] = - highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd); + dest[stride * 0] = highbd_clip_pixel_add(dest[stride * 0], a1, bd); + dest[stride * 1] = highbd_clip_pixel_add(dest[stride * 1], e1, bd); + dest[stride * 2] = highbd_clip_pixel_add(dest[stride * 2], e1, bd); + dest[stride * 3] = highbd_clip_pixel_add(dest[stride * 3], e1, bd); ip++; dest++; } @@ -1413,7 +1408,7 @@ void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, } void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8, - int dest_stride, int bd) { + int stride, int bd) { int i; tran_high_t a1; tran_low_t out = @@ -1428,7 +1423,7 @@ void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8, dest[1] = highbd_clip_pixel_add(dest[1], a1, bd); dest[2] = highbd_clip_pixel_add(dest[2], a1, bd); dest[3] = highbd_clip_pixel_add(dest[3], a1, bd); - dest += dest_stride; + dest += stride; } } diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c index 31812299c34..b4ed6ee850a 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/convolve8_avg_dspr2.c @@ -403,8 +403,11 @@ void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { int x, y; - uint32_t tp1, tp2, tn1; - uint32_t tp3, tp4, tn2; + uint32_t tp1, tp2, tn1, tp3, tp4, tn2; + (void)filter_x; + (void)filter_x_stride; + (void)filter_y; + (void)filter_y_stride; /* prefetch data to cache memory */ prefetch_load(src); diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/convolve8_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/convolve8_dspr2.c index f6812c7d049..8d35b6394e2 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/convolve8_dspr2.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/convolve8_dspr2.c @@ -1307,6 +1307,7 @@ void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, assert(y_step_q4 == 16); assert(((const int32_t *)filter_x)[1] != 0x800000); assert(((const int32_t *)filter_y)[1] != 0x800000); + (void)x_step_q4; /* bit positon for extract from acc */ __asm__ __volatile__("wrdsp %[pos], 1 \n\t" @@ -1398,6 +1399,10 @@ void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h) { int x, y; + (void)filter_x; + (void)filter_x_stride; + (void)filter_y; + (void)filter_y_stride; /* prefetch data to cache memory */ prefetch_load(src); diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/deblock_msa.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/deblock_msa.c index cc633c6698d..e33ea740a9e 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/deblock_msa.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/deblock_msa.c @@ -459,7 +459,7 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch, flimit_vec = __msa_fill_w(flimit); for (row = rows; row--;) { - int32_t sum_sq = 0; + int32_t sum_sq; int32_t sum = 0; src0 = (v16u8)__msa_fill_b(src_dup[0]); ST8x1_UB(src0, (src_dup - 8)); @@ -474,7 +474,7 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch, ILVRL_B2_UH(zero, src, src_r_h, src_l_h); src_r_w = __msa_dotp_u_w(src_r_h, src_r_h); src_r_w += __msa_dotp_u_w(src_l_h, src_l_h); - sum_sq = HADD_SW_S32(src_r_w); + sum_sq = HADD_SW_S32(src_r_w) + 16; sum_h = __msa_hadd_u_h(src, src); sum = HADD_UH_U32(sum_h); { @@ -573,7 +573,6 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch, void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows, int32_t cols, int32_t flimit) { int32_t row, col, cnt, i; - const int16_t *rv3 = &vpx_rv[63 & rand()]; v4i32 flimit_vec; v16u8 dst7, dst8, dst_r_b, dst_l_b; v16i8 mask; @@ -601,7 +600,7 @@ void vpx_mbpost_proc_down_msa(uint8_t *dst_ptr, int32_t pitch, int32_t rows, dst = LD_UB(dst_tmp); for (cnt = (col << 4), i = 0; i < 16; ++cnt) { - rv2[i] = rv3 + ((cnt * 17) & 127); + rv2[i] = vpx_rv + (i & 7); ++i; } for (cnt = -8; cnt < 0; ++cnt) { diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred16_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred16_dspr2.c index 3e29d0ac39f..835e10e125c 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred16_dspr2.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred16_dspr2.c @@ -15,6 +15,7 @@ void vpx_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; int32_t tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16; + (void)above; __asm__ __volatile__( "lb %[tmp1], (%[left]) \n\t" diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred4_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred4_dspr2.c index 9f51d50c752..dce03a2b2a0 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred4_dspr2.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred4_dspr2.c @@ -14,6 +14,7 @@ void vpx_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { int32_t tmp1, tmp2, tmp3, tmp4; + (void)above; __asm__ __volatile__( "lb %[tmp1], (%[left]) \n\t" diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred8_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred8_dspr2.c index eac79d51000..16e7fc55079 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred8_dspr2.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/intrapred8_dspr2.c @@ -14,6 +14,7 @@ void vpx_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; + (void)above; __asm__ __volatile__( "lb %[tmp1], (%[left]) \n\t" diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h index edd54aec5e2..27881f0db6c 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/inv_txfm_dspr2.h @@ -57,18 +57,15 @@ extern "C" { out; \ }) -void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride); +void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride); void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output); -void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride); +void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride); void iadst4_dspr2(const int16_t *input, int16_t *output); void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows); -void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride); +void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride); void iadst8_dspr2(const int16_t *input, int16_t *output); void idct16_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows); -void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride); +void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride); void iadst16_dspr2(const int16_t *input, int16_t *output); #endif // #if HAVE_DSPR2 diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans16_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans16_dspr2.c index 0ec0c2059f4..44ba65c7ac8 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans16_dspr2.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans16_dspr2.c @@ -389,7 +389,7 @@ void idct16_rows_dspr2(const int16_t *input, int16_t *output, } } -void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) { +void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride) { int i; int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; int step1_8, step1_9, step1_10, step1_11; @@ -712,14 +712,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) { "add %[load6], %[step1_1], %[step1_6] \n\t" "add %[load6], %[load6], %[step1_14] \n\t" "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[load8], 0(%[dest_pix]) \n\t" "addi %[load6], %[load6], 32 \n\t" "sra %[load6], %[load6], 6 \n\t" "add %[load8], %[load8], %[load6] \n\t" "lbux %[load6], %[load8](%[cm]) \n\t" "sb %[load6], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[load7], 0(%[dest_pix]) \n\t" "add %[load5], %[step1_2], %[step1_5] \n\t" @@ -731,14 +731,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) { "add %[load6], %[step1_3], %[step1_4] \n\t" "add %[load6], %[load6], %[step1_12] \n\t" "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[load8], 0(%[dest_pix]) \n\t" "addi %[load6], %[load6], 32 \n\t" "sra %[load6], %[load6], 6 \n\t" "add %[load8], %[load8], %[load6] \n\t" "lbux %[load6], %[load8](%[cm]) \n\t" "sb %[load6], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[load7], 0(%[dest_pix]) \n\t" "sub %[load5], %[step1_3], %[step1_4] \n\t" @@ -750,14 +750,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) { "sub %[load6], %[step1_2], %[step1_5] \n\t" "add %[load6], %[load6], %[step1_10] \n\t" "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[load8], 0(%[dest_pix]) \n\t" "addi %[load6], %[load6], 32 \n\t" "sra %[load6], %[load6], 6 \n\t" "add %[load8], %[load8], %[load6] \n\t" "lbux %[load6], %[load8](%[cm]) \n\t" "sb %[load6], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "sub %[load5], %[step1_1], %[step1_6] \n\t" "lbu %[load7], 0(%[dest_pix]) \n\t" @@ -769,14 +769,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) { "sub %[load6], %[step1_0], %[step1_7] \n\t" "add %[load6], %[load6], %[step1_8] \n\t" "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[load8], 0(%[dest_pix]) \n\t" "addi %[load6], %[load6], 32 \n\t" "sra %[load6], %[load6], 6 \n\t" "add %[load8], %[load8], %[load6] \n\t" "lbux %[load6], %[load8](%[cm]) \n\t" "sb %[load6], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[load7], 0(%[dest_pix]) \n\t" "sub %[load5], %[step1_0], %[step1_7] \n\t" @@ -788,14 +788,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) { "sub %[load6], %[step1_1], %[step1_6] \n\t" "sub %[load6], %[load6], %[step1_9] \n\t" "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[load8], 0(%[dest_pix]) \n\t" "addi %[load6], %[load6], 32 \n\t" "sra %[load6], %[load6], 6 \n\t" "add %[load8], %[load8], %[load6] \n\t" "lbux %[load6], %[load8](%[cm]) \n\t" "sb %[load6], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[load7], 0(%[dest_pix]) \n\t" "sub %[load5], %[step1_2], %[step1_5] \n\t" @@ -807,14 +807,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) { "sub %[load6], %[step1_3], %[step1_4] \n\t" "sub %[load6], %[load6], %[step1_11] \n\t" "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[load8], 0(%[dest_pix]) \n\t" "addi %[load6], %[load6], 32 \n\t" "sra %[load6], %[load6], 6 \n\t" "add %[load8], %[load8], %[load6] \n\t" "lbux %[load6], %[load8](%[cm]) \n\t" "sb %[load6], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[load7], 0(%[dest_pix]) \n\t" "add %[load5], %[step1_3], %[step1_4] \n\t" @@ -826,14 +826,14 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) { "add %[load6], %[step1_2], %[step1_5] \n\t" "sub %[load6], %[load6], %[step1_13] \n\t" "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[load8], 0(%[dest_pix]) \n\t" "addi %[load6], %[load6], 32 \n\t" "sra %[load6], %[load6], 6 \n\t" "add %[load8], %[load8], %[load6] \n\t" "lbux %[load6], %[load8](%[cm]) \n\t" "sb %[load6], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[load7], 0(%[dest_pix]) \n\t" "add %[load5], %[step1_1], %[step1_6] \n\t" @@ -845,7 +845,7 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) { "add %[load6], %[step1_0], %[step1_7] \n\t" "sub %[load6], %[load6], %[step1_15] \n\t" "sb %[load5], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[load8], 0(%[dest_pix]) \n\t" "addi %[load6], %[load6], 32 \n\t" "sra %[load6], %[load6], 6 \n\t" @@ -856,7 +856,7 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) { : [load5] "=&r"(load5), [load6] "=&r"(load6), [load7] "=&r"(load7), [load8] "=&r"(load8), [dest_pix] "+r"(dest_pix) : - [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0), + [cm] "r"(cm), [stride] "r"(stride), [step1_0] "r"(step1_0), [step1_1] "r"(step1_1), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3), [step1_4] "r"(step1_4), [step1_5] "r"(step1_5), [step1_6] "r"(step1_6), [step1_7] "r"(step1_7), [step1_8] "r"(step1_8), [step1_9] "r"(step1_9), @@ -869,7 +869,7 @@ void idct16_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int dest_stride) { } void vpx_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { + int stride) { DECLARE_ALIGNED(32, int16_t, out[16 * 16]); uint32_t pos = 45; @@ -880,11 +880,11 @@ void vpx_idct16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, idct16_rows_dspr2(input, out, 16); // Then transform columns and add to dest - idct16_cols_add_blk_dspr2(out, dest, dest_stride); + idct16_cols_add_blk_dspr2(out, dest, stride); } void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { + int stride) { DECLARE_ALIGNED(32, int16_t, out[16 * 16]); int16_t *outptr = out; uint32_t i; @@ -924,11 +924,11 @@ void vpx_idct16x16_10_add_dspr2(const int16_t *input, uint8_t *dest, } // Then transform columns - idct16_cols_add_blk_dspr2(out, dest, dest_stride); + idct16_cols_add_blk_dspr2(out, dest, stride); } void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { + int stride) { uint32_t pos = 45; int32_t out; int32_t r; @@ -975,13 +975,54 @@ void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, "sw %[vector_2], 4(%[dest]) \n\t" "sw %[vector_3], 8(%[dest]) \n\t" "sw %[vector_4], 12(%[dest]) \n\t" - "add %[dest], %[dest], %[dest_stride] \n\t" + "add %[dest], %[dest], %[stride] \n\t" : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), [dest] "+&r"(dest) - : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); + : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); + } + } else if (a1 > 255) { + int32_t a11, a12, vector_a11, vector_a12; + + /* use quad-byte + * input and output memory are four byte aligned */ + a11 = a1 >> 1; + a12 = a1 - a11; + __asm__ __volatile__( + "replv.qb %[vector_a11], %[a11] \n\t" + "replv.qb %[vector_a12], %[a12] \n\t" + + : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12) + : [a11] "r"(a11), [a12] "r"(a12)); + + for (r = 16; r--;) { + __asm__ __volatile__( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "lw %[t3], 8(%[dest]) \n\t" + "lw %[t4], 12(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a11] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a11] \n\t" + "addu_s.qb %[vector_3], %[t3], %[vector_a11] \n\t" + "addu_s.qb %[vector_4], %[t4], %[vector_a11] \n\t" + "addu_s.qb %[vector_1], %[vector_1], %[vector_a12] \n\t" + "addu_s.qb %[vector_2], %[vector_2], %[vector_a12] \n\t" + "addu_s.qb %[vector_3], %[vector_3], %[vector_a12] \n\t" + "addu_s.qb %[vector_4], %[vector_4], %[vector_a12] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "sw %[vector_3], 8(%[dest]) \n\t" + "sw %[vector_4], 12(%[dest]) \n\t" + "add %[dest], %[dest], %[stride] \n\t" + + : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), + [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), + [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), + [dest] "+&r"(dest) + : [stride] "r"(stride), [vector_a11] "r"(vector_a11), + [vector_a12] "r"(vector_a12)); } } else { /* use quad-byte @@ -1005,13 +1046,13 @@ void vpx_idct16x16_1_add_dspr2(const int16_t *input, uint8_t *dest, "sw %[vector_2], 4(%[dest]) \n\t" "sw %[vector_3], 8(%[dest]) \n\t" "sw %[vector_4], 12(%[dest]) \n\t" - "add %[dest], %[dest], %[dest_stride] \n\t" + "add %[dest], %[dest], %[stride] \n\t" : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), [dest] "+&r"(dest) - : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); + : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); } } } diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c index ce25d55c9c0..3f043b48baf 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans32_cols_dspr2.c @@ -13,26 +13,25 @@ #include "vpx_dsp/txfm_common.h" #if HAVE_DSPR2 -void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride) { - int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; - int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; - int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19; - int16_t step1_20, step1_21, step1_22, step1_23, step1_24, step1_25, step1_26; - int16_t step1_27, step1_28, step1_29, step1_30, step1_31; - int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; - int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13; - int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20; - int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27; - int16_t step2_28, step2_29, step2_30, step2_31; - int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14; - int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21; - int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27; - int16_t step3_28, step3_29, step3_30, step3_31; +void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride) { + int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; + int step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; + int step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20; + int step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27; + int step1_28, step1_29, step1_30, step1_31; + int step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; + int step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13; + int step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20; + int step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27; + int step2_28, step2_29, step2_30, step2_31; + int step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14; + int step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21; + int step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28; + int step3_29, step3_30, step3_31; int temp0, temp1, temp2, temp3; int load1, load2, load3, load4; int result1, result2; - int i, temp21; + int i; uint8_t *dest_pix, *dest_pix1; const int const_2_power_13 = 8192; uint8_t *cm = vpx_ff_cropTbl; @@ -49,7 +48,7 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, for (i = 0; i < 32; ++i) { dest_pix = dest + i; - dest_pix1 = dest + i + 31 * dest_stride; + dest_pix1 = dest + i + 31 * stride; __asm__ __volatile__( "lh %[load1], 2(%[input]) \n\t" @@ -103,9 +102,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16), - [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30), - [step1_31] "=r"(step1_31) + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), + [step1_16] "=&r"(step1_16), [step1_17] "=&r"(step1_17), + [step1_30] "=&r"(step1_30), [step1_31] "=&r"(step1_31) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64), [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64), @@ -163,9 +162,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18), - [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28), - [step1_29] "=r"(step1_29) + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), + [step1_18] "=&r"(step1_18), [step1_19] "=&r"(step1_19), + [step1_28] "=&r"(step1_28), [step1_29] "=&r"(step1_29) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64), [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64), @@ -223,9 +222,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20), - [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26), - [step1_27] "=r"(step1_27) + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), + [step1_20] "=&r"(step1_20), [step1_21] "=&r"(step1_21), + [step1_26] "=&r"(step1_26), [step1_27] "=&r"(step1_27) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64), [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64), @@ -279,9 +278,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22), - [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24), - [step1_25] "=r"(step1_25) + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), + [step1_22] "=&r"(step1_22), [step1_23] "=&r"(step1_23), + [step1_24] "=&r"(step1_24), [step1_25] "=&r"(step1_25) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64), [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64), @@ -335,9 +334,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8), - [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14), - [step2_15] "=r"(step2_15) + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=&r"(step2_8), + [step2_9] "=&r"(step2_9), [step2_14] "=&r"(step2_14), + [step2_15] "=&r"(step2_15) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), @@ -391,9 +390,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10), - [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12), - [step2_13] "=r"(step2_13) + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), + [step2_10] "=&r"(step2_10), [step2_11] "=&r"(step2_11), + [step2_12] "=&r"(step2_12), [step2_13] "=&r"(step2_13) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), @@ -434,116 +433,154 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "extp %[step3_11], $ac2, 31 \n\t" "extp %[step3_12], $ac3, 31 \n\t" - : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8), - [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10), - [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12), - [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14), - [step3_15] "=r"(step3_15) + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=&r"(step3_8), + [step3_9] "=&r"(step3_9), [step3_10] "=&r"(step3_10), + [step3_11] "=&r"(step3_11), [step3_12] "=&r"(step3_12), + [step3_13] "=&r"(step3_13), [step3_14] "=&r"(step3_14), + [step3_15] "=&r"(step3_15) : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8), [step2_9] "r"(step2_9), [step2_10] "r"(step2_10), [step2_11] "r"(step2_11), [step2_12] "r"(step2_12), [step2_13] "r"(step2_13), [step2_14] "r"(step2_14), [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64)); - step2_18 = step1_17 - step1_18; - step2_29 = step1_30 - step1_29; - __asm__ __volatile__( "mtlo %[const_2_power_13], $ac0 \n\t" "mthi $zero, $ac0 \n\t" - "msub $ac0, %[step2_18], %[cospi_8_64] \n\t" - "madd $ac0, %[step2_29], %[cospi_24_64] \n\t" - "extp %[step3_18], $ac0, 31 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sub %[temp0], %[step1_17], %[step1_18] \n\t" + "sub %[temp1], %[step1_30], %[step1_29] \n\t" + "add %[step3_17], %[step1_17], %[step1_18] \n\t" + "add %[step3_30], %[step1_30], %[step1_29] \n\t" - : [step3_18] "=r"(step3_18) - : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18), - [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64), + "msub $ac0, %[temp0], %[cospi_8_64] \n\t" + "madd $ac0, %[temp1], %[cospi_24_64] \n\t" + "extp %[step3_18], $ac0, 31 \n\t" + "madd $ac1, %[temp0], %[cospi_24_64] \n\t" + "madd $ac1, %[temp1], %[cospi_8_64] \n\t" + "extp %[step3_29], $ac1, 31 \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [step3_18] "=&r"(step3_18), [step3_29] "=&r"(step3_29), + [step3_17] "=&r"(step3_17), [step3_30] "=&r"(step3_30) + : [const_2_power_13] "r"(const_2_power_13), [step1_17] "r"(step1_17), + [step1_18] "r"(step1_18), [step1_30] "r"(step1_30), + [step1_29] "r"(step1_29), [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); - temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64; - step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step2_19 = step1_16 - step1_19; - step2_28 = step1_31 - step1_28; - __asm__ __volatile__( "mtlo %[const_2_power_13], $ac0 \n\t" "mthi $zero, $ac0 \n\t" - "msub $ac0, %[step2_19], %[cospi_8_64] \n\t" - "madd $ac0, %[step2_28], %[cospi_24_64] \n\t" - "extp %[step3_19], $ac0, 31 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sub %[temp0], %[step1_16], %[step1_19] \n\t" + "sub %[temp1], %[step1_31], %[step1_28] \n\t" + "add %[step3_16], %[step1_16], %[step1_19] \n\t" + "add %[step3_31], %[step1_31], %[step1_28] \n\t" - : [step3_19] "=r"(step3_19) - : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19), - [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64), + "msub $ac0, %[temp0], %[cospi_8_64] \n\t" + "madd $ac0, %[temp1], %[cospi_24_64] \n\t" + "extp %[step3_19], $ac0, 31 \n\t" + "madd $ac1, %[temp0], %[cospi_24_64] \n\t" + "madd $ac1, %[temp1], %[cospi_8_64] \n\t" + "extp %[step3_28], $ac1, 31 \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [step3_16] "=&r"(step3_16), [step3_31] "=&r"(step3_31), + [step3_19] "=&r"(step3_19), [step3_28] "=&r"(step3_28) + : [const_2_power_13] "r"(const_2_power_13), [step1_16] "r"(step1_16), + [step1_19] "r"(step1_19), [step1_31] "r"(step1_31), + [step1_28] "r"(step1_28), [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); - temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64; - step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step3_16 = step1_16 + step1_19; - step3_17 = step1_17 + step1_18; - step3_30 = step1_29 + step1_30; - step3_31 = step1_28 + step1_31; - - step2_20 = step1_23 - step1_20; - step2_27 = step1_24 - step1_27; - __asm__ __volatile__( "mtlo %[const_2_power_13], $ac0 \n\t" "mthi $zero, $ac0 \n\t" - "msub $ac0, %[step2_20], %[cospi_24_64] \n\t" - "msub $ac0, %[step2_27], %[cospi_8_64] \n\t" - "extp %[step3_20], $ac0, 31 \n\t" - - : [step3_20] "=r"(step3_20) - : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), - [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64), + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sub %[temp0], %[step1_23], %[step1_20] \n\t" + "sub %[temp1], %[step1_24], %[step1_27] \n\t" + "add %[step3_23], %[step1_23], %[step1_20] \n\t" + "add %[step3_24], %[step1_24], %[step1_27] \n\t" + + "msub $ac0, %[temp0], %[cospi_8_64] \n\t" + "madd $ac0, %[temp1], %[cospi_24_64] \n\t" + "extp %[step3_27], $ac0, 31 \n\t" + "msub $ac1, %[temp0], %[cospi_24_64] \n\t" + "msub $ac1, %[temp1], %[cospi_8_64] \n\t" + "extp %[step3_20], $ac1, 31 \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [step3_23] "=&r"(step3_23), [step3_24] "=&r"(step3_24), + [step3_20] "=&r"(step3_20), [step3_27] "=&r"(step3_27) + : [const_2_power_13] "r"(const_2_power_13), [step1_23] "r"(step1_23), + [step1_20] "r"(step1_20), [step1_24] "r"(step1_24), + [step1_27] "r"(step1_27), [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); - temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64; - step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step2_21 = step1_22 - step1_21; - step2_26 = step1_25 - step1_26; - __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" "mtlo %[const_2_power_13], $ac1 \n\t" "mthi $zero, $ac1 \n\t" - "msub $ac1, %[step2_21], %[cospi_24_64] \n\t" - "msub $ac1, %[step2_26], %[cospi_8_64] \n\t" - "extp %[step3_21], $ac1, 31 \n\t" - - : [step3_21] "=r"(step3_21) - : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21), - [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64), + "sub %[temp0], %[step1_22], %[step1_21] \n\t" + "sub %[temp1], %[step1_25], %[step1_26] \n\t" + "add %[step3_22], %[step1_22], %[step1_21] \n\t" + "add %[step3_25], %[step1_25], %[step1_26] \n\t" + + "msub $ac0, %[temp0], %[cospi_24_64] \n\t" + "msub $ac0, %[temp1], %[cospi_8_64] \n\t" + "extp %[step3_21], $ac0, 31 \n\t" + "msub $ac1, %[temp0], %[cospi_8_64] \n\t" + "madd $ac1, %[temp1], %[cospi_24_64] \n\t" + "extp %[step3_26], $ac1, 31 \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [step3_22] "=&r"(step3_22), [step3_25] "=&r"(step3_25), + [step3_21] "=&r"(step3_21), [step3_26] "=&r"(step3_26) + : [const_2_power_13] "r"(const_2_power_13), [step1_22] "r"(step1_22), + [step1_21] "r"(step1_21), [step1_25] "r"(step1_25), + [step1_26] "r"(step1_26), [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); - temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64; - step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step3_22 = step1_21 + step1_22; - step3_23 = step1_20 + step1_23; - step3_24 = step1_24 + step1_27; - step3_25 = step1_25 + step1_26; - - step2_16 = step3_16 + step3_23; - step2_17 = step3_17 + step3_22; - step2_18 = step3_18 + step3_21; - step2_19 = step3_19 + step3_20; - step2_20 = step3_19 - step3_20; - step2_21 = step3_18 - step3_21; - step2_22 = step3_17 - step3_22; - step2_23 = step3_16 - step3_23; - - step2_24 = step3_31 - step3_24; - step2_25 = step3_30 - step3_25; - step2_26 = step3_29 - step3_26; - step2_27 = step3_28 - step3_27; - step2_28 = step3_28 + step3_27; - step2_29 = step3_29 + step3_26; - step2_30 = step3_30 + step3_25; - step2_31 = step3_31 + step3_24; + __asm__ __volatile__( + "add %[step2_16], %[step3_16], %[step3_23] \n\t" + "add %[step2_17], %[step3_17], %[step3_22] \n\t" + "add %[step2_18], %[step3_18], %[step3_21] \n\t" + "add %[step2_19], %[step3_19], %[step3_20] \n\t" + "sub %[step2_20], %[step3_19], %[step3_20] \n\t" + "sub %[step2_21], %[step3_18], %[step3_21] \n\t" + "sub %[step2_22], %[step3_17], %[step3_22] \n\t" + "sub %[step2_23], %[step3_16], %[step3_23] \n\t" + + : [step2_16] "=&r"(step2_16), [step2_17] "=&r"(step2_17), + [step2_18] "=&r"(step2_18), [step2_19] "=&r"(step2_19), + [step2_20] "=&r"(step2_20), [step2_21] "=&r"(step2_21), + [step2_22] "=&r"(step2_22), [step2_23] "=&r"(step2_23) + : [step3_16] "r"(step3_16), [step3_23] "r"(step3_23), + [step3_17] "r"(step3_17), [step3_22] "r"(step3_22), + [step3_18] "r"(step3_18), [step3_21] "r"(step3_21), + [step3_19] "r"(step3_19), [step3_20] "r"(step3_20)); + + __asm__ __volatile__( + "sub %[step2_24], %[step3_31], %[step3_24] \n\t" + "sub %[step2_25], %[step3_30], %[step3_25] \n\t" + "sub %[step2_26], %[step3_29], %[step3_26] \n\t" + "sub %[step2_27], %[step3_28], %[step3_27] \n\t" + "add %[step2_28], %[step3_28], %[step3_27] \n\t" + "add %[step2_29], %[step3_29], %[step3_26] \n\t" + "add %[step2_30], %[step3_30], %[step3_25] \n\t" + "add %[step2_31], %[step3_31], %[step3_24] \n\t" + + : [step2_24] "=&r"(step2_24), [step2_28] "=&r"(step2_28), + [step2_25] "=&r"(step2_25), [step2_29] "=&r"(step2_29), + [step2_26] "=&r"(step2_26), [step2_30] "=&r"(step2_30), + [step2_27] "=&r"(step2_27), [step2_31] "=&r"(step2_31) + : [step3_31] "r"(step3_31), [step3_24] "r"(step3_24), + [step3_30] "r"(step3_30), [step3_25] "r"(step3_25), + [step3_29] "r"(step3_29), [step3_26] "r"(step3_26), + [step3_28] "r"(step3_28), [step3_27] "r"(step3_27)); __asm__ __volatile__( "lh %[load1], 0(%[input]) \n\t" @@ -580,9 +617,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), [load4] "=&r"(load4), [result1] "=&r"(result1), [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0), - [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2), - [step1_3] "=r"(step1_3) + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=&r"(step1_0), + [step1_1] "=&r"(step1_1), [step1_2] "=&r"(step1_2), + [step1_3] "=&r"(step1_3) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64)); @@ -638,96 +675,137 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4), - [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6), - [step1_7] "=r"(step1_7) + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=&r"(step1_4), + [step1_5] "=&r"(step1_5), [step1_6] "=&r"(step1_6), + [step1_7] "=&r"(step1_7) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), [cospi_16_64] "r"(cospi_16_64)); - step2_0 = step1_0 + step1_7; - step2_1 = step1_1 + step1_6; - step2_2 = step1_2 + step1_5; - step2_3 = step1_3 + step1_4; - step2_4 = step1_3 - step1_4; - step2_5 = step1_2 - step1_5; - step2_6 = step1_1 - step1_6; - step2_7 = step1_0 - step1_7; + __asm__ __volatile__( + "add %[step2_0], %[step1_0], %[step1_7] \n\t" + "add %[step2_1], %[step1_1], %[step1_6] \n\t" + "add %[step2_2], %[step1_2], %[step1_5] \n\t" + "add %[step2_3], %[step1_3], %[step1_4] \n\t" + "sub %[step2_4], %[step1_3], %[step1_4] \n\t" + "sub %[step2_5], %[step1_2], %[step1_5] \n\t" + "sub %[step2_6], %[step1_1], %[step1_6] \n\t" + "sub %[step2_7], %[step1_0], %[step1_7] \n\t" + + : [step2_0] "=&r"(step2_0), [step2_4] "=&r"(step2_4), + [step2_1] "=&r"(step2_1), [step2_5] "=&r"(step2_5), + [step2_2] "=&r"(step2_2), [step2_6] "=&r"(step2_6), + [step2_3] "=&r"(step2_3), [step2_7] "=&r"(step2_7) + : [step1_0] "r"(step1_0), [step1_7] "r"(step1_7), + [step1_1] "r"(step1_1), [step1_6] "r"(step1_6), + [step1_2] "r"(step1_2), [step1_5] "r"(step1_5), + [step1_3] "r"(step1_3), [step1_4] "r"(step1_4)); // stage 7 - step1_0 = step2_0 + step3_15; - step1_1 = step2_1 + step3_14; - step1_2 = step2_2 + step3_13; - step1_3 = step2_3 + step3_12; - step1_4 = step2_4 + step3_11; - step1_5 = step2_5 + step3_10; - step1_6 = step2_6 + step3_9; - step1_7 = step2_7 + step3_8; - step1_8 = step2_7 - step3_8; - step1_9 = step2_6 - step3_9; - step1_10 = step2_5 - step3_10; - step1_11 = step2_4 - step3_11; - step1_12 = step2_3 - step3_12; - step1_13 = step2_2 - step3_13; - step1_14 = step2_1 - step3_14; - step1_15 = step2_0 - step3_15; - __asm__ __volatile__( - "sub %[temp0], %[step2_27], %[step2_20] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_20], $ac0, 31 \n\t" + "add %[step1_0], %[step2_0], %[step3_15] \n\t" + "add %[step1_1], %[step2_1], %[step3_14] \n\t" + "add %[step1_2], %[step2_2], %[step3_13] \n\t" + "add %[step1_3], %[step2_3], %[step3_12] \n\t" + "sub %[step1_12], %[step2_3], %[step3_12] \n\t" + "sub %[step1_13], %[step2_2], %[step3_13] \n\t" + "sub %[step1_14], %[step2_1], %[step3_14] \n\t" + "sub %[step1_15], %[step2_0], %[step3_15] \n\t" + + : [step1_0] "=&r"(step1_0), [step1_12] "=&r"(step1_12), + [step1_1] "=&r"(step1_1), [step1_13] "=&r"(step1_13), + [step1_2] "=&r"(step1_2), [step1_14] "=&r"(step1_14), + [step1_3] "=&r"(step1_3), [step1_15] "=&r"(step1_15) + : [step2_0] "r"(step2_0), [step3_15] "r"(step3_15), + [step2_1] "r"(step2_1), [step3_14] "r"(step3_14), + [step2_2] "r"(step2_2), [step3_13] "r"(step3_13), + [step2_3] "r"(step2_3), [step3_12] "r"(step3_12)); - : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20) - : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), - [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64)); - - temp21 = (step2_20 + step2_27) * cospi_16_64; - step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + __asm__ __volatile__( + "add %[step1_4], %[step2_4], %[step3_11] \n\t" + "add %[step1_5], %[step2_5], %[step3_10] \n\t" + "add %[step1_6], %[step2_6], %[step3_9] \n\t" + "add %[step1_7], %[step2_7], %[step3_8] \n\t" + "sub %[step1_8], %[step2_7], %[step3_8] \n\t" + "sub %[step1_9], %[step2_6], %[step3_9] \n\t" + "sub %[step1_10], %[step2_5], %[step3_10] \n\t" + "sub %[step1_11], %[step2_4], %[step3_11] \n\t" + + : [step1_4] "=&r"(step1_4), [step1_8] "=&r"(step1_8), + [step1_5] "=&r"(step1_5), [step1_9] "=&r"(step1_9), + [step1_6] "=&r"(step1_6), [step1_10] "=&r"(step1_10), + [step1_7] "=&r"(step1_7), [step1_11] "=&r"(step1_11) + : [step2_4] "r"(step2_4), [step3_11] "r"(step3_11), + [step2_5] "r"(step2_5), [step3_10] "r"(step3_10), + [step2_6] "r"(step2_6), [step3_9] "r"(step3_9), + [step2_7] "r"(step2_7), [step3_8] "r"(step3_8)); __asm__ __volatile__( - "sub %[temp0], %[step2_26], %[step2_21] \n\t" + "sub %[temp0], %[step2_27], %[step2_20] \n\t" + "add %[temp1], %[step2_27], %[step2_20] \n\t" + "sub %[temp2], %[step2_26], %[step2_21] \n\t" + "add %[temp3], %[step2_26], %[step2_21] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_21], $ac0, 31 \n\t" + "madd $ac1, %[temp1], %[cospi_16_64] \n\t" + "madd $ac2, %[temp2], %[cospi_16_64] \n\t" + "madd $ac3, %[temp3], %[cospi_16_64] \n\t" - : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21) - : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26), - [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64)); + "extp %[step1_20], $ac0, 31 \n\t" + "extp %[step1_27], $ac1, 31 \n\t" + "extp %[step1_21], $ac2, 31 \n\t" + "extp %[step1_26], $ac3, 31 \n\t" - temp21 = (step2_21 + step2_26) * cospi_16_64; - step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [step1_20] "=&r"(step1_20), + [step1_27] "=&r"(step1_27), [step1_21] "=&r"(step1_21), + [step1_26] "=&r"(step1_26) + : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), + [step2_27] "r"(step2_27), [step2_21] "r"(step2_21), + [step2_26] "r"(step2_26), [cospi_16_64] "r"(cospi_16_64)); __asm__ __volatile__( "sub %[temp0], %[step2_25], %[step2_22] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_22], $ac0, 31 \n\t" - - : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22) - : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25), - [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64)); + "add %[temp1], %[step2_25], %[step2_22] \n\t" + "sub %[temp2], %[step2_24], %[step2_23] \n\t" + "add %[temp3], %[step2_24], %[step2_23] \n\t" - temp21 = (step2_22 + step2_25) * cospi_16_64; - step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - __asm__ __volatile__( - "sub %[temp0], %[step2_24], %[step2_23] \n\t" "mtlo %[const_2_power_13], $ac0 \n\t" "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_23], $ac0, 31 \n\t" + "madd $ac1, %[temp1], %[cospi_16_64] \n\t" + "madd $ac2, %[temp2], %[cospi_16_64] \n\t" + "madd $ac3, %[temp3], %[cospi_16_64] \n\t" - : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23) - : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24), - [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64)); + "extp %[step1_22], $ac0, 31 \n\t" + "extp %[step1_25], $ac1, 31 \n\t" + "extp %[step1_23], $ac2, 31 \n\t" + "extp %[step1_24], $ac3, 31 \n\t" - temp21 = (step2_23 + step2_24) * cospi_16_64; - step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [step1_22] "=&r"(step1_22), + [step1_25] "=&r"(step1_25), [step1_23] "=&r"(step1_23), + [step1_24] "=&r"(step1_24) + : [const_2_power_13] "r"(const_2_power_13), [step2_22] "r"(step2_22), + [step2_25] "r"(step2_25), [step2_23] "r"(step2_23), + [step2_24] "r"(step2_24), [cospi_16_64] "r"(cospi_16_64)); __asm__ __volatile__( "lbu %[temp2], 0(%[dest_pix]) \n\t" @@ -738,14 +816,14 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "lbux %[temp0], %[temp2](%[cm]) \n\t" "add %[temp1], %[step1_1], %[step2_30] \n\t" "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[temp3], 0(%[dest_pix]) \n\t" "addi %[temp1], %[temp1], 32 \n\t" "sra %[temp1], %[temp1], 6 \n\t" "add %[temp3], %[temp3], %[temp1] \n\t" "lbux %[temp1], %[temp3](%[cm]) \n\t" "sb %[temp1], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[temp2], 0(%[dest_pix]) \n\t" "add %[temp0], %[step1_2], %[step2_29] \n\t" @@ -755,18 +833,18 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "lbux %[temp0], %[temp2](%[cm]) \n\t" "add %[temp1], %[step1_3], %[step2_28] \n\t" "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[temp3], 0(%[dest_pix]) \n\t" "addi %[temp1], %[temp1], 32 \n\t" "sra %[temp1], %[temp1], 6 \n\t" "add %[temp3], %[temp3], %[temp1] \n\t" "lbux %[temp1], %[temp3](%[cm]) \n\t" "sb %[temp1], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) - : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_0] "r"(step1_0), + : [cm] "r"(cm), [stride] "r"(stride), [step1_0] "r"(step1_0), [step1_1] "r"(step1_1), [step1_2] "r"(step1_2), [step1_3] "r"(step1_3), [step2_28] "r"(step2_28), [step2_29] "r"(step2_29), [step2_30] "r"(step2_30), @@ -782,29 +860,29 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "add %[temp2], %[temp2], %[step3_15] \n\t" "lbux %[temp0], %[temp2](%[cm]) \n\t" "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" "lbu %[temp3], 0(%[dest_pix1]) \n\t" "add %[temp3], %[temp3], %[step3_14] \n\t" "lbux %[temp1], %[temp3](%[cm]) \n\t" "sb %[temp1], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" "lbu %[temp2], 0(%[dest_pix1]) \n\t" "add %[temp2], %[temp2], %[step3_13] \n\t" "lbux %[temp0], %[temp2](%[cm]) \n\t" "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" "lbu %[temp3], 0(%[dest_pix1]) \n\t" "add %[temp3], %[temp3], %[step3_12] \n\t" "lbux %[temp1], %[temp3](%[cm]) \n\t" "sb %[temp1], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) - : [cm] "r"(cm), [dest_stride] "r"(dest_stride), - [step3_12] "r"(step3_12), [step3_13] "r"(step3_13), - [step3_14] "r"(step3_14), [step3_15] "r"(step3_15)); + : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12), + [step3_13] "r"(step3_13), [step3_14] "r"(step3_14), + [step3_15] "r"(step3_15)); __asm__ __volatile__( "lbu %[temp2], 0(%[dest_pix]) \n\t" @@ -815,14 +893,14 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "lbux %[temp0], %[temp2](%[cm]) \n\t" "add %[temp1], %[step1_5], %[step1_26] \n\t" "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[temp3], 0(%[dest_pix]) \n\t" "addi %[temp1], %[temp1], 32 \n\t" "sra %[temp1], %[temp1], 6 \n\t" "add %[temp3], %[temp3], %[temp1] \n\t" "lbux %[temp1], %[temp3](%[cm]) \n\t" "sb %[temp1], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[temp2], 0(%[dest_pix]) \n\t" "add %[temp0], %[step1_6], %[step1_25] \n\t" @@ -832,18 +910,18 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "lbux %[temp0], %[temp2](%[cm]) \n\t" "add %[temp1], %[step1_7], %[step1_24] \n\t" "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[temp3], 0(%[dest_pix]) \n\t" "addi %[temp1], %[temp1], 32 \n\t" "sra %[temp1], %[temp1], 6 \n\t" "add %[temp3], %[temp3], %[temp1] \n\t" "lbux %[temp1], %[temp3](%[cm]) \n\t" "sb %[temp1], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) - : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_4] "r"(step1_4), + : [cm] "r"(cm), [stride] "r"(stride), [step1_4] "r"(step1_4), [step1_5] "r"(step1_5), [step1_6] "r"(step1_6), [step1_7] "r"(step1_7), [step1_24] "r"(step1_24), [step1_25] "r"(step1_25), [step1_26] "r"(step1_26), @@ -859,29 +937,29 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "add %[temp2], %[temp2], %[step3_15] \n\t" "lbux %[temp0], %[temp2](%[cm]) \n\t" "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" "lbu %[temp3], 0(%[dest_pix1]) \n\t" "add %[temp3], %[temp3], %[step3_14] \n\t" "lbux %[temp1], %[temp3](%[cm]) \n\t" "sb %[temp1], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" "lbu %[temp2], 0(%[dest_pix1]) \n\t" "add %[temp2], %[temp2], %[step3_13] \n\t" "lbux %[temp0], %[temp2](%[cm]) \n\t" "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" "lbu %[temp3], 0(%[dest_pix1]) \n\t" "add %[temp3], %[temp3], %[step3_12] \n\t" "lbux %[temp1], %[temp3](%[cm]) \n\t" "sb %[temp1], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) - : [cm] "r"(cm), [dest_stride] "r"(dest_stride), - [step3_12] "r"(step3_12), [step3_13] "r"(step3_13), - [step3_14] "r"(step3_14), [step3_15] "r"(step3_15)); + : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12), + [step3_13] "r"(step3_13), [step3_14] "r"(step3_14), + [step3_15] "r"(step3_15)); __asm__ __volatile__( "lbu %[temp2], 0(%[dest_pix]) \n\t" @@ -892,14 +970,14 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "lbux %[temp0], %[temp2](%[cm]) \n\t" "add %[temp1], %[step1_9], %[step1_22] \n\t" "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[temp3], 0(%[dest_pix]) \n\t" "addi %[temp1], %[temp1], 32 \n\t" "sra %[temp1], %[temp1], 6 \n\t" "add %[temp3], %[temp3], %[temp1] \n\t" "lbux %[temp1], %[temp3](%[cm]) \n\t" "sb %[temp1], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[temp2], 0(%[dest_pix]) \n\t" "add %[temp0], %[step1_10], %[step1_21] \n\t" @@ -909,18 +987,18 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "lbux %[temp0], %[temp2](%[cm]) \n\t" "add %[temp1], %[step1_11], %[step1_20] \n\t" "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[temp3], 0(%[dest_pix]) \n\t" "addi %[temp1], %[temp1], 32 \n\t" "sra %[temp1], %[temp1], 6 \n\t" "add %[temp3], %[temp3], %[temp1] \n\t" "lbux %[temp1], %[temp3](%[cm]) \n\t" "sb %[temp1], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) - : [cm] "r"(cm), [dest_stride] "r"(dest_stride), [step1_8] "r"(step1_8), + : [cm] "r"(cm), [stride] "r"(stride), [step1_8] "r"(step1_8), [step1_9] "r"(step1_9), [step1_10] "r"(step1_10), [step1_11] "r"(step1_11), [step1_20] "r"(step1_20), [step1_21] "r"(step1_21), [step1_22] "r"(step1_22), @@ -936,29 +1014,29 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "add %[temp2], %[temp2], %[step3_15] \n\t" "lbux %[temp0], %[temp2](%[cm]) \n\t" "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" "lbu %[temp3], 0(%[dest_pix1]) \n\t" "add %[temp3], %[temp3], %[step3_14] \n\t" "lbux %[temp1], %[temp3](%[cm]) \n\t" "sb %[temp1], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" "lbu %[temp2], 0(%[dest_pix1]) \n\t" "add %[temp2], %[temp2], %[step3_13] \n\t" "lbux %[temp0], %[temp2](%[cm]) \n\t" "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" "lbu %[temp3], 0(%[dest_pix1]) \n\t" "add %[temp3], %[temp3], %[step3_12] \n\t" "lbux %[temp1], %[temp3](%[cm]) \n\t" "sb %[temp1], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) - : [cm] "r"(cm), [dest_stride] "r"(dest_stride), - [step3_12] "r"(step3_12), [step3_13] "r"(step3_13), - [step3_14] "r"(step3_14), [step3_15] "r"(step3_15)); + : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12), + [step3_13] "r"(step3_13), [step3_14] "r"(step3_14), + [step3_15] "r"(step3_15)); __asm__ __volatile__( "lbu %[temp2], 0(%[dest_pix]) \n\t" @@ -969,14 +1047,14 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "lbux %[temp0], %[temp2](%[cm]) \n\t" "add %[temp1], %[step1_13], %[step2_18] \n\t" "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[temp3], 0(%[dest_pix]) \n\t" "addi %[temp1], %[temp1], 32 \n\t" "sra %[temp1], %[temp1], 6 \n\t" "add %[temp3], %[temp3], %[temp1] \n\t" "lbux %[temp1], %[temp3](%[cm]) \n\t" "sb %[temp1], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[temp2], 0(%[dest_pix]) \n\t" "add %[temp0], %[step1_14], %[step2_17] \n\t" @@ -986,7 +1064,7 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "lbux %[temp0], %[temp2](%[cm]) \n\t" "add %[temp1], %[step1_15], %[step2_16] \n\t" "sb %[temp0], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[temp3], 0(%[dest_pix]) \n\t" "addi %[temp1], %[temp1], 32 \n\t" "sra %[temp1], %[temp1], 6 \n\t" @@ -996,11 +1074,11 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [dest_pix] "+r"(dest_pix) - : [cm] "r"(cm), [dest_stride] "r"(dest_stride), - [step1_12] "r"(step1_12), [step1_13] "r"(step1_13), - [step1_14] "r"(step1_14), [step1_15] "r"(step1_15), - [step2_16] "r"(step2_16), [step2_17] "r"(step2_17), - [step2_18] "r"(step2_18), [step2_19] "r"(step2_19)); + : [cm] "r"(cm), [stride] "r"(stride), [step1_12] "r"(step1_12), + [step1_13] "r"(step1_13), [step1_14] "r"(step1_14), + [step1_15] "r"(step1_15), [step2_16] "r"(step2_16), + [step2_17] "r"(step2_17), [step2_18] "r"(step2_18), + [step2_19] "r"(step2_19)); step3_12 = ROUND_POWER_OF_TWO((step1_15 - step2_16), 6); step3_13 = ROUND_POWER_OF_TWO((step1_14 - step2_17), 6); @@ -1012,18 +1090,18 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, "add %[temp2], %[temp2], %[step3_15] \n\t" "lbux %[temp0], %[temp2](%[cm]) \n\t" "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" "lbu %[temp3], 0(%[dest_pix1]) \n\t" "add %[temp3], %[temp3], %[step3_14] \n\t" "lbux %[temp1], %[temp3](%[cm]) \n\t" "sb %[temp1], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" "lbu %[temp2], 0(%[dest_pix1]) \n\t" "add %[temp2], %[temp2], %[step3_13] \n\t" "lbux %[temp0], %[temp2](%[cm]) \n\t" "sb %[temp0], 0(%[dest_pix1]) \n\t" - "subu %[dest_pix1], %[dest_pix1], %[dest_stride] \n\t" + "subu %[dest_pix1], %[dest_pix1], %[stride] \n\t" "lbu %[temp3], 0(%[dest_pix1]) \n\t" "add %[temp3], %[temp3], %[step3_12] \n\t" "lbux %[temp1], %[temp3](%[cm]) \n\t" @@ -1031,9 +1109,9 @@ void vpx_idct32_cols_add_blk_dspr2(int16_t *input, uint8_t *dest, : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [dest_pix1] "+r"(dest_pix1) - : [cm] "r"(cm), [dest_stride] "r"(dest_stride), - [step3_12] "r"(step3_12), [step3_13] "r"(step3_13), - [step3_14] "r"(step3_14), [step3_15] "r"(step3_15)); + : [cm] "r"(cm), [stride] "r"(stride), [step3_12] "r"(step3_12), + [step3_13] "r"(step3_13), [step3_14] "r"(step3_14), + [step3_15] "r"(step3_15)); input += 32; } diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans32_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans32_dspr2.c index d71c5ffed51..3c0468c00fa 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans32_dspr2.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans32_dspr2.c @@ -18,24 +18,23 @@ #if HAVE_DSPR2 static void idct32_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) { - int16_t step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; - int16_t step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; - int16_t step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20; - int16_t step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27; - int16_t step1_28, step1_29, step1_30, step1_31; - int16_t step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; - int16_t step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13; - int16_t step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20; - int16_t step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27; - int16_t step2_28, step2_29, step2_30, step2_31; - int16_t step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14; - int16_t step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21; - int16_t step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28; - int16_t step3_29, step3_30, step3_31; + int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6; + int step1_7, step1_8, step1_9, step1_10, step1_11, step1_12, step1_13; + int step1_14, step1_15, step1_16, step1_17, step1_18, step1_19, step1_20; + int step1_21, step1_22, step1_23, step1_24, step1_25, step1_26, step1_27; + int step1_28, step1_29, step1_30, step1_31; + int step2_0, step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; + int step2_7, step2_8, step2_9, step2_10, step2_11, step2_12, step2_13; + int step2_14, step2_15, step2_16, step2_17, step2_18, step2_19, step2_20; + int step2_21, step2_22, step2_23, step2_24, step2_25, step2_26, step2_27; + int step2_28, step2_29, step2_30, step2_31; + int step3_8, step3_9, step3_10, step3_11, step3_12, step3_13, step3_14; + int step3_15, step3_16, step3_17, step3_18, step3_19, step3_20, step3_21; + int step3_22, step3_23, step3_24, step3_25, step3_26, step3_27, step3_28; + int step3_29, step3_30, step3_31; int temp0, temp1, temp2, temp3; int load1, load2, load3, load4; int result1, result2; - int temp21; int i; const int const_2_power_13 = 8192; const int32_t *input_int; @@ -147,9 +146,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_16] "=r"(step1_16), - [step1_17] "=r"(step1_17), [step1_30] "=r"(step1_30), - [step1_31] "=r"(step1_31) + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), + [step1_16] "=&r"(step1_16), [step1_17] "=&r"(step1_17), + [step1_30] "=&r"(step1_30), [step1_31] "=&r"(step1_31) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), [cospi_31_64] "r"(cospi_31_64), [cospi_1_64] "r"(cospi_1_64), [cospi_4_64] "r"(cospi_4_64), [cospi_17_64] "r"(cospi_17_64), @@ -207,9 +206,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_18] "=r"(step1_18), - [step1_19] "=r"(step1_19), [step1_28] "=r"(step1_28), - [step1_29] "=r"(step1_29) + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), + [step1_18] "=&r"(step1_18), [step1_19] "=&r"(step1_19), + [step1_28] "=&r"(step1_28), [step1_29] "=&r"(step1_29) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), [cospi_23_64] "r"(cospi_23_64), [cospi_9_64] "r"(cospi_9_64), [cospi_4_64] "r"(cospi_4_64), [cospi_7_64] "r"(cospi_7_64), @@ -267,9 +266,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_20] "=r"(step1_20), - [step1_21] "=r"(step1_21), [step1_26] "=r"(step1_26), - [step1_27] "=r"(step1_27) + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), + [step1_20] "=&r"(step1_20), [step1_21] "=&r"(step1_21), + [step1_26] "=&r"(step1_26), [step1_27] "=&r"(step1_27) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), [cospi_27_64] "r"(cospi_27_64), [cospi_5_64] "r"(cospi_5_64), [cospi_11_64] "r"(cospi_11_64), [cospi_21_64] "r"(cospi_21_64), @@ -289,7 +288,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "madd $ac1, %[load1], %[cospi_19_64] \n\t" "msub $ac1, %[load2], %[cospi_13_64] \n\t" "extp %[temp0], $ac1, 31 \n\t" - "madd $ac3, %[load1], %[cospi_13_64] \n\t" "madd $ac3, %[load2], %[cospi_19_64] \n\t" "extp %[temp3], $ac3, 31 \n\t" @@ -302,7 +300,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "madd $ac2, %[load3], %[cospi_3_64] \n\t" "msub $ac2, %[load4], %[cospi_29_64] \n\t" "extp %[temp1], $ac2, 31 \n\t" - "madd $ac1, %[load3], %[cospi_29_64] \n\t" "madd $ac1, %[load4], %[cospi_3_64] \n\t" "extp %[temp2], $ac1, 31 \n\t" @@ -314,12 +311,10 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "sub %[load1], %[temp1], %[temp0] \n\t" "sub %[load2], %[temp2], %[temp3] \n\t" - "msub $ac1, %[load1], %[cospi_12_64] \n\t" "msub $ac1, %[load2], %[cospi_20_64] \n\t" "msub $ac3, %[load1], %[cospi_20_64] \n\t" "madd $ac3, %[load2], %[cospi_12_64] \n\t" - "extp %[step1_22], $ac1, 31 \n\t" "extp %[step1_25], $ac3, 31 \n\t" "add %[step1_23], %[temp0], %[temp1] \n\t" @@ -327,9 +322,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_22] "=r"(step1_22), - [step1_23] "=r"(step1_23), [step1_24] "=r"(step1_24), - [step1_25] "=r"(step1_25) + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), + [step1_22] "=&r"(step1_22), [step1_23] "=&r"(step1_23), + [step1_24] "=&r"(step1_24), [step1_25] "=&r"(step1_25) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), [cospi_19_64] "r"(cospi_19_64), [cospi_13_64] "r"(cospi_13_64), [cospi_3_64] "r"(cospi_3_64), [cospi_29_64] "r"(cospi_29_64), @@ -349,7 +344,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "madd $ac1, %[load1], %[cospi_30_64] \n\t" "msub $ac1, %[load2], %[cospi_2_64] \n\t" "extp %[temp0], $ac1, 31 \n\t" - "madd $ac3, %[load1], %[cospi_2_64] \n\t" "madd $ac3, %[load2], %[cospi_30_64] \n\t" "extp %[temp3], $ac3, 31 \n\t" @@ -362,7 +356,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "madd $ac2, %[load3], %[cospi_14_64] \n\t" "msub $ac2, %[load4], %[cospi_18_64] \n\t" "extp %[temp1], $ac2, 31 \n\t" - "madd $ac1, %[load3], %[cospi_18_64] \n\t" "madd $ac1, %[load4], %[cospi_14_64] \n\t" "extp %[temp2], $ac1, 31 \n\t" @@ -374,12 +367,10 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "sub %[load1], %[temp0], %[temp1] \n\t" "sub %[load2], %[temp3], %[temp2] \n\t" - "msub $ac1, %[load1], %[cospi_8_64] \n\t" "madd $ac1, %[load2], %[cospi_24_64] \n\t" "madd $ac3, %[load1], %[cospi_24_64] \n\t" "madd $ac3, %[load2], %[cospi_8_64] \n\t" - "extp %[step2_9], $ac1, 31 \n\t" "extp %[step2_14], $ac3, 31 \n\t" "add %[step2_8], %[temp0], %[temp1] \n\t" @@ -387,9 +378,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=r"(step2_8), - [step2_9] "=r"(step2_9), [step2_14] "=r"(step2_14), - [step2_15] "=r"(step2_15) + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_8] "=&r"(step2_8), + [step2_9] "=&r"(step2_9), [step2_14] "=&r"(step2_14), + [step2_15] "=&r"(step2_15) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), [cospi_30_64] "r"(cospi_30_64), [cospi_2_64] "r"(cospi_2_64), [cospi_14_64] "r"(cospi_14_64), [cospi_18_64] "r"(cospi_18_64), @@ -409,7 +400,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "madd $ac1, %[load1], %[cospi_22_64] \n\t" "msub $ac1, %[load2], %[cospi_10_64] \n\t" "extp %[temp0], $ac1, 31 \n\t" - "madd $ac3, %[load1], %[cospi_10_64] \n\t" "madd $ac3, %[load2], %[cospi_22_64] \n\t" "extp %[temp3], $ac3, 31 \n\t" @@ -422,7 +412,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "madd $ac2, %[load3], %[cospi_6_64] \n\t" "msub $ac2, %[load4], %[cospi_26_64] \n\t" "extp %[temp1], $ac2, 31 \n\t" - "madd $ac1, %[load3], %[cospi_26_64] \n\t" "madd $ac1, %[load4], %[cospi_6_64] \n\t" "extp %[temp2], $ac1, 31 \n\t" @@ -434,12 +423,10 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "sub %[load1], %[temp1], %[temp0] \n\t" "sub %[load2], %[temp2], %[temp3] \n\t" - "msub $ac1, %[load1], %[cospi_24_64] \n\t" "msub $ac1, %[load2], %[cospi_8_64] \n\t" "madd $ac3, %[load2], %[cospi_24_64] \n\t" "msub $ac3, %[load1], %[cospi_8_64] \n\t" - "extp %[step2_10], $ac1, 31 \n\t" "extp %[step2_13], $ac3, 31 \n\t" "add %[step2_11], %[temp0], %[temp1] \n\t" @@ -447,9 +434,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step2_10] "=r"(step2_10), - [step2_11] "=r"(step2_11), [step2_12] "=r"(step2_12), - [step2_13] "=r"(step2_13) + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), + [step2_10] "=&r"(step2_10), [step2_11] "=&r"(step2_11), + [step2_12] "=&r"(step2_12), [step2_13] "=&r"(step2_13) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), [cospi_22_64] "r"(cospi_22_64), [cospi_10_64] "r"(cospi_10_64), [cospi_6_64] "r"(cospi_6_64), [cospi_26_64] "r"(cospi_26_64), @@ -462,21 +449,18 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "sub %[temp0], %[temp0], %[step2_9] \n\t" "add %[temp0], %[temp0], %[step2_10] \n\t" "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" "mthi $zero, $ac1 \n\t" "sub %[temp1], %[step2_14], %[step2_13] \n\t" "add %[temp1], %[temp1], %[step2_9] \n\t" "sub %[temp1], %[temp1], %[step2_10] \n\t" "madd $ac1, %[temp1], %[cospi_16_64] \n\t" - "mtlo %[const_2_power_13], $ac2 \n\t" "mthi $zero, $ac2 \n\t" "sub %[temp0], %[step2_15], %[step2_12] \n\t" "sub %[temp0], %[temp0], %[step2_8] \n\t" "add %[temp0], %[temp0], %[step2_11] \n\t" "madd $ac2, %[temp0], %[cospi_16_64] \n\t" - "mtlo %[const_2_power_13], $ac3 \n\t" "mthi $zero, $ac3 \n\t" "sub %[temp1], %[step2_15], %[step2_12] \n\t" @@ -488,122 +472,159 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "add %[step3_9], %[step2_9], %[step2_10] \n\t" "add %[step3_14], %[step2_13], %[step2_14] \n\t" "add %[step3_15], %[step2_12], %[step2_15] \n\t" - "extp %[step3_10], $ac0, 31 \n\t" "extp %[step3_13], $ac1, 31 \n\t" "extp %[step3_11], $ac2, 31 \n\t" "extp %[step3_12], $ac3, 31 \n\t" - : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=r"(step3_8), - [step3_9] "=r"(step3_9), [step3_10] "=r"(step3_10), - [step3_11] "=r"(step3_11), [step3_12] "=r"(step3_12), - [step3_13] "=r"(step3_13), [step3_14] "=r"(step3_14), - [step3_15] "=r"(step3_15) + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [step3_8] "=&r"(step3_8), + [step3_9] "=&r"(step3_9), [step3_10] "=&r"(step3_10), + [step3_11] "=&r"(step3_11), [step3_12] "=&r"(step3_12), + [step3_13] "=&r"(step3_13), [step3_14] "=&r"(step3_14), + [step3_15] "=&r"(step3_15) : [const_2_power_13] "r"(const_2_power_13), [step2_8] "r"(step2_8), [step2_9] "r"(step2_9), [step2_10] "r"(step2_10), [step2_11] "r"(step2_11), [step2_12] "r"(step2_12), [step2_13] "r"(step2_13), [step2_14] "r"(step2_14), [step2_15] "r"(step2_15), [cospi_16_64] "r"(cospi_16_64)); - step2_18 = step1_17 - step1_18; - step2_29 = step1_30 - step1_29; - __asm__ __volatile__( "mtlo %[const_2_power_13], $ac0 \n\t" "mthi $zero, $ac0 \n\t" - "msub $ac0, %[step2_18], %[cospi_8_64] \n\t" - "madd $ac0, %[step2_29], %[cospi_24_64] \n\t" - "extp %[step3_18], $ac0, 31 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sub %[temp0], %[step1_17], %[step1_18] \n\t" + "sub %[temp1], %[step1_30], %[step1_29] \n\t" + "add %[step3_17], %[step1_17], %[step1_18] \n\t" + "add %[step3_30], %[step1_30], %[step1_29] \n\t" - : [step3_18] "=r"(step3_18) - : [const_2_power_13] "r"(const_2_power_13), [step2_18] "r"(step2_18), - [step2_29] "r"(step2_29), [cospi_24_64] "r"(cospi_24_64), + "msub $ac0, %[temp0], %[cospi_8_64] \n\t" + "madd $ac0, %[temp1], %[cospi_24_64] \n\t" + "extp %[step3_18], $ac0, 31 \n\t" + "madd $ac1, %[temp0], %[cospi_24_64] \n\t" + "madd $ac1, %[temp1], %[cospi_8_64] \n\t" + "extp %[step3_29], $ac1, 31 \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [step3_18] "=&r"(step3_18), [step3_29] "=&r"(step3_29), + [step3_17] "=&r"(step3_17), [step3_30] "=&r"(step3_30) + : [const_2_power_13] "r"(const_2_power_13), [step1_17] "r"(step1_17), + [step1_18] "r"(step1_18), [step1_30] "r"(step1_30), + [step1_29] "r"(step1_29), [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); - temp21 = step2_18 * cospi_24_64 + step2_29 * cospi_8_64; - step3_29 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step2_19 = step1_16 - step1_19; - step2_28 = step1_31 - step1_28; - __asm__ __volatile__( "mtlo %[const_2_power_13], $ac0 \n\t" "mthi $zero, $ac0 \n\t" - "msub $ac0, %[step2_19], %[cospi_8_64] \n\t" - "madd $ac0, %[step2_28], %[cospi_24_64] \n\t" - "extp %[step3_19], $ac0, 31 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sub %[temp0], %[step1_16], %[step1_19] \n\t" + "sub %[temp1], %[step1_31], %[step1_28] \n\t" + "add %[step3_16], %[step1_16], %[step1_19] \n\t" + "add %[step3_31], %[step1_31], %[step1_28] \n\t" - : [step3_19] "=r"(step3_19) - : [const_2_power_13] "r"(const_2_power_13), [step2_19] "r"(step2_19), - [step2_28] "r"(step2_28), [cospi_24_64] "r"(cospi_24_64), + "msub $ac0, %[temp0], %[cospi_8_64] \n\t" + "madd $ac0, %[temp1], %[cospi_24_64] \n\t" + "extp %[step3_19], $ac0, 31 \n\t" + "madd $ac1, %[temp0], %[cospi_24_64] \n\t" + "madd $ac1, %[temp1], %[cospi_8_64] \n\t" + "extp %[step3_28], $ac1, 31 \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [step3_16] "=&r"(step3_16), [step3_31] "=&r"(step3_31), + [step3_19] "=&r"(step3_19), [step3_28] "=&r"(step3_28) + : [const_2_power_13] "r"(const_2_power_13), [step1_16] "r"(step1_16), + [step1_19] "r"(step1_19), [step1_31] "r"(step1_31), + [step1_28] "r"(step1_28), [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); - temp21 = step2_19 * cospi_24_64 + step2_28 * cospi_8_64; - step3_28 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step3_16 = step1_16 + step1_19; - step3_17 = step1_17 + step1_18; - step3_30 = step1_29 + step1_30; - step3_31 = step1_28 + step1_31; - - step2_20 = step1_23 - step1_20; - step2_27 = step1_24 - step1_27; - __asm__ __volatile__( "mtlo %[const_2_power_13], $ac0 \n\t" "mthi $zero, $ac0 \n\t" - "msub $ac0, %[step2_20], %[cospi_24_64] \n\t" - "msub $ac0, %[step2_27], %[cospi_8_64] \n\t" - "extp %[step3_20], $ac0, 31 \n\t" - - : [step3_20] "=r"(step3_20) - : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), - [step2_27] "r"(step2_27), [cospi_24_64] "r"(cospi_24_64), + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "sub %[temp0], %[step1_23], %[step1_20] \n\t" + "sub %[temp1], %[step1_24], %[step1_27] \n\t" + "add %[step3_23], %[step1_23], %[step1_20] \n\t" + "add %[step3_24], %[step1_24], %[step1_27] \n\t" + + "msub $ac0, %[temp0], %[cospi_8_64] \n\t" + "madd $ac0, %[temp1], %[cospi_24_64] \n\t" + "extp %[step3_27], $ac0, 31 \n\t" + "msub $ac1, %[temp0], %[cospi_24_64] \n\t" + "msub $ac1, %[temp1], %[cospi_8_64] \n\t" + "extp %[step3_20], $ac1, 31 \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [step3_23] "=&r"(step3_23), [step3_24] "=&r"(step3_24), + [step3_20] "=&r"(step3_20), [step3_27] "=&r"(step3_27) + : [const_2_power_13] "r"(const_2_power_13), [step1_23] "r"(step1_23), + [step1_20] "r"(step1_20), [step1_24] "r"(step1_24), + [step1_27] "r"(step1_27), [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); - temp21 = -step2_20 * cospi_8_64 + step2_27 * cospi_24_64; - step3_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step2_21 = step1_22 - step1_21; - step2_26 = step1_25 - step1_26; - __asm__ __volatile__( + "mtlo %[const_2_power_13], $ac0 \n\t" + "mthi $zero, $ac0 \n\t" "mtlo %[const_2_power_13], $ac1 \n\t" "mthi $zero, $ac1 \n\t" - "msub $ac1, %[step2_21], %[cospi_24_64] \n\t" - "msub $ac1, %[step2_26], %[cospi_8_64] \n\t" - "extp %[step3_21], $ac1, 31 \n\t" - - : [step3_21] "=r"(step3_21) - : [const_2_power_13] "r"(const_2_power_13), [step2_21] "r"(step2_21), - [step2_26] "r"(step2_26), [cospi_24_64] "r"(cospi_24_64), + "sub %[temp0], %[step1_22], %[step1_21] \n\t" + "sub %[temp1], %[step1_25], %[step1_26] \n\t" + "add %[step3_22], %[step1_22], %[step1_21] \n\t" + "add %[step3_25], %[step1_25], %[step1_26] \n\t" + + "msub $ac0, %[temp0], %[cospi_24_64] \n\t" + "msub $ac0, %[temp1], %[cospi_8_64] \n\t" + "extp %[step3_21], $ac0, 31 \n\t" + "msub $ac1, %[temp0], %[cospi_8_64] \n\t" + "madd $ac1, %[temp1], %[cospi_24_64] \n\t" + "extp %[step3_26], $ac1, 31 \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), + [step3_22] "=&r"(step3_22), [step3_25] "=&r"(step3_25), + [step3_21] "=&r"(step3_21), [step3_26] "=&r"(step3_26) + : [const_2_power_13] "r"(const_2_power_13), [step1_22] "r"(step1_22), + [step1_21] "r"(step1_21), [step1_25] "r"(step1_25), + [step1_26] "r"(step1_26), [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64)); - temp21 = -step2_21 * cospi_8_64 + step2_26 * cospi_24_64; - step3_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; - - step3_22 = step1_21 + step1_22; - step3_23 = step1_20 + step1_23; - step3_24 = step1_24 + step1_27; - step3_25 = step1_25 + step1_26; - - step2_16 = step3_16 + step3_23; - step2_17 = step3_17 + step3_22; - step2_18 = step3_18 + step3_21; - step2_19 = step3_19 + step3_20; - step2_20 = step3_19 - step3_20; - step2_21 = step3_18 - step3_21; - step2_22 = step3_17 - step3_22; - step2_23 = step3_16 - step3_23; - - step2_24 = step3_31 - step3_24; - step2_25 = step3_30 - step3_25; - step2_26 = step3_29 - step3_26; - step2_27 = step3_28 - step3_27; - step2_28 = step3_28 + step3_27; - step2_29 = step3_29 + step3_26; - step2_30 = step3_30 + step3_25; - step2_31 = step3_31 + step3_24; + __asm__ __volatile__( + "add %[step2_16], %[step3_16], %[step3_23] \n\t" + "add %[step2_17], %[step3_17], %[step3_22] \n\t" + "add %[step2_18], %[step3_18], %[step3_21] \n\t" + "add %[step2_19], %[step3_19], %[step3_20] \n\t" + "sub %[step2_20], %[step3_19], %[step3_20] \n\t" + "sub %[step2_21], %[step3_18], %[step3_21] \n\t" + "sub %[step2_22], %[step3_17], %[step3_22] \n\t" + "sub %[step2_23], %[step3_16], %[step3_23] \n\t" + + : [step2_16] "=&r"(step2_16), [step2_17] "=&r"(step2_17), + [step2_18] "=&r"(step2_18), [step2_19] "=&r"(step2_19), + [step2_20] "=&r"(step2_20), [step2_21] "=&r"(step2_21), + [step2_22] "=&r"(step2_22), [step2_23] "=&r"(step2_23) + : [step3_16] "r"(step3_16), [step3_23] "r"(step3_23), + [step3_17] "r"(step3_17), [step3_22] "r"(step3_22), + [step3_18] "r"(step3_18), [step3_21] "r"(step3_21), + [step3_19] "r"(step3_19), [step3_20] "r"(step3_20)); + + __asm__ __volatile__( + "sub %[step2_24], %[step3_31], %[step3_24] \n\t" + "sub %[step2_25], %[step3_30], %[step3_25] \n\t" + "sub %[step2_26], %[step3_29], %[step3_26] \n\t" + "sub %[step2_27], %[step3_28], %[step3_27] \n\t" + "add %[step2_28], %[step3_28], %[step3_27] \n\t" + "add %[step2_29], %[step3_29], %[step3_26] \n\t" + "add %[step2_30], %[step3_30], %[step3_25] \n\t" + "add %[step2_31], %[step3_31], %[step3_24] \n\t" + + : [step2_24] "=&r"(step2_24), [step2_28] "=&r"(step2_28), + [step2_25] "=&r"(step2_25), [step2_29] "=&r"(step2_29), + [step2_26] "=&r"(step2_26), [step2_30] "=&r"(step2_30), + [step2_27] "=&r"(step2_27), [step2_31] "=&r"(step2_31) + : [step3_31] "r"(step3_31), [step3_24] "r"(step3_24), + [step3_30] "r"(step3_30), [step3_25] "r"(step3_25), + [step3_29] "r"(step3_29), [step3_26] "r"(step3_26), + [step3_28] "r"(step3_28), [step3_27] "r"(step3_27)); __asm__ __volatile__( "lh %[load1], 0(%[input]) \n\t" @@ -627,29 +648,25 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "madd $ac3, %[load3], %[cospi_24_64] \n\t" "msub $ac3, %[load4], %[cospi_8_64] \n\t" "extp %[temp2], $ac3, 31 \n\t" - "mtlo %[const_2_power_13], $ac1 \n\t" "mthi $zero, $ac1 \n\t" "madd $ac1, %[load3], %[cospi_8_64] \n\t" "madd $ac1, %[load4], %[cospi_24_64] \n\t" "extp %[temp3], $ac1, 31 \n\t" - - "add %[step1_0], %[temp0], %[temp3] \n\t" - "add %[step1_1], %[temp1], %[temp2] \n\t" - "sub %[step1_2], %[temp1], %[temp2] \n\t" - "sub %[step1_3], %[temp0], %[temp3] \n\t" + "add %[step1_0], %[temp0], %[temp3] \n\t" + "add %[step1_1], %[temp1], %[temp2] \n\t" + "sub %[step1_2], %[temp1], %[temp2] \n\t" + "sub %[step1_3], %[temp0], %[temp3] \n\t" : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), [load4] "=&r"(load4), [result1] "=&r"(result1), [result2] "=&r"(result2), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=r"(step1_0), - [step1_1] "=r"(step1_1), [step1_2] "=r"(step1_2), - [step1_3] "=r"(step1_3) + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_0] "=&r"(step1_0), + [step1_1] "=&r"(step1_1), [step1_2] "=&r"(step1_2), + [step1_3] "=&r"(step1_3) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), - [cospi_16_64] "r"(cospi_16_64), [cospi_24_64] "r"(cospi_24_64), - [cospi_8_64] "r"(cospi_8_64) - - ); + [cospi_24_64] "r"(cospi_24_64), [cospi_8_64] "r"(cospi_8_64), + [cospi_16_64] "r"(cospi_16_64)); __asm__ __volatile__( "lh %[load1], 8(%[input]) \n\t" @@ -665,7 +682,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "madd $ac1, %[load1], %[cospi_28_64] \n\t" "msub $ac1, %[load2], %[cospi_4_64] \n\t" "extp %[temp0], $ac1, 31 \n\t" - "madd $ac3, %[load1], %[cospi_4_64] \n\t" "madd $ac3, %[load2], %[cospi_28_64] \n\t" "extp %[temp3], $ac3, 31 \n\t" @@ -678,7 +694,6 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "madd $ac2, %[load3], %[cospi_12_64] \n\t" "msub $ac2, %[load4], %[cospi_20_64] \n\t" "extp %[temp1], $ac2, 31 \n\t" - "madd $ac1, %[load3], %[cospi_20_64] \n\t" "madd $ac1, %[load4], %[cospi_12_64] \n\t" "extp %[temp2], $ac1, 31 \n\t" @@ -691,11 +706,9 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, "sub %[load1], %[temp3], %[temp2] \n\t" "sub %[load1], %[load1], %[temp0] \n\t" "add %[load1], %[load1], %[temp1] \n\t" - "sub %[load2], %[temp0], %[temp1] \n\t" "sub %[load2], %[load2], %[temp2] \n\t" "add %[load2], %[load2], %[temp3] \n\t" - "madd $ac1, %[load1], %[cospi_16_64] \n\t" "madd $ac3, %[load2], %[cospi_16_64] \n\t" @@ -706,129 +719,246 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, : [load1] "=&r"(load1), [load2] "=&r"(load2), [load3] "=&r"(load3), [load4] "=&r"(load4), [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), - [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=r"(step1_4), - [step1_5] "=r"(step1_5), [step1_6] "=r"(step1_6), - [step1_7] "=r"(step1_7) + [temp2] "=&r"(temp2), [temp3] "=&r"(temp3), [step1_4] "=&r"(step1_4), + [step1_5] "=&r"(step1_5), [step1_6] "=&r"(step1_6), + [step1_7] "=&r"(step1_7) : [const_2_power_13] "r"(const_2_power_13), [input] "r"(input), [cospi_20_64] "r"(cospi_20_64), [cospi_12_64] "r"(cospi_12_64), [cospi_4_64] "r"(cospi_4_64), [cospi_28_64] "r"(cospi_28_64), [cospi_16_64] "r"(cospi_16_64)); - step2_0 = step1_0 + step1_7; - step2_1 = step1_1 + step1_6; - step2_2 = step1_2 + step1_5; - step2_3 = step1_3 + step1_4; - step2_4 = step1_3 - step1_4; - step2_5 = step1_2 - step1_5; - step2_6 = step1_1 - step1_6; - step2_7 = step1_0 - step1_7; - - step1_0 = step2_0 + step3_15; - step1_1 = step2_1 + step3_14; - step1_2 = step2_2 + step3_13; - step1_3 = step2_3 + step3_12; - step1_4 = step2_4 + step3_11; - step1_5 = step2_5 + step3_10; - step1_6 = step2_6 + step3_9; - step1_7 = step2_7 + step3_8; - step1_8 = step2_7 - step3_8; - step1_9 = step2_6 - step3_9; - step1_10 = step2_5 - step3_10; - step1_11 = step2_4 - step3_11; - step1_12 = step2_3 - step3_12; - step1_13 = step2_2 - step3_13; - step1_14 = step2_1 - step3_14; - step1_15 = step2_0 - step3_15; - __asm__ __volatile__( - "sub %[temp0], %[step2_27], %[step2_20] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_20], $ac0, 31 \n\t" - - : [temp0] "=&r"(temp0), [step1_20] "=r"(step1_20) - : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), - [step2_27] "r"(step2_27), [cospi_16_64] "r"(cospi_16_64)); + "add %[step2_0], %[step1_0], %[step1_7] \n\t" + "add %[step2_1], %[step1_1], %[step1_6] \n\t" + "add %[step2_2], %[step1_2], %[step1_5] \n\t" + "add %[step2_3], %[step1_3], %[step1_4] \n\t" + "sub %[step2_4], %[step1_3], %[step1_4] \n\t" + "sub %[step2_5], %[step1_2], %[step1_5] \n\t" + "sub %[step2_6], %[step1_1], %[step1_6] \n\t" + "sub %[step2_7], %[step1_0], %[step1_7] \n\t" + + : [step2_0] "=&r"(step2_0), [step2_4] "=&r"(step2_4), + [step2_1] "=&r"(step2_1), [step2_5] "=&r"(step2_5), + [step2_2] "=&r"(step2_2), [step2_6] "=&r"(step2_6), + [step2_3] "=&r"(step2_3), [step2_7] "=&r"(step2_7) + : [step1_0] "r"(step1_0), [step1_7] "r"(step1_7), + [step1_1] "r"(step1_1), [step1_6] "r"(step1_6), + [step1_2] "r"(step1_2), [step1_5] "r"(step1_5), + [step1_3] "r"(step1_3), [step1_4] "r"(step1_4)); + + // stage 7 + __asm__ __volatile__( + "add %[step1_0], %[step2_0], %[step3_15] \n\t" + "add %[step1_1], %[step2_1], %[step3_14] \n\t" + "add %[step1_2], %[step2_2], %[step3_13] \n\t" + "add %[step1_3], %[step2_3], %[step3_12] \n\t" + "sub %[step1_12], %[step2_3], %[step3_12] \n\t" + "sub %[step1_13], %[step2_2], %[step3_13] \n\t" + "sub %[step1_14], %[step2_1], %[step3_14] \n\t" + "sub %[step1_15], %[step2_0], %[step3_15] \n\t" + + : [step1_0] "=&r"(step1_0), [step1_12] "=&r"(step1_12), + [step1_1] "=&r"(step1_1), [step1_13] "=&r"(step1_13), + [step1_2] "=&r"(step1_2), [step1_14] "=&r"(step1_14), + [step1_3] "=&r"(step1_3), [step1_15] "=&r"(step1_15) + : [step2_0] "r"(step2_0), [step3_15] "r"(step3_15), + [step2_1] "r"(step2_1), [step3_14] "r"(step3_14), + [step2_2] "r"(step2_2), [step3_13] "r"(step3_13), + [step2_3] "r"(step2_3), [step3_12] "r"(step3_12)); - temp21 = (step2_20 + step2_27) * cospi_16_64; - step1_27 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + __asm__ __volatile__( + "add %[step1_4], %[step2_4], %[step3_11] \n\t" + "add %[step1_5], %[step2_5], %[step3_10] \n\t" + "add %[step1_6], %[step2_6], %[step3_9] \n\t" + "add %[step1_7], %[step2_7], %[step3_8] \n\t" + "sub %[step1_8], %[step2_7], %[step3_8] \n\t" + "sub %[step1_9], %[step2_6], %[step3_9] \n\t" + "sub %[step1_10], %[step2_5], %[step3_10] \n\t" + "sub %[step1_11], %[step2_4], %[step3_11] \n\t" + + : [step1_4] "=&r"(step1_4), [step1_8] "=&r"(step1_8), + [step1_5] "=&r"(step1_5), [step1_9] "=&r"(step1_9), + [step1_6] "=&r"(step1_6), [step1_10] "=&r"(step1_10), + [step1_7] "=&r"(step1_7), [step1_11] "=&r"(step1_11) + : [step2_4] "r"(step2_4), [step3_11] "r"(step3_11), + [step2_5] "r"(step2_5), [step3_10] "r"(step3_10), + [step2_6] "r"(step2_6), [step3_9] "r"(step3_9), + [step2_7] "r"(step2_7), [step3_8] "r"(step3_8)); __asm__ __volatile__( - "sub %[temp0], %[step2_26], %[step2_21] \n\t" + "sub %[temp0], %[step2_27], %[step2_20] \n\t" + "add %[temp1], %[step2_27], %[step2_20] \n\t" + "sub %[temp2], %[step2_26], %[step2_21] \n\t" + "add %[temp3], %[step2_26], %[step2_21] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" "mthi $zero, $ac0 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_21], $ac0, 31 \n\t" + "madd $ac1, %[temp1], %[cospi_16_64] \n\t" + "madd $ac2, %[temp2], %[cospi_16_64] \n\t" + "madd $ac3, %[temp3], %[cospi_16_64] \n\t" - : [temp0] "=&r"(temp0), [step1_21] "=r"(step1_21) - : [const_2_power_13] "r"(const_2_power_13), [step2_26] "r"(step2_26), - [step2_21] "r"(step2_21), [cospi_16_64] "r"(cospi_16_64)); + "extp %[step1_20], $ac0, 31 \n\t" + "extp %[step1_27], $ac1, 31 \n\t" + "extp %[step1_21], $ac2, 31 \n\t" + "extp %[step1_26], $ac3, 31 \n\t" - temp21 = (step2_21 + step2_26) * cospi_16_64; - step1_26 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [step1_20] "=&r"(step1_20), + [step1_27] "=&r"(step1_27), [step1_21] "=&r"(step1_21), + [step1_26] "=&r"(step1_26) + : [const_2_power_13] "r"(const_2_power_13), [step2_20] "r"(step2_20), + [step2_27] "r"(step2_27), [step2_21] "r"(step2_21), + [step2_26] "r"(step2_26), [cospi_16_64] "r"(cospi_16_64)); __asm__ __volatile__( "sub %[temp0], %[step2_25], %[step2_22] \n\t" + "add %[temp1], %[step2_25], %[step2_22] \n\t" + "sub %[temp2], %[step2_24], %[step2_23] \n\t" + "add %[temp3], %[step2_24], %[step2_23] \n\t" + "mtlo %[const_2_power_13], $ac0 \n\t" "mthi $zero, $ac0 \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_22], $ac0, 31 \n\t" + "mtlo %[const_2_power_13], $ac1 \n\t" + "mthi $zero, $ac1 \n\t" + "mtlo %[const_2_power_13], $ac2 \n\t" + "mthi $zero, $ac2 \n\t" + "mtlo %[const_2_power_13], $ac3 \n\t" + "mthi $zero, $ac3 \n\t" - : [temp0] "=&r"(temp0), [step1_22] "=r"(step1_22) - : [const_2_power_13] "r"(const_2_power_13), [step2_25] "r"(step2_25), - [step2_22] "r"(step2_22), [cospi_16_64] "r"(cospi_16_64)); + "madd $ac0, %[temp0], %[cospi_16_64] \n\t" + "madd $ac1, %[temp1], %[cospi_16_64] \n\t" + "madd $ac2, %[temp2], %[cospi_16_64] \n\t" + "madd $ac3, %[temp3], %[cospi_16_64] \n\t" - temp21 = (step2_22 + step2_25) * cospi_16_64; - step1_25 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + "extp %[step1_22], $ac0, 31 \n\t" + "extp %[step1_25], $ac1, 31 \n\t" + "extp %[step1_23], $ac2, 31 \n\t" + "extp %[step1_24], $ac3, 31 \n\t" + + : [temp0] "=&r"(temp0), [temp1] "=&r"(temp1), [temp2] "=&r"(temp2), + [temp3] "=&r"(temp3), [step1_22] "=&r"(step1_22), + [step1_25] "=&r"(step1_25), [step1_23] "=&r"(step1_23), + [step1_24] "=&r"(step1_24) + : [const_2_power_13] "r"(const_2_power_13), [step2_22] "r"(step2_22), + [step2_25] "r"(step2_25), [step2_23] "r"(step2_23), + [step2_24] "r"(step2_24), [cospi_16_64] "r"(cospi_16_64)); + // final stage __asm__ __volatile__( - "sub %[temp0], %[step2_24], %[step2_23] \n\t" - "mtlo %[const_2_power_13], $ac0 \n\t" - "mthi $zero, $ac0 \n\t" - "madd $ac0, %[temp0], %[cospi_16_64] \n\t" - "extp %[step1_23], $ac0, 31 \n\t" + "add %[temp0], %[step1_0], %[step2_31] \n\t" + "add %[temp1], %[step1_1], %[step2_30] \n\t" + "add %[temp2], %[step1_2], %[step2_29] \n\t" + "add %[temp3], %[step1_3], %[step2_28] \n\t" + "sub %[load1], %[step1_3], %[step2_28] \n\t" + "sub %[load2], %[step1_2], %[step2_29] \n\t" + "sub %[load3], %[step1_1], %[step2_30] \n\t" + "sub %[load4], %[step1_0], %[step2_31] \n\t" + "sh %[temp0], 0(%[output]) \n\t" + "sh %[temp1], 64(%[output]) \n\t" + "sh %[temp2], 128(%[output]) \n\t" + "sh %[temp3], 192(%[output]) \n\t" + "sh %[load1], 1792(%[output]) \n\t" + "sh %[load2], 1856(%[output]) \n\t" + "sh %[load3], 1920(%[output]) \n\t" + "sh %[load4], 1984(%[output]) \n\t" + + : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1), + [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3), + [temp3] "=&r"(temp3), [load4] "=&r"(load4) + : [step1_0] "r"(step1_0), [step2_31] "r"(step2_31), + [step1_1] "r"(step1_1), [step2_30] "r"(step2_30), + [step1_2] "r"(step1_2), [step2_29] "r"(step2_29), + [step1_3] "r"(step1_3), [step2_28] "r"(step2_28), + [output] "r"(output)); - : [temp0] "=&r"(temp0), [step1_23] "=r"(step1_23) - : [const_2_power_13] "r"(const_2_power_13), [step2_24] "r"(step2_24), - [step2_23] "r"(step2_23), [cospi_16_64] "r"(cospi_16_64)); + __asm__ __volatile__( + "add %[temp0], %[step1_4], %[step1_27] \n\t" + "add %[temp1], %[step1_5], %[step1_26] \n\t" + "add %[temp2], %[step1_6], %[step1_25] \n\t" + "add %[temp3], %[step1_7], %[step1_24] \n\t" + "sub %[load1], %[step1_7], %[step1_24] \n\t" + "sub %[load2], %[step1_6], %[step1_25] \n\t" + "sub %[load3], %[step1_5], %[step1_26] \n\t" + "sub %[load4], %[step1_4], %[step1_27] \n\t" + "sh %[temp0], 256(%[output]) \n\t" + "sh %[temp1], 320(%[output]) \n\t" + "sh %[temp2], 384(%[output]) \n\t" + "sh %[temp3], 448(%[output]) \n\t" + "sh %[load1], 1536(%[output]) \n\t" + "sh %[load2], 1600(%[output]) \n\t" + "sh %[load3], 1664(%[output]) \n\t" + "sh %[load4], 1728(%[output]) \n\t" + + : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1), + [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3), + [temp3] "=&r"(temp3), [load4] "=&r"(load4) + : [step1_4] "r"(step1_4), [step1_27] "r"(step1_27), + [step1_5] "r"(step1_5), [step1_26] "r"(step1_26), + [step1_6] "r"(step1_6), [step1_25] "r"(step1_25), + [step1_7] "r"(step1_7), [step1_24] "r"(step1_24), + [output] "r"(output)); - temp21 = (step2_23 + step2_24) * cospi_16_64; - step1_24 = (temp21 + DCT_CONST_ROUNDING) >> DCT_CONST_BITS; + __asm__ __volatile__( + "add %[temp0], %[step1_8], %[step1_23] \n\t" + "add %[temp1], %[step1_9], %[step1_22] \n\t" + "add %[temp2], %[step1_10], %[step1_21] \n\t" + "add %[temp3], %[step1_11], %[step1_20] \n\t" + "sub %[load1], %[step1_11], %[step1_20] \n\t" + "sub %[load2], %[step1_10], %[step1_21] \n\t" + "sub %[load3], %[step1_9], %[step1_22] \n\t" + "sub %[load4], %[step1_8], %[step1_23] \n\t" + "sh %[temp0], 512(%[output]) \n\t" + "sh %[temp1], 576(%[output]) \n\t" + "sh %[temp2], 640(%[output]) \n\t" + "sh %[temp3], 704(%[output]) \n\t" + "sh %[load1], 1280(%[output]) \n\t" + "sh %[load2], 1344(%[output]) \n\t" + "sh %[load3], 1408(%[output]) \n\t" + "sh %[load4], 1472(%[output]) \n\t" + + : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1), + [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3), + [temp3] "=&r"(temp3), [load4] "=&r"(load4) + : [step1_8] "r"(step1_8), [step1_23] "r"(step1_23), + [step1_9] "r"(step1_9), [step1_22] "r"(step1_22), + [step1_10] "r"(step1_10), [step1_21] "r"(step1_21), + [step1_11] "r"(step1_11), [step1_20] "r"(step1_20), + [output] "r"(output)); - // final stage - output[0 * 32] = step1_0 + step2_31; - output[1 * 32] = step1_1 + step2_30; - output[2 * 32] = step1_2 + step2_29; - output[3 * 32] = step1_3 + step2_28; - output[4 * 32] = step1_4 + step1_27; - output[5 * 32] = step1_5 + step1_26; - output[6 * 32] = step1_6 + step1_25; - output[7 * 32] = step1_7 + step1_24; - output[8 * 32] = step1_8 + step1_23; - output[9 * 32] = step1_9 + step1_22; - output[10 * 32] = step1_10 + step1_21; - output[11 * 32] = step1_11 + step1_20; - output[12 * 32] = step1_12 + step2_19; - output[13 * 32] = step1_13 + step2_18; - output[14 * 32] = step1_14 + step2_17; - output[15 * 32] = step1_15 + step2_16; - output[16 * 32] = step1_15 - step2_16; - output[17 * 32] = step1_14 - step2_17; - output[18 * 32] = step1_13 - step2_18; - output[19 * 32] = step1_12 - step2_19; - output[20 * 32] = step1_11 - step1_20; - output[21 * 32] = step1_10 - step1_21; - output[22 * 32] = step1_9 - step1_22; - output[23 * 32] = step1_8 - step1_23; - output[24 * 32] = step1_7 - step1_24; - output[25 * 32] = step1_6 - step1_25; - output[26 * 32] = step1_5 - step1_26; - output[27 * 32] = step1_4 - step1_27; - output[28 * 32] = step1_3 - step2_28; - output[29 * 32] = step1_2 - step2_29; - output[30 * 32] = step1_1 - step2_30; - output[31 * 32] = step1_0 - step2_31; + __asm__ __volatile__( + "add %[temp0], %[step1_12], %[step2_19] \n\t" + "add %[temp1], %[step1_13], %[step2_18] \n\t" + "add %[temp2], %[step1_14], %[step2_17] \n\t" + "add %[temp3], %[step1_15], %[step2_16] \n\t" + "sub %[load1], %[step1_15], %[step2_16] \n\t" + "sub %[load2], %[step1_14], %[step2_17] \n\t" + "sub %[load3], %[step1_13], %[step2_18] \n\t" + "sub %[load4], %[step1_12], %[step2_19] \n\t" + "sh %[temp0], 768(%[output]) \n\t" + "sh %[temp1], 832(%[output]) \n\t" + "sh %[temp2], 896(%[output]) \n\t" + "sh %[temp3], 960(%[output]) \n\t" + "sh %[load1], 1024(%[output]) \n\t" + "sh %[load2], 1088(%[output]) \n\t" + "sh %[load3], 1152(%[output]) \n\t" + "sh %[load4], 1216(%[output]) \n\t" + + : [temp0] "=&r"(temp0), [load1] "=&r"(load1), [temp1] "=&r"(temp1), + [load2] "=&r"(load2), [temp2] "=&r"(temp2), [load3] "=&r"(load3), + [temp3] "=&r"(temp3), [load4] "=&r"(load4) + : [step1_12] "r"(step1_12), [step2_19] "r"(step2_19), + [step1_13] "r"(step1_13), [step2_18] "r"(step2_18), + [step1_14] "r"(step1_14), [step2_17] "r"(step2_17), + [step1_15] "r"(step1_15), [step2_16] "r"(step2_16), + [output] "r"(output)); input += 32; output += 1; @@ -836,7 +966,7 @@ static void idct32_rows_dspr2(const int16_t *input, int16_t *output, } void vpx_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { + int stride) { DECLARE_ALIGNED(32, int16_t, out[32 * 32]); int16_t *outptr = out; uint32_t pos = 45; @@ -850,7 +980,7 @@ void vpx_idct32x32_1024_add_dspr2(const int16_t *input, uint8_t *dest, idct32_rows_dspr2(input, outptr, 32); // Columns - vpx_idct32_cols_add_blk_dspr2(out, dest, dest_stride); + vpx_idct32_cols_add_blk_dspr2(out, dest, stride); } void vpx_idct32x32_34_add_dspr2(const int16_t *input, uint8_t *dest, @@ -941,7 +1071,7 @@ void vpx_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest, "abs %[absa1], %[a1] \n\t" "replv.qb %[vector_a1], %[absa1] \n\t" - : [absa1] "=r"(absa1), [vector_a1] "=r"(vector_a1) + : [absa1] "=&r"(absa1), [vector_a1] "=&r"(vector_a1) : [a1] "r"(a1)); for (r = 32; r--;) { @@ -980,12 +1110,71 @@ void vpx_idct32x32_1_add_dspr2(const int16_t *input, uint8_t *dest, [dest] "+&r"(dest) : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); } + } else if (a1 > 255) { + int32_t a11, a12, vector_a11, vector_a12; + + /* use quad-byte + * input and output memory are four byte aligned */ + a11 = a1 >> 1; + a12 = a1 - a11; + __asm__ __volatile__( + "replv.qb %[vector_a11], %[a11] \n\t" + "replv.qb %[vector_a12], %[a12] \n\t" + + : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12) + : [a11] "r"(a11), [a12] "r"(a12)); + + for (r = 32; r--;) { + __asm__ __volatile__( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "lw %[t3], 8(%[dest]) \n\t" + "lw %[t4], 12(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a11] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a11] \n\t" + "addu_s.qb %[vector_3], %[t3], %[vector_a11] \n\t" + "addu_s.qb %[vector_4], %[t4], %[vector_a11] \n\t" + "addu_s.qb %[vector_1], %[vector_1], %[vector_a12] \n\t" + "addu_s.qb %[vector_2], %[vector_2], %[vector_a12] \n\t" + "addu_s.qb %[vector_3], %[vector_3], %[vector_a12] \n\t" + "addu_s.qb %[vector_4], %[vector_4], %[vector_a12] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "sw %[vector_3], 8(%[dest]) \n\t" + "sw %[vector_4], 12(%[dest]) \n\t" + + "lw %[t1], 16(%[dest]) \n\t" + "lw %[t2], 20(%[dest]) \n\t" + "lw %[t3], 24(%[dest]) \n\t" + "lw %[t4], 28(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a11] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a11] \n\t" + "addu_s.qb %[vector_3], %[t3], %[vector_a11] \n\t" + "addu_s.qb %[vector_4], %[t4], %[vector_a11] \n\t" + "addu_s.qb %[vector_1], %[vector_1], %[vector_a12] \n\t" + "addu_s.qb %[vector_2], %[vector_2], %[vector_a12] \n\t" + "addu_s.qb %[vector_3], %[vector_3], %[vector_a12] \n\t" + "addu_s.qb %[vector_4], %[vector_4], %[vector_a12] \n\t" + "sw %[vector_1], 16(%[dest]) \n\t" + "sw %[vector_2], 20(%[dest]) \n\t" + "sw %[vector_3], 24(%[dest]) \n\t" + "sw %[vector_4], 28(%[dest]) \n\t" + + "add %[dest], %[dest], %[stride] \n\t" + + : [t1] "=&r"(t1), [t2] "=&r"(t2), [t3] "=&r"(t3), [t4] "=&r"(t4), + [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), + [vector_3] "=&r"(vector_3), [vector_4] "=&r"(vector_4), + [dest] "+&r"(dest) + : [stride] "r"(stride), [vector_a11] "r"(vector_a11), + [vector_a12] "r"(vector_a12)); + } } else { /* use quad-byte * input and output memory are four byte aligned */ __asm__ __volatile__("replv.qb %[vector_a1], %[a1] \n\t" - : [vector_a1] "=r"(vector_a1) + : [vector_a1] "=&r"(vector_a1) : [a1] "r"(a1)); for (r = 32; r--;) { diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans4_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans4_dspr2.c index 516ea80f4ae..3f985b847b1 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans4_dspr2.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans4_dspr2.c @@ -15,7 +15,7 @@ #if HAVE_DSPR2 void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output) { - int16_t step_0, step_1, step_2, step_3; + int step_0, step_1, step_2, step_3; int Temp0, Temp1, Temp2, Temp3; const int const_2_power_13 = 8192; int i; @@ -96,23 +96,13 @@ void vpx_idct4_rows_dspr2(const int16_t *input, int16_t *output) { } void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride) { - int16_t step_0, step_1, step_2, step_3; + int stride) { + int step_0, step_1, step_2, step_3; int Temp0, Temp1, Temp2, Temp3; const int const_2_power_13 = 8192; + const int const_255 = 255; int i; uint8_t *dest_pix; - uint8_t *cm = vpx_ff_cropTbl; - - /* prefetch vpx_ff_cropTbl */ - prefetch_load(vpx_ff_cropTbl); - prefetch_load(vpx_ff_cropTbl + 32); - prefetch_load(vpx_ff_cropTbl + 64); - prefetch_load(vpx_ff_cropTbl + 96); - prefetch_load(vpx_ff_cropTbl + 128); - prefetch_load(vpx_ff_cropTbl + 160); - prefetch_load(vpx_ff_cropTbl + 192); - prefetch_load(vpx_ff_cropTbl + 224); for (i = 0; i < 4; ++i) { dest_pix = (dest + i); @@ -172,51 +162,62 @@ void vpx_idct4_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, "sra %[Temp0], %[Temp0], 4 \n\t" "lbu %[Temp1], 0(%[dest_pix]) \n\t" "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "add %[Temp0], %[step_1], %[step_2] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + "add %[Temp0], %[step_1], %[step_2] \n\t" "addi %[Temp0], %[Temp0], 8 \n\t" "sra %[Temp0], %[Temp0], 4 \n\t" "lbu %[Temp1], 0(%[dest_pix]) \n\t" "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "sub %[Temp0], %[step_1], %[step_2] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + "sub %[Temp0], %[step_1], %[step_2] \n\t" "addi %[Temp0], %[Temp0], 8 \n\t" "sra %[Temp0], %[Temp0], 4 \n\t" "lbu %[Temp1], 0(%[dest_pix]) \n\t" "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "sub %[Temp0], %[step_0], %[step_3] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" + "sub %[Temp0], %[step_0], %[step_3] \n\t" "addi %[Temp0], %[Temp0], 8 \n\t" "sra %[Temp0], %[Temp0], 4 \n\t" "lbu %[Temp1], 0(%[dest_pix]) \n\t" "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" : [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [step_0] "=&r"(step_0), [step_1] "=&r"(step_1), [step_2] "=&r"(step_2), [step_3] "=&r"(step_3), [dest_pix] "+r"(dest_pix) - : [const_2_power_13] "r"(const_2_power_13), + : [const_2_power_13] "r"(const_2_power_13), [const_255] "r"(const_255), [cospi_8_64] "r"(cospi_8_64), [cospi_16_64] "r"(cospi_16_64), - [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm), - [dest_stride] "r"(dest_stride)); + [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), + [stride] "r"(stride)); input += 4; } } -void vpx_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { +void vpx_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, int stride) { DECLARE_ALIGNED(32, int16_t, out[4 * 4]); int16_t *outptr = out; uint32_t pos = 45; @@ -230,11 +231,10 @@ void vpx_idct4x4_16_add_dspr2(const int16_t *input, uint8_t *dest, vpx_idct4_rows_dspr2(input, outptr); // Columns - vpx_idct4_columns_add_blk_dspr2(&out[0], dest, dest_stride); + vpx_idct4_columns_add_blk_dspr2(&out[0], dest, stride); } -void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { +void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest, int stride) { int a1, absa1; int r; int32_t out; @@ -271,10 +271,43 @@ void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest, "lw %[t2], 0(%[dest]) \n\t" "subu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t" "sw %[vector_a], 0(%[dest]) \n\t" - "add %[dest], %[dest], %[dest_stride] \n\t" + "add %[dest], %[dest], %[stride] \n\t" + + : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest) + : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); + } + } else if (a1 > 255) { + int32_t a11, a12, vector_a11, vector_a12; + + /* use quad-byte + * input and output memory are four byte aligned */ + a11 = a1 >> 3; + a12 = a1 - (a11 * 7); + + __asm__ __volatile__( + "replv.qb %[vector_a11], %[a11] \n\t" + "replv.qb %[vector_a12], %[a12] \n\t" + + : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12) + : [a11] "r"(a11), [a12] "r"(a12)); + + for (r = 4; r--;) { + __asm__ __volatile__( + "lw %[t2], 4(%[dest]) \n\t" + "addu_s.qb %[vector_a], %[t2], %[vector_a11] \n\t" + "addu_s.qb %[vector_a], %[vector_a], %[vector_a11] \n\t" + "addu_s.qb %[vector_a], %[vector_a], %[vector_a11] \n\t" + "addu_s.qb %[vector_a], %[vector_a], %[vector_a11] \n\t" + "addu_s.qb %[vector_a], %[vector_a], %[vector_a11] \n\t" + "addu_s.qb %[vector_a], %[vector_a], %[vector_a11] \n\t" + "addu_s.qb %[vector_a], %[vector_a], %[vector_a11] \n\t" + "addu_s.qb %[vector_a], %[vector_a], %[vector_a12] \n\t" + "sw %[vector_a], 0(%[dest]) \n\t" + "add %[dest], %[dest], %[stride] \n\t" : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest) - : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); + : [stride] "r"(stride), [vector_a11] "r"(vector_a11), + [vector_a12] "r"(vector_a12)); } } else { /* use quad-byte @@ -288,10 +321,10 @@ void vpx_idct4x4_1_add_dspr2(const int16_t *input, uint8_t *dest, "lw %[t2], 0(%[dest]) \n\t" "addu_s.qb %[vector_a], %[t2], %[vector_a1] \n\t" "sw %[vector_a], 0(%[dest]) \n\t" - "add %[dest], %[dest], %[dest_stride] \n\t" + "add %[dest], %[dest], %[stride] \n\t" : [t2] "=&r"(t2), [vector_a] "=&r"(vector_a), [dest] "+&r"(dest) - : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); + : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); } } } diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans8_dspr2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans8_dspr2.c index 08a6c78b6e4..d4d246965c3 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans8_dspr2.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/itrans8_dspr2.c @@ -192,24 +192,13 @@ void idct8_rows_dspr2(const int16_t *input, int16_t *output, uint32_t no_rows) { } } -void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, - int dest_stride) { +void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, int stride) { int step1_0, step1_1, step1_2, step1_3, step1_4, step1_5, step1_6, step1_7; int Temp0, Temp1, Temp2, Temp3; int i; const int const_2_power_13 = 8192; + const int const_255 = 255; uint8_t *dest_pix; - uint8_t *cm = vpx_ff_cropTbl; - - /* prefetch vpx_ff_cropTbl */ - prefetch_load(vpx_ff_cropTbl); - prefetch_load(vpx_ff_cropTbl + 32); - prefetch_load(vpx_ff_cropTbl + 64); - prefetch_load(vpx_ff_cropTbl + 96); - prefetch_load(vpx_ff_cropTbl + 128); - prefetch_load(vpx_ff_cropTbl + 160); - prefetch_load(vpx_ff_cropTbl + 192); - prefetch_load(vpx_ff_cropTbl + 224); for (i = 0; i < 8; ++i) { dest_pix = (dest + i); @@ -356,70 +345,94 @@ void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, "sra %[Temp0], %[Temp0], 5 \n\t" "add %[Temp1], %[Temp1], %[Temp0] \n\t" "add %[Temp0], %[step1_1], %[step1_6] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[Temp1], 0(%[dest_pix]) \n\t" "addi %[Temp0], %[Temp0], 16 \n\t" "sra %[Temp0], %[Temp0], 5 \n\t" "add %[Temp1], %[Temp1], %[Temp0] \n\t" "add %[Temp0], %[step1_2], %[step1_5] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[Temp1], 0(%[dest_pix]) \n\t" "addi %[Temp0], %[Temp0], 16 \n\t" "sra %[Temp0], %[Temp0], 5 \n\t" "add %[Temp1], %[Temp1], %[Temp0] \n\t" "add %[Temp0], %[step1_3], %[step1_4] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[Temp1], 0(%[dest_pix]) \n\t" "addi %[Temp0], %[Temp0], 16 \n\t" "sra %[Temp0], %[Temp0], 5 \n\t" "add %[Temp1], %[Temp1], %[Temp0] \n\t" "sub %[Temp0], %[step1_3], %[step1_4] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[Temp1], 0(%[dest_pix]) \n\t" "addi %[Temp0], %[Temp0], 16 \n\t" "sra %[Temp0], %[Temp0], 5 \n\t" "add %[Temp1], %[Temp1], %[Temp0] \n\t" "sub %[Temp0], %[step1_2], %[step1_5] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[Temp1], 0(%[dest_pix]) \n\t" "addi %[Temp0], %[Temp0], 16 \n\t" "sra %[Temp0], %[Temp0], 5 \n\t" "add %[Temp1], %[Temp1], %[Temp0] \n\t" "sub %[Temp0], %[step1_1], %[step1_6] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[Temp1], 0(%[dest_pix]) \n\t" "addi %[Temp0], %[Temp0], 16 \n\t" "sra %[Temp0], %[Temp0], 5 \n\t" "add %[Temp1], %[Temp1], %[Temp0] \n\t" "sub %[Temp0], %[step1_0], %[step1_7] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" - "addu %[dest_pix], %[dest_pix], %[dest_stride] \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" + "addu %[dest_pix], %[dest_pix], %[stride] \n\t" "lbu %[Temp1], 0(%[dest_pix]) \n\t" "addi %[Temp0], %[Temp0], 16 \n\t" "sra %[Temp0], %[Temp0], 5 \n\t" "add %[Temp1], %[Temp1], %[Temp0] \n\t" - "lbux %[Temp2], %[Temp1](%[cm]) \n\t" - "sb %[Temp2], 0(%[dest_pix]) \n\t" + "slt %[Temp2], %[Temp1], %[const_255] \n\t" + "slt %[Temp3], $zero, %[Temp1] \n\t" + "movz %[Temp1], %[const_255], %[Temp2] \n\t" + "movz %[Temp1], $zero, %[Temp3] \n\t" + "sb %[Temp1], 0(%[dest_pix]) \n\t" : [step1_0] "=&r"(step1_0), [step1_1] "=&r"(step1_1), [step1_2] "=&r"(step1_2), [step1_3] "=&r"(step1_3), @@ -427,19 +440,18 @@ void idct8_columns_add_blk_dspr2(int16_t *input, uint8_t *dest, [step1_6] "=&r"(step1_6), [step1_7] "=&r"(step1_7), [Temp0] "=&r"(Temp0), [Temp1] "=&r"(Temp1), [Temp2] "=&r"(Temp2), [Temp3] "=&r"(Temp3), [dest_pix] "+r"(dest_pix) - : [const_2_power_13] "r"(const_2_power_13), + : [const_2_power_13] "r"(const_2_power_13), [const_255] "r"(const_255), [cospi_16_64] "r"(cospi_16_64), [cospi_28_64] "r"(cospi_28_64), [cospi_4_64] "r"(cospi_4_64), [cospi_12_64] "r"(cospi_12_64), [cospi_20_64] "r"(cospi_20_64), [cospi_8_64] "r"(cospi_8_64), - [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), [cm] "r"(cm), - [dest_stride] "r"(dest_stride)); + [cospi_24_64] "r"(cospi_24_64), [input] "r"(input), + [stride] "r"(stride)); input += 8; } } -void vpx_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { +void vpx_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, int stride) { DECLARE_ALIGNED(32, int16_t, out[8 * 8]); int16_t *outptr = out; uint32_t pos = 45; @@ -451,11 +463,10 @@ void vpx_idct8x8_64_add_dspr2(const int16_t *input, uint8_t *dest, idct8_rows_dspr2(input, outptr, 8); // Then transform columns and add to dest - idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); + idct8_columns_add_blk_dspr2(&out[0], dest, stride); } -void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { +void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest, int stride) { DECLARE_ALIGNED(32, int16_t, out[8 * 8]); int16_t *outptr = out; uint32_t pos = 45; @@ -490,11 +501,10 @@ void vpx_idct8x8_12_add_dspr2(const int16_t *input, uint8_t *dest, : [outptr] "r"(outptr)); // Then transform columns and add to dest - idct8_columns_add_blk_dspr2(&out[0], dest, dest_stride); + idct8_columns_add_blk_dspr2(&out[0], dest, stride); } -void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, - int dest_stride) { +void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, int stride) { uint32_t pos = 45; int32_t out; int32_t r; @@ -533,11 +543,47 @@ void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, "subu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" "sw %[vector_1], 0(%[dest]) \n\t" "sw %[vector_2], 4(%[dest]) \n\t" - "add %[dest], %[dest], %[dest_stride] \n\t" + "add %[dest], %[dest], %[stride] \n\t" : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), [dest] "+&r"(dest) - : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); + : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); + } + } else if (a1 > 255) { + int32_t a11, a12, vector_a11, vector_a12; + + /* use quad-byte + * input and output memory are four byte aligned */ + a11 = a1 >> 2; + a12 = a1 - (a11 * 3); + + __asm__ __volatile__( + "replv.qb %[vector_a11], %[a11] \n\t" + "replv.qb %[vector_a12], %[a12] \n\t" + + : [vector_a11] "=&r"(vector_a11), [vector_a12] "=&r"(vector_a12) + : [a11] "r"(a11), [a12] "r"(a12)); + + for (r = 8; r--;) { + __asm__ __volatile__( + "lw %[t1], 0(%[dest]) \n\t" + "lw %[t2], 4(%[dest]) \n\t" + "addu_s.qb %[vector_1], %[t1], %[vector_a11] \n\t" + "addu_s.qb %[vector_2], %[t2], %[vector_a11] \n\t" + "addu_s.qb %[vector_1], %[vector_1], %[vector_a11] \n\t" + "addu_s.qb %[vector_2], %[vector_2], %[vector_a11] \n\t" + "addu_s.qb %[vector_1], %[vector_1], %[vector_a11] \n\t" + "addu_s.qb %[vector_2], %[vector_2], %[vector_a11] \n\t" + "addu_s.qb %[vector_1], %[vector_1], %[vector_a12] \n\t" + "addu_s.qb %[vector_2], %[vector_2], %[vector_a12] \n\t" + "sw %[vector_1], 0(%[dest]) \n\t" + "sw %[vector_2], 4(%[dest]) \n\t" + "add %[dest], %[dest], %[stride] \n\t" + + : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1), + [vector_2] "=&r"(vector_2), [dest] "+r"(dest) + : [stride] "r"(stride), [vector_a11] "r"(vector_a11), + [vector_a12] "r"(vector_a12)); } } else { /* use quad-byte @@ -555,11 +601,11 @@ void vpx_idct8x8_1_add_dspr2(const int16_t *input, uint8_t *dest, "addu_s.qb %[vector_2], %[t2], %[vector_a1] \n\t" "sw %[vector_1], 0(%[dest]) \n\t" "sw %[vector_2], 4(%[dest]) \n\t" - "add %[dest], %[dest], %[dest_stride] \n\t" + "add %[dest], %[dest], %[stride] \n\t" : [t1] "=&r"(t1), [t2] "=&r"(t2), [vector_1] "=&r"(vector_1), [vector_2] "=&r"(vector_2), [dest] "+r"(dest) - : [dest_stride] "r"(dest_stride), [vector_a1] "r"(vector_a1)); + : [stride] "r"(stride), [vector_a1] "r"(vector_a1)); } } } diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/txfm_macros_msa.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/txfm_macros_msa.h index da100f6a980..f077fa4814a 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/txfm_macros_msa.h +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/txfm_macros_msa.h @@ -15,19 +15,24 @@ #define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) \ { \ - v8i16 k0_m = __msa_fill_h(cnst0); \ - v4i32 s0_m, s1_m, s2_m, s3_m; \ + v4i32 s0_m, s1_m, s2_m, s3_m, s4_m, s5_m; \ + v8i16 k0_m, k1_m, k2_m, zero = { 0 }; \ \ - s0_m = (v4i32)__msa_fill_h(cnst1); \ - k0_m = __msa_ilvev_h((v8i16)s0_m, k0_m); \ + k0_m = __msa_fill_h(cnst0); \ + k1_m = __msa_fill_h(cnst1); \ + k2_m = __msa_ilvev_h((v8i16)k1_m, k0_m); \ + k0_m = __msa_ilvev_h((v8i16)zero, k0_m); \ + k1_m = __msa_ilvev_h(k1_m, (v8i16)zero); \ \ - ILVRL_H2_SW((-reg1), reg0, s1_m, s0_m); \ + ILVRL_H2_SW(reg1, reg0, s5_m, s4_m); \ ILVRL_H2_SW(reg0, reg1, s3_m, s2_m); \ - DOTP_SH2_SW(s1_m, s0_m, k0_m, k0_m, s1_m, s0_m); \ + DOTP_SH2_SW(s5_m, s4_m, k0_m, k0_m, s1_m, s0_m); \ + s1_m = __msa_dpsub_s_w(s1_m, (v8i16)s5_m, k1_m); \ + s0_m = __msa_dpsub_s_w(s0_m, (v8i16)s4_m, k1_m); \ SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \ out0 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \ \ - DOTP_SH2_SW(s3_m, s2_m, k0_m, k0_m, s1_m, s0_m); \ + DOTP_SH2_SW(s3_m, s2_m, k2_m, k2_m, s1_m, s0_m); \ SRARI_W2_SW(s1_m, s0_m, DCT_CONST_BITS); \ out1 = __msa_pckev_h((v8i16)s0_m, (v8i16)s1_m); \ } diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk index 2909beb0f6c..bb20ea27421 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk @@ -48,6 +48,7 @@ DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_ssse3.asm ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE) += x86/highbd_intrapred_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm +DSP_SRCS-$(HAVE_NEON) += arm/highbd_intrapred_neon.c endif # CONFIG_VP9_HIGHBITDEPTH ifneq ($(filter yes,$(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),) @@ -56,6 +57,7 @@ DSP_SRCS-yes += deblock.c DSP_SRCS-yes += postproc.h DSP_SRCS-$(HAVE_MSA) += mips/add_noise_msa.c DSP_SRCS-$(HAVE_MSA) += mips/deblock_msa.c +DSP_SRCS-$(HAVE_NEON) += arm/deblock_neon.c DSP_SRCS-$(HAVE_SSE2) += x86/add_noise_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/deblock_sse2.asm endif # CONFIG_POSTPROC @@ -140,14 +142,11 @@ DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64) += x86/loopfilter_sse2.c DSP_SRCS-$(HAVE_AVX2) += x86/loopfilter_avx2.c ifeq ($(HAVE_NEON_ASM),yes) -DSP_SRCS-yes += arm/loopfilter_vertical_4_dual_neon.c DSP_SRCS-yes += arm/loopfilter_16_neon$(ASM) DSP_SRCS-yes += arm/loopfilter_8_neon$(ASM) DSP_SRCS-yes += arm/loopfilter_4_neon$(ASM) else -ifeq ($(HAVE_NEON),yes) -DSP_SRCS-yes += arm/loopfilter_neon.c -endif # HAVE_NEON +DSP_SRCS-$(HAVE_NEON) += arm/loopfilter_neon.c endif # HAVE_NEON_ASM DSP_SRCS-$(HAVE_MSA) += mips/loopfilter_msa.h @@ -203,17 +202,6 @@ endif # ARCH_X86_64 DSP_SRCS-$(HAVE_NEON_ASM) += arm/save_reg_neon$(ASM) ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) -ifeq ($(HAVE_NEON_ASM),yes) -DSP_SRCS-yes += arm/idct16x16_add_neon$(ASM) -else -ifeq ($(HAVE_NEON),yes) -DSP_SRCS-yes += arm/idct16x16_add_neon.c -endif # HAVE_NEON -endif # HAVE_NEON_ASM -DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_neon.c -DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_add_neon.c -DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_34_add_neon.c - DSP_SRCS-$(HAVE_MSA) += mips/inv_txfm_msa.h DSP_SRCS-$(HAVE_MSA) += mips/idct4x4_msa.c DSP_SRCS-$(HAVE_MSA) += mips/idct8x8_msa.c @@ -226,6 +214,9 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/itrans8_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c +else # CONFIG_VP9_HIGHBITDEPTH +DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct4x4_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct8x8_add_neon.c endif # !CONFIG_VP9_HIGHBITDEPTH ifeq ($(HAVE_NEON_ASM),yes) @@ -235,15 +226,21 @@ DSP_SRCS-yes += arm/idct4x4_add_neon$(ASM) DSP_SRCS-yes += arm/idct8x8_1_add_neon$(ASM) DSP_SRCS-yes += arm/idct8x8_add_neon$(ASM) DSP_SRCS-yes += arm/idct16x16_1_add_neon$(ASM) +DSP_SRCS-yes += arm/idct16x16_add_neon$(ASM) +DSP_SRCS-yes += arm/idct16x16_neon.c else DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_1_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_1_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_1_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_add_neon.c endif # HAVE_NEON_ASM DSP_SRCS-$(HAVE_NEON) += arm/idct_neon.h DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_1_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_34_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_135_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_add_neon.c endif # CONFIG_VP9 diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl index ee403be3975..ee1b2927938 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -96,6 +96,7 @@ specialize qw/vpx_h_predictor_8x8 neon dspr2 msa sse2/; add_proto qw/void vpx_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; add_proto qw/void vpx_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vpx_d135_predictor_8x8 neon/; add_proto qw/void vpx_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d153_predictor_8x8 ssse3/; @@ -139,6 +140,7 @@ specialize qw/vpx_h_predictor_16x16 neon dspr2 msa sse2/; add_proto qw/void vpx_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; add_proto qw/void vpx_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vpx_d135_predictor_16x16 neon/; add_proto qw/void vpx_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d153_predictor_16x16 ssse3/; @@ -167,7 +169,7 @@ specialize qw/vpx_d207_predictor_32x32 ssse3/; add_proto qw/void vpx_d207e_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; add_proto qw/void vpx_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vpx_d45_predictor_32x32 ssse3/; +specialize qw/vpx_d45_predictor_32x32 neon ssse3/; add_proto qw/void vpx_d45e_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; @@ -182,6 +184,7 @@ specialize qw/vpx_h_predictor_32x32 neon msa sse2/; add_proto qw/void vpx_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; add_proto qw/void vpx_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; +specialize qw/vpx_d135_predictor_32x32 neon/; add_proto qw/void vpx_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; specialize qw/vpx_d153_predictor_32x32 ssse3/; @@ -211,6 +214,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_d207e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d45_predictor_4x4 neon/; add_proto qw/void vpx_highbd_d45e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; @@ -219,33 +223,39 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_d63e_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_h_predictor_4x4 neon/; add_proto qw/void vpx_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d135_predictor_4x4 neon/; add_proto qw/void vpx_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_v_predictor_4x4 sse2/; + specialize qw/vpx_highbd_v_predictor_4x4 neon sse2/; add_proto qw/void vpx_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_tm_predictor_4x4 sse2/; + specialize qw/vpx_highbd_tm_predictor_4x4 neon sse2/; add_proto qw/void vpx_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_dc_predictor_4x4 sse2/; + specialize qw/vpx_highbd_dc_predictor_4x4 neon sse2/; add_proto qw/void vpx_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_dc_top_predictor_4x4 neon/; add_proto qw/void vpx_highbd_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_dc_left_predictor_4x4 neon/; add_proto qw/void vpx_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_dc_128_predictor_4x4 neon/; add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_d207e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d45_predictor_8x8 neon/; add_proto qw/void vpx_highbd_d45e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; @@ -254,33 +264,39 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_d63e_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_h_predictor_8x8 neon/; add_proto qw/void vpx_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d135_predictor_8x8 neon/; add_proto qw/void vpx_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_v_predictor_8x8 sse2/; + specialize qw/vpx_highbd_v_predictor_8x8 neon sse2/; add_proto qw/void vpx_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_tm_predictor_8x8 sse2/; + specialize qw/vpx_highbd_tm_predictor_8x8 neon sse2/; add_proto qw/void vpx_highbd_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_dc_predictor_8x8 sse2/;; + specialize qw/vpx_highbd_dc_predictor_8x8 neon sse2/; add_proto qw/void vpx_highbd_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_dc_top_predictor_8x8 neon/; add_proto qw/void vpx_highbd_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_dc_left_predictor_8x8 neon/; add_proto qw/void vpx_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_dc_128_predictor_8x8 neon/; add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_d207e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d45_predictor_16x16 neon/; add_proto qw/void vpx_highbd_d45e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; @@ -289,33 +305,39 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_d63e_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_h_predictor_16x16 neon/; add_proto qw/void vpx_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d135_predictor_16x16 neon/; add_proto qw/void vpx_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_v_predictor_16x16 sse2/; + specialize qw/vpx_highbd_v_predictor_16x16 neon sse2/; add_proto qw/void vpx_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_tm_predictor_16x16 sse2/; + specialize qw/vpx_highbd_tm_predictor_16x16 neon sse2/; add_proto qw/void vpx_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_dc_predictor_16x16 sse2/; + specialize qw/vpx_highbd_dc_predictor_16x16 neon sse2/; add_proto qw/void vpx_highbd_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_dc_top_predictor_16x16 neon/; add_proto qw/void vpx_highbd_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_dc_left_predictor_16x16 neon/; add_proto qw/void vpx_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_dc_128_predictor_16x16 neon/; add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_d207e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d45_predictor_32x32 neon/; add_proto qw/void vpx_highbd_d45e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; @@ -324,27 +346,32 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_d63e_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_h_predictor_32x32 neon/; add_proto qw/void vpx_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d135_predictor_32x32 neon/; add_proto qw/void vpx_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; add_proto qw/void vpx_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_v_predictor_32x32 sse2/; + specialize qw/vpx_highbd_v_predictor_32x32 neon sse2/; add_proto qw/void vpx_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_tm_predictor_32x32 sse2/; + specialize qw/vpx_highbd_tm_predictor_32x32 neon sse2/; add_proto qw/void vpx_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vpx_highbd_dc_predictor_32x32 sse2/; + specialize qw/vpx_highbd_dc_predictor_32x32 neon sse2/; add_proto qw/void vpx_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_dc_top_predictor_32x32 neon/; add_proto qw/void vpx_highbd_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_dc_left_predictor_32x32 neon/; add_proto qw/void vpx_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_dc_128_predictor_32x32 neon/; } # CONFIG_VP9_HIGHBITDEPTH # @@ -585,193 +612,193 @@ if (vpx_config("CONFIG_VP9") eq "yes") { if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # Note as optimized versions of these functions are added we need to add a check to ensure # that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only. - add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride"; specialize qw/vpx_iwht4x4_16_add sse2/; - add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + specialize qw/vpx_highbd_idct4x4_1_add neon/; - add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + specialize qw/vpx_highbd_idct8x8_1_add neon/; - add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; - add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; - add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; - add_proto qw/void vpx_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + add_proto qw/void vpx_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; specialize qw/vpx_highbd_idct32x32_1_add sse2/; - add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; - add_proto qw/void vpx_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + add_proto qw/void vpx_highbd_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; # Force C versions if CONFIG_EMULATE_HARDWARE is 1 if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") { - add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; - add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; - add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; - add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; - add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; } else { - add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride"; specialize qw/vpx_idct4x4_16_add neon sse2/; - add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; specialize qw/vpx_idct4x4_1_add neon sse2/; - add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride"; specialize qw/vpx_idct8x8_64_add neon sse2/, "$ssse3_x86_64"; - add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride"; specialize qw/vpx_idct8x8_12_add neon sse2/, "$ssse3_x86_64"; - add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; specialize qw/vpx_idct8x8_1_add neon sse2/; - add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vpx_idct16x16_256_add sse2/; + add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride"; + specialize qw/vpx_idct16x16_256_add neon sse2/; - add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vpx_idct16x16_10_add sse2/; + add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride"; + specialize qw/vpx_idct16x16_10_add neon sse2/; - add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; specialize qw/vpx_idct16x16_1_add neon sse2/; - add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vpx_idct32x32_1024_add sse2/, "$ssse3_x86_64"; + add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride"; + specialize qw/vpx_idct32x32_1024_add neon sse2/, "$ssse3_x86_64"; - add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vpx_idct32x32_135_add sse2/, "$ssse3_x86_64"; + add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride"; + specialize qw/vpx_idct32x32_135_add neon sse2/, "$ssse3_x86_64"; # Need to add 135 eob idct32x32 implementations. $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2; - add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vpx_idct32x32_34_add sse2/, "$ssse3_x86_64"; + add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride"; + specialize qw/vpx_idct32x32_34_add neon sse2/, "$ssse3_x86_64"; - add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; specialize qw/vpx_idct32x32_1_add neon sse2/; - add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/vpx_highbd_idct4x4_16_add sse2/; + add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + specialize qw/vpx_highbd_idct4x4_16_add neon sse2/; - add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/vpx_highbd_idct8x8_64_add sse2/; + add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + specialize qw/vpx_highbd_idct8x8_64_add neon sse2/; - add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/vpx_highbd_idct8x8_12_add sse2/; + add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + specialize qw/vpx_highbd_idct8x8_12_add neon sse2/; - add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; specialize qw/vpx_highbd_idct16x16_256_add sse2/; - add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; specialize qw/vpx_highbd_idct16x16_10_add sse2/; } # CONFIG_EMULATE_HARDWARE } else { # Force C versions if CONFIG_EMULATE_HARDWARE is 1 if (vpx_config("CONFIG_EMULATE_HARDWARE") eq "yes") { - add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride"; } else { - add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; specialize qw/vpx_idct4x4_1_add sse2 neon dspr2 msa/; - add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride"; specialize qw/vpx_idct4x4_16_add sse2 neon dspr2 msa/; - add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; specialize qw/vpx_idct8x8_1_add sse2 neon dspr2 msa/; - add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride"; specialize qw/vpx_idct8x8_64_add sse2 neon dspr2 msa/, "$ssse3_x86_64"; - add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride"; specialize qw/vpx_idct8x8_12_add sse2 neon dspr2 msa/, "$ssse3_x86_64"; - add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; specialize qw/vpx_idct16x16_1_add sse2 neon dspr2 msa/; - add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride"; specialize qw/vpx_idct16x16_256_add sse2 neon dspr2 msa/; - add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride"; specialize qw/vpx_idct16x16_10_add sse2 neon dspr2 msa/; - add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride"; specialize qw/vpx_idct32x32_1024_add sse2 neon dspr2 msa/, "$ssse3_x86_64"; - add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride"; specialize qw/vpx_idct32x32_135_add sse2 neon dspr2 msa/, "$ssse3_x86_64"; - # Need to add 135 eob idct32x32 implementations. $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2; - $vpx_idct32x32_135_add_neon=vpx_idct32x32_1024_add_neon; $vpx_idct32x32_135_add_dspr2=vpx_idct32x32_1024_add_dspr2; $vpx_idct32x32_135_add_msa=vpx_idct32x32_1024_add_msa; - add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride"; specialize qw/vpx_idct32x32_34_add sse2 neon dspr2 msa/, "$ssse3_x86_64"; - add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; specialize qw/vpx_idct32x32_1_add sse2 neon dspr2 msa/; - add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; specialize qw/vpx_iwht4x4_1_add msa/; - add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; + add_proto qw/void vpx_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride"; specialize qw/vpx_iwht4x4_16_add msa sse2/; } # CONFIG_EMULATE_HARDWARE } # CONFIG_VP9_HIGHBITDEPTH @@ -1724,15 +1751,13 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") specialize qw/vpx_plane_add_noise sse2 msa/; add_proto qw/void vpx_mbpost_proc_down/, "unsigned char *dst, int pitch, int rows, int cols,int flimit"; - specialize qw/vpx_mbpost_proc_down sse2 msa/; - $vpx_mbpost_proc_down_sse2=vpx_mbpost_proc_down_xmm; + specialize qw/vpx_mbpost_proc_down sse2 neon msa/; add_proto qw/void vpx_mbpost_proc_across_ip/, "unsigned char *dst, int pitch, int rows, int cols,int flimit"; - specialize qw/vpx_mbpost_proc_across_ip sse2 msa/; - $vpx_mbpost_proc_across_ip_sse2=vpx_mbpost_proc_across_ip_xmm; + specialize qw/vpx_mbpost_proc_across_ip sse2 neon msa/; add_proto qw/void vpx_post_proc_down_and_across_mb_row/, "unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size"; - specialize qw/vpx_post_proc_down_and_across_mb_row sse2 msa/; + specialize qw/vpx_post_proc_down_and_across_mb_row sse2 neon msa/; } diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/deblock_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/deblock_sse2.asm index 6df360df44f..ebca50930a0 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/deblock_sse2.asm +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/deblock_sse2.asm @@ -230,11 +230,11 @@ sym(vpx_post_proc_down_and_across_mb_row_sse2): ret %undef flimit -;void vpx_mbpost_proc_down_xmm(unsigned char *dst, -; int pitch, int rows, int cols,int flimit) +;void vpx_mbpost_proc_down_sse2(unsigned char *dst, +; int pitch, int rows, int cols,int flimit) extern sym(vpx_rv) -global sym(vpx_mbpost_proc_down_xmm) PRIVATE -sym(vpx_mbpost_proc_down_xmm): +global sym(vpx_mbpost_proc_down_sse2) PRIVATE +sym(vpx_mbpost_proc_down_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 5 @@ -462,10 +462,10 @@ sym(vpx_mbpost_proc_down_xmm): %undef flimit4 -;void vpx_mbpost_proc_across_ip_xmm(unsigned char *src, -; int pitch, int rows, int cols,int flimit) -global sym(vpx_mbpost_proc_across_ip_xmm) PRIVATE -sym(vpx_mbpost_proc_across_ip_xmm): +;void vpx_mbpost_proc_across_ip_sse2(unsigned char *src, +; int pitch, int rows, int cols,int flimit) +global sym(vpx_mbpost_proc_across_ip_sse2) PRIVATE +sym(vpx_mbpost_proc_across_ip_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 5 diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c index d5fc1440c41..487a474a675 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c @@ -402,10 +402,10 @@ void iadst4_sse2(__m128i *in) { MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, stg2_1, \ stg2_2, stg2_3, stp2_0, stp2_1, stp2_2, stp2_3) \ \ - stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \ - stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \ - stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \ - stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \ + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \ + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \ + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \ + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \ } \ \ /* Stage3 */ \ @@ -413,10 +413,10 @@ void iadst4_sse2(__m128i *in) { const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \ const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \ \ - stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \ - stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \ - stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \ - stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \ + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \ + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \ + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \ + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \ \ tmp0 = _mm_madd_epi16(lo_56, stg2_1); \ tmp1 = _mm_madd_epi16(hi_56, stg2_1); \ @@ -438,14 +438,14 @@ void iadst4_sse2(__m128i *in) { } \ \ /* Stage4 */ \ - out0 = _mm_adds_epi16(stp1_0, stp2_7); \ - out1 = _mm_adds_epi16(stp1_1, stp1_6); \ - out2 = _mm_adds_epi16(stp1_2, stp1_5); \ - out3 = _mm_adds_epi16(stp1_3, stp2_4); \ - out4 = _mm_subs_epi16(stp1_3, stp2_4); \ - out5 = _mm_subs_epi16(stp1_2, stp1_5); \ - out6 = _mm_subs_epi16(stp1_1, stp1_6); \ - out7 = _mm_subs_epi16(stp1_0, stp2_7); \ + out0 = _mm_add_epi16(stp1_0, stp2_7); \ + out1 = _mm_add_epi16(stp1_1, stp1_6); \ + out2 = _mm_add_epi16(stp1_2, stp1_5); \ + out3 = _mm_add_epi16(stp1_3, stp2_4); \ + out4 = _mm_sub_epi16(stp1_3, stp2_4); \ + out5 = _mm_sub_epi16(stp1_2, stp1_5); \ + out6 = _mm_sub_epi16(stp1_1, stp1_6); \ + out7 = _mm_sub_epi16(stp1_0, stp2_7); \ } void vpx_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, @@ -866,8 +866,8 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, stp2_0 = _mm_packs_epi32(tmp0, tmp2); stp2_2 = _mm_packs_epi32(tmp6, tmp4); - tmp0 = _mm_adds_epi16(stp1_4, stp1_5); - tmp1 = _mm_subs_epi16(stp1_4, stp1_5); + tmp0 = _mm_add_epi16(stp1_4, stp1_5); + tmp1 = _mm_sub_epi16(stp1_4, stp1_5); stp2_4 = tmp0; stp2_5 = _mm_unpacklo_epi64(tmp1, zero); @@ -878,8 +878,8 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, { const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6); - tmp4 = _mm_adds_epi16(stp2_0, stp2_2); - tmp6 = _mm_subs_epi16(stp2_0, stp2_2); + tmp4 = _mm_add_epi16(stp2_0, stp2_2); + tmp6 = _mm_sub_epi16(stp2_0, stp2_2); stp1_2 = _mm_unpackhi_epi64(tmp6, tmp4); stp1_3 = _mm_unpacklo_epi64(tmp6, tmp4); @@ -896,10 +896,10 @@ void vpx_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, } // Stage4 - tmp0 = _mm_adds_epi16(stp1_3, stp2_4); - tmp1 = _mm_adds_epi16(stp1_2, stp1_5); - tmp2 = _mm_subs_epi16(stp1_3, stp2_4); - tmp3 = _mm_subs_epi16(stp1_2, stp1_5); + tmp0 = _mm_add_epi16(stp1_3, stp2_4); + tmp1 = _mm_add_epi16(stp1_2, stp1_5); + tmp2 = _mm_sub_epi16(stp1_3, stp2_4); + tmp3 = _mm_sub_epi16(stp1_2, stp1_5); TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) @@ -3449,7 +3449,7 @@ static INLINE __m128i clamp_high_sse2(__m128i value, int bd) { __m128i ubounded, retval; const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi16(1); - const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one); + const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); ubounded = _mm_cmpgt_epi16(value, max); retval = _mm_andnot_si128(ubounded, value); ubounded = _mm_and_si128(ubounded, max); @@ -4012,7 +4012,7 @@ void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest8, __m128i dc_value, d; const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); - const __m128i max = _mm_subs_epi16(_mm_slli_epi16(one, bd), one); + const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); int a, i, j; uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); tran_low_t out; diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm index 20baf820f6b..dee64e3ad36 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm @@ -263,7 +263,7 @@ cglobal idct8x8_64_add, 3, 5, 13, input, output, stride RET -; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero +; inverse 8x8 2D-DCT transform with only first 12 coeffs non-zero cglobal idct8x8_12_add, 3, 5, 13, input, output, stride mova m8, [pd_8192] mova m11, [pw_16] diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_ports/arm_cpudetect.c b/chromium/third_party/libvpx/source/libvpx/vpx_ports/arm_cpudetect.c index 79c60f7a191..4f9d480ade6 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_ports/arm_cpudetect.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_ports/arm_cpudetect.c @@ -58,8 +58,12 @@ int arm_cpu_caps(void) { #elif defined(_MSC_VER) /* end !CONFIG_RUNTIME_CPU_DETECT */ /*For GetExceptionCode() and EXCEPTION_ILLEGAL_INSTRUCTION.*/ +#ifndef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN +#endif +#ifndef WIN32_EXTRA_LEAN #define WIN32_EXTRA_LEAN +#endif #include <windows.h> int arm_cpu_caps(void) { diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_ports/vpx_timer.h b/chromium/third_party/libvpx/source/libvpx/vpx_ports/vpx_timer.h index 4aae30e9474..c1f1b602750 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_ports/vpx_timer.h +++ b/chromium/third_party/libvpx/source/libvpx/vpx_ports/vpx_timer.h @@ -21,6 +21,8 @@ /* * Win32 specific includes */ +#undef NOMINMAX +#define NOMINMAX #ifndef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN #endif diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_ports/x86.h b/chromium/third_party/libvpx/source/libvpx/vpx_ports/x86.h index 6ba02cf1fcc..5aabb9e3afa 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_ports/x86.h +++ b/chromium/third_party/libvpx/source/libvpx/vpx_ports/x86.h @@ -140,6 +140,11 @@ static INLINE uint64_t xgetbv(void) { #endif #if defined(_MSC_VER) && _MSC_VER >= 1700 +#undef NOMINMAX +#define NOMINMAX +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif #include <windows.h> #if WINAPI_FAMILY_PARTITION(WINAPI_FAMILY_APP) #define getenv(x) NULL diff --git a/chromium/third_party/libvpx/source/libvpx/vpxenc.c b/chromium/third_party/libvpx/source/libvpx/vpxenc.c index a0f760574c8..9cd10ab2eb4 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpxenc.c +++ b/chromium/third_party/libvpx/source/libvpx/vpxenc.c @@ -1657,7 +1657,7 @@ static void get_cx_data(struct stream_state *stream, *got_data = 0; while ((pkt = vpx_codec_get_cx_data(&stream->encoder, &iter))) { static size_t fsize = 0; - static int64_t ivf_header_pos = 0; + static FileOffset ivf_header_pos = 0; switch (pkt->kind) { case VPX_CODEC_CX_FRAME_PKT: @@ -1683,7 +1683,7 @@ static void get_cx_data(struct stream_state *stream, fsize += pkt->data.frame.sz; if (!(pkt->data.frame.flags & VPX_FRAME_IS_FRAGMENT)) { - const int64_t currpos = ftello(stream->file); + const FileOffset currpos = ftello(stream->file); fseeko(stream->file, ivf_header_pos, SEEK_SET); ivf_write_frame_size(stream->file, fsize); fseeko(stream->file, currpos, SEEK_SET); |