diff options
author | Allan Sandfeld Jensen <allan.jensen@qt.io> | 2017-03-08 10:28:10 +0100 |
---|---|---|
committer | Allan Sandfeld Jensen <allan.jensen@qt.io> | 2017-03-20 13:40:30 +0000 |
commit | e733310db58160074f574c429d48f8308c0afe17 (patch) | |
tree | f8aef4b7e62a69928dbcf880620eece20f98c6df /chromium/third_party/libvpx | |
parent | 2f583e4aec1ae3a86fa047829c96b310dc12ecdf (diff) | |
download | qtwebengine-chromium-e733310db58160074f574c429d48f8308c0afe17.tar.gz |
BASELINE: Update Chromium to 56.0.2924.122
Change-Id: I4e04de8f47e47e501c46ed934c76a431c6337ced
Reviewed-by: Michael BrĂ¼ning <michael.bruning@qt.io>
Diffstat (limited to 'chromium/third_party/libvpx')
152 files changed, 10123 insertions, 4249 deletions
diff --git a/chromium/third_party/libvpx/BUILD.gn b/chromium/third_party/libvpx/BUILD.gn index 330022c9047..0a39291205b 100644 --- a/chromium/third_party/libvpx/BUILD.gn +++ b/chromium/third_party/libvpx/BUILD.gn @@ -32,9 +32,15 @@ if (current_cpu == "x86") { if (is_nacl) { platform_include_dir = "//third_party/libvpx/source/config/nacl" } else { - if (is_posix && !is_mac) { + # The mac configurations are currently a relic. They were useful when + # x86inc.asm did not work for MACH_O but now the build is identical to the + # linux config. iOS for arm on the other hand needs an apple-specific twist in + # vpx_config.asm + if (is_ios && current_cpu == "arm") { + os_category = current_os + } else if (is_posix) { # Should cover linux, mac, and the ios simulator. os_category = "linux" - } else { + } else { # This should only match windows. os_category = current_os } platform_include_dir = @@ -48,18 +54,6 @@ config("libvpx_config") { "//third_party/libvpx/source/libvpx", "$root_gen_dir/third_party/libvpx", # Provides vpx_rtcd.h. ] - - if (current_cpu == "arm" && is_clang) { - # TODO(hans) Enable integrated-as (crbug.com/124610). - asmflags = [ "-fno-integrated-as" ] - if (is_android) { - rebased_android_toolchain_root = - rebase_path(android_toolchain_root, root_build_dir) - - # Else /usr/bin/as gets picked up. - asmflags += [ "-B${rebased_android_toolchain_root}/bin" ] - } - } } # gn orders flags on a target before flags from configs. The default config @@ -240,8 +234,11 @@ if (current_cpu == "arm" && arm_assembly_sources != []) { action_foreach("convert_arm_assembly") { script = "//third_party/libvpx/run_perl.py" sources = arm_assembly_sources + gen_file = + get_label_info("//third_party/libvpx/source/libvpx", "root_gen_dir") + + "/{{source_root_relative_dir}}/{{source_file_part}}.S" outputs = [ - "$target_gen_dir/{{source_name_part}}.S", + gen_file, ] if (is_ios) { ads2gas_script = @@ -256,7 +253,7 @@ if (current_cpu == "arm" && arm_assembly_sources != []) { "-i", "{{source}}", "-o", - rebase_path("$target_gen_dir/{{source_name_part}}.S"), + rebase_path(gen_file), ] } @@ -267,6 +264,11 @@ if (current_cpu == "arm" && arm_assembly_sources != []) { configs += [ ":libvpx_warnings" ] if (cpu_arch_full == "arm-neon" || cpu_arch_full == "arm-neon-cpu-detect") { asmflags = [ "-mfpu=neon" ] + + # allow asm files to include generated sources which match the source + # tree layout, e.g., vpx_dsp/arm/... + include_dirs = [ get_label_info("//third_party/libvpx/source/libvpx", + "target_gen_dir") ] } deps = [ ":convert_arm_assembly", diff --git a/chromium/third_party/libvpx/README.chromium b/chromium/third_party/libvpx/README.chromium index 241c65f953a..390b58ebdc1 100644 --- a/chromium/third_party/libvpx/README.chromium +++ b/chromium/third_party/libvpx/README.chromium @@ -5,9 +5,9 @@ License: BSD License File: source/libvpx/LICENSE Security Critical: yes -Date: Tuesday October 04 2016 +Date: Tuesday November 08 2016 Branch: master -Commit: 897870497024e4b6cbed4a6a7c1feeab438508f6 +Commit: 5c64c01c7ca3780d30f140e54a30088f780ae66a Description: Contains the sources used to compile libvpx binaries used by Google Chrome and diff --git a/chromium/third_party/libvpx/generate_gni.sh b/chromium/third_party/libvpx/generate_gni.sh index cdc8c77e2b8..67ffdf747fe 100755 --- a/chromium/third_party/libvpx/generate_gni.sh +++ b/chromium/third_party/libvpx/generate_gni.sh @@ -124,7 +124,7 @@ function convert_srcs_to_project_files { # Select all arm neon files ending in _neon.c and all asm files. # The asm files need to be included in the intrinsics target because # they need the -mfpu=neon flag. - # the pattern may need to be updated if vpx_scale gets intrinics + # the pattern may need to be updated if vpx_scale gets intrinsics local intrinsic_list=$(echo "$source_list" | \ egrep 'neon.*(\.c|\.asm)$') fi @@ -179,9 +179,9 @@ function make_clean { # Lint a pair of vpx_config.h and vpx_config.asm to make sure they match. # $1 - Header file directory. function lint_config { - # mips does not contain any assembly so the header does not need to be - # compared to the asm. - if [[ "$1" != *mipsel && "$1" != *mips64el ]]; then + # mips and native client do not contain any assembly so the headers do not + # need to be compared to the asm. + if [[ "$1" != *mipsel && "$1" != *mips64el && "$1" != nacl ]]; then $BASE_DIR/lint_config.sh \ -h $BASE_DIR/$LIBVPX_CONFIG_DIR/$1/vpx_config.h \ -a $BASE_DIR/$LIBVPX_CONFIG_DIR/$1/vpx_config.asm @@ -219,7 +219,7 @@ function gen_rtcd_header { echo "Generate $LIBVPX_CONFIG_DIR/$1/*_rtcd.h files." rm -rf $BASE_DIR/$TEMP_DIR/libvpx.config - if [[ "$2" == "mipsel" || "$2" == "mips64el" ]]; then + if [[ "$2" == "mipsel" || "$2" == "mips64el" || "$2" == nacl ]]; then print_config_basic $1 > $BASE_DIR/$TEMP_DIR/libvpx.config else $BASE_DIR/lint_config.sh -p \ @@ -267,15 +267,22 @@ function gen_config_files { ./configure $2 > /dev/null # Disable HAVE_UNISTD_H as it causes vp8 to try to detect how many cpus - # available, which doesn't work from iniside a sandbox on linux. + # available, which doesn't work from inside a sandbox on linux. ( echo '/HAVE_UNISTD_H/s/[01]/0/' ; echo 'w' ; echo 'q' ) | ed -s vpx_config.h - # Generate vpx_config.asm. Do not create one for mips. - if [[ "$1" != *mipsel && "$1" != *mips64el ]]; then + # Use the correct ads2gas script. + if [[ "$1" == linux* ]]; then + local ASM_CONV=ads2gas.pl + else + local ASM_CONV=ads2gas_apple.pl + fi + + # Generate vpx_config.asm. Do not create one for mips or native client. + if [[ "$1" != *mipsel && "$1" != *mips64el && "$1" != nacl ]]; then if [[ "$1" == *x64* ]] || [[ "$1" == *ia32* ]]; then egrep "#define [A-Z0-9_]+ [01]" vpx_config.h | awk '{print "%define " $2 " " $3}' > vpx_config.asm else - egrep "#define [A-Z0-9_]+ [01]" vpx_config.h | awk '{print $2 " EQU " $3}' | perl $BASE_DIR/$LIBVPX_SRC_DIR/build/make/ads2gas.pl > vpx_config.asm + egrep "#define [A-Z0-9_]+ [01]" vpx_config.h | awk '{print $2 " EQU " $3}' | perl $BASE_DIR/$LIBVPX_SRC_DIR/build/make/$ASM_CONV > vpx_config.asm fi fi @@ -329,6 +336,8 @@ gen_config_files win/ia32 "--target=x86-win32-vs12 ${all_platforms} ${x86_platfo gen_config_files win/x64 "--target=x86_64-win64-vs12 ${all_platforms} ${x86_platforms}" gen_config_files mac/ia32 "--target=x86-darwin9-gcc ${all_platforms} ${x86_platforms}" gen_config_files mac/x64 "--target=x86_64-darwin9-gcc ${all_platforms} ${x86_platforms}" +gen_config_files ios/arm-neon "--target=armv7-linux-gcc ${all_platforms}" +gen_config_files ios/arm64 "--target=armv8-linux-gcc ${all_platforms}" gen_config_files nacl "--target=generic-gnu $HIGHBD ${all_platforms}" echo "Remove temporary directory." @@ -349,6 +358,8 @@ lint_config win/ia32 lint_config win/x64 lint_config mac/ia32 lint_config mac/x64 +lint_config ios/arm-neon +lint_config ios/arm64 lint_config nacl echo "Create temporary directory." @@ -370,6 +381,8 @@ gen_rtcd_header win/ia32 x86 gen_rtcd_header win/x64 x86_64 gen_rtcd_header mac/ia32 x86 gen_rtcd_header mac/x64 x86_64 +gen_rtcd_header ios/arm-neon armv7 +gen_rtcd_header ios/arm64 armv8 gen_rtcd_header nacl nacl echo "Prepare Makefile." @@ -452,5 +465,3 @@ cd $BASE_DIR/$LIBVPX_SRC_DIR update_readme cd $BASE_DIR - -# TODO(fgalligan): Can we turn on "--enable-realtime-only" for mipsel? diff --git a/chromium/third_party/libvpx/libvpx_srcs.gni b/chromium/third_party/libvpx/libvpx_srcs.gni index 560fd843f06..37850e29793 100644 --- a/chromium/third_party/libvpx/libvpx_srcs.gni +++ b/chromium/third_party/libvpx/libvpx_srcs.gni @@ -314,6 +314,7 @@ libvpx_srcs_x86 = [ "//third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd.c", "//third_party/libvpx/source/libvpx/vpx_dsp/vpx_filter.h", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/convolve.h", + "//third_party/libvpx/source/libvpx/vpx_dsp/x86/fdct.h", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h", @@ -750,6 +751,7 @@ libvpx_srcs_x86_64 = [ "//third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd.c", "//third_party/libvpx/source/libvpx/vpx_dsp/vpx_filter.h", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/convolve.h", + "//third_party/libvpx/source/libvpx/vpx_dsp/x86/fdct.h", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_avx2.h", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/fwd_dct32x32_impl_sse2.h", "//third_party/libvpx/source/libvpx/vpx_dsp/x86/fwd_txfm_impl_sse2.h", @@ -1516,6 +1518,10 @@ libvpx_srcs_arm_neon = [ "//third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/hadamard_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_neon.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_vertical_4_dual_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/sad4d_neon.c", @@ -1587,12 +1593,11 @@ libvpx_srcs_arm_neon = [ libvpx_srcs_arm_neon_assembly = [ "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm", - "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.asm", - "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.asm", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm", + "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.asm", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon_asm.asm", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_16_neon.asm", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm", @@ -1876,6 +1881,7 @@ libvpx_srcs_arm_neon_cpu_detect = [ "//third_party/libvpx/source/libvpx/vpx/vpx_image.h", "//third_party/libvpx/source/libvpx/vpx/vpx_integer.h", "//third_party/libvpx/source/libvpx/vpx_dsp/add_noise.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/transpose_neon.h", "//third_party/libvpx/source/libvpx/vpx_dsp/avg.c", "//third_party/libvpx/source/libvpx/vpx_dsp/bitreader.c", @@ -1939,12 +1945,11 @@ libvpx_srcs_arm_neon_cpu_detect = [ libvpx_srcs_arm_neon_cpu_detect_assembly = [ "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm", - "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.asm", - "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.asm", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm", + "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.asm", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon_asm.asm", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_16_neon.asm", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_4_neon.asm", @@ -1984,6 +1989,9 @@ libvpx_srcs_arm_neon_cpu_detect_neon = [ "//third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/hadamard_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_neon.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_vertical_4_dual_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/sad4d_neon.c", @@ -2297,11 +2305,13 @@ libvpx_srcs_arm64 = [ "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.c", + "//third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_neon.c", "//third_party/libvpx/source/libvpx/vpx_dsp/arm/sad4d_neon.c", diff --git a/chromium/third_party/libvpx/lint_config.sh b/chromium/third_party/libvpx/lint_config.sh index d57e4518fde..1a6c96dfbb9 100755 --- a/chromium/third_party/libvpx/lint_config.sh +++ b/chromium/third_party/libvpx/lint_config.sh @@ -51,6 +51,8 @@ fi combined_config="$(cat $header_file $asm_file | grep -E ' +[01] *$')" # Extra filtering for known exceptions. +combined_config="$(echo "$combined_config" | grep -v WIDE_REFERENCE)" +combined_config="$(echo "$combined_config" | grep -v ARCHITECTURE)" combined_config="$(echo "$combined_config" | grep -v DO1STROUNDING)" # Remove all spaces. @@ -62,6 +64,7 @@ combined_config="$(echo "$combined_config" | sed 's/.*define//')" # Remove equ in the ASM file. combined_config="$(echo "$combined_config" | sed 's/\.equ//')" # gas style combined_config="$(echo "$combined_config" | sed 's/equ//')" # rvds style +combined_config="$(echo "$combined_config" | sed 's/\.set//')" # apple style # Remove %define in YASM ASM files. combined_config="$(echo "$combined_config" | sed 's/%define\s *//')" # yasm style diff --git a/chromium/third_party/libvpx/source/config/ios/arm-neon/vp8_rtcd.h b/chromium/third_party/libvpx/source/config/ios/arm-neon/vp8_rtcd.h new file mode 100644 index 00000000000..6a7f92cafdb --- /dev/null +++ b/chromium/third_party/libvpx/source/config/ios/arm-neon/vp8_rtcd.h @@ -0,0 +1,218 @@ +#ifndef VP8_RTCD_H_ +#define VP8_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP8 + */ + +struct blockd; +struct macroblockd; +struct loop_filter_info; + +/* Encoder forward decls */ +struct block; +struct macroblock; +struct variance_vtable; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_neon + +void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict4x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_neon + +void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_neon + +void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_neon + +void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); +#define vp8_blend_b vp8_blend_b_c + +void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); +#define vp8_blend_mb_inner vp8_blend_mb_inner_c + +void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); +#define vp8_blend_mb_outer vp8_blend_mb_outer_c + +int vp8_block_error_c(short *coeff, short *dqcoeff); +#define vp8_block_error vp8_block_error_c + +void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem16x16_neon(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +#define vp8_copy_mem16x16 vp8_copy_mem16x16_neon + +void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x4_neon(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +#define vp8_copy_mem8x4 vp8_copy_mem8x4_neon + +void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x8_neon(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +#define vp8_copy_mem8x8 vp8_copy_mem8x8_neon + +void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +void vp8_dc_only_idct_add_neon(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +#define vp8_dc_only_idct_add vp8_dc_only_idct_add_neon + +int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_neon(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter vp8_denoiser_filter_neon + +int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_uv_neon(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_neon + +void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride); +void vp8_dequant_idct_add_neon(short *input, short *dq, unsigned char *output, int stride); +#define vp8_dequant_idct_add vp8_dequant_idct_add_neon + +void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_neon + +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_neon + +void vp8_dequantize_b_c(struct blockd*, short *dqc); +void vp8_dequantize_b_neon(struct blockd*, short *dqc); +#define vp8_dequantize_b vp8_dequantize_b_neon + +int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_diamond_search_sad vp8_diamond_search_sad_c + +void vp8_fast_quantize_b_c(struct block *, struct blockd *); +void vp8_fast_quantize_b_neon(struct block *, struct blockd *); +#define vp8_fast_quantize_b vp8_fast_quantize_b_neon + +void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +#define vp8_filter_by_weight16x16 vp8_filter_by_weight16x16_c + +void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +#define vp8_filter_by_weight4x4 vp8_filter_by_weight4x4_c + +void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +#define vp8_filter_by_weight8x8 vp8_filter_by_weight8x8_c + +int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_full_search_sad vp8_full_search_sad_c + +void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bh vp8_loop_filter_bh_neon + +void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bv vp8_loop_filter_bv_neon + +void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbh vp8_loop_filter_mbh_neon + +void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbv vp8_loop_filter_mbv_neon + +void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bhs_neon(unsigned char *y, int ystride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_neon + +void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bvs_neon(unsigned char *y, int ystride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_neon + +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_mbhs_neon(unsigned char *y, int ystride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbh vp8_loop_filter_mbhs_neon + +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_mbvs_neon(unsigned char *y, int ystride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbv vp8_loop_filter_mbvs_neon + +int vp8_mbblock_error_c(struct macroblock *mb, int dc); +#define vp8_mbblock_error vp8_mbblock_error_c + +int vp8_mbuverror_c(struct macroblock *mb); +#define vp8_mbuverror vp8_mbuverror_c + +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_refining_search_sad vp8_refining_search_sad_c + +void vp8_regular_quantize_b_c(struct block *, struct blockd *); +#define vp8_regular_quantize_b vp8_regular_quantize_b_c + +void vp8_short_fdct4x4_c(short *input, short *output, int pitch); +void vp8_short_fdct4x4_neon(short *input, short *output, int pitch); +#define vp8_short_fdct4x4 vp8_short_fdct4x4_neon + +void vp8_short_fdct8x4_c(short *input, short *output, int pitch); +void vp8_short_fdct8x4_neon(short *input, short *output, int pitch); +#define vp8_short_fdct8x4 vp8_short_fdct8x4_neon + +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +void vp8_short_idct4x4llm_neon(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +#define vp8_short_idct4x4llm vp8_short_idct4x4llm_neon + +void vp8_short_inv_walsh4x4_c(short *input, short *output); +void vp8_short_inv_walsh4x4_neon(short *input, short *output); +#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_neon + +void vp8_short_inv_walsh4x4_1_c(short *input, short *output); +#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c + +void vp8_short_walsh4x4_c(short *input, short *output, int pitch); +void vp8_short_walsh4x4_neon(short *input, short *output, int pitch); +#define vp8_short_walsh4x4 vp8_short_walsh4x4_neon + +void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_neon + +void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict4x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_neon + +void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_neon + +void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_neon + +void vp8_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/arm.h" +static void setup_rtcd_internal(void) +{ + int flags = arm_cpu_caps(); + + (void)flags; + +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/chromium/third_party/libvpx/source/config/ios/arm-neon/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/ios/arm-neon/vp9_rtcd.h new file mode 100644 index 00000000000..5f0e862cbfa --- /dev/null +++ b/chromium/third_party/libvpx/source/config/ios/arm-neon/vp9_rtcd.h @@ -0,0 +1,113 @@ +#ifndef VP9_RTCD_H_ +#define VP9_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP9 + */ + +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_enums.h" + +struct macroblockd; + +/* Encoder forward decls */ +struct macroblock; +struct vp9_variance_vtable; +struct search_site_config; +struct mv; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +#define vp9_block_error vp9_block_error_c + +int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff, int block_size); +#define vp9_block_error_fp vp9_block_error_fp_neon + +int vp9_denoiser_filter_c(const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude); +#define vp9_denoiser_filter vp9_denoiser_filter_c + +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_c + +void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_fdct8x8_quant_neon(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_fdct8x8_quant vp9_fdct8x8_quant_neon + +void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht16x16 vp9_fht16x16_c + +void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht4x4 vp9_fht4x4_c + +void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht8x8 vp9_fht8x8_c + +void vp9_filter_by_weight16x16_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); +#define vp9_filter_by_weight16x16 vp9_filter_by_weight16x16_c + +void vp9_filter_by_weight8x8_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); +#define vp9_filter_by_weight8x8 vp9_filter_by_weight8x8_c + +int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); +#define vp9_full_search_sad vp9_full_search_sad_c + +void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vp9_fwht4x4 vp9_fwht4x4_c + +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); +#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c + +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +#define vp9_iht4x4_16_add vp9_iht4x4_16_add_neon + +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +#define vp9_iht8x8_64_add vp9_iht8x8_64_add_neon + +void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_quantize_fp vp9_quantize_fp_neon + +void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c + +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); +#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c + +void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c + +void vp9_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/arm.h" +static void setup_rtcd_internal(void) +{ + int flags = arm_cpu_caps(); + + (void)flags; + +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_config.asm b/chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_config.asm new file mode 100644 index 00000000000..08380931415 --- /dev/null +++ b/chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_config.asm @@ -0,0 +1,87 @@ +@ This file was created from a .asm file +@ using the ads2gas_apple.pl script. + + .set WIDE_REFERENCE, 0 + .set ARCHITECTURE, 5 + .set DO1STROUNDING, 0 +.set ARCH_ARM , 1 +.set ARCH_MIPS , 0 +.set ARCH_X86 , 0 +.set ARCH_X86_64 , 0 +.set HAVE_NEON , 1 +.set HAVE_NEON_ASM , 1 +.set HAVE_MIPS32 , 0 +.set HAVE_DSPR2 , 0 +.set HAVE_MSA , 0 +.set HAVE_MIPS64 , 0 +.set HAVE_MMX , 0 +.set HAVE_SSE , 0 +.set HAVE_SSE2 , 0 +.set HAVE_SSE3 , 0 +.set HAVE_SSSE3 , 0 +.set HAVE_SSE4_1 , 0 +.set HAVE_AVX , 0 +.set HAVE_AVX2 , 0 +.set HAVE_VPX_PORTS , 1 +.set HAVE_PTHREAD_H , 1 +.set HAVE_UNISTD_H , 0 +.set CONFIG_DEPENDENCY_TRACKING , 1 +.set CONFIG_EXTERNAL_BUILD , 1 +.set CONFIG_INSTALL_DOCS , 0 +.set CONFIG_INSTALL_BINS , 1 +.set CONFIG_INSTALL_LIBS , 1 +.set CONFIG_INSTALL_SRCS , 0 +.set CONFIG_DEBUG , 0 +.set CONFIG_GPROF , 0 +.set CONFIG_GCOV , 0 +.set CONFIG_RVCT , 0 +.set CONFIG_GCC , 1 +.set CONFIG_MSVS , 0 +.set CONFIG_PIC , 0 +.set CONFIG_BIG_ENDIAN , 0 +.set CONFIG_CODEC_SRCS , 0 +.set CONFIG_DEBUG_LIBS , 0 +.set CONFIG_DEQUANT_TOKENS , 0 +.set CONFIG_DC_RECON , 0 +.set CONFIG_RUNTIME_CPU_DETECT , 0 +.set CONFIG_POSTPROC , 1 +.set CONFIG_VP9_POSTPROC , 1 +.set CONFIG_MULTITHREAD , 1 +.set CONFIG_INTERNAL_STATS , 0 +.set CONFIG_VP8_ENCODER , 1 +.set CONFIG_VP8_DECODER , 1 +.set CONFIG_VP9_ENCODER , 1 +.set CONFIG_VP9_DECODER , 1 +.set CONFIG_VP8 , 1 +.set CONFIG_VP9 , 1 +.set CONFIG_ENCODERS , 1 +.set CONFIG_DECODERS , 1 +.set CONFIG_STATIC_MSVCRT , 0 +.set CONFIG_SPATIAL_RESAMPLING , 1 +.set CONFIG_REALTIME_ONLY , 1 +.set CONFIG_ONTHEFLY_BITPACKING , 0 +.set CONFIG_ERROR_CONCEALMENT , 0 +.set CONFIG_SHARED , 0 +.set CONFIG_STATIC , 1 +.set CONFIG_SMALL , 0 +.set CONFIG_POSTPROC_VISUALIZER , 0 +.set CONFIG_OS_SUPPORT , 1 +.set CONFIG_UNIT_TESTS , 1 +.set CONFIG_WEBM_IO , 1 +.set CONFIG_LIBYUV , 1 +.set CONFIG_DECODE_PERF_TESTS , 0 +.set CONFIG_ENCODE_PERF_TESTS , 0 +.set CONFIG_MULTI_RES_ENCODING , 1 +.set CONFIG_TEMPORAL_DENOISING , 1 +.set CONFIG_VP9_TEMPORAL_DENOISING , 1 +.set CONFIG_COEFFICIENT_RANGE_CHECKING , 0 +.set CONFIG_VP9_HIGHBITDEPTH , 0 +.set CONFIG_BETTER_HW_COMPATIBILITY , 0 +.set CONFIG_EXPERIMENTAL , 0 +.set CONFIG_SIZE_LIMIT , 1 +.set CONFIG_SPATIAL_SVC , 0 +.set CONFIG_FP_MB_STATS , 0 +.set CONFIG_EMULATE_HARDWARE , 0 +.set CONFIG_MISC_FIXES , 0 +.set DECODE_WIDTH_LIMIT , 16384 +.set DECODE_HEIGHT_LIMIT , 16384 diff --git a/chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_config.c b/chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_config.c new file mode 100644 index 00000000000..22b0617ad4d --- /dev/null +++ b/chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_config.c @@ -0,0 +1,10 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +#include "vpx/vpx_codec.h" +static const char* const cfg = "--target=armv7-linux-gcc --enable-external-build --enable-postproc --enable-multi-res-encoding --enable-temporal-denoising --enable-vp9-temporal-denoising --enable-vp9-postproc --size-limit=16384x16384 --enable-realtime-only --disable-install-docs"; +const char *vpx_codec_build_config(void) {return cfg;} diff --git a/chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_config.h b/chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_config.h new file mode 100644 index 00000000000..2d9a3cedbd0 --- /dev/null +++ b/chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_config.h @@ -0,0 +1,94 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +/* This file automatically generated by configure. Do not edit! */ +#ifndef VPX_CONFIG_H +#define VPX_CONFIG_H +#define RESTRICT +#define INLINE inline +#define ARCH_ARM 1 +#define ARCH_MIPS 0 +#define ARCH_X86 0 +#define ARCH_X86_64 0 +#define HAVE_NEON 1 +#define HAVE_NEON_ASM 1 +#define HAVE_MIPS32 0 +#define HAVE_DSPR2 0 +#define HAVE_MSA 0 +#define HAVE_MIPS64 0 +#define HAVE_MMX 0 +#define HAVE_SSE 0 +#define HAVE_SSE2 0 +#define HAVE_SSE3 0 +#define HAVE_SSSE3 0 +#define HAVE_SSE4_1 0 +#define HAVE_AVX 0 +#define HAVE_AVX2 0 +#define HAVE_VPX_PORTS 1 +#define HAVE_PTHREAD_H 1 +#define HAVE_UNISTD_H 0 +#define CONFIG_DEPENDENCY_TRACKING 1 +#define CONFIG_EXTERNAL_BUILD 1 +#define CONFIG_INSTALL_DOCS 0 +#define CONFIG_INSTALL_BINS 1 +#define CONFIG_INSTALL_LIBS 1 +#define CONFIG_INSTALL_SRCS 0 +#define CONFIG_DEBUG 0 +#define CONFIG_GPROF 0 +#define CONFIG_GCOV 0 +#define CONFIG_RVCT 0 +#define CONFIG_GCC 1 +#define CONFIG_MSVS 0 +#define CONFIG_PIC 0 +#define CONFIG_BIG_ENDIAN 0 +#define CONFIG_CODEC_SRCS 0 +#define CONFIG_DEBUG_LIBS 0 +#define CONFIG_DEQUANT_TOKENS 0 +#define CONFIG_DC_RECON 0 +#define CONFIG_RUNTIME_CPU_DETECT 0 +#define CONFIG_POSTPROC 1 +#define CONFIG_VP9_POSTPROC 1 +#define CONFIG_MULTITHREAD 1 +#define CONFIG_INTERNAL_STATS 0 +#define CONFIG_VP8_ENCODER 1 +#define CONFIG_VP8_DECODER 1 +#define CONFIG_VP9_ENCODER 1 +#define CONFIG_VP9_DECODER 1 +#define CONFIG_VP8 1 +#define CONFIG_VP9 1 +#define CONFIG_ENCODERS 1 +#define CONFIG_DECODERS 1 +#define CONFIG_STATIC_MSVCRT 0 +#define CONFIG_SPATIAL_RESAMPLING 1 +#define CONFIG_REALTIME_ONLY 1 +#define CONFIG_ONTHEFLY_BITPACKING 0 +#define CONFIG_ERROR_CONCEALMENT 0 +#define CONFIG_SHARED 0 +#define CONFIG_STATIC 1 +#define CONFIG_SMALL 0 +#define CONFIG_POSTPROC_VISUALIZER 0 +#define CONFIG_OS_SUPPORT 1 +#define CONFIG_UNIT_TESTS 1 +#define CONFIG_WEBM_IO 1 +#define CONFIG_LIBYUV 1 +#define CONFIG_DECODE_PERF_TESTS 0 +#define CONFIG_ENCODE_PERF_TESTS 0 +#define CONFIG_MULTI_RES_ENCODING 1 +#define CONFIG_TEMPORAL_DENOISING 1 +#define CONFIG_VP9_TEMPORAL_DENOISING 1 +#define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 +#define CONFIG_BETTER_HW_COMPATIBILITY 0 +#define CONFIG_EXPERIMENTAL 0 +#define CONFIG_SIZE_LIMIT 1 +#define CONFIG_SPATIAL_SVC 0 +#define CONFIG_FP_MB_STATS 0 +#define CONFIG_EMULATE_HARDWARE 0 +#define CONFIG_MISC_FIXES 0 +#define DECODE_WIDTH_LIMIT 16384 +#define DECODE_HEIGHT_LIMIT 16384 +#endif /* VPX_CONFIG_H */ diff --git a/chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_dsp_rtcd.h new file mode 100644 index 00000000000..a5c50f21727 --- /dev/null +++ b/chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_dsp_rtcd.h @@ -0,0 +1,851 @@ +#ifndef VPX_DSP_RTCD_H_ +#define VPX_DSP_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * DSP + */ + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +unsigned int vpx_avg_4x4_c(const uint8_t *, int p); +unsigned int vpx_avg_4x4_neon(const uint8_t *, int p); +#define vpx_avg_4x4 vpx_avg_4x4_neon + +unsigned int vpx_avg_8x8_c(const uint8_t *, int p); +unsigned int vpx_avg_8x8_neon(const uint8_t *, int p); +#define vpx_avg_8x8 vpx_avg_8x8_neon + +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +#define vpx_comp_avg_pred vpx_comp_avg_pred_c + +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_convolve8 vpx_convolve8_neon + +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_convolve8_avg vpx_convolve8_avg_neon + +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_neon + +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_neon + +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_convolve8_horiz vpx_convolve8_horiz_neon + +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_convolve8_vert vpx_convolve8_vert_neon + +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_convolve_avg vpx_convolve_avg_neon + +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_convolve_copy vpx_convolve_copy_neon + +void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c + +void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c + +void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c + +void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c + +void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c + +void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c + +void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_neon + +void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c + +void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c + +void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_c + +void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_c + +void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_c + +void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_c + +void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_c + +void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_c + +void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c + +void vpx_d207e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207e_predictor_16x16 vpx_d207e_predictor_16x16_c + +void vpx_d207e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207e_predictor_32x32 vpx_d207e_predictor_32x32_c + +void vpx_d207e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207e_predictor_4x4 vpx_d207e_predictor_4x4_c + +void vpx_d207e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207e_predictor_8x8 vpx_d207e_predictor_8x8_c + +void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_neon + +void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c + +void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_neon + +void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_neon + +void vpx_d45e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_16x16 vpx_d45e_predictor_16x16_c + +void vpx_d45e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_32x32 vpx_d45e_predictor_32x32_c + +void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c + +void vpx_d45e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_8x8 vpx_d45e_predictor_8x8_c + +void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c + +void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_c + +void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_c + +void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c + +void vpx_d63e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_16x16 vpx_d63e_predictor_16x16_c + +void vpx_d63e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_32x32 vpx_d63e_predictor_32x32_c + +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c + +void vpx_d63e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_8x8 vpx_d63e_predictor_8x8_c + +void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63f_predictor_4x4 vpx_d63f_predictor_4x4_c + +void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_neon + +void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_neon + +void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_neon + +void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_neon + +void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_neon + +void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_neon + +void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_neon + +void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_neon + +void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_neon + +void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_neon + +void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_neon + +void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_neon + +void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_neon + +void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_neon + +void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_neon + +void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_neon + +void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct16x16 vpx_fdct16x16_c + +void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct16x16_1 vpx_fdct16x16_1_c + +void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32 vpx_fdct32x32_c + +void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32_1 vpx_fdct32x32_1_c + +void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32_rd vpx_fdct32x32_rd_c + +void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct4x4 vpx_fdct4x4_c + +void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct4x4_1 vpx_fdct4x4_1_c + +void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct8x8 vpx_fdct8x8_neon + +void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct8x8_1 vpx_fdct8x8_1_neon + +void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_get16x16var vpx_get16x16var_neon + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +#define vpx_get4x4sse_cs vpx_get4x4sse_cs_neon + +void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_get8x8var vpx_get8x8var_neon + +unsigned int vpx_get_mb_ss_c(const int16_t *); +#define vpx_get_mb_ss vpx_get_mb_ss_c + +void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_16x16 vpx_h_predictor_16x16_neon + +void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_32x32 vpx_h_predictor_32x32_neon + +void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_4x4 vpx_h_predictor_4x4_neon + +void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_neon + +void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff); +void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride, int16_t *coeff); +#define vpx_hadamard_16x16 vpx_hadamard_16x16_neon + +void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff); +void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, int16_t *coeff); +#define vpx_hadamard_8x8 vpx_hadamard_8x8_neon + +void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c + +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct16x16_10_add vpx_idct16x16_10_add_neon + +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct16x16_1_add vpx_idct16x16_1_add_neon + +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct16x16_256_add vpx_idct16x16_256_add_neon + +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_neon + +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct32x32_135_add vpx_idct32x32_1024_add_neon + +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct32x32_1_add vpx_idct32x32_1_add_neon + +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct32x32_34_add vpx_idct32x32_34_add_neon + +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct4x4_16_add vpx_idct4x4_16_add_neon + +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct4x4_1_add vpx_idct4x4_1_add_neon + +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct8x8_12_add vpx_idct8x8_12_add_neon + +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct8x8_1_add vpx_idct8x8_1_add_neon + +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct8x8_64_add vpx_idct8x8_64_add_neon + +int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); +int16_t vpx_int_pro_col_neon(const uint8_t *ref, const int width); +#define vpx_int_pro_col vpx_int_pro_col_neon + +void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); +void vpx_int_pro_row_neon(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); +#define vpx_int_pro_row vpx_int_pro_row_neon + +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c + +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c + +void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_16 vpx_lpf_horizontal_16_neon + +void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_16_dual vpx_lpf_horizontal_16_dual_neon + +void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_4 vpx_lpf_horizontal_4_neon + +void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_horizontal_4_dual vpx_lpf_horizontal_4_dual_neon + +void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_8 vpx_lpf_horizontal_8_neon + +void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_horizontal_8_dual vpx_lpf_horizontal_8_dual_neon + +void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_16 vpx_lpf_vertical_16_neon + +void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_16_dual vpx_lpf_vertical_16_dual_neon + +void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_4 vpx_lpf_vertical_4_neon + +void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_vertical_4_dual vpx_lpf_vertical_4_dual_neon + +void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_8 vpx_lpf_vertical_8_neon + +void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_neon + +void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); +#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_c + +void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); +#define vpx_mbpost_proc_down vpx_mbpost_proc_down_c + +void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +void vpx_minmax_8x8_neon(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +#define vpx_minmax_8x8 vpx_minmax_8x8_neon + +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse16x16 vpx_mse16x16_neon + +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse16x8 vpx_mse16x8_c + +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse8x16 vpx_mse8x16_c + +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse8x8 vpx_mse8x8_c + +void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch); +#define vpx_plane_add_noise vpx_plane_add_noise_c + +void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +#define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_c + +void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_quantize_b vpx_quantize_b_c + +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c + +unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad16x16 vpx_sad16x16_neon + +unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x16_avg vpx_sad16x16_avg_c + +void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vpx_sad16x16x3 vpx_sad16x16x3_c + +void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad16x16x4d vpx_sad16x16x4d_neon + +void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vpx_sad16x16x8 vpx_sad16x16x8_c + +unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad16x32 vpx_sad16x32_c + +unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x32_avg vpx_sad16x32_avg_c + +void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad16x32x4d vpx_sad16x32x4d_c + +unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad16x8 vpx_sad16x8_neon + +unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x8_avg vpx_sad16x8_avg_c + +void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vpx_sad16x8x3 vpx_sad16x8x3_c + +void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad16x8x4d vpx_sad16x8x4d_c + +void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vpx_sad16x8x8 vpx_sad16x8x8_c + +unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad32x16 vpx_sad32x16_c + +unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad32x16_avg vpx_sad32x16_avg_c + +void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad32x16x4d vpx_sad32x16x4d_c + +unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad32x32 vpx_sad32x32_neon + +unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad32x32_avg vpx_sad32x32_avg_c + +void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vpx_sad32x32x3 vpx_sad32x32x3_c + +void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad32x32x4d vpx_sad32x32x4d_neon + +void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vpx_sad32x32x8 vpx_sad32x32x8_c + +unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad32x64 vpx_sad32x64_c + +unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad32x64_avg vpx_sad32x64_avg_c + +void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad32x64x4d vpx_sad32x64x4d_c + +unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad4x4 vpx_sad4x4_neon + +unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad4x4_avg vpx_sad4x4_avg_c + +void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vpx_sad4x4x3 vpx_sad4x4x3_c + +void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad4x4x4d vpx_sad4x4x4d_c + +void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vpx_sad4x4x8 vpx_sad4x4x8_c + +unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad4x8 vpx_sad4x8_c + +unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad4x8_avg vpx_sad4x8_avg_c + +void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad4x8x4d vpx_sad4x8x4d_c + +void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vpx_sad4x8x8 vpx_sad4x8x8_c + +unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad64x32 vpx_sad64x32_c + +unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad64x32_avg vpx_sad64x32_avg_c + +void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad64x32x4d vpx_sad64x32x4d_c + +unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad64x64 vpx_sad64x64_neon + +unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad64x64_avg vpx_sad64x64_avg_c + +void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vpx_sad64x64x3 vpx_sad64x64x3_c + +void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad64x64x4d vpx_sad64x64x4d_neon + +void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vpx_sad64x64x8 vpx_sad64x64x8_c + +unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x16 vpx_sad8x16_neon + +unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x16_avg vpx_sad8x16_avg_c + +void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vpx_sad8x16x3 vpx_sad8x16x3_c + +void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad8x16x4d vpx_sad8x16x4d_c + +void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vpx_sad8x16x8 vpx_sad8x16x8_c + +unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x4 vpx_sad8x4_c + +unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x4_avg vpx_sad8x4_avg_c + +void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad8x4x4d vpx_sad8x4x4d_c + +void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vpx_sad8x4x8 vpx_sad8x4x8_c + +unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x8 vpx_sad8x8_neon + +unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x8_avg vpx_sad8x8_avg_c + +void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vpx_sad8x8x3 vpx_sad8x8x3_c + +void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad8x8x4d vpx_sad8x8x4d_c + +void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vpx_sad8x8x8 vpx_sad8x8x8_c + +int vpx_satd_c(const int16_t *coeff, int length); +int vpx_satd_neon(const int16_t *coeff, int length); +#define vpx_satd vpx_satd_neon + +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_2d vpx_scaled_2d_c + +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c + +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c + +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c + +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_horiz vpx_scaled_horiz_c + +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_vert vpx_scaled_vert_c + +uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_c + +uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_c + +uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_c + +uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_c + +uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_c + +uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_c + +uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_c + +uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_c + +uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_c + +uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_c + +uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_c + +uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_c + +uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_c + +uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_neon + +uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_c + +uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_c + +uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_c + +uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_neon + +uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_c + +uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c + +uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c + +uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_c + +uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_neon + +uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_c + +uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_c + +uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x8 vpx_sub_pixel_variance8x8_neon + +void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +void vpx_subtract_block_neon(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +#define vpx_subtract_block vpx_subtract_block_neon + +uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size); +#define vpx_sum_squares_2d_i16 vpx_sum_squares_2d_i16_c + +void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_neon + +void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_neon + +void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_neon + +void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_neon + +void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_16x16 vpx_v_predictor_16x16_neon + +void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_32x32 vpx_v_predictor_32x32_neon + +void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_4x4 vpx_v_predictor_4x4_neon + +void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_8x8 vpx_v_predictor_8x8_neon + +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x16 vpx_variance16x16_neon + +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x32 vpx_variance16x32_c + +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x8 vpx_variance16x8_neon + +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x16 vpx_variance32x16_c + +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x32 vpx_variance32x32_neon + +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x64 vpx_variance32x64_neon + +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x4 vpx_variance4x4_c + +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x8 vpx_variance4x8_c + +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance64x32 vpx_variance64x32_neon + +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance64x64 vpx_variance64x64_neon + +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x16 vpx_variance8x16_neon + +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x4 vpx_variance8x4_c + +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x8 vpx_variance8x8_neon + +void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c + +int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl); +int vpx_vector_var_neon(const int16_t *ref, const int16_t *src, const int bwl); +#define vpx_vector_var vpx_vector_var_neon + +void vpx_dsp_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/arm.h" +static void setup_rtcd_internal(void) +{ + int flags = arm_cpu_caps(); + + (void)flags; + +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_scale_rtcd.h b/chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_scale_rtcd.h new file mode 100644 index 00000000000..a1564b7ad6c --- /dev/null +++ b/chromium/third_party/libvpx/source/config/ios/arm-neon/vpx_scale_rtcd.h @@ -0,0 +1,71 @@ +#ifndef VPX_SCALE_RTCD_H_ +#define VPX_SCALE_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c + +void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c + +void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c + +void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c + +void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c + +void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c + +void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c + +void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vp8_yv12_copy_frame vp8_yv12_copy_frame_c + +void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c + +void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_borders vpx_extend_frame_borders_c + +void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c + +void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_y vpx_yv12_copy_y_c + +void vpx_scale_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/arm.h" +static void setup_rtcd_internal(void) +{ + int flags = arm_cpu_caps(); + + (void)flags; + +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/chromium/third_party/libvpx/source/config/ios/arm64/vp8_rtcd.h b/chromium/third_party/libvpx/source/config/ios/arm64/vp8_rtcd.h new file mode 100644 index 00000000000..6a7f92cafdb --- /dev/null +++ b/chromium/third_party/libvpx/source/config/ios/arm64/vp8_rtcd.h @@ -0,0 +1,218 @@ +#ifndef VP8_RTCD_H_ +#define VP8_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP8 + */ + +struct blockd; +struct macroblockd; +struct loop_filter_info; + +/* Encoder forward decls */ +struct block; +struct macroblock; +struct variance_vtable; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_bilinear_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict16x16_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_bilinear_predict16x16 vp8_bilinear_predict16x16_neon + +void vp8_bilinear_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict4x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_bilinear_predict4x4 vp8_bilinear_predict4x4_neon + +void vp8_bilinear_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_bilinear_predict8x4 vp8_bilinear_predict8x4_neon + +void vp8_bilinear_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_bilinear_predict8x8_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_bilinear_predict8x8 vp8_bilinear_predict8x8_neon + +void vp8_blend_b_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); +#define vp8_blend_b vp8_blend_b_c + +void vp8_blend_mb_inner_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); +#define vp8_blend_mb_inner vp8_blend_mb_inner_c + +void vp8_blend_mb_outer_c(unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride); +#define vp8_blend_mb_outer vp8_blend_mb_outer_c + +int vp8_block_error_c(short *coeff, short *dqcoeff); +#define vp8_block_error vp8_block_error_c + +void vp8_copy_mem16x16_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem16x16_neon(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +#define vp8_copy_mem16x16 vp8_copy_mem16x16_neon + +void vp8_copy_mem8x4_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x4_neon(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +#define vp8_copy_mem8x4 vp8_copy_mem8x4_neon + +void vp8_copy_mem8x8_c(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +void vp8_copy_mem8x8_neon(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch); +#define vp8_copy_mem8x8 vp8_copy_mem8x8_neon + +void vp8_dc_only_idct_add_c(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +void vp8_dc_only_idct_add_neon(short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride); +#define vp8_dc_only_idct_add vp8_dc_only_idct_add_neon + +int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_neon(unsigned char *mc_running_avg_y, int mc_avg_y_stride, unsigned char *running_avg_y, int avg_y_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter vp8_denoiser_filter_neon + +int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +int vp8_denoiser_filter_uv_neon(unsigned char *mc_running_avg, int mc_avg_stride, unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising); +#define vp8_denoiser_filter_uv vp8_denoiser_filter_uv_neon + +void vp8_dequant_idct_add_c(short *input, short *dq, unsigned char *output, int stride); +void vp8_dequant_idct_add_neon(short *input, short *dq, unsigned char *output, int stride); +#define vp8_dequant_idct_add vp8_dequant_idct_add_neon + +void vp8_dequant_idct_add_uv_block_c(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +void vp8_dequant_idct_add_uv_block_neon(short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs); +#define vp8_dequant_idct_add_uv_block vp8_dequant_idct_add_uv_block_neon + +void vp8_dequant_idct_add_y_block_c(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +void vp8_dequant_idct_add_y_block_neon(short *q, short *dq, unsigned char *dst, int stride, char *eobs); +#define vp8_dequant_idct_add_y_block vp8_dequant_idct_add_y_block_neon + +void vp8_dequantize_b_c(struct blockd*, short *dqc); +void vp8_dequantize_b_neon(struct blockd*, short *dqc); +#define vp8_dequantize_b vp8_dequantize_b_neon + +int vp8_diamond_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_diamond_search_sad vp8_diamond_search_sad_c + +void vp8_fast_quantize_b_c(struct block *, struct blockd *); +void vp8_fast_quantize_b_neon(struct block *, struct blockd *); +#define vp8_fast_quantize_b vp8_fast_quantize_b_neon + +void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +#define vp8_filter_by_weight16x16 vp8_filter_by_weight16x16_c + +void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +#define vp8_filter_by_weight4x4 vp8_filter_by_weight4x4_c + +void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight); +#define vp8_filter_by_weight8x8 vp8_filter_by_weight8x8_c + +int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_full_search_sad vp8_full_search_sad_c + +void vp8_loop_filter_bh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bh_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bh vp8_loop_filter_bh_neon + +void vp8_loop_filter_bv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_bv_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_bv vp8_loop_filter_bv_neon + +void vp8_loop_filter_mbh_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbh_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbh vp8_loop_filter_mbh_neon + +void vp8_loop_filter_mbv_c(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +void vp8_loop_filter_mbv_neon(unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi); +#define vp8_loop_filter_mbv vp8_loop_filter_mbv_neon + +void vp8_loop_filter_bhs_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bhs_neon(unsigned char *y, int ystride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bh vp8_loop_filter_bhs_neon + +void vp8_loop_filter_bvs_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_bvs_neon(unsigned char *y, int ystride, const unsigned char *blimit); +#define vp8_loop_filter_simple_bv vp8_loop_filter_bvs_neon + +void vp8_loop_filter_simple_horizontal_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_mbhs_neon(unsigned char *y, int ystride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbh vp8_loop_filter_mbhs_neon + +void vp8_loop_filter_simple_vertical_edge_c(unsigned char *y, int ystride, const unsigned char *blimit); +void vp8_loop_filter_mbvs_neon(unsigned char *y, int ystride, const unsigned char *blimit); +#define vp8_loop_filter_simple_mbv vp8_loop_filter_mbvs_neon + +int vp8_mbblock_error_c(struct macroblock *mb, int dc); +#define vp8_mbblock_error vp8_mbblock_error_c + +int vp8_mbuverror_c(struct macroblock *mb); +#define vp8_mbuverror vp8_mbuverror_c + +int vp8_refining_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv); +#define vp8_refining_search_sad vp8_refining_search_sad_c + +void vp8_regular_quantize_b_c(struct block *, struct blockd *); +#define vp8_regular_quantize_b vp8_regular_quantize_b_c + +void vp8_short_fdct4x4_c(short *input, short *output, int pitch); +void vp8_short_fdct4x4_neon(short *input, short *output, int pitch); +#define vp8_short_fdct4x4 vp8_short_fdct4x4_neon + +void vp8_short_fdct8x4_c(short *input, short *output, int pitch); +void vp8_short_fdct8x4_neon(short *input, short *output, int pitch); +#define vp8_short_fdct8x4 vp8_short_fdct8x4_neon + +void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +void vp8_short_idct4x4llm_neon(short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride); +#define vp8_short_idct4x4llm vp8_short_idct4x4llm_neon + +void vp8_short_inv_walsh4x4_c(short *input, short *output); +void vp8_short_inv_walsh4x4_neon(short *input, short *output); +#define vp8_short_inv_walsh4x4 vp8_short_inv_walsh4x4_neon + +void vp8_short_inv_walsh4x4_1_c(short *input, short *output); +#define vp8_short_inv_walsh4x4_1 vp8_short_inv_walsh4x4_1_c + +void vp8_short_walsh4x4_c(short *input, short *output, int pitch); +void vp8_short_walsh4x4_neon(short *input, short *output, int pitch); +#define vp8_short_walsh4x4 vp8_short_walsh4x4_neon + +void vp8_sixtap_predict16x16_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict16x16_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_sixtap_predict16x16 vp8_sixtap_predict16x16_neon + +void vp8_sixtap_predict4x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict4x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_sixtap_predict4x4 vp8_sixtap_predict4x4_neon + +void vp8_sixtap_predict8x4_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x4_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_sixtap_predict8x4 vp8_sixtap_predict8x4_neon + +void vp8_sixtap_predict8x8_c(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +void vp8_sixtap_predict8x8_neon(unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch); +#define vp8_sixtap_predict8x8 vp8_sixtap_predict8x8_neon + +void vp8_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/arm.h" +static void setup_rtcd_internal(void) +{ + int flags = arm_cpu_caps(); + + (void)flags; + +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/chromium/third_party/libvpx/source/config/ios/arm64/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/ios/arm64/vp9_rtcd.h new file mode 100644 index 00000000000..5f0e862cbfa --- /dev/null +++ b/chromium/third_party/libvpx/source/config/ios/arm64/vp9_rtcd.h @@ -0,0 +1,113 @@ +#ifndef VP9_RTCD_H_ +#define VP9_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * VP9 + */ + +#include "vpx/vpx_integer.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_enums.h" + +struct macroblockd; + +/* Encoder forward decls */ +struct macroblock; +struct vp9_variance_vtable; +struct search_site_config; +struct mv; +union int_mv; +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz); +#define vp9_block_error vp9_block_error_c + +int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff, int block_size); +int64_t vp9_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff, int block_size); +#define vp9_block_error_fp vp9_block_error_fp_neon + +int vp9_denoiser_filter_c(const uint8_t *sig, int sig_stride, const uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, int avg_stride, int increase_denoising, BLOCK_SIZE bs, int motion_magnitude); +#define vp9_denoiser_filter vp9_denoiser_filter_c + +int vp9_diamond_search_sad_c(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); +#define vp9_diamond_search_sad vp9_diamond_search_sad_c + +void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_fdct8x8_quant_neon(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_fdct8x8_quant vp9_fdct8x8_quant_neon + +void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht16x16 vp9_fht16x16_c + +void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht4x4 vp9_fht4x4_c + +void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); +#define vp9_fht8x8 vp9_fht8x8_c + +void vp9_filter_by_weight16x16_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); +#define vp9_filter_by_weight16x16 vp9_filter_by_weight16x16_c + +void vp9_filter_by_weight8x8_c(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int src_weight); +#define vp9_filter_by_weight8x8 vp9_filter_by_weight8x8_c + +int vp9_full_search_sad_c(const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv); +#define vp9_full_search_sad vp9_full_search_sad_c + +void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vp9_fwht4x4 vp9_fwht4x4_c + +void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *output, int pitch, int tx_type); +#define vp9_iht16x16_256_add vp9_iht16x16_256_add_c + +void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +#define vp9_iht4x4_16_add vp9_iht4x4_16_add_neon + +void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type); +#define vp9_iht8x8_64_add vp9_iht8x8_64_add_neon + +void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_quantize_fp vp9_quantize_fp_neon + +void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c + +void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst); +#define vp9_scale_and_extend_frame vp9_scale_and_extend_frame_c + +void vp9_temporal_filter_apply_c(uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count); +#define vp9_temporal_filter_apply vp9_temporal_filter_apply_c + +void vp9_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/arm.h" +static void setup_rtcd_internal(void) +{ + int flags = arm_cpu_caps(); + + (void)flags; + +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/chromium/third_party/libvpx/source/config/ios/arm64/vpx_config.asm b/chromium/third_party/libvpx/source/config/ios/arm64/vpx_config.asm new file mode 100644 index 00000000000..787a3e8dd69 --- /dev/null +++ b/chromium/third_party/libvpx/source/config/ios/arm64/vpx_config.asm @@ -0,0 +1,87 @@ +@ This file was created from a .asm file +@ using the ads2gas_apple.pl script. + + .set WIDE_REFERENCE, 0 + .set ARCHITECTURE, 5 + .set DO1STROUNDING, 0 +.set ARCH_ARM , 1 +.set ARCH_MIPS , 0 +.set ARCH_X86 , 0 +.set ARCH_X86_64 , 0 +.set HAVE_NEON , 1 +.set HAVE_NEON_ASM , 0 +.set HAVE_MIPS32 , 0 +.set HAVE_DSPR2 , 0 +.set HAVE_MSA , 0 +.set HAVE_MIPS64 , 0 +.set HAVE_MMX , 0 +.set HAVE_SSE , 0 +.set HAVE_SSE2 , 0 +.set HAVE_SSE3 , 0 +.set HAVE_SSSE3 , 0 +.set HAVE_SSE4_1 , 0 +.set HAVE_AVX , 0 +.set HAVE_AVX2 , 0 +.set HAVE_VPX_PORTS , 1 +.set HAVE_PTHREAD_H , 1 +.set HAVE_UNISTD_H , 0 +.set CONFIG_DEPENDENCY_TRACKING , 1 +.set CONFIG_EXTERNAL_BUILD , 1 +.set CONFIG_INSTALL_DOCS , 0 +.set CONFIG_INSTALL_BINS , 1 +.set CONFIG_INSTALL_LIBS , 1 +.set CONFIG_INSTALL_SRCS , 0 +.set CONFIG_DEBUG , 0 +.set CONFIG_GPROF , 0 +.set CONFIG_GCOV , 0 +.set CONFIG_RVCT , 0 +.set CONFIG_GCC , 1 +.set CONFIG_MSVS , 0 +.set CONFIG_PIC , 0 +.set CONFIG_BIG_ENDIAN , 0 +.set CONFIG_CODEC_SRCS , 0 +.set CONFIG_DEBUG_LIBS , 0 +.set CONFIG_DEQUANT_TOKENS , 0 +.set CONFIG_DC_RECON , 0 +.set CONFIG_RUNTIME_CPU_DETECT , 0 +.set CONFIG_POSTPROC , 1 +.set CONFIG_VP9_POSTPROC , 1 +.set CONFIG_MULTITHREAD , 1 +.set CONFIG_INTERNAL_STATS , 0 +.set CONFIG_VP8_ENCODER , 1 +.set CONFIG_VP8_DECODER , 1 +.set CONFIG_VP9_ENCODER , 1 +.set CONFIG_VP9_DECODER , 1 +.set CONFIG_VP8 , 1 +.set CONFIG_VP9 , 1 +.set CONFIG_ENCODERS , 1 +.set CONFIG_DECODERS , 1 +.set CONFIG_STATIC_MSVCRT , 0 +.set CONFIG_SPATIAL_RESAMPLING , 1 +.set CONFIG_REALTIME_ONLY , 1 +.set CONFIG_ONTHEFLY_BITPACKING , 0 +.set CONFIG_ERROR_CONCEALMENT , 0 +.set CONFIG_SHARED , 0 +.set CONFIG_STATIC , 1 +.set CONFIG_SMALL , 0 +.set CONFIG_POSTPROC_VISUALIZER , 0 +.set CONFIG_OS_SUPPORT , 1 +.set CONFIG_UNIT_TESTS , 1 +.set CONFIG_WEBM_IO , 1 +.set CONFIG_LIBYUV , 1 +.set CONFIG_DECODE_PERF_TESTS , 0 +.set CONFIG_ENCODE_PERF_TESTS , 0 +.set CONFIG_MULTI_RES_ENCODING , 1 +.set CONFIG_TEMPORAL_DENOISING , 1 +.set CONFIG_VP9_TEMPORAL_DENOISING , 1 +.set CONFIG_COEFFICIENT_RANGE_CHECKING , 0 +.set CONFIG_VP9_HIGHBITDEPTH , 0 +.set CONFIG_BETTER_HW_COMPATIBILITY , 0 +.set CONFIG_EXPERIMENTAL , 0 +.set CONFIG_SIZE_LIMIT , 1 +.set CONFIG_SPATIAL_SVC , 0 +.set CONFIG_FP_MB_STATS , 0 +.set CONFIG_EMULATE_HARDWARE , 0 +.set CONFIG_MISC_FIXES , 0 +.set DECODE_WIDTH_LIMIT , 16384 +.set DECODE_HEIGHT_LIMIT , 16384 diff --git a/chromium/third_party/libvpx/source/config/ios/arm64/vpx_config.c b/chromium/third_party/libvpx/source/config/ios/arm64/vpx_config.c new file mode 100644 index 00000000000..5f93ebfb676 --- /dev/null +++ b/chromium/third_party/libvpx/source/config/ios/arm64/vpx_config.c @@ -0,0 +1,10 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +#include "vpx/vpx_codec.h" +static const char* const cfg = "--target=arm64-linux-gcc --enable-external-build --enable-postproc --enable-multi-res-encoding --enable-temporal-denoising --enable-vp9-temporal-denoising --enable-vp9-postproc --size-limit=16384x16384 --enable-realtime-only --disable-install-docs"; +const char *vpx_codec_build_config(void) {return cfg;} diff --git a/chromium/third_party/libvpx/source/config/ios/arm64/vpx_config.h b/chromium/third_party/libvpx/source/config/ios/arm64/vpx_config.h new file mode 100644 index 00000000000..9a9ecb4dc19 --- /dev/null +++ b/chromium/third_party/libvpx/source/config/ios/arm64/vpx_config.h @@ -0,0 +1,94 @@ +/* Copyright (c) 2011 The WebM project authors. All Rights Reserved. */ +/* */ +/* Use of this source code is governed by a BSD-style license */ +/* that can be found in the LICENSE file in the root of the source */ +/* tree. An additional intellectual property rights grant can be found */ +/* in the file PATENTS. All contributing project authors may */ +/* be found in the AUTHORS file in the root of the source tree. */ +/* This file automatically generated by configure. Do not edit! */ +#ifndef VPX_CONFIG_H +#define VPX_CONFIG_H +#define RESTRICT +#define INLINE inline +#define ARCH_ARM 1 +#define ARCH_MIPS 0 +#define ARCH_X86 0 +#define ARCH_X86_64 0 +#define HAVE_NEON 1 +#define HAVE_NEON_ASM 0 +#define HAVE_MIPS32 0 +#define HAVE_DSPR2 0 +#define HAVE_MSA 0 +#define HAVE_MIPS64 0 +#define HAVE_MMX 0 +#define HAVE_SSE 0 +#define HAVE_SSE2 0 +#define HAVE_SSE3 0 +#define HAVE_SSSE3 0 +#define HAVE_SSE4_1 0 +#define HAVE_AVX 0 +#define HAVE_AVX2 0 +#define HAVE_VPX_PORTS 1 +#define HAVE_PTHREAD_H 1 +#define HAVE_UNISTD_H 0 +#define CONFIG_DEPENDENCY_TRACKING 1 +#define CONFIG_EXTERNAL_BUILD 1 +#define CONFIG_INSTALL_DOCS 0 +#define CONFIG_INSTALL_BINS 1 +#define CONFIG_INSTALL_LIBS 1 +#define CONFIG_INSTALL_SRCS 0 +#define CONFIG_DEBUG 0 +#define CONFIG_GPROF 0 +#define CONFIG_GCOV 0 +#define CONFIG_RVCT 0 +#define CONFIG_GCC 1 +#define CONFIG_MSVS 0 +#define CONFIG_PIC 0 +#define CONFIG_BIG_ENDIAN 0 +#define CONFIG_CODEC_SRCS 0 +#define CONFIG_DEBUG_LIBS 0 +#define CONFIG_DEQUANT_TOKENS 0 +#define CONFIG_DC_RECON 0 +#define CONFIG_RUNTIME_CPU_DETECT 0 +#define CONFIG_POSTPROC 1 +#define CONFIG_VP9_POSTPROC 1 +#define CONFIG_MULTITHREAD 1 +#define CONFIG_INTERNAL_STATS 0 +#define CONFIG_VP8_ENCODER 1 +#define CONFIG_VP8_DECODER 1 +#define CONFIG_VP9_ENCODER 1 +#define CONFIG_VP9_DECODER 1 +#define CONFIG_VP8 1 +#define CONFIG_VP9 1 +#define CONFIG_ENCODERS 1 +#define CONFIG_DECODERS 1 +#define CONFIG_STATIC_MSVCRT 0 +#define CONFIG_SPATIAL_RESAMPLING 1 +#define CONFIG_REALTIME_ONLY 1 +#define CONFIG_ONTHEFLY_BITPACKING 0 +#define CONFIG_ERROR_CONCEALMENT 0 +#define CONFIG_SHARED 0 +#define CONFIG_STATIC 1 +#define CONFIG_SMALL 0 +#define CONFIG_POSTPROC_VISUALIZER 0 +#define CONFIG_OS_SUPPORT 1 +#define CONFIG_UNIT_TESTS 1 +#define CONFIG_WEBM_IO 1 +#define CONFIG_LIBYUV 1 +#define CONFIG_DECODE_PERF_TESTS 0 +#define CONFIG_ENCODE_PERF_TESTS 0 +#define CONFIG_MULTI_RES_ENCODING 1 +#define CONFIG_TEMPORAL_DENOISING 1 +#define CONFIG_VP9_TEMPORAL_DENOISING 1 +#define CONFIG_COEFFICIENT_RANGE_CHECKING 0 +#define CONFIG_VP9_HIGHBITDEPTH 0 +#define CONFIG_BETTER_HW_COMPATIBILITY 0 +#define CONFIG_EXPERIMENTAL 0 +#define CONFIG_SIZE_LIMIT 1 +#define CONFIG_SPATIAL_SVC 0 +#define CONFIG_FP_MB_STATS 0 +#define CONFIG_EMULATE_HARDWARE 0 +#define CONFIG_MISC_FIXES 0 +#define DECODE_WIDTH_LIMIT 16384 +#define DECODE_HEIGHT_LIMIT 16384 +#endif /* VPX_CONFIG_H */ diff --git a/chromium/third_party/libvpx/source/config/ios/arm64/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/ios/arm64/vpx_dsp_rtcd.h new file mode 100644 index 00000000000..a5c50f21727 --- /dev/null +++ b/chromium/third_party/libvpx/source/config/ios/arm64/vpx_dsp_rtcd.h @@ -0,0 +1,851 @@ +#ifndef VPX_DSP_RTCD_H_ +#define VPX_DSP_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +/* + * DSP + */ + +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" + + +#ifdef __cplusplus +extern "C" { +#endif + +unsigned int vpx_avg_4x4_c(const uint8_t *, int p); +unsigned int vpx_avg_4x4_neon(const uint8_t *, int p); +#define vpx_avg_4x4 vpx_avg_4x4_neon + +unsigned int vpx_avg_8x8_c(const uint8_t *, int p); +unsigned int vpx_avg_8x8_neon(const uint8_t *, int p); +#define vpx_avg_8x8 vpx_avg_8x8_neon + +void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride); +#define vpx_comp_avg_pred vpx_comp_avg_pred_c + +void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_convolve8 vpx_convolve8_neon + +void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_convolve8_avg vpx_convolve8_avg_neon + +void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_convolve8_avg_horiz vpx_convolve8_avg_horiz_neon + +void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_convolve8_avg_vert vpx_convolve8_avg_vert_neon + +void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_convolve8_horiz vpx_convolve8_horiz_neon + +void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_convolve8_vert vpx_convolve8_vert_neon + +void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_convolve_avg vpx_convolve_avg_neon + +void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_convolve_copy vpx_convolve_copy_neon + +void vpx_d117_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_16x16 vpx_d117_predictor_16x16_c + +void vpx_d117_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_32x32 vpx_d117_predictor_32x32_c + +void vpx_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_4x4 vpx_d117_predictor_4x4_c + +void vpx_d117_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d117_predictor_8x8 vpx_d117_predictor_8x8_c + +void vpx_d135_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_16x16 vpx_d135_predictor_16x16_c + +void vpx_d135_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_32x32 vpx_d135_predictor_32x32_c + +void vpx_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_4x4 vpx_d135_predictor_4x4_neon + +void vpx_d135_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d135_predictor_8x8 vpx_d135_predictor_8x8_c + +void vpx_d153_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_16x16 vpx_d153_predictor_16x16_c + +void vpx_d153_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_32x32 vpx_d153_predictor_32x32_c + +void vpx_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_4x4 vpx_d153_predictor_4x4_c + +void vpx_d153_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d153_predictor_8x8 vpx_d153_predictor_8x8_c + +void vpx_d207_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_16x16 vpx_d207_predictor_16x16_c + +void vpx_d207_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_32x32 vpx_d207_predictor_32x32_c + +void vpx_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_4x4 vpx_d207_predictor_4x4_c + +void vpx_d207_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207_predictor_8x8 vpx_d207_predictor_8x8_c + +void vpx_d207e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207e_predictor_16x16 vpx_d207e_predictor_16x16_c + +void vpx_d207e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207e_predictor_32x32 vpx_d207e_predictor_32x32_c + +void vpx_d207e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207e_predictor_4x4 vpx_d207e_predictor_4x4_c + +void vpx_d207e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d207e_predictor_8x8 vpx_d207e_predictor_8x8_c + +void vpx_d45_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_16x16 vpx_d45_predictor_16x16_neon + +void vpx_d45_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_32x32 vpx_d45_predictor_32x32_c + +void vpx_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_4x4 vpx_d45_predictor_4x4_neon + +void vpx_d45_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45_predictor_8x8 vpx_d45_predictor_8x8_neon + +void vpx_d45e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_16x16 vpx_d45e_predictor_16x16_c + +void vpx_d45e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_32x32 vpx_d45e_predictor_32x32_c + +void vpx_d45e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_4x4 vpx_d45e_predictor_4x4_c + +void vpx_d45e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d45e_predictor_8x8 vpx_d45e_predictor_8x8_c + +void vpx_d63_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_16x16 vpx_d63_predictor_16x16_c + +void vpx_d63_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_32x32 vpx_d63_predictor_32x32_c + +void vpx_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_4x4 vpx_d63_predictor_4x4_c + +void vpx_d63_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63_predictor_8x8 vpx_d63_predictor_8x8_c + +void vpx_d63e_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_16x16 vpx_d63e_predictor_16x16_c + +void vpx_d63e_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_32x32 vpx_d63e_predictor_32x32_c + +void vpx_d63e_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_4x4 vpx_d63e_predictor_4x4_c + +void vpx_d63e_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63e_predictor_8x8 vpx_d63e_predictor_8x8_c + +void vpx_d63f_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_d63f_predictor_4x4 vpx_d63f_predictor_4x4_c + +void vpx_dc_128_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_16x16 vpx_dc_128_predictor_16x16_neon + +void vpx_dc_128_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_32x32 vpx_dc_128_predictor_32x32_neon + +void vpx_dc_128_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_4x4 vpx_dc_128_predictor_4x4_neon + +void vpx_dc_128_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_128_predictor_8x8 vpx_dc_128_predictor_8x8_neon + +void vpx_dc_left_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_16x16 vpx_dc_left_predictor_16x16_neon + +void vpx_dc_left_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_32x32 vpx_dc_left_predictor_32x32_neon + +void vpx_dc_left_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_4x4 vpx_dc_left_predictor_4x4_neon + +void vpx_dc_left_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_left_predictor_8x8 vpx_dc_left_predictor_8x8_neon + +void vpx_dc_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_16x16 vpx_dc_predictor_16x16_neon + +void vpx_dc_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_32x32 vpx_dc_predictor_32x32_neon + +void vpx_dc_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_4x4 vpx_dc_predictor_4x4_neon + +void vpx_dc_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_predictor_8x8 vpx_dc_predictor_8x8_neon + +void vpx_dc_top_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_16x16 vpx_dc_top_predictor_16x16_neon + +void vpx_dc_top_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_32x32 vpx_dc_top_predictor_32x32_neon + +void vpx_dc_top_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_4x4 vpx_dc_top_predictor_4x4_neon + +void vpx_dc_top_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_dc_top_predictor_8x8 vpx_dc_top_predictor_8x8_neon + +void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct16x16 vpx_fdct16x16_c + +void vpx_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct16x16_1 vpx_fdct16x16_1_c + +void vpx_fdct32x32_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32 vpx_fdct32x32_c + +void vpx_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32_1 vpx_fdct32x32_1_c + +void vpx_fdct32x32_rd_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct32x32_rd vpx_fdct32x32_rd_c + +void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct4x4 vpx_fdct4x4_c + +void vpx_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct4x4_1 vpx_fdct4x4_1_c + +void vpx_fdct8x8_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct8x8 vpx_fdct8x8_neon + +void vpx_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride); +void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride); +#define vpx_fdct8x8_1 vpx_fdct8x8_1_neon + +void vpx_get16x16var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get16x16var_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_get16x16var vpx_get16x16var_neon + +unsigned int vpx_get4x4sse_cs_c(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +unsigned int vpx_get4x4sse_cs_neon(const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride); +#define vpx_get4x4sse_cs vpx_get4x4sse_cs_neon + +void vpx_get8x8var_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +void vpx_get8x8var_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum); +#define vpx_get8x8var vpx_get8x8var_neon + +unsigned int vpx_get_mb_ss_c(const int16_t *); +#define vpx_get_mb_ss vpx_get_mb_ss_c + +void vpx_h_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_16x16 vpx_h_predictor_16x16_neon + +void vpx_h_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_32x32 vpx_h_predictor_32x32_neon + +void vpx_h_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_4x4 vpx_h_predictor_4x4_neon + +void vpx_h_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_h_predictor_8x8 vpx_h_predictor_8x8_neon + +void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, int16_t *coeff); +void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride, int16_t *coeff); +#define vpx_hadamard_16x16 vpx_hadamard_16x16_neon + +void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, int16_t *coeff); +void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, int16_t *coeff); +#define vpx_hadamard_8x8 vpx_hadamard_8x8_neon + +void vpx_he_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_he_predictor_4x4 vpx_he_predictor_4x4_c + +void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct16x16_10_add vpx_idct16x16_10_add_neon + +void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct16x16_1_add vpx_idct16x16_1_add_neon + +void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct16x16_256_add vpx_idct16x16_256_add_neon + +void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct32x32_1024_add vpx_idct32x32_1024_add_neon + +void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct32x32_135_add vpx_idct32x32_1024_add_neon + +void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct32x32_1_add vpx_idct32x32_1_add_neon + +void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct32x32_34_add vpx_idct32x32_34_add_neon + +void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct4x4_16_add vpx_idct4x4_16_add_neon + +void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct4x4_1_add vpx_idct4x4_1_add_neon + +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct8x8_12_add vpx_idct8x8_12_add_neon + +void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct8x8_1_add vpx_idct8x8_1_add_neon + +void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct8x8_64_add vpx_idct8x8_64_add_neon + +int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width); +int16_t vpx_int_pro_col_neon(const uint8_t *ref, const int width); +#define vpx_int_pro_col vpx_int_pro_col_neon + +void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); +void vpx_int_pro_row_neon(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height); +#define vpx_int_pro_row vpx_int_pro_row_neon + +void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_iwht4x4_16_add vpx_iwht4x4_16_add_c + +void vpx_iwht4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_iwht4x4_1_add vpx_iwht4x4_1_add_c + +void vpx_lpf_horizontal_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_16 vpx_lpf_horizontal_16_neon + +void vpx_lpf_horizontal_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_16_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_16_dual vpx_lpf_horizontal_16_dual_neon + +void vpx_lpf_horizontal_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_4 vpx_lpf_horizontal_4_neon + +void vpx_lpf_horizontal_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_horizontal_4_dual vpx_lpf_horizontal_4_dual_neon + +void vpx_lpf_horizontal_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_horizontal_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_horizontal_8 vpx_lpf_horizontal_8_neon + +void vpx_lpf_horizontal_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_horizontal_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_horizontal_8_dual vpx_lpf_horizontal_8_dual_neon + +void vpx_lpf_vertical_16_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_16 vpx_lpf_vertical_16_neon + +void vpx_lpf_vertical_16_dual_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_16_dual vpx_lpf_vertical_16_dual_neon + +void vpx_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_4_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_4 vpx_lpf_vertical_4_neon + +void vpx_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_vertical_4_dual vpx_lpf_vertical_4_dual_neon + +void vpx_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +void vpx_lpf_vertical_8_neon(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh); +#define vpx_lpf_vertical_8 vpx_lpf_vertical_8_neon + +void vpx_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1); +#define vpx_lpf_vertical_8_dual vpx_lpf_vertical_8_dual_neon + +void vpx_mbpost_proc_across_ip_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); +#define vpx_mbpost_proc_across_ip vpx_mbpost_proc_across_ip_c + +void vpx_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols,int flimit); +#define vpx_mbpost_proc_down vpx_mbpost_proc_down_c + +void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +void vpx_minmax_8x8_neon(const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max); +#define vpx_minmax_8x8 vpx_minmax_8x8_neon + +unsigned int vpx_mse16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +unsigned int vpx_mse16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse16x16 vpx_mse16x16_neon + +unsigned int vpx_mse16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse16x8 vpx_mse16x8_c + +unsigned int vpx_mse8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse8x16 vpx_mse8x16_c + +unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse); +#define vpx_mse8x8 vpx_mse8x8_c + +void vpx_plane_add_noise_c(uint8_t *start, const int8_t *noise, int blackclamp, int whiteclamp, int width, int height, int pitch); +#define vpx_plane_add_noise vpx_plane_add_noise_c + +void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size); +#define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_c + +void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_quantize_b vpx_quantize_b_c + +void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +#define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c + +unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad16x16 vpx_sad16x16_neon + +unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x16_avg vpx_sad16x16_avg_c + +void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vpx_sad16x16x3 vpx_sad16x16x3_c + +void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad16x16x4d vpx_sad16x16x4d_neon + +void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vpx_sad16x16x8 vpx_sad16x16x8_c + +unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad16x32 vpx_sad16x32_c + +unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x32_avg vpx_sad16x32_avg_c + +void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad16x32x4d vpx_sad16x32x4d_c + +unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad16x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad16x8 vpx_sad16x8_neon + +unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad16x8_avg vpx_sad16x8_avg_c + +void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vpx_sad16x8x3 vpx_sad16x8x3_c + +void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad16x8x4d vpx_sad16x8x4d_c + +void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vpx_sad16x8x8 vpx_sad16x8x8_c + +unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad32x16 vpx_sad32x16_c + +unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad32x16_avg vpx_sad32x16_avg_c + +void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad32x16x4d vpx_sad32x16x4d_c + +unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad32x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad32x32 vpx_sad32x32_neon + +unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad32x32_avg vpx_sad32x32_avg_c + +void vpx_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vpx_sad32x32x3 vpx_sad32x32x3_c + +void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad32x32x4d vpx_sad32x32x4d_neon + +void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vpx_sad32x32x8 vpx_sad32x32x8_c + +unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad32x64 vpx_sad32x64_c + +unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad32x64_avg vpx_sad32x64_avg_c + +void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad32x64x4d vpx_sad32x64x4d_c + +unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad4x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad4x4 vpx_sad4x4_neon + +unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad4x4_avg vpx_sad4x4_avg_c + +void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vpx_sad4x4x3 vpx_sad4x4x3_c + +void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad4x4x4d vpx_sad4x4x4d_c + +void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vpx_sad4x4x8 vpx_sad4x4x8_c + +unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad4x8 vpx_sad4x8_c + +unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad4x8_avg vpx_sad4x8_avg_c + +void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad4x8x4d vpx_sad4x8x4d_c + +void vpx_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vpx_sad4x8x8 vpx_sad4x8x8_c + +unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad64x32 vpx_sad64x32_c + +unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad64x32_avg vpx_sad64x32_avg_c + +void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad64x32x4d vpx_sad64x32x4d_c + +unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad64x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad64x64 vpx_sad64x64_neon + +unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad64x64_avg vpx_sad64x64_avg_c + +void vpx_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vpx_sad64x64x3 vpx_sad64x64x3_c + +void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad64x64x4d vpx_sad64x64x4d_neon + +void vpx_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vpx_sad64x64x8 vpx_sad64x64x8_c + +unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x16 vpx_sad8x16_neon + +unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x16_avg vpx_sad8x16_avg_c + +void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vpx_sad8x16x3 vpx_sad8x16x3_c + +void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad8x16x4d vpx_sad8x16x4d_c + +void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vpx_sad8x16x8 vpx_sad8x16x8_c + +unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x4 vpx_sad8x4_c + +unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x4_avg vpx_sad8x4_avg_c + +void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad8x4x4d vpx_sad8x4x4d_c + +void vpx_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vpx_sad8x4x8 vpx_sad8x4x8_c + +unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +unsigned int vpx_sad8x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride); +#define vpx_sad8x8 vpx_sad8x8_neon + +unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred); +#define vpx_sad8x8_avg vpx_sad8x8_avg_c + +void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vpx_sad8x8x3 vpx_sad8x8x3_c + +void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array); +#define vpx_sad8x8x4d vpx_sad8x8x4d_c + +void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array); +#define vpx_sad8x8x8 vpx_sad8x8x8_c + +int vpx_satd_c(const int16_t *coeff, int length); +int vpx_satd_neon(const int16_t *coeff, int length); +#define vpx_satd vpx_satd_neon + +void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_2d vpx_scaled_2d_c + +void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_avg_2d vpx_scaled_avg_2d_c + +void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_avg_horiz vpx_scaled_avg_horiz_c + +void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_avg_vert vpx_scaled_avg_vert_c + +void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_horiz vpx_scaled_horiz_c + +void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h); +#define vpx_scaled_vert vpx_scaled_vert_c + +uint32_t vpx_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x16 vpx_sub_pixel_avg_variance16x16_c + +uint32_t vpx_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x32 vpx_sub_pixel_avg_variance16x32_c + +uint32_t vpx_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance16x8 vpx_sub_pixel_avg_variance16x8_c + +uint32_t vpx_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x16 vpx_sub_pixel_avg_variance32x16_c + +uint32_t vpx_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x32 vpx_sub_pixel_avg_variance32x32_c + +uint32_t vpx_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance32x64 vpx_sub_pixel_avg_variance32x64_c + +uint32_t vpx_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance4x4 vpx_sub_pixel_avg_variance4x4_c + +uint32_t vpx_sub_pixel_avg_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance4x8 vpx_sub_pixel_avg_variance4x8_c + +uint32_t vpx_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance64x32 vpx_sub_pixel_avg_variance64x32_c + +uint32_t vpx_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance64x64 vpx_sub_pixel_avg_variance64x64_c + +uint32_t vpx_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x16 vpx_sub_pixel_avg_variance8x16_c + +uint32_t vpx_sub_pixel_avg_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x4 vpx_sub_pixel_avg_variance8x4_c + +uint32_t vpx_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred); +#define vpx_sub_pixel_avg_variance8x8 vpx_sub_pixel_avg_variance8x8_c + +uint32_t vpx_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance16x16_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x16 vpx_sub_pixel_variance16x16_neon + +uint32_t vpx_sub_pixel_variance16x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x32 vpx_sub_pixel_variance16x32_c + +uint32_t vpx_sub_pixel_variance16x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance16x8 vpx_sub_pixel_variance16x8_c + +uint32_t vpx_sub_pixel_variance32x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x16 vpx_sub_pixel_variance32x16_c + +uint32_t vpx_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance32x32_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x32 vpx_sub_pixel_variance32x32_neon + +uint32_t vpx_sub_pixel_variance32x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance32x64 vpx_sub_pixel_variance32x64_c + +uint32_t vpx_sub_pixel_variance4x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance4x4 vpx_sub_pixel_variance4x4_c + +uint32_t vpx_sub_pixel_variance4x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance4x8 vpx_sub_pixel_variance4x8_c + +uint32_t vpx_sub_pixel_variance64x32_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance64x32 vpx_sub_pixel_variance64x32_c + +uint32_t vpx_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance64x64_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance64x64 vpx_sub_pixel_variance64x64_neon + +uint32_t vpx_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x16 vpx_sub_pixel_variance8x16_c + +uint32_t vpx_sub_pixel_variance8x4_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x4 vpx_sub_pixel_variance8x4_c + +uint32_t vpx_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +uint32_t vpx_sub_pixel_variance8x8_neon(const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse); +#define vpx_sub_pixel_variance8x8 vpx_sub_pixel_variance8x8_neon + +void vpx_subtract_block_c(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +void vpx_subtract_block_neon(int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride); +#define vpx_subtract_block vpx_subtract_block_neon + +uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size); +#define vpx_sum_squares_2d_i16 vpx_sum_squares_2d_i16_c + +void vpx_tm_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_16x16 vpx_tm_predictor_16x16_neon + +void vpx_tm_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_32x32 vpx_tm_predictor_32x32_neon + +void vpx_tm_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_4x4 vpx_tm_predictor_4x4_neon + +void vpx_tm_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_tm_predictor_8x8 vpx_tm_predictor_8x8_neon + +void vpx_v_predictor_16x16_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_16x16 vpx_v_predictor_16x16_neon + +void vpx_v_predictor_32x32_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_32x32 vpx_v_predictor_32x32_neon + +void vpx_v_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_4x4 vpx_v_predictor_4x4_neon + +void vpx_v_predictor_8x8_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_v_predictor_8x8 vpx_v_predictor_8x8_neon + +unsigned int vpx_variance16x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x16 vpx_variance16x16_neon + +unsigned int vpx_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x32 vpx_variance16x32_c + +unsigned int vpx_variance16x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance16x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance16x8 vpx_variance16x8_neon + +unsigned int vpx_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x16 vpx_variance32x16_c + +unsigned int vpx_variance32x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x32 vpx_variance32x32_neon + +unsigned int vpx_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance32x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance32x64 vpx_variance32x64_neon + +unsigned int vpx_variance4x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x4 vpx_variance4x4_c + +unsigned int vpx_variance4x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance4x8 vpx_variance4x8_c + +unsigned int vpx_variance64x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x32_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance64x32 vpx_variance64x32_neon + +unsigned int vpx_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance64x64_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance64x64 vpx_variance64x64_neon + +unsigned int vpx_variance8x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x16_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x16 vpx_variance8x16_neon + +unsigned int vpx_variance8x4_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x4 vpx_variance8x4_c + +unsigned int vpx_variance8x8_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +unsigned int vpx_variance8x8_neon(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse); +#define vpx_variance8x8 vpx_variance8x8_neon + +void vpx_ve_predictor_4x4_c(uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left); +#define vpx_ve_predictor_4x4 vpx_ve_predictor_4x4_c + +int vpx_vector_var_c(const int16_t *ref, const int16_t *src, const int bwl); +int vpx_vector_var_neon(const int16_t *ref, const int16_t *src, const int bwl); +#define vpx_vector_var vpx_vector_var_neon + +void vpx_dsp_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/arm.h" +static void setup_rtcd_internal(void) +{ + int flags = arm_cpu_caps(); + + (void)flags; + +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/chromium/third_party/libvpx/source/config/ios/arm64/vpx_scale_rtcd.h b/chromium/third_party/libvpx/source/config/ios/arm64/vpx_scale_rtcd.h new file mode 100644 index 00000000000..a1564b7ad6c --- /dev/null +++ b/chromium/third_party/libvpx/source/config/ios/arm64/vpx_scale_rtcd.h @@ -0,0 +1,71 @@ +#ifndef VPX_SCALE_RTCD_H_ +#define VPX_SCALE_RTCD_H_ + +#ifdef RTCD_C +#define RTCD_EXTERN +#else +#define RTCD_EXTERN extern +#endif + +struct yv12_buffer_config; + +#ifdef __cplusplus +extern "C" { +#endif + +void vp8_horizontal_line_2_1_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_2_1_scale vp8_horizontal_line_2_1_scale_c + +void vp8_horizontal_line_5_3_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_3_scale vp8_horizontal_line_5_3_scale_c + +void vp8_horizontal_line_5_4_scale_c(const unsigned char *source, unsigned int source_width, unsigned char *dest, unsigned int dest_width); +#define vp8_horizontal_line_5_4_scale vp8_horizontal_line_5_4_scale_c + +void vp8_vertical_band_2_1_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale vp8_vertical_band_2_1_scale_c + +void vp8_vertical_band_2_1_scale_i_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_2_1_scale_i vp8_vertical_band_2_1_scale_i_c + +void vp8_vertical_band_5_3_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_3_scale vp8_vertical_band_5_3_scale_c + +void vp8_vertical_band_5_4_scale_c(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width); +#define vp8_vertical_band_5_4_scale vp8_vertical_band_5_4_scale_c + +void vp8_yv12_copy_frame_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vp8_yv12_copy_frame vp8_yv12_copy_frame_c + +void vp8_yv12_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vp8_yv12_extend_frame_borders vp8_yv12_extend_frame_borders_c + +void vpx_extend_frame_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_borders vpx_extend_frame_borders_c + +void vpx_extend_frame_inner_borders_c(struct yv12_buffer_config *ybf); +#define vpx_extend_frame_inner_borders vpx_extend_frame_inner_borders_c + +void vpx_yv12_copy_y_c(const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc); +#define vpx_yv12_copy_y vpx_yv12_copy_y_c + +void vpx_scale_rtcd(void); + +#include "vpx_config.h" + +#ifdef RTCD_C +#include "vpx_ports/arm.h" +static void setup_rtcd_internal(void) +{ + int flags = arm_cpu_caps(); + + (void)flags; + +} +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_dsp_rtcd.h index 61f47c62761..0028d86c3ed 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/arm-neon-cpu-detect/vpx_dsp_rtcd.h @@ -337,7 +337,7 @@ void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_s RTCD_EXTERN void (*vpx_idct32x32_1_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); RTCD_EXTERN void (*vpx_idct32x32_34_add)(const tran_low_t *input, uint8_t *dest, int dest_stride); void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); @@ -936,7 +936,7 @@ static void setup_rtcd_internal(void) vpx_idct32x32_1_add = vpx_idct32x32_1_add_c; if (flags & HAS_NEON) vpx_idct32x32_1_add = vpx_idct32x32_1_add_neon; vpx_idct32x32_34_add = vpx_idct32x32_34_add_c; - if (flags & HAS_NEON) vpx_idct32x32_34_add = vpx_idct32x32_1024_add_neon; + if (flags & HAS_NEON) vpx_idct32x32_34_add = vpx_idct32x32_34_add_neon; vpx_idct4x4_16_add = vpx_idct4x4_16_add_c; if (flags & HAS_NEON) vpx_idct4x4_16_add = vpx_idct4x4_16_add_neon; vpx_idct4x4_1_add = vpx_idct4x4_1_add_c; diff --git a/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_dsp_rtcd.h index 312c22d665a..a5c50f21727 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/arm-neon/vpx_dsp_rtcd.h @@ -337,8 +337,8 @@ void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_s #define vpx_idct32x32_1_add vpx_idct32x32_1_add_neon void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); -#define vpx_idct32x32_34_add vpx_idct32x32_1024_add_neon +void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct32x32_34_add vpx_idct32x32_34_add_neon void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); diff --git a/chromium/third_party/libvpx/source/config/linux/arm64/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/arm64/vpx_dsp_rtcd.h index 312c22d665a..a5c50f21727 100644 --- a/chromium/third_party/libvpx/source/config/linux/arm64/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/arm64/vpx_dsp_rtcd.h @@ -337,8 +337,8 @@ void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int dest_s #define vpx_idct32x32_1_add vpx_idct32x32_1_add_neon void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); -void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); -#define vpx_idct32x32_34_add vpx_idct32x32_1024_add_neon +void vpx_idct32x32_34_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); +#define vpx_idct32x32_34_add vpx_idct32x32_34_add_neon void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride); void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride); diff --git a/chromium/third_party/libvpx/source/config/linux/generic/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/generic/vpx_dsp_rtcd.h index 394b709e0f9..163cf7611a8 100644 --- a/chromium/third_party/libvpx/source/config/linux/generic/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/generic/vpx_dsp_rtcd.h @@ -925,8 +925,8 @@ void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int des void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); #define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c -void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -#define vpx_highbd_idct8x8_10_add vpx_highbd_idct8x8_10_add_c +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_c void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); #define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c diff --git a/chromium/third_party/libvpx/source/config/linux/ia32/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/ia32/vp9_rtcd.h index 85cac2502a9..55c229554e3 100644 --- a/chromium/third_party/libvpx/source/config/linux/ia32/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/ia32/vp9_rtcd.h @@ -41,7 +41,8 @@ int vp9_diamond_search_sad_avx(const struct macroblock *x, const struct search_s RTCD_EXTERN int (*vp9_diamond_search_sad)(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -#define vp9_fdct8x8_quant vp9_fdct8x8_quant_c +void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +RTCD_EXTERN void (*vp9_fdct8x8_quant)(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type); @@ -156,6 +157,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vp9_denoiser_filter = vp9_denoiser_filter_sse2; vp9_diamond_search_sad = vp9_diamond_search_sad_c; if (flags & HAS_AVX) vp9_diamond_search_sad = vp9_diamond_search_sad_avx; + vp9_fdct8x8_quant = vp9_fdct8x8_quant_c; + if (flags & HAS_SSSE3) vp9_fdct8x8_quant = vp9_fdct8x8_quant_ssse3; vp9_fht16x16 = vp9_fht16x16_c; if (flags & HAS_SSE2) vp9_fht16x16 = vp9_fht16x16_sse2; vp9_fht4x4 = vp9_fht4x4_c; diff --git a/chromium/third_party/libvpx/source/config/linux/ia32/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/ia32/vpx_dsp_rtcd.h index 5cfa864325c..5e31286207d 100644 --- a/chromium/third_party/libvpx/source/config/linux/ia32/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/ia32/vpx_dsp_rtcd.h @@ -1112,9 +1112,9 @@ RTCD_EXTERN void (*vpx_highbd_idct4x4_16_add)(const tran_low_t *input, uint8_t * void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); #define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c -void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct8x8_10_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); #define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c @@ -2443,8 +2443,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_highbd_idct32x32_1_add = vpx_highbd_idct32x32_1_add_sse2; vpx_highbd_idct4x4_16_add = vpx_highbd_idct4x4_16_add_c; if (flags & HAS_SSE2) vpx_highbd_idct4x4_16_add = vpx_highbd_idct4x4_16_add_sse2; - vpx_highbd_idct8x8_10_add = vpx_highbd_idct8x8_10_add_c; - if (flags & HAS_SSE2) vpx_highbd_idct8x8_10_add = vpx_highbd_idct8x8_10_add_sse2; + vpx_highbd_idct8x8_12_add = vpx_highbd_idct8x8_12_add_c; + if (flags & HAS_SSE2) vpx_highbd_idct8x8_12_add = vpx_highbd_idct8x8_12_add_sse2; vpx_highbd_idct8x8_64_add = vpx_highbd_idct8x8_64_add_c; if (flags & HAS_SSE2) vpx_highbd_idct8x8_64_add = vpx_highbd_idct8x8_64_add_sse2; vpx_highbd_lpf_horizontal_16 = vpx_highbd_lpf_horizontal_16_c; diff --git a/chromium/third_party/libvpx/source/config/linux/x64/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/linux/x64/vp9_rtcd.h index d7819b8ee65..f747ed67d01 100644 --- a/chromium/third_party/libvpx/source/config/linux/x64/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/x64/vp9_rtcd.h @@ -41,7 +41,8 @@ int vp9_diamond_search_sad_avx(const struct macroblock *x, const struct search_s RTCD_EXTERN int (*vp9_diamond_search_sad)(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -#define vp9_fdct8x8_quant vp9_fdct8x8_quant_c +void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +RTCD_EXTERN void (*vp9_fdct8x8_quant)(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type); @@ -154,6 +155,8 @@ static void setup_rtcd_internal(void) vp9_diamond_search_sad = vp9_diamond_search_sad_c; if (flags & HAS_AVX) vp9_diamond_search_sad = vp9_diamond_search_sad_avx; + vp9_fdct8x8_quant = vp9_fdct8x8_quant_c; + if (flags & HAS_SSSE3) vp9_fdct8x8_quant = vp9_fdct8x8_quant_ssse3; vp9_full_search_sad = vp9_full_search_sad_c; if (flags & HAS_SSE3) vp9_full_search_sad = vp9_full_search_sadx3; if (flags & HAS_SSE4_1) vp9_full_search_sad = vp9_full_search_sadx8; diff --git a/chromium/third_party/libvpx/source/config/linux/x64/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/linux/x64/vpx_dsp_rtcd.h index b4d1372e746..1188bb43b56 100644 --- a/chromium/third_party/libvpx/source/config/linux/x64/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/linux/x64/vpx_dsp_rtcd.h @@ -1119,9 +1119,9 @@ void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); #define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c -void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -#define vpx_highbd_idct8x8_10_add vpx_highbd_idct8x8_10_add_sse2 +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_sse2 void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); #define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c diff --git a/chromium/third_party/libvpx/source/config/mac/ia32/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/mac/ia32/vp9_rtcd.h index 85cac2502a9..55c229554e3 100644 --- a/chromium/third_party/libvpx/source/config/mac/ia32/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/mac/ia32/vp9_rtcd.h @@ -41,7 +41,8 @@ int vp9_diamond_search_sad_avx(const struct macroblock *x, const struct search_s RTCD_EXTERN int (*vp9_diamond_search_sad)(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -#define vp9_fdct8x8_quant vp9_fdct8x8_quant_c +void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +RTCD_EXTERN void (*vp9_fdct8x8_quant)(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type); @@ -156,6 +157,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vp9_denoiser_filter = vp9_denoiser_filter_sse2; vp9_diamond_search_sad = vp9_diamond_search_sad_c; if (flags & HAS_AVX) vp9_diamond_search_sad = vp9_diamond_search_sad_avx; + vp9_fdct8x8_quant = vp9_fdct8x8_quant_c; + if (flags & HAS_SSSE3) vp9_fdct8x8_quant = vp9_fdct8x8_quant_ssse3; vp9_fht16x16 = vp9_fht16x16_c; if (flags & HAS_SSE2) vp9_fht16x16 = vp9_fht16x16_sse2; vp9_fht4x4 = vp9_fht4x4_c; diff --git a/chromium/third_party/libvpx/source/config/mac/ia32/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/mac/ia32/vpx_dsp_rtcd.h index 5cfa864325c..5e31286207d 100644 --- a/chromium/third_party/libvpx/source/config/mac/ia32/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/mac/ia32/vpx_dsp_rtcd.h @@ -1112,9 +1112,9 @@ RTCD_EXTERN void (*vpx_highbd_idct4x4_16_add)(const tran_low_t *input, uint8_t * void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); #define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c -void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct8x8_10_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); #define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c @@ -2443,8 +2443,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_highbd_idct32x32_1_add = vpx_highbd_idct32x32_1_add_sse2; vpx_highbd_idct4x4_16_add = vpx_highbd_idct4x4_16_add_c; if (flags & HAS_SSE2) vpx_highbd_idct4x4_16_add = vpx_highbd_idct4x4_16_add_sse2; - vpx_highbd_idct8x8_10_add = vpx_highbd_idct8x8_10_add_c; - if (flags & HAS_SSE2) vpx_highbd_idct8x8_10_add = vpx_highbd_idct8x8_10_add_sse2; + vpx_highbd_idct8x8_12_add = vpx_highbd_idct8x8_12_add_c; + if (flags & HAS_SSE2) vpx_highbd_idct8x8_12_add = vpx_highbd_idct8x8_12_add_sse2; vpx_highbd_idct8x8_64_add = vpx_highbd_idct8x8_64_add_c; if (flags & HAS_SSE2) vpx_highbd_idct8x8_64_add = vpx_highbd_idct8x8_64_add_sse2; vpx_highbd_lpf_horizontal_16 = vpx_highbd_lpf_horizontal_16_c; diff --git a/chromium/third_party/libvpx/source/config/mac/x64/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/mac/x64/vp9_rtcd.h index d7819b8ee65..f747ed67d01 100644 --- a/chromium/third_party/libvpx/source/config/mac/x64/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/mac/x64/vp9_rtcd.h @@ -41,7 +41,8 @@ int vp9_diamond_search_sad_avx(const struct macroblock *x, const struct search_s RTCD_EXTERN int (*vp9_diamond_search_sad)(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -#define vp9_fdct8x8_quant vp9_fdct8x8_quant_c +void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +RTCD_EXTERN void (*vp9_fdct8x8_quant)(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type); @@ -154,6 +155,8 @@ static void setup_rtcd_internal(void) vp9_diamond_search_sad = vp9_diamond_search_sad_c; if (flags & HAS_AVX) vp9_diamond_search_sad = vp9_diamond_search_sad_avx; + vp9_fdct8x8_quant = vp9_fdct8x8_quant_c; + if (flags & HAS_SSSE3) vp9_fdct8x8_quant = vp9_fdct8x8_quant_ssse3; vp9_full_search_sad = vp9_full_search_sad_c; if (flags & HAS_SSE3) vp9_full_search_sad = vp9_full_search_sadx3; if (flags & HAS_SSE4_1) vp9_full_search_sad = vp9_full_search_sadx8; diff --git a/chromium/third_party/libvpx/source/config/mac/x64/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/mac/x64/vpx_dsp_rtcd.h index b4d1372e746..1188bb43b56 100644 --- a/chromium/third_party/libvpx/source/config/mac/x64/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/mac/x64/vpx_dsp_rtcd.h @@ -1119,9 +1119,9 @@ void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); #define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c -void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -#define vpx_highbd_idct8x8_10_add vpx_highbd_idct8x8_10_add_sse2 +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_sse2 void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); #define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c diff --git a/chromium/third_party/libvpx/source/config/nacl/vpx_config.asm b/chromium/third_party/libvpx/source/config/nacl/vpx_config.asm deleted file mode 100644 index 827b4a76013..00000000000 --- a/chromium/third_party/libvpx/source/config/nacl/vpx_config.asm +++ /dev/null @@ -1,85 +0,0 @@ -@ This file was created from a .asm file -@ using the ads2gas.pl script. - .equ DO1STROUNDING, 0 -.equ ARCH_ARM , 0 -.equ ARCH_MIPS , 0 -.equ ARCH_X86 , 0 -.equ ARCH_X86_64 , 0 -.equ HAVE_NEON , 0 -.equ HAVE_NEON_ASM , 0 -.equ HAVE_MIPS32 , 0 -.equ HAVE_DSPR2 , 0 -.equ HAVE_MSA , 0 -.equ HAVE_MIPS64 , 0 -.equ HAVE_MMX , 0 -.equ HAVE_SSE , 0 -.equ HAVE_SSE2 , 0 -.equ HAVE_SSE3 , 0 -.equ HAVE_SSSE3 , 0 -.equ HAVE_SSE4_1 , 0 -.equ HAVE_AVX , 0 -.equ HAVE_AVX2 , 0 -.equ HAVE_VPX_PORTS , 1 -.equ HAVE_PTHREAD_H , 1 -.equ HAVE_UNISTD_H , 0 -.equ CONFIG_DEPENDENCY_TRACKING , 1 -.equ CONFIG_EXTERNAL_BUILD , 1 -.equ CONFIG_INSTALL_DOCS , 0 -.equ CONFIG_INSTALL_BINS , 1 -.equ CONFIG_INSTALL_LIBS , 1 -.equ CONFIG_INSTALL_SRCS , 0 -.equ CONFIG_DEBUG , 0 -.equ CONFIG_GPROF , 0 -.equ CONFIG_GCOV , 0 -.equ CONFIG_RVCT , 0 -.equ CONFIG_GCC , 1 -.equ CONFIG_MSVS , 0 -.equ CONFIG_PIC , 0 -.equ CONFIG_BIG_ENDIAN , 0 -.equ CONFIG_CODEC_SRCS , 0 -.equ CONFIG_DEBUG_LIBS , 0 -.equ CONFIG_DEQUANT_TOKENS , 0 -.equ CONFIG_DC_RECON , 0 -.equ CONFIG_RUNTIME_CPU_DETECT , 0 -.equ CONFIG_POSTPROC , 1 -.equ CONFIG_VP9_POSTPROC , 1 -.equ CONFIG_MULTITHREAD , 1 -.equ CONFIG_INTERNAL_STATS , 0 -.equ CONFIG_VP8_ENCODER , 1 -.equ CONFIG_VP8_DECODER , 1 -.equ CONFIG_VP9_ENCODER , 1 -.equ CONFIG_VP9_DECODER , 1 -.equ CONFIG_VP8 , 1 -.equ CONFIG_VP9 , 1 -.equ CONFIG_ENCODERS , 1 -.equ CONFIG_DECODERS , 1 -.equ CONFIG_STATIC_MSVCRT , 0 -.equ CONFIG_SPATIAL_RESAMPLING , 1 -.equ CONFIG_REALTIME_ONLY , 1 -.equ CONFIG_ONTHEFLY_BITPACKING , 0 -.equ CONFIG_ERROR_CONCEALMENT , 0 -.equ CONFIG_SHARED , 0 -.equ CONFIG_STATIC , 1 -.equ CONFIG_SMALL , 0 -.equ CONFIG_POSTPROC_VISUALIZER , 0 -.equ CONFIG_OS_SUPPORT , 1 -.equ CONFIG_UNIT_TESTS , 1 -.equ CONFIG_WEBM_IO , 1 -.equ CONFIG_LIBYUV , 1 -.equ CONFIG_DECODE_PERF_TESTS , 0 -.equ CONFIG_ENCODE_PERF_TESTS , 0 -.equ CONFIG_MULTI_RES_ENCODING , 1 -.equ CONFIG_TEMPORAL_DENOISING , 1 -.equ CONFIG_VP9_TEMPORAL_DENOISING , 1 -.equ CONFIG_COEFFICIENT_RANGE_CHECKING , 0 -.equ CONFIG_VP9_HIGHBITDEPTH , 1 -.equ CONFIG_BETTER_HW_COMPATIBILITY , 0 -.equ CONFIG_EXPERIMENTAL , 0 -.equ CONFIG_SIZE_LIMIT , 1 -.equ CONFIG_SPATIAL_SVC , 0 -.equ CONFIG_FP_MB_STATS , 0 -.equ CONFIG_EMULATE_HARDWARE , 0 -.equ CONFIG_MISC_FIXES , 0 -.equ DECODE_WIDTH_LIMIT , 16384 -.equ DECODE_HEIGHT_LIMIT , 16384 - .section .note.GNU-stack,"",%progbits diff --git a/chromium/third_party/libvpx/source/config/nacl/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/nacl/vpx_dsp_rtcd.h index 394b709e0f9..163cf7611a8 100644 --- a/chromium/third_party/libvpx/source/config/nacl/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/nacl/vpx_dsp_rtcd.h @@ -925,8 +925,8 @@ void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int des void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); #define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c -void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -#define vpx_highbd_idct8x8_10_add vpx_highbd_idct8x8_10_add_c +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_c void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); #define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c diff --git a/chromium/third_party/libvpx/source/config/vpx_version.h b/chromium/third_party/libvpx/source/config/vpx_version.h index 2909eeaf4bf..97666fffaf6 100644 --- a/chromium/third_party/libvpx/source/config/vpx_version.h +++ b/chromium/third_party/libvpx/source/config/vpx_version.h @@ -1,7 +1,7 @@ #define VERSION_MAJOR 1 #define VERSION_MINOR 6 #define VERSION_PATCH 0 -#define VERSION_EXTRA "535-g8978704" +#define VERSION_EXTRA "702-g5c64c01" #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH)) -#define VERSION_STRING_NOSP "v1.6.0-535-g8978704" -#define VERSION_STRING " v1.6.0-535-g8978704" +#define VERSION_STRING_NOSP "v1.6.0-702-g5c64c01" +#define VERSION_STRING " v1.6.0-702-g5c64c01" diff --git a/chromium/third_party/libvpx/source/config/win/ia32/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/win/ia32/vp9_rtcd.h index 85cac2502a9..55c229554e3 100644 --- a/chromium/third_party/libvpx/source/config/win/ia32/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/win/ia32/vp9_rtcd.h @@ -41,7 +41,8 @@ int vp9_diamond_search_sad_avx(const struct macroblock *x, const struct search_s RTCD_EXTERN int (*vp9_diamond_search_sad)(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -#define vp9_fdct8x8_quant vp9_fdct8x8_quant_c +void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +RTCD_EXTERN void (*vp9_fdct8x8_quant)(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type); @@ -156,6 +157,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vp9_denoiser_filter = vp9_denoiser_filter_sse2; vp9_diamond_search_sad = vp9_diamond_search_sad_c; if (flags & HAS_AVX) vp9_diamond_search_sad = vp9_diamond_search_sad_avx; + vp9_fdct8x8_quant = vp9_fdct8x8_quant_c; + if (flags & HAS_SSSE3) vp9_fdct8x8_quant = vp9_fdct8x8_quant_ssse3; vp9_fht16x16 = vp9_fht16x16_c; if (flags & HAS_SSE2) vp9_fht16x16 = vp9_fht16x16_sse2; vp9_fht4x4 = vp9_fht4x4_c; diff --git a/chromium/third_party/libvpx/source/config/win/ia32/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/win/ia32/vpx_dsp_rtcd.h index 5cfa864325c..5e31286207d 100644 --- a/chromium/third_party/libvpx/source/config/win/ia32/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/win/ia32/vpx_dsp_rtcd.h @@ -1112,9 +1112,9 @@ RTCD_EXTERN void (*vpx_highbd_idct4x4_16_add)(const tran_low_t *input, uint8_t * void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); #define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c -void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -RTCD_EXTERN void (*vpx_highbd_idct8x8_10_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +RTCD_EXTERN void (*vpx_highbd_idct8x8_12_add)(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); #define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c @@ -2443,8 +2443,8 @@ static void setup_rtcd_internal(void) if (flags & HAS_SSE2) vpx_highbd_idct32x32_1_add = vpx_highbd_idct32x32_1_add_sse2; vpx_highbd_idct4x4_16_add = vpx_highbd_idct4x4_16_add_c; if (flags & HAS_SSE2) vpx_highbd_idct4x4_16_add = vpx_highbd_idct4x4_16_add_sse2; - vpx_highbd_idct8x8_10_add = vpx_highbd_idct8x8_10_add_c; - if (flags & HAS_SSE2) vpx_highbd_idct8x8_10_add = vpx_highbd_idct8x8_10_add_sse2; + vpx_highbd_idct8x8_12_add = vpx_highbd_idct8x8_12_add_c; + if (flags & HAS_SSE2) vpx_highbd_idct8x8_12_add = vpx_highbd_idct8x8_12_add_sse2; vpx_highbd_idct8x8_64_add = vpx_highbd_idct8x8_64_add_c; if (flags & HAS_SSE2) vpx_highbd_idct8x8_64_add = vpx_highbd_idct8x8_64_add_sse2; vpx_highbd_lpf_horizontal_16 = vpx_highbd_lpf_horizontal_16_c; diff --git a/chromium/third_party/libvpx/source/config/win/x64/vp9_rtcd.h b/chromium/third_party/libvpx/source/config/win/x64/vp9_rtcd.h index d7819b8ee65..f747ed67d01 100644 --- a/chromium/third_party/libvpx/source/config/win/x64/vp9_rtcd.h +++ b/chromium/third_party/libvpx/source/config/win/x64/vp9_rtcd.h @@ -41,7 +41,8 @@ int vp9_diamond_search_sad_avx(const struct macroblock *x, const struct search_s RTCD_EXTERN int (*vp9_diamond_search_sad)(const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); void vp9_fdct8x8_quant_c(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); -#define vp9_fdct8x8_quant vp9_fdct8x8_quant_c +void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); +RTCD_EXTERN void (*vp9_fdct8x8_quant)(const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan); void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type); void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride, int tx_type); @@ -154,6 +155,8 @@ static void setup_rtcd_internal(void) vp9_diamond_search_sad = vp9_diamond_search_sad_c; if (flags & HAS_AVX) vp9_diamond_search_sad = vp9_diamond_search_sad_avx; + vp9_fdct8x8_quant = vp9_fdct8x8_quant_c; + if (flags & HAS_SSSE3) vp9_fdct8x8_quant = vp9_fdct8x8_quant_ssse3; vp9_full_search_sad = vp9_full_search_sad_c; if (flags & HAS_SSE3) vp9_full_search_sad = vp9_full_search_sadx3; if (flags & HAS_SSE4_1) vp9_full_search_sad = vp9_full_search_sadx8; diff --git a/chromium/third_party/libvpx/source/config/win/x64/vpx_dsp_rtcd.h b/chromium/third_party/libvpx/source/config/win/x64/vpx_dsp_rtcd.h index b4d1372e746..1188bb43b56 100644 --- a/chromium/third_party/libvpx/source/config/win/x64/vpx_dsp_rtcd.h +++ b/chromium/third_party/libvpx/source/config/win/x64/vpx_dsp_rtcd.h @@ -1119,9 +1119,9 @@ void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); #define vpx_highbd_idct4x4_1_add vpx_highbd_idct4x4_1_add_c -void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -void vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); -#define vpx_highbd_idct8x8_10_add vpx_highbd_idct8x8_10_add_sse2 +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); +#define vpx_highbd_idct8x8_12_add vpx_highbd_idct8x8_12_add_sse2 void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int dest_stride, int bd); #define vpx_highbd_idct8x8_1_add vpx_highbd_idct8x8_1_add_c diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/Android.mk b/chromium/third_party/libvpx/source/libvpx/build/make/Android.mk index 36120170e81..09bdc5d2f70 100644 --- a/chromium/third_party/libvpx/source/libvpx/build/make/Android.mk +++ b/chromium/third_party/libvpx/source/libvpx/build/make/Android.mk @@ -71,7 +71,7 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a) include $(CONFIG_DIR)libs-armv7-android-gcc.mk LOCAL_ARM_MODE := arm else ifeq ($(TARGET_ARCH_ABI),arm64-v8a) - include $(CONFIG_DIR)libs-armv8-android-gcc.mk + include $(CONFIG_DIR)libs-arm64-android-gcc.mk LOCAL_ARM_MODE := arm else ifeq ($(TARGET_ARCH_ABI),x86) include $(CONFIG_DIR)libs-x86-android-gcc.mk @@ -101,8 +101,8 @@ LOCAL_CFLAGS := -O3 # like x86inc.asm and x86_abi_support.asm LOCAL_ASMFLAGS := -I$(LIBVPX_PATH) -.PRECIOUS: %.asm.s -$(ASM_CNV_PATH)/libvpx/%.asm.s: $(LIBVPX_PATH)/%.asm +.PRECIOUS: %.asm.S +$(ASM_CNV_PATH)/libvpx/%.asm.S: $(LIBVPX_PATH)/%.asm @mkdir -p $(dir $@) @$(CONFIG_DIR)$(ASM_CONVERSION) <$< > $@ @@ -132,7 +132,7 @@ endif # Pull out assembly files, splitting NEON from the rest. This is # done to specify that the NEON assembly files use NEON assembler flags. -# x86 assembly matches %.asm, arm matches %.asm.s +# x86 assembly matches %.asm, arm matches %.asm.S # x86: @@ -140,12 +140,12 @@ CODEC_SRCS_ASM_X86 = $(filter %.asm, $(CODEC_SRCS_UNIQUE)) LOCAL_SRC_FILES += $(foreach file, $(CODEC_SRCS_ASM_X86), libvpx/$(file)) # arm: -CODEC_SRCS_ASM_ARM_ALL = $(filter %.asm.s, $(CODEC_SRCS_UNIQUE)) +CODEC_SRCS_ASM_ARM_ALL = $(filter %.asm.S, $(CODEC_SRCS_UNIQUE)) CODEC_SRCS_ASM_ARM = $(foreach v, \ $(CODEC_SRCS_ASM_ARM_ALL), \ $(if $(findstring neon,$(v)),,$(v))) -CODEC_SRCS_ASM_ADS2GAS = $(patsubst %.s, \ - $(ASM_CNV_PATH_LOCAL)/libvpx/%.s, \ +CODEC_SRCS_ASM_ADS2GAS = $(patsubst %.S, \ + $(ASM_CNV_PATH_LOCAL)/libvpx/%.S, \ $(CODEC_SRCS_ASM_ARM)) LOCAL_SRC_FILES += $(CODEC_SRCS_ASM_ADS2GAS) @@ -153,18 +153,19 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a) CODEC_SRCS_ASM_NEON = $(foreach v, \ $(CODEC_SRCS_ASM_ARM_ALL),\ $(if $(findstring neon,$(v)),$(v),)) - CODEC_SRCS_ASM_NEON_ADS2GAS = $(patsubst %.s, \ - $(ASM_CNV_PATH_LOCAL)/libvpx/%.s, \ + CODEC_SRCS_ASM_NEON_ADS2GAS = $(patsubst %.S, \ + $(ASM_CNV_PATH_LOCAL)/libvpx/%.S, \ $(CODEC_SRCS_ASM_NEON)) - LOCAL_SRC_FILES += $(patsubst %.s, \ - %.s.neon, \ + LOCAL_SRC_FILES += $(patsubst %.S, \ + %.S.neon, \ $(CODEC_SRCS_ASM_NEON_ADS2GAS)) endif LOCAL_CFLAGS += \ -DHAVE_CONFIG_H=vpx_config.h \ -I$(LIBVPX_PATH) \ - -I$(ASM_CNV_PATH) + -I$(ASM_CNV_PATH) \ + -I$(ASM_CNV_PATH)/libvpx LOCAL_MODULE := libvpx @@ -185,7 +186,8 @@ endif $$(rtcd_dep_template_SRCS): vpx_scale_rtcd.h $$(rtcd_dep_template_SRCS): vpx_dsp_rtcd.h -ifneq ($(findstring $(TARGET_ARCH_ABI),x86 x86_64),) +rtcd_dep_template_CONFIG_ASM_ABIS := x86 x86_64 armeabi-v7a +ifneq ($(findstring $(TARGET_ARCH_ABI),$(rtcd_dep_template_CONFIG_ASM_ABIS)),) $$(rtcd_dep_template_SRCS): vpx_config.asm endif endef diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/Makefile b/chromium/third_party/libvpx/source/libvpx/build/make/Makefile index 469eb74c3aa..cba605786cb 100644 --- a/chromium/third_party/libvpx/source/libvpx/build/make/Makefile +++ b/chromium/third_party/libvpx/source/libvpx/build/make/Makefile @@ -90,7 +90,7 @@ all: .PHONY: clean clean:: - rm -f $(OBJS-yes) $(OBJS-yes:.o=.d) $(OBJS-yes:.asm.s.o=.asm.s) + rm -f $(OBJS-yes) $(OBJS-yes:.o=.d) $(OBJS-yes:.asm.S.o=.asm.S) rm -f $(CLEAN-OBJS) .PHONY: clean @@ -180,13 +180,13 @@ $(BUILD_PFX)%.asm.o: %.asm $(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@)) $(qexec)$(AS) $(ASFLAGS) -o $@ $< -$(BUILD_PFX)%.s.d: %.s +$(BUILD_PFX)%.S.d: %.S $(if $(quiet),@echo " [DEP] $@") $(qexec)mkdir -p $(dir $@) $(qexec)$(SRC_PATH_BARE)/build/make/gen_asm_deps.sh \ --build-pfx=$(BUILD_PFX) --depfile=$@ $(ASFLAGS) $< > $@ -$(BUILD_PFX)%.s.o: %.s +$(BUILD_PFX)%.S.o: %.S $(if $(quiet),@echo " [AS] $@") $(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@)) $(qexec)$(AS) $(ASFLAGS) -o $@ $< @@ -198,8 +198,8 @@ $(BUILD_PFX)%.c.S: %.c $(qexec)$(if $(CONFIG_DEPENDENCY_TRACKING),,mkdir -p $(dir $@)) $(qexec)$(CC) -S $(CFLAGS) -o $@ $< -.PRECIOUS: %.asm.s -$(BUILD_PFX)%.asm.s: %.asm +.PRECIOUS: %.asm.S +$(BUILD_PFX)%.asm.S: %.asm $(if $(quiet),@echo " [ASM CONVERSION] $@") $(qexec)mkdir -p $(dir $@) $(qexec)$(ASM_CONVERSION) <$< >$@ diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/ads2gas.pl b/chromium/third_party/libvpx/source/libvpx/build/make/ads2gas.pl index 7272424af2e..029cc4a56f2 100755 --- a/chromium/third_party/libvpx/source/libvpx/build/make/ads2gas.pl +++ b/chromium/third_party/libvpx/source/libvpx/build/make/ads2gas.pl @@ -138,14 +138,6 @@ while (<STDIN>) s/DCD(.*)/.long $1/; s/DCB(.*)/.byte $1/; - # RN to .req - if (s/RN\s+([Rr]\d+|lr)/.req $1/) - { - print; - print "$comment_sub$comment\n" if defined $comment; - next; - } - # Make function visible to linker, and make additional symbol with # prepended underscore s/EXPORT\s+\|([\$\w]*)\|/.global $1 \n\t.type $1, function/; diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/ads2gas_apple.pl b/chromium/third_party/libvpx/source/libvpx/build/make/ads2gas_apple.pl index 1a9e105ba8d..e1ae7b4f871 100755 --- a/chromium/third_party/libvpx/source/libvpx/build/make/ads2gas_apple.pl +++ b/chromium/third_party/libvpx/source/libvpx/build/make/ads2gas_apple.pl @@ -120,18 +120,6 @@ while (<STDIN>) s/DCD(.*)/.long $1/; s/DCB(.*)/.byte $1/; - # Build a hash of all the register - alias pairs. - if (s/(.*)RN(.*)/$1 .req $2/g) - { - $register_aliases{trim($1)} = trim($2); - next; - } - - while (($key, $value) = each(%register_aliases)) - { - s/\b$key\b/$value/g; - } - # Make function visible to linker, and make additional symbol with # prepended underscore s/EXPORT\s+\|([\$\w]*)\|/.globl _$1\n\t.globl $1/; diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/configure.sh b/chromium/third_party/libvpx/source/libvpx/build/make/configure.sh index 35609e89af4..007e0200023 100644 --- a/chromium/third_party/libvpx/source/libvpx/build/make/configure.sh +++ b/chromium/third_party/libvpx/source/libvpx/build/make/configure.sh @@ -635,7 +635,7 @@ setup_gnu_toolchain() { AS=${AS:-${CROSS}as} STRIP=${STRIP:-${CROSS}strip} NM=${NM:-${CROSS}nm} - AS_SFX=.s + AS_SFX=.S EXE_SFX= } @@ -926,7 +926,7 @@ EOF ;; vs*) asm_conversion_cmd="${source_path}/build/make/ads2armasm_ms.pl" - AS_SFX=.s + AS_SFX=.S msvs_arch_dir=arm-msvs disable_feature multithread disable_feature unit_tests @@ -936,6 +936,7 @@ EOF # only "AppContainerApplication" which requires an AppxManifest. # Therefore disable the examples, just build the library. disable_feature examples + disable_feature tools fi ;; rvct) @@ -1034,7 +1035,7 @@ EOF STRIP="$(${XCRUN_FIND} strip)" NM="$(${XCRUN_FIND} nm)" RANLIB="$(${XCRUN_FIND} ranlib)" - AS_SFX=.s + AS_SFX=.S LD="${CXX:-$(${XCRUN_FIND} ld)}" # ASFLAGS is written here instead of using check_add_asflags diff --git a/chromium/third_party/libvpx/source/libvpx/build/make/gen_msvs_vcxproj.sh b/chromium/third_party/libvpx/source/libvpx/build/make/gen_msvs_vcxproj.sh index e98611d1024..2cf62c117c2 100755 --- a/chromium/third_party/libvpx/source/libvpx/build/make/gen_msvs_vcxproj.sh +++ b/chromium/third_party/libvpx/source/libvpx/build/make/gen_msvs_vcxproj.sh @@ -82,7 +82,7 @@ generate_filter() { | sed -e "s,$src_path_bare,," \ -e 's/^[\./]\+//g' -e 's,[:/ ],_,g') - if ([ "$pat" == "asm" ] || [ "$pat" == "s" ]) && $asm_use_custom_step; then + if ([ "$pat" == "asm" ] || [ "$pat" == "s" ] || [ "$pat" == "S" ]) && $asm_use_custom_step; then # Avoid object file name collisions, i.e. vpx_config.c and # vpx_config.asm produce the same object file without # this additional suffix. @@ -203,7 +203,7 @@ for opt in "$@"; do # The paths in file_list are fixed outside of the loop. file_list[${#file_list[@]}]="$opt" case "$opt" in - *.asm|*.s) uses_asm=true + *.asm|*.[Ss]) uses_asm=true ;; esac ;; @@ -452,7 +452,7 @@ generate_vcxproj() { done open_tag ItemGroup - generate_filter "Source Files" "c;cc;cpp;def;odl;idl;hpj;bat;asm;asmx;s" + generate_filter "Source Files" "c;cc;cpp;def;odl;idl;hpj;bat;asm;asmx;s;S" close_tag ItemGroup open_tag ItemGroup generate_filter "Header Files" "h;hm;inl;inc;xsd" diff --git a/chromium/third_party/libvpx/source/libvpx/configure b/chromium/third_party/libvpx/source/libvpx/configure index 7065dfef538..fb732acf3e5 100755 --- a/chromium/third_party/libvpx/source/libvpx/configure +++ b/chromium/third_party/libvpx/source/libvpx/configure @@ -22,6 +22,7 @@ show_help(){ Advanced options: ${toggle_libs} libraries ${toggle_examples} examples + ${toggle_tools} tools ${toggle_docs} documentation ${toggle_unit_tests} unit tests ${toggle_decode_perf_tests} build decoder perf tests with unit tests @@ -155,7 +156,7 @@ all_platforms="${all_platforms} generic-gnu" # all_targets is a list of all targets that can be configured # note that these should be in dependency order for now. -all_targets="libs examples docs" +all_targets="libs examples tools docs" # all targets available are enabled, by default. for t in ${all_targets}; do @@ -331,6 +332,7 @@ CMDLINE_SELECT=" libs examples + tools docs libc as @@ -476,7 +478,7 @@ EOF # # Write makefiles for all enabled targets # - for tgt in libs examples docs solution; do + for tgt in libs examples tools docs solution; do tgt_fn="$tgt-$toolchain.mk" if enabled $tgt; then diff --git a/chromium/third_party/libvpx/source/libvpx/examples/vp9_spatial_svc_encoder.c b/chromium/third_party/libvpx/source/libvpx/examples/vp9_spatial_svc_encoder.c index cecdce0804c..fa2df7271b2 100644 --- a/chromium/third_party/libvpx/source/libvpx/examples/vp9_spatial_svc_encoder.c +++ b/chromium/third_party/libvpx/source/libvpx/examples/vp9_spatial_svc_encoder.c @@ -84,6 +84,8 @@ static const arg_def_t speed_arg = ARG_DEF("sp", "speed", 1, "speed configuration"); static const arg_def_t aqmode_arg = ARG_DEF("aq", "aqmode", 1, "aq-mode off/on"); +static const arg_def_t bitrates_arg = + ARG_DEF("bl", "bitrates", 1, "bitrates[sl * num_tl + tl]"); #if CONFIG_VP9_HIGHBITDEPTH static const struct arg_enum_list bitdepth_enum[] = { @@ -124,6 +126,7 @@ static const arg_def_t *svc_args[] = { &frames_arg, #endif &speed_arg, &rc_end_usage_arg, + &bitrates_arg, NULL }; static const uint32_t default_frames_to_skip = 0; @@ -250,6 +253,9 @@ static void parse_command_line(int argc, const char **argv_, } else if (arg_match(&arg, &scale_factors_arg, argi)) { snprintf(string_options, sizeof(string_options), "%s scale-factors=%s", string_options, arg.val); + } else if (arg_match(&arg, &bitrates_arg, argi)) { + snprintf(string_options, sizeof(string_options), "%s bitrates=%s", + string_options, arg.val); } else if (arg_match(&arg, &passes_arg, argi)) { passes = arg_parse_uint(&arg); if (passes < 1 || passes > 2) { @@ -417,7 +423,6 @@ static void set_rate_control_stats(struct RateControlStats *rc, for (sl = 0; sl < cfg->ss_number_layers; ++sl) { for (tl = 0; tl < cfg->ts_number_layers; ++tl) { const int layer = sl * cfg->ts_number_layers + tl; - const int tlayer0 = sl * cfg->ts_number_layers; if (cfg->ts_number_layers == 1) rc->layer_framerate[layer] = framerate; else @@ -428,8 +433,8 @@ static void set_rate_control_stats(struct RateControlStats *rc, cfg->layer_target_bitrate[layer - 1]) / (rc->layer_framerate[layer] - rc->layer_framerate[layer - 1]); } else { - rc->layer_pfb[tlayer0] = 1000.0 * cfg->layer_target_bitrate[tlayer0] / - rc->layer_framerate[tlayer0]; + rc->layer_pfb[layer] = 1000.0 * cfg->layer_target_bitrate[layer] / + rc->layer_framerate[layer]; } rc->layer_input_frames[layer] = 0; rc->layer_enc_frames[layer] = 0; @@ -449,12 +454,13 @@ static void printout_rate_control_summary(struct RateControlStats *rc, vpx_codec_enc_cfg_t *cfg, int frame_cnt) { unsigned int sl, tl; - int tot_num_frames = 0; double perc_fluctuation = 0.0; + int tot_num_frames = 0; printf("Total number of processed frames: %d\n\n", frame_cnt - 1); printf("Rate control layer stats for sl%d tl%d layer(s):\n\n", cfg->ss_number_layers, cfg->ts_number_layers); for (sl = 0; sl < cfg->ss_number_layers; ++sl) { + tot_num_frames = 0; for (tl = 0; tl < cfg->ts_number_layers; ++tl) { const int layer = sl * cfg->ts_number_layers + tl; const int num_dropped = @@ -462,7 +468,7 @@ static void printout_rate_control_summary(struct RateControlStats *rc, ? (rc->layer_input_frames[layer] - rc->layer_enc_frames[layer]) : (rc->layer_input_frames[layer] - rc->layer_enc_frames[layer] - 1); - if (!sl) tot_num_frames += rc->layer_input_frames[layer]; + tot_num_frames += rc->layer_input_frames[layer]; rc->layer_encoding_bitrate[layer] = 0.001 * rc->layer_framerate[layer] * rc->layer_encoding_bitrate[layer] / tot_num_frames; @@ -620,7 +626,7 @@ int main(int argc, const char **argv) { struct RateControlStats rc; vpx_svc_layer_id_t layer_id; vpx_svc_ref_frame_config_t ref_frame_config; - int sl, tl; + unsigned int sl, tl; double sum_bitrate = 0.0; double sum_bitrate2 = 0.0; double framerate = 30.0; @@ -695,6 +701,8 @@ int main(int argc, const char **argv) { vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (svc_ctx.threads >> 1)); if (svc_ctx.speed >= 5 && svc_ctx.aqmode == 1) vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3); + if (svc_ctx.speed >= 5) + vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 1); // Encode frames while (!end_of_stream) { @@ -730,7 +738,7 @@ int main(int argc, const char **argv) { &ref_frame_config); // Keep track of input frames, to account for frame drops in rate control // stats/metrics. - for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) { + for (sl = 0; sl < (unsigned int)enc_cfg.ss_number_layers; ++sl) { ++rc.layer_input_frames[sl * enc_cfg.ts_number_layers + layer_id.temporal_layer_id]; } @@ -793,7 +801,7 @@ int main(int argc, const char **argv) { rc.layer_encoding_bitrate[layer] += 8.0 * sizes[sl]; // Keep count of rate control stats per layer, for non-key // frames. - if (tl == layer_id.temporal_layer_id && + if (tl == (unsigned int)layer_id.temporal_layer_id && !(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY)) { rc.layer_avg_frame_size[layer] += 8.0 * sizes[sl]; rc.layer_avg_rate_mismatch[layer] += @@ -807,7 +815,7 @@ int main(int argc, const char **argv) { // Update for short-time encoding bitrate states, for moving // window of size rc->window, shifted by rc->window / 2. // Ignore first window segment, due to key frame. - if (frame_cnt > rc.window_size) { + if (frame_cnt > (unsigned int)rc.window_size) { tl = layer_id.temporal_layer_id; for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) { sum_bitrate += 0.001 * 8.0 * sizes[sl] * framerate; @@ -823,13 +831,14 @@ int main(int argc, const char **argv) { } // Second shifted window. - if (frame_cnt > rc.window_size + rc.window_size / 2) { + if (frame_cnt > + (unsigned int)(rc.window_size + rc.window_size / 2)) { tl = layer_id.temporal_layer_id; for (sl = 0; sl < enc_cfg.ss_number_layers; ++sl) { sum_bitrate2 += 0.001 * 8.0 * sizes[sl] * framerate; } - if (frame_cnt > 2 * rc.window_size && + if (frame_cnt > (unsigned int)(2 * rc.window_size) && frame_cnt % rc.window_size == 0) { rc.window_count += 1; rc.avg_st_encoding_bitrate += sum_bitrate2 / rc.window_size; @@ -842,10 +851,11 @@ int main(int argc, const char **argv) { } #endif } - + /* printf("SVC frame: %d, kf: %d, size: %d, pts: %d\n", frames_received, !!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY), (int)cx_pkt->data.frame.sz, (int)cx_pkt->data.frame.pts); + */ if (enc_cfg.ss_number_layers == 1 && enc_cfg.ts_number_layers == 1) si->bytes_sum[0] += (int)cx_pkt->data.frame.sz; ++frames_received; diff --git a/chromium/third_party/libvpx/source/libvpx/libs.mk b/chromium/third_party/libvpx/source/libvpx/libs.mk index 6e12b540454..f4f48cc1621 100644 --- a/chromium/third_party/libvpx/source/libvpx/libs.mk +++ b/chromium/third_party/libvpx/source/libvpx/libs.mk @@ -12,7 +12,7 @@ # ARM assembly files are written in RVCT-style. We use some make magic to # filter those files to allow GCC compilation ifeq ($(ARCH_ARM),yes) - ASM:=$(if $(filter yes,$(CONFIG_GCC)$(CONFIG_MSVS)),.asm.s,.asm) + ASM:=$(if $(filter yes,$(CONFIG_GCC)$(CONFIG_MSVS)),.asm.S,.asm) else ASM:=.asm endif @@ -366,7 +366,7 @@ endif # # Add assembler dependencies for configuration. # -$(filter %.s.o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm +$(filter %.S.o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm $(filter %$(ASM).o,$(OBJS-yes)): $(BUILD_PFX)vpx_config.asm diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/README.libvpx b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/README.libvpx index 73f83032225..1f8a13d78c1 100644 --- a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/README.libvpx +++ b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/README.libvpx @@ -1,5 +1,5 @@ URL: https://chromium.googlesource.com/webm/libwebm -Version: 32d5ac49414a8914ec1e1f285f3f927c6e8ec29d +Version: 9732ae991efb71aced4267d4794918279e362d99 License: BSD License File: LICENSE.txt diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/common/file_util.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/common/file_util.cc index 4f91318f3e9..6dab146dd98 100644 --- a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/common/file_util.cc +++ b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/common/file_util.cc @@ -14,6 +14,7 @@ #include <cstdio> #include <cstdlib> +#include <cstring> #include <fstream> #include <ios> @@ -21,13 +22,23 @@ namespace libwebm { std::string GetTempFileName() { #if !defined _MSC_VER && !defined __MINGW32__ - char temp_file_name_template[] = "libwebm_temp.XXXXXX"; + std::string temp_file_name_template_str = + std::string(std::getenv("TEST_TMPDIR") ? std::getenv("TEST_TMPDIR") : + ".") + + "/libwebm_temp.XXXXXX"; + char* temp_file_name_template = + new char[temp_file_name_template_str.length() + 1]; + memset(temp_file_name_template, 0, temp_file_name_template_str.length() + 1); + temp_file_name_template_str.copy(temp_file_name_template, + temp_file_name_template_str.length(), 0); int fd = mkstemp(temp_file_name_template); + std::string temp_file_name = + (fd != -1) ? std::string(temp_file_name_template) : std::string(); + delete[] temp_file_name_template; if (fd != -1) { close(fd); - return std::string(temp_file_name_template); } - return std::string(); + return temp_file_name; #else char tmp_file_name[_MAX_PATH]; errno_t err = tmpnam_s(tmp_file_name); diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/common/hdr_util.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/common/hdr_util.cc index e1a9842fb6e..e1618ce75a7 100644 --- a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/common/hdr_util.cc +++ b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/common/hdr_util.cc @@ -7,12 +7,15 @@ // be found in the AUTHORS file in the root of the source tree. #include "hdr_util.h" +#include <climits> #include <cstddef> #include <new> #include "mkvparser/mkvparser.h" namespace libwebm { +const int Vp9CodecFeatures::kValueNotPresent = INT_MAX; + bool CopyPrimaryChromaticity(const mkvparser::PrimaryChromaticity& parser_pc, PrimaryChromaticityPtr* muxer_pc) { muxer_pc->reset(new (std::nothrow) @@ -29,9 +32,9 @@ bool MasteringMetadataValuePresent(double value) { bool CopyMasteringMetadata(const mkvparser::MasteringMetadata& parser_mm, mkvmuxer::MasteringMetadata* muxer_mm) { if (MasteringMetadataValuePresent(parser_mm.luminance_max)) - muxer_mm->luminance_max = parser_mm.luminance_max; + muxer_mm->set_luminance_max(parser_mm.luminance_max); if (MasteringMetadataValuePresent(parser_mm.luminance_min)) - muxer_mm->luminance_min = parser_mm.luminance_min; + muxer_mm->set_luminance_min(parser_mm.luminance_min); PrimaryChromaticityPtr r_ptr(NULL); PrimaryChromaticityPtr g_ptr(NULL); @@ -73,34 +76,37 @@ bool CopyColour(const mkvparser::Colour& parser_colour, return false; if (ColourValuePresent(parser_colour.matrix_coefficients)) - muxer_colour->matrix_coefficients = parser_colour.matrix_coefficients; + muxer_colour->set_matrix_coefficients(parser_colour.matrix_coefficients); if (ColourValuePresent(parser_colour.bits_per_channel)) - muxer_colour->bits_per_channel = parser_colour.bits_per_channel; - if (ColourValuePresent(parser_colour.chroma_subsampling_horz)) - muxer_colour->chroma_subsampling_horz = - parser_colour.chroma_subsampling_horz; - if (ColourValuePresent(parser_colour.chroma_subsampling_vert)) - muxer_colour->chroma_subsampling_vert = - parser_colour.chroma_subsampling_vert; + muxer_colour->set_bits_per_channel(parser_colour.bits_per_channel); + if (ColourValuePresent(parser_colour.chroma_subsampling_horz)) { + muxer_colour->set_chroma_subsampling_horz( + parser_colour.chroma_subsampling_horz); + } + if (ColourValuePresent(parser_colour.chroma_subsampling_vert)) { + muxer_colour->set_chroma_subsampling_vert( + parser_colour.chroma_subsampling_vert); + } if (ColourValuePresent(parser_colour.cb_subsampling_horz)) - muxer_colour->cb_subsampling_horz = parser_colour.cb_subsampling_horz; + muxer_colour->set_cb_subsampling_horz(parser_colour.cb_subsampling_horz); if (ColourValuePresent(parser_colour.cb_subsampling_vert)) - muxer_colour->cb_subsampling_vert = parser_colour.cb_subsampling_vert; + muxer_colour->set_cb_subsampling_vert(parser_colour.cb_subsampling_vert); if (ColourValuePresent(parser_colour.chroma_siting_horz)) - muxer_colour->chroma_siting_horz = parser_colour.chroma_siting_horz; + muxer_colour->set_chroma_siting_horz(parser_colour.chroma_siting_horz); if (ColourValuePresent(parser_colour.chroma_siting_vert)) - muxer_colour->chroma_siting_vert = parser_colour.chroma_siting_vert; + muxer_colour->set_chroma_siting_vert(parser_colour.chroma_siting_vert); if (ColourValuePresent(parser_colour.range)) - muxer_colour->range = parser_colour.range; - if (ColourValuePresent(parser_colour.transfer_characteristics)) - muxer_colour->transfer_characteristics = - parser_colour.transfer_characteristics; + muxer_colour->set_range(parser_colour.range); + if (ColourValuePresent(parser_colour.transfer_characteristics)) { + muxer_colour->set_transfer_characteristics( + parser_colour.transfer_characteristics); + } if (ColourValuePresent(parser_colour.primaries)) - muxer_colour->primaries = parser_colour.primaries; + muxer_colour->set_primaries(parser_colour.primaries); if (ColourValuePresent(parser_colour.max_cll)) - muxer_colour->max_cll = parser_colour.max_cll; + muxer_colour->set_max_cll(parser_colour.max_cll); if (ColourValuePresent(parser_colour.max_fall)) - muxer_colour->max_fall = parser_colour.max_fall; + muxer_colour->set_max_fall(parser_colour.max_fall); if (parser_colour.mastering_metadata) { mkvmuxer::MasteringMetadata muxer_mm; @@ -116,8 +122,8 @@ bool CopyColour(const mkvparser::Colour& parser_colour, // // 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 // +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ -// | ID Byte | Length | | -// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | +// | ID Byte | Length | | +// +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ | // | | // : Bytes 1..Length of Codec Feature : // | | @@ -132,51 +138,83 @@ bool CopyColour(const mkvparser::Colour& parser_colour, // // The X bit is reserved. // -// Currently only profile level is supported. ID byte must be set to 1, and -// length must be 1. Supported values are: -// -// 10: Level 1 -// 11: Level 1.1 -// 20: Level 2 -// 21: Level 2.1 -// 30: Level 3 -// 31: Level 3.1 -// 40: Level 4 -// 41: Level 4.1 -// 50: Level 5 -// 51: Level 5.1 -// 52: Level 5.2 -// 60: Level 6 -// 61: Level 6.1 -// 62: Level 6.2 -// // See the following link for more information: // http://www.webmproject.org/vp9/profiles/ -int ParseVpxCodecPrivate(const uint8_t* private_data, int32_t length) { - const int kVpxCodecPrivateLength = 3; - if (!private_data || length != kVpxCodecPrivateLength) - return 0; - - const uint8_t id_byte = *private_data; - if (id_byte != 1) - return 0; - - const int kVpxProfileLength = 1; - const uint8_t length_byte = private_data[1]; - if (length_byte != kVpxProfileLength) - return 0; - - const int level = static_cast<int>(private_data[2]); - - const int kNumLevels = 14; - const int levels[kNumLevels] = {10, 11, 20, 21, 30, 31, 40, - 41, 50, 51, 52, 60, 61, 62}; +bool ParseVpxCodecPrivate(const uint8_t* private_data, int32_t length, + Vp9CodecFeatures* features) { + const int kVpxCodecPrivateMinLength = 3; + if (!private_data || !features || length < kVpxCodecPrivateMinLength) + return false; - for (int i = 0; i < kNumLevels; ++i) { - if (level == levels[i]) - return level; - } + const uint8_t kVp9ProfileId = 1; + const uint8_t kVp9LevelId = 2; + const uint8_t kVp9BitDepthId = 3; + const uint8_t kVp9ChromaSubsamplingId = 4; + const int kVpxFeatureLength = 1; + int offset = 0; + + // Set features to not set. + features->profile = Vp9CodecFeatures::kValueNotPresent; + features->level = Vp9CodecFeatures::kValueNotPresent; + features->bit_depth = Vp9CodecFeatures::kValueNotPresent; + features->chroma_subsampling = Vp9CodecFeatures::kValueNotPresent; + do { + const uint8_t id_byte = private_data[offset++]; + const uint8_t length_byte = private_data[offset++]; + if (length_byte != kVpxFeatureLength) + return false; + if (id_byte == kVp9ProfileId) { + const int priv_profile = static_cast<int>(private_data[offset++]); + if (priv_profile < 0 || priv_profile > 3) + return false; + if (features->profile != Vp9CodecFeatures::kValueNotPresent && + features->profile != priv_profile) { + return false; + } + features->profile = priv_profile; + } else if (id_byte == kVp9LevelId) { + const int priv_level = static_cast<int>(private_data[offset++]); + + const int kNumLevels = 14; + const int levels[kNumLevels] = {10, 11, 20, 21, 30, 31, 40, + 41, 50, 51, 52, 60, 61, 62}; + + for (int i = 0; i < kNumLevels; ++i) { + if (priv_level == levels[i]) { + if (features->level != Vp9CodecFeatures::kValueNotPresent && + features->level != priv_level) { + return false; + } + features->level = priv_level; + break; + } + } + if (features->level == Vp9CodecFeatures::kValueNotPresent) + return false; + } else if (id_byte == kVp9BitDepthId) { + const int priv_profile = static_cast<int>(private_data[offset++]); + if (priv_profile != 8 && priv_profile != 10 && priv_profile != 12) + return false; + if (features->bit_depth != Vp9CodecFeatures::kValueNotPresent && + features->bit_depth != priv_profile) { + return false; + } + features->bit_depth = priv_profile; + } else if (id_byte == kVp9ChromaSubsamplingId) { + const int priv_profile = static_cast<int>(private_data[offset++]); + if (priv_profile != 0 && priv_profile != 2 && priv_profile != 3) + return false; + if (features->chroma_subsampling != Vp9CodecFeatures::kValueNotPresent && + features->chroma_subsampling != priv_profile) { + return false; + } + features->chroma_subsampling = priv_profile; + } else { + // Invalid ID. + return false; + } + } while (offset + kVpxCodecPrivateMinLength <= length); - return 0; + return true; } } // namespace libwebm diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/common/hdr_util.h b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/common/hdr_util.h index d30c2b9f2a0..689fb30a3fc 100644 --- a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/common/hdr_util.h +++ b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/common/hdr_util.h @@ -28,6 +28,25 @@ namespace libwebm { // TODO(tomfinegan): These should be moved to libwebm_utils once c++11 is // required by libwebm. +// Features of the VP9 codec that may be set in the CodecPrivate of a VP9 video +// stream. A value of kValueNotPresent represents that the value was not set in +// the CodecPrivate. +struct Vp9CodecFeatures { + static const int kValueNotPresent; + + Vp9CodecFeatures() + : profile(kValueNotPresent), + level(kValueNotPresent), + bit_depth(kValueNotPresent), + chroma_subsampling(kValueNotPresent) {} + ~Vp9CodecFeatures() {} + + int profile; + int level; + int bit_depth; + int chroma_subsampling; +}; + typedef std::auto_ptr<mkvmuxer::PrimaryChromaticity> PrimaryChromaticityPtr; bool CopyPrimaryChromaticity(const mkvparser::PrimaryChromaticity& parser_pc, @@ -43,8 +62,9 @@ bool ColourValuePresent(long long value); bool CopyColour(const mkvparser::Colour& parser_colour, mkvmuxer::Colour* muxer_colour); -// Returns VP9 profile upon success or 0 upon failure. -int ParseVpxCodecPrivate(const uint8_t* private_data, int32_t length); +// Returns true if |features| is set to one or more valid values. +bool ParseVpxCodecPrivate(const uint8_t* private_data, int32_t length, + Vp9CodecFeatures* features); } // namespace libwebm diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/common/webmids.h b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/common/webmids.h index 32a0c5fb911..89d722a71bc 100644 --- a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/common/webmids.h +++ b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/common/webmids.h @@ -124,6 +124,14 @@ enum MkvId { kMkvLuminanceMin = 0x55DA, // end mastering metadata // end colour + // projection + kMkvProjection = 0x7670, + kMkvProjectionType = 0x7671, + kMkvProjectionPrivate = 0x7672, + kMkvProjectionPoseYaw = 0x7673, + kMkvProjectionPosePitch = 0x7674, + kMkvProjectionPoseRoll = 0x7675, + // end projection // audio kMkvAudio = 0xE1, kMkvSamplingFrequency = 0xB5, diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc index c79ce24ed35..299b45c989c 100644 --- a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc +++ b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc @@ -16,6 +16,7 @@ #include <ctime> #include <memory> #include <new> +#include <string> #include <vector> #include "common/webmids.h" @@ -25,10 +26,19 @@ namespace mkvmuxer { +const float PrimaryChromaticity::kChromaticityMin = 0.0f; +const float PrimaryChromaticity::kChromaticityMax = 1.0f; +const float MasteringMetadata::kMinLuminance = 0.0f; +const float MasteringMetadata::kMinLuminanceMax = 999.99f; +const float MasteringMetadata::kMaxLuminanceMax = 9999.99f; const float MasteringMetadata::kValueNotPresent = FLT_MAX; const uint64_t Colour::kValueNotPresent = UINT64_MAX; namespace { + +const char kDocTypeWebm[] = "webm"; +const char kDocTypeMatroska[] = "matroska"; + // Deallocate the string designated by |dst|, and then copy the |src| // string to |dst|. The caller owns both the |src| string and the // |dst| copy (hence the caller is responsible for eventually @@ -63,7 +73,7 @@ bool CopyChromaticity(const PrimaryChromaticity* src, if (!dst) return false; - dst->reset(new (std::nothrow) PrimaryChromaticity(src->x, src->y)); + dst->reset(new (std::nothrow) PrimaryChromaticity(src->x(), src->y())); if (!dst->get()) return false; @@ -80,36 +90,57 @@ IMkvWriter::IMkvWriter() {} IMkvWriter::~IMkvWriter() {} -bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version) { +bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version, + const char* const doc_type) { // Level 0 - uint64_t size = EbmlElementSize(libwebm::kMkvEBMLVersion, UINT64_C(1)); - size += EbmlElementSize(libwebm::kMkvEBMLReadVersion, UINT64_C(1)); - size += EbmlElementSize(libwebm::kMkvEBMLMaxIDLength, UINT64_C(4)); - size += EbmlElementSize(libwebm::kMkvEBMLMaxSizeLength, UINT64_C(8)); - size += EbmlElementSize(libwebm::kMkvDocType, "webm"); - size += EbmlElementSize(libwebm::kMkvDocTypeVersion, doc_type_version); - size += EbmlElementSize(libwebm::kMkvDocTypeReadVersion, UINT64_C(2)); + uint64_t size = + EbmlElementSize(libwebm::kMkvEBMLVersion, static_cast<uint64>(1)); + size += EbmlElementSize(libwebm::kMkvEBMLReadVersion, static_cast<uint64>(1)); + size += EbmlElementSize(libwebm::kMkvEBMLMaxIDLength, static_cast<uint64>(4)); + size += + EbmlElementSize(libwebm::kMkvEBMLMaxSizeLength, static_cast<uint64>(8)); + size += EbmlElementSize(libwebm::kMkvDocType, doc_type); + size += EbmlElementSize(libwebm::kMkvDocTypeVersion, + static_cast<uint64>(doc_type_version)); + size += + EbmlElementSize(libwebm::kMkvDocTypeReadVersion, static_cast<uint64>(2)); if (!WriteEbmlMasterElement(writer, libwebm::kMkvEBML, size)) return false; - if (!WriteEbmlElement(writer, libwebm::kMkvEBMLVersion, UINT64_C(1))) + if (!WriteEbmlElement(writer, libwebm::kMkvEBMLVersion, + static_cast<uint64>(1))) { return false; - if (!WriteEbmlElement(writer, libwebm::kMkvEBMLReadVersion, UINT64_C(1))) + } + if (!WriteEbmlElement(writer, libwebm::kMkvEBMLReadVersion, + static_cast<uint64>(1))) { return false; - if (!WriteEbmlElement(writer, libwebm::kMkvEBMLMaxIDLength, UINT64_C(4))) + } + if (!WriteEbmlElement(writer, libwebm::kMkvEBMLMaxIDLength, + static_cast<uint64>(4))) { return false; - if (!WriteEbmlElement(writer, libwebm::kMkvEBMLMaxSizeLength, UINT64_C(8))) + } + if (!WriteEbmlElement(writer, libwebm::kMkvEBMLMaxSizeLength, + static_cast<uint64>(8))) { return false; - if (!WriteEbmlElement(writer, libwebm::kMkvDocType, "webm")) + } + if (!WriteEbmlElement(writer, libwebm::kMkvDocType, doc_type)) return false; - if (!WriteEbmlElement(writer, libwebm::kMkvDocTypeVersion, doc_type_version)) + if (!WriteEbmlElement(writer, libwebm::kMkvDocTypeVersion, + static_cast<uint64>(doc_type_version))) { return false; - if (!WriteEbmlElement(writer, libwebm::kMkvDocTypeReadVersion, UINT64_C(2))) + } + if (!WriteEbmlElement(writer, libwebm::kMkvDocTypeReadVersion, + static_cast<uint64>(2))) { return false; + } return true; } +bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version) { + return WriteEbmlHeader(writer, doc_type_version, kDocTypeWebm); +} + bool WriteEbmlHeader(IMkvWriter* writer) { return WriteEbmlHeader(writer, mkvmuxer::Segment::kDefaultDocTypeVersion); } @@ -262,15 +293,17 @@ bool CuePoint::Write(IMkvWriter* writer) const { if (!writer || track_ < 1 || cluster_pos_ < 1) return false; - uint64_t size = - EbmlElementSize(libwebm::kMkvCueClusterPosition, cluster_pos_); - size += EbmlElementSize(libwebm::kMkvCueTrack, track_); + uint64_t size = EbmlElementSize(libwebm::kMkvCueClusterPosition, + static_cast<uint64>(cluster_pos_)); + size += EbmlElementSize(libwebm::kMkvCueTrack, static_cast<uint64>(track_)); if (output_block_number_ && block_number_ > 1) - size += EbmlElementSize(libwebm::kMkvCueBlockNumber, block_number_); + size += EbmlElementSize(libwebm::kMkvCueBlockNumber, + static_cast<uint64>(block_number_)); const uint64_t track_pos_size = EbmlMasterElementSize(libwebm::kMkvCueTrackPositions, size) + size; const uint64_t payload_size = - EbmlElementSize(libwebm::kMkvCueTime, time_) + track_pos_size; + EbmlElementSize(libwebm::kMkvCueTime, static_cast<uint64>(time_)) + + track_pos_size; if (!WriteEbmlMasterElement(writer, libwebm::kMkvCuePoint, payload_size)) return false; @@ -279,18 +312,27 @@ bool CuePoint::Write(IMkvWriter* writer) const { if (payload_position < 0) return false; - if (!WriteEbmlElement(writer, libwebm::kMkvCueTime, time_)) + if (!WriteEbmlElement(writer, libwebm::kMkvCueTime, + static_cast<uint64>(time_))) { return false; + } if (!WriteEbmlMasterElement(writer, libwebm::kMkvCueTrackPositions, size)) return false; - if (!WriteEbmlElement(writer, libwebm::kMkvCueTrack, track_)) + if (!WriteEbmlElement(writer, libwebm::kMkvCueTrack, + static_cast<uint64>(track_))) { return false; - if (!WriteEbmlElement(writer, libwebm::kMkvCueClusterPosition, cluster_pos_)) + } + if (!WriteEbmlElement(writer, libwebm::kMkvCueClusterPosition, + static_cast<uint64>(cluster_pos_))) { return false; - if (output_block_number_ && block_number_ > 1) - if (!WriteEbmlElement(writer, libwebm::kMkvCueBlockNumber, block_number_)) + } + if (output_block_number_ && block_number_ > 1) { + if (!WriteEbmlElement(writer, libwebm::kMkvCueBlockNumber, + static_cast<uint64>(block_number_))) { return false; + } + } const int64_t stop_position = writer->Position(); if (stop_position < 0) @@ -303,15 +345,17 @@ bool CuePoint::Write(IMkvWriter* writer) const { } uint64_t CuePoint::PayloadSize() const { - uint64_t size = - EbmlElementSize(libwebm::kMkvCueClusterPosition, cluster_pos_); - size += EbmlElementSize(libwebm::kMkvCueTrack, track_); + uint64_t size = EbmlElementSize(libwebm::kMkvCueClusterPosition, + static_cast<uint64>(cluster_pos_)); + size += EbmlElementSize(libwebm::kMkvCueTrack, static_cast<uint64>(track_)); if (output_block_number_ && block_number_ > 1) - size += EbmlElementSize(libwebm::kMkvCueBlockNumber, block_number_); + size += EbmlElementSize(libwebm::kMkvCueBlockNumber, + static_cast<uint64>(block_number_)); const uint64_t track_pos_size = EbmlMasterElementSize(libwebm::kMkvCueTrackPositions, size) + size; const uint64_t payload_size = - EbmlElementSize(libwebm::kMkvCueTime, time_) + track_pos_size; + EbmlElementSize(libwebm::kMkvCueTime, static_cast<uint64>(time_)) + + track_pos_size; return payload_size; } @@ -456,8 +500,9 @@ bool ContentEncAESSettings::Write(IMkvWriter* writer) const { return false; if (!WriteEbmlElement(writer, libwebm::kMkvAESSettingsCipherMode, - cipher_mode_)) + static_cast<uint64>(cipher_mode_))) { return false; + } const int64_t stop_position = writer->Position(); if (stop_position < 0 || @@ -468,8 +513,8 @@ bool ContentEncAESSettings::Write(IMkvWriter* writer) const { } uint64_t ContentEncAESSettings::PayloadSize() const { - uint64_t size = - EbmlElementSize(libwebm::kMkvAESSettingsCipherMode, cipher_mode_); + uint64_t size = EbmlElementSize(libwebm::kMkvAESSettingsCipherMode, + static_cast<uint64>(cipher_mode_)); return size; } @@ -529,20 +574,22 @@ bool ContentEncoding::Write(IMkvWriter* writer) const { encoding_size)) return false; if (!WriteEbmlElement(writer, libwebm::kMkvContentEncodingOrder, - encoding_order_)) + static_cast<uint64>(encoding_order_))) return false; if (!WriteEbmlElement(writer, libwebm::kMkvContentEncodingScope, - encoding_scope_)) + static_cast<uint64>(encoding_scope_))) return false; if (!WriteEbmlElement(writer, libwebm::kMkvContentEncodingType, - encoding_type_)) + static_cast<uint64>(encoding_type_))) return false; if (!WriteEbmlMasterElement(writer, libwebm::kMkvContentEncryption, encryption_size)) return false; - if (!WriteEbmlElement(writer, libwebm::kMkvContentEncAlgo, enc_algo_)) + if (!WriteEbmlElement(writer, libwebm::kMkvContentEncAlgo, + static_cast<uint64>(enc_algo_))) { return false; + } if (!WriteEbmlElement(writer, libwebm::kMkvContentEncKeyID, enc_key_id_, enc_key_id_length_)) return false; @@ -571,12 +618,12 @@ uint64_t ContentEncoding::EncodingSize(uint64_t compresion_size, EbmlMasterElementSize(libwebm::kMkvContentEncryption, encryption_size) + encryption_size; } - encoding_size += - EbmlElementSize(libwebm::kMkvContentEncodingType, encoding_type_); - encoding_size += - EbmlElementSize(libwebm::kMkvContentEncodingScope, encoding_scope_); - encoding_size += - EbmlElementSize(libwebm::kMkvContentEncodingOrder, encoding_order_); + encoding_size += EbmlElementSize(libwebm::kMkvContentEncodingType, + static_cast<uint64>(encoding_type_)); + encoding_size += EbmlElementSize(libwebm::kMkvContentEncodingScope, + static_cast<uint64>(encoding_scope_)); + encoding_size += EbmlElementSize(libwebm::kMkvContentEncodingOrder, + static_cast<uint64>(encoding_order_)); return encoding_size; } @@ -586,7 +633,8 @@ uint64_t ContentEncoding::EncryptionSize() const { uint64_t encryption_size = EbmlElementSize(libwebm::kMkvContentEncKeyID, enc_key_id_, enc_key_id_length_); - encryption_size += EbmlElementSize(libwebm::kMkvContentEncAlgo, enc_algo_); + encryption_size += EbmlElementSize(libwebm::kMkvContentEncAlgo, + static_cast<uint64>(enc_algo_)); return encryption_size + aes_size; } @@ -664,9 +712,10 @@ ContentEncoding* Track::GetContentEncodingByIndex(uint32_t index) const { } uint64_t Track::PayloadSize() const { - uint64_t size = EbmlElementSize(libwebm::kMkvTrackNumber, number_); - size += EbmlElementSize(libwebm::kMkvTrackUID, uid_); - size += EbmlElementSize(libwebm::kMkvTrackType, type_); + uint64_t size = + EbmlElementSize(libwebm::kMkvTrackNumber, static_cast<uint64>(number_)); + size += EbmlElementSize(libwebm::kMkvTrackUID, static_cast<uint64>(uid_)); + size += EbmlElementSize(libwebm::kMkvTrackType, static_cast<uint64>(type_)); if (codec_id_) size += EbmlElementSize(libwebm::kMkvCodecID, codec_id_); if (codec_private_) @@ -676,15 +725,22 @@ uint64_t Track::PayloadSize() const { size += EbmlElementSize(libwebm::kMkvLanguage, language_); if (name_) size += EbmlElementSize(libwebm::kMkvName, name_); - if (max_block_additional_id_) + if (max_block_additional_id_) { size += EbmlElementSize(libwebm::kMkvMaxBlockAdditionID, - max_block_additional_id_); - if (codec_delay_) - size += EbmlElementSize(libwebm::kMkvCodecDelay, codec_delay_); - if (seek_pre_roll_) - size += EbmlElementSize(libwebm::kMkvSeekPreRoll, seek_pre_roll_); - if (default_duration_) - size += EbmlElementSize(libwebm::kMkvDefaultDuration, default_duration_); + static_cast<uint64>(max_block_additional_id_)); + } + if (codec_delay_) { + size += EbmlElementSize(libwebm::kMkvCodecDelay, + static_cast<uint64>(codec_delay_)); + } + if (seek_pre_roll_) { + size += EbmlElementSize(libwebm::kMkvSeekPreRoll, + static_cast<uint64>(seek_pre_roll_)); + } + if (default_duration_) { + size += EbmlElementSize(libwebm::kMkvDefaultDuration, + static_cast<uint64>(default_duration_)); + } if (content_encoding_entries_size_ > 0) { uint64_t content_encodings_size = 0; @@ -722,55 +778,64 @@ bool Track::Write(IMkvWriter* writer) const { if (!WriteEbmlMasterElement(writer, libwebm::kMkvTrackEntry, payload_size)) return false; - uint64_t size = EbmlElementSize(libwebm::kMkvTrackNumber, number_); - size += EbmlElementSize(libwebm::kMkvTrackUID, uid_); - size += EbmlElementSize(libwebm::kMkvTrackType, type_); + uint64_t size = + EbmlElementSize(libwebm::kMkvTrackNumber, static_cast<uint64>(number_)); + size += EbmlElementSize(libwebm::kMkvTrackUID, static_cast<uint64>(uid_)); + size += EbmlElementSize(libwebm::kMkvTrackType, static_cast<uint64>(type_)); if (codec_id_) size += EbmlElementSize(libwebm::kMkvCodecID, codec_id_); if (codec_private_) size += EbmlElementSize(libwebm::kMkvCodecPrivate, codec_private_, - codec_private_length_); + static_cast<uint64>(codec_private_length_)); if (language_) size += EbmlElementSize(libwebm::kMkvLanguage, language_); if (name_) size += EbmlElementSize(libwebm::kMkvName, name_); if (max_block_additional_id_) size += EbmlElementSize(libwebm::kMkvMaxBlockAdditionID, - max_block_additional_id_); + static_cast<uint64>(max_block_additional_id_)); if (codec_delay_) - size += EbmlElementSize(libwebm::kMkvCodecDelay, codec_delay_); + size += EbmlElementSize(libwebm::kMkvCodecDelay, + static_cast<uint64>(codec_delay_)); if (seek_pre_roll_) - size += EbmlElementSize(libwebm::kMkvSeekPreRoll, seek_pre_roll_); + size += EbmlElementSize(libwebm::kMkvSeekPreRoll, + static_cast<uint64>(seek_pre_roll_)); if (default_duration_) - size += EbmlElementSize(libwebm::kMkvDefaultDuration, default_duration_); + size += EbmlElementSize(libwebm::kMkvDefaultDuration, + static_cast<uint64>(default_duration_)); const int64_t payload_position = writer->Position(); if (payload_position < 0) return false; - if (!WriteEbmlElement(writer, libwebm::kMkvTrackNumber, number_)) + if (!WriteEbmlElement(writer, libwebm::kMkvTrackNumber, + static_cast<uint64>(number_))) return false; - if (!WriteEbmlElement(writer, libwebm::kMkvTrackUID, uid_)) + if (!WriteEbmlElement(writer, libwebm::kMkvTrackUID, + static_cast<uint64>(uid_))) return false; - if (!WriteEbmlElement(writer, libwebm::kMkvTrackType, type_)) + if (!WriteEbmlElement(writer, libwebm::kMkvTrackType, + static_cast<uint64>(type_))) return false; if (max_block_additional_id_) { if (!WriteEbmlElement(writer, libwebm::kMkvMaxBlockAdditionID, - max_block_additional_id_)) { + static_cast<uint64>(max_block_additional_id_))) { return false; } } if (codec_delay_) { - if (!WriteEbmlElement(writer, libwebm::kMkvCodecDelay, codec_delay_)) + if (!WriteEbmlElement(writer, libwebm::kMkvCodecDelay, + static_cast<uint64>(codec_delay_))) return false; } if (seek_pre_roll_) { - if (!WriteEbmlElement(writer, libwebm::kMkvSeekPreRoll, seek_pre_roll_)) + if (!WriteEbmlElement(writer, libwebm::kMkvSeekPreRoll, + static_cast<uint64>(seek_pre_roll_))) return false; } if (default_duration_) { if (!WriteEbmlElement(writer, libwebm::kMkvDefaultDuration, - default_duration_)) + static_cast<uint64>(default_duration_))) return false; } if (codec_id_) { @@ -779,7 +844,7 @@ bool Track::Write(IMkvWriter* writer) const { } if (codec_private_) { if (!WriteEbmlElement(writer, libwebm::kMkvCodecPrivate, codec_private_, - codec_private_length_)) + static_cast<uint64>(codec_private_length_))) return false; } if (language_) { @@ -890,14 +955,23 @@ void Track::set_name(const char* name) { // // Colour and its child elements -uint64_t PrimaryChromaticity::PrimaryChromaticityPayloadSize( +uint64_t PrimaryChromaticity::PrimaryChromaticitySize( libwebm::MkvId x_id, libwebm::MkvId y_id) const { - return EbmlElementSize(x_id, x) + EbmlElementSize(y_id, y); + return EbmlElementSize(x_id, x_) + EbmlElementSize(y_id, y_); } bool PrimaryChromaticity::Write(IMkvWriter* writer, libwebm::MkvId x_id, libwebm::MkvId y_id) const { - return WriteEbmlElement(writer, x_id, x) && WriteEbmlElement(writer, y_id, y); + if (!Valid()) { + return false; + } + return WriteEbmlElement(writer, x_id, x_) && + WriteEbmlElement(writer, y_id, y_); +} + +bool PrimaryChromaticity::Valid() const { + return (x_ >= kChromaticityMin && x_ <= kChromaticityMax && + y_ >= kChromaticityMin && y_ <= kChromaticityMax); } uint64_t MasteringMetadata::MasteringMetadataSize() const { @@ -909,6 +983,31 @@ uint64_t MasteringMetadata::MasteringMetadataSize() const { return size; } +bool MasteringMetadata::Valid() const { + if (luminance_min_ != kValueNotPresent) { + if (luminance_min_ < kMinLuminance || luminance_min_ > kMinLuminanceMax || + luminance_min_ > luminance_max_) { + return false; + } + } + if (luminance_max_ != kValueNotPresent) { + if (luminance_max_ < kMinLuminance || luminance_max_ > kMaxLuminanceMax || + luminance_max_ < luminance_min_) { + return false; + } + } + if (r_ && !r_->Valid()) + return false; + if (g_ && !g_->Valid()) + return false; + if (b_ && !b_->Valid()) + return false; + if (white_point_ && !white_point_->Valid()) + return false; + + return true; +} + bool MasteringMetadata::Write(IMkvWriter* writer) const { const uint64_t size = PayloadSize(); @@ -918,12 +1017,12 @@ bool MasteringMetadata::Write(IMkvWriter* writer) const { if (!WriteEbmlMasterElement(writer, libwebm::kMkvMasteringMetadata, size)) return false; - if (luminance_max != kValueNotPresent && - !WriteEbmlElement(writer, libwebm::kMkvLuminanceMax, luminance_max)) { + if (luminance_max_ != kValueNotPresent && + !WriteEbmlElement(writer, libwebm::kMkvLuminanceMax, luminance_max_)) { return false; } - if (luminance_min != kValueNotPresent && - !WriteEbmlElement(writer, libwebm::kMkvLuminanceMin, luminance_min)) { + if (luminance_min_ != kValueNotPresent && + !WriteEbmlElement(writer, libwebm::kMkvLuminanceMin, luminance_min_)) { return false; } if (r_ && @@ -984,25 +1083,25 @@ bool MasteringMetadata::SetChromaticity( uint64_t MasteringMetadata::PayloadSize() const { uint64_t size = 0; - if (luminance_max != kValueNotPresent) - size += EbmlElementSize(libwebm::kMkvLuminanceMax, luminance_max); - if (luminance_min != kValueNotPresent) - size += EbmlElementSize(libwebm::kMkvLuminanceMin, luminance_min); + if (luminance_max_ != kValueNotPresent) + size += EbmlElementSize(libwebm::kMkvLuminanceMax, luminance_max_); + if (luminance_min_ != kValueNotPresent) + size += EbmlElementSize(libwebm::kMkvLuminanceMin, luminance_min_); if (r_) { - size += r_->PrimaryChromaticityPayloadSize( - libwebm::kMkvPrimaryRChromaticityX, libwebm::kMkvPrimaryRChromaticityY); + size += r_->PrimaryChromaticitySize(libwebm::kMkvPrimaryRChromaticityX, + libwebm::kMkvPrimaryRChromaticityY); } if (g_) { - size += g_->PrimaryChromaticityPayloadSize( - libwebm::kMkvPrimaryGChromaticityX, libwebm::kMkvPrimaryGChromaticityY); + size += g_->PrimaryChromaticitySize(libwebm::kMkvPrimaryGChromaticityX, + libwebm::kMkvPrimaryGChromaticityY); } if (b_) { - size += b_->PrimaryChromaticityPayloadSize( - libwebm::kMkvPrimaryBChromaticityX, libwebm::kMkvPrimaryBChromaticityY); + size += b_->PrimaryChromaticitySize(libwebm::kMkvPrimaryBChromaticityX, + libwebm::kMkvPrimaryBChromaticityY); } if (white_point_) { - size += white_point_->PrimaryChromaticityPayloadSize( + size += white_point_->PrimaryChromaticitySize( libwebm::kMkvWhitePointChromaticityX, libwebm::kMkvWhitePointChromaticityY); } @@ -1019,6 +1118,33 @@ uint64_t Colour::ColourSize() const { return size; } +bool Colour::Valid() const { + if (mastering_metadata_ && !mastering_metadata_->Valid()) + return false; + if (matrix_coefficients_ != kValueNotPresent && + !IsMatrixCoefficientsValueValid(matrix_coefficients_)) { + return false; + } + if (chroma_siting_horz_ != kValueNotPresent && + !IsChromaSitingHorzValueValid(chroma_siting_horz_)) { + return false; + } + if (chroma_siting_vert_ != kValueNotPresent && + !IsChromaSitingVertValueValid(chroma_siting_vert_)) { + return false; + } + if (range_ != kValueNotPresent && !IsColourRangeValueValid(range_)) + return false; + if (transfer_characteristics_ != kValueNotPresent && + !IsTransferCharacteristicsValueValid(transfer_characteristics_)) { + return false; + } + if (primaries_ != kValueNotPresent && !IsPrimariesValueValid(primaries_)) + return false; + + return true; +} + bool Colour::Write(IMkvWriter* writer) const { const uint64_t size = PayloadSize(); @@ -1026,69 +1152,77 @@ bool Colour::Write(IMkvWriter* writer) const { if (size == 0) return true; + // Don't write an invalid element. + if (!Valid()) + return false; + if (!WriteEbmlMasterElement(writer, libwebm::kMkvColour, size)) return false; - if (matrix_coefficients != kValueNotPresent && + if (matrix_coefficients_ != kValueNotPresent && !WriteEbmlElement(writer, libwebm::kMkvMatrixCoefficients, - matrix_coefficients)) { + static_cast<uint64>(matrix_coefficients_))) { return false; } - if (bits_per_channel != kValueNotPresent && + if (bits_per_channel_ != kValueNotPresent && !WriteEbmlElement(writer, libwebm::kMkvBitsPerChannel, - bits_per_channel)) { + static_cast<uint64>(bits_per_channel_))) { return false; } - if (chroma_subsampling_horz != kValueNotPresent && + if (chroma_subsampling_horz_ != kValueNotPresent && !WriteEbmlElement(writer, libwebm::kMkvChromaSubsamplingHorz, - chroma_subsampling_horz)) { + static_cast<uint64>(chroma_subsampling_horz_))) { return false; } - if (chroma_subsampling_vert != kValueNotPresent && + if (chroma_subsampling_vert_ != kValueNotPresent && !WriteEbmlElement(writer, libwebm::kMkvChromaSubsamplingVert, - chroma_subsampling_vert)) { + static_cast<uint64>(chroma_subsampling_vert_))) { return false; } - if (cb_subsampling_horz != kValueNotPresent && + if (cb_subsampling_horz_ != kValueNotPresent && !WriteEbmlElement(writer, libwebm::kMkvCbSubsamplingHorz, - cb_subsampling_horz)) { + static_cast<uint64>(cb_subsampling_horz_))) { return false; } - if (cb_subsampling_vert != kValueNotPresent && + if (cb_subsampling_vert_ != kValueNotPresent && !WriteEbmlElement(writer, libwebm::kMkvCbSubsamplingVert, - cb_subsampling_vert)) { + static_cast<uint64>(cb_subsampling_vert_))) { return false; } - if (chroma_siting_horz != kValueNotPresent && + if (chroma_siting_horz_ != kValueNotPresent && !WriteEbmlElement(writer, libwebm::kMkvChromaSitingHorz, - chroma_siting_horz)) { + static_cast<uint64>(chroma_siting_horz_))) { return false; } - if (chroma_siting_vert != kValueNotPresent && + if (chroma_siting_vert_ != kValueNotPresent && !WriteEbmlElement(writer, libwebm::kMkvChromaSitingVert, - chroma_siting_vert)) { + static_cast<uint64>(chroma_siting_vert_))) { return false; } - if (range != kValueNotPresent && - !WriteEbmlElement(writer, libwebm::kMkvRange, range)) { + if (range_ != kValueNotPresent && + !WriteEbmlElement(writer, libwebm::kMkvRange, + static_cast<uint64>(range_))) { return false; } - if (transfer_characteristics != kValueNotPresent && + if (transfer_characteristics_ != kValueNotPresent && !WriteEbmlElement(writer, libwebm::kMkvTransferCharacteristics, - transfer_characteristics)) { + static_cast<uint64>(transfer_characteristics_))) { return false; } - if (primaries != kValueNotPresent && - !WriteEbmlElement(writer, libwebm::kMkvPrimaries, primaries)) { + if (primaries_ != kValueNotPresent && + !WriteEbmlElement(writer, libwebm::kMkvPrimaries, + static_cast<uint64>(primaries_))) { return false; } - if (max_cll != kValueNotPresent && - !WriteEbmlElement(writer, libwebm::kMkvMaxCLL, max_cll)) { + if (max_cll_ != kValueNotPresent && + !WriteEbmlElement(writer, libwebm::kMkvMaxCLL, + static_cast<uint64>(max_cll_))) { return false; } - if (max_fall != kValueNotPresent && - !WriteEbmlElement(writer, libwebm::kMkvMaxFALL, max_fall)) { + if (max_fall_ != kValueNotPresent && + !WriteEbmlElement(writer, libwebm::kMkvMaxFALL, + static_cast<uint64>(max_fall_))) { return false; } @@ -1103,8 +1237,8 @@ bool Colour::SetMasteringMetadata(const MasteringMetadata& mastering_metadata) { if (!mm_ptr.get()) return false; - mm_ptr->luminance_max = mastering_metadata.luminance_max; - mm_ptr->luminance_min = mastering_metadata.luminance_min; + mm_ptr->set_luminance_max(mastering_metadata.luminance_max()); + mm_ptr->set_luminance_min(mastering_metadata.luminance_min()); if (!mm_ptr->SetChromaticity(mastering_metadata.r(), mastering_metadata.g(), mastering_metadata.b(), @@ -1120,38 +1254,56 @@ bool Colour::SetMasteringMetadata(const MasteringMetadata& mastering_metadata) { uint64_t Colour::PayloadSize() const { uint64_t size = 0; - if (matrix_coefficients != kValueNotPresent) - size += - EbmlElementSize(libwebm::kMkvMatrixCoefficients, matrix_coefficients); - if (bits_per_channel != kValueNotPresent) - size += EbmlElementSize(libwebm::kMkvBitsPerChannel, bits_per_channel); - if (chroma_subsampling_horz != kValueNotPresent) + if (matrix_coefficients_ != kValueNotPresent) { + size += EbmlElementSize(libwebm::kMkvMatrixCoefficients, + static_cast<uint64>(matrix_coefficients_)); + } + if (bits_per_channel_ != kValueNotPresent) { + size += EbmlElementSize(libwebm::kMkvBitsPerChannel, + static_cast<uint64>(bits_per_channel_)); + } + if (chroma_subsampling_horz_ != kValueNotPresent) { size += EbmlElementSize(libwebm::kMkvChromaSubsamplingHorz, - chroma_subsampling_horz); - if (chroma_subsampling_vert != kValueNotPresent) + static_cast<uint64>(chroma_subsampling_horz_)); + } + if (chroma_subsampling_vert_ != kValueNotPresent) { size += EbmlElementSize(libwebm::kMkvChromaSubsamplingVert, - chroma_subsampling_vert); - if (cb_subsampling_horz != kValueNotPresent) - size += - EbmlElementSize(libwebm::kMkvCbSubsamplingHorz, cb_subsampling_horz); - if (cb_subsampling_vert != kValueNotPresent) - size += - EbmlElementSize(libwebm::kMkvCbSubsamplingVert, cb_subsampling_vert); - if (chroma_siting_horz != kValueNotPresent) - size += EbmlElementSize(libwebm::kMkvChromaSitingHorz, chroma_siting_horz); - if (chroma_siting_vert != kValueNotPresent) - size += EbmlElementSize(libwebm::kMkvChromaSitingVert, chroma_siting_vert); - if (range != kValueNotPresent) - size += EbmlElementSize(libwebm::kMkvRange, range); - if (transfer_characteristics != kValueNotPresent) + static_cast<uint64>(chroma_subsampling_vert_)); + } + if (cb_subsampling_horz_ != kValueNotPresent) { + size += EbmlElementSize(libwebm::kMkvCbSubsamplingHorz, + static_cast<uint64>(cb_subsampling_horz_)); + } + if (cb_subsampling_vert_ != kValueNotPresent) { + size += EbmlElementSize(libwebm::kMkvCbSubsamplingVert, + static_cast<uint64>(cb_subsampling_vert_)); + } + if (chroma_siting_horz_ != kValueNotPresent) { + size += EbmlElementSize(libwebm::kMkvChromaSitingHorz, + static_cast<uint64>(chroma_siting_horz_)); + } + if (chroma_siting_vert_ != kValueNotPresent) { + size += EbmlElementSize(libwebm::kMkvChromaSitingVert, + static_cast<uint64>(chroma_siting_vert_)); + } + if (range_ != kValueNotPresent) { + size += EbmlElementSize(libwebm::kMkvRange, static_cast<uint64>(range_)); + } + if (transfer_characteristics_ != kValueNotPresent) { size += EbmlElementSize(libwebm::kMkvTransferCharacteristics, - transfer_characteristics); - if (primaries != kValueNotPresent) - size += EbmlElementSize(libwebm::kMkvPrimaries, primaries); - if (max_cll != kValueNotPresent) - size += EbmlElementSize(libwebm::kMkvMaxCLL, max_cll); - if (max_fall != kValueNotPresent) - size += EbmlElementSize(libwebm::kMkvMaxFALL, max_fall); + static_cast<uint64>(transfer_characteristics_)); + } + if (primaries_ != kValueNotPresent) { + size += EbmlElementSize(libwebm::kMkvPrimaries, + static_cast<uint64>(primaries_)); + } + if (max_cll_ != kValueNotPresent) { + size += EbmlElementSize(libwebm::kMkvMaxCLL, static_cast<uint64>(max_cll_)); + } + if (max_fall_ != kValueNotPresent) { + size += + EbmlElementSize(libwebm::kMkvMaxFALL, static_cast<uint64>(max_fall_)); + } if (mastering_metadata_) size += mastering_metadata_->MasteringMetadataSize(); @@ -1161,12 +1313,103 @@ uint64_t Colour::PayloadSize() const { /////////////////////////////////////////////////////////////// // +// Projection element + +uint64_t Projection::ProjectionSize() const { + uint64_t size = PayloadSize(); + + if (size > 0) + size += EbmlMasterElementSize(libwebm::kMkvProjection, size); + + return size; +} + +bool Projection::Write(IMkvWriter* writer) const { + const uint64_t size = PayloadSize(); + + // Don't write an empty element. + if (size == 0) + return true; + + if (!WriteEbmlMasterElement(writer, libwebm::kMkvProjection, size)) + return false; + + if (!WriteEbmlElement(writer, libwebm::kMkvProjectionType, + static_cast<uint64>(type_))) { + return false; + } + + if (private_data_length_ > 0 && private_data_ != NULL && + !WriteEbmlElement(writer, libwebm::kMkvProjectionPrivate, private_data_, + private_data_length_)) { + return false; + } + + if (!WriteEbmlElement(writer, libwebm::kMkvProjectionPoseYaw, pose_yaw_)) + return false; + + if (!WriteEbmlElement(writer, libwebm::kMkvProjectionPosePitch, + pose_pitch_)) { + return false; + } + + if (!WriteEbmlElement(writer, libwebm::kMkvProjectionPoseRoll, pose_roll_)) { + return false; + } + + return true; +} + +bool Projection::SetProjectionPrivate(const uint8_t* data, + uint64_t data_length) { + if (data == NULL || data_length == 0) { + return false; + } + + if (data_length != static_cast<size_t>(data_length)) { + return false; + } + + uint8_t* new_private_data = + new (std::nothrow) uint8_t[static_cast<size_t>(data_length)]; + if (new_private_data == NULL) { + return false; + } + + delete[] private_data_; + private_data_ = new_private_data; + private_data_length_ = data_length; + memcpy(private_data_, data, static_cast<size_t>(data_length)); + + return true; +} + +uint64_t Projection::PayloadSize() const { + uint64_t size = + EbmlElementSize(libwebm::kMkvProjection, static_cast<uint64>(type_)); + + if (private_data_length_ > 0 && private_data_ != NULL) { + size += EbmlElementSize(libwebm::kMkvProjectionPrivate, private_data_, + private_data_length_); + } + + size += EbmlElementSize(libwebm::kMkvProjectionPoseYaw, pose_yaw_); + size += EbmlElementSize(libwebm::kMkvProjectionPosePitch, pose_pitch_); + size += EbmlElementSize(libwebm::kMkvProjectionPoseRoll, pose_roll_); + + return size; +} + +/////////////////////////////////////////////////////////////// +// // VideoTrack Class VideoTrack::VideoTrack(unsigned int* seed) : Track(seed), display_height_(0), display_width_(0), + pixel_height_(0), + pixel_width_(0), crop_left_(0), crop_right_(0), crop_top_(0), @@ -1176,9 +1419,13 @@ VideoTrack::VideoTrack(unsigned int* seed) stereo_mode_(0), alpha_mode_(0), width_(0), - colour_(NULL) {} + colour_(NULL), + projection_(NULL) {} -VideoTrack::~VideoTrack() { delete colour_; } +VideoTrack::~VideoTrack() { + delete colour_; + delete projection_; +} bool VideoTrack::SetStereoMode(uint64_t stereo_mode) { if (stereo_mode != kMono && stereo_mode != kSideBySideLeftIsFirst && @@ -1221,40 +1468,52 @@ bool VideoTrack::Write(IMkvWriter* writer) const { if (payload_position < 0) return false; - if (!WriteEbmlElement(writer, libwebm::kMkvPixelWidth, width_)) + if (!WriteEbmlElement( + writer, libwebm::kMkvPixelWidth, + static_cast<uint64>((pixel_width_ > 0) ? pixel_width_ : width_))) return false; - if (!WriteEbmlElement(writer, libwebm::kMkvPixelHeight, height_)) + if (!WriteEbmlElement( + writer, libwebm::kMkvPixelHeight, + static_cast<uint64>((pixel_height_ > 0) ? pixel_height_ : height_))) return false; if (display_width_ > 0) { - if (!WriteEbmlElement(writer, libwebm::kMkvDisplayWidth, display_width_)) + if (!WriteEbmlElement(writer, libwebm::kMkvDisplayWidth, + static_cast<uint64>(display_width_))) return false; } if (display_height_ > 0) { - if (!WriteEbmlElement(writer, libwebm::kMkvDisplayHeight, display_height_)) + if (!WriteEbmlElement(writer, libwebm::kMkvDisplayHeight, + static_cast<uint64>(display_height_))) return false; } if (crop_left_ > 0) { - if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropLeft, crop_left_)) + if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropLeft, + static_cast<uint64>(crop_left_))) return false; } if (crop_right_ > 0) { - if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropRight, crop_right_)) + if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropRight, + static_cast<uint64>(crop_right_))) return false; } if (crop_top_ > 0) { - if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropTop, crop_top_)) + if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropTop, + static_cast<uint64>(crop_top_))) return false; } if (crop_bottom_ > 0) { - if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropBottom, crop_bottom_)) + if (!WriteEbmlElement(writer, libwebm::kMkvPixelCropBottom, + static_cast<uint64>(crop_bottom_))) return false; } if (stereo_mode_ > kMono) { - if (!WriteEbmlElement(writer, libwebm::kMkvStereoMode, stereo_mode_)) + if (!WriteEbmlElement(writer, libwebm::kMkvStereoMode, + static_cast<uint64>(stereo_mode_))) return false; } if (alpha_mode_ > kNoAlpha) { - if (!WriteEbmlElement(writer, libwebm::kMkvAlphaMode, alpha_mode_)) + if (!WriteEbmlElement(writer, libwebm::kMkvAlphaMode, + static_cast<uint64>(alpha_mode_))) return false; } if (frame_rate_ > 0.0) { @@ -1267,6 +1526,10 @@ bool VideoTrack::Write(IMkvWriter* writer) const { if (!colour_->Write(writer)) return false; } + if (projection_) { + if (!projection_->Write(writer)) + return false; + } const int64_t stop_position = writer->Position(); if (stop_position < 0 || @@ -1287,47 +1550,83 @@ bool VideoTrack::SetColour(const Colour& colour) { return false; } - colour_ptr->matrix_coefficients = colour.matrix_coefficients; - colour_ptr->bits_per_channel = colour.bits_per_channel; - colour_ptr->chroma_subsampling_horz = colour.chroma_subsampling_horz; - colour_ptr->chroma_subsampling_vert = colour.chroma_subsampling_vert; - colour_ptr->cb_subsampling_horz = colour.cb_subsampling_horz; - colour_ptr->cb_subsampling_vert = colour.cb_subsampling_vert; - colour_ptr->chroma_siting_horz = colour.chroma_siting_horz; - colour_ptr->chroma_siting_vert = colour.chroma_siting_vert; - colour_ptr->range = colour.range; - colour_ptr->transfer_characteristics = colour.transfer_characteristics; - colour_ptr->primaries = colour.primaries; - colour_ptr->max_cll = colour.max_cll; - colour_ptr->max_fall = colour.max_fall; + colour_ptr->set_matrix_coefficients(colour.matrix_coefficients()); + colour_ptr->set_bits_per_channel(colour.bits_per_channel()); + colour_ptr->set_chroma_subsampling_horz(colour.chroma_subsampling_horz()); + colour_ptr->set_chroma_subsampling_vert(colour.chroma_subsampling_vert()); + colour_ptr->set_cb_subsampling_horz(colour.cb_subsampling_horz()); + colour_ptr->set_cb_subsampling_vert(colour.cb_subsampling_vert()); + colour_ptr->set_chroma_siting_horz(colour.chroma_siting_horz()); + colour_ptr->set_chroma_siting_vert(colour.chroma_siting_vert()); + colour_ptr->set_range(colour.range()); + colour_ptr->set_transfer_characteristics(colour.transfer_characteristics()); + colour_ptr->set_primaries(colour.primaries()); + colour_ptr->set_max_cll(colour.max_cll()); + colour_ptr->set_max_fall(colour.max_fall()); + delete colour_; colour_ = colour_ptr.release(); return true; } +bool VideoTrack::SetProjection(const Projection& projection) { + std::auto_ptr<Projection> projection_ptr(new Projection()); + if (!projection_ptr.get()) + return false; + + if (projection.private_data()) { + if (!projection_ptr->SetProjectionPrivate( + projection.private_data(), projection.private_data_length())) { + return false; + } + } + + projection_ptr->set_type(projection.type()); + projection_ptr->set_pose_yaw(projection.pose_yaw()); + projection_ptr->set_pose_pitch(projection.pose_pitch()); + projection_ptr->set_pose_roll(projection.pose_roll()); + delete projection_; + projection_ = projection_ptr.release(); + return true; +} + uint64_t VideoTrack::VideoPayloadSize() const { - uint64_t size = EbmlElementSize(libwebm::kMkvPixelWidth, width_); - size += EbmlElementSize(libwebm::kMkvPixelHeight, height_); + uint64_t size = EbmlElementSize( + libwebm::kMkvPixelWidth, + static_cast<uint64>((pixel_width_ > 0) ? pixel_width_ : width_)); + size += EbmlElementSize( + libwebm::kMkvPixelHeight, + static_cast<uint64>((pixel_height_ > 0) ? pixel_height_ : height_)); if (display_width_ > 0) - size += EbmlElementSize(libwebm::kMkvDisplayWidth, display_width_); + size += EbmlElementSize(libwebm::kMkvDisplayWidth, + static_cast<uint64>(display_width_)); if (display_height_ > 0) - size += EbmlElementSize(libwebm::kMkvDisplayHeight, display_height_); + size += EbmlElementSize(libwebm::kMkvDisplayHeight, + static_cast<uint64>(display_height_)); if (crop_left_ > 0) - size += EbmlElementSize(libwebm::kMkvPixelCropLeft, crop_left_); + size += EbmlElementSize(libwebm::kMkvPixelCropLeft, + static_cast<uint64>(crop_left_)); if (crop_right_ > 0) - size += EbmlElementSize(libwebm::kMkvPixelCropRight, crop_right_); + size += EbmlElementSize(libwebm::kMkvPixelCropRight, + static_cast<uint64>(crop_right_)); if (crop_top_ > 0) - size += EbmlElementSize(libwebm::kMkvPixelCropTop, crop_top_); + size += EbmlElementSize(libwebm::kMkvPixelCropTop, + static_cast<uint64>(crop_top_)); if (crop_bottom_ > 0) - size += EbmlElementSize(libwebm::kMkvPixelCropBottom, crop_bottom_); + size += EbmlElementSize(libwebm::kMkvPixelCropBottom, + static_cast<uint64>(crop_bottom_)); if (stereo_mode_ > kMono) - size += EbmlElementSize(libwebm::kMkvStereoMode, stereo_mode_); + size += EbmlElementSize(libwebm::kMkvStereoMode, + static_cast<uint64>(stereo_mode_)); if (alpha_mode_ > kNoAlpha) - size += EbmlElementSize(libwebm::kMkvAlphaMode, alpha_mode_); + size += EbmlElementSize(libwebm::kMkvAlphaMode, + static_cast<uint64>(alpha_mode_)); if (frame_rate_ > 0.0) size += EbmlElementSize(libwebm::kMkvFrameRate, static_cast<float>(frame_rate_)); if (colour_) size += colour_->ColourSize(); + if (projection_) + size += projection_->ProjectionSize(); return size; } @@ -1346,9 +1645,11 @@ uint64_t AudioTrack::PayloadSize() const { uint64_t size = EbmlElementSize(libwebm::kMkvSamplingFrequency, static_cast<float>(sample_rate_)); - size += EbmlElementSize(libwebm::kMkvChannels, channels_); + size += + EbmlElementSize(libwebm::kMkvChannels, static_cast<uint64>(channels_)); if (bit_depth_ > 0) - size += EbmlElementSize(libwebm::kMkvBitDepth, bit_depth_); + size += + EbmlElementSize(libwebm::kMkvBitDepth, static_cast<uint64>(bit_depth_)); size += EbmlMasterElementSize(libwebm::kMkvAudio, size); return parent_size + size; @@ -1361,9 +1662,11 @@ bool AudioTrack::Write(IMkvWriter* writer) const { // Calculate AudioSettings size. uint64_t size = EbmlElementSize(libwebm::kMkvSamplingFrequency, static_cast<float>(sample_rate_)); - size += EbmlElementSize(libwebm::kMkvChannels, channels_); + size += + EbmlElementSize(libwebm::kMkvChannels, static_cast<uint64>(channels_)); if (bit_depth_ > 0) - size += EbmlElementSize(libwebm::kMkvBitDepth, bit_depth_); + size += + EbmlElementSize(libwebm::kMkvBitDepth, static_cast<uint64>(bit_depth_)); if (!WriteEbmlMasterElement(writer, libwebm::kMkvAudio, size)) return false; @@ -1375,10 +1678,12 @@ bool AudioTrack::Write(IMkvWriter* writer) const { if (!WriteEbmlElement(writer, libwebm::kMkvSamplingFrequency, static_cast<float>(sample_rate_))) return false; - if (!WriteEbmlElement(writer, libwebm::kMkvChannels, channels_)) + if (!WriteEbmlElement(writer, libwebm::kMkvChannels, + static_cast<uint64>(channels_))) return false; if (bit_depth_ > 0) - if (!WriteEbmlElement(writer, libwebm::kMkvBitDepth, bit_depth_)) + if (!WriteEbmlElement(writer, libwebm::kMkvBitDepth, + static_cast<uint64>(bit_depth_))) return false; const int64_t stop_position = writer->Position(); @@ -1398,6 +1703,10 @@ const char Tracks::kVorbisCodecId[] = "A_VORBIS"; const char Tracks::kVp8CodecId[] = "V_VP8"; const char Tracks::kVp9CodecId[] = "V_VP9"; const char Tracks::kVp10CodecId[] = "V_VP10"; +const char Tracks::kWebVttCaptionsId[] = "D_WEBVTT/CAPTIONS"; +const char Tracks::kWebVttDescriptionsId[] = "D_WEBVTT/DESCRIPTIONS"; +const char Tracks::kWebVttMetadataId[] = "D_WEBVTT/METADATA"; +const char Tracks::kWebVttSubtitlesId[] = "D_WEBVTT/SUBTITLES"; Tracks::Tracks() : track_entries_(NULL), track_entries_size_(0), wrote_tracks_(false) {} @@ -1650,9 +1959,11 @@ bool Chapter::ExpandDisplaysArray() { uint64_t Chapter::WriteAtom(IMkvWriter* writer) const { uint64_t payload_size = EbmlElementSize(libwebm::kMkvChapterStringUID, id_) + - EbmlElementSize(libwebm::kMkvChapterUID, uid_) + - EbmlElementSize(libwebm::kMkvChapterTimeStart, start_timecode_) + - EbmlElementSize(libwebm::kMkvChapterTimeEnd, end_timecode_); + EbmlElementSize(libwebm::kMkvChapterUID, static_cast<uint64>(uid_)) + + EbmlElementSize(libwebm::kMkvChapterTimeStart, + static_cast<uint64>(start_timecode_)) + + EbmlElementSize(libwebm::kMkvChapterTimeEnd, + static_cast<uint64>(end_timecode_)); for (int idx = 0; idx < displays_count_; ++idx) { const Display& d = displays_[idx]; @@ -1674,13 +1985,16 @@ uint64_t Chapter::WriteAtom(IMkvWriter* writer) const { if (!WriteEbmlElement(writer, libwebm::kMkvChapterStringUID, id_)) return 0; - if (!WriteEbmlElement(writer, libwebm::kMkvChapterUID, uid_)) + if (!WriteEbmlElement(writer, libwebm::kMkvChapterUID, + static_cast<uint64>(uid_))) return 0; - if (!WriteEbmlElement(writer, libwebm::kMkvChapterTimeStart, start_timecode_)) + if (!WriteEbmlElement(writer, libwebm::kMkvChapterTimeStart, + static_cast<uint64>(start_timecode_))) return 0; - if (!WriteEbmlElement(writer, libwebm::kMkvChapterTimeEnd, end_timecode_)) + if (!WriteEbmlElement(writer, libwebm::kMkvChapterTimeEnd, + static_cast<uint64>(end_timecode_))) return 0; for (int idx = 0; idx < displays_count_; ++idx) { @@ -2125,7 +2439,17 @@ Cluster::Cluster(uint64_t timecode, int64_t cues_pos, uint64_t timecode_scale, write_last_frame_with_duration_(write_last_frame_with_duration), writer_(NULL) {} -Cluster::~Cluster() {} +Cluster::~Cluster() { + // Delete any stored frames that are left behind. This will happen if the + // Cluster was not Finalized for whatever reason. + while (!stored_frames_.empty()) { + while (!stored_frames_.begin()->second.empty()) { + delete stored_frames_.begin()->second.front(); + stored_frames_.begin()->second.pop_front(); + } + stored_frames_.erase(stored_frames_.begin()->first); + } +} bool Cluster::Init(IMkvWriter* ptr_writer) { if (!ptr_writer) { @@ -2421,10 +2745,10 @@ bool SeekHead::Finalize(IMkvWriter* writer) const { for (int32_t i = 0; i < kSeekEntryCount; ++i) { if (seek_entry_id_[i] != 0) { - entry_size[i] = EbmlElementSize( - libwebm::kMkvSeekID, static_cast<uint64_t>(seek_entry_id_[i])); - entry_size[i] += - EbmlElementSize(libwebm::kMkvSeekPosition, seek_entry_pos_[i]); + entry_size[i] = EbmlElementSize(libwebm::kMkvSeekID, + static_cast<uint64>(seek_entry_id_[i])); + entry_size[i] += EbmlElementSize( + libwebm::kMkvSeekPosition, static_cast<uint64>(seek_entry_pos_[i])); payload_size += EbmlMasterElementSize(libwebm::kMkvSeek, entry_size[i]) + @@ -2449,11 +2773,11 @@ bool SeekHead::Finalize(IMkvWriter* writer) const { return false; if (!WriteEbmlElement(writer, libwebm::kMkvSeekID, - static_cast<uint64_t>(seek_entry_id_[i]))) + static_cast<uint64>(seek_entry_id_[i]))) return false; if (!WriteEbmlElement(writer, libwebm::kMkvSeekPosition, - seek_entry_pos_[i])) + static_cast<uint64>(seek_entry_pos_[i]))) return false; } } @@ -2522,8 +2846,10 @@ bool SeekHead::SetSeekEntry(int index, uint32_t id, uint64_t position) { uint64_t SeekHead::MaxEntrySize() const { const uint64_t max_entry_payload_size = - EbmlElementSize(libwebm::kMkvSeekID, UINT64_C(0xffffffff)) + - EbmlElementSize(libwebm::kMkvSeekPosition, UINT64_C(0xffffffffffffffff)); + EbmlElementSize(libwebm::kMkvSeekID, + static_cast<uint64>(UINT64_C(0xffffffff))) + + EbmlElementSize(libwebm::kMkvSeekPosition, + static_cast<uint64>(UINT64_C(0xffffffffffffffff))); const uint64_t max_entry_size = EbmlMasterElementSize(libwebm::kMkvSeek, max_entry_payload_size) + max_entry_payload_size; @@ -2613,7 +2939,8 @@ bool SegmentInfo::Write(IMkvWriter* writer) { if (!writer || !muxing_app_ || !writing_app_) return false; - uint64_t size = EbmlElementSize(libwebm::kMkvTimecodeScale, timecode_scale_); + uint64_t size = EbmlElementSize(libwebm::kMkvTimecodeScale, + static_cast<uint64>(timecode_scale_)); if (duration_ > 0.0) size += EbmlElementSize(libwebm::kMkvDuration, static_cast<float>(duration_)); @@ -2629,7 +2956,8 @@ bool SegmentInfo::Write(IMkvWriter* writer) { if (payload_position < 0) return false; - if (!WriteEbmlElement(writer, libwebm::kMkvTimecodeScale, timecode_scale_)) + if (!WriteEbmlElement(writer, libwebm::kMkvTimecodeScale, + static_cast<uint64>(timecode_scale_))) return false; if (duration_ > 0.0) { @@ -2725,10 +3053,12 @@ Segment::Segment() output_cues_(true), accurate_cluster_duration_(false), fixed_size_cluster_timecode_(false), + estimate_file_duration_(true), payload_pos_(0), size_position_(0), doc_type_version_(kDefaultDocTypeVersion), doc_type_version_written_(0), + duration_(0.0), writer_cluster_(NULL), writer_cues_(NULL), writer_header_(NULL) { @@ -2833,6 +3163,10 @@ bool Segment::Init(IMkvWriter* ptr_writer) { writer_cluster_ = ptr_writer; writer_cues_ = ptr_writer; writer_header_ = ptr_writer; + memset(&track_frames_written_, 0, + sizeof(track_frames_written_[0]) * kMaxTrackNumber); + memset(&last_track_timestamp_, 0, + sizeof(last_track_timestamp_[0]) * kMaxTrackNumber); return segment_info_.Init(); } @@ -2876,7 +3210,10 @@ bool Segment::Finalize() { if (WriteFramesAll() < 0) return false; - if (cluster_list_size_ > 0) { + // In kLive mode, call Cluster::Finalize only if |accurate_cluster_duration_| + // is set. In all other modes, always call Cluster::Finalize. + if ((mode_ == kLive ? accurate_cluster_duration_ : true) && + cluster_list_size_ > 0) { // Update last cluster's size Cluster* const old_cluster = cluster_list_[cluster_list_size_ - 1]; @@ -2892,9 +3229,30 @@ bool Segment::Finalize() { chunk_count_++; } - const double duration = + double duration = (static_cast<double>(last_timestamp_) + last_block_duration_) / segment_info_.timecode_scale(); + if (duration_ > 0.0) { + duration = duration_; + } else { + if (last_block_duration_ == 0 && estimate_file_duration_) { + const int num_tracks = static_cast<int>(tracks_.track_entries_size()); + for (int i = 0; i < num_tracks; ++i) { + if (track_frames_written_[i] < 2) + continue; + + // Estimate the duration for the last block of a Track. + const double nano_per_frame = + static_cast<double>(last_track_timestamp_[i]) / + (track_frames_written_[i] - 1); + const double track_duration = + (last_track_timestamp_[i] + nano_per_frame) / + segment_info_.timecode_scale(); + if (track_duration > duration) + duration = track_duration; + } + } + } segment_info_.set_duration(duration); if (!segment_info_.Finalize(writer_header_)) return false; @@ -2941,7 +3299,9 @@ bool Segment::Finalize() { if (writer_header_->Position(0)) return false; - if (!WriteEbmlHeader(writer_header_, doc_type_version_)) + const char* const doc_type = + DocTypeIsWebm() ? kDocTypeWebm : kDocTypeMatroska; + if (!WriteEbmlHeader(writer_header_, doc_type_version_, doc_type)) return false; if (writer_header_->Position() != ebml_header_size_) return false; @@ -3138,7 +3498,10 @@ bool Segment::AddGenericFrame(const Frame* frame) { Frame* const new_frame = new (std::nothrow) Frame(); if (!new_frame || !new_frame->CopyFrom(*frame)) return false; - return QueueFrame(new_frame); + if (!QueueFrame(new_frame)) + return false; + track_frames_written_[frame->track_number() - 1]++; + return true; } if (!DoNewClusterProcessing(frame->track_number(), frame->timestamp(), @@ -3178,10 +3541,10 @@ bool Segment::AddGenericFrame(const Frame* frame) { last_timestamp_ = frame->timestamp(); last_track_timestamp_[frame->track_number() - 1] = frame->timestamp(); last_block_duration_ = frame->duration(); + track_frames_written_[frame->track_number() - 1]++; if (frame_created) delete frame; - return true; } @@ -3292,8 +3655,9 @@ Track* Segment::GetTrackByNumber(uint64_t track_number) const { bool Segment::WriteSegmentHeader() { UpdateDocTypeVersion(); - // TODO(fgalligan): Support more than one segment. - if (!WriteEbmlHeader(writer_header_, doc_type_version_)) + const char* const doc_type = + DocTypeIsWebm() ? kDocTypeWebm : kDocTypeMatroska; + if (!WriteEbmlHeader(writer_header_, doc_type_version_, doc_type)) return false; doc_type_version_written_ = doc_type_version_; ebml_header_size_ = static_cast<int32_t>(writer_header_->Position()); @@ -3766,4 +4130,35 @@ bool Segment::WriteFramesLessThan(uint64_t timestamp) { return true; } +bool Segment::DocTypeIsWebm() const { + const int kNumCodecIds = 9; + + // TODO(vigneshv): Tweak .clang-format. + const char* kWebmCodecIds[kNumCodecIds] = { + Tracks::kOpusCodecId, Tracks::kVorbisCodecId, + Tracks::kVp8CodecId, Tracks::kVp9CodecId, + Tracks::kVp10CodecId, Tracks::kWebVttCaptionsId, + Tracks::kWebVttDescriptionsId, Tracks::kWebVttMetadataId, + Tracks::kWebVttSubtitlesId}; + + const int num_tracks = static_cast<int>(tracks_.track_entries_size()); + for (int track_index = 0; track_index < num_tracks; ++track_index) { + const Track* const track = tracks_.GetTrackByIndex(track_index); + const std::string codec_id = track->codec_id(); + + bool id_is_webm = false; + for (int id_index = 0; id_index < kNumCodecIds; ++id_index) { + if (codec_id == kWebmCodecIds[id_index]) { + id_is_webm = true; + break; + } + } + + if (!id_is_webm) + return false; + } + + return true; +} + } // namespace mkvmuxer diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.h b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.h index 55ba07196df..46b0029dc47 100644 --- a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.h +++ b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.h @@ -64,6 +64,12 @@ class IMkvWriter { LIBWEBM_DISALLOW_COPY_AND_ASSIGN(IMkvWriter); }; +// Writes out the EBML header for a WebM file, but allows caller to specify +// DocType. This function must be called before any other libwebm writing +// functions are called. +bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version, + const char* const doc_type); + // Writes out the EBML header for a WebM file. This function must be called // before any other libwebm writing functions are called. bool WriteEbmlHeader(IMkvWriter* writer, uint64_t doc_type_version); @@ -348,26 +354,42 @@ class ContentEncoding { /////////////////////////////////////////////////////////////// // Colour element. -struct PrimaryChromaticity { - PrimaryChromaticity(float x_val, float y_val) : x(x_val), y(y_val) {} - PrimaryChromaticity() : x(0), y(0) {} +class PrimaryChromaticity { + public: + static const float kChromaticityMin; + static const float kChromaticityMax; + + PrimaryChromaticity(float x_val, float y_val) : x_(x_val), y_(y_val) {} + PrimaryChromaticity() : x_(0), y_(0) {} ~PrimaryChromaticity() {} - uint64_t PrimaryChromaticityPayloadSize(libwebm::MkvId x_id, - libwebm::MkvId y_id) const; + + // Returns sum of |x_id| and |y_id| element id sizes and payload sizes. + uint64_t PrimaryChromaticitySize(libwebm::MkvId x_id, + libwebm::MkvId y_id) const; + bool Valid() const; bool Write(IMkvWriter* writer, libwebm::MkvId x_id, libwebm::MkvId y_id) const; - float x; - float y; + float x() const { return x_; } + void set_x(float new_x) { x_ = new_x; } + float y() const { return y_; } + void set_y(float new_y) { y_ = new_y; } + + private: + float x_; + float y_; }; class MasteringMetadata { public: static const float kValueNotPresent; + static const float kMinLuminance; + static const float kMinLuminanceMax; + static const float kMaxLuminanceMax; MasteringMetadata() - : luminance_max(kValueNotPresent), - luminance_min(kValueNotPresent), + : luminance_max_(kValueNotPresent), + luminance_min_(kValueNotPresent), r_(NULL), g_(NULL), b_(NULL), @@ -381,6 +403,7 @@ class MasteringMetadata { // Returns total size of the MasteringMetadata element. uint64_t MasteringMetadataSize() const; + bool Valid() const; bool Write(IMkvWriter* writer) const; // Copies non-null chromaticity. @@ -393,13 +416,21 @@ class MasteringMetadata { const PrimaryChromaticity* b() const { return b_; } const PrimaryChromaticity* white_point() const { return white_point_; } - float luminance_max; - float luminance_min; + float luminance_max() const { return luminance_max_; } + void set_luminance_max(float luminance_max) { + luminance_max_ = luminance_max; + } + float luminance_min() const { return luminance_min_; } + void set_luminance_min(float luminance_min) { + luminance_min_ = luminance_min; + } private: // Returns size of MasteringMetadata child elements. uint64_t PayloadSize() const; + float luminance_max_; + float luminance_min_; PrimaryChromaticity* r_; PrimaryChromaticity* g_; PrimaryChromaticity* b_; @@ -408,26 +439,90 @@ class MasteringMetadata { class Colour { public: + enum MatrixCoefficients { + kGbr = 0, + kBt709 = 1, + kUnspecifiedMc = 2, + kReserved = 3, + kFcc = 4, + kBt470bg = 5, + kSmpte170MMc = 6, + kSmpte240MMc = 7, + kYcocg = 8, + kBt2020NonConstantLuminance = 9, + kBt2020ConstantLuminance = 10, + }; + enum ChromaSitingHorz { + kUnspecifiedCsh = 0, + kLeftCollocated = 1, + kHalfCsh = 2, + }; + enum ChromaSitingVert { + kUnspecifiedCsv = 0, + kTopCollocated = 1, + kHalfCsv = 2, + }; + enum Range { + kUnspecifiedCr = 0, + kBroadcastRange = 1, + kFullRange = 2, + kMcTcDefined = 3, // Defined by MatrixCoefficients/TransferCharacteristics. + }; + enum TransferCharacteristics { + kIturBt709Tc = 1, + kUnspecifiedTc = 2, + kReservedTc = 3, + kGamma22Curve = 4, + kGamma28Curve = 5, + kSmpte170MTc = 6, + kSmpte240MTc = 7, + kLinear = 8, + kLog = 9, + kLogSqrt = 10, + kIec6196624 = 11, + kIturBt1361ExtendedColourGamut = 12, + kIec6196621 = 13, + kIturBt202010bit = 14, + kIturBt202012bit = 15, + kSmpteSt2084 = 16, + kSmpteSt4281Tc = 17, + kAribStdB67Hlg = 18, + }; + enum Primaries { + kReservedP0 = 0, + kIturBt709P = 1, + kUnspecifiedP = 2, + kReservedP3 = 3, + kIturBt470M = 4, + kIturBt470Bg = 5, + kSmpte170MP = 6, + kSmpte240MP = 7, + kFilm = 8, + kIturBt2020 = 9, + kSmpteSt4281P = 10, + kJedecP22Phosphors = 22, + }; static const uint64_t kValueNotPresent; Colour() - : matrix_coefficients(kValueNotPresent), - bits_per_channel(kValueNotPresent), - chroma_subsampling_horz(kValueNotPresent), - chroma_subsampling_vert(kValueNotPresent), - cb_subsampling_horz(kValueNotPresent), - cb_subsampling_vert(kValueNotPresent), - chroma_siting_horz(kValueNotPresent), - chroma_siting_vert(kValueNotPresent), - range(kValueNotPresent), - transfer_characteristics(kValueNotPresent), - primaries(kValueNotPresent), - max_cll(kValueNotPresent), - max_fall(kValueNotPresent), + : matrix_coefficients_(kValueNotPresent), + bits_per_channel_(kValueNotPresent), + chroma_subsampling_horz_(kValueNotPresent), + chroma_subsampling_vert_(kValueNotPresent), + cb_subsampling_horz_(kValueNotPresent), + cb_subsampling_vert_(kValueNotPresent), + chroma_siting_horz_(kValueNotPresent), + chroma_siting_vert_(kValueNotPresent), + range_(kValueNotPresent), + transfer_characteristics_(kValueNotPresent), + primaries_(kValueNotPresent), + max_cll_(kValueNotPresent), + max_fall_(kValueNotPresent), mastering_metadata_(NULL) {} ~Colour() { delete mastering_metadata_; } // Returns total size of the Colour element. uint64_t ColourSize() const; + bool Valid() const; bool Write(IMkvWriter* writer) const; // Deep copies |mastering_metadata|. @@ -437,28 +532,125 @@ class Colour { return mastering_metadata_; } - uint64_t matrix_coefficients; - uint64_t bits_per_channel; - uint64_t chroma_subsampling_horz; - uint64_t chroma_subsampling_vert; - uint64_t cb_subsampling_horz; - uint64_t cb_subsampling_vert; - uint64_t chroma_siting_horz; - uint64_t chroma_siting_vert; - uint64_t range; - uint64_t transfer_characteristics; - uint64_t primaries; - uint64_t max_cll; - uint64_t max_fall; + uint64_t matrix_coefficients() const { return matrix_coefficients_; } + void set_matrix_coefficients(uint64_t matrix_coefficients) { + matrix_coefficients_ = matrix_coefficients; + } + uint64_t bits_per_channel() const { return bits_per_channel_; } + void set_bits_per_channel(uint64_t bits_per_channel) { + bits_per_channel_ = bits_per_channel; + } + uint64_t chroma_subsampling_horz() const { return chroma_subsampling_horz_; } + void set_chroma_subsampling_horz(uint64_t chroma_subsampling_horz) { + chroma_subsampling_horz_ = chroma_subsampling_horz; + } + uint64_t chroma_subsampling_vert() const { return chroma_subsampling_vert_; } + void set_chroma_subsampling_vert(uint64_t chroma_subsampling_vert) { + chroma_subsampling_vert_ = chroma_subsampling_vert; + } + uint64_t cb_subsampling_horz() const { return cb_subsampling_horz_; } + void set_cb_subsampling_horz(uint64_t cb_subsampling_horz) { + cb_subsampling_horz_ = cb_subsampling_horz; + } + uint64_t cb_subsampling_vert() const { return cb_subsampling_vert_; } + void set_cb_subsampling_vert(uint64_t cb_subsampling_vert) { + cb_subsampling_vert_ = cb_subsampling_vert; + } + uint64_t chroma_siting_horz() const { return chroma_siting_horz_; } + void set_chroma_siting_horz(uint64_t chroma_siting_horz) { + chroma_siting_horz_ = chroma_siting_horz; + } + uint64_t chroma_siting_vert() const { return chroma_siting_vert_; } + void set_chroma_siting_vert(uint64_t chroma_siting_vert) { + chroma_siting_vert_ = chroma_siting_vert; + } + uint64_t range() const { return range_; } + void set_range(uint64_t range) { range_ = range; } + uint64_t transfer_characteristics() const { + return transfer_characteristics_; + } + void set_transfer_characteristics(uint64_t transfer_characteristics) { + transfer_characteristics_ = transfer_characteristics; + } + uint64_t primaries() const { return primaries_; } + void set_primaries(uint64_t primaries) { primaries_ = primaries; } + uint64_t max_cll() const { return max_cll_; } + void set_max_cll(uint64_t max_cll) { max_cll_ = max_cll; } + uint64_t max_fall() const { return max_fall_; } + void set_max_fall(uint64_t max_fall) { max_fall_ = max_fall; } private: // Returns size of Colour child elements. uint64_t PayloadSize() const; + uint64_t matrix_coefficients_; + uint64_t bits_per_channel_; + uint64_t chroma_subsampling_horz_; + uint64_t chroma_subsampling_vert_; + uint64_t cb_subsampling_horz_; + uint64_t cb_subsampling_vert_; + uint64_t chroma_siting_horz_; + uint64_t chroma_siting_vert_; + uint64_t range_; + uint64_t transfer_characteristics_; + uint64_t primaries_; + uint64_t max_cll_; + uint64_t max_fall_; + MasteringMetadata* mastering_metadata_; }; /////////////////////////////////////////////////////////////// +// Projection element. +class Projection { + public: + enum ProjectionType { + kTypeNotPresent = -1, + kRectangular = 0, + kEquirectangular = 1, + kCubeMap = 2, + kMesh = 3, + }; + static const uint64_t kValueNotPresent; + Projection() + : type_(kRectangular), + pose_yaw_(0.0), + pose_pitch_(0.0), + pose_roll_(0.0), + private_data_(NULL), + private_data_length_(0) {} + ~Projection() { delete[] private_data_; } + + uint64_t ProjectionSize() const; + bool Write(IMkvWriter* writer) const; + + bool SetProjectionPrivate(const uint8_t* private_data, + uint64_t private_data_length); + + ProjectionType type() const { return type_; } + void set_type(ProjectionType type) { type_ = type; } + float pose_yaw() const { return pose_yaw_; } + void set_pose_yaw(float pose_yaw) { pose_yaw_ = pose_yaw; } + float pose_pitch() const { return pose_pitch_; } + void set_pose_pitch(float pose_pitch) { pose_pitch_ = pose_pitch; } + float pose_roll() const { return pose_roll_; } + void set_pose_roll(float pose_roll) { pose_roll_ = pose_roll; } + uint8_t* private_data() const { return private_data_; } + uint64_t private_data_length() const { return private_data_length_; } + + private: + // Returns size of VideoProjection child elements. + uint64_t PayloadSize() const; + + ProjectionType type_; + float pose_yaw_; + float pose_pitch_; + float pose_roll_; + uint8_t* private_data_; + uint64_t private_data_length_; +}; + +/////////////////////////////////////////////////////////////// // Track element. class Track { public: @@ -581,6 +773,10 @@ class VideoTrack : public Track { uint64_t display_height() const { return display_height_; } void set_display_width(uint64_t width) { display_width_ = width; } uint64_t display_width() const { return display_width_; } + void set_pixel_height(uint64_t height) { pixel_height_ = height; } + uint64_t pixel_height() const { return pixel_height_; } + void set_pixel_width(uint64_t width) { pixel_width_ = width; } + uint64_t pixel_width() const { return pixel_width_; } void set_crop_left(uint64_t crop_left) { crop_left_ = crop_left; } uint64_t crop_left() const { return crop_left_; } @@ -605,6 +801,11 @@ class VideoTrack : public Track { // Deep copies |colour|. bool SetColour(const Colour& colour); + Projection* projection() { return projection_; } + + // Deep copies |projection|. + bool SetProjection(const Projection& projection); + private: // Returns the size in bytes of the Video element. uint64_t VideoPayloadSize() const; @@ -612,6 +813,8 @@ class VideoTrack : public Track { // Video track element names. uint64_t display_height_; uint64_t display_width_; + uint64_t pixel_height_; + uint64_t pixel_width_; uint64_t crop_left_; uint64_t crop_right_; uint64_t crop_top_; @@ -623,6 +826,7 @@ class VideoTrack : public Track { uint64_t width_; Colour* colour_; + Projection* projection_; LIBWEBM_DISALLOW_COPY_AND_ASSIGN(VideoTrack); }; @@ -670,6 +874,10 @@ class Tracks { static const char kVp8CodecId[]; static const char kVp9CodecId[]; static const char kVp10CodecId[]; + static const char kWebVttCaptionsId[]; + static const char kWebVttDescriptionsId[]; + static const char kWebVttMetadataId[]; + static const char kWebVttSubtitlesId[]; Tracks(); ~Tracks(); @@ -1294,8 +1502,8 @@ class Segment { kBeforeClusters = 0x1 // Position Cues before Clusters }; - const static uint32_t kDefaultDocTypeVersion = 2; - const static uint64_t kDefaultMaxClusterDuration = 30000000000ULL; + static const uint32_t kDefaultDocTypeVersion = 4; + static const uint64_t kDefaultMaxClusterDuration = 30000000000ULL; Segment(); ~Segment(); @@ -1481,7 +1689,16 @@ class Segment { Mode mode() const { return mode_; } CuesPosition cues_position() const { return cues_position_; } bool output_cues() const { return output_cues_; } + void set_estimate_file_duration(bool estimate_duration) { + estimate_file_duration_ = estimate_duration; + } + bool estimate_file_duration() const { return estimate_file_duration_; } const SegmentInfo* segment_info() const { return &segment_info_; } + void set_duration(double duration) { duration_ = duration; } + double duration() const { return duration_; } + + // Returns true when codec IDs are valid for WebM. + bool DocTypeIsWebm() const; private: // Checks if header information has been output and initialized. If not it @@ -1637,6 +1854,9 @@ class Segment { // Last timestamp in nanoseconds by track number added to a cluster. uint64_t last_track_timestamp_[kMaxTrackNumber]; + // Number of frames written per track. + uint64_t track_frames_written_[kMaxTrackNumber]; + // Maximum time in nanoseconds for a cluster duration. This variable is a // guideline and some clusters may have a longer duration. Default is 30 // seconds. @@ -1665,6 +1885,9 @@ class Segment { // Flag whether or not to write the Cluster Timecode using exactly 8 bytes. bool fixed_size_cluster_timecode_; + // Flag whether or not to estimate the file duration. + bool estimate_file_duration_; + // The size of the EBML header, used to validate the header if // WriteEbmlHeader() is called more than once. int32_t ebml_header_size_; @@ -1682,6 +1905,9 @@ class Segment { uint32_t doc_type_version_; uint32_t doc_type_version_written_; + // If |duration_| is > 0, then explicitly set the duration of the segment. + double duration_; + // Pointer to the writer objects. Not owned by this class. IMkvWriter* writer_cluster_; IMkvWriter* writer_cues_; diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc index 3562b8ab828..1ba17ac1ba0 100644 --- a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc +++ b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc @@ -31,20 +31,20 @@ namespace { // Date elements are always 8 octets in size. const int kDateElementSize = 8; -uint64_t WriteBlock(IMkvWriter* writer, const Frame* const frame, - int64_t timecode, uint64_t timecode_scale) { - uint64_t block_additional_elem_size = 0; - uint64_t block_addid_elem_size = 0; - uint64_t block_more_payload_size = 0; - uint64_t block_more_elem_size = 0; - uint64_t block_additions_payload_size = 0; - uint64_t block_additions_elem_size = 0; +uint64 WriteBlock(IMkvWriter* writer, const Frame* const frame, int64 timecode, + uint64 timecode_scale) { + uint64 block_additional_elem_size = 0; + uint64 block_addid_elem_size = 0; + uint64 block_more_payload_size = 0; + uint64 block_more_elem_size = 0; + uint64 block_additions_payload_size = 0; + uint64 block_additions_elem_size = 0; if (frame->additional()) { block_additional_elem_size = EbmlElementSize(libwebm::kMkvBlockAdditional, frame->additional(), frame->additional_length()); - block_addid_elem_size = - EbmlElementSize(libwebm::kMkvBlockAddID, frame->add_id()); + block_addid_elem_size = EbmlElementSize( + libwebm::kMkvBlockAddID, static_cast<uint64>(frame->add_id())); block_more_payload_size = block_addid_elem_size + block_additional_elem_size; @@ -58,32 +58,33 @@ uint64_t WriteBlock(IMkvWriter* writer, const Frame* const frame, block_additions_payload_size; } - uint64_t discard_padding_elem_size = 0; + uint64 discard_padding_elem_size = 0; if (frame->discard_padding() != 0) { discard_padding_elem_size = - EbmlElementSize(libwebm::kMkvDiscardPadding, frame->discard_padding()); + EbmlElementSize(libwebm::kMkvDiscardPadding, + static_cast<int64>(frame->discard_padding())); } - const uint64_t reference_block_timestamp = + const uint64 reference_block_timestamp = frame->reference_block_timestamp() / timecode_scale; - uint64_t reference_block_elem_size = 0; + uint64 reference_block_elem_size = 0; if (!frame->is_key()) { reference_block_elem_size = EbmlElementSize(libwebm::kMkvReferenceBlock, reference_block_timestamp); } - const uint64_t duration = frame->duration() / timecode_scale; - uint64_t block_duration_elem_size = 0; + const uint64 duration = frame->duration() / timecode_scale; + uint64 block_duration_elem_size = 0; if (duration > 0) block_duration_elem_size = EbmlElementSize(libwebm::kMkvBlockDuration, duration); - const uint64_t block_payload_size = 4 + frame->length(); - const uint64_t block_elem_size = + const uint64 block_payload_size = 4 + frame->length(); + const uint64 block_elem_size = EbmlMasterElementSize(libwebm::kMkvBlock, block_payload_size) + block_payload_size; - const uint64_t block_group_payload_size = + const uint64 block_group_payload_size = block_elem_size + block_additions_elem_size + block_duration_elem_size + discard_padding_elem_size + reference_block_elem_size; @@ -105,7 +106,7 @@ uint64_t WriteBlock(IMkvWriter* writer, const Frame* const frame, if (SerializeInt(writer, 0, 1)) return 0; - if (writer->Write(frame->frame(), static_cast<uint32_t>(frame->length()))) + if (writer->Write(frame->frame(), static_cast<uint32>(frame->length()))) return 0; if (frame->additional()) { @@ -118,7 +119,8 @@ uint64_t WriteBlock(IMkvWriter* writer, const Frame* const frame, block_more_payload_size)) return 0; - if (!WriteEbmlElement(writer, libwebm::kMkvBlockAddID, frame->add_id())) + if (!WriteEbmlElement(writer, libwebm::kMkvBlockAddID, + static_cast<uint64>(frame->add_id()))) return 0; if (!WriteEbmlElement(writer, libwebm::kMkvBlockAdditional, @@ -129,7 +131,7 @@ uint64_t WriteBlock(IMkvWriter* writer, const Frame* const frame, if (frame->discard_padding() != 0 && !WriteEbmlElement(writer, libwebm::kMkvDiscardPadding, - frame->discard_padding())) { + static_cast<int64>(frame->discard_padding()))) { return false; } @@ -148,38 +150,38 @@ uint64_t WriteBlock(IMkvWriter* writer, const Frame* const frame, block_group_payload_size; } -uint64_t WriteSimpleBlock(IMkvWriter* writer, const Frame* const frame, - int64_t timecode) { +uint64 WriteSimpleBlock(IMkvWriter* writer, const Frame* const frame, + int64 timecode) { if (WriteID(writer, libwebm::kMkvSimpleBlock)) return 0; - const int32_t size = static_cast<int32_t>(frame->length()) + 4; + const int32 size = static_cast<int32>(frame->length()) + 4; if (WriteUInt(writer, size)) return 0; - if (WriteUInt(writer, static_cast<uint64_t>(frame->track_number()))) + if (WriteUInt(writer, static_cast<uint64>(frame->track_number()))) return 0; if (SerializeInt(writer, timecode, 2)) return 0; - uint64_t flags = 0; + uint64 flags = 0; if (frame->is_key()) flags |= 0x80; if (SerializeInt(writer, flags, 1)) return 0; - if (writer->Write(frame->frame(), static_cast<uint32_t>(frame->length()))) + if (writer->Write(frame->frame(), static_cast<uint32>(frame->length()))) return 0; - return static_cast<uint64_t>(GetUIntSize(libwebm::kMkvSimpleBlock) + - GetCodedUIntSize(size) + 4 + frame->length()); + return GetUIntSize(libwebm::kMkvSimpleBlock) + GetCodedUIntSize(size) + 4 + + frame->length(); } } // namespace -int32_t GetCodedUIntSize(uint64_t value) { +int32 GetCodedUIntSize(uint64 value) { if (value < 0x000000000000007FULL) return 1; else if (value < 0x0000000000003FFFULL) @@ -197,7 +199,7 @@ int32_t GetCodedUIntSize(uint64_t value) { return 8; } -int32_t GetUIntSize(uint64_t value) { +int32 GetUIntSize(uint64 value) { if (value < 0x0000000000000100ULL) return 1; else if (value < 0x0000000000010000ULL) @@ -215,26 +217,26 @@ int32_t GetUIntSize(uint64_t value) { return 8; } -int32_t GetIntSize(int64_t value) { +int32 GetIntSize(int64 value) { // Doubling the requested value ensures positive values with their high bit // set are written with 0-padding to avoid flipping the signedness. - const uint64_t v = (value < 0) ? value ^ -1LL : value; + const uint64 v = (value < 0) ? value ^ -1LL : value; return GetUIntSize(2 * v); } -uint64_t EbmlMasterElementSize(uint64_t type, uint64_t value) { +uint64 EbmlMasterElementSize(uint64 type, uint64 value) { // Size of EBML ID - int32_t ebml_size = GetUIntSize(type); + int32 ebml_size = GetUIntSize(type); // Datasize ebml_size += GetCodedUIntSize(value); - return static_cast<uint64_t>(ebml_size); + return ebml_size; } -uint64_t EbmlElementSize(uint64_t type, int64_t value) { +uint64 EbmlElementSize(uint64 type, int64 value) { // Size of EBML ID - int32_t ebml_size = GetUIntSize(type); + int32 ebml_size = GetUIntSize(type); // Datasize ebml_size += GetIntSize(value); @@ -242,20 +244,19 @@ uint64_t EbmlElementSize(uint64_t type, int64_t value) { // Size of Datasize ebml_size++; - return static_cast<uint64_t>(ebml_size); + return ebml_size; } -uint64_t EbmlElementSize(uint64_t type, uint64_t value) { +uint64 EbmlElementSize(uint64 type, uint64 value) { return EbmlElementSize(type, value, 0); } -uint64_t EbmlElementSize(uint64_t type, uint64_t value, uint64_t fixed_size) { +uint64 EbmlElementSize(uint64 type, uint64 value, uint64 fixed_size) { // Size of EBML ID - uint64_t ebml_size = static_cast<uint64_t>(GetUIntSize(type)); + uint64 ebml_size = GetUIntSize(type); // Datasize - ebml_size += - (fixed_size > 0) ? fixed_size : static_cast<uint64_t>(GetUIntSize(value)); + ebml_size += (fixed_size > 0) ? fixed_size : GetUIntSize(value); // Size of Datasize ebml_size++; @@ -263,9 +264,9 @@ uint64_t EbmlElementSize(uint64_t type, uint64_t value, uint64_t fixed_size) { return ebml_size; } -uint64_t EbmlElementSize(uint64_t type, float /* value */) { +uint64 EbmlElementSize(uint64 type, float /* value */) { // Size of EBML ID - uint64_t ebml_size = static_cast<uint64_t>(GetUIntSize(type)); + uint64 ebml_size = GetUIntSize(type); // Datasize ebml_size += sizeof(float); @@ -276,12 +277,12 @@ uint64_t EbmlElementSize(uint64_t type, float /* value */) { return ebml_size; } -uint64_t EbmlElementSize(uint64_t type, const char* value) { +uint64 EbmlElementSize(uint64 type, const char* value) { if (!value) return 0; // Size of EBML ID - uint64_t ebml_size = static_cast<uint64_t>(GetUIntSize(type)); + uint64 ebml_size = GetUIntSize(type); // Datasize ebml_size += strlen(value); @@ -292,12 +293,12 @@ uint64_t EbmlElementSize(uint64_t type, const char* value) { return ebml_size; } -uint64_t EbmlElementSize(uint64_t type, const uint8_t* value, uint64_t size) { +uint64 EbmlElementSize(uint64 type, const uint8* value, uint64 size) { if (!value) return 0; // Size of EBML ID - uint64_t ebml_size = static_cast<uint64_t>(GetUIntSize(type)); + uint64 ebml_size = GetUIntSize(type); // Datasize ebml_size += size; @@ -308,9 +309,9 @@ uint64_t EbmlElementSize(uint64_t type, const uint8_t* value, uint64_t size) { return ebml_size; } -uint64_t EbmlDateElementSize(uint64_t type) { +uint64 EbmlDateElementSize(uint64 type) { // Size of EBML ID - uint64_t ebml_size = static_cast<uint64_t>(GetUIntSize(type)); + uint64 ebml_size = GetUIntSize(type); // Datasize ebml_size += kDateElementSize; @@ -321,18 +322,18 @@ uint64_t EbmlDateElementSize(uint64_t type) { return ebml_size; } -int32_t SerializeInt(IMkvWriter* writer, int64_t value, int32_t size) { +int32 SerializeInt(IMkvWriter* writer, int64 value, int32 size) { if (!writer || size < 1 || size > 8) return -1; - for (int32_t i = 1; i <= size; ++i) { - const int32_t byte_count = size - i; - const int32_t bit_count = byte_count * 8; + for (int32 i = 1; i <= size; ++i) { + const int32 byte_count = size - i; + const int32 bit_count = byte_count * 8; - const int64_t bb = value >> bit_count; - const uint8_t b = static_cast<uint8_t>(bb); + const int64 bb = value >> bit_count; + const uint8 b = static_cast<uint8>(bb); - const int32_t status = writer->Write(&b, 1); + const int32 status = writer->Write(&b, 1); if (status < 0) return status; @@ -341,26 +342,26 @@ int32_t SerializeInt(IMkvWriter* writer, int64_t value, int32_t size) { return 0; } -int32_t SerializeFloat(IMkvWriter* writer, float f) { +int32 SerializeFloat(IMkvWriter* writer, float f) { if (!writer) return -1; - assert(sizeof(uint32_t) == sizeof(float)); + assert(sizeof(uint32) == sizeof(float)); // This union is merely used to avoid a reinterpret_cast from float& to // uint32& which will result in violation of strict aliasing. union U32 { - uint32_t u32; + uint32 u32; float f; } value; value.f = f; - for (int32_t i = 1; i <= 4; ++i) { - const int32_t byte_count = 4 - i; - const int32_t bit_count = byte_count * 8; + for (int32 i = 1; i <= 4; ++i) { + const int32 byte_count = 4 - i; + const int32 bit_count = byte_count * 8; - const uint8_t byte = static_cast<uint8_t>(value.u32 >> bit_count); + const uint8 byte = static_cast<uint8>(value.u32 >> bit_count); - const int32_t status = writer->Write(&byte, 1); + const int32 status = writer->Write(&byte, 1); if (status < 0) return status; @@ -369,21 +370,21 @@ int32_t SerializeFloat(IMkvWriter* writer, float f) { return 0; } -int32_t WriteUInt(IMkvWriter* writer, uint64_t value) { +int32 WriteUInt(IMkvWriter* writer, uint64 value) { if (!writer) return -1; - int32_t size = GetCodedUIntSize(value); + int32 size = GetCodedUIntSize(value); return WriteUIntSize(writer, value, size); } -int32_t WriteUIntSize(IMkvWriter* writer, uint64_t value, int32_t size) { +int32 WriteUIntSize(IMkvWriter* writer, uint64 value, int32 size) { if (!writer || size < 0 || size > 8) return -1; if (size > 0) { - const uint64_t bit = 1LL << (size * 7); + const uint64 bit = 1LL << (size * 7); if (value > (bit - 2)) return -1; @@ -391,11 +392,11 @@ int32_t WriteUIntSize(IMkvWriter* writer, uint64_t value, int32_t size) { value |= bit; } else { size = 1; - int64_t bit; + int64 bit; for (;;) { bit = 1LL << (size * 7); - const uint64_t max = bit - 2; + const uint64 max = bit - 2; if (value <= max) break; @@ -412,18 +413,18 @@ int32_t WriteUIntSize(IMkvWriter* writer, uint64_t value, int32_t size) { return SerializeInt(writer, value, size); } -int32_t WriteID(IMkvWriter* writer, uint64_t type) { +int32 WriteID(IMkvWriter* writer, uint64 type) { if (!writer) return -1; writer->ElementStartNotify(type, writer->Position()); - const int32_t size = GetUIntSize(type); + const int32 size = GetUIntSize(type); return SerializeInt(writer, type, size); } -bool WriteEbmlMasterElement(IMkvWriter* writer, uint64_t type, uint64_t size) { +bool WriteEbmlMasterElement(IMkvWriter* writer, uint64 type, uint64 size) { if (!writer) return false; @@ -436,19 +437,19 @@ bool WriteEbmlMasterElement(IMkvWriter* writer, uint64_t type, uint64_t size) { return true; } -bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, uint64_t value) { +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value) { return WriteEbmlElement(writer, type, value, 0); } -bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, uint64_t value, - uint64_t fixed_size) { +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value, + uint64 fixed_size) { if (!writer) return false; if (WriteID(writer, type)) return false; - uint64_t size = static_cast<uint64_t>(GetUIntSize(value)); + uint64 size = GetUIntSize(value); if (fixed_size > 0) { if (size > fixed_size) return false; @@ -457,30 +458,30 @@ bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, uint64_t value, if (WriteUInt(writer, size)) return false; - if (SerializeInt(writer, value, static_cast<int32_t>(size))) + if (SerializeInt(writer, value, static_cast<int32>(size))) return false; return true; } -bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, int64_t value) { +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, int64 value) { if (!writer) return false; if (WriteID(writer, type)) return 0; - const uint64_t size = GetIntSize(value); + const uint64 size = GetIntSize(value); if (WriteUInt(writer, size)) return false; - if (SerializeInt(writer, value, static_cast<int32_t>(size))) + if (SerializeInt(writer, value, static_cast<int32>(size))) return false; return true; } -bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, float value) { +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, float value) { if (!writer) return false; @@ -496,25 +497,25 @@ bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, float value) { return true; } -bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, const char* value) { +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const char* value) { if (!writer || !value) return false; if (WriteID(writer, type)) return false; - const uint64_t length = strlen(value); + const uint64 length = strlen(value); if (WriteUInt(writer, length)) return false; - if (writer->Write(value, static_cast<const uint32_t>(length))) + if (writer->Write(value, static_cast<const uint32>(length))) return false; return true; } -bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, const uint8_t* value, - uint64_t size) { +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const uint8* value, + uint64 size) { if (!writer || !value || size < 1) return false; @@ -524,13 +525,13 @@ bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, const uint8_t* value, if (WriteUInt(writer, size)) return false; - if (writer->Write(value, static_cast<uint32_t>(size))) + if (writer->Write(value, static_cast<uint32>(size))) return false; return true; } -bool WriteEbmlDateElement(IMkvWriter* writer, uint64_t type, int64_t value) { +bool WriteEbmlDateElement(IMkvWriter* writer, uint64 type, int64 value) { if (!writer) return false; @@ -546,8 +547,8 @@ bool WriteEbmlDateElement(IMkvWriter* writer, uint64_t type, int64_t value) { return true; } -uint64_t WriteFrame(IMkvWriter* writer, const Frame* const frame, - Cluster* cluster) { +uint64 WriteFrame(IMkvWriter* writer, const Frame* const frame, + Cluster* cluster) { if (!writer || !frame || !frame->IsValid() || !cluster || !cluster->timecode_scale()) return 0; @@ -556,7 +557,7 @@ uint64_t WriteFrame(IMkvWriter* writer, const Frame* const frame, // timecode for the cluster itself (remember that block timecode // is a signed, 16-bit integer). However, as a simplification we // only permit non-negative cluster-relative timecodes for blocks. - const int64_t relative_timecode = cluster->GetRelativeTimecode( + const int64 relative_timecode = cluster->GetRelativeTimecode( frame->timestamp() / cluster->timecode_scale()); if (relative_timecode < 0 || relative_timecode > kMaxBlockTimecode) return 0; @@ -567,20 +568,19 @@ uint64_t WriteFrame(IMkvWriter* writer, const Frame* const frame, cluster->timecode_scale()); } -uint64_t WriteVoidElement(IMkvWriter* writer, uint64_t size) { +uint64 WriteVoidElement(IMkvWriter* writer, uint64 size) { if (!writer) return false; // Subtract one for the void ID and the coded size. - uint64_t void_entry_size = size - 1 - GetCodedUIntSize(size - 1); - uint64_t void_size = - EbmlMasterElementSize(libwebm::kMkvVoid, void_entry_size) + - void_entry_size; + uint64 void_entry_size = size - 1 - GetCodedUIntSize(size - 1); + uint64 void_size = EbmlMasterElementSize(libwebm::kMkvVoid, void_entry_size) + + void_entry_size; if (void_size != size) return 0; - const int64_t payload_position = writer->Position(); + const int64 payload_position = writer->Position(); if (payload_position < 0) return 0; @@ -590,30 +590,29 @@ uint64_t WriteVoidElement(IMkvWriter* writer, uint64_t size) { if (WriteUInt(writer, void_entry_size)) return 0; - const uint8_t value = 0; - for (int32_t i = 0; i < static_cast<int32_t>(void_entry_size); ++i) { + const uint8 value = 0; + for (int32 i = 0; i < static_cast<int32>(void_entry_size); ++i) { if (writer->Write(&value, 1)) return 0; } - const int64_t stop_position = writer->Position(); + const int64 stop_position = writer->Position(); if (stop_position < 0 || - stop_position - payload_position != static_cast<int64_t>(void_size)) + stop_position - payload_position != static_cast<int64>(void_size)) return 0; return void_size; } -void GetVersion(int32_t* major, int32_t* minor, int32_t* build, - int32_t* revision) { +void GetVersion(int32* major, int32* minor, int32* build, int32* revision) { *major = 0; *minor = 2; *build = 1; *revision = 0; } -uint64_t MakeUID(unsigned int* seed) { - uint64_t uid = 0; +uint64 MakeUID(unsigned int* seed) { + uint64 uid = 0; #ifdef __MINGW32__ srand(*seed); @@ -625,21 +624,22 @@ uint64_t MakeUID(unsigned int* seed) { // TODO(fgalligan): Move random number generation to platform specific code. #ifdef _MSC_VER (void)seed; - const int32_t nn = rand(); + const int32 nn = rand(); #elif __ANDROID__ - int32_t temp_num = 1; + (void)seed; + int32 temp_num = 1; int fd = open("/dev/urandom", O_RDONLY); if (fd != -1) { read(fd, &temp_num, sizeof(temp_num)); close(fd); } - const int32_t nn = temp_num; + const int32 nn = temp_num; #elif defined __MINGW32__ - const int32_t nn = rand(); + const int32 nn = rand(); #else - const int32_t nn = rand_r(seed); + const int32 nn = rand_r(seed); #endif - const int32_t n = 0xFF & (nn >> 4); // throw away low-order bits + const int32 n = 0xFF & (nn >> 4); // throw away low-order bits uid |= n; } @@ -647,4 +647,97 @@ uint64_t MakeUID(unsigned int* seed) { return uid; } +bool IsMatrixCoefficientsValueValid(uint64_t value) { + switch (value) { + case mkvmuxer::Colour::kGbr: + case mkvmuxer::Colour::kBt709: + case mkvmuxer::Colour::kUnspecifiedMc: + case mkvmuxer::Colour::kReserved: + case mkvmuxer::Colour::kFcc: + case mkvmuxer::Colour::kBt470bg: + case mkvmuxer::Colour::kSmpte170MMc: + case mkvmuxer::Colour::kSmpte240MMc: + case mkvmuxer::Colour::kYcocg: + case mkvmuxer::Colour::kBt2020NonConstantLuminance: + case mkvmuxer::Colour::kBt2020ConstantLuminance: + return true; + } + return false; +} + +bool IsChromaSitingHorzValueValid(uint64_t value) { + switch (value) { + case mkvmuxer::Colour::kUnspecifiedCsh: + case mkvmuxer::Colour::kLeftCollocated: + case mkvmuxer::Colour::kHalfCsh: + return true; + } + return false; +} + +bool IsChromaSitingVertValueValid(uint64_t value) { + switch (value) { + case mkvmuxer::Colour::kUnspecifiedCsv: + case mkvmuxer::Colour::kTopCollocated: + case mkvmuxer::Colour::kHalfCsv: + return true; + } + return false; +} + +bool IsColourRangeValueValid(uint64_t value) { + switch (value) { + case mkvmuxer::Colour::kUnspecifiedCr: + case mkvmuxer::Colour::kBroadcastRange: + case mkvmuxer::Colour::kFullRange: + case mkvmuxer::Colour::kMcTcDefined: + return true; + } + return false; +} + +bool IsTransferCharacteristicsValueValid(uint64_t value) { + switch (value) { + case mkvmuxer::Colour::kIturBt709Tc: + case mkvmuxer::Colour::kUnspecifiedTc: + case mkvmuxer::Colour::kReservedTc: + case mkvmuxer::Colour::kGamma22Curve: + case mkvmuxer::Colour::kGamma28Curve: + case mkvmuxer::Colour::kSmpte170MTc: + case mkvmuxer::Colour::kSmpte240MTc: + case mkvmuxer::Colour::kLinear: + case mkvmuxer::Colour::kLog: + case mkvmuxer::Colour::kLogSqrt: + case mkvmuxer::Colour::kIec6196624: + case mkvmuxer::Colour::kIturBt1361ExtendedColourGamut: + case mkvmuxer::Colour::kIec6196621: + case mkvmuxer::Colour::kIturBt202010bit: + case mkvmuxer::Colour::kIturBt202012bit: + case mkvmuxer::Colour::kSmpteSt2084: + case mkvmuxer::Colour::kSmpteSt4281Tc: + case mkvmuxer::Colour::kAribStdB67Hlg: + return true; + } + return false; +} + +bool IsPrimariesValueValid(uint64_t value) { + switch (value) { + case mkvmuxer::Colour::kReservedP0: + case mkvmuxer::Colour::kIturBt709P: + case mkvmuxer::Colour::kUnspecifiedP: + case mkvmuxer::Colour::kReservedP3: + case mkvmuxer::Colour::kIturBt470M: + case mkvmuxer::Colour::kIturBt470Bg: + case mkvmuxer::Colour::kSmpte170MP: + case mkvmuxer::Colour::kSmpte240MP: + case mkvmuxer::Colour::kFilm: + case mkvmuxer::Colour::kIturBt2020: + case mkvmuxer::Colour::kSmpteSt4281P: + case mkvmuxer::Colour::kJedecP22Phosphors: + return true; + } + return false; +} + } // namespace mkvmuxer diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.h b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.h index 0e21a2dcbe5..132388da599 100644 --- a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.h +++ b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.h @@ -8,87 +8,104 @@ #ifndef MKVMUXER_MKVMUXERUTIL_H_ #define MKVMUXER_MKVMUXERUTIL_H_ -#include <stdint.h> +#include "mkvmuxertypes.h" + +#include "stdint.h" namespace mkvmuxer { class Cluster; class Frame; class IMkvWriter; -const uint64_t kEbmlUnknownValue = 0x01FFFFFFFFFFFFFFULL; -const int64_t kMaxBlockTimecode = 0x07FFFLL; +// TODO(tomfinegan): mkvmuxer:: integer types continue to be used here because +// changing them causes pain for downstream projects. It would be nice if a +// solution that allows removal of the mkvmuxer:: integer types while avoiding +// pain for downstream users of libwebm. Considering that mkvmuxerutil.{cc,h} +// are really, for the great majority of cases, EBML size calculation and writer +// functions, perhaps a more EBML focused utility would be the way to go as a +// first step. + +const uint64 kEbmlUnknownValue = 0x01FFFFFFFFFFFFFFULL; +const int64 kMaxBlockTimecode = 0x07FFFLL; // Writes out |value| in Big Endian order. Returns 0 on success. -int32_t SerializeInt(IMkvWriter* writer, int64_t value, int32_t size); +int32 SerializeInt(IMkvWriter* writer, int64 value, int32 size); // Returns the size in bytes of the element. -int32_t GetUIntSize(uint64_t value); -int32_t GetIntSize(int64_t value); -int32_t GetCodedUIntSize(uint64_t value); -uint64_t EbmlMasterElementSize(uint64_t type, uint64_t value); -uint64_t EbmlElementSize(uint64_t type, int64_t value); -uint64_t EbmlElementSize(uint64_t type, uint64_t value); -uint64_t EbmlElementSize(uint64_t type, float value); -uint64_t EbmlElementSize(uint64_t type, const char* value); -uint64_t EbmlElementSize(uint64_t type, const uint8_t* value, uint64_t size); -uint64_t EbmlDateElementSize(uint64_t type); +int32 GetUIntSize(uint64 value); +int32 GetIntSize(int64 value); +int32 GetCodedUIntSize(uint64 value); +uint64 EbmlMasterElementSize(uint64 type, uint64 value); +uint64 EbmlElementSize(uint64 type, int64 value); +uint64 EbmlElementSize(uint64 type, uint64 value); +uint64 EbmlElementSize(uint64 type, float value); +uint64 EbmlElementSize(uint64 type, const char* value); +uint64 EbmlElementSize(uint64 type, const uint8* value, uint64 size); +uint64 EbmlDateElementSize(uint64 type); // Returns the size in bytes of the element assuming that the element was // written using |fixed_size| bytes. If |fixed_size| is set to zero, then it // computes the necessary number of bytes based on |value|. -uint64_t EbmlElementSize(uint64_t type, uint64_t value, uint64_t fixed_size); +uint64 EbmlElementSize(uint64 type, uint64 value, uint64 fixed_size); // Creates an EBML coded number from |value| and writes it out. The size of // the coded number is determined by the value of |value|. |value| must not // be in a coded form. Returns 0 on success. -int32_t WriteUInt(IMkvWriter* writer, uint64_t value); +int32 WriteUInt(IMkvWriter* writer, uint64 value); // Creates an EBML coded number from |value| and writes it out. The size of // the coded number is determined by the value of |size|. |value| must not // be in a coded form. Returns 0 on success. -int32_t WriteUIntSize(IMkvWriter* writer, uint64_t value, int32_t size); +int32 WriteUIntSize(IMkvWriter* writer, uint64 value, int32 size); // Output an Mkv master element. Returns true if the element was written. -bool WriteEbmlMasterElement(IMkvWriter* writer, uint64_t value, uint64_t size); +bool WriteEbmlMasterElement(IMkvWriter* writer, uint64 value, uint64 size); // Outputs an Mkv ID, calls |IMkvWriter::ElementStartNotify|, and passes the // ID to |SerializeInt|. Returns 0 on success. -int32_t WriteID(IMkvWriter* writer, uint64_t type); +int32 WriteID(IMkvWriter* writer, uint64 type); // Output an Mkv non-master element. Returns true if the element was written. -bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, uint64_t value); -bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, int64_t value); -bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, float value); -bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, const char* value); -bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, const uint8_t* value, - uint64_t size); -bool WriteEbmlDateElement(IMkvWriter* writer, uint64_t type, int64_t value); +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value); +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, int64 value); +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, float value); +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const char* value); +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, const uint8* value, + uint64 size); +bool WriteEbmlDateElement(IMkvWriter* writer, uint64 type, int64 value); // Output an Mkv non-master element using fixed size. The element will be // written out using exactly |fixed_size| bytes. If |fixed_size| is set to zero // then it computes the necessary number of bytes based on |value|. Returns true // if the element was written. -bool WriteEbmlElement(IMkvWriter* writer, uint64_t type, uint64_t value, - uint64_t fixed_size); +bool WriteEbmlElement(IMkvWriter* writer, uint64 type, uint64 value, + uint64 fixed_size); // Output a Mkv Frame. It decides the correct element to write (Block vs // SimpleBlock) based on the parameters of the Frame. -uint64_t WriteFrame(IMkvWriter* writer, const Frame* const frame, - Cluster* cluster); +uint64 WriteFrame(IMkvWriter* writer, const Frame* const frame, + Cluster* cluster); // Output a void element. |size| must be the entire size in bytes that will be // void. The function will calculate the size of the void header and subtract // it from |size|. -uint64_t WriteVoidElement(IMkvWriter* writer, uint64_t size); +uint64 WriteVoidElement(IMkvWriter* writer, uint64 size); // Returns the version number of the muxer in |major|, |minor|, |build|, // and |revision|. -void GetVersion(int32_t* major, int32_t* minor, int32_t* build, - int32_t* revision); +void GetVersion(int32* major, int32* minor, int32* build, int32* revision); // Returns a random number to be used for UID, using |seed| to seed // the random-number generator (see POSIX rand_r() for semantics). -uint64_t MakeUID(unsigned int* seed); +uint64 MakeUID(unsigned int* seed); + +// Colour field validation helpers. All return true when |value| is valid. +bool IsMatrixCoefficientsValueValid(uint64_t value); +bool IsChromaSitingHorzValueValid(uint64_t value); +bool IsChromaSitingVertValueValid(uint64_t value); +bool IsColourRangeValueValid(uint64_t value); +bool IsTransferCharacteristicsValueValid(uint64_t value); +bool IsPrimariesValueValid(uint64_t value); } // namespace mkvmuxer diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc index ca48e149c6d..ec34e4df818 100644 --- a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc +++ b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvmuxer/mkvwriter.cc @@ -77,7 +77,7 @@ int32 MkvWriter::Position(int64 position) { #ifdef _MSC_VER return _fseeki64(file_, position, SEEK_SET); #else - return fseek(file_, position, SEEK_SET); + return fseeko(file_, static_cast<off_t>(position), SEEK_SET); #endif } diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvparser/mkvparser.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvparser/mkvparser.cc index 21801154d9f..e62d6f6075c 100644 --- a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvparser/mkvparser.cc +++ b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvparser/mkvparser.cc @@ -25,6 +25,7 @@ namespace mkvparser { const float MasteringMetadata::kValueNotPresent = FLT_MAX; const long long Colour::kValueNotPresent = LLONG_MAX; +const float Projection::kValueNotPresent = FLT_MAX; #ifdef MSC_COMPAT inline bool isnan(double val) { return !!_isnan(val); } @@ -1475,6 +1476,8 @@ long Segment::Load() { } } +SeekHead::Entry::Entry() : id(0), pos(0), element_start(0), element_size(0) {} + SeekHead::SeekHead(Segment* pSegment, long long start, long long size_, long long element_start, long long element_size) : m_pSegment(pSegment), @@ -1766,18 +1769,7 @@ bool SeekHead::ParseEntry(IMkvReader* pReader, long long start, long long size_, if ((pos + seekIdSize) > stop) return false; - // Note that the SeekId payload really is serialized - // as a "Matroska integer", not as a plain binary value. - // In fact, Matroska requires that ID values in the - // stream exactly match the binary representation as listed - // in the Matroska specification. - // - // This parser is more liberal, and permits IDs to have - // any width. (This could make the representation in the stream - // different from what's in the spec, but it doesn't matter here, - // since we always normalize "Matroska integer" values.) - - pEntry->id = ReadUInt(pReader, pos, len); // payload + pEntry->id = ReadID(pReader, pos, len); // payload if (pEntry->id <= 0) return false; @@ -4125,7 +4117,7 @@ ContentEncoding::~ContentEncoding() { } const ContentEncoding::ContentCompression* - ContentEncoding::GetCompressionByIndex(unsigned long idx) const { +ContentEncoding::GetCompressionByIndex(unsigned long idx) const { const ptrdiff_t count = compression_entries_end_ - compression_entries_; assert(count >= 0); @@ -5188,11 +5180,92 @@ bool Colour::Parse(IMkvReader* reader, long long colour_start, return true; } +bool Projection::Parse(IMkvReader* reader, long long start, long long size, + Projection** projection) { + if (!reader || *projection) + return false; + + std::auto_ptr<Projection> projection_ptr(new Projection()); + if (!projection_ptr.get()) + return false; + + const long long end = start + size; + long long read_pos = start; + + while (read_pos < end) { + long long child_id = 0; + long long child_size = 0; + + const long long status = + ParseElementHeader(reader, read_pos, end, child_id, child_size); + if (status < 0) + return false; + + if (child_id == libwebm::kMkvProjectionType) { + long long projection_type = kTypeNotPresent; + projection_type = UnserializeUInt(reader, read_pos, child_size); + if (projection_type < 0) + return false; + + projection_ptr->type = static_cast<ProjectionType>(projection_type); + } else if (child_id == libwebm::kMkvProjectionPrivate) { + unsigned char* data = SafeArrayAlloc<unsigned char>(1, child_size); + + if (data == NULL) + return false; + + const int status = + reader->Read(read_pos, static_cast<long>(child_size), data); + + if (status) { + delete[] data; + return false; + } + + projection_ptr->private_data = data; + projection_ptr->private_data_length = static_cast<size_t>(child_size); + } else { + double value = 0; + const long long value_parse_status = + UnserializeFloat(reader, read_pos, child_size, value); + if (value_parse_status < 0) { + return false; + } + + switch (child_id) { + case libwebm::kMkvProjectionPoseYaw: + projection_ptr->pose_yaw = static_cast<float>(value); + break; + case libwebm::kMkvProjectionPosePitch: + projection_ptr->pose_pitch = static_cast<float>(value); + break; + case libwebm::kMkvProjectionPoseRoll: + projection_ptr->pose_roll = static_cast<float>(value); + break; + default: + return false; + } + } + + read_pos += child_size; + if (read_pos > end) + return false; + } + + *projection = projection_ptr.release(); + return true; +} + VideoTrack::VideoTrack(Segment* pSegment, long long element_start, long long element_size) - : Track(pSegment, element_start, element_size), m_colour(NULL) {} + : Track(pSegment, element_start, element_size), + m_colour(NULL), + m_projection(NULL) {} -VideoTrack::~VideoTrack() { delete m_colour; } +VideoTrack::~VideoTrack() { + delete m_colour; + delete m_projection; +} long VideoTrack::Parse(Segment* pSegment, const Info& info, long long element_start, long long element_size, @@ -5224,6 +5297,7 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info, const long long stop = pos + s.size; Colour* colour = NULL; + Projection* projection = NULL; while (pos < stop) { long long id, size; @@ -5274,6 +5348,9 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info, } else if (id == libwebm::kMkvColour) { if (!Colour::Parse(pReader, pos, size, &colour)) return E_FILE_FORMAT_INVALID; + } else if (id == libwebm::kMkvProjection) { + if (!Projection::Parse(pReader, pos, size, &projection)) + return E_FILE_FORMAT_INVALID; } pos += size; // consume payload @@ -5305,6 +5382,7 @@ long VideoTrack::Parse(Segment* pSegment, const Info& info, pTrack->m_stereo_mode = stereo_mode; pTrack->m_rate = rate; pTrack->m_colour = colour; + pTrack->m_projection = projection; pResult = pTrack; return 0; // success @@ -5405,6 +5483,8 @@ long VideoTrack::Seek(long long time_ns, const BlockEntry*& pResult) const { Colour* VideoTrack::GetColour() const { return m_colour; } +Projection* VideoTrack::GetProjection() const { return m_projection; } + long long VideoTrack::GetWidth() const { return m_width; } long long VideoTrack::GetHeight() const { return m_height; } @@ -6698,8 +6778,10 @@ Cluster::Cluster(Segment* pSegment, long idx, long long element_start {} Cluster::~Cluster() { - if (m_entries_count <= 0) + if (m_entries_count <= 0) { + delete[] m_entries; return; + } BlockEntry** i = m_entries; BlockEntry** const j = m_entries + m_entries_count; diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvparser/mkvparser.h b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvparser/mkvparser.h index 42e6e88ab46..26c2b7e5ebf 100644 --- a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvparser/mkvparser.h +++ b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvparser/mkvparser.h @@ -473,6 +473,34 @@ struct Colour { MasteringMetadata* mastering_metadata; }; +struct Projection { + enum ProjectionType { + kTypeNotPresent = -1, + kRectangular = 0, + kEquirectangular = 1, + kCubeMap = 2, + kMesh = 3, + }; + static const float kValueNotPresent; + Projection() + : type(kTypeNotPresent), + private_data(NULL), + private_data_length(0), + pose_yaw(kValueNotPresent), + pose_pitch(kValueNotPresent), + pose_roll(kValueNotPresent) {} + ~Projection() { delete[] private_data; } + static bool Parse(IMkvReader* reader, long long element_start, + long long element_size, Projection** projection); + + ProjectionType type; + unsigned char* private_data; + size_t private_data_length; + float pose_yaw; + float pose_pitch; + float pose_roll; +}; + class VideoTrack : public Track { VideoTrack(const VideoTrack&); VideoTrack& operator=(const VideoTrack&); @@ -497,6 +525,8 @@ class VideoTrack : public Track { Colour* GetColour() const; + Projection* GetProjection() const; + private: long long m_width; long long m_height; @@ -508,6 +538,7 @@ class VideoTrack : public Track { double m_rate; Colour* m_colour; + Projection* m_projection; }; class AudioTrack : public Track { @@ -813,6 +844,8 @@ class SeekHead { long Parse(); struct Entry { + Entry(); + // the SeekHead entry payload long long id; long long pos; diff --git a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvparser/mkvreader.cc b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvparser/mkvreader.cc index 9f90d8c4f86..b8fd00c2635 100644 --- a/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvparser/mkvreader.cc +++ b/chromium/third_party/libvpx/source/libvpx/third_party/libwebm/mkvparser/mkvreader.cc @@ -117,7 +117,7 @@ int MkvReader::Read(long long offset, long len, unsigned char* buffer) { if (status) return -1; // error #else - fseek(m_file, offset, SEEK_SET); + fseeko(m_file, static_cast<off_t>(offset), SEEK_SET); #endif const size_t size = fread(buffer, 1, len, m_file); @@ -128,4 +128,4 @@ int MkvReader::Read(long long offset, long len, unsigned char* buffer) { return 0; // success } -} // namespace mkvparser
\ No newline at end of file +} // namespace mkvparser diff --git a/chromium/third_party/libvpx/source/libvpx/tools.mk b/chromium/third_party/libvpx/source/libvpx/tools.mk new file mode 100644 index 00000000000..3c660b1dfd5 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/tools.mk @@ -0,0 +1,110 @@ +## +## Copyright (c) 2016 The WebM project authors. All Rights Reserved. +## +## Use of this source code is governed by a BSD-style license +## that can be found in the LICENSE file in the root of the source +## tree. An additional intellectual property rights grant can be found +## in the file PATENTS. All contributing project authors may +## be found in the AUTHORS file in the root of the source tree. +## + +# List of tools to build. +TOOLS-yes += tiny_ssim.c +tiny_ssim.SRCS += vpx/vpx_integer.h +tiny_ssim.GUID = 3afa9b05-940b-4d68-b5aa-55157d8ed7b4 +tiny_ssim.DESCRIPTION = Generate SSIM/PSNR from raw .yuv files + +# +# End of specified files. The rest of the build rules should happen +# automagically from here. +# + + +# Expand list of selected tools to build (as specified above) +TOOLS = $(addprefix tools/,$(call enabled,TOOLS)) +ALL_SRCS = $(foreach ex,$(TOOLS),$($(notdir $(ex:.c=)).SRCS)) + + +# Expand all tools sources into a variable containing all sources +# for that tools (not just them main one specified in TOOLS) +# and add this file to the list (for MSVS workspace generation) +$(foreach ex,$(TOOLS),$(eval $(notdir $(ex:.c=)).SRCS += $(ex) tools.mk)) + + +# Create build/install dependencies for all tools. The common case +# is handled here. The MSVS case is handled below. +NOT_MSVS = $(if $(CONFIG_MSVS),,yes) +DIST-BINS-$(NOT_MSVS) += $(addprefix bin/,$(TOOLS:.c=$(EXE_SFX))) +DIST-SRCS-yes += $(ALL_SRCS) +OBJS-$(NOT_MSVS) += $(call objs,$(ALL_SRCS)) +BINS-$(NOT_MSVS) += $(addprefix $(BUILD_PFX),$(TOOLS:.c=$(EXE_SFX))) + + +# Instantiate linker template for all tools. +$(foreach bin,$(BINS-yes),\ + $(eval $(bin):)\ + $(eval $(call linker_template,$(bin),\ + $(call objs,$($(notdir $(bin:$(EXE_SFX)=)).SRCS)) \ + -lm\ + ))) + + +# The following pairs define a mapping of locations in the distribution +# tree to locations in the source/build trees. +INSTALL_MAPS += src/%.c %.c +INSTALL_MAPS += src/% $(SRC_PATH_BARE)/% +INSTALL_MAPS += bin/% % +INSTALL_MAPS += % % + + +# Build Visual Studio Projects. We use a template here to instantiate +# explicit rules rather than using an implicit rule because we want to +# leverage make's VPATH searching rather than specifying the paths on +# each file in TOOLS. This has the unfortunate side effect that +# touching the source files trigger a rebuild of the project files +# even though there is no real dependency there (the dependency is on +# the makefiles). We may want to revisit this. +define vcproj_template +$(1): $($(1:.$(VCPROJ_SFX)=).SRCS) vpx.$(VCPROJ_SFX) + $(if $(quiet),@echo " [vcproj] $$@") + $(qexec)$$(GEN_VCPROJ)\ + --exe\ + --target=$$(TOOLCHAIN)\ + --name=$$(@:.$(VCPROJ_SFX)=)\ + --ver=$$(CONFIG_VS_VERSION)\ + --proj-guid=$$($$(@:.$(VCPROJ_SFX)=).GUID)\ + --src-path-bare="$(SRC_PATH_BARE)" \ + $$(if $$(CONFIG_STATIC_MSVCRT),--static-crt) \ + --out=$$@ $$(INTERNAL_CFLAGS) $$(CFLAGS) \ + $$(INTERNAL_LDFLAGS) $$(LDFLAGS) $$^ +endef +TOOLS_BASENAME := $(notdir $(TOOLS)) +PROJECTS-$(CONFIG_MSVS) += $(TOOLS_BASENAME:.c=.$(VCPROJ_SFX)) +INSTALL-BINS-$(CONFIG_MSVS) += $(foreach p,$(VS_PLATFORMS),\ + $(addprefix bin/$(p)/,$(TOOLS_BASENAME:.c=.exe))) +$(foreach proj,$(call enabled,PROJECTS),\ + $(eval $(call vcproj_template,$(proj)))) + +# +# Documentation Rules +# +%.dox: %.c + @echo " [DOXY] $@" + @mkdir -p $(dir $@) + @echo "/*!\page tools_$(@F:.dox=) $(@F:.dox=)" > $@ + @echo " \includelineno $(<F)" >> $@ + @echo "*/" >> $@ + +tools.dox: tools.mk + @echo " [DOXY] $@" + @echo "/*!\page tools Tools" > $@ + @echo " This SDK includes a number of tools/utilities."\ + "The following tools are included: ">>$@ + @$(foreach ex,$(sort $(notdir $(TOOLS:.c=))),\ + echo " - \subpage tools_$(ex) $($(ex).DESCRIPTION)" >> $@;) + @echo "*/" >> $@ + +CLEAN-OBJS += tools.doxy tools.dox $(TOOLS:.c=.dox) +DOCS-yes += tools.doxy tools.dox +tools.doxy: tools.dox $(TOOLS:.c=.dox) + @echo "INPUT += $^" > $@ diff --git a/chromium/third_party/libvpx/source/libvpx/tools/tiny_ssim.c b/chromium/third_party/libvpx/source/libvpx/tools/tiny_ssim.c new file mode 100644 index 00000000000..28052e0a84d --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/tools/tiny_ssim.c @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <errno.h> +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "vpx/vpx_integer.h" + +void vp8_ssim_parms_8x8_c(unsigned char *s, int sp, unsigned char *r, int rp, + uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, + uint32_t *sum_sq_r, uint32_t *sum_sxr) { + int i, j; + for (i = 0; i < 8; i++, s += sp, r += rp) { + for (j = 0; j < 8; j++) { + *sum_s += s[j]; + *sum_r += r[j]; + *sum_sq_s += s[j] * s[j]; + *sum_sq_r += r[j] * r[j]; + *sum_sxr += s[j] * r[j]; + } + } +} + +static const int64_t cc1 = 26634; // (64^2*(.01*255)^2 +static const int64_t cc2 = 239708; // (64^2*(.03*255)^2 + +static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s, + uint32_t sum_sq_r, uint32_t sum_sxr, int count) { + int64_t ssim_n, ssim_d; + int64_t c1, c2; + + // scale the constants by number of pixels + c1 = (cc1 * count * count) >> 12; + c2 = (cc2 * count * count) >> 12; + + ssim_n = (2 * sum_s * sum_r + c1) * + ((int64_t)2 * count * sum_sxr - (int64_t)2 * sum_s * sum_r + c2); + + ssim_d = (sum_s * sum_s + sum_r * sum_r + c1) * + ((int64_t)count * sum_sq_s - (int64_t)sum_s * sum_s + + (int64_t)count * sum_sq_r - (int64_t)sum_r * sum_r + c2); + + return ssim_n * 1.0 / ssim_d; +} + +static double ssim_8x8(unsigned char *s, int sp, unsigned char *r, int rp) { + uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0; + vp8_ssim_parms_8x8_c(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, + &sum_sxr); + return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64); +} + +// We are using a 8x8 moving window with starting location of each 8x8 window +// on the 4x4 pixel grid. Such arrangement allows the windows to overlap +// block boundaries to penalize blocking artifacts. +double vp8_ssim2(unsigned char *img1, unsigned char *img2, int stride_img1, + int stride_img2, int width, int height) { + int i, j; + int samples = 0; + double ssim_total = 0; + + // sample point start with each 4x4 location + for (i = 0; i <= height - 8; + i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) { + for (j = 0; j <= width - 8; j += 4) { + double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2); + ssim_total += v; + samples++; + } + } + ssim_total /= samples; + return ssim_total; +} + +static uint64_t calc_plane_error(uint8_t *orig, int orig_stride, uint8_t *recon, + int recon_stride, unsigned int cols, + unsigned int rows) { + unsigned int row, col; + uint64_t total_sse = 0; + int diff; + + for (row = 0; row < rows; row++) { + for (col = 0; col < cols; col++) { + diff = orig[col] - recon[col]; + total_sse += diff * diff; + } + + orig += orig_stride; + recon += recon_stride; + } + + return total_sse; +} + +#define MAX_PSNR 100 + +double vp9_mse2psnr(double samples, double peak, double mse) { + double psnr; + + if (mse > 0.0) + psnr = 10.0 * log10(peak * peak * samples / mse); + else + psnr = MAX_PSNR; // Limit to prevent / 0 + + if (psnr > MAX_PSNR) psnr = MAX_PSNR; + + return psnr; +} + +int main(int argc, char *argv[]) { + FILE *f[2]; + uint8_t *buf[2]; + int w, h, n_frames, tl_skip = 0, tl_skips_remaining = 0; + double ssim = 0, psnravg = 0, psnrglb = 0; + double ssimy, ssimu, ssimv; + uint64_t psnry, psnru, psnrv; + + if (argc < 4) { + fprintf(stderr, "Usage: %s file1.yuv file2.yuv WxH [tl_skip={0,1,3}]\n", + argv[0]); + return 1; + } + f[0] = strcmp(argv[1], "-") ? fopen(argv[1], "rb") : stdin; + f[1] = strcmp(argv[2], "-") ? fopen(argv[2], "rb") : stdin; + sscanf(argv[3], "%dx%d", &w, &h); + // Number of frames to skip from file1.yuv for every frame used. Normal values + // 0, 1 and 3 correspond to TL2, TL1 and TL0 respectively for a 3TL encoding + // in mode 10. 7 would be reasonable for comparing TL0 of a 4-layer encoding. + if (argc > 4) { + sscanf(argv[4], "%d", &tl_skip); + } + if (!f[0] || !f[1]) { + fprintf(stderr, "Could not open input files: %s\n", strerror(errno)); + return 1; + } + if (w <= 0 || h <= 0 || w & 1 || h & 1) { + fprintf(stderr, "Invalid size %dx%d\n", w, h); + return 1; + } + buf[0] = malloc(w * h * 3 / 2); + buf[1] = malloc(w * h * 3 / 2); + n_frames = 0; + while (1) { + size_t r1, r2; + r1 = fread(buf[0], w * h * 3 / 2, 1, f[0]); + if (r1) { + // Reading parts of file1.yuv that were not used in temporal layer. + if (tl_skips_remaining > 0) { + --tl_skips_remaining; + continue; + } + // Use frame, but skip |tl_skip| after it. + tl_skips_remaining = tl_skip; + } + r2 = fread(buf[1], w * h * 3 / 2, 1, f[1]); + if (r1 && r2 && r1 != r2) { + fprintf(stderr, "Failed to read data: %s [%d/%d]\n", strerror(errno), + (int)r1, (int)r2); + return 1; + } else if (r1 == 0 || r2 == 0) { + break; + } +#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h) \ + ssim = vp8_ssim2(buf0, buf1, w, w, w, h); \ + psnr = calc_plane_error(buf0, w, buf1, w, w, h); + psnr_and_ssim(ssimy, psnry, buf[0], buf[1], w, h); + psnr_and_ssim(ssimu, psnru, buf[0] + w * h, buf[1] + w * h, w / 2, h / 2); + psnr_and_ssim(ssimv, psnrv, buf[0] + w * h * 5 / 4, buf[1] + w * h * 5 / 4, + w / 2, h / 2); + ssim += 0.8 * ssimy + 0.1 * (ssimu + ssimv); + psnravg += + vp9_mse2psnr(w * h * 6 / 4, 255.0, (double)psnry + psnru + psnrv); + psnrglb += psnry + psnru + psnrv; + n_frames++; + } + free(buf[0]); + free(buf[1]); + ssim /= n_frames; + psnravg /= n_frames; + psnrglb = vp9_mse2psnr((double)n_frames * w * h * 6 / 4, 255.0, psnrglb); + + printf("AvgPSNR: %lf\n", psnravg); + printf("GlbPSNR: %lf\n", psnrglb); + printf("SSIM: %lf\n", 100 * pow(ssim, 8.0)); + printf("Nframes: %d\n", n_frames); + + if (strcmp(argv[1], "-")) fclose(f[0]); + if (strcmp(argv[2], "-")) fclose(f[1]); + + return 0; +} diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/loopfilter_filters.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/loopfilter_filters.c index 1f60721e1cd..2a7cde8788f 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/loopfilter_filters.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/loopfilter_filters.c @@ -63,8 +63,8 @@ static void vp8_filter(signed char mask, uc hev, uc *op1, uc *op0, uc *oq0, filter_value &= mask; /* save bottom 3 bits so that we round one side +4 and the other +3 - * if it equals 4 we'll set to adjust by -1 to account for the fact - * we'd round 3 the other way + * if it equals 4 we'll set it to adjust by -1 to account for the fact + * we'd round it by 3 the other way */ Filter1 = vp8_signed_char_clamp(filter_value + 4); Filter2 = vp8_signed_char_clamp(filter_value + 3); diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/msa/idct_msa.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/msa/idct_msa.c index e1759c875e4..3d516d0f81a 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/msa/idct_msa.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/msa/idct_msa.c @@ -90,8 +90,7 @@ static void idct4x4_addblk_msa(int16_t *input, uint8_t *pred, v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3; v4i32 res0, res1, res2, res3; v16i8 zero = { 0 }; - v16i8 pred0, pred1, pred2, pred3, dest0, dest1, dest2, dest3; - v16i8 mask = { 0, 4, 8, 12, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }; + v16i8 pred0, pred1, pred2, pred3; LD_SH2(input, 8, input0, input1); UNPCK_SH_SW(input0, in0, in1); @@ -111,20 +110,17 @@ static void idct4x4_addblk_msa(int16_t *input, uint8_t *pred, res1 = CLIP_SW_0_255(res1); res2 = CLIP_SW_0_255(res2); res3 = CLIP_SW_0_255(res3); - LD_SB4(dest, dest_stride, dest0, dest1, dest2, dest3); - VSHF_B2_SB(res0, dest0, res1, dest1, mask, mask, dest0, dest1); - VSHF_B2_SB(res2, dest2, res3, dest3, mask, mask, dest2, dest3); - ST_SB4(dest0, dest1, dest2, dest3, dest, dest_stride); + PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1); + res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1); + ST4x4_UB(res0, res0, 3, 2, 1, 0, dest, dest_stride); } static void idct4x4_addconst_msa(int16_t in_dc, uint8_t *pred, int32_t pred_stride, uint8_t *dest, int32_t dest_stride) { - v8i16 vec; - v8i16 res0, res1, res2, res3; + v8i16 vec, res0, res1, res2, res3, dst0, dst1; v16i8 zero = { 0 }; - v16i8 pred0, pred1, pred2, pred3, dest0, dest1, dest2, dest3; - v16i8 mask = { 0, 2, 4, 6, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }; + v16i8 pred0, pred1, pred2, pred3; vec = __msa_fill_h(in_dc); vec = __msa_srari_h(vec, 3); @@ -133,55 +129,59 @@ static void idct4x4_addconst_msa(int16_t in_dc, uint8_t *pred, res2, res3); ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3); CLIP_SH4_0_255(res0, res1, res2, res3); - LD_SB4(dest, dest_stride, dest0, dest1, dest2, dest3); - VSHF_B2_SB(res0, dest0, res1, dest1, mask, mask, dest0, dest1); - VSHF_B2_SB(res2, dest2, res3, dest3, mask, mask, dest2, dest3); - ST_SB4(dest0, dest1, dest2, dest3, dest, dest_stride); + PCKEV_B2_SH(res1, res0, res3, res2, dst0, dst1); + dst0 = (v8i16)__msa_pckev_w((v4i32)dst1, (v4i32)dst0); + ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dest, dest_stride); } void vp8_short_inv_walsh4x4_msa(int16_t *input, int16_t *mb_dq_coeff) { - v8i16 input0, input1; - v4i32 in0, in1, in2, in3, a1, b1, c1, d1; - v4i32 hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3; + v8i16 input0, input1, tmp0, tmp1, tmp2, tmp3, out0, out1; + const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 }; + const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 }; + const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 }; + const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 }; LD_SH2(input, 8, input0, input1); - UNPCK_SH_SW(input0, in0, in1); - UNPCK_SH_SW(input1, in2, in3); - BUTTERFLY_4(in0, in1, in2, in3, a1, b1, c1, d1); - BUTTERFLY_4(a1, d1, c1, b1, hz0, hz1, hz3, hz2); - TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3); - BUTTERFLY_4(hz0, hz1, hz2, hz3, a1, b1, c1, d1); - BUTTERFLY_4(a1, d1, c1, b1, vt0, vt1, vt3, vt2); - ADD4(vt0, 3, vt1, 3, vt2, 3, vt3, 3, vt0, vt1, vt2, vt3); - SRA_4V(vt0, vt1, vt2, vt3, 3); - mb_dq_coeff[0] = __msa_copy_s_h((v8i16)vt0, 0); - mb_dq_coeff[16] = __msa_copy_s_h((v8i16)vt1, 0); - mb_dq_coeff[32] = __msa_copy_s_h((v8i16)vt2, 0); - mb_dq_coeff[48] = __msa_copy_s_h((v8i16)vt3, 0); - mb_dq_coeff[64] = __msa_copy_s_h((v8i16)vt0, 2); - mb_dq_coeff[80] = __msa_copy_s_h((v8i16)vt1, 2); - mb_dq_coeff[96] = __msa_copy_s_h((v8i16)vt2, 2); - mb_dq_coeff[112] = __msa_copy_s_h((v8i16)vt3, 2); - mb_dq_coeff[128] = __msa_copy_s_h((v8i16)vt0, 4); - mb_dq_coeff[144] = __msa_copy_s_h((v8i16)vt1, 4); - mb_dq_coeff[160] = __msa_copy_s_h((v8i16)vt2, 4); - mb_dq_coeff[176] = __msa_copy_s_h((v8i16)vt3, 4); - mb_dq_coeff[192] = __msa_copy_s_h((v8i16)vt0, 6); - mb_dq_coeff[208] = __msa_copy_s_h((v8i16)vt1, 6); - mb_dq_coeff[224] = __msa_copy_s_h((v8i16)vt2, 6); - mb_dq_coeff[240] = __msa_copy_s_h((v8i16)vt3, 6); + input1 = (v8i16)__msa_sldi_b((v16i8)input1, (v16i8)input1, 8); + tmp0 = input0 + input1; + tmp1 = input0 - input1; + VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3); + out0 = tmp2 + tmp3; + out1 = tmp2 - tmp3; + VSHF_H2_SH(out0, out1, out0, out1, mask2, mask3, input0, input1); + tmp0 = input0 + input1; + tmp1 = input0 - input1; + VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3); + tmp0 = tmp2 + tmp3; + tmp1 = tmp2 - tmp3; + ADD2(tmp0, 3, tmp1, 3, out0, out1); + out0 >>= 3; + out1 >>= 3; + mb_dq_coeff[0] = __msa_copy_s_h(out0, 0); + mb_dq_coeff[16] = __msa_copy_s_h(out0, 4); + mb_dq_coeff[32] = __msa_copy_s_h(out1, 0); + mb_dq_coeff[48] = __msa_copy_s_h(out1, 4); + mb_dq_coeff[64] = __msa_copy_s_h(out0, 1); + mb_dq_coeff[80] = __msa_copy_s_h(out0, 5); + mb_dq_coeff[96] = __msa_copy_s_h(out1, 1); + mb_dq_coeff[112] = __msa_copy_s_h(out1, 5); + mb_dq_coeff[128] = __msa_copy_s_h(out0, 2); + mb_dq_coeff[144] = __msa_copy_s_h(out0, 6); + mb_dq_coeff[160] = __msa_copy_s_h(out1, 2); + mb_dq_coeff[176] = __msa_copy_s_h(out1, 6); + mb_dq_coeff[192] = __msa_copy_s_h(out0, 3); + mb_dq_coeff[208] = __msa_copy_s_h(out0, 7); + mb_dq_coeff[224] = __msa_copy_s_h(out1, 3); + mb_dq_coeff[240] = __msa_copy_s_h(out1, 7); } static void dequant_idct4x4_addblk_msa(int16_t *input, int16_t *dequant_input, uint8_t *dest, int32_t dest_stride) { v8i16 input0, input1, dequant_in0, dequant_in1, mul0, mul1; - v8i16 in0, in1, in2, in3; - v8i16 hz0_h, hz1_h, hz2_h, hz3_h; - v16i8 dest0, dest1, dest2, dest3; - v4i32 hz0_w, hz1_w, hz2_w, hz3_w; - v4i32 vt0, vt1, vt2, vt3, res0, res1, res2, res3; + v8i16 in0, in1, in2, in3, hz0_h, hz1_h, hz2_h, hz3_h; + v16u8 dest0, dest1, dest2, dest3; + v4i32 hz0_w, hz1_w, hz2_w, hz3_w, vt0, vt1, vt2, vt3, res0, res1, res2, res3; v2i64 zero = { 0 }; - v16i8 mask = { 0, 4, 8, 12, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }; LD_SH2(input, 8, input0, input1); LD_SH2(dequant_input, 8, dequant_in0, dequant_in1); @@ -196,7 +196,7 @@ static void dequant_idct4x4_addblk_msa(int16_t *input, int16_t *dequant_input, VP8_IDCT_1D_W(hz0_w, hz1_w, hz2_w, hz3_w, vt0, vt1, vt2, vt3); SRARI_W4_SW(vt0, vt1, vt2, vt3, 3); TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3); - LD_SB4(dest, dest_stride, dest0, dest1, dest2, dest3); + LD_UB4(dest, dest_stride, dest0, dest1, dest2, dest3); ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3, res0, res1, res2, res3); ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3, res0, res1, res2, @@ -206,19 +206,17 @@ static void dequant_idct4x4_addblk_msa(int16_t *input, int16_t *dequant_input, res1 = CLIP_SW_0_255(res1); res2 = CLIP_SW_0_255(res2); res3 = CLIP_SW_0_255(res3); - VSHF_B2_SB(res0, dest0, res1, dest1, mask, mask, dest0, dest1); - VSHF_B2_SB(res2, dest2, res3, dest3, mask, mask, dest2, dest3); - ST_SB4(dest0, dest1, dest2, dest3, dest, dest_stride); + PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1); + res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1); + ST4x4_UB(res0, res0, 3, 2, 1, 0, dest, dest_stride); } static void dequant_idct4x4_addblk_2x_msa(int16_t *input, int16_t *dequant_input, uint8_t *dest, int32_t dest_stride) { v16u8 dest0, dest1, dest2, dest3; - v8i16 in0, in1, in2, in3; - v8i16 mul0, mul1, mul2, mul3, dequant_in0, dequant_in1; - v8i16 hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3; - v8i16 res0, res1, res2, res3; + v8i16 in0, in1, in2, in3, mul0, mul1, mul2, mul3, dequant_in0, dequant_in1; + v8i16 hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3, res0, res1, res2, res3; v4i32 hz0l, hz1l, hz2l, hz3l, hz0r, hz1r, hz2r, hz3r; v4i32 vt0l, vt1l, vt2l, vt3l, vt0r, vt1r, vt2r, vt3r; v16i8 zero = { 0 }; @@ -247,11 +245,8 @@ static void dequant_idct4x4_addblk_2x_msa(int16_t *input, res2, res3); ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3); CLIP_SH4_0_255(res0, res1, res2, res3); - PCKEV_B4_SH(res0, res0, res1, res1, res2, res2, res3, res3, res0, res1, res2, - res3); - PCKOD_D2_UB(dest0, res0, dest1, res1, dest0, dest1); - PCKOD_D2_UB(dest2, res2, dest3, res3, dest2, dest3); - ST_UB4(dest0, dest1, dest2, dest3, dest, dest_stride); + PCKEV_B2_SW(res1, res0, res3, res2, vt0l, vt1l); + ST8x4_UB(vt0l, vt1l, dest, dest_stride); __asm__ __volatile__( "sw $zero, 0(%[input]) \n\t" @@ -276,10 +271,9 @@ static void dequant_idct4x4_addblk_2x_msa(int16_t *input, static void dequant_idct_addconst_2x_msa(int16_t *input, int16_t *dequant_input, uint8_t *dest, int32_t dest_stride) { - v8i16 input_dc0, input_dc1, vec; + v8i16 input_dc0, input_dc1, vec, res0, res1, res2, res3; v16u8 dest0, dest1, dest2, dest3; v16i8 zero = { 0 }; - v8i16 res0, res1, res2, res3; input_dc0 = __msa_fill_h(input[0] * dequant_input[0]); input_dc1 = __msa_fill_h(input[16] * dequant_input[0]); @@ -292,11 +286,8 @@ static void dequant_idct_addconst_2x_msa(int16_t *input, int16_t *dequant_input, res2, res3); ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3); CLIP_SH4_0_255(res0, res1, res2, res3); - PCKEV_B4_SH(res0, res0, res1, res1, res2, res2, res3, res3, res0, res1, res2, - res3); - PCKOD_D2_UB(dest0, res0, dest1, res1, dest0, dest1); - PCKOD_D2_UB(dest2, res2, dest3, res3, dest2, dest3); - ST_UB4(dest0, dest1, dest2, dest3, dest, dest_stride); + PCKEV_B2_SH(res1, res0, res3, res2, res0, res1); + ST8x4_UB(res0, res1, dest, dest_stride); } void vp8_short_idct4x4llm_msa(int16_t *input, uint8_t *pred_ptr, diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c b/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c index f5f1790ef53..98a4fc09a35 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/msa/loopfilter_filters_msa.c @@ -24,208 +24,145 @@ mask = ((v16u8)mask <= b_limit); \ } -#define VP8_LPF_FILTER4_4W(p1_in_out, p0_in_out, q0_in_out, q1_in_out, \ - mask_in, hev_in) \ - { \ - v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \ - v16i8 filt, filt1, filt2, cnst4b, cnst3b; \ - v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \ - \ - p1_m = (v16i8)__msa_xori_b(p1_in_out, 0x80); \ - p0_m = (v16i8)__msa_xori_b(p0_in_out, 0x80); \ - q0_m = (v16i8)__msa_xori_b(q0_in_out, 0x80); \ - q1_m = (v16i8)__msa_xori_b(q1_in_out, 0x80); \ - \ - filt = __msa_subs_s_b(p1_m, q1_m); \ - \ - filt = filt & (v16i8)hev_in; \ - \ - q0_sub_p0 = q0_m - p0_m; \ - filt_sign = __msa_clti_s_b(filt, 0); \ - \ - cnst3h = __msa_ldi_h(3); \ - q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \ - q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \ - filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \ - filt_r += q0_sub_p0_r; \ - filt_r = __msa_sat_s_h(filt_r, 7); \ - \ - q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0); \ - q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h); \ - filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt); \ - filt_l += q0_sub_p0_l; \ - filt_l = __msa_sat_s_h(filt_l, 7); \ - \ - filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r); \ - filt = filt & (v16i8)mask_in; \ - \ - cnst4b = __msa_ldi_b(4); \ - filt1 = __msa_adds_s_b(filt, cnst4b); \ - filt1 >>= 3; \ - \ - cnst3b = __msa_ldi_b(3); \ - filt2 = __msa_adds_s_b(filt, cnst3b); \ - filt2 >>= 3; \ - \ - q0_m = __msa_subs_s_b(q0_m, filt1); \ - q0_in_out = __msa_xori_b((v16u8)q0_m, 0x80); \ - p0_m = __msa_adds_s_b(p0_m, filt2); \ - p0_in_out = __msa_xori_b((v16u8)p0_m, 0x80); \ - \ - filt = __msa_srari_b(filt1, 1); \ - hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \ - filt = filt & (v16i8)hev_in; \ - \ - q1_m = __msa_subs_s_b(q1_m, filt); \ - q1_in_out = __msa_xori_b((v16u8)q1_m, 0x80); \ - p1_m = __msa_adds_s_b(p1_m, filt); \ - p1_in_out = __msa_xori_b((v16u8)p1_m, 0x80); \ +#define VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev) \ + { \ + v16i8 p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2; \ + const v16i8 cnst4b = __msa_ldi_b(4); \ + const v16i8 cnst3b = __msa_ldi_b(3); \ + \ + p1_m = (v16i8)__msa_xori_b(p1, 0x80); \ + p0_m = (v16i8)__msa_xori_b(p0, 0x80); \ + q0_m = (v16i8)__msa_xori_b(q0, 0x80); \ + q1_m = (v16i8)__msa_xori_b(q1, 0x80); \ + \ + filt = __msa_subs_s_b(p1_m, q1_m); \ + filt &= hev; \ + q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt &= mask; \ + t1 = __msa_adds_s_b(filt, cnst4b); \ + t1 >>= cnst3b; \ + t2 = __msa_adds_s_b(filt, cnst3b); \ + t2 >>= cnst3b; \ + q0_m = __msa_subs_s_b(q0_m, t1); \ + q0 = __msa_xori_b((v16u8)q0_m, 0x80); \ + p0_m = __msa_adds_s_b(p0_m, t2); \ + p0 = __msa_xori_b((v16u8)p0_m, 0x80); \ + filt = __msa_srari_b(t1, 1); \ + hev = __msa_xori_b(hev, 0xff); \ + filt &= hev; \ + q1_m = __msa_subs_s_b(q1_m, filt); \ + q1 = __msa_xori_b((v16u8)q1_m, 0x80); \ + p1_m = __msa_adds_s_b(p1_m, filt); \ + p1 = __msa_xori_b((v16u8)p1_m, 0x80); \ } -#define VP8_SIMPLE_FILT(p1_in, p0_in, q0_in, q1_in, mask) \ - { \ - v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, q0_sub_p0_sign; \ - v16i8 filt, filt1, filt2, cnst4b, cnst3b, filt_sign; \ - v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \ - \ - p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \ - p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \ - q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \ - q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \ - \ - filt = __msa_subs_s_b(p1_m, q1_m); \ - \ - q0_sub_p0 = q0_m - p0_m; \ - filt_sign = __msa_clti_s_b(filt, 0); \ - \ - cnst3h = __msa_ldi_h(3); \ - q0_sub_p0_sign = __msa_clti_s_b(q0_sub_p0, 0); \ - q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0_sign, q0_sub_p0); \ - q0_sub_p0_r *= cnst3h; \ - filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \ - filt_r += q0_sub_p0_r; \ - filt_r = __msa_sat_s_h(filt_r, 7); \ - \ - q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0_sign, q0_sub_p0); \ - q0_sub_p0_l *= cnst3h; \ - filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt); \ - filt_l += q0_sub_p0_l; \ - filt_l = __msa_sat_s_h(filt_l, 7); \ - \ - filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r); \ - filt = filt & (v16i8)(mask); \ - \ - cnst4b = __msa_ldi_b(4); \ - filt1 = __msa_adds_s_b(filt, cnst4b); \ - filt1 >>= 3; \ - \ - cnst3b = __msa_ldi_b(3); \ - filt2 = __msa_adds_s_b(filt, cnst3b); \ - filt2 >>= 3; \ - \ - q0_m = __msa_subs_s_b(q0_m, filt1); \ - p0_m = __msa_adds_s_b(p0_m, filt2); \ - q0_in = __msa_xori_b((v16u8)q0_m, 0x80); \ - p0_in = __msa_xori_b((v16u8)p0_m, 0x80); \ +#define VP8_SIMPLE_FILT(p1_in, p0_in, q0_in, q1_in, mask) \ + { \ + v16i8 p1_m, p0_m, q0_m, q1_m, filt, filt1, filt2; \ + v16i8 q0_sub_p0; \ + const v16i8 cnst4b = __msa_ldi_b(4); \ + const v16i8 cnst3b = __msa_ldi_b(3); \ + \ + p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \ + p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \ + q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \ + q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \ + \ + filt = __msa_subs_s_b(p1_m, q1_m); \ + q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt &= mask; \ + filt1 = __msa_adds_s_b(filt, cnst4b); \ + filt1 >>= cnst3b; \ + filt2 = __msa_adds_s_b(filt, cnst3b); \ + filt2 >>= cnst3b; \ + q0_m = __msa_subs_s_b(q0_m, filt1); \ + p0_m = __msa_adds_s_b(p0_m, filt2); \ + q0_in = __msa_xori_b((v16u8)q0_m, 0x80); \ + p0_in = __msa_xori_b((v16u8)p0_m, 0x80); \ } -#define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) \ - { \ - v16i8 p2_m, p1_m, p0_m, q2_m, q1_m, q0_m; \ - v16i8 filt, q0_sub_p0, cnst4b, cnst3b; \ - v16i8 u, filt1, filt2, filt_sign, q0_sub_p0_sign; \ - v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_r, u_r, u_l, filt_l; \ - v8i16 cnst3h, cnst27h, cnst18h, cnst63h; \ - \ - cnst3h = __msa_ldi_h(3); \ - \ - p2_m = (v16i8)__msa_xori_b(p2, 0x80); \ - p1_m = (v16i8)__msa_xori_b(p1, 0x80); \ - p0_m = (v16i8)__msa_xori_b(p0, 0x80); \ - q0_m = (v16i8)__msa_xori_b(q0, 0x80); \ - q1_m = (v16i8)__msa_xori_b(q1, 0x80); \ - q2_m = (v16i8)__msa_xori_b(q2, 0x80); \ - \ - filt = __msa_subs_s_b(p1_m, q1_m); \ - q0_sub_p0 = q0_m - p0_m; \ - q0_sub_p0_sign = __msa_clti_s_b(q0_sub_p0, 0); \ - filt_sign = __msa_clti_s_b(filt, 0); \ - \ - q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0_sign, q0_sub_p0); \ - q0_sub_p0_r *= cnst3h; \ - filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \ - filt_r = filt_r + q0_sub_p0_r; \ - filt_r = __msa_sat_s_h(filt_r, 7); \ - \ - q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0_sign, q0_sub_p0); \ - q0_sub_p0_l *= cnst3h; \ - filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt); \ - filt_l = filt_l + q0_sub_p0_l; \ - filt_l = __msa_sat_s_h(filt_l, 7); \ - \ - filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r); \ - filt = filt & (v16i8)mask; \ - filt2 = filt & (v16i8)hev; \ - \ - hev = __msa_xori_b(hev, 0xff); \ - filt = filt & (v16i8)hev; \ - cnst4b = __msa_ldi_b(4); \ - filt1 = __msa_adds_s_b(filt2, cnst4b); \ - filt1 >>= 3; \ - cnst3b = __msa_ldi_b(3); \ - filt2 = __msa_adds_s_b(filt2, cnst3b); \ - filt2 >>= 3; \ - q0_m = __msa_subs_s_b(q0_m, filt1); \ - p0_m = __msa_adds_s_b(p0_m, filt2); \ - \ - filt_sign = __msa_clti_s_b(filt, 0); \ - ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l); \ - \ - cnst27h = __msa_ldi_h(27); \ - cnst63h = __msa_ldi_h(63); \ - \ - u_r = filt_r * cnst27h; \ - u_r += cnst63h; \ - u_r >>= 7; \ - u_r = __msa_sat_s_h(u_r, 7); \ - u_l = filt_l * cnst27h; \ - u_l += cnst63h; \ - u_l >>= 7; \ - u_l = __msa_sat_s_h(u_l, 7); \ - u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r); \ - q0_m = __msa_subs_s_b(q0_m, u); \ - q0 = __msa_xori_b((v16u8)q0_m, 0x80); \ - p0_m = __msa_adds_s_b(p0_m, u); \ - p0 = __msa_xori_b((v16u8)p0_m, 0x80); \ - cnst18h = __msa_ldi_h(18); \ - u_r = filt_r * cnst18h; \ - u_r += cnst63h; \ - u_r >>= 7; \ - u_r = __msa_sat_s_h(u_r, 7); \ - \ - u_l = filt_l * cnst18h; \ - u_l += cnst63h; \ - u_l >>= 7; \ - u_l = __msa_sat_s_h(u_l, 7); \ - u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r); \ - q1_m = __msa_subs_s_b(q1_m, u); \ - q1 = __msa_xori_b((v16u8)q1_m, 0x80); \ - p1_m = __msa_adds_s_b(p1_m, u); \ - p1 = __msa_xori_b((v16u8)p1_m, 0x80); \ - u_r = filt_r << 3; \ - u_r += filt_r + cnst63h; \ - u_r >>= 7; \ - u_r = __msa_sat_s_h(u_r, 7); \ - \ - u_l = filt_l << 3; \ - u_l += filt_l + cnst63h; \ - u_l >>= 7; \ - u_l = __msa_sat_s_h(u_l, 7); \ - u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r); \ - q2_m = __msa_subs_s_b(q2_m, u); \ - q2 = __msa_xori_b((v16u8)q2_m, 0x80); \ - p2_m = __msa_adds_s_b(p2_m, u); \ - p2 = __msa_xori_b((v16u8)p2_m, 0x80); \ +#define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) \ + { \ + v16i8 p2_m, p1_m, p0_m, q2_m, q1_m, q0_m; \ + v16i8 u, filt, t1, t2, filt_sign, q0_sub_p0; \ + v8i16 filt_r, filt_l, u_r, u_l; \ + v8i16 temp0, temp1, temp2, temp3; \ + const v16i8 cnst4b = __msa_ldi_b(4); \ + const v16i8 cnst3b = __msa_ldi_b(3); \ + const v8i16 cnst9h = __msa_ldi_h(9); \ + const v8i16 cnst63h = __msa_ldi_h(63); \ + \ + p2_m = (v16i8)__msa_xori_b(p2, 0x80); \ + p1_m = (v16i8)__msa_xori_b(p1, 0x80); \ + p0_m = (v16i8)__msa_xori_b(p0, 0x80); \ + q0_m = (v16i8)__msa_xori_b(q0, 0x80); \ + q1_m = (v16i8)__msa_xori_b(q1, 0x80); \ + q2_m = (v16i8)__msa_xori_b(q2, 0x80); \ + \ + filt = __msa_subs_s_b(p1_m, q1_m); \ + q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt &= mask; \ + \ + t2 = filt & hev; \ + hev = __msa_xori_b(hev, 0xff); \ + filt &= hev; \ + t1 = __msa_adds_s_b(t2, cnst4b); \ + t1 >>= cnst3b; \ + t2 = __msa_adds_s_b(t2, cnst3b); \ + t2 >>= cnst3b; \ + q0_m = __msa_subs_s_b(q0_m, t1); \ + p0_m = __msa_adds_s_b(p0_m, t2); \ + filt_sign = __msa_clti_s_b(filt, 0); \ + ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l); \ + temp0 = filt_r * cnst9h; \ + temp1 = temp0 + cnst63h; \ + temp2 = filt_l * cnst9h; \ + temp3 = temp2 + cnst63h; \ + \ + u_r = temp1 >> 7; \ + u_r = __msa_sat_s_h(u_r, 7); \ + u_l = temp3 >> 7; \ + u_l = __msa_sat_s_h(u_l, 7); \ + u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r); \ + q2_m = __msa_subs_s_b(q2_m, u); \ + p2_m = __msa_adds_s_b(p2_m, u); \ + q2 = __msa_xori_b((v16u8)q2_m, 0x80); \ + p2 = __msa_xori_b((v16u8)p2_m, 0x80); \ + \ + temp1 += temp0; \ + temp3 += temp2; \ + \ + u_r = temp1 >> 7; \ + u_r = __msa_sat_s_h(u_r, 7); \ + u_l = temp3 >> 7; \ + u_l = __msa_sat_s_h(u_l, 7); \ + u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r); \ + q1_m = __msa_subs_s_b(q1_m, u); \ + p1_m = __msa_adds_s_b(p1_m, u); \ + q1 = __msa_xori_b((v16u8)q1_m, 0x80); \ + p1 = __msa_xori_b((v16u8)p1_m, 0x80); \ + \ + temp1 += temp0; \ + temp3 += temp2; \ + \ + u_r = temp1 >> 7; \ + u_r = __msa_sat_s_h(u_r, 7); \ + u_l = temp3 >> 7; \ + u_l = __msa_sat_s_h(u_l, 7); \ + u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r); \ + q0_m = __msa_subs_s_b(q0_m, u); \ + p0_m = __msa_adds_s_b(p0_m, u); \ + q0 = __msa_xori_b((v16u8)q0_m, 0x80); \ + p0 = __msa_xori_b((v16u8)p0_m, 0x80); \ } #define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/msa/vp8_macros_msa.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/msa/vp8_macros_msa.h index 65905f6c027..6bec3adec39 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/msa/vp8_macros_msa.h +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/mips/msa/vp8_macros_msa.h @@ -1221,6 +1221,8 @@ } #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__) #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__) +#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__) +#define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__) #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ out2, out3) \ diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/common/onyx.h b/chromium/third_party/libvpx/source/libvpx/vp8/common/onyx.h index 43e3c29b509..72fba2ec56b 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/common/onyx.h +++ b/chromium/third_party/libvpx/source/libvpx/vp8/common/onyx.h @@ -110,6 +110,8 @@ typedef struct { int Sharpness; int cpu_used; unsigned int rc_max_intra_bitrate_pct; + /* percent of rate boost for golden frame in CBR mode. */ + unsigned int gf_cbr_boost_pct; unsigned int screen_content_mode; /* mode -> diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c index 87560f28b1e..c5389594553 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_if.c @@ -1467,6 +1467,12 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) { cpi->baseline_gf_interval = cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL; + // GF behavior for 1 pass CBR, used when error_resilience is off. + if (!cpi->oxcf.error_resilient_mode && + cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER && + cpi->oxcf.Mode == MODE_REALTIME) + cpi->baseline_gf_interval = cpi->gf_interval_onepass_cbr; + #if (CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING) cpi->oxcf.token_partitions = 3; #endif @@ -1766,9 +1772,13 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { cpi->mse_source_denoised = 0; /* Should we use the cyclic refresh method. - * Currently this is tied to error resilliant mode + * Currently there is no external control for this. + * Enable it for error_resilient_mode, or for 1 pass CBR mode. */ - cpi->cyclic_refresh_mode_enabled = cpi->oxcf.error_resilient_mode; + cpi->cyclic_refresh_mode_enabled = + (cpi->oxcf.error_resilient_mode || + (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER && + cpi->oxcf.Mode <= 2)); cpi->cyclic_refresh_mode_max_mbs_perframe = (cpi->common.mb_rows * cpi->common.mb_cols) / 7; if (cpi->oxcf.number_of_layers == 1) { @@ -1781,6 +1791,23 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { cpi->cyclic_refresh_mode_index = 0; cpi->cyclic_refresh_q = 32; + // GF behavior for 1 pass CBR, used when error_resilience is off. + cpi->gf_update_onepass_cbr = 0; + cpi->gf_noboost_onepass_cbr = 0; + if (!cpi->oxcf.error_resilient_mode && + cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER && cpi->oxcf.Mode <= 2) { + cpi->gf_update_onepass_cbr = 1; + cpi->gf_noboost_onepass_cbr = 1; + cpi->gf_interval_onepass_cbr = + cpi->cyclic_refresh_mode_max_mbs_perframe > 0 + ? (2 * (cpi->common.mb_rows * cpi->common.mb_cols) / + cpi->cyclic_refresh_mode_max_mbs_perframe) + : 10; + cpi->gf_interval_onepass_cbr = + VPXMIN(40, VPXMAX(6, cpi->gf_interval_onepass_cbr)); + cpi->baseline_gf_interval = cpi->gf_interval_onepass_cbr; + } + if (cpi->cyclic_refresh_mode_enabled) { CHECK_MEM_ERROR(cpi->cyclic_refresh_map, vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1)); @@ -3925,7 +3952,6 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, #else /* transform / motion compensation build reconstruction frame */ vp8_encode_frame(cpi); - if (cpi->oxcf.screen_content_mode == 2) { if (vp8_drop_encodedframe_overshoot(cpi, Q)) return; } @@ -4203,6 +4229,20 @@ static void encode_frame_to_data_rate(VP8_COMP *cpi, size_t *size, } } while (Loop == 1); +#if defined(DROP_UNCODED_FRAMES) + /* if there are no coded macroblocks at all drop this frame */ + if (cpi->common.MBs == cpi->mb.skip_true_count && + (cpi->drop_frame_count & 7) != 7 && cm->frame_type != KEY_FRAME) { + cpi->common.current_video_frame++; + cpi->frames_since_key++; + cpi->drop_frame_count++; + // We advance the temporal pattern for dropped frames. + cpi->temporal_pattern_counter++; + return; + } + cpi->drop_frame_count = 0; +#endif + #if 0 /* Experimental code for lagged and one pass * Update stats used for one pass GF selection diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h index 59ad5773a64..bfcc6457c19 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h +++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/onyx_int.h @@ -413,6 +413,9 @@ typedef struct VP8_COMP { int drop_frames_allowed; /* Are we permitted to drop frames? */ int drop_frame; /* Drop this frame? */ +#if defined(DROP_UNCODED_FRAMES) + int drop_frame_count; +#endif vp8_prob frame_coef_probs[BLOCK_TYPES][COEF_BANDS][PREV_COEF_CONTEXTS] [ENTROPY_NODES]; @@ -501,6 +504,11 @@ typedef struct VP8_COMP { int force_maxqp; + // GF update for 1 pass cbr. + int gf_update_onepass_cbr; + int gf_interval_onepass_cbr; + int gf_noboost_onepass_cbr; + #if CONFIG_MULTITHREAD /* multithread data */ int *mt_current_mb_col; diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ratectrl.c b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ratectrl.c index 4d6afc19b35..e89247ae4ae 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ratectrl.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/encoder/ratectrl.c @@ -885,61 +885,61 @@ static void calc_pframe_target_size(VP8_COMP *cpi) { /* Adjust target frame size for Golden Frames: */ if (cpi->oxcf.error_resilient_mode == 0 && (cpi->frames_till_gf_update_due == 0) && !cpi->drop_frame) { - int Q = - (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q; - - int gf_frame_useage = 0; /* Golden frame useage since last GF */ - int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME] + - cpi->recent_ref_frame_usage[LAST_FRAME] + - cpi->recent_ref_frame_usage[GOLDEN_FRAME] + - cpi->recent_ref_frame_usage[ALTREF_FRAME]; - - int pct_gf_active = (100 * cpi->gf_active_count) / - (cpi->common.mb_rows * cpi->common.mb_cols); - - if (tot_mbs) { - gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] + - cpi->recent_ref_frame_usage[ALTREF_FRAME]) * - 100 / tot_mbs; - } + if (!cpi->gf_update_onepass_cbr) { + int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] + : cpi->oxcf.fixed_q; + + int gf_frame_useage = 0; /* Golden frame useage since last GF */ + int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME] + + cpi->recent_ref_frame_usage[LAST_FRAME] + + cpi->recent_ref_frame_usage[GOLDEN_FRAME] + + cpi->recent_ref_frame_usage[ALTREF_FRAME]; + + int pct_gf_active = (100 * cpi->gf_active_count) / + (cpi->common.mb_rows * cpi->common.mb_cols); + + if (tot_mbs) { + gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] + + cpi->recent_ref_frame_usage[ALTREF_FRAME]) * + 100 / tot_mbs; + } - if (pct_gf_active > gf_frame_useage) gf_frame_useage = pct_gf_active; + if (pct_gf_active > gf_frame_useage) gf_frame_useage = pct_gf_active; - /* Is a fixed manual GF frequency being used */ - if (cpi->auto_gold) { - /* For one pass throw a GF if recent frame intra useage is - * low or the GF useage is high - */ - if ((cpi->pass == 0) && - (cpi->this_frame_percent_intra < 15 || gf_frame_useage >= 5)) { - cpi->common.refresh_golden_frame = 1; + /* Is a fixed manual GF frequency being used */ + if (cpi->auto_gold) { + /* For one pass throw a GF if recent frame intra useage is + * low or the GF useage is high + */ + if ((cpi->pass == 0) && + (cpi->this_frame_percent_intra < 15 || gf_frame_useage >= 5)) { + cpi->common.refresh_golden_frame = 1; - /* Two pass GF descision */ - } else if (cpi->pass == 2) { - cpi->common.refresh_golden_frame = 1; + /* Two pass GF descision */ + } else if (cpi->pass == 2) { + cpi->common.refresh_golden_frame = 1; + } } - } #if 0 - /* Debug stats */ - if (0) - { - FILE *f; + /* Debug stats */ + if (0) { + FILE *f; - f = fopen("gf_useaget.stt", "a"); - fprintf(f, " %8ld %10ld %10ld %10ld %10ld\n", - cpi->common.current_video_frame, cpi->gfu_boost, GFQ_ADJUSTMENT, cpi->gfu_boost, gf_frame_useage); - fclose(f); - } + f = fopen("gf_useaget.stt", "a"); + fprintf(f, " %8ld %10ld %10ld %10ld %10ld\n", + cpi->common.current_video_frame, cpi->gfu_boost, + GFQ_ADJUSTMENT, cpi->gfu_boost, gf_frame_useage); + fclose(f); + } #endif - if (cpi->common.refresh_golden_frame == 1) { + if (cpi->common.refresh_golden_frame == 1) { #if 0 - if (0) - { + if (0) { FILE *f; f = fopen("GFexit.stt", "a"); @@ -949,61 +949,76 @@ static void calc_pframe_target_size(VP8_COMP *cpi) { #endif - if (cpi->auto_adjust_gold_quantizer) { - calc_gf_params(cpi); - } - - /* If we are using alternate ref instead of gf then do not apply the - * boost It will instead be applied to the altref update Jims - * modified boost - */ - if (!cpi->source_alt_ref_active) { - if (cpi->oxcf.fixed_q < 0) { - if (cpi->pass == 2) { - /* The spend on the GF is defined in the two pass - * code for two pass encodes - */ - cpi->this_frame_target = cpi->per_frame_bandwidth; - } else { - int Boost = cpi->last_boost; - int frames_in_section = cpi->frames_till_gf_update_due + 1; - int allocation_chunks = (frames_in_section * 100) + (Boost - 100); - int bits_in_section = cpi->inter_frame_target * frames_in_section; - - /* Normalize Altboost and allocations chunck down to - * prevent overflow - */ - while (Boost > 1000) { - Boost /= 2; - allocation_chunks /= 2; - } + if (cpi->auto_adjust_gold_quantizer) { + calc_gf_params(cpi); + } - /* Avoid loss of precision but avoid overflow */ - if ((bits_in_section >> 7) > allocation_chunks) { - cpi->this_frame_target = - Boost * (bits_in_section / allocation_chunks); + /* If we are using alternate ref instead of gf then do not apply the + * boost It will instead be applied to the altref update Jims + * modified boost + */ + if (!cpi->source_alt_ref_active) { + if (cpi->oxcf.fixed_q < 0) { + if (cpi->pass == 2) { + /* The spend on the GF is defined in the two pass + * code for two pass encodes + */ + cpi->this_frame_target = cpi->per_frame_bandwidth; } else { - cpi->this_frame_target = - (Boost * bits_in_section) / allocation_chunks; + int Boost = cpi->last_boost; + int frames_in_section = cpi->frames_till_gf_update_due + 1; + int allocation_chunks = (frames_in_section * 100) + (Boost - 100); + int bits_in_section = cpi->inter_frame_target * frames_in_section; + + /* Normalize Altboost and allocations chunck down to + * prevent overflow + */ + while (Boost > 1000) { + Boost /= 2; + allocation_chunks /= 2; + } + + /* Avoid loss of precision but avoid overflow */ + if ((bits_in_section >> 7) > allocation_chunks) { + cpi->this_frame_target = + Boost * (bits_in_section / allocation_chunks); + } else { + cpi->this_frame_target = + (Boost * bits_in_section) / allocation_chunks; + } } + } else { + cpi->this_frame_target = + (estimate_bits_at_q(1, Q, cpi->common.MBs, 1.0) * + cpi->last_boost) / + 100; } } else { - cpi->this_frame_target = - (estimate_bits_at_q(1, Q, cpi->common.MBs, 1.0) * - cpi->last_boost) / - 100; + /* If there is an active ARF at this location use the minimum + * bits on this frame even if it is a contructed arf. + * The active maximum quantizer insures that an appropriate + * number of bits will be spent if needed for contstructed ARFs. + */ + cpi->this_frame_target = 0; } + cpi->current_gf_interval = cpi->frames_till_gf_update_due; } - /* If there is an active ARF at this location use the minimum - * bits on this frame even if it is a contructed arf. - * The active maximum quantizer insures that an appropriate - * number of bits will be spent if needed for contstructed ARFs. - */ - else { - cpi->this_frame_target = 0; + } else { + // Special case for 1 pass CBR: fixed gf period. + // TODO(marpan): Adjust this boost/interval logic. + // If gf_cbr_boost_pct is small (below threshold) set the flag + // gf_noboost_onepass_cbr = 1, which forces the gf to use the same + // rate correction factor as last. + cpi->gf_noboost_onepass_cbr = (cpi->oxcf.gf_cbr_boost_pct <= 100); + cpi->baseline_gf_interval = cpi->gf_interval_onepass_cbr; + // Skip this update if the zero_mvcount is low. + if (cpi->zeromv_count > (cpi->common.MBs >> 1)) { + cpi->common.refresh_golden_frame = 1; + cpi->this_frame_target = + (cpi->this_frame_target * (100 + cpi->oxcf.gf_cbr_boost_pct)) / 100; } - + cpi->frames_till_gf_update_due = cpi->baseline_gf_interval; cpi->current_gf_interval = cpi->frames_till_gf_update_due; } } @@ -1025,8 +1040,9 @@ void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var) { if (cpi->common.frame_type == KEY_FRAME) { rate_correction_factor = cpi->key_frame_rate_correction_factor; } else { - if (cpi->oxcf.number_of_layers == 1 && (cpi->common.refresh_alt_ref_frame || - cpi->common.refresh_golden_frame)) { + if (cpi->oxcf.number_of_layers == 1 && !cpi->gf_noboost_onepass_cbr && + (cpi->common.refresh_alt_ref_frame || + cpi->common.refresh_golden_frame)) { rate_correction_factor = cpi->gf_rate_correction_factor; } else { rate_correction_factor = cpi->rate_correction_factor; @@ -1102,8 +1118,9 @@ void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var) { if (cpi->common.frame_type == KEY_FRAME) { cpi->key_frame_rate_correction_factor = rate_correction_factor; } else { - if (cpi->oxcf.number_of_layers == 1 && (cpi->common.refresh_alt_ref_frame || - cpi->common.refresh_golden_frame)) { + if (cpi->oxcf.number_of_layers == 1 && !cpi->gf_noboost_onepass_cbr && + (cpi->common.refresh_alt_ref_frame || + cpi->common.refresh_golden_frame)) { cpi->gf_rate_correction_factor = rate_correction_factor; } else { cpi->rate_correction_factor = rate_correction_factor; @@ -1118,7 +1135,6 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) { cpi->active_worst_quality = cpi->worst_quality; return cpi->worst_quality; } - /* Reset Zbin OQ value */ cpi->mb.zbin_over_quant = 0; @@ -1128,10 +1144,12 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) { if (cpi->common.frame_type == KEY_FRAME) { Q = cpi->oxcf.key_q; } else if (cpi->oxcf.number_of_layers == 1 && - cpi->common.refresh_alt_ref_frame) { + cpi->common.refresh_alt_ref_frame && + !cpi->gf_noboost_onepass_cbr) { Q = cpi->oxcf.alt_q; } else if (cpi->oxcf.number_of_layers == 1 && - cpi->common.refresh_golden_frame) { + cpi->common.refresh_golden_frame && + !cpi->gf_noboost_onepass_cbr) { Q = cpi->oxcf.gold_q; } } else { @@ -1145,7 +1163,7 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) { if (cpi->common.frame_type == KEY_FRAME) { correction_factor = cpi->key_frame_rate_correction_factor; } else { - if (cpi->oxcf.number_of_layers == 1 && + if (cpi->oxcf.number_of_layers == 1 && !cpi->gf_noboost_onepass_cbr && (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)) { correction_factor = cpi->gf_rate_correction_factor; @@ -1199,6 +1217,7 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) { if (cpi->common.frame_type == KEY_FRAME) { zbin_oqmax = 0; } else if (cpi->oxcf.number_of_layers == 1 && + !cpi->gf_noboost_onepass_cbr && (cpi->common.refresh_alt_ref_frame || (cpi->common.refresh_golden_frame && !cpi->source_alt_ref_active))) { diff --git a/chromium/third_party/libvpx/source/libvpx/vp8/vp8_cx_iface.c b/chromium/third_party/libvpx/source/libvpx/vp8/vp8_cx_iface.c index fac237eec02..f8475ed61da 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp8/vp8_cx_iface.c +++ b/chromium/third_party/libvpx/source/libvpx/vp8/vp8_cx_iface.c @@ -40,6 +40,7 @@ struct vp8_extracfg { vp8e_tuning tuning; unsigned int cq_level; /* constrained quality level */ unsigned int rc_max_intra_bitrate_pct; + unsigned int gf_cbr_boost_pct; unsigned int screen_content_mode; }; @@ -65,6 +66,7 @@ static struct vp8_extracfg default_extracfg = { 0, /* tuning*/ 10, /* cq_level */ 0, /* rc_max_intra_bitrate_pct */ + 0, /* gf_cbr_boost_pct */ 0, /* screen_content_mode */ }; @@ -315,6 +317,7 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf, oxcf->target_bandwidth = cfg.rc_target_bitrate; oxcf->rc_max_intra_bitrate_pct = vp8_cfg.rc_max_intra_bitrate_pct; + oxcf->gf_cbr_boost_pct = vp8_cfg.gf_cbr_boost_pct; oxcf->best_allowed_q = cfg.rc_min_quantizer; oxcf->worst_allowed_q = cfg.rc_max_quantizer; @@ -558,6 +561,13 @@ static vpx_codec_err_t set_rc_max_intra_bitrate_pct(vpx_codec_alg_priv_t *ctx, return update_extracfg(ctx, &extra_cfg); } +static vpx_codec_err_t ctrl_set_rc_gf_cbr_boost_pct(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp8_extracfg extra_cfg = ctx->vp8_cfg; + extra_cfg.gf_cbr_boost_pct = CAST(VP8E_SET_GF_CBR_BOOST_PCT, args); + return update_extracfg(ctx, &extra_cfg); +} + static vpx_codec_err_t set_screen_content_mode(vpx_codec_alg_priv_t *ctx, va_list args) { struct vp8_extracfg extra_cfg = ctx->vp8_cfg; @@ -1159,6 +1169,7 @@ static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] = { { VP8E_SET_CQ_LEVEL, set_cq_level }, { VP8E_SET_MAX_INTRA_BITRATE_PCT, set_rc_max_intra_bitrate_pct }, { VP8E_SET_SCREEN_CONTENT_MODE, set_screen_content_mode }, + { VP8E_SET_GF_CBR_BOOST_PCT, ctrl_set_rc_gf_cbr_boost_pct }, { -1, NULL }, }; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_frame_buffers.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_frame_buffers.c index efcf2bf885b..a254e79d20e 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_frame_buffers.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_frame_buffers.c @@ -52,14 +52,12 @@ int vp9_get_frame_buffer(void *cb_priv, size_t min_size, if (i == int_fb_list->num_internal_frame_buffers) return -1; if (int_fb_list->int_fb[i].size < min_size) { - int_fb_list->int_fb[i].data = - (uint8_t *)vpx_realloc(int_fb_list->int_fb[i].data, min_size); - if (!int_fb_list->int_fb[i].data) return -1; - - // This memset is needed for fixing valgrind error from C loop filter + vpx_free(int_fb_list->int_fb[i].data); + // The data must be zeroed to fix a valgrind error from the C loop filter // due to access uninitialized memory in frame border. It could be - // removed if border is totally removed. - memset(int_fb_list->int_fb[i].data, 0, min_size); + // skipped if border were totally removed. + int_fb_list->int_fb[i].data = (uint8_t *)vpx_calloc(1, min_size); + if (!int_fb_list->int_fb[i].data) return -1; int_fb_list->int_fb[i].size = min_size; } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.c b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.c index c6a39f85ca0..e3a088e2870 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_idct.c @@ -331,8 +331,8 @@ void vp9_highbd_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride, // DC only DCT coefficient if (eob == 1) { vpx_highbd_idct8x8_1_add(input, dest, stride, bd); - } else if (eob <= 10) { - vpx_highbd_idct8x8_10_add(input, dest, stride, bd); + } else if (eob <= 12) { + vpx_highbd_idct8x8_12_add(input, dest, stride, bd); } else { vpx_highbd_idct8x8_64_add(input, dest, stride, bd); } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl index fafc6598393..abef0676396 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl +++ b/chromium/third_party/libvpx/source/libvpx/vp9/common/vp9_rtcd_defs.pl @@ -137,6 +137,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; add_proto qw/void vp9_fdct8x8_quant/, "const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/vp9_fdct8x8_quant ssse3/; } else { add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz"; specialize qw/vp9_block_error avx2 msa sse2/; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.c b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.c index fde0b7e318c..628d1c8d2bc 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodeframe.c @@ -1517,7 +1517,6 @@ static int tile_worker_hook(TileWorkerData *const tile_data, return 0; } - tile_data->xd.error_info = &tile_data->error_info; tile_data->xd.corrupted = 0; do { @@ -1529,6 +1528,8 @@ static int tile_worker_hook(TileWorkerData *const tile_data, &tile_data->error_info, &tile_data->bit_reader, pbi->decrypt_cb, pbi->decrypt_state); vp9_init_macroblockd(&pbi->common, &tile_data->xd, tile_data->dqcoeff); + // init resets xd.error_info + tile_data->xd.error_info = &tile_data->error_info; for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end; mi_row += MI_BLOCK_SIZE) { diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodemv.c b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodemv.c index 4372ba0371d..1a4152436a2 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodemv.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/decoder/vp9_decodemv.c @@ -770,6 +770,10 @@ static void read_inter_block_mode_info(VP9Decoder *const pbi, int idx, idy; PREDICTION_MODE b_mode; int_mv best_sub8x8[2]; + const uint32_t invalid_mv = 0x80008000; + // Initialize the 2nd element as even though it won't be used meaningfully + // if is_compound is false, copying/clamping it may trigger a MSan warning. + best_sub8x8[1].as_int = invalid_mv; for (idy = 0; idy < 2; idy += num_4x4_h) { for (idx = 0; idx < 2; idx += num_4x4_w) { const int j = idy * 2 + idx; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.c index 3f1c430f98d..49aea69ebd1 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.c @@ -80,8 +80,8 @@ static void prob_diff_update(const vpx_tree_index *tree, vp9_cond_prob_diff_update(w, &probs[i], branch_ct[i]); } -static void write_selected_tx_size(const VP9_COMMON *cm, const MACROBLOCKD *xd, - vpx_writer *w) { +static void write_selected_tx_size(const VP9_COMMON *cm, + const MACROBLOCKD *const xd, vpx_writer *w) { TX_SIZE tx_size = xd->mi[0]->tx_size; BLOCK_SIZE bsize = xd->mi[0]->sb_type; const TX_SIZE max_tx_size = max_txsize_lookup[bsize]; @@ -95,7 +95,7 @@ static void write_selected_tx_size(const VP9_COMMON *cm, const MACROBLOCKD *xd, } } -static int write_skip(const VP9_COMMON *cm, const MACROBLOCKD *xd, +static int write_skip(const VP9_COMMON *cm, const MACROBLOCKD *const xd, int segment_id, const MODE_INFO *mi, vpx_writer *w) { if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) { return 1; @@ -195,7 +195,7 @@ static void write_segment_id(vpx_writer *w, const struct segmentation *seg, } // This function encodes the reference frame -static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *xd, +static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *const xd, vpx_writer *w) { const MODE_INFO *const mi = xd->mi[0]; const int is_compound = has_second_ref(mi); @@ -230,14 +230,16 @@ static void write_ref_frames(const VP9_COMMON *cm, const MACROBLOCKD *xd, } } -static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi, - vpx_writer *w) { +static void pack_inter_mode_mvs(VP9_COMP *cpi, const MACROBLOCKD *const xd, + const MB_MODE_INFO_EXT *const mbmi_ext, + vpx_writer *w, + unsigned int *const max_mv_magnitude, + int interp_filter_selected[MAX_REF_FRAMES] + [SWITCHABLE]) { VP9_COMMON *const cm = &cpi->common; const nmv_context *nmvc = &cm->fc->nmvc; - const MACROBLOCK *const x = &cpi->td.mb; - const MACROBLOCKD *const xd = &x->e_mbd; const struct segmentation *const seg = &cm->seg; - const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext; + const MODE_INFO *const mi = xd->mi[0]; const PREDICTION_MODE mode = mi->mode; const int segment_id = mi->segment_id; const BLOCK_SIZE bsize = mi->sb_type; @@ -299,7 +301,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi, vp9_write_token(w, vp9_switchable_interp_tree, cm->fc->switchable_interp_prob[ctx], &switchable_interp_encodings[mi->interp_filter]); - ++cpi->interp_filter_selected[0][mi->interp_filter]; + ++interp_filter_selected[0][mi->interp_filter]; } else { assert(mi->interp_filter == cm->interp_filter); } @@ -317,7 +319,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi, for (ref = 0; ref < 1 + is_compound; ++ref) vp9_encode_mv(cpi, w, &mi->bmi[j].as_mv[ref].as_mv, &mbmi_ext->ref_mvs[mi->ref_frame[ref]][0].as_mv, - nmvc, allow_hp); + nmvc, allow_hp, max_mv_magnitude); } } } @@ -326,16 +328,16 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, const MODE_INFO *mi, for (ref = 0; ref < 1 + is_compound; ++ref) vp9_encode_mv(cpi, w, &mi->mv[ref].as_mv, &mbmi_ext->ref_mvs[mi->ref_frame[ref]][0].as_mv, nmvc, - allow_hp); + allow_hp, max_mv_magnitude); } } } } static void write_mb_modes_kf(const VP9_COMMON *cm, const MACROBLOCKD *xd, - MODE_INFO **mi_8x8, vpx_writer *w) { + vpx_writer *w) { const struct segmentation *const seg = &cm->seg; - const MODE_INFO *const mi = mi_8x8[0]; + const MODE_INFO *const mi = xd->mi[0]; const MODE_INFO *const above_mi = xd->above_mi; const MODE_INFO *const left_mi = xd->left_mi; const BLOCK_SIZE bsize = mi->sb_type; @@ -366,27 +368,29 @@ static void write_mb_modes_kf(const VP9_COMMON *cm, const MACROBLOCKD *xd, write_intra_mode(w, mi->uv_mode, vp9_kf_uv_mode_prob[mi->mode]); } -static void write_modes_b(VP9_COMP *cpi, const TileInfo *const tile, - vpx_writer *w, TOKENEXTRA **tok, - const TOKENEXTRA *const tok_end, int mi_row, - int mi_col) { +static void write_modes_b(VP9_COMP *cpi, MACROBLOCKD *const xd, + const TileInfo *const tile, vpx_writer *w, + TOKENEXTRA **tok, const TOKENEXTRA *const tok_end, + int mi_row, int mi_col, + unsigned int *const max_mv_magnitude, + int interp_filter_selected[MAX_REF_FRAMES] + [SWITCHABLE]) { const VP9_COMMON *const cm = &cpi->common; - MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; + const MB_MODE_INFO_EXT *const mbmi_ext = + cpi->td.mb.mbmi_ext_base + (mi_row * cm->mi_cols + mi_col); MODE_INFO *m; xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col); m = xd->mi[0]; - cpi->td.mb.mbmi_ext = - cpi->td.mb.mbmi_ext_base + (mi_row * cm->mi_cols + mi_col); - set_mi_row_col(xd, tile, mi_row, num_8x8_blocks_high_lookup[m->sb_type], mi_col, num_8x8_blocks_wide_lookup[m->sb_type], cm->mi_rows, cm->mi_cols); if (frame_is_intra_only(cm)) { - write_mb_modes_kf(cm, xd, xd->mi, w); + write_mb_modes_kf(cm, xd, w); } else { - pack_inter_mode_mvs(cpi, m, w); + pack_inter_mode_mvs(cpi, xd, mbmi_ext, w, max_mv_magnitude, + interp_filter_selected); } assert(*tok < tok_end); @@ -415,13 +419,14 @@ static void write_partition(const VP9_COMMON *const cm, } } -static void write_modes_sb(VP9_COMP *cpi, const TileInfo *const tile, - vpx_writer *w, TOKENEXTRA **tok, - const TOKENEXTRA *const tok_end, int mi_row, - int mi_col, BLOCK_SIZE bsize) { +static void write_modes_sb(VP9_COMP *cpi, MACROBLOCKD *const xd, + const TileInfo *const tile, vpx_writer *w, + TOKENEXTRA **tok, const TOKENEXTRA *const tok_end, + int mi_row, int mi_col, BLOCK_SIZE bsize, + unsigned int *const max_mv_magnitude, + int interp_filter_selected[MAX_REF_FRAMES] + [SWITCHABLE]) { const VP9_COMMON *const cm = &cpi->common; - MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; - const int bsl = b_width_log2_lookup[bsize]; const int bs = (1 << bsl) / 4; PARTITION_TYPE partition; @@ -436,30 +441,37 @@ static void write_modes_sb(VP9_COMP *cpi, const TileInfo *const tile, write_partition(cm, xd, bs, mi_row, mi_col, partition, bsize, w); subsize = get_subsize(bsize, partition); if (subsize < BLOCK_8X8) { - write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col, + max_mv_magnitude, interp_filter_selected); } else { switch (partition) { case PARTITION_NONE: - write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col, + max_mv_magnitude, interp_filter_selected); break; case PARTITION_HORZ: - write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col, + max_mv_magnitude, interp_filter_selected); if (mi_row + bs < cm->mi_rows) - write_modes_b(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col); + write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row + bs, mi_col, + max_mv_magnitude, interp_filter_selected); break; case PARTITION_VERT: - write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col); + write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col, + max_mv_magnitude, interp_filter_selected); if (mi_col + bs < cm->mi_cols) - write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs); + write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col + bs, + max_mv_magnitude, interp_filter_selected); break; case PARTITION_SPLIT: - write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize); - write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs, - subsize); - write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col, - subsize); - write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col + bs, - subsize); + write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col, subsize, + max_mv_magnitude, interp_filter_selected); + write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col + bs, + subsize, max_mv_magnitude, interp_filter_selected); + write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row + bs, mi_col, + subsize, max_mv_magnitude, interp_filter_selected); + write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row + bs, mi_col + bs, + subsize, max_mv_magnitude, interp_filter_selected); break; default: assert(0); } @@ -471,11 +483,13 @@ static void write_modes_sb(VP9_COMP *cpi, const TileInfo *const tile, update_partition_context(xd, mi_row, mi_col, subsize, bsize); } -static void write_modes(VP9_COMP *cpi, const TileInfo *const tile, - vpx_writer *w, TOKENEXTRA **tok, - const TOKENEXTRA *const tok_end) { +static void write_modes(VP9_COMP *cpi, MACROBLOCKD *const xd, + const TileInfo *const tile, vpx_writer *w, + TOKENEXTRA **tok, const TOKENEXTRA *const tok_end, + unsigned int *const max_mv_magnitude, + int interp_filter_selected[MAX_REF_FRAMES] + [SWITCHABLE]) { const VP9_COMMON *const cm = &cpi->common; - MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; int mi_row, mi_col; set_partition_probs(cm, xd); @@ -485,7 +499,8 @@ static void write_modes(VP9_COMP *cpi, const TileInfo *const tile, vp9_zero(xd->left_seg_context); for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end; mi_col += MI_BLOCK_SIZE) - write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, BLOCK_64X64); + write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col, + BLOCK_64X64, max_mv_magnitude, interp_filter_selected); } } @@ -900,8 +915,128 @@ int vp9_get_refresh_mask(VP9_COMP *cpi) { } } +static int encode_tile_worker(VP9_COMP *cpi, VP9BitstreamWorkerData *data) { + MACROBLOCKD *const xd = &data->xd; + vpx_start_encode(&data->bit_writer, data->dest); + write_modes(cpi, xd, &cpi->tile_data[data->tile_idx].tile_info, + &data->bit_writer, &data->tok, data->tok_end, + &data->max_mv_magnitude, data->interp_filter_selected); + assert(data->tok == data->tok_end); + vpx_stop_encode(&data->bit_writer); + return 1; +} + +void vp9_bitstream_encode_tiles_buffer_dealloc(VP9_COMP *const cpi) { + if (cpi->vp9_bitstream_worker_data) { + int i; + for (i = 1; i < cpi->num_workers; ++i) { + vpx_free(cpi->vp9_bitstream_worker_data[i].dest); + } + vpx_free(cpi->vp9_bitstream_worker_data); + cpi->vp9_bitstream_worker_data = NULL; + } +} + +static int encode_tiles_buffer_alloc(VP9_COMP *const cpi) { + int i; + const size_t worker_data_size = + cpi->num_workers * sizeof(*cpi->vp9_bitstream_worker_data); + cpi->vp9_bitstream_worker_data = vpx_memalign(16, worker_data_size); + memset(cpi->vp9_bitstream_worker_data, 0, worker_data_size); + if (!cpi->vp9_bitstream_worker_data) return 1; + for (i = 1; i < cpi->num_workers; ++i) { + cpi->vp9_bitstream_worker_data[i].dest_size = + cpi->oxcf.width * cpi->oxcf.height; + cpi->vp9_bitstream_worker_data[i].dest = + vpx_malloc(cpi->vp9_bitstream_worker_data[i].dest_size); + if (!cpi->vp9_bitstream_worker_data[i].dest) return 1; + } + return 0; +} + +static size_t encode_tiles_mt(VP9_COMP *cpi, uint8_t *data_ptr) { + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int num_workers = cpi->num_workers; + size_t total_size = 0; + int tile_col = 0; + + if (!cpi->vp9_bitstream_worker_data || + cpi->vp9_bitstream_worker_data[1].dest_size > + (cpi->oxcf.width * cpi->oxcf.height)) { + vp9_bitstream_encode_tiles_buffer_dealloc(cpi); + if (encode_tiles_buffer_alloc(cpi)) return 0; + } + + while (tile_col < tile_cols) { + int i, j; + for (i = 0; i < num_workers && tile_col < tile_cols; ++i) { + VPxWorker *const worker = &cpi->workers[i]; + VP9BitstreamWorkerData *const data = &cpi->vp9_bitstream_worker_data[i]; + + // Populate the worker data. + data->xd = cpi->td.mb.e_mbd; + data->tile_idx = tile_col; + data->tok = cpi->tile_tok[0][tile_col]; + data->tok_end = cpi->tile_tok[0][tile_col] + cpi->tok_count[0][tile_col]; + data->max_mv_magnitude = cpi->max_mv_magnitude; + memset(data->interp_filter_selected, 0, + sizeof(data->interp_filter_selected[0][0]) * SWITCHABLE); + + // First thread can directly write into the output buffer. + if (i == 0) { + // If this worker happens to be for the last tile, then do not offset it + // by 4 for the tile size. + data->dest = + data_ptr + total_size + (tile_col == tile_cols - 1 ? 0 : 4); + } + worker->data1 = cpi; + worker->data2 = data; + worker->hook = (VPxWorkerHook)encode_tile_worker; + worker->had_error = 0; + + if (i < num_workers - 1) { + winterface->launch(worker); + } else { + winterface->execute(worker); + } + ++tile_col; + } + for (j = 0; j < i; ++j) { + VPxWorker *const worker = &cpi->workers[j]; + VP9BitstreamWorkerData *const data = + (VP9BitstreamWorkerData *)worker->data2; + uint32_t tile_size; + int k; + + if (!winterface->sync(worker)) return 0; + tile_size = data->bit_writer.pos; + + // Aggregate per-thread bitstream stats. + cpi->max_mv_magnitude = + VPXMAX(cpi->max_mv_magnitude, data->max_mv_magnitude); + for (k = 0; k < SWITCHABLE; ++k) { + cpi->interp_filter_selected[0][k] += data->interp_filter_selected[0][k]; + } + + // Prefix the size of the tile on all but the last. + if (tile_col != tile_cols || j < i - 1) { + mem_put_be32(data_ptr + total_size, tile_size); + total_size += 4; + } + if (j > 0) { + memcpy(data_ptr + total_size, data->dest, tile_size); + } + total_size += tile_size; + } + } + return total_size; +} + static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) { VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &cpi->td.mb.e_mbd; vpx_writer residual_bc; int tile_row, tile_col; TOKENEXTRA *tok_end; @@ -912,6 +1047,14 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) { memset(cm->above_seg_context, 0, sizeof(*cm->above_seg_context) * mi_cols_aligned_to_sb(cm->mi_cols)); + // Encoding tiles in parallel is done only for realtime mode now. In other + // modes the speed up is insignificant and requires further testing to ensure + // that it does not make the overall process worse in any case. + if (cpi->oxcf.mode == REALTIME && cpi->num_workers > 1 && tile_rows == 1 && + tile_cols > 1) { + return encode_tiles_mt(cpi, data_ptr); + } + for (tile_row = 0; tile_row < tile_rows; tile_row++) { for (tile_col = 0; tile_col < tile_cols; tile_col++) { int tile_idx = tile_row * tile_cols + tile_col; @@ -925,8 +1068,9 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) { else vpx_start_encode(&residual_bc, data_ptr + total_size); - write_modes(cpi, &cpi->tile_data[tile_idx].tile_info, &residual_bc, &tok, - tok_end); + write_modes(cpi, xd, &cpi->tile_data[tile_idx].tile_info, &residual_bc, + &tok, tok_end, &cpi->max_mv_magnitude, + cpi->interp_filter_selected); assert(tok == tok_end); vpx_stop_encode(&residual_bc); if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) { @@ -938,7 +1082,6 @@ static size_t encode_tiles(VP9_COMP *cpi, uint8_t *data_ptr) { total_size += residual_bc.pos; } } - return total_size; } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.h index 8c97d37f77e..044a3bbc7bc 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_bitstream.h @@ -17,8 +17,26 @@ extern "C" { #include "vp9/encoder/vp9_encoder.h" +typedef struct VP9BitstreamWorkerData { + uint8_t *dest; + int dest_size; + TOKENEXTRA *tok; + TOKENEXTRA *tok_end; + vpx_writer bit_writer; + int tile_idx; + unsigned int max_mv_magnitude; + // The size of interp_filter_selected in VP9_COMP is actually + // MAX_REFERENCE_FRAMES x SWITCHABLE. But when encoding tiles, all we ever do + // is increment the very first index (index 0) for the first dimension. Hence + // this is sufficient. + int interp_filter_selected[1][SWITCHABLE]; + DECLARE_ALIGNED(16, MACROBLOCKD, xd); +} VP9BitstreamWorkerData; + int vp9_get_refresh_mask(VP9_COMP *cpi); +void vp9_bitstream_encode_tiles_buffer_dealloc(VP9_COMP *const cpi); + void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size); static INLINE int vp9_preserve_existing_gf(VP9_COMP *cpi) { diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c index 335faca82b1..3ab05375ff7 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodeframe.c @@ -795,7 +795,12 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, v16x16 vt2[16]; int force_split[21]; int avg_32x32; + int max_var_32x32 = 0; + int min_var_32x32 = INT_MAX; + int var_32x32; int avg_16x16[4]; + int64_t threshold_4x4avg; + NOISE_LEVEL noise_level = kLow; uint8_t *s; const uint8_t *d; int sp; @@ -829,6 +834,9 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, } } + threshold_4x4avg = + (cpi->oxcf.speed < 8) ? thresholds[1] << 1 : thresholds[2] >> 1; + memset(x->variance_low, 0, sizeof(x->variance_low)); if (xd->mb_to_right_edge < 0) pixels_wide += (xd->mb_to_right_edge >> 3); @@ -846,7 +854,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, // that the temporal reference frame will always be of type LAST_FRAME. // TODO(marpan): If that assumption is broken, we need to revisit this code. MODE_INFO *mi = xd->mi[0]; - const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME); + YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME); const YV12_BUFFER_CONFIG *yv12_g = NULL; unsigned int y_sad_g, y_sad_thr; @@ -871,9 +879,18 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, y_sad_g = UINT_MAX; } - vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, - &cm->frame_refs[LAST_FRAME - 1].sf); - mi->ref_frame[0] = LAST_FRAME; + if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR && + cpi->rc.is_src_frame_alt_ref) { + yv12 = get_ref_frame_buffer(cpi, ALTREF_FRAME); + vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, + &cm->frame_refs[ALTREF_FRAME - 1].sf); + mi->ref_frame[0] = ALTREF_FRAME; + y_sad_g = UINT_MAX; + } else { + vp9_setup_pre_planes(xd, 0, yv12, mi_row, mi_col, + &cm->frame_refs[LAST_FRAME - 1].sf); + mi->ref_frame[0] = LAST_FRAME; + } mi->ref_frame[1] = NONE; mi->sb_type = BLOCK_64X64; mi->mv[0].as_int = 0; @@ -986,7 +1003,7 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, } if (is_key_frame || (low_res && vt.split[i].split[j].part_variances.none.variance > - (thresholds[1] << 1))) { + threshold_4x4avg)) { force_split[split_index] = 0; // Go down to 4x4 down-sampling for variance. variance4x4downsample[i2 + j] = 1; @@ -1029,6 +1046,9 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, // (64x64) level. if (!force_split[i + 1]) { get_variance(&vt.split[i].part_variances.none); + var_32x32 = vt.split[i].part_variances.none.variance; + max_var_32x32 = VPXMAX(var_32x32, max_var_32x32); + min_var_32x32 = VPXMIN(var_32x32, min_var_32x32); if (vt.split[i].part_variances.none.variance > thresholds[1] || (!is_key_frame && vt.split[i].part_variances.none.variance > (thresholds[1] >> 1) && @@ -1036,15 +1056,27 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, force_split[i + 1] = 1; force_split[0] = 1; } - avg_32x32 += vt.split[i].part_variances.none.variance; + avg_32x32 += var_32x32; } } if (!force_split[0]) { fill_variance_tree(&vt, BLOCK_64X64); get_variance(&vt.part_variances.none); + if (cpi->noise_estimate.enabled) + noise_level = vp9_noise_estimate_extract_level(&cpi->noise_estimate); // If variance of this 64x64 block is above (some threshold of) the average // variance over the sub-32x32 blocks, then force this block to split. - if (!is_key_frame && vt.part_variances.none.variance > (5 * avg_32x32) >> 4) + // Only checking this for noise level >= medium for now. + if (!is_key_frame && noise_level >= kMedium && + vt.part_variances.none.variance > (5 * avg_32x32) >> 4) + force_split[0] = 1; + // Else if the maximum 32x32 variance minus the miniumum 32x32 variance in + // a 64x64 block is greater than threshold and the maximum 32x32 variance is + // above a miniumum threshold, then force the split of a 64x64 block + // Only check this for low noise. + else if (!is_key_frame && noise_level < kMedium && + (max_var_32x32 - min_var_32x32) > 3 * (thresholds[0] >> 3) && + max_var_32x32 > thresholds[0] >> 1) force_split[0] = 1; } @@ -1863,7 +1895,7 @@ static void update_state_rt(VP9_COMP *cpi, ThreadData *td, } } - if (cm->use_prev_frame_mvs || + if (cm->use_prev_frame_mvs || !cm->error_resilient_mode || (cpi->svc.use_base_mv && cpi->svc.number_spatial_layers > 1 && cpi->svc.spatial_layer_id != cpi->svc.number_spatial_layers - 1)) { MV_REF *const frame_mvs = @@ -3942,8 +3974,10 @@ void vp9_encode_tile(VP9_COMP *cpi, ThreadData *td, int tile_row, int mi_row; // Set up pointers to per thread motion search counters. - td->mb.m_search_count_ptr = &td->rd_counts.m_search_count; - td->mb.ex_search_count_ptr = &td->rd_counts.ex_search_count; + this_tile->m_search_count = 0; // Count of motion search hits. + this_tile->ex_search_count = 0; // Exhaustive mesh search hits. + td->mb.m_search_count_ptr = &this_tile->m_search_count; + td->mb.ex_search_count_ptr = &this_tile->ex_search_count; for (mi_row = mi_row_start; mi_row < mi_row_end; mi_row += MI_BLOCK_SIZE) { if (cpi->sf.use_nonrd_pick_mode) @@ -4048,6 +4082,7 @@ static void encode_frame_internal(VP9_COMP *cpi) { vp9_zero(x->zcoeff_blk); if (cm->frame_type != KEY_FRAME && cpi->rc.frames_since_golden == 0 && + !(cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR) && !cpi->use_svc) cpi->ref_frame_flags &= (~VP9_GOLD_FLAG); diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.c index 874a8e4b981..023d087c2ce 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.c @@ -208,7 +208,8 @@ void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vpx_writer *w, } void vp9_encode_mv(VP9_COMP *cpi, vpx_writer *w, const MV *mv, const MV *ref, - const nmv_context *mvctx, int usehp) { + const nmv_context *mvctx, int usehp, + unsigned int *const max_mv_magnitude) { const MV diff = { mv->row - ref->row, mv->col - ref->col }; const MV_JOINT_TYPE j = vp9_get_mv_joint(&diff); usehp = usehp && use_mv_hp(ref); @@ -223,8 +224,8 @@ void vp9_encode_mv(VP9_COMP *cpi, vpx_writer *w, const MV *mv, const MV *ref, // If auto_mv_step_size is enabled then keep track of the largest // motion vector component used. if (cpi->sf.mv.auto_mv_step_size) { - unsigned int maxv = VPXMAX(abs(mv->row), abs(mv->col)) >> 3; - cpi->max_mv_magnitude = VPXMAX(maxv, cpi->max_mv_magnitude); + const unsigned int maxv = VPXMAX(abs(mv->row), abs(mv->col)) >> 3; + *max_mv_magnitude = VPXMAX(maxv, *max_mv_magnitude); } } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.h index ad77b8154f3..9fc7ab8dc45 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encodemv.h @@ -23,7 +23,8 @@ void vp9_write_nmv_probs(VP9_COMMON *cm, int usehp, vpx_writer *w, nmv_context_counts *const counts); void vp9_encode_mv(VP9_COMP *cpi, vpx_writer *w, const MV *mv, const MV *ref, - const nmv_context *mvctx, int usehp); + const nmv_context *mvctx, int usehp, + unsigned int *const max_mv_magnitude); void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2], const nmv_context *mvctx, int usehp); diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c index 12f02e7c5d9..2a58003829c 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.c @@ -2030,7 +2030,10 @@ void vp9_remove_compressor(VP9_COMP *cpi) { vpx_free(cpi->tile_thr_data); vpx_free(cpi->workers); - if (cpi->num_workers > 1) vp9_loop_filter_dealloc(&cpi->lf_row_sync); + if (cpi->num_workers > 1) { + vp9_loop_filter_dealloc(&cpi->lf_row_sync); + vp9_bitstream_encode_tiles_buffer_dealloc(cpi); + } vp9_alt_ref_aq_destroy(cpi->alt_ref_aq); @@ -2438,6 +2441,8 @@ static int recode_loop_test(VP9_COMP *cpi, int high_limit, int low_limit, int q, cpi->resize_pending = 1; return 1; } + // Force recode if projected_frame_size > max_frame_bandwidth + if (rc->projected_frame_size >= rc->max_frame_bandwidth) return 1; // TODO(agrange) high_limit could be greater than the scale-down threshold. if ((rc->projected_frame_size > high_limit && q < maxq) || @@ -2796,7 +2801,7 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) { dc_quant_devisor = 4.0; #endif - fprintf(f, "%10u %dx%d %10d %10d %d %d %10d %10d %10d %10d" + fprintf(f, "%10u %dx%d %d %d %10d %10d %10d %10d" "%10"PRId64" %10"PRId64" %5d %5d %10"PRId64" " "%10"PRId64" %10"PRId64" %10d " "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf" @@ -2805,8 +2810,6 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) { "%10lf %8u %10"PRId64" %10d %10d %10d %10d %10d\n", cpi->common.current_video_frame, cm->width, cm->height, - cpi->td.rd_counts.m_search_count, - cpi->td.rd_counts.ex_search_count, cpi->rc.source_alt_ref_pending, cpi->rc.source_alt_ref_active, cpi->rc.this_frame_target, @@ -3124,7 +3127,8 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, if (cpi->oxcf.pass == 0 && cpi->oxcf.mode == REALTIME && cpi->oxcf.speed >= 5 && cpi->resize_state == 0 && (cpi->oxcf.content == VP9E_CONTENT_SCREEN || - cpi->oxcf.rc_mode == VPX_VBR)) + cpi->oxcf.rc_mode == VPX_VBR) && + cm->show_frame) vp9_avg_source_sad(cpi); // For 1 pass SVC, since only ZEROMV is allowed for upsampled reference @@ -3214,6 +3218,13 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, vpx_clear_system_state(); } +#define MAX_QSTEP_ADJ 4 +static int get_qstep_adj(int rate_excess, int rate_limit) { + int qstep = + rate_limit ? ((rate_excess + rate_limit / 2) / rate_limit) : INT_MAX; + return VPXMIN(qstep, MAX_QSTEP_ADJ); +} + static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, uint8_t *dest) { VP9_COMMON *const cm = &cpi->common; @@ -3387,6 +3398,7 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, // to attempt to recode. int last_q = q; int retries = 0; + int qstep; if (cpi->resize_pending == 1) { // Change in frame size so go back around the recode loop. @@ -3412,7 +3424,10 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, q_high = rc->worst_quality; // Raise Qlow as to at least the current value - q_low = q < q_high ? q + 1 : q_high; + qstep = + get_qstep_adj(rc->projected_frame_size, rc->this_frame_target); + q_low = VPXMIN(q + qstep, q_high); + // q_low = q < q_high ? q + 1 : q_high; if (undershoot_seen || loop_at_this_size > 1) { // Update rate_correction_factor unless @@ -3437,7 +3452,10 @@ static void encode_with_recode_loop(VP9_COMP *cpi, size_t *size, overshoot_seen = 1; } else { // Frame is too small - q_high = q > q_low ? q - 1 : q_low; + qstep = + get_qstep_adj(rc->this_frame_target, rc->projected_frame_size); + q_high = VPXMAX(q - qstep, q_low); + // q_high = q > q_low ? q - 1 : q_low; if (overshoot_seen || loop_at_this_size > 1) { vp9_rc_update_rate_correction_factors(cpi); @@ -4477,7 +4495,8 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, cpi->svc.layer_context[cpi->svc.spatial_layer_id].has_alt_frame = 1; #endif - if ((oxcf->arnr_max_frames > 0) && (oxcf->arnr_strength > 0)) { + if ((oxcf->mode != REALTIME) && (oxcf->arnr_max_frames > 0) && + (oxcf->arnr_strength > 0)) { int bitrate = cpi->rc.avg_frame_bandwidth / 40; int not_low_bitrate = bitrate > ALT_REF_AQ_LOW_BITRATE_BOUNDARY; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h index 66e41492b57..0007e6395da 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_encoder.h @@ -267,14 +267,14 @@ typedef struct TileDataEnc { TileInfo tile_info; int thresh_freq_fact[BLOCK_SIZES][MAX_MODES]; int mode_map[BLOCK_SIZES][MAX_MODES]; + int m_search_count; + int ex_search_count; } TileDataEnc; typedef struct RD_COUNTS { vp9_coeff_count coef_counts[TX_SIZES][PLANE_TYPES]; int64_t comp_pred_diff[REFERENCE_MODES]; int64_t filter_diff[SWITCHABLE_FILTER_CONTEXTS]; - int m_search_count; - int ex_search_count; } RD_COUNTS; typedef struct ThreadData { @@ -601,6 +601,7 @@ typedef struct VP9_COMP { VPxWorker *workers; struct EncWorkerData *tile_thr_data; VP9LfSync lf_row_sync; + struct VP9BitstreamWorkerData *vp9_bitstream_worker_data; int keep_level_stats; Vp9LevelInfo level_info; @@ -735,7 +736,8 @@ static INLINE int is_one_pass_cbr_svc(const struct VP9_COMP *const cpi) { } static INLINE int is_altref_enabled(const VP9_COMP *const cpi) { - return cpi->oxcf.mode != REALTIME && cpi->oxcf.lag_in_frames > 0 && + return !(cpi->oxcf.mode == REALTIME && cpi->oxcf.rc_mode == VPX_CBR) && + cpi->oxcf.lag_in_frames > 0 && (cpi->oxcf.enable_auto_arf && (!is_two_pass_svc(cpi) || cpi->oxcf.ss_enable_auto_arf[cpi->svc.spatial_layer_id])); diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ethread.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ethread.c index 7657573bbf0..f4f7c7baccd 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ethread.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ethread.c @@ -30,10 +30,6 @@ static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) { for (n = 0; n < ENTROPY_TOKENS; n++) td->rd_counts.coef_counts[i][j][k][l][m][n] += td_t->rd_counts.coef_counts[i][j][k][l][m][n]; - - // Counts of all motion searches and exhuastive mesh searches. - td->rd_counts.m_search_count += td_t->rd_counts.m_search_count; - td->rd_counts.ex_search_count += td_t->rd_counts.ex_search_count; } static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) { diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c index 2f1fe360d85..788952d3467 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_firstpass.c @@ -48,10 +48,8 @@ #define FIRST_PASS_Q 10.0 #define GF_MAX_BOOST 96.0 #define INTRA_MODE_PENALTY 1024 -#define KF_MAX_BOOST 128.0 #define MIN_ARF_GF_BOOST 240 #define MIN_DECAY_FACTOR 0.01 -#define MIN_KF_BOOST 300 #define NEW_MV_MODE_PENALTY 32 #define SVC_FACTOR_PT_LOW 0.45 #define DARK_THRESH 64 @@ -1578,7 +1576,7 @@ static double get_sr_decay_rate(const VP9_COMP *cpi, sr_decay = 1.0 - (SR_DIFF_PART * sr_diff) - motion_amplitude_part - (INTRA_PART * modified_pcnt_intra); } - return VPXMAX(sr_decay, VPXMIN(DEFAULT_DECAY_LIMIT, modified_pct_inter)); + return VPXMAX(sr_decay, DEFAULT_DECAY_LIMIT); } // This function gives an estimate of how badly we believe the prediction @@ -1681,6 +1679,7 @@ static void accumulate_frame_motion_stats(const FIRSTPASS_STATS *stats, #define BASELINE_ERR_PER_MB 1000.0 static double calc_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame, + double *sr_accumulator, double this_frame_mv_in_out, double max_boost) { double frame_boost; const double lq = vp9_convert_qindex_to_q( @@ -1694,17 +1693,56 @@ static double calc_frame_boost(VP9_COMP *cpi, const FIRSTPASS_STATS *this_frame, // Underlying boost factor is based on inter error ratio. frame_boost = (BASELINE_ERR_PER_MB * num_mbs) / - DOUBLE_DIVIDE_CHECK(this_frame->coded_error); + DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator); + + // Update the accumulator for second ref error difference. + // This is intended to give an indication of how much the coded error is + // increasing over time. + *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error) / 1; + *sr_accumulator = VPXMAX(0.0, *sr_accumulator); + + // Small adjustment for cases where there is a zoom out + if (this_frame_mv_in_out > 0.0) + frame_boost += frame_boost * (this_frame_mv_in_out * 2.0); + + // Q correction and scalling frame_boost = frame_boost * BOOST_FACTOR * boost_q_correction; - // Increase boost for frames where new data coming into frame (e.g. zoom out). - // Slightly reduce boost if there is a net balance of motion out of the frame - // (zoom in). The range for this_frame_mv_in_out is -1.0 to +1.0. + return VPXMIN(frame_boost, max_boost * boost_q_correction); +} + +#define KF_BOOST_FACTOR 12.5 +static double calc_kf_frame_boost(VP9_COMP *cpi, + const FIRSTPASS_STATS *this_frame, + double *sr_accumulator, + double this_frame_mv_in_out, + double max_boost) { + double frame_boost; + const double lq = vp9_convert_qindex_to_q( + cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth); + const double boost_q_correction = VPXMIN((0.50 + (lq * 0.015)), 2.00); + int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs + : cpi->common.MBs; + + // Correct for any inactive region in the image + num_mbs = (int)VPXMAX(1, num_mbs * calculate_active_area(cpi, this_frame)); + + // Underlying boost factor is based on inter error ratio. + frame_boost = (BASELINE_ERR_PER_MB * num_mbs) / + DOUBLE_DIVIDE_CHECK(this_frame->coded_error + *sr_accumulator); + + // Update the accumulator for second ref error difference. + // This is intended to give an indication of how much the coded error is + // increasing over time. + *sr_accumulator += (this_frame->sr_coded_error - this_frame->coded_error) / 1; + *sr_accumulator = VPXMAX(0.0, *sr_accumulator); + + // Small adjustment for cases where there is a zoom out if (this_frame_mv_in_out > 0.0) frame_boost += frame_boost * (this_frame_mv_in_out * 2.0); - // In the extreme case the boost is halved. - else - frame_boost += frame_boost * (this_frame_mv_in_out / 2.0); + + // Q correction and scalling + frame_boost = frame_boost * KF_BOOST_FACTOR * boost_q_correction; return VPXMIN(frame_boost, max_boost * boost_q_correction); } @@ -1719,6 +1757,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames, double this_frame_mv_in_out = 0.0; double mv_in_out_accumulator = 0.0; double abs_mv_in_out_accumulator = 0.0; + double sr_accumulator = 0.0; int arf_boost; int flash_detected = 0; @@ -1745,9 +1784,10 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames, : decay_accumulator; } - boost_score += - decay_accumulator * - calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST); + sr_accumulator = 0.0; + boost_score += decay_accumulator * + calc_frame_boost(cpi, this_frame, &sr_accumulator, + this_frame_mv_in_out, GF_MAX_BOOST); } *f_boost = (int)boost_score; @@ -1759,6 +1799,7 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames, this_frame_mv_in_out = 0.0; mv_in_out_accumulator = 0.0; abs_mv_in_out_accumulator = 0.0; + sr_accumulator = 0.0; // Search backward towards last gf position. for (i = -1; i >= -b_frames; --i) { @@ -1783,9 +1824,10 @@ static int calc_arf_boost(VP9_COMP *cpi, int offset, int f_frames, int b_frames, : decay_accumulator; } - boost_score += - decay_accumulator * - calc_frame_boost(cpi, this_frame, this_frame_mv_in_out, GF_MAX_BOOST); + sr_accumulator = 0.0; + boost_score += decay_accumulator * + calc_frame_boost(cpi, this_frame, &sr_accumulator, + this_frame_mv_in_out, GF_MAX_BOOST); } *b_boost = (int)boost_score; @@ -2085,7 +2127,6 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { double mv_ratio_accumulator = 0.0; double decay_accumulator = 1.0; double zero_motion_accumulator = 1.0; - double loop_decay_rate = 1.00; double last_loop_decay_rate = 1.00; @@ -2095,6 +2136,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { double mv_ratio_accumulator_thresh; double mv_in_out_thresh; double abs_mv_in_out_thresh; + double sr_accumulator = 0.0; unsigned int allow_alt_ref = is_altref_enabled(cpi); int f_boost = 0; @@ -2221,9 +2263,10 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { } // Calculate a boost number for this frame. - boost_score += - decay_accumulator * - calc_frame_boost(cpi, &next_frame, this_frame_mv_in_out, GF_MAX_BOOST); + sr_accumulator = 0.0; + boost_score += decay_accumulator * + calc_frame_boost(cpi, &next_frame, &sr_accumulator, + this_frame_mv_in_out, GF_MAX_BOOST); // Break out conditions. if ( @@ -2473,6 +2516,10 @@ static int test_candidate_kf(TWO_PASS *twopass, } #define FRAMES_TO_CHECK_DECAY 8 +#define KF_MAX_FRAME_BOOST 96.0 +#define MIN_KF_TOT_BOOST 300 +#define MAX_KF_TOT_BOOST 5400 +#define KF_BOOST_SCAN_MAX_FRAMES 32 static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { int i, j; @@ -2485,14 +2532,13 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { FIRSTPASS_STATS next_frame; FIRSTPASS_STATS last_frame; int kf_bits = 0; - int loop_decay_counter = 0; double decay_accumulator = 1.0; - double av_decay_accumulator = 0.0; double zero_motion_accumulator = 1.0; double boost_score = 0.0; double kf_mod_err = 0.0; double kf_group_err = 0.0; double recent_loop_decay[FRAMES_TO_CHECK_DECAY]; + double sr_accumulator = 0.0; vp9_zero(next_frame); @@ -2642,34 +2688,36 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Scan through the kf group collating various stats used to determine // how many bits to spend on it. - decay_accumulator = 1.0; boost_score = 0.0; + for (i = 0; i < (rc->frames_to_key - 1); ++i) { if (EOF == input_stats(twopass, &next_frame)) break; - // Monitor for static sections. - zero_motion_accumulator = VPXMIN(zero_motion_accumulator, - get_zero_motion_factor(cpi, &next_frame)); - - // Not all frames in the group are necessarily used in calculating boost. - if ((i <= rc->max_gf_interval) || - ((i <= (rc->max_gf_interval * 4)) && (decay_accumulator > 0.5))) { - const double frame_boost = - calc_frame_boost(cpi, &next_frame, 0, KF_MAX_BOOST); - - // How fast is prediction quality decaying. - if (!detect_flash(twopass, 0)) { - const double loop_decay_rate = - get_prediction_decay_rate(cpi, &next_frame); - decay_accumulator *= loop_decay_rate; - decay_accumulator = VPXMAX(decay_accumulator, MIN_DECAY_FACTOR); - av_decay_accumulator += decay_accumulator; - ++loop_decay_counter; - } - boost_score += (decay_accumulator * frame_boost); + if (i <= KF_BOOST_SCAN_MAX_FRAMES) { + double frame_boost; + double zm_factor; + + // Monitor for static sections. + zero_motion_accumulator = VPXMIN( + zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame)); + + // Factor 0.75-1.25 based on how much of frame is static. + zm_factor = (0.75 + (zero_motion_accumulator / 2.0)); + + // The second (lagging) ref error is not valid immediately after + // a key frame because either the lag has not built up (in the case of + // the first key frame or it points to a refernce before the new key + // frame. + if (i < 2) sr_accumulator = 0.0; + frame_boost = calc_kf_frame_boost(cpi, &next_frame, &sr_accumulator, 0, + KF_MAX_FRAME_BOOST * zm_factor); + + boost_score += frame_boost; + if (frame_boost < 25.00) break; + } else { + break; } } - av_decay_accumulator /= (double)loop_decay_counter; reset_fpf_position(twopass, start_position); @@ -2681,9 +2729,9 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { start_position, twopass->stats_in_end, rc->frames_to_key); // Apply various clamps for min and max boost - rc->kf_boost = (int)(av_decay_accumulator * boost_score); - rc->kf_boost = VPXMAX(rc->kf_boost, (rc->frames_to_key * 3)); - rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_BOOST); + rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3)); + rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST); + rc->kf_boost = VPXMIN(rc->kf_boost, MAX_KF_TOT_BOOST); // Work out how many bits to allocate for the key frame itself. kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost, diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c index 76d2611d89f..2b7ddbcd948 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_pickmode.c @@ -1080,12 +1080,14 @@ typedef struct { PREDICTION_MODE pred_mode; } REF_MODE; -#define RT_INTER_MODES 8 +#define RT_INTER_MODES 12 static const REF_MODE ref_mode_set[RT_INTER_MODES] = { { LAST_FRAME, ZEROMV }, { LAST_FRAME, NEARESTMV }, { GOLDEN_FRAME, ZEROMV }, { LAST_FRAME, NEARMV }, { LAST_FRAME, NEWMV }, { GOLDEN_FRAME, NEARESTMV }, - { GOLDEN_FRAME, NEARMV }, { GOLDEN_FRAME, NEWMV } + { GOLDEN_FRAME, NEARMV }, { GOLDEN_FRAME, NEWMV }, + { ALTREF_FRAME, ZEROMV }, { ALTREF_FRAME, NEARESTMV }, + { ALTREF_FRAME, NEARMV }, { ALTREF_FRAME, NEWMV } }; static const REF_MODE ref_mode_set_svc[RT_INTER_MODES] = { { LAST_FRAME, ZEROMV }, { GOLDEN_FRAME, ZEROMV }, @@ -1467,6 +1469,10 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, usable_ref_frame = GOLDEN_FRAME; } + if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR && + (cpi->rc.alt_ref_gf_group || cpi->rc.is_src_frame_alt_ref)) + usable_ref_frame = ALTREF_FRAME; + // For svc mode, on spatial_layer_id > 0: if the reference has different scale // constrain the inter mode to only test zero motion. if (cpi->use_svc && svc->force_zero_mode_spatial_ref && @@ -1506,7 +1512,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int this_early_term = 0; PREDICTION_MODE this_mode = ref_mode_set[idx].pred_mode; - if (cpi->use_svc) this_mode = ref_mode_set_svc[idx].pred_mode; + ref_frame = ref_mode_set[idx].ref_frame; + + if (cpi->use_svc) { + this_mode = ref_mode_set_svc[idx].pred_mode; + ref_frame = ref_mode_set_svc[idx].ref_frame; + } + if (ref_frame > usable_ref_frame) continue; if (sf->short_circuit_flat_blocks && x->source_variance == 0 && this_mode != NEARESTMV) { @@ -1515,9 +1527,23 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode))) continue; - ref_frame = ref_mode_set[idx].ref_frame; - if (cpi->use_svc) { - ref_frame = ref_mode_set_svc[idx].ref_frame; + if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR) { + if (cpi->rc.is_src_frame_alt_ref && + (ref_frame != ALTREF_FRAME || + frame_mv[this_mode][ref_frame].as_int != 0)) + continue; + + if (cpi->rc.alt_ref_gf_group && + cpi->rc.frames_since_golden > (cpi->rc.baseline_gf_interval >> 1) && + ref_frame == GOLDEN_FRAME && + frame_mv[this_mode][ref_frame].as_int != 0) + continue; + + if (cpi->rc.alt_ref_gf_group && + cpi->rc.frames_since_golden < (cpi->rc.baseline_gf_interval >> 1) && + ref_frame == ALTREF_FRAME && + frame_mv[this_mode][ref_frame].as_int != 0) + continue; } if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue; @@ -1543,13 +1569,27 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, continue; } - if (!force_skip_low_temp_var && + if (sf->reference_masking && !(frame_mv[this_mode][ref_frame].as_int == 0 && ref_frame == LAST_FRAME)) { - i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME; - if ((cpi->ref_frame_flags & flag_list[i]) && sf->reference_masking) - if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1)) + if (usable_ref_frame < ALTREF_FRAME) { + if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) { + i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME; + if ((cpi->ref_frame_flags & flag_list[i])) + if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1)) + ref_frame_skip_mask |= (1 << ref_frame); + } + } else if (!cpi->rc.is_src_frame_alt_ref && + !(frame_mv[this_mode][ref_frame].as_int == 0 && + ref_frame == ALTREF_FRAME)) { + int ref1 = (ref_frame == GOLDEN_FRAME) ? LAST_FRAME : GOLDEN_FRAME; + int ref2 = (ref_frame == ALTREF_FRAME) ? LAST_FRAME : ALTREF_FRAME; + if (((cpi->ref_frame_flags & flag_list[ref1]) && + (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref1] << 1))) || + ((cpi->ref_frame_flags & flag_list[ref2]) && + (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref2] << 1)))) ref_frame_skip_mask |= (1 << ref_frame); + } } if (ref_frame_skip_mask & (1 << ref_frame)) continue; @@ -1884,6 +1924,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, svc_force_zero_mode[best_ref_frame - 1]); inter_mode_thresh = (inter_mode_thresh << 1) + inter_mode_thresh; } + if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR && + cpi->rc.is_src_frame_alt_ref) + perform_intra_pred = 0; // Perform intra prediction search, if the best SAD is above a certain // threshold. if ((!force_skip_low_temp_var || bsize < BLOCK_32X32) && perform_intra_pred && diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c index 93eddd655ac..b5cfd5de6c6 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.c @@ -45,6 +45,9 @@ #define FRAME_OVERHEAD_BITS 200 +// Use this macro to turn on/off use of alt-refs in one-pass mode. +#define USE_ALTREF_FOR_ONE_PASS 0 + #if CONFIG_VP9_HIGHBITDEPTH #define ASSIGN_MINQ_TABLE(bit_depth, name) \ do { \ @@ -327,6 +330,7 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) { rc->prev_avg_source_sad_lag = 0; rc->high_source_sad = 0; rc->high_source_sad_lagindex = -1; + rc->alt_ref_gf_group = 0; rc->fac_active_worst_inter = 150; rc->fac_active_worst_gf = 100; rc->force_qpmin = 0; @@ -561,6 +565,13 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame, q = clamp(q, VPXMIN(cpi->rc.q_1_frame, cpi->rc.q_2_frame), VPXMAX(cpi->rc.q_1_frame, cpi->rc.q_2_frame)); } +#if USE_ALTREF_FOR_ONE_PASS + if (cpi->oxcf.pass == 0 && cpi->oxcf.rc_mode == VPX_VBR && + cpi->oxcf.lag_in_frames > 0 && cpi->rc.is_src_frame_alt_ref && + !cpi->rc.alt_ref_gf_group) { + q = VPXMIN(q, (q + cpi->rc.last_boosted_qindex) >> 1); + } +#endif return q; } @@ -1429,24 +1440,16 @@ void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) { cpi->rc.rc_1_frame = 0; } -// Use this macro to turn on/off use of alt-refs in one-pass mode. -#define USE_ALTREF_FOR_ONE_PASS 1 - static int calc_pframe_target_size_one_pass_vbr(const VP9_COMP *const cpi) { const RATE_CONTROL *const rc = &cpi->rc; - int target; const int af_ratio = rc->af_ratio_onepass_vbr; -#if USE_ALTREF_FOR_ONE_PASS - target = + int target = (!rc->is_src_frame_alt_ref && (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) ? (rc->avg_frame_bandwidth * rc->baseline_gf_interval * af_ratio) / (rc->baseline_gf_interval + af_ratio - 1) : (rc->avg_frame_bandwidth * rc->baseline_gf_interval) / (rc->baseline_gf_interval + af_ratio - 1); -#else - target = rc->avg_frame_bandwidth; -#endif return vp9_rc_clamp_pframe_target_size(cpi, target); } @@ -1499,8 +1502,8 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) { if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cpi->oxcf.pass == 0) { vp9_cyclic_refresh_set_golden_update(cpi); } else { - rc->baseline_gf_interval = - (rc->min_gf_interval + rc->max_gf_interval) / 2; + rc->baseline_gf_interval = VPXMIN( + 20, VPXMAX(10, (rc->min_gf_interval + rc->max_gf_interval) / 2)); } rc->af_ratio_onepass_vbr = 10; if (rc->rolling_target_bits > 0) @@ -1526,6 +1529,7 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) { rc->frames_till_gf_update_due = rc->baseline_gf_interval; cpi->refresh_golden_frame = 1; rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS; + rc->alt_ref_gf_group = USE_ALTREF_FOR_ONE_PASS; } if (cm->frame_type == KEY_FRAME) target = calc_iframe_target_size_one_pass_vbr(cpi); @@ -2088,8 +2092,8 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) { rc->high_source_sad_lagindex = high_source_sad_lagindex; // Adjust some factors for the next GF group, ignore initial key frame, // and only for lag_in_frames not too small. - if (cpi->refresh_golden_frame == 1 && cm->frame_type != KEY_FRAME && - cm->current_video_frame > 30 && cpi->oxcf.lag_in_frames > 8) { + if (cpi->refresh_golden_frame == 1 && cm->current_video_frame > 30 && + cpi->oxcf.lag_in_frames > 8) { int frame_constraint; if (rc->rolling_target_bits > 0) rate_err = @@ -2110,6 +2114,8 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) { ? VPXMAX(10, rc->baseline_gf_interval >> 1) : VPXMAX(6, rc->baseline_gf_interval >> 1); } + if (rc->baseline_gf_interval > cpi->oxcf.lag_in_frames - 1) + rc->baseline_gf_interval = cpi->oxcf.lag_in_frames - 1; // Check for constraining gf_interval for up-coming scene/content changes, // or for up-coming key frame, whichever is closer. frame_constraint = rc->frames_to_key; @@ -2133,6 +2139,23 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) { rc->af_ratio_onepass_vbr = 5; rc->gfu_boost = DEFAULT_GF_BOOST >> 2; } +#if USE_ALTREF_FOR_ONE_PASS + // Don't use alt-ref if there is a scene cut within the group, + // or content is not low. + if ((rc->high_source_sad_lagindex > 0 && + rc->high_source_sad_lagindex <= rc->frames_till_gf_update_due) || + (avg_source_sad_lag > 3 * sad_thresh1 >> 3)) { + rc->source_alt_ref_pending = 0; + rc->alt_ref_gf_group = 0; + } else { + rc->source_alt_ref_pending = 1; + rc->alt_ref_gf_group = 1; + // If alt-ref is used for this gf group, limit the interval. + if (rc->baseline_gf_interval > 10 && + rc->baseline_gf_interval < rc->frames_to_key) + rc->baseline_gf_interval = 10; + } +#endif target = calc_pframe_target_size_one_pass_vbr(cpi); vp9_rc_set_frame_target(cpi, target); } @@ -2261,6 +2284,7 @@ void vp9_avg_source_sad(VP9_COMP *cpi) { cpi->ext_refresh_frame_flags_pending == 0) { int target; cpi->refresh_golden_frame = 1; + rc->source_alt_ref_pending = USE_ALTREF_FOR_ONE_PASS; rc->gfu_boost = DEFAULT_GF_BOOST >> 1; rc->baseline_gf_interval = VPXMIN(20, VPXMAX(10, rc->baseline_gf_interval)); diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.h b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.h index 6006e9b051a..70aef03ffb4 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_ratectrl.h @@ -160,6 +160,7 @@ typedef struct { uint64_t avg_source_sad[MAX_LAG_BUFFERS]; uint64_t prev_avg_source_sad_lag; int high_source_sad_lagindex; + int alt_ref_gf_group; int high_source_sad; int count_last_scene_change; int avg_frame_low_motion; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c index ea893609193..3e1ed50a6d2 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/vp9_speed_features.c @@ -421,6 +421,10 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed, (frames_since_key % (sf->last_partitioning_redo_frequency << 1) == 1); sf->max_delta_qindex = is_keyframe ? 20 : 15; sf->partition_search_type = REFERENCE_PARTITION; + if (cpi->oxcf.rc_mode == VPX_VBR && cpi->oxcf.lag_in_frames > 0 && + cpi->rc.is_src_frame_alt_ref) { + sf->partition_search_type = VAR_BASED_PARTITION; + } sf->use_nonrd_pick_mode = 1; sf->allow_skip_recode = 0; sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEW_ZERO; @@ -504,7 +508,6 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, int speed, sf->short_circuit_low_temp_var = 2; } sf->limit_newmv_early_exit = 0; - sf->bias_golden = 0; } } diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c index fb2a9254172..b3c3d7beb9e 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/encoder/x86/vp9_dct_ssse3.c @@ -12,14 +12,17 @@ #include <tmmintrin.h> // SSSE3 #include "./vp9_rtcd.h" +#include "./vpx_config.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/x86/fdct.h" #include "vpx_dsp/x86/inv_txfm_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h" void vp9_fdct8x8_quant_ssse3( - const int16_t *input, int stride, int16_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, - int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { __m128i zero; int pass; @@ -328,15 +331,15 @@ void vp9_fdct8x8_quant_ssse3( qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); + store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); + store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); dequant = _mm_unpackhi_epi64(dequant, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1); + store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); + store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); } { @@ -398,20 +401,21 @@ void vp9_fdct8x8_quant_ssse3( qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign); qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), qcoeff0); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, qcoeff1); + store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); + store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), coeff0); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, coeff1); + store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); + store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); } else { - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero); + // Maybe a more efficient way to store 0? + store_zero_tran_low(qcoeff_ptr + n_coeffs); + store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero); + store_zero_tran_low(dqcoeff_ptr + n_coeffs); + store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); } } @@ -452,10 +456,10 @@ void vp9_fdct8x8_quant_ssse3( } } else { do { - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i *)(dqcoeff_ptr + n_coeffs) + 1, zero); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs), zero); - _mm_store_si128((__m128i *)(qcoeff_ptr + n_coeffs) + 1, zero); + store_zero_tran_low(dqcoeff_ptr + n_coeffs); + store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8); + store_zero_tran_low(qcoeff_ptr + n_coeffs); + store_zero_tran_low(qcoeff_ptr + n_coeffs + 8); n_coeffs += 8 * 2; } while (n_coeffs < 0); *eob_ptr = 0; diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_dx_iface.c b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_dx_iface.c index 3b5dc3ddac0..0a3e84a0da2 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_dx_iface.c +++ b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_dx_iface.c @@ -467,8 +467,8 @@ static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx, // as the size of the first intra frame be better? This will // avoid too many deallocate and allocate. if (frame_worker_data->scratch_buffer_size < data_sz) { - frame_worker_data->scratch_buffer = - (uint8_t *)vpx_realloc(frame_worker_data->scratch_buffer, data_sz); + vpx_free(frame_worker_data->scratch_buffer); + frame_worker_data->scratch_buffer = (uint8_t *)vpx_malloc(data_sz); if (frame_worker_data->scratch_buffer == NULL) { set_error_detail(ctx, "Failed to reallocate scratch buffer"); return VPX_CODEC_MEM_ERROR; @@ -553,6 +553,9 @@ static vpx_codec_err_t decoder_decode(vpx_codec_alg_priv_t *ctx, ctx->decrypt_cb, ctx->decrypt_state); if (res != VPX_CODEC_OK) return res; + if (ctx->svc_decoding && ctx->svc_spatial_layer < frame_count - 1) + frame_count = ctx->svc_spatial_layer + 1; + if (ctx->frame_parallel_decode) { // Decode in frame parallel mode. When decoding in this mode, the frame // passed to the decoder must be either a normal frame or a superframe with @@ -1001,6 +1004,16 @@ static vpx_codec_err_t ctrl_set_skip_loop_filter(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_OK; } +static vpx_codec_err_t ctrl_set_spatial_layer_svc(vpx_codec_alg_priv_t *ctx, + va_list args) { + ctx->svc_decoding = 1; + ctx->svc_spatial_layer = va_arg(args, int); + if (ctx->svc_spatial_layer < 0) + return VPX_CODEC_INVALID_PARAM; + else + return VPX_CODEC_OK; +} + static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { { VP8_COPY_REFERENCE, ctrl_copy_reference }, @@ -1011,6 +1024,7 @@ static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = { { VPXD_SET_DECRYPTOR, ctrl_set_decryptor }, { VP9_SET_BYTE_ALIGNMENT, ctrl_set_byte_alignment }, { VP9_SET_SKIP_LOOP_FILTER, ctrl_set_skip_loop_filter }, + { VP9_DECODE_SVC_SPATIAL_LAYER, ctrl_set_spatial_layer_svc }, // Getters { VP8D_GET_LAST_REF_UPDATES, ctrl_get_last_ref_updates }, diff --git a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_dx_iface.h b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_dx_iface.h index cc3d51842ac..c1559599b8c 100644 --- a/chromium/third_party/libvpx/source/libvpx/vp9/vp9_dx_iface.h +++ b/chromium/third_party/libvpx/source/libvpx/vp9/vp9_dx_iface.h @@ -60,6 +60,10 @@ struct vpx_codec_alg_priv { void *ext_priv; // Private data associated with the external frame buffers. vpx_get_frame_buffer_cb_fn_t get_ext_fb_cb; vpx_release_frame_buffer_cb_fn_t release_ext_fb_cb; + + // Allow for decoding up to a given spatial layer for SVC stream. + int svc_decoding; + int svc_spatial_layer; }; #endif // VP9_VP9_DX_IFACE_H_ diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c b/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c index 5aa0b8ddb84..88b1531d8c4 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx/src/svc_encodeframe.c @@ -53,6 +53,10 @@ static const int DEFAULT_SCALE_FACTORS_NUM[VPX_SS_MAX_LAYERS] = { 4, 5, 7, 11, static const int DEFAULT_SCALE_FACTORS_DEN[VPX_SS_MAX_LAYERS] = { 16, 16, 16, 16, 16 }; +static const int DEFAULT_SCALE_FACTORS_NUM_2x[VPX_SS_MAX_LAYERS] = { 1, 2, 4 }; + +static const int DEFAULT_SCALE_FACTORS_DEN_2x[VPX_SS_MAX_LAYERS] = { 4, 4, 4 }; + typedef enum { QUANTIZER = 0, BITRATE, @@ -156,6 +160,9 @@ static vpx_codec_err_t parse_layer_options_from_string(SvcContext *svc_ctx, char *token; const char *delim = ","; char *save_ptr; + int num_layers = svc_ctx->spatial_layers; + if (type == BITRATE) + num_layers = svc_ctx->spatial_layers * svc_ctx->temporal_layers; if (input == NULL || option0 == NULL || (option1 == NULL && type == SCALE_FACTOR)) @@ -163,7 +170,7 @@ static vpx_codec_err_t parse_layer_options_from_string(SvcContext *svc_ctx, input_string = strdup(input); token = strtok_r(input_string, delim, &save_ptr); - for (i = 0; i < svc_ctx->spatial_layers; ++i) { + for (i = 0; i < num_layers; ++i) { if (token != NULL) { res = extract_option(type, token, option0 + i, option1 + i); if (res != VPX_CODEC_OK) break; @@ -172,11 +179,11 @@ static vpx_codec_err_t parse_layer_options_from_string(SvcContext *svc_ctx, break; } } - if (res == VPX_CODEC_OK && i != svc_ctx->spatial_layers) { + if (res == VPX_CODEC_OK && i != num_layers) { svc_log(svc_ctx, SVC_LOG_ERROR, "svc: layer params type: %d %d values required, " "but only %d specified\n", - type, svc_ctx->spatial_layers, i); + type, num_layers, i); res = VPX_CODEC_INVALID_PARAM; } free(input_string); @@ -287,24 +294,30 @@ vpx_codec_err_t vpx_svc_set_options(SvcContext *svc_ctx, const char *options) { return VPX_CODEC_OK; } -void assign_layer_bitrates(const SvcContext *svc_ctx, - vpx_codec_enc_cfg_t *const enc_cfg) { +vpx_codec_err_t assign_layer_bitrates(const SvcContext *svc_ctx, + vpx_codec_enc_cfg_t *const enc_cfg) { int i; const SvcInternal_t *const si = get_const_svc_internal(svc_ctx); int sl, tl, spatial_layer_target; if (svc_ctx->temporal_layering_mode != 0) { if (si->bitrates[0] != 0) { - enc_cfg->rc_target_bitrate = 0; + unsigned int total_bitrate = 0; for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) { - enc_cfg->ss_target_bitrate[sl * svc_ctx->temporal_layers] = 0; + total_bitrate += si->bitrates[sl * svc_ctx->temporal_layers + + svc_ctx->temporal_layers - 1]; for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) { enc_cfg->ss_target_bitrate[sl * svc_ctx->temporal_layers] += (unsigned int)si->bitrates[sl * svc_ctx->temporal_layers + tl]; enc_cfg->layer_target_bitrate[sl * svc_ctx->temporal_layers + tl] = si->bitrates[sl * svc_ctx->temporal_layers + tl]; + if (tl > 0 && (si->bitrates[sl * svc_ctx->temporal_layers + tl] <= + si->bitrates[sl * svc_ctx->temporal_layers + tl - 1])) + return VPX_CODEC_INVALID_PARAM; } } + if (total_bitrate != enc_cfg->rc_target_bitrate) + return VPX_CODEC_INVALID_PARAM; } else { float total = 0; float alloc_ratio[VPX_MAX_LAYERS] = { 0 }; @@ -341,11 +354,14 @@ void assign_layer_bitrates(const SvcContext *svc_ctx, } } else { if (si->bitrates[0] != 0) { - enc_cfg->rc_target_bitrate = 0; + unsigned int total_bitrate = 0; for (i = 0; i < svc_ctx->spatial_layers; ++i) { enc_cfg->ss_target_bitrate[i] = (unsigned int)si->bitrates[i]; - enc_cfg->rc_target_bitrate += si->bitrates[i]; + enc_cfg->layer_target_bitrate[i] = (unsigned int)si->bitrates[i]; + total_bitrate += si->bitrates[i]; } + if (total_bitrate != enc_cfg->rc_target_bitrate) + return VPX_CODEC_INVALID_PARAM; } else { float total = 0; float alloc_ratio[VPX_MAX_LAYERS] = { 0 }; @@ -368,6 +384,7 @@ void assign_layer_bitrates(const SvcContext *svc_ctx, } } } + return VPX_CODEC_OK; } vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, @@ -412,12 +429,24 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, si->svc_params.scaling_factor_den[sl] = DEFAULT_SCALE_FACTORS_DEN[sl]; si->svc_params.speed_per_layer[sl] = svc_ctx->speed; } - + if (enc_cfg->rc_end_usage == VPX_CBR && enc_cfg->g_pass == VPX_RC_ONE_PASS && + svc_ctx->spatial_layers <= 3) { + for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) { + int sl2 = (svc_ctx->spatial_layers == 2) ? sl + 1 : sl; + si->svc_params.scaling_factor_num[sl] = DEFAULT_SCALE_FACTORS_NUM_2x[sl2]; + si->svc_params.scaling_factor_den[sl] = DEFAULT_SCALE_FACTORS_DEN_2x[sl2]; + } + } for (tl = 0; tl < svc_ctx->temporal_layers; ++tl) { for (sl = 0; sl < svc_ctx->spatial_layers; ++sl) { i = sl * svc_ctx->temporal_layers + tl; si->svc_params.max_quantizers[i] = MAX_QUANTIZER; si->svc_params.min_quantizers[i] = 0; + if (enc_cfg->rc_end_usage == VPX_CBR && + enc_cfg->g_pass == VPX_RC_ONE_PASS) { + si->svc_params.max_quantizers[i] = 56; + si->svc_params.min_quantizers[i] = 2; + } } } @@ -442,7 +471,15 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx, (int)VPX_MAX_LAYERS); return VPX_CODEC_INVALID_PARAM; } - assign_layer_bitrates(svc_ctx, enc_cfg); + res = assign_layer_bitrates(svc_ctx, enc_cfg); + if (res != VPX_CODEC_OK) { + svc_log(svc_ctx, SVC_LOG_ERROR, + "layer bitrates incorrect: \n" + "1) spatial layer bitrates should sum up to target \n" + "2) temporal layer bitrates should be increasing within \n" + "a spatial layer \n"); + return VPX_CODEC_INVALID_PARAM; + } #if CONFIG_SPATIAL_SVC for (i = 0; i < svc_ctx->spatial_layers; ++i) diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/svc_context.h b/chromium/third_party/libvpx/source/libvpx/vpx/svc_context.h index c8bde5832a5..462785075cb 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx/svc_context.h +++ b/chromium/third_party/libvpx/source/libvpx/vpx/svc_context.h @@ -54,7 +54,7 @@ typedef struct SvcInternal { // values extracted from option, quantizers vpx_svc_extra_cfg_t svc_params; int enable_auto_alt_ref[VPX_SS_MAX_LAYERS]; - int bitrates[VPX_SS_MAX_LAYERS]; + int bitrates[VPX_MAX_LAYERS]; // accumulated statistics double psnr_sum[VPX_SS_MAX_LAYERS][COMPONENTS]; // total/Y/U/V diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/vp8cx.h b/chromium/third_party/libvpx/source/libvpx/vpx/vp8cx.h index 8fa25e8bc07..cc90159bc3a 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx/vp8cx.h +++ b/chromium/third_party/libvpx/source/libvpx/vpx/vp8cx.h @@ -561,7 +561,22 @@ enum vp8e_enc_control_id { * * Supported in codecs: VP9 */ - VP9E_SET_ALT_REF_AQ + VP9E_SET_ALT_REF_AQ, + + /*!\brief Boost percentage for Golden Frame in CBR mode. + * + * This value controls the amount of boost given to Golden Frame in + * CBR mode. It is expressed as a percentage of the average + * per-frame bitrate, with the special (and default) value 0 meaning + * the feature is off, i.e., no golden frame boost in CBR mode and + * average bitrate target is used. + * + * For example, to allow 100% more bits, i.e, 2X, in a golden frame + * than average frame, set this to 100. + * + * Supported in codecs: VP8 + */ + VP8E_SET_GF_CBR_BOOST_PCT, }; /*!\brief vpx 1-D scaling mode @@ -769,6 +784,9 @@ VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTRA_BITRATE_PCT, unsigned int) VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTER_BITRATE_PCT, unsigned int) #define VPX_CTRL_VP8E_SET_MAX_INTER_BITRATE_PCT +VPX_CTRL_USE_TYPE(VP8E_SET_GF_CBR_BOOST_PCT, unsigned int) +#define VPX_CTRL_VP8E_SET_GF_CBR_BOOST_PCT + VPX_CTRL_USE_TYPE(VP8E_SET_SCREEN_CONTENT_MODE, unsigned int) #define VPX_CTRL_VP8E_SET_SCREEN_CONTENT_MODE diff --git a/chromium/third_party/libvpx/source/libvpx/vpx/vp8dx.h b/chromium/third_party/libvpx/source/libvpx/vpx/vp8dx.h index 88204acd378..0d7759eb25b 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx/vp8dx.h +++ b/chromium/third_party/libvpx/source/libvpx/vpx/vp8dx.h @@ -111,6 +111,11 @@ enum vp8_dec_control_id { */ VP9_SET_SKIP_LOOP_FILTER, + /** control function to decode SVC stream up to the x spatial layers, + * where x is passed in through the control, and is 0 for base layer. + */ + VP9_DECODE_SVC_SPATIAL_LAYER, + VP8_DECODER_CTRL_ID_MAX }; @@ -162,6 +167,8 @@ VPX_CTRL_USE_TYPE(VP9D_GET_FRAME_SIZE, int *) #define VPX_CTRL_VP9D_GET_FRAME_SIZE VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int) #define VPX_CTRL_VP9_INVERT_TILE_DECODE_ORDER +#define VPX_CTRL_VP9_DECODE_SVC_SPATIAL_LAYER +VPX_CTRL_USE_TYPE(VP9_DECODE_SVC_SPATIAL_LAYER, int) /*!\endcond */ /*! @} - end defgroup vp8_decoder */ diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c index 7cb2ba90d2f..e9503f13d70 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c @@ -52,10 +52,10 @@ void vpx_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) { v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64); v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64); v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64); - v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64); - v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64); - v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64); - v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64); + v_t0_lo = vmulq_n_s32(v_t0_lo, (int32_t)cospi_16_64); + v_t0_hi = vmulq_n_s32(v_t0_hi, (int32_t)cospi_16_64); + v_t1_lo = vmulq_n_s32(v_t1_lo, (int32_t)cospi_16_64); + v_t1_hi = vmulq_n_s32(v_t1_hi, (int32_t)cospi_16_64); { const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c new file mode 100644 index 00000000000..5530c6425b2 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_loopfilter_neon.c @@ -0,0 +1,761 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/transpose_neon.h" + +static INLINE void load_thresh(const uint8_t *blimit, const uint8_t *limit, + const uint8_t *thresh, uint16x8_t *blimit_vec, + uint16x8_t *limit_vec, uint16x8_t *thresh_vec, + const int bd) { + const int16x8_t shift = vdupq_n_s16(bd - 8); + *blimit_vec = vmovl_u8(vld1_dup_u8(blimit)); + *limit_vec = vmovl_u8(vld1_dup_u8(limit)); + *thresh_vec = vmovl_u8(vld1_dup_u8(thresh)); + *blimit_vec = vshlq_u16(*blimit_vec, shift); + *limit_vec = vshlq_u16(*limit_vec, shift); + *thresh_vec = vshlq_u16(*thresh_vec, shift); +} + +// Here flat is 128-bit long, with each 16-bit chunk being a mask of +// a pixel. When used to control filter branches, we only detect whether it is +// all 0s or all 1s. We pairwise add flat to a 32-bit long number flat_status. +// flat equals 0 if and only if flat_status equals 0. +// flat equals -1 (all 1s) if and only if flat_status equals -4. (This is true +// because each mask occupies more than 1 bit.) +static INLINE uint32_t calc_flat_status(const uint16x8_t flat) { + const uint64x1_t t0 = vadd_u64(vreinterpret_u64_u16(vget_low_u16(flat)), + vreinterpret_u64_u16(vget_high_u16(flat))); + const uint64x1_t t1 = vpaddl_u32(vreinterpret_u32_u64(t0)); + return vget_lane_u32(vreinterpret_u32_u64(t1), 0); +} + +static INLINE uint16x8_t +filter_hev_mask4(const uint16x8_t limit, const uint16x8_t blimit, + const uint16x8_t thresh, const uint16x8_t p3, + const uint16x8_t p2, const uint16x8_t p1, const uint16x8_t p0, + const uint16x8_t q0, const uint16x8_t q1, const uint16x8_t q2, + const uint16x8_t q3, uint16x8_t *hev, uint16x8_t *mask) { + uint16x8_t max, t0, t1; + + max = vabdq_u16(p1, p0); + max = vmaxq_u16(max, vabdq_u16(q1, q0)); + *hev = vcgtq_u16(max, thresh); + *mask = vmaxq_u16(max, vabdq_u16(p3, p2)); + *mask = vmaxq_u16(*mask, vabdq_u16(p2, p1)); + *mask = vmaxq_u16(*mask, vabdq_u16(q2, q1)); + *mask = vmaxq_u16(*mask, vabdq_u16(q3, q2)); + t0 = vabdq_u16(p0, q0); + t1 = vabdq_u16(p1, q1); + t0 = vaddq_u16(t0, t0); + t1 = vshrq_n_u16(t1, 1); + t0 = vaddq_u16(t0, t1); + *mask = vcleq_u16(*mask, limit); + t0 = vcleq_u16(t0, blimit); + *mask = vandq_u16(*mask, t0); + + return max; +} + +static INLINE uint16x8_t filter_flat_hev_mask( + const uint16x8_t limit, const uint16x8_t blimit, const uint16x8_t thresh, + const uint16x8_t p3, const uint16x8_t p2, const uint16x8_t p1, + const uint16x8_t p0, const uint16x8_t q0, const uint16x8_t q1, + const uint16x8_t q2, const uint16x8_t q3, uint16x8_t *flat, + uint32_t *flat_status, uint16x8_t *hev, const int bd) { + uint16x8_t mask; + const uint16x8_t max = filter_hev_mask4(limit, blimit, thresh, p3, p2, p1, p0, + q0, q1, q2, q3, hev, &mask); + *flat = vmaxq_u16(max, vabdq_u16(p2, p0)); + *flat = vmaxq_u16(*flat, vabdq_u16(q2, q0)); + *flat = vmaxq_u16(*flat, vabdq_u16(p3, p0)); + *flat = vmaxq_u16(*flat, vabdq_u16(q3, q0)); + *flat = vcleq_u16(*flat, vdupq_n_u16(1 << (bd - 8))); /* flat_mask4() */ + *flat = vandq_u16(*flat, mask); + *flat_status = calc_flat_status(*flat); + + return mask; +} + +static INLINE uint16x8_t flat_mask5(const uint16x8_t p4, const uint16x8_t p3, + const uint16x8_t p2, const uint16x8_t p1, + const uint16x8_t p0, const uint16x8_t q0, + const uint16x8_t q1, const uint16x8_t q2, + const uint16x8_t q3, const uint16x8_t q4, + const uint16x8_t flat, + uint32_t *flat2_status, const int bd) { + uint16x8_t flat2 = vabdq_u16(p4, p0); + flat2 = vmaxq_u16(flat2, vabdq_u16(p3, p0)); + flat2 = vmaxq_u16(flat2, vabdq_u16(p2, p0)); + flat2 = vmaxq_u16(flat2, vabdq_u16(p1, p0)); + flat2 = vmaxq_u16(flat2, vabdq_u16(q1, q0)); + flat2 = vmaxq_u16(flat2, vabdq_u16(q2, q0)); + flat2 = vmaxq_u16(flat2, vabdq_u16(q3, q0)); + flat2 = vmaxq_u16(flat2, vabdq_u16(q4, q0)); + flat2 = vcleq_u16(flat2, vdupq_n_u16(1 << (bd - 8))); + flat2 = vandq_u16(flat2, flat); + *flat2_status = calc_flat_status(flat2); + + return flat2; +} + +static INLINE int16x8_t flip_sign(const uint16x8_t v, const int bd) { + const uint16x8_t offset = vdupq_n_u16(0x80 << (bd - 8)); + return vreinterpretq_s16_u16(vsubq_u16(v, offset)); +} + +static INLINE uint16x8_t flip_sign_back(const int16x8_t v, const int bd) { + const int16x8_t offset = vdupq_n_s16(0x80 << (bd - 8)); + return vreinterpretq_u16_s16(vaddq_s16(v, offset)); +} + +static INLINE void filter_update(const uint16x8_t sub0, const uint16x8_t sub1, + const uint16x8_t add0, const uint16x8_t add1, + uint16x8_t *sum) { + *sum = vsubq_u16(*sum, sub0); + *sum = vsubq_u16(*sum, sub1); + *sum = vaddq_u16(*sum, add0); + *sum = vaddq_u16(*sum, add1); +} + +static INLINE uint16x8_t calc_7_tap_filter_kernel(const uint16x8_t sub0, + const uint16x8_t sub1, + const uint16x8_t add0, + const uint16x8_t add1, + uint16x8_t *sum) { + filter_update(sub0, sub1, add0, add1, sum); + return vrshrq_n_u16(*sum, 3); +} + +static INLINE uint16x8_t apply_15_tap_filter_kernel( + const uint16x8_t flat, const uint16x8_t sub0, const uint16x8_t sub1, + const uint16x8_t add0, const uint16x8_t add1, const uint16x8_t in, + uint16x8_t *sum) { + filter_update(sub0, sub1, add0, add1, sum); + return vbslq_u16(flat, vrshrq_n_u16(*sum, 4), in); +} + +// 7-tap filter [1, 1, 1, 2, 1, 1, 1] +static INLINE void calc_7_tap_filter(const uint16x8_t p3, const uint16x8_t p2, + const uint16x8_t p1, const uint16x8_t p0, + const uint16x8_t q0, const uint16x8_t q1, + const uint16x8_t q2, const uint16x8_t q3, + uint16x8_t *op2, uint16x8_t *op1, + uint16x8_t *op0, uint16x8_t *oq0, + uint16x8_t *oq1, uint16x8_t *oq2) { + uint16x8_t sum; + sum = vaddq_u16(p3, p3); // 2*p3 + sum = vaddq_u16(sum, p3); // 3*p3 + sum = vaddq_u16(sum, p2); // 3*p3+p2 + sum = vaddq_u16(sum, p2); // 3*p3+2*p2 + sum = vaddq_u16(sum, p1); // 3*p3+2*p2+p1 + sum = vaddq_u16(sum, p0); // 3*p3+2*p2+p1+p0 + sum = vaddq_u16(sum, q0); // 3*p3+2*p2+p1+p0+q0 + *op2 = vrshrq_n_u16(sum, 3); + *op1 = calc_7_tap_filter_kernel(p3, p2, p1, q1, &sum); + *op0 = calc_7_tap_filter_kernel(p3, p1, p0, q2, &sum); + *oq0 = calc_7_tap_filter_kernel(p3, p0, q0, q3, &sum); + *oq1 = calc_7_tap_filter_kernel(p2, q0, q1, q3, &sum); + *oq2 = calc_7_tap_filter_kernel(p1, q1, q2, q3, &sum); +} + +static INLINE void apply_7_tap_filter(const uint16x8_t flat, + const uint16x8_t p3, const uint16x8_t p2, + const uint16x8_t p1, const uint16x8_t p0, + const uint16x8_t q0, const uint16x8_t q1, + const uint16x8_t q2, const uint16x8_t q3, + uint16x8_t *op2, uint16x8_t *op1, + uint16x8_t *op0, uint16x8_t *oq0, + uint16x8_t *oq1, uint16x8_t *oq2) { + uint16x8_t tp1, tp0, tq0, tq1; + calc_7_tap_filter(p3, p2, p1, p0, q0, q1, q2, q3, op2, &tp1, &tp0, &tq0, &tq1, + oq2); + *op2 = vbslq_u16(flat, *op2, p2); + *op1 = vbslq_u16(flat, tp1, *op1); + *op0 = vbslq_u16(flat, tp0, *op0); + *oq0 = vbslq_u16(flat, tq0, *oq0); + *oq1 = vbslq_u16(flat, tq1, *oq1); + *oq2 = vbslq_u16(flat, *oq2, q2); +} + +// 15-tap filter [1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1] +static INLINE void apply_15_tap_filter( + const uint16x8_t flat2, const uint16x8_t p7, const uint16x8_t p6, + const uint16x8_t p5, const uint16x8_t p4, const uint16x8_t p3, + const uint16x8_t p2, const uint16x8_t p1, const uint16x8_t p0, + const uint16x8_t q0, const uint16x8_t q1, const uint16x8_t q2, + const uint16x8_t q3, const uint16x8_t q4, const uint16x8_t q5, + const uint16x8_t q6, const uint16x8_t q7, uint16x8_t *op6, uint16x8_t *op5, + uint16x8_t *op4, uint16x8_t *op3, uint16x8_t *op2, uint16x8_t *op1, + uint16x8_t *op0, uint16x8_t *oq0, uint16x8_t *oq1, uint16x8_t *oq2, + uint16x8_t *oq3, uint16x8_t *oq4, uint16x8_t *oq5, uint16x8_t *oq6) { + uint16x8_t sum; + sum = vshlq_n_u16(p7, 3); // 8*p7 + sum = vsubq_u16(sum, p7); // 7*p7 + sum = vaddq_u16(sum, p6); // 7*p7+p6 + sum = vaddq_u16(sum, p6); // 7*p7+2*p6 + sum = vaddq_u16(sum, p5); // 7*p7+2*p6+p5 + sum = vaddq_u16(sum, p4); // 7*p7+2*p6+p5+p4 + sum = vaddq_u16(sum, p3); // 7*p7+2*p6+p5+p4+p3 + sum = vaddq_u16(sum, p2); // 7*p7+2*p6+p5+p4+p3+p2 + sum = vaddq_u16(sum, p1); // 7*p7+2*p6+p5+p4+p3+p2+p1 + sum = vaddq_u16(sum, p0); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0 + sum = vaddq_u16(sum, q0); // 7*p7+2*p6+p5+p4+p3+p2+p1+p0+q0 + *op6 = vbslq_u16(flat2, vrshrq_n_u16(sum, 4), p6); + *op5 = apply_15_tap_filter_kernel(flat2, p7, p6, p5, q1, p5, &sum); + *op4 = apply_15_tap_filter_kernel(flat2, p7, p5, p4, q2, p4, &sum); + *op3 = apply_15_tap_filter_kernel(flat2, p7, p4, p3, q3, p3, &sum); + *op2 = apply_15_tap_filter_kernel(flat2, p7, p3, p2, q4, *op2, &sum); + *op1 = apply_15_tap_filter_kernel(flat2, p7, p2, p1, q5, *op1, &sum); + *op0 = apply_15_tap_filter_kernel(flat2, p7, p1, p0, q6, *op0, &sum); + *oq0 = apply_15_tap_filter_kernel(flat2, p7, p0, q0, q7, *oq0, &sum); + *oq1 = apply_15_tap_filter_kernel(flat2, p6, q0, q1, q7, *oq1, &sum); + *oq2 = apply_15_tap_filter_kernel(flat2, p5, q1, q2, q7, *oq2, &sum); + *oq3 = apply_15_tap_filter_kernel(flat2, p4, q2, q3, q7, q3, &sum); + *oq4 = apply_15_tap_filter_kernel(flat2, p3, q3, q4, q7, q4, &sum); + *oq5 = apply_15_tap_filter_kernel(flat2, p2, q4, q5, q7, q5, &sum); + *oq6 = apply_15_tap_filter_kernel(flat2, p1, q5, q6, q7, q6, &sum); +} + +static INLINE void filter4(const uint16x8_t mask, const uint16x8_t hev, + const uint16x8_t p1, const uint16x8_t p0, + const uint16x8_t q0, const uint16x8_t q1, + uint16x8_t *op1, uint16x8_t *op0, uint16x8_t *oq0, + uint16x8_t *oq1, const int bd) { + const int16x8_t max = vdupq_n_s16((1 << (bd - 1)) - 1); + const int16x8_t min = vdupq_n_s16((int16_t)(((uint32_t)-1) << (bd - 1))); + int16x8_t filter, filter1, filter2, t; + int16x8_t ps1 = flip_sign(p1, bd); + int16x8_t ps0 = flip_sign(p0, bd); + int16x8_t qs0 = flip_sign(q0, bd); + int16x8_t qs1 = flip_sign(q1, bd); + + /* add outer taps if we have high edge variance */ + filter = vsubq_s16(ps1, qs1); + filter = vmaxq_s16(filter, min); + filter = vminq_s16(filter, max); + filter = vandq_s16(filter, vreinterpretq_s16_u16(hev)); + t = vsubq_s16(qs0, ps0); + + /* inner taps */ + filter = vaddq_s16(filter, t); + filter = vaddq_s16(filter, t); + filter = vaddq_s16(filter, t); + filter = vmaxq_s16(filter, min); + filter = vminq_s16(filter, max); + filter = vandq_s16(filter, vreinterpretq_s16_u16(mask)); + + /* save bottom 3 bits so that we round one side +4 and the other +3 */ + /* if it equals 4 we'll set it to adjust by -1 to account for the fact */ + /* we'd round it by 3 the other way */ + t = vaddq_s16(filter, vdupq_n_s16(4)); + t = vminq_s16(t, max); + filter1 = vshrq_n_s16(t, 3); + t = vaddq_s16(filter, vdupq_n_s16(3)); + t = vminq_s16(t, max); + filter2 = vshrq_n_s16(t, 3); + + qs0 = vsubq_s16(qs0, filter1); + qs0 = vmaxq_s16(qs0, min); + qs0 = vminq_s16(qs0, max); + ps0 = vaddq_s16(ps0, filter2); + ps0 = vmaxq_s16(ps0, min); + ps0 = vminq_s16(ps0, max); + *oq0 = flip_sign_back(qs0, bd); + *op0 = flip_sign_back(ps0, bd); + + /* outer tap adjustments */ + filter = vrshrq_n_s16(filter1, 1); + filter = vbicq_s16(filter, vreinterpretq_s16_u16(hev)); + + qs1 = vsubq_s16(qs1, filter); + qs1 = vmaxq_s16(qs1, min); + qs1 = vminq_s16(qs1, max); + ps1 = vaddq_s16(ps1, filter); + ps1 = vmaxq_s16(ps1, min); + ps1 = vminq_s16(ps1, max); + *oq1 = flip_sign_back(qs1, bd); + *op1 = flip_sign_back(ps1, bd); +} + +static INLINE void filter8(const uint16x8_t mask, const uint16x8_t flat, + const uint32_t flat_status, const uint16x8_t hev, + const uint16x8_t p3, const uint16x8_t p2, + const uint16x8_t p1, const uint16x8_t p0, + const uint16x8_t q0, const uint16x8_t q1, + const uint16x8_t q2, const uint16x8_t q3, + uint16x8_t *op2, uint16x8_t *op1, uint16x8_t *op0, + uint16x8_t *oq0, uint16x8_t *oq1, uint16x8_t *oq2, + const int bd) { + if (flat_status != (uint32_t)-4) { + filter4(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1, bd); + *op2 = p2; + *oq2 = q2; + if (flat_status) { + apply_7_tap_filter(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0, + oq0, oq1, oq2); + } + } else { + calc_7_tap_filter(p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0, oq0, oq1, + oq2); + } +} + +static INLINE void filter16( + const uint16x8_t mask, const uint16x8_t flat, const uint32_t flat_status, + const uint16x8_t flat2, const uint32_t flat2_status, const uint16x8_t hev, + const uint16x8_t p7, const uint16x8_t p6, const uint16x8_t p5, + const uint16x8_t p4, const uint16x8_t p3, const uint16x8_t p2, + const uint16x8_t p1, const uint16x8_t p0, const uint16x8_t q0, + const uint16x8_t q1, const uint16x8_t q2, const uint16x8_t q3, + const uint16x8_t q4, const uint16x8_t q5, const uint16x8_t q6, + const uint16x8_t q7, uint16x8_t *op6, uint16x8_t *op5, uint16x8_t *op4, + uint16x8_t *op3, uint16x8_t *op2, uint16x8_t *op1, uint16x8_t *op0, + uint16x8_t *oq0, uint16x8_t *oq1, uint16x8_t *oq2, uint16x8_t *oq3, + uint16x8_t *oq4, uint16x8_t *oq5, uint16x8_t *oq6, const int bd) { + if (flat_status != (uint32_t)-4) { + filter4(mask, hev, p1, p0, q0, q1, op1, op0, oq0, oq1, bd); + } + + if (flat_status) { + *op2 = p2; + *oq2 = q2; + if (flat2_status != (uint32_t)-4) { + apply_7_tap_filter(flat, p3, p2, p1, p0, q0, q1, q2, q3, op2, op1, op0, + oq0, oq1, oq2); + } + if (flat2_status) { + apply_15_tap_filter(flat2, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, + q4, q5, q6, q7, op6, op5, op4, op3, op2, op1, op0, + oq0, oq1, oq2, oq3, oq4, oq5, oq6); + } + } +} + +static INLINE void load_8x8(const uint16_t *s, const int p, uint16x8_t *p3, + uint16x8_t *p2, uint16x8_t *p1, uint16x8_t *p0, + uint16x8_t *q0, uint16x8_t *q1, uint16x8_t *q2, + uint16x8_t *q3) { + *p3 = vld1q_u16(s); + s += p; + *p2 = vld1q_u16(s); + s += p; + *p1 = vld1q_u16(s); + s += p; + *p0 = vld1q_u16(s); + s += p; + *q0 = vld1q_u16(s); + s += p; + *q1 = vld1q_u16(s); + s += p; + *q2 = vld1q_u16(s); + s += p; + *q3 = vld1q_u16(s); +} + +static INLINE void load_8x16(const uint16_t *s, const int p, uint16x8_t *s0, + uint16x8_t *s1, uint16x8_t *s2, uint16x8_t *s3, + uint16x8_t *s4, uint16x8_t *s5, uint16x8_t *s6, + uint16x8_t *s7, uint16x8_t *s8, uint16x8_t *s9, + uint16x8_t *s10, uint16x8_t *s11, uint16x8_t *s12, + uint16x8_t *s13, uint16x8_t *s14, + uint16x8_t *s15) { + *s0 = vld1q_u16(s); + s += p; + *s1 = vld1q_u16(s); + s += p; + *s2 = vld1q_u16(s); + s += p; + *s3 = vld1q_u16(s); + s += p; + *s4 = vld1q_u16(s); + s += p; + *s5 = vld1q_u16(s); + s += p; + *s6 = vld1q_u16(s); + s += p; + *s7 = vld1q_u16(s); + s += p; + *s8 = vld1q_u16(s); + s += p; + *s9 = vld1q_u16(s); + s += p; + *s10 = vld1q_u16(s); + s += p; + *s11 = vld1q_u16(s); + s += p; + *s12 = vld1q_u16(s); + s += p; + *s13 = vld1q_u16(s); + s += p; + *s14 = vld1q_u16(s); + s += p; + *s15 = vld1q_u16(s); +} + +static INLINE void store_8x4(uint16_t *s, const int p, const uint16x8_t s0, + const uint16x8_t s1, const uint16x8_t s2, + const uint16x8_t s3) { + vst1q_u16(s, s0); + s += p; + vst1q_u16(s, s1); + s += p; + vst1q_u16(s, s2); + s += p; + vst1q_u16(s, s3); +} + +static INLINE void store_8x6(uint16_t *s, const int p, const uint16x8_t s0, + const uint16x8_t s1, const uint16x8_t s2, + const uint16x8_t s3, const uint16x8_t s4, + const uint16x8_t s5) { + vst1q_u16(s, s0); + s += p; + vst1q_u16(s, s1); + s += p; + vst1q_u16(s, s2); + s += p; + vst1q_u16(s, s3); + s += p; + vst1q_u16(s, s4); + s += p; + vst1q_u16(s, s5); +} + +static INLINE void store_4x8(uint16_t *s, const int p, const uint16x8_t p1, + const uint16x8_t p0, const uint16x8_t q0, + const uint16x8_t q1) { + uint16x8x4_t o; + + o.val[0] = p1; + o.val[1] = p0; + o.val[2] = q0; + o.val[3] = q1; + vst4q_lane_u16(s, o, 0); + s += p; + vst4q_lane_u16(s, o, 1); + s += p; + vst4q_lane_u16(s, o, 2); + s += p; + vst4q_lane_u16(s, o, 3); + s += p; + vst4q_lane_u16(s, o, 4); + s += p; + vst4q_lane_u16(s, o, 5); + s += p; + vst4q_lane_u16(s, o, 6); + s += p; + vst4q_lane_u16(s, o, 7); +} + +static INLINE void store_6x8(uint16_t *s, const int p, const uint16x8_t s0, + const uint16x8_t s1, const uint16x8_t s2, + const uint16x8_t s3, const uint16x8_t s4, + const uint16x8_t s5) { + uint16x8x3_t o0, o1; + + o0.val[0] = s0; + o0.val[1] = s1; + o0.val[2] = s2; + o1.val[0] = s3; + o1.val[1] = s4; + o1.val[2] = s5; + vst3q_lane_u16(s - 3, o0, 0); + vst3q_lane_u16(s + 0, o1, 0); + s += p; + vst3q_lane_u16(s - 3, o0, 1); + vst3q_lane_u16(s + 0, o1, 1); + s += p; + vst3q_lane_u16(s - 3, o0, 2); + vst3q_lane_u16(s + 0, o1, 2); + s += p; + vst3q_lane_u16(s - 3, o0, 3); + vst3q_lane_u16(s + 0, o1, 3); + s += p; + vst3q_lane_u16(s - 3, o0, 4); + vst3q_lane_u16(s + 0, o1, 4); + s += p; + vst3q_lane_u16(s - 3, o0, 5); + vst3q_lane_u16(s + 0, o1, 5); + s += p; + vst3q_lane_u16(s - 3, o0, 6); + vst3q_lane_u16(s + 0, o1, 6); + s += p; + vst3q_lane_u16(s - 3, o0, 7); + vst3q_lane_u16(s + 0, o1, 7); +} + +static INLINE void store_7x8(uint16_t *s, const int p, const uint16x8_t s0, + const uint16x8_t s1, const uint16x8_t s2, + const uint16x8_t s3, const uint16x8_t s4, + const uint16x8_t s5, const uint16x8_t s6) { + uint16x8x4_t o0; + uint16x8x3_t o1; + + o0.val[0] = s0; + o0.val[1] = s1; + o0.val[2] = s2; + o0.val[3] = s3; + o1.val[0] = s4; + o1.val[1] = s5; + o1.val[2] = s6; + vst4q_lane_u16(s - 4, o0, 0); + vst3q_lane_u16(s + 0, o1, 0); + s += p; + vst4q_lane_u16(s - 4, o0, 1); + vst3q_lane_u16(s + 0, o1, 1); + s += p; + vst4q_lane_u16(s - 4, o0, 2); + vst3q_lane_u16(s + 0, o1, 2); + s += p; + vst4q_lane_u16(s - 4, o0, 3); + vst3q_lane_u16(s + 0, o1, 3); + s += p; + vst4q_lane_u16(s - 4, o0, 4); + vst3q_lane_u16(s + 0, o1, 4); + s += p; + vst4q_lane_u16(s - 4, o0, 5); + vst3q_lane_u16(s + 0, o1, 5); + s += p; + vst4q_lane_u16(s - 4, o0, 6); + vst3q_lane_u16(s + 0, o1, 6); + s += p; + vst4q_lane_u16(s - 4, o0, 7); + vst3q_lane_u16(s + 0, o1, 7); +} + +static INLINE void store_8x14(uint16_t *s, const int p, const uint16x8_t p6, + const uint16x8_t p5, const uint16x8_t p4, + const uint16x8_t p3, const uint16x8_t p2, + const uint16x8_t p1, const uint16x8_t p0, + const uint16x8_t q0, const uint16x8_t q1, + const uint16x8_t q2, const uint16x8_t q3, + const uint16x8_t q4, const uint16x8_t q5, + const uint16x8_t q6, const uint32_t flat_status, + const uint32_t flat2_status) { + if (flat_status) { + if (flat2_status) { + vst1q_u16(s - 7 * p, p6); + vst1q_u16(s - 6 * p, p5); + vst1q_u16(s - 5 * p, p4); + vst1q_u16(s - 4 * p, p3); + vst1q_u16(s + 3 * p, q3); + vst1q_u16(s + 4 * p, q4); + vst1q_u16(s + 5 * p, q5); + vst1q_u16(s + 6 * p, q6); + } + vst1q_u16(s - 3 * p, p2); + vst1q_u16(s + 2 * p, q2); + } + vst1q_u16(s - 2 * p, p1); + vst1q_u16(s - 1 * p, p0); + vst1q_u16(s + 0 * p, q0); + vst1q_u16(s + 1 * p, q1); +} + +void vpx_highbd_lpf_horizontal_4_neon(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, + mask, hev; + + load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd); + load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + filter_hev_mask4(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, + q2, q3, &hev, &mask); + filter4(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1, bd); + store_8x4(s - 2 * p, p, p1, p0, q0, q1); +} + +void vpx_highbd_lpf_horizontal_4_dual_neon( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + vpx_highbd_lpf_horizontal_4_neon(s, p, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_horizontal_4_neon(s + 8, p, blimit1, limit1, thresh1, bd); +} + +void vpx_highbd_lpf_vertical_4_neon(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, + mask, hev; + + load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + transpose_s16_8x8((int16x8_t *)&p3, (int16x8_t *)&p2, (int16x8_t *)&p1, + (int16x8_t *)&p0, (int16x8_t *)&q0, (int16x8_t *)&q1, + (int16x8_t *)&q2, (int16x8_t *)&q3); + load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd); + filter_hev_mask4(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, + q2, q3, &hev, &mask); + filter4(mask, hev, p1, p0, q0, q1, &p1, &p0, &q0, &q1, bd); + store_4x8(s - 2, p, p1, p0, q0, q1); +} + +void vpx_highbd_lpf_vertical_4_dual_neon( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + vpx_highbd_lpf_vertical_4_neon(s, p, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_vertical_4_neon(s + 8 * p, p, blimit1, limit1, thresh1, bd); +} + +void vpx_highbd_lpf_horizontal_8_neon(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, + op2, op1, op0, oq0, oq1, oq2, mask, flat, hev; + uint32_t flat_status; + + load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd); + load_8x8(s - 4 * p, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, + q0, q1, q2, q3, &flat, &flat_status, &hev, bd); + filter8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2, + &op1, &op0, &oq0, &oq1, &oq2, bd); + store_8x6(s - 3 * p, p, op2, op1, op0, oq0, oq1, oq2); +} + +void vpx_highbd_lpf_horizontal_8_dual_neon( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + vpx_highbd_lpf_horizontal_8_neon(s, p, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_horizontal_8_neon(s + 8, p, blimit1, limit1, thresh1, bd); +} + +void vpx_highbd_lpf_vertical_8_neon(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, const uint8_t *thresh, + int bd) { + uint16x8_t blimit_vec, limit_vec, thresh_vec, p3, p2, p1, p0, q0, q1, q2, q3, + op2, op1, op0, oq0, oq1, oq2, mask, flat, hev; + uint32_t flat_status; + + load_8x8(s - 4, p, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3); + transpose_s16_8x8((int16x8_t *)&p3, (int16x8_t *)&p2, (int16x8_t *)&p1, + (int16x8_t *)&p0, (int16x8_t *)&q0, (int16x8_t *)&q1, + (int16x8_t *)&q2, (int16x8_t *)&q3); + load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd); + mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, + q0, q1, q2, q3, &flat, &flat_status, &hev, bd); + filter8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2, + &op1, &op0, &oq0, &oq1, &oq2, bd); + // Note: store_6x8() is faster than transpose + store_8x8(). + store_6x8(s, p, op2, op1, op0, oq0, oq1, oq2); +} + +void vpx_highbd_lpf_vertical_8_dual_neon( + uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0, + const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, + const uint8_t *thresh1, int bd) { + vpx_highbd_lpf_vertical_8_neon(s, p, blimit0, limit0, thresh0, bd); + vpx_highbd_lpf_vertical_8_neon(s + 8 * p, p, blimit1, limit1, thresh1, bd); +} + +static void lpf_horizontal_16_kernel(uint16_t *s, int p, + const uint16x8_t blimit_vec, + const uint16x8_t limit_vec, + const uint16x8_t thresh_vec, + const int bd) { + uint16x8_t mask, flat, flat2, hev, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, + q3, q4, q5, q6, q7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, + oq4, oq5, oq6; + uint32_t flat_status, flat2_status; + + load_8x16(s - 8 * p, p, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &q1, &q2, + &q3, &q4, &q5, &q6, &q7); + mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, + q0, q1, q2, q3, &flat, &flat_status, &hev, bd); + flat2 = flat_mask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, + &flat2_status, bd); + filter16(mask, flat, flat_status, flat2, flat2_status, hev, p7, p6, p5, p4, + p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, + &op3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, + bd); + store_8x14(s, p, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, oq4, + oq5, oq6, flat_status, flat2_status); +} + +static void lpf_vertical_16_kernel(uint16_t *s, int p, + const uint16x8_t blimit_vec, + const uint16x8_t limit_vec, + const uint16x8_t thresh_vec, const int bd) { + uint16x8_t mask, flat, flat2, hev, p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, + q3, q4, q5, q6, q7, op6, op5, op4, op3, op2, op1, op0, oq0, oq1, oq2, oq3, + oq4, oq5, oq6; + uint32_t flat_status, flat2_status; + + load_8x8(s - 8, p, &p7, &p6, &p5, &p4, &p3, &p2, &p1, &p0); + transpose_s16_8x8((int16x8_t *)&p7, (int16x8_t *)&p6, (int16x8_t *)&p5, + (int16x8_t *)&p4, (int16x8_t *)&p3, (int16x8_t *)&p2, + (int16x8_t *)&p1, (int16x8_t *)&p0); + load_8x8(s, p, &q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7); + transpose_s16_8x8((int16x8_t *)&q0, (int16x8_t *)&q1, (int16x8_t *)&q2, + (int16x8_t *)&q3, (int16x8_t *)&q4, (int16x8_t *)&q5, + (int16x8_t *)&q6, (int16x8_t *)&q7); + mask = filter_flat_hev_mask(limit_vec, blimit_vec, thresh_vec, p3, p2, p1, p0, + q0, q1, q2, q3, &flat, &flat_status, &hev, bd); + flat2 = flat_mask5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, + &flat2_status, bd); + filter16(mask, flat, flat_status, flat2, flat2_status, hev, p7, p6, p5, p4, + p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7, &op6, &op5, &op4, + &op3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &oq3, &oq4, &oq5, &oq6, + bd); + if (flat_status) { + if (flat2_status) { + store_7x8(s - 3, p, op6, op5, op4, op3, op2, op1, op0); + store_7x8(s + 4, p, oq0, oq1, oq2, oq3, oq4, oq5, oq6); + } else { + // Note: store_6x8() is faster than transpose + store_8x8(). + store_6x8(s, p, op2, op1, op0, oq0, oq1, oq2); + } + } else { + store_4x8(s - 2, p, op1, op0, oq0, oq1); + } +} + +void vpx_highbd_lpf_horizontal_16_neon(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + uint16x8_t blimit_vec, limit_vec, thresh_vec; + load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd); + lpf_horizontal_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd); +} + +void vpx_highbd_lpf_horizontal_16_dual_neon(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + uint16x8_t blimit_vec, limit_vec, thresh_vec; + load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd); + lpf_horizontal_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd); + lpf_horizontal_16_kernel(s + 8, p, blimit_vec, limit_vec, thresh_vec, bd); +} + +void vpx_highbd_lpf_vertical_16_neon(uint16_t *s, int p, const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + uint16x8_t blimit_vec, limit_vec, thresh_vec; + load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd); + lpf_vertical_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd); +} + +void vpx_highbd_lpf_vertical_16_dual_neon(uint16_t *s, int p, + const uint8_t *blimit, + const uint8_t *limit, + const uint8_t *thresh, int bd) { + uint16x8_t blimit_vec, limit_vec, thresh_vec; + load_thresh(blimit, limit, thresh, &blimit_vec, &limit_vec, &thresh_vec, bd); + lpf_vertical_16_kernel(s, p, blimit_vec, limit_vec, thresh_vec, bd); + lpf_vertical_16_kernel(s + 8 * p, p, blimit_vec, limit_vec, thresh_vec, bd); +} diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c new file mode 100644 index 00000000000..1fde13e8d6d --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve8_neon.c @@ -0,0 +1,923 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include <assert.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_ports/mem.h" + +static INLINE void load_4x4(const int16_t *s, ptrdiff_t p, int16x4_t *s0, + int16x4_t *s1, int16x4_t *s2, int16x4_t *s3) { + *s0 = vld1_s16(s); + s += p; + *s1 = vld1_s16(s); + s += p; + *s2 = vld1_s16(s); + s += p; + *s3 = vld1_s16(s); +} + +static INLINE void load_8x4(const uint16_t *s, ptrdiff_t p, uint16x8_t *s0, + uint16x8_t *s1, uint16x8_t *s2, uint16x8_t *s3) { + *s0 = vld1q_u16(s); + s += p; + *s1 = vld1q_u16(s); + s += p; + *s2 = vld1q_u16(s); + s += p; + *s3 = vld1q_u16(s); +} + +static INLINE void load_8x8(const int16_t *s, ptrdiff_t p, int16x8_t *s0, + int16x8_t *s1, int16x8_t *s2, int16x8_t *s3, + int16x8_t *s4, int16x8_t *s5, int16x8_t *s6, + int16x8_t *s7) { + *s0 = vld1q_s16(s); + s += p; + *s1 = vld1q_s16(s); + s += p; + *s2 = vld1q_s16(s); + s += p; + *s3 = vld1q_s16(s); + s += p; + *s4 = vld1q_s16(s); + s += p; + *s5 = vld1q_s16(s); + s += p; + *s6 = vld1q_s16(s); + s += p; + *s7 = vld1q_s16(s); +} + +static INLINE void store_8x8(uint16_t *s, ptrdiff_t p, const uint16x8_t s0, + const uint16x8_t s1, const uint16x8_t s2, + const uint16x8_t s3, const uint16x8_t s4, + const uint16x8_t s5, const uint16x8_t s6, + const uint16x8_t s7) { + vst1q_u16(s, s0); + s += p; + vst1q_u16(s, s1); + s += p; + vst1q_u16(s, s2); + s += p; + vst1q_u16(s, s3); + s += p; + vst1q_u16(s, s4); + s += p; + vst1q_u16(s, s5); + s += p; + vst1q_u16(s, s6); + s += p; + vst1q_u16(s, s7); +} + +static INLINE int32x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1, + const int16x4_t s2, const int16x4_t s3, + const int16x4_t s4, const int16x4_t s5, + const int16x4_t s6, const int16x4_t s7, + const int16x8_t filters) { + const int16x4_t filters_lo = vget_low_s16(filters); + const int16x4_t filters_hi = vget_high_s16(filters); + int32x4_t sum = vdupq_n_s32(0); + + sum = vmlal_lane_s16(sum, s0, filters_lo, 0); + sum = vmlal_lane_s16(sum, s1, filters_lo, 1); + sum = vmlal_lane_s16(sum, s2, filters_lo, 2); + sum = vmlal_lane_s16(sum, s3, filters_lo, 3); + sum = vmlal_lane_s16(sum, s4, filters_hi, 0); + sum = vmlal_lane_s16(sum, s5, filters_hi, 1); + sum = vmlal_lane_s16(sum, s6, filters_hi, 2); + sum = vmlal_lane_s16(sum, s7, filters_hi, 3); + return sum; +} + +static INLINE uint16x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1, + const int16x8_t s2, const int16x8_t s3, + const int16x8_t s4, const int16x8_t s5, + const int16x8_t s6, const int16x8_t s7, + const int16x8_t filters, + const uint16x8_t max) { + const int16x4_t filters_lo = vget_low_s16(filters); + const int16x4_t filters_hi = vget_high_s16(filters); + int32x4_t sum0 = vdupq_n_s32(0); + int32x4_t sum1 = vdupq_n_s32(0); + uint16x8_t d; + + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s0), filters_lo, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s1), filters_lo, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s2), filters_lo, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s3), filters_lo, 3); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s4), filters_hi, 0); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s5), filters_hi, 1); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s6), filters_hi, 2); + sum0 = vmlal_lane_s16(sum0, vget_low_s16(s7), filters_hi, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s0), filters_lo, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s1), filters_lo, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s2), filters_lo, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s3), filters_lo, 3); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s4), filters_hi, 0); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s5), filters_hi, 1); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s6), filters_hi, 2); + sum1 = vmlal_lane_s16(sum1, vget_high_s16(s7), filters_hi, 3); + d = vcombine_u16(vqrshrun_n_s32(sum0, 7), vqrshrun_n_s32(sum1, 7)); + d = vminq_u16(d, max); + return d; +} + +void vpx_highbd_convolve8_horiz_neon(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, // unused + int y_step_q4, // unused + int w, int h, int bd) { + if (x_step_q4 != 16) { + vpx_highbd_convolve8_horiz_c(src8, src_stride, dst8, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h, bd); + } else { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + const int16x8_t filters = vld1q_s16(filter_x); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + uint16x8_t t0, t1, t2, t3; + + assert(!((intptr_t)dst & 3)); + assert(!(dst_stride & 3)); + + src -= 3; + + if (h == 4) { + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + int32x4_t d0, d1, d2, d3; + uint16x8_t d01, d23; + + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + load_8x4(src, src_stride, &t0, &t1, &t2, &t3); + transpose_u16_8x4(&t0, &t1, &t2, &t3); + s0 = vreinterpret_s16_u16(vget_low_u16(t0)); + s1 = vreinterpret_s16_u16(vget_low_u16(t1)); + s2 = vreinterpret_s16_u16(vget_low_u16(t2)); + s3 = vreinterpret_s16_u16(vget_low_u16(t3)); + s4 = vreinterpret_s16_u16(vget_high_u16(t0)); + s5 = vreinterpret_s16_u16(vget_high_u16(t1)); + s6 = vreinterpret_s16_u16(vget_high_u16(t2)); + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + src += 7; + + do { + load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10); + transpose_s16_4x4d(&s7, &s8, &s9, &s10); + + d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + + d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7)); + d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7)); + d01 = vminq_u16(d01, max); + d23 = vminq_u16(d23, max); + transpose_u16_4x4q(&d01, &d23); + + vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01)); + vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23)); + vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01)); + vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23)); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + src += 4; + dst += 4; + w -= 4; + } while (w > 0); + } else { + int16x8_t t4, t5, t6, t7; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + uint16x8_t d0, d1, d2, d3; + + if (w == 4) { + do { + load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4, + &s5, &s6, &s7); + transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10, + &t4, &t5, &t6, &t7); + src += 8 * src_stride; + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + __builtin_prefetch(dst + 4 * dst_stride); + __builtin_prefetch(dst + 5 * dst_stride); + __builtin_prefetch(dst + 6 * dst_stride); + __builtin_prefetch(dst + 7 * dst_stride); + transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7); + + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + __builtin_prefetch(src + 4 * src_stride); + __builtin_prefetch(src + 5 * src_stride); + __builtin_prefetch(src + 6 * src_stride); + __builtin_prefetch(src + 7 * src_stride); + d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); + d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); + d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); + d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); + + transpose_u16_8x4(&d0, &d1, &d2, &d3); + vst1_u16(dst, vget_low_u16(d0)); + dst += dst_stride; + vst1_u16(dst, vget_low_u16(d1)); + dst += dst_stride; + vst1_u16(dst, vget_low_u16(d2)); + dst += dst_stride; + vst1_u16(dst, vget_low_u16(d3)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d0)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d1)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d2)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d3)); + dst += dst_stride; + h -= 8; + } while (h > 0); + } else { + int width; + const uint16_t *s; + uint16_t *d; + int16x8_t s11, s12, s13, s14; + uint16x8_t d4, d5, d6, d7; + + do { + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + __builtin_prefetch(src + 4 * src_stride); + __builtin_prefetch(src + 5 * src_stride); + __builtin_prefetch(src + 6 * src_stride); + __builtin_prefetch(src + 7 * src_stride); + load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4, + &s5, &s6, &s7); + transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + width = w; + s = src + 7; + d = dst; + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + __builtin_prefetch(dst + 4 * dst_stride); + __builtin_prefetch(dst + 5 * dst_stride); + __builtin_prefetch(dst + 6 * dst_stride); + __builtin_prefetch(dst + 7 * dst_stride); + + do { + load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11, + &s12, &s13, &s14); + transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14); + + d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); + d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); + d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); + d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); + d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, max); + d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, max); + d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, max); + d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, max); + + transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += 8 * src_stride; + dst += 8 * dst_stride; + h -= 8; + } while (h > 0); + } + } + } +} + +void vpx_highbd_convolve8_avg_horiz_neon(const uint8_t *src8, + ptrdiff_t src_stride, uint8_t *dst8, + ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, // unused + int y_step_q4, // unused + int w, int h, int bd) { + if (x_step_q4 != 16) { + vpx_highbd_convolve8_avg_horiz_c(src8, src_stride, dst8, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h, bd); + } else { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + const int16x8_t filters = vld1q_s16(filter_x); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + uint16x8_t t0, t1, t2, t3; + + assert(!((intptr_t)dst & 3)); + assert(!(dst_stride & 3)); + + src -= 3; + + if (h == 4) { + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + int32x4_t d0, d1, d2, d3; + uint16x8_t d01, d23, t01, t23; + + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + load_8x4(src, src_stride, &t0, &t1, &t2, &t3); + transpose_u16_8x4(&t0, &t1, &t2, &t3); + s0 = vreinterpret_s16_u16(vget_low_u16(t0)); + s1 = vreinterpret_s16_u16(vget_low_u16(t1)); + s2 = vreinterpret_s16_u16(vget_low_u16(t2)); + s3 = vreinterpret_s16_u16(vget_low_u16(t3)); + s4 = vreinterpret_s16_u16(vget_high_u16(t0)); + s5 = vreinterpret_s16_u16(vget_high_u16(t1)); + s6 = vreinterpret_s16_u16(vget_high_u16(t2)); + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + src += 7; + + do { + load_4x4((const int16_t *)src, src_stride, &s7, &s8, &s9, &s10); + transpose_s16_4x4d(&s7, &s8, &s9, &s10); + + d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + + t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7)); + t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7)); + t01 = vminq_u16(t01, max); + t23 = vminq_u16(t23, max); + transpose_u16_4x4q(&t01, &t23); + + d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride), + vld1_u16(dst + 2 * dst_stride)); + d23 = vcombine_u16(vld1_u16(dst + 1 * dst_stride), + vld1_u16(dst + 3 * dst_stride)); + d01 = vrhaddq_u16(d01, t01); + d23 = vrhaddq_u16(d23, t23); + + vst1_u16(dst + 0 * dst_stride, vget_low_u16(d01)); + vst1_u16(dst + 1 * dst_stride, vget_low_u16(d23)); + vst1_u16(dst + 2 * dst_stride, vget_high_u16(d01)); + vst1_u16(dst + 3 * dst_stride, vget_high_u16(d23)); + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + src += 4; + dst += 4; + w -= 4; + } while (w > 0); + } else { + int16x8_t t4, t5, t6, t7; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3; + + if (w == 4) { + do { + load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4, + &s5, &s6, &s7); + transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + load_8x8((const int16_t *)(src + 7), src_stride, &s7, &s8, &s9, &s10, + &t4, &t5, &t6, &t7); + src += 8 * src_stride; + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + __builtin_prefetch(dst + 4 * dst_stride); + __builtin_prefetch(dst + 5 * dst_stride); + __builtin_prefetch(dst + 6 * dst_stride); + __builtin_prefetch(dst + 7 * dst_stride); + transpose_s16_8x8(&s7, &s8, &s9, &s10, &t4, &t5, &t6, &t7); + + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + __builtin_prefetch(src + 4 * src_stride); + __builtin_prefetch(src + 5 * src_stride); + __builtin_prefetch(src + 6 * src_stride); + __builtin_prefetch(src + 7 * src_stride); + t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); + t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); + t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); + t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); + transpose_u16_8x4(&t0, &t1, &t2, &t3); + + d0 = vcombine_u16(vld1_u16(dst + 0 * dst_stride), + vld1_u16(dst + 4 * dst_stride)); + d1 = vcombine_u16(vld1_u16(dst + 1 * dst_stride), + vld1_u16(dst + 5 * dst_stride)); + d2 = vcombine_u16(vld1_u16(dst + 2 * dst_stride), + vld1_u16(dst + 6 * dst_stride)); + d3 = vcombine_u16(vld1_u16(dst + 3 * dst_stride), + vld1_u16(dst + 7 * dst_stride)); + d0 = vrhaddq_u16(d0, t0); + d1 = vrhaddq_u16(d1, t1); + d2 = vrhaddq_u16(d2, t2); + d3 = vrhaddq_u16(d3, t3); + + vst1_u16(dst, vget_low_u16(d0)); + dst += dst_stride; + vst1_u16(dst, vget_low_u16(d1)); + dst += dst_stride; + vst1_u16(dst, vget_low_u16(d2)); + dst += dst_stride; + vst1_u16(dst, vget_low_u16(d3)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d0)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d1)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d2)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d3)); + dst += dst_stride; + h -= 8; + } while (h > 0); + } else { + int width; + const uint16_t *s; + uint16_t *d; + int16x8_t s11, s12, s13, s14; + uint16x8_t d4, d5, d6, d7; + + do { + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + __builtin_prefetch(src + 4 * src_stride); + __builtin_prefetch(src + 5 * src_stride); + __builtin_prefetch(src + 6 * src_stride); + __builtin_prefetch(src + 7 * src_stride); + load_8x8((const int16_t *)src, src_stride, &s0, &s1, &s2, &s3, &s4, + &s5, &s6, &s7); + transpose_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + width = w; + s = src + 7; + d = dst; + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + __builtin_prefetch(dst + 4 * dst_stride); + __builtin_prefetch(dst + 5 * dst_stride); + __builtin_prefetch(dst + 6 * dst_stride); + __builtin_prefetch(dst + 7 * dst_stride); + + do { + load_8x8((const int16_t *)s, src_stride, &s7, &s8, &s9, &s10, &s11, + &s12, &s13, &s14); + transpose_s16_8x8(&s7, &s8, &s9, &s10, &s11, &s12, &s13, &s14); + + d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); + d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); + d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); + d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); + d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filters, max); + d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filters, max); + d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filters, max); + d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filters, max); + + transpose_u16_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); + + d0 = vrhaddq_u16(d0, vld1q_u16(d + 0 * dst_stride)); + d1 = vrhaddq_u16(d1, vld1q_u16(d + 1 * dst_stride)); + d2 = vrhaddq_u16(d2, vld1q_u16(d + 2 * dst_stride)); + d3 = vrhaddq_u16(d3, vld1q_u16(d + 3 * dst_stride)); + d4 = vrhaddq_u16(d4, vld1q_u16(d + 4 * dst_stride)); + d5 = vrhaddq_u16(d5, vld1q_u16(d + 5 * dst_stride)); + d6 = vrhaddq_u16(d6, vld1q_u16(d + 6 * dst_stride)); + d7 = vrhaddq_u16(d7, vld1q_u16(d + 7 * dst_stride)); + + store_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); + + s0 = s8; + s1 = s9; + s2 = s10; + s3 = s11; + s4 = s12; + s5 = s13; + s6 = s14; + s += 8; + d += 8; + width -= 8; + } while (width > 0); + src += 8 * src_stride; + dst += 8 * dst_stride; + h -= 8; + } while (h > 0); + } + } + } +} + +void vpx_highbd_convolve8_vert_neon(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const int16_t *filter_x, // unused + int x_step_q4, // unused + const int16_t *filter_y, int y_step_q4, + int w, int h, int bd) { + if (y_step_q4 != 16) { + vpx_highbd_convolve8_vert_c(src8, src_stride, dst8, dst_stride, filter_x, + x_step_q4, filter_y, y_step_q4, w, h, bd); + } else { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + const int16x8_t filters = vld1q_s16(filter_y); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + assert(!((intptr_t)dst & 3)); + assert(!(dst_stride & 3)); + + src -= 3 * src_stride; + + if (w == 4) { + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + int32x4_t d0, d1, d2, d3; + uint16x8_t d01, d23; + + s0 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s1 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s2 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s3 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s4 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s5 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s6 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + + do { + s7 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s8 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s9 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s10 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + + d01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7)); + d23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7)); + d01 = vminq_u16(d01, max); + d23 = vminq_u16(d23, max); + vst1_u16(dst, vget_low_u16(d01)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d01)); + dst += dst_stride; + vst1_u16(dst, vget_low_u16(d23)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d23)); + dst += dst_stride; + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + h -= 4; + } while (h > 0); + } else { + int height; + const uint16_t *s; + uint16_t *d; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + uint16x8_t d0, d1, d2, d3; + + do { + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + __builtin_prefetch(src + 4 * src_stride); + __builtin_prefetch(src + 5 * src_stride); + __builtin_prefetch(src + 6 * src_stride); + s = src; + s0 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s1 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s2 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s3 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s4 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s5 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s6 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + d = dst; + height = h; + + do { + s7 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s8 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s9 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s10 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + __builtin_prefetch(s + 0 * src_stride); + __builtin_prefetch(s + 1 * src_stride); + __builtin_prefetch(s + 2 * src_stride); + __builtin_prefetch(s + 3 * src_stride); + d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); + d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); + d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); + d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); + + vst1q_u16(d, d0); + d += dst_stride; + vst1q_u16(d, d1); + d += dst_stride; + vst1q_u16(d, d2); + d += dst_stride; + vst1q_u16(d, d3); + d += dst_stride; + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + height -= 4; + } while (height > 0); + src += 8; + dst += 8; + w -= 8; + } while (w > 0); + } + } +} + +void vpx_highbd_convolve8_avg_vert_neon(const uint8_t *src8, + ptrdiff_t src_stride, uint8_t *dst8, + ptrdiff_t dst_stride, + const int16_t *filter_x, // unused + int x_step_q4, // unused + const int16_t *filter_y, int y_step_q4, + int w, int h, int bd) { + if (y_step_q4 != 16) { + vpx_highbd_convolve8_avg_vert_c(src8, src_stride, dst8, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, w, + h, bd); + } else { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + const int16x8_t filters = vld1q_s16(filter_y); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + assert(!((intptr_t)dst & 3)); + assert(!(dst_stride & 3)); + + src -= 3 * src_stride; + + if (w == 4) { + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + int32x4_t d0, d1, d2, d3; + uint16x8_t d01, d23, t01, t23; + + s0 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s1 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s2 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s3 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s4 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s5 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s6 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + + do { + s7 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s8 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s9 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + s10 = vreinterpret_s16_u16(vld1_u16(src)); + src += src_stride; + + __builtin_prefetch(dst + 0 * dst_stride); + __builtin_prefetch(dst + 1 * dst_stride); + __builtin_prefetch(dst + 2 * dst_stride); + __builtin_prefetch(dst + 3 * dst_stride); + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filters); + d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filters); + d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filters); + d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filters); + + t01 = vcombine_u16(vqrshrun_n_s32(d0, 7), vqrshrun_n_s32(d1, 7)); + t23 = vcombine_u16(vqrshrun_n_s32(d2, 7), vqrshrun_n_s32(d3, 7)); + t01 = vminq_u16(t01, max); + t23 = vminq_u16(t23, max); + + d01 = vcombine_u16(vld1_u16(dst + 0 * dst_stride), + vld1_u16(dst + 1 * dst_stride)); + d23 = vcombine_u16(vld1_u16(dst + 2 * dst_stride), + vld1_u16(dst + 3 * dst_stride)); + d01 = vrhaddq_u16(d01, t01); + d23 = vrhaddq_u16(d23, t23); + + vst1_u16(dst, vget_low_u16(d01)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d01)); + dst += dst_stride; + vst1_u16(dst, vget_low_u16(d23)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d23)); + dst += dst_stride; + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + h -= 4; + } while (h > 0); + } else { + int height; + const uint16_t *s; + uint16_t *d; + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; + uint16x8_t d0, d1, d2, d3, t0, t1, t2, t3; + + do { + __builtin_prefetch(src + 0 * src_stride); + __builtin_prefetch(src + 1 * src_stride); + __builtin_prefetch(src + 2 * src_stride); + __builtin_prefetch(src + 3 * src_stride); + __builtin_prefetch(src + 4 * src_stride); + __builtin_prefetch(src + 5 * src_stride); + __builtin_prefetch(src + 6 * src_stride); + s = src; + s0 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s1 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s2 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s3 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s4 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s5 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s6 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + d = dst; + height = h; + + do { + s7 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s8 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s9 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + s10 = vreinterpretq_s16_u16(vld1q_u16(s)); + s += src_stride; + + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + __builtin_prefetch(s + 0 * src_stride); + __builtin_prefetch(s + 1 * src_stride); + __builtin_prefetch(s + 2 * src_stride); + __builtin_prefetch(s + 3 * src_stride); + t0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, max); + t1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, max); + t2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filters, max); + t3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filters, max); + + d0 = vld1q_u16(d + 0 * dst_stride); + d1 = vld1q_u16(d + 1 * dst_stride); + d2 = vld1q_u16(d + 2 * dst_stride); + d3 = vld1q_u16(d + 3 * dst_stride); + d0 = vrhaddq_u16(d0, t0); + d1 = vrhaddq_u16(d1, t1); + d2 = vrhaddq_u16(d2, t2); + d3 = vrhaddq_u16(d3, t3); + + vst1q_u16(d, d0); + d += dst_stride; + vst1q_u16(d, d1); + d += dst_stride; + vst1q_u16(d, d2); + d += dst_stride; + vst1q_u16(d, d3); + d += dst_stride; + + s0 = s4; + s1 = s5; + s2 = s6; + s3 = s7; + s4 = s8; + s5 = s9; + s6 = s10; + height -= 4; + } while (height > 0); + src += 8; + dst += 8; + w -= 8; + } while (w > 0); + } + } +} diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c new file mode 100644 index 00000000000..f4d70761eb3 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +void vpx_highbd_convolve_avg_neon(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h, int bd) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + + (void)filter_x; + (void)filter_x_stride; + (void)filter_y; + (void)filter_y_stride; + (void)bd; + + if (w < 8) { // avg4 + uint16x4_t s0, s1, d0, d1; + uint16x8_t s01, d01; + do { + s0 = vld1_u16(src); + d0 = vld1_u16(dst); + src += src_stride; + s1 = vld1_u16(src); + d1 = vld1_u16(dst + dst_stride); + src += src_stride; + s01 = vcombine_u16(s0, s1); + d01 = vcombine_u16(d0, d1); + d01 = vrhaddq_u16(s01, d01); + vst1_u16(dst, vget_low_u16(d01)); + dst += dst_stride; + vst1_u16(dst, vget_high_u16(d01)); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else if (w == 8) { // avg8 + uint16x8_t s0, s1, d0, d1; + do { + s0 = vld1q_u16(src); + d0 = vld1q_u16(dst); + src += src_stride; + s1 = vld1q_u16(src); + d1 = vld1q_u16(dst + dst_stride); + src += src_stride; + + d0 = vrhaddq_u16(s0, d0); + d1 = vrhaddq_u16(s1, d1); + + vst1q_u16(dst, d0); + dst += dst_stride; + vst1q_u16(dst, d1); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else if (w < 32) { // avg16 + uint16x8_t s0l, s0h, s1l, s1h, d0l, d0h, d1l, d1h; + do { + s0l = vld1q_u16(src); + s0h = vld1q_u16(src + 8); + d0l = vld1q_u16(dst); + d0h = vld1q_u16(dst + 8); + src += src_stride; + s1l = vld1q_u16(src); + s1h = vld1q_u16(src + 8); + d1l = vld1q_u16(dst + dst_stride); + d1h = vld1q_u16(dst + dst_stride + 8); + src += src_stride; + + d0l = vrhaddq_u16(s0l, d0l); + d0h = vrhaddq_u16(s0h, d0h); + d1l = vrhaddq_u16(s1l, d1l); + d1h = vrhaddq_u16(s1h, d1h); + + vst1q_u16(dst, d0l); + vst1q_u16(dst + 8, d0h); + dst += dst_stride; + vst1q_u16(dst, d1l); + vst1q_u16(dst + 8, d1h); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else if (w == 32) { // avg32 + uint16x8_t s0, s1, s2, s3, d0, d1, d2, d3; + do { + s0 = vld1q_u16(src); + s1 = vld1q_u16(src + 8); + s2 = vld1q_u16(src + 16); + s3 = vld1q_u16(src + 24); + d0 = vld1q_u16(dst); + d1 = vld1q_u16(dst + 8); + d2 = vld1q_u16(dst + 16); + d3 = vld1q_u16(dst + 24); + src += src_stride; + + d0 = vrhaddq_u16(s0, d0); + d1 = vrhaddq_u16(s1, d1); + d2 = vrhaddq_u16(s2, d2); + d3 = vrhaddq_u16(s3, d3); + + vst1q_u16(dst, d0); + vst1q_u16(dst + 8, d1); + vst1q_u16(dst + 16, d2); + vst1q_u16(dst + 24, d3); + dst += dst_stride; + + s0 = vld1q_u16(src); + s1 = vld1q_u16(src + 8); + s2 = vld1q_u16(src + 16); + s3 = vld1q_u16(src + 24); + d0 = vld1q_u16(dst); + d1 = vld1q_u16(dst + 8); + d2 = vld1q_u16(dst + 16); + d3 = vld1q_u16(dst + 24); + src += src_stride; + + d0 = vrhaddq_u16(s0, d0); + d1 = vrhaddq_u16(s1, d1); + d2 = vrhaddq_u16(s2, d2); + d3 = vrhaddq_u16(s3, d3); + + vst1q_u16(dst, d0); + vst1q_u16(dst + 8, d1); + vst1q_u16(dst + 16, d2); + vst1q_u16(dst + 24, d3); + dst += dst_stride; + h -= 2; + } while (h > 0); + } else { // avg64 + uint16x8_t s0, s1, s2, s3, d0, d1, d2, d3; + do { + s0 = vld1q_u16(src); + s1 = vld1q_u16(src + 8); + s2 = vld1q_u16(src + 16); + s3 = vld1q_u16(src + 24); + d0 = vld1q_u16(dst); + d1 = vld1q_u16(dst + 8); + d2 = vld1q_u16(dst + 16); + d3 = vld1q_u16(dst + 24); + + d0 = vrhaddq_u16(s0, d0); + d1 = vrhaddq_u16(s1, d1); + d2 = vrhaddq_u16(s2, d2); + d3 = vrhaddq_u16(s3, d3); + + vst1q_u16(dst, d0); + vst1q_u16(dst + 8, d1); + vst1q_u16(dst + 16, d2); + vst1q_u16(dst + 24, d3); + + s0 = vld1q_u16(src + 32); + s1 = vld1q_u16(src + 40); + s2 = vld1q_u16(src + 48); + s3 = vld1q_u16(src + 56); + d0 = vld1q_u16(dst + 32); + d1 = vld1q_u16(dst + 40); + d2 = vld1q_u16(dst + 48); + d3 = vld1q_u16(dst + 56); + + d0 = vrhaddq_u16(s0, d0); + d1 = vrhaddq_u16(s1, d1); + d2 = vrhaddq_u16(s2, d2); + d3 = vrhaddq_u16(s3, d3); + + vst1q_u16(dst + 32, d0); + vst1q_u16(dst + 40, d1); + vst1q_u16(dst + 48, d2); + vst1q_u16(dst + 56, d3); + src += src_stride; + dst += dst_stride; + } while (--h); + } +} diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c new file mode 100644 index 00000000000..a980ab1a380 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +void vpx_highbd_convolve_copy_neon(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const int16_t *filter_x, int filter_x_stride, + const int16_t *filter_y, int filter_y_stride, + int w, int h, int bd) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + + (void)filter_x; + (void)filter_x_stride; + (void)filter_y; + (void)filter_y_stride; + (void)bd; + + if (w < 8) { // copy4 + do { + vst1_u16(dst, vld1_u16(src)); + src += src_stride; + dst += dst_stride; + vst1_u16(dst, vld1_u16(src)); + src += src_stride; + dst += dst_stride; + h -= 2; + } while (h > 0); + } else if (w == 8) { // copy8 + do { + vst1q_u16(dst, vld1q_u16(src)); + src += src_stride; + dst += dst_stride; + vst1q_u16(dst, vld1q_u16(src)); + src += src_stride; + dst += dst_stride; + h -= 2; + } while (h > 0); + } else if (w < 32) { // copy16 + do { + vst2q_u16(dst, vld2q_u16(src)); + src += src_stride; + dst += dst_stride; + vst2q_u16(dst, vld2q_u16(src)); + src += src_stride; + dst += dst_stride; + vst2q_u16(dst, vld2q_u16(src)); + src += src_stride; + dst += dst_stride; + vst2q_u16(dst, vld2q_u16(src)); + src += src_stride; + dst += dst_stride; + h -= 4; + } while (h > 0); + } else if (w == 32) { // copy32 + do { + vst4q_u16(dst, vld4q_u16(src)); + src += src_stride; + dst += dst_stride; + vst4q_u16(dst, vld4q_u16(src)); + src += src_stride; + dst += dst_stride; + vst4q_u16(dst, vld4q_u16(src)); + src += src_stride; + dst += dst_stride; + vst4q_u16(dst, vld4q_u16(src)); + src += src_stride; + dst += dst_stride; + h -= 4; + } while (h > 0); + } else { // copy64 + do { + vst4q_u16(dst, vld4q_u16(src)); + vst4q_u16(dst + 32, vld4q_u16(src + 32)); + src += src_stride; + dst += dst_stride; + vst4q_u16(dst, vld4q_u16(src)); + vst4q_u16(dst + 32, vld4q_u16(src + 32)); + src += src_stride; + dst += dst_stride; + vst4q_u16(dst, vld4q_u16(src)); + vst4q_u16(dst + 32, vld4q_u16(src + 32)); + src += src_stride; + dst += dst_stride; + vst4q_u16(dst, vld4q_u16(src)); + vst4q_u16(dst + 32, vld4q_u16(src + 32)); + src += src_stride; + dst += dst_stride; + h -= 4; + } while (h > 0); + } +} diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c new file mode 100644 index 00000000000..4e6e109920a --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_vpx_convolve_neon.c @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/vpx_filter.h" +#include "vpx_ports/mem.h" + +void vpx_highbd_convolve8_neon(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, int w, + int h, int bd) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y)); + // + 1 to make it divisible by 4 + DECLARE_ALIGNED(16, uint16_t, temp[64 * 136]); + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + /* Filter starting 3 lines back. The neon implementation will ignore the given + * height and filter a multiple of 4 lines. Since this goes in to the temp + * buffer which has lots of extra room and is subsequently discarded this is + * safe if somewhat less than ideal. */ + vpx_highbd_convolve8_horiz_neon(CONVERT_TO_BYTEPTR(src - src_stride * 3), + src_stride, CONVERT_TO_BYTEPTR(temp), w, + filter_x, x_step_q4, filter_y, y_step_q4, w, + intermediate_height, bd); + + /* Step into the temp buffer 3 lines to get the actual frame data */ + vpx_highbd_convolve8_vert_neon(CONVERT_TO_BYTEPTR(temp + w * 3), w, dst, + dst_stride, filter_x, x_step_q4, filter_y, + y_step_q4, w, h, bd); +} + +void vpx_highbd_convolve8_avg_neon(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h, int bd) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y)); + // + 1 to make it divisible by 4 + DECLARE_ALIGNED(16, uint16_t, temp[64 * 136]); + const int intermediate_height = + (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; + + /* This implementation has the same issues as above. In addition, we only want + * to average the values after both passes. + */ + vpx_highbd_convolve8_horiz_neon(CONVERT_TO_BYTEPTR(src - src_stride * 3), + src_stride, CONVERT_TO_BYTEPTR(temp), w, + filter_x, x_step_q4, filter_y, y_step_q4, w, + intermediate_height, bd); + vpx_highbd_convolve8_avg_vert_neon(CONVERT_TO_BYTEPTR(temp + w * 3), w, dst, + dst_stride, filter_x, x_step_q4, filter_y, + y_step_q4, w, h, bd); +} diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm index dc459e20d9c..e3c0c5210d2 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm @@ -25,9 +25,8 @@ |vpx_idct16x16_1_add_neon| PROC ldrsh r0, [r0] - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 + ; cospi_16_64 = 11585 + movw r12, #0x2d41 ; out = dct_const_round_shift(input[0] * cospi_16_64) mul r0, r0, r12 ; input[0] * cospi_16_64 diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c index 4035830f3c8..f1e49ff5178 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.c @@ -21,7 +21,7 @@ void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16; int16x8_t q0s16; uint8_t *d1, *d2; - int16_t i, j, a1, cospi_16_64 = 11585; + int16_t i, j, a1; int16_t out = dct_const_round_shift(input[0] * cospi_16_64); out = dct_const_round_shift(out * cospi_16_64); a1 = ROUND_POWER_OF_TWO(out, 6); diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm index 22a0c95941a..5e64cea0ae7 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm @@ -60,13 +60,11 @@ vld2.s16 {q1,q2}, [r0]! vmov.s16 q15, q1 - ; generate cospi_28_64 = 3196 - mov r3, #0xc00 - add r3, #0x7c + ; cospi_28_64 = 3196 + movw r3, #0x0c7c - ; generate cospi_4_64 = 16069 - mov r12, #0x3e00 - add r12, #0xc5 + ; cospi_4_64 = 16069 + movw r12, #0x3ec5 ; transpose the input data TRANSPOSE8X8 @@ -76,13 +74,11 @@ vdup.16 d1, r12 ; duplicate cospi_4_64 ; preloading to avoid stall - ; generate cospi_12_64 = 13623 - mov r3, #0x3500 - add r3, #0x37 + ; cospi_12_64 = 13623 + movw r3, #0x3537 - ; generate cospi_20_64 = 9102 - mov r12, #0x2300 - add r12, #0x8e + ; cospi_20_64 = 9102 + movw r12, #0x238e ; step2[4] * cospi_28_64 vmull.s16 q2, d18, d0 @@ -112,13 +108,11 @@ vqrshrn.s32 d15, q6, #14 ; >> 14 ; preloading to avoid stall - ; generate cospi_16_64 = 11585 - mov r3, #0x2d00 - add r3, #0x41 + ; cospi_16_64 = 11585 + movw r3, #0x2d41 - ; generate cospi_24_64 = 6270 - mov r12, #0x1800 - add r12, #0x7e + ; cospi_24_64 = 6270 + movw r12, #0x187e ; step2[5] * cospi_12_64 vmull.s16 q2, d26, d2 @@ -155,9 +149,8 @@ vmull.s16 q0, d24, d30 vmull.s16 q1, d25, d30 - ; generate cospi_8_64 = 15137 - mov r3, #0x3b00 - add r3, #0x21 + ; cospi_8_64 = 15137 + movw r3, #0x3b21 vdup.16 d30, r12 ; duplicate cospi_24_64 vdup.16 d31, r3 ; duplicate cospi_8_64 @@ -208,9 +201,8 @@ vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7]; vadd.s16 q15, q6, q7 ; step2[7] = step1[6] + step1[7]; - ; generate cospi_16_64 = 11585 - mov r3, #0x2d00 - add r3, #0x41 + ; cospi_16_64 = 11585 + movw r3, #0x2d41 ; stage 5 vadd.s16 q0, q8, q11 ; step1[0] = step2[0] + step2[3]; @@ -307,13 +299,11 @@ vld2.s16 {q0,q1}, [r0]! vmov.s16 q15, q0; - ; generate cospi_30_64 = 1606 - mov r3, #0x0600 - add r3, #0x46 + ; cospi_30_64 = 1606 + movw r3, #0x0646 - ; generate cospi_2_64 = 16305 - mov r12, #0x3f00 - add r12, #0xb1 + ; cospi_2_64 = 16305 + movw r12, #0x3fb1 ; transpose the input data TRANSPOSE8X8 @@ -323,13 +313,11 @@ vdup.16 d13, r12 ; duplicate cospi_2_64 ; preloading to avoid stall - ; generate cospi_14_64 = 12665 - mov r3, #0x3100 - add r3, #0x79 + ; cospi_14_64 = 12665 + movw r3, #0x3179 - ; generate cospi_18_64 = 10394 - mov r12, #0x2800 - add r12, #0x9a + ; cospi_18_64 = 10394 + movw r12, #0x289a ; step1[8] * cospi_30_64 vmull.s16 q2, d16, d12 @@ -359,13 +347,11 @@ vqrshrn.s32 d15, q4, #14 ; >> 14 ; preloading to avoid stall - ; generate cospi_22_64 = 7723 - mov r3, #0x1e00 - add r3, #0x2b + ; cospi_22_64 = 7723 + movw r3, #0x1e2b - ; generate cospi_10_64 = 14449 - mov r12, #0x3800 - add r12, #0x71 + ; cospi_10_64 = 14449 + movw r12, #0x3871 ; step1[9] * cospi_14_64 vmull.s16 q2, d24, d30 @@ -411,13 +397,11 @@ vmlal.s16 q5, d27, d30 ; preloading to avoid stall - ; generate cospi_6_64 = 15679 - mov r3, #0x3d00 - add r3, #0x3f + ; cospi_6_64 = 15679 + movw r3, #0x3d3f - ; generate cospi_26_64 = 4756 - mov r12, #0x1200 - add r12, #0x94 + ; cospi_26_64 = 4756 + movw r12, #0x1294 vdup.16 d30, r3 ; duplicate cospi_6_64 vdup.16 d31, r12 ; duplicate cospi_26_64 @@ -466,13 +450,11 @@ vadd.s16 q7, q6, q7 ; step1[15]=step2[14]+step2[15] ; stage 4 - ; generate cospi_24_64 = 6270 - mov r3, #0x1800 - add r3, #0x7e + ; cospi_24_64 = 6270 + movw r3, #0x187e - ; generate cospi_8_64 = 15137 - mov r12, #0x3b00 - add r12, #0x21 + ; cospi_8_64 = 15137 + movw r12, #0x3b21 ; -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 vdup.16 d30, r12 ; duplicate cospi_8_64 @@ -543,9 +525,8 @@ vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15]; ; stage 6. - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 + ; cospi_16_64 = 11585 + movw r12, #0x2d41 vdup.16 d14, r12 ; duplicate cospi_16_64 @@ -810,13 +791,11 @@ end_idct16x16_pass2 vld2.s16 {q1,q2}, [r0]! vmov.s16 q15, q1 - ; generate cospi_28_64*2 = 6392 - mov r3, #0x1800 - add r3, #0xf8 + ; cospi_28_64*2 = 6392 + movw r3, #0x18f8 - ; generate cospi_4_64*2 = 32138 - mov r12, #0x7d00 - add r12, #0x8a + ; cospi_4_64*2 = 32138 + movw r12, #0x7d8a ; transpose the input data TRANSPOSE8X8 @@ -833,9 +812,8 @@ end_idct16x16_pass2 vqrdmulh.s16 q4, q9, q0 ; preloading to avoid stall - ; generate cospi_16_64*2 = 23170 - mov r3, #0x5a00 - add r3, #0x82 + ; cospi_16_64*2 = 23170 + movw r3, #0x5a82 ; dct_const_round_shift(step2[4] * cospi_4_64); vqrdmulh.s16 q7, q9, q1 @@ -843,9 +821,8 @@ end_idct16x16_pass2 ; stage 4 vdup.16 q1, r3 ; cospi_16_64*2 - ; generate cospi_16_64 = 11585 - mov r3, #0x2d00 - add r3, #0x41 + ; cospi_16_64 = 11585 + movw r3, #0x2d41 vdup.16 d4, r3; ; duplicate cospi_16_64 @@ -939,13 +916,11 @@ end_idct16x16_pass2 vld2.s16 {q0,q1}, [r0]! vmov.s16 q15, q0; - ; generate 2*cospi_30_64 = 3212 - mov r3, #0xc00 - add r3, #0x8c + ; 2*cospi_30_64 = 3212 + movw r3, #0x0c8c - ; generate 2*cospi_2_64 = 32610 - mov r12, #0x7f00 - add r12, #0x62 + ; 2*cospi_2_64 = 32610 + movw r12, #0x7f62 ; transpose the input data TRANSPOSE8X8 @@ -962,15 +937,13 @@ end_idct16x16_pass2 vqrdmulh.s16 q7, q8, q6 ; preloading to avoid stall - ; generate 2*cospi_26_64 = 9512 - mov r12, #0x2500 - add r12, #0x28 + ; 2*cospi_26_64 = 9512 + movw r12, #0x2528 rsb r12, #0 vdup.16 q15, r12 ; duplicate -2*cospi_26_64 - ; generate 2*cospi_6_64 = 31358 - mov r3, #0x7a00 - add r3, #0x7e + ; 2*cospi_6_64 = 31358 + movw r3, #0x7a7e vdup.16 q14, r3 ; duplicate 2*cospi_6_64 ; dct_const_round_shift(- step1[12] * cospi_26_64) @@ -980,14 +953,12 @@ end_idct16x16_pass2 vqrdmulh.s16 q4, q9, q14 ; stage 4 - ; generate cospi_24_64 = 6270 - mov r3, #0x1800 - add r3, #0x7e + ; cospi_24_64 = 6270 + movw r3, #0x187e vdup.16 d31, r3 ; duplicate cospi_24_64 - ; generate cospi_8_64 = 15137 - mov r12, #0x3b00 - add r12, #0x21 + ; cospi_8_64 = 15137 + movw r12, #0x3b21 vdup.16 d30, r12 ; duplicate cospi_8_64 ; step1[14] * cospi_24_64 @@ -1052,9 +1023,8 @@ end_idct16x16_pass2 vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15]; ; stage 6. - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 + ; cospi_16_64 = 11585 + movw r12, #0x2d41 vdup.16 d14, r12 ; duplicate cospi_16_64 diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c index ce5cbcbcda5..f682afc7bf6 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c @@ -73,8 +73,8 @@ void vpx_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out, d31s16 = vget_high_s16(q15s16); // stage 3 - d0s16 = vdup_n_s16(cospi_28_64); - d1s16 = vdup_n_s16(cospi_4_64); + d0s16 = vdup_n_s16((int16_t)cospi_28_64); + d1s16 = vdup_n_s16((int16_t)cospi_4_64); q2s32 = vmull_s16(d18s16, d0s16); q3s32 = vmull_s16(d19s16, d0s16); @@ -86,8 +86,8 @@ void vpx_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out, q5s32 = vmlal_s16(q5s32, d30s16, d0s16); q6s32 = vmlal_s16(q6s32, d31s16, d0s16); - d2s16 = vdup_n_s16(cospi_12_64); - d3s16 = vdup_n_s16(cospi_20_64); + d2s16 = vdup_n_s16((int16_t)cospi_12_64); + d3s16 = vdup_n_s16((int16_t)cospi_20_64); d8s16 = vqrshrn_n_s32(q2s32, 14); d9s16 = vqrshrn_n_s32(q3s32, 14); @@ -114,15 +114,15 @@ void vpx_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out, q6s16 = vcombine_s16(d12s16, d13s16); // stage 4 - d30s16 = vdup_n_s16(cospi_16_64); + d30s16 = vdup_n_s16((int16_t)cospi_16_64); q2s32 = vmull_s16(d16s16, d30s16); q11s32 = vmull_s16(d17s16, d30s16); q0s32 = vmull_s16(d24s16, d30s16); q1s32 = vmull_s16(d25s16, d30s16); - d30s16 = vdup_n_s16(cospi_24_64); - d31s16 = vdup_n_s16(cospi_8_64); + d30s16 = vdup_n_s16((int16_t)cospi_24_64); + d31s16 = vdup_n_s16((int16_t)cospi_8_64); q3s32 = vaddq_s32(q2s32, q0s32); q12s32 = vaddq_s32(q11s32, q1s32); @@ -168,7 +168,7 @@ void vpx_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out, q2s16 = vsubq_s16(q9s16, q10s16); q3s16 = vsubq_s16(q8s16, q11s16); - d16s16 = vdup_n_s16(cospi_16_64); + d16s16 = vdup_n_s16((int16_t)cospi_16_64); q11s32 = vmull_s16(d26s16, d16s16); q12s32 = vmull_s16(d27s16, d16s16); @@ -313,8 +313,8 @@ void vpx_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out, d31s16 = vget_high_s16(q15s16); // stage 3 - d12s16 = vdup_n_s16(cospi_30_64); - d13s16 = vdup_n_s16(cospi_2_64); + d12s16 = vdup_n_s16((int16_t)cospi_30_64); + d13s16 = vdup_n_s16((int16_t)cospi_2_64); q2s32 = vmull_s16(d16s16, d12s16); q3s32 = vmull_s16(d17s16, d12s16); @@ -333,8 +333,8 @@ void vpx_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out, q0s16 = vcombine_s16(d0s16, d1s16); q7s16 = vcombine_s16(d14s16, d15s16); - d30s16 = vdup_n_s16(cospi_14_64); - d31s16 = vdup_n_s16(cospi_18_64); + d30s16 = vdup_n_s16((int16_t)cospi_14_64); + d31s16 = vdup_n_s16((int16_t)cospi_18_64); q2s32 = vmull_s16(d24s16, d30s16); q3s32 = vmull_s16(d25s16, d30s16); @@ -353,8 +353,8 @@ void vpx_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out, q1s16 = vcombine_s16(d2s16, d3s16); q6s16 = vcombine_s16(d12s16, d13s16); - d30s16 = vdup_n_s16(cospi_22_64); - d31s16 = vdup_n_s16(cospi_10_64); + d30s16 = vdup_n_s16((int16_t)cospi_22_64); + d31s16 = vdup_n_s16((int16_t)cospi_10_64); q11s32 = vmull_s16(d20s16, d30s16); q12s32 = vmull_s16(d21s16, d30s16); @@ -373,8 +373,8 @@ void vpx_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out, q2s16 = vcombine_s16(d4s16, d5s16); q5s16 = vcombine_s16(d10s16, d11s16); - d30s16 = vdup_n_s16(cospi_6_64); - d31s16 = vdup_n_s16(cospi_26_64); + d30s16 = vdup_n_s16((int16_t)cospi_6_64); + d31s16 = vdup_n_s16((int16_t)cospi_26_64); q10s32 = vmull_s16(d28s16, d30s16); q11s32 = vmull_s16(d29s16, d30s16); @@ -413,8 +413,8 @@ void vpx_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out, d28s16 = vget_low_s16(q14s16); d29s16 = vget_high_s16(q14s16); - d30s16 = vdup_n_s16(cospi_8_64); - d31s16 = vdup_n_s16(cospi_24_64); + d30s16 = vdup_n_s16((int16_t)cospi_8_64); + d31s16 = vdup_n_s16((int16_t)cospi_24_64); q2s32 = vmull_s16(d18s16, d31s16); q3s32 = vmull_s16(d19s16, d31s16); @@ -474,7 +474,7 @@ void vpx_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out, d26s16 = vget_low_s16(q13s16); d27s16 = vget_high_s16(q13s16); - d14s16 = vdup_n_s16(cospi_16_64); + d14s16 = vdup_n_s16((int16_t)cospi_16_64); q3s32 = vmull_s16(d26s16, d14s16); q4s32 = vmull_s16(d27s16, d14s16); @@ -837,15 +837,15 @@ void vpx_idct16x16_10_add_neon_pass1(int16_t *in, int16_t *out, &q15s16); // stage 3 - q0s16 = vdupq_n_s16(cospi_28_64 * 2); - q1s16 = vdupq_n_s16(cospi_4_64 * 2); + q0s16 = vdupq_n_s16((int16_t)cospi_28_64 * 2); + q1s16 = vdupq_n_s16((int16_t)cospi_4_64 * 2); q4s16 = vqrdmulhq_s16(q9s16, q0s16); q7s16 = vqrdmulhq_s16(q9s16, q1s16); // stage 4 - q1s16 = vdupq_n_s16(cospi_16_64 * 2); - d4s16 = vdup_n_s16(cospi_16_64); + q1s16 = vdupq_n_s16((int16_t)cospi_16_64 * 2); + d4s16 = vdup_n_s16((int16_t)cospi_16_64); q8s16 = vqrdmulhq_s16(q8s16, q1s16); @@ -979,13 +979,13 @@ void vpx_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *out, &q15s16); // stage 3 - q6s16 = vdupq_n_s16(cospi_30_64 * 2); + q6s16 = vdupq_n_s16((int16_t)cospi_30_64 * 2); q0s16 = vqrdmulhq_s16(q8s16, q6s16); - q6s16 = vdupq_n_s16(cospi_2_64 * 2); + q6s16 = vdupq_n_s16((int16_t)cospi_2_64 * 2); q7s16 = vqrdmulhq_s16(q8s16, q6s16); - q15s16 = vdupq_n_s16(-cospi_26_64 * 2); - q14s16 = vdupq_n_s16(cospi_6_64 * 2); + q15s16 = vdupq_n_s16((int16_t)-cospi_26_64 * 2); + q14s16 = vdupq_n_s16((int16_t)cospi_6_64 * 2); q3s16 = vqrdmulhq_s16(q9s16, q15s16); q4s16 = vqrdmulhq_s16(q9s16, q14s16); @@ -999,8 +999,8 @@ void vpx_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *out, d14s16 = vget_low_s16(q7s16); d15s16 = vget_high_s16(q7s16); - d30s16 = vdup_n_s16(cospi_8_64); - d31s16 = vdup_n_s16(cospi_24_64); + d30s16 = vdup_n_s16((int16_t)cospi_8_64); + d31s16 = vdup_n_s16((int16_t)cospi_24_64); q12s32 = vmull_s16(d14s16, d31s16); q5s32 = vmull_s16(d15s16, d31s16); @@ -1057,7 +1057,7 @@ void vpx_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *out, d26s16 = vget_low_s16(q13s16); d27s16 = vget_high_s16(q13s16); - d14s16 = vdup_n_s16(cospi_16_64); + d14s16 = vdup_n_s16((int16_t)cospi_16_64); q3s32 = vmull_s16(d26s16, d14s16); q4s32 = vmull_s16(d27s16, d14s16); q0s32 = vmull_s16(d20s16, d14s16); diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.asm deleted file mode 100644 index 96d276b4d14..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.asm +++ /dev/null @@ -1,144 +0,0 @@ -; -; Copyright (c) 2013 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - EXPORT |vpx_idct32x32_1_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - ;TODO(hkuang): put the following macros in a seperate - ;file so other idct function could also use them. - MACRO - LD_16x8 $src, $stride - vld1.8 {q8}, [$src], $stride - vld1.8 {q9}, [$src], $stride - vld1.8 {q10}, [$src], $stride - vld1.8 {q11}, [$src], $stride - vld1.8 {q12}, [$src], $stride - vld1.8 {q13}, [$src], $stride - vld1.8 {q14}, [$src], $stride - vld1.8 {q15}, [$src], $stride - MEND - - MACRO - ADD_DIFF_16x8 $diff - vqadd.u8 q8, q8, $diff - vqadd.u8 q9, q9, $diff - vqadd.u8 q10, q10, $diff - vqadd.u8 q11, q11, $diff - vqadd.u8 q12, q12, $diff - vqadd.u8 q13, q13, $diff - vqadd.u8 q14, q14, $diff - vqadd.u8 q15, q15, $diff - MEND - - MACRO - SUB_DIFF_16x8 $diff - vqsub.u8 q8, q8, $diff - vqsub.u8 q9, q9, $diff - vqsub.u8 q10, q10, $diff - vqsub.u8 q11, q11, $diff - vqsub.u8 q12, q12, $diff - vqsub.u8 q13, q13, $diff - vqsub.u8 q14, q14, $diff - vqsub.u8 q15, q15, $diff - MEND - - MACRO - ST_16x8 $dst, $stride - vst1.8 {q8}, [$dst], $stride - vst1.8 {q9}, [$dst], $stride - vst1.8 {q10},[$dst], $stride - vst1.8 {q11},[$dst], $stride - vst1.8 {q12},[$dst], $stride - vst1.8 {q13},[$dst], $stride - vst1.8 {q14},[$dst], $stride - vst1.8 {q15},[$dst], $stride - MEND - -;void vpx_idct32x32_1_add_neon(int16_t *input, uint8_t *dest, -; int dest_stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int dest_stride - -|vpx_idct32x32_1_add_neon| PROC - push {lr} - pld [r1] - add r3, r1, #16 ; r3 dest + 16 for second loop - ldrsh r0, [r0] - - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 - - ; out = dct_const_round_shift(input[0] * cospi_16_64) - mul r0, r0, r12 ; input[0] * cospi_16_64 - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; out = dct_const_round_shift(out * cospi_16_64) - mul r0, r0, r12 ; out * cospi_16_64 - mov r12, r1 ; save dest - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; a1 = ROUND_POWER_OF_TWO(out, 6) - add r0, r0, #32 ; + (1 <<((6) - 1)) - asrs r0, r0, #6 ; >> 6 - bge diff_positive_32_32 - -diff_negative_32_32 - neg r0, r0 - usat r0, #8, r0 - vdup.u8 q0, r0 - mov r0, #4 - -diff_negative_32_32_loop - sub r0, #1 - LD_16x8 r1, r2 - SUB_DIFF_16x8 q0 - ST_16x8 r12, r2 - - LD_16x8 r1, r2 - SUB_DIFF_16x8 q0 - ST_16x8 r12, r2 - cmp r0, #2 - moveq r1, r3 - moveq r12, r3 - cmp r0, #0 - bne diff_negative_32_32_loop - pop {pc} - -diff_positive_32_32 - usat r0, #8, r0 - vdup.u8 q0, r0 - mov r0, #4 - -diff_positive_32_32_loop - sub r0, #1 - LD_16x8 r1, r2 - ADD_DIFF_16x8 q0 - ST_16x8 r12, r2 - - LD_16x8 r1, r2 - ADD_DIFF_16x8 q0 - ST_16x8 r12, r2 - cmp r0, #2 - moveq r1, r3 - moveq r12, r3 - cmp r0, #0 - bne diff_positive_32_32_loop - pop {pc} - - ENDP ; |vpx_idct32x32_1_add_neon| - END diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c index 9dfdf8d6965..6be4b01229b 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_1_add_neon.c @@ -94,7 +94,7 @@ void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8; int i, j, dest_stride8; uint8_t *d; - int16_t a1, cospi_16_64 = 11585; + int16_t a1; int16_t out = dct_const_round_shift(input[0] * cospi_16_64); out = dct_const_round_shift(out * cospi_16_64); @@ -103,7 +103,7 @@ void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, dest_stride8 = dest_stride * 8; if (a1 >= 0) { // diff_positive_32_32 a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1; - q0u8 = vdupq_n_u8(a1); + q0u8 = vdupq_n_u8((uint8_t)a1); for (i = 0; i < 2; i++, dest += 16) { // diff_positive_32_32_loop d = dest; for (j = 0; j < 4; j++) { @@ -119,7 +119,7 @@ void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, } else { // diff_negative_32_32 a1 = -a1; a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1; - q0u8 = vdupq_n_u8(a1); + q0u8 = vdupq_n_u8((uint8_t)a1); for (i = 0; i < 2; i++, dest += 16) { // diff_negative_32_32_loop d = dest; for (j = 0; j < 4; j++) { diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c new file mode 100644 index 00000000000..ebec9df54ad --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_34_add_neon.c @@ -0,0 +1,519 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/txfm_common.h" + +// Only for the first pass of the _34_ variant. Since it only uses values from +// the top left 8x8 it can safely assume all the remaining values are 0 and skip +// an awful lot of calculations. In fact, only the first 6 columns make the cut. +// None of the elements in the 7th or 8th column are used so it skips any calls +// to input[67] too. +// In C this does a single row of 32 for each call. Here it transposes the top +// left 8x8 to allow using SIMD. + +// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 34 non-zero +// coefficients as follows: +// 0 1 2 3 4 5 6 7 +// 0 0 2 5 10 17 25 +// 1 1 4 8 15 22 30 +// 2 3 7 12 18 28 +// 3 6 11 16 23 31 +// 4 9 14 19 29 +// 5 13 20 26 +// 6 21 27 33 +// 7 24 32 +static void idct32_6_neon(const int16_t *input, int16_t *output) { + int16x8_t in0, in1, in2, in3, in4, in5, in6, in7; + int16x8_t s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s1_9, s1_10, + s1_11, s1_12, s1_13, s1_14, s1_15, s1_16, s1_17, s1_18, s1_19, s1_20, + s1_21, s1_22, s1_23, s1_24, s1_25, s1_26, s1_27, s1_28, s1_29, s1_30, + s1_31; + int16x8_t s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s2_9, s2_10, + s2_11, s2_12, s2_13, s2_14, s2_15, s2_16, s2_17, s2_18, s2_19, s2_20, + s2_21, s2_22, s2_23, s2_24, s2_25, s2_26, s2_27, s2_28, s2_29, s2_30, + s2_31; + int16x8_t s3_24, s3_25, s3_26, s3_27; + + load_and_transpose_s16_8x8(input, 32, &in0, &in1, &in2, &in3, &in4, &in5, + &in6, &in7); + + // stage 1 + // input[1] * cospi_31_64 - input[31] * cospi_1_64 (but input[31] == 0) + s1_16 = multiply_shift_and_narrow_s16(in1, cospi_31_64); + // input[1] * cospi_1_64 + input[31] * cospi_31_64 (but input[31] == 0) + s1_31 = multiply_shift_and_narrow_s16(in1, cospi_1_64); + + s1_20 = multiply_shift_and_narrow_s16(in5, cospi_27_64); + s1_27 = multiply_shift_and_narrow_s16(in5, cospi_5_64); + + s1_23 = multiply_shift_and_narrow_s16(in3, -cospi_29_64); + s1_24 = multiply_shift_and_narrow_s16(in3, cospi_3_64); + + // stage 2 + s2_8 = multiply_shift_and_narrow_s16(in2, cospi_30_64); + s2_15 = multiply_shift_and_narrow_s16(in2, cospi_2_64); + + // stage 3 + s1_4 = multiply_shift_and_narrow_s16(in4, cospi_28_64); + s1_7 = multiply_shift_and_narrow_s16(in4, cospi_4_64); + + s1_17 = multiply_accumulate_shift_and_narrow_s16(s1_16, -cospi_4_64, s1_31, + cospi_28_64); + s1_30 = multiply_accumulate_shift_and_narrow_s16(s1_16, cospi_28_64, s1_31, + cospi_4_64); + + s1_21 = multiply_accumulate_shift_and_narrow_s16(s1_20, -cospi_20_64, s1_27, + cospi_12_64); + s1_26 = multiply_accumulate_shift_and_narrow_s16(s1_20, cospi_12_64, s1_27, + cospi_20_64); + + s1_22 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_12_64, s1_24, + -cospi_20_64); + s1_25 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_20_64, s1_24, + cospi_12_64); + + // stage 4 + s1_0 = multiply_shift_and_narrow_s16(in0, cospi_16_64); + + s2_9 = multiply_accumulate_shift_and_narrow_s16(s2_8, -cospi_8_64, s2_15, + cospi_24_64); + s2_14 = multiply_accumulate_shift_and_narrow_s16(s2_8, cospi_24_64, s2_15, + cospi_8_64); + + s2_20 = vsubq_s16(s1_23, s1_20); + s2_21 = vsubq_s16(s1_22, s1_21); + s2_22 = vaddq_s16(s1_21, s1_22); + s2_23 = vaddq_s16(s1_20, s1_23); + s2_24 = vaddq_s16(s1_24, s1_27); + s2_25 = vaddq_s16(s1_25, s1_26); + s2_26 = vsubq_s16(s1_25, s1_26); + s2_27 = vsubq_s16(s1_24, s1_27); + + // stage 5 + s1_5 = sub_multiply_shift_and_narrow_s16(s1_7, s1_4, cospi_16_64); + s1_6 = add_multiply_shift_and_narrow_s16(s1_4, s1_7, cospi_16_64); + + s1_18 = multiply_accumulate_shift_and_narrow_s16(s1_17, -cospi_8_64, s1_30, + cospi_24_64); + s1_29 = multiply_accumulate_shift_and_narrow_s16(s1_17, cospi_24_64, s1_30, + cospi_8_64); + + s1_19 = multiply_accumulate_shift_and_narrow_s16(s1_16, -cospi_8_64, s1_31, + cospi_24_64); + s1_28 = multiply_accumulate_shift_and_narrow_s16(s1_16, cospi_24_64, s1_31, + cospi_8_64); + + s1_20 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_24_64, s2_27, + -cospi_8_64); + s1_27 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_8_64, s2_27, + cospi_24_64); + + s1_21 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_24_64, s2_26, + -cospi_8_64); + s1_26 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_8_64, s2_26, + cospi_24_64); + + // stage 6 + s2_0 = vaddq_s16(s1_0, s1_7); + s2_1 = vaddq_s16(s1_0, s1_6); + s2_2 = vaddq_s16(s1_0, s1_5); + s2_3 = vaddq_s16(s1_0, s1_4); + s2_4 = vsubq_s16(s1_0, s1_4); + s2_5 = vsubq_s16(s1_0, s1_5); + s2_6 = vsubq_s16(s1_0, s1_6); + s2_7 = vsubq_s16(s1_0, s1_7); + + s2_10 = sub_multiply_shift_and_narrow_s16(s2_14, s2_9, cospi_16_64); + s2_13 = add_multiply_shift_and_narrow_s16(s2_9, s2_14, cospi_16_64); + + s2_11 = sub_multiply_shift_and_narrow_s16(s2_15, s2_8, cospi_16_64); + s2_12 = add_multiply_shift_and_narrow_s16(s2_8, s2_15, cospi_16_64); + + s2_16 = vaddq_s16(s1_16, s2_23); + s2_17 = vaddq_s16(s1_17, s2_22); + s2_18 = vaddq_s16(s1_18, s1_21); + s2_19 = vaddq_s16(s1_19, s1_20); + s2_20 = vsubq_s16(s1_19, s1_20); + s2_21 = vsubq_s16(s1_18, s1_21); + s2_22 = vsubq_s16(s1_17, s2_22); + s2_23 = vsubq_s16(s1_16, s2_23); + + s3_24 = vsubq_s16(s1_31, s2_24); + s3_25 = vsubq_s16(s1_30, s2_25); + s3_26 = vsubq_s16(s1_29, s1_26); + s3_27 = vsubq_s16(s1_28, s1_27); + s2_28 = vaddq_s16(s1_27, s1_28); + s2_29 = vaddq_s16(s1_26, s1_29); + s2_30 = vaddq_s16(s2_25, s1_30); + s2_31 = vaddq_s16(s2_24, s1_31); + + // stage 7 + s1_0 = vaddq_s16(s2_0, s2_15); + s1_1 = vaddq_s16(s2_1, s2_14); + s1_2 = vaddq_s16(s2_2, s2_13); + s1_3 = vaddq_s16(s2_3, s2_12); + s1_4 = vaddq_s16(s2_4, s2_11); + s1_5 = vaddq_s16(s2_5, s2_10); + s1_6 = vaddq_s16(s2_6, s2_9); + s1_7 = vaddq_s16(s2_7, s2_8); + s1_8 = vsubq_s16(s2_7, s2_8); + s1_9 = vsubq_s16(s2_6, s2_9); + s1_10 = vsubq_s16(s2_5, s2_10); + s1_11 = vsubq_s16(s2_4, s2_11); + s1_12 = vsubq_s16(s2_3, s2_12); + s1_13 = vsubq_s16(s2_2, s2_13); + s1_14 = vsubq_s16(s2_1, s2_14); + s1_15 = vsubq_s16(s2_0, s2_15); + + s1_20 = sub_multiply_shift_and_narrow_s16(s3_27, s2_20, cospi_16_64); + s1_27 = add_multiply_shift_and_narrow_s16(s2_20, s3_27, cospi_16_64); + + s1_21 = sub_multiply_shift_and_narrow_s16(s3_26, s2_21, cospi_16_64); + s1_26 = add_multiply_shift_and_narrow_s16(s2_21, s3_26, cospi_16_64); + + s1_22 = sub_multiply_shift_and_narrow_s16(s3_25, s2_22, cospi_16_64); + s1_25 = add_multiply_shift_and_narrow_s16(s2_22, s3_25, cospi_16_64); + + s1_23 = sub_multiply_shift_and_narrow_s16(s3_24, s2_23, cospi_16_64); + s1_24 = add_multiply_shift_and_narrow_s16(s2_23, s3_24, cospi_16_64); + + // final stage + vst1q_s16(output, vaddq_s16(s1_0, s2_31)); + output += 8; + vst1q_s16(output, vaddq_s16(s1_1, s2_30)); + output += 8; + vst1q_s16(output, vaddq_s16(s1_2, s2_29)); + output += 8; + vst1q_s16(output, vaddq_s16(s1_3, s2_28)); + output += 8; + vst1q_s16(output, vaddq_s16(s1_4, s1_27)); + output += 8; + vst1q_s16(output, vaddq_s16(s1_5, s1_26)); + output += 8; + vst1q_s16(output, vaddq_s16(s1_6, s1_25)); + output += 8; + vst1q_s16(output, vaddq_s16(s1_7, s1_24)); + output += 8; + + vst1q_s16(output, vaddq_s16(s1_8, s1_23)); + output += 8; + vst1q_s16(output, vaddq_s16(s1_9, s1_22)); + output += 8; + vst1q_s16(output, vaddq_s16(s1_10, s1_21)); + output += 8; + vst1q_s16(output, vaddq_s16(s1_11, s1_20)); + output += 8; + vst1q_s16(output, vaddq_s16(s1_12, s2_19)); + output += 8; + vst1q_s16(output, vaddq_s16(s1_13, s2_18)); + output += 8; + vst1q_s16(output, vaddq_s16(s1_14, s2_17)); + output += 8; + vst1q_s16(output, vaddq_s16(s1_15, s2_16)); + output += 8; + + vst1q_s16(output, vsubq_s16(s1_15, s2_16)); + output += 8; + vst1q_s16(output, vsubq_s16(s1_14, s2_17)); + output += 8; + vst1q_s16(output, vsubq_s16(s1_13, s2_18)); + output += 8; + vst1q_s16(output, vsubq_s16(s1_12, s2_19)); + output += 8; + vst1q_s16(output, vsubq_s16(s1_11, s1_20)); + output += 8; + vst1q_s16(output, vsubq_s16(s1_10, s1_21)); + output += 8; + vst1q_s16(output, vsubq_s16(s1_9, s1_22)); + output += 8; + vst1q_s16(output, vsubq_s16(s1_8, s1_23)); + output += 8; + + vst1q_s16(output, vsubq_s16(s1_7, s1_24)); + output += 8; + vst1q_s16(output, vsubq_s16(s1_6, s1_25)); + output += 8; + vst1q_s16(output, vsubq_s16(s1_5, s1_26)); + output += 8; + vst1q_s16(output, vsubq_s16(s1_4, s1_27)); + output += 8; + vst1q_s16(output, vsubq_s16(s1_3, s2_28)); + output += 8; + vst1q_s16(output, vsubq_s16(s1_2, s2_29)); + output += 8; + vst1q_s16(output, vsubq_s16(s1_1, s2_30)); + output += 8; + vst1q_s16(output, vsubq_s16(s1_0, s2_31)); +} + +static void idct32_8_neon(const int16_t *input, uint8_t *output, int stride) { + int16x8_t in0, in1, in2, in3, in4, in5, in6, in7; + int16x8_t out0, out1, out2, out3, out4, out5, out6, out7; + int16x8_t s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8, s1_9, s1_10, + s1_11, s1_12, s1_13, s1_14, s1_15, s1_16, s1_17, s1_18, s1_19, s1_20, + s1_21, s1_22, s1_23, s1_24, s1_25, s1_26, s1_27, s1_28, s1_29, s1_30, + s1_31; + int16x8_t s2_0, s2_1, s2_2, s2_3, s2_4, s2_5, s2_6, s2_7, s2_8, s2_9, s2_10, + s2_11, s2_12, s2_13, s2_14, s2_15, s2_16, s2_17, s2_18, s2_19, s2_20, + s2_21, s2_22, s2_23, s2_24, s2_25, s2_26, s2_27, s2_28, s2_29, s2_30, + s2_31; + int16x8_t s3_24, s3_25, s3_26, s3_27; + + load_and_transpose_s16_8x8(input, 8, &in0, &in1, &in2, &in3, &in4, &in5, &in6, + &in7); + + // stage 1 + s1_16 = multiply_shift_and_narrow_s16(in1, cospi_31_64); + s1_31 = multiply_shift_and_narrow_s16(in1, cospi_1_64); + + // Different for _8_ + s1_19 = multiply_shift_and_narrow_s16(in7, -cospi_25_64); + s1_28 = multiply_shift_and_narrow_s16(in7, cospi_7_64); + + s1_20 = multiply_shift_and_narrow_s16(in5, cospi_27_64); + s1_27 = multiply_shift_and_narrow_s16(in5, cospi_5_64); + + s1_23 = multiply_shift_and_narrow_s16(in3, -cospi_29_64); + s1_24 = multiply_shift_and_narrow_s16(in3, cospi_3_64); + + // stage 2 + s2_8 = multiply_shift_and_narrow_s16(in2, cospi_30_64); + s2_15 = multiply_shift_and_narrow_s16(in2, cospi_2_64); + + s2_11 = multiply_shift_and_narrow_s16(in6, -cospi_26_64); + s2_12 = multiply_shift_and_narrow_s16(in6, cospi_6_64); + + // stage 3 + s1_4 = multiply_shift_and_narrow_s16(in4, cospi_28_64); + s1_7 = multiply_shift_and_narrow_s16(in4, cospi_4_64); + + s1_17 = multiply_accumulate_shift_and_narrow_s16(s1_16, -cospi_4_64, s1_31, + cospi_28_64); + s1_30 = multiply_accumulate_shift_and_narrow_s16(s1_16, cospi_28_64, s1_31, + cospi_4_64); + + // Different for _8_ + s1_18 = multiply_accumulate_shift_and_narrow_s16(s1_19, -cospi_28_64, s1_28, + -cospi_4_64); + s1_29 = multiply_accumulate_shift_and_narrow_s16(s1_19, -cospi_4_64, s1_28, + cospi_28_64); + + s1_21 = multiply_accumulate_shift_and_narrow_s16(s1_20, -cospi_20_64, s1_27, + cospi_12_64); + s1_26 = multiply_accumulate_shift_and_narrow_s16(s1_20, cospi_12_64, s1_27, + cospi_20_64); + + s1_22 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_12_64, s1_24, + -cospi_20_64); + s1_25 = multiply_accumulate_shift_and_narrow_s16(s1_23, -cospi_20_64, s1_24, + cospi_12_64); + + // stage 4 + s1_0 = multiply_shift_and_narrow_s16(in0, cospi_16_64); + + s2_9 = multiply_accumulate_shift_and_narrow_s16(s2_8, -cospi_8_64, s2_15, + cospi_24_64); + s2_14 = multiply_accumulate_shift_and_narrow_s16(s2_8, cospi_24_64, s2_15, + cospi_8_64); + + s2_10 = multiply_accumulate_shift_and_narrow_s16(s2_11, -cospi_24_64, s2_12, + -cospi_8_64); + s2_13 = multiply_accumulate_shift_and_narrow_s16(s2_11, -cospi_8_64, s2_12, + cospi_24_64); + + s2_16 = vaddq_s16(s1_16, s1_19); + + s2_17 = vaddq_s16(s1_17, s1_18); + s2_18 = vsubq_s16(s1_17, s1_18); + + s2_19 = vsubq_s16(s1_16, s1_19); + + s2_20 = vsubq_s16(s1_23, s1_20); + s2_21 = vsubq_s16(s1_22, s1_21); + + s2_22 = vaddq_s16(s1_21, s1_22); + s2_23 = vaddq_s16(s1_20, s1_23); + + s2_24 = vaddq_s16(s1_24, s1_27); + s2_25 = vaddq_s16(s1_25, s1_26); + s2_26 = vsubq_s16(s1_25, s1_26); + s2_27 = vsubq_s16(s1_24, s1_27); + + s2_28 = vsubq_s16(s1_31, s1_28); + s2_29 = vsubq_s16(s1_30, s1_29); + s2_30 = vaddq_s16(s1_29, s1_30); + s2_31 = vaddq_s16(s1_28, s1_31); + + // stage 5 + s1_5 = sub_multiply_shift_and_narrow_s16(s1_7, s1_4, cospi_16_64); + s1_6 = add_multiply_shift_and_narrow_s16(s1_4, s1_7, cospi_16_64); + + s1_8 = vaddq_s16(s2_8, s2_11); + s1_9 = vaddq_s16(s2_9, s2_10); + s1_10 = vsubq_s16(s2_9, s2_10); + s1_11 = vsubq_s16(s2_8, s2_11); + s1_12 = vsubq_s16(s2_15, s2_12); + s1_13 = vsubq_s16(s2_14, s2_13); + s1_14 = vaddq_s16(s2_13, s2_14); + s1_15 = vaddq_s16(s2_12, s2_15); + + s1_18 = multiply_accumulate_shift_and_narrow_s16(s2_18, -cospi_8_64, s2_29, + cospi_24_64); + s1_29 = multiply_accumulate_shift_and_narrow_s16(s2_18, cospi_24_64, s2_29, + cospi_8_64); + + s1_19 = multiply_accumulate_shift_and_narrow_s16(s2_19, -cospi_8_64, s2_28, + cospi_24_64); + s1_28 = multiply_accumulate_shift_and_narrow_s16(s2_19, cospi_24_64, s2_28, + cospi_8_64); + + s1_20 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_24_64, s2_27, + -cospi_8_64); + s1_27 = multiply_accumulate_shift_and_narrow_s16(s2_20, -cospi_8_64, s2_27, + cospi_24_64); + + s1_21 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_24_64, s2_26, + -cospi_8_64); + s1_26 = multiply_accumulate_shift_and_narrow_s16(s2_21, -cospi_8_64, s2_26, + cospi_24_64); + + // stage 6 + s2_0 = vaddq_s16(s1_0, s1_7); + s2_1 = vaddq_s16(s1_0, s1_6); + s2_2 = vaddq_s16(s1_0, s1_5); + s2_3 = vaddq_s16(s1_0, s1_4); + s2_4 = vsubq_s16(s1_0, s1_4); + s2_5 = vsubq_s16(s1_0, s1_5); + s2_6 = vsubq_s16(s1_0, s1_6); + s2_7 = vsubq_s16(s1_0, s1_7); + + s2_10 = sub_multiply_shift_and_narrow_s16(s1_13, s1_10, cospi_16_64); + s2_13 = add_multiply_shift_and_narrow_s16(s1_10, s1_13, cospi_16_64); + + s2_11 = sub_multiply_shift_and_narrow_s16(s1_12, s1_11, cospi_16_64); + s2_12 = add_multiply_shift_and_narrow_s16(s1_11, s1_12, cospi_16_64); + + s1_16 = vaddq_s16(s2_16, s2_23); + s1_17 = vaddq_s16(s2_17, s2_22); + s2_18 = vaddq_s16(s1_18, s1_21); + s2_19 = vaddq_s16(s1_19, s1_20); + s2_20 = vsubq_s16(s1_19, s1_20); + s2_21 = vsubq_s16(s1_18, s1_21); + s1_22 = vsubq_s16(s2_17, s2_22); + s1_23 = vsubq_s16(s2_16, s2_23); + + s3_24 = vsubq_s16(s2_31, s2_24); + s3_25 = vsubq_s16(s2_30, s2_25); + s3_26 = vsubq_s16(s1_29, s1_26); + s3_27 = vsubq_s16(s1_28, s1_27); + s2_28 = vaddq_s16(s1_27, s1_28); + s2_29 = vaddq_s16(s1_26, s1_29); + s2_30 = vaddq_s16(s2_25, s2_30); + s2_31 = vaddq_s16(s2_24, s2_31); + + // stage 7 + s1_0 = vaddq_s16(s2_0, s1_15); + s1_1 = vaddq_s16(s2_1, s1_14); + s1_2 = vaddq_s16(s2_2, s2_13); + s1_3 = vaddq_s16(s2_3, s2_12); + s1_4 = vaddq_s16(s2_4, s2_11); + s1_5 = vaddq_s16(s2_5, s2_10); + s1_6 = vaddq_s16(s2_6, s1_9); + s1_7 = vaddq_s16(s2_7, s1_8); + s1_8 = vsubq_s16(s2_7, s1_8); + s1_9 = vsubq_s16(s2_6, s1_9); + s1_10 = vsubq_s16(s2_5, s2_10); + s1_11 = vsubq_s16(s2_4, s2_11); + s1_12 = vsubq_s16(s2_3, s2_12); + s1_13 = vsubq_s16(s2_2, s2_13); + s1_14 = vsubq_s16(s2_1, s1_14); + s1_15 = vsubq_s16(s2_0, s1_15); + + s1_20 = sub_multiply_shift_and_narrow_s16(s3_27, s2_20, cospi_16_64); + s1_27 = add_multiply_shift_and_narrow_s16(s2_20, s3_27, cospi_16_64); + + s1_21 = sub_multiply_shift_and_narrow_s16(s3_26, s2_21, cospi_16_64); + s1_26 = add_multiply_shift_and_narrow_s16(s2_21, s3_26, cospi_16_64); + + s2_22 = sub_multiply_shift_and_narrow_s16(s3_25, s1_22, cospi_16_64); + s1_25 = add_multiply_shift_and_narrow_s16(s1_22, s3_25, cospi_16_64); + + s2_23 = sub_multiply_shift_and_narrow_s16(s3_24, s1_23, cospi_16_64); + s1_24 = add_multiply_shift_and_narrow_s16(s1_23, s3_24, cospi_16_64); + + // final stage + out0 = vaddq_s16(s1_0, s2_31); + out1 = vaddq_s16(s1_1, s2_30); + out2 = vaddq_s16(s1_2, s2_29); + out3 = vaddq_s16(s1_3, s2_28); + out4 = vaddq_s16(s1_4, s1_27); + out5 = vaddq_s16(s1_5, s1_26); + out6 = vaddq_s16(s1_6, s1_25); + out7 = vaddq_s16(s1_7, s1_24); + + add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7, output, + stride); + + out0 = vaddq_s16(s1_8, s2_23); + out1 = vaddq_s16(s1_9, s2_22); + out2 = vaddq_s16(s1_10, s1_21); + out3 = vaddq_s16(s1_11, s1_20); + out4 = vaddq_s16(s1_12, s2_19); + out5 = vaddq_s16(s1_13, s2_18); + out6 = vaddq_s16(s1_14, s1_17); + out7 = vaddq_s16(s1_15, s1_16); + + add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7, + output + (8 * stride), stride); + + out0 = vsubq_s16(s1_15, s1_16); + out1 = vsubq_s16(s1_14, s1_17); + out2 = vsubq_s16(s1_13, s2_18); + out3 = vsubq_s16(s1_12, s2_19); + out4 = vsubq_s16(s1_11, s1_20); + out5 = vsubq_s16(s1_10, s1_21); + out6 = vsubq_s16(s1_9, s2_22); + out7 = vsubq_s16(s1_8, s2_23); + + add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7, + output + (16 * stride), stride); + + out0 = vsubq_s16(s1_7, s1_24); + out1 = vsubq_s16(s1_6, s1_25); + out2 = vsubq_s16(s1_5, s1_26); + out3 = vsubq_s16(s1_4, s1_27); + out4 = vsubq_s16(s1_3, s2_28); + out5 = vsubq_s16(s1_2, s2_29); + out6 = vsubq_s16(s1_1, s2_30); + out7 = vsubq_s16(s1_0, s2_31); + + add_and_store_u8_s16(out0, out1, out2, out3, out4, out5, out6, out7, + output + (24 * stride), stride); +} + +void vpx_idct32x32_34_add_neon(const int16_t *input, uint8_t *dest, + int stride) { + int i; + int16_t temp[32 * 8]; + int16_t *t = temp; + + idct32_6_neon(input, t); + + for (i = 0; i < 32; i += 8) { + idct32_8_neon(t, dest, stride); + t += (8 * 8); + dest += 8; + } +} diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.asm deleted file mode 100644 index 7483ee77e18..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.asm +++ /dev/null @@ -1,1299 +0,0 @@ -; -; Copyright (c) 2013 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -;TODO(cd): adjust these constant to be able to use vqdmulh for faster -; dct_const_round_shift(a * b) within butterfly calculations. -cospi_1_64 EQU 16364 -cospi_2_64 EQU 16305 -cospi_3_64 EQU 16207 -cospi_4_64 EQU 16069 -cospi_5_64 EQU 15893 -cospi_6_64 EQU 15679 -cospi_7_64 EQU 15426 -cospi_8_64 EQU 15137 -cospi_9_64 EQU 14811 -cospi_10_64 EQU 14449 -cospi_11_64 EQU 14053 -cospi_12_64 EQU 13623 -cospi_13_64 EQU 13160 -cospi_14_64 EQU 12665 -cospi_15_64 EQU 12140 -cospi_16_64 EQU 11585 -cospi_17_64 EQU 11003 -cospi_18_64 EQU 10394 -cospi_19_64 EQU 9760 -cospi_20_64 EQU 9102 -cospi_21_64 EQU 8423 -cospi_22_64 EQU 7723 -cospi_23_64 EQU 7005 -cospi_24_64 EQU 6270 -cospi_25_64 EQU 5520 -cospi_26_64 EQU 4756 -cospi_27_64 EQU 3981 -cospi_28_64 EQU 3196 -cospi_29_64 EQU 2404 -cospi_30_64 EQU 1606 -cospi_31_64 EQU 804 - - - EXPORT |vpx_idct32x32_1024_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - AREA Block, CODE, READONLY - - ; -------------------------------------------------------------------------- - ; Load from transposed_buffer - ; q13 = transposed_buffer[first_offset] - ; q14 = transposed_buffer[second_offset] - ; for proper address calculation, the last offset used when manipulating - ; transposed_buffer must be passed in. use 0 for first use. - MACRO - LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset - ; address calculation with proper stride and loading - add r0, #($first_offset - $prev_offset )*8*2 - vld1.s16 {q14}, [r0] - add r0, #($second_offset - $first_offset)*8*2 - vld1.s16 {q13}, [r0] - ; (used) two registers (q14, q13) - MEND - ; -------------------------------------------------------------------------- - ; Load from output (used as temporary storage) - ; reg1 = output[first_offset] - ; reg2 = output[second_offset] - ; for proper address calculation, the last offset used when manipulating - ; output, whether reading or storing) must be passed in. use 0 for first - ; use. - MACRO - LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 - ; address calculation with proper stride and loading - add r1, #($first_offset - $prev_offset )*32*2 - vld1.s16 {$reg1}, [r1] - add r1, #($second_offset - $first_offset)*32*2 - vld1.s16 {$reg2}, [r1] - ; (used) two registers ($reg1, $reg2) - MEND - ; -------------------------------------------------------------------------- - ; Store into output (sometimes as as temporary storage) - ; output[first_offset] = reg1 - ; output[second_offset] = reg2 - ; for proper address calculation, the last offset used when manipulating - ; output, whether reading or storing) must be passed in. use 0 for first - ; use. - MACRO - STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2 - ; address calculation with proper stride and storing - add r1, #($first_offset - $prev_offset )*32*2 - vst1.16 {$reg1}, [r1] - add r1, #($second_offset - $first_offset)*32*2 - vst1.16 {$reg2}, [r1] - MEND - ; -------------------------------------------------------------------------- - ; Combine-add results with current destination content - ; q6-q9 contain the results (out[j * 32 + 0-31]) - MACRO - STORE_COMBINE_CENTER_RESULTS - ; load dest[j * dest_stride + 0-31] - vld1.s16 {d8}, [r10], r2 - vld1.s16 {d11}, [r9], r11 - vld1.s16 {d9}, [r10] - vld1.s16 {d10}, [r9] - ; ROUND_POWER_OF_TWO - vrshr.s16 q7, q7, #6 - vrshr.s16 q8, q8, #6 - vrshr.s16 q9, q9, #6 - vrshr.s16 q6, q6, #6 - ; add to dest[j * dest_stride + 0-31] - vaddw.u8 q7, q7, d9 - vaddw.u8 q8, q8, d10 - vaddw.u8 q9, q9, d11 - vaddw.u8 q6, q6, d8 - ; clip pixel - vqmovun.s16 d9, q7 - vqmovun.s16 d10, q8 - vqmovun.s16 d11, q9 - vqmovun.s16 d8, q6 - ; store back into dest[j * dest_stride + 0-31] - vst1.16 {d9}, [r10], r11 - vst1.16 {d10}, [r9], r2 - vst1.16 {d8}, [r10] - vst1.16 {d11}, [r9] - ; update pointers (by dest_stride * 2) - sub r9, r9, r2, lsl #1 - add r10, r10, r2, lsl #1 - MEND - ; -------------------------------------------------------------------------- - ; Combine-add results with current destination content - ; q6-q9 contain the results (out[j * 32 + 0-31]) - MACRO - STORE_COMBINE_CENTER_RESULTS_LAST - ; load dest[j * dest_stride + 0-31] - vld1.s16 {d8}, [r10], r2 - vld1.s16 {d11}, [r9], r11 - vld1.s16 {d9}, [r10] - vld1.s16 {d10}, [r9] - ; ROUND_POWER_OF_TWO - vrshr.s16 q7, q7, #6 - vrshr.s16 q8, q8, #6 - vrshr.s16 q9, q9, #6 - vrshr.s16 q6, q6, #6 - ; add to dest[j * dest_stride + 0-31] - vaddw.u8 q7, q7, d9 - vaddw.u8 q8, q8, d10 - vaddw.u8 q9, q9, d11 - vaddw.u8 q6, q6, d8 - ; clip pixel - vqmovun.s16 d9, q7 - vqmovun.s16 d10, q8 - vqmovun.s16 d11, q9 - vqmovun.s16 d8, q6 - ; store back into dest[j * dest_stride + 0-31] - vst1.16 {d9}, [r10], r11 - vst1.16 {d10}, [r9], r2 - vst1.16 {d8}, [r10]! - vst1.16 {d11}, [r9]! - ; update pointers (by dest_stride * 2) - sub r9, r9, r2, lsl #1 - add r10, r10, r2, lsl #1 - MEND - ; -------------------------------------------------------------------------- - ; Combine-add results with current destination content - ; q4-q7 contain the results (out[j * 32 + 0-31]) - MACRO - STORE_COMBINE_EXTREME_RESULTS - ; load dest[j * dest_stride + 0-31] - vld1.s16 {d4}, [r7], r2 - vld1.s16 {d7}, [r6], r11 - vld1.s16 {d5}, [r7] - vld1.s16 {d6}, [r6] - ; ROUND_POWER_OF_TWO - vrshr.s16 q5, q5, #6 - vrshr.s16 q6, q6, #6 - vrshr.s16 q7, q7, #6 - vrshr.s16 q4, q4, #6 - ; add to dest[j * dest_stride + 0-31] - vaddw.u8 q5, q5, d5 - vaddw.u8 q6, q6, d6 - vaddw.u8 q7, q7, d7 - vaddw.u8 q4, q4, d4 - ; clip pixel - vqmovun.s16 d5, q5 - vqmovun.s16 d6, q6 - vqmovun.s16 d7, q7 - vqmovun.s16 d4, q4 - ; store back into dest[j * dest_stride + 0-31] - vst1.16 {d5}, [r7], r11 - vst1.16 {d6}, [r6], r2 - vst1.16 {d7}, [r6] - vst1.16 {d4}, [r7] - ; update pointers (by dest_stride * 2) - sub r6, r6, r2, lsl #1 - add r7, r7, r2, lsl #1 - MEND - ; -------------------------------------------------------------------------- - ; Combine-add results with current destination content - ; q4-q7 contain the results (out[j * 32 + 0-31]) - MACRO - STORE_COMBINE_EXTREME_RESULTS_LAST - ; load dest[j * dest_stride + 0-31] - vld1.s16 {d4}, [r7], r2 - vld1.s16 {d7}, [r6], r11 - vld1.s16 {d5}, [r7] - vld1.s16 {d6}, [r6] - ; ROUND_POWER_OF_TWO - vrshr.s16 q5, q5, #6 - vrshr.s16 q6, q6, #6 - vrshr.s16 q7, q7, #6 - vrshr.s16 q4, q4, #6 - ; add to dest[j * dest_stride + 0-31] - vaddw.u8 q5, q5, d5 - vaddw.u8 q6, q6, d6 - vaddw.u8 q7, q7, d7 - vaddw.u8 q4, q4, d4 - ; clip pixel - vqmovun.s16 d5, q5 - vqmovun.s16 d6, q6 - vqmovun.s16 d7, q7 - vqmovun.s16 d4, q4 - ; store back into dest[j * dest_stride + 0-31] - vst1.16 {d5}, [r7], r11 - vst1.16 {d6}, [r6], r2 - vst1.16 {d7}, [r6]! - vst1.16 {d4}, [r7]! - ; update pointers (by dest_stride * 2) - sub r6, r6, r2, lsl #1 - add r7, r7, r2, lsl #1 - MEND - ; -------------------------------------------------------------------------- - ; Touches q8-q12, q15 (q13-q14 are preserved) - ; valid output registers are anything but q8-q11 - MACRO - DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 - ; TODO(cd): have special case to re-use constants when they are similar for - ; consecutive butterflies - ; TODO(cd): have special case when both constants are the same, do the - ; additions/subtractions before the multiplies. - ; generate the constants - ; generate scalar constants - mov r8, #$first_constant & 0xFF00 - mov r12, #$second_constant & 0xFF00 - add r8, #$first_constant & 0x00FF - add r12, #$second_constant & 0x00FF - ; generate vector constants - vdup.16 d30, r8 - vdup.16 d31, r12 - ; (used) two for inputs (regA-regD), one for constants (q15) - ; do some multiplications (ordered for maximum latency hiding) - vmull.s16 q8, $regC, d30 - vmull.s16 q10, $regA, d31 - vmull.s16 q9, $regD, d30 - vmull.s16 q11, $regB, d31 - vmull.s16 q12, $regC, d31 - ; (used) five for intermediate (q8-q12), one for constants (q15) - ; do some addition/subtractions (to get back two register) - vsub.s32 q8, q8, q10 - vsub.s32 q9, q9, q11 - ; do more multiplications (ordered for maximum latency hiding) - vmull.s16 q10, $regD, d31 - vmull.s16 q11, $regA, d30 - vmull.s16 q15, $regB, d30 - ; (used) six for intermediate (q8-q12, q15) - ; do more addition/subtractions - vadd.s32 q11, q12, q11 - vadd.s32 q10, q10, q15 - ; (used) four for intermediate (q8-q11) - ; dct_const_round_shift - vqrshrn.s32 $reg1, q8, #14 - vqrshrn.s32 $reg2, q9, #14 - vqrshrn.s32 $reg3, q11, #14 - vqrshrn.s32 $reg4, q10, #14 - ; (used) two for results, well four d registers - MEND - ; -------------------------------------------------------------------------- - ; Touches q8-q12, q15 (q13-q14 are preserved) - ; valid output registers are anything but q8-q11 - MACRO - DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 - DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4 - MEND - ; -------------------------------------------------------------------------- - -;void vpx_idct32x32_1024_add_neon(int16_t *input, uint8_t *dest, int dest_stride); -; -; r0 int16_t *input, -; r1 uint8_t *dest, -; r2 int dest_stride) -; loop counters -; r4 bands loop counter -; r5 pass loop counter -; r8 transpose loop counter -; combine-add pointers -; r6 dest + 31 * dest_stride, descending (30, 29, 28, ...) -; r7 dest + 0 * dest_stride, ascending (1, 2, 3, ...) -; r9 dest + 15 * dest_stride, descending (14, 13, 12, ...) -; r10 dest + 16 * dest_stride, ascending (17, 18, 19, ...) - -|vpx_idct32x32_1024_add_neon| PROC - ; This function does one pass of idct32x32 transform. - ; - ; This is done by transposing the input and then doing a 1d transform on - ; columns. In the first pass, the transposed columns are the original - ; rows. In the second pass, after the transposition, the colums are the - ; original columns. - ; The 1d transform is done by looping over bands of eight columns (the - ; idct32_bands loop). For each band, the transform input transposition - ; is done on demand, one band of four 8x8 matrices at a time. The four - ; matrices are transposed by pairs (the idct32_transpose_pair loop). - push {r4-r11} - vpush {d8-d15} - ; stack operation - ; internal buffer used to transpose 8 lines into before transforming them - ; int16_t transpose_buffer[32 * 8]; - ; at sp + [4096, 4607] - ; results of the first pass (transpose and transform rows) - ; int16_t pass1[32 * 32]; - ; at sp + [0, 2047] - ; results of the second pass (transpose and transform columns) - ; int16_t pass2[32 * 32]; - ; at sp + [2048, 4095] - sub sp, sp, #512+2048+2048 - - ; r6 = dest + 31 * dest_stride - ; r7 = dest + 0 * dest_stride - ; r9 = dest + 15 * dest_stride - ; r10 = dest + 16 * dest_stride - rsb r6, r2, r2, lsl #5 - rsb r9, r2, r2, lsl #4 - add r10, r1, r2, lsl #4 - mov r7, r1 - add r6, r6, r1 - add r9, r9, r1 - ; r11 = -dest_stride - neg r11, r2 - ; r3 = input - mov r3, r0 - ; parameters for first pass - ; r0 = transpose_buffer[32 * 8] - add r0, sp, #4096 - ; r1 = pass1[32 * 32] - mov r1, sp - - mov r5, #0 ; initialize pass loop counter -idct32_pass_loop - mov r4, #4 ; initialize bands loop counter -idct32_bands_loop - mov r8, #2 ; initialize transpose loop counter -idct32_transpose_pair_loop - ; Load two horizontally consecutive 8x8 16bit data matrices. The first one - ; into q0-q7 and the second one into q8-q15. There is a stride of 64, - ; adjusted to 32 because of the two post-increments. - vld1.s16 {q8}, [r3]! - vld1.s16 {q0}, [r3]! - add r3, #32 - vld1.s16 {q9}, [r3]! - vld1.s16 {q1}, [r3]! - add r3, #32 - vld1.s16 {q10}, [r3]! - vld1.s16 {q2}, [r3]! - add r3, #32 - vld1.s16 {q11}, [r3]! - vld1.s16 {q3}, [r3]! - add r3, #32 - vld1.s16 {q12}, [r3]! - vld1.s16 {q4}, [r3]! - add r3, #32 - vld1.s16 {q13}, [r3]! - vld1.s16 {q5}, [r3]! - add r3, #32 - vld1.s16 {q14}, [r3]! - vld1.s16 {q6}, [r3]! - add r3, #32 - vld1.s16 {q15}, [r3]! - vld1.s16 {q7}, [r3]! - - ; Transpose the two 8x8 16bit data matrices. - vswp d17, d24 - vswp d23, d30 - vswp d21, d28 - vswp d19, d26 - vswp d1, d8 - vswp d7, d14 - vswp d5, d12 - vswp d3, d10 - vtrn.32 q8, q10 - vtrn.32 q9, q11 - vtrn.32 q12, q14 - vtrn.32 q13, q15 - vtrn.32 q0, q2 - vtrn.32 q1, q3 - vtrn.32 q4, q6 - vtrn.32 q5, q7 - vtrn.16 q8, q9 - vtrn.16 q10, q11 - vtrn.16 q12, q13 - vtrn.16 q14, q15 - vtrn.16 q0, q1 - vtrn.16 q2, q3 - vtrn.16 q4, q5 - vtrn.16 q6, q7 - - ; Store both matrices after each other. There is a stride of 32, which - ; adjusts to nothing because of the post-increments. - vst1.16 {q8}, [r0]! - vst1.16 {q9}, [r0]! - vst1.16 {q10}, [r0]! - vst1.16 {q11}, [r0]! - vst1.16 {q12}, [r0]! - vst1.16 {q13}, [r0]! - vst1.16 {q14}, [r0]! - vst1.16 {q15}, [r0]! - vst1.16 {q0}, [r0]! - vst1.16 {q1}, [r0]! - vst1.16 {q2}, [r0]! - vst1.16 {q3}, [r0]! - vst1.16 {q4}, [r0]! - vst1.16 {q5}, [r0]! - vst1.16 {q6}, [r0]! - vst1.16 {q7}, [r0]! - - ; increment pointers by adjusted stride (not necessary for r0/out) - ; go back by 7*32 for the seven lines moved fully by read and add - ; go back by 32 for the eigth line only read - ; advance by 16*2 to go the next pair - sub r3, r3, #7*32*2 + 32 - 16*2 - ; transpose pair loop processing - subs r8, r8, #1 - bne idct32_transpose_pair_loop - - ; restore r0/input to its original value - sub r0, r0, #32*8*2 - - ; Instead of doing the transforms stage by stage, it is done by loading - ; some input values and doing as many stages as possible to minimize the - ; storing/loading of intermediate results. To fit within registers, the - ; final coefficients are cut into four blocks: - ; BLOCK A: 16-19,28-31 - ; BLOCK B: 20-23,24-27 - ; BLOCK C: 8-10,11-15 - ; BLOCK D: 0-3,4-7 - ; Blocks A and C are straight calculation through the various stages. In - ; block B, further calculations are performed using the results from - ; block A. In block D, further calculations are performed using the results - ; from block C and then the final calculations are done using results from - ; block A and B which have been combined at the end of block B. - - ; -------------------------------------------------------------------------- - ; BLOCK A: 16-19,28-31 - ; -------------------------------------------------------------------------- - ; generate 16,17,30,31 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] * cospi_1_64; - ;temp2 = input[1 * 32] * cospi_1_64 + input[31 * 32] * cospi_31_64; - ;step1b[16][i] = dct_const_round_shift(temp1); - ;step1b[31][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 0, 1, 31 - DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64; - ;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64; - ;step1b[17][i] = dct_const_round_shift(temp1); - ;step1b[30][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 31, 17, 15 - DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;step2[16] = step1b[16][i] + step1b[17][i]; - ;step2[17] = step1b[16][i] - step1b[17][i]; - ;step2[30] = -step1b[30][i] + step1b[31][i]; - ;step2[31] = step1b[30][i] + step1b[31][i]; - vadd.s16 q4, q0, q1 - vsub.s16 q13, q0, q1 - vadd.s16 q6, q2, q3 - vsub.s16 q14, q2, q3 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64; - ;temp2 = step1b[30][i] * cospi_4_64 - step1b[17][i] * cospi_28_64; - ;step3[17] = dct_const_round_shift(temp1); - ;step3[30] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15 - ; -------------------------------------------------------------------------- - ; generate 18,19,28,29 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64; - ;temp2 = input[9 * 32] * cospi_9_64 + input[23 * 32] * cospi_23_64; - ;step1b[18][i] = dct_const_round_shift(temp1); - ;step1b[29][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 15, 9, 23 - DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[25 * 32] * cospi_7_64 - input[7 * 32] * cospi_25_64; - ;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64; - ;step1b[19][i] = dct_const_round_shift(temp1); - ;step1b[28][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 23, 25, 7 - DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;step2[18] = -step1b[18][i] + step1b[19][i]; - ;step2[19] = step1b[18][i] + step1b[19][i]; - ;step2[28] = step1b[28][i] + step1b[29][i]; - ;step2[29] = step1b[28][i] - step1b[29][i]; - vsub.s16 q13, q3, q2 - vadd.s16 q3, q3, q2 - vsub.s16 q14, q1, q0 - vadd.s16 q2, q1, q0 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = step1b[18][i] * (-cospi_4_64) - step1b[29][i] * (-cospi_28_64); - ;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64); - ;step3[29] = dct_const_round_shift(temp1); - ;step3[18] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1 - ; -------------------------------------------------------------------------- - ; combine 16-19,28-31 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[16] = step1b[16][i] + step1b[19][i]; - ;step1[17] = step1b[17][i] + step1b[18][i]; - ;step1[18] = step1b[17][i] - step1b[18][i]; - ;step1[29] = step1b[30][i] - step1b[29][i]; - ;step1[30] = step1b[30][i] + step1b[29][i]; - ;step1[31] = step1b[31][i] + step1b[28][i]; - vadd.s16 q8, q4, q2 - vadd.s16 q9, q5, q0 - vadd.s16 q10, q7, q1 - vadd.s16 q15, q6, q3 - vsub.s16 q13, q5, q0 - vsub.s16 q14, q7, q1 - STORE_IN_OUTPUT 0, 16, 31, q8, q15 - STORE_IN_OUTPUT 31, 17, 30, q9, q10 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64; - ;temp2 = step1b[29][i] * cospi_8_64 + step1b[18][i] * cospi_24_64; - ;step2[18] = dct_const_round_shift(temp1); - ;step2[29] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3 - STORE_IN_OUTPUT 30, 29, 18, q1, q0 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[19] = step1b[16][i] - step1b[19][i]; - ;step1[28] = step1b[31][i] - step1b[28][i]; - vsub.s16 q13, q4, q2 - vsub.s16 q14, q6, q3 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64; - ;temp2 = step1b[28][i] * cospi_8_64 + step1b[19][i] * cospi_24_64; - ;step2[19] = dct_const_round_shift(temp1); - ;step2[28] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13 - STORE_IN_OUTPUT 18, 19, 28, q4, q6 - ; -------------------------------------------------------------------------- - - - ; -------------------------------------------------------------------------- - ; BLOCK B: 20-23,24-27 - ; -------------------------------------------------------------------------- - ; generate 20,21,26,27 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64; - ;temp2 = input[5 * 32] * cospi_5_64 + input[27 * 32] * cospi_27_64; - ;step1b[20][i] = dct_const_round_shift(temp1); - ;step1b[27][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 7, 5, 27 - DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64; - ;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64; - ;step1b[21][i] = dct_const_round_shift(temp1); - ;step1b[26][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 27, 21, 11 - DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;step2[20] = step1b[20][i] + step1b[21][i]; - ;step2[21] = step1b[20][i] - step1b[21][i]; - ;step2[26] = -step1b[26][i] + step1b[27][i]; - ;step2[27] = step1b[26][i] + step1b[27][i]; - vsub.s16 q13, q0, q1 - vadd.s16 q0, q0, q1 - vsub.s16 q14, q2, q3 - vadd.s16 q2, q2, q3 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64; - ;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64; - ;step3[21] = dct_const_round_shift(temp1); - ;step3[26] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; generate 22,23,24,25 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64; - ;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64; - ;step1b[22][i] = dct_const_round_shift(temp1); - ;step1b[25][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 11, 13, 19 - DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15 - ; -------------------------------------------------------------------------- - ; part of stage 1 - ;temp1 = input[29 * 32] * cospi_3_64 - input[3 * 32] * cospi_29_64; - ;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64; - ;step1b[23][i] = dct_const_round_shift(temp1); - ;step1b[24][i] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 19, 29, 3 - DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;step2[22] = -step1b[22][i] + step1b[23][i]; - ;step2[23] = step1b[22][i] + step1b[23][i]; - ;step2[24] = step1b[24][i] + step1b[25][i]; - ;step2[25] = step1b[24][i] - step1b[25][i]; - vsub.s16 q14, q4, q5 - vadd.s16 q5, q4, q5 - vsub.s16 q13, q6, q7 - vadd.s16 q6, q6, q7 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64); - ;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64); - ;step3[25] = dct_const_round_shift(temp1); - ;step3[22] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15 - ; -------------------------------------------------------------------------- - ; combine 20-23,24-27 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[22] = step1b[22][i] + step1b[21][i]; - ;step1[23] = step1b[23][i] + step1b[20][i]; - vadd.s16 q10, q7, q1 - vadd.s16 q11, q5, q0 - ;step1[24] = step1b[24][i] + step1b[27][i]; - ;step1[25] = step1b[25][i] + step1b[26][i]; - vadd.s16 q12, q6, q2 - vadd.s16 q15, q4, q3 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;step3[16] = step1b[16][i] + step1b[23][i]; - ;step3[17] = step1b[17][i] + step1b[22][i]; - ;step3[22] = step1b[17][i] - step1b[22][i]; - ;step3[23] = step1b[16][i] - step1b[23][i]; - LOAD_FROM_OUTPUT 28, 16, 17, q14, q13 - vadd.s16 q8, q14, q11 - vadd.s16 q9, q13, q10 - vsub.s16 q13, q13, q10 - vsub.s16 q11, q14, q11 - STORE_IN_OUTPUT 17, 17, 16, q9, q8 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;step3[24] = step1b[31][i] - step1b[24][i]; - ;step3[25] = step1b[30][i] - step1b[25][i]; - ;step3[30] = step1b[30][i] + step1b[25][i]; - ;step3[31] = step1b[31][i] + step1b[24][i]; - LOAD_FROM_OUTPUT 16, 30, 31, q14, q9 - vsub.s16 q8, q9, q12 - vadd.s16 q10, q14, q15 - vsub.s16 q14, q14, q15 - vadd.s16 q12, q9, q12 - STORE_IN_OUTPUT 31, 30, 31, q10, q12 - ; -------------------------------------------------------------------------- - ; TODO(cd) do some register allocation change to remove these push/pop - vpush {q8} ; [24] - vpush {q11} ; [23] - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64; - ;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64; - ;step1[22] = dct_const_round_shift(temp1); - ;step1[25] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 - STORE_IN_OUTPUT 31, 25, 22, q14, q13 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64; - ;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64; - ;step1[23] = dct_const_round_shift(temp1); - ;step1[24] = dct_const_round_shift(temp2); - ; TODO(cd) do some register allocation change to remove these push/pop - vpop {q13} ; [23] - vpop {q14} ; [24] - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 - STORE_IN_OUTPUT 22, 24, 23, q14, q13 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[20] = step1b[23][i] - step1b[20][i]; - ;step1[27] = step1b[24][i] - step1b[27][i]; - vsub.s16 q14, q5, q0 - vsub.s16 q13, q6, q2 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;temp1 = step1b[20][i] * (-cospi_8_64) - step1b[27][i] * (-cospi_24_64); - ;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64); - ;step2[27] = dct_const_round_shift(temp1); - ;step2[20] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[21] = step1b[22][i] - step1b[21][i]; - ;step1[26] = step1b[25][i] - step1b[26][i]; - vsub.s16 q14, q7, q1 - vsub.s16 q13, q4, q3 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;temp1 = step1b[21][i] * (-cospi_8_64) - step1b[26][i] * (-cospi_24_64); - ;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64); - ;step2[26] = dct_const_round_shift(temp1); - ;step2[21] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;step3[18] = step1b[18][i] + step1b[21][i]; - ;step3[19] = step1b[19][i] + step1b[20][i]; - ;step3[20] = step1b[19][i] - step1b[20][i]; - ;step3[21] = step1b[18][i] - step1b[21][i]; - LOAD_FROM_OUTPUT 23, 18, 19, q14, q13 - vadd.s16 q8, q14, q1 - vadd.s16 q9, q13, q6 - vsub.s16 q13, q13, q6 - vsub.s16 q1, q14, q1 - STORE_IN_OUTPUT 19, 18, 19, q8, q9 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;step3[27] = step1b[28][i] - step1b[27][i]; - ;step3[28] = step1b[28][i] + step1b[27][i]; - ;step3[29] = step1b[29][i] + step1b[26][i]; - ;step3[26] = step1b[29][i] - step1b[26][i]; - LOAD_FROM_OUTPUT 19, 28, 29, q8, q9 - vsub.s16 q14, q8, q5 - vadd.s16 q10, q8, q5 - vadd.s16 q11, q9, q0 - vsub.s16 q0, q9, q0 - STORE_IN_OUTPUT 29, 28, 29, q10, q11 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64; - ;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64; - ;step1[20] = dct_const_round_shift(temp1); - ;step1[27] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29 - STORE_IN_OUTPUT 29, 20, 27, q13, q14 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64; - ;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64; - ;step1[21] = dct_const_round_shift(temp1); - ;step1[26] = dct_const_round_shift(temp2); - DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1 - STORE_IN_OUTPUT 27, 21, 26, q1, q0 - ; -------------------------------------------------------------------------- - - - ; -------------------------------------------------------------------------- - ; BLOCK C: 8-10,11-15 - ; -------------------------------------------------------------------------- - ; generate 8,9,14,15 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64; - ;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64; - ;step2[8] = dct_const_round_shift(temp1); - ;step2[15] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 3, 2, 30 - DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64; - ;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64; - ;step2[9] = dct_const_round_shift(temp1); - ;step2[14] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 30, 18, 14 - DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;step3[8] = step1b[8][i] + step1b[9][i]; - ;step3[9] = step1b[8][i] - step1b[9][i]; - ;step3[14] = step1b[15][i] - step1b[14][i]; - ;step3[15] = step1b[15][i] + step1b[14][i]; - vsub.s16 q13, q0, q1 - vadd.s16 q0, q0, q1 - vsub.s16 q14, q2, q3 - vadd.s16 q2, q2, q3 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64; - ;temp2 = step1b[14][i] * cospi_8_64 + step1b[9][i] * cospi_24_64; - ;step1[9] = dct_const_round_shift(temp1); - ;step1[14] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; generate 10,11,12,13 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64; - ;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64; - ;step2[10] = dct_const_round_shift(temp1); - ;step2[13] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 14, 10, 22 - DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15 - ; -------------------------------------------------------------------------- - ; part of stage 2 - ;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64; - ;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64; - ;step2[11] = dct_const_round_shift(temp1); - ;step2[12] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 22, 26, 6 - DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;step3[10] = step1b[11][i] - step1b[10][i]; - ;step3[11] = step1b[11][i] + step1b[10][i]; - ;step3[12] = step1b[12][i] + step1b[13][i]; - ;step3[13] = step1b[12][i] - step1b[13][i]; - vsub.s16 q14, q4, q5 - vadd.s16 q5, q4, q5 - vsub.s16 q13, q6, q7 - vadd.s16 q6, q6, q7 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;temp1 = step1b[10][i] * (-cospi_8_64) - step1b[13][i] * (-cospi_24_64); - ;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64); - ;step1[13] = dct_const_round_shift(temp1); - ;step1[10] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15 - ; -------------------------------------------------------------------------- - ; combine 8-10,11-15 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;step2[8] = step1b[8][i] + step1b[11][i]; - ;step2[9] = step1b[9][i] + step1b[10][i]; - ;step2[10] = step1b[9][i] - step1b[10][i]; - vadd.s16 q8, q0, q5 - vadd.s16 q9, q1, q7 - vsub.s16 q13, q1, q7 - ;step2[13] = step1b[14][i] - step1b[13][i]; - ;step2[14] = step1b[14][i] + step1b[13][i]; - ;step2[15] = step1b[15][i] + step1b[12][i]; - vsub.s16 q14, q3, q4 - vadd.s16 q10, q3, q4 - vadd.s16 q15, q2, q6 - STORE_IN_OUTPUT 26, 8, 15, q8, q15 - STORE_IN_OUTPUT 15, 9, 14, q9, q10 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64; - ;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64; - ;step3[10] = dct_const_round_shift(temp1); - ;step3[13] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 - STORE_IN_OUTPUT 14, 13, 10, q3, q1 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;step2[11] = step1b[8][i] - step1b[11][i]; - ;step2[12] = step1b[15][i] - step1b[12][i]; - vsub.s16 q13, q0, q5 - vsub.s16 q14, q2, q6 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64; - ;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64; - ;step3[11] = dct_const_round_shift(temp1); - ;step3[12] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 - STORE_IN_OUTPUT 10, 11, 12, q1, q3 - ; -------------------------------------------------------------------------- - - - ; -------------------------------------------------------------------------- - ; BLOCK D: 0-3,4-7 - ; -------------------------------------------------------------------------- - ; generate 4,5,6,7 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64; - ;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64; - ;step3[4] = dct_const_round_shift(temp1); - ;step3[7] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 6, 4, 28 - DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5 - ; -------------------------------------------------------------------------- - ; part of stage 3 - ;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64; - ;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64; - ;step3[5] = dct_const_round_shift(temp1); - ;step3[6] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 28, 20, 12 - DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;step1[4] = step1b[4][i] + step1b[5][i]; - ;step1[5] = step1b[4][i] - step1b[5][i]; - ;step1[6] = step1b[7][i] - step1b[6][i]; - ;step1[7] = step1b[7][i] + step1b[6][i]; - vsub.s16 q13, q0, q1 - vadd.s16 q0, q0, q1 - vsub.s16 q14, q2, q3 - vadd.s16 q2, q2, q3 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64; - ;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64; - ;step2[5] = dct_const_round_shift(temp1); - ;step2[6] = dct_const_round_shift(temp2); - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7 - ; -------------------------------------------------------------------------- - ; generate 0,1,2,3 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64; - ;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64; - ;step1[1] = dct_const_round_shift(temp1); - ;step1[0] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 12, 0, 16 - DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15 - ; -------------------------------------------------------------------------- - ; part of stage 4 - ;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64; - ;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64; - ;step1[2] = dct_const_round_shift(temp1); - ;step1[3] = dct_const_round_shift(temp2); - LOAD_FROM_TRANSPOSED 16, 8, 24 - DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13 - ; -------------------------------------------------------------------------- - ; part of stage 5 - ;step2[0] = step1b[0][i] + step1b[3][i]; - ;step2[1] = step1b[1][i] + step1b[2][i]; - ;step2[2] = step1b[1][i] - step1b[2][i]; - ;step2[3] = step1b[0][i] - step1b[3][i]; - vadd.s16 q4, q7, q6 - vsub.s16 q7, q7, q6 - vsub.s16 q6, q5, q14 - vadd.s16 q5, q5, q14 - ; -------------------------------------------------------------------------- - ; combine 0-3,4-7 - ; -------------------------------------------------------------------------- - ; part of stage 6 - ;step3[0] = step1b[0][i] + step1b[7][i]; - ;step3[1] = step1b[1][i] + step1b[6][i]; - ;step3[2] = step1b[2][i] + step1b[5][i]; - ;step3[3] = step1b[3][i] + step1b[4][i]; - vadd.s16 q8, q4, q2 - vadd.s16 q9, q5, q3 - vadd.s16 q10, q6, q1 - vadd.s16 q11, q7, q0 - ;step3[4] = step1b[3][i] - step1b[4][i]; - ;step3[5] = step1b[2][i] - step1b[5][i]; - ;step3[6] = step1b[1][i] - step1b[6][i]; - ;step3[7] = step1b[0][i] - step1b[7][i]; - vsub.s16 q12, q7, q0 - vsub.s16 q13, q6, q1 - vsub.s16 q14, q5, q3 - vsub.s16 q15, q4, q2 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[0] = step1b[0][i] + step1b[15][i]; - ;step1[1] = step1b[1][i] + step1b[14][i]; - ;step1[14] = step1b[1][i] - step1b[14][i]; - ;step1[15] = step1b[0][i] - step1b[15][i]; - LOAD_FROM_OUTPUT 12, 14, 15, q0, q1 - vadd.s16 q2, q8, q1 - vadd.s16 q3, q9, q0 - vsub.s16 q4, q9, q0 - vsub.s16 q5, q8, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[14 * 32] = step1b[14][i] + step1b[17][i]; - ;output[15 * 32] = step1b[15][i] + step1b[16][i]; - ;output[16 * 32] = step1b[15][i] - step1b[16][i]; - ;output[17 * 32] = step1b[14][i] - step1b[17][i]; - LOAD_FROM_OUTPUT 15, 16, 17, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - - cmp r5, #0 - bgt idct32_bands_end_2nd_pass - -idct32_bands_end_1st_pass - STORE_IN_OUTPUT 17, 16, 17, q6, q7 - STORE_IN_OUTPUT 17, 14, 15, q8, q9 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 0 * 32] = step1b[0][i] + step1b[31][i]; - ;output[ 1 * 32] = step1b[1][i] + step1b[30][i]; - ;output[30 * 32] = step1b[1][i] - step1b[30][i]; - ;output[31 * 32] = step1b[0][i] - step1b[31][i]; - LOAD_FROM_OUTPUT 15, 30, 31, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 31, 30, 31, q6, q7 - STORE_IN_OUTPUT 31, 0, 1, q4, q5 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[2] = step1b[2][i] + step1b[13][i]; - ;step1[3] = step1b[3][i] + step1b[12][i]; - ;step1[12] = step1b[3][i] - step1b[12][i]; - ;step1[13] = step1b[2][i] - step1b[13][i]; - LOAD_FROM_OUTPUT 1, 12, 13, q0, q1 - vadd.s16 q2, q10, q1 - vadd.s16 q3, q11, q0 - vsub.s16 q4, q11, q0 - vsub.s16 q5, q10, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[12 * 32] = step1b[12][i] + step1b[19][i]; - ;output[13 * 32] = step1b[13][i] + step1b[18][i]; - ;output[18 * 32] = step1b[13][i] - step1b[18][i]; - ;output[19 * 32] = step1b[12][i] - step1b[19][i]; - LOAD_FROM_OUTPUT 13, 18, 19, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_IN_OUTPUT 19, 18, 19, q6, q7 - STORE_IN_OUTPUT 19, 12, 13, q8, q9 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 2 * 32] = step1b[2][i] + step1b[29][i]; - ;output[ 3 * 32] = step1b[3][i] + step1b[28][i]; - ;output[28 * 32] = step1b[3][i] - step1b[28][i]; - ;output[29 * 32] = step1b[2][i] - step1b[29][i]; - LOAD_FROM_OUTPUT 13, 28, 29, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 29, 28, 29, q6, q7 - STORE_IN_OUTPUT 29, 2, 3, q4, q5 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[4] = step1b[4][i] + step1b[11][i]; - ;step1[5] = step1b[5][i] + step1b[10][i]; - ;step1[10] = step1b[5][i] - step1b[10][i]; - ;step1[11] = step1b[4][i] - step1b[11][i]; - LOAD_FROM_OUTPUT 3, 10, 11, q0, q1 - vadd.s16 q2, q12, q1 - vadd.s16 q3, q13, q0 - vsub.s16 q4, q13, q0 - vsub.s16 q5, q12, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[10 * 32] = step1b[10][i] + step1b[21][i]; - ;output[11 * 32] = step1b[11][i] + step1b[20][i]; - ;output[20 * 32] = step1b[11][i] - step1b[20][i]; - ;output[21 * 32] = step1b[10][i] - step1b[21][i]; - LOAD_FROM_OUTPUT 11, 20, 21, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_IN_OUTPUT 21, 20, 21, q6, q7 - STORE_IN_OUTPUT 21, 10, 11, q8, q9 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 4 * 32] = step1b[4][i] + step1b[27][i]; - ;output[ 5 * 32] = step1b[5][i] + step1b[26][i]; - ;output[26 * 32] = step1b[5][i] - step1b[26][i]; - ;output[27 * 32] = step1b[4][i] - step1b[27][i]; - LOAD_FROM_OUTPUT 11, 26, 27, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 27, 26, 27, q6, q7 - STORE_IN_OUTPUT 27, 4, 5, q4, q5 - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[6] = step1b[6][i] + step1b[9][i]; - ;step1[7] = step1b[7][i] + step1b[8][i]; - ;step1[8] = step1b[7][i] - step1b[8][i]; - ;step1[9] = step1b[6][i] - step1b[9][i]; - LOAD_FROM_OUTPUT 5, 8, 9, q0, q1 - vadd.s16 q2, q14, q1 - vadd.s16 q3, q15, q0 - vsub.s16 q4, q15, q0 - vsub.s16 q5, q14, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 8 * 32] = step1b[8][i] + step1b[23][i]; - ;output[ 9 * 32] = step1b[9][i] + step1b[22][i]; - ;output[22 * 32] = step1b[9][i] - step1b[22][i]; - ;output[23 * 32] = step1b[8][i] - step1b[23][i]; - LOAD_FROM_OUTPUT 9, 22, 23, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_IN_OUTPUT 23, 22, 23, q6, q7 - STORE_IN_OUTPUT 23, 8, 9, q8, q9 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 6 * 32] = step1b[6][i] + step1b[25][i]; - ;output[ 7 * 32] = step1b[7][i] + step1b[24][i]; - ;output[24 * 32] = step1b[7][i] - step1b[24][i]; - ;output[25 * 32] = step1b[6][i] - step1b[25][i]; - LOAD_FROM_OUTPUT 9, 24, 25, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_IN_OUTPUT 25, 24, 25, q6, q7 - STORE_IN_OUTPUT 25, 6, 7, q4, q5 - - ; restore r0 by removing the last offset from the last - ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2 - sub r0, r0, #24*8*2 - ; restore r1 by removing the last offset from the last - ; operation (STORE_IN_OUTPUT 24, 6, 7) => 7*32*2 - ; advance by 8 columns => 8*2 - sub r1, r1, #7*32*2 - 8*2 - ; advance by 8 lines (8*32*2) - ; go back by the two pairs from the loop (32*2) - add r3, r3, #8*32*2 - 32*2 - - ; bands loop processing - subs r4, r4, #1 - bne idct32_bands_loop - - ; parameters for second pass - ; the input of pass2 is the result of pass1. we have to remove the offset - ; of 32 columns induced by the above idct32_bands_loop - sub r3, r1, #32*2 - ; r1 = pass2[32 * 32] - add r1, sp, #2048 - - ; pass loop processing - add r5, r5, #1 - b idct32_pass_loop - -idct32_bands_end_2nd_pass - STORE_COMBINE_CENTER_RESULTS - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 0 * 32] = step1b[0][i] + step1b[31][i]; - ;output[ 1 * 32] = step1b[1][i] + step1b[30][i]; - ;output[30 * 32] = step1b[1][i] - step1b[30][i]; - ;output[31 * 32] = step1b[0][i] - step1b[31][i]; - LOAD_FROM_OUTPUT 17, 30, 31, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_COMBINE_EXTREME_RESULTS - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[2] = step1b[2][i] + step1b[13][i]; - ;step1[3] = step1b[3][i] + step1b[12][i]; - ;step1[12] = step1b[3][i] - step1b[12][i]; - ;step1[13] = step1b[2][i] - step1b[13][i]; - LOAD_FROM_OUTPUT 31, 12, 13, q0, q1 - vadd.s16 q2, q10, q1 - vadd.s16 q3, q11, q0 - vsub.s16 q4, q11, q0 - vsub.s16 q5, q10, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[12 * 32] = step1b[12][i] + step1b[19][i]; - ;output[13 * 32] = step1b[13][i] + step1b[18][i]; - ;output[18 * 32] = step1b[13][i] - step1b[18][i]; - ;output[19 * 32] = step1b[12][i] - step1b[19][i]; - LOAD_FROM_OUTPUT 13, 18, 19, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_COMBINE_CENTER_RESULTS - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 2 * 32] = step1b[2][i] + step1b[29][i]; - ;output[ 3 * 32] = step1b[3][i] + step1b[28][i]; - ;output[28 * 32] = step1b[3][i] - step1b[28][i]; - ;output[29 * 32] = step1b[2][i] - step1b[29][i]; - LOAD_FROM_OUTPUT 19, 28, 29, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_COMBINE_EXTREME_RESULTS - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[4] = step1b[4][i] + step1b[11][i]; - ;step1[5] = step1b[5][i] + step1b[10][i]; - ;step1[10] = step1b[5][i] - step1b[10][i]; - ;step1[11] = step1b[4][i] - step1b[11][i]; - LOAD_FROM_OUTPUT 29, 10, 11, q0, q1 - vadd.s16 q2, q12, q1 - vadd.s16 q3, q13, q0 - vsub.s16 q4, q13, q0 - vsub.s16 q5, q12, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[10 * 32] = step1b[10][i] + step1b[21][i]; - ;output[11 * 32] = step1b[11][i] + step1b[20][i]; - ;output[20 * 32] = step1b[11][i] - step1b[20][i]; - ;output[21 * 32] = step1b[10][i] - step1b[21][i]; - LOAD_FROM_OUTPUT 11, 20, 21, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_COMBINE_CENTER_RESULTS - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 4 * 32] = step1b[4][i] + step1b[27][i]; - ;output[ 5 * 32] = step1b[5][i] + step1b[26][i]; - ;output[26 * 32] = step1b[5][i] - step1b[26][i]; - ;output[27 * 32] = step1b[4][i] - step1b[27][i]; - LOAD_FROM_OUTPUT 21, 26, 27, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_COMBINE_EXTREME_RESULTS - ; -------------------------------------------------------------------------- - ; part of stage 7 - ;step1[6] = step1b[6][i] + step1b[9][i]; - ;step1[7] = step1b[7][i] + step1b[8][i]; - ;step1[8] = step1b[7][i] - step1b[8][i]; - ;step1[9] = step1b[6][i] - step1b[9][i]; - LOAD_FROM_OUTPUT 27, 8, 9, q0, q1 - vadd.s16 q2, q14, q1 - vadd.s16 q3, q15, q0 - vsub.s16 q4, q15, q0 - vsub.s16 q5, q14, q1 - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 8 * 32] = step1b[8][i] + step1b[23][i]; - ;output[ 9 * 32] = step1b[9][i] + step1b[22][i]; - ;output[22 * 32] = step1b[9][i] - step1b[22][i]; - ;output[23 * 32] = step1b[8][i] - step1b[23][i]; - LOAD_FROM_OUTPUT 9, 22, 23, q0, q1 - vadd.s16 q8, q4, q1 - vadd.s16 q9, q5, q0 - vsub.s16 q6, q5, q0 - vsub.s16 q7, q4, q1 - STORE_COMBINE_CENTER_RESULTS_LAST - ; -------------------------------------------------------------------------- - ; part of final stage - ;output[ 6 * 32] = step1b[6][i] + step1b[25][i]; - ;output[ 7 * 32] = step1b[7][i] + step1b[24][i]; - ;output[24 * 32] = step1b[7][i] - step1b[24][i]; - ;output[25 * 32] = step1b[6][i] - step1b[25][i]; - LOAD_FROM_OUTPUT 23, 24, 25, q0, q1 - vadd.s16 q4, q2, q1 - vadd.s16 q5, q3, q0 - vsub.s16 q6, q3, q0 - vsub.s16 q7, q2, q1 - STORE_COMBINE_EXTREME_RESULTS_LAST - ; -------------------------------------------------------------------------- - ; restore pointers to their initial indices for next band pass by - ; removing/adding dest_stride * 8. The actual increment by eight - ; is taken care of within the _LAST macros. - add r6, r6, r2, lsl #3 - add r9, r9, r2, lsl #3 - sub r7, r7, r2, lsl #3 - sub r10, r10, r2, lsl #3 - - ; restore r0 by removing the last offset from the last - ; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 24*8*2 - sub r0, r0, #24*8*2 - ; restore r1 by removing the last offset from the last - ; operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25*32*2 - ; advance by 8 columns => 8*2 - sub r1, r1, #25*32*2 - 8*2 - ; advance by 8 lines (8*32*2) - ; go back by the two pairs from the loop (32*2) - add r3, r3, #8*32*2 - 32*2 - - ; bands loop processing - subs r4, r4, #1 - bne idct32_bands_loop - - ; stack operation - add sp, sp, #512+2048+2048 - vpop {d8-d15} - pop {r4-r11} - bx lr - ENDP ; |vpx_idct32x32_1024_add_neon| - END diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm index adab715dde5..cbfab361af8 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.asm @@ -25,9 +25,8 @@ |vpx_idct4x4_1_add_neon| PROC ldrsh r0, [r0] - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 + ; cospi_16_64 = 11585 + movw r12, #0x2d41 ; out = dct_const_round_shift(input[0] * cospi_16_64) mul r0, r0, r12 ; input[0] * cospi_16_64 diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c index b37cb51a1a7..525aac05a84 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_1_add_neon.c @@ -21,7 +21,7 @@ void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, uint16x8_t q8u16; int16x8_t q0s16; uint8_t *d1, *d2; - int16_t i, a1, cospi_16_64 = 11585; + int16_t i, a1; int16_t out = dct_const_round_shift(input[0] * cospi_16_64); out = dct_const_round_shift(out * cospi_16_64); a1 = ROUND_POWER_OF_TWO(out, 4); diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm index 877fbd63435..bd4e86ded25 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm @@ -15,6 +15,8 @@ AREA ||.text||, CODE, READONLY, ALIGN=2 + INCLUDE vpx_dsp/arm/idct_neon.asm.S + AREA Block, CODE, READONLY ; name this block of code ;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) ; @@ -33,18 +35,15 @@ ; So, two passes of a transpose followed by a column transform. ; load the inputs into q8-q9, d16-d19 - vld1.s16 {q8,q9}, [r0]! + LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0 ; generate scalar constants - ; cospi_8_64 = 15137 = 0x3b21 - mov r0, #0x3b00 - add r0, #0x21 - ; cospi_16_64 = 11585 = 0x2d41 - mov r3, #0x2d00 - add r3, #0x41 - ; cospi_24_64 = 6270 = 0x 187e - mov r12, #0x1800 - add r12, #0x7e + ; cospi_8_64 = 15137 + movw r0, #0x3b21 + ; cospi_16_64 = 11585 + movw r3, #0x2d41 + ; cospi_24_64 = 6270 + movw r12, #0x187e ; transpose the input data ; 00 01 02 03 d16 diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.c index 1caa456987d..8f669c90765 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct4x4_add_neon.c @@ -11,6 +11,8 @@ #include <arm_neon.h> #include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/txfm_common.h" void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride) { @@ -24,14 +26,11 @@ void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int16x4x2_t d0x2s16, d1x2s16; int32x4x2_t q0x2s32; uint8_t *d; - int16_t cospi_8_64 = 15137; - int16_t cospi_16_64 = 11585; - int16_t cospi_24_64 = 6270; d26u32 = d27u32 = vdup_n_u32(0); - q8s16 = vld1q_s16(input); - q9s16 = vld1q_s16(input + 8); + q8s16 = load_tran_low_to_s16(input); + q9s16 = load_tran_low_to_s16(input + 8); d16s16 = vget_low_s16(q8s16); d17s16 = vget_high_s16(q8s16); @@ -43,8 +42,8 @@ void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); - d20s16 = vdup_n_s16(cospi_8_64); - d21s16 = vdup_n_s16(cospi_16_64); + d20s16 = vdup_n_s16((int16_t)cospi_8_64); + d21s16 = vdup_n_s16((int16_t)cospi_16_64); q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16)); @@ -53,7 +52,7 @@ void vpx_idct4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); - d22s16 = vdup_n_s16(cospi_24_64); + d22s16 = vdup_n_s16((int16_t)cospi_24_64); // stage 1 d23s16 = vadd_s16(d16s16, d18s16); diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm index dbbff364f37..e4531c6e97f 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm @@ -25,9 +25,8 @@ |vpx_idct8x8_1_add_neon| PROC ldrsh r0, [r0] - ; generate cospi_16_64 = 11585 - mov r12, #0x2d00 - add r12, #0x41 + ; cospi_16_64 = 11585 + movw r12, #0x2d41 ; out = dct_const_round_shift(input[0] * cospi_16_64) mul r0, r0, r12 ; input[0] * cospi_16_64 diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c index df557de8187..eee41e6c6b1 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.c @@ -21,7 +21,7 @@ void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16; int16x8_t q0s16; uint8_t *d1, *d2; - int16_t i, a1, cospi_16_64 = 11585; + int16_t i, a1; int16_t out = dct_const_round_shift(input[0] * cospi_16_64); out = dct_const_round_shift(out * cospi_16_64); a1 = ROUND_POWER_OF_TWO(out, 5); diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm index 6ab59b41b74..a5c9c927d67 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm @@ -16,6 +16,8 @@ AREA ||.text||, CODE, READONLY, ALIGN=2 + INCLUDE vpx_dsp/arm/idct_neon.asm.S + ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are ; loaded in q8-q15. The output will be stored back into q8-q15 registers. ; This macro will touch q0-q7 registers and use them as buffer during @@ -207,41 +209,34 @@ |vpx_idct8x8_64_add_neon| PROC push {r4-r9} vpush {d8-d15} - vld1.s16 {q8,q9}, [r0]! - vld1.s16 {q10,q11}, [r0]! - vld1.s16 {q12,q13}, [r0]! - vld1.s16 {q14,q15}, [r0]! + LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0 + LOAD_TRAN_LOW_TO_S16 d20, d21, d22, d23, r0 + LOAD_TRAN_LOW_TO_S16 d24, d25, d26, d27, r0 + LOAD_TRAN_LOW_TO_S16 d28, d29, d30, d31, r0 ; transpose the input data TRANSPOSE8X8 - ; generate cospi_28_64 = 3196 - mov r3, #0x0c00 - add r3, #0x7c + ; cospi_28_64 = 3196 + movw r3, #0x0c7c - ; generate cospi_4_64 = 16069 - mov r4, #0x3e00 - add r4, #0xc5 + ; cospi_4_64 = 16069 + movw r4, #0x3ec5 - ; generate cospi_12_64 = 13623 - mov r5, #0x3500 - add r5, #0x37 + ; cospi_12_64 = 13623 + movw r5, #0x3537 - ; generate cospi_20_64 = 9102 - mov r6, #0x2300 - add r6, #0x8e + ; cospi_20_64 = 9102 + movw r6, #0x238e - ; generate cospi_16_64 = 11585 - mov r7, #0x2d00 - add r7, #0x41 + ; cospi_16_64 = 11585 + movw r7, #0x2d41 - ; generate cospi_24_64 = 6270 - mov r8, #0x1800 - add r8, #0x7e + ; cospi_24_64 = 6270 + movw r8, #0x187e - ; generate cospi_8_64 = 15137 - mov r9, #0x3b00 - add r9, #0x21 + ; cospi_8_64 = 15137 + movw r9, #0x3b21 ; First transform rows IDCT8x8_1D @@ -319,41 +314,34 @@ |vpx_idct8x8_12_add_neon| PROC push {r4-r9} vpush {d8-d15} - vld1.s16 {q8,q9}, [r0]! - vld1.s16 {q10,q11}, [r0]! - vld1.s16 {q12,q13}, [r0]! - vld1.s16 {q14,q15}, [r0]! + LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0 + LOAD_TRAN_LOW_TO_S16 d20, d21, d22, d23, r0 + LOAD_TRAN_LOW_TO_S16 d24, d25, d26, d27, r0 + LOAD_TRAN_LOW_TO_S16 d28, d29, d30, d31, r0 ; transpose the input data TRANSPOSE8X8 - ; generate cospi_28_64 = 3196 - mov r3, #0x0c00 - add r3, #0x7c + ; cospi_28_64 = 3196 + movw r3, #0x0c7c - ; generate cospi_4_64 = 16069 - mov r4, #0x3e00 - add r4, #0xc5 + ; cospi_4_64 = 16069 + movw r4, #0x3ec5 - ; generate cospi_12_64 = 13623 - mov r5, #0x3500 - add r5, #0x37 + ; cospi_12_64 = 13623 + movw r5, #0x3537 - ; generate cospi_20_64 = 9102 - mov r6, #0x2300 - add r6, #0x8e + ; cospi_20_64 = 9102 + movw r6, #0x238e - ; generate cospi_16_64 = 11585 - mov r7, #0x2d00 - add r7, #0x41 + ; cospi_16_64 = 11585 + movw r7, #0x2d41 - ; generate cospi_24_64 = 6270 - mov r8, #0x1800 - add r8, #0x7e + ; cospi_24_64 = 6270 + movw r8, #0x187e - ; generate cospi_8_64 = 15137 - mov r9, #0x3b00 - add r9, #0x21 + ; cospi_8_64 = 15137 + movw r9, #0x3b21 ; First transform rows ; stage 1 diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.c index 7d65612417c..159a6ec9891 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.c @@ -12,6 +12,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/txfm_common.h" @@ -27,10 +28,10 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16, int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32; int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; - d0s16 = vdup_n_s16(cospi_28_64); - d1s16 = vdup_n_s16(cospi_4_64); - d2s16 = vdup_n_s16(cospi_12_64); - d3s16 = vdup_n_s16(cospi_20_64); + d0s16 = vdup_n_s16((int16_t)cospi_28_64); + d1s16 = vdup_n_s16((int16_t)cospi_4_64); + d2s16 = vdup_n_s16((int16_t)cospi_12_64); + d3s16 = vdup_n_s16((int16_t)cospi_20_64); d16s16 = vget_low_s16(*q8s16); d17s16 = vget_high_s16(*q8s16); @@ -83,7 +84,7 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16, q6s16 = vcombine_s16(d12s16, d13s16); q7s16 = vcombine_s16(d14s16, d15s16); - d0s16 = vdup_n_s16(cospi_16_64); + d0s16 = vdup_n_s16((int16_t)cospi_16_64); q2s32 = vmull_s16(d16s16, d0s16); q3s32 = vmull_s16(d17s16, d0s16); @@ -95,8 +96,8 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16, q13s32 = vmlsl_s16(q13s32, d24s16, d0s16); q15s32 = vmlsl_s16(q15s32, d25s16, d0s16); - d0s16 = vdup_n_s16(cospi_24_64); - d1s16 = vdup_n_s16(cospi_8_64); + d0s16 = vdup_n_s16((int16_t)cospi_24_64); + d1s16 = vdup_n_s16((int16_t)cospi_8_64); d18s16 = vqrshrn_n_s32(q2s32, 14); d19s16 = vqrshrn_n_s32(q3s32, 14); @@ -136,7 +137,7 @@ static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16, d28s16 = vget_low_s16(*q14s16); d29s16 = vget_high_s16(*q14s16); - d16s16 = vdup_n_s16(cospi_16_64); + d16s16 = vdup_n_s16((int16_t)cospi_16_64); q9s32 = vmull_s16(d28s16, d16s16); q10s32 = vmull_s16(d29s16, d16s16); @@ -173,14 +174,14 @@ void vpx_idct8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; uint16x8_t q8u16, q9u16, q10u16, q11u16; - q8s16 = vld1q_s16(input); - q9s16 = vld1q_s16(input + 8); - q10s16 = vld1q_s16(input + 16); - q11s16 = vld1q_s16(input + 24); - q12s16 = vld1q_s16(input + 32); - q13s16 = vld1q_s16(input + 40); - q14s16 = vld1q_s16(input + 48); - q15s16 = vld1q_s16(input + 56); + q8s16 = load_tran_low_to_s16(input); + q9s16 = load_tran_low_to_s16(input + 8); + q10s16 = load_tran_low_to_s16(input + 16); + q11s16 = load_tran_low_to_s16(input + 24); + q12s16 = load_tran_low_to_s16(input + 32); + q13s16 = load_tran_low_to_s16(input + 40); + q14s16 = load_tran_low_to_s16(input + 48); + q15s16 = load_tran_low_to_s16(input + 56); transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); @@ -279,43 +280,43 @@ void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, uint16x8_t q8u16, q9u16, q10u16, q11u16; int32x4_t q9s32, q10s32, q11s32, q12s32; - q8s16 = vld1q_s16(input); - q9s16 = vld1q_s16(input + 8); - q10s16 = vld1q_s16(input + 16); - q11s16 = vld1q_s16(input + 24); - q12s16 = vld1q_s16(input + 32); - q13s16 = vld1q_s16(input + 40); - q14s16 = vld1q_s16(input + 48); - q15s16 = vld1q_s16(input + 56); + q8s16 = load_tran_low_to_s16(input); + q9s16 = load_tran_low_to_s16(input + 8); + q10s16 = load_tran_low_to_s16(input + 16); + q11s16 = load_tran_low_to_s16(input + 24); + q12s16 = load_tran_low_to_s16(input + 32); + q13s16 = load_tran_low_to_s16(input + 40); + q14s16 = load_tran_low_to_s16(input + 48); + q15s16 = load_tran_low_to_s16(input + 56); transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); // First transform rows // stage 1 - q0s16 = vdupq_n_s16(cospi_28_64 * 2); - q1s16 = vdupq_n_s16(cospi_4_64 * 2); + q0s16 = vdupq_n_s16((int16_t)cospi_28_64 * 2); + q1s16 = vdupq_n_s16((int16_t)cospi_4_64 * 2); q4s16 = vqrdmulhq_s16(q9s16, q0s16); - q0s16 = vdupq_n_s16(-cospi_20_64 * 2); + q0s16 = vdupq_n_s16(-(int16_t)cospi_20_64 * 2); q7s16 = vqrdmulhq_s16(q9s16, q1s16); - q1s16 = vdupq_n_s16(cospi_12_64 * 2); + q1s16 = vdupq_n_s16((int16_t)cospi_12_64 * 2); q5s16 = vqrdmulhq_s16(q11s16, q0s16); - q0s16 = vdupq_n_s16(cospi_16_64 * 2); + q0s16 = vdupq_n_s16((int16_t)cospi_16_64 * 2); q6s16 = vqrdmulhq_s16(q11s16, q1s16); // stage 2 & stage 3 - even half - q1s16 = vdupq_n_s16(cospi_24_64 * 2); + q1s16 = vdupq_n_s16((int16_t)cospi_24_64 * 2); q9s16 = vqrdmulhq_s16(q8s16, q0s16); - q0s16 = vdupq_n_s16(cospi_8_64 * 2); + q0s16 = vdupq_n_s16((int16_t)cospi_8_64 * 2); q13s16 = vqrdmulhq_s16(q10s16, q1s16); @@ -337,7 +338,7 @@ void vpx_idct8x8_12_add_neon(const tran_low_t *input, uint8_t *dest, d28s16 = vget_low_s16(q14s16); d29s16 = vget_high_s16(q14s16); - d16s16 = vdup_n_s16(cospi_16_64); + d16s16 = vdup_n_s16((int16_t)cospi_16_64); q9s32 = vmull_s16(d28s16, d16s16); q10s32 = vmull_s16(d29s16, d16s16); q11s32 = vmull_s16(d28s16, d16s16); diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.asm new file mode 100644 index 00000000000..f39e8ddd4b4 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.asm @@ -0,0 +1,30 @@ +; +; Copyright (c) 2016 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + INCLUDE ./vpx_config.asm + + ; Helper function used to load tran_low_t into int16, narrowing if + ; necessary. + ; $dst0..3 are d registers with the pairs assumed to be contiguous in + ; non-high-bitdepth builds. q0-q3 are used as temporaries in high-bitdepth. + MACRO + LOAD_TRAN_LOW_TO_S16 $dst0, $dst1, $dst2, $dst3, $src + IF CONFIG_VP9_HIGHBITDEPTH + vld1.s32 {q0,q1}, [$src]! + vld1.s32 {q2,q3}, [$src]! + vmovn.i32 $dst0, q0 + vmovn.i32 $dst1, q1 + vmovn.i32 $dst2, q2 + vmovn.i32 $dst3, q3 + ELSE + vld1.s16 {$dst0-$dst1,$dst2-$dst3}, [$src]! + ENDIF + MEND + END diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h new file mode 100644 index 00000000000..5c2a53c034f --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_ARM_IDCT_NEON_H_ +#define VPX_DSP_ARM_IDCT_NEON_H_ + +#include <arm_neon.h> + +#include "./vpx_config.h" +#include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/vpx_dsp_common.h" + +//------------------------------------------------------------------------------ + +// Helper function used to load tran_low_t into int16, narrowing if necessary. +static INLINE int16x8_t load_tran_low_to_s16(const tran_low_t *buf) { +#if CONFIG_VP9_HIGHBITDEPTH + const int32x4_t v0 = vld1q_s32(buf); + const int32x4_t v1 = vld1q_s32(buf + 4); + const int16x4_t s0 = vmovn_s32(v0); + const int16x4_t s1 = vmovn_s32(v1); + return vcombine_s16(s0, s1); +#else + return vld1q_s16(buf); +#endif +} + +// Multiply a by a_const. Saturate, shift and narrow by 14. +static INLINE int16x8_t multiply_shift_and_narrow_s16(const int16x8_t a, + const int16_t a_const) { + // Shift by 14 + rounding will be within 16 bits for well formed streams. + // See WRAPLOW and dct_const_round_shift for details. + // This instruction doubles the result and returns the high half, essentially + // resulting in a right shift by 15. By multiplying the constant first that + // becomes a right shift by 14. + // The largest possible value used here is + // vpx_dsp/txfm_common.h:cospi_1_64 = 16364 (* 2 = 32728) a which falls *just* + // within the range of int16_t (+32767 / -32768) even when negated. + return vqrdmulhq_n_s16(a, a_const * 2); +} + +// Add a and b, then multiply by ab_const. Shift and narrow by 14. +static INLINE int16x8_t add_multiply_shift_and_narrow_s16( + const int16x8_t a, const int16x8_t b, const int16_t ab_const) { + // In both add_ and it's pair, sub_, the input for well-formed streams will be + // well within 16 bits (input to the idct is the difference between two frames + // and will be within -255 to 255, or 9 bits) + // However, for inputs over about 25,000 (valid for int16_t, but not for idct + // input) this function can not use vaddq_s16. + // In order to match existing behavior and intentionally out of range tests, + // expand the addition up to 32 bits to prevent truncation. + int32x4_t temp_low = vaddl_s16(vget_low_s16(a), vget_low_s16(b)); + int32x4_t temp_high = vaddl_s16(vget_high_s16(a), vget_high_s16(b)); + temp_low = vmulq_n_s32(temp_low, ab_const); + temp_high = vmulq_n_s32(temp_high, ab_const); + return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14)); +} + +// Subtract b from a, then multiply by ab_const. Shift and narrow by 14. +static INLINE int16x8_t sub_multiply_shift_and_narrow_s16( + const int16x8_t a, const int16x8_t b, const int16_t ab_const) { + int32x4_t temp_low = vsubl_s16(vget_low_s16(a), vget_low_s16(b)); + int32x4_t temp_high = vsubl_s16(vget_high_s16(a), vget_high_s16(b)); + temp_low = vmulq_n_s32(temp_low, ab_const); + temp_high = vmulq_n_s32(temp_high, ab_const); + return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14)); +} + +// Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by +// 14. +static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16( + const int16x8_t a, const int16_t a_const, const int16x8_t b, + const int16_t b_const) { + int32x4_t temp_low = vmull_n_s16(vget_low_s16(a), a_const); + int32x4_t temp_high = vmull_n_s16(vget_high_s16(a), a_const); + temp_low = vmlal_n_s16(temp_low, vget_low_s16(b), b_const); + temp_high = vmlal_n_s16(temp_high, vget_high_s16(b), b_const); + return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14)); +} + +static INLINE void load_and_transpose_s16_8x8(const int16_t *a, int a_stride, + int16x8_t *a0, int16x8_t *a1, + int16x8_t *a2, int16x8_t *a3, + int16x8_t *a4, int16x8_t *a5, + int16x8_t *a6, int16x8_t *a7) { + *a0 = vld1q_s16(a); + a += a_stride; + *a1 = vld1q_s16(a); + a += a_stride; + *a2 = vld1q_s16(a); + a += a_stride; + *a3 = vld1q_s16(a); + a += a_stride; + *a4 = vld1q_s16(a); + a += a_stride; + *a5 = vld1q_s16(a); + a += a_stride; + *a6 = vld1q_s16(a); + a += a_stride; + *a7 = vld1q_s16(a); + + transpose_s16_8x8(a0, a1, a2, a3, a4, a5, a6, a7); +} + +// Shift the output down by 6 and add it to the destination buffer. +static INLINE void add_and_store_u8_s16(const int16x8_t a0, const int16x8_t a1, + const int16x8_t a2, const int16x8_t a3, + const int16x8_t a4, const int16x8_t a5, + const int16x8_t a6, const int16x8_t a7, + uint8_t *b, const int b_stride) { + uint8x8_t b0, b1, b2, b3, b4, b5, b6, b7; + int16x8_t c0, c1, c2, c3, c4, c5, c6, c7; + b0 = vld1_u8(b); + b += b_stride; + b1 = vld1_u8(b); + b += b_stride; + b2 = vld1_u8(b); + b += b_stride; + b3 = vld1_u8(b); + b += b_stride; + b4 = vld1_u8(b); + b += b_stride; + b5 = vld1_u8(b); + b += b_stride; + b6 = vld1_u8(b); + b += b_stride; + b7 = vld1_u8(b); + b -= (7 * b_stride); + + // c = b + (a >> 6) + c0 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b0)), a0, 6); + c1 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b1)), a1, 6); + c2 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b2)), a2, 6); + c3 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b3)), a3, 6); + c4 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b4)), a4, 6); + c5 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b5)), a5, 6); + c6 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b6)), a6, 6); + c7 = vrsraq_n_s16(vreinterpretq_s16_u16(vmovl_u8(b7)), a7, 6); + + b0 = vqmovun_s16(c0); + b1 = vqmovun_s16(c1); + b2 = vqmovun_s16(c2); + b3 = vqmovun_s16(c3); + b4 = vqmovun_s16(c4); + b5 = vqmovun_s16(c5); + b6 = vqmovun_s16(c6); + b7 = vqmovun_s16(c7); + + vst1_u8(b, b0); + b += b_stride; + vst1_u8(b, b1); + b += b_stride; + vst1_u8(b, b2); + b += b_stride; + vst1_u8(b, b3); + b += b_stride; + vst1_u8(b, b4); + b += b_stride; + vst1_u8(b, b5); + b += b_stride; + vst1_u8(b, b6); + b += b_stride; + vst1_u8(b, b7); +} +#endif // VPX_DSP_ARM_IDCT_NEON_H_ diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon.c index 38e79ed69dd..e150a5302d5 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/intrapred_neon.c @@ -17,306 +17,254 @@ //------------------------------------------------------------------------------ // DC 4x4 -// 'do_above' and 'do_left' facilitate branch removal when inlined. -static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, - const uint8_t *left, int do_above, int do_left) { - uint16x4_t sum_top; - uint16x4_t sum_left; - uint16x4_t dc0; - - if (do_above) { - const uint8x8_t A = vld1_u8(above); // top row - const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top - sum_top = vpadd_u16(p0, p0); - } - - if (do_left) { - const uint8x8_t L = vld1_u8(left); // left border - const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left - sum_left = vpadd_u16(p0, p0); - } - - if (do_above && do_left) { - const uint16x4_t sum = vadd_u16(sum_left, sum_top); - dc0 = vrshr_n_u16(sum, 3); - } else if (do_above) { - dc0 = vrshr_n_u16(sum_top, 2); - } else if (do_left) { - dc0 = vrshr_n_u16(sum_left, 2); - } else { - dc0 = vdup_n_u16(0x80); - } +static INLINE uint16x4_t dc_sum_4(const uint8_t *ref) { + const uint8x8_t ref_u8 = vld1_u8(ref); + const uint16x4_t p0 = vpaddl_u8(ref_u8); + return vpadd_u16(p0, p0); +} - { - const uint8x8_t dc = vdup_lane_u8(vreinterpret_u8_u16(dc0), 0); - int i; - for (i = 0; i < 4; ++i) { - vst1_lane_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc), 0); - } +static INLINE void dc_store_4x4(uint8_t *dst, ptrdiff_t stride, + const uint8x8_t dc) { + const uint8x8_t dc_dup = vdup_lane_u8(dc, 0); + int i; + for (i = 0; i < 4; ++i, dst += stride) { + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(dc_dup), 0); } } void vpx_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - dc_4x4(dst, stride, above, left, 1, 1); + const uint8x8_t a = vld1_u8(above); + const uint8x8_t l = vld1_u8(left); + const uint16x8_t al = vaddl_u8(a, l); + uint16x4_t sum; + uint8x8_t dc; + sum = vpadd_u16(vget_low_u16(al), vget_low_u16(al)); + sum = vpadd_u16(sum, sum); + dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3)); + dc_store_4x4(dst, stride, dc); } void vpx_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint16x4_t sum = dc_sum_4(left); + const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 2)); (void)above; - dc_4x4(dst, stride, NULL, left, 0, 1); + dc_store_4x4(dst, stride, dc); } void vpx_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint16x4_t sum = dc_sum_4(above); + const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 2)); (void)left; - dc_4x4(dst, stride, above, NULL, 1, 0); + dc_store_4x4(dst, stride, dc); } void vpx_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint8x8_t dc = vdup_n_u8(0x80); (void)above; (void)left; - dc_4x4(dst, stride, NULL, NULL, 0, 0); + dc_store_4x4(dst, stride, dc); } //------------------------------------------------------------------------------ // DC 8x8 -// 'do_above' and 'do_left' facilitate branch removal when inlined. -static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, - const uint8_t *left, int do_above, int do_left) { - uint16x8_t sum_top; - uint16x8_t sum_left; - uint8x8_t dc0; - - if (do_above) { - const uint8x8_t A = vld1_u8(above); // top row - const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top - const uint16x4_t p1 = vpadd_u16(p0, p0); - const uint16x4_t p2 = vpadd_u16(p1, p1); - sum_top = vcombine_u16(p2, p2); - } - - if (do_left) { - const uint8x8_t L = vld1_u8(left); // left border - const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left - const uint16x4_t p1 = vpadd_u16(p0, p0); - const uint16x4_t p2 = vpadd_u16(p1, p1); - sum_left = vcombine_u16(p2, p2); - } - - if (do_above && do_left) { - const uint16x8_t sum = vaddq_u16(sum_left, sum_top); - dc0 = vrshrn_n_u16(sum, 4); - } else if (do_above) { - dc0 = vrshrn_n_u16(sum_top, 3); - } else if (do_left) { - dc0 = vrshrn_n_u16(sum_left, 3); - } else { - dc0 = vdup_n_u8(0x80); - } +static INLINE uint16x4_t dc_sum_8(const uint8_t *ref) { + const uint8x8_t ref_u8 = vld1_u8(ref); + uint16x4_t sum = vpaddl_u8(ref_u8); + sum = vpadd_u16(sum, sum); + return vpadd_u16(sum, sum); +} - { - const uint8x8_t dc = vdup_lane_u8(dc0, 0); - int i; - for (i = 0; i < 8; ++i) { - vst1_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc)); - } +static INLINE void dc_store_8x8(uint8_t *dst, ptrdiff_t stride, + const uint8x8_t dc) { + const uint8x8_t dc_dup = vdup_lane_u8(dc, 0); + int i; + for (i = 0; i < 8; ++i, dst += stride) { + vst1_u8(dst, dc_dup); } } void vpx_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - dc_8x8(dst, stride, above, left, 1, 1); + const uint8x8_t above_u8 = vld1_u8(above); + const uint8x8_t left_u8 = vld1_u8(left); + const uint8x16_t above_and_left = vcombine_u8(above_u8, left_u8); + const uint16x8_t p0 = vpaddlq_u8(above_and_left); + uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); + uint8x8_t dc; + sum = vpadd_u16(sum, sum); + sum = vpadd_u16(sum, sum); + dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4)); + dc_store_8x8(dst, stride, dc); } void vpx_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint16x4_t sum = dc_sum_8(left); + const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3)); (void)above; - dc_8x8(dst, stride, NULL, left, 0, 1); + dc_store_8x8(dst, stride, dc); } void vpx_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint16x4_t sum = dc_sum_8(above); + const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 3)); (void)left; - dc_8x8(dst, stride, above, NULL, 1, 0); + dc_store_8x8(dst, stride, dc); } void vpx_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint8x8_t dc = vdup_n_u8(0x80); (void)above; (void)left; - dc_8x8(dst, stride, NULL, NULL, 0, 0); + dc_store_8x8(dst, stride, dc); } //------------------------------------------------------------------------------ // DC 16x16 -// 'do_above' and 'do_left' facilitate branch removal when inlined. -static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left, - int do_above, int do_left) { - uint16x8_t sum_top; - uint16x8_t sum_left; - uint8x8_t dc0; - - if (do_above) { - const uint8x16_t A = vld1q_u8(above); // top row - const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top - const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); - const uint16x4_t p2 = vpadd_u16(p1, p1); - const uint16x4_t p3 = vpadd_u16(p2, p2); - sum_top = vcombine_u16(p3, p3); - } - - if (do_left) { - const uint8x16_t L = vld1q_u8(left); // left row - const uint16x8_t p0 = vpaddlq_u8(L); // cascading summation of the left - const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); - const uint16x4_t p2 = vpadd_u16(p1, p1); - const uint16x4_t p3 = vpadd_u16(p2, p2); - sum_left = vcombine_u16(p3, p3); - } - - if (do_above && do_left) { - const uint16x8_t sum = vaddq_u16(sum_left, sum_top); - dc0 = vrshrn_n_u16(sum, 5); - } else if (do_above) { - dc0 = vrshrn_n_u16(sum_top, 4); - } else if (do_left) { - dc0 = vrshrn_n_u16(sum_left, 4); - } else { - dc0 = vdup_n_u8(0x80); - } +static INLINE uint16x4_t dc_sum_16(const uint8_t *ref) { + const uint8x16_t ref_u8 = vld1q_u8(ref); + const uint16x8_t p0 = vpaddlq_u8(ref_u8); + uint16x4_t sum = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); + sum = vpadd_u16(sum, sum); + return vpadd_u16(sum, sum); +} - { - const uint8x16_t dc = vdupq_lane_u8(dc0, 0); - int i; - for (i = 0; i < 16; ++i) { - vst1q_u8(dst + i * stride, dc); - } +static INLINE void dc_store_16x16(uint8_t *dst, ptrdiff_t stride, + const uint8x8_t dc) { + const uint8x16_t dc_dup = vdupq_lane_u8(dc, 0); + int i; + for (i = 0; i < 16; ++i, dst += stride) { + vst1q_u8(dst, dc_dup); } } void vpx_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - dc_16x16(dst, stride, above, left, 1, 1); + const uint8x16_t ref0 = vld1q_u8(above); + const uint8x16_t ref1 = vld1q_u8(left); + const uint16x8_t p0 = vpaddlq_u8(ref0); + const uint16x8_t p1 = vpaddlq_u8(ref1); + const uint16x8_t p2 = vaddq_u16(p0, p1); + uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); + uint8x8_t dc; + sum = vpadd_u16(sum, sum); + sum = vpadd_u16(sum, sum); + dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5)); + dc_store_16x16(dst, stride, dc); } void vpx_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint16x4_t sum = dc_sum_16(left); + const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4)); (void)above; - dc_16x16(dst, stride, NULL, left, 0, 1); + dc_store_16x16(dst, stride, dc); } void vpx_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint16x4_t sum = dc_sum_16(above); + const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 4)); (void)left; - dc_16x16(dst, stride, above, NULL, 1, 0); + dc_store_16x16(dst, stride, dc); } void vpx_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint8x8_t dc = vdup_n_u8(0x80); (void)above; (void)left; - dc_16x16(dst, stride, NULL, NULL, 0, 0); + dc_store_16x16(dst, stride, dc); } //------------------------------------------------------------------------------ // DC 32x32 -// 'do_above' and 'do_left' facilitate branch removal when inlined. -static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left, - int do_above, int do_left) { - uint16x8_t sum_top; - uint16x8_t sum_left; - uint8x8_t dc0; - - if (do_above) { - const uint8x16_t A0 = vld1q_u8(above); // top row - const uint8x16_t A1 = vld1q_u8(above + 16); - const uint16x8_t p0 = vpaddlq_u8(A0); // cascading summation of the top - const uint16x8_t p1 = vpaddlq_u8(A1); - const uint16x8_t p2 = vaddq_u16(p0, p1); - const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); - const uint16x4_t p4 = vpadd_u16(p3, p3); - const uint16x4_t p5 = vpadd_u16(p4, p4); - sum_top = vcombine_u16(p5, p5); - } - - if (do_left) { - const uint8x16_t L0 = vld1q_u8(left); // left row - const uint8x16_t L1 = vld1q_u8(left + 16); - const uint16x8_t p0 = vpaddlq_u8(L0); // cascading summation of the left - const uint16x8_t p1 = vpaddlq_u8(L1); - const uint16x8_t p2 = vaddq_u16(p0, p1); - const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); - const uint16x4_t p4 = vpadd_u16(p3, p3); - const uint16x4_t p5 = vpadd_u16(p4, p4); - sum_left = vcombine_u16(p5, p5); - } +static INLINE uint16x4_t dc_sum_32(const uint8_t *ref) { + const uint8x16x2_t r = vld2q_u8(ref); + const uint16x8_t p0 = vpaddlq_u8(r.val[0]); + const uint16x8_t p1 = vpaddlq_u8(r.val[1]); + const uint16x8_t p2 = vaddq_u16(p0, p1); + uint16x4_t sum = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); + sum = vpadd_u16(sum, sum); + return vpadd_u16(sum, sum); +} - if (do_above && do_left) { - const uint16x8_t sum = vaddq_u16(sum_left, sum_top); - dc0 = vrshrn_n_u16(sum, 6); - } else if (do_above) { - dc0 = vrshrn_n_u16(sum_top, 5); - } else if (do_left) { - dc0 = vrshrn_n_u16(sum_left, 5); - } else { - dc0 = vdup_n_u8(0x80); - } +static INLINE void dc_store_32x32(uint8_t *dst, ptrdiff_t stride, + const uint8x8_t dc) { + uint8x16x2_t dc_dup; + int i; + dc_dup.val[0] = dc_dup.val[1] = vdupq_lane_u8(dc, 0); - { - const uint8x16_t dc = vdupq_lane_u8(dc0, 0); - int i; - for (i = 0; i < 32; ++i) { - vst1q_u8(dst + i * stride, dc); - vst1q_u8(dst + i * stride + 16, dc); - } + for (i = 0; i < 32; ++i, dst += stride) { + vst2q_u8(dst, dc_dup); } } void vpx_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - dc_32x32(dst, stride, above, left, 1, 1); + const uint8x16x2_t a = vld2q_u8(above); + const uint8x16x2_t l = vld2q_u8(left); + const uint16x8_t pa0 = vpaddlq_u8(a.val[0]); + const uint16x8_t pl0 = vpaddlq_u8(l.val[0]); + const uint16x8_t pa1 = vpaddlq_u8(a.val[1]); + const uint16x8_t pl1 = vpaddlq_u8(l.val[1]); + const uint16x8_t pa = vaddq_u16(pa0, pa1); + const uint16x8_t pl = vaddq_u16(pl0, pl1); + const uint16x8_t pal = vaddq_u16(pa, pl); + uint16x4_t sum = vadd_u16(vget_low_u16(pal), vget_high_u16(pal)); + uint8x8_t dc; + sum = vpadd_u16(sum, sum); + sum = vpadd_u16(sum, sum); + dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 6)); + dc_store_32x32(dst, stride, dc); } void vpx_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint16x4_t sum = dc_sum_32(left); + const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5)); (void)above; - dc_32x32(dst, stride, NULL, left, 0, 1); + dc_store_32x32(dst, stride, dc); } void vpx_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint16x4_t sum = dc_sum_32(above); + const uint8x8_t dc = vreinterpret_u8_u16(vrshr_n_u16(sum, 5)); (void)left; - dc_32x32(dst, stride, above, NULL, 1, 0); + dc_store_32x32(dst, stride, dc); } void vpx_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint8x8_t dc = vdup_n_u8(0x80); (void)above; (void)left; - dc_32x32(dst, stride, NULL, NULL, 0, 0); + dc_store_32x32(dst, stride, dc); } // ----------------------------------------------------------------------------- void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above)); // top row - const uint64x1_t A1 = vshr_n_u64(A0, 8); - const uint64x1_t A2 = vshr_n_u64(A0, 16); - const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0); + const uint8x8_t ABCDEFGH = vld1_u8(above); + const uint64x1_t A1 = vshr_n_u64(vreinterpret_u64_u8(ABCDEFGH), 8); + const uint64x1_t A2 = vshr_n_u64(vreinterpret_u64_u8(ABCDEFGH), 16); const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1); const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2); const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00); @@ -331,485 +279,506 @@ void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0); vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0); vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0); - dst[3 * stride + 3] = above[7]; + vst1_lane_u8(dst + 3 * stride + 3, ABCDEFGH, 7); +} + +static INLINE void d45_store_8(uint8_t **dst, const ptrdiff_t stride, + const uint8x8_t above_right, uint8x8_t *row) { + *row = vext_u8(*row, above_right, 1); + vst1_u8(*dst, *row); + *dst += stride; } void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - static const uint8_t shuffle1[8] = { 1, 2, 3, 4, 5, 6, 7, 7 }; - static const uint8_t shuffle2[8] = { 2, 3, 4, 5, 6, 7, 7, 7 }; - const uint8x8_t sh_12345677 = vld1_u8(shuffle1); - const uint8x8_t sh_23456777 = vld1_u8(shuffle2); - const uint8x8_t A0 = vld1_u8(above); // top row - const uint8x8_t A1 = vtbl1_u8(A0, sh_12345677); - const uint8x8_t A2 = vtbl1_u8(A0, sh_23456777); + const uint8x8_t A0 = vld1_u8(above); + const uint8x8_t above_right = vdup_lane_u8(A0, 7); + const uint8x8_t A1 = vext_u8(A0, above_right, 1); + const uint8x8_t A2 = vext_u8(A0, above_right, 2); const uint8x8_t avg1 = vhadd_u8(A0, A2); uint8x8_t row = vrhadd_u8(avg1, A1); - int i; (void)left; - for (i = 0; i < 7; ++i) { - vst1_u8(dst + i * stride, row); - row = vtbl1_u8(row, sh_12345677); - } - vst1_u8(dst + i * stride, row); + + vst1_u8(dst, row); + dst += stride; + d45_store_8(&dst, stride, above_right, &row); + d45_store_8(&dst, stride, above_right, &row); + d45_store_8(&dst, stride, above_right, &row); + d45_store_8(&dst, stride, above_right, &row); + d45_store_8(&dst, stride, above_right, &row); + d45_store_8(&dst, stride, above_right, &row); + vst1_u8(dst, above_right); +} + +static INLINE void d45_store_16(uint8_t **dst, const ptrdiff_t stride, + const uint8x16_t above_right, uint8x16_t *row) { + *row = vextq_u8(*row, above_right, 1); + vst1q_u8(*dst, *row); + *dst += stride; } void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x16_t A0 = vld1q_u8(above); // top row - const uint8x16_t above_right = vld1q_dup_u8(above + 15); + const uint8x16_t A0 = vld1q_u8(above); + const uint8x16_t above_right = vdupq_lane_u8(vget_high_u8(A0), 7); const uint8x16_t A1 = vextq_u8(A0, above_right, 1); const uint8x16_t A2 = vextq_u8(A0, above_right, 2); const uint8x16_t avg1 = vhaddq_u8(A0, A2); uint8x16_t row = vrhaddq_u8(avg1, A1); - int i; (void)left; - for (i = 0; i < 15; ++i) { - vst1q_u8(dst + i * stride, row); - row = vextq_u8(row, above_right, 1); - } - vst1q_u8(dst + i * stride, row); + + vst1q_u8(dst, row); + dst += stride; + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + d45_store_16(&dst, stride, above_right, &row); + vst1q_u8(dst, above_right); } // ----------------------------------------------------------------------------- void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - const uint8x8_t XABCD_u8 = vld1_u8(above - 1); - const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8); - const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32); + const uint8x8_t XABCD = vld1_u8(above - 1); const uint32x2_t zero = vdup_n_u32(0); const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0); - const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL); - const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8)); - const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC); - const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8)); - const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16)); - const uint8_t D = vget_lane_u8(XABCD_u8, 4); - const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6); - const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC); - const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8); - const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_); + const uint8x8_t LKJI = vrev64_u8(vreinterpret_u8_u32(IJKL)); + const uint8x8_t LKJIXABC = vext_u8(LKJI, XABCD, 4); + const uint8x8_t KJIXABCD = vext_u8(LKJI, XABCD, 5); + const uint8x8_t JIXABCD0 = + vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(KJIXABCD), 8)); + const uint8x8_t avg1 = vhadd_u8(JIXABCD0, LKJIXABC); + const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABCD); const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); const uint32x2_t r3 = vreinterpret_u32_u8(avg2); const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); - vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0); - vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0); - vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0); - vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0); + vst1_lane_u32((uint32_t *)dst, r0, 0); + dst += stride; + vst1_lane_u32((uint32_t *)dst, r1, 0); + dst += stride; + vst1_lane_u32((uint32_t *)dst, r2, 0); + dst += stride; + vst1_lane_u32((uint32_t *)dst, r3, 0); } +// ----------------------------------------------------------------------------- + #if !HAVE_NEON_ASM void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint32_t d = *(const uint32_t *)above; int i; - uint32x2_t d0u32 = vdup_n_u32(0); (void)left; - d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0); - for (i = 0; i < 4; i++, dst += stride) - vst1_lane_u32((uint32_t *)dst, d0u32, 0); + for (i = 0; i < 4; i++, dst += stride) { + *(uint32_t *)dst = d; + } } void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint8x8_t d = vld1_u8(above); int i; - uint8x8_t d0u8 = vdup_n_u8(0); (void)left; - d0u8 = vld1_u8(above); - for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8); + for (i = 0; i < 8; i++, dst += stride) { + vst1_u8(dst, d); + } } void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint8x16_t d = vld1q_u8(above); int i; - uint8x16_t q0u8 = vdupq_n_u8(0); (void)left; - q0u8 = vld1q_u8(above); - for (i = 0; i < 16; i++, dst += stride) vst1q_u8(dst, q0u8); + for (i = 0; i < 16; i++, dst += stride) { + vst1q_u8(dst, d); + } } void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { + const uint8x16_t d0 = vld1q_u8(above); + const uint8x16_t d1 = vld1q_u8(above + 16); int i; - uint8x16_t q0u8 = vdupq_n_u8(0); - uint8x16_t q1u8 = vdupq_n_u8(0); (void)left; - q0u8 = vld1q_u8(above); - q1u8 = vld1q_u8(above + 16); - for (i = 0; i < 32; i++, dst += stride) { - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q1u8); + for (i = 0; i < 32; i++) { + // Note: performance was worse using vst2q_u8 under gcc-4.9 & clang-3.8. + // clang-3.8 unrolled the loop fully with no filler so the cause is likely + // the latency of the instruction. + vst1q_u8(dst, d0); + dst += 16; + vst1q_u8(dst, d1); + dst += stride - 16; } } +// ----------------------------------------------------------------------------- + void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - uint8x8_t d0u8 = vdup_n_u8(0); - uint32x2_t d1u32 = vdup_n_u32(0); + const uint32x2_t zero = vdup_n_u32(0); + const uint8x8_t left_u8 = + vreinterpret_u8_u32(vld1_lane_u32((const uint32_t *)left, zero, 0)); + uint8x8_t d; (void)above; - d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0); - - d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + d = vdup_lane_u8(left_u8, 0); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0); dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + d = vdup_lane_u8(left_u8, 1); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0); dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + d = vdup_lane_u8(left_u8, 2); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0); dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); + d = vdup_lane_u8(left_u8, 3); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d), 0); } void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - uint8x8_t d0u8 = vdup_n_u8(0); - uint64x1_t d1u64 = vdup_n_u64(0); + const uint8x8_t left_u8 = vld1_u8(left); + uint8x8_t d; (void)above; - d1u64 = vld1_u64((const uint64_t *)left); - - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0); - vst1_u8(dst, d0u8); + d = vdup_lane_u8(left_u8, 0); + vst1_u8(dst, d); dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1); - vst1_u8(dst, d0u8); + d = vdup_lane_u8(left_u8, 1); + vst1_u8(dst, d); dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2); - vst1_u8(dst, d0u8); + d = vdup_lane_u8(left_u8, 2); + vst1_u8(dst, d); dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3); - vst1_u8(dst, d0u8); + d = vdup_lane_u8(left_u8, 3); + vst1_u8(dst, d); dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4); - vst1_u8(dst, d0u8); + d = vdup_lane_u8(left_u8, 4); + vst1_u8(dst, d); dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5); - vst1_u8(dst, d0u8); + d = vdup_lane_u8(left_u8, 5); + vst1_u8(dst, d); dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6); - vst1_u8(dst, d0u8); + d = vdup_lane_u8(left_u8, 6); + vst1_u8(dst, d); dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7); - vst1_u8(dst, d0u8); + d = vdup_lane_u8(left_u8, 7); + vst1_u8(dst, d); } void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - int j; - uint8x8_t d2u8 = vdup_n_u8(0); - uint8x16_t q0u8 = vdupq_n_u8(0); - uint8x16_t q1u8 = vdupq_n_u8(0); + const uint8x16_t left_u8q = vld1q_u8(left); + uint8x8_t left_u8d = vget_low_u8(left_u8q); + uint8x16_t d; + int i; (void)above; - q1u8 = vld1q_u8(left); - d2u8 = vget_low_u8(q1u8); - for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { - q0u8 = vdupq_lane_u8(d2u8, 0); - vst1q_u8(dst, q0u8); + for (i = 0; i < 2; i++, left_u8d = vget_high_u8(left_u8q)) { + d = vdupq_lane_u8(left_u8d, 0); + vst1q_u8(dst, d); dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 1); - vst1q_u8(dst, q0u8); + d = vdupq_lane_u8(left_u8d, 1); + vst1q_u8(dst, d); dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 2); - vst1q_u8(dst, q0u8); + d = vdupq_lane_u8(left_u8d, 2); + vst1q_u8(dst, d); dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 3); - vst1q_u8(dst, q0u8); + d = vdupq_lane_u8(left_u8d, 3); + vst1q_u8(dst, d); dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 4); - vst1q_u8(dst, q0u8); + d = vdupq_lane_u8(left_u8d, 4); + vst1q_u8(dst, d); dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 5); - vst1q_u8(dst, q0u8); + d = vdupq_lane_u8(left_u8d, 5); + vst1q_u8(dst, d); dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 6); - vst1q_u8(dst, q0u8); + d = vdupq_lane_u8(left_u8d, 6); + vst1q_u8(dst, d); dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 7); - vst1q_u8(dst, q0u8); + d = vdupq_lane_u8(left_u8d, 7); + vst1q_u8(dst, d); dst += stride; } } void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - int j, k; - uint8x8_t d2u8 = vdup_n_u8(0); - uint8x16_t q0u8 = vdupq_n_u8(0); - uint8x16_t q1u8 = vdupq_n_u8(0); + uint8x16_t d; + int i; (void)above; - for (k = 0; k < 2; k++, left += 16) { - q1u8 = vld1q_u8(left); - d2u8 = vget_low_u8(q1u8); - for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { - q0u8 = vdupq_lane_u8(d2u8, 0); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 1); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 2); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 3); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 4); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 5); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 6); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 7); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - } + for (i = 0; i < 2; i++, left += 16) { + const uint8x16_t left_u8 = vld1q_u8(left); + const uint8x8_t left_low = vget_low_u8(left_u8); + const uint8x8_t left_high = vget_high_u8(left_u8); + d = vdupq_lane_u8(left_low, 0); + vst1q_u8(dst, d); // Note clang-3.8 produced poor code w/vst2q_u8 + dst += 16; + vst1q_u8(dst, d); + dst += stride - 16; + d = vdupq_lane_u8(left_low, 1); + vst1q_u8(dst, d); + dst += 16; + vst1q_u8(dst, d); + dst += stride - 16; + d = vdupq_lane_u8(left_low, 2); + vst1q_u8(dst, d); + dst += 16; + vst1q_u8(dst, d); + dst += stride - 16; + d = vdupq_lane_u8(left_low, 3); + vst1q_u8(dst, d); + dst += 16; + vst1q_u8(dst, d); + dst += stride - 16; + d = vdupq_lane_u8(left_low, 4); + vst1q_u8(dst, d); + dst += 16; + vst1q_u8(dst, d); + dst += stride - 16; + d = vdupq_lane_u8(left_low, 5); + vst1q_u8(dst, d); + dst += 16; + vst1q_u8(dst, d); + dst += stride - 16; + d = vdupq_lane_u8(left_low, 6); + vst1q_u8(dst, d); + dst += 16; + vst1q_u8(dst, d); + dst += stride - 16; + d = vdupq_lane_u8(left_low, 7); + vst1q_u8(dst, d); + dst += 16; + vst1q_u8(dst, d); + dst += stride - 16; + + d = vdupq_lane_u8(left_high, 0); + vst1q_u8(dst, d); + dst += 16; + vst1q_u8(dst, d); + dst += stride - 16; + d = vdupq_lane_u8(left_high, 1); + vst1q_u8(dst, d); + dst += 16; + vst1q_u8(dst, d); + dst += stride - 16; + d = vdupq_lane_u8(left_high, 2); + vst1q_u8(dst, d); + dst += 16; + vst1q_u8(dst, d); + dst += stride - 16; + d = vdupq_lane_u8(left_high, 3); + vst1q_u8(dst, d); + dst += 16; + vst1q_u8(dst, d); + dst += stride - 16; + d = vdupq_lane_u8(left_high, 4); + vst1q_u8(dst, d); + dst += 16; + vst1q_u8(dst, d); + dst += stride - 16; + d = vdupq_lane_u8(left_high, 5); + vst1q_u8(dst, d); + dst += 16; + vst1q_u8(dst, d); + dst += stride - 16; + d = vdupq_lane_u8(left_high, 6); + vst1q_u8(dst, d); + dst += 16; + vst1q_u8(dst, d); + dst += stride - 16; + d = vdupq_lane_u8(left_high, 7); + vst1q_u8(dst, d); + dst += 16; + vst1q_u8(dst, d); + dst += stride - 16; } } +// ----------------------------------------------------------------------------- + +static INLINE int16x8_t convert_u8_to_s16(uint8x8_t v) { + return vreinterpretq_s16_u16(vmovl_u8(v)); +} + void vpx_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - int i; - uint16x8_t q1u16, q3u16; - int16x8_t q1s16; - uint8x8_t d0u8 = vdup_n_u8(0); - uint32x2_t d2u32 = vdup_n_u32(0); - - d0u8 = vld1_dup_u8(above - 1); - d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0); - q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8); - for (i = 0; i < 4; i++, dst += stride) { - q1u16 = vdupq_n_u16((uint16_t)left[i]); - q1s16 = - vaddq_s16(vreinterpretq_s16_u16(q1u16), vreinterpretq_s16_u16(q3u16)); - d0u8 = vqmovun_s16(q1s16); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); - } + const uint8x8_t top_left = vld1_dup_u8(above - 1); + const uint8x8_t left_u8 = vld1_u8(left); + const uint8x8_t above_u8 = vld1_u8(above); + const int16x4_t left_s16 = vget_low_s16(convert_u8_to_s16(left_u8)); + int16x8_t sub, sum; + uint32x2_t d; + + sub = vreinterpretq_s16_u16(vsubl_u8(above_u8, top_left)); + // Avoid vcombine_s16() which generates lots of redundant code with clang-3.8. + sub = vreinterpretq_s16_s64( + vdupq_lane_s64(vreinterpret_s64_s16(vget_low_s16(sub)), 0)); + + sum = vcombine_s16(vdup_lane_s16(left_s16, 0), vdup_lane_s16(left_s16, 1)); + sum = vaddq_s16(sum, sub); + d = vreinterpret_u32_u8(vqmovun_s16(sum)); + vst1_lane_u32((uint32_t *)dst, d, 0); + dst += stride; + vst1_lane_u32((uint32_t *)dst, d, 1); + dst += stride; + + sum = vcombine_s16(vdup_lane_s16(left_s16, 2), vdup_lane_s16(left_s16, 3)); + sum = vaddq_s16(sum, sub); + d = vreinterpret_u32_u8(vqmovun_s16(sum)); + vst1_lane_u32((uint32_t *)dst, d, 0); + dst += stride; + vst1_lane_u32((uint32_t *)dst, d, 1); +} + +static INLINE void tm_8_kernel(uint8_t **dst, const ptrdiff_t stride, + const int16x8_t left_dup, const int16x8_t sub) { + const int16x8_t sum = vaddq_s16(left_dup, sub); + const uint8x8_t d = vqmovun_s16(sum); + vst1_u8(*dst, d); + *dst += stride; } void vpx_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - int j; - uint16x8_t q0u16, q3u16, q10u16; - int16x8_t q0s16; - uint16x4_t d20u16; - uint8x8_t d0u8, d2u8, d30u8; - - d0u8 = vld1_dup_u8(above - 1); - d30u8 = vld1_u8(left); - d2u8 = vld1_u8(above); - q10u16 = vmovl_u8(d30u8); - q3u16 = vsubl_u8(d2u8, d0u8); - d20u16 = vget_low_u16(q10u16); - for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { - q0u16 = vdupq_lane_u16(d20u16, 0); - q0s16 = - vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); - d0u8 = vqmovun_s16(q0s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); - dst += stride; - q0u16 = vdupq_lane_u16(d20u16, 1); - q0s16 = - vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); - d0u8 = vqmovun_s16(q0s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); - dst += stride; - q0u16 = vdupq_lane_u16(d20u16, 2); - q0s16 = - vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); - d0u8 = vqmovun_s16(q0s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); - dst += stride; - q0u16 = vdupq_lane_u16(d20u16, 3); - q0s16 = - vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); - d0u8 = vqmovun_s16(q0s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); - dst += stride; + const uint8x8_t top_left = vld1_dup_u8(above - 1); + const uint8x8_t above_u8 = vld1_u8(above); + const uint8x8_t left_u8 = vld1_u8(left); + const int16x8_t left_s16q = convert_u8_to_s16(left_u8); + const int16x8_t sub = vreinterpretq_s16_u16(vsubl_u8(above_u8, top_left)); + int16x4_t left_s16d = vget_low_s16(left_s16q); + int i; + + for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) { + int16x8_t left_dup; + + left_dup = vdupq_lane_s16(left_s16d, 0); + tm_8_kernel(&dst, stride, left_dup, sub); + left_dup = vdupq_lane_s16(left_s16d, 1); + tm_8_kernel(&dst, stride, left_dup, sub); + left_dup = vdupq_lane_s16(left_s16d, 2); + tm_8_kernel(&dst, stride, left_dup, sub); + left_dup = vdupq_lane_s16(left_s16d, 3); + tm_8_kernel(&dst, stride, left_dup, sub); } } +static INLINE void tm_16_kernel(uint8_t **dst, const ptrdiff_t stride, + const int16x8_t left_dup, const int16x8_t sub0, + const int16x8_t sub1) { + const int16x8_t sum0 = vaddq_s16(left_dup, sub0); + const int16x8_t sum1 = vaddq_s16(left_dup, sub1); + const uint8x8_t d0 = vqmovun_s16(sum0); + const uint8x8_t d1 = vqmovun_s16(sum1); + vst1_u8(*dst, d0); + *dst += 8; + vst1_u8(*dst, d1); + *dst += stride - 8; +} + void vpx_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - int j, k; - uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16; - uint8x16_t q0u8, q1u8; - int16x8_t q0s16, q1s16, q8s16, q11s16; - uint16x4_t d20u16; - uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8; - - q0u8 = vld1q_dup_u8(above - 1); - q1u8 = vld1q_u8(above); - q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); - q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); - for (k = 0; k < 2; k++, left += 8) { - d18u8 = vld1_u8(left); - q10u16 = vmovl_u8(d18u8); - d20u16 = vget_low_u16(q10u16); - for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { - q0u16 = vdupq_lane_u16(d20u16, 0); - q8u16 = vdupq_lane_u16(d20u16, 1); - q1s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16)); - q0s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16)); - q11s16 = - vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16)); - q8s16 = - vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16)); - d2u8 = vqmovun_s16(q1s16); - d3u8 = vqmovun_s16(q0s16); - d22u8 = vqmovun_s16(q11s16); - d23u8 = vqmovun_s16(q8s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); - vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); - dst += stride; - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); - vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); - dst += stride; - - q0u16 = vdupq_lane_u16(d20u16, 2); - q8u16 = vdupq_lane_u16(d20u16, 3); - q1s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q2u16)); - q0s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q3u16)); - q11s16 = - vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q2u16)); - q8s16 = - vaddq_s16(vreinterpretq_s16_u16(q8u16), vreinterpretq_s16_u16(q3u16)); - d2u8 = vqmovun_s16(q1s16); - d3u8 = vqmovun_s16(q0s16); - d22u8 = vqmovun_s16(q11s16); - d23u8 = vqmovun_s16(q8s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); - vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); - dst += stride; - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); - vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); - dst += stride; - } + const uint8x16_t top_left = vld1q_dup_u8(above - 1); + const uint8x16_t above_u8 = vld1q_u8(above); + const int16x8_t sub0 = vreinterpretq_s16_u16( + vsubl_u8(vget_low_u8(above_u8), vget_low_u8(top_left))); + const int16x8_t sub1 = vreinterpretq_s16_u16( + vsubl_u8(vget_high_u8(above_u8), vget_high_u8(top_left))); + int16x8_t left_dup; + int i; + + for (i = 0; i < 2; i++, left += 8) { + const uint8x8_t left_u8 = vld1_u8(left); + const int16x8_t left_s16q = convert_u8_to_s16(left_u8); + const int16x4_t left_low = vget_low_s16(left_s16q); + const int16x4_t left_high = vget_high_s16(left_s16q); + + left_dup = vdupq_lane_s16(left_low, 0); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1); + left_dup = vdupq_lane_s16(left_low, 1); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1); + left_dup = vdupq_lane_s16(left_low, 2); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1); + left_dup = vdupq_lane_s16(left_low, 3); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1); + + left_dup = vdupq_lane_s16(left_high, 0); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1); + left_dup = vdupq_lane_s16(left_high, 1); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1); + left_dup = vdupq_lane_s16(left_high, 2); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1); + left_dup = vdupq_lane_s16(left_high, 3); + tm_16_kernel(&dst, stride, left_dup, sub0, sub1); } } +static INLINE void tm_32_kernel(uint8_t **dst, const ptrdiff_t stride, + const int16x8_t left_dup, const int16x8_t sub0, + const int16x8_t sub1, const int16x8_t sub2, + const int16x8_t sub3) { + const int16x8_t sum0 = vaddq_s16(left_dup, sub0); + const int16x8_t sum1 = vaddq_s16(left_dup, sub1); + const int16x8_t sum2 = vaddq_s16(left_dup, sub2); + const int16x8_t sum3 = vaddq_s16(left_dup, sub3); + const uint8x8_t d0 = vqmovun_s16(sum0); + const uint8x8_t d1 = vqmovun_s16(sum1); + const uint8x8_t d2 = vqmovun_s16(sum2); + const uint8x8_t d3 = vqmovun_s16(sum3); + + vst1q_u8(*dst, vcombine_u8(d0, d1)); + *dst += 16; + vst1q_u8(*dst, vcombine_u8(d2, d3)); + *dst += stride - 16; +} + void vpx_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { - int j, k; - uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16; - uint8x16_t q0u8, q1u8, q2u8; - int16x8_t q12s16, q13s16, q14s16, q15s16; - uint16x4_t d6u16; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8; - - q0u8 = vld1q_dup_u8(above - 1); - q1u8 = vld1q_u8(above); - q2u8 = vld1q_u8(above + 16); - q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); - q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); - q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8)); - q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8)); - for (k = 0; k < 4; k++, left += 8) { - d26u8 = vld1_u8(left); - q3u16 = vmovl_u8(d26u8); - d6u16 = vget_low_u16(q3u16); - for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) { - q0u16 = vdupq_lane_u16(d6u16, 0); - q12s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); - q13s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); - q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q10u16)); - q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q11u16)); - d0u8 = vqmovun_s16(q12s16); - d1u8 = vqmovun_s16(q13s16); - d2u8 = vqmovun_s16(q14s16); - d3u8 = vqmovun_s16(q15s16); - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); - vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); - dst += stride; - - q0u16 = vdupq_lane_u16(d6u16, 1); - q12s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); - q13s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); - q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q10u16)); - q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q11u16)); - d0u8 = vqmovun_s16(q12s16); - d1u8 = vqmovun_s16(q13s16); - d2u8 = vqmovun_s16(q14s16); - d3u8 = vqmovun_s16(q15s16); - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); - vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); - dst += stride; - - q0u16 = vdupq_lane_u16(d6u16, 2); - q12s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); - q13s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); - q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q10u16)); - q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q11u16)); - d0u8 = vqmovun_s16(q12s16); - d1u8 = vqmovun_s16(q13s16); - d2u8 = vqmovun_s16(q14s16); - d3u8 = vqmovun_s16(q15s16); - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); - vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); - dst += stride; - - q0u16 = vdupq_lane_u16(d6u16, 3); - q12s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q8u16)); - q13s16 = - vaddq_s16(vreinterpretq_s16_u16(q0u16), vreinterpretq_s16_u16(q9u16)); - q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q10u16)); - q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q11u16)); - d0u8 = vqmovun_s16(q12s16); - d1u8 = vqmovun_s16(q13s16); - d2u8 = vqmovun_s16(q14s16); - d3u8 = vqmovun_s16(q15s16); - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); - vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); - dst += stride; + const uint8x16_t top_left = vld1q_dup_u8(above - 1); + const uint8x16_t above_low = vld1q_u8(above); + const uint8x16_t above_high = vld1q_u8(above + 16); + const int16x8_t sub0 = vreinterpretq_s16_u16( + vsubl_u8(vget_low_u8(above_low), vget_low_u8(top_left))); + const int16x8_t sub1 = vreinterpretq_s16_u16( + vsubl_u8(vget_high_u8(above_low), vget_high_u8(top_left))); + const int16x8_t sub2 = vreinterpretq_s16_u16( + vsubl_u8(vget_low_u8(above_high), vget_low_u8(top_left))); + const int16x8_t sub3 = vreinterpretq_s16_u16( + vsubl_u8(vget_high_u8(above_high), vget_high_u8(top_left))); + int16x8_t left_dup; + int i, j; + + for (j = 0; j < 4; j++, left += 8) { + const uint8x8_t left_u8 = vld1_u8(left); + const int16x8_t left_s16q = convert_u8_to_s16(left_u8); + int16x4_t left_s16d = vget_low_s16(left_s16q); + for (i = 0; i < 2; i++, left_s16d = vget_high_s16(left_s16q)) { + left_dup = vdupq_lane_s16(left_s16d, 0); + tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3); + left_dup = vdupq_lane_s16(left_s16d, 1); + tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3); + left_dup = vdupq_lane_s16(left_s16d, 2); + tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3); + left_dup = vdupq_lane_s16(left_s16d, 3); + tm_32_kernel(&dst, stride, left_dup, sub0, sub1, sub2, sub3); } } } diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_neon.c index fc080163bb4..7419cea022d 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/loopfilter_neon.c @@ -423,8 +423,8 @@ static INLINE void apply_15_tap_filter_16( filter = vand##r##s8(filter, vreinterpret##r##s8_u8(mask)); \ \ /* save bottom 3 bits so that we round one side +4 and the other +3 */ \ - /* if it equals 4 we'll set to adjust by -1 to account for the fact */ \ - /* we'd round 3 the other way */ \ + /* if it equals 4 we'll set it to adjust by -1 to account for the fact */ \ + /* we'd round it by 3 the other way */ \ filter1 = vshr##r##n_s8(vqadd##r##s8(filter, vdup##r##n_s8(4)), 3); \ filter2 = vshr##r##n_s8(vqadd##r##s8(filter, vdup##r##n_s8(3)), 3); \ \ @@ -909,7 +909,7 @@ void vpx_lpf_vertical_8_neon(uint8_t *s, int p, const uint8_t *blimit, p0, q0, q1, q2, q3, &flat, &flat_status, &hev); filter8_8(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2, &op1, &op0, &oq0, &oq1, &oq2); - // Note: tranpose + store_8x8() is faster than store_6x8(). + // Note: transpose + store_8x8() is faster than store_6x8(). transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3); store_8x8(s - 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3); } @@ -934,7 +934,7 @@ void vpx_lpf_vertical_8_dual_neon(uint8_t *s, int p, const uint8_t *blimit0, p0, q0, q1, q2, q3, &flat, &flat_status, &hev); filter8_16(mask, flat, flat_status, hev, p3, p2, p1, p0, q0, q1, q2, q3, &op2, &op1, &op0, &oq0, &oq1, &oq2); - // Note: store_6x8() twice is faster than tranpose + store_8x16(). + // Note: store_6x8() twice is faster than transpose + store_8x16(). store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0), vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2)); store_6x8(s + 8 * p, p, vget_high_u8(op2), vget_high_u8(op1), @@ -1037,7 +1037,7 @@ void vpx_lpf_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit, &s6, &s7); store_16x8(s, p, s0, s1, s2, s3, s4, s5, s6, s7); } else { - // Note: tranpose + store_8x8() is faster than store_6x8(). + // Note: transpose + store_8x8() is faster than store_6x8(). transpose_u8_8x8(&p3, &op2, &op1, &op0, &oq0, &oq1, &oq2, &q3); store_8x8(s + 4, p, p3, op2, op1, op0, oq0, oq1, oq2, q3); } @@ -1074,7 +1074,7 @@ void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit, store_16x16(s, p, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15); } else { - // Note: store_6x8() twice is faster than tranpose + store_8x16(). + // Note: store_6x8() twice is faster than transpose + store_8x16(). s += 8; store_6x8(s, p, vget_low_u8(op2), vget_low_u8(op1), vget_low_u8(op0), vget_low_u8(oq0), vget_low_u8(oq1), vget_low_u8(oq2)); diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/transpose_neon.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/transpose_neon.h index 55188c5bc21..445add29689 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/transpose_neon.h +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/transpose_neon.h @@ -39,6 +39,15 @@ static INLINE uint8x16x2_t vpx_vtrnq_u64(uint32x4_t a0, uint32x4_t a1) { return b0; } +static INLINE uint16x8x2_t vpx_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) { + uint16x8x2_t b0; + b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)), + vreinterpret_u16_u32(vget_low_u32(a1))); + b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)), + vreinterpret_u16_u32(vget_high_u32(a1))); + return b0; +} + static INLINE void transpose_u8_4x4(uint8x8_t *a0, uint8x8_t *a1) { // Swap 16 bit elements. Goes from: // a0: 00 01 02 03 10 11 12 13 @@ -68,6 +77,70 @@ static INLINE void transpose_u8_4x4(uint8x8_t *a0, uint8x8_t *a1) { *a1 = d0.val[1]; } +static INLINE void transpose_s16_4x4d(int16x4_t *a0, int16x4_t *a1, + int16x4_t *a2, int16x4_t *a3) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 + // a1: 10 11 12 13 + // a2: 20 21 22 23 + // a3: 30 31 32 33 + // to: + // b0.val[0]: 00 10 02 12 + // b0.val[1]: 01 11 03 13 + // b1.val[0]: 20 30 22 32 + // b1.val[1]: 21 31 23 33 + + const int16x4x2_t b0 = vtrn_s16(*a0, *a1); + const int16x4x2_t b1 = vtrn_s16(*a2, *a3); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 + // c0.val[1]: 02 12 22 32 + // c1.val[0]: 01 11 21 31 + // c1.val[1]: 03 13 23 33 + + const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]), + vreinterpret_s32_s16(b1.val[0])); + const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]), + vreinterpret_s32_s16(b1.val[1])); + + *a0 = vreinterpret_s16_s32(c0.val[0]); + *a1 = vreinterpret_s16_s32(c1.val[0]); + *a2 = vreinterpret_s16_s32(c0.val[1]); + *a3 = vreinterpret_s16_s32(c1.val[1]); +} + +static INLINE void transpose_u16_4x4q(uint16x8_t *a0, uint16x8_t *a1) { + // Swap 32 bit elements. Goes from: + // a0: 00 01 02 03 10 11 12 13 + // a1: 20 21 22 23 30 31 32 33 + // to: + // b0.val[0]: 00 01 20 21 10 11 30 31 + // b0.val[1]: 02 03 22 23 12 13 32 33 + + const uint32x4x2_t b0 = + vtrnq_u32(vreinterpretq_u32_u16(*a0), vreinterpretq_u32_u16(*a1)); + + // Swap 64 bit elements resulting in: + // c0.val[0]: 00 01 20 21 02 03 22 23 + // c0.val[1]: 10 11 30 31 12 13 32 33 + + const uint32x4_t c0 = + vcombine_u32(vget_low_u32(b0.val[0]), vget_low_u32(b0.val[1])); + const uint32x4_t c1 = + vcombine_u32(vget_high_u32(b0.val[0]), vget_high_u32(b0.val[1])); + + // Swap 16 bit elements resulting in: + // d0.val[0]: 00 10 20 30 02 12 22 32 + // d0.val[1]: 01 11 21 31 03 13 23 33 + + const uint16x8x2_t d0 = + vtrnq_u16(vreinterpretq_u16_u32(c0), vreinterpretq_u16_u32(c1)); + + *a0 = d0.val[0]; + *a1 = d0.val[1]; +} + static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, uint8x8_t *a3) { // Swap 8 bit elements. Goes from: @@ -101,6 +174,39 @@ static INLINE void transpose_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, *a3 = vreinterpret_u8_u16(c1.val[1]); } +static INLINE void transpose_u16_8x4(uint16x8_t *a0, uint16x8_t *a1, + uint16x8_t *a2, uint16x8_t *a3) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 + // b0.val[1]: 01 11 03 13 05 15 07 17 + // b1.val[0]: 20 30 22 32 24 34 26 36 + // b1.val[1]: 21 31 23 33 25 35 27 37 + + const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1); + const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 + // c0.val[1]: 02 12 22 32 06 16 26 36 + // c1.val[0]: 01 11 21 31 05 15 25 35 + // c1.val[1]: 03 13 23 33 07 17 27 37 + + const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]), + vreinterpretq_u32_u16(b1.val[0])); + const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]), + vreinterpretq_u32_u16(b1.val[1])); + + *a0 = vreinterpretq_u16_u32(c0.val[0]); + *a1 = vreinterpretq_u16_u32(c1.val[0]); + *a2 = vreinterpretq_u16_u32(c0.val[1]); + *a3 = vreinterpretq_u16_u32(c1.val[1]); +} + // Note: Using 'd' registers or 'q' registers has almost identical speed. We use // 'q' registers here to save some instructions. static INLINE void transpose_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, uint8x8_t *a2, @@ -228,6 +334,77 @@ static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1, *a7 = d3.val[1]; } +static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1, + uint16x8_t *a2, uint16x8_t *a3, + uint16x8_t *a4, uint16x8_t *a5, + uint16x8_t *a6, uint16x8_t *a7) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // a4: 40 41 42 43 44 45 46 47 + // a5: 50 51 52 53 54 55 56 57 + // a6: 60 61 62 63 64 65 66 67 + // a7: 70 71 72 73 74 75 76 77 + // to: + // b0.val[0]: 00 10 02 12 04 14 06 16 + // b0.val[1]: 01 11 03 13 05 15 07 17 + // b1.val[0]: 20 30 22 32 24 34 26 36 + // b1.val[1]: 21 31 23 33 25 35 27 37 + // b2.val[0]: 40 50 42 52 44 54 46 56 + // b2.val[1]: 41 51 43 53 45 55 47 57 + // b3.val[0]: 60 70 62 72 64 74 66 76 + // b3.val[1]: 61 71 63 73 65 75 67 77 + + const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1); + const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3); + const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5); + const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 10 20 30 04 14 24 34 + // c0.val[1]: 02 12 22 32 06 16 26 36 + // c1.val[0]: 01 11 21 31 05 15 25 35 + // c1.val[1]: 03 13 23 33 07 17 27 37 + // c2.val[0]: 40 50 60 70 44 54 64 74 + // c2.val[1]: 42 52 62 72 46 56 66 76 + // c3.val[0]: 41 51 61 71 45 55 65 75 + // c3.val[1]: 43 53 63 73 47 57 67 77 + + const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]), + vreinterpretq_u32_u16(b1.val[0])); + const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]), + vreinterpretq_u32_u16(b1.val[1])); + const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]), + vreinterpretq_u32_u16(b3.val[0])); + const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]), + vreinterpretq_u32_u16(b3.val[1])); + + // Swap 64 bit elements resulting in: + // d0.val[0]: 00 10 20 30 40 50 60 70 + // d0.val[1]: 04 14 24 34 44 54 64 74 + // d1.val[0]: 01 11 21 31 41 51 61 71 + // d1.val[1]: 05 15 25 35 45 55 65 75 + // d2.val[0]: 02 12 22 32 42 52 62 72 + // d2.val[1]: 06 16 26 36 46 56 66 76 + // d3.val[0]: 03 13 23 33 43 53 63 73 + // d3.val[1]: 07 17 27 37 47 57 67 77 + const uint16x8x2_t d0 = vpx_vtrnq_u64_to_u16(c0.val[0], c2.val[0]); + const uint16x8x2_t d1 = vpx_vtrnq_u64_to_u16(c1.val[0], c3.val[0]); + const uint16x8x2_t d2 = vpx_vtrnq_u64_to_u16(c0.val[1], c2.val[1]); + const uint16x8x2_t d3 = vpx_vtrnq_u64_to_u16(c1.val[1], c3.val[1]); + + *a0 = d0.val[0]; + *a1 = d1.val[0]; + *a2 = d2.val[0]; + *a3 = d3.val[0]; + *a4 = d0.val[1]; + *a5 = d1.val[1]; + *a6 = d2.val[1]; + *a7 = d3.val[1]; +} + static INLINE void transpose_u8_16x8( const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2, const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5, diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c index e16d33718aa..1386838eea6 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/vpx_convolve8_neon.c @@ -820,14 +820,14 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); s += src_stride; - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + __builtin_prefetch(s + 0 * src_stride); + __builtin_prefetch(s + 1 * src_stride); + __builtin_prefetch(s + 2 * src_stride); + __builtin_prefetch(s + 3 * src_stride); d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, filter4); d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, @@ -1002,14 +1002,14 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, s10 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(s))); s += src_stride; - __builtin_prefetch(dst + 0 * dst_stride); - __builtin_prefetch(dst + 1 * dst_stride); - __builtin_prefetch(dst + 2 * dst_stride); - __builtin_prefetch(dst + 3 * dst_stride); - __builtin_prefetch(src + 0 * src_stride); - __builtin_prefetch(src + 1 * src_stride); - __builtin_prefetch(src + 2 * src_stride); - __builtin_prefetch(src + 3 * src_stride); + __builtin_prefetch(d + 0 * dst_stride); + __builtin_prefetch(d + 1 * dst_stride); + __builtin_prefetch(d + 2 * dst_stride); + __builtin_prefetch(d + 3 * dst_stride); + __builtin_prefetch(s + 0 * src_stride); + __builtin_prefetch(s + 1 * src_stride); + __builtin_prefetch(s + 2 * src_stride); + __builtin_prefetch(s + 3 * src_stride); d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filters, filter3, filter4); d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filters, filter3, diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/vpx_convolve_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/vpx_convolve_neon.c index 5d7fa54fcd4..6ca0e501b3c 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/vpx_convolve_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/vpx_convolve_neon.c @@ -24,16 +24,15 @@ void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]); // Account for the vertical phase needing 3 lines prior and 4 lines post - int intermediate_height = h + 7; + const int intermediate_height = h + 7; assert(y_step_q4 == 16); assert(x_step_q4 == 16); - /* Filter starting 3 lines back. The neon implementation will ignore the - * given height and filter a multiple of 4 lines. Since this goes in to - * the temp buffer which has lots of extra room and is subsequently discarded - * this is safe if somewhat less than ideal. - */ + /* Filter starting 3 lines back. The neon implementation will ignore the given + * height and filter a multiple of 4 lines. Since this goes in to the temp + * buffer which has lots of extra room and is subsequently discarded this is + * safe if somewhat less than ideal. */ vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter_x, x_step_q4, filter_y, y_step_q4, w, intermediate_height); @@ -49,7 +48,7 @@ void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, const int16_t *filter_y, int y_step_q4, int w, int h) { DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]); - int intermediate_height = h + 7; + const int intermediate_height = h + 7; assert(y_step_q4 == 16); assert(x_step_q4 == 16); diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/fwd_txfm.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/fwd_txfm.c index 4e7d4053ea9..aa59601094d 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/fwd_txfm.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/fwd_txfm.c @@ -8,6 +8,7 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <assert.h> #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/fwd_txfm.h" @@ -21,36 +22,37 @@ void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) { int pass; // We need an intermediate buffer between passes. tran_low_t intermediate[4 * 4]; - const int16_t *in_pass0 = input; - const tran_low_t *in = NULL; + const tran_low_t *in_low = NULL; tran_low_t *out = intermediate; // Do the two transform/transpose passes for (pass = 0; pass < 2; ++pass) { - tran_high_t input[4]; // canbe16 + tran_high_t in_high[4]; // canbe16 tran_high_t step[4]; // canbe16 tran_high_t temp1, temp2; // needs32 int i; for (i = 0; i < 4; ++i) { // Load inputs. - if (0 == pass) { - input[0] = in_pass0[0 * stride] * 16; - input[1] = in_pass0[1 * stride] * 16; - input[2] = in_pass0[2 * stride] * 16; - input[3] = in_pass0[3 * stride] * 16; - if (i == 0 && input[0]) { - input[0] += 1; + if (pass == 0) { + in_high[0] = input[0 * stride] * 16; + in_high[1] = input[1 * stride] * 16; + in_high[2] = input[2 * stride] * 16; + in_high[3] = input[3 * stride] * 16; + if (i == 0 && in_high[0]) { + ++in_high[0]; } } else { - input[0] = in[0 * 4]; - input[1] = in[1 * 4]; - input[2] = in[2 * 4]; - input[3] = in[3 * 4]; + assert(in_low != NULL); + in_high[0] = in_low[0 * 4]; + in_high[1] = in_low[1 * 4]; + in_high[2] = in_low[2 * 4]; + in_high[3] = in_low[3 * 4]; + ++in_low; } // Transform. - step[0] = input[0] + input[3]; - step[1] = input[1] + input[2]; - step[2] = input[1] - input[2]; - step[3] = input[0] - input[3]; + step[0] = in_high[0] + in_high[3]; + step[1] = in_high[1] + in_high[2]; + step[2] = in_high[1] - in_high[2]; + step[3] = in_high[0] - in_high[3]; temp1 = (step[0] + step[1]) * cospi_16_64; temp2 = (step[0] - step[1]) * cospi_16_64; out[0] = (tran_low_t)fdct_round_shift(temp1); @@ -60,12 +62,11 @@ void vpx_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) { out[1] = (tran_low_t)fdct_round_shift(temp1); out[3] = (tran_low_t)fdct_round_shift(temp2); // Do next column (which is a transposed row in second/horizontal pass) - in_pass0++; - in++; + ++input; out += 4; } // Setup in/out for next pass. - in = intermediate; + in_low = intermediate; out = output; } @@ -99,7 +100,6 @@ void vpx_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) { tran_high_t t0, t1, t2, t3; // needs32 tran_high_t x0, x1, x2, x3; // canbe16 - int i; for (i = 0; i < 8; i++) { // stage 1 if (pass == 0) { @@ -190,56 +190,57 @@ void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) { int pass; // We need an intermediate buffer between passes. tran_low_t intermediate[256]; - const int16_t *in_pass0 = input; - const tran_low_t *in = NULL; + const tran_low_t *in_low = NULL; tran_low_t *out = intermediate; // Do the two transform/transpose passes for (pass = 0; pass < 2; ++pass) { tran_high_t step1[8]; // canbe16 tran_high_t step2[8]; // canbe16 tran_high_t step3[8]; // canbe16 - tran_high_t input[8]; // canbe16 + tran_high_t in_high[8]; // canbe16 tran_high_t temp1, temp2; // needs32 int i; for (i = 0; i < 16; i++) { if (0 == pass) { // Calculate input for the first 8 results. - input[0] = (in_pass0[0 * stride] + in_pass0[15 * stride]) * 4; - input[1] = (in_pass0[1 * stride] + in_pass0[14 * stride]) * 4; - input[2] = (in_pass0[2 * stride] + in_pass0[13 * stride]) * 4; - input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4; - input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4; - input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4; - input[6] = (in_pass0[6 * stride] + in_pass0[9 * stride]) * 4; - input[7] = (in_pass0[7 * stride] + in_pass0[8 * stride]) * 4; + in_high[0] = (input[0 * stride] + input[15 * stride]) * 4; + in_high[1] = (input[1 * stride] + input[14 * stride]) * 4; + in_high[2] = (input[2 * stride] + input[13 * stride]) * 4; + in_high[3] = (input[3 * stride] + input[12 * stride]) * 4; + in_high[4] = (input[4 * stride] + input[11 * stride]) * 4; + in_high[5] = (input[5 * stride] + input[10 * stride]) * 4; + in_high[6] = (input[6 * stride] + input[9 * stride]) * 4; + in_high[7] = (input[7 * stride] + input[8 * stride]) * 4; // Calculate input for the next 8 results. - step1[0] = (in_pass0[7 * stride] - in_pass0[8 * stride]) * 4; - step1[1] = (in_pass0[6 * stride] - in_pass0[9 * stride]) * 4; - step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4; - step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4; - step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4; - step1[5] = (in_pass0[2 * stride] - in_pass0[13 * stride]) * 4; - step1[6] = (in_pass0[1 * stride] - in_pass0[14 * stride]) * 4; - step1[7] = (in_pass0[0 * stride] - in_pass0[15 * stride]) * 4; + step1[0] = (input[7 * stride] - input[8 * stride]) * 4; + step1[1] = (input[6 * stride] - input[9 * stride]) * 4; + step1[2] = (input[5 * stride] - input[10 * stride]) * 4; + step1[3] = (input[4 * stride] - input[11 * stride]) * 4; + step1[4] = (input[3 * stride] - input[12 * stride]) * 4; + step1[5] = (input[2 * stride] - input[13 * stride]) * 4; + step1[6] = (input[1 * stride] - input[14 * stride]) * 4; + step1[7] = (input[0 * stride] - input[15 * stride]) * 4; } else { // Calculate input for the first 8 results. - input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2); - input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2); - input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2); - input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2); - input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2); - input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2); - input[6] = ((in[6 * 16] + 1) >> 2) + ((in[9 * 16] + 1) >> 2); - input[7] = ((in[7 * 16] + 1) >> 2) + ((in[8 * 16] + 1) >> 2); + assert(in_low != NULL); + in_high[0] = ((in_low[0 * 16] + 1) >> 2) + ((in_low[15 * 16] + 1) >> 2); + in_high[1] = ((in_low[1 * 16] + 1) >> 2) + ((in_low[14 * 16] + 1) >> 2); + in_high[2] = ((in_low[2 * 16] + 1) >> 2) + ((in_low[13 * 16] + 1) >> 2); + in_high[3] = ((in_low[3 * 16] + 1) >> 2) + ((in_low[12 * 16] + 1) >> 2); + in_high[4] = ((in_low[4 * 16] + 1) >> 2) + ((in_low[11 * 16] + 1) >> 2); + in_high[5] = ((in_low[5 * 16] + 1) >> 2) + ((in_low[10 * 16] + 1) >> 2); + in_high[6] = ((in_low[6 * 16] + 1) >> 2) + ((in_low[9 * 16] + 1) >> 2); + in_high[7] = ((in_low[7 * 16] + 1) >> 2) + ((in_low[8 * 16] + 1) >> 2); // Calculate input for the next 8 results. - step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[8 * 16] + 1) >> 2); - step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[9 * 16] + 1) >> 2); - step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2); - step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2); - step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2); - step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2); - step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2); - step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2); + step1[0] = ((in_low[7 * 16] + 1) >> 2) - ((in_low[8 * 16] + 1) >> 2); + step1[1] = ((in_low[6 * 16] + 1) >> 2) - ((in_low[9 * 16] + 1) >> 2); + step1[2] = ((in_low[5 * 16] + 1) >> 2) - ((in_low[10 * 16] + 1) >> 2); + step1[3] = ((in_low[4 * 16] + 1) >> 2) - ((in_low[11 * 16] + 1) >> 2); + step1[4] = ((in_low[3 * 16] + 1) >> 2) - ((in_low[12 * 16] + 1) >> 2); + step1[5] = ((in_low[2 * 16] + 1) >> 2) - ((in_low[13 * 16] + 1) >> 2); + step1[6] = ((in_low[1 * 16] + 1) >> 2) - ((in_low[14 * 16] + 1) >> 2); + step1[7] = ((in_low[0 * 16] + 1) >> 2) - ((in_low[15 * 16] + 1) >> 2); + in_low++; } // Work on the first eight values; fdct8(input, even_results); { @@ -248,14 +249,14 @@ void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) { tran_high_t x0, x1, x2, x3; // canbe16 // stage 1 - s0 = input[0] + input[7]; - s1 = input[1] + input[6]; - s2 = input[2] + input[5]; - s3 = input[3] + input[4]; - s4 = input[3] - input[4]; - s5 = input[2] - input[5]; - s6 = input[1] - input[6]; - s7 = input[0] - input[7]; + s0 = in_high[0] + in_high[7]; + s1 = in_high[1] + in_high[6]; + s2 = in_high[2] + in_high[5]; + s3 = in_high[3] + in_high[4]; + s4 = in_high[3] - in_high[4]; + s5 = in_high[2] - in_high[5]; + s6 = in_high[1] - in_high[6]; + s7 = in_high[0] - in_high[7]; // fdct4(step, step); x0 = s0 + s3; @@ -350,12 +351,11 @@ void vpx_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) { out[15] = (tran_low_t)fdct_round_shift(temp2); } // Do next column (which is a transposed row in second/horizontal pass) - in++; - in_pass0++; + input++; out += 16; } // Setup in/out for next pass. - in = intermediate; + in_low = intermediate; out = output; } } diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c index 46ddd1da0d0..f3f543ddfe8 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c @@ -96,6 +96,7 @@ void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) { void idct4_c(const tran_low_t *input, tran_low_t *output) { tran_low_t step[4]; tran_high_t temp1, temp2; + // stage 1 temp1 = (input[0] + input[2]) * cospi_16_64; temp2 = (input[0] - input[2]) * cospi_16_64; @@ -114,9 +115,9 @@ void idct4_c(const tran_low_t *input, tran_low_t *output) { } void vpx_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; tran_low_t out[4 * 4]; tran_low_t *outptr = out; - int i, j; tran_low_t temp_in[4], temp_out[4]; // Rows @@ -142,6 +143,7 @@ void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int i; tran_high_t a1; tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); a1 = ROUND_POWER_OF_TWO(out, 4); @@ -157,6 +159,7 @@ void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, void idct8_c(const tran_low_t *input, tran_low_t *output) { tran_low_t step1[8], step2[8]; tran_high_t temp1, temp2; + // stage 1 step1[0] = input[0]; step1[2] = input[4]; @@ -209,9 +212,9 @@ void idct8_c(const tran_low_t *input, tran_low_t *output) { } void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; tran_low_t out[8 * 8]; tran_low_t *outptr = out; - int i, j; tran_low_t temp_in[8], temp_out[8]; // First transform rows @@ -236,6 +239,7 @@ void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { int i, j; tran_high_t a1; tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); a1 = ROUND_POWER_OF_TWO(out, 5); for (j = 0; j < 8; ++j) { @@ -246,14 +250,13 @@ void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { void iadst4_c(const tran_low_t *input, tran_low_t *output) { tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; - tran_low_t x0 = input[0]; tran_low_t x1 = input[1]; tran_low_t x2 = input[2]; tran_low_t x3 = input[3]; if (!(x0 | x1 | x2 | x3)) { - output[0] = output[1] = output[2] = output[3] = 0; + memset(output, 0, 4 * sizeof(*output)); return; } @@ -283,7 +286,6 @@ void iadst4_c(const tran_low_t *input, tran_low_t *output) { void iadst8_c(const tran_low_t *input, tran_low_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7; - tran_high_t x0 = input[7]; tran_high_t x1 = input[0]; tran_high_t x2 = input[5]; @@ -294,8 +296,7 @@ void iadst8_c(const tran_low_t *input, tran_low_t *output) { tran_high_t x7 = input[6]; if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { - output[0] = output[1] = output[2] = output[3] = output[4] = output[5] = - output[6] = output[7] = 0; + memset(output, 0, 8 * sizeof(*output)); return; } @@ -359,13 +360,13 @@ void iadst8_c(const tran_low_t *input, tran_low_t *output) { } void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; tran_low_t out[8 * 8] = { 0 }; tran_low_t *outptr = out; - int i, j; tran_low_t temp_in[8], temp_out[8]; // First transform rows - // only first 4 row has non-zero coefs + // Only first 4 row has non-zero coefs for (i = 0; i < 4; ++i) { idct8_c(input, outptr); input += 8; @@ -550,9 +551,9 @@ void idct16_c(const tran_low_t *input, tran_low_t *output) { void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; tran_low_t out[16 * 16]; tran_low_t *outptr = out; - int i, j; tran_low_t temp_in[16], temp_out[16]; // First transform rows @@ -576,7 +577,6 @@ void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, void iadst16_c(const tran_low_t *input, tran_low_t *output) { tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; tran_high_t s9, s10, s11, s12, s13, s14, s15; - tran_high_t x0 = input[15]; tran_high_t x1 = input[0]; tran_high_t x2 = input[13]; @@ -596,9 +596,7 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) { if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | x13 | x14 | x15)) { - output[0] = output[1] = output[2] = output[3] = output[4] = output[5] = - output[6] = output[7] = output[8] = output[9] = output[10] = - output[11] = output[12] = output[13] = output[14] = output[15] = 0; + memset(output, 0, 16 * sizeof(*output)); return; } @@ -746,9 +744,9 @@ void iadst16_c(const tran_low_t *input, tran_low_t *output) { void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; tran_low_t out[16 * 16] = { 0 }; tran_low_t *outptr = out; - int i, j; tran_low_t temp_in[16], temp_out[16]; // First transform rows. Since all non-zero dct coefficients are in @@ -774,6 +772,7 @@ void vpx_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { int i, j; tran_high_t a1; tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); a1 = ROUND_POWER_OF_TWO(out, 6); for (j = 0; j < 16; ++j) { @@ -1151,9 +1150,9 @@ void idct32_c(const tran_low_t *input, tran_low_t *output) { void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; tran_low_t out[32 * 32]; tran_low_t *outptr = out; - int i, j; tran_low_t temp_in[32], temp_out[32]; // Rows @@ -1188,13 +1187,13 @@ void vpx_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest, void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; tran_low_t out[32 * 32] = { 0 }; tran_low_t *outptr = out; - int i, j; tran_low_t temp_in[32], temp_out[32]; // Rows - // only upper-left 16x16 has non-zero coeff + // Only upper-left 16x16 has non-zero coeff for (i = 0; i < 16; ++i) { idct32_c(input, outptr); input += 32; @@ -1214,13 +1213,13 @@ void vpx_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest, void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; tran_low_t out[32 * 32] = { 0 }; tran_low_t *outptr = out; - int i, j; tran_low_t temp_in[32], temp_out[32]; // Rows - // only upper-left 8x8 has non-zero coeff + // Only upper-left 8x8 has non-zero coeff for (i = 0; i < 8; ++i) { idct32_c(input, outptr); input += 32; @@ -1241,8 +1240,8 @@ void vpx_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest, void vpx_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { int i, j; tran_high_t a1; - tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + out = WRAPLOW(dct_const_round_shift(out * cospi_16_64)); a1 = ROUND_POWER_OF_TWO(out, 6); @@ -1373,12 +1372,12 @@ void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) { // stage 1 temp1 = (input[0] + input[2]) * cospi_16_64; temp2 = (input[0] - input[2]) * cospi_16_64; - step[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64; temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64; - step[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); // stage 2 output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd); @@ -1389,9 +1388,9 @@ void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) { void vpx_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int bd) { + int i, j; tran_low_t out[4 * 4]; tran_low_t *outptr = out; - int i, j; tran_low_t temp_in[4], temp_out[4]; uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); @@ -1418,10 +1417,10 @@ void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8, int i; tran_high_t a1; tran_low_t out = - HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); + HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd); + out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 4); for (i = 0; i < 4; i++) { @@ -1452,12 +1451,12 @@ void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) { step1[3] = input[6]; temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64; temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64; - step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64; temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64; - step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); // stage 2 & stage 3 - even half vpx_highbd_idct4_c(step1, step1, bd); @@ -1472,8 +1471,8 @@ void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) { step1[4] = step2[4]; temp1 = (step2[6] - step2[5]) * cospi_16_64; temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step1[7] = step2[7]; // stage 4 @@ -1489,20 +1488,20 @@ void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) { void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int bd) { + int i, j; tran_low_t out[8 * 8]; tran_low_t *outptr = out; - int i, j; tran_low_t temp_in[8], temp_out[8]; uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - // First transform rows. + // First transform rows for (i = 0; i < 8; ++i) { vpx_highbd_idct8_c(input, outptr, bd); input += 8; outptr += 8; } - // Then transform columns. + // Then transform columns for (i = 0; i < 8; ++i) { for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; vpx_highbd_idct8_c(temp_in, temp_out, bd); @@ -1518,9 +1517,10 @@ void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8, int i, j; tran_high_t a1; tran_low_t out = - HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); + HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd); + + out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 5); for (j = 0; j < 8; ++j) { for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); @@ -1567,10 +1567,10 @@ void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) { // The overall dynamic range is 14b (input) + 14b (multiplication scaling) // + 1b (addition) = 29b. // Hence the output bit depth is 15b. - output[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s3), bd); - output[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s3), bd); - output[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd); - output[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s1 - s3), bd); + output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd); + output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd); + output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd); + output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd); } void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { @@ -1608,14 +1608,14 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { s6 = cospi_26_64 * x6 + cospi_6_64 * x7; s7 = cospi_6_64 * x6 - cospi_26_64 * x7; - x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s4), bd); - x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s5), bd); - x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s6), bd); - x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s7), bd); - x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s4), bd); - x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s5), bd); - x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s6), bd); - x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s7), bd); + x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd); + x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd); + x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd); + x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd); + x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd); + x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd); + x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd); + x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd); // stage 2 s0 = x0; @@ -1631,10 +1631,10 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { x1 = HIGHBD_WRAPLOW(s1 + s3, bd); x2 = HIGHBD_WRAPLOW(s0 - s2, bd); x3 = HIGHBD_WRAPLOW(s1 - s3, bd); - x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd); - x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd); - x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd); - x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd); + x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd); + x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd); + x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd); + x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd); // stage 3 s2 = cospi_16_64 * (x2 + x3); @@ -1642,10 +1642,10 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { s6 = cospi_16_64 * (x6 + x7); s7 = cospi_16_64 * (x6 - x7); - x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd); - x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd); - x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd); - x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd); + x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd); + x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd); + x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd); + x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd); output[0] = HIGHBD_WRAPLOW(x0, bd); output[1] = HIGHBD_WRAPLOW(-x4, bd); @@ -1657,22 +1657,23 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { output[7] = HIGHBD_WRAPLOW(-x1, bd); } -void vpx_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int bd) { + int i, j; tran_low_t out[8 * 8] = { 0 }; tran_low_t *outptr = out; - int i, j; tran_low_t temp_in[8], temp_out[8]; uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - // First transform rows. - // Only first 4 row has non-zero coefs. + // First transform rows + // Only first 4 row has non-zero coefs for (i = 0; i < 4; ++i) { vpx_highbd_idct8_c(input, outptr, bd); input += 8; outptr += 8; } - // Then transform columns. + + // Then transform columns for (i = 0; i < 8; ++i) { for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; vpx_highbd_idct8_c(temp_in, temp_out, bd); @@ -1726,23 +1727,23 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; - step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; - step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; - step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; - step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); // stage 3 step1[0] = step2[0]; @@ -1752,12 +1753,12 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; - step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; - step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd); step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd); @@ -1771,12 +1772,12 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { // stage 4 temp1 = (step1[0] + step1[1]) * cospi_16_64; temp2 = (step1[0] - step1[1]) * cospi_16_64; - step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd); step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd); @@ -1786,12 +1787,12 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { step2[15] = step1[15]; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; - step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; - step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step2[11] = step1[11]; step2[12] = step1[12]; @@ -1803,8 +1804,8 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { step1[4] = step2[4]; temp1 = (step2[6] - step2[5]) * cospi_16_64; temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step1[7] = step2[7]; step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd); @@ -1829,12 +1830,12 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { step2[9] = step1[9]; temp1 = (-step1[10] + step1[13]) * cospi_16_64; temp2 = (step1[10] + step1[13]) * cospi_16_64; - step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = (-step1[11] + step1[12]) * cospi_16_64; temp2 = (step1[11] + step1[12]) * cospi_16_64; - step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step2[14] = step1[14]; step2[15] = step1[15]; @@ -1859,20 +1860,20 @@ void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int bd) { + int i, j; tran_low_t out[16 * 16]; tran_low_t *outptr = out; - int i, j; tran_low_t temp_in[16], temp_out[16]; uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - // First transform rows. + // First transform rows for (i = 0; i < 16; ++i) { vpx_highbd_idct16_c(input, outptr, bd); input += 16; outptr += 16; } - // Then transform columns. + // Then transform columns for (i = 0; i < 16; ++i) { for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; vpx_highbd_idct16_c(temp_in, temp_out, bd); @@ -1936,22 +1937,22 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) { s14 = x14 * cospi_29_64 + x15 * cospi_3_64; s15 = x14 * cospi_3_64 - x15 * cospi_29_64; - x0 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 + s8), bd); - x1 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 + s9), bd); - x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 + s10), bd); - x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 + s11), bd); - x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s12), bd); - x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s13), bd); - x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 + s14), bd); - x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 + s15), bd); - x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s0 - s8), bd); - x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s1 - s9), bd); - x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2 - s10), bd); - x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3 - s11), bd); - x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s12), bd); - x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s13), bd); - x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6 - s14), bd); - x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7 - s15), bd); + x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd); + x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd); + x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd); + x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd); + x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd); + x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd); + x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd); + x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd); + x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd); + x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd); + x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd); + x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd); + x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd); + x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd); + x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd); + x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd); // stage 2 s0 = x0; @@ -1979,14 +1980,14 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) { x5 = HIGHBD_WRAPLOW(s1 - s5, bd); x6 = HIGHBD_WRAPLOW(s2 - s6, bd); x7 = HIGHBD_WRAPLOW(s3 - s7, bd); - x8 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 + s12), bd); - x9 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 + s13), bd); - x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 + s14), bd); - x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 + s15), bd); - x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s8 - s12), bd); - x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s9 - s13), bd); - x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10 - s14), bd); - x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11 - s15), bd); + x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd); + x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd); + x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd); + x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd); + x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd); + x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd); + x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd); + x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd); // stage 3 s0 = x0; @@ -2010,18 +2011,18 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) { x1 = HIGHBD_WRAPLOW(s1 + s3, bd); x2 = HIGHBD_WRAPLOW(s0 - s2, bd); x3 = HIGHBD_WRAPLOW(s1 - s3, bd); - x4 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 + s6), bd); - x5 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 + s7), bd); - x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s4 - s6), bd); - x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s5 - s7), bd); + x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd); + x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd); + x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd); + x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd); x8 = HIGHBD_WRAPLOW(s8 + s10, bd); x9 = HIGHBD_WRAPLOW(s9 + s11, bd); x10 = HIGHBD_WRAPLOW(s8 - s10, bd); x11 = HIGHBD_WRAPLOW(s9 - s11, bd); - x12 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 + s14), bd); - x13 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 + s15), bd); - x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s12 - s14), bd); - x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s13 - s15), bd); + x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd); + x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd); + x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd); + x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd); // stage 4 s2 = (-cospi_16_64) * (x2 + x3); @@ -2033,14 +2034,14 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) { s14 = (-cospi_16_64) * (x14 + x15); s15 = cospi_16_64 * (x14 - x15); - x2 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s2), bd); - x3 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s3), bd); - x6 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s6), bd); - x7 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s7), bd); - x10 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s10), bd); - x11 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s11), bd); - x14 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s14), bd); - x15 = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(s15), bd); + x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd); + x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd); + x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd); + x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd); + x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd); + x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd); + x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd); + x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd); output[0] = HIGHBD_WRAPLOW(x0, bd); output[1] = HIGHBD_WRAPLOW(-x8, bd); @@ -2062,9 +2063,9 @@ void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) { void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int bd) { + int i, j; tran_low_t out[16 * 16] = { 0 }; tran_low_t *outptr = out; - int i, j; tran_low_t temp_in[16], temp_out[16]; uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); @@ -2076,7 +2077,7 @@ void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8, outptr += 16; } - // Then transform columns. + // Then transform columns for (i = 0; i < 16; ++i) { for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; vpx_highbd_idct16_c(temp_in, temp_out, bd); @@ -2092,10 +2093,10 @@ void vpx_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8, int i, j; tran_high_t a1; tran_low_t out = - HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); + HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd); + out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 6); for (j = 0; j < 16; ++j) { for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd); @@ -2137,43 +2138,43 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64; temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64; - step1[16] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[31] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[16] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[31] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64; temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64; - step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64; temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64; - step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64; temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64; - step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64; temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64; - step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64; temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64; - step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64; temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64; - step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64; temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64; - step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); // stage 2 step2[0] = step1[0]; @@ -2187,23 +2188,23 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64; - step2[8] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[15] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64; - step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64; - step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64; - step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd); step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd); @@ -2230,12 +2231,12 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64; - step1[4] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[7] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64; - step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd); step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd); @@ -2250,22 +2251,22 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, step1[31] = step2[31]; temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64; temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64; - step1[17] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[30] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64; temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64; - step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step1[19] = step2[19]; step1[20] = step2[20]; temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64; temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64; - step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64; temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64; - step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step1[23] = step2[23]; step1[24] = step2[24]; step1[27] = step2[27]; @@ -2274,12 +2275,12 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, // stage 4 temp1 = (step1[0] + step1[1]) * cospi_16_64; temp2 = (step1[0] - step1[1]) * cospi_16_64; - step2[0] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[1] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64; temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64; - step2[2] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[3] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd); step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd); step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd); @@ -2289,12 +2290,12 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, step2[15] = step1[15]; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64; - step2[9] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[14] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64; - step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step2[11] = step1[11]; step2[12] = step1[12]; @@ -2324,8 +2325,8 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, step1[4] = step2[4]; temp1 = (step2[6] - step2[5]) * cospi_16_64; temp2 = (step2[5] + step2[6]) * cospi_16_64; - step1[5] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[6] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step1[7] = step2[7]; step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd); @@ -2341,20 +2342,20 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, step1[17] = step2[17]; temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64; temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64; - step1[18] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[29] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64; temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64; - step1[19] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[28] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64; temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64; - step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64; temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64; - step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step1[22] = step2[22]; step1[23] = step2[23]; step1[24] = step2[24]; @@ -2375,12 +2376,12 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, step2[9] = step1[9]; temp1 = (-step1[10] + step1[13]) * cospi_16_64; temp2 = (step1[10] + step1[13]) * cospi_16_64; - step2[10] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[13] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = (-step1[11] + step1[12]) * cospi_16_64; temp2 = (step1[11] + step1[12]) * cospi_16_64; - step2[11] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step2[12] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step2[14] = step1[14]; step2[15] = step1[15]; @@ -2426,20 +2427,20 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, step1[19] = step2[19]; temp1 = (-step2[20] + step2[27]) * cospi_16_64; temp2 = (step2[20] + step2[27]) * cospi_16_64; - step1[20] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[27] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = (-step2[21] + step2[26]) * cospi_16_64; temp2 = (step2[21] + step2[26]) * cospi_16_64; - step1[21] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[26] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = (-step2[22] + step2[25]) * cospi_16_64; temp2 = (step2[22] + step2[25]) * cospi_16_64; - step1[22] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[25] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); temp1 = (-step2[23] + step2[24]) * cospi_16_64; temp2 = (step2[23] + step2[24]) * cospi_16_64; - step1[23] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp1), bd); - step1[24] = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(temp2), bd); + step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd); + step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd); step1[28] = step2[28]; step1[29] = step2[29]; step1[30] = step2[30]; @@ -2482,9 +2483,9 @@ static void highbd_idct32_c(const tran_low_t *input, tran_low_t *output, void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int bd) { + int i, j; tran_low_t out[32 * 32]; tran_low_t *outptr = out; - int i, j; tran_low_t temp_in[32], temp_out[32]; uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); @@ -2520,19 +2521,20 @@ void vpx_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8, void vpx_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int bd) { + int i, j; tran_low_t out[32 * 32] = { 0 }; tran_low_t *outptr = out; - int i, j; tran_low_t temp_in[32], temp_out[32]; uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); // Rows - // Only upper-left 8x8 has non-zero coeff. + // Only upper-left 8x8 has non-zero coeff for (i = 0; i < 8; ++i) { highbd_idct32_c(input, outptr, bd); input += 32; outptr += 32; } + // Columns for (i = 0; i < 32; ++i) { for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; @@ -2549,10 +2551,10 @@ void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8, int i, j; int a1; uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - tran_low_t out = - HIGHBD_WRAPLOW(highbd_dct_const_round_shift(input[0] * cospi_16_64), bd); - out = HIGHBD_WRAPLOW(highbd_dct_const_round_shift(out * cospi_16_64), bd); + HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); + + out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); a1 = ROUND_POWER_OF_TWO(out, 6); for (j = 0; j < 32; ++j) { @@ -2560,4 +2562,5 @@ void vpx_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8, dest += stride; } } + #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.h index e530730d575..13137659fae 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.h +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.h @@ -57,11 +57,6 @@ static INLINE tran_high_t highbd_check_range(tran_high_t input, int bd) { (void)bd; return input; } - -static INLINE tran_high_t highbd_dct_const_round_shift(tran_high_t input) { - tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS); - return (tran_high_t)rv; -} #endif // CONFIG_VP9_HIGHBITDEPTH #if CONFIG_EMULATE_HARDWARE diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/loopfilter.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/loopfilter.c index 60a15e23bcf..9866ea37d6d 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/loopfilter.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/loopfilter.c @@ -94,8 +94,8 @@ static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1, filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; // save bottom 3 bits so that we round one side +4 and the other +3 - // if it equals 4 we'll set to adjust by -1 to account for the fact - // we'd round 3 the other way + // if it equals 4 we'll set it to adjust by -1 to account for the fact + // we'd round it by 3 the other way filter1 = signed_char_clamp(filter + 4) >> 3; filter2 = signed_char_clamp(filter + 3) >> 3; @@ -425,8 +425,8 @@ static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1, filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask; // Save bottom 3 bits so that we round one side +4 and the other +3 - // if it equals 4 we'll set to adjust by -1 to account for the fact - // we'd round 3 the other way. + // if it equals 4 we'll set it to adjust by -1 to account for the fact + // we'd round it by 3 the other way. filter1 = signed_char_clamp_high(filter + 4, bd) >> 3; filter2 = signed_char_clamp_high(filter + 3, bd) >> 3; diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/deblock_msa.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/deblock_msa.c index 402d7ed9979..cc633c6698d 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/deblock_msa.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/deblock_msa.c @@ -454,7 +454,7 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch, v16u8 tmp = { 0 }; v16i8 zero = { 0 }; v8u16 sum_h, src_r_h, src_l_h; - v4u32 src_r_w, src_l_w; + v4u32 src_r_w; v4i32 flimit_vec; flimit_vec = __msa_fill_w(flimit); @@ -473,9 +473,8 @@ void vpx_mbpost_proc_across_ip_msa(uint8_t *src_ptr, int32_t pitch, src[15] = 0; ILVRL_B2_UH(zero, src, src_r_h, src_l_h); src_r_w = __msa_dotp_u_w(src_r_h, src_r_h); - src_l_w = __msa_dotp_u_w(src_l_h, src_l_h); + src_r_w += __msa_dotp_u_w(src_l_h, src_l_h); sum_sq = HADD_SW_S32(src_r_w); - sum_sq += HADD_SW_S32(src_l_w); sum_h = __msa_hadd_u_h(src, src); sum = HADD_UH_U32(sum_h); { diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/inv_txfm_msa.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/inv_txfm_msa.h index 8b8a4173d2f..1fe9b28e8ad 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/inv_txfm_msa.h +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/inv_txfm_msa.h @@ -196,18 +196,18 @@ out2, out3) \ { \ v8i16 madd_s0_m, madd_s1_m, madd_s2_m, madd_s3_m; \ - v4i32 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ + v4i32 tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd; \ \ ILVRL_H2_SH(inp1, inp0, madd_s1_m, madd_s0_m); \ ILVRL_H2_SH(inp3, inp2, madd_s3_m, madd_s2_m); \ DOTP_SH4_SW(madd_s1_m, madd_s0_m, madd_s1_m, madd_s0_m, cst0, cst0, cst1, \ - cst1, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ - SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out1); \ + cst1, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd); \ + SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS); \ + PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out0, out1); \ DOTP_SH4_SW(madd_s3_m, madd_s2_m, madd_s3_m, madd_s2_m, cst2, cst2, cst3, \ - cst3, tmp0_m, tmp1_m, tmp2_m, tmp3_m); \ - SRARI_W4_SW(tmp0_m, tmp1_m, tmp2_m, tmp3_m, DCT_CONST_BITS); \ - PCKEV_H2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out2, out3); \ + cst3, tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd); \ + SRARI_W4_SW(tmp0_madd, tmp1_madd, tmp2_madd, tmp3_madd, DCT_CONST_BITS); \ + PCKEV_H2_SH(tmp1_madd, tmp0_madd, tmp3_madd, tmp2_madd, out2, out3); \ } /* idct 8x8 macro */ diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/loopfilter_16_msa.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/loopfilter_16_msa.c index 6ee2456ca5d..b73d56bd558 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/loopfilter_16_msa.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/loopfilter_16_msa.c @@ -449,7 +449,7 @@ static void mb_lpf_horizontal_edge(uint8_t *src, int32_t pitch, LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); - VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); @@ -779,7 +779,7 @@ int32_t vpx_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, /* flat4 */ VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); /* filter4 */ - VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/loopfilter_4_msa.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/loopfilter_4_msa.c index e0079665f71..9500cd2fd86 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/loopfilter_4_msa.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/loopfilter_4_msa.c @@ -27,7 +27,7 @@ void vpx_lpf_horizontal_4_msa(uint8_t *src, int32_t pitch, LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); - VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); p1_d = __msa_copy_u_d((v2i64)p1_out, 0); p0_d = __msa_copy_u_d((v2i64)p0_out, 0); @@ -86,7 +86,7 @@ void vpx_lpf_vertical_4_msa(uint8_t *src, int32_t pitch, q3); LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); - VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1); ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1); ILVRL_H2_SH(vec1, vec0, vec2, vec3); diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/loopfilter_8_msa.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/loopfilter_8_msa.c index 403e5dc51b3..a22c62bb3a3 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/loopfilter_8_msa.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/loopfilter_8_msa.c @@ -32,7 +32,7 @@ void vpx_lpf_horizontal_8_msa(uint8_t *src, int32_t pitch, LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev, mask, flat); VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); - VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); @@ -177,7 +177,7 @@ void vpx_lpf_vertical_8_msa(uint8_t *src, int32_t pitch, /* flat4 */ VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat); /* filter4 */ - VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); + VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out); flat = (v16u8)__msa_ilvr_d((v2i64)zero, (v2i64)flat); diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/loopfilter_msa.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/loopfilter_msa.h index 4063e5e6b87..49fd74c25a4 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/loopfilter_msa.h +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/loopfilter_msa.h @@ -13,144 +13,71 @@ #include "vpx_dsp/mips/macros_msa.h" -#define VP9_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \ - p1_out, p0_out, q0_out, q1_out) \ - { \ - v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \ - v16i8 filt, filt1, filt2, cnst4b, cnst3b; \ - v8i16 q0_sub_p0_r, filt_r, cnst3h; \ - \ - p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \ - p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \ - q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \ - q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \ - \ - filt = __msa_subs_s_b(p1_m, q1_m); \ - filt = filt & (v16i8)hev_in; \ - q0_sub_p0 = q0_m - p0_m; \ - filt_sign = __msa_clti_s_b(filt, 0); \ - \ - cnst3h = __msa_ldi_h(3); \ - q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \ - q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \ - filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \ - filt_r += q0_sub_p0_r; \ - filt_r = __msa_sat_s_h(filt_r, 7); \ - \ - /* combine left and right part */ \ - filt = __msa_pckev_b((v16i8)filt_r, (v16i8)filt_r); \ - \ - filt = filt & (v16i8)mask_in; \ - cnst4b = __msa_ldi_b(4); \ - filt1 = __msa_adds_s_b(filt, cnst4b); \ - filt1 >>= 3; \ - \ - cnst3b = __msa_ldi_b(3); \ - filt2 = __msa_adds_s_b(filt, cnst3b); \ - filt2 >>= 3; \ - \ - q0_m = __msa_subs_s_b(q0_m, filt1); \ - q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \ - p0_m = __msa_adds_s_b(p0_m, filt2); \ - p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \ - \ - filt = __msa_srari_b(filt1, 1); \ - hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \ - filt = filt & (v16i8)hev_in; \ - \ - q1_m = __msa_subs_s_b(q1_m, filt); \ - q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \ - p1_m = __msa_adds_s_b(p1_m, filt); \ - p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \ - } - -#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \ - p1_out, p0_out, q0_out, q1_out) \ - { \ - v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \ - v16i8 filt, filt1, filt2, cnst4b, cnst3b; \ - v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \ - \ - p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \ - p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \ - q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \ - q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \ - \ - filt = __msa_subs_s_b(p1_m, q1_m); \ - \ - filt = filt & (v16i8)hev_in; \ - \ - q0_sub_p0 = q0_m - p0_m; \ - filt_sign = __msa_clti_s_b(filt, 0); \ - \ - cnst3h = __msa_ldi_h(3); \ - q0_sub_p0_r = (v8i16)__msa_ilvr_b(q0_sub_p0, q0_sub_p0); \ - q0_sub_p0_r = __msa_dotp_s_h((v16i8)q0_sub_p0_r, (v16i8)cnst3h); \ - filt_r = (v8i16)__msa_ilvr_b(filt_sign, filt); \ - filt_r += q0_sub_p0_r; \ - filt_r = __msa_sat_s_h(filt_r, 7); \ - \ - q0_sub_p0_l = (v8i16)__msa_ilvl_b(q0_sub_p0, q0_sub_p0); \ - q0_sub_p0_l = __msa_dotp_s_h((v16i8)q0_sub_p0_l, (v16i8)cnst3h); \ - filt_l = (v8i16)__msa_ilvl_b(filt_sign, filt); \ - filt_l += q0_sub_p0_l; \ - filt_l = __msa_sat_s_h(filt_l, 7); \ - \ - filt = __msa_pckev_b((v16i8)filt_l, (v16i8)filt_r); \ - filt = filt & (v16i8)mask_in; \ - \ - cnst4b = __msa_ldi_b(4); \ - filt1 = __msa_adds_s_b(filt, cnst4b); \ - filt1 >>= 3; \ - \ - cnst3b = __msa_ldi_b(3); \ - filt2 = __msa_adds_s_b(filt, cnst3b); \ - filt2 >>= 3; \ - \ - q0_m = __msa_subs_s_b(q0_m, filt1); \ - q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \ - p0_m = __msa_adds_s_b(p0_m, filt2); \ - p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \ - \ - filt = __msa_srari_b(filt1, 1); \ - hev_in = __msa_xori_b((v16u8)hev_in, 0xff); \ - filt = filt & (v16i8)hev_in; \ - \ - q1_m = __msa_subs_s_b(q1_m, filt); \ - q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \ - p1_m = __msa_adds_s_b(p1_m, filt); \ - p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \ +#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask, hev, p1_out, \ + p0_out, q0_out, q1_out) \ + { \ + v16i8 p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2; \ + const v16i8 cnst4b = __msa_ldi_b(4); \ + const v16i8 cnst3b = __msa_ldi_b(3); \ + \ + p1_m = (v16i8)__msa_xori_b(p1_in, 0x80); \ + p0_m = (v16i8)__msa_xori_b(p0_in, 0x80); \ + q0_m = (v16i8)__msa_xori_b(q0_in, 0x80); \ + q1_m = (v16i8)__msa_xori_b(q1_in, 0x80); \ + \ + filt = __msa_subs_s_b(p1_m, q1_m); \ + filt &= hev; \ + q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt = __msa_adds_s_b(filt, q0_sub_p0); \ + filt &= mask; \ + t1 = __msa_adds_s_b(filt, cnst4b); \ + t1 >>= cnst3b; \ + t2 = __msa_adds_s_b(filt, cnst3b); \ + t2 >>= cnst3b; \ + q0_m = __msa_subs_s_b(q0_m, t1); \ + q0_out = __msa_xori_b((v16u8)q0_m, 0x80); \ + p0_m = __msa_adds_s_b(p0_m, t2); \ + p0_out = __msa_xori_b((v16u8)p0_m, 0x80); \ + filt = __msa_srari_b(t1, 1); \ + hev = __msa_xori_b(hev, 0xff); \ + filt &= hev; \ + q1_m = __msa_subs_s_b(q1_m, filt); \ + q1_out = __msa_xori_b((v16u8)q1_m, 0x80); \ + p1_m = __msa_adds_s_b(p1_m, filt); \ + p1_out = __msa_xori_b((v16u8)p1_m, 0x80); \ } -#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \ - { \ - v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \ - v16u8 zero_in = { 0 }; \ - \ - tmp = __msa_ori_b(zero_in, 1); \ - p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in); \ - q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in); \ - p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in); \ - q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in); \ - \ - p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0); \ - flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out); \ - p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0); \ - flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out); \ - \ - flat_out = (tmp < (v16u8)flat_out); \ - flat_out = __msa_xori_b(flat_out, 0xff); \ - flat_out = flat_out & (mask); \ +#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \ + { \ + v16u8 tmp_flat4, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \ + v16u8 zero_in = { 0 }; \ + \ + tmp_flat4 = __msa_ori_b(zero_in, 1); \ + p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in); \ + q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in); \ + p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in); \ + q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in); \ + \ + p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0); \ + flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out); \ + p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0); \ + flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out); \ + \ + flat_out = (tmp_flat4 < (v16u8)flat_out); \ + flat_out = __msa_xori_b(flat_out, 0xff); \ + flat_out = flat_out & (mask); \ } #define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in, \ q6_in, q7_in, flat_in, flat2_out) \ { \ - v16u8 tmp, zero_in = { 0 }; \ + v16u8 tmp_flat5, zero_in = { 0 }; \ v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \ v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \ \ - tmp = __msa_ori_b(zero_in, 1); \ + tmp_flat5 = __msa_ori_b(zero_in, 1); \ p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in); \ q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in); \ p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in); \ @@ -168,7 +95,7 @@ p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0); \ flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out); \ \ - flat2_out = (tmp < (v16u8)flat2_out); \ + flat2_out = (tmp_flat5 < (v16u8)flat2_out); \ flat2_out = __msa_xori_b(flat2_out, 0xff); \ flat2_out = flat2_out & flat_in; \ } @@ -177,38 +104,38 @@ p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \ q1_filt8_out, q2_filt8_out) \ { \ - v8u16 tmp0, tmp1, tmp2; \ + v8u16 tmp_filt8_0, tmp_filt8_1, tmp_filt8_2; \ \ - tmp2 = p2_in + p1_in + p0_in; \ - tmp0 = p3_in << 1; \ + tmp_filt8_2 = p2_in + p1_in + p0_in; \ + tmp_filt8_0 = p3_in << 1; \ \ - tmp0 = tmp0 + tmp2 + q0_in; \ - tmp1 = tmp0 + p3_in + p2_in; \ - p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ + tmp_filt8_0 = tmp_filt8_0 + tmp_filt8_2 + q0_in; \ + tmp_filt8_1 = tmp_filt8_0 + p3_in + p2_in; \ + p2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ \ - tmp1 = tmp0 + p1_in + q1_in; \ - p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ + tmp_filt8_1 = tmp_filt8_0 + p1_in + q1_in; \ + p1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ \ - tmp1 = q2_in + q1_in + q0_in; \ - tmp2 = tmp2 + tmp1; \ - tmp0 = tmp2 + (p0_in); \ - tmp0 = tmp0 + (p3_in); \ - p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp0, 3); \ + tmp_filt8_1 = q2_in + q1_in + q0_in; \ + tmp_filt8_2 = tmp_filt8_2 + tmp_filt8_1; \ + tmp_filt8_0 = tmp_filt8_2 + (p0_in); \ + tmp_filt8_0 = tmp_filt8_0 + (p3_in); \ + p0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_0, 3); \ \ - tmp0 = q2_in + q3_in; \ - tmp0 = p0_in + tmp1 + tmp0; \ - tmp1 = q3_in + q3_in; \ - tmp1 = tmp1 + tmp0; \ - q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ + tmp_filt8_0 = q2_in + q3_in; \ + tmp_filt8_0 = p0_in + tmp_filt8_1 + tmp_filt8_0; \ + tmp_filt8_1 = q3_in + q3_in; \ + tmp_filt8_1 = tmp_filt8_1 + tmp_filt8_0; \ + q2_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ \ - tmp0 = tmp2 + q3_in; \ - tmp1 = tmp0 + q0_in; \ - q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ + tmp_filt8_0 = tmp_filt8_2 + q3_in; \ + tmp_filt8_1 = tmp_filt8_0 + q0_in; \ + q0_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ \ - tmp1 = tmp0 - p2_in; \ - tmp0 = q1_in + q3_in; \ - tmp1 = tmp0 + tmp1; \ - q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp1, 3); \ + tmp_filt8_1 = tmp_filt8_0 - p2_in; \ + tmp_filt8_0 = q1_in + q3_in; \ + tmp_filt8_1 = tmp_filt8_0 + tmp_filt8_1; \ + q1_filt8_out = (v8i16)__msa_srari_h((v8i16)tmp_filt8_1, 3); \ } #define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \ diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/macros_msa.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/macros_msa.h index f498fbe9de2..002e574aa8f 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/macros_msa.h +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/macros_msa.h @@ -168,20 +168,20 @@ val_m; \ }) #else // !(__mips == 64) -#define LD(psrc) \ - ({ \ - const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \ - uint32_t val0_m, val1_m; \ - uint64_t val_m = 0; \ - \ - val0_m = LW(psrc_m1); \ - val1_m = LW(psrc_m1 + 4); \ - \ - val_m = (uint64_t)(val1_m); \ - val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000); \ - val_m = (uint64_t)(val_m | (uint64_t)val0_m); \ - \ - val_m; \ +#define LD(psrc) \ + ({ \ + const uint8_t *psrc_m1 = (const uint8_t *)(psrc); \ + uint32_t val0_m, val1_m; \ + uint64_t val_m_combined = 0; \ + \ + val0_m = LW(psrc_m1); \ + val1_m = LW(psrc_m1 + 4); \ + \ + val_m_combined = (uint64_t)(val1_m); \ + val_m_combined = (uint64_t)((val_m_combined << 32) & 0xFFFFFFFF00000000); \ + val_m_combined = (uint64_t)(val_m_combined | (uint64_t)val0_m); \ + \ + val_m_combined; \ }) #endif // (__mips == 64) @@ -909,27 +909,42 @@ sum_m; \ }) -/* Description : Horizontal addition of 8 unsigned halfword elements - Arguments : Inputs - in (unsigned halfword vector) - Outputs - sum_m (u32 sum) - Return Type - unsigned word - Details : 8 unsigned halfword elements of input vector are added - together and the resulting integer sum is returned +/* Description : Horizontal addition of 4 unsigned word elements + Arguments : Input - in (unsigned word vector) + Output - sum_m (u32 sum) + Return Type - unsigned word (GP) + Details : 4 unsigned word elements of 'in' vector are added together and + the resulting integer sum is returned */ -#define HADD_UH_U32(in) \ +#define HADD_UW_U32(in) \ ({ \ - v4u32 res_m; \ v2u64 res0_m, res1_m; \ uint32_t sum_m; \ \ - res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \ - res0_m = __msa_hadd_u_d(res_m, res_m); \ + res0_m = __msa_hadd_u_d((v4u32)in, (v4u32)in); \ res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1); \ - res0_m = res0_m + res1_m; \ + res0_m += res1_m; \ sum_m = __msa_copy_u_w((v4i32)res0_m, 0); \ sum_m; \ }) +/* Description : Horizontal addition of 8 unsigned halfword elements + Arguments : Input - in (unsigned halfword vector) + Output - sum_m (u32 sum) + Return Type - unsigned word + Details : 8 unsigned halfword elements of 'in' vector are added + together and the resulting integer sum is returned +*/ +#define HADD_UH_U32(in) \ + ({ \ + v4u32 res_m; \ + uint32_t sum_m; \ + \ + res_m = __msa_hadd_u_w((v8u16)in, (v8u16)in); \ + sum_m = HADD_UW_U32(res_m); \ + sum_m; \ + }) + /* Description : Horizontal addition of unsigned byte vector elements Arguments : Inputs - in0, in1 Outputs - out0, out1 @@ -2019,13 +2034,12 @@ pdst, stride) \ { \ v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ \ tmp0_m = PCKEV_XORI128_UB(in0, in1); \ tmp1_m = PCKEV_XORI128_UB(in2, in3); \ ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ - ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \ + ST8x4_UB(tmp0_m, tmp1_m, pdst, stride); \ } /* Description : Pack even byte elements and store byte vector in destination diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/sad_msa.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/sad_msa.c index 6455814e1b8..e295123acf0 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/sad_msa.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/sad_msa.c @@ -1030,6 +1030,7 @@ static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride, v8u16 sad2_1 = { 0 }; v8u16 sad3_0 = { 0 }; v8u16 sad3_1 = { 0 }; + v4u32 sad; ref0_ptr = aref_ptr[0]; ref1_ptr = aref_ptr[1]; @@ -1061,14 +1062,21 @@ static void sad_64width_x4d_msa(const uint8_t *src, int32_t src_stride, sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3); } - sad_array[0] = HADD_UH_U32(sad0_0); - sad_array[0] += HADD_UH_U32(sad0_1); - sad_array[1] = HADD_UH_U32(sad1_0); - sad_array[1] += HADD_UH_U32(sad1_1); - sad_array[2] = HADD_UH_U32(sad2_0); - sad_array[2] += HADD_UH_U32(sad2_1); - sad_array[3] = HADD_UH_U32(sad3_0); - sad_array[3] += HADD_UH_U32(sad3_1); + sad = __msa_hadd_u_w(sad0_0, sad0_0); + sad += __msa_hadd_u_w(sad0_1, sad0_1); + sad_array[0] = HADD_UW_U32(sad); + + sad = __msa_hadd_u_w(sad1_0, sad1_0); + sad += __msa_hadd_u_w(sad1_1, sad1_1); + sad_array[1] = HADD_UW_U32(sad); + + sad = __msa_hadd_u_w(sad2_0, sad2_0); + sad += __msa_hadd_u_w(sad2_1, sad2_1); + sad_array[2] = HADD_UW_U32(sad); + + sad = __msa_hadd_u_w(sad3_0, sad3_0); + sad += __msa_hadd_u_w(sad3_1, sad3_1); + sad_array[3] = HADD_UW_U32(sad); } static uint32_t avgsad_4width_msa(const uint8_t *src_ptr, int32_t src_stride, diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/variance_msa.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/variance_msa.c index 085990e4845..49b2f99230f 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/variance_msa.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/variance_msa.c @@ -489,27 +489,19 @@ static uint32_t sse_64width_msa(const uint8_t *src_ptr, int32_t src_stride, uint32_t vpx_get4x4sse_cs_msa(const uint8_t *src_ptr, int32_t src_stride, const uint8_t *ref_ptr, int32_t ref_stride) { - uint32_t err = 0; uint32_t src0, src1, src2, src3; uint32_t ref0, ref1, ref2, ref3; v16i8 src = { 0 }; v16i8 ref = { 0 }; - v16u8 src_vec0, src_vec1; - v8i16 diff0, diff1; v4i32 err0 = { 0 }; - v4i32 err1 = { 0 }; LW4(src_ptr, src_stride, src0, src1, src2, src3); LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3); INSERT_W4_SB(src0, src1, src2, src3, src); INSERT_W4_SB(ref0, ref1, ref2, ref3, ref); - ILVRL_B2_UB(src, ref, src_vec0, src_vec1); - HSUB_UB2_SH(src_vec0, src_vec1, diff0, diff1); - DPADD_SH2_SW(diff0, diff1, diff0, diff1, err0, err1); - err = HADD_SW_S32(err0); - err += HADD_SW_S32(err1); + CALC_MSE_B(src, ref, err0); - return err; + return HADD_SW_S32(err0); } #define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4); diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/vpx_convolve_msa.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/vpx_convolve_msa.h index 198c21ed20a..f75679521a4 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/vpx_convolve_msa.h +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/vpx_convolve_msa.h @@ -16,18 +16,18 @@ extern const uint8_t mc_filt_mask_arr[16 * 3]; -#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt0, filt1, filt2, \ - filt3) \ - ({ \ - v8i16 tmp0, tmp1; \ - \ - tmp0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \ - tmp0 = __msa_dpadd_s_h(tmp0, (v16i8)vec1, (v16i8)filt1); \ - tmp1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2); \ - tmp1 = __msa_dpadd_s_h(tmp1, (v16i8)vec3, (v16i8)filt3); \ - tmp0 = __msa_adds_s_h(tmp0, tmp1); \ - \ - tmp0; \ +#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt0, filt1, filt2, \ + filt3) \ + ({ \ + v8i16 tmp_dpadd_0, tmp_dpadd_1; \ + \ + tmp_dpadd_0 = __msa_dotp_s_h((v16i8)vec0, (v16i8)filt0); \ + tmp_dpadd_0 = __msa_dpadd_s_h(tmp_dpadd_0, (v16i8)vec1, (v16i8)filt1); \ + tmp_dpadd_1 = __msa_dotp_s_h((v16i8)vec2, (v16i8)filt2); \ + tmp_dpadd_1 = __msa_dpadd_s_h(tmp_dpadd_1, (v16i8)vec3, (v16i8)filt3); \ + tmp_dpadd_0 = __msa_adds_s_h(tmp_dpadd_0, tmp_dpadd_1); \ + \ + tmp_dpadd_0; \ }) #define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_h0, \ @@ -114,11 +114,10 @@ extern const uint8_t mc_filt_mask_arr[16 * 3]; stride) \ { \ v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ - uint8_t *pdst_m = (uint8_t *)(pdst); \ \ PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \ PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \ AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \ - ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \ + ST8x4_UB(tmp0_m, tmp1_m, pdst, stride); \ } #endif /* VPX_DSP_MIPS_VPX_CONVOLVE_MSA_H_ */ diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_convolve.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_convolve.c index f17281b3698..cab6368e606 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_convolve.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_convolve.c @@ -25,6 +25,7 @@ static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride, int x_step_q4, int w, int h) { int x, y; src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { int x_q4 = x0_q4; for (x = 0; x < w; ++x) { @@ -46,6 +47,7 @@ static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride, int x_step_q4, int w, int h) { int x, y; src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { int x_q4 = x0_q4; for (x = 0; x < w; ++x) { @@ -72,7 +74,7 @@ static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride, for (x = 0; x < w; ++x) { int y_q4 = y0_q4; for (y = 0; y < h; ++y) { - const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; int k, sum = 0; for (k = 0; k < SUBPEL_TAPS; ++k) @@ -95,7 +97,7 @@ static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride, for (x = 0; x < w; ++x) { int y_q4 = y0_q4; for (y = 0; y < h; ++y) { - const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; + const uint8_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; int k, sum = 0; for (k = 0; k < SUBPEL_TAPS; ++k) @@ -128,8 +130,8 @@ static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, // --Must round-up because block may be located at sub-pixel position. // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. - uint8_t temp[135 * 64]; - int intermediate_height = + uint8_t temp[64 * 135]; + const int intermediate_height = (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; assert(w <= 64); @@ -143,16 +145,6 @@ static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, y_filters, y0_q4, y_step_q4, w, h); } -static const InterpKernel *get_filter_base(const int16_t *filter) { - // NOTE: This assumes that the filter table is 256-byte aligned. - // TODO(agrange) Modify to make independent of table alignment. - return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF)); -} - -static int get_filter_offset(const int16_t *f, const InterpKernel *base) { - return (int)((const InterpKernel *)(intptr_t)f - base); -} - void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, @@ -219,7 +211,6 @@ void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, int w, int h) { const InterpKernel *const filters_x = get_filter_base(filter_x); const int x0_q4 = get_filter_offset(filter_x, filters_x); - const InterpKernel *const filters_y = get_filter_base(filter_y); const int y0_q4 = get_filter_offset(filter_y, filters_y); @@ -231,7 +222,7 @@ void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { - /* Fixed size intermediate buffer places limits on parameters. */ + // Fixed size intermediate buffer places limits on parameters. DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]); assert(w <= 64); assert(h <= 64); @@ -272,7 +263,6 @@ void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, for (y = 0; y < h; ++y) { for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); - src += src_stride; dst += dst_stride; } @@ -334,9 +324,10 @@ static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride, const InterpKernel *x_filters, int x0_q4, int x_step_q4, int w, int h, int bd) { int x, y; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { int x_q4 = x0_q4; for (x = 0; x < w; ++x) { @@ -357,9 +348,10 @@ static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride, const InterpKernel *x_filters, int x0_q4, int x_step_q4, int w, int h, int bd) { int x, y; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= SUBPEL_TAPS / 2 - 1; + for (y = 0; y < h; ++y) { int x_q4 = x0_q4; for (x = 0; x < w; ++x) { @@ -382,9 +374,10 @@ static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride, const InterpKernel *y_filters, int y0_q4, int y_step_q4, int w, int h, int bd) { int x, y; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (x = 0; x < w; ++x) { int y_q4 = y0_q4; for (y = 0; y < h; ++y) { @@ -407,9 +400,10 @@ static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride, const InterpKernel *y_filters, int y0_q4, int y_step_q4, int w, int h, int bd) { int x, y; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= src_stride * (SUBPEL_TAPS / 2 - 1); + for (x = 0; x < w; ++x) { int y_q4 = y0_q4; for (y = 0; y < h; ++y) { @@ -447,7 +441,7 @@ static void highbd_convolve(const uint8_t *src, ptrdiff_t src_stride, // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. uint16_t temp[64 * 135]; - int intermediate_height = + const int intermediate_height = (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; assert(w <= 64); @@ -470,6 +464,7 @@ void vpx_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, int h, int bd) { const InterpKernel *const filters_x = get_filter_base(filter_x); const int x0_q4 = get_filter_offset(filter_x, filters_x); + (void)filter_y; (void)y_step_q4; @@ -484,6 +479,7 @@ void vpx_highbd_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, int w, int h, int bd) { const InterpKernel *const filters_x = get_filter_base(filter_x); const int x0_q4 = get_filter_offset(filter_x, filters_x); + (void)filter_y; (void)y_step_q4; @@ -498,6 +494,7 @@ void vpx_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, int h, int bd) { const InterpKernel *const filters_y = get_filter_base(filter_y); const int y0_q4 = get_filter_offset(filter_y, filters_y); + (void)filter_x; (void)x_step_q4; @@ -512,6 +509,7 @@ void vpx_highbd_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, int w, int h, int bd) { const InterpKernel *const filters_y = get_filter_base(filter_y); const int y0_q4 = get_filter_offset(filter_y, filters_y); + (void)filter_x; (void)x_step_q4; @@ -526,7 +524,6 @@ void vpx_highbd_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, int h, int bd) { const InterpKernel *const filters_x = get_filter_base(filter_x); const int x0_q4 = get_filter_offset(filter_x, filters_x); - const InterpKernel *const filters_y = get_filter_base(filter_y); const int y0_q4 = get_filter_offset(filter_y, filters_y); @@ -556,11 +553,12 @@ void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd) { int r; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + (void)filter_x; - (void)filter_y; (void)filter_x_stride; + (void)filter_y; (void)filter_y_stride; (void)bd; @@ -577,18 +575,17 @@ void vpx_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd) { int x, y; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + (void)filter_x; - (void)filter_y; (void)filter_x_stride; + (void)filter_y; (void)filter_y_stride; (void)bd; for (y = 0; y < h; ++y) { - for (x = 0; x < w; ++x) { - dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); - } + for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); src += src_stride; dst += dst_stride; } diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk index 66062b6e78e..2909beb0f6c 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk @@ -86,6 +86,10 @@ DSP_SRCS-$(HAVE_SSSE3) += x86/vpx_subpixel_8t_intrin_ssse3.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/vpx_high_subpixel_8t_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/vpx_high_subpixel_bilinear_sse2.asm +DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_copy_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_avg_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve8_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_vpx_convolve_neon.c endif DSP_SRCS-$(HAVE_SSE2) += x86/vpx_convolve_copy_sse2.asm @@ -159,6 +163,7 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_horiz_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/loopfilter_mb_vert_dspr2.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +DSP_SRCS-$(HAVE_NEON) += arm/highbd_loopfilter_neon.c DSP_SRCS-$(HAVE_SSE2) += x86/highbd_loopfilter_sse2.c endif # CONFIG_VP9_HIGHBITDEPTH @@ -199,27 +204,15 @@ DSP_SRCS-$(HAVE_NEON_ASM) += arm/save_reg_neon$(ASM) ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) ifeq ($(HAVE_NEON_ASM),yes) -DSP_SRCS-yes += arm/idct4x4_1_add_neon$(ASM) -DSP_SRCS-yes += arm/idct4x4_add_neon$(ASM) -DSP_SRCS-yes += arm/idct8x8_1_add_neon$(ASM) -DSP_SRCS-yes += arm/idct8x8_add_neon$(ASM) -DSP_SRCS-yes += arm/idct16x16_1_add_neon$(ASM) DSP_SRCS-yes += arm/idct16x16_add_neon$(ASM) -DSP_SRCS-yes += arm/idct32x32_1_add_neon$(ASM) -DSP_SRCS-yes += arm/idct32x32_add_neon$(ASM) else ifeq ($(HAVE_NEON),yes) -DSP_SRCS-yes += arm/idct4x4_1_add_neon.c -DSP_SRCS-yes += arm/idct4x4_add_neon.c -DSP_SRCS-yes += arm/idct8x8_1_add_neon.c -DSP_SRCS-yes += arm/idct8x8_add_neon.c -DSP_SRCS-yes += arm/idct16x16_1_add_neon.c DSP_SRCS-yes += arm/idct16x16_add_neon.c -DSP_SRCS-yes += arm/idct32x32_1_add_neon.c -DSP_SRCS-yes += arm/idct32x32_add_neon.c endif # HAVE_NEON endif # HAVE_NEON_ASM DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_34_add_neon.c DSP_SRCS-$(HAVE_MSA) += mips/inv_txfm_msa.h DSP_SRCS-$(HAVE_MSA) += mips/idct4x4_msa.c @@ -233,7 +226,25 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/itrans8_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c -endif # CONFIG_VP9_HIGHBITDEPTH +endif # !CONFIG_VP9_HIGHBITDEPTH + +ifeq ($(HAVE_NEON_ASM),yes) +DSP_SRCS-yes += arm/idct_neon$(ASM) +DSP_SRCS-yes += arm/idct4x4_1_add_neon$(ASM) +DSP_SRCS-yes += arm/idct4x4_add_neon$(ASM) +DSP_SRCS-yes += arm/idct8x8_1_add_neon$(ASM) +DSP_SRCS-yes += arm/idct8x8_add_neon$(ASM) +DSP_SRCS-yes += arm/idct16x16_1_add_neon$(ASM) +else +DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_1_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_1_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_1_add_neon.c +endif # HAVE_NEON_ASM +DSP_SRCS-$(HAVE_NEON) += arm/idct_neon.h +DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_1_add_neon.c + endif # CONFIG_VP9 # quantization @@ -241,6 +252,7 @@ ifeq ($(CONFIG_VP9_ENCODER),yes) DSP_SRCS-yes += quantize.c DSP_SRCS-yes += quantize.h +DSP_SRCS-$(HAVE_SSE2) += x86/fdct.h DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl index d148642e37b..ee403be3975 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -392,28 +392,28 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # Sub Pixel Filters # add_proto qw/void vpx_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - specialize qw/vpx_highbd_convolve_copy sse2/; + specialize qw/vpx_highbd_convolve_copy sse2 neon/; add_proto qw/void vpx_highbd_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - specialize qw/vpx_highbd_convolve_avg sse2/; + specialize qw/vpx_highbd_convolve_avg sse2 neon/; add_proto qw/void vpx_highbd_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - specialize qw/vpx_highbd_convolve8/, "$sse2_x86_64"; + specialize qw/vpx_highbd_convolve8 neon/, "$sse2_x86_64"; add_proto qw/void vpx_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - specialize qw/vpx_highbd_convolve8_horiz/, "$sse2_x86_64"; + specialize qw/vpx_highbd_convolve8_horiz neon/, "$sse2_x86_64"; add_proto qw/void vpx_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - specialize qw/vpx_highbd_convolve8_vert/, "$sse2_x86_64"; + specialize qw/vpx_highbd_convolve8_vert neon/, "$sse2_x86_64"; add_proto qw/void vpx_highbd_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - specialize qw/vpx_highbd_convolve8_avg/, "$sse2_x86_64"; + specialize qw/vpx_highbd_convolve8_avg neon/, "$sse2_x86_64"; add_proto qw/void vpx_highbd_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - specialize qw/vpx_highbd_convolve8_avg_horiz/, "$sse2_x86_64"; + specialize qw/vpx_highbd_convolve8_avg_horiz neon/, "$sse2_x86_64"; add_proto qw/void vpx_highbd_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; - specialize qw/vpx_highbd_convolve8_avg_vert/, "$sse2_x86_64"; + specialize qw/vpx_highbd_convolve8_avg_vert neon/, "$sse2_x86_64"; } # CONFIG_VP9_HIGHBITDEPTH # @@ -457,40 +457,40 @@ specialize qw/vpx_lpf_horizontal_4_dual sse2 neon dspr2 msa/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_lpf_vertical_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; - specialize qw/vpx_highbd_lpf_vertical_16 sse2/; + specialize qw/vpx_highbd_lpf_vertical_16 sse2 neon/; add_proto qw/void vpx_highbd_lpf_vertical_16_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; - specialize qw/vpx_highbd_lpf_vertical_16_dual sse2/; + specialize qw/vpx_highbd_lpf_vertical_16_dual sse2 neon/; add_proto qw/void vpx_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; - specialize qw/vpx_highbd_lpf_vertical_8 sse2/; + specialize qw/vpx_highbd_lpf_vertical_8 sse2 neon/; add_proto qw/void vpx_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; - specialize qw/vpx_highbd_lpf_vertical_8_dual sse2/; + specialize qw/vpx_highbd_lpf_vertical_8_dual sse2 neon/; add_proto qw/void vpx_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; - specialize qw/vpx_highbd_lpf_vertical_4 sse2/; + specialize qw/vpx_highbd_lpf_vertical_4 sse2 neon/; add_proto qw/void vpx_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; - specialize qw/vpx_highbd_lpf_vertical_4_dual sse2/; + specialize qw/vpx_highbd_lpf_vertical_4_dual sse2 neon/; add_proto qw/void vpx_highbd_lpf_horizontal_16/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; - specialize qw/vpx_highbd_lpf_horizontal_16 sse2/; + specialize qw/vpx_highbd_lpf_horizontal_16 sse2 neon/; add_proto qw/void vpx_highbd_lpf_horizontal_16_dual/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; - specialize qw/vpx_highbd_lpf_horizontal_16_dual sse2/; + specialize qw/vpx_highbd_lpf_horizontal_16_dual sse2 neon/; add_proto qw/void vpx_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; - specialize qw/vpx_highbd_lpf_horizontal_8 sse2/; + specialize qw/vpx_highbd_lpf_horizontal_8 sse2 neon/; add_proto qw/void vpx_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; - specialize qw/vpx_highbd_lpf_horizontal_8_dual sse2/; + specialize qw/vpx_highbd_lpf_horizontal_8_dual sse2 neon/; add_proto qw/void vpx_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd"; - specialize qw/vpx_highbd_lpf_horizontal_4 sse2/; + specialize qw/vpx_highbd_lpf_horizontal_4 sse2 neon/; add_proto qw/void vpx_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd"; - specialize qw/vpx_highbd_lpf_horizontal_4_dual sse2/; + specialize qw/vpx_highbd_lpf_horizontal_4_dual sse2 neon/; } # CONFIG_VP9_HIGHBITDEPTH # @@ -637,26 +637,26 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - add_proto qw/void vpx_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; } else { add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vpx_idct4x4_16_add sse2/; + specialize qw/vpx_idct4x4_16_add neon sse2/; add_proto qw/void vpx_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vpx_idct4x4_1_add sse2/; + specialize qw/vpx_idct4x4_1_add neon sse2/; add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vpx_idct8x8_64_add sse2/, "$ssse3_x86_64"; + specialize qw/vpx_idct8x8_64_add neon sse2/, "$ssse3_x86_64"; add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vpx_idct8x8_12_add sse2/, "$ssse3_x86_64"; + specialize qw/vpx_idct8x8_12_add neon sse2/, "$ssse3_x86_64"; add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vpx_idct8x8_1_add sse2/; + specialize qw/vpx_idct8x8_1_add neon sse2/; add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/vpx_idct16x16_256_add sse2/; @@ -665,7 +665,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_idct16x16_10_add sse2/; add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vpx_idct16x16_1_add sse2/; + specialize qw/vpx_idct16x16_1_add neon sse2/; add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/vpx_idct32x32_1024_add sse2/, "$ssse3_x86_64"; @@ -679,7 +679,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_idct32x32_34_add sse2/, "$ssse3_x86_64"; add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; - specialize qw/vpx_idct32x32_1_add sse2/; + specialize qw/vpx_idct32x32_1_add neon sse2/; add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; specialize qw/vpx_highbd_idct4x4_16_add sse2/; @@ -687,8 +687,8 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; specialize qw/vpx_highbd_idct8x8_64_add sse2/; - add_proto qw/void vpx_highbd_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; - specialize qw/vpx_highbd_idct8x8_10_add sse2/; + add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + specialize qw/vpx_highbd_idct8x8_12_add sse2/; add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; specialize qw/vpx_highbd_idct16x16_256_add sse2/; @@ -764,8 +764,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/vpx_idct32x32_34_add sse2 neon dspr2 msa/, "$ssse3_x86_64"; - # Need to add 34 eob idct32x32 neon implementation. - $vpx_idct32x32_34_add_neon=vpx_idct32x32_1024_add_neon; add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride"; specialize qw/vpx_idct32x32_1_add sse2 neon dspr2 msa/; diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_filter.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_filter.h index 6cea251bcca..26d690501b6 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_filter.h +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_filter.h @@ -26,6 +26,17 @@ extern "C" { typedef int16_t InterpKernel[SUBPEL_TAPS]; +static INLINE const InterpKernel *get_filter_base(const int16_t *filter) { + // NOTE: This assumes that the filter table is 256-byte aligned. + // TODO(agrange) Modify to make independent of table alignment. + return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF)); +} + +static INLINE int get_filter_offset(const int16_t *f, + const InterpKernel *base) { + return (int)((const InterpKernel *)(intptr_t)f - base); +} + #ifdef __cplusplus } // extern "C" #endif diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/fdct.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/fdct.h new file mode 100644 index 00000000000..54a6d81fcbc --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/fdct.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_DSP_X86_FDCT_H_ +#define VPX_DSP_X86_FDCT_H_ + +#include <xmmintrin.h> + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" + +// Load 8 16 bit values. If the source is 32 bits then cast down. +// This does not saturate values. It only truncates. +static INLINE __m128i load_tran_low(const tran_low_t *a) { +#if CONFIG_VP9_HIGHBITDEPTH + return _mm_setr_epi16((int16_t)a[0], (int16_t)a[1], (int16_t)a[2], + (int16_t)a[3], (int16_t)a[4], (int16_t)a[5], + (int16_t)a[6], (int16_t)a[7]); +#else + return _mm_load_si128((const __m128i *)a); +#endif +} + +// Store 8 16 bit values. If the destination is 32 bits then sign extend the +// values by multiplying by 1. +static INLINE void store_tran_low(__m128i a, tran_low_t *b) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m128i one = _mm_set1_epi16(1); + const __m128i a_hi = _mm_mulhi_epi16(a, one); + const __m128i a_lo = _mm_mullo_epi16(a, one); + const __m128i a_1 = _mm_unpacklo_epi16(a_lo, a_hi); + const __m128i a_2 = _mm_unpackhi_epi16(a_lo, a_hi); + _mm_store_si128((__m128i *)(b), a_1); + _mm_store_si128((__m128i *)(b + 4), a_2); +#else + _mm_store_si128((__m128i *)(b), a); +#endif +} + +// Zero fill 8 positions in the output buffer. +static INLINE void store_zero_tran_low(tran_low_t *a) { + const __m128i zero = _mm_setzero_si128(); +#if CONFIG_VP9_HIGHBITDEPTH + _mm_store_si128((__m128i *)(a), zero); + _mm_store_si128((__m128i *)(a + 4), zero); +#else + _mm_store_si128((__m128i *)(a), zero); +#endif +} +#endif // VPX_DSP_X86_FDCT_H_ diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c index 64b56223ede..2362476c1f1 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c @@ -77,10 +77,10 @@ void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, for (j = 0; j < 4; j++) { if (test & (1 << (4 * j))) { int k = 4 * i + j; - const int64_t tmp1 = abs_coeff[j] + round_ptr[k != 0]; - const int64_t tmp2 = ((tmp1 * quant_ptr[k != 0]) >> 16) + tmp1; + const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0]; + const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3; const uint32_t abs_qcoeff = - (uint32_t)((tmp2 * quant_shift_ptr[k != 0]) >> 16); + (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16); qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j]; dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0]; if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i; diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c index cb56ad0789c..d5fc1440c41 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c @@ -2379,7 +2379,6 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, #define IDCT32_34 \ /* Stage1 */ \ { \ - const __m128i zero = _mm_setzero_si128(); \ const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], zero); \ const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], zero); \ \ @@ -2404,7 +2403,6 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, \ /* Stage2 */ \ { \ - const __m128i zero = _mm_setzero_si128(); \ const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], zero); \ const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], zero); \ \ @@ -2431,7 +2429,6 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, \ /* Stage3 */ \ { \ - const __m128i zero = _mm_setzero_si128(); \ const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], zero); \ const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], zero); \ \ @@ -2472,7 +2469,6 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, \ /* Stage4 */ \ { \ - const __m128i zero = _mm_setzero_si128(); \ const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], zero); \ const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], zero); \ \ @@ -3009,6 +3005,7 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, // Only upper-left 8x8 has non-zero coeff void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { + const __m128i zero = _mm_setzero_si128(); const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); const __m128i final_rounding = _mm_set1_epi16(1 << 5); @@ -3104,7 +3101,6 @@ void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, col[31] = _mm_sub_epi16(stp1_0, stp1_31); for (i = 0; i < 4; i++) { int j; - const __m128i zero = _mm_setzero_si128(); // Transpose 32x8 block to 8x32 block array_transpose_8x8(col + i * 8, in); IDCT32_34 @@ -3677,7 +3673,7 @@ void vpx_highbd_idct8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_idct8x8_10_add_sse2(const tran_low_t *input, uint8_t *dest8, +void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint8_t *dest8, int stride, int bd) { tran_low_t out[8 * 8] = { 0 }; tran_low_t *outptr = out; @@ -4021,8 +4017,8 @@ void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint8_t *dest8, uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); tran_low_t out; - out = highbd_dct_const_round_shift(input[0] * cospi_16_64); - out = highbd_dct_const_round_shift(out * cospi_16_64); + out = dct_const_round_shift(input[0] * cospi_16_64); + out = dct_const_round_shift(out * cospi_16_64); a = ROUND_POWER_OF_TWO(out, 6); d = _mm_set1_epi32(a); diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/quantize_sse2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/quantize_sse2.c index 2c7e431c745..0580a7bd7b6 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/quantize_sse2.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/quantize_sse2.c @@ -13,32 +13,7 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" - -static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) { -#if CONFIG_VP9_HIGHBITDEPTH - return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1], - (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3], - (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5], - (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]); -#else - return _mm_load_si128((const __m128i *)coeff_ptr); -#endif -} - -static INLINE void store_coefficients(__m128i coeff_vals, - tran_low_t *coeff_ptr) { -#if CONFIG_VP9_HIGHBITDEPTH - __m128i one = _mm_set1_epi16(1); - __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one); - __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one); - __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi); - __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi); - _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1); - _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2); -#else - _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals); -#endif -} +#include "vpx_dsp/x86/fdct.h" void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, @@ -81,8 +56,8 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i qtmp0, qtmp1; __m128i cmp_mask0, cmp_mask1; // Do DC and first 15 AC - coeff0 = load_coefficients(coeff_ptr + n_coeffs); - coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8); + coeff0 = load_tran_low(coeff_ptr + n_coeffs); + coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); // Poor man's sign extract coeff0_sign = _mm_srai_epi16(coeff0, 15); @@ -117,15 +92,15 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); - store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs); - store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8); + store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); + store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); dequant = _mm_unpackhi_epi64(dequant, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - store_coefficients(coeff0, dqcoeff_ptr + n_coeffs); - store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8); + store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); + store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); } { @@ -159,8 +134,8 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, __m128i qtmp0, qtmp1; __m128i cmp_mask0, cmp_mask1; - coeff0 = load_coefficients(coeff_ptr + n_coeffs); - coeff1 = load_coefficients(coeff_ptr + n_coeffs + 8); + coeff0 = load_tran_low(coeff_ptr + n_coeffs); + coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8); // Poor man's sign extract coeff0_sign = _mm_srai_epi16(coeff0, 15); @@ -191,14 +166,14 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0); qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1); - store_coefficients(qcoeff0, qcoeff_ptr + n_coeffs); - store_coefficients(qcoeff1, qcoeff_ptr + n_coeffs + 8); + store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs); + store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8); coeff0 = _mm_mullo_epi16(qcoeff0, dequant); coeff1 = _mm_mullo_epi16(qcoeff1, dequant); - store_coefficients(coeff0, dqcoeff_ptr + n_coeffs); - store_coefficients(coeff1, dqcoeff_ptr + n_coeffs + 8); + store_tran_low(coeff0, dqcoeff_ptr + n_coeffs); + store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8); } { @@ -237,10 +212,10 @@ void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, } } else { do { - store_coefficients(zero, dqcoeff_ptr + n_coeffs); - store_coefficients(zero, dqcoeff_ptr + n_coeffs + 8); - store_coefficients(zero, qcoeff_ptr + n_coeffs); - store_coefficients(zero, qcoeff_ptr + n_coeffs + 8); + store_tran_low(zero, dqcoeff_ptr + n_coeffs); + store_tran_low(zero, dqcoeff_ptr + n_coeffs + 8); + store_tran_low(zero, qcoeff_ptr + n_coeffs); + store_tran_low(zero, qcoeff_ptr + n_coeffs + 8); n_coeffs += 8 * 2; } while (n_coeffs < 0); *eob_ptr = 0; diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c index b26d97b4551..09c75d455ca 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c @@ -860,16 +860,6 @@ static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride, } } -static const InterpKernel *get_filter_base(const int16_t *filter) { - // NOTE: This assumes that the filter table is 256-byte aligned. - // TODO(agrange) Modify to make independent of table alignment. - return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF)); -} - -static int get_filter_offset(const int16_t *f, const InterpKernel *base) { - return (int)((const InterpKernel *)(intptr_t)f - base); -} - void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.c b/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.c index c94ed52d16d..a9be0868066 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.c @@ -76,38 +76,6 @@ void *vpx_calloc(size_t num, size_t size) { return x; } -void *vpx_realloc(void *memblk, size_t size) { - void *new_addr = NULL; - - /* - The realloc() function changes the size of the object pointed to by - ptr to the size specified by size, and returns a pointer to the - possibly moved block. The contents are unchanged up to the lesser - of the new and old sizes. If ptr is null, realloc() behaves like - malloc() for the specified size. If size is zero (0) and ptr is - not a null pointer, the object pointed to is freed. - */ - if (!memblk) - new_addr = vpx_malloc(size); - else if (!size) - vpx_free(memblk); - else { - void *addr = get_actual_malloc_address(memblk); - const uint64_t aligned_size = - get_aligned_malloc_size(size, DEFAULT_ALIGNMENT); - if (!check_size_argument_overflow(1, aligned_size)) return NULL; - - addr = realloc(addr, (size_t)aligned_size); - if (addr) { - new_addr = align_addr((unsigned char *)addr + ADDRESS_STORAGE_SIZE, - DEFAULT_ALIGNMENT); - set_actual_malloc_address(new_addr, addr); - } - } - - return new_addr; -} - void vpx_free(void *memblk) { if (memblk) { void *addr = get_actual_malloc_address(memblk); diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.h b/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.h index c14f288b895..733aff4885c 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.h +++ b/chromium/third_party/libvpx/source/libvpx/vpx_mem/vpx_mem.h @@ -26,7 +26,6 @@ extern "C" { void *vpx_memalign(size_t align, size_t size); void *vpx_malloc(size_t size); void *vpx_calloc(size_t num, size_t size); -void *vpx_realloc(void *memblk, size_t size); void vpx_free(void *memblk); #if CONFIG_VP9_HIGHBITDEPTH diff --git a/chromium/third_party/libvpx/source/libvpx/vpxdec.c b/chromium/third_party/libvpx/source/libvpx/vpxdec.c index d1ed3e6cae0..2cdb69d5a31 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpxdec.c +++ b/chromium/third_party/libvpx/source/libvpx/vpxdec.c @@ -9,11 +9,11 @@ */ #include <assert.h> +#include <limits.h> +#include <stdarg.h> #include <stdio.h> #include <stdlib.h> -#include <stdarg.h> #include <string.h> -#include <limits.h> #include "./vpx_config.h" @@ -92,31 +92,19 @@ static const arg_def_t md5arg = static const arg_def_t outbitdeptharg = ARG_DEF(NULL, "output-bit-depth", 1, "Output bit-depth for decoded frames"); #endif - -static const arg_def_t *all_args[] = { &codecarg, - &use_yv12, - &use_i420, - &flipuvarg, - &rawvideo, - &noblitarg, - &progressarg, - &limitarg, - &skiparg, - &postprocarg, - &summaryarg, - &outputfile, - &threadsarg, - &frameparallelarg, - &verbosearg, - &scalearg, - &fb_arg, - &md5arg, - &error_concealment, - &continuearg, +static const arg_def_t svcdecodingarg = ARG_DEF( + NULL, "svc-decode-layer", 1, "Decode SVC stream up to given spatial layer"); + +static const arg_def_t *all_args[] = { + &codecarg, &use_yv12, &use_i420, &flipuvarg, &rawvideo, + &noblitarg, &progressarg, &limitarg, &skiparg, &postprocarg, + &summaryarg, &outputfile, &threadsarg, &frameparallelarg, &verbosearg, + &scalearg, &fb_arg, &md5arg, &error_concealment, &continuearg, #if CONFIG_VP9_HIGHBITDEPTH - &outbitdeptharg, + &outbitdeptharg, #endif - NULL }; + &svcdecodingarg, NULL +}; #if CONFIG_VP8_DECODER static const arg_def_t addnoise_level = @@ -519,6 +507,8 @@ static int main_loop(int argc, const char **argv_) { #if CONFIG_VP9_HIGHBITDEPTH unsigned int output_bit_depth = 0; #endif + int svc_decoding = 0; + int svc_spatial_layer = 0; #if CONFIG_VP8_DECODER vp8_postproc_cfg_t vp8_pp_cfg = { 0, 0, 0 }; #endif @@ -610,6 +600,10 @@ static int main_loop(int argc, const char **argv_) { output_bit_depth = arg_parse_uint(&arg); } #endif + else if (arg_match(&arg, &svcdecodingarg, argi)) { + svc_decoding = 1; + svc_spatial_layer = arg_parse_uint(&arg); + } #if CONFIG_VP8_DECODER else if (arg_match(&arg, &addnoise_level, argi)) { postproc = 1; @@ -726,7 +720,14 @@ static int main_loop(int argc, const char **argv_) { vpx_codec_error(&decoder)); goto fail2; } - + if (svc_decoding) { + if (vpx_codec_control(&decoder, VP9_DECODE_SVC_SPATIAL_LAYER, + svc_spatial_layer)) { + fprintf(stderr, "Failed to set spatial layer for svc decode: %s\n", + vpx_codec_error(&decoder)); + goto fail; + } + } if (!quiet) fprintf(stderr, "%s\n", decoder.name); #if CONFIG_VP8_DECODER @@ -780,8 +781,8 @@ static int main_loop(int argc, const char **argv_) { const char *detail = vpx_codec_error_detail(&decoder); warn("Failed to decode frame %d: %s", frame_in, vpx_codec_error(&decoder)); - if (detail) warn("Additional information: %s", detail); + corrupted = 1; if (!keep_going) goto fail; } @@ -800,6 +801,8 @@ static int main_loop(int argc, const char **argv_) { // Flush the decoder in frame parallel decode. if (vpx_codec_decode(&decoder, NULL, 0, NULL, 0)) { warn("Failed to flush decoder: %s", vpx_codec_error(&decoder)); + corrupted = 1; + if (!keep_going) goto fail; } } @@ -812,7 +815,7 @@ static int main_loop(int argc, const char **argv_) { vpx_usec_timer_mark(&timer); dx_time += (unsigned int)vpx_usec_timer_elapsed(&timer); - if (!frame_parallel && + if (!frame_parallel && !corrupted && vpx_codec_control(&decoder, VP8D_GET_FRAME_CORRUPTED, &corrupted)) { warn("Failed VP8_GET_FRAME_CORRUPTED: %s", vpx_codec_error(&decoder)); if (!keep_going) goto fail; diff --git a/chromium/third_party/libvpx/source/libvpx/vpxenc.c b/chromium/third_party/libvpx/source/libvpx/vpxenc.c index 6e0af57a42c..a0f760574c8 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpxenc.c +++ b/chromium/third_party/libvpx/source/libvpx/vpxenc.c @@ -355,6 +355,8 @@ static const arg_def_t cq_level = ARG_DEF(NULL, "cq-level", 1, "Constant/Constrained Quality level"); static const arg_def_t max_intra_rate_pct = ARG_DEF(NULL, "max-intra-rate", 1, "Max I-frame bitrate (pct)"); +static const arg_def_t gf_cbr_boost_pct = ARG_DEF( + NULL, "gf-cbr-boost", 1, "Boost for Golden Frame in CBR mode (pct)"); #if CONFIG_VP8_ENCODER static const arg_def_t cpu_used_vp8 = @@ -363,12 +365,21 @@ static const arg_def_t token_parts = ARG_DEF(NULL, "token-parts", 1, "Number of token partitions to use, log2"); static const arg_def_t screen_content_mode = ARG_DEF(NULL, "screen-content-mode", 1, "Screen content mode"); -static const arg_def_t *vp8_args[] = { - &cpu_used_vp8, &auto_altref, &noise_sens, &sharpness, - &static_thresh, &token_parts, &arnr_maxframes, &arnr_strength, - &arnr_type, &tune_ssim, &cq_level, &max_intra_rate_pct, - &screen_content_mode, NULL -}; +static const arg_def_t *vp8_args[] = { &cpu_used_vp8, + &auto_altref, + &noise_sens, + &sharpness, + &static_thresh, + &token_parts, + &arnr_maxframes, + &arnr_strength, + &arnr_type, + &tune_ssim, + &cq_level, + &max_intra_rate_pct, + &gf_cbr_boost_pct, + &screen_content_mode, + NULL }; static const int vp8_arg_ctrl_map[] = { VP8E_SET_CPUUSED, VP8E_SET_ENABLEAUTOALTREF, VP8E_SET_NOISE_SENSITIVITY, @@ -381,6 +392,7 @@ static const int vp8_arg_ctrl_map[] = { VP8E_SET_CPUUSED, VP8E_SET_TUNING, VP8E_SET_CQ_LEVEL, VP8E_SET_MAX_INTRA_BITRATE_PCT, + VP8E_SET_GF_CBR_BOOST_PCT, VP8E_SET_SCREEN_CONTENT_MODE, 0 }; #endif @@ -407,8 +419,6 @@ static const arg_def_t alt_ref_aq = ARG_DEF(NULL, "alt-ref-aq", 1, static const arg_def_t frame_periodic_boost = ARG_DEF(NULL, "frame-boost", 1, "Enable frame periodic boost (0: off (default), 1: on)"); -static const arg_def_t gf_cbr_boost_pct = ARG_DEF( - NULL, "gf-cbr-boost", 1, "Boost for Golden Frame in CBR mode (pct)"); static const arg_def_t max_inter_rate_pct = ARG_DEF(NULL, "max-inter-rate", 1, "Max P-frame bitrate (pct)"); static const arg_def_t min_gf_interval = ARG_DEF( |