diff options
Diffstat (limited to 'chromium/third_party/libvpx/source/libvpx/vpx_dsp')
40 files changed, 6334 insertions, 5318 deletions
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/avg_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/avg_neon.c index 001517d33ee..cca9a932423 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/avg_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/avg_neon.c @@ -15,6 +15,7 @@ #include "./vpx_config.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/idct_neon.h" static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) { const uint32x4_t a = vpaddlq_u16(v_16x8); @@ -64,13 +65,13 @@ unsigned int vpx_avg_8x8_neon(const uint8_t *s, int p) { // coeff: 16 bits, dynamic range [-32640, 32640]. // length: value range {16, 64, 256, 1024}. -int vpx_satd_neon(const int16_t *coeff, int length) { +int vpx_satd_neon(const tran_low_t *coeff, int length) { const int16x4_t zero = vdup_n_s16(0); int32x4_t accum = vdupq_n_s32(0); do { - const int16x8_t src0 = vld1q_s16(coeff); - const int16x8_t src8 = vld1q_s16(coeff + 8); + const int16x8_t src0 = load_tran_low_to_s16q(coeff); + const int16x8_t src8 = load_tran_low_to_s16q(coeff + 8); accum = vabal_s16(accum, vget_low_s16(src0), zero); accum = vabal_s16(accum, vget_high_s16(src0), zero); accum = vabal_s16(accum, vget_low_s16(src8), zero); diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c index e9503f13d70..96f6de1be95 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c @@ -12,8 +12,11 @@ #include "./vpx_config.h" #include "vpx_dsp/txfm_common.h" +#include "vpx_dsp/vpx_dsp_common.h" +#include "vpx_dsp/arm/idct_neon.h" -void vpx_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) { +void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output, + int stride) { int i; // stage 1 int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2); @@ -191,18 +194,18 @@ void vpx_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) { input_6 = vhsubq_s16(input_6, sign_in6); input_7 = vhsubq_s16(input_7, sign_in7); // store results - vst1q_s16(&final_output[0 * 8], input_0); - vst1q_s16(&final_output[1 * 8], input_1); - vst1q_s16(&final_output[2 * 8], input_2); - vst1q_s16(&final_output[3 * 8], input_3); - vst1q_s16(&final_output[4 * 8], input_4); - vst1q_s16(&final_output[5 * 8], input_5); - vst1q_s16(&final_output[6 * 8], input_6); - vst1q_s16(&final_output[7 * 8], input_7); + store_s16q_to_tran_low(final_output + 0 * 8, input_0); + store_s16q_to_tran_low(final_output + 1 * 8, input_1); + store_s16q_to_tran_low(final_output + 2 * 8, input_2); + store_s16q_to_tran_low(final_output + 3 * 8, input_3); + store_s16q_to_tran_low(final_output + 4 * 8, input_4); + store_s16q_to_tran_low(final_output + 5 * 8, input_5); + store_s16q_to_tran_low(final_output + 6 * 8, input_6); + store_s16q_to_tran_low(final_output + 7 * 8, input_7); } } -void vpx_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) { +void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) { int r; int16x8_t sum = vld1q_s16(&input[0]); for (r = 1; r < 8; ++r) { @@ -214,7 +217,11 @@ void vpx_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) { const int64x2_t b = vpaddlq_s32(a); const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), vreinterpret_s32_s64(vget_high_s64(b))); +#if CONFIG_VP9_HIGHBITDEPTH + output[0] = vget_lane_s32(c, 0); +#else output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0); +#endif output[1] = 0; } } diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/hadamard_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/hadamard_neon.c index 977323497a8..ebeafed31fd 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/hadamard_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/hadamard_neon.c @@ -11,6 +11,8 @@ #include <arm_neon.h> #include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/idct_neon.h" #include "vpx_dsp/arm/transpose_neon.h" static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, @@ -45,7 +47,7 @@ static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, } void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, - int16_t *coeff) { + tran_low_t *coeff) { int16x8_t a0 = vld1q_s16(src_diff); int16x8_t a1 = vld1q_s16(src_diff + src_stride); int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride); @@ -63,18 +65,18 @@ void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, // Skip the second transpose because it is not required. - vst1q_s16(coeff + 0, a0); - vst1q_s16(coeff + 8, a1); - vst1q_s16(coeff + 16, a2); - vst1q_s16(coeff + 24, a3); - vst1q_s16(coeff + 32, a4); - vst1q_s16(coeff + 40, a5); - vst1q_s16(coeff + 48, a6); - vst1q_s16(coeff + 56, a7); + store_s16q_to_tran_low(coeff + 0, a0); + store_s16q_to_tran_low(coeff + 8, a1); + store_s16q_to_tran_low(coeff + 16, a2); + store_s16q_to_tran_low(coeff + 24, a3); + store_s16q_to_tran_low(coeff + 32, a4); + store_s16q_to_tran_low(coeff + 40, a5); + store_s16q_to_tran_low(coeff + 48, a6); + store_s16q_to_tran_low(coeff + 56, a7); } void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride, - int16_t *coeff) { + tran_low_t *coeff) { int i; /* Rearrange 16x16 to 8x32 and remove stride. @@ -88,10 +90,10 @@ void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride, vpx_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192); for (i = 0; i < 64; i += 8) { - const int16x8_t a0 = vld1q_s16(coeff + 0); - const int16x8_t a1 = vld1q_s16(coeff + 64); - const int16x8_t a2 = vld1q_s16(coeff + 128); - const int16x8_t a3 = vld1q_s16(coeff + 192); + const int16x8_t a0 = load_tran_low_to_s16q(coeff + 0); + const int16x8_t a1 = load_tran_low_to_s16q(coeff + 64); + const int16x8_t a2 = load_tran_low_to_s16q(coeff + 128); + const int16x8_t a3 = load_tran_low_to_s16q(coeff + 192); const int16x8_t b0 = vhaddq_s16(a0, a1); const int16x8_t b1 = vhsubq_s16(a0, a1); @@ -103,10 +105,10 @@ void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride, const int16x8_t c2 = vsubq_s16(b0, b2); const int16x8_t c3 = vsubq_s16(b1, b3); - vst1q_s16(coeff + 0, c0); - vst1q_s16(coeff + 64, c1); - vst1q_s16(coeff + 128, c2); - vst1q_s16(coeff + 192, c3); + store_s16q_to_tran_low(coeff + 0, c0); + store_s16q_to_tran_low(coeff + 64, c1); + store_s16q_to_tran_low(coeff + 128, c2); + store_s16q_to_tran_low(coeff + 192, c3); coeff += 8; } diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c new file mode 100644 index 00000000000..d361c8263a8 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c @@ -0,0 +1,1512 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/inv_txfm.h" + +static INLINE void highbd_idct16x16_add_wrap_low_8x2(const int64x2x2_t *const t, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int32x2x2_t t32[4]; + + t32[0].val[0] = vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS); + t32[0].val[1] = vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS); + t32[1].val[0] = vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS); + t32[1].val[1] = vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS); + t32[2].val[0] = vrshrn_n_s64(t[2].val[0], DCT_CONST_BITS); + t32[2].val[1] = vrshrn_n_s64(t[2].val[1], DCT_CONST_BITS); + t32[3].val[0] = vrshrn_n_s64(t[3].val[0], DCT_CONST_BITS); + t32[3].val[1] = vrshrn_n_s64(t[3].val[1], DCT_CONST_BITS); + d0->val[0] = vcombine_s32(t32[0].val[0], t32[0].val[1]); + d0->val[1] = vcombine_s32(t32[1].val[0], t32[1].val[1]); + d1->val[0] = vcombine_s32(t32[2].val[0], t32[2].val[1]); + d1->val[1] = vcombine_s32(t32[3].val[0], t32[3].val[1]); +} + +static INLINE void highbd_idct16x16_add_wrap_low_4x2(const int64x2x2_t *const t, + int32x4_t *const d0, + int32x4_t *const d1) { + int32x2x2_t t32[2]; + + t32[0].val[0] = vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS); + t32[0].val[1] = vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS); + t32[1].val[0] = vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS); + t32[1].val[1] = vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS); + *d0 = vcombine_s32(t32[0].val[0], t32[0].val[1]); + *d1 = vcombine_s32(t32[1].val[0], t32[1].val[1]); +} + +static INLINE int32x4x2_t +highbd_idct16x16_add_wrap_low_8x1(const int64x2x2_t *const t) { + int32x2x2_t t32[2]; + int32x4x2_t d; + + t32[0].val[0] = vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS); + t32[0].val[1] = vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS); + t32[1].val[0] = vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS); + t32[1].val[1] = vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS); + d.val[0] = vcombine_s32(t32[0].val[0], t32[0].val[1]); + d.val[1] = vcombine_s32(t32[1].val[0], t32[1].val[1]); + return d; +} + +static INLINE int32x4_t highbd_idct16x16_add_wrap_low_4x1(const int64x2x2_t t) { + int32x2x2_t t32; + + t32.val[0] = vrshrn_n_s64(t.val[0], DCT_CONST_BITS); + t32.val[1] = vrshrn_n_s64(t.val[1], DCT_CONST_BITS); + return vcombine_s32(t32.val[0], t32.val[1]); +} + +static INLINE void highbd_idct_cospi_2_30(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_2_30_10_22, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[4]; + + t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]), + vget_low_s32(cospi_2_30_10_22), 1); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]), + vget_low_s32(cospi_2_30_10_22), 1); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]), + vget_low_s32(cospi_2_30_10_22), 1); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]), + vget_low_s32(cospi_2_30_10_22), 1); + t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]), + vget_low_s32(cospi_2_30_10_22), 1); + t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]), + vget_low_s32(cospi_2_30_10_22), 1); + t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]), + vget_low_s32(cospi_2_30_10_22), 1); + t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]), + vget_low_s32(cospi_2_30_10_22), 1); + t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]), + vget_low_s32(cospi_2_30_10_22), 0); + t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]), + vget_low_s32(cospi_2_30_10_22), 0); + t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]), + vget_low_s32(cospi_2_30_10_22), 0); + t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]), + vget_low_s32(cospi_2_30_10_22), 0); + t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]), + vget_low_s32(cospi_2_30_10_22), 0); + t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]), + vget_low_s32(cospi_2_30_10_22), 0); + t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]), + vget_low_s32(cospi_2_30_10_22), 0); + t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), + vget_low_s32(cospi_2_30_10_22), 0); + highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_4_28(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_4_12_20N_28, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[4]; + + t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]), + vget_high_s32(cospi_4_12_20N_28), 1); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]), + vget_high_s32(cospi_4_12_20N_28), 1); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]), + vget_high_s32(cospi_4_12_20N_28), 1); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]), + vget_high_s32(cospi_4_12_20N_28), 1); + t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]), + vget_high_s32(cospi_4_12_20N_28), 1); + t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]), + vget_high_s32(cospi_4_12_20N_28), 1); + t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]), + vget_high_s32(cospi_4_12_20N_28), 1); + t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]), + vget_high_s32(cospi_4_12_20N_28), 1); + t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]), + vget_low_s32(cospi_4_12_20N_28), 0); + t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]), + vget_low_s32(cospi_4_12_20N_28), 0); + t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]), + vget_low_s32(cospi_4_12_20N_28), 0); + t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]), + vget_low_s32(cospi_4_12_20N_28), 0); + t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]), + vget_low_s32(cospi_4_12_20N_28), 0); + t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]), + vget_low_s32(cospi_4_12_20N_28), 0); + t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]), + vget_low_s32(cospi_4_12_20N_28), 0); + t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), + vget_low_s32(cospi_4_12_20N_28), 0); + highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_6_26(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_6_26_14_18N, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[4]; + + t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]), + vget_low_s32(cospi_6_26_14_18N), 0); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]), + vget_low_s32(cospi_6_26_14_18N), 0); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]), + vget_low_s32(cospi_6_26_14_18N), 0); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]), + vget_low_s32(cospi_6_26_14_18N), 0); + t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]), + vget_low_s32(cospi_6_26_14_18N), 0); + t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]), + vget_low_s32(cospi_6_26_14_18N), 0); + t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]), + vget_low_s32(cospi_6_26_14_18N), 0); + t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]), + vget_low_s32(cospi_6_26_14_18N), 0); + t[0].val[0] = vmlal_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]), + vget_low_s32(cospi_6_26_14_18N), 1); + t[0].val[1] = vmlal_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]), + vget_low_s32(cospi_6_26_14_18N), 1); + t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]), + vget_low_s32(cospi_6_26_14_18N), 1); + t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]), + vget_low_s32(cospi_6_26_14_18N), 1); + t[2].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]), + vget_low_s32(cospi_6_26_14_18N), 1); + t[2].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]), + vget_low_s32(cospi_6_26_14_18N), 1); + t[3].val[0] = vmlsl_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]), + vget_low_s32(cospi_6_26_14_18N), 1); + t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), + vget_low_s32(cospi_6_26_14_18N), 1); + highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_10_22(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_2_30_10_22, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[4]; + + t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]), + vget_high_s32(cospi_2_30_10_22), 1); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]), + vget_high_s32(cospi_2_30_10_22), 1); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]), + vget_high_s32(cospi_2_30_10_22), 1); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]), + vget_high_s32(cospi_2_30_10_22), 1); + t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]), + vget_high_s32(cospi_2_30_10_22), 1); + t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]), + vget_high_s32(cospi_2_30_10_22), 1); + t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]), + vget_high_s32(cospi_2_30_10_22), 1); + t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]), + vget_high_s32(cospi_2_30_10_22), 1); + t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]), + vget_high_s32(cospi_2_30_10_22), 0); + t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]), + vget_high_s32(cospi_2_30_10_22), 0); + t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]), + vget_high_s32(cospi_2_30_10_22), 0); + t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]), + vget_high_s32(cospi_2_30_10_22), 0); + t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]), + vget_high_s32(cospi_2_30_10_22), 0); + t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]), + vget_high_s32(cospi_2_30_10_22), 0); + t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]), + vget_high_s32(cospi_2_30_10_22), 0); + t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), + vget_high_s32(cospi_2_30_10_22), 0); + highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_12_20(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_4_12_20N_28, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[4]; + + t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]), + vget_low_s32(cospi_4_12_20N_28), 1); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]), + vget_low_s32(cospi_4_12_20N_28), 1); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]), + vget_low_s32(cospi_4_12_20N_28), 1); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]), + vget_low_s32(cospi_4_12_20N_28), 1); + t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]), + vget_low_s32(cospi_4_12_20N_28), 1); + t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]), + vget_low_s32(cospi_4_12_20N_28), 1); + t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]), + vget_low_s32(cospi_4_12_20N_28), 1); + t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]), + vget_low_s32(cospi_4_12_20N_28), 1); + t[0].val[0] = vmlal_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]), + vget_high_s32(cospi_4_12_20N_28), 0); + t[0].val[1] = vmlal_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]), + vget_high_s32(cospi_4_12_20N_28), 0); + t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]), + vget_high_s32(cospi_4_12_20N_28), 0); + t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]), + vget_high_s32(cospi_4_12_20N_28), 0); + t[2].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]), + vget_high_s32(cospi_4_12_20N_28), 0); + t[2].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]), + vget_high_s32(cospi_4_12_20N_28), 0); + t[3].val[0] = vmlsl_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]), + vget_high_s32(cospi_4_12_20N_28), 0); + t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), + vget_high_s32(cospi_4_12_20N_28), 0); + highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_14_18(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_6_26_14_18N, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[4]; + + t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]), + vget_high_s32(cospi_6_26_14_18N), 0); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]), + vget_high_s32(cospi_6_26_14_18N), 0); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]), + vget_high_s32(cospi_6_26_14_18N), 0); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]), + vget_high_s32(cospi_6_26_14_18N), 0); + t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]), + vget_high_s32(cospi_6_26_14_18N), 0); + t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]), + vget_high_s32(cospi_6_26_14_18N), 0); + t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]), + vget_high_s32(cospi_6_26_14_18N), 0); + t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]), + vget_high_s32(cospi_6_26_14_18N), 0); + t[0].val[0] = vmlal_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]), + vget_high_s32(cospi_6_26_14_18N), 1); + t[0].val[1] = vmlal_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]), + vget_high_s32(cospi_6_26_14_18N), 1); + t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]), + vget_high_s32(cospi_6_26_14_18N), 1); + t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]), + vget_high_s32(cospi_6_26_14_18N), 1); + t[2].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]), + vget_high_s32(cospi_6_26_14_18N), 1); + t[2].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]), + vget_high_s32(cospi_6_26_14_18N), 1); + t[3].val[0] = vmlsl_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]), + vget_high_s32(cospi_6_26_14_18N), 1); + t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), + vget_high_s32(cospi_6_26_14_18N), 1); + highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_8_24_q_kernel( + const int32x4x2_t s0, const int32x4x2_t s1, const int32x4_t cospi_0_8_16_24, + int64x2x2_t *const t) { + t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]), + vget_high_s32(cospi_0_8_16_24), 1); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]), + vget_high_s32(cospi_0_8_16_24), 1); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]), + vget_high_s32(cospi_0_8_16_24), 1); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]), + vget_high_s32(cospi_0_8_16_24), 1); + t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]), + vget_high_s32(cospi_0_8_16_24), 1); + t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]), + vget_high_s32(cospi_0_8_16_24), 1); + t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]), + vget_high_s32(cospi_0_8_16_24), 1); + t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]), + vget_high_s32(cospi_0_8_16_24), 1); + t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]), + vget_low_s32(cospi_0_8_16_24), 1); + t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]), + vget_low_s32(cospi_0_8_16_24), 1); + t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]), + vget_low_s32(cospi_0_8_16_24), 1); + t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]), + vget_low_s32(cospi_0_8_16_24), 1); + t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]), + vget_low_s32(cospi_0_8_16_24), 1); + t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]), + vget_low_s32(cospi_0_8_16_24), 1); + t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]), + vget_low_s32(cospi_0_8_16_24), 1); + t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]), + vget_low_s32(cospi_0_8_16_24), 1); +} + +static INLINE void highbd_idct_cospi_8_24_d_kernel( + const int32x4_t s0, const int32x4_t s1, const int32x4_t cospi_0_8_16_24, + int64x2x2_t *const t) { + t[0].val[0] = + vmull_lane_s32(vget_low_s32(s0), vget_high_s32(cospi_0_8_16_24), 1); + t[0].val[1] = + vmull_lane_s32(vget_high_s32(s0), vget_high_s32(cospi_0_8_16_24), 1); + t[1].val[0] = + vmull_lane_s32(vget_low_s32(s1), vget_high_s32(cospi_0_8_16_24), 1); + t[1].val[1] = + vmull_lane_s32(vget_high_s32(s1), vget_high_s32(cospi_0_8_16_24), 1); + t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1), + vget_low_s32(cospi_0_8_16_24), 1); + t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1), + vget_low_s32(cospi_0_8_16_24), 1); + t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s0), + vget_low_s32(cospi_0_8_16_24), 1); + t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s0), + vget_low_s32(cospi_0_8_16_24), 1); +} + +static INLINE void highbd_idct_cospi_8_24_q(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_0_8_16_24, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[4]; + + highbd_idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t); + highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_8_24_d(const int32x4_t s0, + const int32x4_t s1, + const int32x4_t cospi_0_8_16_24, + int32x4_t *const d0, + int32x4_t *const d1) { + int64x2x2_t t[2]; + + highbd_idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t); + highbd_idct16x16_add_wrap_low_4x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_8_24_neg_q(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_0_8_16_24, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[4]; + + highbd_idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t); + t[2].val[0] = vsubq_s64(vdupq_n_s64(0), t[2].val[0]); + t[2].val[1] = vsubq_s64(vdupq_n_s64(0), t[2].val[1]); + t[3].val[0] = vsubq_s64(vdupq_n_s64(0), t[3].val[0]); + t[3].val[1] = vsubq_s64(vdupq_n_s64(0), t[3].val[1]); + highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_8_24_neg_d(const int32x4_t s0, + const int32x4_t s1, + const int32x4_t cospi_0_8_16_24, + int32x4_t *const d0, + int32x4_t *const d1) { + int64x2x2_t t[2]; + + highbd_idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t); + t[1].val[0] = vsubq_s64(vdupq_n_s64(0), t[1].val[0]); + t[1].val[1] = vsubq_s64(vdupq_n_s64(0), t[1].val[1]); + highbd_idct16x16_add_wrap_low_4x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_16_16_q(const int32x4x2_t s0, + const int32x4x2_t s1, + const int32x4_t cospi_0_8_16_24, + int32x4x2_t *const d0, + int32x4x2_t *const d1) { + int64x2x2_t t[6]; + + t[4].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]), + vget_high_s32(cospi_0_8_16_24), 0); + t[4].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]), + vget_high_s32(cospi_0_8_16_24), 0); + t[5].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]), + vget_high_s32(cospi_0_8_16_24), 0); + t[5].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]), + vget_high_s32(cospi_0_8_16_24), 0); + t[0].val[0] = vmlsl_lane_s32(t[4].val[0], vget_low_s32(s0.val[0]), + vget_high_s32(cospi_0_8_16_24), 0); + t[0].val[1] = vmlsl_lane_s32(t[4].val[1], vget_high_s32(s0.val[0]), + vget_high_s32(cospi_0_8_16_24), 0); + t[1].val[0] = vmlsl_lane_s32(t[5].val[0], vget_low_s32(s0.val[1]), + vget_high_s32(cospi_0_8_16_24), 0); + t[1].val[1] = vmlsl_lane_s32(t[5].val[1], vget_high_s32(s0.val[1]), + vget_high_s32(cospi_0_8_16_24), 0); + t[2].val[0] = vmlal_lane_s32(t[4].val[0], vget_low_s32(s0.val[0]), + vget_high_s32(cospi_0_8_16_24), 0); + t[2].val[1] = vmlal_lane_s32(t[4].val[1], vget_high_s32(s0.val[0]), + vget_high_s32(cospi_0_8_16_24), 0); + t[3].val[0] = vmlal_lane_s32(t[5].val[0], vget_low_s32(s0.val[1]), + vget_high_s32(cospi_0_8_16_24), 0); + t[3].val[1] = vmlal_lane_s32(t[5].val[1], vget_high_s32(s0.val[1]), + vget_high_s32(cospi_0_8_16_24), 0); + highbd_idct16x16_add_wrap_low_8x2(t, d0, d1); +} + +static INLINE void highbd_idct_cospi_16_16_d(const int32x4_t s0, + const int32x4_t s1, + const int32x4_t cospi_0_8_16_24, + int32x4_t *const d0, + int32x4_t *const d1) { + int64x2x2_t t[3]; + + t[2].val[0] = + vmull_lane_s32(vget_low_s32(s1), vget_high_s32(cospi_0_8_16_24), 0); + t[2].val[1] = + vmull_lane_s32(vget_high_s32(s1), vget_high_s32(cospi_0_8_16_24), 0); + t[0].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0), + vget_high_s32(cospi_0_8_16_24), 0); + t[0].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0), + vget_high_s32(cospi_0_8_16_24), 0); + t[1].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0), + vget_high_s32(cospi_0_8_16_24), 0); + t[1].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0), + vget_high_s32(cospi_0_8_16_24), 0); + highbd_idct16x16_add_wrap_low_4x2(t, d0, d1); +} + +static INLINE void highbd_idct16x16_add_stage7_dual( + const int32x4x2_t *const step2, int32x4x2_t *const out) { + out[0].val[0] = vaddq_s32(step2[0].val[0], step2[15].val[0]); + out[0].val[1] = vaddq_s32(step2[0].val[1], step2[15].val[1]); + out[1].val[0] = vaddq_s32(step2[1].val[0], step2[14].val[0]); + out[1].val[1] = vaddq_s32(step2[1].val[1], step2[14].val[1]); + out[2].val[0] = vaddq_s32(step2[2].val[0], step2[13].val[0]); + out[2].val[1] = vaddq_s32(step2[2].val[1], step2[13].val[1]); + out[3].val[0] = vaddq_s32(step2[3].val[0], step2[12].val[0]); + out[3].val[1] = vaddq_s32(step2[3].val[1], step2[12].val[1]); + out[4].val[0] = vaddq_s32(step2[4].val[0], step2[11].val[0]); + out[4].val[1] = vaddq_s32(step2[4].val[1], step2[11].val[1]); + out[5].val[0] = vaddq_s32(step2[5].val[0], step2[10].val[0]); + out[5].val[1] = vaddq_s32(step2[5].val[1], step2[10].val[1]); + out[6].val[0] = vaddq_s32(step2[6].val[0], step2[9].val[0]); + out[6].val[1] = vaddq_s32(step2[6].val[1], step2[9].val[1]); + out[7].val[0] = vaddq_s32(step2[7].val[0], step2[8].val[0]); + out[7].val[1] = vaddq_s32(step2[7].val[1], step2[8].val[1]); + out[8].val[0] = vsubq_s32(step2[7].val[0], step2[8].val[0]); + out[8].val[1] = vsubq_s32(step2[7].val[1], step2[8].val[1]); + out[9].val[0] = vsubq_s32(step2[6].val[0], step2[9].val[0]); + out[9].val[1] = vsubq_s32(step2[6].val[1], step2[9].val[1]); + out[10].val[0] = vsubq_s32(step2[5].val[0], step2[10].val[0]); + out[10].val[1] = vsubq_s32(step2[5].val[1], step2[10].val[1]); + out[11].val[0] = vsubq_s32(step2[4].val[0], step2[11].val[0]); + out[11].val[1] = vsubq_s32(step2[4].val[1], step2[11].val[1]); + out[12].val[0] = vsubq_s32(step2[3].val[0], step2[12].val[0]); + out[12].val[1] = vsubq_s32(step2[3].val[1], step2[12].val[1]); + out[13].val[0] = vsubq_s32(step2[2].val[0], step2[13].val[0]); + out[13].val[1] = vsubq_s32(step2[2].val[1], step2[13].val[1]); + out[14].val[0] = vsubq_s32(step2[1].val[0], step2[14].val[0]); + out[14].val[1] = vsubq_s32(step2[1].val[1], step2[14].val[1]); + out[15].val[0] = vsubq_s32(step2[0].val[0], step2[15].val[0]); + out[15].val[1] = vsubq_s32(step2[0].val[1], step2[15].val[1]); +} + +static INLINE void highbd_idct16x16_add_stage7(const int32x4_t *const step2, + int32x4_t *const out) { + out[0] = vaddq_s32(step2[0], step2[15]); + out[1] = vaddq_s32(step2[1], step2[14]); + out[2] = vaddq_s32(step2[2], step2[13]); + out[3] = vaddq_s32(step2[3], step2[12]); + out[4] = vaddq_s32(step2[4], step2[11]); + out[5] = vaddq_s32(step2[5], step2[10]); + out[6] = vaddq_s32(step2[6], step2[9]); + out[7] = vaddq_s32(step2[7], step2[8]); + out[8] = vsubq_s32(step2[7], step2[8]); + out[9] = vsubq_s32(step2[6], step2[9]); + out[10] = vsubq_s32(step2[5], step2[10]); + out[11] = vsubq_s32(step2[4], step2[11]); + out[12] = vsubq_s32(step2[3], step2[12]); + out[13] = vsubq_s32(step2[2], step2[13]); + out[14] = vsubq_s32(step2[1], step2[14]); + out[15] = vsubq_s32(step2[0], step2[15]); +} + +static INLINE void highbd_idct16x16_store_pass1(const int32x4x2_t *const out, + int32_t *output) { + // Save the result into output + vst1q_s32(output + 0, out[0].val[0]); + vst1q_s32(output + 4, out[0].val[1]); + output += 16; + vst1q_s32(output + 0, out[1].val[0]); + vst1q_s32(output + 4, out[1].val[1]); + output += 16; + vst1q_s32(output + 0, out[2].val[0]); + vst1q_s32(output + 4, out[2].val[1]); + output += 16; + vst1q_s32(output + 0, out[3].val[0]); + vst1q_s32(output + 4, out[3].val[1]); + output += 16; + vst1q_s32(output + 0, out[4].val[0]); + vst1q_s32(output + 4, out[4].val[1]); + output += 16; + vst1q_s32(output + 0, out[5].val[0]); + vst1q_s32(output + 4, out[5].val[1]); + output += 16; + vst1q_s32(output + 0, out[6].val[0]); + vst1q_s32(output + 4, out[6].val[1]); + output += 16; + vst1q_s32(output + 0, out[7].val[0]); + vst1q_s32(output + 4, out[7].val[1]); + output += 16; + vst1q_s32(output + 0, out[8].val[0]); + vst1q_s32(output + 4, out[8].val[1]); + output += 16; + vst1q_s32(output + 0, out[9].val[0]); + vst1q_s32(output + 4, out[9].val[1]); + output += 16; + vst1q_s32(output + 0, out[10].val[0]); + vst1q_s32(output + 4, out[10].val[1]); + output += 16; + vst1q_s32(output + 0, out[11].val[0]); + vst1q_s32(output + 4, out[11].val[1]); + output += 16; + vst1q_s32(output + 0, out[12].val[0]); + vst1q_s32(output + 4, out[12].val[1]); + output += 16; + vst1q_s32(output + 0, out[13].val[0]); + vst1q_s32(output + 4, out[13].val[1]); + output += 16; + vst1q_s32(output + 0, out[14].val[0]); + vst1q_s32(output + 4, out[14].val[1]); + output += 16; + vst1q_s32(output + 0, out[15].val[0]); + vst1q_s32(output + 4, out[15].val[1]); +} + +static INLINE void highbd_idct16x16_add_store(const int32x4x2_t *const out, + uint16_t *dest, const int stride, + const int bd) { + // Add the result to dest + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + int16x8_t o[16]; + o[0] = vcombine_s16(vrshrn_n_s32(out[0].val[0], 6), + vrshrn_n_s32(out[0].val[1], 6)); + o[1] = vcombine_s16(vrshrn_n_s32(out[1].val[0], 6), + vrshrn_n_s32(out[1].val[1], 6)); + o[2] = vcombine_s16(vrshrn_n_s32(out[2].val[0], 6), + vrshrn_n_s32(out[2].val[1], 6)); + o[3] = vcombine_s16(vrshrn_n_s32(out[3].val[0], 6), + vrshrn_n_s32(out[3].val[1], 6)); + o[4] = vcombine_s16(vrshrn_n_s32(out[4].val[0], 6), + vrshrn_n_s32(out[4].val[1], 6)); + o[5] = vcombine_s16(vrshrn_n_s32(out[5].val[0], 6), + vrshrn_n_s32(out[5].val[1], 6)); + o[6] = vcombine_s16(vrshrn_n_s32(out[6].val[0], 6), + vrshrn_n_s32(out[6].val[1], 6)); + o[7] = vcombine_s16(vrshrn_n_s32(out[7].val[0], 6), + vrshrn_n_s32(out[7].val[1], 6)); + o[8] = vcombine_s16(vrshrn_n_s32(out[8].val[0], 6), + vrshrn_n_s32(out[8].val[1], 6)); + o[9] = vcombine_s16(vrshrn_n_s32(out[9].val[0], 6), + vrshrn_n_s32(out[9].val[1], 6)); + o[10] = vcombine_s16(vrshrn_n_s32(out[10].val[0], 6), + vrshrn_n_s32(out[10].val[1], 6)); + o[11] = vcombine_s16(vrshrn_n_s32(out[11].val[0], 6), + vrshrn_n_s32(out[11].val[1], 6)); + o[12] = vcombine_s16(vrshrn_n_s32(out[12].val[0], 6), + vrshrn_n_s32(out[12].val[1], 6)); + o[13] = vcombine_s16(vrshrn_n_s32(out[13].val[0], 6), + vrshrn_n_s32(out[13].val[1], 6)); + o[14] = vcombine_s16(vrshrn_n_s32(out[14].val[0], 6), + vrshrn_n_s32(out[14].val[1], 6)); + o[15] = vcombine_s16(vrshrn_n_s32(out[15].val[0], 6), + vrshrn_n_s32(out[15].val[1], 6)); + highbd_idct16x16_add8x1(o[0], max, &dest, stride); + highbd_idct16x16_add8x1(o[1], max, &dest, stride); + highbd_idct16x16_add8x1(o[2], max, &dest, stride); + highbd_idct16x16_add8x1(o[3], max, &dest, stride); + highbd_idct16x16_add8x1(o[4], max, &dest, stride); + highbd_idct16x16_add8x1(o[5], max, &dest, stride); + highbd_idct16x16_add8x1(o[6], max, &dest, stride); + highbd_idct16x16_add8x1(o[7], max, &dest, stride); + highbd_idct16x16_add8x1(o[8], max, &dest, stride); + highbd_idct16x16_add8x1(o[9], max, &dest, stride); + highbd_idct16x16_add8x1(o[10], max, &dest, stride); + highbd_idct16x16_add8x1(o[11], max, &dest, stride); + highbd_idct16x16_add8x1(o[12], max, &dest, stride); + highbd_idct16x16_add8x1(o[13], max, &dest, stride); + highbd_idct16x16_add8x1(o[14], max, &dest, stride); + highbd_idct16x16_add8x1(o[15], max, &dest, stride); +} + +static void highbd_idct16x16_256_add_half1d(const int32_t *input, + int32_t *output, uint16_t *dest, + const int stride, const int bd) { + const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0); + const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4); + const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8); + const int32x4_t cospi_6_26_14_18N = vld1q_s32(kCospi32 + 12); + int32x4x2_t in[16], step1[16], step2[16], out[16]; + + // Load input (16x8) + in[0].val[0] = vld1q_s32(input); + in[0].val[1] = vld1q_s32(input + 4); + input += 8; + in[8].val[0] = vld1q_s32(input); + in[8].val[1] = vld1q_s32(input + 4); + input += 8; + in[1].val[0] = vld1q_s32(input); + in[1].val[1] = vld1q_s32(input + 4); + input += 8; + in[9].val[0] = vld1q_s32(input); + in[9].val[1] = vld1q_s32(input + 4); + input += 8; + in[2].val[0] = vld1q_s32(input); + in[2].val[1] = vld1q_s32(input + 4); + input += 8; + in[10].val[0] = vld1q_s32(input); + in[10].val[1] = vld1q_s32(input + 4); + input += 8; + in[3].val[0] = vld1q_s32(input); + in[3].val[1] = vld1q_s32(input + 4); + input += 8; + in[11].val[0] = vld1q_s32(input); + in[11].val[1] = vld1q_s32(input + 4); + input += 8; + in[4].val[0] = vld1q_s32(input); + in[4].val[1] = vld1q_s32(input + 4); + input += 8; + in[12].val[0] = vld1q_s32(input); + in[12].val[1] = vld1q_s32(input + 4); + input += 8; + in[5].val[0] = vld1q_s32(input); + in[5].val[1] = vld1q_s32(input + 4); + input += 8; + in[13].val[0] = vld1q_s32(input); + in[13].val[1] = vld1q_s32(input + 4); + input += 8; + in[6].val[0] = vld1q_s32(input); + in[6].val[1] = vld1q_s32(input + 4); + input += 8; + in[14].val[0] = vld1q_s32(input); + in[14].val[1] = vld1q_s32(input + 4); + input += 8; + in[7].val[0] = vld1q_s32(input); + in[7].val[1] = vld1q_s32(input + 4); + input += 8; + in[15].val[0] = vld1q_s32(input); + in[15].val[1] = vld1q_s32(input + 4); + + // Transpose + transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + transpose_s32_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14], + &in[15]); + + // stage 1 + step1[0] = in[0 / 2]; + step1[1] = in[16 / 2]; + step1[2] = in[8 / 2]; + step1[3] = in[24 / 2]; + step1[4] = in[4 / 2]; + step1[5] = in[20 / 2]; + step1[6] = in[12 / 2]; + step1[7] = in[28 / 2]; + step1[8] = in[2 / 2]; + step1[9] = in[18 / 2]; + step1[10] = in[10 / 2]; + step1[11] = in[26 / 2]; + step1[12] = in[6 / 2]; + step1[13] = in[22 / 2]; + step1[14] = in[14 / 2]; + step1[15] = in[30 / 2]; + + // stage 2 + step2[0] = step1[0]; + step2[1] = step1[1]; + step2[2] = step1[2]; + step2[3] = step1[3]; + step2[4] = step1[4]; + step2[5] = step1[5]; + step2[6] = step1[6]; + step2[7] = step1[7]; + highbd_idct_cospi_2_30(step1[8], step1[15], cospi_2_30_10_22, &step2[8], + &step2[15]); + highbd_idct_cospi_14_18(step1[9], step1[14], cospi_6_26_14_18N, &step2[9], + &step2[14]); + highbd_idct_cospi_10_22(step1[10], step1[13], cospi_2_30_10_22, &step2[10], + &step2[13]); + highbd_idct_cospi_6_26(step1[11], step1[12], cospi_6_26_14_18N, &step2[11], + &step2[12]); + + // stage 3 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[2]; + step1[3] = step2[3]; + highbd_idct_cospi_4_28(step2[4], step2[7], cospi_4_12_20N_28, &step1[4], + &step1[7]); + highbd_idct_cospi_12_20(step2[5], step2[6], cospi_4_12_20N_28, &step1[5], + &step1[6]); + step1[8].val[0] = vaddq_s32(step2[8].val[0], step2[9].val[0]); + step1[8].val[1] = vaddq_s32(step2[8].val[1], step2[9].val[1]); + step1[9].val[0] = vsubq_s32(step2[8].val[0], step2[9].val[0]); + step1[9].val[1] = vsubq_s32(step2[8].val[1], step2[9].val[1]); + step1[10].val[0] = vsubq_s32(step2[11].val[0], step2[10].val[0]); + step1[10].val[1] = vsubq_s32(step2[11].val[1], step2[10].val[1]); + step1[11].val[0] = vaddq_s32(step2[11].val[0], step2[10].val[0]); + step1[11].val[1] = vaddq_s32(step2[11].val[1], step2[10].val[1]); + step1[12].val[0] = vaddq_s32(step2[12].val[0], step2[13].val[0]); + step1[12].val[1] = vaddq_s32(step2[12].val[1], step2[13].val[1]); + step1[13].val[0] = vsubq_s32(step2[12].val[0], step2[13].val[0]); + step1[13].val[1] = vsubq_s32(step2[12].val[1], step2[13].val[1]); + step1[14].val[0] = vsubq_s32(step2[15].val[0], step2[14].val[0]); + step1[14].val[1] = vsubq_s32(step2[15].val[1], step2[14].val[1]); + step1[15].val[0] = vaddq_s32(step2[15].val[0], step2[14].val[0]); + step1[15].val[1] = vaddq_s32(step2[15].val[1], step2[14].val[1]); + + // stage 4 + highbd_idct_cospi_16_16_q(step1[1], step1[0], cospi_0_8_16_24, &step2[1], + &step2[0]); + highbd_idct_cospi_8_24_q(step1[2], step1[3], cospi_0_8_16_24, &step2[2], + &step2[3]); + step2[4].val[0] = vaddq_s32(step1[4].val[0], step1[5].val[0]); + step2[4].val[1] = vaddq_s32(step1[4].val[1], step1[5].val[1]); + step2[5].val[0] = vsubq_s32(step1[4].val[0], step1[5].val[0]); + step2[5].val[1] = vsubq_s32(step1[4].val[1], step1[5].val[1]); + step2[6].val[0] = vsubq_s32(step1[7].val[0], step1[6].val[0]); + step2[6].val[1] = vsubq_s32(step1[7].val[1], step1[6].val[1]); + step2[7].val[0] = vaddq_s32(step1[7].val[0], step1[6].val[0]); + step2[7].val[1] = vaddq_s32(step1[7].val[1], step1[6].val[1]); + step2[8] = step1[8]; + highbd_idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9], + &step2[14]); + highbd_idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, + &step2[13], &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0].val[0] = vaddq_s32(step2[0].val[0], step2[3].val[0]); + step1[0].val[1] = vaddq_s32(step2[0].val[1], step2[3].val[1]); + step1[1].val[0] = vaddq_s32(step2[1].val[0], step2[2].val[0]); + step1[1].val[1] = vaddq_s32(step2[1].val[1], step2[2].val[1]); + step1[2].val[0] = vsubq_s32(step2[1].val[0], step2[2].val[0]); + step1[2].val[1] = vsubq_s32(step2[1].val[1], step2[2].val[1]); + step1[3].val[0] = vsubq_s32(step2[0].val[0], step2[3].val[0]); + step1[3].val[1] = vsubq_s32(step2[0].val[1], step2[3].val[1]); + step1[4] = step2[4]; + highbd_idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], + &step1[6]); + step1[7] = step2[7]; + step1[8].val[0] = vaddq_s32(step2[8].val[0], step2[11].val[0]); + step1[8].val[1] = vaddq_s32(step2[8].val[1], step2[11].val[1]); + step1[9].val[0] = vaddq_s32(step2[9].val[0], step2[10].val[0]); + step1[9].val[1] = vaddq_s32(step2[9].val[1], step2[10].val[1]); + step1[10].val[0] = vsubq_s32(step2[9].val[0], step2[10].val[0]); + step1[10].val[1] = vsubq_s32(step2[9].val[1], step2[10].val[1]); + step1[11].val[0] = vsubq_s32(step2[8].val[0], step2[11].val[0]); + step1[11].val[1] = vsubq_s32(step2[8].val[1], step2[11].val[1]); + step1[12].val[0] = vsubq_s32(step2[15].val[0], step2[12].val[0]); + step1[12].val[1] = vsubq_s32(step2[15].val[1], step2[12].val[1]); + step1[13].val[0] = vsubq_s32(step2[14].val[0], step2[13].val[0]); + step1[13].val[1] = vsubq_s32(step2[14].val[1], step2[13].val[1]); + step1[14].val[0] = vaddq_s32(step2[14].val[0], step2[13].val[0]); + step1[14].val[1] = vaddq_s32(step2[14].val[1], step2[13].val[1]); + step1[15].val[0] = vaddq_s32(step2[15].val[0], step2[12].val[0]); + step1[15].val[1] = vaddq_s32(step2[15].val[1], step2[12].val[1]); + + // stage 6 + step2[0].val[0] = vaddq_s32(step1[0].val[0], step1[7].val[0]); + step2[0].val[1] = vaddq_s32(step1[0].val[1], step1[7].val[1]); + step2[1].val[0] = vaddq_s32(step1[1].val[0], step1[6].val[0]); + step2[1].val[1] = vaddq_s32(step1[1].val[1], step1[6].val[1]); + step2[2].val[0] = vaddq_s32(step1[2].val[0], step1[5].val[0]); + step2[2].val[1] = vaddq_s32(step1[2].val[1], step1[5].val[1]); + step2[3].val[0] = vaddq_s32(step1[3].val[0], step1[4].val[0]); + step2[3].val[1] = vaddq_s32(step1[3].val[1], step1[4].val[1]); + step2[4].val[0] = vsubq_s32(step1[3].val[0], step1[4].val[0]); + step2[4].val[1] = vsubq_s32(step1[3].val[1], step1[4].val[1]); + step2[5].val[0] = vsubq_s32(step1[2].val[0], step1[5].val[0]); + step2[5].val[1] = vsubq_s32(step1[2].val[1], step1[5].val[1]); + step2[6].val[0] = vsubq_s32(step1[1].val[0], step1[6].val[0]); + step2[6].val[1] = vsubq_s32(step1[1].val[1], step1[6].val[1]); + step2[7].val[0] = vsubq_s32(step1[0].val[0], step1[7].val[0]); + step2[7].val[1] = vsubq_s32(step1[0].val[1], step1[7].val[1]); + highbd_idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10], + &step2[13]); + highbd_idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11], + &step2[12]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + highbd_idct16x16_add_stage7_dual(step2, out); + + if (output) { + highbd_idct16x16_store_pass1(out, output); + } else { + highbd_idct16x16_add_store(out, dest, stride, bd); + } +} + +static INLINE int32x4x2_t highbd_idct_cospi_lane0_dual(const int32x4x2_t s, + const int32x2_t coef) { + int64x2x2_t t[2]; + + t[0].val[0] = vmull_lane_s32(vget_low_s32(s.val[0]), coef, 0); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s.val[0]), coef, 0); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s.val[1]), coef, 0); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s.val[1]), coef, 0); + return highbd_idct16x16_add_wrap_low_8x1(t); +} + +static INLINE int32x4_t highbd_idct_cospi_lane0(const int32x4_t s, + const int32x2_t coef) { + int64x2x2_t t; + + t.val[0] = vmull_lane_s32(vget_low_s32(s), coef, 0); + t.val[1] = vmull_lane_s32(vget_high_s32(s), coef, 0); + return highbd_idct16x16_add_wrap_low_4x1(t); +} + +static INLINE int32x4x2_t highbd_idct_cospi_lane1_dual(const int32x4x2_t s, + const int32x2_t coef) { + int64x2x2_t t[2]; + + t[0].val[0] = vmull_lane_s32(vget_low_s32(s.val[0]), coef, 1); + t[0].val[1] = vmull_lane_s32(vget_high_s32(s.val[0]), coef, 1); + t[1].val[0] = vmull_lane_s32(vget_low_s32(s.val[1]), coef, 1); + t[1].val[1] = vmull_lane_s32(vget_high_s32(s.val[1]), coef, 1); + return highbd_idct16x16_add_wrap_low_8x1(t); +} + +static INLINE int32x4_t highbd_idct_cospi_lane1(const int32x4_t s, + const int32x2_t coef) { + int64x2x2_t t; + + t.val[0] = vmull_lane_s32(vget_low_s32(s), coef, 1); + t.val[1] = vmull_lane_s32(vget_high_s32(s), coef, 1); + return highbd_idct16x16_add_wrap_low_4x1(t); +} + +static INLINE int32x4x2_t highbd_idct_add_dual(const int32x4x2_t s0, + const int32x4x2_t s1) { + int32x4x2_t t; + t.val[0] = vaddq_s32(s0.val[0], s1.val[0]); + t.val[1] = vaddq_s32(s0.val[1], s1.val[1]); + return t; +} + +static INLINE int32x4x2_t highbd_idct_sub_dual(const int32x4x2_t s0, + const int32x4x2_t s1) { + int32x4x2_t t; + t.val[0] = vsubq_s32(s0.val[0], s1.val[0]); + t.val[1] = vsubq_s32(s0.val[1], s1.val[1]); + return t; +} + +static void highbd_idct16x16_38_add_half1d(const int32_t *input, + int32_t *output, uint16_t *dest, + const int stride, const int bd) { + const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0); + const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4); + const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8); + const int32x4_t cospi_6_26_14_18N = vld1q_s32(kCospi32 + 12); + int32x4x2_t in[8], step1[16], step2[16], out[16]; + + // Load input (8x8) + in[0].val[0] = vld1q_s32(input); + in[0].val[1] = vld1q_s32(input + 4); + input += 16; + in[1].val[0] = vld1q_s32(input); + in[1].val[1] = vld1q_s32(input + 4); + input += 16; + in[2].val[0] = vld1q_s32(input); + in[2].val[1] = vld1q_s32(input + 4); + input += 16; + in[3].val[0] = vld1q_s32(input); + in[3].val[1] = vld1q_s32(input + 4); + input += 16; + in[4].val[0] = vld1q_s32(input); + in[4].val[1] = vld1q_s32(input + 4); + input += 16; + in[5].val[0] = vld1q_s32(input); + in[5].val[1] = vld1q_s32(input + 4); + input += 16; + in[6].val[0] = vld1q_s32(input); + in[6].val[1] = vld1q_s32(input + 4); + input += 16; + in[7].val[0] = vld1q_s32(input); + in[7].val[1] = vld1q_s32(input + 4); + + // Transpose + transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + + // stage 1 + step1[0] = in[0 / 2]; + step1[2] = in[8 / 2]; + step1[4] = in[4 / 2]; + step1[6] = in[12 / 2]; + step1[8] = in[2 / 2]; + step1[10] = in[10 / 2]; + step1[12] = in[6 / 2]; + step1[14] = in[14 / 2]; // 0 in pass 1 + + // stage 2 + step2[0] = step1[0]; + step2[2] = step1[2]; + step2[4] = step1[4]; + step2[6] = step1[6]; + step2[8] = + highbd_idct_cospi_lane1_dual(step1[8], vget_low_s32(cospi_2_30_10_22)); + step2[9] = + highbd_idct_cospi_lane1_dual(step1[14], vget_high_s32(cospi_6_26_14_18N)); + step2[10] = + highbd_idct_cospi_lane1_dual(step1[10], vget_high_s32(cospi_2_30_10_22)); + step2[11] = + highbd_idct_cospi_lane1_dual(step1[12], vget_low_s32(cospi_6_26_14_18N)); + step2[12] = + highbd_idct_cospi_lane0_dual(step1[12], vget_low_s32(cospi_6_26_14_18N)); + step2[13] = + highbd_idct_cospi_lane0_dual(step1[10], vget_high_s32(cospi_2_30_10_22)); + step2[14] = + highbd_idct_cospi_lane0_dual(step1[14], vget_high_s32(cospi_6_26_14_18N)); + step2[15] = + highbd_idct_cospi_lane0_dual(step1[8], vget_low_s32(cospi_2_30_10_22)); + + // stage 3 + step1[0] = step2[0]; + step1[2] = step2[2]; + step1[4] = + highbd_idct_cospi_lane1_dual(step2[4], vget_high_s32(cospi_4_12_20N_28)); + step1[5] = + highbd_idct_cospi_lane0_dual(step2[6], vget_high_s32(cospi_4_12_20N_28)); + step1[6] = + highbd_idct_cospi_lane1_dual(step2[6], vget_low_s32(cospi_4_12_20N_28)); + step1[7] = + highbd_idct_cospi_lane0_dual(step2[4], vget_low_s32(cospi_4_12_20N_28)); + step1[8] = highbd_idct_add_dual(step2[8], step2[9]); + step1[9] = highbd_idct_sub_dual(step2[8], step2[9]); + step1[10] = highbd_idct_sub_dual(step2[11], step2[10]); + step1[11] = highbd_idct_add_dual(step2[11], step2[10]); + step1[12] = highbd_idct_add_dual(step2[12], step2[13]); + step1[13] = highbd_idct_sub_dual(step2[12], step2[13]); + step1[14] = highbd_idct_sub_dual(step2[15], step2[14]); + step1[15] = highbd_idct_add_dual(step2[15], step2[14]); + + // stage 4 + step2[0] = step2[1] = + highbd_idct_cospi_lane0_dual(step1[0], vget_high_s32(cospi_0_8_16_24)); + step2[2] = + highbd_idct_cospi_lane1_dual(step1[2], vget_high_s32(cospi_0_8_16_24)); + step2[3] = + highbd_idct_cospi_lane1_dual(step1[2], vget_low_s32(cospi_0_8_16_24)); + step2[4] = highbd_idct_add_dual(step1[4], step1[5]); + step2[5] = highbd_idct_sub_dual(step1[4], step1[5]); + step2[6] = highbd_idct_sub_dual(step1[7], step1[6]); + step2[7] = highbd_idct_add_dual(step1[7], step1[6]); + step2[8] = step1[8]; + highbd_idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9], + &step2[14]); + highbd_idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, + &step2[13], &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0] = highbd_idct_add_dual(step2[0], step2[3]); + step1[1] = highbd_idct_add_dual(step2[1], step2[2]); + step1[2] = highbd_idct_sub_dual(step2[1], step2[2]); + step1[3] = highbd_idct_sub_dual(step2[0], step2[3]); + step1[4] = step2[4]; + highbd_idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], + &step1[6]); + step1[7] = step2[7]; + step1[8] = highbd_idct_add_dual(step2[8], step2[11]); + step1[9] = highbd_idct_add_dual(step2[9], step2[10]); + step1[10] = highbd_idct_sub_dual(step2[9], step2[10]); + step1[11] = highbd_idct_sub_dual(step2[8], step2[11]); + step1[12] = highbd_idct_sub_dual(step2[15], step2[12]); + step1[13] = highbd_idct_sub_dual(step2[14], step2[13]); + step1[14] = highbd_idct_add_dual(step2[14], step2[13]); + step1[15] = highbd_idct_add_dual(step2[15], step2[12]); + + // stage 6 + step2[0] = highbd_idct_add_dual(step1[0], step1[7]); + step2[1] = highbd_idct_add_dual(step1[1], step1[6]); + step2[2] = highbd_idct_add_dual(step1[2], step1[5]); + step2[3] = highbd_idct_add_dual(step1[3], step1[4]); + step2[4] = highbd_idct_sub_dual(step1[3], step1[4]); + step2[5] = highbd_idct_sub_dual(step1[2], step1[5]); + step2[6] = highbd_idct_sub_dual(step1[1], step1[6]); + step2[7] = highbd_idct_sub_dual(step1[0], step1[7]); + highbd_idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10], + &step2[13]); + highbd_idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11], + &step2[12]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + highbd_idct16x16_add_stage7_dual(step2, out); + + if (output) { + highbd_idct16x16_store_pass1(out, output); + } else { + highbd_idct16x16_add_store(out, dest, stride, bd); + } +} + +void highbd_idct16x16_10_add_half1d_pass1(const tran_low_t *input, + int32_t *output) { + const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0); + const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4); + const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8); + const int32x4_t cospi_6_26_14_18N = vld1q_s32(kCospi32 + 12); + int32x4_t in[4], step1[16], step2[16], out[16]; + + // Load input (4x4) + in[0] = vld1q_s32(input); + input += 16; + in[1] = vld1q_s32(input); + input += 16; + in[2] = vld1q_s32(input); + input += 16; + in[3] = vld1q_s32(input); + + // Transpose + transpose_s32_4x4(&in[0], &in[1], &in[2], &in[3]); + + // stage 1 + step1[0] = in[0 / 2]; + step1[4] = in[4 / 2]; + step1[8] = in[2 / 2]; + step1[12] = in[6 / 2]; + + // stage 2 + step2[0] = step1[0]; + step2[4] = step1[4]; + step2[8] = highbd_idct_cospi_lane1(step1[8], vget_low_s32(cospi_2_30_10_22)); + step2[11] = + highbd_idct_cospi_lane1(step1[12], vget_low_s32(cospi_6_26_14_18N)); + step2[12] = + highbd_idct_cospi_lane0(step1[12], vget_low_s32(cospi_6_26_14_18N)); + step2[15] = highbd_idct_cospi_lane0(step1[8], vget_low_s32(cospi_2_30_10_22)); + + // stage 3 + step1[0] = step2[0]; + step1[4] = + highbd_idct_cospi_lane1(step2[4], vget_high_s32(cospi_4_12_20N_28)); + step1[7] = highbd_idct_cospi_lane0(step2[4], vget_low_s32(cospi_4_12_20N_28)); + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + step1[14] = step2[15]; + step1[15] = step2[15]; + + // stage 4 + step2[0] = step2[1] = + highbd_idct_cospi_lane0(step1[0], vget_high_s32(cospi_0_8_16_24)); + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; + step2[8] = step1[8]; + highbd_idct_cospi_8_24_d(step1[14], step1[9], cospi_0_8_16_24, &step2[9], + &step2[14]); + highbd_idct_cospi_8_24_neg_d(step1[13], step1[10], cospi_0_8_16_24, + &step2[13], &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[1]; + step1[3] = step2[0]; + step1[4] = step2[4]; + highbd_idct_cospi_16_16_d(step2[5], step2[6], cospi_0_8_16_24, &step1[5], + &step1[6]); + step1[7] = step2[7]; + step1[8] = vaddq_s32(step2[8], step2[11]); + step1[9] = vaddq_s32(step2[9], step2[10]); + step1[10] = vsubq_s32(step2[9], step2[10]); + step1[11] = vsubq_s32(step2[8], step2[11]); + step1[12] = vsubq_s32(step2[15], step2[12]); + step1[13] = vsubq_s32(step2[14], step2[13]); + step1[14] = vaddq_s32(step2[14], step2[13]); + step1[15] = vaddq_s32(step2[15], step2[12]); + + // stage 6 + step2[0] = vaddq_s32(step1[0], step1[7]); + step2[1] = vaddq_s32(step1[1], step1[6]); + step2[2] = vaddq_s32(step1[2], step1[5]); + step2[3] = vaddq_s32(step1[3], step1[4]); + step2[4] = vsubq_s32(step1[3], step1[4]); + step2[5] = vsubq_s32(step1[2], step1[5]); + step2[6] = vsubq_s32(step1[1], step1[6]); + step2[7] = vsubq_s32(step1[0], step1[7]); + highbd_idct_cospi_16_16_d(step1[10], step1[13], cospi_0_8_16_24, &step2[10], + &step2[13]); + highbd_idct_cospi_16_16_d(step1[11], step1[12], cospi_0_8_16_24, &step2[11], + &step2[12]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + highbd_idct16x16_add_stage7(step2, out); + + // pass 1: save the result into output + vst1q_s32(output, out[0]); + output += 4; + vst1q_s32(output, out[1]); + output += 4; + vst1q_s32(output, out[2]); + output += 4; + vst1q_s32(output, out[3]); + output += 4; + vst1q_s32(output, out[4]); + output += 4; + vst1q_s32(output, out[5]); + output += 4; + vst1q_s32(output, out[6]); + output += 4; + vst1q_s32(output, out[7]); + output += 4; + vst1q_s32(output, out[8]); + output += 4; + vst1q_s32(output, out[9]); + output += 4; + vst1q_s32(output, out[10]); + output += 4; + vst1q_s32(output, out[11]); + output += 4; + vst1q_s32(output, out[12]); + output += 4; + vst1q_s32(output, out[13]); + output += 4; + vst1q_s32(output, out[14]); + output += 4; + vst1q_s32(output, out[15]); +} + +void highbd_idct16x16_10_add_half1d_pass2(const int32_t *input, + int32_t *const output, + uint16_t *const dest, + const int stride, const int bd) { + const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0); + const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4); + const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8); + const int32x4_t cospi_6_26_14_18N = vld1q_s32(kCospi32 + 12); + int32x4x2_t in[4], step1[16], step2[16], out[16]; + + // Load input (4x8) + in[0].val[0] = vld1q_s32(input); + input += 4; + in[0].val[1] = vld1q_s32(input); + input += 4; + in[1].val[0] = vld1q_s32(input); + input += 4; + in[1].val[1] = vld1q_s32(input); + input += 4; + in[2].val[0] = vld1q_s32(input); + input += 4; + in[2].val[1] = vld1q_s32(input); + input += 4; + in[3].val[0] = vld1q_s32(input); + input += 4; + in[3].val[1] = vld1q_s32(input); + + // Transpose + transpose_s32_4x8(&in[0].val[0], &in[0].val[1], &in[1].val[0], &in[1].val[1], + &in[2].val[0], &in[2].val[1], &in[3].val[0], &in[3].val[1]); + + // stage 1 + step1[0] = in[0 / 2]; + step1[4] = in[4 / 2]; + step1[8] = in[2 / 2]; + step1[12] = in[6 / 2]; + + // stage 2 + step2[0] = step1[0]; + step2[4] = step1[4]; + step2[8] = + highbd_idct_cospi_lane1_dual(step1[8], vget_low_s32(cospi_2_30_10_22)); + step2[11] = + highbd_idct_cospi_lane1_dual(step1[12], vget_low_s32(cospi_6_26_14_18N)); + step2[12] = + highbd_idct_cospi_lane0_dual(step1[12], vget_low_s32(cospi_6_26_14_18N)); + step2[15] = + highbd_idct_cospi_lane0_dual(step1[8], vget_low_s32(cospi_2_30_10_22)); + + // stage 3 + step1[0] = step2[0]; + step1[4] = + highbd_idct_cospi_lane1_dual(step2[4], vget_high_s32(cospi_4_12_20N_28)); + step1[7] = + highbd_idct_cospi_lane0_dual(step2[4], vget_low_s32(cospi_4_12_20N_28)); + step1[8] = step2[8]; + step1[9] = step2[8]; + step1[10] = step2[11]; + step1[11] = step2[11]; + step1[12] = step2[12]; + step1[13] = step2[12]; + step1[14] = step2[15]; + step1[15] = step2[15]; + + // stage 4 + step2[0] = step2[1] = + highbd_idct_cospi_lane0_dual(step1[0], vget_high_s32(cospi_0_8_16_24)); + step2[4] = step1[4]; + step2[5] = step1[4]; + step2[6] = step1[7]; + step2[7] = step1[7]; + step2[8] = step1[8]; + highbd_idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9], + &step2[14]); + highbd_idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, + &step2[13], &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0] = step2[0]; + step1[1] = step2[1]; + step1[2] = step2[1]; + step1[3] = step2[0]; + step1[4] = step2[4]; + highbd_idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], + &step1[6]); + step1[7] = step2[7]; + step1[8] = highbd_idct_add_dual(step2[8], step2[11]); + step1[9] = highbd_idct_add_dual(step2[9], step2[10]); + step1[10] = highbd_idct_sub_dual(step2[9], step2[10]); + step1[11] = highbd_idct_sub_dual(step2[8], step2[11]); + step1[12] = highbd_idct_sub_dual(step2[15], step2[12]); + step1[13] = highbd_idct_sub_dual(step2[14], step2[13]); + step1[14] = highbd_idct_add_dual(step2[14], step2[13]); + step1[15] = highbd_idct_add_dual(step2[15], step2[12]); + + // stage 6 + step2[0] = highbd_idct_add_dual(step1[0], step1[7]); + step2[1] = highbd_idct_add_dual(step1[1], step1[6]); + step2[2] = highbd_idct_add_dual(step1[2], step1[5]); + step2[3] = highbd_idct_add_dual(step1[3], step1[4]); + step2[4] = highbd_idct_sub_dual(step1[3], step1[4]); + step2[5] = highbd_idct_sub_dual(step1[2], step1[5]); + step2[6] = highbd_idct_sub_dual(step1[1], step1[6]); + step2[7] = highbd_idct_sub_dual(step1[0], step1[7]); + highbd_idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10], + &step2[13]); + highbd_idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11], + &step2[12]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + highbd_idct16x16_add_stage7_dual(step2, out); + + if (output) { + highbd_idct16x16_store_pass1(out, output); + } else { + highbd_idct16x16_add_store(out, dest, stride, bd); + } +} + +void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + if (bd == 8) { + int16_t row_idct_output[16 * 16]; + + // pass 1 + // Parallel idct on the upper 8 rows + idct16x16_256_add_half1d(input, row_idct_output, dest, stride, 1); + + // Parallel idct on the lower 8 rows + idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8, dest, stride, + 1); + + // pass 2 + // Parallel idct to get the left 8 columns + idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride, 1); + + // Parallel idct to get the right 8 columns + idct16x16_256_add_half1d(row_idct_output + 8 * 16, NULL, dest + 8, stride, + 1); + } else { + int32_t row_idct_output[16 * 16]; + + // pass 1 + // Parallel idct on the upper 8 rows + highbd_idct16x16_256_add_half1d(input, row_idct_output, dest, stride, bd); + + // Parallel idct on the lower 8 rows + highbd_idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8, dest, + stride, bd); + + // pass 2 + // Parallel idct to get the left 8 columns + highbd_idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride, bd); + + // Parallel idct to get the right 8 columns + highbd_idct16x16_256_add_half1d(row_idct_output + 8 * 16, NULL, dest + 8, + stride, bd); + } +} + +void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + if (bd == 8) { + int16_t row_idct_output[16 * 16]; + + // pass 1 + // Parallel idct on the upper 8 rows + idct16x16_38_add_half1d(input, row_idct_output, dest, stride, 1); + + // pass 2 + // Parallel idct to get the left 8 columns + idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, 1); + + // Parallel idct to get the right 8 columns + idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride, + 1); + } else { + int32_t row_idct_output[16 * 16]; + + // pass 1 + // Parallel idct on the upper 8 rows + highbd_idct16x16_38_add_half1d(input, row_idct_output, dest, stride, bd); + + // pass 2 + // Parallel idct to get the left 8 columns + highbd_idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, bd); + + // Parallel idct to get the right 8 columns + highbd_idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, + stride, bd); + } +} + +void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + if (bd == 8) { + int16_t row_idct_output[4 * 16]; + + // pass 1 + // Parallel idct on the upper 8 rows + idct16x16_10_add_half1d_pass1(input, row_idct_output); + + // pass 2 + // Parallel idct to get the left 8 columns + idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride, 1); + + // Parallel idct to get the right 8 columns + idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, dest + 8, + stride, 1); + } else { + int32_t row_idct_output[4 * 16]; + + // pass 1 + // Parallel idct on the upper 8 rows + highbd_idct16x16_10_add_half1d_pass1(input, row_idct_output); + + // pass 2 + // Parallel idct to get the left 8 columns + highbd_idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride, + bd); + + // Parallel idct to get the right 8 columns + highbd_idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, + dest + 8, stride, bd); + } +} + +static INLINE void highbd_idct16x16_1_add_pos_kernel(uint16_t **dest, + const int stride, + const int16x8_t res, + const int16x8_t max) { + const uint16x8_t a0 = vld1q_u16(*dest + 0); + const uint16x8_t a1 = vld1q_u16(*dest + 8); + const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0)); + const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1)); + const int16x8_t c0 = vminq_s16(b0, max); + const int16x8_t c1 = vminq_s16(b1, max); + vst1q_u16(*dest + 0, vreinterpretq_u16_s16(c0)); + vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1)); + *dest += stride; +} + +static INLINE void highbd_idct16x16_1_add_neg_kernel(uint16_t **dest, + const int stride, + const int16x8_t res) { + const uint16x8_t a0 = vld1q_u16(*dest + 0); + const uint16x8_t a1 = vld1q_u16(*dest + 8); + const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0)); + const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1)); + const uint16x8_t c0 = vqshluq_n_s16(b0, 0); + const uint16x8_t c1 = vqshluq_n_s16(b1, 0); + vst1q_u16(*dest + 0, c0); + vst1q_u16(*dest + 8, c1); + *dest += stride; +} + +void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + const tran_low_t out0 = + HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); + const tran_low_t out1 = + HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd); + const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6); + const int16x8_t dc = vdupq_n_s16(a1); + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + int i; + + if (a1 >= 0) { + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + for (i = 0; i < 4; ++i) { + highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max); + } + } else { + for (i = 0; i < 4; ++i) { + highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc); + highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc); + highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc); + highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc); + } + } +} diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c new file mode 100644 index 00000000000..d74331f8031 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" +#include "vpx_dsp/inv_txfm.h" + +static INLINE void highbd_idct32x32_1_add_pos_kernel(uint16_t **dest, + const int stride, + const int16x8_t res, + const int16x8_t max) { + const uint16x8_t a0 = vld1q_u16(*dest); + const uint16x8_t a1 = vld1q_u16(*dest + 8); + const uint16x8_t a2 = vld1q_u16(*dest + 16); + const uint16x8_t a3 = vld1q_u16(*dest + 24); + const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0)); + const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1)); + const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2)); + const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3)); + const int16x8_t c0 = vminq_s16(b0, max); + const int16x8_t c1 = vminq_s16(b1, max); + const int16x8_t c2 = vminq_s16(b2, max); + const int16x8_t c3 = vminq_s16(b3, max); + vst1q_u16(*dest, vreinterpretq_u16_s16(c0)); + vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1)); + vst1q_u16(*dest + 16, vreinterpretq_u16_s16(c2)); + vst1q_u16(*dest + 24, vreinterpretq_u16_s16(c3)); + *dest += stride; +} + +static INLINE void highbd_idct32x32_1_add_neg_kernel(uint16_t **dest, + const int stride, + const int16x8_t res) { + const uint16x8_t a0 = vld1q_u16(*dest); + const uint16x8_t a1 = vld1q_u16(*dest + 8); + const uint16x8_t a2 = vld1q_u16(*dest + 16); + const uint16x8_t a3 = vld1q_u16(*dest + 24); + const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0)); + const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1)); + const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2)); + const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3)); + const uint16x8_t c0 = vqshluq_n_s16(b0, 0); + const uint16x8_t c1 = vqshluq_n_s16(b1, 0); + const uint16x8_t c2 = vqshluq_n_s16(b2, 0); + const uint16x8_t c3 = vqshluq_n_s16(b3, 0); + vst1q_u16(*dest, c0); + vst1q_u16(*dest + 8, c1); + vst1q_u16(*dest + 16, c2); + vst1q_u16(*dest + 24, c3); + *dest += stride; +} + +void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + const tran_low_t out0 = + HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); + const tran_low_t out1 = + HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd); + const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6); + const int16x8_t dc = vdupq_n_s16(a1); + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + int i; + + if (a1 >= 0) { + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + for (i = 0; i < 8; ++i) { + highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max); + } + } else { + for (i = 0; i < 8; ++i) { + highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc); + highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc); + highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc); + highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc); + } + } +} diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c index 26fa3e216bb..128f72b9c96 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c @@ -82,10 +82,10 @@ static INLINE void idct4x4_16_kernel_bd10(const int32x4_t cospis, b3 = vmulq_lane_s32(*a1, vget_low_s32(cospis), 1); b2 = vmlsq_lane_s32(b2, *a3, vget_low_s32(cospis), 1); b3 = vmlaq_lane_s32(b3, *a3, vget_high_s32(cospis), 1); - b0 = vrshrq_n_s32(b0, 14); - b1 = vrshrq_n_s32(b1, 14); - b2 = vrshrq_n_s32(b2, 14); - b3 = vrshrq_n_s32(b3, 14); + b0 = vrshrq_n_s32(b0, DCT_CONST_BITS); + b1 = vrshrq_n_s32(b1, DCT_CONST_BITS); + b2 = vrshrq_n_s32(b2, DCT_CONST_BITS); + b3 = vrshrq_n_s32(b3, DCT_CONST_BITS); *a0 = vaddq_s32(b0, b3); *a1 = vaddq_s32(b1, b2); *a2 = vsubq_s32(b1, b2); @@ -119,10 +119,14 @@ static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis, c5 = vsubq_s64(c5, c9); c6 = vaddq_s64(c6, c10); c7 = vaddq_s64(c7, c11); - b0 = vcombine_s32(vrshrn_n_s64(c0, 14), vrshrn_n_s64(c1, 14)); - b1 = vcombine_s32(vrshrn_n_s64(c2, 14), vrshrn_n_s64(c3, 14)); - b2 = vcombine_s32(vrshrn_n_s64(c4, 14), vrshrn_n_s64(c5, 14)); - b3 = vcombine_s32(vrshrn_n_s64(c6, 14), vrshrn_n_s64(c7, 14)); + b0 = vcombine_s32(vrshrn_n_s64(c0, DCT_CONST_BITS), + vrshrn_n_s64(c1, DCT_CONST_BITS)); + b1 = vcombine_s32(vrshrn_n_s64(c2, DCT_CONST_BITS), + vrshrn_n_s64(c3, DCT_CONST_BITS)); + b2 = vcombine_s32(vrshrn_n_s64(c4, DCT_CONST_BITS), + vrshrn_n_s64(c5, DCT_CONST_BITS)); + b3 = vcombine_s32(vrshrn_n_s64(c6, DCT_CONST_BITS), + vrshrn_n_s64(c7, DCT_CONST_BITS)); *a0 = vaddq_s32(b0, b3); *a1 = vaddq_s32(b1, b2); *a2 = vsubq_s32(b1, b2); diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c index c1c0f645d18..f53f4c7fcad 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c @@ -15,21 +15,29 @@ #include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/inv_txfm.h" -static INLINE void highbd_idct8x8_1_add_kernel(uint16_t **dest, - const int stride, - const int16x8_t res, - const int16x8_t max) { +static INLINE void highbd_idct8x8_1_add_pos_kernel(uint16_t **dest, + const int stride, + const int16x8_t res, + const int16x8_t max) { const uint16x8_t a = vld1q_u16(*dest); const int16x8_t b = vaddq_s16(res, vreinterpretq_s16_u16(a)); const int16x8_t c = vminq_s16(b, max); - const uint16x8_t d = vqshluq_n_s16(c, 0); - vst1q_u16(*dest, d); + vst1q_u16(*dest, vreinterpretq_u16_s16(c)); + *dest += stride; +} + +static INLINE void highbd_idct8x8_1_add_neg_kernel(uint16_t **dest, + const int stride, + const int16x8_t res) { + const uint16x8_t a = vld1q_u16(*dest); + const int16x8_t b = vaddq_s16(res, vreinterpretq_s16_u16(a)); + const uint16x8_t c = vqshluq_n_s16(b, 0); + vst1q_u16(*dest, c); *dest += stride; } void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest8, int stride, int bd) { - const int16x8_t max = vdupq_n_s16((1 << bd) - 1); const tran_low_t out0 = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); const tran_low_t out1 = @@ -38,14 +46,26 @@ void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest8, const int16x8_t dc = vdupq_n_s16(a1); uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - highbd_idct8x8_1_add_kernel(&dest, stride, dc, max); - highbd_idct8x8_1_add_kernel(&dest, stride, dc, max); - highbd_idct8x8_1_add_kernel(&dest, stride, dc, max); - highbd_idct8x8_1_add_kernel(&dest, stride, dc, max); - highbd_idct8x8_1_add_kernel(&dest, stride, dc, max); - highbd_idct8x8_1_add_kernel(&dest, stride, dc, max); - highbd_idct8x8_1_add_kernel(&dest, stride, dc, max); - highbd_idct8x8_1_add_kernel(&dest, stride, dc, max); + if (a1 >= 0) { + const int16x8_t max = vdupq_n_s16((1 << bd) - 1); + highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max); + highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max); + } else { + highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc); + highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc); + highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc); + highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc); + highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc); + highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc); + highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc); + highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc); + } } static INLINE void idct8x8_12_half1d_bd10( @@ -62,18 +82,18 @@ static INLINE void idct8x8_12_half1d_bd10( step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0); step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1); step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0); - step1[4] = vrshrq_n_s32(step1[4], 14); - step1[5] = vrshrq_n_s32(step1[5], 14); - step1[6] = vrshrq_n_s32(step1[6], 14); - step1[7] = vrshrq_n_s32(step1[7], 14); + step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS); + step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS); + step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS); + step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS); // stage 2 step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0); step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1); step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1); - step2[1] = vrshrq_n_s32(step2[1], 14); - step2[2] = vrshrq_n_s32(step2[2], 14); - step2[3] = vrshrq_n_s32(step2[3], 14); + step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS); + step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS); + step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS); step2[4] = vaddq_s32(step1[4], step1[5]); step2[5] = vsubq_s32(step1[4], step1[5]); @@ -89,8 +109,8 @@ static INLINE void idct8x8_12_half1d_bd10( step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0); step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0); step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0); - step1[5] = vrshrq_n_s32(step1[5], 14); - step1[6] = vrshrq_n_s32(step1[6], 14); + step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS); + step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS); // stage 4 *io0 = vaddq_s32(step1[0], step2[7]); @@ -134,14 +154,14 @@ static INLINE void idct8x8_12_half1d_bd12( t64[5] = vmull_lane_s32(input_3h, vget_low_s32(cospis1), 1); t64[6] = vmull_lane_s32(input_1l, vget_low_s32(cospis1), 0); t64[7] = vmull_lane_s32(input_1h, vget_low_s32(cospis1), 0); - t32[0] = vrshrn_n_s64(t64[0], 14); - t32[1] = vrshrn_n_s64(t64[1], 14); - t32[2] = vrshrn_n_s64(t64[2], 14); - t32[3] = vrshrn_n_s64(t64[3], 14); - t32[4] = vrshrn_n_s64(t64[4], 14); - t32[5] = vrshrn_n_s64(t64[5], 14); - t32[6] = vrshrn_n_s64(t64[6], 14); - t32[7] = vrshrn_n_s64(t64[7], 14); + t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS); + t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS); + t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); + t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); + t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS); + t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS); + t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS); + t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS); step1[4] = vcombine_s32(t32[0], t32[1]); step1[5] = vcombine_s32(t32[2], t32[3]); step1[6] = vcombine_s32(t32[4], t32[5]); @@ -154,12 +174,12 @@ static INLINE void idct8x8_12_half1d_bd12( t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1); t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1); t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1); - t32[2] = vrshrn_n_s64(t64[2], 14); - t32[3] = vrshrn_n_s64(t64[3], 14); - t32[4] = vrshrn_n_s64(t64[4], 14); - t32[5] = vrshrn_n_s64(t64[5], 14); - t32[6] = vrshrn_n_s64(t64[6], 14); - t32[7] = vrshrn_n_s64(t64[7], 14); + t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); + t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); + t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS); + t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS); + t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS); + t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS); step2[1] = vcombine_s32(t32[2], t32[3]); step2[2] = vcombine_s32(t32[4], t32[5]); step2[3] = vcombine_s32(t32[6], t32[7]); @@ -185,10 +205,10 @@ static INLINE void idct8x8_12_half1d_bd12( vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0); t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]), vget_high_s32(cospis0), 0); - t32[0] = vrshrn_n_s64(t64[0], 14); - t32[1] = vrshrn_n_s64(t64[1], 14); - t32[2] = vrshrn_n_s64(t64[2], 14); - t32[3] = vrshrn_n_s64(t64[3], 14); + t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS); + t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS); + t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); + t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); step1[5] = vcombine_s32(t32[0], t32[1]); step1[6] = vcombine_s32(t32[2], t32[3]); @@ -357,10 +377,10 @@ static INLINE void idct8x8_64_half1d_bd10( step1[6] = vmlsq_lane_s32(step1[6], *io5, vget_high_s32(cospis1), 0); step1[7] = vmlaq_lane_s32(step1[7], *io7, vget_high_s32(cospis1), 1); - step1[4] = vrshrq_n_s32(step1[4], 14); - step1[5] = vrshrq_n_s32(step1[5], 14); - step1[6] = vrshrq_n_s32(step1[6], 14); - step1[7] = vrshrq_n_s32(step1[7], 14); + step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS); + step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS); + step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS); + step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS); // stage 2 step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0); @@ -372,10 +392,10 @@ static INLINE void idct8x8_64_half1d_bd10( step2[2] = vmlsq_lane_s32(step2[2], *io6, vget_low_s32(cospis0), 1); step2[3] = vmlaq_lane_s32(step2[3], *io6, vget_high_s32(cospis0), 1); - step2[0] = vrshrq_n_s32(step2[0], 14); - step2[1] = vrshrq_n_s32(step2[1], 14); - step2[2] = vrshrq_n_s32(step2[2], 14); - step2[3] = vrshrq_n_s32(step2[3], 14); + step2[0] = vrshrq_n_s32(step2[0], DCT_CONST_BITS); + step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS); + step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS); + step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS); step2[4] = vaddq_s32(step1[4], step1[5]); step2[5] = vsubq_s32(step1[4], step1[5]); @@ -391,8 +411,8 @@ static INLINE void idct8x8_64_half1d_bd10( step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0); step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0); step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0); - step1[5] = vrshrq_n_s32(step1[5], 14); - step1[6] = vrshrq_n_s32(step1[6], 14); + step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS); + step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS); // stage 4 *io0 = vaddq_s32(step1[0], step2[7]); @@ -453,14 +473,14 @@ static INLINE void idct8x8_64_half1d_bd12( t64[5] = vmlsl_lane_s32(t64[5], input_5h, vget_high_s32(cospis1), 0); t64[6] = vmlal_lane_s32(t64[6], input_7l, vget_high_s32(cospis1), 1); t64[7] = vmlal_lane_s32(t64[7], input_7h, vget_high_s32(cospis1), 1); - t32[0] = vrshrn_n_s64(t64[0], 14); - t32[1] = vrshrn_n_s64(t64[1], 14); - t32[2] = vrshrn_n_s64(t64[2], 14); - t32[3] = vrshrn_n_s64(t64[3], 14); - t32[4] = vrshrn_n_s64(t64[4], 14); - t32[5] = vrshrn_n_s64(t64[5], 14); - t32[6] = vrshrn_n_s64(t64[6], 14); - t32[7] = vrshrn_n_s64(t64[7], 14); + t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS); + t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS); + t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); + t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); + t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS); + t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS); + t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS); + t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS); step1[4] = vcombine_s32(t32[0], t32[1]); step1[5] = vcombine_s32(t32[2], t32[3]); step1[6] = vcombine_s32(t32[4], t32[5]); @@ -481,14 +501,14 @@ static INLINE void idct8x8_64_half1d_bd12( t64[5] = vmlsl_lane_s32(t64[5], step1h[3], vget_low_s32(cospis0), 1); t64[6] = vmlal_lane_s32(t64[6], step1l[3], vget_high_s32(cospis0), 1); t64[7] = vmlal_lane_s32(t64[7], step1h[3], vget_high_s32(cospis0), 1); - t32[0] = vrshrn_n_s64(t64[0], 14); - t32[1] = vrshrn_n_s64(t64[1], 14); - t32[2] = vrshrn_n_s64(t64[2], 14); - t32[3] = vrshrn_n_s64(t64[3], 14); - t32[4] = vrshrn_n_s64(t64[4], 14); - t32[5] = vrshrn_n_s64(t64[5], 14); - t32[6] = vrshrn_n_s64(t64[6], 14); - t32[7] = vrshrn_n_s64(t64[7], 14); + t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS); + t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS); + t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); + t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); + t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS); + t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS); + t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS); + t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS); step2[0] = vcombine_s32(t32[0], t32[1]); step2[1] = vcombine_s32(t32[2], t32[3]); step2[2] = vcombine_s32(t32[4], t32[5]); @@ -515,10 +535,10 @@ static INLINE void idct8x8_64_half1d_bd12( vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0); t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]), vget_high_s32(cospis0), 0); - t32[0] = vrshrn_n_s64(t64[0], 14); - t32[1] = vrshrn_n_s64(t64[1], 14); - t32[2] = vrshrn_n_s64(t64[2], 14); - t32[3] = vrshrn_n_s64(t64[3], 14); + t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS); + t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS); + t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS); + t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS); step1[5] = vcombine_s32(t32[0], t32[1]); step1[6] = vcombine_s32(t32[2], t32[3]); diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm deleted file mode 100644 index d648840df40..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm +++ /dev/null @@ -1,196 +0,0 @@ -; -; Copyright (c) 2013 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - - EXPORT |vpx_idct16x16_1_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void vpx_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, int stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int stride) - -|vpx_idct16x16_1_add_neon| PROC - ldrsh r0, [r0] - - ; cospi_16_64 = 11585 - movw r12, #0x2d41 - - ; out = dct_const_round_shift(input[0] * cospi_16_64) - mul r0, r0, r12 ; input[0] * cospi_16_64 - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; out = dct_const_round_shift(out * cospi_16_64) - mul r0, r0, r12 ; out * cospi_16_64 - mov r12, r1 ; save dest - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; a1 = ROUND_POWER_OF_TWO(out, 6) - add r0, r0, #32 ; + (1 <<((6) - 1)) - asr r0, r0, #6 ; >> 6 - - vdup.s16 q0, r0 ; duplicate a1 - mov r0, #8 - sub r2, #8 - - ; load destination data row0 - row3 - vld1.64 {d2}, [r1], r0 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r0 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r0 - vld1.64 {d7}, [r1], r2 - vld1.64 {d16}, [r1], r0 - vld1.64 {d17}, [r1], r2 - - vaddw.u8 q9, q0, d2 ; dest[x] + a1 - vaddw.u8 q10, q0, d3 ; dest[x] + a1 - vaddw.u8 q11, q0, d4 ; dest[x] + a1 - vaddw.u8 q12, q0, d5 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - vaddw.u8 q9, q0, d6 ; dest[x] + a1 - vaddw.u8 q10, q0, d7 ; dest[x] + a1 - vaddw.u8 q11, q0, d16 ; dest[x] + a1 - vaddw.u8 q12, q0, d17 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - ; load destination data row4 - row7 - vld1.64 {d2}, [r1], r0 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r0 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r0 - vld1.64 {d7}, [r1], r2 - vld1.64 {d16}, [r1], r0 - vld1.64 {d17}, [r1], r2 - - vaddw.u8 q9, q0, d2 ; dest[x] + a1 - vaddw.u8 q10, q0, d3 ; dest[x] + a1 - vaddw.u8 q11, q0, d4 ; dest[x] + a1 - vaddw.u8 q12, q0, d5 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - vaddw.u8 q9, q0, d6 ; dest[x] + a1 - vaddw.u8 q10, q0, d7 ; dest[x] + a1 - vaddw.u8 q11, q0, d16 ; dest[x] + a1 - vaddw.u8 q12, q0, d17 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - ; load destination data row8 - row11 - vld1.64 {d2}, [r1], r0 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r0 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r0 - vld1.64 {d7}, [r1], r2 - vld1.64 {d16}, [r1], r0 - vld1.64 {d17}, [r1], r2 - - vaddw.u8 q9, q0, d2 ; dest[x] + a1 - vaddw.u8 q10, q0, d3 ; dest[x] + a1 - vaddw.u8 q11, q0, d4 ; dest[x] + a1 - vaddw.u8 q12, q0, d5 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - vaddw.u8 q9, q0, d6 ; dest[x] + a1 - vaddw.u8 q10, q0, d7 ; dest[x] + a1 - vaddw.u8 q11, q0, d16 ; dest[x] + a1 - vaddw.u8 q12, q0, d17 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - ; load destination data row12 - row15 - vld1.64 {d2}, [r1], r0 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r0 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r0 - vld1.64 {d7}, [r1], r2 - vld1.64 {d16}, [r1], r0 - vld1.64 {d17}, [r1], r2 - - vaddw.u8 q9, q0, d2 ; dest[x] + a1 - vaddw.u8 q10, q0, d3 ; dest[x] + a1 - vaddw.u8 q11, q0, d4 ; dest[x] + a1 - vaddw.u8 q12, q0, d5 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - vaddw.u8 q9, q0, d6 ; dest[x] + a1 - vaddw.u8 q10, q0, d7 ; dest[x] + a1 - vaddw.u8 q11, q0, d16 ; dest[x] + a1 - vaddw.u8 q12, q0, d17 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r0 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r0 - vst1.64 {d31}, [r12], r2 - - bx lr - ENDP ; |vpx_idct16x16_1_add_neon| - - END diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm deleted file mode 100644 index ea6b099d3bb..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm +++ /dev/null @@ -1,1176 +0,0 @@ -; -; Copyright (c) 2013 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - INCLUDE vpx_dsp/arm/idct_neon.asm.S - - EXPORT |vpx_idct16x16_256_add_neon_pass1| - EXPORT |vpx_idct16x16_256_add_neon_pass2| - IF CONFIG_VP9_HIGHBITDEPTH - EXPORT |vpx_idct16x16_256_add_neon_pass1_tran_low| - EXPORT |vpx_idct16x16_256_add_neon_pass2_tran_low| - ENDIF - EXPORT |vpx_idct16x16_10_add_neon_pass1| - EXPORT |vpx_idct16x16_10_add_neon_pass2| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15. - MACRO - TRANSPOSE8X8 - vswp d17, d24 - vswp d23, d30 - vswp d21, d28 - vswp d19, d26 - vtrn.32 q8, q10 - vtrn.32 q9, q11 - vtrn.32 q12, q14 - vtrn.32 q13, q15 - vtrn.16 q8, q9 - vtrn.16 q10, q11 - vtrn.16 q12, q13 - vtrn.16 q14, q15 - MEND - - AREA Block, CODE, READONLY ; name this block of code -;void |vpx_idct16x16_256_add_neon_pass1|(const int16_t *input, int16_t *output) -; -; r0 const int16_t *input -; r1 int16_t *output - -; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output -; will be stored back into q8-q15 registers. This function will touch q0-q7 -; registers and use them as buffer during calculation. -|vpx_idct16x16_256_add_neon_pass1| PROC - - ; TODO(hkuang): Find a better way to load the elements. - ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 - vld2.s16 {q8,q9}, [r0]! - vld2.s16 {q9,q10}, [r0]! - vld2.s16 {q10,q11}, [r0]! - vld2.s16 {q11,q12}, [r0]! - vld2.s16 {q12,q13}, [r0]! - vld2.s16 {q13,q14}, [r0]! - vld2.s16 {q14,q15}, [r0]! - vld2.s16 {q1,q2}, [r0]! - vmov.s16 q15, q1 - -idct16x16_256_add_neon_pass1 - ; cospi_28_64 = 3196 - movw r3, #0x0c7c - - ; cospi_4_64 = 16069 - movw r12, #0x3ec5 - - ; transpose the input data - TRANSPOSE8X8 - - ; stage 3 - vdup.16 d0, r3 ; duplicate cospi_28_64 - vdup.16 d1, r12 ; duplicate cospi_4_64 - - ; preloading to avoid stall - ; cospi_12_64 = 13623 - movw r3, #0x3537 - - ; cospi_20_64 = 9102 - movw r12, #0x238e - - ; step2[4] * cospi_28_64 - vmull.s16 q2, d18, d0 - vmull.s16 q3, d19, d0 - - ; step2[4] * cospi_4_64 - vmull.s16 q5, d18, d1 - vmull.s16 q6, d19, d1 - - ; temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64 - vmlsl.s16 q2, d30, d1 - vmlsl.s16 q3, d31, d1 - - ; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64 - vmlal.s16 q5, d30, d0 - vmlal.s16 q6, d31, d0 - - vdup.16 d2, r3 ; duplicate cospi_12_64 - vdup.16 d3, r12 ; duplicate cospi_20_64 - - ; dct_const_round_shift(temp1) - vrshrn.s32 d8, q2, #14 ; >> 14 - vrshrn.s32 d9, q3, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vrshrn.s32 d14, q5, #14 ; >> 14 - vrshrn.s32 d15, q6, #14 ; >> 14 - - ; preloading to avoid stall - ; cospi_16_64 = 11585 - movw r3, #0x2d41 - - ; cospi_24_64 = 6270 - movw r12, #0x187e - - ; step2[5] * cospi_12_64 - vmull.s16 q2, d26, d2 - vmull.s16 q3, d27, d2 - - ; step2[5] * cospi_20_64 - vmull.s16 q9, d26, d3 - vmull.s16 q15, d27, d3 - - ; temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64 - vmlsl.s16 q2, d22, d3 - vmlsl.s16 q3, d23, d3 - - ; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64 - vmlal.s16 q9, d22, d2 - vmlal.s16 q15, d23, d2 - - ; dct_const_round_shift(temp1) - vrshrn.s32 d10, q2, #14 ; >> 14 - vrshrn.s32 d11, q3, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vrshrn.s32 d12, q9, #14 ; >> 14 - vrshrn.s32 d13, q15, #14 ; >> 14 - - ; stage 4 - vdup.16 d30, r3 ; cospi_16_64 - - ; step1[0] * cospi_16_64 - vmull.s16 q2, d16, d30 - vmull.s16 q11, d17, d30 - - ; step1[1] * cospi_16_64 - vmull.s16 q0, d24, d30 - vmull.s16 q1, d25, d30 - - ; cospi_8_64 = 15137 - movw r3, #0x3b21 - - vdup.16 d30, r12 ; duplicate cospi_24_64 - vdup.16 d31, r3 ; duplicate cospi_8_64 - - ; temp1 = (step1[0] + step1[1]) * cospi_16_64 - vadd.s32 q3, q2, q0 - vadd.s32 q12, q11, q1 - - ; temp2 = (step1[0] - step1[1]) * cospi_16_64 - vsub.s32 q13, q2, q0 - vsub.s32 q1, q11, q1 - - ; dct_const_round_shift(temp1) - vrshrn.s32 d16, q3, #14 ; >> 14 - vrshrn.s32 d17, q12, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vrshrn.s32 d18, q13, #14 ; >> 14 - vrshrn.s32 d19, q1, #14 ; >> 14 - - ; step1[2] * cospi_24_64 - step1[3] * cospi_8_64; - ; step1[2] * cospi_8_64 - vmull.s16 q0, d20, d31 - vmull.s16 q1, d21, d31 - - ; step1[2] * cospi_24_64 - vmull.s16 q12, d20, d30 - vmull.s16 q13, d21, d30 - - ; temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64 - vmlal.s16 q0, d28, d30 - vmlal.s16 q1, d29, d30 - - ; temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64 - vmlsl.s16 q12, d28, d31 - vmlsl.s16 q13, d29, d31 - - ; dct_const_round_shift(temp2) - vrshrn.s32 d22, q0, #14 ; >> 14 - vrshrn.s32 d23, q1, #14 ; >> 14 - - ; dct_const_round_shift(temp1) - vrshrn.s32 d20, q12, #14 ; >> 14 - vrshrn.s32 d21, q13, #14 ; >> 14 - - vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5]; - vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5]; - vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7]; - vadd.s16 q15, q6, q7 ; step2[7] = step1[6] + step1[7]; - - ; cospi_16_64 = 11585 - movw r3, #0x2d41 - - ; stage 5 - vadd.s16 q0, q8, q11 ; step1[0] = step2[0] + step2[3]; - vadd.s16 q1, q9, q10 ; step1[1] = step2[1] + step2[2]; - vsub.s16 q2, q9, q10 ; step1[2] = step2[1] - step2[2]; - vsub.s16 q3, q8, q11 ; step1[3] = step2[0] - step2[3]; - - vdup.16 d16, r3; ; duplicate cospi_16_64 - - ; step2[5] * cospi_16_64 - vmull.s16 q11, d26, d16 - vmull.s16 q12, d27, d16 - - ; step2[6] * cospi_16_64 - vmull.s16 q9, d28, d16 - vmull.s16 q10, d29, d16 - - ; temp1 = (step2[6] - step2[5]) * cospi_16_64 - vsub.s32 q6, q9, q11 - vsub.s32 q13, q10, q12 - - ; temp2 = (step2[5] + step2[6]) * cospi_16_64 - vadd.s32 q9, q9, q11 - vadd.s32 q10, q10, q12 - - ; dct_const_round_shift(temp1) - vrshrn.s32 d10, q6, #14 ; >> 14 - vrshrn.s32 d11, q13, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vrshrn.s32 d12, q9, #14 ; >> 14 - vrshrn.s32 d13, q10, #14 ; >> 14 - - ; stage 6 - vadd.s16 q8, q0, q15 ; step2[0] = step1[0] + step1[7]; - vadd.s16 q9, q1, q6 ; step2[1] = step1[1] + step1[6]; - vadd.s16 q10, q2, q5 ; step2[2] = step1[2] + step1[5]; - vadd.s16 q11, q3, q4 ; step2[3] = step1[3] + step1[4]; - vsub.s16 q12, q3, q4 ; step2[4] = step1[3] - step1[4]; - vsub.s16 q13, q2, q5 ; step2[5] = step1[2] - step1[5]; - vsub.s16 q14, q1, q6 ; step2[6] = step1[1] - step1[6]; - vsub.s16 q15, q0, q15 ; step2[7] = step1[0] - step1[7]; - - ; store the data - vst1.64 {q8-q9}, [r1]! - vst1.64 {q10-q11}, [r1]! - vst1.64 {q12-q13}, [r1]! - vst1.64 {q14-q15}, [r1] - - bx lr - ENDP ; |vpx_idct16x16_256_add_neon_pass1| - - IF CONFIG_VP9_HIGHBITDEPTH -;void |vpx_idct16x16_256_add_neon_pass1_tran_low|(const tran_low_t *input, -; int16_t *output) -; -; r0 const tran_low_t *input -; r1 int16_t *output - -|vpx_idct16x16_256_add_neon_pass1_tran_low| PROC - LOAD_TRAN_LOW_TO_S16X2 d16, d17, d18, d19, r0 - LOAD_TRAN_LOW_TO_S16X2 d18, d19, d20, d21, r0 - LOAD_TRAN_LOW_TO_S16X2 d20, d21, d22, d23, r0 - LOAD_TRAN_LOW_TO_S16X2 d22, d23, d24, d25, r0 - LOAD_TRAN_LOW_TO_S16X2 d24, d25, d26, d27, r0 - LOAD_TRAN_LOW_TO_S16X2 d26, d27, d28, d29, r0 - LOAD_TRAN_LOW_TO_S16X2 d28, d29, d30, d31, r0 - LOAD_TRAN_LOW_TO_S16X2 d2, d3, d4, d5, r0 - vmov.s16 q15, q1 - - b idct16x16_256_add_neon_pass1 - ENDP ; |vpx_idct16x16_256_add_neon_pass1_tran_low| - ENDIF ; CONFIG_VP9_HIGHBITDEPTH - -;void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, -; int16_t *output, -; int16_t *pass1_output, -; int16_t skip_adding, -; uint8_t *dest, -; int stride) -; -; r0 const int16_t *src -; r1 int16_t *output -; r2 int16_t *pass1_output -; r3 int16_t skip_adding -; r4 uint8_t *dest -; r5 int stride - -; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output -; will be stored back into q8-q15 registers. This function will touch q0-q7 -; registers and use them as buffer during calculation. -|vpx_idct16x16_256_add_neon_pass2| PROC - ; TODO(hkuang): Find a better way to load the elements. - ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 - vld2.s16 {q8,q9}, [r0]! - vld2.s16 {q9,q10}, [r0]! - vld2.s16 {q10,q11}, [r0]! - vld2.s16 {q11,q12}, [r0]! - vld2.s16 {q12,q13}, [r0]! - vld2.s16 {q13,q14}, [r0]! - vld2.s16 {q14,q15}, [r0]! - vld2.s16 {q0,q1}, [r0]! - vmov.s16 q15, q0; - -idct16x16_256_add_neon_pass2 - push {r3-r9} - - ; cospi_30_64 = 1606 - movw r3, #0x0646 - - ; cospi_2_64 = 16305 - movw r12, #0x3fb1 - - ; transpose the input data - TRANSPOSE8X8 - - ; stage 3 - vdup.16 d12, r3 ; duplicate cospi_30_64 - vdup.16 d13, r12 ; duplicate cospi_2_64 - - ; preloading to avoid stall - ; cospi_14_64 = 12665 - movw r3, #0x3179 - - ; cospi_18_64 = 10394 - movw r12, #0x289a - - ; step1[8] * cospi_30_64 - vmull.s16 q2, d16, d12 - vmull.s16 q3, d17, d12 - - ; step1[8] * cospi_2_64 - vmull.s16 q1, d16, d13 - vmull.s16 q4, d17, d13 - - ; temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64 - vmlsl.s16 q2, d30, d13 - vmlsl.s16 q3, d31, d13 - - ; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64 - vmlal.s16 q1, d30, d12 - vmlal.s16 q4, d31, d12 - - vdup.16 d30, r3 ; duplicate cospi_14_64 - vdup.16 d31, r12 ; duplicate cospi_18_64 - - ; dct_const_round_shift(temp1) - vrshrn.s32 d0, q2, #14 ; >> 14 - vrshrn.s32 d1, q3, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vrshrn.s32 d14, q1, #14 ; >> 14 - vrshrn.s32 d15, q4, #14 ; >> 14 - - ; preloading to avoid stall - ; cospi_22_64 = 7723 - movw r3, #0x1e2b - - ; cospi_10_64 = 14449 - movw r12, #0x3871 - - ; step1[9] * cospi_14_64 - vmull.s16 q2, d24, d30 - vmull.s16 q3, d25, d30 - - ; step1[9] * cospi_18_64 - vmull.s16 q4, d24, d31 - vmull.s16 q5, d25, d31 - - ; temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64 - vmlsl.s16 q2, d22, d31 - vmlsl.s16 q3, d23, d31 - - ; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64 - vmlal.s16 q4, d22, d30 - vmlal.s16 q5, d23, d30 - - vdup.16 d30, r3 ; duplicate cospi_22_64 - vdup.16 d31, r12 ; duplicate cospi_10_64 - - ; dct_const_round_shift(temp1) - vrshrn.s32 d2, q2, #14 ; >> 14 - vrshrn.s32 d3, q3, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vrshrn.s32 d12, q4, #14 ; >> 14 - vrshrn.s32 d13, q5, #14 ; >> 14 - - ; step1[10] * cospi_22_64 - vmull.s16 q11, d20, d30 - vmull.s16 q12, d21, d30 - - ; step1[10] * cospi_10_64 - vmull.s16 q4, d20, d31 - vmull.s16 q5, d21, d31 - - ; temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64 - vmlsl.s16 q11, d26, d31 - vmlsl.s16 q12, d27, d31 - - ; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64 - vmlal.s16 q4, d26, d30 - vmlal.s16 q5, d27, d30 - - ; preloading to avoid stall - ; cospi_6_64 = 15679 - movw r3, #0x3d3f - - ; cospi_26_64 = 4756 - movw r12, #0x1294 - - vdup.16 d30, r3 ; duplicate cospi_6_64 - vdup.16 d31, r12 ; duplicate cospi_26_64 - - ; dct_const_round_shift(temp1) - vrshrn.s32 d4, q11, #14 ; >> 14 - vrshrn.s32 d5, q12, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vrshrn.s32 d11, q5, #14 ; >> 14 - vrshrn.s32 d10, q4, #14 ; >> 14 - - ; step1[11] * cospi_6_64 - vmull.s16 q10, d28, d30 - vmull.s16 q11, d29, d30 - - ; step1[11] * cospi_26_64 - vmull.s16 q12, d28, d31 - vmull.s16 q13, d29, d31 - - ; temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64 - vmlsl.s16 q10, d18, d31 - vmlsl.s16 q11, d19, d31 - - ; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64 - vmlal.s16 q12, d18, d30 - vmlal.s16 q13, d19, d30 - - vsub.s16 q9, q0, q1 ; step1[9]=step2[8]-step2[9] - vadd.s16 q0, q0, q1 ; step1[8]=step2[8]+step2[9] - - ; dct_const_round_shift(temp1) - vrshrn.s32 d6, q10, #14 ; >> 14 - vrshrn.s32 d7, q11, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vrshrn.s32 d8, q12, #14 ; >> 14 - vrshrn.s32 d9, q13, #14 ; >> 14 - - ; stage 3 - vsub.s16 q10, q3, q2 ; step1[10]=-step2[10]+step2[11] - vadd.s16 q11, q2, q3 ; step1[11]=step2[10]+step2[11] - vadd.s16 q12, q4, q5 ; step1[12]=step2[12]+step2[13] - vsub.s16 q13, q4, q5 ; step1[13]=step2[12]-step2[13] - vsub.s16 q14, q7, q6 ; step1[14]=-step2[14]+tep2[15] - vadd.s16 q7, q6, q7 ; step1[15]=step2[14]+step2[15] - - ; stage 4 - ; cospi_24_64 = 6270 - movw r3, #0x187e - - ; cospi_8_64 = 15137 - movw r12, #0x3b21 - - ; -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 - vdup.16 d30, r12 ; duplicate cospi_8_64 - vdup.16 d31, r3 ; duplicate cospi_24_64 - - ; step1[9] * cospi_24_64 - vmull.s16 q2, d18, d31 - vmull.s16 q3, d19, d31 - - ; step1[14] * cospi_24_64 - vmull.s16 q4, d28, d31 - vmull.s16 q5, d29, d31 - - ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64 - vmlal.s16 q2, d28, d30 - vmlal.s16 q3, d29, d30 - - ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 - vmlsl.s16 q4, d18, d30 - vmlsl.s16 q5, d19, d30 - - rsb r12, #0 - vdup.16 d30, r12 ; duplicate -cospi_8_64 - - ; dct_const_round_shift(temp2) - vrshrn.s32 d12, q2, #14 ; >> 14 - vrshrn.s32 d13, q3, #14 ; >> 14 - - ; dct_const_round_shift(temp1) - vrshrn.s32 d2, q4, #14 ; >> 14 - vrshrn.s32 d3, q5, #14 ; >> 14 - - vmov.s16 q3, q11 - vmov.s16 q4, q12 - - ; - step1[13] * cospi_8_64 - vmull.s16 q11, d26, d30 - vmull.s16 q12, d27, d30 - - ; -step1[10] * cospi_8_64 - vmull.s16 q8, d20, d30 - vmull.s16 q9, d21, d30 - - ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 - vmlsl.s16 q11, d20, d31 - vmlsl.s16 q12, d21, d31 - - ; temp1 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 - vmlal.s16 q8, d26, d31 - vmlal.s16 q9, d27, d31 - - ; dct_const_round_shift(temp2) - vrshrn.s32 d4, q11, #14 ; >> 14 - vrshrn.s32 d5, q12, #14 ; >> 14 - - ; dct_const_round_shift(temp1) - vrshrn.s32 d10, q8, #14 ; >> 14 - vrshrn.s32 d11, q9, #14 ; >> 14 - - ; stage 5 - vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11]; - vadd.s16 q9, q1, q2 ; step1[9] = step2[9]+step2[10]; - vsub.s16 q10, q1, q2 ; step1[10] = step2[9]-step2[10]; - vsub.s16 q11, q0, q3 ; step1[11] = step2[8]-step2[11]; - vsub.s16 q12, q7, q4 ; step1[12] =-step2[12]+step2[15]; - vsub.s16 q13, q6, q5 ; step1[13] =-step2[13]+step2[14]; - vadd.s16 q14, q6, q5 ; step1[14] =step2[13]+step2[14]; - vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15]; - - ; stage 6. - ; cospi_16_64 = 11585 - movw r12, #0x2d41 - - vdup.16 d14, r12 ; duplicate cospi_16_64 - - ; step1[13] * cospi_16_64 - vmull.s16 q3, d26, d14 - vmull.s16 q4, d27, d14 - - ; step1[10] * cospi_16_64 - vmull.s16 q0, d20, d14 - vmull.s16 q1, d21, d14 - - ; temp1 = (-step1[10] + step1[13]) * cospi_16_64 - vsub.s32 q5, q3, q0 - vsub.s32 q6, q4, q1 - - ; temp2 = (step1[10] + step1[13]) * cospi_16_64 - vadd.s32 q10, q3, q0 - vadd.s32 q4, q4, q1 - - ; dct_const_round_shift(temp1) - vrshrn.s32 d4, q5, #14 ; >> 14 - vrshrn.s32 d5, q6, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vrshrn.s32 d10, q10, #14 ; >> 14 - vrshrn.s32 d11, q4, #14 ; >> 14 - - ; step1[11] * cospi_16_64 - vmull.s16 q0, d22, d14 - vmull.s16 q1, d23, d14 - - ; step1[12] * cospi_16_64 - vmull.s16 q13, d24, d14 - vmull.s16 q6, d25, d14 - - ; temp1 = (-step1[11] + step1[12]) * cospi_16_64 - vsub.s32 q10, q13, q0 - vsub.s32 q4, q6, q1 - - ; temp2 = (step1[11] + step1[12]) * cospi_16_64 - vadd.s32 q13, q13, q0 - vadd.s32 q6, q6, q1 - - ; dct_const_round_shift(temp1) - vrshrn.s32 d6, q10, #14 ; >> 14 - vrshrn.s32 d7, q4, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vrshrn.s32 d8, q13, #14 ; >> 14 - vrshrn.s32 d9, q6, #14 ; >> 14 - - mov r4, #16 ; pass1_output stride - ldr r3, [sp] ; load skip_adding - cmp r3, #0 ; check if need adding dest data - beq skip_adding_dest - - ldr r7, [sp, #28] ; dest used to save element 0-7 - mov r9, r7 ; save dest pointer for later use - ldr r8, [sp, #32] ; load stride - - ; stage 7 - ; load the data in pass1 - vld1.s16 {q0}, [r2], r4 ; load data step2[0] - vld1.s16 {q1}, [r2], r4 ; load data step2[1] - vld1.s16 {q10}, [r2], r4 ; load data step2[2] - vld1.s16 {q11}, [r2], r4 ; load data step2[3] - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vadd.s16 q12, q0, q15 ; step2[0] + step2[15] - vadd.s16 q13, q1, q14 ; step2[1] + step2[14] - vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO - vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q12, q12, d12 ; + dest[j * stride + i] - vaddw.u8 q13, q13, d13 ; + dest[j * stride + i] - vqmovun.s16 d12, q12 ; clip pixel - vqmovun.s16 d13, q13 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vst1.64 {d13}, [r9], r8 ; store the data - vsub.s16 q14, q1, q14 ; step2[1] - step2[14] - vsub.s16 q15, q0, q15 ; step2[0] - step2[15] - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vadd.s16 q12, q10, q5 ; step2[2] + step2[13] - vadd.s16 q13, q11, q4 ; step2[3] + step2[12] - vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO - vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q12, q12, d12 ; + dest[j * stride + i] - vaddw.u8 q13, q13, d13 ; + dest[j * stride + i] - vqmovun.s16 d12, q12 ; clip pixel - vqmovun.s16 d13, q13 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vst1.64 {d13}, [r9], r8 ; store the data - vsub.s16 q4, q11, q4 ; step2[3] - step2[12] - vsub.s16 q5, q10, q5 ; step2[2] - step2[13] - vld1.s16 {q0}, [r2], r4 ; load data step2[4] - vld1.s16 {q1}, [r2], r4 ; load data step2[5] - vld1.s16 {q10}, [r2], r4 ; load data step2[6] - vld1.s16 {q11}, [r2], r4 ; load data step2[7] - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vadd.s16 q12, q0, q3 ; step2[4] + step2[11] - vadd.s16 q13, q1, q2 ; step2[5] + step2[10] - vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO - vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q12, q12, d12 ; + dest[j * stride + i] - vaddw.u8 q13, q13, d13 ; + dest[j * stride + i] - vqmovun.s16 d12, q12 ; clip pixel - vqmovun.s16 d13, q13 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vst1.64 {d13}, [r9], r8 ; store the data - vsub.s16 q2, q1, q2 ; step2[5] - step2[10] - vsub.s16 q3, q0, q3 ; step2[4] - step2[11] - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vadd.s16 q12, q10, q9 ; step2[6] + step2[9] - vadd.s16 q13, q11, q8 ; step2[7] + step2[8] - vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO - vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q12, q12, d12 ; + dest[j * stride + i] - vaddw.u8 q13, q13, d13 ; + dest[j * stride + i] - vqmovun.s16 d12, q12 ; clip pixel - vqmovun.s16 d13, q13 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vst1.64 {d13}, [r9], r8 ; store the data - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vsub.s16 q8, q11, q8 ; step2[7] - step2[8] - vsub.s16 q9, q10, q9 ; step2[6] - step2[9] - - ; store the data output 8,9,10,11,12,13,14,15 - vrshr.s16 q8, q8, #6 ; ROUND_POWER_OF_TWO - vaddw.u8 q8, q8, d12 ; + dest[j * stride + i] - vqmovun.s16 d12, q8 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vrshr.s16 q9, q9, #6 - vaddw.u8 q9, q9, d13 ; + dest[j * stride + i] - vqmovun.s16 d13, q9 ; clip pixel - vst1.64 {d13}, [r9], r8 ; store the data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vrshr.s16 q2, q2, #6 - vaddw.u8 q2, q2, d12 ; + dest[j * stride + i] - vqmovun.s16 d12, q2 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vrshr.s16 q3, q3, #6 - vaddw.u8 q3, q3, d13 ; + dest[j * stride + i] - vqmovun.s16 d13, q3 ; clip pixel - vst1.64 {d13}, [r9], r8 ; store the data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vrshr.s16 q4, q4, #6 - vaddw.u8 q4, q4, d12 ; + dest[j * stride + i] - vqmovun.s16 d12, q4 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vrshr.s16 q5, q5, #6 - vaddw.u8 q5, q5, d13 ; + dest[j * stride + i] - vqmovun.s16 d13, q5 ; clip pixel - vst1.64 {d13}, [r9], r8 ; store the data - vld1.64 {d13}, [r7], r8 ; load destinatoin data - vrshr.s16 q14, q14, #6 - vaddw.u8 q14, q14, d12 ; + dest[j * stride + i] - vqmovun.s16 d12, q14 ; clip pixel - vst1.64 {d12}, [r9], r8 ; store the data - vld1.64 {d12}, [r7], r8 ; load destinatoin data - vrshr.s16 q15, q15, #6 - vaddw.u8 q15, q15, d13 ; + dest[j * stride + i] - vqmovun.s16 d13, q15 ; clip pixel - vst1.64 {d13}, [r9], r8 ; store the data - b end_idct16x16_pass2 - -skip_adding_dest - ; stage 7 - ; load the data in pass1 - mov r5, #24 - mov r3, #8 - - vld1.s16 {q0}, [r2], r4 ; load data step2[0] - vld1.s16 {q1}, [r2], r4 ; load data step2[1] - vadd.s16 q12, q0, q15 ; step2[0] + step2[15] - vadd.s16 q13, q1, q14 ; step2[1] + step2[14] - vld1.s16 {q10}, [r2], r4 ; load data step2[2] - vld1.s16 {q11}, [r2], r4 ; load data step2[3] - vst1.64 {d24}, [r1], r3 ; store output[0] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[1] - vst1.64 {d27}, [r1], r5 - vadd.s16 q12, q10, q5 ; step2[2] + step2[13] - vadd.s16 q13, q11, q4 ; step2[3] + step2[12] - vsub.s16 q14, q1, q14 ; step2[1] - step2[14] - vsub.s16 q15, q0, q15 ; step2[0] - step2[15] - vst1.64 {d24}, [r1], r3 ; store output[2] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[3] - vst1.64 {d27}, [r1], r5 - vsub.s16 q4, q11, q4 ; step2[3] - step2[12] - vsub.s16 q5, q10, q5 ; step2[2] - step2[13] - vld1.s16 {q0}, [r2], r4 ; load data step2[4] - vld1.s16 {q1}, [r2], r4 ; load data step2[5] - vadd.s16 q12, q0, q3 ; step2[4] + step2[11] - vadd.s16 q13, q1, q2 ; step2[5] + step2[10] - vld1.s16 {q10}, [r2], r4 ; load data step2[6] - vld1.s16 {q11}, [r2], r4 ; load data step2[7] - vst1.64 {d24}, [r1], r3 ; store output[4] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[5] - vst1.64 {d27}, [r1], r5 - vadd.s16 q12, q10, q9 ; step2[6] + step2[9] - vadd.s16 q13, q11, q8 ; step2[7] + step2[8] - vsub.s16 q2, q1, q2 ; step2[5] - step2[10] - vsub.s16 q3, q0, q3 ; step2[4] - step2[11] - vsub.s16 q8, q11, q8 ; step2[7] - step2[8] - vsub.s16 q9, q10, q9 ; step2[6] - step2[9] - vst1.64 {d24}, [r1], r3 ; store output[6] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[7] - vst1.64 {d27}, [r1], r5 - - ; store the data output 8,9,10,11,12,13,14,15 - vst1.64 {d16}, [r1], r3 - vst1.64 {d17}, [r1], r5 - vst1.64 {d18}, [r1], r3 - vst1.64 {d19}, [r1], r5 - vst1.64 {d4}, [r1], r3 - vst1.64 {d5}, [r1], r5 - vst1.64 {d6}, [r1], r3 - vst1.64 {d7}, [r1], r5 - vst1.64 {d8}, [r1], r3 - vst1.64 {d9}, [r1], r5 - vst1.64 {d10}, [r1], r3 - vst1.64 {d11}, [r1], r5 - vst1.64 {d28}, [r1], r3 - vst1.64 {d29}, [r1], r5 - vst1.64 {d30}, [r1], r3 - vst1.64 {d31}, [r1], r5 -end_idct16x16_pass2 - pop {r3-r9} - bx lr - ENDP ; |vpx_idct16x16_256_add_neon_pass2| - - IF CONFIG_VP9_HIGHBITDEPTH -;void vpx_idct16x16_256_add_neon_pass2_tran_low(const tran_low_t *src, -; int16_t *output, -; int16_t *pass1_output, -; int16_t skip_adding, -; uint8_t *dest, -; int stride) -; -; r0 const tran_low_t *src -; r1 int16_t *output -; r2 int16_t *pass1_output -; r3 int16_t skip_adding -; r4 uint8_t *dest -; r5 int stride - -|vpx_idct16x16_256_add_neon_pass2_tran_low| PROC - LOAD_TRAN_LOW_TO_S16X2 d16, d17, d18, d19, r0 - LOAD_TRAN_LOW_TO_S16X2 d18, d19, d20, d21, r0 - LOAD_TRAN_LOW_TO_S16X2 d20, d21, d22, d23, r0 - LOAD_TRAN_LOW_TO_S16X2 d22, d23, d24, d25, r0 - LOAD_TRAN_LOW_TO_S16X2 d24, d25, d26, d27, r0 - LOAD_TRAN_LOW_TO_S16X2 d26, d27, d28, d29, r0 - LOAD_TRAN_LOW_TO_S16X2 d28, d29, d30, d31, r0 - LOAD_TRAN_LOW_TO_S16X2 d0, d1, d2, d3, r0 - vmov.s16 q15, q0 - - b idct16x16_256_add_neon_pass2 - ENDP ; |vpx_idct16x16_256_add_neon_pass2_tran_low| - ENDIF ; CONFIG_VP9_HIGHBITDEPTH - -;void |vpx_idct16x16_10_add_neon_pass1|(const tran_low_t *input, -; int16_t *output) -; -; r0 const tran_low_t *input -; r1 int16_t *output - -; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output -; will be stored back into q8-q15 registers. This function will touch q0-q7 -; registers and use them as buffer during calculation. -|vpx_idct16x16_10_add_neon_pass1| PROC - - ; TODO(hkuang): Find a better way to load the elements. - ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15 - LOAD_TRAN_LOW_TO_S16X2 d16, d17, d18, d19, r0 - LOAD_TRAN_LOW_TO_S16X2 d18, d19, d20, d21, r0 - LOAD_TRAN_LOW_TO_S16X2 d20, d21, d22, d23, r0 - LOAD_TRAN_LOW_TO_S16X2 d22, d23, d24, d25, r0 - LOAD_TRAN_LOW_TO_S16X2 d24, d25, d26, d27, r0 - LOAD_TRAN_LOW_TO_S16X2 d26, d27, d28, d29, r0 - LOAD_TRAN_LOW_TO_S16X2 d28, d29, d30, d31, r0 - LOAD_TRAN_LOW_TO_S16X2 d2, d3, d4, d5, r0 - vmov.s16 q15, q1 - - ; cospi_28_64*2 = 6392 - movw r3, #0x18f8 - - ; cospi_4_64*2 = 32138 - movw r12, #0x7d8a - - ; transpose the input data - TRANSPOSE8X8 - - ; stage 3 - vdup.16 q0, r3 ; duplicate cospi_28_64*2 - vdup.16 q1, r12 ; duplicate cospi_4_64*2 - - ; The following instructions use vqrdmulh to do the - ; dct_const_round_shift(step2[4] * cospi_28_64). vvqrdmulh will multiply, - ; double, and return the high 16 bits, effectively giving >> 15. Doubling - ; the constant will change this to >> 14. - ; dct_const_round_shift(step2[4] * cospi_28_64); - vqrdmulh.s16 q4, q9, q0 - - ; preloading to avoid stall - ; cospi_16_64*2 = 23170 - movw r3, #0x5a82 - - ; dct_const_round_shift(step2[4] * cospi_4_64); - vqrdmulh.s16 q7, q9, q1 - - ; stage 4 - vdup.16 q1, r3 ; cospi_16_64*2 - - ; cospi_16_64 = 11585 - movw r3, #0x2d41 - - vdup.16 d4, r3; ; duplicate cospi_16_64 - - ; dct_const_round_shift(step1[0] * cospi_16_64) - vqrdmulh.s16 q8, q8, q1 - - ; step2[6] * cospi_16_64 - vmull.s16 q9, d14, d4 - vmull.s16 q10, d15, d4 - - ; step2[5] * cospi_16_64 - vmull.s16 q12, d9, d4 - vmull.s16 q11, d8, d4 - - ; temp1 = (step2[6] - step2[5]) * cospi_16_64 - vsub.s32 q15, q10, q12 - vsub.s32 q6, q9, q11 - - ; temp2 = (step2[5] + step2[6]) * cospi_16_64 - vadd.s32 q9, q9, q11 - vadd.s32 q10, q10, q12 - - ; dct_const_round_shift(temp1) - vrshrn.s32 d11, q15, #14 ; >> 14 - vrshrn.s32 d10, q6, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vrshrn.s32 d12, q9, #14 ; >> 14 - vrshrn.s32 d13, q10, #14 ; >> 14 - - ; stage 6 - vadd.s16 q2, q8, q7 ; step2[0] = step1[0] + step1[7]; - vadd.s16 q10, q8, q5 ; step2[2] = step1[2] + step1[5]; - vadd.s16 q11, q8, q4 ; step2[3] = step1[3] + step1[4]; - vadd.s16 q9, q8, q6 ; step2[1] = step1[1] + step1[6]; - vsub.s16 q12, q8, q4 ; step2[4] = step1[3] - step1[4]; - vsub.s16 q13, q8, q5 ; step2[5] = step1[2] - step1[5]; - vsub.s16 q14, q8, q6 ; step2[6] = step1[1] - step1[6]; - vsub.s16 q15, q8, q7 ; step2[7] = step1[0] - step1[7]; - - ; store the data - vst1.64 {q2}, [r1]! - vst1.64 {q9-q10}, [r1]! - vst1.64 {q11-q12}, [r1]! - vst1.64 {q13-q14}, [r1]! - vst1.64 {q15}, [r1] - - bx lr - ENDP ; |vpx_idct16x16_10_add_neon_pass1| - -;void vpx_idct16x16_10_add_neon_pass2(const tran_low_t *src, int16_t *output, -; int16_t *pass1_output) -; -; r0 const tran_low_t *src -; r1 int16_t *output -; r2 int16_t *pass1_output - -; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output -; will be stored back into q8-q15 registers. This function will touch q0-q7 -; registers and use them as buffer during calculation. -|vpx_idct16x16_10_add_neon_pass2| PROC - push {r3-r9} - - ; TODO(hkuang): Find a better way to load the elements. - ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15 - LOAD_TRAN_LOW_TO_S16X2 d16, d17, d18, d19, r0 - LOAD_TRAN_LOW_TO_S16X2 d18, d19, d20, d21, r0 - LOAD_TRAN_LOW_TO_S16X2 d20, d21, d22, d23, r0 - LOAD_TRAN_LOW_TO_S16X2 d22, d23, d24, d25, r0 - LOAD_TRAN_LOW_TO_S16X2 d24, d25, d26, d27, r0 - LOAD_TRAN_LOW_TO_S16X2 d26, d27, d28, d29, r0 - LOAD_TRAN_LOW_TO_S16X2 d28, d29, d30, d31, r0 - LOAD_TRAN_LOW_TO_S16X2 d0, d1, d2, d3, r0 - vmov.s16 q15, q0; - - ; 2*cospi_30_64 = 3212 - movw r3, #0x0c8c - - ; 2*cospi_2_64 = 32610 - movw r12, #0x7f62 - - ; transpose the input data - TRANSPOSE8X8 - - ; stage 3 - vdup.16 q6, r3 ; duplicate 2*cospi_30_64 - - ; dct_const_round_shift(step1[8] * cospi_30_64) - vqrdmulh.s16 q0, q8, q6 - - vdup.16 q6, r12 ; duplicate 2*cospi_2_64 - - ; dct_const_round_shift(step1[8] * cospi_2_64) - vqrdmulh.s16 q7, q8, q6 - - ; preloading to avoid stall - ; 2*cospi_26_64 = 9512 - movw r12, #0x2528 - rsb r12, #0 - vdup.16 q15, r12 ; duplicate -2*cospi_26_64 - - ; 2*cospi_6_64 = 31358 - movw r3, #0x7a7e - vdup.16 q14, r3 ; duplicate 2*cospi_6_64 - - ; dct_const_round_shift(- step1[12] * cospi_26_64) - vqrdmulh.s16 q3, q9, q15 - - ; dct_const_round_shift(step1[12] * cospi_6_64) - vqrdmulh.s16 q4, q9, q14 - - ; stage 4 - ; cospi_24_64 = 6270 - movw r3, #0x187e - vdup.16 d31, r3 ; duplicate cospi_24_64 - - ; cospi_8_64 = 15137 - movw r12, #0x3b21 - vdup.16 d30, r12 ; duplicate cospi_8_64 - - ; step1[14] * cospi_24_64 - vmull.s16 q12, d14, d31 - vmull.s16 q5, d15, d31 - - ; step1[9] * cospi_24_64 - vmull.s16 q2, d0, d31 - vmull.s16 q11, d1, d31 - - ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64 - vmlsl.s16 q12, d0, d30 - vmlsl.s16 q5, d1, d30 - - ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64 - vmlal.s16 q2, d14, d30 - vmlal.s16 q11, d15, d30 - - rsb r12, #0 - vdup.16 d30, r12 ; duplicate -cospi_8_64 - - ; dct_const_round_shift(temp1) - vrshrn.s32 d2, q12, #14 ; >> 14 - vrshrn.s32 d3, q5, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vrshrn.s32 d12, q2, #14 ; >> 14 - vrshrn.s32 d13, q11, #14 ; >> 14 - - ; - step1[13] * cospi_8_64 - vmull.s16 q10, d8, d30 - vmull.s16 q13, d9, d30 - - ; -step1[10] * cospi_8_64 - vmull.s16 q8, d6, d30 - vmull.s16 q9, d7, d30 - - ; temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64 - vmlsl.s16 q10, d6, d31 - vmlsl.s16 q13, d7, d31 - - ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64 - vmlal.s16 q8, d8, d31 - vmlal.s16 q9, d9, d31 - - ; dct_const_round_shift(temp1) - vrshrn.s32 d4, q10, #14 ; >> 14 - vrshrn.s32 d5, q13, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vrshrn.s32 d10, q8, #14 ; >> 14 - vrshrn.s32 d11, q9, #14 ; >> 14 - - ; stage 5 - vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11]; - vadd.s16 q9, q1, q2 ; step1[9] = step2[9]+step2[10]; - vsub.s16 q10, q1, q2 ; step1[10] = step2[9]-step2[10]; - vsub.s16 q11, q0, q3 ; step1[11] = step2[8]-step2[11]; - vsub.s16 q12, q7, q4 ; step1[12] =-step2[12]+step2[15]; - vsub.s16 q13, q6, q5 ; step1[13] =-step2[13]+step2[14]; - vadd.s16 q14, q6, q5 ; step1[14] =step2[13]+step2[14]; - vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15]; - - ; stage 6. - ; cospi_16_64 = 11585 - movw r12, #0x2d41 - - vdup.16 d14, r12 ; duplicate cospi_16_64 - - ; step1[13] * cospi_16_64 - vmull.s16 q3, d26, d14 - vmull.s16 q4, d27, d14 - - ; step1[10] * cospi_16_64 - vmull.s16 q0, d20, d14 - vmull.s16 q1, d21, d14 - - ; temp1 = (-step1[10] + step1[13]) * cospi_16_64 - vsub.s32 q5, q3, q0 - vsub.s32 q6, q4, q1 - - ; temp2 = (step1[10] + step1[13]) * cospi_16_64 - vadd.s32 q0, q3, q0 - vadd.s32 q1, q4, q1 - - ; dct_const_round_shift(temp1) - vrshrn.s32 d4, q5, #14 ; >> 14 - vrshrn.s32 d5, q6, #14 ; >> 14 - - ; dct_const_round_shift(temp2) - vrshrn.s32 d10, q0, #14 ; >> 14 - vrshrn.s32 d11, q1, #14 ; >> 14 - - ; step1[11] * cospi_16_64 - vmull.s16 q0, d22, d14 - vmull.s16 q1, d23, d14 - - ; step1[12] * cospi_16_64 - vmull.s16 q13, d24, d14 - vmull.s16 q6, d25, d14 - - ; temp1 = (-step1[11] + step1[12]) * cospi_16_64 - vsub.s32 q10, q13, q0 - vsub.s32 q4, q6, q1 - - ; temp2 = (step1[11] + step1[12]) * cospi_16_64 - vadd.s32 q13, q13, q0 - vadd.s32 q6, q6, q1 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vrshrn.s32 d6, q10, #14 ; >> 14 - vrshrn.s32 d7, q4, #14 ; >> 14 - - ; dct_const_round_shift((step1[11] + step1[12]) * cospi_16_64); - vrshrn.s32 d8, q13, #14 ; >> 14 - vrshrn.s32 d9, q6, #14 ; >> 14 - - mov r4, #16 ; pass1_output stride - ldr r3, [sp] ; load skip_adding - - ; stage 7 - ; load the data in pass1 - mov r5, #24 - mov r3, #8 - - vld1.s16 {q0}, [r2], r4 ; load data step2[0] - vld1.s16 {q1}, [r2], r4 ; load data step2[1] - vadd.s16 q12, q0, q15 ; step2[0] + step2[15] - vadd.s16 q13, q1, q14 ; step2[1] + step2[14] - vld1.s16 {q10}, [r2], r4 ; load data step2[2] - vld1.s16 {q11}, [r2], r4 ; load data step2[3] - vst1.64 {d24}, [r1], r3 ; store output[0] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[1] - vst1.64 {d27}, [r1], r5 - vadd.s16 q12, q10, q5 ; step2[2] + step2[13] - vadd.s16 q13, q11, q4 ; step2[3] + step2[12] - vsub.s16 q14, q1, q14 ; step2[1] - step2[14] - vsub.s16 q15, q0, q15 ; step2[0] - step2[15] - vst1.64 {d24}, [r1], r3 ; store output[2] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[3] - vst1.64 {d27}, [r1], r5 - vsub.s16 q4, q11, q4 ; step2[3] - step2[12] - vsub.s16 q5, q10, q5 ; step2[2] - step2[13] - vld1.s16 {q0}, [r2], r4 ; load data step2[4] - vld1.s16 {q1}, [r2], r4 ; load data step2[5] - vadd.s16 q12, q0, q3 ; step2[4] + step2[11] - vadd.s16 q13, q1, q2 ; step2[5] + step2[10] - vld1.s16 {q10}, [r2], r4 ; load data step2[6] - vld1.s16 {q11}, [r2], r4 ; load data step2[7] - vst1.64 {d24}, [r1], r3 ; store output[4] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[5] - vst1.64 {d27}, [r1], r5 - vadd.s16 q12, q10, q9 ; step2[6] + step2[9] - vadd.s16 q13, q11, q8 ; step2[7] + step2[8] - vsub.s16 q2, q1, q2 ; step2[5] - step2[10] - vsub.s16 q3, q0, q3 ; step2[4] - step2[11] - vsub.s16 q8, q11, q8 ; step2[7] - step2[8] - vsub.s16 q9, q10, q9 ; step2[6] - step2[9] - vst1.64 {d24}, [r1], r3 ; store output[6] - vst1.64 {d25}, [r1], r5 - vst1.64 {d26}, [r1], r3 ; store output[7] - vst1.64 {d27}, [r1], r5 - - ; store the data output 8,9,10,11,12,13,14,15 - vst1.64 {d16}, [r1], r3 - vst1.64 {d17}, [r1], r5 - vst1.64 {d18}, [r1], r3 - vst1.64 {d19}, [r1], r5 - vst1.64 {d4}, [r1], r3 - vst1.64 {d5}, [r1], r5 - vst1.64 {d6}, [r1], r3 - vst1.64 {d7}, [r1], r5 - vst1.64 {d8}, [r1], r3 - vst1.64 {d9}, [r1], r5 - vst1.64 {d10}, [r1], r3 - vst1.64 {d11}, [r1], r5 - vst1.64 {d28}, [r1], r3 - vst1.64 {d29}, [r1], r5 - vst1.64 {d30}, [r1], r3 - vst1.64 {d31}, [r1], r5 -end_idct10_16x16_pass2 - pop {r3-r9} - bx lr - ENDP ; |vpx_idct16x16_10_add_neon_pass2| - END diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c index 0c891919b76..4259cd8cadc 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c @@ -14,106 +14,10 @@ #include "vpx_dsp/arm/idct_neon.h" #include "vpx_dsp/txfm_common.h" -#if CONFIG_VP9_HIGHBITDEPTH -static INLINE void idct16x16_256_add_load_tran_low_kernel( - const tran_low_t **input, int16_t **out) { - int16x8_t s; - - s = load_tran_low_to_s16q(*input); - vst1q_s16(*out, s); - *input += 8; - *out += 8; -} - -static INLINE void idct16x16_256_add_load_tran_low(const tran_low_t *input, - int16_t *out) { - idct16x16_256_add_load_tran_low_kernel(&input, &out); - idct16x16_256_add_load_tran_low_kernel(&input, &out); - idct16x16_256_add_load_tran_low_kernel(&input, &out); - idct16x16_256_add_load_tran_low_kernel(&input, &out); - idct16x16_256_add_load_tran_low_kernel(&input, &out); - idct16x16_256_add_load_tran_low_kernel(&input, &out); - idct16x16_256_add_load_tran_low_kernel(&input, &out); - idct16x16_256_add_load_tran_low_kernel(&input, &out); - idct16x16_256_add_load_tran_low_kernel(&input, &out); - idct16x16_256_add_load_tran_low_kernel(&input, &out); - idct16x16_256_add_load_tran_low_kernel(&input, &out); - idct16x16_256_add_load_tran_low_kernel(&input, &out); - idct16x16_256_add_load_tran_low_kernel(&input, &out); - idct16x16_256_add_load_tran_low_kernel(&input, &out); - idct16x16_256_add_load_tran_low_kernel(&input, &out); - idct16x16_256_add_load_tran_low_kernel(&input, &out); - idct16x16_256_add_load_tran_low_kernel(&input, &out); - idct16x16_256_add_load_tran_low_kernel(&input, &out); - idct16x16_256_add_load_tran_low_kernel(&input, &out); - idct16x16_256_add_load_tran_low_kernel(&input, &out); - idct16x16_256_add_load_tran_low_kernel(&input, &out); - idct16x16_256_add_load_tran_low_kernel(&input, &out); - idct16x16_256_add_load_tran_low_kernel(&input, &out); - idct16x16_256_add_load_tran_low_kernel(&input, &out); - idct16x16_256_add_load_tran_low_kernel(&input, &out); - idct16x16_256_add_load_tran_low_kernel(&input, &out); - idct16x16_256_add_load_tran_low_kernel(&input, &out); - idct16x16_256_add_load_tran_low_kernel(&input, &out); - idct16x16_256_add_load_tran_low_kernel(&input, &out); - idct16x16_256_add_load_tran_low_kernel(&input, &out); - idct16x16_256_add_load_tran_low_kernel(&input, &out); - idct16x16_256_add_load_tran_low_kernel(&input, &out); -} -#endif // CONFIG_VP9_HIGHBITDEPTH - static INLINE void wrap_low_4x2(const int32x4_t *const t32, int16x4_t *const d0, int16x4_t *const d1) { - *d0 = vrshrn_n_s32(t32[0], 14); - *d1 = vrshrn_n_s32(t32[1], 14); -} - -static INLINE void idct_cospi_2_30(const int16x8_t s0, const int16x8_t s1, - const int16x4_t cospi_2_30_10_22, - int16x8_t *const d0, int16x8_t *const d1) { - int32x4_t t32[6]; - - t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 1); - t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 1); - t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 1); - t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 1); - t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 0); - t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 0); - t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 0); - t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 0); - idct16x16_add_wrap_low_8x2(t32, d0, d1); -} - -static INLINE void idct_cospi_4_28(const int16x8_t s0, const int16x8_t s1, - const int16x4_t cospi_4_12_20N_28, - int16x8_t *const d0, int16x8_t *const d1) { - int32x4_t t32[6]; - - t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 3); - t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 3); - t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 3); - t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 3); - t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 0); - t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 0); - t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 0); - t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 0); - idct16x16_add_wrap_low_8x2(t32, d0, d1); -} - -static INLINE void idct_cospi_6_26(const int16x8_t s0, const int16x8_t s1, - const int16x4_t cospi_6_26_14_18N, - int16x8_t *const d0, int16x8_t *const d1) { - int32x4_t t32[6]; - - t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26_14_18N, 0); - t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26_14_18N, 0); - t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26_14_18N, 0); - t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26_14_18N, 0); - t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26_14_18N, 1); - t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26_14_18N, 1); - t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26_14_18N, 1); - t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26_14_18N, 1); - idct16x16_add_wrap_low_8x2(t32, d0, d1); + *d0 = vrshrn_n_s32(t32[0], DCT_CONST_BITS); + *d1 = vrshrn_n_s32(t32[1], DCT_CONST_BITS); } static INLINE void idct_cospi_8_24_d_kernel(const int16x4_t s0, @@ -146,54 +50,6 @@ static INLINE void idct_cospi_8_24_neg_d(const int16x4_t s0, const int16x4_t s1, wrap_low_4x2(t32, d0, d1); } -static INLINE void idct_cospi_10_22(const int16x8_t s0, const int16x8_t s1, - const int16x4_t cospi_2_30_10_22, - int16x8_t *const d0, int16x8_t *const d1) { - int32x4_t t32[6]; - - t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 3); - t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 3); - t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 3); - t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 3); - t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 2); - t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 2); - t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 2); - t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 2); - idct16x16_add_wrap_low_8x2(t32, d0, d1); -} - -static INLINE void idct_cospi_12_20(const int16x8_t s0, const int16x8_t s1, - const int16x4_t cospi_4_12_20N_28, - int16x8_t *const d0, int16x8_t *const d1) { - int32x4_t t32[6]; - - t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 1); - t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 1); - t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 1); - t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 1); - t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 2); - t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 2); - t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 2); - t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 2); - idct16x16_add_wrap_low_8x2(t32, d0, d1); -} - -static INLINE void idct_cospi_14_18(const int16x8_t s0, const int16x8_t s1, - const int16x4_t cospi_6_26_14_18N, - int16x8_t *const d0, int16x8_t *const d1) { - int32x4_t t32[6]; - - t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26_14_18N, 2); - t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26_14_18N, 2); - t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26_14_18N, 2); - t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26_14_18N, 2); - t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26_14_18N, 3); - t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26_14_18N, 3); - t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26_14_18N, 3); - t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26_14_18N, 3); - idct16x16_add_wrap_low_8x2(t32, d0, d1); -} - static INLINE void idct_cospi_16_16_d(const int16x4_t s0, const int16x4_t s1, const int16x4_t cospi_0_8_16_24, int16x4_t *const d0, @@ -206,8 +62,68 @@ static INLINE void idct_cospi_16_16_d(const int16x4_t s0, const int16x4_t s1, wrap_low_4x2(t32, d0, d1); } -static void idct16x16_256_add_half1d(const int16_t *input, int16_t *output, - uint8_t *dest, int stride) { +static INLINE void idct16x16_add_store(const int16x8_t *const out, + uint8_t *dest, const int stride) { + // Add the result to dest + idct16x16_add8x1(out[0], &dest, stride); + idct16x16_add8x1(out[1], &dest, stride); + idct16x16_add8x1(out[2], &dest, stride); + idct16x16_add8x1(out[3], &dest, stride); + idct16x16_add8x1(out[4], &dest, stride); + idct16x16_add8x1(out[5], &dest, stride); + idct16x16_add8x1(out[6], &dest, stride); + idct16x16_add8x1(out[7], &dest, stride); + idct16x16_add8x1(out[8], &dest, stride); + idct16x16_add8x1(out[9], &dest, stride); + idct16x16_add8x1(out[10], &dest, stride); + idct16x16_add8x1(out[11], &dest, stride); + idct16x16_add8x1(out[12], &dest, stride); + idct16x16_add8x1(out[13], &dest, stride); + idct16x16_add8x1(out[14], &dest, stride); + idct16x16_add8x1(out[15], &dest, stride); +} + +static INLINE void idct16x16_add_store_bd8(int16x8_t *const out, uint16_t *dest, + const int stride) { + // Add the result to dest + const int16x8_t max = vdupq_n_s16((1 << 8) - 1); + out[0] = vrshrq_n_s16(out[0], 6); + out[1] = vrshrq_n_s16(out[1], 6); + out[2] = vrshrq_n_s16(out[2], 6); + out[3] = vrshrq_n_s16(out[3], 6); + out[4] = vrshrq_n_s16(out[4], 6); + out[5] = vrshrq_n_s16(out[5], 6); + out[6] = vrshrq_n_s16(out[6], 6); + out[7] = vrshrq_n_s16(out[7], 6); + out[8] = vrshrq_n_s16(out[8], 6); + out[9] = vrshrq_n_s16(out[9], 6); + out[10] = vrshrq_n_s16(out[10], 6); + out[11] = vrshrq_n_s16(out[11], 6); + out[12] = vrshrq_n_s16(out[12], 6); + out[13] = vrshrq_n_s16(out[13], 6); + out[14] = vrshrq_n_s16(out[14], 6); + out[15] = vrshrq_n_s16(out[15], 6); + highbd_idct16x16_add8x1(out[0], max, &dest, stride); + highbd_idct16x16_add8x1(out[1], max, &dest, stride); + highbd_idct16x16_add8x1(out[2], max, &dest, stride); + highbd_idct16x16_add8x1(out[3], max, &dest, stride); + highbd_idct16x16_add8x1(out[4], max, &dest, stride); + highbd_idct16x16_add8x1(out[5], max, &dest, stride); + highbd_idct16x16_add8x1(out[6], max, &dest, stride); + highbd_idct16x16_add8x1(out[7], max, &dest, stride); + highbd_idct16x16_add8x1(out[8], max, &dest, stride); + highbd_idct16x16_add8x1(out[9], max, &dest, stride); + highbd_idct16x16_add8x1(out[10], max, &dest, stride); + highbd_idct16x16_add8x1(out[11], max, &dest, stride); + highbd_idct16x16_add8x1(out[12], max, &dest, stride); + highbd_idct16x16_add8x1(out[13], max, &dest, stride); + highbd_idct16x16_add8x1(out[14], max, &dest, stride); + highbd_idct16x16_add8x1(out[15], max, &dest, stride); +} + +void idct16x16_256_add_half1d(const void *const input, int16_t *output, + void *const dest, const int stride, + const int highbd_flag) { const int16x8_t cospis0 = vld1q_s16(kCospi); const int16x8_t cospis1 = vld1q_s16(kCospi + 8); const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0); @@ -217,37 +133,73 @@ static void idct16x16_256_add_half1d(const int16_t *input, int16_t *output, int16x8_t in[16], step1[16], step2[16], out[16]; // Load input (16x8) - in[0] = vld1q_s16(input); - input += 8; - in[8] = vld1q_s16(input); - input += 8; - in[1] = vld1q_s16(input); - input += 8; - in[9] = vld1q_s16(input); - input += 8; - in[2] = vld1q_s16(input); - input += 8; - in[10] = vld1q_s16(input); - input += 8; - in[3] = vld1q_s16(input); - input += 8; - in[11] = vld1q_s16(input); - input += 8; - in[4] = vld1q_s16(input); - input += 8; - in[12] = vld1q_s16(input); - input += 8; - in[5] = vld1q_s16(input); - input += 8; - in[13] = vld1q_s16(input); - input += 8; - in[6] = vld1q_s16(input); - input += 8; - in[14] = vld1q_s16(input); - input += 8; - in[7] = vld1q_s16(input); - input += 8; - in[15] = vld1q_s16(input); + if (output) { + const tran_low_t *inputT = (const tran_low_t *)input; + in[0] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[8] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[1] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[9] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[2] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[10] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[3] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[11] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[4] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[12] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[5] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[13] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[6] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[14] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[7] = load_tran_low_to_s16q(inputT); + inputT += 8; + in[15] = load_tran_low_to_s16q(inputT); + } else { + const int16_t *inputT = (const int16_t *)input; + in[0] = vld1q_s16(inputT); + inputT += 8; + in[8] = vld1q_s16(inputT); + inputT += 8; + in[1] = vld1q_s16(inputT); + inputT += 8; + in[9] = vld1q_s16(inputT); + inputT += 8; + in[2] = vld1q_s16(inputT); + inputT += 8; + in[10] = vld1q_s16(inputT); + inputT += 8; + in[3] = vld1q_s16(inputT); + inputT += 8; + in[11] = vld1q_s16(inputT); + inputT += 8; + in[4] = vld1q_s16(inputT); + inputT += 8; + in[12] = vld1q_s16(inputT); + inputT += 8; + in[5] = vld1q_s16(inputT); + inputT += 8; + in[13] = vld1q_s16(inputT); + inputT += 8; + in[6] = vld1q_s16(inputT); + inputT += 8; + in[14] = vld1q_s16(inputT); + inputT += 8; + in[7] = vld1q_s16(inputT); + inputT += 8; + in[15] = vld1q_s16(inputT); + } // Transpose transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], @@ -358,79 +310,181 @@ static void idct16x16_256_add_half1d(const int16_t *input, int16_t *output, step2[15] = step1[15]; // stage 7 - out[0] = vaddq_s16(step2[0], step2[15]); - out[1] = vaddq_s16(step2[1], step2[14]); - out[2] = vaddq_s16(step2[2], step2[13]); - out[3] = vaddq_s16(step2[3], step2[12]); - out[4] = vaddq_s16(step2[4], step2[11]); - out[5] = vaddq_s16(step2[5], step2[10]); - out[6] = vaddq_s16(step2[6], step2[9]); - out[7] = vaddq_s16(step2[7], step2[8]); - out[8] = vsubq_s16(step2[7], step2[8]); - out[9] = vsubq_s16(step2[6], step2[9]); - out[10] = vsubq_s16(step2[5], step2[10]); - out[11] = vsubq_s16(step2[4], step2[11]); - out[12] = vsubq_s16(step2[3], step2[12]); - out[13] = vsubq_s16(step2[2], step2[13]); - out[14] = vsubq_s16(step2[1], step2[14]); - out[15] = vsubq_s16(step2[0], step2[15]); + idct16x16_add_stage7(step2, out); if (output) { - // pass 1: save the result into output - vst1q_s16(output, out[0]); - output += 16; - vst1q_s16(output, out[1]); - output += 16; - vst1q_s16(output, out[2]); - output += 16; - vst1q_s16(output, out[3]); - output += 16; - vst1q_s16(output, out[4]); - output += 16; - vst1q_s16(output, out[5]); - output += 16; - vst1q_s16(output, out[6]); - output += 16; - vst1q_s16(output, out[7]); - output += 16; - vst1q_s16(output, out[8]); - output += 16; - vst1q_s16(output, out[9]); - output += 16; - vst1q_s16(output, out[10]); - output += 16; - vst1q_s16(output, out[11]); - output += 16; - vst1q_s16(output, out[12]); - output += 16; - vst1q_s16(output, out[13]); - output += 16; - vst1q_s16(output, out[14]); - output += 16; - vst1q_s16(output, out[15]); + idct16x16_store_pass1(out, output); } else { - // pass 2: add the result to dest. - idct16x16_add8x1(out[0], &dest, stride); - idct16x16_add8x1(out[1], &dest, stride); - idct16x16_add8x1(out[2], &dest, stride); - idct16x16_add8x1(out[3], &dest, stride); - idct16x16_add8x1(out[4], &dest, stride); - idct16x16_add8x1(out[5], &dest, stride); - idct16x16_add8x1(out[6], &dest, stride); - idct16x16_add8x1(out[7], &dest, stride); - idct16x16_add8x1(out[8], &dest, stride); - idct16x16_add8x1(out[9], &dest, stride); - idct16x16_add8x1(out[10], &dest, stride); - idct16x16_add8x1(out[11], &dest, stride); - idct16x16_add8x1(out[12], &dest, stride); - idct16x16_add8x1(out[13], &dest, stride); - idct16x16_add8x1(out[14], &dest, stride); - idct16x16_add8x1(out[15], &dest, stride); + if (highbd_flag) { + idct16x16_add_store_bd8(out, dest, stride); + } else { + idct16x16_add_store(out, dest, stride); + } } } -static void idct16x16_10_add_half1d_pass1(const tran_low_t *input, - int16_t *output) { +void idct16x16_38_add_half1d(const void *const input, int16_t *const output, + void *const dest, const int stride, + const int highbd_flag) { + const int16x8_t cospis0 = vld1q_s16(kCospi); + const int16x8_t cospis1 = vld1q_s16(kCospi + 8); + const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0); + const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1); + const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0); + const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0); + const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0); + const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1); + const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1); + int16x8_t in[8], step1[16], step2[16], out[16]; + + // Load input (8x8) + if (output) { + const tran_low_t *inputT = (const tran_low_t *)input; + in[0] = load_tran_low_to_s16q(inputT); + inputT += 16; + in[1] = load_tran_low_to_s16q(inputT); + inputT += 16; + in[2] = load_tran_low_to_s16q(inputT); + inputT += 16; + in[3] = load_tran_low_to_s16q(inputT); + inputT += 16; + in[4] = load_tran_low_to_s16q(inputT); + inputT += 16; + in[5] = load_tran_low_to_s16q(inputT); + inputT += 16; + in[6] = load_tran_low_to_s16q(inputT); + inputT += 16; + in[7] = load_tran_low_to_s16q(inputT); + } else { + const int16_t *inputT = (const int16_t *)input; + in[0] = vld1q_s16(inputT); + inputT += 16; + in[1] = vld1q_s16(inputT); + inputT += 16; + in[2] = vld1q_s16(inputT); + inputT += 16; + in[3] = vld1q_s16(inputT); + inputT += 16; + in[4] = vld1q_s16(inputT); + inputT += 16; + in[5] = vld1q_s16(inputT); + inputT += 16; + in[6] = vld1q_s16(inputT); + inputT += 16; + in[7] = vld1q_s16(inputT); + } + + // Transpose + transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], + &in[7]); + + // stage 1 + step1[0] = in[0 / 2]; + step1[2] = in[8 / 2]; + step1[4] = in[4 / 2]; + step1[6] = in[12 / 2]; + step1[8] = in[2 / 2]; + step1[10] = in[10 / 2]; + step1[12] = in[6 / 2]; + step1[14] = in[14 / 2]; // 0 in pass 1 + + // stage 2 + step2[0] = step1[0]; + step2[2] = step1[2]; + step2[4] = step1[4]; + step2[6] = step1[6]; + step2[8] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 1); + step2[9] = vqrdmulhq_lane_s16(step1[14], cospid_6_26_14_18N, 3); + step2[10] = vqrdmulhq_lane_s16(step1[10], cospid_2_30_10_22, 3); + step2[11] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 1); + step2[12] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 0); + step2[13] = vqrdmulhq_lane_s16(step1[10], cospid_2_30_10_22, 2); + step2[14] = vqrdmulhq_lane_s16(step1[14], cospid_6_26_14_18N, 2); + step2[15] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 0); + + // stage 3 + step1[0] = step2[0]; + step1[2] = step2[2]; + step1[4] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 3); + step1[5] = vqrdmulhq_lane_s16(step2[6], cospid_4_12_20N_28, 2); + step1[6] = vqrdmulhq_lane_s16(step2[6], cospid_4_12_20N_28, 1); + step1[7] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 0); + step1[8] = vaddq_s16(step2[8], step2[9]); + step1[9] = vsubq_s16(step2[8], step2[9]); + step1[10] = vsubq_s16(step2[11], step2[10]); + step1[11] = vaddq_s16(step2[11], step2[10]); + step1[12] = vaddq_s16(step2[12], step2[13]); + step1[13] = vsubq_s16(step2[12], step2[13]); + step1[14] = vsubq_s16(step2[15], step2[14]); + step1[15] = vaddq_s16(step2[15], step2[14]); + + // stage 4 + step2[0] = step2[1] = vqrdmulhq_lane_s16(step1[0], cospid_0_8_16_24, 2); + step2[2] = vqrdmulhq_lane_s16(step1[2], cospid_0_8_16_24, 3); + step2[3] = vqrdmulhq_lane_s16(step1[2], cospid_0_8_16_24, 1); + step2[4] = vaddq_s16(step1[4], step1[5]); + step2[5] = vsubq_s16(step1[4], step1[5]); + step2[6] = vsubq_s16(step1[7], step1[6]); + step2[7] = vaddq_s16(step1[7], step1[6]); + step2[8] = step1[8]; + idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9], + &step2[14]); + idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13], + &step2[10]); + step2[11] = step1[11]; + step2[12] = step1[12]; + step2[15] = step1[15]; + + // stage 5 + step1[0] = vaddq_s16(step2[0], step2[3]); + step1[1] = vaddq_s16(step2[1], step2[2]); + step1[2] = vsubq_s16(step2[1], step2[2]); + step1[3] = vsubq_s16(step2[0], step2[3]); + step1[4] = step2[4]; + idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]); + step1[7] = step2[7]; + step1[8] = vaddq_s16(step2[8], step2[11]); + step1[9] = vaddq_s16(step2[9], step2[10]); + step1[10] = vsubq_s16(step2[9], step2[10]); + step1[11] = vsubq_s16(step2[8], step2[11]); + step1[12] = vsubq_s16(step2[15], step2[12]); + step1[13] = vsubq_s16(step2[14], step2[13]); + step1[14] = vaddq_s16(step2[14], step2[13]); + step1[15] = vaddq_s16(step2[15], step2[12]); + + // stage 6 + step2[0] = vaddq_s16(step1[0], step1[7]); + step2[1] = vaddq_s16(step1[1], step1[6]); + step2[2] = vaddq_s16(step1[2], step1[5]); + step2[3] = vaddq_s16(step1[3], step1[4]); + step2[4] = vsubq_s16(step1[3], step1[4]); + step2[5] = vsubq_s16(step1[2], step1[5]); + step2[6] = vsubq_s16(step1[1], step1[6]); + step2[7] = vsubq_s16(step1[0], step1[7]); + idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10], + &step2[13]); + idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11], + &step2[12]); + step2[8] = step1[8]; + step2[9] = step1[9]; + step2[14] = step1[14]; + step2[15] = step1[15]; + + // stage 7 + idct16x16_add_stage7(step2, out); + + if (output) { + idct16x16_store_pass1(out, output); + } else { + if (highbd_flag) { + idct16x16_add_store_bd8(out, dest, stride); + } else { + idct16x16_add_store(out, dest, stride); + } + } +} + +void idct16x16_10_add_half1d_pass1(const tran_low_t *input, int16_t *output) { const int16x8_t cospis0 = vld1q_s16(kCospi); const int16x8_t cospis1 = vld1q_s16(kCospi + 8); const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0); @@ -442,8 +496,7 @@ static void idct16x16_10_add_half1d_pass1(const tran_low_t *input, const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1); int16x4_t in[4], step1[16], step2[16], out[16]; -// Load input (4x4) -#if CONFIG_VP9_HIGHBITDEPTH + // Load input (4x4) in[0] = load_tran_low_to_s16d(input); input += 16; in[1] = load_tran_low_to_s16d(input); @@ -451,15 +504,6 @@ static void idct16x16_10_add_half1d_pass1(const tran_low_t *input, in[2] = load_tran_low_to_s16d(input); input += 16; in[3] = load_tran_low_to_s16d(input); -#else - in[0] = vld1_s16(input); - input += 16; - in[1] = vld1_s16(input); - input += 16; - in[2] = vld1_s16(input); - input += 16; - in[3] = vld1_s16(input); -#endif // CONFIG_VP9_HIGHBITDEPTH // Transpose transpose_s16_4x4d(&in[0], &in[1], &in[2], &in[3]); @@ -593,8 +637,9 @@ static void idct16x16_10_add_half1d_pass1(const tran_low_t *input, vst1_s16(output, out[15]); } -static void idct16x16_10_add_half1d_pass2(const int16_t *input, int16_t *output, - uint8_t *dest, int stride) { +void idct16x16_10_add_half1d_pass2(const int16_t *input, int16_t *const output, + void *const dest, const int stride, + const int highbd_flag) { const int16x8_t cospis0 = vld1q_s16(kCospi); const int16x8_t cospis1 = vld1q_s16(kCospi + 8); const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0); @@ -706,74 +751,16 @@ static void idct16x16_10_add_half1d_pass2(const int16_t *input, int16_t *output, step2[15] = step1[15]; // stage 7 - out[0] = vaddq_s16(step2[0], step2[15]); - out[1] = vaddq_s16(step2[1], step2[14]); - out[2] = vaddq_s16(step2[2], step2[13]); - out[3] = vaddq_s16(step2[3], step2[12]); - out[4] = vaddq_s16(step2[4], step2[11]); - out[5] = vaddq_s16(step2[5], step2[10]); - out[6] = vaddq_s16(step2[6], step2[9]); - out[7] = vaddq_s16(step2[7], step2[8]); - out[8] = vsubq_s16(step2[7], step2[8]); - out[9] = vsubq_s16(step2[6], step2[9]); - out[10] = vsubq_s16(step2[5], step2[10]); - out[11] = vsubq_s16(step2[4], step2[11]); - out[12] = vsubq_s16(step2[3], step2[12]); - out[13] = vsubq_s16(step2[2], step2[13]); - out[14] = vsubq_s16(step2[1], step2[14]); - out[15] = vsubq_s16(step2[0], step2[15]); + idct16x16_add_stage7(step2, out); if (output) { - // pass 1: save the result into output - vst1q_s16(output, out[0]); - output += 16; - vst1q_s16(output, out[1]); - output += 16; - vst1q_s16(output, out[2]); - output += 16; - vst1q_s16(output, out[3]); - output += 16; - vst1q_s16(output, out[4]); - output += 16; - vst1q_s16(output, out[5]); - output += 16; - vst1q_s16(output, out[6]); - output += 16; - vst1q_s16(output, out[7]); - output += 16; - vst1q_s16(output, out[8]); - output += 16; - vst1q_s16(output, out[9]); - output += 16; - vst1q_s16(output, out[10]); - output += 16; - vst1q_s16(output, out[11]); - output += 16; - vst1q_s16(output, out[12]); - output += 16; - vst1q_s16(output, out[13]); - output += 16; - vst1q_s16(output, out[14]); - output += 16; - vst1q_s16(output, out[15]); + idct16x16_store_pass1(out, output); } else { - // pass 2: add the result to dest. - idct16x16_add8x1(out[0], &dest, stride); - idct16x16_add8x1(out[1], &dest, stride); - idct16x16_add8x1(out[2], &dest, stride); - idct16x16_add8x1(out[3], &dest, stride); - idct16x16_add8x1(out[4], &dest, stride); - idct16x16_add8x1(out[5], &dest, stride); - idct16x16_add8x1(out[6], &dest, stride); - idct16x16_add8x1(out[7], &dest, stride); - idct16x16_add8x1(out[8], &dest, stride); - idct16x16_add8x1(out[9], &dest, stride); - idct16x16_add8x1(out[10], &dest, stride); - idct16x16_add8x1(out[11], &dest, stride); - idct16x16_add8x1(out[12], &dest, stride); - idct16x16_add8x1(out[13], &dest, stride); - idct16x16_add8x1(out[14], &dest, stride); - idct16x16_add8x1(out[15], &dest, stride); + if (highbd_flag) { + idct16x16_add_store_bd8(out, dest, stride); + } else { + idct16x16_add_store(out, dest, stride); + } } } @@ -781,27 +768,36 @@ void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, int stride) { int16_t row_idct_output[16 * 16]; -#if CONFIG_VP9_HIGHBITDEPTH - int16_t pass1_input[16 * 16]; - idct16x16_256_add_load_tran_low(input, pass1_input); -#else - const int16_t *pass1_input = input; -#endif // CONFIG_VP9_HIGHBITDEPTH - // pass 1 // Parallel idct on the upper 8 rows - idct16x16_256_add_half1d(pass1_input, row_idct_output, dest, stride); + idct16x16_256_add_half1d(input, row_idct_output, dest, stride, 0); // Parallel idct on the lower 8 rows - idct16x16_256_add_half1d(pass1_input + 8 * 16, row_idct_output + 8, dest, - stride); + idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8, dest, stride, + 0); + + // pass 2 + // Parallel idct to get the left 8 columns + idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride, 0); + + // Parallel idct to get the right 8 columns + idct16x16_256_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride, 0); +} + +void vpx_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + int16_t row_idct_output[16 * 16]; + + // pass 1 + // Parallel idct on the upper 8 rows + idct16x16_38_add_half1d(input, row_idct_output, dest, stride, 0); // pass 2 // Parallel idct to get the left 8 columns - idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride); + idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, 0); // Parallel idct to get the right 8 columns - idct16x16_256_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride); + idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride, 0); } void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, @@ -814,9 +810,9 @@ void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, // pass 2 // Parallel idct to get the left 8 columns - idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride); + idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride, 0); // Parallel idct to get the right 8 columns - idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, dest + 8, - stride); + idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, dest + 8, stride, + 0); } diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_neon.c deleted file mode 100644 index 47366bcb7d6..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_neon.c +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Copyright (c) 2013 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vpx_dsp_rtcd.h" -#include "vpx_dsp/vpx_dsp_common.h" - -void vpx_idct16x16_256_add_neon_pass1(const int16_t *input, int16_t *output); -void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *output, - int16_t *pass1_output, - int16_t skip_adding, uint8_t *dest, - int stride); -#if CONFIG_VP9_HIGHBITDEPTH -void vpx_idct16x16_256_add_neon_pass1_tran_low(const tran_low_t *input, - int16_t *output); -void vpx_idct16x16_256_add_neon_pass2_tran_low(const tran_low_t *src, - int16_t *output, - int16_t *pass1_output, - int16_t skip_adding, - uint8_t *dest, int stride); -#else -#define vpx_idct16x16_256_add_neon_pass1_tran_low \ - vpx_idct16x16_256_add_neon_pass1 -#define vpx_idct16x16_256_add_neon_pass2_tran_low \ - vpx_idct16x16_256_add_neon_pass2 -#endif - -void vpx_idct16x16_10_add_neon_pass1(const tran_low_t *input, int16_t *output); -void vpx_idct16x16_10_add_neon_pass2(const tran_low_t *src, int16_t *output, - int16_t *pass1_output); - -#if HAVE_NEON_ASM -/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */ -extern void vpx_push_neon(int64_t *store); -extern void vpx_pop_neon(int64_t *store); -#endif // HAVE_NEON_ASM - -void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest, - int stride) { -#if HAVE_NEON_ASM - int64_t store_reg[8]; -#endif - int16_t pass1_output[16 * 16] = { 0 }; - int16_t row_idct_output[16 * 16] = { 0 }; - -#if HAVE_NEON_ASM - // save d8-d15 register values. - vpx_push_neon(store_reg); -#endif - - /* Parallel idct on the upper 8 rows */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - vpx_idct16x16_256_add_neon_pass1_tran_low(input, pass1_output); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7 - // which will be saved into row_idct_output. - vpx_idct16x16_256_add_neon_pass2_tran_low(input + 1, row_idct_output, - pass1_output, 0, dest, stride); - - /* Parallel idct on the lower 8 rows */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - vpx_idct16x16_256_add_neon_pass1_tran_low(input + 8 * 16, pass1_output); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7 - // which will be saved into row_idct_output. - vpx_idct16x16_256_add_neon_pass2_tran_low( - input + 8 * 16 + 1, row_idct_output + 8, pass1_output, 0, dest, stride); - - /* Parallel idct on the left 8 columns */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7. - // Then add the result to the destination data. - vpx_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output, - pass1_output, 1, dest, stride); - - /* Parallel idct on the right 8 columns */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7. - // Then add the result to the destination data. - vpx_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1, - row_idct_output + 8, pass1_output, 1, - dest + 8, stride); - -#if HAVE_NEON_ASM - // restore d8-d15 register values. - vpx_pop_neon(store_reg); -#endif -} - -void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest, - int stride) { -#if HAVE_NEON_ASM - int64_t store_reg[8]; -#endif - int16_t pass1_output[16 * 16] = { 0 }; - int16_t row_idct_output[16 * 16] = { 0 }; - -#if HAVE_NEON_ASM - // save d8-d15 register values. - vpx_push_neon(store_reg); -#endif - - /* Parallel idct on the upper 8 rows */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - vpx_idct16x16_10_add_neon_pass1(input, pass1_output); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7 - // which will be saved into row_idct_output. - vpx_idct16x16_10_add_neon_pass2(input + 1, row_idct_output, pass1_output); - - /* Skip Parallel idct on the lower 8 rows as they are all 0s */ - - /* Parallel idct on the left 8 columns */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7. - // Then add the result to the destination data. - vpx_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output, - pass1_output, 1, dest, stride); - - /* Parallel idct on the right 8 columns */ - // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the - // stage 6 result in pass1_output. - vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output); - - // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines - // with result in pass1(pass1_output) to calculate final result in stage 7. - // Then add the result to the destination data. - vpx_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1, - row_idct_output + 8, pass1_output, 1, - dest + 8, stride); - -#if HAVE_NEON_ASM - // restore d8-d15 register values. - vpx_pop_neon(store_reg); -#endif -} diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c index de1bf978750..ae9457e18ee 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c @@ -147,8 +147,10 @@ static INLINE void DO_BUTTERFLY(int16x8_t q14s16, int16x8_t q13s16, q11s32 = vaddq_s32(q12s32, q11s32); q10s32 = vaddq_s32(q10s32, q15s32); - *qAs16 = vcombine_s16(vrshrn_n_s32(q8s32, 14), vrshrn_n_s32(q9s32, 14)); - *qBs16 = vcombine_s16(vrshrn_n_s32(q11s32, 14), vrshrn_n_s32(q10s32, 14)); + *qAs16 = vcombine_s16(vrshrn_n_s32(q8s32, DCT_CONST_BITS), + vrshrn_n_s32(q9s32, DCT_CONST_BITS)); + *qBs16 = vcombine_s16(vrshrn_n_s32(q11s32, DCT_CONST_BITS), + vrshrn_n_s32(q10s32, DCT_CONST_BITS)); } static INLINE void load_s16x8q(const int16_t *in, int16x8_t *s0, int16x8_t *s1, diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm deleted file mode 100644 index 29f678a0382..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm +++ /dev/null @@ -1,86 +0,0 @@ -; -; Copyright (c) 2013 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - - EXPORT |vpx_idct8x8_1_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void vpx_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, int stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int stride) - -|vpx_idct8x8_1_add_neon| PROC - ldrsh r0, [r0] - - ; cospi_16_64 = 11585 - movw r12, #0x2d41 - - ; out = dct_const_round_shift(input[0] * cospi_16_64) - mul r0, r0, r12 ; input[0] * cospi_16_64 - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; out = dct_const_round_shift(out * cospi_16_64) - mul r0, r0, r12 ; out * cospi_16_64 - mov r12, r1 ; save dest - add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1)) - asr r0, r0, #14 ; >> DCT_CONST_BITS - - ; a1 = ROUND_POWER_OF_TWO(out, 5) - add r0, r0, #16 ; + (1 <<((5) - 1)) - asr r0, r0, #5 ; >> 5 - - vdup.s16 q0, r0 ; duplicate a1 - - ; load destination data - vld1.64 {d2}, [r1], r2 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r2 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r2 - vld1.64 {d7}, [r1], r2 - vld1.64 {d16}, [r1], r2 - vld1.64 {d17}, [r1] - - vaddw.u8 q9, q0, d2 ; dest[x] + a1 - vaddw.u8 q10, q0, d3 ; dest[x] + a1 - vaddw.u8 q11, q0, d4 ; dest[x] + a1 - vaddw.u8 q12, q0, d5 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r2 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r2 - vst1.64 {d31}, [r12], r2 - - vaddw.u8 q9, q0, d6 ; dest[x] + a1 - vaddw.u8 q10, q0, d7 ; dest[x] + a1 - vaddw.u8 q11, q0, d16 ; dest[x] + a1 - vaddw.u8 q12, q0, d17 ; dest[x] + a1 - vqmovun.s16 d2, q9 ; clip_pixel - vqmovun.s16 d3, q10 ; clip_pixel - vqmovun.s16 d30, q11 ; clip_pixel - vqmovun.s16 d31, q12 ; clip_pixel - vst1.64 {d2}, [r12], r2 - vst1.64 {d3}, [r12], r2 - vst1.64 {d30}, [r12], r2 - vst1.64 {d31}, [r12], r2 - - bx lr - ENDP ; |vpx_idct8x8_1_add_neon| - - END diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm deleted file mode 100644 index 2bfbcc5a52c..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm +++ /dev/null @@ -1,507 +0,0 @@ -; -; Copyright (c) 2013 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - EXPORT |vpx_idct8x8_64_add_neon| - EXPORT |vpx_idct8x8_12_add_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - - INCLUDE vpx_dsp/arm/idct_neon.asm.S - - ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are - ; loaded in q8-q15. The output will be stored back into q8-q15 registers. - ; This macro will touch q0-q7 registers and use them as buffer during - ; calculation. - MACRO - IDCT8x8_1D - ; stage 1 - vdup.16 d0, r3 ; duplicate cospi_28_64 - vdup.16 d1, r4 ; duplicate cospi_4_64 - vdup.16 d2, r5 ; duplicate cospi_12_64 - vdup.16 d3, r6 ; duplicate cospi_20_64 - - ; input[1] * cospi_28_64 - vmull.s16 q2, d18, d0 - vmull.s16 q3, d19, d0 - - ; input[5] * cospi_12_64 - vmull.s16 q5, d26, d2 - vmull.s16 q6, d27, d2 - - ; input[1]*cospi_28_64-input[7]*cospi_4_64 - vmlsl.s16 q2, d30, d1 - vmlsl.s16 q3, d31, d1 - - ; input[5] * cospi_12_64 - input[3] * cospi_20_64 - vmlsl.s16 q5, d22, d3 - vmlsl.s16 q6, d23, d3 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vrshrn.s32 d8, q2, #14 ; >> 14 - vrshrn.s32 d9, q3, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vrshrn.s32 d10, q5, #14 ; >> 14 - vrshrn.s32 d11, q6, #14 ; >> 14 - - ; input[1] * cospi_4_64 - vmull.s16 q2, d18, d1 - vmull.s16 q3, d19, d1 - - ; input[5] * cospi_20_64 - vmull.s16 q9, d26, d3 - vmull.s16 q13, d27, d3 - - ; input[1]*cospi_4_64+input[7]*cospi_28_64 - vmlal.s16 q2, d30, d0 - vmlal.s16 q3, d31, d0 - - ; input[5] * cospi_20_64 + input[3] * cospi_12_64 - vmlal.s16 q9, d22, d2 - vmlal.s16 q13, d23, d2 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vrshrn.s32 d14, q2, #14 ; >> 14 - vrshrn.s32 d15, q3, #14 ; >> 14 - - ; stage 2 & stage 3 - even half - vdup.16 d0, r7 ; duplicate cospi_16_64 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vrshrn.s32 d12, q9, #14 ; >> 14 - vrshrn.s32 d13, q13, #14 ; >> 14 - - ; input[0] * cospi_16_64 - vmull.s16 q2, d16, d0 - vmull.s16 q3, d17, d0 - - ; input[0] * cospi_16_64 - vmull.s16 q13, d16, d0 - vmull.s16 q15, d17, d0 - - ; (input[0] + input[2]) * cospi_16_64 - vmlal.s16 q2, d24, d0 - vmlal.s16 q3, d25, d0 - - ; (input[0] - input[2]) * cospi_16_64 - vmlsl.s16 q13, d24, d0 - vmlsl.s16 q15, d25, d0 - - vdup.16 d0, r8 ; duplicate cospi_24_64 - vdup.16 d1, r9 ; duplicate cospi_8_64 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vrshrn.s32 d18, q2, #14 ; >> 14 - vrshrn.s32 d19, q3, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vrshrn.s32 d22, q13, #14 ; >> 14 - vrshrn.s32 d23, q15, #14 ; >> 14 - - ; input[1] * cospi_24_64 - input[3] * cospi_8_64 - ; input[1] * cospi_24_64 - vmull.s16 q2, d20, d0 - vmull.s16 q3, d21, d0 - - ; input[1] * cospi_8_64 - vmull.s16 q8, d20, d1 - vmull.s16 q12, d21, d1 - - ; input[1] * cospi_24_64 - input[3] * cospi_8_64 - vmlsl.s16 q2, d28, d1 - vmlsl.s16 q3, d29, d1 - - ; input[1] * cospi_8_64 + input[3] * cospi_24_64 - vmlal.s16 q8, d28, d0 - vmlal.s16 q12, d29, d0 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vrshrn.s32 d26, q2, #14 ; >> 14 - vrshrn.s32 d27, q3, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vrshrn.s32 d30, q8, #14 ; >> 14 - vrshrn.s32 d31, q12, #14 ; >> 14 - - vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3] - vadd.s16 q1, q11, q13 ; output[1] = step[1] + step[2] - vsub.s16 q2, q11, q13 ; output[2] = step[1] - step[2] - vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3] - - ; stage 3 -odd half - vdup.16 d16, r7 ; duplicate cospi_16_64 - - ; stage 2 - odd half - vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5] - vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5] - vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7] - vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7] - - ; step2[6] * cospi_16_64 - vmull.s16 q9, d28, d16 - vmull.s16 q10, d29, d16 - - ; step2[6] * cospi_16_64 - vmull.s16 q11, d28, d16 - vmull.s16 q12, d29, d16 - - ; (step2[6] - step2[5]) * cospi_16_64 - vmlsl.s16 q9, d26, d16 - vmlsl.s16 q10, d27, d16 - - ; (step2[5] + step2[6]) * cospi_16_64 - vmlal.s16 q11, d26, d16 - vmlal.s16 q12, d27, d16 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vrshrn.s32 d10, q9, #14 ; >> 14 - vrshrn.s32 d11, q10, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vrshrn.s32 d12, q11, #14 ; >> 14 - vrshrn.s32 d13, q12, #14 ; >> 14 - - ; stage 4 - vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7]; - vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6]; - vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5]; - vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4]; - vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4]; - vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5]; - vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6]; - vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7]; - MEND - - ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15. - MACRO - TRANSPOSE8X8 - vswp d17, d24 - vswp d23, d30 - vswp d21, d28 - vswp d19, d26 - vtrn.32 q8, q10 - vtrn.32 q9, q11 - vtrn.32 q12, q14 - vtrn.32 q13, q15 - vtrn.16 q8, q9 - vtrn.16 q10, q11 - vtrn.16 q12, q13 - vtrn.16 q14, q15 - MEND - - AREA Block, CODE, READONLY ; name this block of code -;void vpx_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int stride) - -|vpx_idct8x8_64_add_neon| PROC - push {r4-r9} - vpush {d8-d15} - LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0 - LOAD_TRAN_LOW_TO_S16 d20, d21, d22, d23, r0 - LOAD_TRAN_LOW_TO_S16 d24, d25, d26, d27, r0 - LOAD_TRAN_LOW_TO_S16 d28, d29, d30, d31, r0 - - ; transpose the input data - TRANSPOSE8X8 - - ; cospi_28_64 = 3196 - movw r3, #0x0c7c - - ; cospi_4_64 = 16069 - movw r4, #0x3ec5 - - ; cospi_12_64 = 13623 - movw r5, #0x3537 - - ; cospi_20_64 = 9102 - movw r6, #0x238e - - ; cospi_16_64 = 11585 - movw r7, #0x2d41 - - ; cospi_24_64 = 6270 - movw r8, #0x187e - - ; cospi_8_64 = 15137 - movw r9, #0x3b21 - - ; First transform rows - IDCT8x8_1D - - ; Transpose the matrix - TRANSPOSE8X8 - - ; Then transform columns - IDCT8x8_1D - - ; ROUND_POWER_OF_TWO(temp_out[j], 5) - vrshr.s16 q8, q8, #5 - vrshr.s16 q9, q9, #5 - vrshr.s16 q10, q10, #5 - vrshr.s16 q11, q11, #5 - vrshr.s16 q12, q12, #5 - vrshr.s16 q13, q13, #5 - vrshr.s16 q14, q14, #5 - vrshr.s16 q15, q15, #5 - - ; save dest pointer - mov r0, r1 - - ; load destination data - vld1.64 {d0}, [r1], r2 - vld1.64 {d1}, [r1], r2 - vld1.64 {d2}, [r1], r2 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r2 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r2 - vld1.64 {d7}, [r1] - - ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * stride + i] - vaddw.u8 q8, q8, d0 - vaddw.u8 q9, q9, d1 - vaddw.u8 q10, q10, d2 - vaddw.u8 q11, q11, d3 - vaddw.u8 q12, q12, d4 - vaddw.u8 q13, q13, d5 - vaddw.u8 q14, q14, d6 - vaddw.u8 q15, q15, d7 - - ; clip_pixel - vqmovun.s16 d0, q8 - vqmovun.s16 d1, q9 - vqmovun.s16 d2, q10 - vqmovun.s16 d3, q11 - vqmovun.s16 d4, q12 - vqmovun.s16 d5, q13 - vqmovun.s16 d6, q14 - vqmovun.s16 d7, q15 - - ; store the data - vst1.64 {d0}, [r0], r2 - vst1.64 {d1}, [r0], r2 - vst1.64 {d2}, [r0], r2 - vst1.64 {d3}, [r0], r2 - vst1.64 {d4}, [r0], r2 - vst1.64 {d5}, [r0], r2 - vst1.64 {d6}, [r0], r2 - vst1.64 {d7}, [r0], r2 - - vpop {d8-d15} - pop {r4-r9} - bx lr - ENDP ; |vpx_idct8x8_64_add_neon| - -;void vpx_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int stride) -; -; r0 int16_t input -; r1 uint8_t *dest -; r2 int stride) - -|vpx_idct8x8_12_add_neon| PROC - push {r4-r9} - vpush {d8-d15} - LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0 - LOAD_TRAN_LOW_TO_S16 d20, d21, d22, d23, r0 - LOAD_TRAN_LOW_TO_S16 d24, d25, d26, d27, r0 - LOAD_TRAN_LOW_TO_S16 d28, d29, d30, d31, r0 - - ; transpose the input data - TRANSPOSE8X8 - - ; cospi_28_64 = 3196 - movw r3, #0x0c7c - - ; cospi_4_64 = 16069 - movw r4, #0x3ec5 - - ; cospi_12_64 = 13623 - movw r5, #0x3537 - - ; cospi_20_64 = 9102 - movw r6, #0x238e - - ; cospi_16_64 = 11585 - movw r7, #0x2d41 - - ; cospi_24_64 = 6270 - movw r8, #0x187e - - ; cospi_8_64 = 15137 - movw r9, #0x3b21 - - ; First transform rows - ; stage 1 - ; The following instructions use vqrdmulh to do the - ; dct_const_round_shift(input[1] * cospi_28_64). vqrdmulh will do doubling - ; multiply and shift the result by 16 bits instead of 14 bits. So we need - ; to double the constants before multiplying to compensate this. - mov r12, r3, lsl #1 - vdup.16 q0, r12 ; duplicate cospi_28_64*2 - mov r12, r4, lsl #1 - vdup.16 q1, r12 ; duplicate cospi_4_64*2 - - ; dct_const_round_shift(input[1] * cospi_28_64) - vqrdmulh.s16 q4, q9, q0 - - mov r12, r6, lsl #1 - rsb r12, #0 - vdup.16 q0, r12 ; duplicate -cospi_20_64*2 - - ; dct_const_round_shift(input[1] * cospi_4_64) - vqrdmulh.s16 q7, q9, q1 - - mov r12, r5, lsl #1 - vdup.16 q1, r12 ; duplicate cospi_12_64*2 - - ; dct_const_round_shift(- input[3] * cospi_20_64) - vqrdmulh.s16 q5, q11, q0 - - mov r12, r7, lsl #1 - vdup.16 q0, r12 ; duplicate cospi_16_64*2 - - ; dct_const_round_shift(input[3] * cospi_12_64) - vqrdmulh.s16 q6, q11, q1 - - ; stage 2 & stage 3 - even half - mov r12, r8, lsl #1 - vdup.16 q1, r12 ; duplicate cospi_24_64*2 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vqrdmulh.s16 q9, q8, q0 - - mov r12, r9, lsl #1 - vdup.16 q0, r12 ; duplicate cospi_8_64*2 - - ; dct_const_round_shift(input[1] * cospi_24_64) - vqrdmulh.s16 q13, q10, q1 - - ; dct_const_round_shift(input[1] * cospi_8_64) - vqrdmulh.s16 q15, q10, q0 - - ; stage 3 -odd half - vdup.16 d16, r7 ; duplicate cospi_16_64 - - vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3] - vadd.s16 q1, q9, q13 ; output[1] = step[1] + step[2] - vsub.s16 q2, q9, q13 ; output[2] = step[1] - step[2] - vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3] - - ; stage 2 - odd half - vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5] - vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5] - vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7] - vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7] - - ; step2[6] * cospi_16_64 - vmull.s16 q9, d28, d16 - vmull.s16 q10, d29, d16 - - ; step2[6] * cospi_16_64 - vmull.s16 q11, d28, d16 - vmull.s16 q12, d29, d16 - - ; (step2[6] - step2[5]) * cospi_16_64 - vmlsl.s16 q9, d26, d16 - vmlsl.s16 q10, d27, d16 - - ; (step2[5] + step2[6]) * cospi_16_64 - vmlal.s16 q11, d26, d16 - vmlal.s16 q12, d27, d16 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vrshrn.s32 d10, q9, #14 ; >> 14 - vrshrn.s32 d11, q10, #14 ; >> 14 - - ; dct_const_round_shift(input_dc * cospi_16_64) - vrshrn.s32 d12, q11, #14 ; >> 14 - vrshrn.s32 d13, q12, #14 ; >> 14 - - ; stage 4 - vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7]; - vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6]; - vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5]; - vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4]; - vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4]; - vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5]; - vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6]; - vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7]; - - ; Transpose the matrix - TRANSPOSE8X8 - - ; Then transform columns - IDCT8x8_1D - - ; ROUND_POWER_OF_TWO(temp_out[j], 5) - vrshr.s16 q8, q8, #5 - vrshr.s16 q9, q9, #5 - vrshr.s16 q10, q10, #5 - vrshr.s16 q11, q11, #5 - vrshr.s16 q12, q12, #5 - vrshr.s16 q13, q13, #5 - vrshr.s16 q14, q14, #5 - vrshr.s16 q15, q15, #5 - - ; save dest pointer - mov r0, r1 - - ; load destination data - vld1.64 {d0}, [r1], r2 - vld1.64 {d1}, [r1], r2 - vld1.64 {d2}, [r1], r2 - vld1.64 {d3}, [r1], r2 - vld1.64 {d4}, [r1], r2 - vld1.64 {d5}, [r1], r2 - vld1.64 {d6}, [r1], r2 - vld1.64 {d7}, [r1] - - ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * stride + i] - vaddw.u8 q8, q8, d0 - vaddw.u8 q9, q9, d1 - vaddw.u8 q10, q10, d2 - vaddw.u8 q11, q11, d3 - vaddw.u8 q12, q12, d4 - vaddw.u8 q13, q13, d5 - vaddw.u8 q14, q14, d6 - vaddw.u8 q15, q15, d7 - - ; clip_pixel - vqmovun.s16 d0, q8 - vqmovun.s16 d1, q9 - vqmovun.s16 d2, q10 - vqmovun.s16 d3, q11 - vqmovun.s16 d4, q12 - vqmovun.s16 d5, q13 - vqmovun.s16 d6, q14 - vqmovun.s16 d7, q15 - - ; store the data - vst1.64 {d0}, [r0], r2 - vst1.64 {d1}, [r0], r2 - vst1.64 {d2}, [r0], r2 - vst1.64 {d3}, [r0], r2 - vst1.64 {d4}, [r0], r2 - vst1.64 {d5}, [r0], r2 - vst1.64 {d6}, [r0], r2 - vst1.64 {d7}, [r0], r2 - - vpop {d8-d15} - pop {r4-r9} - bx lr - ENDP ; |vpx_idct8x8_12_add_neon| - - END diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h index d9b85223c76..fe5b603e21b 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h @@ -15,6 +15,7 @@ #include "./vpx_config.h" #include "vpx_dsp/arm/transpose_neon.h" +#include "vpx_dsp/txfm_common.h" #include "vpx_dsp/vpx_dsp_common.h" DECLARE_ALIGNED(16, static const int16_t, kCospi[16]) = { @@ -28,11 +29,15 @@ DECLARE_ALIGNED(16, static const int16_t, kCospi[16]) = { 12665 /* cospi_14_64 */, -10394 /* -cospi_18_64 */ }; -DECLARE_ALIGNED(16, static const int32_t, kCospi32[8]) = { - 16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */, - 11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */, - 16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */, - -9102 /* -cospi_20_64 */, 3196 /* cospi_28_64 */ +DECLARE_ALIGNED(16, static const int32_t, kCospi32[16]) = { + 16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */, + 11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */, + 16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */, + -9102 /* -cospi_20_64 */, 3196 /* cospi_28_64 */, + 16305 /* cospi_2_64 */, 1606 /* cospi_30_64 */, + 14449 /* cospi_10_64 */, 7723 /* cospi_22_64 */, + 15679 /* cospi_6_64 */, -4756 /* -cospi_26_64 */, + 12665 /* cospi_14_64 */, -10394 /* -cospi_18_64 */ }; //------------------------------------------------------------------------------ @@ -76,23 +81,34 @@ static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) { #endif } +static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) { +#if CONFIG_VP9_HIGHBITDEPTH + const int32x4_t v0 = vmovl_s16(vget_low_s16(a)); + const int32x4_t v1 = vmovl_s16(vget_high_s16(a)); + vst1q_s32(buf, v0); + vst1q_s32(buf + 4, v1); +#else + vst1q_s16(buf, a); +#endif +} + //------------------------------------------------------------------------------ -// Multiply a by a_const. Saturate, shift and narrow by 14. +// Multiply a by a_const. Saturate, shift and narrow by DCT_CONST_BITS. static INLINE int16x8_t multiply_shift_and_narrow_s16(const int16x8_t a, const int16_t a_const) { - // Shift by 14 + rounding will be within 16 bits for well formed streams. - // See WRAPLOW and dct_const_round_shift for details. + // Shift by DCT_CONST_BITS + rounding will be within 16 bits for well formed + // streams. See WRAPLOW and dct_const_round_shift for details. // This instruction doubles the result and returns the high half, essentially // resulting in a right shift by 15. By multiplying the constant first that - // becomes a right shift by 14. + // becomes a right shift by DCT_CONST_BITS. // The largest possible value used here is // vpx_dsp/txfm_common.h:cospi_1_64 = 16364 (* 2 = 32728) a which falls *just* // within the range of int16_t (+32767 / -32768) even when negated. return vqrdmulhq_n_s16(a, a_const * 2); } -// Add a and b, then multiply by ab_const. Shift and narrow by 14. +// Add a and b, then multiply by ab_const. Shift and narrow by DCT_CONST_BITS. static INLINE int16x8_t add_multiply_shift_and_narrow_s16( const int16x8_t a, const int16x8_t b, const int16_t ab_const) { // In both add_ and it's pair, sub_, the input for well-formed streams will be @@ -106,21 +122,24 @@ static INLINE int16x8_t add_multiply_shift_and_narrow_s16( int32x4_t temp_high = vaddl_s16(vget_high_s16(a), vget_high_s16(b)); temp_low = vmulq_n_s32(temp_low, ab_const); temp_high = vmulq_n_s32(temp_high, ab_const); - return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14)); + return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS), + vrshrn_n_s32(temp_high, DCT_CONST_BITS)); } -// Subtract b from a, then multiply by ab_const. Shift and narrow by 14. +// Subtract b from a, then multiply by ab_const. Shift and narrow by +// DCT_CONST_BITS. static INLINE int16x8_t sub_multiply_shift_and_narrow_s16( const int16x8_t a, const int16x8_t b, const int16_t ab_const) { int32x4_t temp_low = vsubl_s16(vget_low_s16(a), vget_low_s16(b)); int32x4_t temp_high = vsubl_s16(vget_high_s16(a), vget_high_s16(b)); temp_low = vmulq_n_s32(temp_low, ab_const); temp_high = vmulq_n_s32(temp_high, ab_const); - return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14)); + return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS), + vrshrn_n_s32(temp_high, DCT_CONST_BITS)); } // Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by -// 14. +// DCT_CONST_BITS. static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16( const int16x8_t a, const int16_t a_const, const int16x8_t b, const int16_t b_const) { @@ -128,7 +147,8 @@ static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16( int32x4_t temp_high = vmull_n_s16(vget_high_s16(a), a_const); temp_low = vmlal_n_s16(temp_low, vget_low_s16(b), b_const); temp_high = vmlal_n_s16(temp_high, vget_high_s16(b), b_const); - return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14)); + return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS), + vrshrn_n_s32(temp_high, DCT_CONST_BITS)); } // Shift the output down by 6 and add it to the destination buffer. @@ -218,10 +238,10 @@ static INLINE void idct4x4_16_kernel_bd8(const int16x4_t cospis, c3 = vmull_lane_s16(b2, cospis, 1); c2 = vmlsl_lane_s16(c2, b3, cospis, 1); c3 = vmlal_lane_s16(c3, b3, cospis, 3); - b0 = vrshrn_n_s32(c0, 14); - b1 = vrshrn_n_s32(c1, 14); - b2 = vrshrn_n_s32(c2, 14); - b3 = vrshrn_n_s32(c3, 14); + b0 = vrshrn_n_s32(c0, DCT_CONST_BITS); + b1 = vrshrn_n_s32(c1, DCT_CONST_BITS); + b2 = vrshrn_n_s32(c2, DCT_CONST_BITS); + b3 = vrshrn_n_s32(c3, DCT_CONST_BITS); d0 = vcombine_s16(b0, b1); d1 = vcombine_s16(b3, b2); *a0 = vaddq_s16(d0, d1); @@ -263,8 +283,8 @@ static INLINE void idct8x8_12_pass1_bd8( t32[1] = vmull_lane_s16(step2[6], cospis0, 2); t32[0] = vmlsl_lane_s16(t32[1], step2[5], cospis0, 2); t32[1] = vmlal_lane_s16(t32[1], step2[5], cospis0, 2); - step1[5] = vrshrn_n_s32(t32[0], 14); - step1[6] = vrshrn_n_s32(t32[1], 14); + step1[5] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); + step1[6] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); // stage 4 *io0 = vadd_s16(step1[0], step2[7]); @@ -322,10 +342,10 @@ static INLINE void idct8x8_12_pass2_bd8( t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2); t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); - t16[0] = vrshrn_n_s32(t32[0], 14); - t16[1] = vrshrn_n_s32(t32[1], 14); - t16[2] = vrshrn_n_s32(t32[2], 14); - t16[3] = vrshrn_n_s32(t32[3], 14); + t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); + t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); + t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS); + t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS); step1[5] = vcombine_s16(t16[0], t16[1]); step1[6] = vcombine_s16(t16[2], t16[3]); @@ -390,14 +410,14 @@ static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0, t32[5] = vmlsl_lane_s16(t32[5], input_5h, cospis1, 2); t32[6] = vmlal_lane_s16(t32[6], input_7l, cospis1, 3); t32[7] = vmlal_lane_s16(t32[7], input_7h, cospis1, 3); - t16[0] = vrshrn_n_s32(t32[0], 14); - t16[1] = vrshrn_n_s32(t32[1], 14); - t16[2] = vrshrn_n_s32(t32[2], 14); - t16[3] = vrshrn_n_s32(t32[3], 14); - t16[4] = vrshrn_n_s32(t32[4], 14); - t16[5] = vrshrn_n_s32(t32[5], 14); - t16[6] = vrshrn_n_s32(t32[6], 14); - t16[7] = vrshrn_n_s32(t32[7], 14); + t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); + t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); + t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS); + t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS); + t16[4] = vrshrn_n_s32(t32[4], DCT_CONST_BITS); + t16[5] = vrshrn_n_s32(t32[5], DCT_CONST_BITS); + t16[6] = vrshrn_n_s32(t32[6], DCT_CONST_BITS); + t16[7] = vrshrn_n_s32(t32[7], DCT_CONST_BITS); step1[4] = vcombine_s16(t16[0], t16[1]); step1[5] = vcombine_s16(t16[2], t16[3]); step1[6] = vcombine_s16(t16[4], t16[5]); @@ -418,14 +438,14 @@ static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0, t32[5] = vmlsl_lane_s16(t32[5], step1h[3], cospis0, 1); t32[6] = vmlal_lane_s16(t32[6], step1l[3], cospis0, 3); t32[7] = vmlal_lane_s16(t32[7], step1h[3], cospis0, 3); - t16[0] = vrshrn_n_s32(t32[0], 14); - t16[1] = vrshrn_n_s32(t32[1], 14); - t16[2] = vrshrn_n_s32(t32[2], 14); - t16[3] = vrshrn_n_s32(t32[3], 14); - t16[4] = vrshrn_n_s32(t32[4], 14); - t16[5] = vrshrn_n_s32(t32[5], 14); - t16[6] = vrshrn_n_s32(t32[6], 14); - t16[7] = vrshrn_n_s32(t32[7], 14); + t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); + t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); + t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS); + t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS); + t16[4] = vrshrn_n_s32(t32[4], DCT_CONST_BITS); + t16[5] = vrshrn_n_s32(t32[5], DCT_CONST_BITS); + t16[6] = vrshrn_n_s32(t32[6], DCT_CONST_BITS); + t16[7] = vrshrn_n_s32(t32[7], DCT_CONST_BITS); step2[0] = vcombine_s16(t16[0], t16[1]); step2[1] = vcombine_s16(t16[2], t16[3]); step2[2] = vcombine_s16(t16[4], t16[5]); @@ -448,10 +468,10 @@ static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0, t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2); t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2); - t16[0] = vrshrn_n_s32(t32[0], 14); - t16[1] = vrshrn_n_s32(t32[1], 14); - t16[2] = vrshrn_n_s32(t32[2], 14); - t16[3] = vrshrn_n_s32(t32[3], 14); + t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); + t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); + t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS); + t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS); step1[5] = vcombine_s16(t16[0], t16[1]); step1[6] = vcombine_s16(t16[2], t16[3]); @@ -471,10 +491,10 @@ static INLINE void idct16x16_add_wrap_low_8x2(const int32x4_t *const t32, int16x8_t *const d1) { int16x4_t t16[4]; - t16[0] = vrshrn_n_s32(t32[0], 14); - t16[1] = vrshrn_n_s32(t32[1], 14); - t16[2] = vrshrn_n_s32(t32[2], 14); - t16[3] = vrshrn_n_s32(t32[3], 14); + t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS); + t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS); + t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS); + t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS); *d0 = vcombine_s16(t16[0], t16[1]); *d1 = vcombine_s16(t16[2], t16[3]); } @@ -529,6 +549,178 @@ static INLINE void idct_cospi_16_16_q(const int16x8_t s0, const int16x8_t s1, idct16x16_add_wrap_low_8x2(t32, d0, d1); } +static INLINE void idct_cospi_2_30(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_2_30_10_22, + int16x8_t *const d0, int16x8_t *const d1) { + int32x4_t t32[4]; + + t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 1); + t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 1); + t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 1); + t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 1); + t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 0); + t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 0); + t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 0); + t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 0); + idct16x16_add_wrap_low_8x2(t32, d0, d1); +} + +static INLINE void idct_cospi_4_28(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_4_12_20N_28, + int16x8_t *const d0, int16x8_t *const d1) { + int32x4_t t32[4]; + + t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 3); + t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 3); + t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 3); + t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 3); + t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 0); + t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 0); + t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 0); + t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 0); + idct16x16_add_wrap_low_8x2(t32, d0, d1); +} + +static INLINE void idct_cospi_6_26(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_6_26_14_18N, + int16x8_t *const d0, int16x8_t *const d1) { + int32x4_t t32[4]; + + t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26_14_18N, 0); + t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26_14_18N, 0); + t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26_14_18N, 0); + t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26_14_18N, 0); + t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26_14_18N, 1); + t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26_14_18N, 1); + t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26_14_18N, 1); + t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26_14_18N, 1); + idct16x16_add_wrap_low_8x2(t32, d0, d1); +} + +static INLINE void idct_cospi_10_22(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_2_30_10_22, + int16x8_t *const d0, int16x8_t *const d1) { + int32x4_t t32[4]; + + t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 3); + t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 3); + t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 3); + t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 3); + t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 2); + t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 2); + t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 2); + t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 2); + idct16x16_add_wrap_low_8x2(t32, d0, d1); +} + +static INLINE void idct_cospi_12_20(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_4_12_20N_28, + int16x8_t *const d0, int16x8_t *const d1) { + int32x4_t t32[4]; + + t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 1); + t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 1); + t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 1); + t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 1); + t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 2); + t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 2); + t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 2); + t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 2); + idct16x16_add_wrap_low_8x2(t32, d0, d1); +} + +static INLINE void idct_cospi_14_18(const int16x8_t s0, const int16x8_t s1, + const int16x4_t cospi_6_26_14_18N, + int16x8_t *const d0, int16x8_t *const d1) { + int32x4_t t32[4]; + + t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26_14_18N, 2); + t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26_14_18N, 2); + t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26_14_18N, 2); + t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26_14_18N, 2); + t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26_14_18N, 3); + t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26_14_18N, 3); + t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26_14_18N, 3); + t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26_14_18N, 3); + idct16x16_add_wrap_low_8x2(t32, d0, d1); +} + +static INLINE void idct16x16_add_stage7(const int16x8_t *const step2, + int16x8_t *const out) { +#if CONFIG_VP9_HIGHBITDEPTH + // Use saturating add/sub to avoid overflow in 2nd pass + out[0] = vqaddq_s16(step2[0], step2[15]); + out[1] = vqaddq_s16(step2[1], step2[14]); + out[2] = vqaddq_s16(step2[2], step2[13]); + out[3] = vqaddq_s16(step2[3], step2[12]); + out[4] = vqaddq_s16(step2[4], step2[11]); + out[5] = vqaddq_s16(step2[5], step2[10]); + out[6] = vqaddq_s16(step2[6], step2[9]); + out[7] = vqaddq_s16(step2[7], step2[8]); + out[8] = vqsubq_s16(step2[7], step2[8]); + out[9] = vqsubq_s16(step2[6], step2[9]); + out[10] = vqsubq_s16(step2[5], step2[10]); + out[11] = vqsubq_s16(step2[4], step2[11]); + out[12] = vqsubq_s16(step2[3], step2[12]); + out[13] = vqsubq_s16(step2[2], step2[13]); + out[14] = vqsubq_s16(step2[1], step2[14]); + out[15] = vqsubq_s16(step2[0], step2[15]); +#else + out[0] = vaddq_s16(step2[0], step2[15]); + out[1] = vaddq_s16(step2[1], step2[14]); + out[2] = vaddq_s16(step2[2], step2[13]); + out[3] = vaddq_s16(step2[3], step2[12]); + out[4] = vaddq_s16(step2[4], step2[11]); + out[5] = vaddq_s16(step2[5], step2[10]); + out[6] = vaddq_s16(step2[6], step2[9]); + out[7] = vaddq_s16(step2[7], step2[8]); + out[8] = vsubq_s16(step2[7], step2[8]); + out[9] = vsubq_s16(step2[6], step2[9]); + out[10] = vsubq_s16(step2[5], step2[10]); + out[11] = vsubq_s16(step2[4], step2[11]); + out[12] = vsubq_s16(step2[3], step2[12]); + out[13] = vsubq_s16(step2[2], step2[13]); + out[14] = vsubq_s16(step2[1], step2[14]); + out[15] = vsubq_s16(step2[0], step2[15]); +#endif +} + +static INLINE void idct16x16_store_pass1(const int16x8_t *const out, + int16_t *output) { + // Save the result into output + vst1q_s16(output, out[0]); + output += 16; + vst1q_s16(output, out[1]); + output += 16; + vst1q_s16(output, out[2]); + output += 16; + vst1q_s16(output, out[3]); + output += 16; + vst1q_s16(output, out[4]); + output += 16; + vst1q_s16(output, out[5]); + output += 16; + vst1q_s16(output, out[6]); + output += 16; + vst1q_s16(output, out[7]); + output += 16; + vst1q_s16(output, out[8]); + output += 16; + vst1q_s16(output, out[9]); + output += 16; + vst1q_s16(output, out[10]); + output += 16; + vst1q_s16(output, out[11]); + output += 16; + vst1q_s16(output, out[12]); + output += 16; + vst1q_s16(output, out[13]); + output += 16; + vst1q_s16(output, out[14]); + output += 16; + vst1q_s16(output, out[15]); +} + static INLINE void idct16x16_add8x1(int16x8_t res, uint8_t **dest, const int stride) { uint8x8_t d = vld1_u8(*dest); @@ -541,4 +733,29 @@ static INLINE void idct16x16_add8x1(int16x8_t res, uint8_t **dest, *dest += stride; } +static INLINE void highbd_idct16x16_add8x1(int16x8_t res, const int16x8_t max, + uint16_t **dest, const int stride) { + uint16x8_t d = vld1q_u16(*dest); + + res = vqaddq_s16(res, vreinterpretq_s16_u16(d)); + res = vminq_s16(res, max); + d = vqshluq_n_s16(res, 0); + vst1q_u16(*dest, d); + *dest += stride; +} + +void idct16x16_256_add_half1d(const void *const input, int16_t *output, + void *const dest, const int stride, + const int highbd_flag); + +void idct16x16_38_add_half1d(const void *const input, int16_t *const output, + void *const dest, const int stride, + const int highbd_flag); + +void idct16x16_10_add_half1d_pass1(const tran_low_t *input, int16_t *output); + +void idct16x16_10_add_half1d_pass2(const int16_t *input, int16_t *const output, + void *const dest, const int stride, + const int highbd_flag); + #endif // VPX_DSP_ARM_IDCT_NEON_H_ diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/transpose_neon.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/transpose_neon.h index 8366ce50b87..434c20ca21c 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/transpose_neon.h +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/transpose_neon.h @@ -710,6 +710,83 @@ static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1, *a7 = d3.val[1]; } +static INLINE void transpose_s32_8x8(int32x4x2_t *a0, int32x4x2_t *a1, + int32x4x2_t *a2, int32x4x2_t *a3, + int32x4x2_t *a4, int32x4x2_t *a5, + int32x4x2_t *a6, int32x4x2_t *a7) { + // Swap 32 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 10 11 12 13 14 15 16 17 + // a2: 20 21 22 23 24 25 26 27 + // a3: 30 31 32 33 34 35 36 37 + // a4: 40 41 42 43 44 45 46 47 + // a5: 50 51 52 53 54 55 56 57 + // a6: 60 61 62 63 64 65 66 67 + // a7: 70 71 72 73 74 75 76 77 + // to: + // b0: 00 10 02 12 01 11 03 13 + // b1: 20 30 22 32 21 31 23 33 + // b2: 40 50 42 52 41 51 43 53 + // b3: 60 70 62 72 61 71 63 73 + // b4: 04 14 06 16 05 15 07 17 + // b5: 24 34 26 36 25 35 27 37 + // b6: 44 54 46 56 45 55 47 57 + // b7: 64 74 66 76 65 75 67 77 + + const int32x4x2_t b0 = vtrnq_s32(a0->val[0], a1->val[0]); + const int32x4x2_t b1 = vtrnq_s32(a2->val[0], a3->val[0]); + const int32x4x2_t b2 = vtrnq_s32(a4->val[0], a5->val[0]); + const int32x4x2_t b3 = vtrnq_s32(a6->val[0], a7->val[0]); + const int32x4x2_t b4 = vtrnq_s32(a0->val[1], a1->val[1]); + const int32x4x2_t b5 = vtrnq_s32(a2->val[1], a3->val[1]); + const int32x4x2_t b6 = vtrnq_s32(a4->val[1], a5->val[1]); + const int32x4x2_t b7 = vtrnq_s32(a6->val[1], a7->val[1]); + + // Swap 64 bit elements resulting in: + // c0: 00 10 20 30 02 12 22 32 + // c1: 01 11 21 31 03 13 23 33 + // c2: 40 50 60 70 42 52 62 72 + // c3: 41 51 61 71 43 53 63 73 + // c4: 04 14 24 34 06 16 26 36 + // c5: 05 15 25 35 07 17 27 37 + // c6: 44 54 64 74 46 56 66 76 + // c7: 45 55 65 75 47 57 67 77 + const int32x4x2_t c0 = vpx_vtrnq_s64_to_s32(b0.val[0], b1.val[0]); + const int32x4x2_t c1 = vpx_vtrnq_s64_to_s32(b0.val[1], b1.val[1]); + const int32x4x2_t c2 = vpx_vtrnq_s64_to_s32(b2.val[0], b3.val[0]); + const int32x4x2_t c3 = vpx_vtrnq_s64_to_s32(b2.val[1], b3.val[1]); + const int32x4x2_t c4 = vpx_vtrnq_s64_to_s32(b4.val[0], b5.val[0]); + const int32x4x2_t c5 = vpx_vtrnq_s64_to_s32(b4.val[1], b5.val[1]); + const int32x4x2_t c6 = vpx_vtrnq_s64_to_s32(b6.val[0], b7.val[0]); + const int32x4x2_t c7 = vpx_vtrnq_s64_to_s32(b6.val[1], b7.val[1]); + + // Swap 128 bit elements resulting in: + // a0: 00 10 20 30 40 50 60 70 + // a1: 01 11 21 31 41 51 61 71 + // a2: 02 12 22 32 42 52 62 72 + // a3: 03 13 23 33 43 53 63 73 + // a4: 04 14 24 34 44 54 64 74 + // a5: 05 15 25 35 45 55 65 75 + // a6: 06 16 26 36 46 56 66 76 + // a7: 07 17 27 37 47 57 67 77 + a0->val[0] = c0.val[0]; + a0->val[1] = c2.val[0]; + a1->val[0] = c1.val[0]; + a1->val[1] = c3.val[0]; + a2->val[0] = c0.val[1]; + a2->val[1] = c2.val[1]; + a3->val[0] = c1.val[1]; + a3->val[1] = c3.val[1]; + a4->val[0] = c4.val[0]; + a4->val[1] = c6.val[0]; + a5->val[0] = c5.val[0]; + a5->val[1] = c7.val[0]; + a6->val[0] = c4.val[1]; + a6->val[1] = c6.val[1]; + a7->val[0] = c5.val[1]; + a7->val[1] = c7.val[1]; +} + static INLINE void transpose_u8_16x8( const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2, const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5, diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/avg.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/avg.c index 4d9abb8de36..e4cd6cca78b 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/avg.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/avg.c @@ -67,9 +67,10 @@ static void hadamard_col8(const int16_t *src_diff, int src_stride, // The order of the output coeff of the hadamard is not important. For // optimization purposes the final transpose may be skipped. void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, - int16_t *coeff) { + tran_low_t *coeff) { int idx; int16_t buffer[64]; + int16_t buffer2[64]; int16_t *tmp_buf = &buffer[0]; for (idx = 0; idx < 8; ++idx) { hadamard_col8(src_diff, src_stride, tmp_buf); // src_diff: 9 bit @@ -80,17 +81,19 @@ void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride, tmp_buf = &buffer[0]; for (idx = 0; idx < 8; ++idx) { - hadamard_col8(tmp_buf, 8, coeff); // tmp_buf: 12 bit - // dynamic range [-2040, 2040] - coeff += 8; // coeff: 15 bit - // dynamic range [-16320, 16320] + hadamard_col8(tmp_buf, 8, buffer2 + 8 * idx); // tmp_buf: 12 bit + // dynamic range [-2040, 2040] + // buffer2: 15 bit + // dynamic range [-16320, 16320] ++tmp_buf; } + + for (idx = 0; idx < 64; ++idx) coeff[idx] = (tran_low_t)buffer2[idx]; } // In place 16x16 2D Hadamard transform void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, - int16_t *coeff) { + tran_low_t *coeff) { int idx; for (idx = 0; idx < 4; ++idx) { // src_diff: 9 bit, dynamic range [-255, 255] @@ -101,15 +104,15 @@ void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, // coeff: 15 bit, dynamic range [-16320, 16320] for (idx = 0; idx < 64; ++idx) { - int16_t a0 = coeff[0]; - int16_t a1 = coeff[64]; - int16_t a2 = coeff[128]; - int16_t a3 = coeff[192]; + tran_low_t a0 = coeff[0]; + tran_low_t a1 = coeff[64]; + tran_low_t a2 = coeff[128]; + tran_low_t a3 = coeff[192]; - int16_t b0 = (a0 + a1) >> 1; // (a0 + a1): 16 bit, [-32640, 32640] - int16_t b1 = (a0 - a1) >> 1; // b0-b3: 15 bit, dynamic range - int16_t b2 = (a2 + a3) >> 1; // [-16320, 16320] - int16_t b3 = (a2 - a3) >> 1; + tran_low_t b0 = (a0 + a1) >> 1; // (a0 + a1): 16 bit, [-32640, 32640] + tran_low_t b1 = (a0 - a1) >> 1; // b0-b3: 15 bit, dynamic range + tran_low_t b2 = (a2 + a3) >> 1; // [-16320, 16320] + tran_low_t b3 = (a2 - a3) >> 1; coeff[0] = b0 + b2; // 16 bit, [-32640, 32640] coeff[64] = b1 + b3; @@ -122,7 +125,7 @@ void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride, // coeff: 16 bits, dynamic range [-32640, 32640]. // length: value range {16, 64, 256, 1024}. -int vpx_satd_c(const int16_t *coeff, int length) { +int vpx_satd_c(const tran_low_t *coeff, int length) { int i; int satd = 0; for (i = 0; i < length; ++i) satd += abs(coeff[i]); diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/deblock.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/deblock.c index 6c27484979a..a0db1e40c98 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/deblock.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/deblock.c @@ -7,6 +7,7 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ +#include <assert.h> #include <stdlib.h> #include "vpx/vpx_integer.h" @@ -48,6 +49,9 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr, unsigned char v; unsigned char d[4]; + assert(size >= 8); + assert(cols >= 8); + for (row = 0; row < size; row++) { /* post_proc_down for one row */ p_src = src_ptr; diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c index 0f9aff1892a..f99ded57a85 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c @@ -93,6 +93,42 @@ void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int stride) { } } +void iadst4_c(const tran_low_t *input, tran_low_t *output) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; + tran_low_t x0 = input[0]; + tran_low_t x1 = input[1]; + tran_low_t x2 = input[2]; + tran_low_t x3 = input[3]; + + if (!(x0 | x1 | x2 | x3)) { + memset(output, 0, 4 * sizeof(*output)); + return; + } + + s0 = sinpi_1_9 * x0; + s1 = sinpi_2_9 * x0; + s2 = sinpi_3_9 * x1; + s3 = sinpi_4_9 * x2; + s4 = sinpi_1_9 * x2; + s5 = sinpi_2_9 * x3; + s6 = sinpi_4_9 * x3; + s7 = WRAPLOW(x0 - x2 + x3); + + s0 = s0 + s3 + s5; + s1 = s1 - s4 - s6; + s3 = s2; + s2 = sinpi_3_9 * s7; + + // 1-D transform scaling factor is sqrt(2). + // The overall dynamic range is 14b (input) + 14b (multiplication scaling) + // + 1b (addition) = 29b. + // Hence the output bit depth is 15b. + output[0] = WRAPLOW(dct_const_round_shift(s0 + s3)); + output[1] = WRAPLOW(dct_const_round_shift(s1 + s3)); + output[2] = WRAPLOW(dct_const_round_shift(s2)); + output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3)); +} + void idct4_c(const tran_low_t *input, tran_low_t *output) { tran_low_t step[4]; tran_high_t temp1, temp2; @@ -155,6 +191,81 @@ void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { } } +void iadst8_c(const tran_low_t *input, tran_low_t *output) { + int s0, s1, s2, s3, s4, s5, s6, s7; + tran_high_t x0 = input[7]; + tran_high_t x1 = input[0]; + tran_high_t x2 = input[5]; + tran_high_t x3 = input[2]; + tran_high_t x4 = input[3]; + tran_high_t x5 = input[4]; + tran_high_t x6 = input[1]; + tran_high_t x7 = input[6]; + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { + memset(output, 0, 8 * sizeof(*output)); + return; + } + + // stage 1 + s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1); + s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1); + s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3); + s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3); + s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5); + s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5); + s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7); + s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7); + + x0 = WRAPLOW(dct_const_round_shift(s0 + s4)); + x1 = WRAPLOW(dct_const_round_shift(s1 + s5)); + x2 = WRAPLOW(dct_const_round_shift(s2 + s6)); + x3 = WRAPLOW(dct_const_round_shift(s3 + s7)); + x4 = WRAPLOW(dct_const_round_shift(s0 - s4)); + x5 = WRAPLOW(dct_const_round_shift(s1 - s5)); + x6 = WRAPLOW(dct_const_round_shift(s2 - s6)); + x7 = WRAPLOW(dct_const_round_shift(s3 - s7)); + + // stage 2 + s0 = (int)x0; + s1 = (int)x1; + s2 = (int)x2; + s3 = (int)x3; + s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5); + s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5); + s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7); + s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7); + + x0 = WRAPLOW(s0 + s2); + x1 = WRAPLOW(s1 + s3); + x2 = WRAPLOW(s0 - s2); + x3 = WRAPLOW(s1 - s3); + x4 = WRAPLOW(dct_const_round_shift(s4 + s6)); + x5 = WRAPLOW(dct_const_round_shift(s5 + s7)); + x6 = WRAPLOW(dct_const_round_shift(s4 - s6)); + x7 = WRAPLOW(dct_const_round_shift(s5 - s7)); + + // stage 3 + s2 = (int)(cospi_16_64 * (x2 + x3)); + s3 = (int)(cospi_16_64 * (x2 - x3)); + s6 = (int)(cospi_16_64 * (x6 + x7)); + s7 = (int)(cospi_16_64 * (x6 - x7)); + + x2 = WRAPLOW(dct_const_round_shift(s2)); + x3 = WRAPLOW(dct_const_round_shift(s3)); + x6 = WRAPLOW(dct_const_round_shift(s6)); + x7 = WRAPLOW(dct_const_round_shift(s7)); + + output[0] = WRAPLOW(x0); + output[1] = WRAPLOW(-x4); + output[2] = WRAPLOW(x6); + output[3] = WRAPLOW(-x2); + output[4] = WRAPLOW(x3); + output[5] = WRAPLOW(-x7); + output[6] = WRAPLOW(x5); + output[7] = WRAPLOW(-x1); +} + void idct8_c(const tran_low_t *input, tran_low_t *output) { tran_low_t step1[8], step2[8]; tran_high_t temp1, temp2; @@ -234,6 +345,31 @@ void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) { } } +void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) { + int i, j; + tran_low_t out[8 * 8] = { 0 }; + tran_low_t *outptr = out; + tran_low_t temp_in[8], temp_out[8]; + + // First transform rows + // Only first 4 row has non-zero coefs + for (i = 0; i < 4; ++i) { + idct8_c(input, outptr); + input += 8; + outptr += 8; + } + + // Then transform columns + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; + idct8_c(temp_in, temp_out); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 5)); + } + } +} + void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { int i, j; tran_high_t a1; @@ -247,86 +383,119 @@ void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) { } } -void iadst4_c(const tran_low_t *input, tran_low_t *output) { - tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; - tran_low_t x0 = input[0]; - tran_low_t x1 = input[1]; - tran_low_t x2 = input[2]; - tran_low_t x3 = input[3]; - - if (!(x0 | x1 | x2 | x3)) { - memset(output, 0, 4 * sizeof(*output)); - return; - } - - s0 = sinpi_1_9 * x0; - s1 = sinpi_2_9 * x0; - s2 = sinpi_3_9 * x1; - s3 = sinpi_4_9 * x2; - s4 = sinpi_1_9 * x2; - s5 = sinpi_2_9 * x3; - s6 = sinpi_4_9 * x3; - s7 = WRAPLOW(x0 - x2 + x3); - - s0 = s0 + s3 + s5; - s1 = s1 - s4 - s6; - s3 = s2; - s2 = sinpi_3_9 * s7; - - // 1-D transform scaling factor is sqrt(2). - // The overall dynamic range is 14b (input) + 14b (multiplication scaling) - // + 1b (addition) = 29b. - // Hence the output bit depth is 15b. - output[0] = WRAPLOW(dct_const_round_shift(s0 + s3)); - output[1] = WRAPLOW(dct_const_round_shift(s1 + s3)); - output[2] = WRAPLOW(dct_const_round_shift(s2)); - output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3)); -} - -void iadst8_c(const tran_low_t *input, tran_low_t *output) { - int s0, s1, s2, s3, s4, s5, s6, s7; - tran_high_t x0 = input[7]; +void iadst16_c(const tran_low_t *input, tran_low_t *output) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; + tran_high_t s9, s10, s11, s12, s13, s14, s15; + tran_high_t x0 = input[15]; tran_high_t x1 = input[0]; - tran_high_t x2 = input[5]; + tran_high_t x2 = input[13]; tran_high_t x3 = input[2]; - tran_high_t x4 = input[3]; + tran_high_t x4 = input[11]; tran_high_t x5 = input[4]; - tran_high_t x6 = input[1]; + tran_high_t x6 = input[9]; tran_high_t x7 = input[6]; + tran_high_t x8 = input[7]; + tran_high_t x9 = input[8]; + tran_high_t x10 = input[5]; + tran_high_t x11 = input[10]; + tran_high_t x12 = input[3]; + tran_high_t x13 = input[12]; + tran_high_t x14 = input[1]; + tran_high_t x15 = input[14]; - if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { - memset(output, 0, 8 * sizeof(*output)); + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | + x13 | x14 | x15)) { + memset(output, 0, 16 * sizeof(*output)); return; } // stage 1 - s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1); - s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1); - s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3); - s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3); - s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5); - s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5); - s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7); - s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7); + s0 = x0 * cospi_1_64 + x1 * cospi_31_64; + s1 = x0 * cospi_31_64 - x1 * cospi_1_64; + s2 = x2 * cospi_5_64 + x3 * cospi_27_64; + s3 = x2 * cospi_27_64 - x3 * cospi_5_64; + s4 = x4 * cospi_9_64 + x5 * cospi_23_64; + s5 = x4 * cospi_23_64 - x5 * cospi_9_64; + s6 = x6 * cospi_13_64 + x7 * cospi_19_64; + s7 = x6 * cospi_19_64 - x7 * cospi_13_64; + s8 = x8 * cospi_17_64 + x9 * cospi_15_64; + s9 = x8 * cospi_15_64 - x9 * cospi_17_64; + s10 = x10 * cospi_21_64 + x11 * cospi_11_64; + s11 = x10 * cospi_11_64 - x11 * cospi_21_64; + s12 = x12 * cospi_25_64 + x13 * cospi_7_64; + s13 = x12 * cospi_7_64 - x13 * cospi_25_64; + s14 = x14 * cospi_29_64 + x15 * cospi_3_64; + s15 = x14 * cospi_3_64 - x15 * cospi_29_64; - x0 = WRAPLOW(dct_const_round_shift(s0 + s4)); - x1 = WRAPLOW(dct_const_round_shift(s1 + s5)); - x2 = WRAPLOW(dct_const_round_shift(s2 + s6)); - x3 = WRAPLOW(dct_const_round_shift(s3 + s7)); - x4 = WRAPLOW(dct_const_round_shift(s0 - s4)); - x5 = WRAPLOW(dct_const_round_shift(s1 - s5)); - x6 = WRAPLOW(dct_const_round_shift(s2 - s6)); - x7 = WRAPLOW(dct_const_round_shift(s3 - s7)); + x0 = WRAPLOW(dct_const_round_shift(s0 + s8)); + x1 = WRAPLOW(dct_const_round_shift(s1 + s9)); + x2 = WRAPLOW(dct_const_round_shift(s2 + s10)); + x3 = WRAPLOW(dct_const_round_shift(s3 + s11)); + x4 = WRAPLOW(dct_const_round_shift(s4 + s12)); + x5 = WRAPLOW(dct_const_round_shift(s5 + s13)); + x6 = WRAPLOW(dct_const_round_shift(s6 + s14)); + x7 = WRAPLOW(dct_const_round_shift(s7 + s15)); + x8 = WRAPLOW(dct_const_round_shift(s0 - s8)); + x9 = WRAPLOW(dct_const_round_shift(s1 - s9)); + x10 = WRAPLOW(dct_const_round_shift(s2 - s10)); + x11 = WRAPLOW(dct_const_round_shift(s3 - s11)); + x12 = WRAPLOW(dct_const_round_shift(s4 - s12)); + x13 = WRAPLOW(dct_const_round_shift(s5 - s13)); + x14 = WRAPLOW(dct_const_round_shift(s6 - s14)); + x15 = WRAPLOW(dct_const_round_shift(s7 - s15)); // stage 2 - s0 = (int)x0; - s1 = (int)x1; - s2 = (int)x2; - s3 = (int)x3; - s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5); - s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5); - s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7); - s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7); + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4; + s5 = x5; + s6 = x6; + s7 = x7; + s8 = x8 * cospi_4_64 + x9 * cospi_28_64; + s9 = x8 * cospi_28_64 - x9 * cospi_4_64; + s10 = x10 * cospi_20_64 + x11 * cospi_12_64; + s11 = x10 * cospi_12_64 - x11 * cospi_20_64; + s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; + s13 = x12 * cospi_4_64 + x13 * cospi_28_64; + s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; + s15 = x14 * cospi_20_64 + x15 * cospi_12_64; + + x0 = WRAPLOW(s0 + s4); + x1 = WRAPLOW(s1 + s5); + x2 = WRAPLOW(s2 + s6); + x3 = WRAPLOW(s3 + s7); + x4 = WRAPLOW(s0 - s4); + x5 = WRAPLOW(s1 - s5); + x6 = WRAPLOW(s2 - s6); + x7 = WRAPLOW(s3 - s7); + x8 = WRAPLOW(dct_const_round_shift(s8 + s12)); + x9 = WRAPLOW(dct_const_round_shift(s9 + s13)); + x10 = WRAPLOW(dct_const_round_shift(s10 + s14)); + x11 = WRAPLOW(dct_const_round_shift(s11 + s15)); + x12 = WRAPLOW(dct_const_round_shift(s8 - s12)); + x13 = WRAPLOW(dct_const_round_shift(s9 - s13)); + x14 = WRAPLOW(dct_const_round_shift(s10 - s14)); + x15 = WRAPLOW(dct_const_round_shift(s11 - s15)); + + // stage 3 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4 * cospi_8_64 + x5 * cospi_24_64; + s5 = x4 * cospi_24_64 - x5 * cospi_8_64; + s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; + s7 = x6 * cospi_8_64 + x7 * cospi_24_64; + s8 = x8; + s9 = x9; + s10 = x10; + s11 = x11; + s12 = x12 * cospi_8_64 + x13 * cospi_24_64; + s13 = x12 * cospi_24_64 - x13 * cospi_8_64; + s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; + s15 = x14 * cospi_8_64 + x15 * cospi_24_64; x0 = WRAPLOW(s0 + s2); x1 = WRAPLOW(s1 + s3); @@ -336,51 +505,50 @@ void iadst8_c(const tran_low_t *input, tran_low_t *output) { x5 = WRAPLOW(dct_const_round_shift(s5 + s7)); x6 = WRAPLOW(dct_const_round_shift(s4 - s6)); x7 = WRAPLOW(dct_const_round_shift(s5 - s7)); + x8 = WRAPLOW(s8 + s10); + x9 = WRAPLOW(s9 + s11); + x10 = WRAPLOW(s8 - s10); + x11 = WRAPLOW(s9 - s11); + x12 = WRAPLOW(dct_const_round_shift(s12 + s14)); + x13 = WRAPLOW(dct_const_round_shift(s13 + s15)); + x14 = WRAPLOW(dct_const_round_shift(s12 - s14)); + x15 = WRAPLOW(dct_const_round_shift(s13 - s15)); - // stage 3 - s2 = (int)(cospi_16_64 * (x2 + x3)); - s3 = (int)(cospi_16_64 * (x2 - x3)); - s6 = (int)(cospi_16_64 * (x6 + x7)); - s7 = (int)(cospi_16_64 * (x6 - x7)); + // stage 4 + s2 = (-cospi_16_64) * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (-x6 + x7); + s10 = cospi_16_64 * (x10 + x11); + s11 = cospi_16_64 * (-x10 + x11); + s14 = (-cospi_16_64) * (x14 + x15); + s15 = cospi_16_64 * (x14 - x15); x2 = WRAPLOW(dct_const_round_shift(s2)); x3 = WRAPLOW(dct_const_round_shift(s3)); x6 = WRAPLOW(dct_const_round_shift(s6)); x7 = WRAPLOW(dct_const_round_shift(s7)); + x10 = WRAPLOW(dct_const_round_shift(s10)); + x11 = WRAPLOW(dct_const_round_shift(s11)); + x14 = WRAPLOW(dct_const_round_shift(s14)); + x15 = WRAPLOW(dct_const_round_shift(s15)); output[0] = WRAPLOW(x0); - output[1] = WRAPLOW(-x4); - output[2] = WRAPLOW(x6); - output[3] = WRAPLOW(-x2); - output[4] = WRAPLOW(x3); - output[5] = WRAPLOW(-x7); - output[6] = WRAPLOW(x5); - output[7] = WRAPLOW(-x1); -} - -void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) { - int i, j; - tran_low_t out[8 * 8] = { 0 }; - tran_low_t *outptr = out; - tran_low_t temp_in[8], temp_out[8]; - - // First transform rows - // Only first 4 row has non-zero coefs - for (i = 0; i < 4; ++i) { - idct8_c(input, outptr); - input += 8; - outptr += 8; - } - - // Then transform columns - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - idct8_c(temp_in, temp_out); - for (j = 0; j < 8; ++j) { - dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], - ROUND_POWER_OF_TWO(temp_out[j], 5)); - } - } + output[1] = WRAPLOW(-x8); + output[2] = WRAPLOW(x12); + output[3] = WRAPLOW(-x4); + output[4] = WRAPLOW(x6); + output[5] = WRAPLOW(x14); + output[6] = WRAPLOW(x10); + output[7] = WRAPLOW(x2); + output[8] = WRAPLOW(x3); + output[9] = WRAPLOW(x11); + output[10] = WRAPLOW(x15); + output[11] = WRAPLOW(x7); + output[12] = WRAPLOW(x5); + output[13] = WRAPLOW(-x13); + output[14] = WRAPLOW(x9); + output[15] = WRAPLOW(-x1); } void idct16_c(const tran_low_t *input, tran_low_t *output) { @@ -573,172 +741,30 @@ void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest, } } -void iadst16_c(const tran_low_t *input, tran_low_t *output) { - tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; - tran_high_t s9, s10, s11, s12, s13, s14, s15; - tran_high_t x0 = input[15]; - tran_high_t x1 = input[0]; - tran_high_t x2 = input[13]; - tran_high_t x3 = input[2]; - tran_high_t x4 = input[11]; - tran_high_t x5 = input[4]; - tran_high_t x6 = input[9]; - tran_high_t x7 = input[6]; - tran_high_t x8 = input[7]; - tran_high_t x9 = input[8]; - tran_high_t x10 = input[5]; - tran_high_t x11 = input[10]; - tran_high_t x12 = input[3]; - tran_high_t x13 = input[12]; - tran_high_t x14 = input[1]; - tran_high_t x15 = input[14]; +void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest, + int stride) { + int i, j; + tran_low_t out[16 * 16] = { 0 }; + tran_low_t *outptr = out; + tran_low_t temp_in[16], temp_out[16]; - if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | - x13 | x14 | x15)) { - memset(output, 0, 16 * sizeof(*output)); - return; + // First transform rows. Since all non-zero dct coefficients are in + // upper-left 8x8 area, we only need to calculate first 8 rows here. + for (i = 0; i < 8; ++i) { + idct16_c(input, outptr); + input += 16; + outptr += 16; } - // stage 1 - s0 = x0 * cospi_1_64 + x1 * cospi_31_64; - s1 = x0 * cospi_31_64 - x1 * cospi_1_64; - s2 = x2 * cospi_5_64 + x3 * cospi_27_64; - s3 = x2 * cospi_27_64 - x3 * cospi_5_64; - s4 = x4 * cospi_9_64 + x5 * cospi_23_64; - s5 = x4 * cospi_23_64 - x5 * cospi_9_64; - s6 = x6 * cospi_13_64 + x7 * cospi_19_64; - s7 = x6 * cospi_19_64 - x7 * cospi_13_64; - s8 = x8 * cospi_17_64 + x9 * cospi_15_64; - s9 = x8 * cospi_15_64 - x9 * cospi_17_64; - s10 = x10 * cospi_21_64 + x11 * cospi_11_64; - s11 = x10 * cospi_11_64 - x11 * cospi_21_64; - s12 = x12 * cospi_25_64 + x13 * cospi_7_64; - s13 = x12 * cospi_7_64 - x13 * cospi_25_64; - s14 = x14 * cospi_29_64 + x15 * cospi_3_64; - s15 = x14 * cospi_3_64 - x15 * cospi_29_64; - - x0 = WRAPLOW(dct_const_round_shift(s0 + s8)); - x1 = WRAPLOW(dct_const_round_shift(s1 + s9)); - x2 = WRAPLOW(dct_const_round_shift(s2 + s10)); - x3 = WRAPLOW(dct_const_round_shift(s3 + s11)); - x4 = WRAPLOW(dct_const_round_shift(s4 + s12)); - x5 = WRAPLOW(dct_const_round_shift(s5 + s13)); - x6 = WRAPLOW(dct_const_round_shift(s6 + s14)); - x7 = WRAPLOW(dct_const_round_shift(s7 + s15)); - x8 = WRAPLOW(dct_const_round_shift(s0 - s8)); - x9 = WRAPLOW(dct_const_round_shift(s1 - s9)); - x10 = WRAPLOW(dct_const_round_shift(s2 - s10)); - x11 = WRAPLOW(dct_const_round_shift(s3 - s11)); - x12 = WRAPLOW(dct_const_round_shift(s4 - s12)); - x13 = WRAPLOW(dct_const_round_shift(s5 - s13)); - x14 = WRAPLOW(dct_const_round_shift(s6 - s14)); - x15 = WRAPLOW(dct_const_round_shift(s7 - s15)); - - // stage 2 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = x4; - s5 = x5; - s6 = x6; - s7 = x7; - s8 = x8 * cospi_4_64 + x9 * cospi_28_64; - s9 = x8 * cospi_28_64 - x9 * cospi_4_64; - s10 = x10 * cospi_20_64 + x11 * cospi_12_64; - s11 = x10 * cospi_12_64 - x11 * cospi_20_64; - s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; - s13 = x12 * cospi_4_64 + x13 * cospi_28_64; - s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; - s15 = x14 * cospi_20_64 + x15 * cospi_12_64; - - x0 = WRAPLOW(s0 + s4); - x1 = WRAPLOW(s1 + s5); - x2 = WRAPLOW(s2 + s6); - x3 = WRAPLOW(s3 + s7); - x4 = WRAPLOW(s0 - s4); - x5 = WRAPLOW(s1 - s5); - x6 = WRAPLOW(s2 - s6); - x7 = WRAPLOW(s3 - s7); - x8 = WRAPLOW(dct_const_round_shift(s8 + s12)); - x9 = WRAPLOW(dct_const_round_shift(s9 + s13)); - x10 = WRAPLOW(dct_const_round_shift(s10 + s14)); - x11 = WRAPLOW(dct_const_round_shift(s11 + s15)); - x12 = WRAPLOW(dct_const_round_shift(s8 - s12)); - x13 = WRAPLOW(dct_const_round_shift(s9 - s13)); - x14 = WRAPLOW(dct_const_round_shift(s10 - s14)); - x15 = WRAPLOW(dct_const_round_shift(s11 - s15)); - - // stage 3 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = x4 * cospi_8_64 + x5 * cospi_24_64; - s5 = x4 * cospi_24_64 - x5 * cospi_8_64; - s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; - s7 = x6 * cospi_8_64 + x7 * cospi_24_64; - s8 = x8; - s9 = x9; - s10 = x10; - s11 = x11; - s12 = x12 * cospi_8_64 + x13 * cospi_24_64; - s13 = x12 * cospi_24_64 - x13 * cospi_8_64; - s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; - s15 = x14 * cospi_8_64 + x15 * cospi_24_64; - - x0 = WRAPLOW(s0 + s2); - x1 = WRAPLOW(s1 + s3); - x2 = WRAPLOW(s0 - s2); - x3 = WRAPLOW(s1 - s3); - x4 = WRAPLOW(dct_const_round_shift(s4 + s6)); - x5 = WRAPLOW(dct_const_round_shift(s5 + s7)); - x6 = WRAPLOW(dct_const_round_shift(s4 - s6)); - x7 = WRAPLOW(dct_const_round_shift(s5 - s7)); - x8 = WRAPLOW(s8 + s10); - x9 = WRAPLOW(s9 + s11); - x10 = WRAPLOW(s8 - s10); - x11 = WRAPLOW(s9 - s11); - x12 = WRAPLOW(dct_const_round_shift(s12 + s14)); - x13 = WRAPLOW(dct_const_round_shift(s13 + s15)); - x14 = WRAPLOW(dct_const_round_shift(s12 - s14)); - x15 = WRAPLOW(dct_const_round_shift(s13 - s15)); - - // stage 4 - s2 = (-cospi_16_64) * (x2 + x3); - s3 = cospi_16_64 * (x2 - x3); - s6 = cospi_16_64 * (x6 + x7); - s7 = cospi_16_64 * (-x6 + x7); - s10 = cospi_16_64 * (x10 + x11); - s11 = cospi_16_64 * (-x10 + x11); - s14 = (-cospi_16_64) * (x14 + x15); - s15 = cospi_16_64 * (x14 - x15); - - x2 = WRAPLOW(dct_const_round_shift(s2)); - x3 = WRAPLOW(dct_const_round_shift(s3)); - x6 = WRAPLOW(dct_const_round_shift(s6)); - x7 = WRAPLOW(dct_const_round_shift(s7)); - x10 = WRAPLOW(dct_const_round_shift(s10)); - x11 = WRAPLOW(dct_const_round_shift(s11)); - x14 = WRAPLOW(dct_const_round_shift(s14)); - x15 = WRAPLOW(dct_const_round_shift(s15)); - - output[0] = WRAPLOW(x0); - output[1] = WRAPLOW(-x8); - output[2] = WRAPLOW(x12); - output[3] = WRAPLOW(-x4); - output[4] = WRAPLOW(x6); - output[5] = WRAPLOW(x14); - output[6] = WRAPLOW(x10); - output[7] = WRAPLOW(x2); - output[8] = WRAPLOW(x3); - output[9] = WRAPLOW(x11); - output[10] = WRAPLOW(x15); - output[11] = WRAPLOW(x7); - output[12] = WRAPLOW(x5); - output[13] = WRAPLOW(-x13); - output[14] = WRAPLOW(x9); - output[15] = WRAPLOW(-x1); + // Then transform columns + for (i = 0; i < 16; ++i) { + for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; + idct16_c(temp_in, temp_out); + for (j = 0; j < 16; ++j) { + dest[j * stride + i] = clip_pixel_add(dest[j * stride + i], + ROUND_POWER_OF_TWO(temp_out[j], 6)); + } + } } void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest, @@ -1351,6 +1377,51 @@ void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8, } } +void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; + tran_low_t x0 = input[0]; + tran_low_t x1 = input[1]; + tran_low_t x2 = input[2]; + tran_low_t x3 = input[3]; + (void)bd; + + if (detect_invalid_highbd_input(input, 4)) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + assert(0 && "invalid highbd txfm input"); +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING + memset(output, 0, sizeof(*output) * 4); + return; + } + + if (!(x0 | x1 | x2 | x3)) { + memset(output, 0, 4 * sizeof(*output)); + return; + } + + s0 = sinpi_1_9 * x0; + s1 = sinpi_2_9 * x0; + s2 = sinpi_3_9 * x1; + s3 = sinpi_4_9 * x2; + s4 = sinpi_1_9 * x2; + s5 = sinpi_2_9 * x3; + s6 = sinpi_4_9 * x3; + s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd); + + s0 = s0 + s3 + s5; + s1 = s1 - s4 - s6; + s3 = s2; + s2 = sinpi_3_9 * s7; + + // 1-D transform scaling factor is sqrt(2). + // The overall dynamic range is 14b (input) + 14b (multiplication scaling) + // + 1b (addition) = 29b. + // Hence the output bit depth is 15b. + output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd); + output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd); + output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd); + output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd); +} + void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) { tran_low_t step[4]; tran_high_t temp1, temp2; @@ -1427,6 +1498,90 @@ void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8, } } +void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; + tran_low_t x0 = input[7]; + tran_low_t x1 = input[0]; + tran_low_t x2 = input[5]; + tran_low_t x3 = input[2]; + tran_low_t x4 = input[3]; + tran_low_t x5 = input[4]; + tran_low_t x6 = input[1]; + tran_low_t x7 = input[6]; + (void)bd; + + if (detect_invalid_highbd_input(input, 8)) { +#if CONFIG_COEFFICIENT_RANGE_CHECKING + assert(0 && "invalid highbd txfm input"); +#endif // CONFIG_COEFFICIENT_RANGE_CHECKING + memset(output, 0, sizeof(*output) * 8); + return; + } + + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { + memset(output, 0, 8 * sizeof(*output)); + return; + } + + // stage 1 + s0 = cospi_2_64 * x0 + cospi_30_64 * x1; + s1 = cospi_30_64 * x0 - cospi_2_64 * x1; + s2 = cospi_10_64 * x2 + cospi_22_64 * x3; + s3 = cospi_22_64 * x2 - cospi_10_64 * x3; + s4 = cospi_18_64 * x4 + cospi_14_64 * x5; + s5 = cospi_14_64 * x4 - cospi_18_64 * x5; + s6 = cospi_26_64 * x6 + cospi_6_64 * x7; + s7 = cospi_6_64 * x6 - cospi_26_64 * x7; + + x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd); + x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd); + x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd); + x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd); + x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd); + x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd); + x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd); + x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd); + + // stage 2 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = cospi_8_64 * x4 + cospi_24_64 * x5; + s5 = cospi_24_64 * x4 - cospi_8_64 * x5; + s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; + s7 = cospi_8_64 * x6 + cospi_24_64 * x7; + + x0 = HIGHBD_WRAPLOW(s0 + s2, bd); + x1 = HIGHBD_WRAPLOW(s1 + s3, bd); + x2 = HIGHBD_WRAPLOW(s0 - s2, bd); + x3 = HIGHBD_WRAPLOW(s1 - s3, bd); + x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd); + x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd); + x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd); + x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd); + + // stage 3 + s2 = cospi_16_64 * (x2 + x3); + s3 = cospi_16_64 * (x2 - x3); + s6 = cospi_16_64 * (x6 + x7); + s7 = cospi_16_64 * (x6 - x7); + + x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd); + x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd); + x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd); + x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd); + + output[0] = HIGHBD_WRAPLOW(x0, bd); + output[1] = HIGHBD_WRAPLOW(-x4, bd); + output[2] = HIGHBD_WRAPLOW(x6, bd); + output[3] = HIGHBD_WRAPLOW(-x2, bd); + output[4] = HIGHBD_WRAPLOW(x3, bd); + output[5] = HIGHBD_WRAPLOW(-x7, bd); + output[6] = HIGHBD_WRAPLOW(x5, bd); + output[7] = HIGHBD_WRAPLOW(-x1, bd); +} + void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) { tran_low_t step1[8], step2[8]; tran_high_t temp1, temp2; @@ -1507,6 +1662,33 @@ void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, } } +void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + int i, j; + tran_low_t out[8 * 8] = { 0 }; + tran_low_t *outptr = out; + tran_low_t temp_in[8], temp_out[8]; + uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); + + // First transform rows + // Only first 4 row has non-zero coefs + for (i = 0; i < 4; ++i) { + vpx_highbd_idct8_c(input, outptr, bd); + input += 8; + outptr += 8; + } + + // Then transform columns + for (i = 0; i < 8; ++i) { + for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; + vpx_highbd_idct8_c(temp_in, temp_out, bd); + for (j = 0; j < 8; ++j) { + dest[j * stride + i] = highbd_clip_pixel_add( + dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); + } + } +} + void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int bd) { int i, j; @@ -1523,104 +1705,128 @@ void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) { - tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; - tran_low_t x0 = input[0]; - tran_low_t x1 = input[1]; - tran_low_t x2 = input[2]; - tran_low_t x3 = input[3]; - (void)bd; - - if (detect_invalid_highbd_input(input, 4)) { -#if CONFIG_COEFFICIENT_RANGE_CHECKING - assert(0 && "invalid highbd txfm input"); -#endif // CONFIG_COEFFICIENT_RANGE_CHECKING - memset(output, 0, sizeof(*output) * 4); - return; - } - - if (!(x0 | x1 | x2 | x3)) { - memset(output, 0, 4 * sizeof(*output)); - return; - } - - s0 = sinpi_1_9 * x0; - s1 = sinpi_2_9 * x0; - s2 = sinpi_3_9 * x1; - s3 = sinpi_4_9 * x2; - s4 = sinpi_1_9 * x2; - s5 = sinpi_2_9 * x3; - s6 = sinpi_4_9 * x3; - s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd); - - s0 = s0 + s3 + s5; - s1 = s1 - s4 - s6; - s3 = s2; - s2 = sinpi_3_9 * s7; - - // 1-D transform scaling factor is sqrt(2). - // The overall dynamic range is 14b (input) + 14b (multiplication scaling) - // + 1b (addition) = 29b. - // Hence the output bit depth is 15b. - output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd); - output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd); - output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd); - output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd); -} - -void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { - tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; - tran_low_t x0 = input[7]; +void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) { + tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; + tran_high_t s9, s10, s11, s12, s13, s14, s15; + tran_low_t x0 = input[15]; tran_low_t x1 = input[0]; - tran_low_t x2 = input[5]; + tran_low_t x2 = input[13]; tran_low_t x3 = input[2]; - tran_low_t x4 = input[3]; + tran_low_t x4 = input[11]; tran_low_t x5 = input[4]; - tran_low_t x6 = input[1]; + tran_low_t x6 = input[9]; tran_low_t x7 = input[6]; + tran_low_t x8 = input[7]; + tran_low_t x9 = input[8]; + tran_low_t x10 = input[5]; + tran_low_t x11 = input[10]; + tran_low_t x12 = input[3]; + tran_low_t x13 = input[12]; + tran_low_t x14 = input[1]; + tran_low_t x15 = input[14]; (void)bd; - if (detect_invalid_highbd_input(input, 8)) { + if (detect_invalid_highbd_input(input, 16)) { #if CONFIG_COEFFICIENT_RANGE_CHECKING assert(0 && "invalid highbd txfm input"); #endif // CONFIG_COEFFICIENT_RANGE_CHECKING - memset(output, 0, sizeof(*output) * 8); + memset(output, 0, sizeof(*output) * 16); return; } - if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) { - memset(output, 0, 8 * sizeof(*output)); + if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | + x13 | x14 | x15)) { + memset(output, 0, 16 * sizeof(*output)); return; } // stage 1 - s0 = cospi_2_64 * x0 + cospi_30_64 * x1; - s1 = cospi_30_64 * x0 - cospi_2_64 * x1; - s2 = cospi_10_64 * x2 + cospi_22_64 * x3; - s3 = cospi_22_64 * x2 - cospi_10_64 * x3; - s4 = cospi_18_64 * x4 + cospi_14_64 * x5; - s5 = cospi_14_64 * x4 - cospi_18_64 * x5; - s6 = cospi_26_64 * x6 + cospi_6_64 * x7; - s7 = cospi_6_64 * x6 - cospi_26_64 * x7; + s0 = x0 * cospi_1_64 + x1 * cospi_31_64; + s1 = x0 * cospi_31_64 - x1 * cospi_1_64; + s2 = x2 * cospi_5_64 + x3 * cospi_27_64; + s3 = x2 * cospi_27_64 - x3 * cospi_5_64; + s4 = x4 * cospi_9_64 + x5 * cospi_23_64; + s5 = x4 * cospi_23_64 - x5 * cospi_9_64; + s6 = x6 * cospi_13_64 + x7 * cospi_19_64; + s7 = x6 * cospi_19_64 - x7 * cospi_13_64; + s8 = x8 * cospi_17_64 + x9 * cospi_15_64; + s9 = x8 * cospi_15_64 - x9 * cospi_17_64; + s10 = x10 * cospi_21_64 + x11 * cospi_11_64; + s11 = x10 * cospi_11_64 - x11 * cospi_21_64; + s12 = x12 * cospi_25_64 + x13 * cospi_7_64; + s13 = x12 * cospi_7_64 - x13 * cospi_25_64; + s14 = x14 * cospi_29_64 + x15 * cospi_3_64; + s15 = x14 * cospi_3_64 - x15 * cospi_29_64; - x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd); - x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd); - x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd); - x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd); - x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd); - x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd); - x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd); - x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd); + x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd); + x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd); + x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd); + x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd); + x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd); + x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd); + x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd); + x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd); + x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd); + x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd); + x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd); + x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd); + x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd); + x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd); + x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd); + x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd); // stage 2 s0 = x0; s1 = x1; s2 = x2; s3 = x3; - s4 = cospi_8_64 * x4 + cospi_24_64 * x5; - s5 = cospi_24_64 * x4 - cospi_8_64 * x5; - s6 = -cospi_24_64 * x6 + cospi_8_64 * x7; - s7 = cospi_8_64 * x6 + cospi_24_64 * x7; + s4 = x4; + s5 = x5; + s6 = x6; + s7 = x7; + s8 = x8 * cospi_4_64 + x9 * cospi_28_64; + s9 = x8 * cospi_28_64 - x9 * cospi_4_64; + s10 = x10 * cospi_20_64 + x11 * cospi_12_64; + s11 = x10 * cospi_12_64 - x11 * cospi_20_64; + s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; + s13 = x12 * cospi_4_64 + x13 * cospi_28_64; + s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; + s15 = x14 * cospi_20_64 + x15 * cospi_12_64; + + x0 = HIGHBD_WRAPLOW(s0 + s4, bd); + x1 = HIGHBD_WRAPLOW(s1 + s5, bd); + x2 = HIGHBD_WRAPLOW(s2 + s6, bd); + x3 = HIGHBD_WRAPLOW(s3 + s7, bd); + x4 = HIGHBD_WRAPLOW(s0 - s4, bd); + x5 = HIGHBD_WRAPLOW(s1 - s5, bd); + x6 = HIGHBD_WRAPLOW(s2 - s6, bd); + x7 = HIGHBD_WRAPLOW(s3 - s7, bd); + x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd); + x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd); + x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd); + x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd); + x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd); + x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd); + x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd); + x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd); + + // stage 3 + s0 = x0; + s1 = x1; + s2 = x2; + s3 = x3; + s4 = x4 * cospi_8_64 + x5 * cospi_24_64; + s5 = x4 * cospi_24_64 - x5 * cospi_8_64; + s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; + s7 = x6 * cospi_8_64 + x7 * cospi_24_64; + s8 = x8; + s9 = x9; + s10 = x10; + s11 = x11; + s12 = x12 * cospi_8_64 + x13 * cospi_24_64; + s13 = x12 * cospi_24_64 - x13 * cospi_8_64; + s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; + s15 = x14 * cospi_8_64 + x15 * cospi_24_64; x0 = HIGHBD_WRAPLOW(s0 + s2, bd); x1 = HIGHBD_WRAPLOW(s1 + s3, bd); @@ -1630,53 +1836,50 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) { x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd); x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd); x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd); + x8 = HIGHBD_WRAPLOW(s8 + s10, bd); + x9 = HIGHBD_WRAPLOW(s9 + s11, bd); + x10 = HIGHBD_WRAPLOW(s8 - s10, bd); + x11 = HIGHBD_WRAPLOW(s9 - s11, bd); + x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd); + x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd); + x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd); + x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd); - // stage 3 - s2 = cospi_16_64 * (x2 + x3); + // stage 4 + s2 = (-cospi_16_64) * (x2 + x3); s3 = cospi_16_64 * (x2 - x3); s6 = cospi_16_64 * (x6 + x7); - s7 = cospi_16_64 * (x6 - x7); + s7 = cospi_16_64 * (-x6 + x7); + s10 = cospi_16_64 * (x10 + x11); + s11 = cospi_16_64 * (-x10 + x11); + s14 = (-cospi_16_64) * (x14 + x15); + s15 = cospi_16_64 * (x14 - x15); x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd); x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd); x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd); x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd); + x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd); + x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd); + x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd); + x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd); output[0] = HIGHBD_WRAPLOW(x0, bd); - output[1] = HIGHBD_WRAPLOW(-x4, bd); - output[2] = HIGHBD_WRAPLOW(x6, bd); - output[3] = HIGHBD_WRAPLOW(-x2, bd); - output[4] = HIGHBD_WRAPLOW(x3, bd); - output[5] = HIGHBD_WRAPLOW(-x7, bd); - output[6] = HIGHBD_WRAPLOW(x5, bd); - output[7] = HIGHBD_WRAPLOW(-x1, bd); -} - -void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest8, - int stride, int bd) { - int i, j; - tran_low_t out[8 * 8] = { 0 }; - tran_low_t *outptr = out; - tran_low_t temp_in[8], temp_out[8]; - uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); - - // First transform rows - // Only first 4 row has non-zero coefs - for (i = 0; i < 4; ++i) { - vpx_highbd_idct8_c(input, outptr, bd); - input += 8; - outptr += 8; - } - - // Then transform columns - for (i = 0; i < 8; ++i) { - for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; - vpx_highbd_idct8_c(temp_in, temp_out, bd); - for (j = 0; j < 8; ++j) { - dest[j * stride + i] = highbd_clip_pixel_add( - dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); - } - } + output[1] = HIGHBD_WRAPLOW(-x8, bd); + output[2] = HIGHBD_WRAPLOW(x12, bd); + output[3] = HIGHBD_WRAPLOW(-x4, bd); + output[4] = HIGHBD_WRAPLOW(x6, bd); + output[5] = HIGHBD_WRAPLOW(x14, bd); + output[6] = HIGHBD_WRAPLOW(x10, bd); + output[7] = HIGHBD_WRAPLOW(x2, bd); + output[8] = HIGHBD_WRAPLOW(x3, bd); + output[9] = HIGHBD_WRAPLOW(x11, bd); + output[10] = HIGHBD_WRAPLOW(x15, bd); + output[11] = HIGHBD_WRAPLOW(x7, bd); + output[12] = HIGHBD_WRAPLOW(x5, bd); + output[13] = HIGHBD_WRAPLOW(-x13, bd); + output[14] = HIGHBD_WRAPLOW(x9, bd); + output[15] = HIGHBD_WRAPLOW(-x1, bd); } void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) { @@ -1879,181 +2082,33 @@ void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, } } -void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) { - tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8; - tran_high_t s9, s10, s11, s12, s13, s14, s15; - tran_low_t x0 = input[15]; - tran_low_t x1 = input[0]; - tran_low_t x2 = input[13]; - tran_low_t x3 = input[2]; - tran_low_t x4 = input[11]; - tran_low_t x5 = input[4]; - tran_low_t x6 = input[9]; - tran_low_t x7 = input[6]; - tran_low_t x8 = input[7]; - tran_low_t x9 = input[8]; - tran_low_t x10 = input[5]; - tran_low_t x11 = input[10]; - tran_low_t x12 = input[3]; - tran_low_t x13 = input[12]; - tran_low_t x14 = input[1]; - tran_low_t x15 = input[14]; - (void)bd; +void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest8, + int stride, int bd) { + int i, j; + tran_low_t out[16 * 16] = { 0 }; + tran_low_t *outptr = out; + tran_low_t temp_in[16], temp_out[16]; + uint16_t *const dest = CONVERT_TO_SHORTPTR(dest8); - if (detect_invalid_highbd_input(input, 16)) { -#if CONFIG_COEFFICIENT_RANGE_CHECKING - assert(0 && "invalid highbd txfm input"); -#endif // CONFIG_COEFFICIENT_RANGE_CHECKING - memset(output, 0, sizeof(*output) * 16); - return; + // First transform rows. Since all non-zero dct coefficients are in + // upper-left 8x8 area, we only need to calculate first 8 rows here. + for (i = 0; i < 8; ++i) { + vpx_highbd_idct16_c(input, outptr, bd); + input += 16; + outptr += 16; } - if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | - x13 | x14 | x15)) { - memset(output, 0, 16 * sizeof(*output)); - return; + // Then transform columns + for (i = 0; i < 16; ++i) { + uint16_t *destT = dest; + for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; + vpx_highbd_idct16_c(temp_in, temp_out, bd); + for (j = 0; j < 16; ++j) { + destT[i] = highbd_clip_pixel_add(destT[i], + ROUND_POWER_OF_TWO(temp_out[j], 6), bd); + destT += stride; + } } - - // stage 1 - s0 = x0 * cospi_1_64 + x1 * cospi_31_64; - s1 = x0 * cospi_31_64 - x1 * cospi_1_64; - s2 = x2 * cospi_5_64 + x3 * cospi_27_64; - s3 = x2 * cospi_27_64 - x3 * cospi_5_64; - s4 = x4 * cospi_9_64 + x5 * cospi_23_64; - s5 = x4 * cospi_23_64 - x5 * cospi_9_64; - s6 = x6 * cospi_13_64 + x7 * cospi_19_64; - s7 = x6 * cospi_19_64 - x7 * cospi_13_64; - s8 = x8 * cospi_17_64 + x9 * cospi_15_64; - s9 = x8 * cospi_15_64 - x9 * cospi_17_64; - s10 = x10 * cospi_21_64 + x11 * cospi_11_64; - s11 = x10 * cospi_11_64 - x11 * cospi_21_64; - s12 = x12 * cospi_25_64 + x13 * cospi_7_64; - s13 = x12 * cospi_7_64 - x13 * cospi_25_64; - s14 = x14 * cospi_29_64 + x15 * cospi_3_64; - s15 = x14 * cospi_3_64 - x15 * cospi_29_64; - - x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd); - x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd); - x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd); - x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd); - x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd); - x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd); - x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd); - x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd); - x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd); - x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd); - x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd); - x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd); - x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd); - x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd); - x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd); - x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd); - - // stage 2 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = x4; - s5 = x5; - s6 = x6; - s7 = x7; - s8 = x8 * cospi_4_64 + x9 * cospi_28_64; - s9 = x8 * cospi_28_64 - x9 * cospi_4_64; - s10 = x10 * cospi_20_64 + x11 * cospi_12_64; - s11 = x10 * cospi_12_64 - x11 * cospi_20_64; - s12 = -x12 * cospi_28_64 + x13 * cospi_4_64; - s13 = x12 * cospi_4_64 + x13 * cospi_28_64; - s14 = -x14 * cospi_12_64 + x15 * cospi_20_64; - s15 = x14 * cospi_20_64 + x15 * cospi_12_64; - - x0 = HIGHBD_WRAPLOW(s0 + s4, bd); - x1 = HIGHBD_WRAPLOW(s1 + s5, bd); - x2 = HIGHBD_WRAPLOW(s2 + s6, bd); - x3 = HIGHBD_WRAPLOW(s3 + s7, bd); - x4 = HIGHBD_WRAPLOW(s0 - s4, bd); - x5 = HIGHBD_WRAPLOW(s1 - s5, bd); - x6 = HIGHBD_WRAPLOW(s2 - s6, bd); - x7 = HIGHBD_WRAPLOW(s3 - s7, bd); - x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd); - x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd); - x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd); - x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd); - x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd); - x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd); - x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd); - x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd); - - // stage 3 - s0 = x0; - s1 = x1; - s2 = x2; - s3 = x3; - s4 = x4 * cospi_8_64 + x5 * cospi_24_64; - s5 = x4 * cospi_24_64 - x5 * cospi_8_64; - s6 = -x6 * cospi_24_64 + x7 * cospi_8_64; - s7 = x6 * cospi_8_64 + x7 * cospi_24_64; - s8 = x8; - s9 = x9; - s10 = x10; - s11 = x11; - s12 = x12 * cospi_8_64 + x13 * cospi_24_64; - s13 = x12 * cospi_24_64 - x13 * cospi_8_64; - s14 = -x14 * cospi_24_64 + x15 * cospi_8_64; - s15 = x14 * cospi_8_64 + x15 * cospi_24_64; - - x0 = HIGHBD_WRAPLOW(s0 + s2, bd); - x1 = HIGHBD_WRAPLOW(s1 + s3, bd); - x2 = HIGHBD_WRAPLOW(s0 - s2, bd); - x3 = HIGHBD_WRAPLOW(s1 - s3, bd); - x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd); - x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd); - x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd); - x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd); - x8 = HIGHBD_WRAPLOW(s8 + s10, bd); - x9 = HIGHBD_WRAPLOW(s9 + s11, bd); - x10 = HIGHBD_WRAPLOW(s8 - s10, bd); - x11 = HIGHBD_WRAPLOW(s9 - s11, bd); - x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd); - x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd); - x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd); - x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd); - - // stage 4 - s2 = (-cospi_16_64) * (x2 + x3); - s3 = cospi_16_64 * (x2 - x3); - s6 = cospi_16_64 * (x6 + x7); - s7 = cospi_16_64 * (-x6 + x7); - s10 = cospi_16_64 * (x10 + x11); - s11 = cospi_16_64 * (-x10 + x11); - s14 = (-cospi_16_64) * (x14 + x15); - s15 = cospi_16_64 * (x14 - x15); - - x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd); - x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd); - x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd); - x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd); - x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd); - x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd); - x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd); - x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd); - - output[0] = HIGHBD_WRAPLOW(x0, bd); - output[1] = HIGHBD_WRAPLOW(-x8, bd); - output[2] = HIGHBD_WRAPLOW(x12, bd); - output[3] = HIGHBD_WRAPLOW(-x4, bd); - output[4] = HIGHBD_WRAPLOW(x6, bd); - output[5] = HIGHBD_WRAPLOW(x14, bd); - output[6] = HIGHBD_WRAPLOW(x10, bd); - output[7] = HIGHBD_WRAPLOW(x2, bd); - output[8] = HIGHBD_WRAPLOW(x3, bd); - output[9] = HIGHBD_WRAPLOW(x11, bd); - output[10] = HIGHBD_WRAPLOW(x15, bd); - output[11] = HIGHBD_WRAPLOW(x7, bd); - output[12] = HIGHBD_WRAPLOW(x5, bd); - output[13] = HIGHBD_WRAPLOW(-x13, bd); - output[14] = HIGHBD_WRAPLOW(x9, bd); - output[15] = HIGHBD_WRAPLOW(-x1, bd); } void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8, diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/avg_msa.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/avg_msa.c index 52a24ed379a..48b841969b2 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/avg_msa.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/avg_msa.c @@ -7,6 +7,7 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ +#include <stdlib.h> #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/mips/macros_msa.h" @@ -54,3 +55,672 @@ uint32_t vpx_avg_4x4_msa(const uint8_t *src, int32_t src_stride) { return sum_out; } + +void vpx_hadamard_8x8_msa(const int16_t *src, int src_stride, int16_t *dst) { + v8i16 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, + tmp4, tmp5, tmp1, tmp6, tmp2); + TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, + src2, src3, src4, src5, src6, src7); + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, + tmp4, tmp5, tmp1, tmp6, tmp2); + TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, + src2, src3, src4, src5, src6, src7); + ST_SH8(src0, src1, src2, src3, src4, src5, src6, src7, dst, 8); +} + +void vpx_hadamard_16x16_msa(const int16_t *src, int src_stride, int16_t *dst) { + v8i16 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10; + v8i16 src11, src12, src13, src14, src15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5; + v8i16 tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15; + v8i16 res0, res1, res2, res3, res4, res5, res6, res7; + + LD_SH2(src, 8, src0, src8); + src += src_stride; + LD_SH2(src, 8, src1, src9); + src += src_stride; + LD_SH2(src, 8, src2, src10); + src += src_stride; + LD_SH2(src, 8, src3, src11); + src += src_stride; + LD_SH2(src, 8, src4, src12); + src += src_stride; + LD_SH2(src, 8, src5, src13); + src += src_stride; + LD_SH2(src, 8, src6, src14); + src += src_stride; + LD_SH2(src, 8, src7, src15); + src += src_stride; + + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10, + tmp12, tmp14, tmp15, tmp13, tmp11, tmp9); + + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, + tmp4, tmp5, tmp1, tmp6, tmp2); + TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, + src2, src3, src4, src5, src6, src7); + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, + tmp4, tmp5, tmp1, tmp6, tmp2); + TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, + src2, src11, src4, src5, src6, src7); + ST_SH8(src0, src1, src2, src11, src4, src5, src6, src7, dst, 8); + + BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9, + src12, src13, src15, src14, src11, src10); + BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15, + tmp11, tmp12, tmp13, tmp9, tmp14, tmp10); + TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, src8, + src9, src10, src11, src12, src13, src14, src15); + BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10, + tmp12, tmp14, tmp15, tmp13, tmp11, tmp9); + BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9, + src12, src13, src15, src14, src11, src10); + BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15, + tmp11, tmp12, tmp13, tmp9, tmp14, tmp10); + TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, res0, + res1, res2, res3, res4, res5, res6, res7); + + LD_SH2(src, 8, src0, src8); + src += src_stride; + LD_SH2(src, 8, src1, src9); + src += src_stride; + LD_SH2(src, 8, src2, src10); + src += src_stride; + LD_SH2(src, 8, src3, src11); + src += src_stride; + + ST_SH8(res0, res1, res2, res3, res4, res5, res6, res7, dst + 64, 8); + + LD_SH2(src, 8, src4, src12); + src += src_stride; + LD_SH2(src, 8, src5, src13); + src += src_stride; + LD_SH2(src, 8, src6, src14); + src += src_stride; + LD_SH2(src, 8, src7, src15); + src += src_stride; + + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10, + tmp12, tmp14, tmp15, tmp13, tmp11, tmp9); + + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, + tmp4, tmp5, tmp1, tmp6, tmp2); + TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, + src2, src3, src4, src5, src6, src7); + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3, + tmp4, tmp5, tmp1, tmp6, tmp2); + TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1, + src2, src3, src4, src5, src6, src7); + ST_SH8(src0, src1, src2, src3, src4, src5, src6, src7, dst + 2 * 64, 8); + + BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9, + src12, src13, src15, src14, src11, src10); + BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15, + tmp11, tmp12, tmp13, tmp9, tmp14, tmp10); + TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, src8, + src9, src10, src11, src12, src13, src14, src15); + BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10, + tmp12, tmp14, tmp15, tmp13, tmp11, tmp9); + BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9, + src12, src13, src15, src14, src11, src10); + BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15, + tmp11, tmp12, tmp13, tmp9, tmp14, tmp10); + TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, res0, + res1, res2, res3, res4, res5, res6, res7); + ST_SH8(res0, res1, res2, res3, res4, res5, res6, res7, dst + 3 * 64, 8); + + LD_SH4(dst, 64, src0, src1, src2, src3); + LD_SH4(dst + 8, 64, src4, src5, src6, src7); + + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + SRA_4V(tmp0, tmp1, tmp2, tmp3, 1); + SRA_4V(tmp4, tmp5, tmp6, tmp7, 1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + + ST_SH4(src0, src1, src2, src3, dst, 64); + ST_SH4(src4, src5, src6, src7, dst + 8, 64); + dst += 16; + + LD_SH4(dst, 64, src0, src1, src2, src3); + LD_SH4(dst + 8, 64, src4, src5, src6, src7); + + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + SRA_4V(tmp0, tmp1, tmp2, tmp3, 1); + SRA_4V(tmp4, tmp5, tmp6, tmp7, 1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + + ST_SH4(src0, src1, src2, src3, dst, 64); + ST_SH4(src4, src5, src6, src7, dst + 8, 64); + dst += 16; + + LD_SH4(dst, 64, src0, src1, src2, src3); + LD_SH4(dst + 8, 64, src4, src5, src6, src7); + + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + SRA_4V(tmp0, tmp1, tmp2, tmp3, 1); + SRA_4V(tmp4, tmp5, tmp6, tmp7, 1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + + ST_SH4(src0, src1, src2, src3, dst, 64); + ST_SH4(src4, src5, src6, src7, dst + 8, 64); + dst += 16; + + LD_SH4(dst, 64, src0, src1, src2, src3); + LD_SH4(dst + 8, 64, src4, src5, src6, src7); + + BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4, + tmp6, tmp7, tmp5, tmp3, tmp1); + SRA_4V(tmp0, tmp1, tmp2, tmp3, 1); + SRA_4V(tmp4, tmp5, tmp6, tmp7, 1); + BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4, + src5, src7, src6, src3, src2); + + ST_SH4(src0, src1, src2, src3, dst, 64); + ST_SH4(src4, src5, src6, src7, dst + 8, 64); +} + +int vpx_satd_msa(const int16_t *data, int length) { + int i, satd; + v8i16 src0, src1, src2, src3, src4, src5, src6, src7; + v8i16 src8, src9, src10, src11, src12, src13, src14, src15; + v8i16 zero = { 0 }; + v8u16 tmp0_h, tmp1_h, tmp2_h, tmp3_h, tmp4_h, tmp5_h, tmp6_h, tmp7_h; + v4u32 tmp0_w = { 0 }; + + if (16 == length) { + LD_SH2(data, 8, src0, src1); + tmp0_h = (v8u16)__msa_asub_s_h(src0, zero); + tmp1_h = (v8u16)__msa_asub_s_h(src1, zero); + tmp0_w = __msa_hadd_u_w(tmp0_h, tmp0_h); + tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h); + satd = HADD_UW_U32(tmp0_w); + } else if (64 == length) { + LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7); + + tmp0_h = (v8u16)__msa_asub_s_h(src0, zero); + tmp1_h = (v8u16)__msa_asub_s_h(src1, zero); + tmp2_h = (v8u16)__msa_asub_s_h(src2, zero); + tmp3_h = (v8u16)__msa_asub_s_h(src3, zero); + tmp4_h = (v8u16)__msa_asub_s_h(src4, zero); + tmp5_h = (v8u16)__msa_asub_s_h(src5, zero); + tmp6_h = (v8u16)__msa_asub_s_h(src6, zero); + tmp7_h = (v8u16)__msa_asub_s_h(src7, zero); + + tmp0_w = __msa_hadd_u_w(tmp0_h, tmp0_h); + tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h); + tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h); + tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h); + tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h); + tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h); + tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h); + tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h); + + satd = HADD_UW_U32(tmp0_w); + } else if (256 == length) { + for (i = 0; i < 2; ++i) { + LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7); + data += 8 * 8; + LD_SH8(data, 8, src8, src9, src10, src11, src12, src13, src14, src15); + data += 8 * 8; + + tmp0_h = (v8u16)__msa_asub_s_h(src0, zero); + tmp1_h = (v8u16)__msa_asub_s_h(src1, zero); + tmp2_h = (v8u16)__msa_asub_s_h(src2, zero); + tmp3_h = (v8u16)__msa_asub_s_h(src3, zero); + tmp4_h = (v8u16)__msa_asub_s_h(src4, zero); + tmp5_h = (v8u16)__msa_asub_s_h(src5, zero); + tmp6_h = (v8u16)__msa_asub_s_h(src6, zero); + tmp7_h = (v8u16)__msa_asub_s_h(src7, zero); + + tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h); + tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h); + tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h); + tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h); + tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h); + tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h); + tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h); + tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h); + + tmp0_h = (v8u16)__msa_asub_s_h(src8, zero); + tmp1_h = (v8u16)__msa_asub_s_h(src9, zero); + tmp2_h = (v8u16)__msa_asub_s_h(src10, zero); + tmp3_h = (v8u16)__msa_asub_s_h(src11, zero); + tmp4_h = (v8u16)__msa_asub_s_h(src12, zero); + tmp5_h = (v8u16)__msa_asub_s_h(src13, zero); + tmp6_h = (v8u16)__msa_asub_s_h(src14, zero); + tmp7_h = (v8u16)__msa_asub_s_h(src15, zero); + + tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h); + tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h); + tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h); + tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h); + tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h); + tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h); + tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h); + tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h); + } + + satd = HADD_UW_U32(tmp0_w); + } else if (1024 == length) { + for (i = 0; i < 8; ++i) { + LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7); + data += 8 * 8; + LD_SH8(data, 8, src8, src9, src10, src11, src12, src13, src14, src15); + data += 8 * 8; + + tmp0_h = (v8u16)__msa_asub_s_h(src0, zero); + tmp1_h = (v8u16)__msa_asub_s_h(src1, zero); + tmp2_h = (v8u16)__msa_asub_s_h(src2, zero); + tmp3_h = (v8u16)__msa_asub_s_h(src3, zero); + tmp4_h = (v8u16)__msa_asub_s_h(src4, zero); + tmp5_h = (v8u16)__msa_asub_s_h(src5, zero); + tmp6_h = (v8u16)__msa_asub_s_h(src6, zero); + tmp7_h = (v8u16)__msa_asub_s_h(src7, zero); + + tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h); + tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h); + tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h); + tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h); + tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h); + tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h); + tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h); + tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h); + + tmp0_h = (v8u16)__msa_asub_s_h(src8, zero); + tmp1_h = (v8u16)__msa_asub_s_h(src9, zero); + tmp2_h = (v8u16)__msa_asub_s_h(src10, zero); + tmp3_h = (v8u16)__msa_asub_s_h(src11, zero); + tmp4_h = (v8u16)__msa_asub_s_h(src12, zero); + tmp5_h = (v8u16)__msa_asub_s_h(src13, zero); + tmp6_h = (v8u16)__msa_asub_s_h(src14, zero); + tmp7_h = (v8u16)__msa_asub_s_h(src15, zero); + + tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h); + tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h); + tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h); + tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h); + tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h); + tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h); + tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h); + tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h); + } + + satd = HADD_UW_U32(tmp0_w); + } else { + satd = 0; + + for (i = 0; i < length; ++i) { + satd += abs(data[i]); + } + } + + return satd; +} + +void vpx_int_pro_row_msa(int16_t hbuf[16], const uint8_t *ref, + const int ref_stride, const int height) { + int i; + v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7; + v8i16 hbuf_r = { 0 }; + v8i16 hbuf_l = { 0 }; + v8i16 ref0_r, ref0_l, ref1_r, ref1_l, ref2_r, ref2_l, ref3_r, ref3_l; + v8i16 ref4_r, ref4_l, ref5_r, ref5_l, ref6_r, ref6_l, ref7_r, ref7_l; + + if (16 == height) { + for (i = 2; i--;) { + LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); + ref += 8 * ref_stride; + UNPCK_UB_SH(ref0, ref0_r, ref0_l); + UNPCK_UB_SH(ref1, ref1_r, ref1_l); + UNPCK_UB_SH(ref2, ref2_r, ref2_l); + UNPCK_UB_SH(ref3, ref3_r, ref3_l); + UNPCK_UB_SH(ref4, ref4_r, ref4_l); + UNPCK_UB_SH(ref5, ref5_r, ref5_l); + UNPCK_UB_SH(ref6, ref6_r, ref6_l); + UNPCK_UB_SH(ref7, ref7_r, ref7_l); + ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + } + + SRA_2V(hbuf_r, hbuf_l, 3); + ST_SH2(hbuf_r, hbuf_l, hbuf, 8); + } else if (32 == height) { + for (i = 2; i--;) { + LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); + ref += 8 * ref_stride; + UNPCK_UB_SH(ref0, ref0_r, ref0_l); + UNPCK_UB_SH(ref1, ref1_r, ref1_l); + UNPCK_UB_SH(ref2, ref2_r, ref2_l); + UNPCK_UB_SH(ref3, ref3_r, ref3_l); + UNPCK_UB_SH(ref4, ref4_r, ref4_l); + UNPCK_UB_SH(ref5, ref5_r, ref5_l); + UNPCK_UB_SH(ref6, ref6_r, ref6_l); + UNPCK_UB_SH(ref7, ref7_r, ref7_l); + ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); + ref += 8 * ref_stride; + UNPCK_UB_SH(ref0, ref0_r, ref0_l); + UNPCK_UB_SH(ref1, ref1_r, ref1_l); + UNPCK_UB_SH(ref2, ref2_r, ref2_l); + UNPCK_UB_SH(ref3, ref3_r, ref3_l); + UNPCK_UB_SH(ref4, ref4_r, ref4_l); + UNPCK_UB_SH(ref5, ref5_r, ref5_l); + UNPCK_UB_SH(ref6, ref6_r, ref6_l); + UNPCK_UB_SH(ref7, ref7_r, ref7_l); + ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + } + + SRA_2V(hbuf_r, hbuf_l, 4); + ST_SH2(hbuf_r, hbuf_l, hbuf, 8); + } else if (64 == height) { + for (i = 4; i--;) { + LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); + ref += 8 * ref_stride; + UNPCK_UB_SH(ref0, ref0_r, ref0_l); + UNPCK_UB_SH(ref1, ref1_r, ref1_l); + UNPCK_UB_SH(ref2, ref2_r, ref2_l); + UNPCK_UB_SH(ref3, ref3_r, ref3_l); + UNPCK_UB_SH(ref4, ref4_r, ref4_l); + UNPCK_UB_SH(ref5, ref5_r, ref5_l); + UNPCK_UB_SH(ref6, ref6_r, ref6_l); + UNPCK_UB_SH(ref7, ref7_r, ref7_l); + ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); + ref += 8 * ref_stride; + UNPCK_UB_SH(ref0, ref0_r, ref0_l); + UNPCK_UB_SH(ref1, ref1_r, ref1_l); + UNPCK_UB_SH(ref2, ref2_r, ref2_l); + UNPCK_UB_SH(ref3, ref3_r, ref3_l); + UNPCK_UB_SH(ref4, ref4_r, ref4_l); + UNPCK_UB_SH(ref5, ref5_r, ref5_l); + UNPCK_UB_SH(ref6, ref6_r, ref6_l); + UNPCK_UB_SH(ref7, ref7_r, ref7_l); + ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l, + hbuf_r, hbuf_l, hbuf_r, hbuf_l); + } + + SRA_2V(hbuf_r, hbuf_l, 5); + ST_SH2(hbuf_r, hbuf_l, hbuf, 8); + } else { + const int norm_factor = height >> 1; + int cnt; + + for (cnt = 0; cnt < 16; cnt++) { + hbuf[cnt] = 0; + } + + for (i = 0; i < height; ++i) { + for (cnt = 0; cnt < 16; cnt++) { + hbuf[cnt] += ref[cnt]; + } + + ref += ref_stride; + } + + for (cnt = 0; cnt < 16; cnt++) { + hbuf[cnt] /= norm_factor; + } + } +} + +int16_t vpx_int_pro_col_msa(const uint8_t *ref, const int width) { + int16_t sum; + v16u8 ref0, ref1, ref2, ref3; + v8u16 ref0_h; + + if (16 == width) { + ref0 = LD_UB(ref); + ref0_h = __msa_hadd_u_h(ref0, ref0); + sum = HADD_UH_U32(ref0_h); + } else if (32 == width) { + LD_UB2(ref, 16, ref0, ref1); + ref0_h = __msa_hadd_u_h(ref0, ref0); + ref0_h += __msa_hadd_u_h(ref1, ref1); + sum = HADD_UH_U32(ref0_h); + } else if (64 == width) { + LD_UB4(ref, 16, ref0, ref1, ref2, ref3); + ref0_h = __msa_hadd_u_h(ref0, ref0); + ref0_h += __msa_hadd_u_h(ref1, ref1); + ref0_h += __msa_hadd_u_h(ref2, ref2); + ref0_h += __msa_hadd_u_h(ref3, ref3); + sum = HADD_UH_U32(ref0_h); + } else { + int idx; + + sum = 0; + for (idx = 0; idx < width; ++idx) { + sum += ref[idx]; + } + } + + return sum; +} + +int vpx_vector_var_msa(const int16_t *ref, const int16_t *src, const int bwl) { + int sse, mean, var; + v8i16 src0, src1, src2, src3, src4, src5, src6, src7, ref0, ref1, ref2; + v8i16 ref3, ref4, ref5, ref6, ref7, src_l0_m, src_l1_m, src_l2_m, src_l3_m; + v8i16 src_l4_m, src_l5_m, src_l6_m, src_l7_m; + v4i32 res_l0_m, res_l1_m, res_l2_m, res_l3_m, res_l4_m, res_l5_m, res_l6_m; + v4i32 res_l7_m, mean_v; + v2i64 sse_v; + + if (2 == bwl) { + LD_SH2(src, 8, src0, src1); + LD_SH2(ref, 8, ref0, ref1); + + ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m); + ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m); + HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m); + HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m); + sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m); + sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m); + DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v); + mean_v = res_l0_m + res_l1_m; + mean_v += res_l2_m + res_l3_m; + + sse_v += __msa_splati_d(sse_v, 1); + sse = __msa_copy_s_w((v4i32)sse_v, 0); + + mean = HADD_SW_S32(mean_v); + } else if (3 == bwl) { + LD_SH4(src, 8, src0, src1, src2, src3); + LD_SH4(ref, 8, ref0, ref1, ref2, ref3); + + ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m); + ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m); + ILVRL_H2_SH(src2, ref2, src_l4_m, src_l5_m); + ILVRL_H2_SH(src3, ref3, src_l6_m, src_l7_m); + HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m); + HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m); + HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m); + HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m); + sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m); + sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m); + DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v); + DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v); + DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v); + mean_v = res_l0_m + res_l1_m; + mean_v += res_l2_m + res_l3_m; + mean_v += res_l4_m + res_l5_m; + mean_v += res_l6_m + res_l7_m; + + sse_v += __msa_splati_d(sse_v, 1); + sse = __msa_copy_s_w((v4i32)sse_v, 0); + + mean = HADD_SW_S32(mean_v); + } else if (4 == bwl) { + LD_SH8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7); + LD_SH8(ref, 8, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7); + + ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m); + ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m); + ILVRL_H2_SH(src2, ref2, src_l4_m, src_l5_m); + ILVRL_H2_SH(src3, ref3, src_l6_m, src_l7_m); + HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m); + HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m); + HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m); + HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m); + sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m); + sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m); + DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v); + DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v); + DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v); + mean_v = res_l0_m + res_l1_m; + mean_v += res_l2_m + res_l3_m; + mean_v += res_l4_m + res_l5_m; + mean_v += res_l6_m + res_l7_m; + + ILVRL_H2_SH(src4, ref4, src_l0_m, src_l1_m); + ILVRL_H2_SH(src5, ref5, src_l2_m, src_l3_m); + ILVRL_H2_SH(src6, ref6, src_l4_m, src_l5_m); + ILVRL_H2_SH(src7, ref7, src_l6_m, src_l7_m); + HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m); + HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m); + HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m); + HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m); + DPADD_SD2_SD(res_l0_m, res_l1_m, sse_v, sse_v); + DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v); + DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v); + DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v); + mean_v += res_l0_m + res_l1_m; + mean_v += res_l2_m + res_l3_m; + mean_v += res_l4_m + res_l5_m; + mean_v += res_l6_m + res_l7_m; + + sse_v += __msa_splati_d(sse_v, 1); + sse = __msa_copy_s_w((v4i32)sse_v, 0); + + mean = HADD_SW_S32(mean_v); + } else { + int i; + const int width = 4 << bwl; + + sse = 0; + mean = 0; + + for (i = 0; i < width; ++i) { + const int diff = ref[i] - src[i]; + + mean += diff; + sse += diff * diff; + } + } + + var = sse - ((mean * mean) >> (bwl + 2)); + + return var; +} + +void vpx_minmax_8x8_msa(const uint8_t *s, int p, const uint8_t *d, int dp, + int *min, int *max) { + v16u8 s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7; + v16u8 diff0, diff1, diff2, diff3, min0, min1, max0, max1; + + LD_UB8(s, p, s0, s1, s2, s3, s4, s5, s6, s7); + LD_UB8(d, dp, d0, d1, d2, d3, d4, d5, d6, d7); + PCKEV_D4_UB(s1, s0, s3, s2, s5, s4, s7, s6, s0, s1, s2, s3); + PCKEV_D4_UB(d1, d0, d3, d2, d5, d4, d7, d6, d0, d1, d2, d3); + + diff0 = __msa_asub_u_b(s0, d0); + diff1 = __msa_asub_u_b(s1, d1); + diff2 = __msa_asub_u_b(s2, d2); + diff3 = __msa_asub_u_b(s3, d3); + + min0 = __msa_min_u_b(diff0, diff1); + min1 = __msa_min_u_b(diff2, diff3); + min0 = __msa_min_u_b(min0, min1); + + max0 = __msa_max_u_b(diff0, diff1); + max1 = __msa_max_u_b(diff2, diff3); + max0 = __msa_max_u_b(max0, max1); + + min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 8); + min0 = __msa_min_u_b(min0, min1); + max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 8); + max0 = __msa_max_u_b(max0, max1); + + min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 4); + min0 = __msa_min_u_b(min0, min1); + max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 4); + max0 = __msa_max_u_b(max0, max1); + + min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 2); + min0 = __msa_min_u_b(min0, min1); + max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 2); + max0 = __msa_max_u_b(max0, max1); + + min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 1); + min0 = __msa_min_u_b(min0, min1); + max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 1); + max0 = __msa_max_u_b(max0, max1); + + *min = min0[0]; + *max = max0[0]; +} diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/deblock_msa.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/deblock_msa.c index e33ea740a9e..aafa272fbdf 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/deblock_msa.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/deblock_msa.c @@ -9,6 +9,7 @@ */ #include <stdlib.h> + #include "./macros_msa.h" extern const int16_t vpx_rv[]; @@ -295,6 +296,7 @@ static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr, uint8_t *p_dst_st = dst_ptr; uint8_t *f_orig = f; uint16_t col; + uint64_t out0, out1, out2, out3; v16u8 above2, above1, below2, below1; v16u8 src, ref, ref_temp; v16u8 inter0, inter1, inter2, inter3, inter4, inter5, inter6; @@ -346,6 +348,67 @@ static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr, f += 16; } + if (0 != (cols / 16)) { + ref = LD_UB(f); + LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1); + src = LD_UB(p_src); + LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0); + above2 = LD_UB(p_src + 3 * src_stride); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1); + above1 = LD_UB(p_src + 4 * src_stride); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2); + src = LD_UB(p_src + 5 * src_stride); + VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3); + below1 = LD_UB(p_src + 6 * src_stride); + VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4); + below2 = LD_UB(p_src + 7 * src_stride); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5); + above2 = LD_UB(p_src + 8 * src_stride); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6); + above1 = LD_UB(p_src + 9 * src_stride); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7); + src = LD_UB(p_src + 10 * src_stride); + VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter8); + below1 = LD_UB(p_src + 11 * src_stride); + VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter9); + below2 = LD_UB(p_src + 12 * src_stride); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter10); + above2 = LD_UB(p_src + 13 * src_stride); + VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter11); + above1 = LD_UB(p_src + 14 * src_stride); + VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter12); + src = LD_UB(p_src + 15 * src_stride); + VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter13); + below1 = LD_UB(p_src + 16 * src_stride); + VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter14); + below2 = LD_UB(p_src + 17 * src_stride); + VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter15); + out0 = __msa_copy_u_d((v2i64)inter0, 0); + out1 = __msa_copy_u_d((v2i64)inter1, 0); + out2 = __msa_copy_u_d((v2i64)inter2, 0); + out3 = __msa_copy_u_d((v2i64)inter3, 0); + SD4(out0, out1, out2, out3, p_dst, dst_stride); + + out0 = __msa_copy_u_d((v2i64)inter4, 0); + out1 = __msa_copy_u_d((v2i64)inter5, 0); + out2 = __msa_copy_u_d((v2i64)inter6, 0); + out3 = __msa_copy_u_d((v2i64)inter7, 0); + SD4(out0, out1, out2, out3, p_dst + 4 * dst_stride, dst_stride); + + out0 = __msa_copy_u_d((v2i64)inter8, 0); + out1 = __msa_copy_u_d((v2i64)inter9, 0); + out2 = __msa_copy_u_d((v2i64)inter10, 0); + out3 = __msa_copy_u_d((v2i64)inter11, 0); + SD4(out0, out1, out2, out3, p_dst + 8 * dst_stride, dst_stride); + + out0 = __msa_copy_u_d((v2i64)inter12, 0); + out1 = __msa_copy_u_d((v2i64)inter13, 0); + out2 = __msa_copy_u_d((v2i64)inter14, 0); + out3 = __msa_copy_u_d((v2i64)inter15, 0); + SD4(out0, out1, out2, out3, p_dst + 12 * dst_stride, dst_stride); + } + f = f_orig; p_dst = dst_ptr - 2; LD_UB8(p_dst, dst_stride, inter0, inter1, inter2, inter3, inter4, inter5, diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/macros_msa.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/macros_msa.h index 002e574aa8f..27b38865a42 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/macros_msa.h +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/macros_msa.h @@ -1049,6 +1049,7 @@ } #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__) #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__) +#define INSERT_D2_SH(...) INSERT_D2(v8i16, __VA_ARGS__) /* Description : Interleave even byte elements from vectors Arguments : Inputs - in0, in1, in2, in3 @@ -1559,6 +1560,12 @@ Details : Each element of vector 'in0' is right shifted by 'shift' and the result is written in-place. 'shift' is a GP variable. */ +#define SRA_2V(in0, in1, shift) \ + { \ + in0 = in0 >> shift; \ + in1 = in1 >> shift; \ + } + #define SRA_4V(in0, in1, in2, in3, shift) \ { \ in0 = in0 >> shift; \ diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/sum_squares_msa.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/sum_squares_msa.c new file mode 100644 index 00000000000..d4563dc410b --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/sum_squares_msa.c @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vpx_dsp_rtcd.h" +#include "./macros_msa.h" + +uint64_t vpx_sum_squares_2d_i16_msa(const int16_t *src, int src_stride, + int size) { + int row, col; + uint64_t ss_res = 0; + v4i32 mul0, mul1; + v2i64 res0 = { 0 }; + + if (4 == size) { + uint64_t src0, src1, src2, src3; + v8i16 diff0 = { 0 }; + v8i16 diff1 = { 0 }; + + LD4(src, src_stride, src0, src1, src2, src3); + INSERT_D2_SH(src0, src1, diff0); + INSERT_D2_SH(src2, src3, diff1); + DOTP_SH2_SW(diff0, diff1, diff0, diff1, mul0, mul1); + mul0 += mul1; + res0 = __msa_hadd_s_d(mul0, mul0); + res0 += __msa_splati_d(res0, 1); + ss_res = (uint64_t)__msa_copy_s_d(res0, 0); + } else if (8 == size) { + v8i16 src0, src1, src2, src3, src4, src5, src6, src7; + + LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + mul0 += mul1; + res0 = __msa_hadd_s_d(mul0, mul0); + res0 += __msa_splati_d(res0, 1); + ss_res = (uint64_t)__msa_copy_s_d(res0, 0); + } else if (16 == size) { + v8i16 src0, src1, src2, src3, src4, src5, src6, src7; + + LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + LD_SH8(src + 8, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + src += 8 * src_stride; + DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + LD_SH8(src + 8, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); + DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + mul0 += mul1; + res0 += __msa_hadd_s_d(mul0, mul0); + + res0 += __msa_splati_d(res0, 1); + ss_res = (uint64_t)__msa_copy_s_d(res0, 0); + } else if (0 == (size % 16)) { + v8i16 src0, src1, src2, src3, src4, src5, src6, src7; + + for (row = 0; row < (size >> 4); row++) { + for (col = 0; col < size; col += 16) { + const int16_t *src_ptr = src + col; + LD_SH8(src_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6, + src7); + DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + LD_SH8(src_ptr + 8, src_stride, src0, src1, src2, src3, src4, src5, + src6, src7); + src_ptr += 8 * src_stride; + DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + LD_SH8(src_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6, + src7); + DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + LD_SH8(src_ptr + 8, src_stride, src0, src1, src2, src3, src4, src5, + src6, src7); + DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1); + DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1); + DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1); + DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1); + mul0 += mul1; + res0 += __msa_hadd_s_d(mul0, mul0); + } + + src += 16 * src_stride; + } + + res0 += __msa_splati_d(res0, 1); + ss_res = (uint64_t)__msa_copy_s_d(res0, 0); + } else { + int16_t val; + + for (row = 0; row < size; row++) { + for (col = 0; col < size; col++) { + val = src[col]; + ss_res += val * val; + } + + src += src_stride; + } + } + + return ss_res; +} diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/prob.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/prob.h index 5656ddbab4d..f1cc0eaa105 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/prob.h +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/prob.h @@ -48,7 +48,7 @@ typedef const vpx_tree_index vpx_tree[]; static INLINE vpx_prob get_prob(unsigned int num, unsigned int den) { assert(den != 0); { - const int p = (int)(((int64_t)num * 256 + (den >> 1)) / den); + const int p = (int)(((uint64_t)num * 256 + (den >> 1)) / den); // (p > 255) ? 255 : (p < 1) ? 1 : p; const int clipped_prob = p | ((255 - p) >> 23) | (p == 0); return (vpx_prob)clipped_prob; diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk index bb20ea27421..ca6e5ca9a83 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk @@ -13,6 +13,13 @@ DSP_SRCS-yes += vpx_dsp_common.h DSP_SRCS-$(HAVE_MSA) += mips/macros_msa.h +DSP_SRCS-$(HAVE_AVX2) += x86/bitdepth_conversion_avx2.h +DSP_SRCS-$(HAVE_SSE2) += x86/bitdepth_conversion_sse2.h +# This file is included in libs.mk. Including it here would cause it to be +# compiled into an object. Even as an empty file, this would create an +# executable section on the stack. +#DSP_SRCS-$(HAVE_SSE2) += x86/bitdepth_conversion_sse2$(ASM) + # bit reader DSP_SRCS-yes += prob.h DSP_SRCS-yes += prob.c @@ -195,9 +202,7 @@ DSP_SRCS-yes += inv_txfm.c DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.h DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.c DSP_SRCS-$(HAVE_SSE2) += x86/inv_wht_sse2.asm -ifeq ($(ARCH_X86_64),yes) -DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3_x86_64.asm -endif # ARCH_X86_64 +DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3.c DSP_SRCS-$(HAVE_NEON_ASM) += arm/save_reg_neon$(ASM) @@ -217,26 +222,23 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c else # CONFIG_VP9_HIGHBITDEPTH DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct4x4_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct8x8_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct16x16_add_neon.c +DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_add_neon.c endif # !CONFIG_VP9_HIGHBITDEPTH ifeq ($(HAVE_NEON_ASM),yes) DSP_SRCS-yes += arm/idct_neon$(ASM) DSP_SRCS-yes += arm/idct4x4_1_add_neon$(ASM) DSP_SRCS-yes += arm/idct4x4_add_neon$(ASM) -DSP_SRCS-yes += arm/idct8x8_1_add_neon$(ASM) -DSP_SRCS-yes += arm/idct8x8_add_neon$(ASM) -DSP_SRCS-yes += arm/idct16x16_1_add_neon$(ASM) -DSP_SRCS-yes += arm/idct16x16_add_neon$(ASM) -DSP_SRCS-yes += arm/idct16x16_neon.c else DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_1_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_add_neon.c +endif # HAVE_NEON_ASM +DSP_SRCS-$(HAVE_NEON) += arm/idct_neon.h DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_1_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_1_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_add_neon.c -endif # HAVE_NEON_ASM -DSP_SRCS-$(HAVE_NEON) += arm/idct_neon.h DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_1_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_34_add_neon.c DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_135_add_neon.c @@ -249,7 +251,6 @@ ifeq ($(CONFIG_VP9_ENCODER),yes) DSP_SRCS-yes += quantize.c DSP_SRCS-yes += quantize.h -DSP_SRCS-$(HAVE_SSE2) += x86/fdct.h DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c @@ -276,6 +277,7 @@ DSP_SRCS-yes += sad.c DSP_SRCS-yes += subtract.c DSP_SRCS-yes += sum_squares.c DSP_SRCS-$(HAVE_SSE2) += x86/sum_squares_sse2.c +DSP_SRCS-$(HAVE_MSA) += mips/sum_squares_msa.c DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c DSP_SRCS-$(HAVE_NEON) += arm/sad_neon.c diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl index ee1b2927938..a17bda582e4 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -536,10 +536,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_fdct4x4_1 sse2/; add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct8x8 sse2/; + specialize qw/vpx_fdct8x8 neon sse2/; add_proto qw/void vpx_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride"; - specialize qw/vpx_fdct8x8_1 sse2/; + specialize qw/vpx_fdct8x8_1 neon sse2/; add_proto qw/void vpx_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride"; specialize qw/vpx_fdct16x16 sse2/; @@ -624,13 +624,14 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_idct8x8_1_add neon/; add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + specialize qw/vpx_highbd_idct16x16_1_add neon/; add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; add_proto qw/void vpx_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; - specialize qw/vpx_highbd_idct32x32_1_add sse2/; + specialize qw/vpx_highbd_idct32x32_1_add neon sse2/; add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; @@ -650,6 +651,8 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride"; + add_proto qw/void vpx_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int stride"; + add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride"; add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; @@ -670,6 +673,8 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; } else { add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride"; @@ -679,10 +684,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_idct4x4_1_add neon sse2/; add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct8x8_64_add neon sse2/, "$ssse3_x86_64"; + specialize qw/vpx_idct8x8_64_add neon sse2 ssse3/; add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct8x8_12_add neon sse2/, "$ssse3_x86_64"; + specialize qw/vpx_idct8x8_12_add neon sse2 ssse3/; add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; specialize qw/vpx_idct8x8_1_add neon sse2/; @@ -690,6 +695,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride"; specialize qw/vpx_idct16x16_256_add neon sse2/; + add_proto qw/void vpx_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int stride"; + specialize qw/vpx_idct16x16_38_add neon sse2/; + $vpx_idct16x16_38_add_sse2=vpx_idct16x16_256_add_sse2; + add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride"; specialize qw/vpx_idct16x16_10_add neon sse2/; @@ -697,15 +706,15 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_idct16x16_1_add neon sse2/; add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct32x32_1024_add neon sse2/, "$ssse3_x86_64"; + specialize qw/vpx_idct32x32_1024_add neon sse2 ssse3/; add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct32x32_135_add neon sse2/, "$ssse3_x86_64"; + specialize qw/vpx_idct32x32_135_add neon sse2 ssse3/; # Need to add 135 eob idct32x32 implementations. $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2; add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct32x32_34_add neon sse2/, "$ssse3_x86_64"; + specialize qw/vpx_idct32x32_34_add neon sse2 ssse3/; add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; specialize qw/vpx_idct32x32_1_add neon sse2/; @@ -720,10 +729,14 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_idct8x8_12_add neon sse2/; add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; - specialize qw/vpx_highbd_idct16x16_256_add sse2/; + specialize qw/vpx_highbd_idct16x16_256_add neon sse2/; + + add_proto qw/void vpx_highbd_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; + specialize qw/vpx_highbd_idct16x16_38_add neon sse2/; + $vpx_highbd_idct16x16_38_add_sse2=vpx_highbd_idct16x16_256_add_sse2; add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd"; - specialize qw/vpx_highbd_idct16x16_10_add sse2/; + specialize qw/vpx_highbd_idct16x16_10_add neon sse2/; } # CONFIG_EMULATE_HARDWARE } else { # Force C versions if CONFIG_EMULATE_HARDWARE is 1 @@ -742,6 +755,8 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride"; + add_proto qw/void vpx_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int stride"; + add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride"; add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride"; @@ -766,10 +781,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_idct8x8_1_add sse2 neon dspr2 msa/; add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct8x8_64_add sse2 neon dspr2 msa/, "$ssse3_x86_64"; + specialize qw/vpx_idct8x8_64_add sse2 ssse3 neon dspr2 msa/; add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct8x8_12_add sse2 neon dspr2 msa/, "$ssse3_x86_64"; + specialize qw/vpx_idct8x8_12_add sse2 ssse3 neon dspr2 msa/; add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; specialize qw/vpx_idct16x16_1_add sse2 neon dspr2 msa/; @@ -777,20 +792,26 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride"; specialize qw/vpx_idct16x16_256_add sse2 neon dspr2 msa/; + add_proto qw/void vpx_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int stride"; + specialize qw/vpx_idct16x16_38_add sse2 neon dspr2 msa/; + $vpx_idct16x16_38_add_sse2=vpx_idct16x16_256_add_sse2; + $vpx_idct16x16_38_add_dspr2=vpx_idct16x16_256_add_dspr2; + $vpx_idct16x16_38_add_msa=vpx_idct16x16_256_add_msa; + add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride"; specialize qw/vpx_idct16x16_10_add sse2 neon dspr2 msa/; add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct32x32_1024_add sse2 neon dspr2 msa/, "$ssse3_x86_64"; + specialize qw/vpx_idct32x32_1024_add sse2 ssse3 neon dspr2 msa/; add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct32x32_135_add sse2 neon dspr2 msa/, "$ssse3_x86_64"; + specialize qw/vpx_idct32x32_135_add sse2 ssse3 neon dspr2 msa/; $vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2; $vpx_idct32x32_135_add_dspr2=vpx_idct32x32_1024_add_dspr2; $vpx_idct32x32_135_add_msa=vpx_idct32x32_1024_add_msa; add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride"; - specialize qw/vpx_idct32x32_34_add sse2 neon dspr2 msa/, "$ssse3_x86_64"; + specialize qw/vpx_idct32x32_34_add sse2 ssse3 neon dspr2 msa/; add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride"; specialize qw/vpx_idct32x32_1_add sse2 neon dspr2 msa/; @@ -883,25 +904,37 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { specialize qw/vpx_avg_4x4 sse2 neon msa/; add_proto qw/void vpx_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max"; - specialize qw/vpx_minmax_8x8 sse2 neon/; + specialize qw/vpx_minmax_8x8 sse2 neon msa/; - add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff"; - specialize qw/vpx_hadamard_8x8 sse2 neon/, "$ssse3_x86_64"; - add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff"; - specialize qw/vpx_hadamard_16x16 sse2 neon/; + if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { + add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff"; + specialize qw/vpx_hadamard_8x8 sse2 neon/, "$ssse3_x86_64"; + + add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff"; + specialize qw/vpx_hadamard_16x16 sse2 neon/; + + add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length"; + specialize qw/vpx_satd sse2 neon/; + } else { + add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff"; + specialize qw/vpx_hadamard_8x8 sse2 neon msa/, "$ssse3_x86_64"; + + add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff"; + specialize qw/vpx_hadamard_16x16 sse2 neon msa/; - add_proto qw/int vpx_satd/, "const int16_t *coeff, int length"; - specialize qw/vpx_satd sse2 neon/; + add_proto qw/int vpx_satd/, "const int16_t *coeff, int length"; + specialize qw/vpx_satd sse2 neon msa/; + } add_proto qw/void vpx_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height"; - specialize qw/vpx_int_pro_row sse2 neon/; + specialize qw/vpx_int_pro_row sse2 neon msa/; add_proto qw/int16_t vpx_int_pro_col/, "const uint8_t *ref, const int width"; - specialize qw/vpx_int_pro_col sse2 neon/; + specialize qw/vpx_int_pro_col sse2 neon msa/; add_proto qw/int vpx_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl"; - specialize qw/vpx_vector_var neon sse2/; + specialize qw/vpx_vector_var neon sse2 msa/; } # CONFIG_VP9_ENCODER add_proto qw/unsigned int vpx_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred"; @@ -1039,7 +1072,7 @@ add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const specialize qw/vpx_sad4x4x4d msa sse2/; add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size"; -specialize qw/vpx_sum_squares_2d_i16 sse2/; +specialize qw/vpx_sum_squares_2d_i16 sse2 msa/; # # Structured Similarity (SSIM) diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/avg_intrin_sse2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/avg_intrin_sse2.c index b0a104bad06..4e89e07e580 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/avg_intrin_sse2.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/avg_intrin_sse2.c @@ -11,6 +11,8 @@ #include <emmintrin.h> #include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" #include "vpx_ports/mem.h" void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, @@ -213,7 +215,7 @@ static void hadamard_col8_sse2(__m128i *in, int iter) { } void vpx_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride, - int16_t *coeff) { + tran_low_t *coeff) { __m128i src[8]; src[0] = _mm_load_si128((const __m128i *)src_diff); src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); @@ -227,25 +229,25 @@ void vpx_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride, hadamard_col8_sse2(src, 0); hadamard_col8_sse2(src, 1); - _mm_store_si128((__m128i *)coeff, src[0]); + store_tran_low(src[0], coeff); coeff += 8; - _mm_store_si128((__m128i *)coeff, src[1]); + store_tran_low(src[1], coeff); coeff += 8; - _mm_store_si128((__m128i *)coeff, src[2]); + store_tran_low(src[2], coeff); coeff += 8; - _mm_store_si128((__m128i *)coeff, src[3]); + store_tran_low(src[3], coeff); coeff += 8; - _mm_store_si128((__m128i *)coeff, src[4]); + store_tran_low(src[4], coeff); coeff += 8; - _mm_store_si128((__m128i *)coeff, src[5]); + store_tran_low(src[5], coeff); coeff += 8; - _mm_store_si128((__m128i *)coeff, src[6]); + store_tran_low(src[6], coeff); coeff += 8; - _mm_store_si128((__m128i *)coeff, src[7]); + store_tran_low(src[7], coeff); } void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride, - int16_t *coeff) { + tran_low_t *coeff) { int idx; for (idx = 0; idx < 4; ++idx) { int16_t const *src_ptr = @@ -254,10 +256,10 @@ void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride, } for (idx = 0; idx < 64; idx += 8) { - __m128i coeff0 = _mm_load_si128((const __m128i *)coeff); - __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64)); - __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128)); - __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192)); + __m128i coeff0 = load_tran_low(coeff); + __m128i coeff1 = load_tran_low(coeff + 64); + __m128i coeff2 = load_tran_low(coeff + 128); + __m128i coeff3 = load_tran_low(coeff + 192); __m128i b0 = _mm_add_epi16(coeff0, coeff1); __m128i b1 = _mm_sub_epi16(coeff0, coeff1); @@ -271,25 +273,25 @@ void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride, coeff0 = _mm_add_epi16(b0, b2); coeff1 = _mm_add_epi16(b1, b3); - _mm_store_si128((__m128i *)coeff, coeff0); - _mm_store_si128((__m128i *)(coeff + 64), coeff1); + store_tran_low(coeff0, coeff); + store_tran_low(coeff1, coeff + 64); coeff2 = _mm_sub_epi16(b0, b2); coeff3 = _mm_sub_epi16(b1, b3); - _mm_store_si128((__m128i *)(coeff + 128), coeff2); - _mm_store_si128((__m128i *)(coeff + 192), coeff3); + store_tran_low(coeff2, coeff + 128); + store_tran_low(coeff3, coeff + 192); coeff += 8; } } -int vpx_satd_sse2(const int16_t *coeff, int length) { +int vpx_satd_sse2(const tran_low_t *coeff, int length) { int i; const __m128i zero = _mm_setzero_si128(); __m128i accum = zero; for (i = 0; i < length; i += 8) { - const __m128i src_line = _mm_load_si128((const __m128i *)coeff); + const __m128i src_line = load_tran_low(coeff); const __m128i inv = _mm_sub_epi16(zero, src_line); const __m128i abs = _mm_max_epi16(src_line, inv); // abs(src_line) const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero); diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm index 26412e8e432..22e0a086cc2 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm @@ -8,42 +8,50 @@ ; be found in the AUTHORS file in the root of the source tree. ; -%define private_prefix vpx - %include "third_party/x86inc/x86inc.asm" - -; This file provides SSSE3 version of the hadamard transformation. Part -; of the macro definitions are originally derived from the ffmpeg project. -; The current version applies to x86 64-bit only. +%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm" SECTION .text %if ARCH_X86_64 ; matrix transpose -%macro INTERLEAVE_2X 4 - punpckh%1 m%4, m%2, m%3 - punpckl%1 m%2, m%3 - SWAP %3, %4 -%endmacro - -%macro TRANSPOSE8X8 9 - INTERLEAVE_2X wd, %1, %2, %9 - INTERLEAVE_2X wd, %3, %4, %9 - INTERLEAVE_2X wd, %5, %6, %9 - INTERLEAVE_2X wd, %7, %8, %9 - - INTERLEAVE_2X dq, %1, %3, %9 - INTERLEAVE_2X dq, %2, %4, %9 - INTERLEAVE_2X dq, %5, %7, %9 - INTERLEAVE_2X dq, %6, %8, %9 - - INTERLEAVE_2X qdq, %1, %5, %9 - INTERLEAVE_2X qdq, %3, %7, %9 - INTERLEAVE_2X qdq, %2, %6, %9 - INTERLEAVE_2X qdq, %4, %8, %9 - - SWAP %2, %5 - SWAP %4, %7 +%macro TRANSPOSE8X8 10 + ; stage 1 + punpcklwd m%9, m%1, m%2 + punpcklwd m%10, m%3, m%4 + punpckhwd m%1, m%2 + punpckhwd m%3, m%4 + + punpcklwd m%2, m%5, m%6 + punpcklwd m%4, m%7, m%8 + punpckhwd m%5, m%6 + punpckhwd m%7, m%8 + + ; stage 2 + punpckldq m%6, m%9, m%10 + punpckldq m%8, m%1, m%3 + punpckhdq m%9, m%10 + punpckhdq m%1, m%3 + + punpckldq m%10, m%2, m%4 + punpckldq m%3, m%5, m%7 + punpckhdq m%2, m%4 + punpckhdq m%5, m%7 + + ; stage 3 + punpckhqdq m%4, m%9, m%2 ; out3 + punpcklqdq m%9, m%2 ; out2 + punpcklqdq m%7, m%1, m%5 ; out6 + punpckhqdq m%1, m%5 ; out7 + + punpckhqdq m%2, m%6, m%10 ; out1 + punpcklqdq m%6, m%10 ; out0 + punpcklqdq m%5, m%8, m%3 ; out4 + punpckhqdq m%8, m%3 ; out5 + + SWAP %6, %1 + SWAP %3, %9 + SWAP %8, %6 %endmacro %macro HMD8_1D 0 @@ -87,8 +95,9 @@ SECTION .text SWAP 7, 9 %endmacro + INIT_XMM ssse3 -cglobal hadamard_8x8, 3, 5, 10, input, stride, output +cglobal hadamard_8x8, 3, 5, 11, input, stride, output lea r3, [2 * strideq] lea r4, [4 * strideq] @@ -105,17 +114,17 @@ cglobal hadamard_8x8, 3, 5, 10, input, stride, output mova m7, [inputq + r3] HMD8_1D - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 + TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10 HMD8_1D - mova [outputq + 0], m0 - mova [outputq + 16], m1 - mova [outputq + 32], m2 - mova [outputq + 48], m3 - mova [outputq + 64], m4 - mova [outputq + 80], m5 - mova [outputq + 96], m6 - mova [outputq + 112], m7 + STORE_TRAN_LOW 0, outputq, 0, 8, 9 + STORE_TRAN_LOW 1, outputq, 8, 8, 9 + STORE_TRAN_LOW 2, outputq, 16, 8, 9 + STORE_TRAN_LOW 3, outputq, 24, 8, 9 + STORE_TRAN_LOW 4, outputq, 32, 8, 9 + STORE_TRAN_LOW 5, outputq, 40, 8, 9 + STORE_TRAN_LOW 6, outputq, 48, 8, 9 + STORE_TRAN_LOW 7, outputq, 56, 8, 9 RET %endif diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h new file mode 100644 index 00000000000..b9116f04981 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_ +#define VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_ + +#include <immintrin.h> + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/vpx_dsp_common.h" + +// Load 16 16 bit values. If the source is 32 bits then pack down with +// saturation. +static INLINE __m256i load_tran_low(const tran_low_t *a) { +#if CONFIG_VP9_HIGHBITDEPTH + const __m256i a_low = _mm256_loadu_si256((const __m256i *)a); + return _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8)); +#else + return _mm256_loadu_si256((const __m256i *)a); +#endif +} + +#endif // VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_ diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.asm new file mode 100644 index 00000000000..aacf71f7ac6 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.asm @@ -0,0 +1,90 @@ +; +; Copyright (c) 2017 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +; TODO(johannkoenig): Add the necessary include guards to vpx_config.asm. +; vpx_config.asm is not guarded so can not be included twice. Because this will +; be used in conjunction with x86_abi_support.asm or x86inc.asm, it must be +; included after those files. + +; Increment register by sizeof() tran_low_t * 8. +%macro INCREMENT_TRAN_LOW 1 +%if CONFIG_VP9_HIGHBITDEPTH + add %1, 32 +%else + add %1, 16 +%endif +%endmacro + +; Increment %1 by sizeof() tran_low_t * %2. +%macro INCREMENT_ELEMENTS_TRAN_LOW 2 +%if CONFIG_VP9_HIGHBITDEPTH + lea %1, [%1 + %2 * 4] +%else + lea %1, [%1 + %2 * 2] +%endif +%endmacro + +; Load %2 + %3 into m%1. +; %3 is the offset in elements, not bytes. +; If tran_low_t is 16 bits (low bit depth configuration) then load the value +; directly. If tran_low_t is 32 bits (high bit depth configuration) then pack +; the values down to 16 bits. +%macro LOAD_TRAN_LOW 3 +%if CONFIG_VP9_HIGHBITDEPTH + mova m%1, [%2 + (%3) * 4] + packssdw m%1, [%2 + (%3) * 4 + 16] +%else + mova m%1, [%2 + (%3) * 2] +%endif +%endmacro + +; Store m%1 to %2 + %3. +; %3 is the offset in elements, not bytes. +; If 5 arguments are provided then m%1 is corrupted. +; If 6 arguments are provided then m%1 is preserved. +; If tran_low_t is 16 bits (low bit depth configuration) then store the value +; directly. If tran_low_t is 32 bits (high bit depth configuration) then sign +; extend the values first. +; Uses m%4-m%6 as scratch registers for high bit depth. +%macro STORE_TRAN_LOW 5-6 +%if CONFIG_VP9_HIGHBITDEPTH + pxor m%4, m%4 + mova m%5, m%1 + %if %0 == 6 + mova m%6, m%1 + %endif + pcmpgtw m%4, m%1 + punpcklwd m%5, m%4 + %if %0 == 5 + punpckhwd m%1, m%4 + %else + punpckhwd m%6, m%4 + %endif + mova [%2 + (%3) * 4 + 0], m%5 + %if %0 == 5 + mova [%2 + (%3) * 4 + 16], m%1 + %else + mova [%2 + (%3) * 4 + 16], m%6 + %endif +%else + mova [%2 + (%3) * 2], m%1 +%endif +%endmacro + +; Store zeros (in m%1) to %2 + %3. +; %3 is the offset in elements, not bytes. +%macro STORE_ZERO_TRAN_LOW 3 +%if CONFIG_VP9_HIGHBITDEPTH + mova [%2 + (%3) * 4 + 0], m%1 + mova [%2 + (%3) * 4 + 16], m%1 +%else + mova [%2 + (%3) * 2], m%1 +%endif +%endmacro diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/fdct.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h index 54a6d81fcbc..5d1d7795723 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/fdct.h +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. * * Use of this source code is governed by a BSD-style license * that can be found in the LICENSE file in the root of the source @@ -7,8 +7,8 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VPX_DSP_X86_FDCT_H_ -#define VPX_DSP_X86_FDCT_H_ +#ifndef VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_ +#define VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_ #include <xmmintrin.h> @@ -16,13 +16,12 @@ #include "vpx/vpx_integer.h" #include "vpx_dsp/vpx_dsp_common.h" -// Load 8 16 bit values. If the source is 32 bits then cast down. -// This does not saturate values. It only truncates. +// Load 8 16 bit values. If the source is 32 bits then pack down with +// saturation. static INLINE __m128i load_tran_low(const tran_low_t *a) { #if CONFIG_VP9_HIGHBITDEPTH - return _mm_setr_epi16((int16_t)a[0], (int16_t)a[1], (int16_t)a[2], - (int16_t)a[3], (int16_t)a[4], (int16_t)a[5], - (int16_t)a[6], (int16_t)a[7]); + const __m128i a_low = _mm_load_si128((const __m128i *)a); + return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4)); #else return _mm_load_si128((const __m128i *)a); #endif @@ -54,4 +53,4 @@ static INLINE void store_zero_tran_low(tran_low_t *a) { _mm_store_si128((__m128i *)(a), zero); #endif } -#endif // VPX_DSP_X86_FDCT_H_ +#endif // VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_ diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/deblock_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/deblock_sse2.asm index ebca50930a0..bd8fd12480e 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/deblock_sse2.asm +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/deblock_sse2.asm @@ -78,8 +78,8 @@ %endmacro %macro UPDATE_FLIMIT 0 - movdqa xmm2, XMMWORD PTR [rbx] - movdqa [rsp], xmm2 + movdqu xmm2, XMMWORD PTR [rbx] + movdqu [rsp], xmm2 add rbx, 16 %endmacro diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm index 78a1dbb24f8..b433874f28d 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm @@ -10,10 +10,6 @@ %include "third_party/x86inc/x86inc.asm" -; This file provides SSSE3 version of the forward transformation. Part -; of the macro definitions are originally derived from the ffmpeg project. -; The current version applies to x86 64-bit only. - SECTION_RODATA pw_11585x2: times 8 dw 23170 @@ -32,107 +28,11 @@ TRANSFORM_COEFFS 9102, 13623 SECTION .text %if ARCH_X86_64 -%macro SUM_SUB 3 - psubw m%3, m%1, m%2 - paddw m%1, m%2 - SWAP %2, %3 -%endmacro - -; butterfly operation -%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2 - pmaddwd m%1, m%3, %5 - pmaddwd m%2, m%3, %6 - paddd m%1, %4 - paddd m%2, %4 - psrad m%1, 14 - psrad m%2, 14 -%endmacro - -%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2 - punpckhwd m%6, m%2, m%1 - MUL_ADD_2X %7, %6, %6, %5, [pw_%4_%3], [pw_%3_m%4] - punpcklwd m%2, m%1 - MUL_ADD_2X %1, %2, %2, %5, [pw_%4_%3], [pw_%3_m%4] - packssdw m%1, m%7 - packssdw m%2, m%6 -%endmacro - -; matrix transpose -%macro INTERLEAVE_2X 4 - punpckh%1 m%4, m%2, m%3 - punpckl%1 m%2, m%3 - SWAP %3, %4 -%endmacro - -%macro TRANSPOSE8X8 9 - INTERLEAVE_2X wd, %1, %2, %9 - INTERLEAVE_2X wd, %3, %4, %9 - INTERLEAVE_2X wd, %5, %6, %9 - INTERLEAVE_2X wd, %7, %8, %9 - - INTERLEAVE_2X dq, %1, %3, %9 - INTERLEAVE_2X dq, %2, %4, %9 - INTERLEAVE_2X dq, %5, %7, %9 - INTERLEAVE_2X dq, %6, %8, %9 - - INTERLEAVE_2X qdq, %1, %5, %9 - INTERLEAVE_2X qdq, %3, %7, %9 - INTERLEAVE_2X qdq, %2, %6, %9 - INTERLEAVE_2X qdq, %4, %8, %9 - - SWAP %2, %5 - SWAP %4, %7 -%endmacro - -; 1D forward 8x8 DCT transform -%macro FDCT8_1D 1 - SUM_SUB 0, 7, 9 - SUM_SUB 1, 6, 9 - SUM_SUB 2, 5, 9 - SUM_SUB 3, 4, 9 - - SUM_SUB 0, 3, 9 - SUM_SUB 1, 2, 9 - SUM_SUB 6, 5, 9 -%if %1 == 0 - SUM_SUB 0, 1, 9 -%endif - - BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 - - pmulhrsw m6, m12 - pmulhrsw m5, m12 -%if %1 == 0 - pmulhrsw m0, m12 - pmulhrsw m1, m12 -%else - BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 - SWAP 0, 1 -%endif - - SUM_SUB 4, 5, 9 - SUM_SUB 7, 6, 9 - BUTTERFLY_4X 4, 7, 3196, 16069, m8, 9, 10 - BUTTERFLY_4X 5, 6, 13623, 9102, m8, 9, 10 - SWAP 1, 4 - SWAP 3, 6 -%endmacro - -%macro DIVIDE_ROUND_2X 4 ; dst1, dst2, tmp1, tmp2 - psraw m%3, m%1, 15 - psraw m%4, m%2, 15 - psubw m%1, m%3 - psubw m%2, m%4 - psraw m%1, 1 - psraw m%2, 1 -%endmacro - INIT_XMM ssse3 cglobal fdct8x8, 3, 5, 13, input, output, stride mova m8, [pd_8192] mova m12, [pw_11585x2] - pxor m11, m11 lea r3, [2 * strideq] lea r4, [4 * strideq] @@ -159,25 +59,303 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride psllw m7, 2 ; column transform - FDCT8_1D 0 - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - - FDCT8_1D 1 - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - - DIVIDE_ROUND_2X 0, 1, 9, 10 - DIVIDE_ROUND_2X 2, 3, 9, 10 - DIVIDE_ROUND_2X 4, 5, 9, 10 - DIVIDE_ROUND_2X 6, 7, 9, 10 - - mova [outputq + 0], m0 - mova [outputq + 16], m1 - mova [outputq + 32], m2 - mova [outputq + 48], m3 - mova [outputq + 64], m4 - mova [outputq + 80], m5 - mova [outputq + 96], m6 - mova [outputq + 112], m7 + ; stage 1 + paddw m10, m0, m7 + psubw m0, m7 + + paddw m9, m1, m6 + psubw m1, m6 + + paddw m7, m2, m5 + psubw m2, m5 + + paddw m6, m3, m4 + psubw m3, m4 + + ; stage 2 + paddw m5, m9, m7 + psubw m9, m7 + + paddw m4, m10, m6 + psubw m10, m6 + + paddw m7, m1, m2 + psubw m1, m2 + + ; stage 3 + paddw m6, m4, m5 + psubw m4, m5 + + pmulhrsw m1, m12 + pmulhrsw m7, m12 + + ; sin(pi / 8), cos(pi / 8) + punpcklwd m2, m10, m9 + punpckhwd m10, m9 + pmaddwd m5, m2, [pw_15137_6270] + pmaddwd m2, [pw_6270_m15137] + pmaddwd m9, m10, [pw_15137_6270] + pmaddwd m10, [pw_6270_m15137] + paddd m5, m8 + paddd m2, m8 + paddd m9, m8 + paddd m10, m8 + psrad m5, 14 + psrad m2, 14 + psrad m9, 14 + psrad m10, 14 + packssdw m5, m9 + packssdw m2, m10 + + pmulhrsw m6, m12 + pmulhrsw m4, m12 + + paddw m9, m3, m1 + psubw m3, m1 + + paddw m10, m0, m7 + psubw m0, m7 + + ; stage 4 + ; sin(pi / 16), cos(pi / 16) + punpcklwd m1, m10, m9 + punpckhwd m10, m9 + pmaddwd m7, m1, [pw_16069_3196] + pmaddwd m1, [pw_3196_m16069] + pmaddwd m9, m10, [pw_16069_3196] + pmaddwd m10, [pw_3196_m16069] + paddd m7, m8 + paddd m1, m8 + paddd m9, m8 + paddd m10, m8 + psrad m7, 14 + psrad m1, 14 + psrad m9, 14 + psrad m10, 14 + packssdw m7, m9 + packssdw m1, m10 + + ; sin(3 * pi / 16), cos(3 * pi / 16) + punpcklwd m11, m0, m3 + punpckhwd m0, m3 + pmaddwd m9, m11, [pw_9102_13623] + pmaddwd m11, [pw_13623_m9102] + pmaddwd m3, m0, [pw_9102_13623] + pmaddwd m0, [pw_13623_m9102] + paddd m9, m8 + paddd m11, m8 + paddd m3, m8 + paddd m0, m8 + psrad m9, 14 + psrad m11, 14 + psrad m3, 14 + psrad m0, 14 + packssdw m9, m3 + packssdw m11, m0 + + ; transpose + ; stage 1 + punpcklwd m0, m6, m7 + punpcklwd m3, m5, m11 + punpckhwd m6, m7 + punpckhwd m5, m11 + punpcklwd m7, m4, m9 + punpcklwd m10, m2, m1 + punpckhwd m4, m9 + punpckhwd m2, m1 + + ; stage 2 + punpckldq m9, m0, m3 + punpckldq m1, m6, m5 + punpckhdq m0, m3 + punpckhdq m6, m5 + punpckldq m3, m7, m10 + punpckldq m5, m4, m2 + punpckhdq m7, m10 + punpckhdq m4, m2 + + ; stage 3 + punpcklqdq m10, m9, m3 + punpckhqdq m9, m3 + punpcklqdq m2, m0, m7 + punpckhqdq m0, m7 + punpcklqdq m3, m1, m5 + punpckhqdq m1, m5 + punpcklqdq m7, m6, m4 + punpckhqdq m6, m4 + + ; row transform + ; stage 1 + paddw m5, m10, m6 + psubw m10, m6 + + paddw m4, m9, m7 + psubw m9, m7 + + paddw m6, m2, m1 + psubw m2, m1 + + paddw m7, m0, m3 + psubw m0, m3 + + ;stage 2 + paddw m1, m5, m7 + psubw m5, m7 + + paddw m3, m4, m6 + psubw m4, m6 + + paddw m7, m9, m2 + psubw m9, m2 + + ; stage 3 + punpcklwd m6, m1, m3 + punpckhwd m1, m3 + pmaddwd m2, m6, [pw_11585_11585] + pmaddwd m6, [pw_11585_m11585] + pmaddwd m3, m1, [pw_11585_11585] + pmaddwd m1, [pw_11585_m11585] + paddd m2, m8 + paddd m6, m8 + paddd m3, m8 + paddd m1, m8 + psrad m2, 14 + psrad m6, 14 + psrad m3, 14 + psrad m1, 14 + packssdw m2, m3 + packssdw m6, m1 + + pmulhrsw m7, m12 + pmulhrsw m9, m12 + + punpcklwd m3, m5, m4 + punpckhwd m5, m4 + pmaddwd m1, m3, [pw_15137_6270] + pmaddwd m3, [pw_6270_m15137] + pmaddwd m4, m5, [pw_15137_6270] + pmaddwd m5, [pw_6270_m15137] + paddd m1, m8 + paddd m3, m8 + paddd m4, m8 + paddd m5, m8 + psrad m1, 14 + psrad m3, 14 + psrad m4, 14 + psrad m5, 14 + packssdw m1, m4 + packssdw m3, m5 + + paddw m4, m0, m9 + psubw m0, m9 + + paddw m5, m10, m7 + psubw m10, m7 + + ; stage 4 + punpcklwd m9, m5, m4 + punpckhwd m5, m4 + pmaddwd m7, m9, [pw_16069_3196] + pmaddwd m9, [pw_3196_m16069] + pmaddwd m4, m5, [pw_16069_3196] + pmaddwd m5, [pw_3196_m16069] + paddd m7, m8 + paddd m9, m8 + paddd m4, m8 + paddd m5, m8 + psrad m7, 14 + psrad m9, 14 + psrad m4, 14 + psrad m5, 14 + packssdw m7, m4 + packssdw m9, m5 + + punpcklwd m4, m10, m0 + punpckhwd m10, m0 + pmaddwd m5, m4, [pw_9102_13623] + pmaddwd m4, [pw_13623_m9102] + pmaddwd m0, m10, [pw_9102_13623] + pmaddwd m10, [pw_13623_m9102] + paddd m5, m8 + paddd m4, m8 + paddd m0, m8 + paddd m10, m8 + psrad m5, 14 + psrad m4, 14 + psrad m0, 14 + psrad m10, 14 + packssdw m5, m0 + packssdw m4, m10 + + ; transpose + ; stage 1 + punpcklwd m0, m2, m7 + punpcklwd m10, m1, m4 + punpckhwd m2, m7 + punpckhwd m1, m4 + punpcklwd m7, m6, m5 + punpcklwd m4, m3, m9 + punpckhwd m6, m5 + punpckhwd m3, m9 + + ; stage 2 + punpckldq m5, m0, m10 + punpckldq m9, m2, m1 + punpckhdq m0, m10 + punpckhdq m2, m1 + punpckldq m10, m7, m4 + punpckldq m1, m6, m3 + punpckhdq m7, m4 + punpckhdq m6, m3 + + ; stage 3 + punpcklqdq m4, m5, m10 + punpckhqdq m5, m10 + punpcklqdq m3, m0, m7 + punpckhqdq m0, m7 + punpcklqdq m10, m9, m1 + punpckhqdq m9, m1 + punpcklqdq m7, m2, m6 + punpckhqdq m2, m6 + + psraw m1, m4, 15 + psraw m6, m5, 15 + psraw m8, m3, 15 + psraw m11, m0, 15 + + psubw m4, m1 + psubw m5, m6 + psubw m3, m8 + psubw m0, m11 + + psraw m4, 1 + psraw m5, 1 + psraw m3, 1 + psraw m0, 1 + + psraw m1, m10, 15 + psraw m6, m9, 15 + psraw m8, m7, 15 + psraw m11, m2, 15 + + psubw m10, m1 + psubw m9, m6 + psubw m7, m8 + psubw m2, m11 + + psraw m10, 1 + psraw m9, 1 + psraw m7, 1 + psraw m2, 1 + + mova [outputq + 0], m4 + mova [outputq + 16], m5 + mova [outputq + 32], m3 + mova [outputq + 48], m0 + mova [outputq + 64], m10 + mova [outputq + 80], m9 + mova [outputq + 96], m7 + mova [outputq + 112], m2 RET %endif diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c index 487a474a675..33909ba8159 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c @@ -263,100 +263,6 @@ void iadst4_sse2(__m128i *in) { in[1] = _mm_packs_epi32(u[2], u[3]); } -#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ - out2, out3, out4, out5, out6, out7) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ - const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ - const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ - const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ - const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \ - const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \ - \ - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ - \ - out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ - out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ - out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ - out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ - out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ - out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ - out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ - out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ - } - -#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \ - { \ - const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \ - const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \ - const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \ - \ - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ - \ - out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ - out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ - out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ - out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ - } - -#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \ - { \ - const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ - const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ - out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ - out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ - } - -// Define Macro for multiplying elements by constants and adding them together. -#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \ - res0, res1, res2, res3) \ - { \ - tmp0 = _mm_madd_epi16(lo_0, cst0); \ - tmp1 = _mm_madd_epi16(hi_0, cst0); \ - tmp2 = _mm_madd_epi16(lo_0, cst1); \ - tmp3 = _mm_madd_epi16(hi_0, cst1); \ - tmp4 = _mm_madd_epi16(lo_1, cst2); \ - tmp5 = _mm_madd_epi16(hi_1, cst2); \ - tmp6 = _mm_madd_epi16(lo_1, cst3); \ - tmp7 = _mm_madd_epi16(hi_1, cst3); \ - \ - tmp0 = _mm_add_epi32(tmp0, rounding); \ - tmp1 = _mm_add_epi32(tmp1, rounding); \ - tmp2 = _mm_add_epi32(tmp2, rounding); \ - tmp3 = _mm_add_epi32(tmp3, rounding); \ - tmp4 = _mm_add_epi32(tmp4, rounding); \ - tmp5 = _mm_add_epi32(tmp5, rounding); \ - tmp6 = _mm_add_epi32(tmp6, rounding); \ - tmp7 = _mm_add_epi32(tmp7, rounding); \ - \ - tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ - tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ - tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ - tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ - tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ - tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ - tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ - tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ - \ - res0 = _mm_packs_epi32(tmp0, tmp1); \ - res1 = _mm_packs_epi32(tmp2, tmp3); \ - res2 = _mm_packs_epi32(tmp4, tmp5); \ - res3 = _mm_packs_epi32(tmp6, tmp7); \ - } - #define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \ { \ tmp0 = _mm_madd_epi16(lo_0, cst0); \ diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.h index d762a04abcd..d5683ab1cf0 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.h +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.h @@ -46,6 +46,36 @@ static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) { res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); } +#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ + out2, out3, out4, out5, out6, out7) \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ + const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \ + const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \ + const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \ + const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \ + const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \ + const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \ + \ + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \ + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \ + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \ + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \ + \ + out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ + out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ + out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ + out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ + out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \ + out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \ + out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \ + out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \ + } #define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \ { \ @@ -186,6 +216,69 @@ static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) { RECON_AND_STORE(dest + 15 * stride, in[15]); } +#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \ + { \ + const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \ + const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \ + const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \ + \ + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \ + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \ + \ + out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \ + out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \ + out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \ + out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \ + } + +#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \ + { \ + const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \ + const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \ + out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \ + out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \ + } + +// Define Macro for multiplying elements by constants and adding them together. +#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \ + res0, res1, res2, res3) \ + { \ + tmp0 = _mm_madd_epi16(lo_0, cst0); \ + tmp1 = _mm_madd_epi16(hi_0, cst0); \ + tmp2 = _mm_madd_epi16(lo_0, cst1); \ + tmp3 = _mm_madd_epi16(hi_0, cst1); \ + tmp4 = _mm_madd_epi16(lo_1, cst2); \ + tmp5 = _mm_madd_epi16(hi_1, cst2); \ + tmp6 = _mm_madd_epi16(lo_1, cst3); \ + tmp7 = _mm_madd_epi16(hi_1, cst3); \ + \ + tmp0 = _mm_add_epi32(tmp0, rounding); \ + tmp1 = _mm_add_epi32(tmp1, rounding); \ + tmp2 = _mm_add_epi32(tmp2, rounding); \ + tmp3 = _mm_add_epi32(tmp3, rounding); \ + tmp4 = _mm_add_epi32(tmp4, rounding); \ + tmp5 = _mm_add_epi32(tmp5, rounding); \ + tmp6 = _mm_add_epi32(tmp6, rounding); \ + tmp7 = _mm_add_epi32(tmp7, rounding); \ + \ + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \ + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \ + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \ + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \ + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \ + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \ + tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \ + tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \ + \ + res0 = _mm_packs_epi32(tmp0, tmp1); \ + res1 = _mm_packs_epi32(tmp2, tmp3); \ + res2 = _mm_packs_epi32(tmp4, tmp5); \ + res3 = _mm_packs_epi32(tmp6, tmp7); \ + } + void idct4_sse2(__m128i *in); void idct8_sse2(__m128i *in); void idct16_sse2(__m128i *in0, __m128i *in1); diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c new file mode 100644 index 00000000000..cfa6a732ae7 --- /dev/null +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c @@ -0,0 +1,1741 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <tmmintrin.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/inv_txfm_sse2.h" +#include "vpx_dsp/x86/txfm_common_sse2.h" + +void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i zero = _mm_setzero_si128(); + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1 << 4); + const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + int i; + + // Load input data. + in0 = load_input_data(input); + in1 = load_input_data(input + 8 * 1); + in2 = load_input_data(input + 8 * 2); + in3 = load_input_data(input + 8 * 3); + in4 = load_input_data(input + 8 * 4); + in5 = load_input_data(input + 8 * 5); + in6 = load_input_data(input + 8 * 6); + in7 = load_input_data(input + 8 * 7); + + // 2-D + for (i = 0; i < 2; i++) { + // 8x8 Transpose is copied from vpx_fdct8x8_sse2() + TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, + in4, in5, in6, in7); + + // 4-stage 1D idct8x8 + { + /* Stage1 */ + { + const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); + const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); + const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); + const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); + + { + tmp0 = _mm_madd_epi16(lo_17, stg1_0); + tmp1 = _mm_madd_epi16(hi_17, stg1_0); + tmp2 = _mm_madd_epi16(lo_17, stg1_1); + tmp3 = _mm_madd_epi16(hi_17, stg1_1); + tmp4 = _mm_madd_epi16(lo_35, stg1_2); + tmp5 = _mm_madd_epi16(hi_35, stg1_2); + tmp6 = _mm_madd_epi16(lo_35, stg1_3); + tmp7 = _mm_madd_epi16(hi_35, stg1_3); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp5 = _mm_add_epi32(tmp5, rounding); + tmp6 = _mm_add_epi32(tmp6, rounding); + tmp7 = _mm_add_epi32(tmp7, rounding); + + tmp0 = _mm_srai_epi32(tmp0, 14); + tmp1 = _mm_srai_epi32(tmp1, 14); + tmp2 = _mm_srai_epi32(tmp2, 14); + tmp3 = _mm_srai_epi32(tmp3, 14); + tmp4 = _mm_srai_epi32(tmp4, 14); + tmp5 = _mm_srai_epi32(tmp5, 14); + tmp6 = _mm_srai_epi32(tmp6, 14); + tmp7 = _mm_srai_epi32(tmp7, 14); + + stp1_4 = _mm_packs_epi32(tmp0, tmp1); + stp1_7 = _mm_packs_epi32(tmp2, tmp3); + stp1_5 = _mm_packs_epi32(tmp4, tmp5); + stp1_6 = _mm_packs_epi32(tmp6, tmp7); + } + } + + /* Stage2 */ + { + const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); + const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); + + { + tmp0 = _mm_unpacklo_epi16(in0, in4); + tmp1 = _mm_unpackhi_epi16(in0, in4); + + tmp2 = _mm_madd_epi16(tmp0, stk2_0); + tmp3 = _mm_madd_epi16(tmp1, stk2_0); + tmp4 = _mm_madd_epi16(tmp0, stk2_1); + tmp5 = _mm_madd_epi16(tmp1, stk2_1); + + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp5 = _mm_add_epi32(tmp5, rounding); + + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); + + stp2_0 = _mm_packs_epi32(tmp2, tmp3); + stp2_1 = _mm_packs_epi32(tmp4, tmp5); + + tmp0 = _mm_madd_epi16(lo_26, stg2_2); + tmp1 = _mm_madd_epi16(hi_26, stg2_2); + tmp2 = _mm_madd_epi16(lo_26, stg2_3); + tmp3 = _mm_madd_epi16(hi_26, stg2_3); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + + tmp0 = _mm_srai_epi32(tmp0, 14); + tmp1 = _mm_srai_epi32(tmp1, 14); + tmp2 = _mm_srai_epi32(tmp2, 14); + tmp3 = _mm_srai_epi32(tmp3, 14); + + stp2_2 = _mm_packs_epi32(tmp0, tmp1); + stp2_3 = _mm_packs_epi32(tmp2, tmp3); + } + + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); + } + + /* Stage3 */ + { + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); + + tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5); + tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5); + + tmp2 = _mm_madd_epi16(tmp0, stk2_1); + tmp3 = _mm_madd_epi16(tmp1, stk2_1); + tmp4 = _mm_madd_epi16(tmp0, stk2_0); + tmp5 = _mm_madd_epi16(tmp1, stk2_0); + + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp4 = _mm_add_epi32(tmp4, rounding); + tmp5 = _mm_add_epi32(tmp5, rounding); + + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); + tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); + + stp1_5 = _mm_packs_epi32(tmp2, tmp3); + stp1_6 = _mm_packs_epi32(tmp4, tmp5); + } + + /* Stage4 */ + in0 = _mm_add_epi16(stp1_0, stp2_7); + in1 = _mm_add_epi16(stp1_1, stp1_6); + in2 = _mm_add_epi16(stp1_2, stp1_5); + in3 = _mm_add_epi16(stp1_3, stp2_4); + in4 = _mm_sub_epi16(stp1_3, stp2_4); + in5 = _mm_sub_epi16(stp1_2, stp1_5); + in6 = _mm_sub_epi16(stp1_1, stp1_6); + in7 = _mm_sub_epi16(stp1_0, stp2_7); + } + } + + // Final rounding and shift + in0 = _mm_adds_epi16(in0, final_rounding); + in1 = _mm_adds_epi16(in1, final_rounding); + in2 = _mm_adds_epi16(in2, final_rounding); + in3 = _mm_adds_epi16(in3, final_rounding); + in4 = _mm_adds_epi16(in4, final_rounding); + in5 = _mm_adds_epi16(in5, final_rounding); + in6 = _mm_adds_epi16(in6, final_rounding); + in7 = _mm_adds_epi16(in7, final_rounding); + + in0 = _mm_srai_epi16(in0, 5); + in1 = _mm_srai_epi16(in1, 5); + in2 = _mm_srai_epi16(in2, 5); + in3 = _mm_srai_epi16(in3, 5); + in4 = _mm_srai_epi16(in4, 5); + in5 = _mm_srai_epi16(in5, 5); + in6 = _mm_srai_epi16(in6, 5); + in7 = _mm_srai_epi16(in7, 5); + + RECON_AND_STORE(dest + 0 * stride, in0); + RECON_AND_STORE(dest + 1 * stride, in1); + RECON_AND_STORE(dest + 2 * stride, in2); + RECON_AND_STORE(dest + 3 * stride, in3); + RECON_AND_STORE(dest + 4 * stride, in4); + RECON_AND_STORE(dest + 5 * stride, in5); + RECON_AND_STORE(dest + 6 * stride, in6); + RECON_AND_STORE(dest + 7 * stride, in7); +} + +void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i zero = _mm_setzero_si128(); + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i final_rounding = _mm_set1_epi16(1 << 4); + const __m128i stg1_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64); + const __m128i stg1_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64); + const __m128i stg1_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64); + const __m128i stg1_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64); + const __m128i stg2_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64); + const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg2_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64); + const __m128i stg2_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64); + const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + + __m128i in0, in1, in2, in3, in4, in5, in6, in7; + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7; + __m128i tmp0, tmp1, tmp2, tmp3; + + // Rows. Load 4-row input data. + in0 = load_input_data(input); + in1 = load_input_data(input + 8 * 1); + in2 = load_input_data(input + 8 * 2); + in3 = load_input_data(input + 8 * 3); + + // 8x4 Transpose + TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1); + + // Stage1 + tmp0 = _mm_mulhrs_epi16(in0, stg1_0); + tmp1 = _mm_mulhrs_epi16(in0, stg1_1); + tmp2 = _mm_mulhrs_epi16(in1, stg1_2); + tmp3 = _mm_mulhrs_epi16(in1, stg1_3); + + stp1_4 = _mm_unpackhi_epi64(tmp0, tmp1); + stp1_5 = _mm_unpackhi_epi64(tmp2, tmp3); + + // Stage2 + tmp0 = _mm_mulhrs_epi16(in0, stg2_0); + stp2_0 = _mm_unpacklo_epi64(tmp0, tmp0); + + tmp1 = _mm_mulhrs_epi16(in1, stg2_2); + tmp2 = _mm_mulhrs_epi16(in1, stg2_3); + stp2_2 = _mm_unpacklo_epi64(tmp2, tmp1); + + tmp0 = _mm_add_epi16(stp1_4, stp1_5); + tmp1 = _mm_sub_epi16(stp1_4, stp1_5); + + stp2_4 = tmp0; + stp2_5 = _mm_unpacklo_epi64(tmp1, zero); + stp2_6 = _mm_unpackhi_epi64(tmp1, zero); + + tmp0 = _mm_unpacklo_epi16(stp2_5, stp2_6); + tmp1 = _mm_madd_epi16(tmp0, stg3_0); + tmp2 = _mm_madd_epi16(tmp0, stk2_0); // stg3_1 = stk2_0 + + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + + stp1_5 = _mm_packs_epi32(tmp1, tmp2); + + // Stage3 + tmp2 = _mm_add_epi16(stp2_0, stp2_2); + tmp3 = _mm_sub_epi16(stp2_0, stp2_2); + + stp1_2 = _mm_unpackhi_epi64(tmp3, tmp2); + stp1_3 = _mm_unpacklo_epi64(tmp3, tmp2); + + // Stage4 + tmp0 = _mm_add_epi16(stp1_3, stp2_4); + tmp1 = _mm_add_epi16(stp1_2, stp1_5); + tmp2 = _mm_sub_epi16(stp1_3, stp2_4); + tmp3 = _mm_sub_epi16(stp1_2, stp1_5); + + TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3) + + /* Stage1 */ + stp1_4 = _mm_mulhrs_epi16(in1, stg1_0); + stp1_7 = _mm_mulhrs_epi16(in1, stg1_1); + stp1_5 = _mm_mulhrs_epi16(in3, stg1_2); + stp1_6 = _mm_mulhrs_epi16(in3, stg1_3); + + /* Stage2 */ + stp2_0 = _mm_mulhrs_epi16(in0, stg2_0); + stp2_1 = _mm_mulhrs_epi16(in0, stg2_0); + + stp2_2 = _mm_mulhrs_epi16(in2, stg2_2); + stp2_3 = _mm_mulhrs_epi16(in2, stg2_3); + + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); + + /* Stage3 */ + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); + + tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5); + tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5); + + tmp2 = _mm_madd_epi16(tmp0, stk2_0); + tmp3 = _mm_madd_epi16(tmp1, stk2_0); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + stp1_6 = _mm_packs_epi32(tmp2, tmp3); + + tmp2 = _mm_madd_epi16(tmp0, stk2_1); + tmp3 = _mm_madd_epi16(tmp1, stk2_1); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + stp1_5 = _mm_packs_epi32(tmp2, tmp3); + + /* Stage4 */ + in0 = _mm_add_epi16(stp1_0, stp2_7); + in1 = _mm_add_epi16(stp1_1, stp1_6); + in2 = _mm_add_epi16(stp1_2, stp1_5); + in3 = _mm_add_epi16(stp1_3, stp2_4); + in4 = _mm_sub_epi16(stp1_3, stp2_4); + in5 = _mm_sub_epi16(stp1_2, stp1_5); + in6 = _mm_sub_epi16(stp1_1, stp1_6); + in7 = _mm_sub_epi16(stp1_0, stp2_7); + + // Final rounding and shift + in0 = _mm_adds_epi16(in0, final_rounding); + in1 = _mm_adds_epi16(in1, final_rounding); + in2 = _mm_adds_epi16(in2, final_rounding); + in3 = _mm_adds_epi16(in3, final_rounding); + in4 = _mm_adds_epi16(in4, final_rounding); + in5 = _mm_adds_epi16(in5, final_rounding); + in6 = _mm_adds_epi16(in6, final_rounding); + in7 = _mm_adds_epi16(in7, final_rounding); + + in0 = _mm_srai_epi16(in0, 5); + in1 = _mm_srai_epi16(in1, 5); + in2 = _mm_srai_epi16(in2, 5); + in3 = _mm_srai_epi16(in3, 5); + in4 = _mm_srai_epi16(in4, 5); + in5 = _mm_srai_epi16(in5, 5); + in6 = _mm_srai_epi16(in6, 5); + in7 = _mm_srai_epi16(in7, 5); + + RECON_AND_STORE(dest + 0 * stride, in0); + RECON_AND_STORE(dest + 1 * stride, in1); + RECON_AND_STORE(dest + 2 * stride, in2); + RECON_AND_STORE(dest + 3 * stride, in3); + RECON_AND_STORE(dest + 4 * stride, in4); + RECON_AND_STORE(dest + 5 * stride, in5); + RECON_AND_STORE(dest + 6 * stride, in6); + RECON_AND_STORE(dest + 7 * stride, in7); +} + +static INLINE void idct32_34(const __m128i *in, __m128i *stp1) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + // idct constants for each stage + const __m128i stk1_0 = pair_set_epi16(2 * cospi_31_64, 2 * cospi_31_64); + const __m128i stk1_1 = pair_set_epi16(2 * cospi_1_64, 2 * cospi_1_64); + const __m128i stk1_6 = pair_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64); + const __m128i stk1_7 = pair_set_epi16(2 * cospi_7_64, 2 * cospi_7_64); + const __m128i stk1_8 = pair_set_epi16(2 * cospi_27_64, 2 * cospi_27_64); + const __m128i stk1_9 = pair_set_epi16(2 * cospi_5_64, 2 * cospi_5_64); + const __m128i stk1_14 = pair_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64); + const __m128i stk1_15 = pair_set_epi16(2 * cospi_3_64, 2 * cospi_3_64); + + const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64); + const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64); + const __m128i stk2_6 = pair_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64); + const __m128i stk2_7 = pair_set_epi16(2 * cospi_6_64, 2 * cospi_6_64); + + const __m128i stk3_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64); + const __m128i stk3_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64); + const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); + const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stk4_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, + stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23, + stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + /* Stage1 */ + + stp1[16] = _mm_mulhrs_epi16(in[1], stk1_0); + stp1[31] = _mm_mulhrs_epi16(in[1], stk1_1); + + stp1[19] = _mm_mulhrs_epi16(in[7], stk1_6); + stp1[28] = _mm_mulhrs_epi16(in[7], stk1_7); + + stp1[20] = _mm_mulhrs_epi16(in[5], stk1_8); + stp1[27] = _mm_mulhrs_epi16(in[5], stk1_9); + + stp1[23] = _mm_mulhrs_epi16(in[3], stk1_14); + stp1[24] = _mm_mulhrs_epi16(in[3], stk1_15); + + /* Stage2 */ + + stp2_8 = _mm_mulhrs_epi16(in[2], stk2_0); + stp2_15 = _mm_mulhrs_epi16(in[2], stk2_1); + + stp2_11 = _mm_mulhrs_epi16(in[6], stk2_6); + stp2_12 = _mm_mulhrs_epi16(in[6], stk2_7); + + /* Stage3 */ + { + const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1[16], stp1[31]); + const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1[16], stp1[31]); + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1[19], stp1[28]); + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1[19], stp1[28]); + + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1[20], stp1[27]); + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1[20], stp1[27]); + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1[23], stp1[24]); + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1[23], stp1[24]); + + stp1[4] = _mm_mulhrs_epi16(in[4], stk3_0); + stp1[7] = _mm_mulhrs_epi16(in[4], stk3_1); + + MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, + stg3_5, stg3_6, stg3_4, stp1[17], stp1[30], stp1[18], + stp1[29]) + MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, + stg3_9, stg3_10, stg3_8, stp1[21], stp1[26], + stp1[22], stp1[25]) + } + + /* Stage4 */ + { + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15); + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15); + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12); + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12); + + stp1[0] = _mm_mulhrs_epi16(in[0], stk4_0); + stp1[1] = _mm_mulhrs_epi16(in[0], stk4_0); // stk4_1 = stk4_0 + stp1[2] = stp1[0]; + stp1[3] = stp1[1]; + + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, stg4_5, + stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, stp2_13) + + stp2_16 = _mm_add_epi16(stp1[16], stp1[19]); + stp2_17 = _mm_add_epi16(stp1[17], stp1[18]); + stp2_18 = _mm_sub_epi16(stp1[17], stp1[18]); + stp2_19 = _mm_sub_epi16(stp1[16], stp1[19]); + stp2_20 = _mm_sub_epi16(stp1[23], stp1[20]); + stp2_21 = _mm_sub_epi16(stp1[22], stp1[21]); + stp2_22 = _mm_add_epi16(stp1[22], stp1[21]); + stp2_23 = _mm_add_epi16(stp1[23], stp1[20]); + + stp2_24 = _mm_add_epi16(stp1[24], stp1[27]); + stp2_25 = _mm_add_epi16(stp1[25], stp1[26]); + stp2_26 = _mm_sub_epi16(stp1[25], stp1[26]); + stp2_27 = _mm_sub_epi16(stp1[24], stp1[27]); + stp2_28 = _mm_sub_epi16(stp1[31], stp1[28]); + stp2_29 = _mm_sub_epi16(stp1[30], stp1[29]); + stp2_30 = _mm_add_epi16(stp1[29], stp1[30]); + stp2_31 = _mm_add_epi16(stp1[28], stp1[31]); + } + + /* Stage5 */ + { +// Note: +// #define AVOID_OVERFLOW = 0, code would be faster. But it can't pass +// SingleExtreme test. The MaxSupportedCoeff/MinSupportedCoeff must drop +// to 23198 and -23197, respectively. +#define AVOID_OVERFLOW (1) + +#if AVOID_OVERFLOW + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp1[7], stp1[4]); + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp1[7], stp1[4]); +#endif + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); + + const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); + const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); + + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); + +#if AVOID_OVERFLOW + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + + stp1[5] = _mm_packs_epi32(tmp0, tmp1); + stp1[6] = _mm_packs_epi32(tmp2, tmp3); +#else + tmp0 = _mm_sub_epi16(stp1[7], stp1[4]); + tmp1 = _mm_adds_epi16(stp1[7], stp1[4]); + stp1[5] = _mm_mulhrs_epi16(tmp0, stk4_0); + stp1[6] = _mm_mulhrs_epi16(tmp1, stk4_0); +#endif + + stp1[8] = _mm_add_epi16(stp2_8, stp2_11); + stp1[9] = _mm_add_epi16(stp2_9, stp2_10); + stp1[10] = _mm_sub_epi16(stp2_9, stp2_10); + stp1[11] = _mm_sub_epi16(stp2_8, stp2_11); + stp1[12] = _mm_sub_epi16(stp2_15, stp2_12); + stp1[13] = _mm_sub_epi16(stp2_14, stp2_13); + stp1[14] = _mm_add_epi16(stp2_14, stp2_13); + stp1[15] = _mm_add_epi16(stp2_15, stp2_12); + + MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, + stg4_5, stg4_4, stg4_5, stp1[18], stp1[29], stp1[19], + stp1[28]) + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, + stg4_4, stg4_6, stg4_4, stp1[20], stp1[27], stp1[21], + stp1[26]) + + stp1[16] = stp2_16; + stp1[17] = stp2_17; + stp1[22] = stp2_22; + stp1[23] = stp2_23; + stp1[24] = stp2_24; + stp1[25] = stp2_25; + stp1[30] = stp2_30; + stp1[31] = stp2_31; + } + + /* Stage6 */ + { +#if AVOID_OVERFLOW + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1[10], stp1[13]); + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1[10], stp1[13]); + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1[11], stp1[12]); + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1[11], stp1[12]); +#endif + + stp2_0 = _mm_add_epi16(stp1[0], stp1[7]); + stp2_1 = _mm_add_epi16(stp1[1], stp1[6]); + stp2_2 = _mm_add_epi16(stp1[2], stp1[5]); + stp2_3 = _mm_add_epi16(stp1[3], stp1[4]); + stp2_4 = _mm_sub_epi16(stp1[3], stp1[4]); + stp2_5 = _mm_sub_epi16(stp1[2], stp1[5]); + stp2_6 = _mm_sub_epi16(stp1[1], stp1[6]); + stp2_7 = _mm_sub_epi16(stp1[0], stp1[7]); + + stp2_8 = stp1[8]; + stp2_9 = stp1[9]; + stp2_14 = stp1[14]; + stp2_15 = stp1[15]; + +#if AVOID_OVERFLOW + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, + stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, + stp2_12) +#else + tmp0 = _mm_add_epi16(stp1[10], stp1[13]); + tmp1 = _mm_sub_epi16(stp1[13], stp1[10]); + tmp2 = _mm_add_epi16(stp1[11], stp1[12]); + tmp3 = _mm_sub_epi16(stp1[12], stp1[11]); + + stp2_10 = _mm_mulhrs_epi16(tmp1, stk4_0); + stp2_13 = _mm_mulhrs_epi16(tmp0, stk4_0); + stp2_11 = _mm_mulhrs_epi16(tmp3, stk4_0); + stp2_12 = _mm_mulhrs_epi16(tmp2, stk4_0); + +#endif + + stp2_16 = _mm_add_epi16(stp1[16], stp1[23]); + stp2_17 = _mm_add_epi16(stp1[17], stp1[22]); + stp2_18 = _mm_add_epi16(stp1[18], stp1[21]); + stp2_19 = _mm_add_epi16(stp1[19], stp1[20]); + stp2_20 = _mm_sub_epi16(stp1[19], stp1[20]); + stp2_21 = _mm_sub_epi16(stp1[18], stp1[21]); + stp2_22 = _mm_sub_epi16(stp1[17], stp1[22]); + stp2_23 = _mm_sub_epi16(stp1[16], stp1[23]); + + stp2_24 = _mm_sub_epi16(stp1[31], stp1[24]); + stp2_25 = _mm_sub_epi16(stp1[30], stp1[25]); + stp2_26 = _mm_sub_epi16(stp1[29], stp1[26]); + stp2_27 = _mm_sub_epi16(stp1[28], stp1[27]); + stp2_28 = _mm_add_epi16(stp1[27], stp1[28]); + stp2_29 = _mm_add_epi16(stp1[26], stp1[29]); + stp2_30 = _mm_add_epi16(stp1[25], stp1[30]); + stp2_31 = _mm_add_epi16(stp1[24], stp1[31]); + } + + /* Stage7 */ + { +#if AVOID_OVERFLOW + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); + + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); + const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); + const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); +#endif + stp1[0] = _mm_add_epi16(stp2_0, stp2_15); + stp1[1] = _mm_add_epi16(stp2_1, stp2_14); + stp1[2] = _mm_add_epi16(stp2_2, stp2_13); + stp1[3] = _mm_add_epi16(stp2_3, stp2_12); + stp1[4] = _mm_add_epi16(stp2_4, stp2_11); + stp1[5] = _mm_add_epi16(stp2_5, stp2_10); + stp1[6] = _mm_add_epi16(stp2_6, stp2_9); + stp1[7] = _mm_add_epi16(stp2_7, stp2_8); + stp1[8] = _mm_sub_epi16(stp2_7, stp2_8); + stp1[9] = _mm_sub_epi16(stp2_6, stp2_9); + stp1[10] = _mm_sub_epi16(stp2_5, stp2_10); + stp1[11] = _mm_sub_epi16(stp2_4, stp2_11); + stp1[12] = _mm_sub_epi16(stp2_3, stp2_12); + stp1[13] = _mm_sub_epi16(stp2_2, stp2_13); + stp1[14] = _mm_sub_epi16(stp2_1, stp2_14); + stp1[15] = _mm_sub_epi16(stp2_0, stp2_15); + + stp1[16] = stp2_16; + stp1[17] = stp2_17; + stp1[18] = stp2_18; + stp1[19] = stp2_19; + +#if AVOID_OVERFLOW + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, + stg4_0, stg6_0, stg4_0, stp1[20], stp1[27], stp1[21], + stp1[26]) + MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, + stg4_0, stg6_0, stg4_0, stp1[22], stp1[25], stp1[23], + stp1[24]) +#else + tmp0 = _mm_add_epi16(stp2_20, stp2_27); + tmp1 = _mm_sub_epi16(stp2_27, stp2_20); + tmp2 = _mm_add_epi16(stp2_21, stp2_26); + tmp3 = _mm_sub_epi16(stp2_26, stp2_21); + + stp1[20] = _mm_mulhrs_epi16(tmp1, stk4_0); + stp1[27] = _mm_mulhrs_epi16(tmp0, stk4_0); + stp1[21] = _mm_mulhrs_epi16(tmp3, stk4_0); + stp1[26] = _mm_mulhrs_epi16(tmp2, stk4_0); + + tmp0 = _mm_add_epi16(stp2_22, stp2_25); + tmp1 = _mm_sub_epi16(stp2_25, stp2_22); + tmp2 = _mm_add_epi16(stp2_23, stp2_24); + tmp3 = _mm_sub_epi16(stp2_24, stp2_23); + + stp1[22] = _mm_mulhrs_epi16(tmp1, stk4_0); + stp1[25] = _mm_mulhrs_epi16(tmp0, stk4_0); + stp1[23] = _mm_mulhrs_epi16(tmp3, stk4_0); + stp1[24] = _mm_mulhrs_epi16(tmp2, stk4_0); +#endif + + stp1[28] = stp2_28; + stp1[29] = stp2_29; + stp1[30] = stp2_30; + stp1[31] = stp2_31; + } +#undef AVOID_OVERFLOW +} + +// Only upper-left 8x8 has non-zero coeff +void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, + int stride) { + const __m128i zero = _mm_setzero_si128(); + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + __m128i in[32], col[32]; + __m128i stp1[32]; + int i; + + // Load input data. Only need to load the top left 8x8 block. + in[0] = load_input_data(input); + in[1] = load_input_data(input + 32); + in[2] = load_input_data(input + 64); + in[3] = load_input_data(input + 96); + in[4] = load_input_data(input + 128); + in[5] = load_input_data(input + 160); + in[6] = load_input_data(input + 192); + in[7] = load_input_data(input + 224); + + array_transpose_8x8(in, in); + idct32_34(in, stp1); + + // 1_D: Store 32 intermediate results for each 8x32 block. + col[0] = _mm_add_epi16(stp1[0], stp1[31]); + col[1] = _mm_add_epi16(stp1[1], stp1[30]); + col[2] = _mm_add_epi16(stp1[2], stp1[29]); + col[3] = _mm_add_epi16(stp1[3], stp1[28]); + col[4] = _mm_add_epi16(stp1[4], stp1[27]); + col[5] = _mm_add_epi16(stp1[5], stp1[26]); + col[6] = _mm_add_epi16(stp1[6], stp1[25]); + col[7] = _mm_add_epi16(stp1[7], stp1[24]); + col[8] = _mm_add_epi16(stp1[8], stp1[23]); + col[9] = _mm_add_epi16(stp1[9], stp1[22]); + col[10] = _mm_add_epi16(stp1[10], stp1[21]); + col[11] = _mm_add_epi16(stp1[11], stp1[20]); + col[12] = _mm_add_epi16(stp1[12], stp1[19]); + col[13] = _mm_add_epi16(stp1[13], stp1[18]); + col[14] = _mm_add_epi16(stp1[14], stp1[17]); + col[15] = _mm_add_epi16(stp1[15], stp1[16]); + col[16] = _mm_sub_epi16(stp1[15], stp1[16]); + col[17] = _mm_sub_epi16(stp1[14], stp1[17]); + col[18] = _mm_sub_epi16(stp1[13], stp1[18]); + col[19] = _mm_sub_epi16(stp1[12], stp1[19]); + col[20] = _mm_sub_epi16(stp1[11], stp1[20]); + col[21] = _mm_sub_epi16(stp1[10], stp1[21]); + col[22] = _mm_sub_epi16(stp1[9], stp1[22]); + col[23] = _mm_sub_epi16(stp1[8], stp1[23]); + col[24] = _mm_sub_epi16(stp1[7], stp1[24]); + col[25] = _mm_sub_epi16(stp1[6], stp1[25]); + col[26] = _mm_sub_epi16(stp1[5], stp1[26]); + col[27] = _mm_sub_epi16(stp1[4], stp1[27]); + col[28] = _mm_sub_epi16(stp1[3], stp1[28]); + col[29] = _mm_sub_epi16(stp1[2], stp1[29]); + col[30] = _mm_sub_epi16(stp1[1], stp1[30]); + col[31] = _mm_sub_epi16(stp1[0], stp1[31]); + for (i = 0; i < 4; i++) { + int j; + // Transpose 32x8 block to 8x32 block + array_transpose_8x8(col + i * 8, in); + idct32_34(in, stp1); + + // 2_D: Calculate the results and store them to destination. + in[0] = _mm_add_epi16(stp1[0], stp1[31]); + in[1] = _mm_add_epi16(stp1[1], stp1[30]); + in[2] = _mm_add_epi16(stp1[2], stp1[29]); + in[3] = _mm_add_epi16(stp1[3], stp1[28]); + in[4] = _mm_add_epi16(stp1[4], stp1[27]); + in[5] = _mm_add_epi16(stp1[5], stp1[26]); + in[6] = _mm_add_epi16(stp1[6], stp1[25]); + in[7] = _mm_add_epi16(stp1[7], stp1[24]); + in[8] = _mm_add_epi16(stp1[8], stp1[23]); + in[9] = _mm_add_epi16(stp1[9], stp1[22]); + in[10] = _mm_add_epi16(stp1[10], stp1[21]); + in[11] = _mm_add_epi16(stp1[11], stp1[20]); + in[12] = _mm_add_epi16(stp1[12], stp1[19]); + in[13] = _mm_add_epi16(stp1[13], stp1[18]); + in[14] = _mm_add_epi16(stp1[14], stp1[17]); + in[15] = _mm_add_epi16(stp1[15], stp1[16]); + in[16] = _mm_sub_epi16(stp1[15], stp1[16]); + in[17] = _mm_sub_epi16(stp1[14], stp1[17]); + in[18] = _mm_sub_epi16(stp1[13], stp1[18]); + in[19] = _mm_sub_epi16(stp1[12], stp1[19]); + in[20] = _mm_sub_epi16(stp1[11], stp1[20]); + in[21] = _mm_sub_epi16(stp1[10], stp1[21]); + in[22] = _mm_sub_epi16(stp1[9], stp1[22]); + in[23] = _mm_sub_epi16(stp1[8], stp1[23]); + in[24] = _mm_sub_epi16(stp1[7], stp1[24]); + in[25] = _mm_sub_epi16(stp1[6], stp1[25]); + in[26] = _mm_sub_epi16(stp1[5], stp1[26]); + in[27] = _mm_sub_epi16(stp1[4], stp1[27]); + in[28] = _mm_sub_epi16(stp1[3], stp1[28]); + in[29] = _mm_sub_epi16(stp1[2], stp1[29]); + in[30] = _mm_sub_epi16(stp1[1], stp1[30]); + in[31] = _mm_sub_epi16(stp1[0], stp1[31]); + + for (j = 0; j < 32; ++j) { + // Final rounding and shift + in[j] = _mm_adds_epi16(in[j], final_rounding); + in[j] = _mm_srai_epi16(in[j], 6); + RECON_AND_STORE(dest + j * stride, in[j]); + } + + dest += 8; + } +} + +// in0[16] represents the left 8x16 block +// in1[16] represents the right 8x16 block +static void load_buffer_16x16(const tran_low_t *input, __m128i *in0, + __m128i *in1) { + int i; + for (i = 0; i < 16; i++) { + in0[i] = load_input_data(input); + in1[i] = load_input_data(input + 8); + input += 32; + } +} + +static void array_transpose_16x16_2(__m128i *in0, __m128i *in1, __m128i *out0, + __m128i *out1) { + array_transpose_8x8(in0, out0); + array_transpose_8x8(&in0[8], out1); + array_transpose_8x8(in1, &out0[8]); + array_transpose_8x8(&in1[8], &out1[8]); +} + +// For each 8x16 block __m128i in[16], output __m128i col[32] +static void idct32_8x16_135(__m128i *in) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i stk1_0 = pair_set_epi16(2 * cospi_31_64, 2 * cospi_31_64); + const __m128i stk1_1 = pair_set_epi16(2 * cospi_1_64, 2 * cospi_1_64); + const __m128i stk1_2 = pair_set_epi16(-2 * cospi_17_64, -2 * cospi_17_64); + const __m128i stk1_3 = pair_set_epi16(2 * cospi_15_64, 2 * cospi_15_64); + + const __m128i stk1_4 = pair_set_epi16(2 * cospi_23_64, 2 * cospi_23_64); + const __m128i stk1_5 = pair_set_epi16(2 * cospi_9_64, 2 * cospi_9_64); + const __m128i stk1_6 = pair_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64); + const __m128i stk1_7 = pair_set_epi16(2 * cospi_7_64, 2 * cospi_7_64); + + const __m128i stk1_8 = pair_set_epi16(2 * cospi_27_64, 2 * cospi_27_64); + const __m128i stk1_9 = pair_set_epi16(2 * cospi_5_64, 2 * cospi_5_64); + const __m128i stk1_10 = pair_set_epi16(-2 * cospi_21_64, -2 * cospi_21_64); + const __m128i stk1_11 = pair_set_epi16(2 * cospi_11_64, 2 * cospi_11_64); + + const __m128i stk1_12 = pair_set_epi16(2 * cospi_19_64, 2 * cospi_19_64); + const __m128i stk1_13 = pair_set_epi16(2 * cospi_13_64, 2 * cospi_13_64); + const __m128i stk1_14 = pair_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64); + const __m128i stk1_15 = pair_set_epi16(2 * cospi_3_64, 2 * cospi_3_64); + + const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64); + const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64); + const __m128i stk2_2 = pair_set_epi16(-2 * cospi_18_64, -2 * cospi_18_64); + const __m128i stk2_3 = pair_set_epi16(2 * cospi_14_64, 2 * cospi_14_64); + + const __m128i stk2_4 = pair_set_epi16(2 * cospi_22_64, 2 * cospi_22_64); + const __m128i stk2_5 = pair_set_epi16(2 * cospi_10_64, 2 * cospi_10_64); + const __m128i stk2_6 = pair_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64); + const __m128i stk2_7 = pair_set_epi16(2 * cospi_6_64, 2 * cospi_6_64); + + const __m128i stk3_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64); + const __m128i stk3_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64); + const __m128i stk3_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64); + const __m128i stk3_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64); + + const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); + const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stk4_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64); + const __m128i stk4_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64); + const __m128i stk4_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64); + + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23, + stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, + stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23, + stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + /* Stage1 */ + stp1_16 = _mm_mulhrs_epi16(in[1], stk1_0); + stp1_31 = _mm_mulhrs_epi16(in[1], stk1_1); + stp1_17 = _mm_mulhrs_epi16(in[15], stk1_2); + stp1_30 = _mm_mulhrs_epi16(in[15], stk1_3); + + stp1_18 = _mm_mulhrs_epi16(in[9], stk1_4); + stp1_29 = _mm_mulhrs_epi16(in[9], stk1_5); + stp1_19 = _mm_mulhrs_epi16(in[7], stk1_6); + stp1_28 = _mm_mulhrs_epi16(in[7], stk1_7); + + stp1_20 = _mm_mulhrs_epi16(in[5], stk1_8); + stp1_27 = _mm_mulhrs_epi16(in[5], stk1_9); + stp1_21 = _mm_mulhrs_epi16(in[11], stk1_10); + stp1_26 = _mm_mulhrs_epi16(in[11], stk1_11); + + stp1_22 = _mm_mulhrs_epi16(in[13], stk1_12); + stp1_25 = _mm_mulhrs_epi16(in[13], stk1_13); + stp1_23 = _mm_mulhrs_epi16(in[3], stk1_14); + stp1_24 = _mm_mulhrs_epi16(in[3], stk1_15); + + /* Stage2 */ + stp2_8 = _mm_mulhrs_epi16(in[2], stk2_0); + stp2_15 = _mm_mulhrs_epi16(in[2], stk2_1); + stp2_9 = _mm_mulhrs_epi16(in[14], stk2_2); + stp2_14 = _mm_mulhrs_epi16(in[14], stk2_3); + + stp2_10 = _mm_mulhrs_epi16(in[10], stk2_4); + stp2_13 = _mm_mulhrs_epi16(in[10], stk2_5); + stp2_11 = _mm_mulhrs_epi16(in[6], stk2_6); + stp2_12 = _mm_mulhrs_epi16(in[6], stk2_7); + + stp2_16 = _mm_add_epi16(stp1_16, stp1_17); + stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); + stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); + stp2_19 = _mm_add_epi16(stp1_19, stp1_18); + + stp2_20 = _mm_add_epi16(stp1_20, stp1_21); + stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); + stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); + stp2_23 = _mm_add_epi16(stp1_23, stp1_22); + + stp2_24 = _mm_add_epi16(stp1_24, stp1_25); + stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); + stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); + stp2_27 = _mm_add_epi16(stp1_27, stp1_26); + + stp2_28 = _mm_add_epi16(stp1_28, stp1_29); + stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); + stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); + stp2_31 = _mm_add_epi16(stp1_31, stp1_30); + + /* Stage3 */ + { + const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); + const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); + + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); + + stp1_4 = _mm_mulhrs_epi16(in[4], stk3_0); + stp1_7 = _mm_mulhrs_epi16(in[4], stk3_1); + stp1_5 = _mm_mulhrs_epi16(in[12], stk3_2); + stp1_6 = _mm_mulhrs_epi16(in[12], stk3_3); + + stp2_0 = _mm_mulhrs_epi16(in[0], stk4_0); + stp2_1 = _mm_mulhrs_epi16(in[0], stk4_0); // stk4_1 = stk4_0 + stp2_2 = _mm_mulhrs_epi16(in[8], stk4_2); + stp2_3 = _mm_mulhrs_epi16(in[8], stk4_3); + + stp1_8 = _mm_add_epi16(stp2_8, stp2_9); + stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); + stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); + stp1_11 = _mm_add_epi16(stp2_11, stp2_10); + stp1_12 = _mm_add_epi16(stp2_12, stp2_13); + stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); + stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); + stp1_15 = _mm_add_epi16(stp2_15, stp2_14); + + MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, + stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, + stp1_29) + MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, + stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, + stp1_25) + + stp1_16 = stp2_16; + stp1_31 = stp2_31; + stp1_19 = stp2_19; + stp1_20 = stp2_20; + stp1_23 = stp2_23; + stp1_24 = stp2_24; + stp1_27 = stp2_27; + stp1_28 = stp2_28; + } + + /* Stage4 */ + { + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); + + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); + + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, stg4_5, + stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, stp2_13) + + stp2_8 = stp1_8; + stp2_15 = stp1_15; + stp2_11 = stp1_11; + stp2_12 = stp1_12; + + stp2_16 = _mm_add_epi16(stp1_16, stp1_19); + stp2_17 = _mm_add_epi16(stp1_17, stp1_18); + stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); + stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); + stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); + stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); + stp2_22 = _mm_add_epi16(stp1_22, stp1_21); + stp2_23 = _mm_add_epi16(stp1_23, stp1_20); + + stp2_24 = _mm_add_epi16(stp1_24, stp1_27); + stp2_25 = _mm_add_epi16(stp1_25, stp1_26); + stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); + stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); + stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); + stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); + stp2_30 = _mm_add_epi16(stp1_29, stp1_30); + stp2_31 = _mm_add_epi16(stp1_28, stp1_31); + } + + /* Stage5 */ + { + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); + + const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); + const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); + + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); + + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); + + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + + stp1_5 = _mm_packs_epi32(tmp0, tmp1); + stp1_6 = _mm_packs_epi32(tmp2, tmp3); + + stp1_4 = stp2_4; + stp1_7 = stp2_7; + + stp1_8 = _mm_add_epi16(stp2_8, stp2_11); + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); + stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); + stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); + stp1_15 = _mm_add_epi16(stp2_15, stp2_12); + + stp1_16 = stp2_16; + stp1_17 = stp2_17; + + MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, + stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, + stp1_28) + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, + stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, + stp1_26) + + stp1_22 = stp2_22; + stp1_23 = stp2_23; + stp1_24 = stp2_24; + stp1_25 = stp2_25; + stp1_30 = stp2_30; + stp1_31 = stp2_31; + } + + /* Stage6 */ + { + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); + + stp2_0 = _mm_add_epi16(stp1_0, stp1_7); + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); + stp2_3 = _mm_add_epi16(stp1_3, stp1_4); + stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); + stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); + + stp2_8 = stp1_8; + stp2_9 = stp1_9; + stp2_14 = stp1_14; + stp2_15 = stp1_15; + + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, + stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, + stp2_12) + + stp2_16 = _mm_add_epi16(stp1_16, stp1_23); + stp2_17 = _mm_add_epi16(stp1_17, stp1_22); + stp2_18 = _mm_add_epi16(stp1_18, stp1_21); + stp2_19 = _mm_add_epi16(stp1_19, stp1_20); + stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); + stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); + stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); + stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); + + stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); + stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); + stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); + stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); + stp2_28 = _mm_add_epi16(stp1_27, stp1_28); + stp2_29 = _mm_add_epi16(stp1_26, stp1_29); + stp2_30 = _mm_add_epi16(stp1_25, stp1_30); + stp2_31 = _mm_add_epi16(stp1_24, stp1_31); + } + + /* Stage7 */ + { + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); + + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); + const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); + const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); + + stp1_0 = _mm_add_epi16(stp2_0, stp2_15); + stp1_1 = _mm_add_epi16(stp2_1, stp2_14); + stp1_2 = _mm_add_epi16(stp2_2, stp2_13); + stp1_3 = _mm_add_epi16(stp2_3, stp2_12); + stp1_4 = _mm_add_epi16(stp2_4, stp2_11); + stp1_5 = _mm_add_epi16(stp2_5, stp2_10); + stp1_6 = _mm_add_epi16(stp2_6, stp2_9); + stp1_7 = _mm_add_epi16(stp2_7, stp2_8); + stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); + stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); + stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); + stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); + stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); + stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); + stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); + stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); + + stp1_16 = stp2_16; + stp1_17 = stp2_17; + stp1_18 = stp2_18; + stp1_19 = stp2_19; + + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, + stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, + stp1_26) + MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, + stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, + stp1_24) + + stp1_28 = stp2_28; + stp1_29 = stp2_29; + stp1_30 = stp2_30; + stp1_31 = stp2_31; + } + + in[0] = _mm_add_epi16(stp1_0, stp1_31); + in[1] = _mm_add_epi16(stp1_1, stp1_30); + in[2] = _mm_add_epi16(stp1_2, stp1_29); + in[3] = _mm_add_epi16(stp1_3, stp1_28); + in[4] = _mm_add_epi16(stp1_4, stp1_27); + in[5] = _mm_add_epi16(stp1_5, stp1_26); + in[6] = _mm_add_epi16(stp1_6, stp1_25); + in[7] = _mm_add_epi16(stp1_7, stp1_24); + in[8] = _mm_add_epi16(stp1_8, stp1_23); + in[9] = _mm_add_epi16(stp1_9, stp1_22); + in[10] = _mm_add_epi16(stp1_10, stp1_21); + in[11] = _mm_add_epi16(stp1_11, stp1_20); + in[12] = _mm_add_epi16(stp1_12, stp1_19); + in[13] = _mm_add_epi16(stp1_13, stp1_18); + in[14] = _mm_add_epi16(stp1_14, stp1_17); + in[15] = _mm_add_epi16(stp1_15, stp1_16); + in[16] = _mm_sub_epi16(stp1_15, stp1_16); + in[17] = _mm_sub_epi16(stp1_14, stp1_17); + in[18] = _mm_sub_epi16(stp1_13, stp1_18); + in[19] = _mm_sub_epi16(stp1_12, stp1_19); + in[20] = _mm_sub_epi16(stp1_11, stp1_20); + in[21] = _mm_sub_epi16(stp1_10, stp1_21); + in[22] = _mm_sub_epi16(stp1_9, stp1_22); + in[23] = _mm_sub_epi16(stp1_8, stp1_23); + in[24] = _mm_sub_epi16(stp1_7, stp1_24); + in[25] = _mm_sub_epi16(stp1_6, stp1_25); + in[26] = _mm_sub_epi16(stp1_5, stp1_26); + in[27] = _mm_sub_epi16(stp1_4, stp1_27); + in[28] = _mm_sub_epi16(stp1_3, stp1_28); + in[29] = _mm_sub_epi16(stp1_2, stp1_29); + in[30] = _mm_sub_epi16(stp1_1, stp1_30); + in[31] = _mm_sub_epi16(stp1_0, stp1_31); +} + +static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) { + const __m128i final_rounding = _mm_set1_epi16(1 << 5); + const __m128i zero = _mm_setzero_si128(); + int j = 0; + while (j < 32) { + in[j] = _mm_adds_epi16(in[j], final_rounding); + in[j + 1] = _mm_adds_epi16(in[j + 1], final_rounding); + + in[j] = _mm_srai_epi16(in[j], 6); + in[j + 1] = _mm_srai_epi16(in[j + 1], 6); + + RECON_AND_STORE(dst, in[j]); + dst += stride; + RECON_AND_STORE(dst, in[j + 1]); + dst += stride; + j += 2; + } +} + +static INLINE void recon_and_store(__m128i *in0, __m128i *in1, uint8_t *dest, + int stride) { + store_buffer_8x32(in0, dest, stride); + store_buffer_8x32(in1, dest + 8, stride); +} + +static INLINE void idct32_135(__m128i *col0, __m128i *col1) { + idct32_8x16_135(col0); + idct32_8x16_135(col1); +} + +// Only upper-left 16x16 has non-zero coeff +void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i col0[32], col1[32], col2[32], col3[32]; + + // Load input data. Only need to load the top left 16x16 block. + load_buffer_16x16(input, col2, col3); + + // columns + array_transpose_16x16_2(col2, col3, col0, col1); + idct32_135(col0, col1); + + // rows + array_transpose_16x16_2(col0, col1, col2, col3); + idct32_135(col2, col3); + recon_and_store(col2, col3, dest, stride); + + array_transpose_16x16_2(&col0[16], &col1[16], col2, col3); + idct32_135(col2, col3); + recon_and_store(col2, col3, dest + 16, stride); +} + +// For each 8x32 block __m128i in[32], output __m128i in[32] +static void idct32_8x32(const __m128i *in, __m128i *out) { + const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING); + // idct constants for each stage + const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64); + const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64); + const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64); + const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64); + const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64); + const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64); + const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64); + const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64); + const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64); + const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64); + const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64); + const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64); + const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64); + const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64); + const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64); + const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64); + + const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64); + const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64); + const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64); + const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64); + const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64); + const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64); + const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64); + const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64); + + const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64); + const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64); + const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64); + const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64); + const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64); + const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64); + + const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64); + const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64); + const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64); + const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + + const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64); + __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7, + stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15, + stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23, + stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31; + __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7, + stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15, + stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23, + stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31; + __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; + + /* Stage1 */ + { + const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]); + const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]); + const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]); + const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]); + + const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]); + const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]); + const __m128i lo_25_7 = _mm_unpacklo_epi16(in[25], in[7]); + const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]); + + const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]); + const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]); + const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]); + const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]); + + const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]); + const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]); + const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]); + const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]); + + MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, stg1_1, + stg1_2, stg1_3, stp1_16, stp1_31, stp1_17, stp1_30) + MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5, + stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28) + MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, stg1_9, + stg1_10, stg1_11, stp1_20, stp1_27, stp1_21, stp1_26) + MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12, + stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, stp1_23, + stp1_24) + } + + /* Stage2 */ + { + const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]); + const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]); + const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]); + const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]); + + const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]); + const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]); + const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]); + const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]); + + MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, stg2_1, + stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14) + MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, stg2_5, + stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12) + + stp2_16 = _mm_add_epi16(stp1_16, stp1_17); + stp2_17 = _mm_sub_epi16(stp1_16, stp1_17); + stp2_18 = _mm_sub_epi16(stp1_19, stp1_18); + stp2_19 = _mm_add_epi16(stp1_19, stp1_18); + + stp2_20 = _mm_add_epi16(stp1_20, stp1_21); + stp2_21 = _mm_sub_epi16(stp1_20, stp1_21); + stp2_22 = _mm_sub_epi16(stp1_23, stp1_22); + stp2_23 = _mm_add_epi16(stp1_23, stp1_22); + + stp2_24 = _mm_add_epi16(stp1_24, stp1_25); + stp2_25 = _mm_sub_epi16(stp1_24, stp1_25); + stp2_26 = _mm_sub_epi16(stp1_27, stp1_26); + stp2_27 = _mm_add_epi16(stp1_27, stp1_26); + + stp2_28 = _mm_add_epi16(stp1_28, stp1_29); + stp2_29 = _mm_sub_epi16(stp1_28, stp1_29); + stp2_30 = _mm_sub_epi16(stp1_31, stp1_30); + stp2_31 = _mm_add_epi16(stp1_31, stp1_30); + } + + /* Stage3 */ + { + const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]); + const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]); + const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]); + const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]); + + const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30); + const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30); + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); + + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); + + MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, stg3_1, + stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6) + + stp1_8 = _mm_add_epi16(stp2_8, stp2_9); + stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); + stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); + stp1_11 = _mm_add_epi16(stp2_11, stp2_10); + stp1_12 = _mm_add_epi16(stp2_12, stp2_13); + stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); + stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); + stp1_15 = _mm_add_epi16(stp2_15, stp2_14); + + MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4, + stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18, + stp1_29) + MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8, + stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22, + stp1_25) + + stp1_16 = stp2_16; + stp1_31 = stp2_31; + stp1_19 = stp2_19; + stp1_20 = stp2_20; + stp1_23 = stp2_23; + stp1_24 = stp2_24; + stp1_27 = stp2_27; + stp1_28 = stp2_28; + } + + /* Stage4 */ + { + const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]); + const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]); + const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]); + const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]); + + const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); + const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); + + MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1, + stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3) + + stp2_4 = _mm_add_epi16(stp1_4, stp1_5); + stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); + stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); + stp2_7 = _mm_add_epi16(stp1_7, stp1_6); + + MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, stg4_5, + stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, stp2_13) + + stp2_8 = stp1_8; + stp2_15 = stp1_15; + stp2_11 = stp1_11; + stp2_12 = stp1_12; + + stp2_16 = _mm_add_epi16(stp1_16, stp1_19); + stp2_17 = _mm_add_epi16(stp1_17, stp1_18); + stp2_18 = _mm_sub_epi16(stp1_17, stp1_18); + stp2_19 = _mm_sub_epi16(stp1_16, stp1_19); + stp2_20 = _mm_sub_epi16(stp1_23, stp1_20); + stp2_21 = _mm_sub_epi16(stp1_22, stp1_21); + stp2_22 = _mm_add_epi16(stp1_22, stp1_21); + stp2_23 = _mm_add_epi16(stp1_23, stp1_20); + + stp2_24 = _mm_add_epi16(stp1_24, stp1_27); + stp2_25 = _mm_add_epi16(stp1_25, stp1_26); + stp2_26 = _mm_sub_epi16(stp1_25, stp1_26); + stp2_27 = _mm_sub_epi16(stp1_24, stp1_27); + stp2_28 = _mm_sub_epi16(stp1_31, stp1_28); + stp2_29 = _mm_sub_epi16(stp1_30, stp1_29); + stp2_30 = _mm_add_epi16(stp1_29, stp1_30); + stp2_31 = _mm_add_epi16(stp1_28, stp1_31); + } + + /* Stage5 */ + { + const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); + const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); + const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29); + const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29); + + const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28); + const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28); + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); + + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); + + stp1_0 = _mm_add_epi16(stp2_0, stp2_3); + stp1_1 = _mm_add_epi16(stp2_1, stp2_2); + stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); + stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); + + tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); + tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); + tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); + tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); + + tmp0 = _mm_add_epi32(tmp0, rounding); + tmp1 = _mm_add_epi32(tmp1, rounding); + tmp2 = _mm_add_epi32(tmp2, rounding); + tmp3 = _mm_add_epi32(tmp3, rounding); + + tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); + tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); + tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); + tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); + + stp1_5 = _mm_packs_epi32(tmp0, tmp1); + stp1_6 = _mm_packs_epi32(tmp2, tmp3); + + stp1_4 = stp2_4; + stp1_7 = stp2_7; + + stp1_8 = _mm_add_epi16(stp2_8, stp2_11); + stp1_9 = _mm_add_epi16(stp2_9, stp2_10); + stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); + stp1_11 = _mm_sub_epi16(stp2_8, stp2_11); + stp1_12 = _mm_sub_epi16(stp2_15, stp2_12); + stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); + stp1_14 = _mm_add_epi16(stp2_14, stp2_13); + stp1_15 = _mm_add_epi16(stp2_15, stp2_12); + + stp1_16 = stp2_16; + stp1_17 = stp2_17; + + MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4, + stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19, + stp1_28) + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6, + stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21, + stp1_26) + + stp1_22 = stp2_22; + stp1_23 = stp2_23; + stp1_24 = stp2_24; + stp1_25 = stp2_25; + stp1_30 = stp2_30; + stp1_31 = stp2_31; + } + + /* Stage6 */ + { + const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); + const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); + const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); + const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); + + stp2_0 = _mm_add_epi16(stp1_0, stp1_7); + stp2_1 = _mm_add_epi16(stp1_1, stp1_6); + stp2_2 = _mm_add_epi16(stp1_2, stp1_5); + stp2_3 = _mm_add_epi16(stp1_3, stp1_4); + stp2_4 = _mm_sub_epi16(stp1_3, stp1_4); + stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); + stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); + stp2_7 = _mm_sub_epi16(stp1_0, stp1_7); + + stp2_8 = stp1_8; + stp2_9 = stp1_9; + stp2_14 = stp1_14; + stp2_15 = stp1_15; + + MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0, + stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11, + stp2_12) + + stp2_16 = _mm_add_epi16(stp1_16, stp1_23); + stp2_17 = _mm_add_epi16(stp1_17, stp1_22); + stp2_18 = _mm_add_epi16(stp1_18, stp1_21); + stp2_19 = _mm_add_epi16(stp1_19, stp1_20); + stp2_20 = _mm_sub_epi16(stp1_19, stp1_20); + stp2_21 = _mm_sub_epi16(stp1_18, stp1_21); + stp2_22 = _mm_sub_epi16(stp1_17, stp1_22); + stp2_23 = _mm_sub_epi16(stp1_16, stp1_23); + + stp2_24 = _mm_sub_epi16(stp1_31, stp1_24); + stp2_25 = _mm_sub_epi16(stp1_30, stp1_25); + stp2_26 = _mm_sub_epi16(stp1_29, stp1_26); + stp2_27 = _mm_sub_epi16(stp1_28, stp1_27); + stp2_28 = _mm_add_epi16(stp1_27, stp1_28); + stp2_29 = _mm_add_epi16(stp1_26, stp1_29); + stp2_30 = _mm_add_epi16(stp1_25, stp1_30); + stp2_31 = _mm_add_epi16(stp1_24, stp1_31); + } + + /* Stage7 */ + { + const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27); + const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27); + const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26); + const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26); + + const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25); + const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25); + const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24); + const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24); + + stp1_0 = _mm_add_epi16(stp2_0, stp2_15); + stp1_1 = _mm_add_epi16(stp2_1, stp2_14); + stp1_2 = _mm_add_epi16(stp2_2, stp2_13); + stp1_3 = _mm_add_epi16(stp2_3, stp2_12); + stp1_4 = _mm_add_epi16(stp2_4, stp2_11); + stp1_5 = _mm_add_epi16(stp2_5, stp2_10); + stp1_6 = _mm_add_epi16(stp2_6, stp2_9); + stp1_7 = _mm_add_epi16(stp2_7, stp2_8); + stp1_8 = _mm_sub_epi16(stp2_7, stp2_8); + stp1_9 = _mm_sub_epi16(stp2_6, stp2_9); + stp1_10 = _mm_sub_epi16(stp2_5, stp2_10); + stp1_11 = _mm_sub_epi16(stp2_4, stp2_11); + stp1_12 = _mm_sub_epi16(stp2_3, stp2_12); + stp1_13 = _mm_sub_epi16(stp2_2, stp2_13); + stp1_14 = _mm_sub_epi16(stp2_1, stp2_14); + stp1_15 = _mm_sub_epi16(stp2_0, stp2_15); + + stp1_16 = stp2_16; + stp1_17 = stp2_17; + stp1_18 = stp2_18; + stp1_19 = stp2_19; + + MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0, + stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21, + stp1_26) + MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0, + stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23, + stp1_24) + + stp1_28 = stp2_28; + stp1_29 = stp2_29; + stp1_30 = stp2_30; + stp1_31 = stp2_31; + } + + out[0] = _mm_add_epi16(stp1_0, stp1_31); + out[1] = _mm_add_epi16(stp1_1, stp1_30); + out[2] = _mm_add_epi16(stp1_2, stp1_29); + out[3] = _mm_add_epi16(stp1_3, stp1_28); + out[4] = _mm_add_epi16(stp1_4, stp1_27); + out[5] = _mm_add_epi16(stp1_5, stp1_26); + out[6] = _mm_add_epi16(stp1_6, stp1_25); + out[7] = _mm_add_epi16(stp1_7, stp1_24); + out[8] = _mm_add_epi16(stp1_8, stp1_23); + out[9] = _mm_add_epi16(stp1_9, stp1_22); + out[10] = _mm_add_epi16(stp1_10, stp1_21); + out[11] = _mm_add_epi16(stp1_11, stp1_20); + out[12] = _mm_add_epi16(stp1_12, stp1_19); + out[13] = _mm_add_epi16(stp1_13, stp1_18); + out[14] = _mm_add_epi16(stp1_14, stp1_17); + out[15] = _mm_add_epi16(stp1_15, stp1_16); + out[16] = _mm_sub_epi16(stp1_15, stp1_16); + out[17] = _mm_sub_epi16(stp1_14, stp1_17); + out[18] = _mm_sub_epi16(stp1_13, stp1_18); + out[19] = _mm_sub_epi16(stp1_12, stp1_19); + out[20] = _mm_sub_epi16(stp1_11, stp1_20); + out[21] = _mm_sub_epi16(stp1_10, stp1_21); + out[22] = _mm_sub_epi16(stp1_9, stp1_22); + out[23] = _mm_sub_epi16(stp1_8, stp1_23); + out[24] = _mm_sub_epi16(stp1_7, stp1_24); + out[25] = _mm_sub_epi16(stp1_6, stp1_25); + out[26] = _mm_sub_epi16(stp1_5, stp1_26); + out[27] = _mm_sub_epi16(stp1_4, stp1_27); + out[28] = _mm_sub_epi16(stp1_3, stp1_28); + out[29] = _mm_sub_epi16(stp1_2, stp1_29); + out[30] = _mm_sub_epi16(stp1_1, stp1_30); + out[31] = _mm_sub_epi16(stp1_0, stp1_31); +} + +static void load_buffer_8x32(const tran_low_t *input, __m128i *in) { + int i; + for (i = 0; i < 8; ++i) { + in[i] = load_input_data(input); + in[i + 8] = load_input_data(input + 8); + in[i + 16] = load_input_data(input + 16); + in[i + 24] = load_input_data(input + 24); + input += 32; + } +} + +void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest, + int stride) { + __m128i col[128], in[32]; + int i, j; + + // rows + for (i = 0; i < 4; ++i) { + load_buffer_8x32(input, in); + input += 32 << 3; + + // Transpose 32x8 block to 8x32 block + array_transpose_8x8(in, in); + array_transpose_8x8(in + 8, in + 8); + array_transpose_8x8(in + 16, in + 16); + array_transpose_8x8(in + 24, in + 24); + + idct32_8x32(in, col + (i << 5)); + } + + // columns + for (i = 0; i < 4; ++i) { + j = i << 3; + // Transpose 32x8 block to 8x32 block + array_transpose_8x8(col + j, in); + array_transpose_8x8(col + j + 32, in + 8); + array_transpose_8x8(col + j + 64, in + 16); + array_transpose_8x8(col + j + 96, in + 24); + + idct32_8x32(in, in); + store_buffer_8x32(in, dest, stride); + dest += 8; + } +} diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm deleted file mode 100644 index dee64e3ad36..00000000000 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm +++ /dev/null @@ -1,1793 +0,0 @@ -; -; Copyright (c) 2014 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "third_party/x86inc/x86inc.asm" - -; This file provides SSSE3 version of the inverse transformation. Part -; of the functions are originally derived from the ffmpeg project. -; Note that the current version applies to x86 64-bit only. - -SECTION_RODATA - -pw_11585x2: times 8 dw 23170 - -pw_m2404x2: times 8 dw -2404*2 -pw_m4756x2: times 8 dw -4756*2 -pw_m5520x2: times 8 dw -5520*2 -pw_m8423x2: times 8 dw -8423*2 -pw_m9102x2: times 8 dw -9102*2 -pw_m10394x2: times 8 dw -10394*2 -pw_m11003x2: times 8 dw -11003*2 - -pw_16364x2: times 8 dw 16364*2 -pw_16305x2: times 8 dw 16305*2 -pw_16207x2: times 8 dw 16207*2 -pw_16069x2: times 8 dw 16069*2 -pw_15893x2: times 8 dw 15893*2 -pw_15679x2: times 8 dw 15679*2 -pw_15426x2: times 8 dw 15426*2 -pw_15137x2: times 8 dw 15137*2 -pw_14811x2: times 8 dw 14811*2 -pw_14449x2: times 8 dw 14449*2 -pw_14053x2: times 8 dw 14053*2 -pw_13623x2: times 8 dw 13623*2 -pw_13160x2: times 8 dw 13160*2 -pw_12665x2: times 8 dw 12665*2 -pw_12140x2: times 8 dw 12140*2 -pw__9760x2: times 8 dw 9760*2 -pw__7723x2: times 8 dw 7723*2 -pw__7005x2: times 8 dw 7005*2 -pw__6270x2: times 8 dw 6270*2 -pw__3981x2: times 8 dw 3981*2 -pw__3196x2: times 8 dw 3196*2 -pw__1606x2: times 8 dw 1606*2 -pw___804x2: times 8 dw 804*2 - -pd_8192: times 4 dd 8192 -pw_32: times 8 dw 32 -pw_16: times 8 dw 16 - -%macro TRANSFORM_COEFFS 2 -pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2 -pw_m%2_%1: dw -%2, %1, -%2, %1, -%2, %1, -%2, %1 -pw_m%1_m%2: dw -%1, -%2, -%1, -%2, -%1, -%2, -%1, -%2 -%endmacro - -TRANSFORM_COEFFS 6270, 15137 -TRANSFORM_COEFFS 3196, 16069 -TRANSFORM_COEFFS 13623, 9102 - -; constants for 32x32_34 -TRANSFORM_COEFFS 804, 16364 -TRANSFORM_COEFFS 15426, 5520 -TRANSFORM_COEFFS 3981, 15893 -TRANSFORM_COEFFS 16207, 2404 -TRANSFORM_COEFFS 1606, 16305 -TRANSFORM_COEFFS 15679, 4756 -TRANSFORM_COEFFS 11585, 11585 - -; constants for 32x32_1024 -TRANSFORM_COEFFS 12140, 11003 -TRANSFORM_COEFFS 7005, 14811 -TRANSFORM_COEFFS 14053, 8423 -TRANSFORM_COEFFS 9760, 13160 -TRANSFORM_COEFFS 12665, 10394 -TRANSFORM_COEFFS 7723, 14449 - -%macro PAIR_PP_COEFFS 2 -dpw_%1_%2: dw %1, %1, %1, %1, %2, %2, %2, %2 -%endmacro - -%macro PAIR_MP_COEFFS 2 -dpw_m%1_%2: dw -%1, -%1, -%1, -%1, %2, %2, %2, %2 -%endmacro - -%macro PAIR_MM_COEFFS 2 -dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2 -%endmacro - -PAIR_PP_COEFFS 30274, 12540 -PAIR_PP_COEFFS 6392, 32138 -PAIR_MP_COEFFS 18204, 27246 - -PAIR_PP_COEFFS 12540, 12540 -PAIR_PP_COEFFS 30274, 30274 -PAIR_PP_COEFFS 6392, 6392 -PAIR_PP_COEFFS 32138, 32138 -PAIR_MM_COEFFS 18204, 18204 -PAIR_PP_COEFFS 27246, 27246 - -SECTION .text - -%if ARCH_X86_64 -%macro SUM_SUB 3 - psubw m%3, m%1, m%2 - paddw m%1, m%2 - SWAP %2, %3 -%endmacro - -; butterfly operation -%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2 - pmaddwd m%1, m%3, %5 - pmaddwd m%2, m%3, %6 - paddd m%1, %4 - paddd m%2, %4 - psrad m%1, 14 - psrad m%2, 14 -%endmacro - -%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2 - punpckhwd m%6, m%2, m%1 - MUL_ADD_2X %7, %6, %6, %5, [pw_m%4_%3], [pw_%3_%4] - punpcklwd m%2, m%1 - MUL_ADD_2X %1, %2, %2, %5, [pw_m%4_%3], [pw_%3_%4] - packssdw m%1, m%7 - packssdw m%2, m%6 -%endmacro - -%macro BUTTERFLY_4Xmm 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2 - punpckhwd m%6, m%2, m%1 - MUL_ADD_2X %7, %6, %6, %5, [pw_m%4_%3], [pw_m%3_m%4] - punpcklwd m%2, m%1 - MUL_ADD_2X %1, %2, %2, %5, [pw_m%4_%3], [pw_m%3_m%4] - packssdw m%1, m%7 - packssdw m%2, m%6 -%endmacro - -; matrix transpose -%macro INTERLEAVE_2X 4 - punpckh%1 m%4, m%2, m%3 - punpckl%1 m%2, m%3 - SWAP %3, %4 -%endmacro - -%macro TRANSPOSE8X8 9 - INTERLEAVE_2X wd, %1, %2, %9 - INTERLEAVE_2X wd, %3, %4, %9 - INTERLEAVE_2X wd, %5, %6, %9 - INTERLEAVE_2X wd, %7, %8, %9 - - INTERLEAVE_2X dq, %1, %3, %9 - INTERLEAVE_2X dq, %2, %4, %9 - INTERLEAVE_2X dq, %5, %7, %9 - INTERLEAVE_2X dq, %6, %8, %9 - - INTERLEAVE_2X qdq, %1, %5, %9 - INTERLEAVE_2X qdq, %3, %7, %9 - INTERLEAVE_2X qdq, %2, %6, %9 - INTERLEAVE_2X qdq, %4, %8, %9 - - SWAP %2, %5 - SWAP %4, %7 -%endmacro - -%macro IDCT8_1D 0 - SUM_SUB 0, 4, 9 - BUTTERFLY_4X 2, 6, 6270, 15137, m8, 9, 10 - pmulhrsw m0, m12 - pmulhrsw m4, m12 - BUTTERFLY_4X 1, 7, 3196, 16069, m8, 9, 10 - BUTTERFLY_4X 5, 3, 13623, 9102, m8, 9, 10 - - SUM_SUB 1, 5, 9 - SUM_SUB 7, 3, 9 - SUM_SUB 0, 6, 9 - SUM_SUB 4, 2, 9 - SUM_SUB 3, 5, 9 - pmulhrsw m3, m12 - pmulhrsw m5, m12 - - SUM_SUB 0, 7, 9 - SUM_SUB 4, 3, 9 - SUM_SUB 2, 5, 9 - SUM_SUB 6, 1, 9 - - SWAP 3, 6 - SWAP 1, 4 -%endmacro - -; This macro handles 8 pixels per line -%macro ADD_STORE_8P_2X 5; src1, src2, tmp1, tmp2, zero - paddw m%1, m11 - paddw m%2, m11 - psraw m%1, 5 - psraw m%2, 5 - - movh m%3, [outputq] - movh m%4, [outputq + strideq] - punpcklbw m%3, m%5 - punpcklbw m%4, m%5 - paddw m%3, m%1 - paddw m%4, m%2 - packuswb m%3, m%5 - packuswb m%4, m%5 - movh [outputq], m%3 - movh [outputq + strideq], m%4 -%endmacro - -INIT_XMM ssse3 -; full inverse 8x8 2D-DCT transform -cglobal idct8x8_64_add, 3, 5, 13, input, output, stride - mova m8, [pd_8192] - mova m11, [pw_16] - mova m12, [pw_11585x2] - - lea r3, [2 * strideq] -%if CONFIG_VP9_HIGHBITDEPTH - mova m0, [inputq + 0] - packssdw m0, [inputq + 16] - mova m1, [inputq + 32] - packssdw m1, [inputq + 48] - mova m2, [inputq + 64] - packssdw m2, [inputq + 80] - mova m3, [inputq + 96] - packssdw m3, [inputq + 112] - mova m4, [inputq + 128] - packssdw m4, [inputq + 144] - mova m5, [inputq + 160] - packssdw m5, [inputq + 176] - mova m6, [inputq + 192] - packssdw m6, [inputq + 208] - mova m7, [inputq + 224] - packssdw m7, [inputq + 240] -%else - mova m0, [inputq + 0] - mova m1, [inputq + 16] - mova m2, [inputq + 32] - mova m3, [inputq + 48] - mova m4, [inputq + 64] - mova m5, [inputq + 80] - mova m6, [inputq + 96] - mova m7, [inputq + 112] -%endif - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - IDCT8_1D - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - IDCT8_1D - - pxor m12, m12 - ADD_STORE_8P_2X 0, 1, 9, 10, 12 - lea outputq, [outputq + r3] - ADD_STORE_8P_2X 2, 3, 9, 10, 12 - lea outputq, [outputq + r3] - ADD_STORE_8P_2X 4, 5, 9, 10, 12 - lea outputq, [outputq + r3] - ADD_STORE_8P_2X 6, 7, 9, 10, 12 - - RET - -; inverse 8x8 2D-DCT transform with only first 12 coeffs non-zero -cglobal idct8x8_12_add, 3, 5, 13, input, output, stride - mova m8, [pd_8192] - mova m11, [pw_16] - mova m12, [pw_11585x2] - - lea r3, [2 * strideq] - -%if CONFIG_VP9_HIGHBITDEPTH - mova m0, [inputq + 0] - packssdw m0, [inputq + 16] - mova m1, [inputq + 32] - packssdw m1, [inputq + 48] - mova m2, [inputq + 64] - packssdw m2, [inputq + 80] - mova m3, [inputq + 96] - packssdw m3, [inputq + 112] -%else - mova m0, [inputq + 0] - mova m1, [inputq + 16] - mova m2, [inputq + 32] - mova m3, [inputq + 48] -%endif - - punpcklwd m0, m1 - punpcklwd m2, m3 - punpckhdq m9, m0, m2 - punpckldq m0, m2 - SWAP 2, 9 - - ; m0 -> [0], [0] - ; m1 -> [1], [1] - ; m2 -> [2], [2] - ; m3 -> [3], [3] - punpckhqdq m10, m0, m0 - punpcklqdq m0, m0 - punpckhqdq m9, m2, m2 - punpcklqdq m2, m2 - SWAP 1, 10 - SWAP 3, 9 - - pmulhrsw m0, m12 - pmulhrsw m2, [dpw_30274_12540] - pmulhrsw m1, [dpw_6392_32138] - pmulhrsw m3, [dpw_m18204_27246] - - SUM_SUB 0, 2, 9 - SUM_SUB 1, 3, 9 - - punpcklqdq m9, m3, m3 - punpckhqdq m5, m3, m9 - - SUM_SUB 3, 5, 9 - punpckhqdq m5, m3 - pmulhrsw m5, m12 - - punpckhqdq m9, m1, m5 - punpcklqdq m1, m5 - SWAP 5, 9 - - SUM_SUB 0, 5, 9 - SUM_SUB 2, 1, 9 - - punpckhqdq m3, m0, m0 - punpckhqdq m4, m1, m1 - punpckhqdq m6, m5, m5 - punpckhqdq m7, m2, m2 - - punpcklwd m0, m3 - punpcklwd m7, m2 - punpcklwd m1, m4 - punpcklwd m6, m5 - - punpckhdq m4, m0, m7 - punpckldq m0, m7 - punpckhdq m10, m1, m6 - punpckldq m5, m1, m6 - - punpckhqdq m1, m0, m5 - punpcklqdq m0, m5 - punpckhqdq m3, m4, m10 - punpcklqdq m2, m4, m10 - - - pmulhrsw m0, m12 - pmulhrsw m6, m2, [dpw_30274_30274] - pmulhrsw m4, m2, [dpw_12540_12540] - - pmulhrsw m7, m1, [dpw_32138_32138] - pmulhrsw m1, [dpw_6392_6392] - pmulhrsw m5, m3, [dpw_m18204_m18204] - pmulhrsw m3, [dpw_27246_27246] - - mova m2, m0 - SUM_SUB 0, 6, 9 - SUM_SUB 2, 4, 9 - SUM_SUB 1, 5, 9 - SUM_SUB 7, 3, 9 - - SUM_SUB 3, 5, 9 - pmulhrsw m3, m12 - pmulhrsw m5, m12 - - SUM_SUB 0, 7, 9 - SUM_SUB 2, 3, 9 - SUM_SUB 4, 5, 9 - SUM_SUB 6, 1, 9 - - SWAP 3, 6 - SWAP 1, 2 - SWAP 2, 4 - - - pxor m12, m12 - ADD_STORE_8P_2X 0, 1, 9, 10, 12 - lea outputq, [outputq + r3] - ADD_STORE_8P_2X 2, 3, 9, 10, 12 - lea outputq, [outputq + r3] - ADD_STORE_8P_2X 4, 5, 9, 10, 12 - lea outputq, [outputq + r3] - ADD_STORE_8P_2X 6, 7, 9, 10, 12 - - RET - -%define idx0 16 * 0 -%define idx1 16 * 1 -%define idx2 16 * 2 -%define idx3 16 * 3 -%define idx4 16 * 4 -%define idx5 16 * 5 -%define idx6 16 * 6 -%define idx7 16 * 7 -%define idx8 16 * 0 -%define idx9 16 * 1 -%define idx10 16 * 2 -%define idx11 16 * 3 -%define idx12 16 * 4 -%define idx13 16 * 5 -%define idx14 16 * 6 -%define idx15 16 * 7 -%define idx16 16 * 0 -%define idx17 16 * 1 -%define idx18 16 * 2 -%define idx19 16 * 3 -%define idx20 16 * 4 -%define idx21 16 * 5 -%define idx22 16 * 6 -%define idx23 16 * 7 -%define idx24 16 * 0 -%define idx25 16 * 1 -%define idx26 16 * 2 -%define idx27 16 * 3 -%define idx28 16 * 4 -%define idx29 16 * 5 -%define idx30 16 * 6 -%define idx31 16 * 7 - -; FROM idct32x32_add_neon.asm -; -; Instead of doing the transforms stage by stage, it is done by loading -; some input values and doing as many stages as possible to minimize the -; storing/loading of intermediate results. To fit within registers, the -; final coefficients are cut into four blocks: -; BLOCK A: 16-19,28-31 -; BLOCK B: 20-23,24-27 -; BLOCK C: 8-11,12-15 -; BLOCK D: 0-3,4-7 -; Blocks A and C are straight calculation through the various stages. In -; block B, further calculations are performed using the results from -; block A. In block D, further calculations are performed using the results -; from block C and then the final calculations are done using results from -; block A and B which have been combined at the end of block B. -; - -%macro IDCT32X32_34 4 - ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m11, m1 - pmulhrsw m1, [pw___804x2] ; stp1_16 - mova [r4 + 0], m0 - pmulhrsw m11, [pw_16364x2] ; stp2_31 - mova [r4 + 16 * 2], m2 - mova m12, m7 - pmulhrsw m7, [pw_15426x2] ; stp1_28 - mova [r4 + 16 * 4], m4 - pmulhrsw m12, [pw_m5520x2] ; stp2_19 - mova [r4 + 16 * 6], m6 - - ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m2, m1 ; stp1_16 - mova m0, m11 ; stp1_31 - mova m4, m7 ; stp1_28 - mova m15, m12 ; stp1_19 - - ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30 - BUTTERFLY_4Xmm 4, 15, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18 - - ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 1, 12, 9 ; stp2_16, stp2_19 - SUM_SUB 0, 15, 9 ; stp2_17, stp2_18 - SUM_SUB 11, 7, 9 ; stp2_31, stp2_28 - SUM_SUB 2, 4, 9 ; stp2_30, stp2_29 - - ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 4, 15, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29 - BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28 - - ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m6, m5 - pmulhrsw m5, [pw__3981x2] ; stp1_20 - mova [stp + %4 + idx28], m12 - mova [stp + %4 + idx29], m15 - pmulhrsw m6, [pw_15893x2] ; stp2_27 - mova [stp + %4 + idx30], m2 - mova m2, m3 - pmulhrsw m3, [pw_m2404x2] ; stp1_23 - mova [stp + %4 + idx31], m11 - pmulhrsw m2, [pw_16207x2] ; stp2_24 - - ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m13, m5 ; stp1_20 - mova m14, m6 ; stp1_27 - mova m15, m3 ; stp1_23 - mova m11, m2 ; stp1_24 - - ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26 - BUTTERFLY_4Xmm 11, 15, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22 - - ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 3, 5, 9 ; stp2_23, stp2_20 - SUM_SUB 15, 14, 9 ; stp2_22, stp2_21 - SUM_SUB 2, 6, 9 ; stp2_24, stp2_27 - SUM_SUB 11, 13, 9 ; stp2_25, stp2_26 - - ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20 - BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21 - - ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 1, 3, 9 ; stp2_16, stp2_23 - SUM_SUB 0, 15, 9 ; stp2_17, stp2_22 - SUM_SUB 4, 14, 9 ; stp2_18, stp2_21 - SUM_SUB 7, 5, 9 ; stp2_19, stp2_20 - mova [stp + %3 + idx16], m1 - mova [stp + %3 + idx17], m0 - mova [stp + %3 + idx18], m4 - mova [stp + %3 + idx19], m7 - - mova m4, [stp + %4 + idx28] - mova m7, [stp + %4 + idx29] - mova m10, [stp + %4 + idx30] - mova m12, [stp + %4 + idx31] - SUM_SUB 4, 6, 9 ; stp2_28, stp2_27 - SUM_SUB 7, 13, 9 ; stp2_29, stp2_26 - SUM_SUB 10, 11, 9 ; stp2_30, stp2_25 - SUM_SUB 12, 2, 9 ; stp2_31, stp2_24 - mova [stp + %4 + idx28], m4 - mova [stp + %4 + idx29], m7 - mova [stp + %4 + idx30], m10 - mova [stp + %4 + idx31], m12 - - ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - mova m10, [pw_11585x2] - SUM_SUB 6, 5, 9 - pmulhrsw m6, m10 ; stp1_27 - pmulhrsw m5, m10 ; stp1_20 - SUM_SUB 13, 14, 9 - pmulhrsw m13, m10 ; stp1_26 - pmulhrsw m14, m10 ; stp1_21 - SUM_SUB 11, 15, 9 - pmulhrsw m11, m10 ; stp1_25 - pmulhrsw m15, m10 ; stp1_22 - SUM_SUB 2, 3, 9 - pmulhrsw m2, m10 ; stp1_24 - pmulhrsw m3, m10 ; stp1_23 -%else - BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27 - SWAP 6, 5 - BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26 - SWAP 13, 14 - BUTTERFLY_4X 11, 15, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25 - SWAP 11, 15 - BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24 - SWAP 2, 3 -%endif - - mova [stp + %4 + idx24], m2 - mova [stp + %4 + idx25], m11 - mova [stp + %4 + idx26], m13 - mova [stp + %4 + idx27], m6 - - ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m0, [rsp + transposed_in + 16 * 2] - mova m6, [rsp + transposed_in + 16 * 6] - - mova m1, m0 - pmulhrsw m0, [pw__1606x2] ; stp1_8 - mova [stp + %3 + idx20], m5 - mova [stp + %3 + idx21], m14 - pmulhrsw m1, [pw_16305x2] ; stp2_15 - mova [stp + %3 + idx22], m15 - mova m7, m6 - pmulhrsw m7, [pw_m4756x2] ; stp2_11 - mova [stp + %3 + idx23], m3 - pmulhrsw m6, [pw_15679x2] ; stp1_12 - - ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m3, m0 ; stp1_8 - mova m2, m1 ; stp1_15 - - ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14 - mova m4, m7 ; stp1_11 - mova m5, m6 ; stp1_12 - BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10 - - ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 7, 9 ; stp1_8, stp1_11 - SUM_SUB 2, 4, 9 ; stp1_9, stp1_10 - SUM_SUB 1, 6, 9 ; stp1_15, stp1_12 - SUM_SUB 3, 5, 9 ; stp1_14, stp1_13 - - ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - mova m10, [pw_11585x2] - SUM_SUB 5, 4, 9 - pmulhrsw m5, m10 ; stp1_13 - pmulhrsw m4, m10 ; stp1_10 - SUM_SUB 6, 7, 9 - pmulhrsw m6, m10 ; stp1_12 - pmulhrsw m7, m10 ; stp1_11 -%else - BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13 - SWAP 5, 4 - BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12 - SWAP 6, 7 -%endif - - ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova [stp + %2 + idx8], m0 - mova [stp + %2 + idx9], m2 - mova [stp + %2 + idx10], m4 - mova [stp + %2 + idx11], m7 - - ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m11, [rsp + transposed_in + 16 * 4] - mova m12, m11 - pmulhrsw m11, [pw__3196x2] ; stp1_4 - pmulhrsw m12, [pw_16069x2] ; stp1_7 - - ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m0, [rsp + transposed_in + 16 * 0] - mova m10, [pw_11585x2] - pmulhrsw m0, m10 ; stp1_1 - - mova m14, m11 ; stp1_4 - mova m13, m12 ; stp1_7 - - ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - SUM_SUB 13, 14, 9 - pmulhrsw m13, m10 ; stp1_6 - pmulhrsw m14, m10 ; stp1_5 -%else - BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6 - SWAP 13, 14 -%endif - mova m7, m0 ; stp1_0 = stp1_1 - mova m4, m0 ; stp1_1 - mova m2, m7 ; stp1_0 - - ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 12, 9 ; stp1_0, stp1_7 - SUM_SUB 7, 13, 9 ; stp1_1, stp1_6 - SUM_SUB 2, 14, 9 ; stp1_2, stp1_5 - SUM_SUB 4, 11, 9 ; stp1_3, stp1_4 - - ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 1, 9 ; stp1_0, stp1_15 - SUM_SUB 7, 3, 9 ; stp1_1, stp1_14 - SUM_SUB 2, 5, 9 ; stp1_2, stp1_13 - SUM_SUB 4, 6, 9 ; stp1_3, stp1_12 - - ; 0-3, 28-31 final stage - mova m15, [stp + %4 + idx30] - mova m10, [stp + %4 + idx31] - SUM_SUB 0, 10, 9 ; stp1_0, stp1_31 - SUM_SUB 7, 15, 9 ; stp1_1, stp1_30 - mova [stp + %1 + idx0], m0 - mova [stp + %1 + idx1], m7 - mova [stp + %4 + idx30], m15 - mova [stp + %4 + idx31], m10 - mova m7, [stp + %4 + idx28] - mova m0, [stp + %4 + idx29] - SUM_SUB 2, 0, 9 ; stp1_2, stp1_29 - SUM_SUB 4, 7, 9 ; stp1_3, stp1_28 - mova [stp + %1 + idx2], m2 - mova [stp + %1 + idx3], m4 - mova [stp + %4 + idx28], m7 - mova [stp + %4 + idx29], m0 - - ; 12-15, 16-19 final stage - mova m0, [stp + %3 + idx16] - mova m7, [stp + %3 + idx17] - mova m2, [stp + %3 + idx18] - mova m4, [stp + %3 + idx19] - SUM_SUB 1, 0, 9 ; stp1_15, stp1_16 - SUM_SUB 3, 7, 9 ; stp1_14, stp1_17 - SUM_SUB 5, 2, 9 ; stp1_13, stp1_18 - SUM_SUB 6, 4, 9 ; stp1_12, stp1_19 - mova [stp + %2 + idx12], m6 - mova [stp + %2 + idx13], m5 - mova [stp + %2 + idx14], m3 - mova [stp + %2 + idx15], m1 - mova [stp + %3 + idx16], m0 - mova [stp + %3 + idx17], m7 - mova [stp + %3 + idx18], m2 - mova [stp + %3 + idx19], m4 - - mova m4, [stp + %2 + idx8] - mova m5, [stp + %2 + idx9] - mova m6, [stp + %2 + idx10] - mova m7, [stp + %2 + idx11] - SUM_SUB 11, 7, 9 ; stp1_4, stp1_11 - SUM_SUB 14, 6, 9 ; stp1_5, stp1_10 - SUM_SUB 13, 5, 9 ; stp1_6, stp1_9 - SUM_SUB 12, 4, 9 ; stp1_7, stp1_8 - - ; 4-7, 24-27 final stage - mova m0, [stp + %4 + idx27] - mova m1, [stp + %4 + idx26] - mova m2, [stp + %4 + idx25] - mova m3, [stp + %4 + idx24] - SUM_SUB 11, 0, 9 ; stp1_4, stp1_27 - SUM_SUB 14, 1, 9 ; stp1_5, stp1_26 - SUM_SUB 13, 2, 9 ; stp1_6, stp1_25 - SUM_SUB 12, 3, 9 ; stp1_7, stp1_24 - mova [stp + %4 + idx27], m0 - mova [stp + %4 + idx26], m1 - mova [stp + %4 + idx25], m2 - mova [stp + %4 + idx24], m3 - mova [stp + %1 + idx4], m11 - mova [stp + %1 + idx5], m14 - mova [stp + %1 + idx6], m13 - mova [stp + %1 + idx7], m12 - - ; 8-11, 20-23 final stage - mova m0, [stp + %3 + idx20] - mova m1, [stp + %3 + idx21] - mova m2, [stp + %3 + idx22] - mova m3, [stp + %3 + idx23] - SUM_SUB 7, 0, 9 ; stp1_11, stp_20 - SUM_SUB 6, 1, 9 ; stp1_10, stp_21 - SUM_SUB 5, 2, 9 ; stp1_9, stp_22 - SUM_SUB 4, 3, 9 ; stp1_8, stp_23 - mova [stp + %2 + idx8], m4 - mova [stp + %2 + idx9], m5 - mova [stp + %2 + idx10], m6 - mova [stp + %2 + idx11], m7 - mova [stp + %3 + idx20], m0 - mova [stp + %3 + idx21], m1 - mova [stp + %3 + idx22], m2 - mova [stp + %3 + idx23], m3 -%endmacro - -%macro RECON_AND_STORE 1 - mova m11, [pw_32] - lea stp, [rsp + %1] - mov r6, 32 - pxor m8, m8 -%%recon_and_store: - mova m0, [stp + 16 * 32 * 0] - mova m1, [stp + 16 * 32 * 1] - mova m2, [stp + 16 * 32 * 2] - mova m3, [stp + 16 * 32 * 3] - add stp, 16 - - paddw m0, m11 - paddw m1, m11 - paddw m2, m11 - paddw m3, m11 - psraw m0, 6 - psraw m1, 6 - psraw m2, 6 - psraw m3, 6 - movh m4, [outputq + 0] - movh m5, [outputq + 8] - movh m6, [outputq + 16] - movh m7, [outputq + 24] - punpcklbw m4, m8 - punpcklbw m5, m8 - punpcklbw m6, m8 - punpcklbw m7, m8 - paddw m0, m4 - paddw m1, m5 - paddw m2, m6 - paddw m3, m7 - packuswb m0, m1 - packuswb m2, m3 - mova [outputq + 0], m0 - mova [outputq + 16], m2 - lea outputq, [outputq + strideq] - dec r6 - jnz %%recon_and_store -%endmacro - -%define i32x32_size 16*32*5 -%define pass_two_start 16*32*0 -%define transposed_in 16*32*4 -%define pass_one_start 16*32*0 -%define stp r8 - -INIT_XMM ssse3 -cglobal idct32x32_34_add, 3, 11, 16, i32x32_size, input, output, stride - mova m8, [pd_8192] - lea stp, [rsp + pass_one_start] - -idct32x32_34: - mov r3, inputq - lea r4, [rsp + transposed_in] - -idct32x32_34_transpose: -%if CONFIG_VP9_HIGHBITDEPTH - mova m0, [r3 + 0] - packssdw m0, [r3 + 16] - mova m1, [r3 + 32 * 4] - packssdw m1, [r3 + 32 * 4 + 16] - mova m2, [r3 + 32 * 8] - packssdw m2, [r3 + 32 * 8 + 16] - mova m3, [r3 + 32 * 12] - packssdw m3, [r3 + 32 * 12 + 16] - mova m4, [r3 + 32 * 16] - packssdw m4, [r3 + 32 * 16 + 16] - mova m5, [r3 + 32 * 20] - packssdw m5, [r3 + 32 * 20 + 16] - mova m6, [r3 + 32 * 24] - packssdw m6, [r3 + 32 * 24 + 16] - mova m7, [r3 + 32 * 28] - packssdw m7, [r3 + 32 * 28 + 16] -%else - mova m0, [r3 + 0] - mova m1, [r3 + 16 * 4] - mova m2, [r3 + 16 * 8] - mova m3, [r3 + 16 * 12] - mova m4, [r3 + 16 * 16] - mova m5, [r3 + 16 * 20] - mova m6, [r3 + 16 * 24] - mova m7, [r3 + 16 * 28] -%endif - - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - - IDCT32X32_34 16*0, 16*32, 16*64, 16*96 - lea stp, [stp + 16 * 8] - mov r6, 4 - lea stp, [rsp + pass_one_start] - lea r9, [rsp + pass_one_start] - -idct32x32_34_2: - lea r4, [rsp + transposed_in] - mov r3, r9 - -idct32x32_34_transpose_2: - mova m0, [r3 + 0] - mova m1, [r3 + 16 * 1] - mova m2, [r3 + 16 * 2] - mova m3, [r3 + 16 * 3] - mova m4, [r3 + 16 * 4] - mova m5, [r3 + 16 * 5] - mova m6, [r3 + 16 * 6] - mova m7, [r3 + 16 * 7] - - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - - IDCT32X32_34 16*0, 16*8, 16*16, 16*24 - - lea stp, [stp + 16 * 32] - add r9, 16 * 32 - dec r6 - jnz idct32x32_34_2 - - RECON_AND_STORE pass_two_start - - RET - -%macro IDCT32X32_135 4 - ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m1, [rsp + transposed_in + 16 * 1] - mova m11, m1 - pmulhrsw m1, [pw___804x2] ; stp1_16 - pmulhrsw m11, [pw_16364x2] ; stp2_31 - - mova m7, [rsp + transposed_in + 16 * 7] - mova m12, m7 - pmulhrsw m7, [pw_15426x2] ; stp1_28 - pmulhrsw m12, [pw_m5520x2] ; stp2_19 - - mova m3, [rsp + transposed_in + 16 * 9] - mova m4, m3 - pmulhrsw m3, [pw__7005x2] ; stp1_18 - pmulhrsw m4, [pw_14811x2] ; stp2_29 - - mova m0, [rsp + transposed_in + 16 * 15] - mova m2, m0 - pmulhrsw m0, [pw_12140x2] ; stp1_30 - pmulhrsw m2, [pw_m11003x2] ; stp2_17 - - ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 1, 2, 9 ; stp2_16, stp2_17 - SUM_SUB 12, 3, 9 ; stp2_19, stp2_18 - SUM_SUB 7, 4, 9 ; stp2_28, stp2_29 - SUM_SUB 11, 0, 9 ; stp2_31, stp2_30 - - ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30 - BUTTERFLY_4Xmm 4, 3, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18 - - ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 1, 12, 9 ; stp2_16, stp2_19 - SUM_SUB 0, 3, 9 ; stp2_17, stp2_18 - SUM_SUB 11, 7, 9 ; stp2_31, stp2_28 - SUM_SUB 2, 4, 9 ; stp2_30, stp2_29 - - ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 4, 3, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29 - BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28 - - mova [stp + %3 + idx16], m1 - mova [stp + %3 + idx17], m0 - mova [stp + %3 + idx18], m4 - mova [stp + %3 + idx19], m7 - mova [stp + %4 + idx28], m12 - mova [stp + %4 + idx29], m3 - mova [stp + %4 + idx30], m2 - mova [stp + %4 + idx31], m11 - - ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m2, [rsp + transposed_in + 16 * 3] - mova m3, m2 - pmulhrsw m3, [pw_m2404x2] ; stp1_23 - pmulhrsw m2, [pw_16207x2] ; stp2_24 - - mova m5, [rsp + transposed_in + 16 * 5] - mova m6, m5 - pmulhrsw m5, [pw__3981x2] ; stp1_20 - pmulhrsw m6, [pw_15893x2] ; stp2_27 - - mova m14, [rsp + transposed_in + 16 * 11] - mova m13, m14 - pmulhrsw m13, [pw_m8423x2] ; stp1_21 - pmulhrsw m14, [pw_14053x2] ; stp2_26 - - mova m0, [rsp + transposed_in + 16 * 13] - mova m1, m0 - pmulhrsw m0, [pw__9760x2] ; stp1_22 - pmulhrsw m1, [pw_13160x2] ; stp2_25 - - ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 5, 13, 9 ; stp2_20, stp2_21 - SUM_SUB 3, 0, 9 ; stp2_23, stp2_22 - SUM_SUB 2, 1, 9 ; stp2_24, stp2_25 - SUM_SUB 6, 14, 9 ; stp2_27, stp2_26 - - ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26 - BUTTERFLY_4Xmm 1, 0, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22 - - ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 3, 5, 9 ; stp2_23, stp2_20 - SUM_SUB 0, 14, 9 ; stp2_22, stp2_21 - SUM_SUB 2, 6, 9 ; stp2_24, stp2_27 - SUM_SUB 1, 13, 9 ; stp2_25, stp2_26 - - ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20 - BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21 - - ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m4, [stp + %3 + idx16] - mova m7, [stp + %3 + idx17] - mova m11, [stp + %3 + idx18] - mova m12, [stp + %3 + idx19] - SUM_SUB 4, 3, 9 ; stp2_16, stp2_23 - SUM_SUB 7, 0, 9 ; stp2_17, stp2_22 - SUM_SUB 11, 14, 9 ; stp2_18, stp2_21 - SUM_SUB 12, 5, 9 ; stp2_19, stp2_20 - mova [stp + %3 + idx16], m4 - mova [stp + %3 + idx17], m7 - mova [stp + %3 + idx18], m11 - mova [stp + %3 + idx19], m12 - - mova m4, [stp + %4 + idx28] - mova m7, [stp + %4 + idx29] - mova m11, [stp + %4 + idx30] - mova m12, [stp + %4 + idx31] - SUM_SUB 4, 6, 9 ; stp2_28, stp2_27 - SUM_SUB 7, 13, 9 ; stp2_29, stp2_26 - SUM_SUB 11, 1, 9 ; stp2_30, stp2_25 - SUM_SUB 12, 2, 9 ; stp2_31, stp2_24 - mova [stp + %4 + idx28], m4 - mova [stp + %4 + idx29], m7 - mova [stp + %4 + idx30], m11 - mova [stp + %4 + idx31], m12 - - ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - mova m10, [pw_11585x2] - SUM_SUB 6, 5, 9 - pmulhrsw m6, m10 ; stp1_27 - pmulhrsw m5, m10 ; stp1_20 - SUM_SUB 13, 14, 9 - pmulhrsw m13, m10 ; stp1_26 - pmulhrsw m14, m10 ; stp1_21 - SUM_SUB 1, 0, 9 - pmulhrsw m1, m10 ; stp1_25 - pmulhrsw m0, m10 ; stp1_22 - SUM_SUB 2, 3, 9 - pmulhrsw m2, m10 ; stp1_25 - pmulhrsw m3, m10 ; stp1_22 -%else - BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27 - SWAP 6, 5 - BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26 - SWAP 13, 14 - BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25 - SWAP 1, 0 - BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24 - SWAP 2, 3 -%endif - mova [stp + %3 + idx20], m5 - mova [stp + %3 + idx21], m14 - mova [stp + %3 + idx22], m0 - mova [stp + %3 + idx23], m3 - mova [stp + %4 + idx24], m2 - mova [stp + %4 + idx25], m1 - mova [stp + %4 + idx26], m13 - mova [stp + %4 + idx27], m6 - - ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m0, [rsp + transposed_in + 16 * 2] - mova m1, m0 - pmulhrsw m0, [pw__1606x2] ; stp1_8 - pmulhrsw m1, [pw_16305x2] ; stp2_15 - - mova m6, [rsp + transposed_in + 16 * 6] - mova m7, m6 - pmulhrsw m7, [pw_m4756x2] ; stp2_11 - pmulhrsw m6, [pw_15679x2] ; stp1_12 - - mova m4, [rsp + transposed_in + 16 * 10] - mova m5, m4 - pmulhrsw m4, [pw__7723x2] ; stp1_10 - pmulhrsw m5, [pw_14449x2] ; stp2_13 - - mova m2, [rsp + transposed_in + 16 * 14] - mova m3, m2 - pmulhrsw m3, [pw_m10394x2] ; stp1_9 - pmulhrsw m2, [pw_12665x2] ; stp2_14 - - ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 3, 9 ; stp1_8, stp1_9 - SUM_SUB 7, 4, 9 ; stp1_11, stp1_10 - SUM_SUB 6, 5, 9 ; stp1_12, stp1_13 - SUM_SUB 1, 2, 9 ; stp1_15, stp1_14 - - ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14 - BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10 - - ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 7, 9 ; stp1_8, stp1_11 - SUM_SUB 2, 4, 9 ; stp1_9, stp1_10 - SUM_SUB 1, 6, 9 ; stp1_15, stp1_12 - SUM_SUB 3, 5, 9 ; stp1_14, stp1_13 - - ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - mova m10, [pw_11585x2] - SUM_SUB 5, 4, 9 - pmulhrsw m5, m10 ; stp1_13 - pmulhrsw m4, m10 ; stp1_10 - SUM_SUB 6, 7, 9 - pmulhrsw m6, m10 ; stp1_12 - pmulhrsw m7, m10 ; stp1_11 -%else - BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13 - SWAP 5, 4 - BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12 - SWAP 6, 7 -%endif - ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova [stp + %2 + idx8], m0 - mova [stp + %2 + idx9], m2 - mova [stp + %2 + idx10], m4 - mova [stp + %2 + idx11], m7 - mova [stp + %2 + idx12], m6 - mova [stp + %2 + idx13], m5 - mova [stp + %2 + idx14], m3 - mova [stp + %2 + idx15], m1 - - ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m11, [rsp + transposed_in + 16 * 4] - mova m12, m11 - pmulhrsw m11, [pw__3196x2] ; stp1_4 - pmulhrsw m12, [pw_16069x2] ; stp1_7 - - mova m13, [rsp + transposed_in + 16 * 12] - mova m14, m13 - pmulhrsw m13, [pw_13623x2] ; stp1_6 - pmulhrsw m14, [pw_m9102x2] ; stp1_5 - - ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m0, [rsp + transposed_in + 16 * 0] - mova m2, [rsp + transposed_in + 16 * 8] - pmulhrsw m0, [pw_11585x2] ; stp1_1 - mova m3, m2 - pmulhrsw m2, [pw__6270x2] ; stp1_2 - pmulhrsw m3, [pw_15137x2] ; stp1_3 - - SUM_SUB 11, 14, 9 ; stp1_4, stp1_5 - SUM_SUB 12, 13, 9 ; stp1_7, stp1_6 - - ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - mova m10, [pw_11585x2] - SUM_SUB 13, 14, 9 - pmulhrsw m13, m10 ; stp1_6 - pmulhrsw m14, m10 ; stp1_5 -%else - BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6 - SWAP 13, 14 -%endif - mova m1, m0 ; stp1_0 = stp1_1 - SUM_SUB 0, 3, 9 ; stp1_0, stp1_3 - SUM_SUB 1, 2, 9 ; stp1_1, stp1_2 - - ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 12, 9 ; stp1_0, stp1_7 - SUM_SUB 1, 13, 9 ; stp1_1, stp1_6 - SUM_SUB 2, 14, 9 ; stp1_2, stp1_5 - SUM_SUB 3, 11, 9 ; stp1_3, stp1_4 - - ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m4, [stp + %2 + idx12] - mova m5, [stp + %2 + idx13] - mova m6, [stp + %2 + idx14] - mova m7, [stp + %2 + idx15] - SUM_SUB 0, 7, 9 ; stp1_0, stp1_15 - SUM_SUB 1, 6, 9 ; stp1_1, stp1_14 - SUM_SUB 2, 5, 9 ; stp1_2, stp1_13 - SUM_SUB 3, 4, 9 ; stp1_3, stp1_12 - - ; 0-3, 28-31 final stage - mova m10, [stp + %4 + idx31] - mova m15, [stp + %4 + idx30] - SUM_SUB 0, 10, 9 ; stp1_0, stp1_31 - SUM_SUB 1, 15, 9 ; stp1_1, stp1_30 - mova [stp + %1 + idx0], m0 - mova [stp + %1 + idx1], m1 - mova [stp + %4 + idx31], m10 - mova [stp + %4 + idx30], m15 - mova m0, [stp + %4 + idx29] - mova m1, [stp + %4 + idx28] - SUM_SUB 2, 0, 9 ; stp1_2, stp1_29 - SUM_SUB 3, 1, 9 ; stp1_3, stp1_28 - mova [stp + %1 + idx2], m2 - mova [stp + %1 + idx3], m3 - mova [stp + %4 + idx29], m0 - mova [stp + %4 + idx28], m1 - - ; 12-15, 16-19 final stage - mova m0, [stp + %3 + idx16] - mova m1, [stp + %3 + idx17] - mova m2, [stp + %3 + idx18] - mova m3, [stp + %3 + idx19] - SUM_SUB 7, 0, 9 ; stp1_15, stp1_16 - SUM_SUB 6, 1, 9 ; stp1_14, stp1_17 - SUM_SUB 5, 2, 9 ; stp1_13, stp1_18 - SUM_SUB 4, 3, 9 ; stp1_12, stp1_19 - mova [stp + %2 + idx12], m4 - mova [stp + %2 + idx13], m5 - mova [stp + %2 + idx14], m6 - mova [stp + %2 + idx15], m7 - mova [stp + %3 + idx16], m0 - mova [stp + %3 + idx17], m1 - mova [stp + %3 + idx18], m2 - mova [stp + %3 + idx19], m3 - - mova m4, [stp + %2 + idx8] - mova m5, [stp + %2 + idx9] - mova m6, [stp + %2 + idx10] - mova m7, [stp + %2 + idx11] - SUM_SUB 11, 7, 9 ; stp1_4, stp1_11 - SUM_SUB 14, 6, 9 ; stp1_5, stp1_10 - SUM_SUB 13, 5, 9 ; stp1_6, stp1_9 - SUM_SUB 12, 4, 9 ; stp1_7, stp1_8 - - ; 4-7, 24-27 final stage - mova m3, [stp + %4 + idx24] - mova m2, [stp + %4 + idx25] - mova m1, [stp + %4 + idx26] - mova m0, [stp + %4 + idx27] - SUM_SUB 12, 3, 9 ; stp1_7, stp1_24 - SUM_SUB 13, 2, 9 ; stp1_6, stp1_25 - SUM_SUB 14, 1, 9 ; stp1_5, stp1_26 - SUM_SUB 11, 0, 9 ; stp1_4, stp1_27 - mova [stp + %4 + idx24], m3 - mova [stp + %4 + idx25], m2 - mova [stp + %4 + idx26], m1 - mova [stp + %4 + idx27], m0 - mova [stp + %1 + idx4], m11 - mova [stp + %1 + idx5], m14 - mova [stp + %1 + idx6], m13 - mova [stp + %1 + idx7], m12 - - ; 8-11, 20-23 final stage - mova m0, [stp + %3 + idx20] - mova m1, [stp + %3 + idx21] - mova m2, [stp + %3 + idx22] - mova m3, [stp + %3 + idx23] - SUM_SUB 7, 0, 9 ; stp1_11, stp_20 - SUM_SUB 6, 1, 9 ; stp1_10, stp_21 - SUM_SUB 5, 2, 9 ; stp1_9, stp_22 - SUM_SUB 4, 3, 9 ; stp1_8, stp_23 - mova [stp + %2 + idx8], m4 - mova [stp + %2 + idx9], m5 - mova [stp + %2 + idx10], m6 - mova [stp + %2 + idx11], m7 - mova [stp + %3 + idx20], m0 - mova [stp + %3 + idx21], m1 - mova [stp + %3 + idx22], m2 - mova [stp + %3 + idx23], m3 -%endmacro - -INIT_XMM ssse3 -cglobal idct32x32_135_add, 3, 11, 16, i32x32_size, input, output, stride - mova m8, [pd_8192] - mov r6, 2 - lea stp, [rsp + pass_one_start] - -idct32x32_135: - mov r3, inputq - lea r4, [rsp + transposed_in] - mov r7, 2 - -idct32x32_135_transpose: -%if CONFIG_VP9_HIGHBITDEPTH - mova m0, [r3 + 0] - packssdw m0, [r3 + 16] - mova m1, [r3 + 32 * 4] - packssdw m1, [r3 + 32 * 4 + 16] - mova m2, [r3 + 32 * 8] - packssdw m2, [r3 + 32 * 8 + 16] - mova m3, [r3 + 32 * 12] - packssdw m3, [r3 + 32 * 12 + 16] - mova m4, [r3 + 32 * 16] - packssdw m4, [r3 + 32 * 16 + 16] - mova m5, [r3 + 32 * 20] - packssdw m5, [r3 + 32 * 20 + 16] - mova m6, [r3 + 32 * 24] - packssdw m6, [r3 + 32 * 24 + 16] - mova m7, [r3 + 32 * 28] - packssdw m7, [r3 + 32 * 28 + 16] -%else - mova m0, [r3 + 0] - mova m1, [r3 + 16 * 4] - mova m2, [r3 + 16 * 8] - mova m3, [r3 + 16 * 12] - mova m4, [r3 + 16 * 16] - mova m5, [r3 + 16 * 20] - mova m6, [r3 + 16 * 24] - mova m7, [r3 + 16 * 28] -%endif - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - - mova [r4 + 0], m0 - mova [r4 + 16 * 1], m1 - mova [r4 + 16 * 2], m2 - mova [r4 + 16 * 3], m3 - mova [r4 + 16 * 4], m4 - mova [r4 + 16 * 5], m5 - mova [r4 + 16 * 6], m6 - mova [r4 + 16 * 7], m7 - -%if CONFIG_VP9_HIGHBITDEPTH - add r3, 32 -%else - add r3, 16 -%endif - add r4, 16 * 8 - dec r7 - jne idct32x32_135_transpose - - IDCT32X32_135 16*0, 16*32, 16*64, 16*96 - lea stp, [stp + 16 * 8] -%if CONFIG_VP9_HIGHBITDEPTH - lea inputq, [inputq + 32 * 32] -%else - lea inputq, [inputq + 16 * 32] -%endif - dec r6 - jnz idct32x32_135 - - mov r6, 4 - lea stp, [rsp + pass_one_start] - lea r9, [rsp + pass_one_start] - -idct32x32_135_2: - lea r4, [rsp + transposed_in] - mov r3, r9 - mov r7, 2 - -idct32x32_135_transpose_2: - mova m0, [r3 + 0] - mova m1, [r3 + 16 * 1] - mova m2, [r3 + 16 * 2] - mova m3, [r3 + 16 * 3] - mova m4, [r3 + 16 * 4] - mova m5, [r3 + 16 * 5] - mova m6, [r3 + 16 * 6] - mova m7, [r3 + 16 * 7] - - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - - mova [r4 + 0], m0 - mova [r4 + 16 * 1], m1 - mova [r4 + 16 * 2], m2 - mova [r4 + 16 * 3], m3 - mova [r4 + 16 * 4], m4 - mova [r4 + 16 * 5], m5 - mova [r4 + 16 * 6], m6 - mova [r4 + 16 * 7], m7 - - add r3, 16 * 8 - add r4, 16 * 8 - dec r7 - jne idct32x32_135_transpose_2 - - IDCT32X32_135 16*0, 16*8, 16*16, 16*24 - - lea stp, [stp + 16 * 32] - add r9, 16 * 32 - dec r6 - jnz idct32x32_135_2 - - RECON_AND_STORE pass_two_start - - RET - -%macro IDCT32X32_1024 4 - ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m1, [rsp + transposed_in + 16 * 1] - mova m11, [rsp + transposed_in + 16 * 31] - BUTTERFLY_4X 1, 11, 804, 16364, m8, 9, 10 ; stp1_16, stp1_31 - - mova m0, [rsp + transposed_in + 16 * 15] - mova m2, [rsp + transposed_in + 16 * 17] - BUTTERFLY_4X 2, 0, 12140, 11003, m8, 9, 10 ; stp1_17, stp1_30 - - mova m7, [rsp + transposed_in + 16 * 7] - mova m12, [rsp + transposed_in + 16 * 25] - BUTTERFLY_4X 12, 7, 15426, 5520, m8, 9, 10 ; stp1_19, stp1_28 - - mova m3, [rsp + transposed_in + 16 * 9] - mova m4, [rsp + transposed_in + 16 * 23] - BUTTERFLY_4X 3, 4, 7005, 14811, m8, 9, 10 ; stp1_18, stp1_29 - - ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 1, 2, 9 ; stp2_16, stp2_17 - SUM_SUB 12, 3, 9 ; stp2_19, stp2_18 - SUM_SUB 7, 4, 9 ; stp2_28, stp2_29 - SUM_SUB 11, 0, 9 ; stp2_31, stp2_30 - - ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30 - BUTTERFLY_4Xmm 4, 3, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18 - - ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 1, 12, 9 ; stp2_16, stp2_19 - SUM_SUB 0, 3, 9 ; stp2_17, stp2_18 - SUM_SUB 11, 7, 9 ; stp2_31, stp2_28 - SUM_SUB 2, 4, 9 ; stp2_30, stp2_29 - - ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 4, 3, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29 - BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28 - - mova [stp + %3 + idx16], m1 - mova [stp + %3 + idx17], m0 - mova [stp + %3 + idx18], m4 - mova [stp + %3 + idx19], m7 - mova [stp + %4 + idx28], m12 - mova [stp + %4 + idx29], m3 - mova [stp + %4 + idx30], m2 - mova [stp + %4 + idx31], m11 - - ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m5, [rsp + transposed_in + 16 * 5] - mova m6, [rsp + transposed_in + 16 * 27] - BUTTERFLY_4X 5, 6, 3981, 15893, m8, 9, 10 ; stp1_20, stp1_27 - - mova m13, [rsp + transposed_in + 16 * 21] - mova m14, [rsp + transposed_in + 16 * 11] - BUTTERFLY_4X 13, 14, 14053, 8423, m8, 9, 10 ; stp1_21, stp1_26 - - mova m0, [rsp + transposed_in + 16 * 13] - mova m1, [rsp + transposed_in + 16 * 19] - BUTTERFLY_4X 0, 1, 9760, 13160, m8, 9, 10 ; stp1_22, stp1_25 - - mova m2, [rsp + transposed_in + 16 * 3] - mova m3, [rsp + transposed_in + 16 * 29] - BUTTERFLY_4X 3, 2, 16207, 2404, m8, 9, 10 ; stp1_23, stp1_24 - - ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 5, 13, 9 ; stp2_20, stp2_21 - SUM_SUB 3, 0, 9 ; stp2_23, stp2_22 - SUM_SUB 2, 1, 9 ; stp2_24, stp2_25 - SUM_SUB 6, 14, 9 ; stp2_27, stp2_26 - - ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26 - BUTTERFLY_4Xmm 1, 0, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22 - - ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 3, 5, 9 ; stp2_23, stp2_20 - SUM_SUB 0, 14, 9 ; stp2_22, stp2_21 - SUM_SUB 2, 6, 9 ; stp2_24, stp2_27 - SUM_SUB 1, 13, 9 ; stp2_25, stp2_26 - - ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20 - BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21 - - ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m4, [stp + %3 + idx16] - mova m7, [stp + %3 + idx17] - mova m11, [stp + %3 + idx18] - mova m12, [stp + %3 + idx19] - SUM_SUB 4, 3, 9 ; stp2_16, stp2_23 - SUM_SUB 7, 0, 9 ; stp2_17, stp2_22 - SUM_SUB 11, 14, 9 ; stp2_18, stp2_21 - SUM_SUB 12, 5, 9 ; stp2_19, stp2_20 - mova [stp + %3 + idx16], m4 - mova [stp + %3 + idx17], m7 - mova [stp + %3 + idx18], m11 - mova [stp + %3 + idx19], m12 - - mova m4, [stp + %4 + idx28] - mova m7, [stp + %4 + idx29] - mova m11, [stp + %4 + idx30] - mova m12, [stp + %4 + idx31] - SUM_SUB 4, 6, 9 ; stp2_28, stp2_27 - SUM_SUB 7, 13, 9 ; stp2_29, stp2_26 - SUM_SUB 11, 1, 9 ; stp2_30, stp2_25 - SUM_SUB 12, 2, 9 ; stp2_31, stp2_24 - mova [stp + %4 + idx28], m4 - mova [stp + %4 + idx29], m7 - mova [stp + %4 + idx30], m11 - mova [stp + %4 + idx31], m12 - - ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - mova m10, [pw_11585x2] - SUM_SUB 6, 5, 9 - pmulhrsw m6, m10 ; stp1_27 - pmulhrsw m5, m10 ; stp1_20 - SUM_SUB 13, 14, 9 - pmulhrsw m13, m10 ; stp1_26 - pmulhrsw m14, m10 ; stp1_21 - SUM_SUB 1, 0, 9 - pmulhrsw m1, m10 ; stp1_25 - pmulhrsw m0, m10 ; stp1_22 - SUM_SUB 2, 3, 9 - pmulhrsw m2, m10 ; stp1_25 - pmulhrsw m3, m10 ; stp1_22 -%else - BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27 - SWAP 6, 5 - BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26 - SWAP 13, 14 - BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25 - SWAP 1, 0 - BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24 - SWAP 2, 3 -%endif - mova [stp + %3 + idx20], m5 - mova [stp + %3 + idx21], m14 - mova [stp + %3 + idx22], m0 - mova [stp + %3 + idx23], m3 - mova [stp + %4 + idx24], m2 - mova [stp + %4 + idx25], m1 - mova [stp + %4 + idx26], m13 - mova [stp + %4 + idx27], m6 - - ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m0, [rsp + transposed_in + 16 * 2] - mova m1, [rsp + transposed_in + 16 * 30] - BUTTERFLY_4X 0, 1, 1606, 16305, m8, 9, 10 ; stp1_8, stp1_15 - - mova m2, [rsp + transposed_in + 16 * 14] - mova m3, [rsp + transposed_in + 16 * 18] - BUTTERFLY_4X 3, 2, 12665, 10394, m8, 9, 10 ; stp1_9, stp1_14 - - mova m4, [rsp + transposed_in + 16 * 10] - mova m5, [rsp + transposed_in + 16 * 22] - BUTTERFLY_4X 4, 5, 7723, 14449, m8, 9, 10 ; stp1_10, stp1_13 - - mova m6, [rsp + transposed_in + 16 * 6] - mova m7, [rsp + transposed_in + 16 * 26] - BUTTERFLY_4X 7, 6, 15679, 4756, m8, 9, 10 ; stp1_11, stp1_12 - - ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 3, 9 ; stp1_8, stp1_9 - SUM_SUB 7, 4, 9 ; stp1_11, stp1_10 - SUM_SUB 6, 5, 9 ; stp1_12, stp1_13 - SUM_SUB 1, 2, 9 ; stp1_15, stp1_14 - - ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14 - BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10 - - ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 7, 9 ; stp1_8, stp1_11 - SUM_SUB 2, 4, 9 ; stp1_9, stp1_10 - SUM_SUB 1, 6, 9 ; stp1_15, stp1_12 - SUM_SUB 3, 5, 9 ; stp1_14, stp1_13 - - ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - mova m10, [pw_11585x2] - SUM_SUB 5, 4, 9 - pmulhrsw m5, m10 ; stp1_13 - pmulhrsw m4, m10 ; stp1_10 - SUM_SUB 6, 7, 9 - pmulhrsw m6, m10 ; stp1_12 - pmulhrsw m7, m10 ; stp1_11 -%else - BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13 - SWAP 5, 4 - BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12 - SWAP 6, 7 -%endif - ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova [stp + %2 + idx8], m0 - mova [stp + %2 + idx9], m2 - mova [stp + %2 + idx10], m4 - mova [stp + %2 + idx11], m7 - mova [stp + %2 + idx12], m6 - mova [stp + %2 + idx13], m5 - mova [stp + %2 + idx14], m3 - mova [stp + %2 + idx15], m1 - - ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - ; - ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m11, [rsp + transposed_in + 16 * 4] - mova m12, [rsp + transposed_in + 16 * 28] - BUTTERFLY_4X 11, 12, 3196, 16069, m8, 9, 10 ; stp1_4, stp1_7 - - mova m13, [rsp + transposed_in + 16 * 12] - mova m14, [rsp + transposed_in + 16 * 20] - BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_5, stp1_6 - - ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m0, [rsp + transposed_in + 16 * 0] - mova m1, [rsp + transposed_in + 16 * 16] - -%if 0 ; overflow occurs in SUM_SUB when using test streams - mova m10, [pw_11585x2] - SUM_SUB 0, 1, 9 - pmulhrsw m0, m10 ; stp1_1 - pmulhrsw m1, m10 ; stp1_0 -%else - BUTTERFLY_4X 0, 1, 11585, 11585, m8, 9, 10 ; stp1_1, stp1_0 - SWAP 0, 1 -%endif - mova m2, [rsp + transposed_in + 16 * 8] - mova m3, [rsp + transposed_in + 16 * 24] - BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_2, stp1_3 - - mova m10, [pw_11585x2] - SUM_SUB 11, 14, 9 ; stp1_4, stp1_5 - SUM_SUB 12, 13, 9 ; stp1_7, stp1_6 - - ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -%if 0 ; overflow occurs in SUM_SUB when using test streams - SUM_SUB 13, 14, 9 - pmulhrsw m13, m10 ; stp1_6 - pmulhrsw m14, m10 ; stp1_5 -%else - BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6 - SWAP 13, 14 -%endif - SUM_SUB 0, 3, 9 ; stp1_0, stp1_3 - SUM_SUB 1, 2, 9 ; stp1_1, stp1_2 - - ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SUM_SUB 0, 12, 9 ; stp1_0, stp1_7 - SUM_SUB 1, 13, 9 ; stp1_1, stp1_6 - SUM_SUB 2, 14, 9 ; stp1_2, stp1_5 - SUM_SUB 3, 11, 9 ; stp1_3, stp1_4 - - ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - mova m4, [stp + %2 + idx12] - mova m5, [stp + %2 + idx13] - mova m6, [stp + %2 + idx14] - mova m7, [stp + %2 + idx15] - SUM_SUB 0, 7, 9 ; stp1_0, stp1_15 - SUM_SUB 1, 6, 9 ; stp1_1, stp1_14 - SUM_SUB 2, 5, 9 ; stp1_2, stp1_13 - SUM_SUB 3, 4, 9 ; stp1_3, stp1_12 - - ; 0-3, 28-31 final stage - mova m10, [stp + %4 + idx31] - mova m15, [stp + %4 + idx30] - SUM_SUB 0, 10, 9 ; stp1_0, stp1_31 - SUM_SUB 1, 15, 9 ; stp1_1, stp1_30 - mova [stp + %1 + idx0], m0 - mova [stp + %1 + idx1], m1 - mova [stp + %4 + idx31], m10 - mova [stp + %4 + idx30], m15 - mova m0, [stp + %4 + idx29] - mova m1, [stp + %4 + idx28] - SUM_SUB 2, 0, 9 ; stp1_2, stp1_29 - SUM_SUB 3, 1, 9 ; stp1_3, stp1_28 - mova [stp + %1 + idx2], m2 - mova [stp + %1 + idx3], m3 - mova [stp + %4 + idx29], m0 - mova [stp + %4 + idx28], m1 - - ; 12-15, 16-19 final stage - mova m0, [stp + %3 + idx16] - mova m1, [stp + %3 + idx17] - mova m2, [stp + %3 + idx18] - mova m3, [stp + %3 + idx19] - SUM_SUB 7, 0, 9 ; stp1_15, stp1_16 - SUM_SUB 6, 1, 9 ; stp1_14, stp1_17 - SUM_SUB 5, 2, 9 ; stp1_13, stp1_18 - SUM_SUB 4, 3, 9 ; stp1_12, stp1_19 - mova [stp + %2 + idx12], m4 - mova [stp + %2 + idx13], m5 - mova [stp + %2 + idx14], m6 - mova [stp + %2 + idx15], m7 - mova [stp + %3 + idx16], m0 - mova [stp + %3 + idx17], m1 - mova [stp + %3 + idx18], m2 - mova [stp + %3 + idx19], m3 - - mova m4, [stp + %2 + idx8] - mova m5, [stp + %2 + idx9] - mova m6, [stp + %2 + idx10] - mova m7, [stp + %2 + idx11] - SUM_SUB 11, 7, 9 ; stp1_4, stp1_11 - SUM_SUB 14, 6, 9 ; stp1_5, stp1_10 - SUM_SUB 13, 5, 9 ; stp1_6, stp1_9 - SUM_SUB 12, 4, 9 ; stp1_7, stp1_8 - - ; 4-7, 24-27 final stage - mova m3, [stp + %4 + idx24] - mova m2, [stp + %4 + idx25] - mova m1, [stp + %4 + idx26] - mova m0, [stp + %4 + idx27] - SUM_SUB 12, 3, 9 ; stp1_7, stp1_24 - SUM_SUB 13, 2, 9 ; stp1_6, stp1_25 - SUM_SUB 14, 1, 9 ; stp1_5, stp1_26 - SUM_SUB 11, 0, 9 ; stp1_4, stp1_27 - mova [stp + %4 + idx24], m3 - mova [stp + %4 + idx25], m2 - mova [stp + %4 + idx26], m1 - mova [stp + %4 + idx27], m0 - mova [stp + %1 + idx4], m11 - mova [stp + %1 + idx5], m14 - mova [stp + %1 + idx6], m13 - mova [stp + %1 + idx7], m12 - - ; 8-11, 20-23 final stage - mova m0, [stp + %3 + idx20] - mova m1, [stp + %3 + idx21] - mova m2, [stp + %3 + idx22] - mova m3, [stp + %3 + idx23] - SUM_SUB 7, 0, 9 ; stp1_11, stp_20 - SUM_SUB 6, 1, 9 ; stp1_10, stp_21 - SUM_SUB 5, 2, 9 ; stp1_9, stp_22 - SUM_SUB 4, 3, 9 ; stp1_8, stp_23 - mova [stp + %2 + idx8], m4 - mova [stp + %2 + idx9], m5 - mova [stp + %2 + idx10], m6 - mova [stp + %2 + idx11], m7 - mova [stp + %3 + idx20], m0 - mova [stp + %3 + idx21], m1 - mova [stp + %3 + idx22], m2 - mova [stp + %3 + idx23], m3 -%endmacro - -INIT_XMM ssse3 -cglobal idct32x32_1024_add, 3, 11, 16, i32x32_size, input, output, stride - mova m8, [pd_8192] - mov r6, 4 - lea stp, [rsp + pass_one_start] - -idct32x32_1024: - mov r3, inputq - lea r4, [rsp + transposed_in] - mov r7, 4 - -idct32x32_1024_transpose: -%if CONFIG_VP9_HIGHBITDEPTH - mova m0, [r3 + 0] - packssdw m0, [r3 + 16] - mova m1, [r3 + 32 * 4] - packssdw m1, [r3 + 32 * 4 + 16] - mova m2, [r3 + 32 * 8] - packssdw m2, [r3 + 32 * 8 + 16] - mova m3, [r3 + 32 * 12] - packssdw m3, [r3 + 32 * 12 + 16] - mova m4, [r3 + 32 * 16] - packssdw m4, [r3 + 32 * 16 + 16] - mova m5, [r3 + 32 * 20] - packssdw m5, [r3 + 32 * 20 + 16] - mova m6, [r3 + 32 * 24] - packssdw m6, [r3 + 32 * 24 + 16] - mova m7, [r3 + 32 * 28] - packssdw m7, [r3 + 32 * 28 + 16] -%else - mova m0, [r3 + 0] - mova m1, [r3 + 16 * 4] - mova m2, [r3 + 16 * 8] - mova m3, [r3 + 16 * 12] - mova m4, [r3 + 16 * 16] - mova m5, [r3 + 16 * 20] - mova m6, [r3 + 16 * 24] - mova m7, [r3 + 16 * 28] -%endif - - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - - mova [r4 + 0], m0 - mova [r4 + 16 * 1], m1 - mova [r4 + 16 * 2], m2 - mova [r4 + 16 * 3], m3 - mova [r4 + 16 * 4], m4 - mova [r4 + 16 * 5], m5 - mova [r4 + 16 * 6], m6 - mova [r4 + 16 * 7], m7 -%if CONFIG_VP9_HIGHBITDEPTH - add r3, 32 -%else - add r3, 16 -%endif - add r4, 16 * 8 - dec r7 - jne idct32x32_1024_transpose - - IDCT32X32_1024 16*0, 16*32, 16*64, 16*96 - - lea stp, [stp + 16 * 8] -%if CONFIG_VP9_HIGHBITDEPTH - lea inputq, [inputq + 32 * 32] -%else - lea inputq, [inputq + 16 * 32] -%endif - dec r6 - jnz idct32x32_1024 - - mov r6, 4 - lea stp, [rsp + pass_one_start] - lea r9, [rsp + pass_one_start] - -idct32x32_1024_2: - lea r4, [rsp + transposed_in] - mov r3, r9 - mov r7, 4 - -idct32x32_1024_transpose_2: - mova m0, [r3 + 0] - mova m1, [r3 + 16 * 1] - mova m2, [r3 + 16 * 2] - mova m3, [r3 + 16 * 3] - mova m4, [r3 + 16 * 4] - mova m5, [r3 + 16 * 5] - mova m6, [r3 + 16 * 6] - mova m7, [r3 + 16 * 7] - - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - - mova [r4 + 0], m0 - mova [r4 + 16 * 1], m1 - mova [r4 + 16 * 2], m2 - mova [r4 + 16 * 3], m3 - mova [r4 + 16 * 4], m4 - mova [r4 + 16 * 5], m5 - mova [r4 + 16 * 6], m6 - mova [r4 + 16 * 7], m7 - - add r3, 16 * 8 - add r4, 16 * 8 - dec r7 - jne idct32x32_1024_transpose_2 - - IDCT32X32_1024 16*0, 16*8, 16*16, 16*24 - - lea stp, [stp + 16 * 32] - add r9, 16 * 32 - dec r6 - jnz idct32x32_1024_2 - - RECON_AND_STORE pass_two_start - - RET -%endif diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_wht_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_wht_sse2.asm index fbbcd76bd7b..bcf1a6ef989 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_wht_sse2.asm +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_wht_sse2.asm @@ -9,6 +9,7 @@ ; %include "third_party/x86inc/x86inc.asm" +%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm" SECTION .text @@ -82,15 +83,8 @@ SECTION .text INIT_XMM sse2 cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride -%if CONFIG_VP9_HIGHBITDEPTH - mova m0, [inputq + 0] - packssdw m0, [inputq + 16] - mova m1, [inputq + 32] - packssdw m1, [inputq + 48] -%else - mova m0, [inputq + 0] - mova m1, [inputq + 16] -%endif + LOAD_TRAN_LOW 0, inputq, 0 + LOAD_TRAN_LOW 1, inputq, 8 psraw m0, 2 psraw m1, 2 diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/quantize_sse2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/quantize_sse2.c index 0580a7bd7b6..32721beb3a6 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/quantize_sse2.c +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/quantize_sse2.c @@ -13,7 +13,7 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" -#include "vpx_dsp/x86/fdct.h" +#include "vpx_dsp/x86/bitdepth_conversion_sse2.h" void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm index ca215391739..ec2cafb94cd 100644 --- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm +++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm @@ -200,7 +200,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ pand m13, m12 %if CONFIG_VP9_HIGHBITDEPTH ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff - pxor m11, m11 mova m11, m14 mova m6, m14 pcmpgtw m5, m14 |