summaryrefslogtreecommitdiff
path: root/chromium/third_party/libvpx/source/libvpx/vpx_dsp
diff options
context:
space:
mode:
Diffstat (limited to 'chromium/third_party/libvpx/source/libvpx/vpx_dsp')
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/avg_neon.c7
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c27
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/hadamard_neon.c38
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c1512
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c89
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c20
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c164
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm196
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm1176
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c674
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_neon.c160
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c6
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm86
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm507
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h315
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/transpose_neon.h77
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/avg.c33
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/deblock.c4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c1157
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/avg_msa.c670
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/deblock_msa.c63
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/macros_msa.h7
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/sum_squares_msa.c129
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/prob.h2
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk24
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl85
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/avg_intrin_sse2.c42
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm89
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h30
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.asm90
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h (renamed from chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/fdct.h)17
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/deblock_sse2.asm4
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm416
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c94
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.h93
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c1741
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm1793
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_wht_sse2.asm12
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/quantize_sse2.c2
-rw-r--r--chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm1
40 files changed, 6334 insertions, 5318 deletions
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/avg_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/avg_neon.c
index 001517d33ee..cca9a932423 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/avg_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/avg_neon.c
@@ -15,6 +15,7 @@
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/idct_neon.h"
static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {
const uint32x4_t a = vpaddlq_u16(v_16x8);
@@ -64,13 +65,13 @@ unsigned int vpx_avg_8x8_neon(const uint8_t *s, int p) {
// coeff: 16 bits, dynamic range [-32640, 32640].
// length: value range {16, 64, 256, 1024}.
-int vpx_satd_neon(const int16_t *coeff, int length) {
+int vpx_satd_neon(const tran_low_t *coeff, int length) {
const int16x4_t zero = vdup_n_s16(0);
int32x4_t accum = vdupq_n_s32(0);
do {
- const int16x8_t src0 = vld1q_s16(coeff);
- const int16x8_t src8 = vld1q_s16(coeff + 8);
+ const int16x8_t src0 = load_tran_low_to_s16q(coeff);
+ const int16x8_t src8 = load_tran_low_to_s16q(coeff + 8);
accum = vabal_s16(accum, vget_low_s16(src0), zero);
accum = vabal_s16(accum, vget_high_s16(src0), zero);
accum = vabal_s16(accum, vget_low_s16(src8), zero);
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c
index e9503f13d70..96f6de1be95 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/fwd_txfm_neon.c
@@ -12,8 +12,11 @@
#include "./vpx_config.h"
#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/arm/idct_neon.h"
-void vpx_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
+void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
+ int stride) {
int i;
// stage 1
int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
@@ -191,18 +194,18 @@ void vpx_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
input_6 = vhsubq_s16(input_6, sign_in6);
input_7 = vhsubq_s16(input_7, sign_in7);
// store results
- vst1q_s16(&final_output[0 * 8], input_0);
- vst1q_s16(&final_output[1 * 8], input_1);
- vst1q_s16(&final_output[2 * 8], input_2);
- vst1q_s16(&final_output[3 * 8], input_3);
- vst1q_s16(&final_output[4 * 8], input_4);
- vst1q_s16(&final_output[5 * 8], input_5);
- vst1q_s16(&final_output[6 * 8], input_6);
- vst1q_s16(&final_output[7 * 8], input_7);
+ store_s16q_to_tran_low(final_output + 0 * 8, input_0);
+ store_s16q_to_tran_low(final_output + 1 * 8, input_1);
+ store_s16q_to_tran_low(final_output + 2 * 8, input_2);
+ store_s16q_to_tran_low(final_output + 3 * 8, input_3);
+ store_s16q_to_tran_low(final_output + 4 * 8, input_4);
+ store_s16q_to_tran_low(final_output + 5 * 8, input_5);
+ store_s16q_to_tran_low(final_output + 6 * 8, input_6);
+ store_s16q_to_tran_low(final_output + 7 * 8, input_7);
}
}
-void vpx_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) {
+void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) {
int r;
int16x8_t sum = vld1q_s16(&input[0]);
for (r = 1; r < 8; ++r) {
@@ -214,7 +217,11 @@ void vpx_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) {
const int64x2_t b = vpaddlq_s32(a);
const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
vreinterpret_s32_s64(vget_high_s64(b)));
+#if CONFIG_VP9_HIGHBITDEPTH
+ output[0] = vget_lane_s32(c, 0);
+#else
output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0);
+#endif
output[1] = 0;
}
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/hadamard_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/hadamard_neon.c
index 977323497a8..ebeafed31fd 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/hadamard_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/hadamard_neon.c
@@ -11,6 +11,8 @@
#include <arm_neon.h>
#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/idct_neon.h"
#include "vpx_dsp/arm/transpose_neon.h"
static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
@@ -45,7 +47,7 @@ static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
}
void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride,
- int16_t *coeff) {
+ tran_low_t *coeff) {
int16x8_t a0 = vld1q_s16(src_diff);
int16x8_t a1 = vld1q_s16(src_diff + src_stride);
int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride);
@@ -63,18 +65,18 @@ void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride,
// Skip the second transpose because it is not required.
- vst1q_s16(coeff + 0, a0);
- vst1q_s16(coeff + 8, a1);
- vst1q_s16(coeff + 16, a2);
- vst1q_s16(coeff + 24, a3);
- vst1q_s16(coeff + 32, a4);
- vst1q_s16(coeff + 40, a5);
- vst1q_s16(coeff + 48, a6);
- vst1q_s16(coeff + 56, a7);
+ store_s16q_to_tran_low(coeff + 0, a0);
+ store_s16q_to_tran_low(coeff + 8, a1);
+ store_s16q_to_tran_low(coeff + 16, a2);
+ store_s16q_to_tran_low(coeff + 24, a3);
+ store_s16q_to_tran_low(coeff + 32, a4);
+ store_s16q_to_tran_low(coeff + 40, a5);
+ store_s16q_to_tran_low(coeff + 48, a6);
+ store_s16q_to_tran_low(coeff + 56, a7);
}
void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride,
- int16_t *coeff) {
+ tran_low_t *coeff) {
int i;
/* Rearrange 16x16 to 8x32 and remove stride.
@@ -88,10 +90,10 @@ void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride,
vpx_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
for (i = 0; i < 64; i += 8) {
- const int16x8_t a0 = vld1q_s16(coeff + 0);
- const int16x8_t a1 = vld1q_s16(coeff + 64);
- const int16x8_t a2 = vld1q_s16(coeff + 128);
- const int16x8_t a3 = vld1q_s16(coeff + 192);
+ const int16x8_t a0 = load_tran_low_to_s16q(coeff + 0);
+ const int16x8_t a1 = load_tran_low_to_s16q(coeff + 64);
+ const int16x8_t a2 = load_tran_low_to_s16q(coeff + 128);
+ const int16x8_t a3 = load_tran_low_to_s16q(coeff + 192);
const int16x8_t b0 = vhaddq_s16(a0, a1);
const int16x8_t b1 = vhsubq_s16(a0, a1);
@@ -103,10 +105,10 @@ void vpx_hadamard_16x16_neon(const int16_t *src_diff, int src_stride,
const int16x8_t c2 = vsubq_s16(b0, b2);
const int16x8_t c3 = vsubq_s16(b1, b3);
- vst1q_s16(coeff + 0, c0);
- vst1q_s16(coeff + 64, c1);
- vst1q_s16(coeff + 128, c2);
- vst1q_s16(coeff + 192, c3);
+ store_s16q_to_tran_low(coeff + 0, c0);
+ store_s16q_to_tran_low(coeff + 64, c1);
+ store_s16q_to_tran_low(coeff + 128, c2);
+ store_s16q_to_tran_low(coeff + 192, c3);
coeff += 8;
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c
new file mode 100644
index 00000000000..d361c8263a8
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct16x16_add_neon.c
@@ -0,0 +1,1512 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE void highbd_idct16x16_add_wrap_low_8x2(const int64x2x2_t *const t,
+ int32x4x2_t *const d0,
+ int32x4x2_t *const d1) {
+ int32x2x2_t t32[4];
+
+ t32[0].val[0] = vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS);
+ t32[0].val[1] = vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS);
+ t32[1].val[0] = vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS);
+ t32[1].val[1] = vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS);
+ t32[2].val[0] = vrshrn_n_s64(t[2].val[0], DCT_CONST_BITS);
+ t32[2].val[1] = vrshrn_n_s64(t[2].val[1], DCT_CONST_BITS);
+ t32[3].val[0] = vrshrn_n_s64(t[3].val[0], DCT_CONST_BITS);
+ t32[3].val[1] = vrshrn_n_s64(t[3].val[1], DCT_CONST_BITS);
+ d0->val[0] = vcombine_s32(t32[0].val[0], t32[0].val[1]);
+ d0->val[1] = vcombine_s32(t32[1].val[0], t32[1].val[1]);
+ d1->val[0] = vcombine_s32(t32[2].val[0], t32[2].val[1]);
+ d1->val[1] = vcombine_s32(t32[3].val[0], t32[3].val[1]);
+}
+
+static INLINE void highbd_idct16x16_add_wrap_low_4x2(const int64x2x2_t *const t,
+ int32x4_t *const d0,
+ int32x4_t *const d1) {
+ int32x2x2_t t32[2];
+
+ t32[0].val[0] = vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS);
+ t32[0].val[1] = vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS);
+ t32[1].val[0] = vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS);
+ t32[1].val[1] = vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS);
+ *d0 = vcombine_s32(t32[0].val[0], t32[0].val[1]);
+ *d1 = vcombine_s32(t32[1].val[0], t32[1].val[1]);
+}
+
+static INLINE int32x4x2_t
+highbd_idct16x16_add_wrap_low_8x1(const int64x2x2_t *const t) {
+ int32x2x2_t t32[2];
+ int32x4x2_t d;
+
+ t32[0].val[0] = vrshrn_n_s64(t[0].val[0], DCT_CONST_BITS);
+ t32[0].val[1] = vrshrn_n_s64(t[0].val[1], DCT_CONST_BITS);
+ t32[1].val[0] = vrshrn_n_s64(t[1].val[0], DCT_CONST_BITS);
+ t32[1].val[1] = vrshrn_n_s64(t[1].val[1], DCT_CONST_BITS);
+ d.val[0] = vcombine_s32(t32[0].val[0], t32[0].val[1]);
+ d.val[1] = vcombine_s32(t32[1].val[0], t32[1].val[1]);
+ return d;
+}
+
+static INLINE int32x4_t highbd_idct16x16_add_wrap_low_4x1(const int64x2x2_t t) {
+ int32x2x2_t t32;
+
+ t32.val[0] = vrshrn_n_s64(t.val[0], DCT_CONST_BITS);
+ t32.val[1] = vrshrn_n_s64(t.val[1], DCT_CONST_BITS);
+ return vcombine_s32(t32.val[0], t32.val[1]);
+}
+
+static INLINE void highbd_idct_cospi_2_30(const int32x4x2_t s0,
+ const int32x4x2_t s1,
+ const int32x4_t cospi_2_30_10_22,
+ int32x4x2_t *const d0,
+ int32x4x2_t *const d1) {
+ int64x2x2_t t[4];
+
+ t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
+ vget_low_s32(cospi_2_30_10_22), 1);
+ t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
+ vget_low_s32(cospi_2_30_10_22), 1);
+ t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
+ vget_low_s32(cospi_2_30_10_22), 1);
+ t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
+ vget_low_s32(cospi_2_30_10_22), 1);
+ t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+ vget_low_s32(cospi_2_30_10_22), 1);
+ t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+ vget_low_s32(cospi_2_30_10_22), 1);
+ t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+ vget_low_s32(cospi_2_30_10_22), 1);
+ t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+ vget_low_s32(cospi_2_30_10_22), 1);
+ t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
+ vget_low_s32(cospi_2_30_10_22), 0);
+ t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
+ vget_low_s32(cospi_2_30_10_22), 0);
+ t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
+ vget_low_s32(cospi_2_30_10_22), 0);
+ t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
+ vget_low_s32(cospi_2_30_10_22), 0);
+ t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
+ vget_low_s32(cospi_2_30_10_22), 0);
+ t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
+ vget_low_s32(cospi_2_30_10_22), 0);
+ t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
+ vget_low_s32(cospi_2_30_10_22), 0);
+ t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
+ vget_low_s32(cospi_2_30_10_22), 0);
+ highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_4_28(const int32x4x2_t s0,
+ const int32x4x2_t s1,
+ const int32x4_t cospi_4_12_20N_28,
+ int32x4x2_t *const d0,
+ int32x4x2_t *const d1) {
+ int64x2x2_t t[4];
+
+ t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
+ vget_high_s32(cospi_4_12_20N_28), 1);
+ t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
+ vget_high_s32(cospi_4_12_20N_28), 1);
+ t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
+ vget_high_s32(cospi_4_12_20N_28), 1);
+ t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
+ vget_high_s32(cospi_4_12_20N_28), 1);
+ t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+ vget_high_s32(cospi_4_12_20N_28), 1);
+ t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+ vget_high_s32(cospi_4_12_20N_28), 1);
+ t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+ vget_high_s32(cospi_4_12_20N_28), 1);
+ t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+ vget_high_s32(cospi_4_12_20N_28), 1);
+ t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
+ vget_low_s32(cospi_4_12_20N_28), 0);
+ t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
+ vget_low_s32(cospi_4_12_20N_28), 0);
+ t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
+ vget_low_s32(cospi_4_12_20N_28), 0);
+ t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
+ vget_low_s32(cospi_4_12_20N_28), 0);
+ t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
+ vget_low_s32(cospi_4_12_20N_28), 0);
+ t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
+ vget_low_s32(cospi_4_12_20N_28), 0);
+ t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
+ vget_low_s32(cospi_4_12_20N_28), 0);
+ t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
+ vget_low_s32(cospi_4_12_20N_28), 0);
+ highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_6_26(const int32x4x2_t s0,
+ const int32x4x2_t s1,
+ const int32x4_t cospi_6_26_14_18N,
+ int32x4x2_t *const d0,
+ int32x4x2_t *const d1) {
+ int64x2x2_t t[4];
+
+ t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
+ vget_low_s32(cospi_6_26_14_18N), 0);
+ t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
+ vget_low_s32(cospi_6_26_14_18N), 0);
+ t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
+ vget_low_s32(cospi_6_26_14_18N), 0);
+ t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
+ vget_low_s32(cospi_6_26_14_18N), 0);
+ t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+ vget_low_s32(cospi_6_26_14_18N), 0);
+ t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+ vget_low_s32(cospi_6_26_14_18N), 0);
+ t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+ vget_low_s32(cospi_6_26_14_18N), 0);
+ t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+ vget_low_s32(cospi_6_26_14_18N), 0);
+ t[0].val[0] = vmlal_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
+ vget_low_s32(cospi_6_26_14_18N), 1);
+ t[0].val[1] = vmlal_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
+ vget_low_s32(cospi_6_26_14_18N), 1);
+ t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
+ vget_low_s32(cospi_6_26_14_18N), 1);
+ t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
+ vget_low_s32(cospi_6_26_14_18N), 1);
+ t[2].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
+ vget_low_s32(cospi_6_26_14_18N), 1);
+ t[2].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
+ vget_low_s32(cospi_6_26_14_18N), 1);
+ t[3].val[0] = vmlsl_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
+ vget_low_s32(cospi_6_26_14_18N), 1);
+ t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
+ vget_low_s32(cospi_6_26_14_18N), 1);
+ highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_10_22(const int32x4x2_t s0,
+ const int32x4x2_t s1,
+ const int32x4_t cospi_2_30_10_22,
+ int32x4x2_t *const d0,
+ int32x4x2_t *const d1) {
+ int64x2x2_t t[4];
+
+ t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
+ vget_high_s32(cospi_2_30_10_22), 1);
+ t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
+ vget_high_s32(cospi_2_30_10_22), 1);
+ t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
+ vget_high_s32(cospi_2_30_10_22), 1);
+ t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
+ vget_high_s32(cospi_2_30_10_22), 1);
+ t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+ vget_high_s32(cospi_2_30_10_22), 1);
+ t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+ vget_high_s32(cospi_2_30_10_22), 1);
+ t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+ vget_high_s32(cospi_2_30_10_22), 1);
+ t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+ vget_high_s32(cospi_2_30_10_22), 1);
+ t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
+ vget_high_s32(cospi_2_30_10_22), 0);
+ t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
+ vget_high_s32(cospi_2_30_10_22), 0);
+ t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
+ vget_high_s32(cospi_2_30_10_22), 0);
+ t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
+ vget_high_s32(cospi_2_30_10_22), 0);
+ t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
+ vget_high_s32(cospi_2_30_10_22), 0);
+ t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
+ vget_high_s32(cospi_2_30_10_22), 0);
+ t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
+ vget_high_s32(cospi_2_30_10_22), 0);
+ t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
+ vget_high_s32(cospi_2_30_10_22), 0);
+ highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_12_20(const int32x4x2_t s0,
+ const int32x4x2_t s1,
+ const int32x4_t cospi_4_12_20N_28,
+ int32x4x2_t *const d0,
+ int32x4x2_t *const d1) {
+ int64x2x2_t t[4];
+
+ t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
+ vget_low_s32(cospi_4_12_20N_28), 1);
+ t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
+ vget_low_s32(cospi_4_12_20N_28), 1);
+ t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
+ vget_low_s32(cospi_4_12_20N_28), 1);
+ t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
+ vget_low_s32(cospi_4_12_20N_28), 1);
+ t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+ vget_low_s32(cospi_4_12_20N_28), 1);
+ t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+ vget_low_s32(cospi_4_12_20N_28), 1);
+ t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+ vget_low_s32(cospi_4_12_20N_28), 1);
+ t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+ vget_low_s32(cospi_4_12_20N_28), 1);
+ t[0].val[0] = vmlal_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
+ vget_high_s32(cospi_4_12_20N_28), 0);
+ t[0].val[1] = vmlal_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
+ vget_high_s32(cospi_4_12_20N_28), 0);
+ t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
+ vget_high_s32(cospi_4_12_20N_28), 0);
+ t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
+ vget_high_s32(cospi_4_12_20N_28), 0);
+ t[2].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
+ vget_high_s32(cospi_4_12_20N_28), 0);
+ t[2].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
+ vget_high_s32(cospi_4_12_20N_28), 0);
+ t[3].val[0] = vmlsl_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
+ vget_high_s32(cospi_4_12_20N_28), 0);
+ t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
+ vget_high_s32(cospi_4_12_20N_28), 0);
+ highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_14_18(const int32x4x2_t s0,
+ const int32x4x2_t s1,
+ const int32x4_t cospi_6_26_14_18N,
+ int32x4x2_t *const d0,
+ int32x4x2_t *const d1) {
+ int64x2x2_t t[4];
+
+ t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
+ vget_high_s32(cospi_6_26_14_18N), 0);
+ t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
+ vget_high_s32(cospi_6_26_14_18N), 0);
+ t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
+ vget_high_s32(cospi_6_26_14_18N), 0);
+ t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
+ vget_high_s32(cospi_6_26_14_18N), 0);
+ t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+ vget_high_s32(cospi_6_26_14_18N), 0);
+ t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+ vget_high_s32(cospi_6_26_14_18N), 0);
+ t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+ vget_high_s32(cospi_6_26_14_18N), 0);
+ t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+ vget_high_s32(cospi_6_26_14_18N), 0);
+ t[0].val[0] = vmlal_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
+ vget_high_s32(cospi_6_26_14_18N), 1);
+ t[0].val[1] = vmlal_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
+ vget_high_s32(cospi_6_26_14_18N), 1);
+ t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
+ vget_high_s32(cospi_6_26_14_18N), 1);
+ t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
+ vget_high_s32(cospi_6_26_14_18N), 1);
+ t[2].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
+ vget_high_s32(cospi_6_26_14_18N), 1);
+ t[2].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
+ vget_high_s32(cospi_6_26_14_18N), 1);
+ t[3].val[0] = vmlsl_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
+ vget_high_s32(cospi_6_26_14_18N), 1);
+ t[3].val[1] = vmlsl_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
+ vget_high_s32(cospi_6_26_14_18N), 1);
+ highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_8_24_q_kernel(
+ const int32x4x2_t s0, const int32x4x2_t s1, const int32x4_t cospi_0_8_16_24,
+ int64x2x2_t *const t) {
+ t[0].val[0] = vmull_lane_s32(vget_low_s32(s0.val[0]),
+ vget_high_s32(cospi_0_8_16_24), 1);
+ t[0].val[1] = vmull_lane_s32(vget_high_s32(s0.val[0]),
+ vget_high_s32(cospi_0_8_16_24), 1);
+ t[1].val[0] = vmull_lane_s32(vget_low_s32(s0.val[1]),
+ vget_high_s32(cospi_0_8_16_24), 1);
+ t[1].val[1] = vmull_lane_s32(vget_high_s32(s0.val[1]),
+ vget_high_s32(cospi_0_8_16_24), 1);
+ t[2].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+ vget_high_s32(cospi_0_8_16_24), 1);
+ t[2].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+ vget_high_s32(cospi_0_8_16_24), 1);
+ t[3].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+ vget_high_s32(cospi_0_8_16_24), 1);
+ t[3].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+ vget_high_s32(cospi_0_8_16_24), 1);
+ t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1.val[0]),
+ vget_low_s32(cospi_0_8_16_24), 1);
+ t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1.val[0]),
+ vget_low_s32(cospi_0_8_16_24), 1);
+ t[1].val[0] = vmlsl_lane_s32(t[1].val[0], vget_low_s32(s1.val[1]),
+ vget_low_s32(cospi_0_8_16_24), 1);
+ t[1].val[1] = vmlsl_lane_s32(t[1].val[1], vget_high_s32(s1.val[1]),
+ vget_low_s32(cospi_0_8_16_24), 1);
+ t[2].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0.val[0]),
+ vget_low_s32(cospi_0_8_16_24), 1);
+ t[2].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0.val[0]),
+ vget_low_s32(cospi_0_8_16_24), 1);
+ t[3].val[0] = vmlal_lane_s32(t[3].val[0], vget_low_s32(s0.val[1]),
+ vget_low_s32(cospi_0_8_16_24), 1);
+ t[3].val[1] = vmlal_lane_s32(t[3].val[1], vget_high_s32(s0.val[1]),
+ vget_low_s32(cospi_0_8_16_24), 1);
+}
+
+static INLINE void highbd_idct_cospi_8_24_d_kernel(
+ const int32x4_t s0, const int32x4_t s1, const int32x4_t cospi_0_8_16_24,
+ int64x2x2_t *const t) {
+ t[0].val[0] =
+ vmull_lane_s32(vget_low_s32(s0), vget_high_s32(cospi_0_8_16_24), 1);
+ t[0].val[1] =
+ vmull_lane_s32(vget_high_s32(s0), vget_high_s32(cospi_0_8_16_24), 1);
+ t[1].val[0] =
+ vmull_lane_s32(vget_low_s32(s1), vget_high_s32(cospi_0_8_16_24), 1);
+ t[1].val[1] =
+ vmull_lane_s32(vget_high_s32(s1), vget_high_s32(cospi_0_8_16_24), 1);
+ t[0].val[0] = vmlsl_lane_s32(t[0].val[0], vget_low_s32(s1),
+ vget_low_s32(cospi_0_8_16_24), 1);
+ t[0].val[1] = vmlsl_lane_s32(t[0].val[1], vget_high_s32(s1),
+ vget_low_s32(cospi_0_8_16_24), 1);
+ t[1].val[0] = vmlal_lane_s32(t[1].val[0], vget_low_s32(s0),
+ vget_low_s32(cospi_0_8_16_24), 1);
+ t[1].val[1] = vmlal_lane_s32(t[1].val[1], vget_high_s32(s0),
+ vget_low_s32(cospi_0_8_16_24), 1);
+}
+
+static INLINE void highbd_idct_cospi_8_24_q(const int32x4x2_t s0,
+ const int32x4x2_t s1,
+ const int32x4_t cospi_0_8_16_24,
+ int32x4x2_t *const d0,
+ int32x4x2_t *const d1) {
+ int64x2x2_t t[4];
+
+ highbd_idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t);
+ highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_8_24_d(const int32x4_t s0,
+ const int32x4_t s1,
+ const int32x4_t cospi_0_8_16_24,
+ int32x4_t *const d0,
+ int32x4_t *const d1) {
+ int64x2x2_t t[2];
+
+ highbd_idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t);
+ highbd_idct16x16_add_wrap_low_4x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_8_24_neg_q(const int32x4x2_t s0,
+ const int32x4x2_t s1,
+ const int32x4_t cospi_0_8_16_24,
+ int32x4x2_t *const d0,
+ int32x4x2_t *const d1) {
+ int64x2x2_t t[4];
+
+ highbd_idct_cospi_8_24_q_kernel(s0, s1, cospi_0_8_16_24, t);
+ t[2].val[0] = vsubq_s64(vdupq_n_s64(0), t[2].val[0]);
+ t[2].val[1] = vsubq_s64(vdupq_n_s64(0), t[2].val[1]);
+ t[3].val[0] = vsubq_s64(vdupq_n_s64(0), t[3].val[0]);
+ t[3].val[1] = vsubq_s64(vdupq_n_s64(0), t[3].val[1]);
+ highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_8_24_neg_d(const int32x4_t s0,
+ const int32x4_t s1,
+ const int32x4_t cospi_0_8_16_24,
+ int32x4_t *const d0,
+ int32x4_t *const d1) {
+ int64x2x2_t t[2];
+
+ highbd_idct_cospi_8_24_d_kernel(s0, s1, cospi_0_8_16_24, t);
+ t[1].val[0] = vsubq_s64(vdupq_n_s64(0), t[1].val[0]);
+ t[1].val[1] = vsubq_s64(vdupq_n_s64(0), t[1].val[1]);
+ highbd_idct16x16_add_wrap_low_4x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_16_16_q(const int32x4x2_t s0,
+ const int32x4x2_t s1,
+ const int32x4_t cospi_0_8_16_24,
+ int32x4x2_t *const d0,
+ int32x4x2_t *const d1) {
+ int64x2x2_t t[6];
+
+ t[4].val[0] = vmull_lane_s32(vget_low_s32(s1.val[0]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[4].val[1] = vmull_lane_s32(vget_high_s32(s1.val[0]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[5].val[0] = vmull_lane_s32(vget_low_s32(s1.val[1]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[5].val[1] = vmull_lane_s32(vget_high_s32(s1.val[1]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[0].val[0] = vmlsl_lane_s32(t[4].val[0], vget_low_s32(s0.val[0]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[0].val[1] = vmlsl_lane_s32(t[4].val[1], vget_high_s32(s0.val[0]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[1].val[0] = vmlsl_lane_s32(t[5].val[0], vget_low_s32(s0.val[1]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[1].val[1] = vmlsl_lane_s32(t[5].val[1], vget_high_s32(s0.val[1]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[2].val[0] = vmlal_lane_s32(t[4].val[0], vget_low_s32(s0.val[0]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[2].val[1] = vmlal_lane_s32(t[4].val[1], vget_high_s32(s0.val[0]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[3].val[0] = vmlal_lane_s32(t[5].val[0], vget_low_s32(s0.val[1]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[3].val[1] = vmlal_lane_s32(t[5].val[1], vget_high_s32(s0.val[1]),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ highbd_idct16x16_add_wrap_low_8x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct_cospi_16_16_d(const int32x4_t s0,
+ const int32x4_t s1,
+ const int32x4_t cospi_0_8_16_24,
+ int32x4_t *const d0,
+ int32x4_t *const d1) {
+ int64x2x2_t t[3];
+
+ t[2].val[0] =
+ vmull_lane_s32(vget_low_s32(s1), vget_high_s32(cospi_0_8_16_24), 0);
+ t[2].val[1] =
+ vmull_lane_s32(vget_high_s32(s1), vget_high_s32(cospi_0_8_16_24), 0);
+ t[0].val[0] = vmlsl_lane_s32(t[2].val[0], vget_low_s32(s0),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[0].val[1] = vmlsl_lane_s32(t[2].val[1], vget_high_s32(s0),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[1].val[0] = vmlal_lane_s32(t[2].val[0], vget_low_s32(s0),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ t[1].val[1] = vmlal_lane_s32(t[2].val[1], vget_high_s32(s0),
+ vget_high_s32(cospi_0_8_16_24), 0);
+ highbd_idct16x16_add_wrap_low_4x2(t, d0, d1);
+}
+
+static INLINE void highbd_idct16x16_add_stage7_dual(
+ const int32x4x2_t *const step2, int32x4x2_t *const out) {
+ out[0].val[0] = vaddq_s32(step2[0].val[0], step2[15].val[0]);
+ out[0].val[1] = vaddq_s32(step2[0].val[1], step2[15].val[1]);
+ out[1].val[0] = vaddq_s32(step2[1].val[0], step2[14].val[0]);
+ out[1].val[1] = vaddq_s32(step2[1].val[1], step2[14].val[1]);
+ out[2].val[0] = vaddq_s32(step2[2].val[0], step2[13].val[0]);
+ out[2].val[1] = vaddq_s32(step2[2].val[1], step2[13].val[1]);
+ out[3].val[0] = vaddq_s32(step2[3].val[0], step2[12].val[0]);
+ out[3].val[1] = vaddq_s32(step2[3].val[1], step2[12].val[1]);
+ out[4].val[0] = vaddq_s32(step2[4].val[0], step2[11].val[0]);
+ out[4].val[1] = vaddq_s32(step2[4].val[1], step2[11].val[1]);
+ out[5].val[0] = vaddq_s32(step2[5].val[0], step2[10].val[0]);
+ out[5].val[1] = vaddq_s32(step2[5].val[1], step2[10].val[1]);
+ out[6].val[0] = vaddq_s32(step2[6].val[0], step2[9].val[0]);
+ out[6].val[1] = vaddq_s32(step2[6].val[1], step2[9].val[1]);
+ out[7].val[0] = vaddq_s32(step2[7].val[0], step2[8].val[0]);
+ out[7].val[1] = vaddq_s32(step2[7].val[1], step2[8].val[1]);
+ out[8].val[0] = vsubq_s32(step2[7].val[0], step2[8].val[0]);
+ out[8].val[1] = vsubq_s32(step2[7].val[1], step2[8].val[1]);
+ out[9].val[0] = vsubq_s32(step2[6].val[0], step2[9].val[0]);
+ out[9].val[1] = vsubq_s32(step2[6].val[1], step2[9].val[1]);
+ out[10].val[0] = vsubq_s32(step2[5].val[0], step2[10].val[0]);
+ out[10].val[1] = vsubq_s32(step2[5].val[1], step2[10].val[1]);
+ out[11].val[0] = vsubq_s32(step2[4].val[0], step2[11].val[0]);
+ out[11].val[1] = vsubq_s32(step2[4].val[1], step2[11].val[1]);
+ out[12].val[0] = vsubq_s32(step2[3].val[0], step2[12].val[0]);
+ out[12].val[1] = vsubq_s32(step2[3].val[1], step2[12].val[1]);
+ out[13].val[0] = vsubq_s32(step2[2].val[0], step2[13].val[0]);
+ out[13].val[1] = vsubq_s32(step2[2].val[1], step2[13].val[1]);
+ out[14].val[0] = vsubq_s32(step2[1].val[0], step2[14].val[0]);
+ out[14].val[1] = vsubq_s32(step2[1].val[1], step2[14].val[1]);
+ out[15].val[0] = vsubq_s32(step2[0].val[0], step2[15].val[0]);
+ out[15].val[1] = vsubq_s32(step2[0].val[1], step2[15].val[1]);
+}
+
+static INLINE void highbd_idct16x16_add_stage7(const int32x4_t *const step2,
+ int32x4_t *const out) {
+ out[0] = vaddq_s32(step2[0], step2[15]);
+ out[1] = vaddq_s32(step2[1], step2[14]);
+ out[2] = vaddq_s32(step2[2], step2[13]);
+ out[3] = vaddq_s32(step2[3], step2[12]);
+ out[4] = vaddq_s32(step2[4], step2[11]);
+ out[5] = vaddq_s32(step2[5], step2[10]);
+ out[6] = vaddq_s32(step2[6], step2[9]);
+ out[7] = vaddq_s32(step2[7], step2[8]);
+ out[8] = vsubq_s32(step2[7], step2[8]);
+ out[9] = vsubq_s32(step2[6], step2[9]);
+ out[10] = vsubq_s32(step2[5], step2[10]);
+ out[11] = vsubq_s32(step2[4], step2[11]);
+ out[12] = vsubq_s32(step2[3], step2[12]);
+ out[13] = vsubq_s32(step2[2], step2[13]);
+ out[14] = vsubq_s32(step2[1], step2[14]);
+ out[15] = vsubq_s32(step2[0], step2[15]);
+}
+
+static INLINE void highbd_idct16x16_store_pass1(const int32x4x2_t *const out,
+ int32_t *output) {
+ // Save the result into output
+ vst1q_s32(output + 0, out[0].val[0]);
+ vst1q_s32(output + 4, out[0].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[1].val[0]);
+ vst1q_s32(output + 4, out[1].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[2].val[0]);
+ vst1q_s32(output + 4, out[2].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[3].val[0]);
+ vst1q_s32(output + 4, out[3].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[4].val[0]);
+ vst1q_s32(output + 4, out[4].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[5].val[0]);
+ vst1q_s32(output + 4, out[5].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[6].val[0]);
+ vst1q_s32(output + 4, out[6].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[7].val[0]);
+ vst1q_s32(output + 4, out[7].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[8].val[0]);
+ vst1q_s32(output + 4, out[8].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[9].val[0]);
+ vst1q_s32(output + 4, out[9].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[10].val[0]);
+ vst1q_s32(output + 4, out[10].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[11].val[0]);
+ vst1q_s32(output + 4, out[11].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[12].val[0]);
+ vst1q_s32(output + 4, out[12].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[13].val[0]);
+ vst1q_s32(output + 4, out[13].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[14].val[0]);
+ vst1q_s32(output + 4, out[14].val[1]);
+ output += 16;
+ vst1q_s32(output + 0, out[15].val[0]);
+ vst1q_s32(output + 4, out[15].val[1]);
+}
+
+static INLINE void highbd_idct16x16_add_store(const int32x4x2_t *const out,
+ uint16_t *dest, const int stride,
+ const int bd) {
+ // Add the result to dest
+ const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+ int16x8_t o[16];
+ o[0] = vcombine_s16(vrshrn_n_s32(out[0].val[0], 6),
+ vrshrn_n_s32(out[0].val[1], 6));
+ o[1] = vcombine_s16(vrshrn_n_s32(out[1].val[0], 6),
+ vrshrn_n_s32(out[1].val[1], 6));
+ o[2] = vcombine_s16(vrshrn_n_s32(out[2].val[0], 6),
+ vrshrn_n_s32(out[2].val[1], 6));
+ o[3] = vcombine_s16(vrshrn_n_s32(out[3].val[0], 6),
+ vrshrn_n_s32(out[3].val[1], 6));
+ o[4] = vcombine_s16(vrshrn_n_s32(out[4].val[0], 6),
+ vrshrn_n_s32(out[4].val[1], 6));
+ o[5] = vcombine_s16(vrshrn_n_s32(out[5].val[0], 6),
+ vrshrn_n_s32(out[5].val[1], 6));
+ o[6] = vcombine_s16(vrshrn_n_s32(out[6].val[0], 6),
+ vrshrn_n_s32(out[6].val[1], 6));
+ o[7] = vcombine_s16(vrshrn_n_s32(out[7].val[0], 6),
+ vrshrn_n_s32(out[7].val[1], 6));
+ o[8] = vcombine_s16(vrshrn_n_s32(out[8].val[0], 6),
+ vrshrn_n_s32(out[8].val[1], 6));
+ o[9] = vcombine_s16(vrshrn_n_s32(out[9].val[0], 6),
+ vrshrn_n_s32(out[9].val[1], 6));
+ o[10] = vcombine_s16(vrshrn_n_s32(out[10].val[0], 6),
+ vrshrn_n_s32(out[10].val[1], 6));
+ o[11] = vcombine_s16(vrshrn_n_s32(out[11].val[0], 6),
+ vrshrn_n_s32(out[11].val[1], 6));
+ o[12] = vcombine_s16(vrshrn_n_s32(out[12].val[0], 6),
+ vrshrn_n_s32(out[12].val[1], 6));
+ o[13] = vcombine_s16(vrshrn_n_s32(out[13].val[0], 6),
+ vrshrn_n_s32(out[13].val[1], 6));
+ o[14] = vcombine_s16(vrshrn_n_s32(out[14].val[0], 6),
+ vrshrn_n_s32(out[14].val[1], 6));
+ o[15] = vcombine_s16(vrshrn_n_s32(out[15].val[0], 6),
+ vrshrn_n_s32(out[15].val[1], 6));
+ highbd_idct16x16_add8x1(o[0], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[1], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[2], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[3], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[4], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[5], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[6], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[7], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[8], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[9], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[10], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[11], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[12], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[13], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[14], max, &dest, stride);
+ highbd_idct16x16_add8x1(o[15], max, &dest, stride);
+}
+
+static void highbd_idct16x16_256_add_half1d(const int32_t *input,
+ int32_t *output, uint16_t *dest,
+ const int stride, const int bd) {
+ const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0);
+ const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4);
+ const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8);
+ const int32x4_t cospi_6_26_14_18N = vld1q_s32(kCospi32 + 12);
+ int32x4x2_t in[16], step1[16], step2[16], out[16];
+
+ // Load input (16x8)
+ in[0].val[0] = vld1q_s32(input);
+ in[0].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[8].val[0] = vld1q_s32(input);
+ in[8].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[1].val[0] = vld1q_s32(input);
+ in[1].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[9].val[0] = vld1q_s32(input);
+ in[9].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[2].val[0] = vld1q_s32(input);
+ in[2].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[10].val[0] = vld1q_s32(input);
+ in[10].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[3].val[0] = vld1q_s32(input);
+ in[3].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[11].val[0] = vld1q_s32(input);
+ in[11].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[4].val[0] = vld1q_s32(input);
+ in[4].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[12].val[0] = vld1q_s32(input);
+ in[12].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[5].val[0] = vld1q_s32(input);
+ in[5].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[13].val[0] = vld1q_s32(input);
+ in[13].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[6].val[0] = vld1q_s32(input);
+ in[6].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[14].val[0] = vld1q_s32(input);
+ in[14].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[7].val[0] = vld1q_s32(input);
+ in[7].val[1] = vld1q_s32(input + 4);
+ input += 8;
+ in[15].val[0] = vld1q_s32(input);
+ in[15].val[1] = vld1q_s32(input + 4);
+
+ // Transpose
+ transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+ &in[7]);
+ transpose_s32_8x8(&in[8], &in[9], &in[10], &in[11], &in[12], &in[13], &in[14],
+ &in[15]);
+
+ // stage 1
+ step1[0] = in[0 / 2];
+ step1[1] = in[16 / 2];
+ step1[2] = in[8 / 2];
+ step1[3] = in[24 / 2];
+ step1[4] = in[4 / 2];
+ step1[5] = in[20 / 2];
+ step1[6] = in[12 / 2];
+ step1[7] = in[28 / 2];
+ step1[8] = in[2 / 2];
+ step1[9] = in[18 / 2];
+ step1[10] = in[10 / 2];
+ step1[11] = in[26 / 2];
+ step1[12] = in[6 / 2];
+ step1[13] = in[22 / 2];
+ step1[14] = in[14 / 2];
+ step1[15] = in[30 / 2];
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[1] = step1[1];
+ step2[2] = step1[2];
+ step2[3] = step1[3];
+ step2[4] = step1[4];
+ step2[5] = step1[5];
+ step2[6] = step1[6];
+ step2[7] = step1[7];
+ highbd_idct_cospi_2_30(step1[8], step1[15], cospi_2_30_10_22, &step2[8],
+ &step2[15]);
+ highbd_idct_cospi_14_18(step1[9], step1[14], cospi_6_26_14_18N, &step2[9],
+ &step2[14]);
+ highbd_idct_cospi_10_22(step1[10], step1[13], cospi_2_30_10_22, &step2[10],
+ &step2[13]);
+ highbd_idct_cospi_6_26(step1[11], step1[12], cospi_6_26_14_18N, &step2[11],
+ &step2[12]);
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[2];
+ step1[3] = step2[3];
+ highbd_idct_cospi_4_28(step2[4], step2[7], cospi_4_12_20N_28, &step1[4],
+ &step1[7]);
+ highbd_idct_cospi_12_20(step2[5], step2[6], cospi_4_12_20N_28, &step1[5],
+ &step1[6]);
+ step1[8].val[0] = vaddq_s32(step2[8].val[0], step2[9].val[0]);
+ step1[8].val[1] = vaddq_s32(step2[8].val[1], step2[9].val[1]);
+ step1[9].val[0] = vsubq_s32(step2[8].val[0], step2[9].val[0]);
+ step1[9].val[1] = vsubq_s32(step2[8].val[1], step2[9].val[1]);
+ step1[10].val[0] = vsubq_s32(step2[11].val[0], step2[10].val[0]);
+ step1[10].val[1] = vsubq_s32(step2[11].val[1], step2[10].val[1]);
+ step1[11].val[0] = vaddq_s32(step2[11].val[0], step2[10].val[0]);
+ step1[11].val[1] = vaddq_s32(step2[11].val[1], step2[10].val[1]);
+ step1[12].val[0] = vaddq_s32(step2[12].val[0], step2[13].val[0]);
+ step1[12].val[1] = vaddq_s32(step2[12].val[1], step2[13].val[1]);
+ step1[13].val[0] = vsubq_s32(step2[12].val[0], step2[13].val[0]);
+ step1[13].val[1] = vsubq_s32(step2[12].val[1], step2[13].val[1]);
+ step1[14].val[0] = vsubq_s32(step2[15].val[0], step2[14].val[0]);
+ step1[14].val[1] = vsubq_s32(step2[15].val[1], step2[14].val[1]);
+ step1[15].val[0] = vaddq_s32(step2[15].val[0], step2[14].val[0]);
+ step1[15].val[1] = vaddq_s32(step2[15].val[1], step2[14].val[1]);
+
+ // stage 4
+ highbd_idct_cospi_16_16_q(step1[1], step1[0], cospi_0_8_16_24, &step2[1],
+ &step2[0]);
+ highbd_idct_cospi_8_24_q(step1[2], step1[3], cospi_0_8_16_24, &step2[2],
+ &step2[3]);
+ step2[4].val[0] = vaddq_s32(step1[4].val[0], step1[5].val[0]);
+ step2[4].val[1] = vaddq_s32(step1[4].val[1], step1[5].val[1]);
+ step2[5].val[0] = vsubq_s32(step1[4].val[0], step1[5].val[0]);
+ step2[5].val[1] = vsubq_s32(step1[4].val[1], step1[5].val[1]);
+ step2[6].val[0] = vsubq_s32(step1[7].val[0], step1[6].val[0]);
+ step2[6].val[1] = vsubq_s32(step1[7].val[1], step1[6].val[1]);
+ step2[7].val[0] = vaddq_s32(step1[7].val[0], step1[6].val[0]);
+ step2[7].val[1] = vaddq_s32(step1[7].val[1], step1[6].val[1]);
+ step2[8] = step1[8];
+ highbd_idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+ &step2[14]);
+ highbd_idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24,
+ &step2[13], &step2[10]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+ step1[0].val[0] = vaddq_s32(step2[0].val[0], step2[3].val[0]);
+ step1[0].val[1] = vaddq_s32(step2[0].val[1], step2[3].val[1]);
+ step1[1].val[0] = vaddq_s32(step2[1].val[0], step2[2].val[0]);
+ step1[1].val[1] = vaddq_s32(step2[1].val[1], step2[2].val[1]);
+ step1[2].val[0] = vsubq_s32(step2[1].val[0], step2[2].val[0]);
+ step1[2].val[1] = vsubq_s32(step2[1].val[1], step2[2].val[1]);
+ step1[3].val[0] = vsubq_s32(step2[0].val[0], step2[3].val[0]);
+ step1[3].val[1] = vsubq_s32(step2[0].val[1], step2[3].val[1]);
+ step1[4] = step2[4];
+ highbd_idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5],
+ &step1[6]);
+ step1[7] = step2[7];
+ step1[8].val[0] = vaddq_s32(step2[8].val[0], step2[11].val[0]);
+ step1[8].val[1] = vaddq_s32(step2[8].val[1], step2[11].val[1]);
+ step1[9].val[0] = vaddq_s32(step2[9].val[0], step2[10].val[0]);
+ step1[9].val[1] = vaddq_s32(step2[9].val[1], step2[10].val[1]);
+ step1[10].val[0] = vsubq_s32(step2[9].val[0], step2[10].val[0]);
+ step1[10].val[1] = vsubq_s32(step2[9].val[1], step2[10].val[1]);
+ step1[11].val[0] = vsubq_s32(step2[8].val[0], step2[11].val[0]);
+ step1[11].val[1] = vsubq_s32(step2[8].val[1], step2[11].val[1]);
+ step1[12].val[0] = vsubq_s32(step2[15].val[0], step2[12].val[0]);
+ step1[12].val[1] = vsubq_s32(step2[15].val[1], step2[12].val[1]);
+ step1[13].val[0] = vsubq_s32(step2[14].val[0], step2[13].val[0]);
+ step1[13].val[1] = vsubq_s32(step2[14].val[1], step2[13].val[1]);
+ step1[14].val[0] = vaddq_s32(step2[14].val[0], step2[13].val[0]);
+ step1[14].val[1] = vaddq_s32(step2[14].val[1], step2[13].val[1]);
+ step1[15].val[0] = vaddq_s32(step2[15].val[0], step2[12].val[0]);
+ step1[15].val[1] = vaddq_s32(step2[15].val[1], step2[12].val[1]);
+
+ // stage 6
+ step2[0].val[0] = vaddq_s32(step1[0].val[0], step1[7].val[0]);
+ step2[0].val[1] = vaddq_s32(step1[0].val[1], step1[7].val[1]);
+ step2[1].val[0] = vaddq_s32(step1[1].val[0], step1[6].val[0]);
+ step2[1].val[1] = vaddq_s32(step1[1].val[1], step1[6].val[1]);
+ step2[2].val[0] = vaddq_s32(step1[2].val[0], step1[5].val[0]);
+ step2[2].val[1] = vaddq_s32(step1[2].val[1], step1[5].val[1]);
+ step2[3].val[0] = vaddq_s32(step1[3].val[0], step1[4].val[0]);
+ step2[3].val[1] = vaddq_s32(step1[3].val[1], step1[4].val[1]);
+ step2[4].val[0] = vsubq_s32(step1[3].val[0], step1[4].val[0]);
+ step2[4].val[1] = vsubq_s32(step1[3].val[1], step1[4].val[1]);
+ step2[5].val[0] = vsubq_s32(step1[2].val[0], step1[5].val[0]);
+ step2[5].val[1] = vsubq_s32(step1[2].val[1], step1[5].val[1]);
+ step2[6].val[0] = vsubq_s32(step1[1].val[0], step1[6].val[0]);
+ step2[6].val[1] = vsubq_s32(step1[1].val[1], step1[6].val[1]);
+ step2[7].val[0] = vsubq_s32(step1[0].val[0], step1[7].val[0]);
+ step2[7].val[1] = vsubq_s32(step1[0].val[1], step1[7].val[1]);
+ highbd_idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+ &step2[13]);
+ highbd_idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+ &step2[12]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ highbd_idct16x16_add_stage7_dual(step2, out);
+
+ if (output) {
+ highbd_idct16x16_store_pass1(out, output);
+ } else {
+ highbd_idct16x16_add_store(out, dest, stride, bd);
+ }
+}
+
+static INLINE int32x4x2_t highbd_idct_cospi_lane0_dual(const int32x4x2_t s,
+ const int32x2_t coef) {
+ int64x2x2_t t[2];
+
+ t[0].val[0] = vmull_lane_s32(vget_low_s32(s.val[0]), coef, 0);
+ t[0].val[1] = vmull_lane_s32(vget_high_s32(s.val[0]), coef, 0);
+ t[1].val[0] = vmull_lane_s32(vget_low_s32(s.val[1]), coef, 0);
+ t[1].val[1] = vmull_lane_s32(vget_high_s32(s.val[1]), coef, 0);
+ return highbd_idct16x16_add_wrap_low_8x1(t);
+}
+
+static INLINE int32x4_t highbd_idct_cospi_lane0(const int32x4_t s,
+ const int32x2_t coef) {
+ int64x2x2_t t;
+
+ t.val[0] = vmull_lane_s32(vget_low_s32(s), coef, 0);
+ t.val[1] = vmull_lane_s32(vget_high_s32(s), coef, 0);
+ return highbd_idct16x16_add_wrap_low_4x1(t);
+}
+
+static INLINE int32x4x2_t highbd_idct_cospi_lane1_dual(const int32x4x2_t s,
+ const int32x2_t coef) {
+ int64x2x2_t t[2];
+
+ t[0].val[0] = vmull_lane_s32(vget_low_s32(s.val[0]), coef, 1);
+ t[0].val[1] = vmull_lane_s32(vget_high_s32(s.val[0]), coef, 1);
+ t[1].val[0] = vmull_lane_s32(vget_low_s32(s.val[1]), coef, 1);
+ t[1].val[1] = vmull_lane_s32(vget_high_s32(s.val[1]), coef, 1);
+ return highbd_idct16x16_add_wrap_low_8x1(t);
+}
+
+static INLINE int32x4_t highbd_idct_cospi_lane1(const int32x4_t s,
+ const int32x2_t coef) {
+ int64x2x2_t t;
+
+ t.val[0] = vmull_lane_s32(vget_low_s32(s), coef, 1);
+ t.val[1] = vmull_lane_s32(vget_high_s32(s), coef, 1);
+ return highbd_idct16x16_add_wrap_low_4x1(t);
+}
+
+static INLINE int32x4x2_t highbd_idct_add_dual(const int32x4x2_t s0,
+ const int32x4x2_t s1) {
+ int32x4x2_t t;
+ t.val[0] = vaddq_s32(s0.val[0], s1.val[0]);
+ t.val[1] = vaddq_s32(s0.val[1], s1.val[1]);
+ return t;
+}
+
+static INLINE int32x4x2_t highbd_idct_sub_dual(const int32x4x2_t s0,
+ const int32x4x2_t s1) {
+ int32x4x2_t t;
+ t.val[0] = vsubq_s32(s0.val[0], s1.val[0]);
+ t.val[1] = vsubq_s32(s0.val[1], s1.val[1]);
+ return t;
+}
+
+static void highbd_idct16x16_38_add_half1d(const int32_t *input,
+ int32_t *output, uint16_t *dest,
+ const int stride, const int bd) {
+ const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0);
+ const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4);
+ const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8);
+ const int32x4_t cospi_6_26_14_18N = vld1q_s32(kCospi32 + 12);
+ int32x4x2_t in[8], step1[16], step2[16], out[16];
+
+ // Load input (8x8)
+ in[0].val[0] = vld1q_s32(input);
+ in[0].val[1] = vld1q_s32(input + 4);
+ input += 16;
+ in[1].val[0] = vld1q_s32(input);
+ in[1].val[1] = vld1q_s32(input + 4);
+ input += 16;
+ in[2].val[0] = vld1q_s32(input);
+ in[2].val[1] = vld1q_s32(input + 4);
+ input += 16;
+ in[3].val[0] = vld1q_s32(input);
+ in[3].val[1] = vld1q_s32(input + 4);
+ input += 16;
+ in[4].val[0] = vld1q_s32(input);
+ in[4].val[1] = vld1q_s32(input + 4);
+ input += 16;
+ in[5].val[0] = vld1q_s32(input);
+ in[5].val[1] = vld1q_s32(input + 4);
+ input += 16;
+ in[6].val[0] = vld1q_s32(input);
+ in[6].val[1] = vld1q_s32(input + 4);
+ input += 16;
+ in[7].val[0] = vld1q_s32(input);
+ in[7].val[1] = vld1q_s32(input + 4);
+
+ // Transpose
+ transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+ &in[7]);
+
+ // stage 1
+ step1[0] = in[0 / 2];
+ step1[2] = in[8 / 2];
+ step1[4] = in[4 / 2];
+ step1[6] = in[12 / 2];
+ step1[8] = in[2 / 2];
+ step1[10] = in[10 / 2];
+ step1[12] = in[6 / 2];
+ step1[14] = in[14 / 2]; // 0 in pass 1
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[2] = step1[2];
+ step2[4] = step1[4];
+ step2[6] = step1[6];
+ step2[8] =
+ highbd_idct_cospi_lane1_dual(step1[8], vget_low_s32(cospi_2_30_10_22));
+ step2[9] =
+ highbd_idct_cospi_lane1_dual(step1[14], vget_high_s32(cospi_6_26_14_18N));
+ step2[10] =
+ highbd_idct_cospi_lane1_dual(step1[10], vget_high_s32(cospi_2_30_10_22));
+ step2[11] =
+ highbd_idct_cospi_lane1_dual(step1[12], vget_low_s32(cospi_6_26_14_18N));
+ step2[12] =
+ highbd_idct_cospi_lane0_dual(step1[12], vget_low_s32(cospi_6_26_14_18N));
+ step2[13] =
+ highbd_idct_cospi_lane0_dual(step1[10], vget_high_s32(cospi_2_30_10_22));
+ step2[14] =
+ highbd_idct_cospi_lane0_dual(step1[14], vget_high_s32(cospi_6_26_14_18N));
+ step2[15] =
+ highbd_idct_cospi_lane0_dual(step1[8], vget_low_s32(cospi_2_30_10_22));
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[2] = step2[2];
+ step1[4] =
+ highbd_idct_cospi_lane1_dual(step2[4], vget_high_s32(cospi_4_12_20N_28));
+ step1[5] =
+ highbd_idct_cospi_lane0_dual(step2[6], vget_high_s32(cospi_4_12_20N_28));
+ step1[6] =
+ highbd_idct_cospi_lane1_dual(step2[6], vget_low_s32(cospi_4_12_20N_28));
+ step1[7] =
+ highbd_idct_cospi_lane0_dual(step2[4], vget_low_s32(cospi_4_12_20N_28));
+ step1[8] = highbd_idct_add_dual(step2[8], step2[9]);
+ step1[9] = highbd_idct_sub_dual(step2[8], step2[9]);
+ step1[10] = highbd_idct_sub_dual(step2[11], step2[10]);
+ step1[11] = highbd_idct_add_dual(step2[11], step2[10]);
+ step1[12] = highbd_idct_add_dual(step2[12], step2[13]);
+ step1[13] = highbd_idct_sub_dual(step2[12], step2[13]);
+ step1[14] = highbd_idct_sub_dual(step2[15], step2[14]);
+ step1[15] = highbd_idct_add_dual(step2[15], step2[14]);
+
+ // stage 4
+ step2[0] = step2[1] =
+ highbd_idct_cospi_lane0_dual(step1[0], vget_high_s32(cospi_0_8_16_24));
+ step2[2] =
+ highbd_idct_cospi_lane1_dual(step1[2], vget_high_s32(cospi_0_8_16_24));
+ step2[3] =
+ highbd_idct_cospi_lane1_dual(step1[2], vget_low_s32(cospi_0_8_16_24));
+ step2[4] = highbd_idct_add_dual(step1[4], step1[5]);
+ step2[5] = highbd_idct_sub_dual(step1[4], step1[5]);
+ step2[6] = highbd_idct_sub_dual(step1[7], step1[6]);
+ step2[7] = highbd_idct_add_dual(step1[7], step1[6]);
+ step2[8] = step1[8];
+ highbd_idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+ &step2[14]);
+ highbd_idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24,
+ &step2[13], &step2[10]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+ step1[0] = highbd_idct_add_dual(step2[0], step2[3]);
+ step1[1] = highbd_idct_add_dual(step2[1], step2[2]);
+ step1[2] = highbd_idct_sub_dual(step2[1], step2[2]);
+ step1[3] = highbd_idct_sub_dual(step2[0], step2[3]);
+ step1[4] = step2[4];
+ highbd_idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5],
+ &step1[6]);
+ step1[7] = step2[7];
+ step1[8] = highbd_idct_add_dual(step2[8], step2[11]);
+ step1[9] = highbd_idct_add_dual(step2[9], step2[10]);
+ step1[10] = highbd_idct_sub_dual(step2[9], step2[10]);
+ step1[11] = highbd_idct_sub_dual(step2[8], step2[11]);
+ step1[12] = highbd_idct_sub_dual(step2[15], step2[12]);
+ step1[13] = highbd_idct_sub_dual(step2[14], step2[13]);
+ step1[14] = highbd_idct_add_dual(step2[14], step2[13]);
+ step1[15] = highbd_idct_add_dual(step2[15], step2[12]);
+
+ // stage 6
+ step2[0] = highbd_idct_add_dual(step1[0], step1[7]);
+ step2[1] = highbd_idct_add_dual(step1[1], step1[6]);
+ step2[2] = highbd_idct_add_dual(step1[2], step1[5]);
+ step2[3] = highbd_idct_add_dual(step1[3], step1[4]);
+ step2[4] = highbd_idct_sub_dual(step1[3], step1[4]);
+ step2[5] = highbd_idct_sub_dual(step1[2], step1[5]);
+ step2[6] = highbd_idct_sub_dual(step1[1], step1[6]);
+ step2[7] = highbd_idct_sub_dual(step1[0], step1[7]);
+ highbd_idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+ &step2[13]);
+ highbd_idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+ &step2[12]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ highbd_idct16x16_add_stage7_dual(step2, out);
+
+ if (output) {
+ highbd_idct16x16_store_pass1(out, output);
+ } else {
+ highbd_idct16x16_add_store(out, dest, stride, bd);
+ }
+}
+
+void highbd_idct16x16_10_add_half1d_pass1(const tran_low_t *input,
+ int32_t *output) {
+ const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0);
+ const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4);
+ const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8);
+ const int32x4_t cospi_6_26_14_18N = vld1q_s32(kCospi32 + 12);
+ int32x4_t in[4], step1[16], step2[16], out[16];
+
+ // Load input (4x4)
+ in[0] = vld1q_s32(input);
+ input += 16;
+ in[1] = vld1q_s32(input);
+ input += 16;
+ in[2] = vld1q_s32(input);
+ input += 16;
+ in[3] = vld1q_s32(input);
+
+ // Transpose
+ transpose_s32_4x4(&in[0], &in[1], &in[2], &in[3]);
+
+ // stage 1
+ step1[0] = in[0 / 2];
+ step1[4] = in[4 / 2];
+ step1[8] = in[2 / 2];
+ step1[12] = in[6 / 2];
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[4] = step1[4];
+ step2[8] = highbd_idct_cospi_lane1(step1[8], vget_low_s32(cospi_2_30_10_22));
+ step2[11] =
+ highbd_idct_cospi_lane1(step1[12], vget_low_s32(cospi_6_26_14_18N));
+ step2[12] =
+ highbd_idct_cospi_lane0(step1[12], vget_low_s32(cospi_6_26_14_18N));
+ step2[15] = highbd_idct_cospi_lane0(step1[8], vget_low_s32(cospi_2_30_10_22));
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[4] =
+ highbd_idct_cospi_lane1(step2[4], vget_high_s32(cospi_4_12_20N_28));
+ step1[7] = highbd_idct_cospi_lane0(step2[4], vget_low_s32(cospi_4_12_20N_28));
+ step1[8] = step2[8];
+ step1[9] = step2[8];
+ step1[10] = step2[11];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[13] = step2[12];
+ step1[14] = step2[15];
+ step1[15] = step2[15];
+
+ // stage 4
+ step2[0] = step2[1] =
+ highbd_idct_cospi_lane0(step1[0], vget_high_s32(cospi_0_8_16_24));
+ step2[4] = step1[4];
+ step2[5] = step1[4];
+ step2[6] = step1[7];
+ step2[7] = step1[7];
+ step2[8] = step1[8];
+ highbd_idct_cospi_8_24_d(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+ &step2[14]);
+ highbd_idct_cospi_8_24_neg_d(step1[13], step1[10], cospi_0_8_16_24,
+ &step2[13], &step2[10]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[1];
+ step1[3] = step2[0];
+ step1[4] = step2[4];
+ highbd_idct_cospi_16_16_d(step2[5], step2[6], cospi_0_8_16_24, &step1[5],
+ &step1[6]);
+ step1[7] = step2[7];
+ step1[8] = vaddq_s32(step2[8], step2[11]);
+ step1[9] = vaddq_s32(step2[9], step2[10]);
+ step1[10] = vsubq_s32(step2[9], step2[10]);
+ step1[11] = vsubq_s32(step2[8], step2[11]);
+ step1[12] = vsubq_s32(step2[15], step2[12]);
+ step1[13] = vsubq_s32(step2[14], step2[13]);
+ step1[14] = vaddq_s32(step2[14], step2[13]);
+ step1[15] = vaddq_s32(step2[15], step2[12]);
+
+ // stage 6
+ step2[0] = vaddq_s32(step1[0], step1[7]);
+ step2[1] = vaddq_s32(step1[1], step1[6]);
+ step2[2] = vaddq_s32(step1[2], step1[5]);
+ step2[3] = vaddq_s32(step1[3], step1[4]);
+ step2[4] = vsubq_s32(step1[3], step1[4]);
+ step2[5] = vsubq_s32(step1[2], step1[5]);
+ step2[6] = vsubq_s32(step1[1], step1[6]);
+ step2[7] = vsubq_s32(step1[0], step1[7]);
+ highbd_idct_cospi_16_16_d(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+ &step2[13]);
+ highbd_idct_cospi_16_16_d(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+ &step2[12]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ highbd_idct16x16_add_stage7(step2, out);
+
+ // pass 1: save the result into output
+ vst1q_s32(output, out[0]);
+ output += 4;
+ vst1q_s32(output, out[1]);
+ output += 4;
+ vst1q_s32(output, out[2]);
+ output += 4;
+ vst1q_s32(output, out[3]);
+ output += 4;
+ vst1q_s32(output, out[4]);
+ output += 4;
+ vst1q_s32(output, out[5]);
+ output += 4;
+ vst1q_s32(output, out[6]);
+ output += 4;
+ vst1q_s32(output, out[7]);
+ output += 4;
+ vst1q_s32(output, out[8]);
+ output += 4;
+ vst1q_s32(output, out[9]);
+ output += 4;
+ vst1q_s32(output, out[10]);
+ output += 4;
+ vst1q_s32(output, out[11]);
+ output += 4;
+ vst1q_s32(output, out[12]);
+ output += 4;
+ vst1q_s32(output, out[13]);
+ output += 4;
+ vst1q_s32(output, out[14]);
+ output += 4;
+ vst1q_s32(output, out[15]);
+}
+
+void highbd_idct16x16_10_add_half1d_pass2(const int32_t *input,
+ int32_t *const output,
+ uint16_t *const dest,
+ const int stride, const int bd) {
+ const int32x4_t cospi_0_8_16_24 = vld1q_s32(kCospi32 + 0);
+ const int32x4_t cospi_4_12_20N_28 = vld1q_s32(kCospi32 + 4);
+ const int32x4_t cospi_2_30_10_22 = vld1q_s32(kCospi32 + 8);
+ const int32x4_t cospi_6_26_14_18N = vld1q_s32(kCospi32 + 12);
+ int32x4x2_t in[4], step1[16], step2[16], out[16];
+
+ // Load input (4x8)
+ in[0].val[0] = vld1q_s32(input);
+ input += 4;
+ in[0].val[1] = vld1q_s32(input);
+ input += 4;
+ in[1].val[0] = vld1q_s32(input);
+ input += 4;
+ in[1].val[1] = vld1q_s32(input);
+ input += 4;
+ in[2].val[0] = vld1q_s32(input);
+ input += 4;
+ in[2].val[1] = vld1q_s32(input);
+ input += 4;
+ in[3].val[0] = vld1q_s32(input);
+ input += 4;
+ in[3].val[1] = vld1q_s32(input);
+
+ // Transpose
+ transpose_s32_4x8(&in[0].val[0], &in[0].val[1], &in[1].val[0], &in[1].val[1],
+ &in[2].val[0], &in[2].val[1], &in[3].val[0], &in[3].val[1]);
+
+ // stage 1
+ step1[0] = in[0 / 2];
+ step1[4] = in[4 / 2];
+ step1[8] = in[2 / 2];
+ step1[12] = in[6 / 2];
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[4] = step1[4];
+ step2[8] =
+ highbd_idct_cospi_lane1_dual(step1[8], vget_low_s32(cospi_2_30_10_22));
+ step2[11] =
+ highbd_idct_cospi_lane1_dual(step1[12], vget_low_s32(cospi_6_26_14_18N));
+ step2[12] =
+ highbd_idct_cospi_lane0_dual(step1[12], vget_low_s32(cospi_6_26_14_18N));
+ step2[15] =
+ highbd_idct_cospi_lane0_dual(step1[8], vget_low_s32(cospi_2_30_10_22));
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[4] =
+ highbd_idct_cospi_lane1_dual(step2[4], vget_high_s32(cospi_4_12_20N_28));
+ step1[7] =
+ highbd_idct_cospi_lane0_dual(step2[4], vget_low_s32(cospi_4_12_20N_28));
+ step1[8] = step2[8];
+ step1[9] = step2[8];
+ step1[10] = step2[11];
+ step1[11] = step2[11];
+ step1[12] = step2[12];
+ step1[13] = step2[12];
+ step1[14] = step2[15];
+ step1[15] = step2[15];
+
+ // stage 4
+ step2[0] = step2[1] =
+ highbd_idct_cospi_lane0_dual(step1[0], vget_high_s32(cospi_0_8_16_24));
+ step2[4] = step1[4];
+ step2[5] = step1[4];
+ step2[6] = step1[7];
+ step2[7] = step1[7];
+ step2[8] = step1[8];
+ highbd_idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+ &step2[14]);
+ highbd_idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24,
+ &step2[13], &step2[10]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+ step1[0] = step2[0];
+ step1[1] = step2[1];
+ step1[2] = step2[1];
+ step1[3] = step2[0];
+ step1[4] = step2[4];
+ highbd_idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5],
+ &step1[6]);
+ step1[7] = step2[7];
+ step1[8] = highbd_idct_add_dual(step2[8], step2[11]);
+ step1[9] = highbd_idct_add_dual(step2[9], step2[10]);
+ step1[10] = highbd_idct_sub_dual(step2[9], step2[10]);
+ step1[11] = highbd_idct_sub_dual(step2[8], step2[11]);
+ step1[12] = highbd_idct_sub_dual(step2[15], step2[12]);
+ step1[13] = highbd_idct_sub_dual(step2[14], step2[13]);
+ step1[14] = highbd_idct_add_dual(step2[14], step2[13]);
+ step1[15] = highbd_idct_add_dual(step2[15], step2[12]);
+
+ // stage 6
+ step2[0] = highbd_idct_add_dual(step1[0], step1[7]);
+ step2[1] = highbd_idct_add_dual(step1[1], step1[6]);
+ step2[2] = highbd_idct_add_dual(step1[2], step1[5]);
+ step2[3] = highbd_idct_add_dual(step1[3], step1[4]);
+ step2[4] = highbd_idct_sub_dual(step1[3], step1[4]);
+ step2[5] = highbd_idct_sub_dual(step1[2], step1[5]);
+ step2[6] = highbd_idct_sub_dual(step1[1], step1[6]);
+ step2[7] = highbd_idct_sub_dual(step1[0], step1[7]);
+ highbd_idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+ &step2[13]);
+ highbd_idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+ &step2[12]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ highbd_idct16x16_add_stage7_dual(step2, out);
+
+ if (output) {
+ highbd_idct16x16_store_pass1(out, output);
+ } else {
+ highbd_idct16x16_add_store(out, dest, stride, bd);
+ }
+}
+
+void vpx_highbd_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ if (bd == 8) {
+ int16_t row_idct_output[16 * 16];
+
+ // pass 1
+ // Parallel idct on the upper 8 rows
+ idct16x16_256_add_half1d(input, row_idct_output, dest, stride, 1);
+
+ // Parallel idct on the lower 8 rows
+ idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8, dest, stride,
+ 1);
+
+ // pass 2
+ // Parallel idct to get the left 8 columns
+ idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride, 1);
+
+ // Parallel idct to get the right 8 columns
+ idct16x16_256_add_half1d(row_idct_output + 8 * 16, NULL, dest + 8, stride,
+ 1);
+ } else {
+ int32_t row_idct_output[16 * 16];
+
+ // pass 1
+ // Parallel idct on the upper 8 rows
+ highbd_idct16x16_256_add_half1d(input, row_idct_output, dest, stride, bd);
+
+ // Parallel idct on the lower 8 rows
+ highbd_idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8, dest,
+ stride, bd);
+
+ // pass 2
+ // Parallel idct to get the left 8 columns
+ highbd_idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride, bd);
+
+ // Parallel idct to get the right 8 columns
+ highbd_idct16x16_256_add_half1d(row_idct_output + 8 * 16, NULL, dest + 8,
+ stride, bd);
+ }
+}
+
+void vpx_highbd_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ if (bd == 8) {
+ int16_t row_idct_output[16 * 16];
+
+ // pass 1
+ // Parallel idct on the upper 8 rows
+ idct16x16_38_add_half1d(input, row_idct_output, dest, stride, 1);
+
+ // pass 2
+ // Parallel idct to get the left 8 columns
+ idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, 1);
+
+ // Parallel idct to get the right 8 columns
+ idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride,
+ 1);
+ } else {
+ int32_t row_idct_output[16 * 16];
+
+ // pass 1
+ // Parallel idct on the upper 8 rows
+ highbd_idct16x16_38_add_half1d(input, row_idct_output, dest, stride, bd);
+
+ // pass 2
+ // Parallel idct to get the left 8 columns
+ highbd_idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, bd);
+
+ // Parallel idct to get the right 8 columns
+ highbd_idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8,
+ stride, bd);
+ }
+}
+
+void vpx_highbd_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ if (bd == 8) {
+ int16_t row_idct_output[4 * 16];
+
+ // pass 1
+ // Parallel idct on the upper 8 rows
+ idct16x16_10_add_half1d_pass1(input, row_idct_output);
+
+ // pass 2
+ // Parallel idct to get the left 8 columns
+ idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride, 1);
+
+ // Parallel idct to get the right 8 columns
+ idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, dest + 8,
+ stride, 1);
+ } else {
+ int32_t row_idct_output[4 * 16];
+
+ // pass 1
+ // Parallel idct on the upper 8 rows
+ highbd_idct16x16_10_add_half1d_pass1(input, row_idct_output);
+
+ // pass 2
+ // Parallel idct to get the left 8 columns
+ highbd_idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride,
+ bd);
+
+ // Parallel idct to get the right 8 columns
+ highbd_idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL,
+ dest + 8, stride, bd);
+ }
+}
+
+static INLINE void highbd_idct16x16_1_add_pos_kernel(uint16_t **dest,
+ const int stride,
+ const int16x8_t res,
+ const int16x8_t max) {
+ const uint16x8_t a0 = vld1q_u16(*dest + 0);
+ const uint16x8_t a1 = vld1q_u16(*dest + 8);
+ const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
+ const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
+ const int16x8_t c0 = vminq_s16(b0, max);
+ const int16x8_t c1 = vminq_s16(b1, max);
+ vst1q_u16(*dest + 0, vreinterpretq_u16_s16(c0));
+ vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1));
+ *dest += stride;
+}
+
+static INLINE void highbd_idct16x16_1_add_neg_kernel(uint16_t **dest,
+ const int stride,
+ const int16x8_t res) {
+ const uint16x8_t a0 = vld1q_u16(*dest + 0);
+ const uint16x8_t a1 = vld1q_u16(*dest + 8);
+ const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
+ const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
+ const uint16x8_t c0 = vqshluq_n_s16(b0, 0);
+ const uint16x8_t c1 = vqshluq_n_s16(b1, 0);
+ vst1q_u16(*dest + 0, c0);
+ vst1q_u16(*dest + 8, c1);
+ *dest += stride;
+}
+
+void vpx_highbd_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ const tran_low_t out0 =
+ HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+ const tran_low_t out1 =
+ HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
+ const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
+ const int16x8_t dc = vdupq_n_s16(a1);
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ int i;
+
+ if (a1 >= 0) {
+ const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+ for (i = 0; i < 4; ++i) {
+ highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct16x16_1_add_pos_kernel(&dest, stride, dc, max);
+ }
+ } else {
+ for (i = 0; i < 4; ++i) {
+ highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct16x16_1_add_neg_kernel(&dest, stride, dc);
+ }
+ }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c
new file mode 100644
index 00000000000..d74331f8031
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct32x32_add_neon.c
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/inv_txfm.h"
+
+static INLINE void highbd_idct32x32_1_add_pos_kernel(uint16_t **dest,
+ const int stride,
+ const int16x8_t res,
+ const int16x8_t max) {
+ const uint16x8_t a0 = vld1q_u16(*dest);
+ const uint16x8_t a1 = vld1q_u16(*dest + 8);
+ const uint16x8_t a2 = vld1q_u16(*dest + 16);
+ const uint16x8_t a3 = vld1q_u16(*dest + 24);
+ const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
+ const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
+ const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2));
+ const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3));
+ const int16x8_t c0 = vminq_s16(b0, max);
+ const int16x8_t c1 = vminq_s16(b1, max);
+ const int16x8_t c2 = vminq_s16(b2, max);
+ const int16x8_t c3 = vminq_s16(b3, max);
+ vst1q_u16(*dest, vreinterpretq_u16_s16(c0));
+ vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1));
+ vst1q_u16(*dest + 16, vreinterpretq_u16_s16(c2));
+ vst1q_u16(*dest + 24, vreinterpretq_u16_s16(c3));
+ *dest += stride;
+}
+
+static INLINE void highbd_idct32x32_1_add_neg_kernel(uint16_t **dest,
+ const int stride,
+ const int16x8_t res) {
+ const uint16x8_t a0 = vld1q_u16(*dest);
+ const uint16x8_t a1 = vld1q_u16(*dest + 8);
+ const uint16x8_t a2 = vld1q_u16(*dest + 16);
+ const uint16x8_t a3 = vld1q_u16(*dest + 24);
+ const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0));
+ const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1));
+ const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2));
+ const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3));
+ const uint16x8_t c0 = vqshluq_n_s16(b0, 0);
+ const uint16x8_t c1 = vqshluq_n_s16(b1, 0);
+ const uint16x8_t c2 = vqshluq_n_s16(b2, 0);
+ const uint16x8_t c3 = vqshluq_n_s16(b3, 0);
+ vst1q_u16(*dest, c0);
+ vst1q_u16(*dest + 8, c1);
+ vst1q_u16(*dest + 16, c2);
+ vst1q_u16(*dest + 24, c3);
+ *dest += stride;
+}
+
+void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ const tran_low_t out0 =
+ HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
+ const tran_low_t out1 =
+ HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd);
+ const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);
+ const int16x8_t dc = vdupq_n_s16(a1);
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+ int i;
+
+ if (a1 >= 0) {
+ const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+ for (i = 0; i < 8; ++i) {
+ highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max);
+ }
+ } else {
+ for (i = 0; i < 8; ++i) {
+ highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc);
+ }
+ }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c
index 26fa3e216bb..128f72b9c96 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct4x4_add_neon.c
@@ -82,10 +82,10 @@ static INLINE void idct4x4_16_kernel_bd10(const int32x4_t cospis,
b3 = vmulq_lane_s32(*a1, vget_low_s32(cospis), 1);
b2 = vmlsq_lane_s32(b2, *a3, vget_low_s32(cospis), 1);
b3 = vmlaq_lane_s32(b3, *a3, vget_high_s32(cospis), 1);
- b0 = vrshrq_n_s32(b0, 14);
- b1 = vrshrq_n_s32(b1, 14);
- b2 = vrshrq_n_s32(b2, 14);
- b3 = vrshrq_n_s32(b3, 14);
+ b0 = vrshrq_n_s32(b0, DCT_CONST_BITS);
+ b1 = vrshrq_n_s32(b1, DCT_CONST_BITS);
+ b2 = vrshrq_n_s32(b2, DCT_CONST_BITS);
+ b3 = vrshrq_n_s32(b3, DCT_CONST_BITS);
*a0 = vaddq_s32(b0, b3);
*a1 = vaddq_s32(b1, b2);
*a2 = vsubq_s32(b1, b2);
@@ -119,10 +119,14 @@ static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis,
c5 = vsubq_s64(c5, c9);
c6 = vaddq_s64(c6, c10);
c7 = vaddq_s64(c7, c11);
- b0 = vcombine_s32(vrshrn_n_s64(c0, 14), vrshrn_n_s64(c1, 14));
- b1 = vcombine_s32(vrshrn_n_s64(c2, 14), vrshrn_n_s64(c3, 14));
- b2 = vcombine_s32(vrshrn_n_s64(c4, 14), vrshrn_n_s64(c5, 14));
- b3 = vcombine_s32(vrshrn_n_s64(c6, 14), vrshrn_n_s64(c7, 14));
+ b0 = vcombine_s32(vrshrn_n_s64(c0, DCT_CONST_BITS),
+ vrshrn_n_s64(c1, DCT_CONST_BITS));
+ b1 = vcombine_s32(vrshrn_n_s64(c2, DCT_CONST_BITS),
+ vrshrn_n_s64(c3, DCT_CONST_BITS));
+ b2 = vcombine_s32(vrshrn_n_s64(c4, DCT_CONST_BITS),
+ vrshrn_n_s64(c5, DCT_CONST_BITS));
+ b3 = vcombine_s32(vrshrn_n_s64(c6, DCT_CONST_BITS),
+ vrshrn_n_s64(c7, DCT_CONST_BITS));
*a0 = vaddq_s32(b0, b3);
*a1 = vaddq_s32(b1, b2);
*a2 = vsubq_s32(b1, b2);
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c
index c1c0f645d18..f53f4c7fcad 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/highbd_idct8x8_add_neon.c
@@ -15,21 +15,29 @@
#include "vpx_dsp/arm/transpose_neon.h"
#include "vpx_dsp/inv_txfm.h"
-static INLINE void highbd_idct8x8_1_add_kernel(uint16_t **dest,
- const int stride,
- const int16x8_t res,
- const int16x8_t max) {
+static INLINE void highbd_idct8x8_1_add_pos_kernel(uint16_t **dest,
+ const int stride,
+ const int16x8_t res,
+ const int16x8_t max) {
const uint16x8_t a = vld1q_u16(*dest);
const int16x8_t b = vaddq_s16(res, vreinterpretq_s16_u16(a));
const int16x8_t c = vminq_s16(b, max);
- const uint16x8_t d = vqshluq_n_s16(c, 0);
- vst1q_u16(*dest, d);
+ vst1q_u16(*dest, vreinterpretq_u16_s16(c));
+ *dest += stride;
+}
+
+static INLINE void highbd_idct8x8_1_add_neg_kernel(uint16_t **dest,
+ const int stride,
+ const int16x8_t res) {
+ const uint16x8_t a = vld1q_u16(*dest);
+ const int16x8_t b = vaddq_s16(res, vreinterpretq_s16_u16(a));
+ const uint16x8_t c = vqshluq_n_s16(b, 0);
+ vst1q_u16(*dest, c);
*dest += stride;
}
void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest8,
int stride, int bd) {
- const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
const tran_low_t out0 =
HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
const tran_low_t out1 =
@@ -38,14 +46,26 @@ void vpx_highbd_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest8,
const int16x8_t dc = vdupq_n_s16(a1);
uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
- highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
- highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
- highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
- highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
- highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
- highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
- highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
- highbd_idct8x8_1_add_kernel(&dest, stride, dc, max);
+ if (a1 >= 0) {
+ const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+ highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+ highbd_idct8x8_1_add_pos_kernel(&dest, stride, dc, max);
+ } else {
+ highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ highbd_idct8x8_1_add_neg_kernel(&dest, stride, dc);
+ }
}
static INLINE void idct8x8_12_half1d_bd10(
@@ -62,18 +82,18 @@ static INLINE void idct8x8_12_half1d_bd10(
step1[5] = vmulq_lane_s32(*io3, vget_high_s32(cospis1), 0);
step1[6] = vmulq_lane_s32(*io3, vget_low_s32(cospis1), 1);
step1[7] = vmulq_lane_s32(*io1, vget_low_s32(cospis1), 0);
- step1[4] = vrshrq_n_s32(step1[4], 14);
- step1[5] = vrshrq_n_s32(step1[5], 14);
- step1[6] = vrshrq_n_s32(step1[6], 14);
- step1[7] = vrshrq_n_s32(step1[7], 14);
+ step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS);
+ step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
+ step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
+ step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS);
// stage 2
step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0);
step2[2] = vmulq_lane_s32(*io2, vget_high_s32(cospis0), 1);
step2[3] = vmulq_lane_s32(*io2, vget_low_s32(cospis0), 1);
- step2[1] = vrshrq_n_s32(step2[1], 14);
- step2[2] = vrshrq_n_s32(step2[2], 14);
- step2[3] = vrshrq_n_s32(step2[3], 14);
+ step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS);
+ step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS);
+ step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS);
step2[4] = vaddq_s32(step1[4], step1[5]);
step2[5] = vsubq_s32(step1[4], step1[5]);
@@ -89,8 +109,8 @@ static INLINE void idct8x8_12_half1d_bd10(
step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0);
step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
- step1[5] = vrshrq_n_s32(step1[5], 14);
- step1[6] = vrshrq_n_s32(step1[6], 14);
+ step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
+ step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
// stage 4
*io0 = vaddq_s32(step1[0], step2[7]);
@@ -134,14 +154,14 @@ static INLINE void idct8x8_12_half1d_bd12(
t64[5] = vmull_lane_s32(input_3h, vget_low_s32(cospis1), 1);
t64[6] = vmull_lane_s32(input_1l, vget_low_s32(cospis1), 0);
t64[7] = vmull_lane_s32(input_1h, vget_low_s32(cospis1), 0);
- t32[0] = vrshrn_n_s64(t64[0], 14);
- t32[1] = vrshrn_n_s64(t64[1], 14);
- t32[2] = vrshrn_n_s64(t64[2], 14);
- t32[3] = vrshrn_n_s64(t64[3], 14);
- t32[4] = vrshrn_n_s64(t64[4], 14);
- t32[5] = vrshrn_n_s64(t64[5], 14);
- t32[6] = vrshrn_n_s64(t64[6], 14);
- t32[7] = vrshrn_n_s64(t64[7], 14);
+ t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+ t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+ t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+ t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+ t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
+ t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
+ t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
+ t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
step1[4] = vcombine_s32(t32[0], t32[1]);
step1[5] = vcombine_s32(t32[2], t32[3]);
step1[6] = vcombine_s32(t32[4], t32[5]);
@@ -154,12 +174,12 @@ static INLINE void idct8x8_12_half1d_bd12(
t64[5] = vmull_lane_s32(step1h[1], vget_high_s32(cospis0), 1);
t64[6] = vmull_lane_s32(step1l[1], vget_low_s32(cospis0), 1);
t64[7] = vmull_lane_s32(step1h[1], vget_low_s32(cospis0), 1);
- t32[2] = vrshrn_n_s64(t64[2], 14);
- t32[3] = vrshrn_n_s64(t64[3], 14);
- t32[4] = vrshrn_n_s64(t64[4], 14);
- t32[5] = vrshrn_n_s64(t64[5], 14);
- t32[6] = vrshrn_n_s64(t64[6], 14);
- t32[7] = vrshrn_n_s64(t64[7], 14);
+ t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+ t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+ t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
+ t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
+ t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
+ t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
step2[1] = vcombine_s32(t32[2], t32[3]);
step2[2] = vcombine_s32(t32[4], t32[5]);
step2[3] = vcombine_s32(t32[6], t32[7]);
@@ -185,10 +205,10 @@ static INLINE void idct8x8_12_half1d_bd12(
vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]),
vget_high_s32(cospis0), 0);
- t32[0] = vrshrn_n_s64(t64[0], 14);
- t32[1] = vrshrn_n_s64(t64[1], 14);
- t32[2] = vrshrn_n_s64(t64[2], 14);
- t32[3] = vrshrn_n_s64(t64[3], 14);
+ t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+ t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+ t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+ t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
step1[5] = vcombine_s32(t32[0], t32[1]);
step1[6] = vcombine_s32(t32[2], t32[3]);
@@ -357,10 +377,10 @@ static INLINE void idct8x8_64_half1d_bd10(
step1[6] = vmlsq_lane_s32(step1[6], *io5, vget_high_s32(cospis1), 0);
step1[7] = vmlaq_lane_s32(step1[7], *io7, vget_high_s32(cospis1), 1);
- step1[4] = vrshrq_n_s32(step1[4], 14);
- step1[5] = vrshrq_n_s32(step1[5], 14);
- step1[6] = vrshrq_n_s32(step1[6], 14);
- step1[7] = vrshrq_n_s32(step1[7], 14);
+ step1[4] = vrshrq_n_s32(step1[4], DCT_CONST_BITS);
+ step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
+ step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
+ step1[7] = vrshrq_n_s32(step1[7], DCT_CONST_BITS);
// stage 2
step2[1] = vmulq_lane_s32(*io0, vget_high_s32(cospis0), 0);
@@ -372,10 +392,10 @@ static INLINE void idct8x8_64_half1d_bd10(
step2[2] = vmlsq_lane_s32(step2[2], *io6, vget_low_s32(cospis0), 1);
step2[3] = vmlaq_lane_s32(step2[3], *io6, vget_high_s32(cospis0), 1);
- step2[0] = vrshrq_n_s32(step2[0], 14);
- step2[1] = vrshrq_n_s32(step2[1], 14);
- step2[2] = vrshrq_n_s32(step2[2], 14);
- step2[3] = vrshrq_n_s32(step2[3], 14);
+ step2[0] = vrshrq_n_s32(step2[0], DCT_CONST_BITS);
+ step2[1] = vrshrq_n_s32(step2[1], DCT_CONST_BITS);
+ step2[2] = vrshrq_n_s32(step2[2], DCT_CONST_BITS);
+ step2[3] = vrshrq_n_s32(step2[3], DCT_CONST_BITS);
step2[4] = vaddq_s32(step1[4], step1[5]);
step2[5] = vsubq_s32(step1[4], step1[5]);
@@ -391,8 +411,8 @@ static INLINE void idct8x8_64_half1d_bd10(
step1[6] = vmulq_lane_s32(step2[6], vget_high_s32(cospis0), 0);
step1[5] = vmlsq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
step1[6] = vmlaq_lane_s32(step1[6], step2[5], vget_high_s32(cospis0), 0);
- step1[5] = vrshrq_n_s32(step1[5], 14);
- step1[6] = vrshrq_n_s32(step1[6], 14);
+ step1[5] = vrshrq_n_s32(step1[5], DCT_CONST_BITS);
+ step1[6] = vrshrq_n_s32(step1[6], DCT_CONST_BITS);
// stage 4
*io0 = vaddq_s32(step1[0], step2[7]);
@@ -453,14 +473,14 @@ static INLINE void idct8x8_64_half1d_bd12(
t64[5] = vmlsl_lane_s32(t64[5], input_5h, vget_high_s32(cospis1), 0);
t64[6] = vmlal_lane_s32(t64[6], input_7l, vget_high_s32(cospis1), 1);
t64[7] = vmlal_lane_s32(t64[7], input_7h, vget_high_s32(cospis1), 1);
- t32[0] = vrshrn_n_s64(t64[0], 14);
- t32[1] = vrshrn_n_s64(t64[1], 14);
- t32[2] = vrshrn_n_s64(t64[2], 14);
- t32[3] = vrshrn_n_s64(t64[3], 14);
- t32[4] = vrshrn_n_s64(t64[4], 14);
- t32[5] = vrshrn_n_s64(t64[5], 14);
- t32[6] = vrshrn_n_s64(t64[6], 14);
- t32[7] = vrshrn_n_s64(t64[7], 14);
+ t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+ t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+ t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+ t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+ t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
+ t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
+ t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
+ t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
step1[4] = vcombine_s32(t32[0], t32[1]);
step1[5] = vcombine_s32(t32[2], t32[3]);
step1[6] = vcombine_s32(t32[4], t32[5]);
@@ -481,14 +501,14 @@ static INLINE void idct8x8_64_half1d_bd12(
t64[5] = vmlsl_lane_s32(t64[5], step1h[3], vget_low_s32(cospis0), 1);
t64[6] = vmlal_lane_s32(t64[6], step1l[3], vget_high_s32(cospis0), 1);
t64[7] = vmlal_lane_s32(t64[7], step1h[3], vget_high_s32(cospis0), 1);
- t32[0] = vrshrn_n_s64(t64[0], 14);
- t32[1] = vrshrn_n_s64(t64[1], 14);
- t32[2] = vrshrn_n_s64(t64[2], 14);
- t32[3] = vrshrn_n_s64(t64[3], 14);
- t32[4] = vrshrn_n_s64(t64[4], 14);
- t32[5] = vrshrn_n_s64(t64[5], 14);
- t32[6] = vrshrn_n_s64(t64[6], 14);
- t32[7] = vrshrn_n_s64(t64[7], 14);
+ t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+ t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+ t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+ t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
+ t32[4] = vrshrn_n_s64(t64[4], DCT_CONST_BITS);
+ t32[5] = vrshrn_n_s64(t64[5], DCT_CONST_BITS);
+ t32[6] = vrshrn_n_s64(t64[6], DCT_CONST_BITS);
+ t32[7] = vrshrn_n_s64(t64[7], DCT_CONST_BITS);
step2[0] = vcombine_s32(t32[0], t32[1]);
step2[1] = vcombine_s32(t32[2], t32[3]);
step2[2] = vcombine_s32(t32[4], t32[5]);
@@ -515,10 +535,10 @@ static INLINE void idct8x8_64_half1d_bd12(
vmlal_lane_s32(t64[2], vget_low_s32(step2[5]), vget_high_s32(cospis0), 0);
t64[3] = vmlal_lane_s32(t64[3], vget_high_s32(step2[5]),
vget_high_s32(cospis0), 0);
- t32[0] = vrshrn_n_s64(t64[0], 14);
- t32[1] = vrshrn_n_s64(t64[1], 14);
- t32[2] = vrshrn_n_s64(t64[2], 14);
- t32[3] = vrshrn_n_s64(t64[3], 14);
+ t32[0] = vrshrn_n_s64(t64[0], DCT_CONST_BITS);
+ t32[1] = vrshrn_n_s64(t64[1], DCT_CONST_BITS);
+ t32[2] = vrshrn_n_s64(t64[2], DCT_CONST_BITS);
+ t32[3] = vrshrn_n_s64(t64[3], DCT_CONST_BITS);
step1[5] = vcombine_s32(t32[0], t32[1]);
step1[6] = vcombine_s32(t32[2], t32[3]);
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm
deleted file mode 100644
index d648840df40..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_1_add_neon.asm
+++ /dev/null
@@ -1,196 +0,0 @@
-;
-; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
-;
-
-
- EXPORT |vpx_idct16x16_1_add_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vpx_idct16x16_1_add_neon(int16_t *input, uint8_t *dest, int stride)
-;
-; r0 int16_t input
-; r1 uint8_t *dest
-; r2 int stride)
-
-|vpx_idct16x16_1_add_neon| PROC
- ldrsh r0, [r0]
-
- ; cospi_16_64 = 11585
- movw r12, #0x2d41
-
- ; out = dct_const_round_shift(input[0] * cospi_16_64)
- mul r0, r0, r12 ; input[0] * cospi_16_64
- add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
- asr r0, r0, #14 ; >> DCT_CONST_BITS
-
- ; out = dct_const_round_shift(out * cospi_16_64)
- mul r0, r0, r12 ; out * cospi_16_64
- mov r12, r1 ; save dest
- add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
- asr r0, r0, #14 ; >> DCT_CONST_BITS
-
- ; a1 = ROUND_POWER_OF_TWO(out, 6)
- add r0, r0, #32 ; + (1 <<((6) - 1))
- asr r0, r0, #6 ; >> 6
-
- vdup.s16 q0, r0 ; duplicate a1
- mov r0, #8
- sub r2, #8
-
- ; load destination data row0 - row3
- vld1.64 {d2}, [r1], r0
- vld1.64 {d3}, [r1], r2
- vld1.64 {d4}, [r1], r0
- vld1.64 {d5}, [r1], r2
- vld1.64 {d6}, [r1], r0
- vld1.64 {d7}, [r1], r2
- vld1.64 {d16}, [r1], r0
- vld1.64 {d17}, [r1], r2
-
- vaddw.u8 q9, q0, d2 ; dest[x] + a1
- vaddw.u8 q10, q0, d3 ; dest[x] + a1
- vaddw.u8 q11, q0, d4 ; dest[x] + a1
- vaddw.u8 q12, q0, d5 ; dest[x] + a1
- vqmovun.s16 d2, q9 ; clip_pixel
- vqmovun.s16 d3, q10 ; clip_pixel
- vqmovun.s16 d30, q11 ; clip_pixel
- vqmovun.s16 d31, q12 ; clip_pixel
- vst1.64 {d2}, [r12], r0
- vst1.64 {d3}, [r12], r2
- vst1.64 {d30}, [r12], r0
- vst1.64 {d31}, [r12], r2
-
- vaddw.u8 q9, q0, d6 ; dest[x] + a1
- vaddw.u8 q10, q0, d7 ; dest[x] + a1
- vaddw.u8 q11, q0, d16 ; dest[x] + a1
- vaddw.u8 q12, q0, d17 ; dest[x] + a1
- vqmovun.s16 d2, q9 ; clip_pixel
- vqmovun.s16 d3, q10 ; clip_pixel
- vqmovun.s16 d30, q11 ; clip_pixel
- vqmovun.s16 d31, q12 ; clip_pixel
- vst1.64 {d2}, [r12], r0
- vst1.64 {d3}, [r12], r2
- vst1.64 {d30}, [r12], r0
- vst1.64 {d31}, [r12], r2
-
- ; load destination data row4 - row7
- vld1.64 {d2}, [r1], r0
- vld1.64 {d3}, [r1], r2
- vld1.64 {d4}, [r1], r0
- vld1.64 {d5}, [r1], r2
- vld1.64 {d6}, [r1], r0
- vld1.64 {d7}, [r1], r2
- vld1.64 {d16}, [r1], r0
- vld1.64 {d17}, [r1], r2
-
- vaddw.u8 q9, q0, d2 ; dest[x] + a1
- vaddw.u8 q10, q0, d3 ; dest[x] + a1
- vaddw.u8 q11, q0, d4 ; dest[x] + a1
- vaddw.u8 q12, q0, d5 ; dest[x] + a1
- vqmovun.s16 d2, q9 ; clip_pixel
- vqmovun.s16 d3, q10 ; clip_pixel
- vqmovun.s16 d30, q11 ; clip_pixel
- vqmovun.s16 d31, q12 ; clip_pixel
- vst1.64 {d2}, [r12], r0
- vst1.64 {d3}, [r12], r2
- vst1.64 {d30}, [r12], r0
- vst1.64 {d31}, [r12], r2
-
- vaddw.u8 q9, q0, d6 ; dest[x] + a1
- vaddw.u8 q10, q0, d7 ; dest[x] + a1
- vaddw.u8 q11, q0, d16 ; dest[x] + a1
- vaddw.u8 q12, q0, d17 ; dest[x] + a1
- vqmovun.s16 d2, q9 ; clip_pixel
- vqmovun.s16 d3, q10 ; clip_pixel
- vqmovun.s16 d30, q11 ; clip_pixel
- vqmovun.s16 d31, q12 ; clip_pixel
- vst1.64 {d2}, [r12], r0
- vst1.64 {d3}, [r12], r2
- vst1.64 {d30}, [r12], r0
- vst1.64 {d31}, [r12], r2
-
- ; load destination data row8 - row11
- vld1.64 {d2}, [r1], r0
- vld1.64 {d3}, [r1], r2
- vld1.64 {d4}, [r1], r0
- vld1.64 {d5}, [r1], r2
- vld1.64 {d6}, [r1], r0
- vld1.64 {d7}, [r1], r2
- vld1.64 {d16}, [r1], r0
- vld1.64 {d17}, [r1], r2
-
- vaddw.u8 q9, q0, d2 ; dest[x] + a1
- vaddw.u8 q10, q0, d3 ; dest[x] + a1
- vaddw.u8 q11, q0, d4 ; dest[x] + a1
- vaddw.u8 q12, q0, d5 ; dest[x] + a1
- vqmovun.s16 d2, q9 ; clip_pixel
- vqmovun.s16 d3, q10 ; clip_pixel
- vqmovun.s16 d30, q11 ; clip_pixel
- vqmovun.s16 d31, q12 ; clip_pixel
- vst1.64 {d2}, [r12], r0
- vst1.64 {d3}, [r12], r2
- vst1.64 {d30}, [r12], r0
- vst1.64 {d31}, [r12], r2
-
- vaddw.u8 q9, q0, d6 ; dest[x] + a1
- vaddw.u8 q10, q0, d7 ; dest[x] + a1
- vaddw.u8 q11, q0, d16 ; dest[x] + a1
- vaddw.u8 q12, q0, d17 ; dest[x] + a1
- vqmovun.s16 d2, q9 ; clip_pixel
- vqmovun.s16 d3, q10 ; clip_pixel
- vqmovun.s16 d30, q11 ; clip_pixel
- vqmovun.s16 d31, q12 ; clip_pixel
- vst1.64 {d2}, [r12], r0
- vst1.64 {d3}, [r12], r2
- vst1.64 {d30}, [r12], r0
- vst1.64 {d31}, [r12], r2
-
- ; load destination data row12 - row15
- vld1.64 {d2}, [r1], r0
- vld1.64 {d3}, [r1], r2
- vld1.64 {d4}, [r1], r0
- vld1.64 {d5}, [r1], r2
- vld1.64 {d6}, [r1], r0
- vld1.64 {d7}, [r1], r2
- vld1.64 {d16}, [r1], r0
- vld1.64 {d17}, [r1], r2
-
- vaddw.u8 q9, q0, d2 ; dest[x] + a1
- vaddw.u8 q10, q0, d3 ; dest[x] + a1
- vaddw.u8 q11, q0, d4 ; dest[x] + a1
- vaddw.u8 q12, q0, d5 ; dest[x] + a1
- vqmovun.s16 d2, q9 ; clip_pixel
- vqmovun.s16 d3, q10 ; clip_pixel
- vqmovun.s16 d30, q11 ; clip_pixel
- vqmovun.s16 d31, q12 ; clip_pixel
- vst1.64 {d2}, [r12], r0
- vst1.64 {d3}, [r12], r2
- vst1.64 {d30}, [r12], r0
- vst1.64 {d31}, [r12], r2
-
- vaddw.u8 q9, q0, d6 ; dest[x] + a1
- vaddw.u8 q10, q0, d7 ; dest[x] + a1
- vaddw.u8 q11, q0, d16 ; dest[x] + a1
- vaddw.u8 q12, q0, d17 ; dest[x] + a1
- vqmovun.s16 d2, q9 ; clip_pixel
- vqmovun.s16 d3, q10 ; clip_pixel
- vqmovun.s16 d30, q11 ; clip_pixel
- vqmovun.s16 d31, q12 ; clip_pixel
- vst1.64 {d2}, [r12], r0
- vst1.64 {d3}, [r12], r2
- vst1.64 {d30}, [r12], r0
- vst1.64 {d31}, [r12], r2
-
- bx lr
- ENDP ; |vpx_idct16x16_1_add_neon|
-
- END
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm
deleted file mode 100644
index ea6b099d3bb..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.asm
+++ /dev/null
@@ -1,1176 +0,0 @@
-;
-; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
- INCLUDE vpx_dsp/arm/idct_neon.asm.S
-
- EXPORT |vpx_idct16x16_256_add_neon_pass1|
- EXPORT |vpx_idct16x16_256_add_neon_pass2|
- IF CONFIG_VP9_HIGHBITDEPTH
- EXPORT |vpx_idct16x16_256_add_neon_pass1_tran_low|
- EXPORT |vpx_idct16x16_256_add_neon_pass2_tran_low|
- ENDIF
- EXPORT |vpx_idct16x16_10_add_neon_pass1|
- EXPORT |vpx_idct16x16_10_add_neon_pass2|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
- ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.
- MACRO
- TRANSPOSE8X8
- vswp d17, d24
- vswp d23, d30
- vswp d21, d28
- vswp d19, d26
- vtrn.32 q8, q10
- vtrn.32 q9, q11
- vtrn.32 q12, q14
- vtrn.32 q13, q15
- vtrn.16 q8, q9
- vtrn.16 q10, q11
- vtrn.16 q12, q13
- vtrn.16 q14, q15
- MEND
-
- AREA Block, CODE, READONLY ; name this block of code
-;void |vpx_idct16x16_256_add_neon_pass1|(const int16_t *input, int16_t *output)
-;
-; r0 const int16_t *input
-; r1 int16_t *output
-
-; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
-; will be stored back into q8-q15 registers. This function will touch q0-q7
-; registers and use them as buffer during calculation.
-|vpx_idct16x16_256_add_neon_pass1| PROC
-
- ; TODO(hkuang): Find a better way to load the elements.
- ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
- vld2.s16 {q8,q9}, [r0]!
- vld2.s16 {q9,q10}, [r0]!
- vld2.s16 {q10,q11}, [r0]!
- vld2.s16 {q11,q12}, [r0]!
- vld2.s16 {q12,q13}, [r0]!
- vld2.s16 {q13,q14}, [r0]!
- vld2.s16 {q14,q15}, [r0]!
- vld2.s16 {q1,q2}, [r0]!
- vmov.s16 q15, q1
-
-idct16x16_256_add_neon_pass1
- ; cospi_28_64 = 3196
- movw r3, #0x0c7c
-
- ; cospi_4_64 = 16069
- movw r12, #0x3ec5
-
- ; transpose the input data
- TRANSPOSE8X8
-
- ; stage 3
- vdup.16 d0, r3 ; duplicate cospi_28_64
- vdup.16 d1, r12 ; duplicate cospi_4_64
-
- ; preloading to avoid stall
- ; cospi_12_64 = 13623
- movw r3, #0x3537
-
- ; cospi_20_64 = 9102
- movw r12, #0x238e
-
- ; step2[4] * cospi_28_64
- vmull.s16 q2, d18, d0
- vmull.s16 q3, d19, d0
-
- ; step2[4] * cospi_4_64
- vmull.s16 q5, d18, d1
- vmull.s16 q6, d19, d1
-
- ; temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64
- vmlsl.s16 q2, d30, d1
- vmlsl.s16 q3, d31, d1
-
- ; temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64
- vmlal.s16 q5, d30, d0
- vmlal.s16 q6, d31, d0
-
- vdup.16 d2, r3 ; duplicate cospi_12_64
- vdup.16 d3, r12 ; duplicate cospi_20_64
-
- ; dct_const_round_shift(temp1)
- vrshrn.s32 d8, q2, #14 ; >> 14
- vrshrn.s32 d9, q3, #14 ; >> 14
-
- ; dct_const_round_shift(temp2)
- vrshrn.s32 d14, q5, #14 ; >> 14
- vrshrn.s32 d15, q6, #14 ; >> 14
-
- ; preloading to avoid stall
- ; cospi_16_64 = 11585
- movw r3, #0x2d41
-
- ; cospi_24_64 = 6270
- movw r12, #0x187e
-
- ; step2[5] * cospi_12_64
- vmull.s16 q2, d26, d2
- vmull.s16 q3, d27, d2
-
- ; step2[5] * cospi_20_64
- vmull.s16 q9, d26, d3
- vmull.s16 q15, d27, d3
-
- ; temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64
- vmlsl.s16 q2, d22, d3
- vmlsl.s16 q3, d23, d3
-
- ; temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64
- vmlal.s16 q9, d22, d2
- vmlal.s16 q15, d23, d2
-
- ; dct_const_round_shift(temp1)
- vrshrn.s32 d10, q2, #14 ; >> 14
- vrshrn.s32 d11, q3, #14 ; >> 14
-
- ; dct_const_round_shift(temp2)
- vrshrn.s32 d12, q9, #14 ; >> 14
- vrshrn.s32 d13, q15, #14 ; >> 14
-
- ; stage 4
- vdup.16 d30, r3 ; cospi_16_64
-
- ; step1[0] * cospi_16_64
- vmull.s16 q2, d16, d30
- vmull.s16 q11, d17, d30
-
- ; step1[1] * cospi_16_64
- vmull.s16 q0, d24, d30
- vmull.s16 q1, d25, d30
-
- ; cospi_8_64 = 15137
- movw r3, #0x3b21
-
- vdup.16 d30, r12 ; duplicate cospi_24_64
- vdup.16 d31, r3 ; duplicate cospi_8_64
-
- ; temp1 = (step1[0] + step1[1]) * cospi_16_64
- vadd.s32 q3, q2, q0
- vadd.s32 q12, q11, q1
-
- ; temp2 = (step1[0] - step1[1]) * cospi_16_64
- vsub.s32 q13, q2, q0
- vsub.s32 q1, q11, q1
-
- ; dct_const_round_shift(temp1)
- vrshrn.s32 d16, q3, #14 ; >> 14
- vrshrn.s32 d17, q12, #14 ; >> 14
-
- ; dct_const_round_shift(temp2)
- vrshrn.s32 d18, q13, #14 ; >> 14
- vrshrn.s32 d19, q1, #14 ; >> 14
-
- ; step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
- ; step1[2] * cospi_8_64
- vmull.s16 q0, d20, d31
- vmull.s16 q1, d21, d31
-
- ; step1[2] * cospi_24_64
- vmull.s16 q12, d20, d30
- vmull.s16 q13, d21, d30
-
- ; temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64
- vmlal.s16 q0, d28, d30
- vmlal.s16 q1, d29, d30
-
- ; temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64
- vmlsl.s16 q12, d28, d31
- vmlsl.s16 q13, d29, d31
-
- ; dct_const_round_shift(temp2)
- vrshrn.s32 d22, q0, #14 ; >> 14
- vrshrn.s32 d23, q1, #14 ; >> 14
-
- ; dct_const_round_shift(temp1)
- vrshrn.s32 d20, q12, #14 ; >> 14
- vrshrn.s32 d21, q13, #14 ; >> 14
-
- vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5];
- vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5];
- vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7];
- vadd.s16 q15, q6, q7 ; step2[7] = step1[6] + step1[7];
-
- ; cospi_16_64 = 11585
- movw r3, #0x2d41
-
- ; stage 5
- vadd.s16 q0, q8, q11 ; step1[0] = step2[0] + step2[3];
- vadd.s16 q1, q9, q10 ; step1[1] = step2[1] + step2[2];
- vsub.s16 q2, q9, q10 ; step1[2] = step2[1] - step2[2];
- vsub.s16 q3, q8, q11 ; step1[3] = step2[0] - step2[3];
-
- vdup.16 d16, r3; ; duplicate cospi_16_64
-
- ; step2[5] * cospi_16_64
- vmull.s16 q11, d26, d16
- vmull.s16 q12, d27, d16
-
- ; step2[6] * cospi_16_64
- vmull.s16 q9, d28, d16
- vmull.s16 q10, d29, d16
-
- ; temp1 = (step2[6] - step2[5]) * cospi_16_64
- vsub.s32 q6, q9, q11
- vsub.s32 q13, q10, q12
-
- ; temp2 = (step2[5] + step2[6]) * cospi_16_64
- vadd.s32 q9, q9, q11
- vadd.s32 q10, q10, q12
-
- ; dct_const_round_shift(temp1)
- vrshrn.s32 d10, q6, #14 ; >> 14
- vrshrn.s32 d11, q13, #14 ; >> 14
-
- ; dct_const_round_shift(temp2)
- vrshrn.s32 d12, q9, #14 ; >> 14
- vrshrn.s32 d13, q10, #14 ; >> 14
-
- ; stage 6
- vadd.s16 q8, q0, q15 ; step2[0] = step1[0] + step1[7];
- vadd.s16 q9, q1, q6 ; step2[1] = step1[1] + step1[6];
- vadd.s16 q10, q2, q5 ; step2[2] = step1[2] + step1[5];
- vadd.s16 q11, q3, q4 ; step2[3] = step1[3] + step1[4];
- vsub.s16 q12, q3, q4 ; step2[4] = step1[3] - step1[4];
- vsub.s16 q13, q2, q5 ; step2[5] = step1[2] - step1[5];
- vsub.s16 q14, q1, q6 ; step2[6] = step1[1] - step1[6];
- vsub.s16 q15, q0, q15 ; step2[7] = step1[0] - step1[7];
-
- ; store the data
- vst1.64 {q8-q9}, [r1]!
- vst1.64 {q10-q11}, [r1]!
- vst1.64 {q12-q13}, [r1]!
- vst1.64 {q14-q15}, [r1]
-
- bx lr
- ENDP ; |vpx_idct16x16_256_add_neon_pass1|
-
- IF CONFIG_VP9_HIGHBITDEPTH
-;void |vpx_idct16x16_256_add_neon_pass1_tran_low|(const tran_low_t *input,
-; int16_t *output)
-;
-; r0 const tran_low_t *input
-; r1 int16_t *output
-
-|vpx_idct16x16_256_add_neon_pass1_tran_low| PROC
- LOAD_TRAN_LOW_TO_S16X2 d16, d17, d18, d19, r0
- LOAD_TRAN_LOW_TO_S16X2 d18, d19, d20, d21, r0
- LOAD_TRAN_LOW_TO_S16X2 d20, d21, d22, d23, r0
- LOAD_TRAN_LOW_TO_S16X2 d22, d23, d24, d25, r0
- LOAD_TRAN_LOW_TO_S16X2 d24, d25, d26, d27, r0
- LOAD_TRAN_LOW_TO_S16X2 d26, d27, d28, d29, r0
- LOAD_TRAN_LOW_TO_S16X2 d28, d29, d30, d31, r0
- LOAD_TRAN_LOW_TO_S16X2 d2, d3, d4, d5, r0
- vmov.s16 q15, q1
-
- b idct16x16_256_add_neon_pass1
- ENDP ; |vpx_idct16x16_256_add_neon_pass1_tran_low|
- ENDIF ; CONFIG_VP9_HIGHBITDEPTH
-
-;void vpx_idct16x16_256_add_neon_pass2(const int16_t *src,
-; int16_t *output,
-; int16_t *pass1_output,
-; int16_t skip_adding,
-; uint8_t *dest,
-; int stride)
-;
-; r0 const int16_t *src
-; r1 int16_t *output
-; r2 int16_t *pass1_output
-; r3 int16_t skip_adding
-; r4 uint8_t *dest
-; r5 int stride
-
-; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
-; will be stored back into q8-q15 registers. This function will touch q0-q7
-; registers and use them as buffer during calculation.
-|vpx_idct16x16_256_add_neon_pass2| PROC
- ; TODO(hkuang): Find a better way to load the elements.
- ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15
- vld2.s16 {q8,q9}, [r0]!
- vld2.s16 {q9,q10}, [r0]!
- vld2.s16 {q10,q11}, [r0]!
- vld2.s16 {q11,q12}, [r0]!
- vld2.s16 {q12,q13}, [r0]!
- vld2.s16 {q13,q14}, [r0]!
- vld2.s16 {q14,q15}, [r0]!
- vld2.s16 {q0,q1}, [r0]!
- vmov.s16 q15, q0;
-
-idct16x16_256_add_neon_pass2
- push {r3-r9}
-
- ; cospi_30_64 = 1606
- movw r3, #0x0646
-
- ; cospi_2_64 = 16305
- movw r12, #0x3fb1
-
- ; transpose the input data
- TRANSPOSE8X8
-
- ; stage 3
- vdup.16 d12, r3 ; duplicate cospi_30_64
- vdup.16 d13, r12 ; duplicate cospi_2_64
-
- ; preloading to avoid stall
- ; cospi_14_64 = 12665
- movw r3, #0x3179
-
- ; cospi_18_64 = 10394
- movw r12, #0x289a
-
- ; step1[8] * cospi_30_64
- vmull.s16 q2, d16, d12
- vmull.s16 q3, d17, d12
-
- ; step1[8] * cospi_2_64
- vmull.s16 q1, d16, d13
- vmull.s16 q4, d17, d13
-
- ; temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64
- vmlsl.s16 q2, d30, d13
- vmlsl.s16 q3, d31, d13
-
- ; temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64
- vmlal.s16 q1, d30, d12
- vmlal.s16 q4, d31, d12
-
- vdup.16 d30, r3 ; duplicate cospi_14_64
- vdup.16 d31, r12 ; duplicate cospi_18_64
-
- ; dct_const_round_shift(temp1)
- vrshrn.s32 d0, q2, #14 ; >> 14
- vrshrn.s32 d1, q3, #14 ; >> 14
-
- ; dct_const_round_shift(temp2)
- vrshrn.s32 d14, q1, #14 ; >> 14
- vrshrn.s32 d15, q4, #14 ; >> 14
-
- ; preloading to avoid stall
- ; cospi_22_64 = 7723
- movw r3, #0x1e2b
-
- ; cospi_10_64 = 14449
- movw r12, #0x3871
-
- ; step1[9] * cospi_14_64
- vmull.s16 q2, d24, d30
- vmull.s16 q3, d25, d30
-
- ; step1[9] * cospi_18_64
- vmull.s16 q4, d24, d31
- vmull.s16 q5, d25, d31
-
- ; temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64
- vmlsl.s16 q2, d22, d31
- vmlsl.s16 q3, d23, d31
-
- ; temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64
- vmlal.s16 q4, d22, d30
- vmlal.s16 q5, d23, d30
-
- vdup.16 d30, r3 ; duplicate cospi_22_64
- vdup.16 d31, r12 ; duplicate cospi_10_64
-
- ; dct_const_round_shift(temp1)
- vrshrn.s32 d2, q2, #14 ; >> 14
- vrshrn.s32 d3, q3, #14 ; >> 14
-
- ; dct_const_round_shift(temp2)
- vrshrn.s32 d12, q4, #14 ; >> 14
- vrshrn.s32 d13, q5, #14 ; >> 14
-
- ; step1[10] * cospi_22_64
- vmull.s16 q11, d20, d30
- vmull.s16 q12, d21, d30
-
- ; step1[10] * cospi_10_64
- vmull.s16 q4, d20, d31
- vmull.s16 q5, d21, d31
-
- ; temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64
- vmlsl.s16 q11, d26, d31
- vmlsl.s16 q12, d27, d31
-
- ; temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64
- vmlal.s16 q4, d26, d30
- vmlal.s16 q5, d27, d30
-
- ; preloading to avoid stall
- ; cospi_6_64 = 15679
- movw r3, #0x3d3f
-
- ; cospi_26_64 = 4756
- movw r12, #0x1294
-
- vdup.16 d30, r3 ; duplicate cospi_6_64
- vdup.16 d31, r12 ; duplicate cospi_26_64
-
- ; dct_const_round_shift(temp1)
- vrshrn.s32 d4, q11, #14 ; >> 14
- vrshrn.s32 d5, q12, #14 ; >> 14
-
- ; dct_const_round_shift(temp2)
- vrshrn.s32 d11, q5, #14 ; >> 14
- vrshrn.s32 d10, q4, #14 ; >> 14
-
- ; step1[11] * cospi_6_64
- vmull.s16 q10, d28, d30
- vmull.s16 q11, d29, d30
-
- ; step1[11] * cospi_26_64
- vmull.s16 q12, d28, d31
- vmull.s16 q13, d29, d31
-
- ; temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64
- vmlsl.s16 q10, d18, d31
- vmlsl.s16 q11, d19, d31
-
- ; temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64
- vmlal.s16 q12, d18, d30
- vmlal.s16 q13, d19, d30
-
- vsub.s16 q9, q0, q1 ; step1[9]=step2[8]-step2[9]
- vadd.s16 q0, q0, q1 ; step1[8]=step2[8]+step2[9]
-
- ; dct_const_round_shift(temp1)
- vrshrn.s32 d6, q10, #14 ; >> 14
- vrshrn.s32 d7, q11, #14 ; >> 14
-
- ; dct_const_round_shift(temp2)
- vrshrn.s32 d8, q12, #14 ; >> 14
- vrshrn.s32 d9, q13, #14 ; >> 14
-
- ; stage 3
- vsub.s16 q10, q3, q2 ; step1[10]=-step2[10]+step2[11]
- vadd.s16 q11, q2, q3 ; step1[11]=step2[10]+step2[11]
- vadd.s16 q12, q4, q5 ; step1[12]=step2[12]+step2[13]
- vsub.s16 q13, q4, q5 ; step1[13]=step2[12]-step2[13]
- vsub.s16 q14, q7, q6 ; step1[14]=-step2[14]+tep2[15]
- vadd.s16 q7, q6, q7 ; step1[15]=step2[14]+step2[15]
-
- ; stage 4
- ; cospi_24_64 = 6270
- movw r3, #0x187e
-
- ; cospi_8_64 = 15137
- movw r12, #0x3b21
-
- ; -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
- vdup.16 d30, r12 ; duplicate cospi_8_64
- vdup.16 d31, r3 ; duplicate cospi_24_64
-
- ; step1[9] * cospi_24_64
- vmull.s16 q2, d18, d31
- vmull.s16 q3, d19, d31
-
- ; step1[14] * cospi_24_64
- vmull.s16 q4, d28, d31
- vmull.s16 q5, d29, d31
-
- ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64
- vmlal.s16 q2, d28, d30
- vmlal.s16 q3, d29, d30
-
- ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
- vmlsl.s16 q4, d18, d30
- vmlsl.s16 q5, d19, d30
-
- rsb r12, #0
- vdup.16 d30, r12 ; duplicate -cospi_8_64
-
- ; dct_const_round_shift(temp2)
- vrshrn.s32 d12, q2, #14 ; >> 14
- vrshrn.s32 d13, q3, #14 ; >> 14
-
- ; dct_const_round_shift(temp1)
- vrshrn.s32 d2, q4, #14 ; >> 14
- vrshrn.s32 d3, q5, #14 ; >> 14
-
- vmov.s16 q3, q11
- vmov.s16 q4, q12
-
- ; - step1[13] * cospi_8_64
- vmull.s16 q11, d26, d30
- vmull.s16 q12, d27, d30
-
- ; -step1[10] * cospi_8_64
- vmull.s16 q8, d20, d30
- vmull.s16 q9, d21, d30
-
- ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64
- vmlsl.s16 q11, d20, d31
- vmlsl.s16 q12, d21, d31
-
- ; temp1 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64
- vmlal.s16 q8, d26, d31
- vmlal.s16 q9, d27, d31
-
- ; dct_const_round_shift(temp2)
- vrshrn.s32 d4, q11, #14 ; >> 14
- vrshrn.s32 d5, q12, #14 ; >> 14
-
- ; dct_const_round_shift(temp1)
- vrshrn.s32 d10, q8, #14 ; >> 14
- vrshrn.s32 d11, q9, #14 ; >> 14
-
- ; stage 5
- vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11];
- vadd.s16 q9, q1, q2 ; step1[9] = step2[9]+step2[10];
- vsub.s16 q10, q1, q2 ; step1[10] = step2[9]-step2[10];
- vsub.s16 q11, q0, q3 ; step1[11] = step2[8]-step2[11];
- vsub.s16 q12, q7, q4 ; step1[12] =-step2[12]+step2[15];
- vsub.s16 q13, q6, q5 ; step1[13] =-step2[13]+step2[14];
- vadd.s16 q14, q6, q5 ; step1[14] =step2[13]+step2[14];
- vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15];
-
- ; stage 6.
- ; cospi_16_64 = 11585
- movw r12, #0x2d41
-
- vdup.16 d14, r12 ; duplicate cospi_16_64
-
- ; step1[13] * cospi_16_64
- vmull.s16 q3, d26, d14
- vmull.s16 q4, d27, d14
-
- ; step1[10] * cospi_16_64
- vmull.s16 q0, d20, d14
- vmull.s16 q1, d21, d14
-
- ; temp1 = (-step1[10] + step1[13]) * cospi_16_64
- vsub.s32 q5, q3, q0
- vsub.s32 q6, q4, q1
-
- ; temp2 = (step1[10] + step1[13]) * cospi_16_64
- vadd.s32 q10, q3, q0
- vadd.s32 q4, q4, q1
-
- ; dct_const_round_shift(temp1)
- vrshrn.s32 d4, q5, #14 ; >> 14
- vrshrn.s32 d5, q6, #14 ; >> 14
-
- ; dct_const_round_shift(temp2)
- vrshrn.s32 d10, q10, #14 ; >> 14
- vrshrn.s32 d11, q4, #14 ; >> 14
-
- ; step1[11] * cospi_16_64
- vmull.s16 q0, d22, d14
- vmull.s16 q1, d23, d14
-
- ; step1[12] * cospi_16_64
- vmull.s16 q13, d24, d14
- vmull.s16 q6, d25, d14
-
- ; temp1 = (-step1[11] + step1[12]) * cospi_16_64
- vsub.s32 q10, q13, q0
- vsub.s32 q4, q6, q1
-
- ; temp2 = (step1[11] + step1[12]) * cospi_16_64
- vadd.s32 q13, q13, q0
- vadd.s32 q6, q6, q1
-
- ; dct_const_round_shift(temp1)
- vrshrn.s32 d6, q10, #14 ; >> 14
- vrshrn.s32 d7, q4, #14 ; >> 14
-
- ; dct_const_round_shift(temp2)
- vrshrn.s32 d8, q13, #14 ; >> 14
- vrshrn.s32 d9, q6, #14 ; >> 14
-
- mov r4, #16 ; pass1_output stride
- ldr r3, [sp] ; load skip_adding
- cmp r3, #0 ; check if need adding dest data
- beq skip_adding_dest
-
- ldr r7, [sp, #28] ; dest used to save element 0-7
- mov r9, r7 ; save dest pointer for later use
- ldr r8, [sp, #32] ; load stride
-
- ; stage 7
- ; load the data in pass1
- vld1.s16 {q0}, [r2], r4 ; load data step2[0]
- vld1.s16 {q1}, [r2], r4 ; load data step2[1]
- vld1.s16 {q10}, [r2], r4 ; load data step2[2]
- vld1.s16 {q11}, [r2], r4 ; load data step2[3]
- vld1.64 {d12}, [r7], r8 ; load destinatoin data
- vld1.64 {d13}, [r7], r8 ; load destinatoin data
- vadd.s16 q12, q0, q15 ; step2[0] + step2[15]
- vadd.s16 q13, q1, q14 ; step2[1] + step2[14]
- vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO
- vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO
- vaddw.u8 q12, q12, d12 ; + dest[j * stride + i]
- vaddw.u8 q13, q13, d13 ; + dest[j * stride + i]
- vqmovun.s16 d12, q12 ; clip pixel
- vqmovun.s16 d13, q13 ; clip pixel
- vst1.64 {d12}, [r9], r8 ; store the data
- vst1.64 {d13}, [r9], r8 ; store the data
- vsub.s16 q14, q1, q14 ; step2[1] - step2[14]
- vsub.s16 q15, q0, q15 ; step2[0] - step2[15]
- vld1.64 {d12}, [r7], r8 ; load destinatoin data
- vld1.64 {d13}, [r7], r8 ; load destinatoin data
- vadd.s16 q12, q10, q5 ; step2[2] + step2[13]
- vadd.s16 q13, q11, q4 ; step2[3] + step2[12]
- vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO
- vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO
- vaddw.u8 q12, q12, d12 ; + dest[j * stride + i]
- vaddw.u8 q13, q13, d13 ; + dest[j * stride + i]
- vqmovun.s16 d12, q12 ; clip pixel
- vqmovun.s16 d13, q13 ; clip pixel
- vst1.64 {d12}, [r9], r8 ; store the data
- vst1.64 {d13}, [r9], r8 ; store the data
- vsub.s16 q4, q11, q4 ; step2[3] - step2[12]
- vsub.s16 q5, q10, q5 ; step2[2] - step2[13]
- vld1.s16 {q0}, [r2], r4 ; load data step2[4]
- vld1.s16 {q1}, [r2], r4 ; load data step2[5]
- vld1.s16 {q10}, [r2], r4 ; load data step2[6]
- vld1.s16 {q11}, [r2], r4 ; load data step2[7]
- vld1.64 {d12}, [r7], r8 ; load destinatoin data
- vld1.64 {d13}, [r7], r8 ; load destinatoin data
- vadd.s16 q12, q0, q3 ; step2[4] + step2[11]
- vadd.s16 q13, q1, q2 ; step2[5] + step2[10]
- vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO
- vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO
- vaddw.u8 q12, q12, d12 ; + dest[j * stride + i]
- vaddw.u8 q13, q13, d13 ; + dest[j * stride + i]
- vqmovun.s16 d12, q12 ; clip pixel
- vqmovun.s16 d13, q13 ; clip pixel
- vst1.64 {d12}, [r9], r8 ; store the data
- vst1.64 {d13}, [r9], r8 ; store the data
- vsub.s16 q2, q1, q2 ; step2[5] - step2[10]
- vsub.s16 q3, q0, q3 ; step2[4] - step2[11]
- vld1.64 {d12}, [r7], r8 ; load destinatoin data
- vld1.64 {d13}, [r7], r8 ; load destinatoin data
- vadd.s16 q12, q10, q9 ; step2[6] + step2[9]
- vadd.s16 q13, q11, q8 ; step2[7] + step2[8]
- vrshr.s16 q12, q12, #6 ; ROUND_POWER_OF_TWO
- vrshr.s16 q13, q13, #6 ; ROUND_POWER_OF_TWO
- vaddw.u8 q12, q12, d12 ; + dest[j * stride + i]
- vaddw.u8 q13, q13, d13 ; + dest[j * stride + i]
- vqmovun.s16 d12, q12 ; clip pixel
- vqmovun.s16 d13, q13 ; clip pixel
- vst1.64 {d12}, [r9], r8 ; store the data
- vst1.64 {d13}, [r9], r8 ; store the data
- vld1.64 {d12}, [r7], r8 ; load destinatoin data
- vld1.64 {d13}, [r7], r8 ; load destinatoin data
- vsub.s16 q8, q11, q8 ; step2[7] - step2[8]
- vsub.s16 q9, q10, q9 ; step2[6] - step2[9]
-
- ; store the data output 8,9,10,11,12,13,14,15
- vrshr.s16 q8, q8, #6 ; ROUND_POWER_OF_TWO
- vaddw.u8 q8, q8, d12 ; + dest[j * stride + i]
- vqmovun.s16 d12, q8 ; clip pixel
- vst1.64 {d12}, [r9], r8 ; store the data
- vld1.64 {d12}, [r7], r8 ; load destinatoin data
- vrshr.s16 q9, q9, #6
- vaddw.u8 q9, q9, d13 ; + dest[j * stride + i]
- vqmovun.s16 d13, q9 ; clip pixel
- vst1.64 {d13}, [r9], r8 ; store the data
- vld1.64 {d13}, [r7], r8 ; load destinatoin data
- vrshr.s16 q2, q2, #6
- vaddw.u8 q2, q2, d12 ; + dest[j * stride + i]
- vqmovun.s16 d12, q2 ; clip pixel
- vst1.64 {d12}, [r9], r8 ; store the data
- vld1.64 {d12}, [r7], r8 ; load destinatoin data
- vrshr.s16 q3, q3, #6
- vaddw.u8 q3, q3, d13 ; + dest[j * stride + i]
- vqmovun.s16 d13, q3 ; clip pixel
- vst1.64 {d13}, [r9], r8 ; store the data
- vld1.64 {d13}, [r7], r8 ; load destinatoin data
- vrshr.s16 q4, q4, #6
- vaddw.u8 q4, q4, d12 ; + dest[j * stride + i]
- vqmovun.s16 d12, q4 ; clip pixel
- vst1.64 {d12}, [r9], r8 ; store the data
- vld1.64 {d12}, [r7], r8 ; load destinatoin data
- vrshr.s16 q5, q5, #6
- vaddw.u8 q5, q5, d13 ; + dest[j * stride + i]
- vqmovun.s16 d13, q5 ; clip pixel
- vst1.64 {d13}, [r9], r8 ; store the data
- vld1.64 {d13}, [r7], r8 ; load destinatoin data
- vrshr.s16 q14, q14, #6
- vaddw.u8 q14, q14, d12 ; + dest[j * stride + i]
- vqmovun.s16 d12, q14 ; clip pixel
- vst1.64 {d12}, [r9], r8 ; store the data
- vld1.64 {d12}, [r7], r8 ; load destinatoin data
- vrshr.s16 q15, q15, #6
- vaddw.u8 q15, q15, d13 ; + dest[j * stride + i]
- vqmovun.s16 d13, q15 ; clip pixel
- vst1.64 {d13}, [r9], r8 ; store the data
- b end_idct16x16_pass2
-
-skip_adding_dest
- ; stage 7
- ; load the data in pass1
- mov r5, #24
- mov r3, #8
-
- vld1.s16 {q0}, [r2], r4 ; load data step2[0]
- vld1.s16 {q1}, [r2], r4 ; load data step2[1]
- vadd.s16 q12, q0, q15 ; step2[0] + step2[15]
- vadd.s16 q13, q1, q14 ; step2[1] + step2[14]
- vld1.s16 {q10}, [r2], r4 ; load data step2[2]
- vld1.s16 {q11}, [r2], r4 ; load data step2[3]
- vst1.64 {d24}, [r1], r3 ; store output[0]
- vst1.64 {d25}, [r1], r5
- vst1.64 {d26}, [r1], r3 ; store output[1]
- vst1.64 {d27}, [r1], r5
- vadd.s16 q12, q10, q5 ; step2[2] + step2[13]
- vadd.s16 q13, q11, q4 ; step2[3] + step2[12]
- vsub.s16 q14, q1, q14 ; step2[1] - step2[14]
- vsub.s16 q15, q0, q15 ; step2[0] - step2[15]
- vst1.64 {d24}, [r1], r3 ; store output[2]
- vst1.64 {d25}, [r1], r5
- vst1.64 {d26}, [r1], r3 ; store output[3]
- vst1.64 {d27}, [r1], r5
- vsub.s16 q4, q11, q4 ; step2[3] - step2[12]
- vsub.s16 q5, q10, q5 ; step2[2] - step2[13]
- vld1.s16 {q0}, [r2], r4 ; load data step2[4]
- vld1.s16 {q1}, [r2], r4 ; load data step2[5]
- vadd.s16 q12, q0, q3 ; step2[4] + step2[11]
- vadd.s16 q13, q1, q2 ; step2[5] + step2[10]
- vld1.s16 {q10}, [r2], r4 ; load data step2[6]
- vld1.s16 {q11}, [r2], r4 ; load data step2[7]
- vst1.64 {d24}, [r1], r3 ; store output[4]
- vst1.64 {d25}, [r1], r5
- vst1.64 {d26}, [r1], r3 ; store output[5]
- vst1.64 {d27}, [r1], r5
- vadd.s16 q12, q10, q9 ; step2[6] + step2[9]
- vadd.s16 q13, q11, q8 ; step2[7] + step2[8]
- vsub.s16 q2, q1, q2 ; step2[5] - step2[10]
- vsub.s16 q3, q0, q3 ; step2[4] - step2[11]
- vsub.s16 q8, q11, q8 ; step2[7] - step2[8]
- vsub.s16 q9, q10, q9 ; step2[6] - step2[9]
- vst1.64 {d24}, [r1], r3 ; store output[6]
- vst1.64 {d25}, [r1], r5
- vst1.64 {d26}, [r1], r3 ; store output[7]
- vst1.64 {d27}, [r1], r5
-
- ; store the data output 8,9,10,11,12,13,14,15
- vst1.64 {d16}, [r1], r3
- vst1.64 {d17}, [r1], r5
- vst1.64 {d18}, [r1], r3
- vst1.64 {d19}, [r1], r5
- vst1.64 {d4}, [r1], r3
- vst1.64 {d5}, [r1], r5
- vst1.64 {d6}, [r1], r3
- vst1.64 {d7}, [r1], r5
- vst1.64 {d8}, [r1], r3
- vst1.64 {d9}, [r1], r5
- vst1.64 {d10}, [r1], r3
- vst1.64 {d11}, [r1], r5
- vst1.64 {d28}, [r1], r3
- vst1.64 {d29}, [r1], r5
- vst1.64 {d30}, [r1], r3
- vst1.64 {d31}, [r1], r5
-end_idct16x16_pass2
- pop {r3-r9}
- bx lr
- ENDP ; |vpx_idct16x16_256_add_neon_pass2|
-
- IF CONFIG_VP9_HIGHBITDEPTH
-;void vpx_idct16x16_256_add_neon_pass2_tran_low(const tran_low_t *src,
-; int16_t *output,
-; int16_t *pass1_output,
-; int16_t skip_adding,
-; uint8_t *dest,
-; int stride)
-;
-; r0 const tran_low_t *src
-; r1 int16_t *output
-; r2 int16_t *pass1_output
-; r3 int16_t skip_adding
-; r4 uint8_t *dest
-; r5 int stride
-
-|vpx_idct16x16_256_add_neon_pass2_tran_low| PROC
- LOAD_TRAN_LOW_TO_S16X2 d16, d17, d18, d19, r0
- LOAD_TRAN_LOW_TO_S16X2 d18, d19, d20, d21, r0
- LOAD_TRAN_LOW_TO_S16X2 d20, d21, d22, d23, r0
- LOAD_TRAN_LOW_TO_S16X2 d22, d23, d24, d25, r0
- LOAD_TRAN_LOW_TO_S16X2 d24, d25, d26, d27, r0
- LOAD_TRAN_LOW_TO_S16X2 d26, d27, d28, d29, r0
- LOAD_TRAN_LOW_TO_S16X2 d28, d29, d30, d31, r0
- LOAD_TRAN_LOW_TO_S16X2 d0, d1, d2, d3, r0
- vmov.s16 q15, q0
-
- b idct16x16_256_add_neon_pass2
- ENDP ; |vpx_idct16x16_256_add_neon_pass2_tran_low|
- ENDIF ; CONFIG_VP9_HIGHBITDEPTH
-
-;void |vpx_idct16x16_10_add_neon_pass1|(const tran_low_t *input,
-; int16_t *output)
-;
-; r0 const tran_low_t *input
-; r1 int16_t *output
-
-; idct16 stage1 - stage6 on all the elements loaded in q8-q15. The output
-; will be stored back into q8-q15 registers. This function will touch q0-q7
-; registers and use them as buffer during calculation.
-|vpx_idct16x16_10_add_neon_pass1| PROC
-
- ; TODO(hkuang): Find a better way to load the elements.
- ; load elements of 0, 2, 4, 6, 8, 10, 12, 14 into q8 - q15
- LOAD_TRAN_LOW_TO_S16X2 d16, d17, d18, d19, r0
- LOAD_TRAN_LOW_TO_S16X2 d18, d19, d20, d21, r0
- LOAD_TRAN_LOW_TO_S16X2 d20, d21, d22, d23, r0
- LOAD_TRAN_LOW_TO_S16X2 d22, d23, d24, d25, r0
- LOAD_TRAN_LOW_TO_S16X2 d24, d25, d26, d27, r0
- LOAD_TRAN_LOW_TO_S16X2 d26, d27, d28, d29, r0
- LOAD_TRAN_LOW_TO_S16X2 d28, d29, d30, d31, r0
- LOAD_TRAN_LOW_TO_S16X2 d2, d3, d4, d5, r0
- vmov.s16 q15, q1
-
- ; cospi_28_64*2 = 6392
- movw r3, #0x18f8
-
- ; cospi_4_64*2 = 32138
- movw r12, #0x7d8a
-
- ; transpose the input data
- TRANSPOSE8X8
-
- ; stage 3
- vdup.16 q0, r3 ; duplicate cospi_28_64*2
- vdup.16 q1, r12 ; duplicate cospi_4_64*2
-
- ; The following instructions use vqrdmulh to do the
- ; dct_const_round_shift(step2[4] * cospi_28_64). vvqrdmulh will multiply,
- ; double, and return the high 16 bits, effectively giving >> 15. Doubling
- ; the constant will change this to >> 14.
- ; dct_const_round_shift(step2[4] * cospi_28_64);
- vqrdmulh.s16 q4, q9, q0
-
- ; preloading to avoid stall
- ; cospi_16_64*2 = 23170
- movw r3, #0x5a82
-
- ; dct_const_round_shift(step2[4] * cospi_4_64);
- vqrdmulh.s16 q7, q9, q1
-
- ; stage 4
- vdup.16 q1, r3 ; cospi_16_64*2
-
- ; cospi_16_64 = 11585
- movw r3, #0x2d41
-
- vdup.16 d4, r3; ; duplicate cospi_16_64
-
- ; dct_const_round_shift(step1[0] * cospi_16_64)
- vqrdmulh.s16 q8, q8, q1
-
- ; step2[6] * cospi_16_64
- vmull.s16 q9, d14, d4
- vmull.s16 q10, d15, d4
-
- ; step2[5] * cospi_16_64
- vmull.s16 q12, d9, d4
- vmull.s16 q11, d8, d4
-
- ; temp1 = (step2[6] - step2[5]) * cospi_16_64
- vsub.s32 q15, q10, q12
- vsub.s32 q6, q9, q11
-
- ; temp2 = (step2[5] + step2[6]) * cospi_16_64
- vadd.s32 q9, q9, q11
- vadd.s32 q10, q10, q12
-
- ; dct_const_round_shift(temp1)
- vrshrn.s32 d11, q15, #14 ; >> 14
- vrshrn.s32 d10, q6, #14 ; >> 14
-
- ; dct_const_round_shift(temp2)
- vrshrn.s32 d12, q9, #14 ; >> 14
- vrshrn.s32 d13, q10, #14 ; >> 14
-
- ; stage 6
- vadd.s16 q2, q8, q7 ; step2[0] = step1[0] + step1[7];
- vadd.s16 q10, q8, q5 ; step2[2] = step1[2] + step1[5];
- vadd.s16 q11, q8, q4 ; step2[3] = step1[3] + step1[4];
- vadd.s16 q9, q8, q6 ; step2[1] = step1[1] + step1[6];
- vsub.s16 q12, q8, q4 ; step2[4] = step1[3] - step1[4];
- vsub.s16 q13, q8, q5 ; step2[5] = step1[2] - step1[5];
- vsub.s16 q14, q8, q6 ; step2[6] = step1[1] - step1[6];
- vsub.s16 q15, q8, q7 ; step2[7] = step1[0] - step1[7];
-
- ; store the data
- vst1.64 {q2}, [r1]!
- vst1.64 {q9-q10}, [r1]!
- vst1.64 {q11-q12}, [r1]!
- vst1.64 {q13-q14}, [r1]!
- vst1.64 {q15}, [r1]
-
- bx lr
- ENDP ; |vpx_idct16x16_10_add_neon_pass1|
-
-;void vpx_idct16x16_10_add_neon_pass2(const tran_low_t *src, int16_t *output,
-; int16_t *pass1_output)
-;
-; r0 const tran_low_t *src
-; r1 int16_t *output
-; r2 int16_t *pass1_output
-
-; idct16 stage1 - stage7 on all the elements loaded in q8-q15. The output
-; will be stored back into q8-q15 registers. This function will touch q0-q7
-; registers and use them as buffer during calculation.
-|vpx_idct16x16_10_add_neon_pass2| PROC
- push {r3-r9}
-
- ; TODO(hkuang): Find a better way to load the elements.
- ; load elements of 1, 3, 5, 7, 9, 11, 13, 15 into q8 - q15
- LOAD_TRAN_LOW_TO_S16X2 d16, d17, d18, d19, r0
- LOAD_TRAN_LOW_TO_S16X2 d18, d19, d20, d21, r0
- LOAD_TRAN_LOW_TO_S16X2 d20, d21, d22, d23, r0
- LOAD_TRAN_LOW_TO_S16X2 d22, d23, d24, d25, r0
- LOAD_TRAN_LOW_TO_S16X2 d24, d25, d26, d27, r0
- LOAD_TRAN_LOW_TO_S16X2 d26, d27, d28, d29, r0
- LOAD_TRAN_LOW_TO_S16X2 d28, d29, d30, d31, r0
- LOAD_TRAN_LOW_TO_S16X2 d0, d1, d2, d3, r0
- vmov.s16 q15, q0;
-
- ; 2*cospi_30_64 = 3212
- movw r3, #0x0c8c
-
- ; 2*cospi_2_64 = 32610
- movw r12, #0x7f62
-
- ; transpose the input data
- TRANSPOSE8X8
-
- ; stage 3
- vdup.16 q6, r3 ; duplicate 2*cospi_30_64
-
- ; dct_const_round_shift(step1[8] * cospi_30_64)
- vqrdmulh.s16 q0, q8, q6
-
- vdup.16 q6, r12 ; duplicate 2*cospi_2_64
-
- ; dct_const_round_shift(step1[8] * cospi_2_64)
- vqrdmulh.s16 q7, q8, q6
-
- ; preloading to avoid stall
- ; 2*cospi_26_64 = 9512
- movw r12, #0x2528
- rsb r12, #0
- vdup.16 q15, r12 ; duplicate -2*cospi_26_64
-
- ; 2*cospi_6_64 = 31358
- movw r3, #0x7a7e
- vdup.16 q14, r3 ; duplicate 2*cospi_6_64
-
- ; dct_const_round_shift(- step1[12] * cospi_26_64)
- vqrdmulh.s16 q3, q9, q15
-
- ; dct_const_round_shift(step1[12] * cospi_6_64)
- vqrdmulh.s16 q4, q9, q14
-
- ; stage 4
- ; cospi_24_64 = 6270
- movw r3, #0x187e
- vdup.16 d31, r3 ; duplicate cospi_24_64
-
- ; cospi_8_64 = 15137
- movw r12, #0x3b21
- vdup.16 d30, r12 ; duplicate cospi_8_64
-
- ; step1[14] * cospi_24_64
- vmull.s16 q12, d14, d31
- vmull.s16 q5, d15, d31
-
- ; step1[9] * cospi_24_64
- vmull.s16 q2, d0, d31
- vmull.s16 q11, d1, d31
-
- ; temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64
- vmlsl.s16 q12, d0, d30
- vmlsl.s16 q5, d1, d30
-
- ; temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64
- vmlal.s16 q2, d14, d30
- vmlal.s16 q11, d15, d30
-
- rsb r12, #0
- vdup.16 d30, r12 ; duplicate -cospi_8_64
-
- ; dct_const_round_shift(temp1)
- vrshrn.s32 d2, q12, #14 ; >> 14
- vrshrn.s32 d3, q5, #14 ; >> 14
-
- ; dct_const_round_shift(temp2)
- vrshrn.s32 d12, q2, #14 ; >> 14
- vrshrn.s32 d13, q11, #14 ; >> 14
-
- ; - step1[13] * cospi_8_64
- vmull.s16 q10, d8, d30
- vmull.s16 q13, d9, d30
-
- ; -step1[10] * cospi_8_64
- vmull.s16 q8, d6, d30
- vmull.s16 q9, d7, d30
-
- ; temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64
- vmlsl.s16 q10, d6, d31
- vmlsl.s16 q13, d7, d31
-
- ; temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64
- vmlal.s16 q8, d8, d31
- vmlal.s16 q9, d9, d31
-
- ; dct_const_round_shift(temp1)
- vrshrn.s32 d4, q10, #14 ; >> 14
- vrshrn.s32 d5, q13, #14 ; >> 14
-
- ; dct_const_round_shift(temp2)
- vrshrn.s32 d10, q8, #14 ; >> 14
- vrshrn.s32 d11, q9, #14 ; >> 14
-
- ; stage 5
- vadd.s16 q8, q0, q3 ; step1[8] = step2[8]+step2[11];
- vadd.s16 q9, q1, q2 ; step1[9] = step2[9]+step2[10];
- vsub.s16 q10, q1, q2 ; step1[10] = step2[9]-step2[10];
- vsub.s16 q11, q0, q3 ; step1[11] = step2[8]-step2[11];
- vsub.s16 q12, q7, q4 ; step1[12] =-step2[12]+step2[15];
- vsub.s16 q13, q6, q5 ; step1[13] =-step2[13]+step2[14];
- vadd.s16 q14, q6, q5 ; step1[14] =step2[13]+step2[14];
- vadd.s16 q15, q7, q4 ; step1[15] =step2[12]+step2[15];
-
- ; stage 6.
- ; cospi_16_64 = 11585
- movw r12, #0x2d41
-
- vdup.16 d14, r12 ; duplicate cospi_16_64
-
- ; step1[13] * cospi_16_64
- vmull.s16 q3, d26, d14
- vmull.s16 q4, d27, d14
-
- ; step1[10] * cospi_16_64
- vmull.s16 q0, d20, d14
- vmull.s16 q1, d21, d14
-
- ; temp1 = (-step1[10] + step1[13]) * cospi_16_64
- vsub.s32 q5, q3, q0
- vsub.s32 q6, q4, q1
-
- ; temp2 = (step1[10] + step1[13]) * cospi_16_64
- vadd.s32 q0, q3, q0
- vadd.s32 q1, q4, q1
-
- ; dct_const_round_shift(temp1)
- vrshrn.s32 d4, q5, #14 ; >> 14
- vrshrn.s32 d5, q6, #14 ; >> 14
-
- ; dct_const_round_shift(temp2)
- vrshrn.s32 d10, q0, #14 ; >> 14
- vrshrn.s32 d11, q1, #14 ; >> 14
-
- ; step1[11] * cospi_16_64
- vmull.s16 q0, d22, d14
- vmull.s16 q1, d23, d14
-
- ; step1[12] * cospi_16_64
- vmull.s16 q13, d24, d14
- vmull.s16 q6, d25, d14
-
- ; temp1 = (-step1[11] + step1[12]) * cospi_16_64
- vsub.s32 q10, q13, q0
- vsub.s32 q4, q6, q1
-
- ; temp2 = (step1[11] + step1[12]) * cospi_16_64
- vadd.s32 q13, q13, q0
- vadd.s32 q6, q6, q1
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vrshrn.s32 d6, q10, #14 ; >> 14
- vrshrn.s32 d7, q4, #14 ; >> 14
-
- ; dct_const_round_shift((step1[11] + step1[12]) * cospi_16_64);
- vrshrn.s32 d8, q13, #14 ; >> 14
- vrshrn.s32 d9, q6, #14 ; >> 14
-
- mov r4, #16 ; pass1_output stride
- ldr r3, [sp] ; load skip_adding
-
- ; stage 7
- ; load the data in pass1
- mov r5, #24
- mov r3, #8
-
- vld1.s16 {q0}, [r2], r4 ; load data step2[0]
- vld1.s16 {q1}, [r2], r4 ; load data step2[1]
- vadd.s16 q12, q0, q15 ; step2[0] + step2[15]
- vadd.s16 q13, q1, q14 ; step2[1] + step2[14]
- vld1.s16 {q10}, [r2], r4 ; load data step2[2]
- vld1.s16 {q11}, [r2], r4 ; load data step2[3]
- vst1.64 {d24}, [r1], r3 ; store output[0]
- vst1.64 {d25}, [r1], r5
- vst1.64 {d26}, [r1], r3 ; store output[1]
- vst1.64 {d27}, [r1], r5
- vadd.s16 q12, q10, q5 ; step2[2] + step2[13]
- vadd.s16 q13, q11, q4 ; step2[3] + step2[12]
- vsub.s16 q14, q1, q14 ; step2[1] - step2[14]
- vsub.s16 q15, q0, q15 ; step2[0] - step2[15]
- vst1.64 {d24}, [r1], r3 ; store output[2]
- vst1.64 {d25}, [r1], r5
- vst1.64 {d26}, [r1], r3 ; store output[3]
- vst1.64 {d27}, [r1], r5
- vsub.s16 q4, q11, q4 ; step2[3] - step2[12]
- vsub.s16 q5, q10, q5 ; step2[2] - step2[13]
- vld1.s16 {q0}, [r2], r4 ; load data step2[4]
- vld1.s16 {q1}, [r2], r4 ; load data step2[5]
- vadd.s16 q12, q0, q3 ; step2[4] + step2[11]
- vadd.s16 q13, q1, q2 ; step2[5] + step2[10]
- vld1.s16 {q10}, [r2], r4 ; load data step2[6]
- vld1.s16 {q11}, [r2], r4 ; load data step2[7]
- vst1.64 {d24}, [r1], r3 ; store output[4]
- vst1.64 {d25}, [r1], r5
- vst1.64 {d26}, [r1], r3 ; store output[5]
- vst1.64 {d27}, [r1], r5
- vadd.s16 q12, q10, q9 ; step2[6] + step2[9]
- vadd.s16 q13, q11, q8 ; step2[7] + step2[8]
- vsub.s16 q2, q1, q2 ; step2[5] - step2[10]
- vsub.s16 q3, q0, q3 ; step2[4] - step2[11]
- vsub.s16 q8, q11, q8 ; step2[7] - step2[8]
- vsub.s16 q9, q10, q9 ; step2[6] - step2[9]
- vst1.64 {d24}, [r1], r3 ; store output[6]
- vst1.64 {d25}, [r1], r5
- vst1.64 {d26}, [r1], r3 ; store output[7]
- vst1.64 {d27}, [r1], r5
-
- ; store the data output 8,9,10,11,12,13,14,15
- vst1.64 {d16}, [r1], r3
- vst1.64 {d17}, [r1], r5
- vst1.64 {d18}, [r1], r3
- vst1.64 {d19}, [r1], r5
- vst1.64 {d4}, [r1], r3
- vst1.64 {d5}, [r1], r5
- vst1.64 {d6}, [r1], r3
- vst1.64 {d7}, [r1], r5
- vst1.64 {d8}, [r1], r3
- vst1.64 {d9}, [r1], r5
- vst1.64 {d10}, [r1], r3
- vst1.64 {d11}, [r1], r5
- vst1.64 {d28}, [r1], r3
- vst1.64 {d29}, [r1], r5
- vst1.64 {d30}, [r1], r3
- vst1.64 {d31}, [r1], r5
-end_idct10_16x16_pass2
- pop {r3-r9}
- bx lr
- ENDP ; |vpx_idct16x16_10_add_neon_pass2|
- END
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
index 0c891919b76..4259cd8cadc 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_add_neon.c
@@ -14,106 +14,10 @@
#include "vpx_dsp/arm/idct_neon.h"
#include "vpx_dsp/txfm_common.h"
-#if CONFIG_VP9_HIGHBITDEPTH
-static INLINE void idct16x16_256_add_load_tran_low_kernel(
- const tran_low_t **input, int16_t **out) {
- int16x8_t s;
-
- s = load_tran_low_to_s16q(*input);
- vst1q_s16(*out, s);
- *input += 8;
- *out += 8;
-}
-
-static INLINE void idct16x16_256_add_load_tran_low(const tran_low_t *input,
- int16_t *out) {
- idct16x16_256_add_load_tran_low_kernel(&input, &out);
- idct16x16_256_add_load_tran_low_kernel(&input, &out);
- idct16x16_256_add_load_tran_low_kernel(&input, &out);
- idct16x16_256_add_load_tran_low_kernel(&input, &out);
- idct16x16_256_add_load_tran_low_kernel(&input, &out);
- idct16x16_256_add_load_tran_low_kernel(&input, &out);
- idct16x16_256_add_load_tran_low_kernel(&input, &out);
- idct16x16_256_add_load_tran_low_kernel(&input, &out);
- idct16x16_256_add_load_tran_low_kernel(&input, &out);
- idct16x16_256_add_load_tran_low_kernel(&input, &out);
- idct16x16_256_add_load_tran_low_kernel(&input, &out);
- idct16x16_256_add_load_tran_low_kernel(&input, &out);
- idct16x16_256_add_load_tran_low_kernel(&input, &out);
- idct16x16_256_add_load_tran_low_kernel(&input, &out);
- idct16x16_256_add_load_tran_low_kernel(&input, &out);
- idct16x16_256_add_load_tran_low_kernel(&input, &out);
- idct16x16_256_add_load_tran_low_kernel(&input, &out);
- idct16x16_256_add_load_tran_low_kernel(&input, &out);
- idct16x16_256_add_load_tran_low_kernel(&input, &out);
- idct16x16_256_add_load_tran_low_kernel(&input, &out);
- idct16x16_256_add_load_tran_low_kernel(&input, &out);
- idct16x16_256_add_load_tran_low_kernel(&input, &out);
- idct16x16_256_add_load_tran_low_kernel(&input, &out);
- idct16x16_256_add_load_tran_low_kernel(&input, &out);
- idct16x16_256_add_load_tran_low_kernel(&input, &out);
- idct16x16_256_add_load_tran_low_kernel(&input, &out);
- idct16x16_256_add_load_tran_low_kernel(&input, &out);
- idct16x16_256_add_load_tran_low_kernel(&input, &out);
- idct16x16_256_add_load_tran_low_kernel(&input, &out);
- idct16x16_256_add_load_tran_low_kernel(&input, &out);
- idct16x16_256_add_load_tran_low_kernel(&input, &out);
- idct16x16_256_add_load_tran_low_kernel(&input, &out);
-}
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
static INLINE void wrap_low_4x2(const int32x4_t *const t32, int16x4_t *const d0,
int16x4_t *const d1) {
- *d0 = vrshrn_n_s32(t32[0], 14);
- *d1 = vrshrn_n_s32(t32[1], 14);
-}
-
-static INLINE void idct_cospi_2_30(const int16x8_t s0, const int16x8_t s1,
- const int16x4_t cospi_2_30_10_22,
- int16x8_t *const d0, int16x8_t *const d1) {
- int32x4_t t32[6];
-
- t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 1);
- t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 1);
- t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 1);
- t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 1);
- t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 0);
- t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 0);
- t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 0);
- t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 0);
- idct16x16_add_wrap_low_8x2(t32, d0, d1);
-}
-
-static INLINE void idct_cospi_4_28(const int16x8_t s0, const int16x8_t s1,
- const int16x4_t cospi_4_12_20N_28,
- int16x8_t *const d0, int16x8_t *const d1) {
- int32x4_t t32[6];
-
- t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 3);
- t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 3);
- t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 3);
- t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 3);
- t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 0);
- t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 0);
- t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 0);
- t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 0);
- idct16x16_add_wrap_low_8x2(t32, d0, d1);
-}
-
-static INLINE void idct_cospi_6_26(const int16x8_t s0, const int16x8_t s1,
- const int16x4_t cospi_6_26_14_18N,
- int16x8_t *const d0, int16x8_t *const d1) {
- int32x4_t t32[6];
-
- t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26_14_18N, 0);
- t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26_14_18N, 0);
- t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26_14_18N, 0);
- t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26_14_18N, 0);
- t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26_14_18N, 1);
- t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26_14_18N, 1);
- t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26_14_18N, 1);
- t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26_14_18N, 1);
- idct16x16_add_wrap_low_8x2(t32, d0, d1);
+ *d0 = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+ *d1 = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
}
static INLINE void idct_cospi_8_24_d_kernel(const int16x4_t s0,
@@ -146,54 +50,6 @@ static INLINE void idct_cospi_8_24_neg_d(const int16x4_t s0, const int16x4_t s1,
wrap_low_4x2(t32, d0, d1);
}
-static INLINE void idct_cospi_10_22(const int16x8_t s0, const int16x8_t s1,
- const int16x4_t cospi_2_30_10_22,
- int16x8_t *const d0, int16x8_t *const d1) {
- int32x4_t t32[6];
-
- t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 3);
- t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 3);
- t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 3);
- t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 3);
- t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 2);
- t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 2);
- t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 2);
- t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 2);
- idct16x16_add_wrap_low_8x2(t32, d0, d1);
-}
-
-static INLINE void idct_cospi_12_20(const int16x8_t s0, const int16x8_t s1,
- const int16x4_t cospi_4_12_20N_28,
- int16x8_t *const d0, int16x8_t *const d1) {
- int32x4_t t32[6];
-
- t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 1);
- t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 1);
- t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 1);
- t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 1);
- t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 2);
- t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 2);
- t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 2);
- t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 2);
- idct16x16_add_wrap_low_8x2(t32, d0, d1);
-}
-
-static INLINE void idct_cospi_14_18(const int16x8_t s0, const int16x8_t s1,
- const int16x4_t cospi_6_26_14_18N,
- int16x8_t *const d0, int16x8_t *const d1) {
- int32x4_t t32[6];
-
- t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26_14_18N, 2);
- t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26_14_18N, 2);
- t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26_14_18N, 2);
- t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26_14_18N, 2);
- t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26_14_18N, 3);
- t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26_14_18N, 3);
- t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26_14_18N, 3);
- t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26_14_18N, 3);
- idct16x16_add_wrap_low_8x2(t32, d0, d1);
-}
-
static INLINE void idct_cospi_16_16_d(const int16x4_t s0, const int16x4_t s1,
const int16x4_t cospi_0_8_16_24,
int16x4_t *const d0,
@@ -206,8 +62,68 @@ static INLINE void idct_cospi_16_16_d(const int16x4_t s0, const int16x4_t s1,
wrap_low_4x2(t32, d0, d1);
}
-static void idct16x16_256_add_half1d(const int16_t *input, int16_t *output,
- uint8_t *dest, int stride) {
+static INLINE void idct16x16_add_store(const int16x8_t *const out,
+ uint8_t *dest, const int stride) {
+ // Add the result to dest
+ idct16x16_add8x1(out[0], &dest, stride);
+ idct16x16_add8x1(out[1], &dest, stride);
+ idct16x16_add8x1(out[2], &dest, stride);
+ idct16x16_add8x1(out[3], &dest, stride);
+ idct16x16_add8x1(out[4], &dest, stride);
+ idct16x16_add8x1(out[5], &dest, stride);
+ idct16x16_add8x1(out[6], &dest, stride);
+ idct16x16_add8x1(out[7], &dest, stride);
+ idct16x16_add8x1(out[8], &dest, stride);
+ idct16x16_add8x1(out[9], &dest, stride);
+ idct16x16_add8x1(out[10], &dest, stride);
+ idct16x16_add8x1(out[11], &dest, stride);
+ idct16x16_add8x1(out[12], &dest, stride);
+ idct16x16_add8x1(out[13], &dest, stride);
+ idct16x16_add8x1(out[14], &dest, stride);
+ idct16x16_add8x1(out[15], &dest, stride);
+}
+
+static INLINE void idct16x16_add_store_bd8(int16x8_t *const out, uint16_t *dest,
+ const int stride) {
+ // Add the result to dest
+ const int16x8_t max = vdupq_n_s16((1 << 8) - 1);
+ out[0] = vrshrq_n_s16(out[0], 6);
+ out[1] = vrshrq_n_s16(out[1], 6);
+ out[2] = vrshrq_n_s16(out[2], 6);
+ out[3] = vrshrq_n_s16(out[3], 6);
+ out[4] = vrshrq_n_s16(out[4], 6);
+ out[5] = vrshrq_n_s16(out[5], 6);
+ out[6] = vrshrq_n_s16(out[6], 6);
+ out[7] = vrshrq_n_s16(out[7], 6);
+ out[8] = vrshrq_n_s16(out[8], 6);
+ out[9] = vrshrq_n_s16(out[9], 6);
+ out[10] = vrshrq_n_s16(out[10], 6);
+ out[11] = vrshrq_n_s16(out[11], 6);
+ out[12] = vrshrq_n_s16(out[12], 6);
+ out[13] = vrshrq_n_s16(out[13], 6);
+ out[14] = vrshrq_n_s16(out[14], 6);
+ out[15] = vrshrq_n_s16(out[15], 6);
+ highbd_idct16x16_add8x1(out[0], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[1], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[2], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[3], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[4], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[5], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[6], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[7], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[8], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[9], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[10], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[11], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[12], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[13], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[14], max, &dest, stride);
+ highbd_idct16x16_add8x1(out[15], max, &dest, stride);
+}
+
+void idct16x16_256_add_half1d(const void *const input, int16_t *output,
+ void *const dest, const int stride,
+ const int highbd_flag) {
const int16x8_t cospis0 = vld1q_s16(kCospi);
const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
@@ -217,37 +133,73 @@ static void idct16x16_256_add_half1d(const int16_t *input, int16_t *output,
int16x8_t in[16], step1[16], step2[16], out[16];
// Load input (16x8)
- in[0] = vld1q_s16(input);
- input += 8;
- in[8] = vld1q_s16(input);
- input += 8;
- in[1] = vld1q_s16(input);
- input += 8;
- in[9] = vld1q_s16(input);
- input += 8;
- in[2] = vld1q_s16(input);
- input += 8;
- in[10] = vld1q_s16(input);
- input += 8;
- in[3] = vld1q_s16(input);
- input += 8;
- in[11] = vld1q_s16(input);
- input += 8;
- in[4] = vld1q_s16(input);
- input += 8;
- in[12] = vld1q_s16(input);
- input += 8;
- in[5] = vld1q_s16(input);
- input += 8;
- in[13] = vld1q_s16(input);
- input += 8;
- in[6] = vld1q_s16(input);
- input += 8;
- in[14] = vld1q_s16(input);
- input += 8;
- in[7] = vld1q_s16(input);
- input += 8;
- in[15] = vld1q_s16(input);
+ if (output) {
+ const tran_low_t *inputT = (const tran_low_t *)input;
+ in[0] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[8] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[1] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[9] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[2] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[10] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[3] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[11] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[4] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[12] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[5] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[13] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[6] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[14] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[7] = load_tran_low_to_s16q(inputT);
+ inputT += 8;
+ in[15] = load_tran_low_to_s16q(inputT);
+ } else {
+ const int16_t *inputT = (const int16_t *)input;
+ in[0] = vld1q_s16(inputT);
+ inputT += 8;
+ in[8] = vld1q_s16(inputT);
+ inputT += 8;
+ in[1] = vld1q_s16(inputT);
+ inputT += 8;
+ in[9] = vld1q_s16(inputT);
+ inputT += 8;
+ in[2] = vld1q_s16(inputT);
+ inputT += 8;
+ in[10] = vld1q_s16(inputT);
+ inputT += 8;
+ in[3] = vld1q_s16(inputT);
+ inputT += 8;
+ in[11] = vld1q_s16(inputT);
+ inputT += 8;
+ in[4] = vld1q_s16(inputT);
+ inputT += 8;
+ in[12] = vld1q_s16(inputT);
+ inputT += 8;
+ in[5] = vld1q_s16(inputT);
+ inputT += 8;
+ in[13] = vld1q_s16(inputT);
+ inputT += 8;
+ in[6] = vld1q_s16(inputT);
+ inputT += 8;
+ in[14] = vld1q_s16(inputT);
+ inputT += 8;
+ in[7] = vld1q_s16(inputT);
+ inputT += 8;
+ in[15] = vld1q_s16(inputT);
+ }
// Transpose
transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
@@ -358,79 +310,181 @@ static void idct16x16_256_add_half1d(const int16_t *input, int16_t *output,
step2[15] = step1[15];
// stage 7
- out[0] = vaddq_s16(step2[0], step2[15]);
- out[1] = vaddq_s16(step2[1], step2[14]);
- out[2] = vaddq_s16(step2[2], step2[13]);
- out[3] = vaddq_s16(step2[3], step2[12]);
- out[4] = vaddq_s16(step2[4], step2[11]);
- out[5] = vaddq_s16(step2[5], step2[10]);
- out[6] = vaddq_s16(step2[6], step2[9]);
- out[7] = vaddq_s16(step2[7], step2[8]);
- out[8] = vsubq_s16(step2[7], step2[8]);
- out[9] = vsubq_s16(step2[6], step2[9]);
- out[10] = vsubq_s16(step2[5], step2[10]);
- out[11] = vsubq_s16(step2[4], step2[11]);
- out[12] = vsubq_s16(step2[3], step2[12]);
- out[13] = vsubq_s16(step2[2], step2[13]);
- out[14] = vsubq_s16(step2[1], step2[14]);
- out[15] = vsubq_s16(step2[0], step2[15]);
+ idct16x16_add_stage7(step2, out);
if (output) {
- // pass 1: save the result into output
- vst1q_s16(output, out[0]);
- output += 16;
- vst1q_s16(output, out[1]);
- output += 16;
- vst1q_s16(output, out[2]);
- output += 16;
- vst1q_s16(output, out[3]);
- output += 16;
- vst1q_s16(output, out[4]);
- output += 16;
- vst1q_s16(output, out[5]);
- output += 16;
- vst1q_s16(output, out[6]);
- output += 16;
- vst1q_s16(output, out[7]);
- output += 16;
- vst1q_s16(output, out[8]);
- output += 16;
- vst1q_s16(output, out[9]);
- output += 16;
- vst1q_s16(output, out[10]);
- output += 16;
- vst1q_s16(output, out[11]);
- output += 16;
- vst1q_s16(output, out[12]);
- output += 16;
- vst1q_s16(output, out[13]);
- output += 16;
- vst1q_s16(output, out[14]);
- output += 16;
- vst1q_s16(output, out[15]);
+ idct16x16_store_pass1(out, output);
} else {
- // pass 2: add the result to dest.
- idct16x16_add8x1(out[0], &dest, stride);
- idct16x16_add8x1(out[1], &dest, stride);
- idct16x16_add8x1(out[2], &dest, stride);
- idct16x16_add8x1(out[3], &dest, stride);
- idct16x16_add8x1(out[4], &dest, stride);
- idct16x16_add8x1(out[5], &dest, stride);
- idct16x16_add8x1(out[6], &dest, stride);
- idct16x16_add8x1(out[7], &dest, stride);
- idct16x16_add8x1(out[8], &dest, stride);
- idct16x16_add8x1(out[9], &dest, stride);
- idct16x16_add8x1(out[10], &dest, stride);
- idct16x16_add8x1(out[11], &dest, stride);
- idct16x16_add8x1(out[12], &dest, stride);
- idct16x16_add8x1(out[13], &dest, stride);
- idct16x16_add8x1(out[14], &dest, stride);
- idct16x16_add8x1(out[15], &dest, stride);
+ if (highbd_flag) {
+ idct16x16_add_store_bd8(out, dest, stride);
+ } else {
+ idct16x16_add_store(out, dest, stride);
+ }
}
}
-static void idct16x16_10_add_half1d_pass1(const tran_low_t *input,
- int16_t *output) {
+void idct16x16_38_add_half1d(const void *const input, int16_t *const output,
+ void *const dest, const int stride,
+ const int highbd_flag) {
+ const int16x8_t cospis0 = vld1q_s16(kCospi);
+ const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
+ const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
+ const int16x8_t cospisd1 = vaddq_s16(cospis1, cospis1);
+ const int16x4_t cospi_0_8_16_24 = vget_low_s16(cospis0);
+ const int16x4_t cospid_0_8_16_24 = vget_low_s16(cospisd0);
+ const int16x4_t cospid_4_12_20N_28 = vget_high_s16(cospisd0);
+ const int16x4_t cospid_2_30_10_22 = vget_low_s16(cospisd1);
+ const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
+ int16x8_t in[8], step1[16], step2[16], out[16];
+
+ // Load input (8x8)
+ if (output) {
+ const tran_low_t *inputT = (const tran_low_t *)input;
+ in[0] = load_tran_low_to_s16q(inputT);
+ inputT += 16;
+ in[1] = load_tran_low_to_s16q(inputT);
+ inputT += 16;
+ in[2] = load_tran_low_to_s16q(inputT);
+ inputT += 16;
+ in[3] = load_tran_low_to_s16q(inputT);
+ inputT += 16;
+ in[4] = load_tran_low_to_s16q(inputT);
+ inputT += 16;
+ in[5] = load_tran_low_to_s16q(inputT);
+ inputT += 16;
+ in[6] = load_tran_low_to_s16q(inputT);
+ inputT += 16;
+ in[7] = load_tran_low_to_s16q(inputT);
+ } else {
+ const int16_t *inputT = (const int16_t *)input;
+ in[0] = vld1q_s16(inputT);
+ inputT += 16;
+ in[1] = vld1q_s16(inputT);
+ inputT += 16;
+ in[2] = vld1q_s16(inputT);
+ inputT += 16;
+ in[3] = vld1q_s16(inputT);
+ inputT += 16;
+ in[4] = vld1q_s16(inputT);
+ inputT += 16;
+ in[5] = vld1q_s16(inputT);
+ inputT += 16;
+ in[6] = vld1q_s16(inputT);
+ inputT += 16;
+ in[7] = vld1q_s16(inputT);
+ }
+
+ // Transpose
+ transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+ &in[7]);
+
+ // stage 1
+ step1[0] = in[0 / 2];
+ step1[2] = in[8 / 2];
+ step1[4] = in[4 / 2];
+ step1[6] = in[12 / 2];
+ step1[8] = in[2 / 2];
+ step1[10] = in[10 / 2];
+ step1[12] = in[6 / 2];
+ step1[14] = in[14 / 2]; // 0 in pass 1
+
+ // stage 2
+ step2[0] = step1[0];
+ step2[2] = step1[2];
+ step2[4] = step1[4];
+ step2[6] = step1[6];
+ step2[8] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 1);
+ step2[9] = vqrdmulhq_lane_s16(step1[14], cospid_6_26_14_18N, 3);
+ step2[10] = vqrdmulhq_lane_s16(step1[10], cospid_2_30_10_22, 3);
+ step2[11] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 1);
+ step2[12] = vqrdmulhq_lane_s16(step1[12], cospid_6_26_14_18N, 0);
+ step2[13] = vqrdmulhq_lane_s16(step1[10], cospid_2_30_10_22, 2);
+ step2[14] = vqrdmulhq_lane_s16(step1[14], cospid_6_26_14_18N, 2);
+ step2[15] = vqrdmulhq_lane_s16(step1[8], cospid_2_30_10_22, 0);
+
+ // stage 3
+ step1[0] = step2[0];
+ step1[2] = step2[2];
+ step1[4] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 3);
+ step1[5] = vqrdmulhq_lane_s16(step2[6], cospid_4_12_20N_28, 2);
+ step1[6] = vqrdmulhq_lane_s16(step2[6], cospid_4_12_20N_28, 1);
+ step1[7] = vqrdmulhq_lane_s16(step2[4], cospid_4_12_20N_28, 0);
+ step1[8] = vaddq_s16(step2[8], step2[9]);
+ step1[9] = vsubq_s16(step2[8], step2[9]);
+ step1[10] = vsubq_s16(step2[11], step2[10]);
+ step1[11] = vaddq_s16(step2[11], step2[10]);
+ step1[12] = vaddq_s16(step2[12], step2[13]);
+ step1[13] = vsubq_s16(step2[12], step2[13]);
+ step1[14] = vsubq_s16(step2[15], step2[14]);
+ step1[15] = vaddq_s16(step2[15], step2[14]);
+
+ // stage 4
+ step2[0] = step2[1] = vqrdmulhq_lane_s16(step1[0], cospid_0_8_16_24, 2);
+ step2[2] = vqrdmulhq_lane_s16(step1[2], cospid_0_8_16_24, 3);
+ step2[3] = vqrdmulhq_lane_s16(step1[2], cospid_0_8_16_24, 1);
+ step2[4] = vaddq_s16(step1[4], step1[5]);
+ step2[5] = vsubq_s16(step1[4], step1[5]);
+ step2[6] = vsubq_s16(step1[7], step1[6]);
+ step2[7] = vaddq_s16(step1[7], step1[6]);
+ step2[8] = step1[8];
+ idct_cospi_8_24_q(step1[14], step1[9], cospi_0_8_16_24, &step2[9],
+ &step2[14]);
+ idct_cospi_8_24_neg_q(step1[13], step1[10], cospi_0_8_16_24, &step2[13],
+ &step2[10]);
+ step2[11] = step1[11];
+ step2[12] = step1[12];
+ step2[15] = step1[15];
+
+ // stage 5
+ step1[0] = vaddq_s16(step2[0], step2[3]);
+ step1[1] = vaddq_s16(step2[1], step2[2]);
+ step1[2] = vsubq_s16(step2[1], step2[2]);
+ step1[3] = vsubq_s16(step2[0], step2[3]);
+ step1[4] = step2[4];
+ idct_cospi_16_16_q(step2[5], step2[6], cospi_0_8_16_24, &step1[5], &step1[6]);
+ step1[7] = step2[7];
+ step1[8] = vaddq_s16(step2[8], step2[11]);
+ step1[9] = vaddq_s16(step2[9], step2[10]);
+ step1[10] = vsubq_s16(step2[9], step2[10]);
+ step1[11] = vsubq_s16(step2[8], step2[11]);
+ step1[12] = vsubq_s16(step2[15], step2[12]);
+ step1[13] = vsubq_s16(step2[14], step2[13]);
+ step1[14] = vaddq_s16(step2[14], step2[13]);
+ step1[15] = vaddq_s16(step2[15], step2[12]);
+
+ // stage 6
+ step2[0] = vaddq_s16(step1[0], step1[7]);
+ step2[1] = vaddq_s16(step1[1], step1[6]);
+ step2[2] = vaddq_s16(step1[2], step1[5]);
+ step2[3] = vaddq_s16(step1[3], step1[4]);
+ step2[4] = vsubq_s16(step1[3], step1[4]);
+ step2[5] = vsubq_s16(step1[2], step1[5]);
+ step2[6] = vsubq_s16(step1[1], step1[6]);
+ step2[7] = vsubq_s16(step1[0], step1[7]);
+ idct_cospi_16_16_q(step1[10], step1[13], cospi_0_8_16_24, &step2[10],
+ &step2[13]);
+ idct_cospi_16_16_q(step1[11], step1[12], cospi_0_8_16_24, &step2[11],
+ &step2[12]);
+ step2[8] = step1[8];
+ step2[9] = step1[9];
+ step2[14] = step1[14];
+ step2[15] = step1[15];
+
+ // stage 7
+ idct16x16_add_stage7(step2, out);
+
+ if (output) {
+ idct16x16_store_pass1(out, output);
+ } else {
+ if (highbd_flag) {
+ idct16x16_add_store_bd8(out, dest, stride);
+ } else {
+ idct16x16_add_store(out, dest, stride);
+ }
+ }
+}
+
+void idct16x16_10_add_half1d_pass1(const tran_low_t *input, int16_t *output) {
const int16x8_t cospis0 = vld1q_s16(kCospi);
const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
@@ -442,8 +496,7 @@ static void idct16x16_10_add_half1d_pass1(const tran_low_t *input,
const int16x4_t cospid_6_26_14_18N = vget_high_s16(cospisd1);
int16x4_t in[4], step1[16], step2[16], out[16];
-// Load input (4x4)
-#if CONFIG_VP9_HIGHBITDEPTH
+ // Load input (4x4)
in[0] = load_tran_low_to_s16d(input);
input += 16;
in[1] = load_tran_low_to_s16d(input);
@@ -451,15 +504,6 @@ static void idct16x16_10_add_half1d_pass1(const tran_low_t *input,
in[2] = load_tran_low_to_s16d(input);
input += 16;
in[3] = load_tran_low_to_s16d(input);
-#else
- in[0] = vld1_s16(input);
- input += 16;
- in[1] = vld1_s16(input);
- input += 16;
- in[2] = vld1_s16(input);
- input += 16;
- in[3] = vld1_s16(input);
-#endif // CONFIG_VP9_HIGHBITDEPTH
// Transpose
transpose_s16_4x4d(&in[0], &in[1], &in[2], &in[3]);
@@ -593,8 +637,9 @@ static void idct16x16_10_add_half1d_pass1(const tran_low_t *input,
vst1_s16(output, out[15]);
}
-static void idct16x16_10_add_half1d_pass2(const int16_t *input, int16_t *output,
- uint8_t *dest, int stride) {
+void idct16x16_10_add_half1d_pass2(const int16_t *input, int16_t *const output,
+ void *const dest, const int stride,
+ const int highbd_flag) {
const int16x8_t cospis0 = vld1q_s16(kCospi);
const int16x8_t cospis1 = vld1q_s16(kCospi + 8);
const int16x8_t cospisd0 = vaddq_s16(cospis0, cospis0);
@@ -706,74 +751,16 @@ static void idct16x16_10_add_half1d_pass2(const int16_t *input, int16_t *output,
step2[15] = step1[15];
// stage 7
- out[0] = vaddq_s16(step2[0], step2[15]);
- out[1] = vaddq_s16(step2[1], step2[14]);
- out[2] = vaddq_s16(step2[2], step2[13]);
- out[3] = vaddq_s16(step2[3], step2[12]);
- out[4] = vaddq_s16(step2[4], step2[11]);
- out[5] = vaddq_s16(step2[5], step2[10]);
- out[6] = vaddq_s16(step2[6], step2[9]);
- out[7] = vaddq_s16(step2[7], step2[8]);
- out[8] = vsubq_s16(step2[7], step2[8]);
- out[9] = vsubq_s16(step2[6], step2[9]);
- out[10] = vsubq_s16(step2[5], step2[10]);
- out[11] = vsubq_s16(step2[4], step2[11]);
- out[12] = vsubq_s16(step2[3], step2[12]);
- out[13] = vsubq_s16(step2[2], step2[13]);
- out[14] = vsubq_s16(step2[1], step2[14]);
- out[15] = vsubq_s16(step2[0], step2[15]);
+ idct16x16_add_stage7(step2, out);
if (output) {
- // pass 1: save the result into output
- vst1q_s16(output, out[0]);
- output += 16;
- vst1q_s16(output, out[1]);
- output += 16;
- vst1q_s16(output, out[2]);
- output += 16;
- vst1q_s16(output, out[3]);
- output += 16;
- vst1q_s16(output, out[4]);
- output += 16;
- vst1q_s16(output, out[5]);
- output += 16;
- vst1q_s16(output, out[6]);
- output += 16;
- vst1q_s16(output, out[7]);
- output += 16;
- vst1q_s16(output, out[8]);
- output += 16;
- vst1q_s16(output, out[9]);
- output += 16;
- vst1q_s16(output, out[10]);
- output += 16;
- vst1q_s16(output, out[11]);
- output += 16;
- vst1q_s16(output, out[12]);
- output += 16;
- vst1q_s16(output, out[13]);
- output += 16;
- vst1q_s16(output, out[14]);
- output += 16;
- vst1q_s16(output, out[15]);
+ idct16x16_store_pass1(out, output);
} else {
- // pass 2: add the result to dest.
- idct16x16_add8x1(out[0], &dest, stride);
- idct16x16_add8x1(out[1], &dest, stride);
- idct16x16_add8x1(out[2], &dest, stride);
- idct16x16_add8x1(out[3], &dest, stride);
- idct16x16_add8x1(out[4], &dest, stride);
- idct16x16_add8x1(out[5], &dest, stride);
- idct16x16_add8x1(out[6], &dest, stride);
- idct16x16_add8x1(out[7], &dest, stride);
- idct16x16_add8x1(out[8], &dest, stride);
- idct16x16_add8x1(out[9], &dest, stride);
- idct16x16_add8x1(out[10], &dest, stride);
- idct16x16_add8x1(out[11], &dest, stride);
- idct16x16_add8x1(out[12], &dest, stride);
- idct16x16_add8x1(out[13], &dest, stride);
- idct16x16_add8x1(out[14], &dest, stride);
- idct16x16_add8x1(out[15], &dest, stride);
+ if (highbd_flag) {
+ idct16x16_add_store_bd8(out, dest, stride);
+ } else {
+ idct16x16_add_store(out, dest, stride);
+ }
}
}
@@ -781,27 +768,36 @@ void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest,
int stride) {
int16_t row_idct_output[16 * 16];
-#if CONFIG_VP9_HIGHBITDEPTH
- int16_t pass1_input[16 * 16];
- idct16x16_256_add_load_tran_low(input, pass1_input);
-#else
- const int16_t *pass1_input = input;
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
// pass 1
// Parallel idct on the upper 8 rows
- idct16x16_256_add_half1d(pass1_input, row_idct_output, dest, stride);
+ idct16x16_256_add_half1d(input, row_idct_output, dest, stride, 0);
// Parallel idct on the lower 8 rows
- idct16x16_256_add_half1d(pass1_input + 8 * 16, row_idct_output + 8, dest,
- stride);
+ idct16x16_256_add_half1d(input + 8 * 16, row_idct_output + 8, dest, stride,
+ 0);
+
+ // pass 2
+ // Parallel idct to get the left 8 columns
+ idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride, 0);
+
+ // Parallel idct to get the right 8 columns
+ idct16x16_256_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride, 0);
+}
+
+void vpx_idct16x16_38_add_neon(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int16_t row_idct_output[16 * 16];
+
+ // pass 1
+ // Parallel idct on the upper 8 rows
+ idct16x16_38_add_half1d(input, row_idct_output, dest, stride, 0);
// pass 2
// Parallel idct to get the left 8 columns
- idct16x16_256_add_half1d(row_idct_output, NULL, dest, stride);
+ idct16x16_38_add_half1d(row_idct_output, NULL, dest, stride, 0);
// Parallel idct to get the right 8 columns
- idct16x16_256_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride);
+ idct16x16_38_add_half1d(row_idct_output + 16 * 8, NULL, dest + 8, stride, 0);
}
void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest,
@@ -814,9 +810,9 @@ void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest,
// pass 2
// Parallel idct to get the left 8 columns
- idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride);
+ idct16x16_10_add_half1d_pass2(row_idct_output, NULL, dest, stride, 0);
// Parallel idct to get the right 8 columns
- idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, dest + 8,
- stride);
+ idct16x16_10_add_half1d_pass2(row_idct_output + 4 * 8, NULL, dest + 8, stride,
+ 0);
}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_neon.c
deleted file mode 100644
index 47366bcb7d6..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct16x16_neon.c
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-
-void vpx_idct16x16_256_add_neon_pass1(const int16_t *input, int16_t *output);
-void vpx_idct16x16_256_add_neon_pass2(const int16_t *src, int16_t *output,
- int16_t *pass1_output,
- int16_t skip_adding, uint8_t *dest,
- int stride);
-#if CONFIG_VP9_HIGHBITDEPTH
-void vpx_idct16x16_256_add_neon_pass1_tran_low(const tran_low_t *input,
- int16_t *output);
-void vpx_idct16x16_256_add_neon_pass2_tran_low(const tran_low_t *src,
- int16_t *output,
- int16_t *pass1_output,
- int16_t skip_adding,
- uint8_t *dest, int stride);
-#else
-#define vpx_idct16x16_256_add_neon_pass1_tran_low \
- vpx_idct16x16_256_add_neon_pass1
-#define vpx_idct16x16_256_add_neon_pass2_tran_low \
- vpx_idct16x16_256_add_neon_pass2
-#endif
-
-void vpx_idct16x16_10_add_neon_pass1(const tran_low_t *input, int16_t *output);
-void vpx_idct16x16_10_add_neon_pass2(const tran_low_t *src, int16_t *output,
- int16_t *pass1_output);
-
-#if HAVE_NEON_ASM
-/* For ARM NEON, d8-d15 are callee-saved registers, and need to be saved. */
-extern void vpx_push_neon(int64_t *store);
-extern void vpx_pop_neon(int64_t *store);
-#endif // HAVE_NEON_ASM
-
-void vpx_idct16x16_256_add_neon(const tran_low_t *input, uint8_t *dest,
- int stride) {
-#if HAVE_NEON_ASM
- int64_t store_reg[8];
-#endif
- int16_t pass1_output[16 * 16] = { 0 };
- int16_t row_idct_output[16 * 16] = { 0 };
-
-#if HAVE_NEON_ASM
- // save d8-d15 register values.
- vpx_push_neon(store_reg);
-#endif
-
- /* Parallel idct on the upper 8 rows */
- // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
- // stage 6 result in pass1_output.
- vpx_idct16x16_256_add_neon_pass1_tran_low(input, pass1_output);
-
- // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
- // with result in pass1(pass1_output) to calculate final result in stage 7
- // which will be saved into row_idct_output.
- vpx_idct16x16_256_add_neon_pass2_tran_low(input + 1, row_idct_output,
- pass1_output, 0, dest, stride);
-
- /* Parallel idct on the lower 8 rows */
- // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
- // stage 6 result in pass1_output.
- vpx_idct16x16_256_add_neon_pass1_tran_low(input + 8 * 16, pass1_output);
-
- // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
- // with result in pass1(pass1_output) to calculate final result in stage 7
- // which will be saved into row_idct_output.
- vpx_idct16x16_256_add_neon_pass2_tran_low(
- input + 8 * 16 + 1, row_idct_output + 8, pass1_output, 0, dest, stride);
-
- /* Parallel idct on the left 8 columns */
- // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
- // stage 6 result in pass1_output.
- vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output);
-
- // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
- // with result in pass1(pass1_output) to calculate final result in stage 7.
- // Then add the result to the destination data.
- vpx_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
- pass1_output, 1, dest, stride);
-
- /* Parallel idct on the right 8 columns */
- // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
- // stage 6 result in pass1_output.
- vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output);
-
- // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
- // with result in pass1(pass1_output) to calculate final result in stage 7.
- // Then add the result to the destination data.
- vpx_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
- row_idct_output + 8, pass1_output, 1,
- dest + 8, stride);
-
-#if HAVE_NEON_ASM
- // restore d8-d15 register values.
- vpx_pop_neon(store_reg);
-#endif
-}
-
-void vpx_idct16x16_10_add_neon(const tran_low_t *input, uint8_t *dest,
- int stride) {
-#if HAVE_NEON_ASM
- int64_t store_reg[8];
-#endif
- int16_t pass1_output[16 * 16] = { 0 };
- int16_t row_idct_output[16 * 16] = { 0 };
-
-#if HAVE_NEON_ASM
- // save d8-d15 register values.
- vpx_push_neon(store_reg);
-#endif
-
- /* Parallel idct on the upper 8 rows */
- // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
- // stage 6 result in pass1_output.
- vpx_idct16x16_10_add_neon_pass1(input, pass1_output);
-
- // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
- // with result in pass1(pass1_output) to calculate final result in stage 7
- // which will be saved into row_idct_output.
- vpx_idct16x16_10_add_neon_pass2(input + 1, row_idct_output, pass1_output);
-
- /* Skip Parallel idct on the lower 8 rows as they are all 0s */
-
- /* Parallel idct on the left 8 columns */
- // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
- // stage 6 result in pass1_output.
- vpx_idct16x16_256_add_neon_pass1(row_idct_output, pass1_output);
-
- // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
- // with result in pass1(pass1_output) to calculate final result in stage 7.
- // Then add the result to the destination data.
- vpx_idct16x16_256_add_neon_pass2(row_idct_output + 1, row_idct_output,
- pass1_output, 1, dest, stride);
-
- /* Parallel idct on the right 8 columns */
- // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
- // stage 6 result in pass1_output.
- vpx_idct16x16_256_add_neon_pass1(row_idct_output + 8 * 16, pass1_output);
-
- // Second pass processes odd elements 1, 3, 5, 7, 9, 11, 13, 15 and combines
- // with result in pass1(pass1_output) to calculate final result in stage 7.
- // Then add the result to the destination data.
- vpx_idct16x16_256_add_neon_pass2(row_idct_output + 8 * 16 + 1,
- row_idct_output + 8, pass1_output, 1,
- dest + 8, stride);
-
-#if HAVE_NEON_ASM
- // restore d8-d15 register values.
- vpx_pop_neon(store_reg);
-#endif
-}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c
index de1bf978750..ae9457e18ee 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct32x32_add_neon.c
@@ -147,8 +147,10 @@ static INLINE void DO_BUTTERFLY(int16x8_t q14s16, int16x8_t q13s16,
q11s32 = vaddq_s32(q12s32, q11s32);
q10s32 = vaddq_s32(q10s32, q15s32);
- *qAs16 = vcombine_s16(vrshrn_n_s32(q8s32, 14), vrshrn_n_s32(q9s32, 14));
- *qBs16 = vcombine_s16(vrshrn_n_s32(q11s32, 14), vrshrn_n_s32(q10s32, 14));
+ *qAs16 = vcombine_s16(vrshrn_n_s32(q8s32, DCT_CONST_BITS),
+ vrshrn_n_s32(q9s32, DCT_CONST_BITS));
+ *qBs16 = vcombine_s16(vrshrn_n_s32(q11s32, DCT_CONST_BITS),
+ vrshrn_n_s32(q10s32, DCT_CONST_BITS));
}
static INLINE void load_s16x8q(const int16_t *in, int16x8_t *s0, int16x8_t *s1,
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm
deleted file mode 100644
index 29f678a0382..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_1_add_neon.asm
+++ /dev/null
@@ -1,86 +0,0 @@
-;
-; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license and patent
-; grant that can be found in the LICENSE file in the root of the source
-; tree. All contributing project authors may be found in the AUTHORS
-; file in the root of the source tree.
-;
-
-
- EXPORT |vpx_idct8x8_1_add_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vpx_idct8x8_1_add_neon(int16_t *input, uint8_t *dest, int stride)
-;
-; r0 int16_t input
-; r1 uint8_t *dest
-; r2 int stride)
-
-|vpx_idct8x8_1_add_neon| PROC
- ldrsh r0, [r0]
-
- ; cospi_16_64 = 11585
- movw r12, #0x2d41
-
- ; out = dct_const_round_shift(input[0] * cospi_16_64)
- mul r0, r0, r12 ; input[0] * cospi_16_64
- add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
- asr r0, r0, #14 ; >> DCT_CONST_BITS
-
- ; out = dct_const_round_shift(out * cospi_16_64)
- mul r0, r0, r12 ; out * cospi_16_64
- mov r12, r1 ; save dest
- add r0, r0, #0x2000 ; +(1 << ((DCT_CONST_BITS) - 1))
- asr r0, r0, #14 ; >> DCT_CONST_BITS
-
- ; a1 = ROUND_POWER_OF_TWO(out, 5)
- add r0, r0, #16 ; + (1 <<((5) - 1))
- asr r0, r0, #5 ; >> 5
-
- vdup.s16 q0, r0 ; duplicate a1
-
- ; load destination data
- vld1.64 {d2}, [r1], r2
- vld1.64 {d3}, [r1], r2
- vld1.64 {d4}, [r1], r2
- vld1.64 {d5}, [r1], r2
- vld1.64 {d6}, [r1], r2
- vld1.64 {d7}, [r1], r2
- vld1.64 {d16}, [r1], r2
- vld1.64 {d17}, [r1]
-
- vaddw.u8 q9, q0, d2 ; dest[x] + a1
- vaddw.u8 q10, q0, d3 ; dest[x] + a1
- vaddw.u8 q11, q0, d4 ; dest[x] + a1
- vaddw.u8 q12, q0, d5 ; dest[x] + a1
- vqmovun.s16 d2, q9 ; clip_pixel
- vqmovun.s16 d3, q10 ; clip_pixel
- vqmovun.s16 d30, q11 ; clip_pixel
- vqmovun.s16 d31, q12 ; clip_pixel
- vst1.64 {d2}, [r12], r2
- vst1.64 {d3}, [r12], r2
- vst1.64 {d30}, [r12], r2
- vst1.64 {d31}, [r12], r2
-
- vaddw.u8 q9, q0, d6 ; dest[x] + a1
- vaddw.u8 q10, q0, d7 ; dest[x] + a1
- vaddw.u8 q11, q0, d16 ; dest[x] + a1
- vaddw.u8 q12, q0, d17 ; dest[x] + a1
- vqmovun.s16 d2, q9 ; clip_pixel
- vqmovun.s16 d3, q10 ; clip_pixel
- vqmovun.s16 d30, q11 ; clip_pixel
- vqmovun.s16 d31, q12 ; clip_pixel
- vst1.64 {d2}, [r12], r2
- vst1.64 {d3}, [r12], r2
- vst1.64 {d30}, [r12], r2
- vst1.64 {d31}, [r12], r2
-
- bx lr
- ENDP ; |vpx_idct8x8_1_add_neon|
-
- END
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm
deleted file mode 100644
index 2bfbcc5a52c..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct8x8_add_neon.asm
+++ /dev/null
@@ -1,507 +0,0 @@
-;
-; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
- EXPORT |vpx_idct8x8_64_add_neon|
- EXPORT |vpx_idct8x8_12_add_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
- INCLUDE vpx_dsp/arm/idct_neon.asm.S
-
- ; Parallel 1D IDCT on all the columns of a 8x8 16bit data matrix which are
- ; loaded in q8-q15. The output will be stored back into q8-q15 registers.
- ; This macro will touch q0-q7 registers and use them as buffer during
- ; calculation.
- MACRO
- IDCT8x8_1D
- ; stage 1
- vdup.16 d0, r3 ; duplicate cospi_28_64
- vdup.16 d1, r4 ; duplicate cospi_4_64
- vdup.16 d2, r5 ; duplicate cospi_12_64
- vdup.16 d3, r6 ; duplicate cospi_20_64
-
- ; input[1] * cospi_28_64
- vmull.s16 q2, d18, d0
- vmull.s16 q3, d19, d0
-
- ; input[5] * cospi_12_64
- vmull.s16 q5, d26, d2
- vmull.s16 q6, d27, d2
-
- ; input[1]*cospi_28_64-input[7]*cospi_4_64
- vmlsl.s16 q2, d30, d1
- vmlsl.s16 q3, d31, d1
-
- ; input[5] * cospi_12_64 - input[3] * cospi_20_64
- vmlsl.s16 q5, d22, d3
- vmlsl.s16 q6, d23, d3
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vrshrn.s32 d8, q2, #14 ; >> 14
- vrshrn.s32 d9, q3, #14 ; >> 14
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vrshrn.s32 d10, q5, #14 ; >> 14
- vrshrn.s32 d11, q6, #14 ; >> 14
-
- ; input[1] * cospi_4_64
- vmull.s16 q2, d18, d1
- vmull.s16 q3, d19, d1
-
- ; input[5] * cospi_20_64
- vmull.s16 q9, d26, d3
- vmull.s16 q13, d27, d3
-
- ; input[1]*cospi_4_64+input[7]*cospi_28_64
- vmlal.s16 q2, d30, d0
- vmlal.s16 q3, d31, d0
-
- ; input[5] * cospi_20_64 + input[3] * cospi_12_64
- vmlal.s16 q9, d22, d2
- vmlal.s16 q13, d23, d2
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vrshrn.s32 d14, q2, #14 ; >> 14
- vrshrn.s32 d15, q3, #14 ; >> 14
-
- ; stage 2 & stage 3 - even half
- vdup.16 d0, r7 ; duplicate cospi_16_64
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vrshrn.s32 d12, q9, #14 ; >> 14
- vrshrn.s32 d13, q13, #14 ; >> 14
-
- ; input[0] * cospi_16_64
- vmull.s16 q2, d16, d0
- vmull.s16 q3, d17, d0
-
- ; input[0] * cospi_16_64
- vmull.s16 q13, d16, d0
- vmull.s16 q15, d17, d0
-
- ; (input[0] + input[2]) * cospi_16_64
- vmlal.s16 q2, d24, d0
- vmlal.s16 q3, d25, d0
-
- ; (input[0] - input[2]) * cospi_16_64
- vmlsl.s16 q13, d24, d0
- vmlsl.s16 q15, d25, d0
-
- vdup.16 d0, r8 ; duplicate cospi_24_64
- vdup.16 d1, r9 ; duplicate cospi_8_64
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vrshrn.s32 d18, q2, #14 ; >> 14
- vrshrn.s32 d19, q3, #14 ; >> 14
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vrshrn.s32 d22, q13, #14 ; >> 14
- vrshrn.s32 d23, q15, #14 ; >> 14
-
- ; input[1] * cospi_24_64 - input[3] * cospi_8_64
- ; input[1] * cospi_24_64
- vmull.s16 q2, d20, d0
- vmull.s16 q3, d21, d0
-
- ; input[1] * cospi_8_64
- vmull.s16 q8, d20, d1
- vmull.s16 q12, d21, d1
-
- ; input[1] * cospi_24_64 - input[3] * cospi_8_64
- vmlsl.s16 q2, d28, d1
- vmlsl.s16 q3, d29, d1
-
- ; input[1] * cospi_8_64 + input[3] * cospi_24_64
- vmlal.s16 q8, d28, d0
- vmlal.s16 q12, d29, d0
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vrshrn.s32 d26, q2, #14 ; >> 14
- vrshrn.s32 d27, q3, #14 ; >> 14
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vrshrn.s32 d30, q8, #14 ; >> 14
- vrshrn.s32 d31, q12, #14 ; >> 14
-
- vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3]
- vadd.s16 q1, q11, q13 ; output[1] = step[1] + step[2]
- vsub.s16 q2, q11, q13 ; output[2] = step[1] - step[2]
- vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3]
-
- ; stage 3 -odd half
- vdup.16 d16, r7 ; duplicate cospi_16_64
-
- ; stage 2 - odd half
- vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5]
- vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5]
- vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7]
- vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7]
-
- ; step2[6] * cospi_16_64
- vmull.s16 q9, d28, d16
- vmull.s16 q10, d29, d16
-
- ; step2[6] * cospi_16_64
- vmull.s16 q11, d28, d16
- vmull.s16 q12, d29, d16
-
- ; (step2[6] - step2[5]) * cospi_16_64
- vmlsl.s16 q9, d26, d16
- vmlsl.s16 q10, d27, d16
-
- ; (step2[5] + step2[6]) * cospi_16_64
- vmlal.s16 q11, d26, d16
- vmlal.s16 q12, d27, d16
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vrshrn.s32 d10, q9, #14 ; >> 14
- vrshrn.s32 d11, q10, #14 ; >> 14
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vrshrn.s32 d12, q11, #14 ; >> 14
- vrshrn.s32 d13, q12, #14 ; >> 14
-
- ; stage 4
- vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7];
- vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6];
- vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5];
- vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4];
- vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4];
- vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5];
- vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6];
- vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7];
- MEND
-
- ; Transpose a 8x8 16bit data matrix. Datas are loaded in q8-q15.
- MACRO
- TRANSPOSE8X8
- vswp d17, d24
- vswp d23, d30
- vswp d21, d28
- vswp d19, d26
- vtrn.32 q8, q10
- vtrn.32 q9, q11
- vtrn.32 q12, q14
- vtrn.32 q13, q15
- vtrn.16 q8, q9
- vtrn.16 q10, q11
- vtrn.16 q12, q13
- vtrn.16 q14, q15
- MEND
-
- AREA Block, CODE, READONLY ; name this block of code
-;void vpx_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int stride)
-;
-; r0 int16_t input
-; r1 uint8_t *dest
-; r2 int stride)
-
-|vpx_idct8x8_64_add_neon| PROC
- push {r4-r9}
- vpush {d8-d15}
- LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0
- LOAD_TRAN_LOW_TO_S16 d20, d21, d22, d23, r0
- LOAD_TRAN_LOW_TO_S16 d24, d25, d26, d27, r0
- LOAD_TRAN_LOW_TO_S16 d28, d29, d30, d31, r0
-
- ; transpose the input data
- TRANSPOSE8X8
-
- ; cospi_28_64 = 3196
- movw r3, #0x0c7c
-
- ; cospi_4_64 = 16069
- movw r4, #0x3ec5
-
- ; cospi_12_64 = 13623
- movw r5, #0x3537
-
- ; cospi_20_64 = 9102
- movw r6, #0x238e
-
- ; cospi_16_64 = 11585
- movw r7, #0x2d41
-
- ; cospi_24_64 = 6270
- movw r8, #0x187e
-
- ; cospi_8_64 = 15137
- movw r9, #0x3b21
-
- ; First transform rows
- IDCT8x8_1D
-
- ; Transpose the matrix
- TRANSPOSE8X8
-
- ; Then transform columns
- IDCT8x8_1D
-
- ; ROUND_POWER_OF_TWO(temp_out[j], 5)
- vrshr.s16 q8, q8, #5
- vrshr.s16 q9, q9, #5
- vrshr.s16 q10, q10, #5
- vrshr.s16 q11, q11, #5
- vrshr.s16 q12, q12, #5
- vrshr.s16 q13, q13, #5
- vrshr.s16 q14, q14, #5
- vrshr.s16 q15, q15, #5
-
- ; save dest pointer
- mov r0, r1
-
- ; load destination data
- vld1.64 {d0}, [r1], r2
- vld1.64 {d1}, [r1], r2
- vld1.64 {d2}, [r1], r2
- vld1.64 {d3}, [r1], r2
- vld1.64 {d4}, [r1], r2
- vld1.64 {d5}, [r1], r2
- vld1.64 {d6}, [r1], r2
- vld1.64 {d7}, [r1]
-
- ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * stride + i]
- vaddw.u8 q8, q8, d0
- vaddw.u8 q9, q9, d1
- vaddw.u8 q10, q10, d2
- vaddw.u8 q11, q11, d3
- vaddw.u8 q12, q12, d4
- vaddw.u8 q13, q13, d5
- vaddw.u8 q14, q14, d6
- vaddw.u8 q15, q15, d7
-
- ; clip_pixel
- vqmovun.s16 d0, q8
- vqmovun.s16 d1, q9
- vqmovun.s16 d2, q10
- vqmovun.s16 d3, q11
- vqmovun.s16 d4, q12
- vqmovun.s16 d5, q13
- vqmovun.s16 d6, q14
- vqmovun.s16 d7, q15
-
- ; store the data
- vst1.64 {d0}, [r0], r2
- vst1.64 {d1}, [r0], r2
- vst1.64 {d2}, [r0], r2
- vst1.64 {d3}, [r0], r2
- vst1.64 {d4}, [r0], r2
- vst1.64 {d5}, [r0], r2
- vst1.64 {d6}, [r0], r2
- vst1.64 {d7}, [r0], r2
-
- vpop {d8-d15}
- pop {r4-r9}
- bx lr
- ENDP ; |vpx_idct8x8_64_add_neon|
-
-;void vpx_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int stride)
-;
-; r0 int16_t input
-; r1 uint8_t *dest
-; r2 int stride)
-
-|vpx_idct8x8_12_add_neon| PROC
- push {r4-r9}
- vpush {d8-d15}
- LOAD_TRAN_LOW_TO_S16 d16, d17, d18, d19, r0
- LOAD_TRAN_LOW_TO_S16 d20, d21, d22, d23, r0
- LOAD_TRAN_LOW_TO_S16 d24, d25, d26, d27, r0
- LOAD_TRAN_LOW_TO_S16 d28, d29, d30, d31, r0
-
- ; transpose the input data
- TRANSPOSE8X8
-
- ; cospi_28_64 = 3196
- movw r3, #0x0c7c
-
- ; cospi_4_64 = 16069
- movw r4, #0x3ec5
-
- ; cospi_12_64 = 13623
- movw r5, #0x3537
-
- ; cospi_20_64 = 9102
- movw r6, #0x238e
-
- ; cospi_16_64 = 11585
- movw r7, #0x2d41
-
- ; cospi_24_64 = 6270
- movw r8, #0x187e
-
- ; cospi_8_64 = 15137
- movw r9, #0x3b21
-
- ; First transform rows
- ; stage 1
- ; The following instructions use vqrdmulh to do the
- ; dct_const_round_shift(input[1] * cospi_28_64). vqrdmulh will do doubling
- ; multiply and shift the result by 16 bits instead of 14 bits. So we need
- ; to double the constants before multiplying to compensate this.
- mov r12, r3, lsl #1
- vdup.16 q0, r12 ; duplicate cospi_28_64*2
- mov r12, r4, lsl #1
- vdup.16 q1, r12 ; duplicate cospi_4_64*2
-
- ; dct_const_round_shift(input[1] * cospi_28_64)
- vqrdmulh.s16 q4, q9, q0
-
- mov r12, r6, lsl #1
- rsb r12, #0
- vdup.16 q0, r12 ; duplicate -cospi_20_64*2
-
- ; dct_const_round_shift(input[1] * cospi_4_64)
- vqrdmulh.s16 q7, q9, q1
-
- mov r12, r5, lsl #1
- vdup.16 q1, r12 ; duplicate cospi_12_64*2
-
- ; dct_const_round_shift(- input[3] * cospi_20_64)
- vqrdmulh.s16 q5, q11, q0
-
- mov r12, r7, lsl #1
- vdup.16 q0, r12 ; duplicate cospi_16_64*2
-
- ; dct_const_round_shift(input[3] * cospi_12_64)
- vqrdmulh.s16 q6, q11, q1
-
- ; stage 2 & stage 3 - even half
- mov r12, r8, lsl #1
- vdup.16 q1, r12 ; duplicate cospi_24_64*2
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vqrdmulh.s16 q9, q8, q0
-
- mov r12, r9, lsl #1
- vdup.16 q0, r12 ; duplicate cospi_8_64*2
-
- ; dct_const_round_shift(input[1] * cospi_24_64)
- vqrdmulh.s16 q13, q10, q1
-
- ; dct_const_round_shift(input[1] * cospi_8_64)
- vqrdmulh.s16 q15, q10, q0
-
- ; stage 3 -odd half
- vdup.16 d16, r7 ; duplicate cospi_16_64
-
- vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3]
- vadd.s16 q1, q9, q13 ; output[1] = step[1] + step[2]
- vsub.s16 q2, q9, q13 ; output[2] = step[1] - step[2]
- vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3]
-
- ; stage 2 - odd half
- vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5]
- vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5]
- vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7]
- vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7]
-
- ; step2[6] * cospi_16_64
- vmull.s16 q9, d28, d16
- vmull.s16 q10, d29, d16
-
- ; step2[6] * cospi_16_64
- vmull.s16 q11, d28, d16
- vmull.s16 q12, d29, d16
-
- ; (step2[6] - step2[5]) * cospi_16_64
- vmlsl.s16 q9, d26, d16
- vmlsl.s16 q10, d27, d16
-
- ; (step2[5] + step2[6]) * cospi_16_64
- vmlal.s16 q11, d26, d16
- vmlal.s16 q12, d27, d16
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vrshrn.s32 d10, q9, #14 ; >> 14
- vrshrn.s32 d11, q10, #14 ; >> 14
-
- ; dct_const_round_shift(input_dc * cospi_16_64)
- vrshrn.s32 d12, q11, #14 ; >> 14
- vrshrn.s32 d13, q12, #14 ; >> 14
-
- ; stage 4
- vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7];
- vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6];
- vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5];
- vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4];
- vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4];
- vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5];
- vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6];
- vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7];
-
- ; Transpose the matrix
- TRANSPOSE8X8
-
- ; Then transform columns
- IDCT8x8_1D
-
- ; ROUND_POWER_OF_TWO(temp_out[j], 5)
- vrshr.s16 q8, q8, #5
- vrshr.s16 q9, q9, #5
- vrshr.s16 q10, q10, #5
- vrshr.s16 q11, q11, #5
- vrshr.s16 q12, q12, #5
- vrshr.s16 q13, q13, #5
- vrshr.s16 q14, q14, #5
- vrshr.s16 q15, q15, #5
-
- ; save dest pointer
- mov r0, r1
-
- ; load destination data
- vld1.64 {d0}, [r1], r2
- vld1.64 {d1}, [r1], r2
- vld1.64 {d2}, [r1], r2
- vld1.64 {d3}, [r1], r2
- vld1.64 {d4}, [r1], r2
- vld1.64 {d5}, [r1], r2
- vld1.64 {d6}, [r1], r2
- vld1.64 {d7}, [r1]
-
- ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * stride + i]
- vaddw.u8 q8, q8, d0
- vaddw.u8 q9, q9, d1
- vaddw.u8 q10, q10, d2
- vaddw.u8 q11, q11, d3
- vaddw.u8 q12, q12, d4
- vaddw.u8 q13, q13, d5
- vaddw.u8 q14, q14, d6
- vaddw.u8 q15, q15, d7
-
- ; clip_pixel
- vqmovun.s16 d0, q8
- vqmovun.s16 d1, q9
- vqmovun.s16 d2, q10
- vqmovun.s16 d3, q11
- vqmovun.s16 d4, q12
- vqmovun.s16 d5, q13
- vqmovun.s16 d6, q14
- vqmovun.s16 d7, q15
-
- ; store the data
- vst1.64 {d0}, [r0], r2
- vst1.64 {d1}, [r0], r2
- vst1.64 {d2}, [r0], r2
- vst1.64 {d3}, [r0], r2
- vst1.64 {d4}, [r0], r2
- vst1.64 {d5}, [r0], r2
- vst1.64 {d6}, [r0], r2
- vst1.64 {d7}, [r0], r2
-
- vpop {d8-d15}
- pop {r4-r9}
- bx lr
- ENDP ; |vpx_idct8x8_12_add_neon|
-
- END
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h
index d9b85223c76..fe5b603e21b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/idct_neon.h
@@ -15,6 +15,7 @@
#include "./vpx_config.h"
#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/txfm_common.h"
#include "vpx_dsp/vpx_dsp_common.h"
DECLARE_ALIGNED(16, static const int16_t, kCospi[16]) = {
@@ -28,11 +29,15 @@ DECLARE_ALIGNED(16, static const int16_t, kCospi[16]) = {
12665 /* cospi_14_64 */, -10394 /* -cospi_18_64 */
};
-DECLARE_ALIGNED(16, static const int32_t, kCospi32[8]) = {
- 16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */,
- 11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */,
- 16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */,
- -9102 /* -cospi_20_64 */, 3196 /* cospi_28_64 */
+DECLARE_ALIGNED(16, static const int32_t, kCospi32[16]) = {
+ 16384 /* cospi_0_64 */, 15137 /* cospi_8_64 */,
+ 11585 /* cospi_16_64 */, 6270 /* cospi_24_64 */,
+ 16069 /* cospi_4_64 */, 13623 /* cospi_12_64 */,
+ -9102 /* -cospi_20_64 */, 3196 /* cospi_28_64 */,
+ 16305 /* cospi_2_64 */, 1606 /* cospi_30_64 */,
+ 14449 /* cospi_10_64 */, 7723 /* cospi_22_64 */,
+ 15679 /* cospi_6_64 */, -4756 /* -cospi_26_64 */,
+ 12665 /* cospi_14_64 */, -10394 /* -cospi_18_64 */
};
//------------------------------------------------------------------------------
@@ -76,23 +81,34 @@ static INLINE int16x4_t load_tran_low_to_s16d(const tran_low_t *buf) {
#endif
}
+static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
+ const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
+ vst1q_s32(buf, v0);
+ vst1q_s32(buf + 4, v1);
+#else
+ vst1q_s16(buf, a);
+#endif
+}
+
//------------------------------------------------------------------------------
-// Multiply a by a_const. Saturate, shift and narrow by 14.
+// Multiply a by a_const. Saturate, shift and narrow by DCT_CONST_BITS.
static INLINE int16x8_t multiply_shift_and_narrow_s16(const int16x8_t a,
const int16_t a_const) {
- // Shift by 14 + rounding will be within 16 bits for well formed streams.
- // See WRAPLOW and dct_const_round_shift for details.
+ // Shift by DCT_CONST_BITS + rounding will be within 16 bits for well formed
+ // streams. See WRAPLOW and dct_const_round_shift for details.
// This instruction doubles the result and returns the high half, essentially
// resulting in a right shift by 15. By multiplying the constant first that
- // becomes a right shift by 14.
+ // becomes a right shift by DCT_CONST_BITS.
// The largest possible value used here is
// vpx_dsp/txfm_common.h:cospi_1_64 = 16364 (* 2 = 32728) a which falls *just*
// within the range of int16_t (+32767 / -32768) even when negated.
return vqrdmulhq_n_s16(a, a_const * 2);
}
-// Add a and b, then multiply by ab_const. Shift and narrow by 14.
+// Add a and b, then multiply by ab_const. Shift and narrow by DCT_CONST_BITS.
static INLINE int16x8_t add_multiply_shift_and_narrow_s16(
const int16x8_t a, const int16x8_t b, const int16_t ab_const) {
// In both add_ and it's pair, sub_, the input for well-formed streams will be
@@ -106,21 +122,24 @@ static INLINE int16x8_t add_multiply_shift_and_narrow_s16(
int32x4_t temp_high = vaddl_s16(vget_high_s16(a), vget_high_s16(b));
temp_low = vmulq_n_s32(temp_low, ab_const);
temp_high = vmulq_n_s32(temp_high, ab_const);
- return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14));
+ return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS),
+ vrshrn_n_s32(temp_high, DCT_CONST_BITS));
}
-// Subtract b from a, then multiply by ab_const. Shift and narrow by 14.
+// Subtract b from a, then multiply by ab_const. Shift and narrow by
+// DCT_CONST_BITS.
static INLINE int16x8_t sub_multiply_shift_and_narrow_s16(
const int16x8_t a, const int16x8_t b, const int16_t ab_const) {
int32x4_t temp_low = vsubl_s16(vget_low_s16(a), vget_low_s16(b));
int32x4_t temp_high = vsubl_s16(vget_high_s16(a), vget_high_s16(b));
temp_low = vmulq_n_s32(temp_low, ab_const);
temp_high = vmulq_n_s32(temp_high, ab_const);
- return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14));
+ return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS),
+ vrshrn_n_s32(temp_high, DCT_CONST_BITS));
}
// Multiply a by a_const and b by b_const, then accumulate. Shift and narrow by
-// 14.
+// DCT_CONST_BITS.
static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16(
const int16x8_t a, const int16_t a_const, const int16x8_t b,
const int16_t b_const) {
@@ -128,7 +147,8 @@ static INLINE int16x8_t multiply_accumulate_shift_and_narrow_s16(
int32x4_t temp_high = vmull_n_s16(vget_high_s16(a), a_const);
temp_low = vmlal_n_s16(temp_low, vget_low_s16(b), b_const);
temp_high = vmlal_n_s16(temp_high, vget_high_s16(b), b_const);
- return vcombine_s16(vrshrn_n_s32(temp_low, 14), vrshrn_n_s32(temp_high, 14));
+ return vcombine_s16(vrshrn_n_s32(temp_low, DCT_CONST_BITS),
+ vrshrn_n_s32(temp_high, DCT_CONST_BITS));
}
// Shift the output down by 6 and add it to the destination buffer.
@@ -218,10 +238,10 @@ static INLINE void idct4x4_16_kernel_bd8(const int16x4_t cospis,
c3 = vmull_lane_s16(b2, cospis, 1);
c2 = vmlsl_lane_s16(c2, b3, cospis, 1);
c3 = vmlal_lane_s16(c3, b3, cospis, 3);
- b0 = vrshrn_n_s32(c0, 14);
- b1 = vrshrn_n_s32(c1, 14);
- b2 = vrshrn_n_s32(c2, 14);
- b3 = vrshrn_n_s32(c3, 14);
+ b0 = vrshrn_n_s32(c0, DCT_CONST_BITS);
+ b1 = vrshrn_n_s32(c1, DCT_CONST_BITS);
+ b2 = vrshrn_n_s32(c2, DCT_CONST_BITS);
+ b3 = vrshrn_n_s32(c3, DCT_CONST_BITS);
d0 = vcombine_s16(b0, b1);
d1 = vcombine_s16(b3, b2);
*a0 = vaddq_s16(d0, d1);
@@ -263,8 +283,8 @@ static INLINE void idct8x8_12_pass1_bd8(
t32[1] = vmull_lane_s16(step2[6], cospis0, 2);
t32[0] = vmlsl_lane_s16(t32[1], step2[5], cospis0, 2);
t32[1] = vmlal_lane_s16(t32[1], step2[5], cospis0, 2);
- step1[5] = vrshrn_n_s32(t32[0], 14);
- step1[6] = vrshrn_n_s32(t32[1], 14);
+ step1[5] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+ step1[6] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
// stage 4
*io0 = vadd_s16(step1[0], step2[7]);
@@ -322,10 +342,10 @@ static INLINE void idct8x8_12_pass2_bd8(
t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
- t16[0] = vrshrn_n_s32(t32[0], 14);
- t16[1] = vrshrn_n_s32(t32[1], 14);
- t16[2] = vrshrn_n_s32(t32[2], 14);
- t16[3] = vrshrn_n_s32(t32[3], 14);
+ t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+ t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
+ t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
+ t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
step1[5] = vcombine_s16(t16[0], t16[1]);
step1[6] = vcombine_s16(t16[2], t16[3]);
@@ -390,14 +410,14 @@ static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0,
t32[5] = vmlsl_lane_s16(t32[5], input_5h, cospis1, 2);
t32[6] = vmlal_lane_s16(t32[6], input_7l, cospis1, 3);
t32[7] = vmlal_lane_s16(t32[7], input_7h, cospis1, 3);
- t16[0] = vrshrn_n_s32(t32[0], 14);
- t16[1] = vrshrn_n_s32(t32[1], 14);
- t16[2] = vrshrn_n_s32(t32[2], 14);
- t16[3] = vrshrn_n_s32(t32[3], 14);
- t16[4] = vrshrn_n_s32(t32[4], 14);
- t16[5] = vrshrn_n_s32(t32[5], 14);
- t16[6] = vrshrn_n_s32(t32[6], 14);
- t16[7] = vrshrn_n_s32(t32[7], 14);
+ t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+ t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
+ t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
+ t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
+ t16[4] = vrshrn_n_s32(t32[4], DCT_CONST_BITS);
+ t16[5] = vrshrn_n_s32(t32[5], DCT_CONST_BITS);
+ t16[6] = vrshrn_n_s32(t32[6], DCT_CONST_BITS);
+ t16[7] = vrshrn_n_s32(t32[7], DCT_CONST_BITS);
step1[4] = vcombine_s16(t16[0], t16[1]);
step1[5] = vcombine_s16(t16[2], t16[3]);
step1[6] = vcombine_s16(t16[4], t16[5]);
@@ -418,14 +438,14 @@ static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0,
t32[5] = vmlsl_lane_s16(t32[5], step1h[3], cospis0, 1);
t32[6] = vmlal_lane_s16(t32[6], step1l[3], cospis0, 3);
t32[7] = vmlal_lane_s16(t32[7], step1h[3], cospis0, 3);
- t16[0] = vrshrn_n_s32(t32[0], 14);
- t16[1] = vrshrn_n_s32(t32[1], 14);
- t16[2] = vrshrn_n_s32(t32[2], 14);
- t16[3] = vrshrn_n_s32(t32[3], 14);
- t16[4] = vrshrn_n_s32(t32[4], 14);
- t16[5] = vrshrn_n_s32(t32[5], 14);
- t16[6] = vrshrn_n_s32(t32[6], 14);
- t16[7] = vrshrn_n_s32(t32[7], 14);
+ t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+ t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
+ t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
+ t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
+ t16[4] = vrshrn_n_s32(t32[4], DCT_CONST_BITS);
+ t16[5] = vrshrn_n_s32(t32[5], DCT_CONST_BITS);
+ t16[6] = vrshrn_n_s32(t32[6], DCT_CONST_BITS);
+ t16[7] = vrshrn_n_s32(t32[7], DCT_CONST_BITS);
step2[0] = vcombine_s16(t16[0], t16[1]);
step2[1] = vcombine_s16(t16[2], t16[3]);
step2[2] = vcombine_s16(t16[4], t16[5]);
@@ -448,10 +468,10 @@ static INLINE void idct8x8_64_1d_bd8(const int16x4_t cospis0,
t32[1] = vmlsl_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(step2[5]), cospis0, 2);
t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(step2[5]), cospis0, 2);
- t16[0] = vrshrn_n_s32(t32[0], 14);
- t16[1] = vrshrn_n_s32(t32[1], 14);
- t16[2] = vrshrn_n_s32(t32[2], 14);
- t16[3] = vrshrn_n_s32(t32[3], 14);
+ t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+ t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
+ t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
+ t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
step1[5] = vcombine_s16(t16[0], t16[1]);
step1[6] = vcombine_s16(t16[2], t16[3]);
@@ -471,10 +491,10 @@ static INLINE void idct16x16_add_wrap_low_8x2(const int32x4_t *const t32,
int16x8_t *const d1) {
int16x4_t t16[4];
- t16[0] = vrshrn_n_s32(t32[0], 14);
- t16[1] = vrshrn_n_s32(t32[1], 14);
- t16[2] = vrshrn_n_s32(t32[2], 14);
- t16[3] = vrshrn_n_s32(t32[3], 14);
+ t16[0] = vrshrn_n_s32(t32[0], DCT_CONST_BITS);
+ t16[1] = vrshrn_n_s32(t32[1], DCT_CONST_BITS);
+ t16[2] = vrshrn_n_s32(t32[2], DCT_CONST_BITS);
+ t16[3] = vrshrn_n_s32(t32[3], DCT_CONST_BITS);
*d0 = vcombine_s16(t16[0], t16[1]);
*d1 = vcombine_s16(t16[2], t16[3]);
}
@@ -529,6 +549,178 @@ static INLINE void idct_cospi_16_16_q(const int16x8_t s0, const int16x8_t s1,
idct16x16_add_wrap_low_8x2(t32, d0, d1);
}
+static INLINE void idct_cospi_2_30(const int16x8_t s0, const int16x8_t s1,
+ const int16x4_t cospi_2_30_10_22,
+ int16x8_t *const d0, int16x8_t *const d1) {
+ int32x4_t t32[4];
+
+ t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 1);
+ t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 1);
+ t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 1);
+ t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 1);
+ t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 0);
+ t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 0);
+ t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 0);
+ t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 0);
+ idct16x16_add_wrap_low_8x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_4_28(const int16x8_t s0, const int16x8_t s1,
+ const int16x4_t cospi_4_12_20N_28,
+ int16x8_t *const d0, int16x8_t *const d1) {
+ int32x4_t t32[4];
+
+ t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 3);
+ t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 3);
+ t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 3);
+ t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 3);
+ t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 0);
+ t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 0);
+ t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 0);
+ t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 0);
+ idct16x16_add_wrap_low_8x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_6_26(const int16x8_t s0, const int16x8_t s1,
+ const int16x4_t cospi_6_26_14_18N,
+ int16x8_t *const d0, int16x8_t *const d1) {
+ int32x4_t t32[4];
+
+ t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26_14_18N, 0);
+ t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26_14_18N, 0);
+ t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26_14_18N, 0);
+ t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26_14_18N, 0);
+ t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26_14_18N, 1);
+ t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26_14_18N, 1);
+ t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26_14_18N, 1);
+ t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26_14_18N, 1);
+ idct16x16_add_wrap_low_8x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_10_22(const int16x8_t s0, const int16x8_t s1,
+ const int16x4_t cospi_2_30_10_22,
+ int16x8_t *const d0, int16x8_t *const d1) {
+ int32x4_t t32[4];
+
+ t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_2_30_10_22, 3);
+ t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_2_30_10_22, 3);
+ t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_2_30_10_22, 3);
+ t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_2_30_10_22, 3);
+ t32[0] = vmlsl_lane_s16(t32[0], vget_low_s16(s1), cospi_2_30_10_22, 2);
+ t32[1] = vmlsl_lane_s16(t32[1], vget_high_s16(s1), cospi_2_30_10_22, 2);
+ t32[2] = vmlal_lane_s16(t32[2], vget_low_s16(s0), cospi_2_30_10_22, 2);
+ t32[3] = vmlal_lane_s16(t32[3], vget_high_s16(s0), cospi_2_30_10_22, 2);
+ idct16x16_add_wrap_low_8x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_12_20(const int16x8_t s0, const int16x8_t s1,
+ const int16x4_t cospi_4_12_20N_28,
+ int16x8_t *const d0, int16x8_t *const d1) {
+ int32x4_t t32[4];
+
+ t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_4_12_20N_28, 1);
+ t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_4_12_20N_28, 1);
+ t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_4_12_20N_28, 1);
+ t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_4_12_20N_28, 1);
+ t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_4_12_20N_28, 2);
+ t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_4_12_20N_28, 2);
+ t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_4_12_20N_28, 2);
+ t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_4_12_20N_28, 2);
+ idct16x16_add_wrap_low_8x2(t32, d0, d1);
+}
+
+static INLINE void idct_cospi_14_18(const int16x8_t s0, const int16x8_t s1,
+ const int16x4_t cospi_6_26_14_18N,
+ int16x8_t *const d0, int16x8_t *const d1) {
+ int32x4_t t32[4];
+
+ t32[0] = vmull_lane_s16(vget_low_s16(s0), cospi_6_26_14_18N, 2);
+ t32[1] = vmull_lane_s16(vget_high_s16(s0), cospi_6_26_14_18N, 2);
+ t32[2] = vmull_lane_s16(vget_low_s16(s1), cospi_6_26_14_18N, 2);
+ t32[3] = vmull_lane_s16(vget_high_s16(s1), cospi_6_26_14_18N, 2);
+ t32[0] = vmlal_lane_s16(t32[0], vget_low_s16(s1), cospi_6_26_14_18N, 3);
+ t32[1] = vmlal_lane_s16(t32[1], vget_high_s16(s1), cospi_6_26_14_18N, 3);
+ t32[2] = vmlsl_lane_s16(t32[2], vget_low_s16(s0), cospi_6_26_14_18N, 3);
+ t32[3] = vmlsl_lane_s16(t32[3], vget_high_s16(s0), cospi_6_26_14_18N, 3);
+ idct16x16_add_wrap_low_8x2(t32, d0, d1);
+}
+
+static INLINE void idct16x16_add_stage7(const int16x8_t *const step2,
+ int16x8_t *const out) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ // Use saturating add/sub to avoid overflow in 2nd pass
+ out[0] = vqaddq_s16(step2[0], step2[15]);
+ out[1] = vqaddq_s16(step2[1], step2[14]);
+ out[2] = vqaddq_s16(step2[2], step2[13]);
+ out[3] = vqaddq_s16(step2[3], step2[12]);
+ out[4] = vqaddq_s16(step2[4], step2[11]);
+ out[5] = vqaddq_s16(step2[5], step2[10]);
+ out[6] = vqaddq_s16(step2[6], step2[9]);
+ out[7] = vqaddq_s16(step2[7], step2[8]);
+ out[8] = vqsubq_s16(step2[7], step2[8]);
+ out[9] = vqsubq_s16(step2[6], step2[9]);
+ out[10] = vqsubq_s16(step2[5], step2[10]);
+ out[11] = vqsubq_s16(step2[4], step2[11]);
+ out[12] = vqsubq_s16(step2[3], step2[12]);
+ out[13] = vqsubq_s16(step2[2], step2[13]);
+ out[14] = vqsubq_s16(step2[1], step2[14]);
+ out[15] = vqsubq_s16(step2[0], step2[15]);
+#else
+ out[0] = vaddq_s16(step2[0], step2[15]);
+ out[1] = vaddq_s16(step2[1], step2[14]);
+ out[2] = vaddq_s16(step2[2], step2[13]);
+ out[3] = vaddq_s16(step2[3], step2[12]);
+ out[4] = vaddq_s16(step2[4], step2[11]);
+ out[5] = vaddq_s16(step2[5], step2[10]);
+ out[6] = vaddq_s16(step2[6], step2[9]);
+ out[7] = vaddq_s16(step2[7], step2[8]);
+ out[8] = vsubq_s16(step2[7], step2[8]);
+ out[9] = vsubq_s16(step2[6], step2[9]);
+ out[10] = vsubq_s16(step2[5], step2[10]);
+ out[11] = vsubq_s16(step2[4], step2[11]);
+ out[12] = vsubq_s16(step2[3], step2[12]);
+ out[13] = vsubq_s16(step2[2], step2[13]);
+ out[14] = vsubq_s16(step2[1], step2[14]);
+ out[15] = vsubq_s16(step2[0], step2[15]);
+#endif
+}
+
+static INLINE void idct16x16_store_pass1(const int16x8_t *const out,
+ int16_t *output) {
+ // Save the result into output
+ vst1q_s16(output, out[0]);
+ output += 16;
+ vst1q_s16(output, out[1]);
+ output += 16;
+ vst1q_s16(output, out[2]);
+ output += 16;
+ vst1q_s16(output, out[3]);
+ output += 16;
+ vst1q_s16(output, out[4]);
+ output += 16;
+ vst1q_s16(output, out[5]);
+ output += 16;
+ vst1q_s16(output, out[6]);
+ output += 16;
+ vst1q_s16(output, out[7]);
+ output += 16;
+ vst1q_s16(output, out[8]);
+ output += 16;
+ vst1q_s16(output, out[9]);
+ output += 16;
+ vst1q_s16(output, out[10]);
+ output += 16;
+ vst1q_s16(output, out[11]);
+ output += 16;
+ vst1q_s16(output, out[12]);
+ output += 16;
+ vst1q_s16(output, out[13]);
+ output += 16;
+ vst1q_s16(output, out[14]);
+ output += 16;
+ vst1q_s16(output, out[15]);
+}
+
static INLINE void idct16x16_add8x1(int16x8_t res, uint8_t **dest,
const int stride) {
uint8x8_t d = vld1_u8(*dest);
@@ -541,4 +733,29 @@ static INLINE void idct16x16_add8x1(int16x8_t res, uint8_t **dest,
*dest += stride;
}
+static INLINE void highbd_idct16x16_add8x1(int16x8_t res, const int16x8_t max,
+ uint16_t **dest, const int stride) {
+ uint16x8_t d = vld1q_u16(*dest);
+
+ res = vqaddq_s16(res, vreinterpretq_s16_u16(d));
+ res = vminq_s16(res, max);
+ d = vqshluq_n_s16(res, 0);
+ vst1q_u16(*dest, d);
+ *dest += stride;
+}
+
+void idct16x16_256_add_half1d(const void *const input, int16_t *output,
+ void *const dest, const int stride,
+ const int highbd_flag);
+
+void idct16x16_38_add_half1d(const void *const input, int16_t *const output,
+ void *const dest, const int stride,
+ const int highbd_flag);
+
+void idct16x16_10_add_half1d_pass1(const tran_low_t *input, int16_t *output);
+
+void idct16x16_10_add_half1d_pass2(const int16_t *input, int16_t *const output,
+ void *const dest, const int stride,
+ const int highbd_flag);
+
#endif // VPX_DSP_ARM_IDCT_NEON_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/transpose_neon.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/transpose_neon.h
index 8366ce50b87..434c20ca21c 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/transpose_neon.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/arm/transpose_neon.h
@@ -710,6 +710,83 @@ static INLINE void transpose_u16_8x8(uint16x8_t *a0, uint16x8_t *a1,
*a7 = d3.val[1];
}
+static INLINE void transpose_s32_8x8(int32x4x2_t *a0, int32x4x2_t *a1,
+ int32x4x2_t *a2, int32x4x2_t *a3,
+ int32x4x2_t *a4, int32x4x2_t *a5,
+ int32x4x2_t *a6, int32x4x2_t *a7) {
+ // Swap 32 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // a4: 40 41 42 43 44 45 46 47
+ // a5: 50 51 52 53 54 55 56 57
+ // a6: 60 61 62 63 64 65 66 67
+ // a7: 70 71 72 73 74 75 76 77
+ // to:
+ // b0: 00 10 02 12 01 11 03 13
+ // b1: 20 30 22 32 21 31 23 33
+ // b2: 40 50 42 52 41 51 43 53
+ // b3: 60 70 62 72 61 71 63 73
+ // b4: 04 14 06 16 05 15 07 17
+ // b5: 24 34 26 36 25 35 27 37
+ // b6: 44 54 46 56 45 55 47 57
+ // b7: 64 74 66 76 65 75 67 77
+
+ const int32x4x2_t b0 = vtrnq_s32(a0->val[0], a1->val[0]);
+ const int32x4x2_t b1 = vtrnq_s32(a2->val[0], a3->val[0]);
+ const int32x4x2_t b2 = vtrnq_s32(a4->val[0], a5->val[0]);
+ const int32x4x2_t b3 = vtrnq_s32(a6->val[0], a7->val[0]);
+ const int32x4x2_t b4 = vtrnq_s32(a0->val[1], a1->val[1]);
+ const int32x4x2_t b5 = vtrnq_s32(a2->val[1], a3->val[1]);
+ const int32x4x2_t b6 = vtrnq_s32(a4->val[1], a5->val[1]);
+ const int32x4x2_t b7 = vtrnq_s32(a6->val[1], a7->val[1]);
+
+ // Swap 64 bit elements resulting in:
+ // c0: 00 10 20 30 02 12 22 32
+ // c1: 01 11 21 31 03 13 23 33
+ // c2: 40 50 60 70 42 52 62 72
+ // c3: 41 51 61 71 43 53 63 73
+ // c4: 04 14 24 34 06 16 26 36
+ // c5: 05 15 25 35 07 17 27 37
+ // c6: 44 54 64 74 46 56 66 76
+ // c7: 45 55 65 75 47 57 67 77
+ const int32x4x2_t c0 = vpx_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
+ const int32x4x2_t c1 = vpx_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
+ const int32x4x2_t c2 = vpx_vtrnq_s64_to_s32(b2.val[0], b3.val[0]);
+ const int32x4x2_t c3 = vpx_vtrnq_s64_to_s32(b2.val[1], b3.val[1]);
+ const int32x4x2_t c4 = vpx_vtrnq_s64_to_s32(b4.val[0], b5.val[0]);
+ const int32x4x2_t c5 = vpx_vtrnq_s64_to_s32(b4.val[1], b5.val[1]);
+ const int32x4x2_t c6 = vpx_vtrnq_s64_to_s32(b6.val[0], b7.val[0]);
+ const int32x4x2_t c7 = vpx_vtrnq_s64_to_s32(b6.val[1], b7.val[1]);
+
+ // Swap 128 bit elements resulting in:
+ // a0: 00 10 20 30 40 50 60 70
+ // a1: 01 11 21 31 41 51 61 71
+ // a2: 02 12 22 32 42 52 62 72
+ // a3: 03 13 23 33 43 53 63 73
+ // a4: 04 14 24 34 44 54 64 74
+ // a5: 05 15 25 35 45 55 65 75
+ // a6: 06 16 26 36 46 56 66 76
+ // a7: 07 17 27 37 47 57 67 77
+ a0->val[0] = c0.val[0];
+ a0->val[1] = c2.val[0];
+ a1->val[0] = c1.val[0];
+ a1->val[1] = c3.val[0];
+ a2->val[0] = c0.val[1];
+ a2->val[1] = c2.val[1];
+ a3->val[0] = c1.val[1];
+ a3->val[1] = c3.val[1];
+ a4->val[0] = c4.val[0];
+ a4->val[1] = c6.val[0];
+ a5->val[0] = c5.val[0];
+ a5->val[1] = c7.val[0];
+ a6->val[0] = c4.val[1];
+ a6->val[1] = c6.val[1];
+ a7->val[0] = c5.val[1];
+ a7->val[1] = c7.val[1];
+}
+
static INLINE void transpose_u8_16x8(
const uint8x16_t i0, const uint8x16_t i1, const uint8x16_t i2,
const uint8x16_t i3, const uint8x16_t i4, const uint8x16_t i5,
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/avg.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/avg.c
index 4d9abb8de36..e4cd6cca78b 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/avg.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/avg.c
@@ -67,9 +67,10 @@ static void hadamard_col8(const int16_t *src_diff, int src_stride,
// The order of the output coeff of the hadamard is not important. For
// optimization purposes the final transpose may be skipped.
void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride,
- int16_t *coeff) {
+ tran_low_t *coeff) {
int idx;
int16_t buffer[64];
+ int16_t buffer2[64];
int16_t *tmp_buf = &buffer[0];
for (idx = 0; idx < 8; ++idx) {
hadamard_col8(src_diff, src_stride, tmp_buf); // src_diff: 9 bit
@@ -80,17 +81,19 @@ void vpx_hadamard_8x8_c(const int16_t *src_diff, int src_stride,
tmp_buf = &buffer[0];
for (idx = 0; idx < 8; ++idx) {
- hadamard_col8(tmp_buf, 8, coeff); // tmp_buf: 12 bit
- // dynamic range [-2040, 2040]
- coeff += 8; // coeff: 15 bit
- // dynamic range [-16320, 16320]
+ hadamard_col8(tmp_buf, 8, buffer2 + 8 * idx); // tmp_buf: 12 bit
+ // dynamic range [-2040, 2040]
+ // buffer2: 15 bit
+ // dynamic range [-16320, 16320]
++tmp_buf;
}
+
+ for (idx = 0; idx < 64; ++idx) coeff[idx] = (tran_low_t)buffer2[idx];
}
// In place 16x16 2D Hadamard transform
void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
- int16_t *coeff) {
+ tran_low_t *coeff) {
int idx;
for (idx = 0; idx < 4; ++idx) {
// src_diff: 9 bit, dynamic range [-255, 255]
@@ -101,15 +104,15 @@ void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
// coeff: 15 bit, dynamic range [-16320, 16320]
for (idx = 0; idx < 64; ++idx) {
- int16_t a0 = coeff[0];
- int16_t a1 = coeff[64];
- int16_t a2 = coeff[128];
- int16_t a3 = coeff[192];
+ tran_low_t a0 = coeff[0];
+ tran_low_t a1 = coeff[64];
+ tran_low_t a2 = coeff[128];
+ tran_low_t a3 = coeff[192];
- int16_t b0 = (a0 + a1) >> 1; // (a0 + a1): 16 bit, [-32640, 32640]
- int16_t b1 = (a0 - a1) >> 1; // b0-b3: 15 bit, dynamic range
- int16_t b2 = (a2 + a3) >> 1; // [-16320, 16320]
- int16_t b3 = (a2 - a3) >> 1;
+ tran_low_t b0 = (a0 + a1) >> 1; // (a0 + a1): 16 bit, [-32640, 32640]
+ tran_low_t b1 = (a0 - a1) >> 1; // b0-b3: 15 bit, dynamic range
+ tran_low_t b2 = (a2 + a3) >> 1; // [-16320, 16320]
+ tran_low_t b3 = (a2 - a3) >> 1;
coeff[0] = b0 + b2; // 16 bit, [-32640, 32640]
coeff[64] = b1 + b3;
@@ -122,7 +125,7 @@ void vpx_hadamard_16x16_c(const int16_t *src_diff, int src_stride,
// coeff: 16 bits, dynamic range [-32640, 32640].
// length: value range {16, 64, 256, 1024}.
-int vpx_satd_c(const int16_t *coeff, int length) {
+int vpx_satd_c(const tran_low_t *coeff, int length) {
int i;
int satd = 0;
for (i = 0; i < length; ++i) satd += abs(coeff[i]);
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/deblock.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/deblock.c
index 6c27484979a..a0db1e40c98 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/deblock.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/deblock.c
@@ -7,6 +7,7 @@
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <assert.h>
#include <stdlib.h>
#include "vpx/vpx_integer.h"
@@ -48,6 +49,9 @@ void vpx_post_proc_down_and_across_mb_row_c(unsigned char *src_ptr,
unsigned char v;
unsigned char d[4];
+ assert(size >= 8);
+ assert(cols >= 8);
+
for (row = 0; row < size; row++) {
/* post_proc_down for one row */
p_src = src_ptr;
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c
index 0f9aff1892a..f99ded57a85 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/inv_txfm.c
@@ -93,6 +93,42 @@ void vpx_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int stride) {
}
}
+void iadst4_c(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+ tran_low_t x0 = input[0];
+ tran_low_t x1 = input[1];
+ tran_low_t x2 = input[2];
+ tran_low_t x3 = input[3];
+
+ if (!(x0 | x1 | x2 | x3)) {
+ memset(output, 0, 4 * sizeof(*output));
+ return;
+ }
+
+ s0 = sinpi_1_9 * x0;
+ s1 = sinpi_2_9 * x0;
+ s2 = sinpi_3_9 * x1;
+ s3 = sinpi_4_9 * x2;
+ s4 = sinpi_1_9 * x2;
+ s5 = sinpi_2_9 * x3;
+ s6 = sinpi_4_9 * x3;
+ s7 = WRAPLOW(x0 - x2 + x3);
+
+ s0 = s0 + s3 + s5;
+ s1 = s1 - s4 - s6;
+ s3 = s2;
+ s2 = sinpi_3_9 * s7;
+
+ // 1-D transform scaling factor is sqrt(2).
+ // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+ // + 1b (addition) = 29b.
+ // Hence the output bit depth is 15b.
+ output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
+ output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
+ output[2] = WRAPLOW(dct_const_round_shift(s2));
+ output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
+}
+
void idct4_c(const tran_low_t *input, tran_low_t *output) {
tran_low_t step[4];
tran_high_t temp1, temp2;
@@ -155,6 +191,81 @@ void vpx_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
}
}
+void iadst8_c(const tran_low_t *input, tran_low_t *output) {
+ int s0, s1, s2, s3, s4, s5, s6, s7;
+ tran_high_t x0 = input[7];
+ tran_high_t x1 = input[0];
+ tran_high_t x2 = input[5];
+ tran_high_t x3 = input[2];
+ tran_high_t x4 = input[3];
+ tran_high_t x5 = input[4];
+ tran_high_t x6 = input[1];
+ tran_high_t x7 = input[6];
+
+ if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
+ memset(output, 0, 8 * sizeof(*output));
+ return;
+ }
+
+ // stage 1
+ s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
+ s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
+ s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
+ s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
+ s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
+ s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
+ s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
+ s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
+
+ x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
+ x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
+ x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
+ x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
+ x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
+ x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
+ x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
+ x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
+
+ // stage 2
+ s0 = (int)x0;
+ s1 = (int)x1;
+ s2 = (int)x2;
+ s3 = (int)x3;
+ s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
+ s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
+ s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
+ s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
+
+ x0 = WRAPLOW(s0 + s2);
+ x1 = WRAPLOW(s1 + s3);
+ x2 = WRAPLOW(s0 - s2);
+ x3 = WRAPLOW(s1 - s3);
+ x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
+ x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
+ x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
+ x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
+
+ // stage 3
+ s2 = (int)(cospi_16_64 * (x2 + x3));
+ s3 = (int)(cospi_16_64 * (x2 - x3));
+ s6 = (int)(cospi_16_64 * (x6 + x7));
+ s7 = (int)(cospi_16_64 * (x6 - x7));
+
+ x2 = WRAPLOW(dct_const_round_shift(s2));
+ x3 = WRAPLOW(dct_const_round_shift(s3));
+ x6 = WRAPLOW(dct_const_round_shift(s6));
+ x7 = WRAPLOW(dct_const_round_shift(s7));
+
+ output[0] = WRAPLOW(x0);
+ output[1] = WRAPLOW(-x4);
+ output[2] = WRAPLOW(x6);
+ output[3] = WRAPLOW(-x2);
+ output[4] = WRAPLOW(x3);
+ output[5] = WRAPLOW(-x7);
+ output[6] = WRAPLOW(x5);
+ output[7] = WRAPLOW(-x1);
+}
+
void idct8_c(const tran_low_t *input, tran_low_t *output) {
tran_low_t step1[8], step2[8];
tran_high_t temp1, temp2;
@@ -234,6 +345,31 @@ void vpx_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
}
}
+void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+ int i, j;
+ tran_low_t out[8 * 8] = { 0 };
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[8], temp_out[8];
+
+ // First transform rows
+ // Only first 4 row has non-zero coefs
+ for (i = 0; i < 4; ++i) {
+ idct8_c(input, outptr);
+ input += 8;
+ outptr += 8;
+ }
+
+ // Then transform columns
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
+ idct8_c(temp_in, temp_out);
+ for (j = 0; j < 8; ++j) {
+ dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 5));
+ }
+ }
+}
+
void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
int i, j;
tran_high_t a1;
@@ -247,86 +383,119 @@ void vpx_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
}
}
-void iadst4_c(const tran_low_t *input, tran_low_t *output) {
- tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
- tran_low_t x0 = input[0];
- tran_low_t x1 = input[1];
- tran_low_t x2 = input[2];
- tran_low_t x3 = input[3];
-
- if (!(x0 | x1 | x2 | x3)) {
- memset(output, 0, 4 * sizeof(*output));
- return;
- }
-
- s0 = sinpi_1_9 * x0;
- s1 = sinpi_2_9 * x0;
- s2 = sinpi_3_9 * x1;
- s3 = sinpi_4_9 * x2;
- s4 = sinpi_1_9 * x2;
- s5 = sinpi_2_9 * x3;
- s6 = sinpi_4_9 * x3;
- s7 = WRAPLOW(x0 - x2 + x3);
-
- s0 = s0 + s3 + s5;
- s1 = s1 - s4 - s6;
- s3 = s2;
- s2 = sinpi_3_9 * s7;
-
- // 1-D transform scaling factor is sqrt(2).
- // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
- // + 1b (addition) = 29b.
- // Hence the output bit depth is 15b.
- output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
- output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
- output[2] = WRAPLOW(dct_const_round_shift(s2));
- output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
-}
-
-void iadst8_c(const tran_low_t *input, tran_low_t *output) {
- int s0, s1, s2, s3, s4, s5, s6, s7;
- tran_high_t x0 = input[7];
+void iadst16_c(const tran_low_t *input, tran_low_t *output) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+ tran_high_t s9, s10, s11, s12, s13, s14, s15;
+ tran_high_t x0 = input[15];
tran_high_t x1 = input[0];
- tran_high_t x2 = input[5];
+ tran_high_t x2 = input[13];
tran_high_t x3 = input[2];
- tran_high_t x4 = input[3];
+ tran_high_t x4 = input[11];
tran_high_t x5 = input[4];
- tran_high_t x6 = input[1];
+ tran_high_t x6 = input[9];
tran_high_t x7 = input[6];
+ tran_high_t x8 = input[7];
+ tran_high_t x9 = input[8];
+ tran_high_t x10 = input[5];
+ tran_high_t x11 = input[10];
+ tran_high_t x12 = input[3];
+ tran_high_t x13 = input[12];
+ tran_high_t x14 = input[1];
+ tran_high_t x15 = input[14];
- if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
- memset(output, 0, 8 * sizeof(*output));
+ if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
+ x13 | x14 | x15)) {
+ memset(output, 0, 16 * sizeof(*output));
return;
}
// stage 1
- s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
- s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
- s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
- s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
- s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
- s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
- s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
- s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
+ s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
+ s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+ s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
+ s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+ s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
+ s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+ s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+ s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+ s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+ s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+ s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+ s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+ s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+ s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
+ s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+ s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
- x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
- x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
- x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
- x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
- x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
- x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
- x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
- x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
+ x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
+ x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
+ x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
+ x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
+ x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
+ x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
+ x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
+ x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
+ x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
+ x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
+ x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
+ x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
+ x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
+ x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
+ x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
+ x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
// stage 2
- s0 = (int)x0;
- s1 = (int)x1;
- s2 = (int)x2;
- s3 = (int)x3;
- s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
- s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
- s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
- s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4;
+ s5 = x5;
+ s6 = x6;
+ s7 = x7;
+ s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+ s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+ s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+ s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+ s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
+ s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+ s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
+ s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+
+ x0 = WRAPLOW(s0 + s4);
+ x1 = WRAPLOW(s1 + s5);
+ x2 = WRAPLOW(s2 + s6);
+ x3 = WRAPLOW(s3 + s7);
+ x4 = WRAPLOW(s0 - s4);
+ x5 = WRAPLOW(s1 - s5);
+ x6 = WRAPLOW(s2 - s6);
+ x7 = WRAPLOW(s3 - s7);
+ x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
+ x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
+ x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
+ x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
+ x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
+ x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
+ x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
+ x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
+
+ // stage 3
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+ s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+ s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
+ s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+ s8 = x8;
+ s9 = x9;
+ s10 = x10;
+ s11 = x11;
+ s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+ s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+ s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
+ s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
x0 = WRAPLOW(s0 + s2);
x1 = WRAPLOW(s1 + s3);
@@ -336,51 +505,50 @@ void iadst8_c(const tran_low_t *input, tran_low_t *output) {
x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
+ x8 = WRAPLOW(s8 + s10);
+ x9 = WRAPLOW(s9 + s11);
+ x10 = WRAPLOW(s8 - s10);
+ x11 = WRAPLOW(s9 - s11);
+ x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
+ x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
+ x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
+ x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
- // stage 3
- s2 = (int)(cospi_16_64 * (x2 + x3));
- s3 = (int)(cospi_16_64 * (x2 - x3));
- s6 = (int)(cospi_16_64 * (x6 + x7));
- s7 = (int)(cospi_16_64 * (x6 - x7));
+ // stage 4
+ s2 = (-cospi_16_64) * (x2 + x3);
+ s3 = cospi_16_64 * (x2 - x3);
+ s6 = cospi_16_64 * (x6 + x7);
+ s7 = cospi_16_64 * (-x6 + x7);
+ s10 = cospi_16_64 * (x10 + x11);
+ s11 = cospi_16_64 * (-x10 + x11);
+ s14 = (-cospi_16_64) * (x14 + x15);
+ s15 = cospi_16_64 * (x14 - x15);
x2 = WRAPLOW(dct_const_round_shift(s2));
x3 = WRAPLOW(dct_const_round_shift(s3));
x6 = WRAPLOW(dct_const_round_shift(s6));
x7 = WRAPLOW(dct_const_round_shift(s7));
+ x10 = WRAPLOW(dct_const_round_shift(s10));
+ x11 = WRAPLOW(dct_const_round_shift(s11));
+ x14 = WRAPLOW(dct_const_round_shift(s14));
+ x15 = WRAPLOW(dct_const_round_shift(s15));
output[0] = WRAPLOW(x0);
- output[1] = WRAPLOW(-x4);
- output[2] = WRAPLOW(x6);
- output[3] = WRAPLOW(-x2);
- output[4] = WRAPLOW(x3);
- output[5] = WRAPLOW(-x7);
- output[6] = WRAPLOW(x5);
- output[7] = WRAPLOW(-x1);
-}
-
-void vpx_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
- int i, j;
- tran_low_t out[8 * 8] = { 0 };
- tran_low_t *outptr = out;
- tran_low_t temp_in[8], temp_out[8];
-
- // First transform rows
- // Only first 4 row has non-zero coefs
- for (i = 0; i < 4; ++i) {
- idct8_c(input, outptr);
- input += 8;
- outptr += 8;
- }
-
- // Then transform columns
- for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
- idct8_c(temp_in, temp_out);
- for (j = 0; j < 8; ++j) {
- dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
- ROUND_POWER_OF_TWO(temp_out[j], 5));
- }
- }
+ output[1] = WRAPLOW(-x8);
+ output[2] = WRAPLOW(x12);
+ output[3] = WRAPLOW(-x4);
+ output[4] = WRAPLOW(x6);
+ output[5] = WRAPLOW(x14);
+ output[6] = WRAPLOW(x10);
+ output[7] = WRAPLOW(x2);
+ output[8] = WRAPLOW(x3);
+ output[9] = WRAPLOW(x11);
+ output[10] = WRAPLOW(x15);
+ output[11] = WRAPLOW(x7);
+ output[12] = WRAPLOW(x5);
+ output[13] = WRAPLOW(-x13);
+ output[14] = WRAPLOW(x9);
+ output[15] = WRAPLOW(-x1);
}
void idct16_c(const tran_low_t *input, tran_low_t *output) {
@@ -573,172 +741,30 @@ void vpx_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
}
}
-void iadst16_c(const tran_low_t *input, tran_low_t *output) {
- tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
- tran_high_t s9, s10, s11, s12, s13, s14, s15;
- tran_high_t x0 = input[15];
- tran_high_t x1 = input[0];
- tran_high_t x2 = input[13];
- tran_high_t x3 = input[2];
- tran_high_t x4 = input[11];
- tran_high_t x5 = input[4];
- tran_high_t x6 = input[9];
- tran_high_t x7 = input[6];
- tran_high_t x8 = input[7];
- tran_high_t x9 = input[8];
- tran_high_t x10 = input[5];
- tran_high_t x11 = input[10];
- tran_high_t x12 = input[3];
- tran_high_t x13 = input[12];
- tran_high_t x14 = input[1];
- tran_high_t x15 = input[14];
+void vpx_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int i, j;
+ tran_low_t out[16 * 16] = { 0 };
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[16], temp_out[16];
- if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
- x13 | x14 | x15)) {
- memset(output, 0, 16 * sizeof(*output));
- return;
+ // First transform rows. Since all non-zero dct coefficients are in
+ // upper-left 8x8 area, we only need to calculate first 8 rows here.
+ for (i = 0; i < 8; ++i) {
+ idct16_c(input, outptr);
+ input += 16;
+ outptr += 16;
}
- // stage 1
- s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
- s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
- s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
- s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
- s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
- s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
- s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
- s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
- s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
- s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
- s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
- s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
- s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
- s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
- s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
- s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
-
- x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
- x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
- x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
- x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
- x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
- x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
- x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
- x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
- x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
- x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
- x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
- x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
- x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
- x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
- x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
- x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
-
- // stage 2
- s0 = x0;
- s1 = x1;
- s2 = x2;
- s3 = x3;
- s4 = x4;
- s5 = x5;
- s6 = x6;
- s7 = x7;
- s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
- s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
- s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
- s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
- s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
- s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
- s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
- s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
-
- x0 = WRAPLOW(s0 + s4);
- x1 = WRAPLOW(s1 + s5);
- x2 = WRAPLOW(s2 + s6);
- x3 = WRAPLOW(s3 + s7);
- x4 = WRAPLOW(s0 - s4);
- x5 = WRAPLOW(s1 - s5);
- x6 = WRAPLOW(s2 - s6);
- x7 = WRAPLOW(s3 - s7);
- x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
- x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
- x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
- x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
- x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
- x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
- x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
- x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
-
- // stage 3
- s0 = x0;
- s1 = x1;
- s2 = x2;
- s3 = x3;
- s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
- s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
- s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
- s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
- s8 = x8;
- s9 = x9;
- s10 = x10;
- s11 = x11;
- s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
- s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
- s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
- s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
-
- x0 = WRAPLOW(s0 + s2);
- x1 = WRAPLOW(s1 + s3);
- x2 = WRAPLOW(s0 - s2);
- x3 = WRAPLOW(s1 - s3);
- x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
- x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
- x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
- x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
- x8 = WRAPLOW(s8 + s10);
- x9 = WRAPLOW(s9 + s11);
- x10 = WRAPLOW(s8 - s10);
- x11 = WRAPLOW(s9 - s11);
- x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
- x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
- x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
- x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
-
- // stage 4
- s2 = (-cospi_16_64) * (x2 + x3);
- s3 = cospi_16_64 * (x2 - x3);
- s6 = cospi_16_64 * (x6 + x7);
- s7 = cospi_16_64 * (-x6 + x7);
- s10 = cospi_16_64 * (x10 + x11);
- s11 = cospi_16_64 * (-x10 + x11);
- s14 = (-cospi_16_64) * (x14 + x15);
- s15 = cospi_16_64 * (x14 - x15);
-
- x2 = WRAPLOW(dct_const_round_shift(s2));
- x3 = WRAPLOW(dct_const_round_shift(s3));
- x6 = WRAPLOW(dct_const_round_shift(s6));
- x7 = WRAPLOW(dct_const_round_shift(s7));
- x10 = WRAPLOW(dct_const_round_shift(s10));
- x11 = WRAPLOW(dct_const_round_shift(s11));
- x14 = WRAPLOW(dct_const_round_shift(s14));
- x15 = WRAPLOW(dct_const_round_shift(s15));
-
- output[0] = WRAPLOW(x0);
- output[1] = WRAPLOW(-x8);
- output[2] = WRAPLOW(x12);
- output[3] = WRAPLOW(-x4);
- output[4] = WRAPLOW(x6);
- output[5] = WRAPLOW(x14);
- output[6] = WRAPLOW(x10);
- output[7] = WRAPLOW(x2);
- output[8] = WRAPLOW(x3);
- output[9] = WRAPLOW(x11);
- output[10] = WRAPLOW(x15);
- output[11] = WRAPLOW(x7);
- output[12] = WRAPLOW(x5);
- output[13] = WRAPLOW(-x13);
- output[14] = WRAPLOW(x9);
- output[15] = WRAPLOW(-x1);
+ // Then transform columns
+ for (i = 0; i < 16; ++i) {
+ for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
+ idct16_c(temp_in, temp_out);
+ for (j = 0; j < 16; ++j) {
+ dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
+ ROUND_POWER_OF_TWO(temp_out[j], 6));
+ }
+ }
}
void vpx_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
@@ -1351,6 +1377,51 @@ void vpx_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
}
}
+void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+ tran_low_t x0 = input[0];
+ tran_low_t x1 = input[1];
+ tran_low_t x2 = input[2];
+ tran_low_t x3 = input[3];
+ (void)bd;
+
+ if (detect_invalid_highbd_input(input, 4)) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+ assert(0 && "invalid highbd txfm input");
+#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
+ memset(output, 0, sizeof(*output) * 4);
+ return;
+ }
+
+ if (!(x0 | x1 | x2 | x3)) {
+ memset(output, 0, 4 * sizeof(*output));
+ return;
+ }
+
+ s0 = sinpi_1_9 * x0;
+ s1 = sinpi_2_9 * x0;
+ s2 = sinpi_3_9 * x1;
+ s3 = sinpi_4_9 * x2;
+ s4 = sinpi_1_9 * x2;
+ s5 = sinpi_2_9 * x3;
+ s6 = sinpi_4_9 * x3;
+ s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
+
+ s0 = s0 + s3 + s5;
+ s1 = s1 - s4 - s6;
+ s3 = s2;
+ s2 = sinpi_3_9 * s7;
+
+ // 1-D transform scaling factor is sqrt(2).
+ // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+ // + 1b (addition) = 29b.
+ // Hence the output bit depth is 15b.
+ output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd);
+ output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd);
+ output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
+ output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd);
+}
+
void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
tran_low_t step[4];
tran_high_t temp1, temp2;
@@ -1427,6 +1498,90 @@ void vpx_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
}
}
+void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+ tran_low_t x0 = input[7];
+ tran_low_t x1 = input[0];
+ tran_low_t x2 = input[5];
+ tran_low_t x3 = input[2];
+ tran_low_t x4 = input[3];
+ tran_low_t x5 = input[4];
+ tran_low_t x6 = input[1];
+ tran_low_t x7 = input[6];
+ (void)bd;
+
+ if (detect_invalid_highbd_input(input, 8)) {
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+ assert(0 && "invalid highbd txfm input");
+#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
+ memset(output, 0, sizeof(*output) * 8);
+ return;
+ }
+
+ if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
+ memset(output, 0, 8 * sizeof(*output));
+ return;
+ }
+
+ // stage 1
+ s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
+ s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
+ s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+ s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+ s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+ s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+ s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
+ s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
+
+ x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd);
+ x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd);
+ x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd);
+ x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd);
+ x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd);
+ x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd);
+ x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd);
+ x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd);
+
+ // stage 2
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
+ s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
+ s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
+ s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
+
+ x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
+ x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
+ x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
+ x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
+ x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
+ x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
+ x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
+ x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
+
+ // stage 3
+ s2 = cospi_16_64 * (x2 + x3);
+ s3 = cospi_16_64 * (x2 - x3);
+ s6 = cospi_16_64 * (x6 + x7);
+ s7 = cospi_16_64 * (x6 - x7);
+
+ x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
+ x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
+ x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
+ x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
+
+ output[0] = HIGHBD_WRAPLOW(x0, bd);
+ output[1] = HIGHBD_WRAPLOW(-x4, bd);
+ output[2] = HIGHBD_WRAPLOW(x6, bd);
+ output[3] = HIGHBD_WRAPLOW(-x2, bd);
+ output[4] = HIGHBD_WRAPLOW(x3, bd);
+ output[5] = HIGHBD_WRAPLOW(-x7, bd);
+ output[6] = HIGHBD_WRAPLOW(x5, bd);
+ output[7] = HIGHBD_WRAPLOW(-x1, bd);
+}
+
void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
tran_low_t step1[8], step2[8];
tran_high_t temp1, temp2;
@@ -1507,6 +1662,33 @@ void vpx_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
}
}
+void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ int i, j;
+ tran_low_t out[8 * 8] = { 0 };
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[8], temp_out[8];
+ uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+ // First transform rows
+ // Only first 4 row has non-zero coefs
+ for (i = 0; i < 4; ++i) {
+ vpx_highbd_idct8_c(input, outptr, bd);
+ input += 8;
+ outptr += 8;
+ }
+
+ // Then transform columns
+ for (i = 0; i < 8; ++i) {
+ for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
+ vpx_highbd_idct8_c(temp_in, temp_out, bd);
+ for (j = 0; j < 8; ++j) {
+ dest[j * stride + i] = highbd_clip_pixel_add(
+ dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+ }
+ }
+}
+
void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
int stride, int bd) {
int i, j;
@@ -1523,104 +1705,128 @@ void vpx_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
}
}
-void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
- tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
- tran_low_t x0 = input[0];
- tran_low_t x1 = input[1];
- tran_low_t x2 = input[2];
- tran_low_t x3 = input[3];
- (void)bd;
-
- if (detect_invalid_highbd_input(input, 4)) {
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
- assert(0 && "invalid highbd txfm input");
-#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
- memset(output, 0, sizeof(*output) * 4);
- return;
- }
-
- if (!(x0 | x1 | x2 | x3)) {
- memset(output, 0, 4 * sizeof(*output));
- return;
- }
-
- s0 = sinpi_1_9 * x0;
- s1 = sinpi_2_9 * x0;
- s2 = sinpi_3_9 * x1;
- s3 = sinpi_4_9 * x2;
- s4 = sinpi_1_9 * x2;
- s5 = sinpi_2_9 * x3;
- s6 = sinpi_4_9 * x3;
- s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
-
- s0 = s0 + s3 + s5;
- s1 = s1 - s4 - s6;
- s3 = s2;
- s2 = sinpi_3_9 * s7;
-
- // 1-D transform scaling factor is sqrt(2).
- // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
- // + 1b (addition) = 29b.
- // Hence the output bit depth is 15b.
- output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd);
- output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd);
- output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
- output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd);
-}
-
-void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
- tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
- tran_low_t x0 = input[7];
+void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
+ tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+ tran_high_t s9, s10, s11, s12, s13, s14, s15;
+ tran_low_t x0 = input[15];
tran_low_t x1 = input[0];
- tran_low_t x2 = input[5];
+ tran_low_t x2 = input[13];
tran_low_t x3 = input[2];
- tran_low_t x4 = input[3];
+ tran_low_t x4 = input[11];
tran_low_t x5 = input[4];
- tran_low_t x6 = input[1];
+ tran_low_t x6 = input[9];
tran_low_t x7 = input[6];
+ tran_low_t x8 = input[7];
+ tran_low_t x9 = input[8];
+ tran_low_t x10 = input[5];
+ tran_low_t x11 = input[10];
+ tran_low_t x12 = input[3];
+ tran_low_t x13 = input[12];
+ tran_low_t x14 = input[1];
+ tran_low_t x15 = input[14];
(void)bd;
- if (detect_invalid_highbd_input(input, 8)) {
+ if (detect_invalid_highbd_input(input, 16)) {
#if CONFIG_COEFFICIENT_RANGE_CHECKING
assert(0 && "invalid highbd txfm input");
#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
- memset(output, 0, sizeof(*output) * 8);
+ memset(output, 0, sizeof(*output) * 16);
return;
}
- if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
- memset(output, 0, 8 * sizeof(*output));
+ if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
+ x13 | x14 | x15)) {
+ memset(output, 0, 16 * sizeof(*output));
return;
}
// stage 1
- s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
- s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
- s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
- s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
- s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
- s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
- s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
- s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
+ s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
+ s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+ s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
+ s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+ s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
+ s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+ s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+ s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+ s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+ s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+ s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+ s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+ s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+ s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
+ s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+ s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
- x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd);
- x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd);
- x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd);
- x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd);
- x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd);
- x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd);
- x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd);
- x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd);
+ x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd);
+ x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd);
+ x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd);
+ x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd);
+ x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd);
+ x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd);
+ x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd);
+ x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd);
+ x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd);
+ x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd);
+ x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd);
+ x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd);
+ x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd);
+ x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd);
+ x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd);
+ x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd);
// stage 2
s0 = x0;
s1 = x1;
s2 = x2;
s3 = x3;
- s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
- s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
- s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
- s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
+ s4 = x4;
+ s5 = x5;
+ s6 = x6;
+ s7 = x7;
+ s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+ s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+ s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+ s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+ s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
+ s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+ s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
+ s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+
+ x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
+ x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
+ x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
+ x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
+ x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
+ x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
+ x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
+ x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
+ x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd);
+ x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd);
+ x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd);
+ x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd);
+ x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd);
+ x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd);
+ x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd);
+ x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd);
+
+ // stage 3
+ s0 = x0;
+ s1 = x1;
+ s2 = x2;
+ s3 = x3;
+ s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+ s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+ s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
+ s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+ s8 = x8;
+ s9 = x9;
+ s10 = x10;
+ s11 = x11;
+ s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+ s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+ s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
+ s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
@@ -1630,53 +1836,50 @@ void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
+ x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
+ x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
+ x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
+ x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
+ x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd);
+ x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd);
+ x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd);
+ x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd);
- // stage 3
- s2 = cospi_16_64 * (x2 + x3);
+ // stage 4
+ s2 = (-cospi_16_64) * (x2 + x3);
s3 = cospi_16_64 * (x2 - x3);
s6 = cospi_16_64 * (x6 + x7);
- s7 = cospi_16_64 * (x6 - x7);
+ s7 = cospi_16_64 * (-x6 + x7);
+ s10 = cospi_16_64 * (x10 + x11);
+ s11 = cospi_16_64 * (-x10 + x11);
+ s14 = (-cospi_16_64) * (x14 + x15);
+ s15 = cospi_16_64 * (x14 - x15);
x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
+ x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd);
+ x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd);
+ x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd);
+ x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd);
output[0] = HIGHBD_WRAPLOW(x0, bd);
- output[1] = HIGHBD_WRAPLOW(-x4, bd);
- output[2] = HIGHBD_WRAPLOW(x6, bd);
- output[3] = HIGHBD_WRAPLOW(-x2, bd);
- output[4] = HIGHBD_WRAPLOW(x3, bd);
- output[5] = HIGHBD_WRAPLOW(-x7, bd);
- output[6] = HIGHBD_WRAPLOW(x5, bd);
- output[7] = HIGHBD_WRAPLOW(-x1, bd);
-}
-
-void vpx_highbd_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest8,
- int stride, int bd) {
- int i, j;
- tran_low_t out[8 * 8] = { 0 };
- tran_low_t *outptr = out;
- tran_low_t temp_in[8], temp_out[8];
- uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
-
- // First transform rows
- // Only first 4 row has non-zero coefs
- for (i = 0; i < 4; ++i) {
- vpx_highbd_idct8_c(input, outptr, bd);
- input += 8;
- outptr += 8;
- }
-
- // Then transform columns
- for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
- vpx_highbd_idct8_c(temp_in, temp_out, bd);
- for (j = 0; j < 8; ++j) {
- dest[j * stride + i] = highbd_clip_pixel_add(
- dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
- }
- }
+ output[1] = HIGHBD_WRAPLOW(-x8, bd);
+ output[2] = HIGHBD_WRAPLOW(x12, bd);
+ output[3] = HIGHBD_WRAPLOW(-x4, bd);
+ output[4] = HIGHBD_WRAPLOW(x6, bd);
+ output[5] = HIGHBD_WRAPLOW(x14, bd);
+ output[6] = HIGHBD_WRAPLOW(x10, bd);
+ output[7] = HIGHBD_WRAPLOW(x2, bd);
+ output[8] = HIGHBD_WRAPLOW(x3, bd);
+ output[9] = HIGHBD_WRAPLOW(x11, bd);
+ output[10] = HIGHBD_WRAPLOW(x15, bd);
+ output[11] = HIGHBD_WRAPLOW(x7, bd);
+ output[12] = HIGHBD_WRAPLOW(x5, bd);
+ output[13] = HIGHBD_WRAPLOW(-x13, bd);
+ output[14] = HIGHBD_WRAPLOW(x9, bd);
+ output[15] = HIGHBD_WRAPLOW(-x1, bd);
}
void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
@@ -1879,181 +2082,33 @@ void vpx_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
}
}
-void vpx_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
- tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
- tran_high_t s9, s10, s11, s12, s13, s14, s15;
- tran_low_t x0 = input[15];
- tran_low_t x1 = input[0];
- tran_low_t x2 = input[13];
- tran_low_t x3 = input[2];
- tran_low_t x4 = input[11];
- tran_low_t x5 = input[4];
- tran_low_t x6 = input[9];
- tran_low_t x7 = input[6];
- tran_low_t x8 = input[7];
- tran_low_t x9 = input[8];
- tran_low_t x10 = input[5];
- tran_low_t x11 = input[10];
- tran_low_t x12 = input[3];
- tran_low_t x13 = input[12];
- tran_low_t x14 = input[1];
- tran_low_t x15 = input[14];
- (void)bd;
+void vpx_highbd_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest8,
+ int stride, int bd) {
+ int i, j;
+ tran_low_t out[16 * 16] = { 0 };
+ tran_low_t *outptr = out;
+ tran_low_t temp_in[16], temp_out[16];
+ uint16_t *const dest = CONVERT_TO_SHORTPTR(dest8);
- if (detect_invalid_highbd_input(input, 16)) {
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
- assert(0 && "invalid highbd txfm input");
-#endif // CONFIG_COEFFICIENT_RANGE_CHECKING
- memset(output, 0, sizeof(*output) * 16);
- return;
+ // First transform rows. Since all non-zero dct coefficients are in
+ // upper-left 8x8 area, we only need to calculate first 8 rows here.
+ for (i = 0; i < 8; ++i) {
+ vpx_highbd_idct16_c(input, outptr, bd);
+ input += 16;
+ outptr += 16;
}
- if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
- x13 | x14 | x15)) {
- memset(output, 0, 16 * sizeof(*output));
- return;
+ // Then transform columns
+ for (i = 0; i < 16; ++i) {
+ uint16_t *destT = dest;
+ for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
+ vpx_highbd_idct16_c(temp_in, temp_out, bd);
+ for (j = 0; j < 16; ++j) {
+ destT[i] = highbd_clip_pixel_add(destT[i],
+ ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+ destT += stride;
+ }
}
-
- // stage 1
- s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
- s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
- s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
- s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
- s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
- s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
- s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
- s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
- s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
- s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
- s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
- s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
- s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
- s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
- s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
- s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
-
- x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd);
- x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd);
- x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd);
- x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd);
- x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd);
- x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd);
- x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd);
- x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd);
- x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd);
- x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd);
- x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd);
- x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd);
- x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd);
- x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd);
- x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd);
- x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd);
-
- // stage 2
- s0 = x0;
- s1 = x1;
- s2 = x2;
- s3 = x3;
- s4 = x4;
- s5 = x5;
- s6 = x6;
- s7 = x7;
- s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
- s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
- s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
- s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
- s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
- s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
- s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
- s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
-
- x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
- x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
- x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
- x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
- x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
- x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
- x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
- x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
- x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd);
- x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd);
- x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd);
- x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd);
- x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd);
- x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd);
- x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd);
- x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd);
-
- // stage 3
- s0 = x0;
- s1 = x1;
- s2 = x2;
- s3 = x3;
- s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
- s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
- s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
- s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
- s8 = x8;
- s9 = x9;
- s10 = x10;
- s11 = x11;
- s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
- s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
- s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
- s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
-
- x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
- x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
- x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
- x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
- x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
- x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
- x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
- x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
- x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
- x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
- x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
- x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
- x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd);
- x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd);
- x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd);
- x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd);
-
- // stage 4
- s2 = (-cospi_16_64) * (x2 + x3);
- s3 = cospi_16_64 * (x2 - x3);
- s6 = cospi_16_64 * (x6 + x7);
- s7 = cospi_16_64 * (-x6 + x7);
- s10 = cospi_16_64 * (x10 + x11);
- s11 = cospi_16_64 * (-x10 + x11);
- s14 = (-cospi_16_64) * (x14 + x15);
- s15 = cospi_16_64 * (x14 - x15);
-
- x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
- x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
- x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
- x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
- x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd);
- x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd);
- x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd);
- x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd);
-
- output[0] = HIGHBD_WRAPLOW(x0, bd);
- output[1] = HIGHBD_WRAPLOW(-x8, bd);
- output[2] = HIGHBD_WRAPLOW(x12, bd);
- output[3] = HIGHBD_WRAPLOW(-x4, bd);
- output[4] = HIGHBD_WRAPLOW(x6, bd);
- output[5] = HIGHBD_WRAPLOW(x14, bd);
- output[6] = HIGHBD_WRAPLOW(x10, bd);
- output[7] = HIGHBD_WRAPLOW(x2, bd);
- output[8] = HIGHBD_WRAPLOW(x3, bd);
- output[9] = HIGHBD_WRAPLOW(x11, bd);
- output[10] = HIGHBD_WRAPLOW(x15, bd);
- output[11] = HIGHBD_WRAPLOW(x7, bd);
- output[12] = HIGHBD_WRAPLOW(x5, bd);
- output[13] = HIGHBD_WRAPLOW(-x13, bd);
- output[14] = HIGHBD_WRAPLOW(x9, bd);
- output[15] = HIGHBD_WRAPLOW(-x1, bd);
}
void vpx_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/avg_msa.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/avg_msa.c
index 52a24ed379a..48b841969b2 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/avg_msa.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/avg_msa.c
@@ -7,6 +7,7 @@
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <stdlib.h>
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/mips/macros_msa.h"
@@ -54,3 +55,672 @@ uint32_t vpx_avg_4x4_msa(const uint8_t *src, int32_t src_stride) {
return sum_out;
}
+
+void vpx_hadamard_8x8_msa(const int16_t *src, int src_stride, int16_t *dst) {
+ v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
+ v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+ LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+ tmp6, tmp7, tmp5, tmp3, tmp1);
+ BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+ src5, src7, src6, src3, src2);
+ BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
+ tmp4, tmp5, tmp1, tmp6, tmp2);
+ TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
+ src2, src3, src4, src5, src6, src7);
+ BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+ tmp6, tmp7, tmp5, tmp3, tmp1);
+ BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+ src5, src7, src6, src3, src2);
+ BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
+ tmp4, tmp5, tmp1, tmp6, tmp2);
+ TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
+ src2, src3, src4, src5, src6, src7);
+ ST_SH8(src0, src1, src2, src3, src4, src5, src6, src7, dst, 8);
+}
+
+void vpx_hadamard_16x16_msa(const int16_t *src, int src_stride, int16_t *dst) {
+ v8i16 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+ v8i16 src11, src12, src13, src14, src15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+ v8i16 tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+ v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+
+ LD_SH2(src, 8, src0, src8);
+ src += src_stride;
+ LD_SH2(src, 8, src1, src9);
+ src += src_stride;
+ LD_SH2(src, 8, src2, src10);
+ src += src_stride;
+ LD_SH2(src, 8, src3, src11);
+ src += src_stride;
+ LD_SH2(src, 8, src4, src12);
+ src += src_stride;
+ LD_SH2(src, 8, src5, src13);
+ src += src_stride;
+ LD_SH2(src, 8, src6, src14);
+ src += src_stride;
+ LD_SH2(src, 8, src7, src15);
+ src += src_stride;
+
+ BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+ tmp6, tmp7, tmp5, tmp3, tmp1);
+ BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10,
+ tmp12, tmp14, tmp15, tmp13, tmp11, tmp9);
+
+ BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+ src5, src7, src6, src3, src2);
+ BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
+ tmp4, tmp5, tmp1, tmp6, tmp2);
+ TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
+ src2, src3, src4, src5, src6, src7);
+ BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+ tmp6, tmp7, tmp5, tmp3, tmp1);
+ BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+ src5, src7, src6, src3, src2);
+ BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
+ tmp4, tmp5, tmp1, tmp6, tmp2);
+ TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
+ src2, src11, src4, src5, src6, src7);
+ ST_SH8(src0, src1, src2, src11, src4, src5, src6, src7, dst, 8);
+
+ BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9,
+ src12, src13, src15, src14, src11, src10);
+ BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15,
+ tmp11, tmp12, tmp13, tmp9, tmp14, tmp10);
+ TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, src8,
+ src9, src10, src11, src12, src13, src14, src15);
+ BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10,
+ tmp12, tmp14, tmp15, tmp13, tmp11, tmp9);
+ BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9,
+ src12, src13, src15, src14, src11, src10);
+ BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15,
+ tmp11, tmp12, tmp13, tmp9, tmp14, tmp10);
+ TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, res0,
+ res1, res2, res3, res4, res5, res6, res7);
+
+ LD_SH2(src, 8, src0, src8);
+ src += src_stride;
+ LD_SH2(src, 8, src1, src9);
+ src += src_stride;
+ LD_SH2(src, 8, src2, src10);
+ src += src_stride;
+ LD_SH2(src, 8, src3, src11);
+ src += src_stride;
+
+ ST_SH8(res0, res1, res2, res3, res4, res5, res6, res7, dst + 64, 8);
+
+ LD_SH2(src, 8, src4, src12);
+ src += src_stride;
+ LD_SH2(src, 8, src5, src13);
+ src += src_stride;
+ LD_SH2(src, 8, src6, src14);
+ src += src_stride;
+ LD_SH2(src, 8, src7, src15);
+ src += src_stride;
+
+ BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+ tmp6, tmp7, tmp5, tmp3, tmp1);
+ BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10,
+ tmp12, tmp14, tmp15, tmp13, tmp11, tmp9);
+
+ BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+ src5, src7, src6, src3, src2);
+ BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
+ tmp4, tmp5, tmp1, tmp6, tmp2);
+ TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
+ src2, src3, src4, src5, src6, src7);
+ BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+ tmp6, tmp7, tmp5, tmp3, tmp1);
+ BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+ src5, src7, src6, src3, src2);
+ BUTTERFLY_8(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7, tmp3,
+ tmp4, tmp5, tmp1, tmp6, tmp2);
+ TRANSPOSE8x8_SH_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
+ src2, src3, src4, src5, src6, src7);
+ ST_SH8(src0, src1, src2, src3, src4, src5, src6, src7, dst + 2 * 64, 8);
+
+ BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9,
+ src12, src13, src15, src14, src11, src10);
+ BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15,
+ tmp11, tmp12, tmp13, tmp9, tmp14, tmp10);
+ TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, src8,
+ src9, src10, src11, src12, src13, src14, src15);
+ BUTTERFLY_8(src8, src10, src12, src14, src15, src13, src11, src9, tmp8, tmp10,
+ tmp12, tmp14, tmp15, tmp13, tmp11, tmp9);
+ BUTTERFLY_8(tmp8, tmp9, tmp12, tmp13, tmp15, tmp14, tmp11, tmp10, src8, src9,
+ src12, src13, src15, src14, src11, src10);
+ BUTTERFLY_8(src8, src9, src10, src11, src15, src14, src13, src12, tmp8, tmp15,
+ tmp11, tmp12, tmp13, tmp9, tmp14, tmp10);
+ TRANSPOSE8x8_SH_SH(tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, res0,
+ res1, res2, res3, res4, res5, res6, res7);
+ ST_SH8(res0, res1, res2, res3, res4, res5, res6, res7, dst + 3 * 64, 8);
+
+ LD_SH4(dst, 64, src0, src1, src2, src3);
+ LD_SH4(dst + 8, 64, src4, src5, src6, src7);
+
+ BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+ tmp6, tmp7, tmp5, tmp3, tmp1);
+ SRA_4V(tmp0, tmp1, tmp2, tmp3, 1);
+ SRA_4V(tmp4, tmp5, tmp6, tmp7, 1);
+ BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+ src5, src7, src6, src3, src2);
+
+ ST_SH4(src0, src1, src2, src3, dst, 64);
+ ST_SH4(src4, src5, src6, src7, dst + 8, 64);
+ dst += 16;
+
+ LD_SH4(dst, 64, src0, src1, src2, src3);
+ LD_SH4(dst + 8, 64, src4, src5, src6, src7);
+
+ BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+ tmp6, tmp7, tmp5, tmp3, tmp1);
+ SRA_4V(tmp0, tmp1, tmp2, tmp3, 1);
+ SRA_4V(tmp4, tmp5, tmp6, tmp7, 1);
+ BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+ src5, src7, src6, src3, src2);
+
+ ST_SH4(src0, src1, src2, src3, dst, 64);
+ ST_SH4(src4, src5, src6, src7, dst + 8, 64);
+ dst += 16;
+
+ LD_SH4(dst, 64, src0, src1, src2, src3);
+ LD_SH4(dst + 8, 64, src4, src5, src6, src7);
+
+ BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+ tmp6, tmp7, tmp5, tmp3, tmp1);
+ SRA_4V(tmp0, tmp1, tmp2, tmp3, 1);
+ SRA_4V(tmp4, tmp5, tmp6, tmp7, 1);
+ BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+ src5, src7, src6, src3, src2);
+
+ ST_SH4(src0, src1, src2, src3, dst, 64);
+ ST_SH4(src4, src5, src6, src7, dst + 8, 64);
+ dst += 16;
+
+ LD_SH4(dst, 64, src0, src1, src2, src3);
+ LD_SH4(dst + 8, 64, src4, src5, src6, src7);
+
+ BUTTERFLY_8(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2, tmp4,
+ tmp6, tmp7, tmp5, tmp3, tmp1);
+ SRA_4V(tmp0, tmp1, tmp2, tmp3, 1);
+ SRA_4V(tmp4, tmp5, tmp6, tmp7, 1);
+ BUTTERFLY_8(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1, src4,
+ src5, src7, src6, src3, src2);
+
+ ST_SH4(src0, src1, src2, src3, dst, 64);
+ ST_SH4(src4, src5, src6, src7, dst + 8, 64);
+}
+
+int vpx_satd_msa(const int16_t *data, int length) {
+ int i, satd;
+ v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
+ v8i16 src8, src9, src10, src11, src12, src13, src14, src15;
+ v8i16 zero = { 0 };
+ v8u16 tmp0_h, tmp1_h, tmp2_h, tmp3_h, tmp4_h, tmp5_h, tmp6_h, tmp7_h;
+ v4u32 tmp0_w = { 0 };
+
+ if (16 == length) {
+ LD_SH2(data, 8, src0, src1);
+ tmp0_h = (v8u16)__msa_asub_s_h(src0, zero);
+ tmp1_h = (v8u16)__msa_asub_s_h(src1, zero);
+ tmp0_w = __msa_hadd_u_w(tmp0_h, tmp0_h);
+ tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
+ satd = HADD_UW_U32(tmp0_w);
+ } else if (64 == length) {
+ LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7);
+
+ tmp0_h = (v8u16)__msa_asub_s_h(src0, zero);
+ tmp1_h = (v8u16)__msa_asub_s_h(src1, zero);
+ tmp2_h = (v8u16)__msa_asub_s_h(src2, zero);
+ tmp3_h = (v8u16)__msa_asub_s_h(src3, zero);
+ tmp4_h = (v8u16)__msa_asub_s_h(src4, zero);
+ tmp5_h = (v8u16)__msa_asub_s_h(src5, zero);
+ tmp6_h = (v8u16)__msa_asub_s_h(src6, zero);
+ tmp7_h = (v8u16)__msa_asub_s_h(src7, zero);
+
+ tmp0_w = __msa_hadd_u_w(tmp0_h, tmp0_h);
+ tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
+ tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
+ tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
+ tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
+ tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
+ tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
+ tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
+
+ satd = HADD_UW_U32(tmp0_w);
+ } else if (256 == length) {
+ for (i = 0; i < 2; ++i) {
+ LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7);
+ data += 8 * 8;
+ LD_SH8(data, 8, src8, src9, src10, src11, src12, src13, src14, src15);
+ data += 8 * 8;
+
+ tmp0_h = (v8u16)__msa_asub_s_h(src0, zero);
+ tmp1_h = (v8u16)__msa_asub_s_h(src1, zero);
+ tmp2_h = (v8u16)__msa_asub_s_h(src2, zero);
+ tmp3_h = (v8u16)__msa_asub_s_h(src3, zero);
+ tmp4_h = (v8u16)__msa_asub_s_h(src4, zero);
+ tmp5_h = (v8u16)__msa_asub_s_h(src5, zero);
+ tmp6_h = (v8u16)__msa_asub_s_h(src6, zero);
+ tmp7_h = (v8u16)__msa_asub_s_h(src7, zero);
+
+ tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h);
+ tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
+ tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
+ tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
+ tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
+ tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
+ tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
+ tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
+
+ tmp0_h = (v8u16)__msa_asub_s_h(src8, zero);
+ tmp1_h = (v8u16)__msa_asub_s_h(src9, zero);
+ tmp2_h = (v8u16)__msa_asub_s_h(src10, zero);
+ tmp3_h = (v8u16)__msa_asub_s_h(src11, zero);
+ tmp4_h = (v8u16)__msa_asub_s_h(src12, zero);
+ tmp5_h = (v8u16)__msa_asub_s_h(src13, zero);
+ tmp6_h = (v8u16)__msa_asub_s_h(src14, zero);
+ tmp7_h = (v8u16)__msa_asub_s_h(src15, zero);
+
+ tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h);
+ tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
+ tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
+ tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
+ tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
+ tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
+ tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
+ tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
+ }
+
+ satd = HADD_UW_U32(tmp0_w);
+ } else if (1024 == length) {
+ for (i = 0; i < 8; ++i) {
+ LD_SH8(data, 8, src0, src1, src2, src3, src4, src5, src6, src7);
+ data += 8 * 8;
+ LD_SH8(data, 8, src8, src9, src10, src11, src12, src13, src14, src15);
+ data += 8 * 8;
+
+ tmp0_h = (v8u16)__msa_asub_s_h(src0, zero);
+ tmp1_h = (v8u16)__msa_asub_s_h(src1, zero);
+ tmp2_h = (v8u16)__msa_asub_s_h(src2, zero);
+ tmp3_h = (v8u16)__msa_asub_s_h(src3, zero);
+ tmp4_h = (v8u16)__msa_asub_s_h(src4, zero);
+ tmp5_h = (v8u16)__msa_asub_s_h(src5, zero);
+ tmp6_h = (v8u16)__msa_asub_s_h(src6, zero);
+ tmp7_h = (v8u16)__msa_asub_s_h(src7, zero);
+
+ tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h);
+ tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
+ tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
+ tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
+ tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
+ tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
+ tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
+ tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
+
+ tmp0_h = (v8u16)__msa_asub_s_h(src8, zero);
+ tmp1_h = (v8u16)__msa_asub_s_h(src9, zero);
+ tmp2_h = (v8u16)__msa_asub_s_h(src10, zero);
+ tmp3_h = (v8u16)__msa_asub_s_h(src11, zero);
+ tmp4_h = (v8u16)__msa_asub_s_h(src12, zero);
+ tmp5_h = (v8u16)__msa_asub_s_h(src13, zero);
+ tmp6_h = (v8u16)__msa_asub_s_h(src14, zero);
+ tmp7_h = (v8u16)__msa_asub_s_h(src15, zero);
+
+ tmp0_w += __msa_hadd_u_w(tmp0_h, tmp0_h);
+ tmp0_w += __msa_hadd_u_w(tmp1_h, tmp1_h);
+ tmp0_w += __msa_hadd_u_w(tmp2_h, tmp2_h);
+ tmp0_w += __msa_hadd_u_w(tmp3_h, tmp3_h);
+ tmp0_w += __msa_hadd_u_w(tmp4_h, tmp4_h);
+ tmp0_w += __msa_hadd_u_w(tmp5_h, tmp5_h);
+ tmp0_w += __msa_hadd_u_w(tmp6_h, tmp6_h);
+ tmp0_w += __msa_hadd_u_w(tmp7_h, tmp7_h);
+ }
+
+ satd = HADD_UW_U32(tmp0_w);
+ } else {
+ satd = 0;
+
+ for (i = 0; i < length; ++i) {
+ satd += abs(data[i]);
+ }
+ }
+
+ return satd;
+}
+
+void vpx_int_pro_row_msa(int16_t hbuf[16], const uint8_t *ref,
+ const int ref_stride, const int height) {
+ int i;
+ v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+ v8i16 hbuf_r = { 0 };
+ v8i16 hbuf_l = { 0 };
+ v8i16 ref0_r, ref0_l, ref1_r, ref1_l, ref2_r, ref2_l, ref3_r, ref3_l;
+ v8i16 ref4_r, ref4_l, ref5_r, ref5_l, ref6_r, ref6_l, ref7_r, ref7_l;
+
+ if (16 == height) {
+ for (i = 2; i--;) {
+ LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+ ref += 8 * ref_stride;
+ UNPCK_UB_SH(ref0, ref0_r, ref0_l);
+ UNPCK_UB_SH(ref1, ref1_r, ref1_l);
+ UNPCK_UB_SH(ref2, ref2_r, ref2_l);
+ UNPCK_UB_SH(ref3, ref3_r, ref3_l);
+ UNPCK_UB_SH(ref4, ref4_r, ref4_l);
+ UNPCK_UB_SH(ref5, ref5_r, ref5_l);
+ UNPCK_UB_SH(ref6, ref6_r, ref6_l);
+ UNPCK_UB_SH(ref7, ref7_r, ref7_l);
+ ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ }
+
+ SRA_2V(hbuf_r, hbuf_l, 3);
+ ST_SH2(hbuf_r, hbuf_l, hbuf, 8);
+ } else if (32 == height) {
+ for (i = 2; i--;) {
+ LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+ ref += 8 * ref_stride;
+ UNPCK_UB_SH(ref0, ref0_r, ref0_l);
+ UNPCK_UB_SH(ref1, ref1_r, ref1_l);
+ UNPCK_UB_SH(ref2, ref2_r, ref2_l);
+ UNPCK_UB_SH(ref3, ref3_r, ref3_l);
+ UNPCK_UB_SH(ref4, ref4_r, ref4_l);
+ UNPCK_UB_SH(ref5, ref5_r, ref5_l);
+ UNPCK_UB_SH(ref6, ref6_r, ref6_l);
+ UNPCK_UB_SH(ref7, ref7_r, ref7_l);
+ ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+ ref += 8 * ref_stride;
+ UNPCK_UB_SH(ref0, ref0_r, ref0_l);
+ UNPCK_UB_SH(ref1, ref1_r, ref1_l);
+ UNPCK_UB_SH(ref2, ref2_r, ref2_l);
+ UNPCK_UB_SH(ref3, ref3_r, ref3_l);
+ UNPCK_UB_SH(ref4, ref4_r, ref4_l);
+ UNPCK_UB_SH(ref5, ref5_r, ref5_l);
+ UNPCK_UB_SH(ref6, ref6_r, ref6_l);
+ UNPCK_UB_SH(ref7, ref7_r, ref7_l);
+ ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ }
+
+ SRA_2V(hbuf_r, hbuf_l, 4);
+ ST_SH2(hbuf_r, hbuf_l, hbuf, 8);
+ } else if (64 == height) {
+ for (i = 4; i--;) {
+ LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+ ref += 8 * ref_stride;
+ UNPCK_UB_SH(ref0, ref0_r, ref0_l);
+ UNPCK_UB_SH(ref1, ref1_r, ref1_l);
+ UNPCK_UB_SH(ref2, ref2_r, ref2_l);
+ UNPCK_UB_SH(ref3, ref3_r, ref3_l);
+ UNPCK_UB_SH(ref4, ref4_r, ref4_l);
+ UNPCK_UB_SH(ref5, ref5_r, ref5_l);
+ UNPCK_UB_SH(ref6, ref6_r, ref6_l);
+ UNPCK_UB_SH(ref7, ref7_r, ref7_l);
+ ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ LD_UB8(ref, ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+ ref += 8 * ref_stride;
+ UNPCK_UB_SH(ref0, ref0_r, ref0_l);
+ UNPCK_UB_SH(ref1, ref1_r, ref1_l);
+ UNPCK_UB_SH(ref2, ref2_r, ref2_l);
+ UNPCK_UB_SH(ref3, ref3_r, ref3_l);
+ UNPCK_UB_SH(ref4, ref4_r, ref4_l);
+ UNPCK_UB_SH(ref5, ref5_r, ref5_l);
+ UNPCK_UB_SH(ref6, ref6_r, ref6_l);
+ UNPCK_UB_SH(ref7, ref7_r, ref7_l);
+ ADD4(hbuf_r, ref0_r, hbuf_l, ref0_l, hbuf_r, ref1_r, hbuf_l, ref1_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref2_r, hbuf_l, ref2_l, hbuf_r, ref3_r, hbuf_l, ref3_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref4_r, hbuf_l, ref4_l, hbuf_r, ref5_r, hbuf_l, ref5_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ ADD4(hbuf_r, ref6_r, hbuf_l, ref6_l, hbuf_r, ref7_r, hbuf_l, ref7_l,
+ hbuf_r, hbuf_l, hbuf_r, hbuf_l);
+ }
+
+ SRA_2V(hbuf_r, hbuf_l, 5);
+ ST_SH2(hbuf_r, hbuf_l, hbuf, 8);
+ } else {
+ const int norm_factor = height >> 1;
+ int cnt;
+
+ for (cnt = 0; cnt < 16; cnt++) {
+ hbuf[cnt] = 0;
+ }
+
+ for (i = 0; i < height; ++i) {
+ for (cnt = 0; cnt < 16; cnt++) {
+ hbuf[cnt] += ref[cnt];
+ }
+
+ ref += ref_stride;
+ }
+
+ for (cnt = 0; cnt < 16; cnt++) {
+ hbuf[cnt] /= norm_factor;
+ }
+ }
+}
+
+int16_t vpx_int_pro_col_msa(const uint8_t *ref, const int width) {
+ int16_t sum;
+ v16u8 ref0, ref1, ref2, ref3;
+ v8u16 ref0_h;
+
+ if (16 == width) {
+ ref0 = LD_UB(ref);
+ ref0_h = __msa_hadd_u_h(ref0, ref0);
+ sum = HADD_UH_U32(ref0_h);
+ } else if (32 == width) {
+ LD_UB2(ref, 16, ref0, ref1);
+ ref0_h = __msa_hadd_u_h(ref0, ref0);
+ ref0_h += __msa_hadd_u_h(ref1, ref1);
+ sum = HADD_UH_U32(ref0_h);
+ } else if (64 == width) {
+ LD_UB4(ref, 16, ref0, ref1, ref2, ref3);
+ ref0_h = __msa_hadd_u_h(ref0, ref0);
+ ref0_h += __msa_hadd_u_h(ref1, ref1);
+ ref0_h += __msa_hadd_u_h(ref2, ref2);
+ ref0_h += __msa_hadd_u_h(ref3, ref3);
+ sum = HADD_UH_U32(ref0_h);
+ } else {
+ int idx;
+
+ sum = 0;
+ for (idx = 0; idx < width; ++idx) {
+ sum += ref[idx];
+ }
+ }
+
+ return sum;
+}
+
+int vpx_vector_var_msa(const int16_t *ref, const int16_t *src, const int bwl) {
+ int sse, mean, var;
+ v8i16 src0, src1, src2, src3, src4, src5, src6, src7, ref0, ref1, ref2;
+ v8i16 ref3, ref4, ref5, ref6, ref7, src_l0_m, src_l1_m, src_l2_m, src_l3_m;
+ v8i16 src_l4_m, src_l5_m, src_l6_m, src_l7_m;
+ v4i32 res_l0_m, res_l1_m, res_l2_m, res_l3_m, res_l4_m, res_l5_m, res_l6_m;
+ v4i32 res_l7_m, mean_v;
+ v2i64 sse_v;
+
+ if (2 == bwl) {
+ LD_SH2(src, 8, src0, src1);
+ LD_SH2(ref, 8, ref0, ref1);
+
+ ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m);
+ ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m);
+ HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m);
+ HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m);
+ sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m);
+ sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m);
+ DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v);
+ mean_v = res_l0_m + res_l1_m;
+ mean_v += res_l2_m + res_l3_m;
+
+ sse_v += __msa_splati_d(sse_v, 1);
+ sse = __msa_copy_s_w((v4i32)sse_v, 0);
+
+ mean = HADD_SW_S32(mean_v);
+ } else if (3 == bwl) {
+ LD_SH4(src, 8, src0, src1, src2, src3);
+ LD_SH4(ref, 8, ref0, ref1, ref2, ref3);
+
+ ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m);
+ ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m);
+ ILVRL_H2_SH(src2, ref2, src_l4_m, src_l5_m);
+ ILVRL_H2_SH(src3, ref3, src_l6_m, src_l7_m);
+ HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m);
+ HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m);
+ HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m);
+ HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m);
+ sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m);
+ sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m);
+ DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v);
+ DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v);
+ DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v);
+ mean_v = res_l0_m + res_l1_m;
+ mean_v += res_l2_m + res_l3_m;
+ mean_v += res_l4_m + res_l5_m;
+ mean_v += res_l6_m + res_l7_m;
+
+ sse_v += __msa_splati_d(sse_v, 1);
+ sse = __msa_copy_s_w((v4i32)sse_v, 0);
+
+ mean = HADD_SW_S32(mean_v);
+ } else if (4 == bwl) {
+ LD_SH8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7);
+ LD_SH8(ref, 8, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+
+ ILVRL_H2_SH(src0, ref0, src_l0_m, src_l1_m);
+ ILVRL_H2_SH(src1, ref1, src_l2_m, src_l3_m);
+ ILVRL_H2_SH(src2, ref2, src_l4_m, src_l5_m);
+ ILVRL_H2_SH(src3, ref3, src_l6_m, src_l7_m);
+ HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m);
+ HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m);
+ HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m);
+ HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m);
+ sse_v = __msa_dotp_s_d(res_l0_m, res_l0_m);
+ sse_v = __msa_dpadd_s_d(sse_v, res_l1_m, res_l1_m);
+ DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v);
+ DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v);
+ DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v);
+ mean_v = res_l0_m + res_l1_m;
+ mean_v += res_l2_m + res_l3_m;
+ mean_v += res_l4_m + res_l5_m;
+ mean_v += res_l6_m + res_l7_m;
+
+ ILVRL_H2_SH(src4, ref4, src_l0_m, src_l1_m);
+ ILVRL_H2_SH(src5, ref5, src_l2_m, src_l3_m);
+ ILVRL_H2_SH(src6, ref6, src_l4_m, src_l5_m);
+ ILVRL_H2_SH(src7, ref7, src_l6_m, src_l7_m);
+ HSUB_UH2_SW(src_l0_m, src_l1_m, res_l0_m, res_l1_m);
+ HSUB_UH2_SW(src_l2_m, src_l3_m, res_l2_m, res_l3_m);
+ HSUB_UH2_SW(src_l4_m, src_l5_m, res_l4_m, res_l5_m);
+ HSUB_UH2_SW(src_l6_m, src_l7_m, res_l6_m, res_l7_m);
+ DPADD_SD2_SD(res_l0_m, res_l1_m, sse_v, sse_v);
+ DPADD_SD2_SD(res_l2_m, res_l3_m, sse_v, sse_v);
+ DPADD_SD2_SD(res_l4_m, res_l5_m, sse_v, sse_v);
+ DPADD_SD2_SD(res_l6_m, res_l7_m, sse_v, sse_v);
+ mean_v += res_l0_m + res_l1_m;
+ mean_v += res_l2_m + res_l3_m;
+ mean_v += res_l4_m + res_l5_m;
+ mean_v += res_l6_m + res_l7_m;
+
+ sse_v += __msa_splati_d(sse_v, 1);
+ sse = __msa_copy_s_w((v4i32)sse_v, 0);
+
+ mean = HADD_SW_S32(mean_v);
+ } else {
+ int i;
+ const int width = 4 << bwl;
+
+ sse = 0;
+ mean = 0;
+
+ for (i = 0; i < width; ++i) {
+ const int diff = ref[i] - src[i];
+
+ mean += diff;
+ sse += diff * diff;
+ }
+ }
+
+ var = sse - ((mean * mean) >> (bwl + 2));
+
+ return var;
+}
+
+void vpx_minmax_8x8_msa(const uint8_t *s, int p, const uint8_t *d, int dp,
+ int *min, int *max) {
+ v16u8 s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7;
+ v16u8 diff0, diff1, diff2, diff3, min0, min1, max0, max1;
+
+ LD_UB8(s, p, s0, s1, s2, s3, s4, s5, s6, s7);
+ LD_UB8(d, dp, d0, d1, d2, d3, d4, d5, d6, d7);
+ PCKEV_D4_UB(s1, s0, s3, s2, s5, s4, s7, s6, s0, s1, s2, s3);
+ PCKEV_D4_UB(d1, d0, d3, d2, d5, d4, d7, d6, d0, d1, d2, d3);
+
+ diff0 = __msa_asub_u_b(s0, d0);
+ diff1 = __msa_asub_u_b(s1, d1);
+ diff2 = __msa_asub_u_b(s2, d2);
+ diff3 = __msa_asub_u_b(s3, d3);
+
+ min0 = __msa_min_u_b(diff0, diff1);
+ min1 = __msa_min_u_b(diff2, diff3);
+ min0 = __msa_min_u_b(min0, min1);
+
+ max0 = __msa_max_u_b(diff0, diff1);
+ max1 = __msa_max_u_b(diff2, diff3);
+ max0 = __msa_max_u_b(max0, max1);
+
+ min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 8);
+ min0 = __msa_min_u_b(min0, min1);
+ max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 8);
+ max0 = __msa_max_u_b(max0, max1);
+
+ min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 4);
+ min0 = __msa_min_u_b(min0, min1);
+ max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 4);
+ max0 = __msa_max_u_b(max0, max1);
+
+ min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 2);
+ min0 = __msa_min_u_b(min0, min1);
+ max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 2);
+ max0 = __msa_max_u_b(max0, max1);
+
+ min1 = (v16u8)__msa_sldi_b((v16i8)min1, (v16i8)min0, 1);
+ min0 = __msa_min_u_b(min0, min1);
+ max1 = (v16u8)__msa_sldi_b((v16i8)max1, (v16i8)max0, 1);
+ max0 = __msa_max_u_b(max0, max1);
+
+ *min = min0[0];
+ *max = max0[0];
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/deblock_msa.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/deblock_msa.c
index e33ea740a9e..aafa272fbdf 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/deblock_msa.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/deblock_msa.c
@@ -9,6 +9,7 @@
*/
#include <stdlib.h>
+
#include "./macros_msa.h"
extern const int16_t vpx_rv[];
@@ -295,6 +296,7 @@ static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
uint8_t *p_dst_st = dst_ptr;
uint8_t *f_orig = f;
uint16_t col;
+ uint64_t out0, out1, out2, out3;
v16u8 above2, above1, below2, below1;
v16u8 src, ref, ref_temp;
v16u8 inter0, inter1, inter2, inter3, inter4, inter5, inter6;
@@ -346,6 +348,67 @@ static void postproc_down_across_luma_msa(uint8_t *src_ptr, uint8_t *dst_ptr,
f += 16;
}
+ if (0 != (cols / 16)) {
+ ref = LD_UB(f);
+ LD_UB2(p_src - 2 * src_stride, src_stride, above2, above1);
+ src = LD_UB(p_src);
+ LD_UB2(p_src + 1 * src_stride, src_stride, below1, below2);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter0);
+ above2 = LD_UB(p_src + 3 * src_stride);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter1);
+ above1 = LD_UB(p_src + 4 * src_stride);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter2);
+ src = LD_UB(p_src + 5 * src_stride);
+ VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter3);
+ below1 = LD_UB(p_src + 6 * src_stride);
+ VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter4);
+ below2 = LD_UB(p_src + 7 * src_stride);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter5);
+ above2 = LD_UB(p_src + 8 * src_stride);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter6);
+ above1 = LD_UB(p_src + 9 * src_stride);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter7);
+ src = LD_UB(p_src + 10 * src_stride);
+ VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter8);
+ below1 = LD_UB(p_src + 11 * src_stride);
+ VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter9);
+ below2 = LD_UB(p_src + 12 * src_stride);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter10);
+ above2 = LD_UB(p_src + 13 * src_stride);
+ VPX_AVER_IF_RETAIN(above1, src, below1, below2, above2, ref, inter11);
+ above1 = LD_UB(p_src + 14 * src_stride);
+ VPX_AVER_IF_RETAIN(src, below1, below2, above2, above1, ref, inter12);
+ src = LD_UB(p_src + 15 * src_stride);
+ VPX_AVER_IF_RETAIN(below1, below2, above2, above1, src, ref, inter13);
+ below1 = LD_UB(p_src + 16 * src_stride);
+ VPX_AVER_IF_RETAIN(below2, above2, above1, src, below1, ref, inter14);
+ below2 = LD_UB(p_src + 17 * src_stride);
+ VPX_AVER_IF_RETAIN(above2, above1, src, below1, below2, ref, inter15);
+ out0 = __msa_copy_u_d((v2i64)inter0, 0);
+ out1 = __msa_copy_u_d((v2i64)inter1, 0);
+ out2 = __msa_copy_u_d((v2i64)inter2, 0);
+ out3 = __msa_copy_u_d((v2i64)inter3, 0);
+ SD4(out0, out1, out2, out3, p_dst, dst_stride);
+
+ out0 = __msa_copy_u_d((v2i64)inter4, 0);
+ out1 = __msa_copy_u_d((v2i64)inter5, 0);
+ out2 = __msa_copy_u_d((v2i64)inter6, 0);
+ out3 = __msa_copy_u_d((v2i64)inter7, 0);
+ SD4(out0, out1, out2, out3, p_dst + 4 * dst_stride, dst_stride);
+
+ out0 = __msa_copy_u_d((v2i64)inter8, 0);
+ out1 = __msa_copy_u_d((v2i64)inter9, 0);
+ out2 = __msa_copy_u_d((v2i64)inter10, 0);
+ out3 = __msa_copy_u_d((v2i64)inter11, 0);
+ SD4(out0, out1, out2, out3, p_dst + 8 * dst_stride, dst_stride);
+
+ out0 = __msa_copy_u_d((v2i64)inter12, 0);
+ out1 = __msa_copy_u_d((v2i64)inter13, 0);
+ out2 = __msa_copy_u_d((v2i64)inter14, 0);
+ out3 = __msa_copy_u_d((v2i64)inter15, 0);
+ SD4(out0, out1, out2, out3, p_dst + 12 * dst_stride, dst_stride);
+ }
+
f = f_orig;
p_dst = dst_ptr - 2;
LD_UB8(p_dst, dst_stride, inter0, inter1, inter2, inter3, inter4, inter5,
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/macros_msa.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/macros_msa.h
index 002e574aa8f..27b38865a42 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/macros_msa.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/macros_msa.h
@@ -1049,6 +1049,7 @@
}
#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
+#define INSERT_D2_SH(...) INSERT_D2(v8i16, __VA_ARGS__)
/* Description : Interleave even byte elements from vectors
Arguments : Inputs - in0, in1, in2, in3
@@ -1559,6 +1560,12 @@
Details : Each element of vector 'in0' is right shifted by 'shift' and
the result is written in-place. 'shift' is a GP variable.
*/
+#define SRA_2V(in0, in1, shift) \
+ { \
+ in0 = in0 >> shift; \
+ in1 = in1 >> shift; \
+ }
+
#define SRA_4V(in0, in1, in2, in3, shift) \
{ \
in0 = in0 >> shift; \
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/sum_squares_msa.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/sum_squares_msa.c
new file mode 100644
index 00000000000..d4563dc410b
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/mips/sum_squares_msa.c
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "./macros_msa.h"
+
+uint64_t vpx_sum_squares_2d_i16_msa(const int16_t *src, int src_stride,
+ int size) {
+ int row, col;
+ uint64_t ss_res = 0;
+ v4i32 mul0, mul1;
+ v2i64 res0 = { 0 };
+
+ if (4 == size) {
+ uint64_t src0, src1, src2, src3;
+ v8i16 diff0 = { 0 };
+ v8i16 diff1 = { 0 };
+
+ LD4(src, src_stride, src0, src1, src2, src3);
+ INSERT_D2_SH(src0, src1, diff0);
+ INSERT_D2_SH(src2, src3, diff1);
+ DOTP_SH2_SW(diff0, diff1, diff0, diff1, mul0, mul1);
+ mul0 += mul1;
+ res0 = __msa_hadd_s_d(mul0, mul0);
+ res0 += __msa_splati_d(res0, 1);
+ ss_res = (uint64_t)__msa_copy_s_d(res0, 0);
+ } else if (8 == size) {
+ v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
+
+ LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+ DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+ DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+ DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+ mul0 += mul1;
+ res0 = __msa_hadd_s_d(mul0, mul0);
+ res0 += __msa_splati_d(res0, 1);
+ ss_res = (uint64_t)__msa_copy_s_d(res0, 0);
+ } else if (16 == size) {
+ v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
+
+ LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+ DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+ DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+ DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+ LD_SH8(src + 8, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ src += 8 * src_stride;
+ DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+ DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+ DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+ DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+ LD_SH8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+ DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+ DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+ DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+ LD_SH8(src + 8, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
+ DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+ DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+ DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+ DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+ mul0 += mul1;
+ res0 += __msa_hadd_s_d(mul0, mul0);
+
+ res0 += __msa_splati_d(res0, 1);
+ ss_res = (uint64_t)__msa_copy_s_d(res0, 0);
+ } else if (0 == (size % 16)) {
+ v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
+
+ for (row = 0; row < (size >> 4); row++) {
+ for (col = 0; col < size; col += 16) {
+ const int16_t *src_ptr = src + col;
+ LD_SH8(src_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6,
+ src7);
+ DOTP_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+ DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+ DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+ DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+ LD_SH8(src_ptr + 8, src_stride, src0, src1, src2, src3, src4, src5,
+ src6, src7);
+ src_ptr += 8 * src_stride;
+ DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+ DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+ DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+ DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+ LD_SH8(src_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6,
+ src7);
+ DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+ DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+ DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+ DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+ LD_SH8(src_ptr + 8, src_stride, src0, src1, src2, src3, src4, src5,
+ src6, src7);
+ DPADD_SH2_SW(src0, src1, src0, src1, mul0, mul1);
+ DPADD_SH2_SW(src2, src3, src2, src3, mul0, mul1);
+ DPADD_SH2_SW(src4, src5, src4, src5, mul0, mul1);
+ DPADD_SH2_SW(src6, src7, src6, src7, mul0, mul1);
+ mul0 += mul1;
+ res0 += __msa_hadd_s_d(mul0, mul0);
+ }
+
+ src += 16 * src_stride;
+ }
+
+ res0 += __msa_splati_d(res0, 1);
+ ss_res = (uint64_t)__msa_copy_s_d(res0, 0);
+ } else {
+ int16_t val;
+
+ for (row = 0; row < size; row++) {
+ for (col = 0; col < size; col++) {
+ val = src[col];
+ ss_res += val * val;
+ }
+
+ src += src_stride;
+ }
+ }
+
+ return ss_res;
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/prob.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/prob.h
index 5656ddbab4d..f1cc0eaa105 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/prob.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/prob.h
@@ -48,7 +48,7 @@ typedef const vpx_tree_index vpx_tree[];
static INLINE vpx_prob get_prob(unsigned int num, unsigned int den) {
assert(den != 0);
{
- const int p = (int)(((int64_t)num * 256 + (den >> 1)) / den);
+ const int p = (int)(((uint64_t)num * 256 + (den >> 1)) / den);
// (p > 255) ? 255 : (p < 1) ? 1 : p;
const int clipped_prob = p | ((255 - p) >> 23) | (p == 0);
return (vpx_prob)clipped_prob;
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk
index bb20ea27421..ca6e5ca9a83 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp.mk
@@ -13,6 +13,13 @@ DSP_SRCS-yes += vpx_dsp_common.h
DSP_SRCS-$(HAVE_MSA) += mips/macros_msa.h
+DSP_SRCS-$(HAVE_AVX2) += x86/bitdepth_conversion_avx2.h
+DSP_SRCS-$(HAVE_SSE2) += x86/bitdepth_conversion_sse2.h
+# This file is included in libs.mk. Including it here would cause it to be
+# compiled into an object. Even as an empty file, this would create an
+# executable section on the stack.
+#DSP_SRCS-$(HAVE_SSE2) += x86/bitdepth_conversion_sse2$(ASM)
+
# bit reader
DSP_SRCS-yes += prob.h
DSP_SRCS-yes += prob.c
@@ -195,9 +202,7 @@ DSP_SRCS-yes += inv_txfm.c
DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.h
DSP_SRCS-$(HAVE_SSE2) += x86/inv_txfm_sse2.c
DSP_SRCS-$(HAVE_SSE2) += x86/inv_wht_sse2.asm
-ifeq ($(ARCH_X86_64),yes)
-DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3_x86_64.asm
-endif # ARCH_X86_64
+DSP_SRCS-$(HAVE_SSSE3) += x86/inv_txfm_ssse3.c
DSP_SRCS-$(HAVE_NEON_ASM) += arm/save_reg_neon$(ASM)
@@ -217,26 +222,23 @@ DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c
else # CONFIG_VP9_HIGHBITDEPTH
DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct4x4_add_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct8x8_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct16x16_add_neon.c
+DSP_SRCS-$(HAVE_NEON) += arm/highbd_idct32x32_add_neon.c
endif # !CONFIG_VP9_HIGHBITDEPTH
ifeq ($(HAVE_NEON_ASM),yes)
DSP_SRCS-yes += arm/idct_neon$(ASM)
DSP_SRCS-yes += arm/idct4x4_1_add_neon$(ASM)
DSP_SRCS-yes += arm/idct4x4_add_neon$(ASM)
-DSP_SRCS-yes += arm/idct8x8_1_add_neon$(ASM)
-DSP_SRCS-yes += arm/idct8x8_add_neon$(ASM)
-DSP_SRCS-yes += arm/idct16x16_1_add_neon$(ASM)
-DSP_SRCS-yes += arm/idct16x16_add_neon$(ASM)
-DSP_SRCS-yes += arm/idct16x16_neon.c
else
DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_1_add_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/idct4x4_add_neon.c
+endif # HAVE_NEON_ASM
+DSP_SRCS-$(HAVE_NEON) += arm/idct_neon.h
DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_1_add_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/idct8x8_add_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_1_add_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/idct16x16_add_neon.c
-endif # HAVE_NEON_ASM
-DSP_SRCS-$(HAVE_NEON) += arm/idct_neon.h
DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_1_add_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_34_add_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/idct32x32_135_add_neon.c
@@ -249,7 +251,6 @@ ifeq ($(CONFIG_VP9_ENCODER),yes)
DSP_SRCS-yes += quantize.c
DSP_SRCS-yes += quantize.h
-DSP_SRCS-$(HAVE_SSE2) += x86/fdct.h
DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c
@@ -276,6 +277,7 @@ DSP_SRCS-yes += sad.c
DSP_SRCS-yes += subtract.c
DSP_SRCS-yes += sum_squares.c
DSP_SRCS-$(HAVE_SSE2) += x86/sum_squares_sse2.c
+DSP_SRCS-$(HAVE_MSA) += mips/sum_squares_msa.c
DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c
DSP_SRCS-$(HAVE_NEON) += arm/sad_neon.c
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
index ee1b2927938..a17bda582e4 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -536,10 +536,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_fdct4x4_1 sse2/;
add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_fdct8x8 sse2/;
+ specialize qw/vpx_fdct8x8 neon sse2/;
add_proto qw/void vpx_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/vpx_fdct8x8_1 sse2/;
+ specialize qw/vpx_fdct8x8_1 neon sse2/;
add_proto qw/void vpx_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/vpx_fdct16x16 sse2/;
@@ -624,13 +624,14 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_highbd_idct8x8_1_add neon/;
add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+ specialize qw/vpx_highbd_idct16x16_1_add neon/;
add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
add_proto qw/void vpx_highbd_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
add_proto qw/void vpx_highbd_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
- specialize qw/vpx_highbd_idct32x32_1_add sse2/;
+ specialize qw/vpx_highbd_idct32x32_1_add neon sse2/;
add_proto qw/void vpx_highbd_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
@@ -650,6 +651,8 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+ add_proto qw/void vpx_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+
add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride";
add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
@@ -670,6 +673,8 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+ add_proto qw/void vpx_highbd_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+
add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
} else {
add_proto qw/void vpx_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int stride";
@@ -679,10 +684,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_idct4x4_1_add neon sse2/;
add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct8x8_64_add neon sse2/, "$ssse3_x86_64";
+ specialize qw/vpx_idct8x8_64_add neon sse2 ssse3/;
add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct8x8_12_add neon sse2/, "$ssse3_x86_64";
+ specialize qw/vpx_idct8x8_12_add neon sse2 ssse3/;
add_proto qw/void vpx_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct8x8_1_add neon sse2/;
@@ -690,6 +695,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct16x16_256_add neon sse2/;
+ add_proto qw/void vpx_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+ specialize qw/vpx_idct16x16_38_add neon sse2/;
+ $vpx_idct16x16_38_add_sse2=vpx_idct16x16_256_add_sse2;
+
add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct16x16_10_add neon sse2/;
@@ -697,15 +706,15 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_idct16x16_1_add neon sse2/;
add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct32x32_1024_add neon sse2/, "$ssse3_x86_64";
+ specialize qw/vpx_idct32x32_1024_add neon sse2 ssse3/;
add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct32x32_135_add neon sse2/, "$ssse3_x86_64";
+ specialize qw/vpx_idct32x32_135_add neon sse2 ssse3/;
# Need to add 135 eob idct32x32 implementations.
$vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct32x32_34_add neon sse2/, "$ssse3_x86_64";
+ specialize qw/vpx_idct32x32_34_add neon sse2 ssse3/;
add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct32x32_1_add neon sse2/;
@@ -720,10 +729,14 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_highbd_idct8x8_12_add neon sse2/;
add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
- specialize qw/vpx_highbd_idct16x16_256_add sse2/;
+ specialize qw/vpx_highbd_idct16x16_256_add neon sse2/;
+
+ add_proto qw/void vpx_highbd_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
+ specialize qw/vpx_highbd_idct16x16_38_add neon sse2/;
+ $vpx_highbd_idct16x16_38_add_sse2=vpx_highbd_idct16x16_256_add_sse2;
add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride, int bd";
- specialize qw/vpx_highbd_idct16x16_10_add sse2/;
+ specialize qw/vpx_highbd_idct16x16_10_add neon sse2/;
} # CONFIG_EMULATE_HARDWARE
} else {
# Force C versions if CONFIG_EMULATE_HARDWARE is 1
@@ -742,6 +755,8 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+ add_proto qw/void vpx_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+
add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride";
add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride";
@@ -766,10 +781,10 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
specialize qw/vpx_idct8x8_1_add sse2 neon dspr2 msa/;
add_proto qw/void vpx_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct8x8_64_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
+ specialize qw/vpx_idct8x8_64_add sse2 ssse3 neon dspr2 msa/;
add_proto qw/void vpx_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct8x8_12_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
+ specialize qw/vpx_idct8x8_12_add sse2 ssse3 neon dspr2 msa/;
add_proto qw/void vpx_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct16x16_1_add sse2 neon dspr2 msa/;
@@ -777,20 +792,26 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/void vpx_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct16x16_256_add sse2 neon dspr2 msa/;
+ add_proto qw/void vpx_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int stride";
+ specialize qw/vpx_idct16x16_38_add sse2 neon dspr2 msa/;
+ $vpx_idct16x16_38_add_sse2=vpx_idct16x16_256_add_sse2;
+ $vpx_idct16x16_38_add_dspr2=vpx_idct16x16_256_add_dspr2;
+ $vpx_idct16x16_38_add_msa=vpx_idct16x16_256_add_msa;
+
add_proto qw/void vpx_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct16x16_10_add sse2 neon dspr2 msa/;
add_proto qw/void vpx_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct32x32_1024_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
+ specialize qw/vpx_idct32x32_1024_add sse2 ssse3 neon dspr2 msa/;
add_proto qw/void vpx_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct32x32_135_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
+ specialize qw/vpx_idct32x32_135_add sse2 ssse3 neon dspr2 msa/;
$vpx_idct32x32_135_add_sse2=vpx_idct32x32_1024_add_sse2;
$vpx_idct32x32_135_add_dspr2=vpx_idct32x32_1024_add_dspr2;
$vpx_idct32x32_135_add_msa=vpx_idct32x32_1024_add_msa;
add_proto qw/void vpx_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int stride";
- specialize qw/vpx_idct32x32_34_add sse2 neon dspr2 msa/, "$ssse3_x86_64";
+ specialize qw/vpx_idct32x32_34_add sse2 ssse3 neon dspr2 msa/;
add_proto qw/void vpx_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int stride";
specialize qw/vpx_idct32x32_1_add sse2 neon dspr2 msa/;
@@ -883,25 +904,37 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
specialize qw/vpx_avg_4x4 sse2 neon msa/;
add_proto qw/void vpx_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
- specialize qw/vpx_minmax_8x8 sse2 neon/;
+ specialize qw/vpx_minmax_8x8 sse2 neon msa/;
- add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
- specialize qw/vpx_hadamard_8x8 sse2 neon/, "$ssse3_x86_64";
- add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
- specialize qw/vpx_hadamard_16x16 sse2 neon/;
+ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff";
+ specialize qw/vpx_hadamard_8x8 sse2 neon/, "$ssse3_x86_64";
+
+ add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, tran_low_t *coeff";
+ specialize qw/vpx_hadamard_16x16 sse2 neon/;
+
+ add_proto qw/int vpx_satd/, "const tran_low_t *coeff, int length";
+ specialize qw/vpx_satd sse2 neon/;
+ } else {
+ add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
+ specialize qw/vpx_hadamard_8x8 sse2 neon msa/, "$ssse3_x86_64";
+
+ add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
+ specialize qw/vpx_hadamard_16x16 sse2 neon msa/;
- add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
- specialize qw/vpx_satd sse2 neon/;
+ add_proto qw/int vpx_satd/, "const int16_t *coeff, int length";
+ specialize qw/vpx_satd sse2 neon msa/;
+ }
add_proto qw/void vpx_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height";
- specialize qw/vpx_int_pro_row sse2 neon/;
+ specialize qw/vpx_int_pro_row sse2 neon msa/;
add_proto qw/int16_t vpx_int_pro_col/, "const uint8_t *ref, const int width";
- specialize qw/vpx_int_pro_col sse2 neon/;
+ specialize qw/vpx_int_pro_col sse2 neon msa/;
add_proto qw/int vpx_vector_var/, "const int16_t *ref, const int16_t *src, const int bwl";
- specialize qw/vpx_vector_var neon sse2/;
+ specialize qw/vpx_vector_var neon sse2 msa/;
} # CONFIG_VP9_ENCODER
add_proto qw/unsigned int vpx_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
@@ -1039,7 +1072,7 @@ add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const
specialize qw/vpx_sad4x4x4d msa sse2/;
add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";
-specialize qw/vpx_sum_squares_2d_i16 sse2/;
+specialize qw/vpx_sum_squares_2d_i16 sse2 msa/;
#
# Structured Similarity (SSIM)
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/avg_intrin_sse2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
index b0a104bad06..4e89e07e580 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
@@ -11,6 +11,8 @@
#include <emmintrin.h>
#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
#include "vpx_ports/mem.h"
void vpx_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
@@ -213,7 +215,7 @@ static void hadamard_col8_sse2(__m128i *in, int iter) {
}
void vpx_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
- int16_t *coeff) {
+ tran_low_t *coeff) {
__m128i src[8];
src[0] = _mm_load_si128((const __m128i *)src_diff);
src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
@@ -227,25 +229,25 @@ void vpx_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
hadamard_col8_sse2(src, 0);
hadamard_col8_sse2(src, 1);
- _mm_store_si128((__m128i *)coeff, src[0]);
+ store_tran_low(src[0], coeff);
coeff += 8;
- _mm_store_si128((__m128i *)coeff, src[1]);
+ store_tran_low(src[1], coeff);
coeff += 8;
- _mm_store_si128((__m128i *)coeff, src[2]);
+ store_tran_low(src[2], coeff);
coeff += 8;
- _mm_store_si128((__m128i *)coeff, src[3]);
+ store_tran_low(src[3], coeff);
coeff += 8;
- _mm_store_si128((__m128i *)coeff, src[4]);
+ store_tran_low(src[4], coeff);
coeff += 8;
- _mm_store_si128((__m128i *)coeff, src[5]);
+ store_tran_low(src[5], coeff);
coeff += 8;
- _mm_store_si128((__m128i *)coeff, src[6]);
+ store_tran_low(src[6], coeff);
coeff += 8;
- _mm_store_si128((__m128i *)coeff, src[7]);
+ store_tran_low(src[7], coeff);
}
void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
- int16_t *coeff) {
+ tran_low_t *coeff) {
int idx;
for (idx = 0; idx < 4; ++idx) {
int16_t const *src_ptr =
@@ -254,10 +256,10 @@ void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
}
for (idx = 0; idx < 64; idx += 8) {
- __m128i coeff0 = _mm_load_si128((const __m128i *)coeff);
- __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64));
- __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128));
- __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192));
+ __m128i coeff0 = load_tran_low(coeff);
+ __m128i coeff1 = load_tran_low(coeff + 64);
+ __m128i coeff2 = load_tran_low(coeff + 128);
+ __m128i coeff3 = load_tran_low(coeff + 192);
__m128i b0 = _mm_add_epi16(coeff0, coeff1);
__m128i b1 = _mm_sub_epi16(coeff0, coeff1);
@@ -271,25 +273,25 @@ void vpx_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
coeff0 = _mm_add_epi16(b0, b2);
coeff1 = _mm_add_epi16(b1, b3);
- _mm_store_si128((__m128i *)coeff, coeff0);
- _mm_store_si128((__m128i *)(coeff + 64), coeff1);
+ store_tran_low(coeff0, coeff);
+ store_tran_low(coeff1, coeff + 64);
coeff2 = _mm_sub_epi16(b0, b2);
coeff3 = _mm_sub_epi16(b1, b3);
- _mm_store_si128((__m128i *)(coeff + 128), coeff2);
- _mm_store_si128((__m128i *)(coeff + 192), coeff3);
+ store_tran_low(coeff2, coeff + 128);
+ store_tran_low(coeff3, coeff + 192);
coeff += 8;
}
}
-int vpx_satd_sse2(const int16_t *coeff, int length) {
+int vpx_satd_sse2(const tran_low_t *coeff, int length) {
int i;
const __m128i zero = _mm_setzero_si128();
__m128i accum = zero;
for (i = 0; i < length; i += 8) {
- const __m128i src_line = _mm_load_si128((const __m128i *)coeff);
+ const __m128i src_line = load_tran_low(coeff);
const __m128i inv = _mm_sub_epi16(zero, src_line);
const __m128i abs = _mm_max_epi16(src_line, inv); // abs(src_line)
const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero);
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm
index 26412e8e432..22e0a086cc2 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/avg_ssse3_x86_64.asm
@@ -8,42 +8,50 @@
; be found in the AUTHORS file in the root of the source tree.
;
-%define private_prefix vpx
-
%include "third_party/x86inc/x86inc.asm"
-
-; This file provides SSSE3 version of the hadamard transformation. Part
-; of the macro definitions are originally derived from the ffmpeg project.
-; The current version applies to x86 64-bit only.
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
SECTION .text
%if ARCH_X86_64
; matrix transpose
-%macro INTERLEAVE_2X 4
- punpckh%1 m%4, m%2, m%3
- punpckl%1 m%2, m%3
- SWAP %3, %4
-%endmacro
-
-%macro TRANSPOSE8X8 9
- INTERLEAVE_2X wd, %1, %2, %9
- INTERLEAVE_2X wd, %3, %4, %9
- INTERLEAVE_2X wd, %5, %6, %9
- INTERLEAVE_2X wd, %7, %8, %9
-
- INTERLEAVE_2X dq, %1, %3, %9
- INTERLEAVE_2X dq, %2, %4, %9
- INTERLEAVE_2X dq, %5, %7, %9
- INTERLEAVE_2X dq, %6, %8, %9
-
- INTERLEAVE_2X qdq, %1, %5, %9
- INTERLEAVE_2X qdq, %3, %7, %9
- INTERLEAVE_2X qdq, %2, %6, %9
- INTERLEAVE_2X qdq, %4, %8, %9
-
- SWAP %2, %5
- SWAP %4, %7
+%macro TRANSPOSE8X8 10
+ ; stage 1
+ punpcklwd m%9, m%1, m%2
+ punpcklwd m%10, m%3, m%4
+ punpckhwd m%1, m%2
+ punpckhwd m%3, m%4
+
+ punpcklwd m%2, m%5, m%6
+ punpcklwd m%4, m%7, m%8
+ punpckhwd m%5, m%6
+ punpckhwd m%7, m%8
+
+ ; stage 2
+ punpckldq m%6, m%9, m%10
+ punpckldq m%8, m%1, m%3
+ punpckhdq m%9, m%10
+ punpckhdq m%1, m%3
+
+ punpckldq m%10, m%2, m%4
+ punpckldq m%3, m%5, m%7
+ punpckhdq m%2, m%4
+ punpckhdq m%5, m%7
+
+ ; stage 3
+ punpckhqdq m%4, m%9, m%2 ; out3
+ punpcklqdq m%9, m%2 ; out2
+ punpcklqdq m%7, m%1, m%5 ; out6
+ punpckhqdq m%1, m%5 ; out7
+
+ punpckhqdq m%2, m%6, m%10 ; out1
+ punpcklqdq m%6, m%10 ; out0
+ punpcklqdq m%5, m%8, m%3 ; out4
+ punpckhqdq m%8, m%3 ; out5
+
+ SWAP %6, %1
+ SWAP %3, %9
+ SWAP %8, %6
%endmacro
%macro HMD8_1D 0
@@ -87,8 +95,9 @@ SECTION .text
SWAP 7, 9
%endmacro
+
INIT_XMM ssse3
-cglobal hadamard_8x8, 3, 5, 10, input, stride, output
+cglobal hadamard_8x8, 3, 5, 11, input, stride, output
lea r3, [2 * strideq]
lea r4, [4 * strideq]
@@ -105,17 +114,17 @@ cglobal hadamard_8x8, 3, 5, 10, input, stride, output
mova m7, [inputq + r3]
HMD8_1D
- TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
+ TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10
HMD8_1D
- mova [outputq + 0], m0
- mova [outputq + 16], m1
- mova [outputq + 32], m2
- mova [outputq + 48], m3
- mova [outputq + 64], m4
- mova [outputq + 80], m5
- mova [outputq + 96], m6
- mova [outputq + 112], m7
+ STORE_TRAN_LOW 0, outputq, 0, 8, 9
+ STORE_TRAN_LOW 1, outputq, 8, 8, 9
+ STORE_TRAN_LOW 2, outputq, 16, 8, 9
+ STORE_TRAN_LOW 3, outputq, 24, 8, 9
+ STORE_TRAN_LOW 4, outputq, 32, 8, 9
+ STORE_TRAN_LOW 5, outputq, 40, 8, 9
+ STORE_TRAN_LOW 6, outputq, 48, 8, 9
+ STORE_TRAN_LOW 7, outputq, 56, 8, 9
RET
%endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h
new file mode 100644
index 00000000000..b9116f04981
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/bitdepth_conversion_avx2.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_
+#define VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_
+
+#include <immintrin.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+// Load 16 16 bit values. If the source is 32 bits then pack down with
+// saturation.
+static INLINE __m256i load_tran_low(const tran_low_t *a) {
+#if CONFIG_VP9_HIGHBITDEPTH
+ const __m256i a_low = _mm256_loadu_si256((const __m256i *)a);
+ return _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8));
+#else
+ return _mm256_loadu_si256((const __m256i *)a);
+#endif
+}
+
+#endif // VPX_DSP_X86_BITDEPTH_CONVERSION_AVX2_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.asm
new file mode 100644
index 00000000000..aacf71f7ac6
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.asm
@@ -0,0 +1,90 @@
+;
+; Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+; TODO(johannkoenig): Add the necessary include guards to vpx_config.asm.
+; vpx_config.asm is not guarded so can not be included twice. Because this will
+; be used in conjunction with x86_abi_support.asm or x86inc.asm, it must be
+; included after those files.
+
+; Increment register by sizeof() tran_low_t * 8.
+%macro INCREMENT_TRAN_LOW 1
+%if CONFIG_VP9_HIGHBITDEPTH
+ add %1, 32
+%else
+ add %1, 16
+%endif
+%endmacro
+
+; Increment %1 by sizeof() tran_low_t * %2.
+%macro INCREMENT_ELEMENTS_TRAN_LOW 2
+%if CONFIG_VP9_HIGHBITDEPTH
+ lea %1, [%1 + %2 * 4]
+%else
+ lea %1, [%1 + %2 * 2]
+%endif
+%endmacro
+
+; Load %2 + %3 into m%1.
+; %3 is the offset in elements, not bytes.
+; If tran_low_t is 16 bits (low bit depth configuration) then load the value
+; directly. If tran_low_t is 32 bits (high bit depth configuration) then pack
+; the values down to 16 bits.
+%macro LOAD_TRAN_LOW 3
+%if CONFIG_VP9_HIGHBITDEPTH
+ mova m%1, [%2 + (%3) * 4]
+ packssdw m%1, [%2 + (%3) * 4 + 16]
+%else
+ mova m%1, [%2 + (%3) * 2]
+%endif
+%endmacro
+
+; Store m%1 to %2 + %3.
+; %3 is the offset in elements, not bytes.
+; If 5 arguments are provided then m%1 is corrupted.
+; If 6 arguments are provided then m%1 is preserved.
+; If tran_low_t is 16 bits (low bit depth configuration) then store the value
+; directly. If tran_low_t is 32 bits (high bit depth configuration) then sign
+; extend the values first.
+; Uses m%4-m%6 as scratch registers for high bit depth.
+%macro STORE_TRAN_LOW 5-6
+%if CONFIG_VP9_HIGHBITDEPTH
+ pxor m%4, m%4
+ mova m%5, m%1
+ %if %0 == 6
+ mova m%6, m%1
+ %endif
+ pcmpgtw m%4, m%1
+ punpcklwd m%5, m%4
+ %if %0 == 5
+ punpckhwd m%1, m%4
+ %else
+ punpckhwd m%6, m%4
+ %endif
+ mova [%2 + (%3) * 4 + 0], m%5
+ %if %0 == 5
+ mova [%2 + (%3) * 4 + 16], m%1
+ %else
+ mova [%2 + (%3) * 4 + 16], m%6
+ %endif
+%else
+ mova [%2 + (%3) * 2], m%1
+%endif
+%endmacro
+
+; Store zeros (in m%1) to %2 + %3.
+; %3 is the offset in elements, not bytes.
+%macro STORE_ZERO_TRAN_LOW 3
+%if CONFIG_VP9_HIGHBITDEPTH
+ mova [%2 + (%3) * 4 + 0], m%1
+ mova [%2 + (%3) * 4 + 16], m%1
+%else
+ mova [%2 + (%3) * 2], m%1
+%endif
+%endmacro
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/fdct.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h
index 54a6d81fcbc..5d1d7795723 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/fdct.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/bitdepth_conversion_sse2.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
@@ -7,8 +7,8 @@
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
-#ifndef VPX_DSP_X86_FDCT_H_
-#define VPX_DSP_X86_FDCT_H_
+#ifndef VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_
+#define VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_
#include <xmmintrin.h>
@@ -16,13 +16,12 @@
#include "vpx/vpx_integer.h"
#include "vpx_dsp/vpx_dsp_common.h"
-// Load 8 16 bit values. If the source is 32 bits then cast down.
-// This does not saturate values. It only truncates.
+// Load 8 16 bit values. If the source is 32 bits then pack down with
+// saturation.
static INLINE __m128i load_tran_low(const tran_low_t *a) {
#if CONFIG_VP9_HIGHBITDEPTH
- return _mm_setr_epi16((int16_t)a[0], (int16_t)a[1], (int16_t)a[2],
- (int16_t)a[3], (int16_t)a[4], (int16_t)a[5],
- (int16_t)a[6], (int16_t)a[7]);
+ const __m128i a_low = _mm_load_si128((const __m128i *)a);
+ return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4));
#else
return _mm_load_si128((const __m128i *)a);
#endif
@@ -54,4 +53,4 @@ static INLINE void store_zero_tran_low(tran_low_t *a) {
_mm_store_si128((__m128i *)(a), zero);
#endif
}
-#endif // VPX_DSP_X86_FDCT_H_
+#endif // VPX_DSP_X86_BITDEPTH_CONVERSION_SSE2_H_
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/deblock_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/deblock_sse2.asm
index ebca50930a0..bd8fd12480e 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/deblock_sse2.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/deblock_sse2.asm
@@ -78,8 +78,8 @@
%endmacro
%macro UPDATE_FLIMIT 0
- movdqa xmm2, XMMWORD PTR [rbx]
- movdqa [rsp], xmm2
+ movdqu xmm2, XMMWORD PTR [rbx]
+ movdqu [rsp], xmm2
add rbx, 16
%endmacro
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm
index 78a1dbb24f8..b433874f28d 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/fwd_txfm_ssse3_x86_64.asm
@@ -10,10 +10,6 @@
%include "third_party/x86inc/x86inc.asm"
-; This file provides SSSE3 version of the forward transformation. Part
-; of the macro definitions are originally derived from the ffmpeg project.
-; The current version applies to x86 64-bit only.
-
SECTION_RODATA
pw_11585x2: times 8 dw 23170
@@ -32,107 +28,11 @@ TRANSFORM_COEFFS 9102, 13623
SECTION .text
%if ARCH_X86_64
-%macro SUM_SUB 3
- psubw m%3, m%1, m%2
- paddw m%1, m%2
- SWAP %2, %3
-%endmacro
-
-; butterfly operation
-%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2
- pmaddwd m%1, m%3, %5
- pmaddwd m%2, m%3, %6
- paddd m%1, %4
- paddd m%2, %4
- psrad m%1, 14
- psrad m%2, 14
-%endmacro
-
-%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
- punpckhwd m%6, m%2, m%1
- MUL_ADD_2X %7, %6, %6, %5, [pw_%4_%3], [pw_%3_m%4]
- punpcklwd m%2, m%1
- MUL_ADD_2X %1, %2, %2, %5, [pw_%4_%3], [pw_%3_m%4]
- packssdw m%1, m%7
- packssdw m%2, m%6
-%endmacro
-
-; matrix transpose
-%macro INTERLEAVE_2X 4
- punpckh%1 m%4, m%2, m%3
- punpckl%1 m%2, m%3
- SWAP %3, %4
-%endmacro
-
-%macro TRANSPOSE8X8 9
- INTERLEAVE_2X wd, %1, %2, %9
- INTERLEAVE_2X wd, %3, %4, %9
- INTERLEAVE_2X wd, %5, %6, %9
- INTERLEAVE_2X wd, %7, %8, %9
-
- INTERLEAVE_2X dq, %1, %3, %9
- INTERLEAVE_2X dq, %2, %4, %9
- INTERLEAVE_2X dq, %5, %7, %9
- INTERLEAVE_2X dq, %6, %8, %9
-
- INTERLEAVE_2X qdq, %1, %5, %9
- INTERLEAVE_2X qdq, %3, %7, %9
- INTERLEAVE_2X qdq, %2, %6, %9
- INTERLEAVE_2X qdq, %4, %8, %9
-
- SWAP %2, %5
- SWAP %4, %7
-%endmacro
-
-; 1D forward 8x8 DCT transform
-%macro FDCT8_1D 1
- SUM_SUB 0, 7, 9
- SUM_SUB 1, 6, 9
- SUM_SUB 2, 5, 9
- SUM_SUB 3, 4, 9
-
- SUM_SUB 0, 3, 9
- SUM_SUB 1, 2, 9
- SUM_SUB 6, 5, 9
-%if %1 == 0
- SUM_SUB 0, 1, 9
-%endif
-
- BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10
-
- pmulhrsw m6, m12
- pmulhrsw m5, m12
-%if %1 == 0
- pmulhrsw m0, m12
- pmulhrsw m1, m12
-%else
- BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10
- SWAP 0, 1
-%endif
-
- SUM_SUB 4, 5, 9
- SUM_SUB 7, 6, 9
- BUTTERFLY_4X 4, 7, 3196, 16069, m8, 9, 10
- BUTTERFLY_4X 5, 6, 13623, 9102, m8, 9, 10
- SWAP 1, 4
- SWAP 3, 6
-%endmacro
-
-%macro DIVIDE_ROUND_2X 4 ; dst1, dst2, tmp1, tmp2
- psraw m%3, m%1, 15
- psraw m%4, m%2, 15
- psubw m%1, m%3
- psubw m%2, m%4
- psraw m%1, 1
- psraw m%2, 1
-%endmacro
-
INIT_XMM ssse3
cglobal fdct8x8, 3, 5, 13, input, output, stride
mova m8, [pd_8192]
mova m12, [pw_11585x2]
- pxor m11, m11
lea r3, [2 * strideq]
lea r4, [4 * strideq]
@@ -159,25 +59,303 @@ cglobal fdct8x8, 3, 5, 13, input, output, stride
psllw m7, 2
; column transform
- FDCT8_1D 0
- TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-
- FDCT8_1D 1
- TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-
- DIVIDE_ROUND_2X 0, 1, 9, 10
- DIVIDE_ROUND_2X 2, 3, 9, 10
- DIVIDE_ROUND_2X 4, 5, 9, 10
- DIVIDE_ROUND_2X 6, 7, 9, 10
-
- mova [outputq + 0], m0
- mova [outputq + 16], m1
- mova [outputq + 32], m2
- mova [outputq + 48], m3
- mova [outputq + 64], m4
- mova [outputq + 80], m5
- mova [outputq + 96], m6
- mova [outputq + 112], m7
+ ; stage 1
+ paddw m10, m0, m7
+ psubw m0, m7
+
+ paddw m9, m1, m6
+ psubw m1, m6
+
+ paddw m7, m2, m5
+ psubw m2, m5
+
+ paddw m6, m3, m4
+ psubw m3, m4
+
+ ; stage 2
+ paddw m5, m9, m7
+ psubw m9, m7
+
+ paddw m4, m10, m6
+ psubw m10, m6
+
+ paddw m7, m1, m2
+ psubw m1, m2
+
+ ; stage 3
+ paddw m6, m4, m5
+ psubw m4, m5
+
+ pmulhrsw m1, m12
+ pmulhrsw m7, m12
+
+ ; sin(pi / 8), cos(pi / 8)
+ punpcklwd m2, m10, m9
+ punpckhwd m10, m9
+ pmaddwd m5, m2, [pw_15137_6270]
+ pmaddwd m2, [pw_6270_m15137]
+ pmaddwd m9, m10, [pw_15137_6270]
+ pmaddwd m10, [pw_6270_m15137]
+ paddd m5, m8
+ paddd m2, m8
+ paddd m9, m8
+ paddd m10, m8
+ psrad m5, 14
+ psrad m2, 14
+ psrad m9, 14
+ psrad m10, 14
+ packssdw m5, m9
+ packssdw m2, m10
+
+ pmulhrsw m6, m12
+ pmulhrsw m4, m12
+
+ paddw m9, m3, m1
+ psubw m3, m1
+
+ paddw m10, m0, m7
+ psubw m0, m7
+
+ ; stage 4
+ ; sin(pi / 16), cos(pi / 16)
+ punpcklwd m1, m10, m9
+ punpckhwd m10, m9
+ pmaddwd m7, m1, [pw_16069_3196]
+ pmaddwd m1, [pw_3196_m16069]
+ pmaddwd m9, m10, [pw_16069_3196]
+ pmaddwd m10, [pw_3196_m16069]
+ paddd m7, m8
+ paddd m1, m8
+ paddd m9, m8
+ paddd m10, m8
+ psrad m7, 14
+ psrad m1, 14
+ psrad m9, 14
+ psrad m10, 14
+ packssdw m7, m9
+ packssdw m1, m10
+
+ ; sin(3 * pi / 16), cos(3 * pi / 16)
+ punpcklwd m11, m0, m3
+ punpckhwd m0, m3
+ pmaddwd m9, m11, [pw_9102_13623]
+ pmaddwd m11, [pw_13623_m9102]
+ pmaddwd m3, m0, [pw_9102_13623]
+ pmaddwd m0, [pw_13623_m9102]
+ paddd m9, m8
+ paddd m11, m8
+ paddd m3, m8
+ paddd m0, m8
+ psrad m9, 14
+ psrad m11, 14
+ psrad m3, 14
+ psrad m0, 14
+ packssdw m9, m3
+ packssdw m11, m0
+
+ ; transpose
+ ; stage 1
+ punpcklwd m0, m6, m7
+ punpcklwd m3, m5, m11
+ punpckhwd m6, m7
+ punpckhwd m5, m11
+ punpcklwd m7, m4, m9
+ punpcklwd m10, m2, m1
+ punpckhwd m4, m9
+ punpckhwd m2, m1
+
+ ; stage 2
+ punpckldq m9, m0, m3
+ punpckldq m1, m6, m5
+ punpckhdq m0, m3
+ punpckhdq m6, m5
+ punpckldq m3, m7, m10
+ punpckldq m5, m4, m2
+ punpckhdq m7, m10
+ punpckhdq m4, m2
+
+ ; stage 3
+ punpcklqdq m10, m9, m3
+ punpckhqdq m9, m3
+ punpcklqdq m2, m0, m7
+ punpckhqdq m0, m7
+ punpcklqdq m3, m1, m5
+ punpckhqdq m1, m5
+ punpcklqdq m7, m6, m4
+ punpckhqdq m6, m4
+
+ ; row transform
+ ; stage 1
+ paddw m5, m10, m6
+ psubw m10, m6
+
+ paddw m4, m9, m7
+ psubw m9, m7
+
+ paddw m6, m2, m1
+ psubw m2, m1
+
+ paddw m7, m0, m3
+ psubw m0, m3
+
+ ;stage 2
+ paddw m1, m5, m7
+ psubw m5, m7
+
+ paddw m3, m4, m6
+ psubw m4, m6
+
+ paddw m7, m9, m2
+ psubw m9, m2
+
+ ; stage 3
+ punpcklwd m6, m1, m3
+ punpckhwd m1, m3
+ pmaddwd m2, m6, [pw_11585_11585]
+ pmaddwd m6, [pw_11585_m11585]
+ pmaddwd m3, m1, [pw_11585_11585]
+ pmaddwd m1, [pw_11585_m11585]
+ paddd m2, m8
+ paddd m6, m8
+ paddd m3, m8
+ paddd m1, m8
+ psrad m2, 14
+ psrad m6, 14
+ psrad m3, 14
+ psrad m1, 14
+ packssdw m2, m3
+ packssdw m6, m1
+
+ pmulhrsw m7, m12
+ pmulhrsw m9, m12
+
+ punpcklwd m3, m5, m4
+ punpckhwd m5, m4
+ pmaddwd m1, m3, [pw_15137_6270]
+ pmaddwd m3, [pw_6270_m15137]
+ pmaddwd m4, m5, [pw_15137_6270]
+ pmaddwd m5, [pw_6270_m15137]
+ paddd m1, m8
+ paddd m3, m8
+ paddd m4, m8
+ paddd m5, m8
+ psrad m1, 14
+ psrad m3, 14
+ psrad m4, 14
+ psrad m5, 14
+ packssdw m1, m4
+ packssdw m3, m5
+
+ paddw m4, m0, m9
+ psubw m0, m9
+
+ paddw m5, m10, m7
+ psubw m10, m7
+
+ ; stage 4
+ punpcklwd m9, m5, m4
+ punpckhwd m5, m4
+ pmaddwd m7, m9, [pw_16069_3196]
+ pmaddwd m9, [pw_3196_m16069]
+ pmaddwd m4, m5, [pw_16069_3196]
+ pmaddwd m5, [pw_3196_m16069]
+ paddd m7, m8
+ paddd m9, m8
+ paddd m4, m8
+ paddd m5, m8
+ psrad m7, 14
+ psrad m9, 14
+ psrad m4, 14
+ psrad m5, 14
+ packssdw m7, m4
+ packssdw m9, m5
+
+ punpcklwd m4, m10, m0
+ punpckhwd m10, m0
+ pmaddwd m5, m4, [pw_9102_13623]
+ pmaddwd m4, [pw_13623_m9102]
+ pmaddwd m0, m10, [pw_9102_13623]
+ pmaddwd m10, [pw_13623_m9102]
+ paddd m5, m8
+ paddd m4, m8
+ paddd m0, m8
+ paddd m10, m8
+ psrad m5, 14
+ psrad m4, 14
+ psrad m0, 14
+ psrad m10, 14
+ packssdw m5, m0
+ packssdw m4, m10
+
+ ; transpose
+ ; stage 1
+ punpcklwd m0, m2, m7
+ punpcklwd m10, m1, m4
+ punpckhwd m2, m7
+ punpckhwd m1, m4
+ punpcklwd m7, m6, m5
+ punpcklwd m4, m3, m9
+ punpckhwd m6, m5
+ punpckhwd m3, m9
+
+ ; stage 2
+ punpckldq m5, m0, m10
+ punpckldq m9, m2, m1
+ punpckhdq m0, m10
+ punpckhdq m2, m1
+ punpckldq m10, m7, m4
+ punpckldq m1, m6, m3
+ punpckhdq m7, m4
+ punpckhdq m6, m3
+
+ ; stage 3
+ punpcklqdq m4, m5, m10
+ punpckhqdq m5, m10
+ punpcklqdq m3, m0, m7
+ punpckhqdq m0, m7
+ punpcklqdq m10, m9, m1
+ punpckhqdq m9, m1
+ punpcklqdq m7, m2, m6
+ punpckhqdq m2, m6
+
+ psraw m1, m4, 15
+ psraw m6, m5, 15
+ psraw m8, m3, 15
+ psraw m11, m0, 15
+
+ psubw m4, m1
+ psubw m5, m6
+ psubw m3, m8
+ psubw m0, m11
+
+ psraw m4, 1
+ psraw m5, 1
+ psraw m3, 1
+ psraw m0, 1
+
+ psraw m1, m10, 15
+ psraw m6, m9, 15
+ psraw m8, m7, 15
+ psraw m11, m2, 15
+
+ psubw m10, m1
+ psubw m9, m6
+ psubw m7, m8
+ psubw m2, m11
+
+ psraw m10, 1
+ psraw m9, 1
+ psraw m7, 1
+ psraw m2, 1
+
+ mova [outputq + 0], m4
+ mova [outputq + 16], m5
+ mova [outputq + 32], m3
+ mova [outputq + 48], m0
+ mova [outputq + 64], m10
+ mova [outputq + 80], m9
+ mova [outputq + 96], m7
+ mova [outputq + 112], m2
RET
%endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
index 487a474a675..33909ba8159 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.c
@@ -263,100 +263,6 @@ void iadst4_sse2(__m128i *in) {
in[1] = _mm_packs_epi32(u[2], u[3]);
}
-#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
- out2, out3, out4, out5, out6, out7) \
- { \
- const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
- const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
- const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
- const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
- const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
- const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
- const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
- const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
- \
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
- \
- out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
- out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
- out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
- out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
- out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
- out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
- out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
- out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
- }
-
-#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \
- { \
- const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
- const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
- const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
- const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
- \
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
- \
- out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
- out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
- out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
- out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
- }
-
-#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
- { \
- const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
- const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
- out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
- out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
- }
-
-// Define Macro for multiplying elements by constants and adding them together.
-#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \
- res0, res1, res2, res3) \
- { \
- tmp0 = _mm_madd_epi16(lo_0, cst0); \
- tmp1 = _mm_madd_epi16(hi_0, cst0); \
- tmp2 = _mm_madd_epi16(lo_0, cst1); \
- tmp3 = _mm_madd_epi16(hi_0, cst1); \
- tmp4 = _mm_madd_epi16(lo_1, cst2); \
- tmp5 = _mm_madd_epi16(hi_1, cst2); \
- tmp6 = _mm_madd_epi16(lo_1, cst3); \
- tmp7 = _mm_madd_epi16(hi_1, cst3); \
- \
- tmp0 = _mm_add_epi32(tmp0, rounding); \
- tmp1 = _mm_add_epi32(tmp1, rounding); \
- tmp2 = _mm_add_epi32(tmp2, rounding); \
- tmp3 = _mm_add_epi32(tmp3, rounding); \
- tmp4 = _mm_add_epi32(tmp4, rounding); \
- tmp5 = _mm_add_epi32(tmp5, rounding); \
- tmp6 = _mm_add_epi32(tmp6, rounding); \
- tmp7 = _mm_add_epi32(tmp7, rounding); \
- \
- tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
- tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
- tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
- tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
- tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
- tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
- tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
- tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
- \
- res0 = _mm_packs_epi32(tmp0, tmp1); \
- res1 = _mm_packs_epi32(tmp2, tmp3); \
- res2 = _mm_packs_epi32(tmp4, tmp5); \
- res3 = _mm_packs_epi32(tmp6, tmp7); \
- }
-
#define MULTIPLICATION_AND_ADD_2(lo_0, hi_0, cst0, cst1, res0, res1) \
{ \
tmp0 = _mm_madd_epi16(lo_0, cst0); \
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.h b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
index d762a04abcd..d5683ab1cf0 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_sse2.h
@@ -46,6 +46,36 @@ static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
}
+#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+ out2, out3, out4, out5, out6, out7) \
+ { \
+ const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+ const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
+ const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
+ const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
+ const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
+ const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
+ const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
+ \
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
+ \
+ out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
+ out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
+ out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
+ out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
+ out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
+ out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
+ out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
+ out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
+ }
#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1) \
{ \
@@ -186,6 +216,69 @@ static INLINE void write_buffer_8x16(uint8_t *dest, __m128i *in, int stride) {
RECON_AND_STORE(dest + 15 * stride, in[15]);
}
+#define TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, out0, out1, out2, out3) \
+ { \
+ const __m128i tr0_0 = _mm_unpackhi_epi16(tmp0, tmp1); \
+ const __m128i tr0_1 = _mm_unpacklo_epi16(tmp1, tmp0); \
+ const __m128i tr0_4 = _mm_unpacklo_epi16(tmp2, tmp3); \
+ const __m128i tr0_5 = _mm_unpackhi_epi16(tmp3, tmp2); \
+ \
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
+ \
+ out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
+ out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
+ out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
+ out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
+ }
+
+#define TRANSPOSE_8X8_10(in0, in1, in2, in3, out0, out1) \
+ { \
+ const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+ out0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+ out1 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+ }
+
+// Define Macro for multiplying elements by constants and adding them together.
+#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, cst0, cst1, cst2, cst3, \
+ res0, res1, res2, res3) \
+ { \
+ tmp0 = _mm_madd_epi16(lo_0, cst0); \
+ tmp1 = _mm_madd_epi16(hi_0, cst0); \
+ tmp2 = _mm_madd_epi16(lo_0, cst1); \
+ tmp3 = _mm_madd_epi16(hi_0, cst1); \
+ tmp4 = _mm_madd_epi16(lo_1, cst2); \
+ tmp5 = _mm_madd_epi16(hi_1, cst2); \
+ tmp6 = _mm_madd_epi16(lo_1, cst3); \
+ tmp7 = _mm_madd_epi16(hi_1, cst3); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ tmp4 = _mm_add_epi32(tmp4, rounding); \
+ tmp5 = _mm_add_epi32(tmp5, rounding); \
+ tmp6 = _mm_add_epi32(tmp6, rounding); \
+ tmp7 = _mm_add_epi32(tmp7, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
+ tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
+ tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
+ tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
+ \
+ res0 = _mm_packs_epi32(tmp0, tmp1); \
+ res1 = _mm_packs_epi32(tmp2, tmp3); \
+ res2 = _mm_packs_epi32(tmp4, tmp5); \
+ res3 = _mm_packs_epi32(tmp6, tmp7); \
+ }
+
void idct4_sse2(__m128i *in);
void idct8_sse2(__m128i *in);
void idct16_sse2(__m128i *in0, __m128i *in1);
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c
new file mode 100644
index 00000000000..cfa6a732ae7
--- /dev/null
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_ssse3.c
@@ -0,0 +1,1741 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <tmmintrin.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+void vpx_idct8x8_64_add_ssse3(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+ const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int i;
+
+ // Load input data.
+ in0 = load_input_data(input);
+ in1 = load_input_data(input + 8 * 1);
+ in2 = load_input_data(input + 8 * 2);
+ in3 = load_input_data(input + 8 * 3);
+ in4 = load_input_data(input + 8 * 4);
+ in5 = load_input_data(input + 8 * 5);
+ in6 = load_input_data(input + 8 * 6);
+ in7 = load_input_data(input + 8 * 7);
+
+ // 2-D
+ for (i = 0; i < 2; i++) {
+ // 8x8 Transpose is copied from vpx_fdct8x8_sse2()
+ TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+
+ // 4-stage 1D idct8x8
+ {
+ /* Stage1 */
+ {
+ const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7);
+ const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7);
+ const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5);
+ const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5);
+
+ {
+ tmp0 = _mm_madd_epi16(lo_17, stg1_0);
+ tmp1 = _mm_madd_epi16(hi_17, stg1_0);
+ tmp2 = _mm_madd_epi16(lo_17, stg1_1);
+ tmp3 = _mm_madd_epi16(hi_17, stg1_1);
+ tmp4 = _mm_madd_epi16(lo_35, stg1_2);
+ tmp5 = _mm_madd_epi16(hi_35, stg1_2);
+ tmp6 = _mm_madd_epi16(lo_35, stg1_3);
+ tmp7 = _mm_madd_epi16(hi_35, stg1_3);
+
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp1 = _mm_add_epi32(tmp1, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp3 = _mm_add_epi32(tmp3, rounding);
+ tmp4 = _mm_add_epi32(tmp4, rounding);
+ tmp5 = _mm_add_epi32(tmp5, rounding);
+ tmp6 = _mm_add_epi32(tmp6, rounding);
+ tmp7 = _mm_add_epi32(tmp7, rounding);
+
+ tmp0 = _mm_srai_epi32(tmp0, 14);
+ tmp1 = _mm_srai_epi32(tmp1, 14);
+ tmp2 = _mm_srai_epi32(tmp2, 14);
+ tmp3 = _mm_srai_epi32(tmp3, 14);
+ tmp4 = _mm_srai_epi32(tmp4, 14);
+ tmp5 = _mm_srai_epi32(tmp5, 14);
+ tmp6 = _mm_srai_epi32(tmp6, 14);
+ tmp7 = _mm_srai_epi32(tmp7, 14);
+
+ stp1_4 = _mm_packs_epi32(tmp0, tmp1);
+ stp1_7 = _mm_packs_epi32(tmp2, tmp3);
+ stp1_5 = _mm_packs_epi32(tmp4, tmp5);
+ stp1_6 = _mm_packs_epi32(tmp6, tmp7);
+ }
+ }
+
+ /* Stage2 */
+ {
+ const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6);
+ const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6);
+
+ {
+ tmp0 = _mm_unpacklo_epi16(in0, in4);
+ tmp1 = _mm_unpackhi_epi16(in0, in4);
+
+ tmp2 = _mm_madd_epi16(tmp0, stk2_0);
+ tmp3 = _mm_madd_epi16(tmp1, stk2_0);
+ tmp4 = _mm_madd_epi16(tmp0, stk2_1);
+ tmp5 = _mm_madd_epi16(tmp1, stk2_1);
+
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp3 = _mm_add_epi32(tmp3, rounding);
+ tmp4 = _mm_add_epi32(tmp4, rounding);
+ tmp5 = _mm_add_epi32(tmp5, rounding);
+
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+ tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+ tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
+
+ stp2_0 = _mm_packs_epi32(tmp2, tmp3);
+ stp2_1 = _mm_packs_epi32(tmp4, tmp5);
+
+ tmp0 = _mm_madd_epi16(lo_26, stg2_2);
+ tmp1 = _mm_madd_epi16(hi_26, stg2_2);
+ tmp2 = _mm_madd_epi16(lo_26, stg2_3);
+ tmp3 = _mm_madd_epi16(hi_26, stg2_3);
+
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp1 = _mm_add_epi32(tmp1, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp3 = _mm_add_epi32(tmp3, rounding);
+
+ tmp0 = _mm_srai_epi32(tmp0, 14);
+ tmp1 = _mm_srai_epi32(tmp1, 14);
+ tmp2 = _mm_srai_epi32(tmp2, 14);
+ tmp3 = _mm_srai_epi32(tmp3, 14);
+
+ stp2_2 = _mm_packs_epi32(tmp0, tmp1);
+ stp2_3 = _mm_packs_epi32(tmp2, tmp3);
+ }
+
+ stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
+ stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
+ stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
+ stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
+ }
+
+ /* Stage3 */
+ {
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
+ stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
+ stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
+
+ tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5);
+ tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5);
+
+ tmp2 = _mm_madd_epi16(tmp0, stk2_1);
+ tmp3 = _mm_madd_epi16(tmp1, stk2_1);
+ tmp4 = _mm_madd_epi16(tmp0, stk2_0);
+ tmp5 = _mm_madd_epi16(tmp1, stk2_0);
+
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp3 = _mm_add_epi32(tmp3, rounding);
+ tmp4 = _mm_add_epi32(tmp4, rounding);
+ tmp5 = _mm_add_epi32(tmp5, rounding);
+
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+ tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+ tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
+
+ stp1_5 = _mm_packs_epi32(tmp2, tmp3);
+ stp1_6 = _mm_packs_epi32(tmp4, tmp5);
+ }
+
+ /* Stage4 */
+ in0 = _mm_add_epi16(stp1_0, stp2_7);
+ in1 = _mm_add_epi16(stp1_1, stp1_6);
+ in2 = _mm_add_epi16(stp1_2, stp1_5);
+ in3 = _mm_add_epi16(stp1_3, stp2_4);
+ in4 = _mm_sub_epi16(stp1_3, stp2_4);
+ in5 = _mm_sub_epi16(stp1_2, stp1_5);
+ in6 = _mm_sub_epi16(stp1_1, stp1_6);
+ in7 = _mm_sub_epi16(stp1_0, stp2_7);
+ }
+ }
+
+ // Final rounding and shift
+ in0 = _mm_adds_epi16(in0, final_rounding);
+ in1 = _mm_adds_epi16(in1, final_rounding);
+ in2 = _mm_adds_epi16(in2, final_rounding);
+ in3 = _mm_adds_epi16(in3, final_rounding);
+ in4 = _mm_adds_epi16(in4, final_rounding);
+ in5 = _mm_adds_epi16(in5, final_rounding);
+ in6 = _mm_adds_epi16(in6, final_rounding);
+ in7 = _mm_adds_epi16(in7, final_rounding);
+
+ in0 = _mm_srai_epi16(in0, 5);
+ in1 = _mm_srai_epi16(in1, 5);
+ in2 = _mm_srai_epi16(in2, 5);
+ in3 = _mm_srai_epi16(in3, 5);
+ in4 = _mm_srai_epi16(in4, 5);
+ in5 = _mm_srai_epi16(in5, 5);
+ in6 = _mm_srai_epi16(in6, 5);
+ in7 = _mm_srai_epi16(in7, 5);
+
+ RECON_AND_STORE(dest + 0 * stride, in0);
+ RECON_AND_STORE(dest + 1 * stride, in1);
+ RECON_AND_STORE(dest + 2 * stride, in2);
+ RECON_AND_STORE(dest + 3 * stride, in3);
+ RECON_AND_STORE(dest + 4 * stride, in4);
+ RECON_AND_STORE(dest + 5 * stride, in5);
+ RECON_AND_STORE(dest + 6 * stride, in6);
+ RECON_AND_STORE(dest + 7 * stride, in7);
+}
+
+void vpx_idct8x8_12_add_ssse3(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i final_rounding = _mm_set1_epi16(1 << 4);
+ const __m128i stg1_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
+ const __m128i stg1_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
+ const __m128i stg1_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64);
+ const __m128i stg1_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64);
+ const __m128i stg2_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
+ const __m128i stk2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stk2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg2_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64);
+ const __m128i stg2_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64);
+ const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
+ __m128i tmp0, tmp1, tmp2, tmp3;
+
+ // Rows. Load 4-row input data.
+ in0 = load_input_data(input);
+ in1 = load_input_data(input + 8 * 1);
+ in2 = load_input_data(input + 8 * 2);
+ in3 = load_input_data(input + 8 * 3);
+
+ // 8x4 Transpose
+ TRANSPOSE_8X8_10(in0, in1, in2, in3, in0, in1);
+
+ // Stage1
+ tmp0 = _mm_mulhrs_epi16(in0, stg1_0);
+ tmp1 = _mm_mulhrs_epi16(in0, stg1_1);
+ tmp2 = _mm_mulhrs_epi16(in1, stg1_2);
+ tmp3 = _mm_mulhrs_epi16(in1, stg1_3);
+
+ stp1_4 = _mm_unpackhi_epi64(tmp0, tmp1);
+ stp1_5 = _mm_unpackhi_epi64(tmp2, tmp3);
+
+ // Stage2
+ tmp0 = _mm_mulhrs_epi16(in0, stg2_0);
+ stp2_0 = _mm_unpacklo_epi64(tmp0, tmp0);
+
+ tmp1 = _mm_mulhrs_epi16(in1, stg2_2);
+ tmp2 = _mm_mulhrs_epi16(in1, stg2_3);
+ stp2_2 = _mm_unpacklo_epi64(tmp2, tmp1);
+
+ tmp0 = _mm_add_epi16(stp1_4, stp1_5);
+ tmp1 = _mm_sub_epi16(stp1_4, stp1_5);
+
+ stp2_4 = tmp0;
+ stp2_5 = _mm_unpacklo_epi64(tmp1, zero);
+ stp2_6 = _mm_unpackhi_epi64(tmp1, zero);
+
+ tmp0 = _mm_unpacklo_epi16(stp2_5, stp2_6);
+ tmp1 = _mm_madd_epi16(tmp0, stg3_0);
+ tmp2 = _mm_madd_epi16(tmp0, stk2_0); // stg3_1 = stk2_0
+
+ tmp1 = _mm_add_epi32(tmp1, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+
+ stp1_5 = _mm_packs_epi32(tmp1, tmp2);
+
+ // Stage3
+ tmp2 = _mm_add_epi16(stp2_0, stp2_2);
+ tmp3 = _mm_sub_epi16(stp2_0, stp2_2);
+
+ stp1_2 = _mm_unpackhi_epi64(tmp3, tmp2);
+ stp1_3 = _mm_unpacklo_epi64(tmp3, tmp2);
+
+ // Stage4
+ tmp0 = _mm_add_epi16(stp1_3, stp2_4);
+ tmp1 = _mm_add_epi16(stp1_2, stp1_5);
+ tmp2 = _mm_sub_epi16(stp1_3, stp2_4);
+ tmp3 = _mm_sub_epi16(stp1_2, stp1_5);
+
+ TRANSPOSE_4X8_10(tmp0, tmp1, tmp2, tmp3, in0, in1, in2, in3)
+
+ /* Stage1 */
+ stp1_4 = _mm_mulhrs_epi16(in1, stg1_0);
+ stp1_7 = _mm_mulhrs_epi16(in1, stg1_1);
+ stp1_5 = _mm_mulhrs_epi16(in3, stg1_2);
+ stp1_6 = _mm_mulhrs_epi16(in3, stg1_3);
+
+ /* Stage2 */
+ stp2_0 = _mm_mulhrs_epi16(in0, stg2_0);
+ stp2_1 = _mm_mulhrs_epi16(in0, stg2_0);
+
+ stp2_2 = _mm_mulhrs_epi16(in2, stg2_2);
+ stp2_3 = _mm_mulhrs_epi16(in2, stg2_3);
+
+ stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
+ stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
+ stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
+ stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
+
+ /* Stage3 */
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
+ stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
+ stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
+
+ tmp0 = _mm_unpacklo_epi16(stp2_6, stp2_5);
+ tmp1 = _mm_unpackhi_epi16(stp2_6, stp2_5);
+
+ tmp2 = _mm_madd_epi16(tmp0, stk2_0);
+ tmp3 = _mm_madd_epi16(tmp1, stk2_0);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp3 = _mm_add_epi32(tmp3, rounding);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+ stp1_6 = _mm_packs_epi32(tmp2, tmp3);
+
+ tmp2 = _mm_madd_epi16(tmp0, stk2_1);
+ tmp3 = _mm_madd_epi16(tmp1, stk2_1);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp3 = _mm_add_epi32(tmp3, rounding);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+ stp1_5 = _mm_packs_epi32(tmp2, tmp3);
+
+ /* Stage4 */
+ in0 = _mm_add_epi16(stp1_0, stp2_7);
+ in1 = _mm_add_epi16(stp1_1, stp1_6);
+ in2 = _mm_add_epi16(stp1_2, stp1_5);
+ in3 = _mm_add_epi16(stp1_3, stp2_4);
+ in4 = _mm_sub_epi16(stp1_3, stp2_4);
+ in5 = _mm_sub_epi16(stp1_2, stp1_5);
+ in6 = _mm_sub_epi16(stp1_1, stp1_6);
+ in7 = _mm_sub_epi16(stp1_0, stp2_7);
+
+ // Final rounding and shift
+ in0 = _mm_adds_epi16(in0, final_rounding);
+ in1 = _mm_adds_epi16(in1, final_rounding);
+ in2 = _mm_adds_epi16(in2, final_rounding);
+ in3 = _mm_adds_epi16(in3, final_rounding);
+ in4 = _mm_adds_epi16(in4, final_rounding);
+ in5 = _mm_adds_epi16(in5, final_rounding);
+ in6 = _mm_adds_epi16(in6, final_rounding);
+ in7 = _mm_adds_epi16(in7, final_rounding);
+
+ in0 = _mm_srai_epi16(in0, 5);
+ in1 = _mm_srai_epi16(in1, 5);
+ in2 = _mm_srai_epi16(in2, 5);
+ in3 = _mm_srai_epi16(in3, 5);
+ in4 = _mm_srai_epi16(in4, 5);
+ in5 = _mm_srai_epi16(in5, 5);
+ in6 = _mm_srai_epi16(in6, 5);
+ in7 = _mm_srai_epi16(in7, 5);
+
+ RECON_AND_STORE(dest + 0 * stride, in0);
+ RECON_AND_STORE(dest + 1 * stride, in1);
+ RECON_AND_STORE(dest + 2 * stride, in2);
+ RECON_AND_STORE(dest + 3 * stride, in3);
+ RECON_AND_STORE(dest + 4 * stride, in4);
+ RECON_AND_STORE(dest + 5 * stride, in5);
+ RECON_AND_STORE(dest + 6 * stride, in6);
+ RECON_AND_STORE(dest + 7 * stride, in7);
+}
+
+static INLINE void idct32_34(const __m128i *in, __m128i *stp1) {
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ // idct constants for each stage
+ const __m128i stk1_0 = pair_set_epi16(2 * cospi_31_64, 2 * cospi_31_64);
+ const __m128i stk1_1 = pair_set_epi16(2 * cospi_1_64, 2 * cospi_1_64);
+ const __m128i stk1_6 = pair_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64);
+ const __m128i stk1_7 = pair_set_epi16(2 * cospi_7_64, 2 * cospi_7_64);
+ const __m128i stk1_8 = pair_set_epi16(2 * cospi_27_64, 2 * cospi_27_64);
+ const __m128i stk1_9 = pair_set_epi16(2 * cospi_5_64, 2 * cospi_5_64);
+ const __m128i stk1_14 = pair_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64);
+ const __m128i stk1_15 = pair_set_epi16(2 * cospi_3_64, 2 * cospi_3_64);
+
+ const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
+ const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
+ const __m128i stk2_6 = pair_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
+ const __m128i stk2_7 = pair_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
+
+ const __m128i stk3_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
+ const __m128i stk3_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
+ const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+ const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stk4_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
+ const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+ stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
+ stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
+ stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+ /* Stage1 */
+
+ stp1[16] = _mm_mulhrs_epi16(in[1], stk1_0);
+ stp1[31] = _mm_mulhrs_epi16(in[1], stk1_1);
+
+ stp1[19] = _mm_mulhrs_epi16(in[7], stk1_6);
+ stp1[28] = _mm_mulhrs_epi16(in[7], stk1_7);
+
+ stp1[20] = _mm_mulhrs_epi16(in[5], stk1_8);
+ stp1[27] = _mm_mulhrs_epi16(in[5], stk1_9);
+
+ stp1[23] = _mm_mulhrs_epi16(in[3], stk1_14);
+ stp1[24] = _mm_mulhrs_epi16(in[3], stk1_15);
+
+ /* Stage2 */
+
+ stp2_8 = _mm_mulhrs_epi16(in[2], stk2_0);
+ stp2_15 = _mm_mulhrs_epi16(in[2], stk2_1);
+
+ stp2_11 = _mm_mulhrs_epi16(in[6], stk2_6);
+ stp2_12 = _mm_mulhrs_epi16(in[6], stk2_7);
+
+ /* Stage3 */
+ {
+ const __m128i lo_17_30 = _mm_unpacklo_epi16(stp1[16], stp1[31]);
+ const __m128i hi_17_30 = _mm_unpackhi_epi16(stp1[16], stp1[31]);
+ const __m128i lo_18_29 = _mm_unpacklo_epi16(stp1[19], stp1[28]);
+ const __m128i hi_18_29 = _mm_unpackhi_epi16(stp1[19], stp1[28]);
+
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp1[20], stp1[27]);
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp1[20], stp1[27]);
+ const __m128i lo_22_25 = _mm_unpacklo_epi16(stp1[23], stp1[24]);
+ const __m128i hi_22_25 = _mm_unpackhi_epi16(stp1[23], stp1[24]);
+
+ stp1[4] = _mm_mulhrs_epi16(in[4], stk3_0);
+ stp1[7] = _mm_mulhrs_epi16(in[4], stk3_1);
+
+ MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,
+ stg3_5, stg3_6, stg3_4, stp1[17], stp1[30], stp1[18],
+ stp1[29])
+ MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,
+ stg3_9, stg3_10, stg3_8, stp1[21], stp1[26],
+ stp1[22], stp1[25])
+ }
+
+ /* Stage4 */
+ {
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp2_8, stp2_15);
+ const __m128i hi_9_14 = _mm_unpackhi_epi16(stp2_8, stp2_15);
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp2_11, stp2_12);
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp2_11, stp2_12);
+
+ stp1[0] = _mm_mulhrs_epi16(in[0], stk4_0);
+ stp1[1] = _mm_mulhrs_epi16(in[0], stk4_0); // stk4_1 = stk4_0
+ stp1[2] = stp1[0];
+ stp1[3] = stp1[1];
+
+ MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, stg4_5,
+ stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, stp2_13)
+
+ stp2_16 = _mm_add_epi16(stp1[16], stp1[19]);
+ stp2_17 = _mm_add_epi16(stp1[17], stp1[18]);
+ stp2_18 = _mm_sub_epi16(stp1[17], stp1[18]);
+ stp2_19 = _mm_sub_epi16(stp1[16], stp1[19]);
+ stp2_20 = _mm_sub_epi16(stp1[23], stp1[20]);
+ stp2_21 = _mm_sub_epi16(stp1[22], stp1[21]);
+ stp2_22 = _mm_add_epi16(stp1[22], stp1[21]);
+ stp2_23 = _mm_add_epi16(stp1[23], stp1[20]);
+
+ stp2_24 = _mm_add_epi16(stp1[24], stp1[27]);
+ stp2_25 = _mm_add_epi16(stp1[25], stp1[26]);
+ stp2_26 = _mm_sub_epi16(stp1[25], stp1[26]);
+ stp2_27 = _mm_sub_epi16(stp1[24], stp1[27]);
+ stp2_28 = _mm_sub_epi16(stp1[31], stp1[28]);
+ stp2_29 = _mm_sub_epi16(stp1[30], stp1[29]);
+ stp2_30 = _mm_add_epi16(stp1[29], stp1[30]);
+ stp2_31 = _mm_add_epi16(stp1[28], stp1[31]);
+ }
+
+ /* Stage5 */
+ {
+// Note:
+// #define AVOID_OVERFLOW = 0, code would be faster. But it can't pass
+// SingleExtreme test. The MaxSupportedCoeff/MinSupportedCoeff must drop
+// to 23198 and -23197, respectively.
+#define AVOID_OVERFLOW (1)
+
+#if AVOID_OVERFLOW
+ const __m128i lo_6_5 = _mm_unpacklo_epi16(stp1[7], stp1[4]);
+ const __m128i hi_6_5 = _mm_unpackhi_epi16(stp1[7], stp1[4]);
+#endif
+ const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);
+ const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);
+
+ const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);
+ const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);
+ const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);
+ const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);
+
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
+
+#if AVOID_OVERFLOW
+ tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);
+ tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);
+ tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);
+ tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);
+
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp1 = _mm_add_epi32(tmp1, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp3 = _mm_add_epi32(tmp3, rounding);
+
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+
+ stp1[5] = _mm_packs_epi32(tmp0, tmp1);
+ stp1[6] = _mm_packs_epi32(tmp2, tmp3);
+#else
+ tmp0 = _mm_sub_epi16(stp1[7], stp1[4]);
+ tmp1 = _mm_adds_epi16(stp1[7], stp1[4]);
+ stp1[5] = _mm_mulhrs_epi16(tmp0, stk4_0);
+ stp1[6] = _mm_mulhrs_epi16(tmp1, stk4_0);
+#endif
+
+ stp1[8] = _mm_add_epi16(stp2_8, stp2_11);
+ stp1[9] = _mm_add_epi16(stp2_9, stp2_10);
+ stp1[10] = _mm_sub_epi16(stp2_9, stp2_10);
+ stp1[11] = _mm_sub_epi16(stp2_8, stp2_11);
+ stp1[12] = _mm_sub_epi16(stp2_15, stp2_12);
+ stp1[13] = _mm_sub_epi16(stp2_14, stp2_13);
+ stp1[14] = _mm_add_epi16(stp2_14, stp2_13);
+ stp1[15] = _mm_add_epi16(stp2_15, stp2_12);
+
+ MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,
+ stg4_5, stg4_4, stg4_5, stp1[18], stp1[29], stp1[19],
+ stp1[28])
+ MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,
+ stg4_4, stg4_6, stg4_4, stp1[20], stp1[27], stp1[21],
+ stp1[26])
+
+ stp1[16] = stp2_16;
+ stp1[17] = stp2_17;
+ stp1[22] = stp2_22;
+ stp1[23] = stp2_23;
+ stp1[24] = stp2_24;
+ stp1[25] = stp2_25;
+ stp1[30] = stp2_30;
+ stp1[31] = stp2_31;
+ }
+
+ /* Stage6 */
+ {
+#if AVOID_OVERFLOW
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1[10], stp1[13]);
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1[10], stp1[13]);
+ const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1[11], stp1[12]);
+ const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1[11], stp1[12]);
+#endif
+
+ stp2_0 = _mm_add_epi16(stp1[0], stp1[7]);
+ stp2_1 = _mm_add_epi16(stp1[1], stp1[6]);
+ stp2_2 = _mm_add_epi16(stp1[2], stp1[5]);
+ stp2_3 = _mm_add_epi16(stp1[3], stp1[4]);
+ stp2_4 = _mm_sub_epi16(stp1[3], stp1[4]);
+ stp2_5 = _mm_sub_epi16(stp1[2], stp1[5]);
+ stp2_6 = _mm_sub_epi16(stp1[1], stp1[6]);
+ stp2_7 = _mm_sub_epi16(stp1[0], stp1[7]);
+
+ stp2_8 = stp1[8];
+ stp2_9 = stp1[9];
+ stp2_14 = stp1[14];
+ stp2_15 = stp1[15];
+
+#if AVOID_OVERFLOW
+ MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,
+ stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,
+ stp2_12)
+#else
+ tmp0 = _mm_add_epi16(stp1[10], stp1[13]);
+ tmp1 = _mm_sub_epi16(stp1[13], stp1[10]);
+ tmp2 = _mm_add_epi16(stp1[11], stp1[12]);
+ tmp3 = _mm_sub_epi16(stp1[12], stp1[11]);
+
+ stp2_10 = _mm_mulhrs_epi16(tmp1, stk4_0);
+ stp2_13 = _mm_mulhrs_epi16(tmp0, stk4_0);
+ stp2_11 = _mm_mulhrs_epi16(tmp3, stk4_0);
+ stp2_12 = _mm_mulhrs_epi16(tmp2, stk4_0);
+
+#endif
+
+ stp2_16 = _mm_add_epi16(stp1[16], stp1[23]);
+ stp2_17 = _mm_add_epi16(stp1[17], stp1[22]);
+ stp2_18 = _mm_add_epi16(stp1[18], stp1[21]);
+ stp2_19 = _mm_add_epi16(stp1[19], stp1[20]);
+ stp2_20 = _mm_sub_epi16(stp1[19], stp1[20]);
+ stp2_21 = _mm_sub_epi16(stp1[18], stp1[21]);
+ stp2_22 = _mm_sub_epi16(stp1[17], stp1[22]);
+ stp2_23 = _mm_sub_epi16(stp1[16], stp1[23]);
+
+ stp2_24 = _mm_sub_epi16(stp1[31], stp1[24]);
+ stp2_25 = _mm_sub_epi16(stp1[30], stp1[25]);
+ stp2_26 = _mm_sub_epi16(stp1[29], stp1[26]);
+ stp2_27 = _mm_sub_epi16(stp1[28], stp1[27]);
+ stp2_28 = _mm_add_epi16(stp1[27], stp1[28]);
+ stp2_29 = _mm_add_epi16(stp1[26], stp1[29]);
+ stp2_30 = _mm_add_epi16(stp1[25], stp1[30]);
+ stp2_31 = _mm_add_epi16(stp1[24], stp1[31]);
+ }
+
+ /* Stage7 */
+ {
+#if AVOID_OVERFLOW
+ const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);
+ const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
+
+ const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);
+ const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);
+ const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);
+ const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);
+#endif
+ stp1[0] = _mm_add_epi16(stp2_0, stp2_15);
+ stp1[1] = _mm_add_epi16(stp2_1, stp2_14);
+ stp1[2] = _mm_add_epi16(stp2_2, stp2_13);
+ stp1[3] = _mm_add_epi16(stp2_3, stp2_12);
+ stp1[4] = _mm_add_epi16(stp2_4, stp2_11);
+ stp1[5] = _mm_add_epi16(stp2_5, stp2_10);
+ stp1[6] = _mm_add_epi16(stp2_6, stp2_9);
+ stp1[7] = _mm_add_epi16(stp2_7, stp2_8);
+ stp1[8] = _mm_sub_epi16(stp2_7, stp2_8);
+ stp1[9] = _mm_sub_epi16(stp2_6, stp2_9);
+ stp1[10] = _mm_sub_epi16(stp2_5, stp2_10);
+ stp1[11] = _mm_sub_epi16(stp2_4, stp2_11);
+ stp1[12] = _mm_sub_epi16(stp2_3, stp2_12);
+ stp1[13] = _mm_sub_epi16(stp2_2, stp2_13);
+ stp1[14] = _mm_sub_epi16(stp2_1, stp2_14);
+ stp1[15] = _mm_sub_epi16(stp2_0, stp2_15);
+
+ stp1[16] = stp2_16;
+ stp1[17] = stp2_17;
+ stp1[18] = stp2_18;
+ stp1[19] = stp2_19;
+
+#if AVOID_OVERFLOW
+ MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,
+ stg4_0, stg6_0, stg4_0, stp1[20], stp1[27], stp1[21],
+ stp1[26])
+ MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,
+ stg4_0, stg6_0, stg4_0, stp1[22], stp1[25], stp1[23],
+ stp1[24])
+#else
+ tmp0 = _mm_add_epi16(stp2_20, stp2_27);
+ tmp1 = _mm_sub_epi16(stp2_27, stp2_20);
+ tmp2 = _mm_add_epi16(stp2_21, stp2_26);
+ tmp3 = _mm_sub_epi16(stp2_26, stp2_21);
+
+ stp1[20] = _mm_mulhrs_epi16(tmp1, stk4_0);
+ stp1[27] = _mm_mulhrs_epi16(tmp0, stk4_0);
+ stp1[21] = _mm_mulhrs_epi16(tmp3, stk4_0);
+ stp1[26] = _mm_mulhrs_epi16(tmp2, stk4_0);
+
+ tmp0 = _mm_add_epi16(stp2_22, stp2_25);
+ tmp1 = _mm_sub_epi16(stp2_25, stp2_22);
+ tmp2 = _mm_add_epi16(stp2_23, stp2_24);
+ tmp3 = _mm_sub_epi16(stp2_24, stp2_23);
+
+ stp1[22] = _mm_mulhrs_epi16(tmp1, stk4_0);
+ stp1[25] = _mm_mulhrs_epi16(tmp0, stk4_0);
+ stp1[23] = _mm_mulhrs_epi16(tmp3, stk4_0);
+ stp1[24] = _mm_mulhrs_epi16(tmp2, stk4_0);
+#endif
+
+ stp1[28] = stp2_28;
+ stp1[29] = stp2_29;
+ stp1[30] = stp2_30;
+ stp1[31] = stp2_31;
+ }
+#undef AVOID_OVERFLOW
+}
+
+// Only upper-left 8x8 has non-zero coeff
+void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+ __m128i in[32], col[32];
+ __m128i stp1[32];
+ int i;
+
+ // Load input data. Only need to load the top left 8x8 block.
+ in[0] = load_input_data(input);
+ in[1] = load_input_data(input + 32);
+ in[2] = load_input_data(input + 64);
+ in[3] = load_input_data(input + 96);
+ in[4] = load_input_data(input + 128);
+ in[5] = load_input_data(input + 160);
+ in[6] = load_input_data(input + 192);
+ in[7] = load_input_data(input + 224);
+
+ array_transpose_8x8(in, in);
+ idct32_34(in, stp1);
+
+ // 1_D: Store 32 intermediate results for each 8x32 block.
+ col[0] = _mm_add_epi16(stp1[0], stp1[31]);
+ col[1] = _mm_add_epi16(stp1[1], stp1[30]);
+ col[2] = _mm_add_epi16(stp1[2], stp1[29]);
+ col[3] = _mm_add_epi16(stp1[3], stp1[28]);
+ col[4] = _mm_add_epi16(stp1[4], stp1[27]);
+ col[5] = _mm_add_epi16(stp1[5], stp1[26]);
+ col[6] = _mm_add_epi16(stp1[6], stp1[25]);
+ col[7] = _mm_add_epi16(stp1[7], stp1[24]);
+ col[8] = _mm_add_epi16(stp1[8], stp1[23]);
+ col[9] = _mm_add_epi16(stp1[9], stp1[22]);
+ col[10] = _mm_add_epi16(stp1[10], stp1[21]);
+ col[11] = _mm_add_epi16(stp1[11], stp1[20]);
+ col[12] = _mm_add_epi16(stp1[12], stp1[19]);
+ col[13] = _mm_add_epi16(stp1[13], stp1[18]);
+ col[14] = _mm_add_epi16(stp1[14], stp1[17]);
+ col[15] = _mm_add_epi16(stp1[15], stp1[16]);
+ col[16] = _mm_sub_epi16(stp1[15], stp1[16]);
+ col[17] = _mm_sub_epi16(stp1[14], stp1[17]);
+ col[18] = _mm_sub_epi16(stp1[13], stp1[18]);
+ col[19] = _mm_sub_epi16(stp1[12], stp1[19]);
+ col[20] = _mm_sub_epi16(stp1[11], stp1[20]);
+ col[21] = _mm_sub_epi16(stp1[10], stp1[21]);
+ col[22] = _mm_sub_epi16(stp1[9], stp1[22]);
+ col[23] = _mm_sub_epi16(stp1[8], stp1[23]);
+ col[24] = _mm_sub_epi16(stp1[7], stp1[24]);
+ col[25] = _mm_sub_epi16(stp1[6], stp1[25]);
+ col[26] = _mm_sub_epi16(stp1[5], stp1[26]);
+ col[27] = _mm_sub_epi16(stp1[4], stp1[27]);
+ col[28] = _mm_sub_epi16(stp1[3], stp1[28]);
+ col[29] = _mm_sub_epi16(stp1[2], stp1[29]);
+ col[30] = _mm_sub_epi16(stp1[1], stp1[30]);
+ col[31] = _mm_sub_epi16(stp1[0], stp1[31]);
+ for (i = 0; i < 4; i++) {
+ int j;
+ // Transpose 32x8 block to 8x32 block
+ array_transpose_8x8(col + i * 8, in);
+ idct32_34(in, stp1);
+
+ // 2_D: Calculate the results and store them to destination.
+ in[0] = _mm_add_epi16(stp1[0], stp1[31]);
+ in[1] = _mm_add_epi16(stp1[1], stp1[30]);
+ in[2] = _mm_add_epi16(stp1[2], stp1[29]);
+ in[3] = _mm_add_epi16(stp1[3], stp1[28]);
+ in[4] = _mm_add_epi16(stp1[4], stp1[27]);
+ in[5] = _mm_add_epi16(stp1[5], stp1[26]);
+ in[6] = _mm_add_epi16(stp1[6], stp1[25]);
+ in[7] = _mm_add_epi16(stp1[7], stp1[24]);
+ in[8] = _mm_add_epi16(stp1[8], stp1[23]);
+ in[9] = _mm_add_epi16(stp1[9], stp1[22]);
+ in[10] = _mm_add_epi16(stp1[10], stp1[21]);
+ in[11] = _mm_add_epi16(stp1[11], stp1[20]);
+ in[12] = _mm_add_epi16(stp1[12], stp1[19]);
+ in[13] = _mm_add_epi16(stp1[13], stp1[18]);
+ in[14] = _mm_add_epi16(stp1[14], stp1[17]);
+ in[15] = _mm_add_epi16(stp1[15], stp1[16]);
+ in[16] = _mm_sub_epi16(stp1[15], stp1[16]);
+ in[17] = _mm_sub_epi16(stp1[14], stp1[17]);
+ in[18] = _mm_sub_epi16(stp1[13], stp1[18]);
+ in[19] = _mm_sub_epi16(stp1[12], stp1[19]);
+ in[20] = _mm_sub_epi16(stp1[11], stp1[20]);
+ in[21] = _mm_sub_epi16(stp1[10], stp1[21]);
+ in[22] = _mm_sub_epi16(stp1[9], stp1[22]);
+ in[23] = _mm_sub_epi16(stp1[8], stp1[23]);
+ in[24] = _mm_sub_epi16(stp1[7], stp1[24]);
+ in[25] = _mm_sub_epi16(stp1[6], stp1[25]);
+ in[26] = _mm_sub_epi16(stp1[5], stp1[26]);
+ in[27] = _mm_sub_epi16(stp1[4], stp1[27]);
+ in[28] = _mm_sub_epi16(stp1[3], stp1[28]);
+ in[29] = _mm_sub_epi16(stp1[2], stp1[29]);
+ in[30] = _mm_sub_epi16(stp1[1], stp1[30]);
+ in[31] = _mm_sub_epi16(stp1[0], stp1[31]);
+
+ for (j = 0; j < 32; ++j) {
+ // Final rounding and shift
+ in[j] = _mm_adds_epi16(in[j], final_rounding);
+ in[j] = _mm_srai_epi16(in[j], 6);
+ RECON_AND_STORE(dest + j * stride, in[j]);
+ }
+
+ dest += 8;
+ }
+}
+
+// in0[16] represents the left 8x16 block
+// in1[16] represents the right 8x16 block
+static void load_buffer_16x16(const tran_low_t *input, __m128i *in0,
+ __m128i *in1) {
+ int i;
+ for (i = 0; i < 16; i++) {
+ in0[i] = load_input_data(input);
+ in1[i] = load_input_data(input + 8);
+ input += 32;
+ }
+}
+
+static void array_transpose_16x16_2(__m128i *in0, __m128i *in1, __m128i *out0,
+ __m128i *out1) {
+ array_transpose_8x8(in0, out0);
+ array_transpose_8x8(&in0[8], out1);
+ array_transpose_8x8(in1, &out0[8]);
+ array_transpose_8x8(&in1[8], &out1[8]);
+}
+
+// For each 8x16 block __m128i in[16], output __m128i col[32]
+static void idct32_8x16_135(__m128i *in) {
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i stk1_0 = pair_set_epi16(2 * cospi_31_64, 2 * cospi_31_64);
+ const __m128i stk1_1 = pair_set_epi16(2 * cospi_1_64, 2 * cospi_1_64);
+ const __m128i stk1_2 = pair_set_epi16(-2 * cospi_17_64, -2 * cospi_17_64);
+ const __m128i stk1_3 = pair_set_epi16(2 * cospi_15_64, 2 * cospi_15_64);
+
+ const __m128i stk1_4 = pair_set_epi16(2 * cospi_23_64, 2 * cospi_23_64);
+ const __m128i stk1_5 = pair_set_epi16(2 * cospi_9_64, 2 * cospi_9_64);
+ const __m128i stk1_6 = pair_set_epi16(-2 * cospi_25_64, -2 * cospi_25_64);
+ const __m128i stk1_7 = pair_set_epi16(2 * cospi_7_64, 2 * cospi_7_64);
+
+ const __m128i stk1_8 = pair_set_epi16(2 * cospi_27_64, 2 * cospi_27_64);
+ const __m128i stk1_9 = pair_set_epi16(2 * cospi_5_64, 2 * cospi_5_64);
+ const __m128i stk1_10 = pair_set_epi16(-2 * cospi_21_64, -2 * cospi_21_64);
+ const __m128i stk1_11 = pair_set_epi16(2 * cospi_11_64, 2 * cospi_11_64);
+
+ const __m128i stk1_12 = pair_set_epi16(2 * cospi_19_64, 2 * cospi_19_64);
+ const __m128i stk1_13 = pair_set_epi16(2 * cospi_13_64, 2 * cospi_13_64);
+ const __m128i stk1_14 = pair_set_epi16(-2 * cospi_29_64, -2 * cospi_29_64);
+ const __m128i stk1_15 = pair_set_epi16(2 * cospi_3_64, 2 * cospi_3_64);
+
+ const __m128i stk2_0 = pair_set_epi16(2 * cospi_30_64, 2 * cospi_30_64);
+ const __m128i stk2_1 = pair_set_epi16(2 * cospi_2_64, 2 * cospi_2_64);
+ const __m128i stk2_2 = pair_set_epi16(-2 * cospi_18_64, -2 * cospi_18_64);
+ const __m128i stk2_3 = pair_set_epi16(2 * cospi_14_64, 2 * cospi_14_64);
+
+ const __m128i stk2_4 = pair_set_epi16(2 * cospi_22_64, 2 * cospi_22_64);
+ const __m128i stk2_5 = pair_set_epi16(2 * cospi_10_64, 2 * cospi_10_64);
+ const __m128i stk2_6 = pair_set_epi16(-2 * cospi_26_64, -2 * cospi_26_64);
+ const __m128i stk2_7 = pair_set_epi16(2 * cospi_6_64, 2 * cospi_6_64);
+
+ const __m128i stk3_0 = pair_set_epi16(2 * cospi_28_64, 2 * cospi_28_64);
+ const __m128i stk3_1 = pair_set_epi16(2 * cospi_4_64, 2 * cospi_4_64);
+ const __m128i stk3_2 = pair_set_epi16(-2 * cospi_20_64, -2 * cospi_20_64);
+ const __m128i stk3_3 = pair_set_epi16(2 * cospi_12_64, 2 * cospi_12_64);
+
+ const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+ const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stk4_0 = pair_set_epi16(2 * cospi_16_64, 2 * cospi_16_64);
+ const __m128i stk4_2 = pair_set_epi16(2 * cospi_24_64, 2 * cospi_24_64);
+ const __m128i stk4_3 = pair_set_epi16(2 * cospi_8_64, 2 * cospi_8_64);
+
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+ stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+ stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
+ stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+ stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
+ stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
+ stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+ /* Stage1 */
+ stp1_16 = _mm_mulhrs_epi16(in[1], stk1_0);
+ stp1_31 = _mm_mulhrs_epi16(in[1], stk1_1);
+ stp1_17 = _mm_mulhrs_epi16(in[15], stk1_2);
+ stp1_30 = _mm_mulhrs_epi16(in[15], stk1_3);
+
+ stp1_18 = _mm_mulhrs_epi16(in[9], stk1_4);
+ stp1_29 = _mm_mulhrs_epi16(in[9], stk1_5);
+ stp1_19 = _mm_mulhrs_epi16(in[7], stk1_6);
+ stp1_28 = _mm_mulhrs_epi16(in[7], stk1_7);
+
+ stp1_20 = _mm_mulhrs_epi16(in[5], stk1_8);
+ stp1_27 = _mm_mulhrs_epi16(in[5], stk1_9);
+ stp1_21 = _mm_mulhrs_epi16(in[11], stk1_10);
+ stp1_26 = _mm_mulhrs_epi16(in[11], stk1_11);
+
+ stp1_22 = _mm_mulhrs_epi16(in[13], stk1_12);
+ stp1_25 = _mm_mulhrs_epi16(in[13], stk1_13);
+ stp1_23 = _mm_mulhrs_epi16(in[3], stk1_14);
+ stp1_24 = _mm_mulhrs_epi16(in[3], stk1_15);
+
+ /* Stage2 */
+ stp2_8 = _mm_mulhrs_epi16(in[2], stk2_0);
+ stp2_15 = _mm_mulhrs_epi16(in[2], stk2_1);
+ stp2_9 = _mm_mulhrs_epi16(in[14], stk2_2);
+ stp2_14 = _mm_mulhrs_epi16(in[14], stk2_3);
+
+ stp2_10 = _mm_mulhrs_epi16(in[10], stk2_4);
+ stp2_13 = _mm_mulhrs_epi16(in[10], stk2_5);
+ stp2_11 = _mm_mulhrs_epi16(in[6], stk2_6);
+ stp2_12 = _mm_mulhrs_epi16(in[6], stk2_7);
+
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_17);
+ stp2_17 = _mm_sub_epi16(stp1_16, stp1_17);
+ stp2_18 = _mm_sub_epi16(stp1_19, stp1_18);
+ stp2_19 = _mm_add_epi16(stp1_19, stp1_18);
+
+ stp2_20 = _mm_add_epi16(stp1_20, stp1_21);
+ stp2_21 = _mm_sub_epi16(stp1_20, stp1_21);
+ stp2_22 = _mm_sub_epi16(stp1_23, stp1_22);
+ stp2_23 = _mm_add_epi16(stp1_23, stp1_22);
+
+ stp2_24 = _mm_add_epi16(stp1_24, stp1_25);
+ stp2_25 = _mm_sub_epi16(stp1_24, stp1_25);
+ stp2_26 = _mm_sub_epi16(stp1_27, stp1_26);
+ stp2_27 = _mm_add_epi16(stp1_27, stp1_26);
+
+ stp2_28 = _mm_add_epi16(stp1_28, stp1_29);
+ stp2_29 = _mm_sub_epi16(stp1_28, stp1_29);
+ stp2_30 = _mm_sub_epi16(stp1_31, stp1_30);
+ stp2_31 = _mm_add_epi16(stp1_31, stp1_30);
+
+ /* Stage3 */
+ {
+ const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30);
+ const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30);
+ const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);
+ const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);
+
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
+ const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);
+ const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);
+
+ stp1_4 = _mm_mulhrs_epi16(in[4], stk3_0);
+ stp1_7 = _mm_mulhrs_epi16(in[4], stk3_1);
+ stp1_5 = _mm_mulhrs_epi16(in[12], stk3_2);
+ stp1_6 = _mm_mulhrs_epi16(in[12], stk3_3);
+
+ stp2_0 = _mm_mulhrs_epi16(in[0], stk4_0);
+ stp2_1 = _mm_mulhrs_epi16(in[0], stk4_0); // stk4_1 = stk4_0
+ stp2_2 = _mm_mulhrs_epi16(in[8], stk4_2);
+ stp2_3 = _mm_mulhrs_epi16(in[8], stk4_3);
+
+ stp1_8 = _mm_add_epi16(stp2_8, stp2_9);
+ stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);
+ stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);
+ stp1_11 = _mm_add_epi16(stp2_11, stp2_10);
+ stp1_12 = _mm_add_epi16(stp2_12, stp2_13);
+ stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);
+ stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);
+ stp1_15 = _mm_add_epi16(stp2_15, stp2_14);
+
+ MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,
+ stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18,
+ stp1_29)
+ MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,
+ stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22,
+ stp1_25)
+
+ stp1_16 = stp2_16;
+ stp1_31 = stp2_31;
+ stp1_19 = stp2_19;
+ stp1_20 = stp2_20;
+ stp1_23 = stp2_23;
+ stp1_24 = stp2_24;
+ stp1_27 = stp2_27;
+ stp1_28 = stp2_28;
+ }
+
+ /* Stage4 */
+ {
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
+ const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);
+
+ stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
+ stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
+ stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
+ stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
+
+ MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, stg4_5,
+ stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, stp2_13)
+
+ stp2_8 = stp1_8;
+ stp2_15 = stp1_15;
+ stp2_11 = stp1_11;
+ stp2_12 = stp1_12;
+
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_19);
+ stp2_17 = _mm_add_epi16(stp1_17, stp1_18);
+ stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);
+ stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);
+ stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);
+ stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);
+ stp2_22 = _mm_add_epi16(stp1_22, stp1_21);
+ stp2_23 = _mm_add_epi16(stp1_23, stp1_20);
+
+ stp2_24 = _mm_add_epi16(stp1_24, stp1_27);
+ stp2_25 = _mm_add_epi16(stp1_25, stp1_26);
+ stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);
+ stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);
+ stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);
+ stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);
+ stp2_30 = _mm_add_epi16(stp1_29, stp1_30);
+ stp2_31 = _mm_add_epi16(stp1_28, stp1_31);
+ }
+
+ /* Stage5 */
+ {
+ const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);
+ const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);
+ const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);
+ const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);
+
+ const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);
+ const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);
+ const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);
+ const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);
+
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
+
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
+ stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
+ stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
+
+ tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);
+ tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);
+ tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);
+ tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);
+
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp1 = _mm_add_epi32(tmp1, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp3 = _mm_add_epi32(tmp3, rounding);
+
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+
+ stp1_5 = _mm_packs_epi32(tmp0, tmp1);
+ stp1_6 = _mm_packs_epi32(tmp2, tmp3);
+
+ stp1_4 = stp2_4;
+ stp1_7 = stp2_7;
+
+ stp1_8 = _mm_add_epi16(stp2_8, stp2_11);
+ stp1_9 = _mm_add_epi16(stp2_9, stp2_10);
+ stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);
+ stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);
+ stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);
+ stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);
+ stp1_14 = _mm_add_epi16(stp2_14, stp2_13);
+ stp1_15 = _mm_add_epi16(stp2_15, stp2_12);
+
+ stp1_16 = stp2_16;
+ stp1_17 = stp2_17;
+
+ MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,
+ stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19,
+ stp1_28)
+ MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,
+ stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21,
+ stp1_26)
+
+ stp1_22 = stp2_22;
+ stp1_23 = stp2_23;
+ stp1_24 = stp2_24;
+ stp1_25 = stp2_25;
+ stp1_30 = stp2_30;
+ stp1_31 = stp2_31;
+ }
+
+ /* Stage6 */
+ {
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);
+ const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
+ const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);
+
+ stp2_0 = _mm_add_epi16(stp1_0, stp1_7);
+ stp2_1 = _mm_add_epi16(stp1_1, stp1_6);
+ stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
+ stp2_3 = _mm_add_epi16(stp1_3, stp1_4);
+ stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);
+ stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
+ stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);
+ stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);
+
+ stp2_8 = stp1_8;
+ stp2_9 = stp1_9;
+ stp2_14 = stp1_14;
+ stp2_15 = stp1_15;
+
+ MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,
+ stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,
+ stp2_12)
+
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_23);
+ stp2_17 = _mm_add_epi16(stp1_17, stp1_22);
+ stp2_18 = _mm_add_epi16(stp1_18, stp1_21);
+ stp2_19 = _mm_add_epi16(stp1_19, stp1_20);
+ stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);
+ stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);
+ stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);
+ stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);
+
+ stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);
+ stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);
+ stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);
+ stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);
+ stp2_28 = _mm_add_epi16(stp1_27, stp1_28);
+ stp2_29 = _mm_add_epi16(stp1_26, stp1_29);
+ stp2_30 = _mm_add_epi16(stp1_25, stp1_30);
+ stp2_31 = _mm_add_epi16(stp1_24, stp1_31);
+ }
+
+ /* Stage7 */
+ {
+ const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);
+ const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
+
+ const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);
+ const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);
+ const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);
+ const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);
+
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_15);
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_14);
+ stp1_2 = _mm_add_epi16(stp2_2, stp2_13);
+ stp1_3 = _mm_add_epi16(stp2_3, stp2_12);
+ stp1_4 = _mm_add_epi16(stp2_4, stp2_11);
+ stp1_5 = _mm_add_epi16(stp2_5, stp2_10);
+ stp1_6 = _mm_add_epi16(stp2_6, stp2_9);
+ stp1_7 = _mm_add_epi16(stp2_7, stp2_8);
+ stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);
+ stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);
+ stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);
+ stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);
+ stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);
+ stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);
+ stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);
+ stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);
+
+ stp1_16 = stp2_16;
+ stp1_17 = stp2_17;
+ stp1_18 = stp2_18;
+ stp1_19 = stp2_19;
+
+ MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,
+ stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21,
+ stp1_26)
+ MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,
+ stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23,
+ stp1_24)
+
+ stp1_28 = stp2_28;
+ stp1_29 = stp2_29;
+ stp1_30 = stp2_30;
+ stp1_31 = stp2_31;
+ }
+
+ in[0] = _mm_add_epi16(stp1_0, stp1_31);
+ in[1] = _mm_add_epi16(stp1_1, stp1_30);
+ in[2] = _mm_add_epi16(stp1_2, stp1_29);
+ in[3] = _mm_add_epi16(stp1_3, stp1_28);
+ in[4] = _mm_add_epi16(stp1_4, stp1_27);
+ in[5] = _mm_add_epi16(stp1_5, stp1_26);
+ in[6] = _mm_add_epi16(stp1_6, stp1_25);
+ in[7] = _mm_add_epi16(stp1_7, stp1_24);
+ in[8] = _mm_add_epi16(stp1_8, stp1_23);
+ in[9] = _mm_add_epi16(stp1_9, stp1_22);
+ in[10] = _mm_add_epi16(stp1_10, stp1_21);
+ in[11] = _mm_add_epi16(stp1_11, stp1_20);
+ in[12] = _mm_add_epi16(stp1_12, stp1_19);
+ in[13] = _mm_add_epi16(stp1_13, stp1_18);
+ in[14] = _mm_add_epi16(stp1_14, stp1_17);
+ in[15] = _mm_add_epi16(stp1_15, stp1_16);
+ in[16] = _mm_sub_epi16(stp1_15, stp1_16);
+ in[17] = _mm_sub_epi16(stp1_14, stp1_17);
+ in[18] = _mm_sub_epi16(stp1_13, stp1_18);
+ in[19] = _mm_sub_epi16(stp1_12, stp1_19);
+ in[20] = _mm_sub_epi16(stp1_11, stp1_20);
+ in[21] = _mm_sub_epi16(stp1_10, stp1_21);
+ in[22] = _mm_sub_epi16(stp1_9, stp1_22);
+ in[23] = _mm_sub_epi16(stp1_8, stp1_23);
+ in[24] = _mm_sub_epi16(stp1_7, stp1_24);
+ in[25] = _mm_sub_epi16(stp1_6, stp1_25);
+ in[26] = _mm_sub_epi16(stp1_5, stp1_26);
+ in[27] = _mm_sub_epi16(stp1_4, stp1_27);
+ in[28] = _mm_sub_epi16(stp1_3, stp1_28);
+ in[29] = _mm_sub_epi16(stp1_2, stp1_29);
+ in[30] = _mm_sub_epi16(stp1_1, stp1_30);
+ in[31] = _mm_sub_epi16(stp1_0, stp1_31);
+}
+
+static INLINE void store_buffer_8x32(__m128i *in, uint8_t *dst, int stride) {
+ const __m128i final_rounding = _mm_set1_epi16(1 << 5);
+ const __m128i zero = _mm_setzero_si128();
+ int j = 0;
+ while (j < 32) {
+ in[j] = _mm_adds_epi16(in[j], final_rounding);
+ in[j + 1] = _mm_adds_epi16(in[j + 1], final_rounding);
+
+ in[j] = _mm_srai_epi16(in[j], 6);
+ in[j + 1] = _mm_srai_epi16(in[j + 1], 6);
+
+ RECON_AND_STORE(dst, in[j]);
+ dst += stride;
+ RECON_AND_STORE(dst, in[j + 1]);
+ dst += stride;
+ j += 2;
+ }
+}
+
+static INLINE void recon_and_store(__m128i *in0, __m128i *in1, uint8_t *dest,
+ int stride) {
+ store_buffer_8x32(in0, dest, stride);
+ store_buffer_8x32(in1, dest + 8, stride);
+}
+
+static INLINE void idct32_135(__m128i *col0, __m128i *col1) {
+ idct32_8x16_135(col0);
+ idct32_8x16_135(col1);
+}
+
+// Only upper-left 16x16 has non-zero coeff
+void vpx_idct32x32_135_add_ssse3(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i col0[32], col1[32], col2[32], col3[32];
+
+ // Load input data. Only need to load the top left 16x16 block.
+ load_buffer_16x16(input, col2, col3);
+
+ // columns
+ array_transpose_16x16_2(col2, col3, col0, col1);
+ idct32_135(col0, col1);
+
+ // rows
+ array_transpose_16x16_2(col0, col1, col2, col3);
+ idct32_135(col2, col3);
+ recon_and_store(col2, col3, dest, stride);
+
+ array_transpose_16x16_2(&col0[16], &col1[16], col2, col3);
+ idct32_135(col2, col3);
+ recon_and_store(col2, col3, dest + 16, stride);
+}
+
+// For each 8x32 block __m128i in[32], output __m128i in[32]
+static void idct32_8x32(const __m128i *in, __m128i *out) {
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ // idct constants for each stage
+ const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+ const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
+ const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+ const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
+ const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+ const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
+ const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+ const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
+ const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+ const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
+ const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+ const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+ const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+ const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
+ const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+ const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
+
+ const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
+ const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
+ const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+ const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+ const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+ const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+ stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+ stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22, stp1_23,
+ stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29, stp1_30, stp1_31;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+ stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
+ stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22, stp2_23,
+ stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29, stp2_30, stp2_31;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+ /* Stage1 */
+ {
+ const __m128i lo_1_31 = _mm_unpacklo_epi16(in[1], in[31]);
+ const __m128i hi_1_31 = _mm_unpackhi_epi16(in[1], in[31]);
+ const __m128i lo_17_15 = _mm_unpacklo_epi16(in[17], in[15]);
+ const __m128i hi_17_15 = _mm_unpackhi_epi16(in[17], in[15]);
+
+ const __m128i lo_9_23 = _mm_unpacklo_epi16(in[9], in[23]);
+ const __m128i hi_9_23 = _mm_unpackhi_epi16(in[9], in[23]);
+ const __m128i lo_25_7 = _mm_unpacklo_epi16(in[25], in[7]);
+ const __m128i hi_25_7 = _mm_unpackhi_epi16(in[25], in[7]);
+
+ const __m128i lo_5_27 = _mm_unpacklo_epi16(in[5], in[27]);
+ const __m128i hi_5_27 = _mm_unpackhi_epi16(in[5], in[27]);
+ const __m128i lo_21_11 = _mm_unpacklo_epi16(in[21], in[11]);
+ const __m128i hi_21_11 = _mm_unpackhi_epi16(in[21], in[11]);
+
+ const __m128i lo_13_19 = _mm_unpacklo_epi16(in[13], in[19]);
+ const __m128i hi_13_19 = _mm_unpackhi_epi16(in[13], in[19]);
+ const __m128i lo_29_3 = _mm_unpacklo_epi16(in[29], in[3]);
+ const __m128i hi_29_3 = _mm_unpackhi_epi16(in[29], in[3]);
+
+ MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0, stg1_1,
+ stg1_2, stg1_3, stp1_16, stp1_31, stp1_17, stp1_30)
+ MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4, stg1_5,
+ stg1_6, stg1_7, stp1_18, stp1_29, stp1_19, stp1_28)
+ MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8, stg1_9,
+ stg1_10, stg1_11, stp1_20, stp1_27, stp1_21, stp1_26)
+ MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12,
+ stg1_13, stg1_14, stg1_15, stp1_22, stp1_25, stp1_23,
+ stp1_24)
+ }
+
+ /* Stage2 */
+ {
+ const __m128i lo_2_30 = _mm_unpacklo_epi16(in[2], in[30]);
+ const __m128i hi_2_30 = _mm_unpackhi_epi16(in[2], in[30]);
+ const __m128i lo_18_14 = _mm_unpacklo_epi16(in[18], in[14]);
+ const __m128i hi_18_14 = _mm_unpackhi_epi16(in[18], in[14]);
+
+ const __m128i lo_10_22 = _mm_unpacklo_epi16(in[10], in[22]);
+ const __m128i hi_10_22 = _mm_unpackhi_epi16(in[10], in[22]);
+ const __m128i lo_26_6 = _mm_unpacklo_epi16(in[26], in[6]);
+ const __m128i hi_26_6 = _mm_unpackhi_epi16(in[26], in[6]);
+
+ MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0, stg2_1,
+ stg2_2, stg2_3, stp2_8, stp2_15, stp2_9, stp2_14)
+ MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4, stg2_5,
+ stg2_6, stg2_7, stp2_10, stp2_13, stp2_11, stp2_12)
+
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_17);
+ stp2_17 = _mm_sub_epi16(stp1_16, stp1_17);
+ stp2_18 = _mm_sub_epi16(stp1_19, stp1_18);
+ stp2_19 = _mm_add_epi16(stp1_19, stp1_18);
+
+ stp2_20 = _mm_add_epi16(stp1_20, stp1_21);
+ stp2_21 = _mm_sub_epi16(stp1_20, stp1_21);
+ stp2_22 = _mm_sub_epi16(stp1_23, stp1_22);
+ stp2_23 = _mm_add_epi16(stp1_23, stp1_22);
+
+ stp2_24 = _mm_add_epi16(stp1_24, stp1_25);
+ stp2_25 = _mm_sub_epi16(stp1_24, stp1_25);
+ stp2_26 = _mm_sub_epi16(stp1_27, stp1_26);
+ stp2_27 = _mm_add_epi16(stp1_27, stp1_26);
+
+ stp2_28 = _mm_add_epi16(stp1_28, stp1_29);
+ stp2_29 = _mm_sub_epi16(stp1_28, stp1_29);
+ stp2_30 = _mm_sub_epi16(stp1_31, stp1_30);
+ stp2_31 = _mm_add_epi16(stp1_31, stp1_30);
+ }
+
+ /* Stage3 */
+ {
+ const __m128i lo_4_28 = _mm_unpacklo_epi16(in[4], in[28]);
+ const __m128i hi_4_28 = _mm_unpackhi_epi16(in[4], in[28]);
+ const __m128i lo_20_12 = _mm_unpacklo_epi16(in[20], in[12]);
+ const __m128i hi_20_12 = _mm_unpackhi_epi16(in[20], in[12]);
+
+ const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30);
+ const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30);
+ const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);
+ const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);
+
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
+ const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);
+ const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);
+
+ MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0, stg3_1,
+ stg3_2, stg3_3, stp1_4, stp1_7, stp1_5, stp1_6)
+
+ stp1_8 = _mm_add_epi16(stp2_8, stp2_9);
+ stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);
+ stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);
+ stp1_11 = _mm_add_epi16(stp2_11, stp2_10);
+ stp1_12 = _mm_add_epi16(stp2_12, stp2_13);
+ stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);
+ stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);
+ stp1_15 = _mm_add_epi16(stp2_15, stp2_14);
+
+ MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,
+ stg3_5, stg3_6, stg3_4, stp1_17, stp1_30, stp1_18,
+ stp1_29)
+ MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,
+ stg3_9, stg3_10, stg3_8, stp1_21, stp1_26, stp1_22,
+ stp1_25)
+
+ stp1_16 = stp2_16;
+ stp1_31 = stp2_31;
+ stp1_19 = stp2_19;
+ stp1_20 = stp2_20;
+ stp1_23 = stp2_23;
+ stp1_24 = stp2_24;
+ stp1_27 = stp2_27;
+ stp1_28 = stp2_28;
+ }
+
+ /* Stage4 */
+ {
+ const __m128i lo_0_16 = _mm_unpacklo_epi16(in[0], in[16]);
+ const __m128i hi_0_16 = _mm_unpackhi_epi16(in[0], in[16]);
+ const __m128i lo_8_24 = _mm_unpacklo_epi16(in[8], in[24]);
+ const __m128i hi_8_24 = _mm_unpackhi_epi16(in[8], in[24]);
+
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
+ const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);
+
+ MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0, stg4_1,
+ stg4_2, stg4_3, stp2_0, stp2_1, stp2_2, stp2_3)
+
+ stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
+ stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
+ stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
+ stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
+
+ MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4, stg4_5,
+ stg4_6, stg4_4, stp2_9, stp2_14, stp2_10, stp2_13)
+
+ stp2_8 = stp1_8;
+ stp2_15 = stp1_15;
+ stp2_11 = stp1_11;
+ stp2_12 = stp1_12;
+
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_19);
+ stp2_17 = _mm_add_epi16(stp1_17, stp1_18);
+ stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);
+ stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);
+ stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);
+ stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);
+ stp2_22 = _mm_add_epi16(stp1_22, stp1_21);
+ stp2_23 = _mm_add_epi16(stp1_23, stp1_20);
+
+ stp2_24 = _mm_add_epi16(stp1_24, stp1_27);
+ stp2_25 = _mm_add_epi16(stp1_25, stp1_26);
+ stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);
+ stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);
+ stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);
+ stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);
+ stp2_30 = _mm_add_epi16(stp1_29, stp1_30);
+ stp2_31 = _mm_add_epi16(stp1_28, stp1_31);
+ }
+
+ /* Stage5 */
+ {
+ const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);
+ const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);
+ const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);
+ const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);
+
+ const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);
+ const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);
+ const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);
+ const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);
+
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
+
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
+ stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
+ stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
+
+ tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);
+ tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);
+ tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);
+ tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);
+
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp1 = _mm_add_epi32(tmp1, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp3 = _mm_add_epi32(tmp3, rounding);
+
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+
+ stp1_5 = _mm_packs_epi32(tmp0, tmp1);
+ stp1_6 = _mm_packs_epi32(tmp2, tmp3);
+
+ stp1_4 = stp2_4;
+ stp1_7 = stp2_7;
+
+ stp1_8 = _mm_add_epi16(stp2_8, stp2_11);
+ stp1_9 = _mm_add_epi16(stp2_9, stp2_10);
+ stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);
+ stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);
+ stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);
+ stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);
+ stp1_14 = _mm_add_epi16(stp2_14, stp2_13);
+ stp1_15 = _mm_add_epi16(stp2_15, stp2_12);
+
+ stp1_16 = stp2_16;
+ stp1_17 = stp2_17;
+
+ MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,
+ stg4_5, stg4_4, stg4_5, stp1_18, stp1_29, stp1_19,
+ stp1_28)
+ MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,
+ stg4_4, stg4_6, stg4_4, stp1_20, stp1_27, stp1_21,
+ stp1_26)
+
+ stp1_22 = stp2_22;
+ stp1_23 = stp2_23;
+ stp1_24 = stp2_24;
+ stp1_25 = stp2_25;
+ stp1_30 = stp2_30;
+ stp1_31 = stp2_31;
+ }
+
+ /* Stage6 */
+ {
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);
+ const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
+ const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);
+
+ stp2_0 = _mm_add_epi16(stp1_0, stp1_7);
+ stp2_1 = _mm_add_epi16(stp1_1, stp1_6);
+ stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
+ stp2_3 = _mm_add_epi16(stp1_3, stp1_4);
+ stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);
+ stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
+ stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);
+ stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);
+
+ stp2_8 = stp1_8;
+ stp2_9 = stp1_9;
+ stp2_14 = stp1_14;
+ stp2_15 = stp1_15;
+
+ MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, stg6_0,
+ stg4_0, stg6_0, stg4_0, stp2_10, stp2_13, stp2_11,
+ stp2_12)
+
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_23);
+ stp2_17 = _mm_add_epi16(stp1_17, stp1_22);
+ stp2_18 = _mm_add_epi16(stp1_18, stp1_21);
+ stp2_19 = _mm_add_epi16(stp1_19, stp1_20);
+ stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);
+ stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);
+ stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);
+ stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);
+
+ stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);
+ stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);
+ stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);
+ stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);
+ stp2_28 = _mm_add_epi16(stp1_27, stp1_28);
+ stp2_29 = _mm_add_epi16(stp1_26, stp1_29);
+ stp2_30 = _mm_add_epi16(stp1_25, stp1_30);
+ stp2_31 = _mm_add_epi16(stp1_24, stp1_31);
+ }
+
+ /* Stage7 */
+ {
+ const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);
+ const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
+
+ const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);
+ const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);
+ const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);
+ const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);
+
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_15);
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_14);
+ stp1_2 = _mm_add_epi16(stp2_2, stp2_13);
+ stp1_3 = _mm_add_epi16(stp2_3, stp2_12);
+ stp1_4 = _mm_add_epi16(stp2_4, stp2_11);
+ stp1_5 = _mm_add_epi16(stp2_5, stp2_10);
+ stp1_6 = _mm_add_epi16(stp2_6, stp2_9);
+ stp1_7 = _mm_add_epi16(stp2_7, stp2_8);
+ stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);
+ stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);
+ stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);
+ stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);
+ stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);
+ stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);
+ stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);
+ stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);
+
+ stp1_16 = stp2_16;
+ stp1_17 = stp2_17;
+ stp1_18 = stp2_18;
+ stp1_19 = stp2_19;
+
+ MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,
+ stg4_0, stg6_0, stg4_0, stp1_20, stp1_27, stp1_21,
+ stp1_26)
+ MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,
+ stg4_0, stg6_0, stg4_0, stp1_22, stp1_25, stp1_23,
+ stp1_24)
+
+ stp1_28 = stp2_28;
+ stp1_29 = stp2_29;
+ stp1_30 = stp2_30;
+ stp1_31 = stp2_31;
+ }
+
+ out[0] = _mm_add_epi16(stp1_0, stp1_31);
+ out[1] = _mm_add_epi16(stp1_1, stp1_30);
+ out[2] = _mm_add_epi16(stp1_2, stp1_29);
+ out[3] = _mm_add_epi16(stp1_3, stp1_28);
+ out[4] = _mm_add_epi16(stp1_4, stp1_27);
+ out[5] = _mm_add_epi16(stp1_5, stp1_26);
+ out[6] = _mm_add_epi16(stp1_6, stp1_25);
+ out[7] = _mm_add_epi16(stp1_7, stp1_24);
+ out[8] = _mm_add_epi16(stp1_8, stp1_23);
+ out[9] = _mm_add_epi16(stp1_9, stp1_22);
+ out[10] = _mm_add_epi16(stp1_10, stp1_21);
+ out[11] = _mm_add_epi16(stp1_11, stp1_20);
+ out[12] = _mm_add_epi16(stp1_12, stp1_19);
+ out[13] = _mm_add_epi16(stp1_13, stp1_18);
+ out[14] = _mm_add_epi16(stp1_14, stp1_17);
+ out[15] = _mm_add_epi16(stp1_15, stp1_16);
+ out[16] = _mm_sub_epi16(stp1_15, stp1_16);
+ out[17] = _mm_sub_epi16(stp1_14, stp1_17);
+ out[18] = _mm_sub_epi16(stp1_13, stp1_18);
+ out[19] = _mm_sub_epi16(stp1_12, stp1_19);
+ out[20] = _mm_sub_epi16(stp1_11, stp1_20);
+ out[21] = _mm_sub_epi16(stp1_10, stp1_21);
+ out[22] = _mm_sub_epi16(stp1_9, stp1_22);
+ out[23] = _mm_sub_epi16(stp1_8, stp1_23);
+ out[24] = _mm_sub_epi16(stp1_7, stp1_24);
+ out[25] = _mm_sub_epi16(stp1_6, stp1_25);
+ out[26] = _mm_sub_epi16(stp1_5, stp1_26);
+ out[27] = _mm_sub_epi16(stp1_4, stp1_27);
+ out[28] = _mm_sub_epi16(stp1_3, stp1_28);
+ out[29] = _mm_sub_epi16(stp1_2, stp1_29);
+ out[30] = _mm_sub_epi16(stp1_1, stp1_30);
+ out[31] = _mm_sub_epi16(stp1_0, stp1_31);
+}
+
+static void load_buffer_8x32(const tran_low_t *input, __m128i *in) {
+ int i;
+ for (i = 0; i < 8; ++i) {
+ in[i] = load_input_data(input);
+ in[i + 8] = load_input_data(input + 8);
+ in[i + 16] = load_input_data(input + 16);
+ in[i + 24] = load_input_data(input + 24);
+ input += 32;
+ }
+}
+
+void vpx_idct32x32_1024_add_ssse3(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ __m128i col[128], in[32];
+ int i, j;
+
+ // rows
+ for (i = 0; i < 4; ++i) {
+ load_buffer_8x32(input, in);
+ input += 32 << 3;
+
+ // Transpose 32x8 block to 8x32 block
+ array_transpose_8x8(in, in);
+ array_transpose_8x8(in + 8, in + 8);
+ array_transpose_8x8(in + 16, in + 16);
+ array_transpose_8x8(in + 24, in + 24);
+
+ idct32_8x32(in, col + (i << 5));
+ }
+
+ // columns
+ for (i = 0; i < 4; ++i) {
+ j = i << 3;
+ // Transpose 32x8 block to 8x32 block
+ array_transpose_8x8(col + j, in);
+ array_transpose_8x8(col + j + 32, in + 8);
+ array_transpose_8x8(col + j + 64, in + 16);
+ array_transpose_8x8(col + j + 96, in + 24);
+
+ idct32_8x32(in, in);
+ store_buffer_8x32(in, dest, stride);
+ dest += 8;
+ }
+}
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
deleted file mode 100644
index dee64e3ad36..00000000000
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm
+++ /dev/null
@@ -1,1793 +0,0 @@
-;
-; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-; This file provides SSSE3 version of the inverse transformation. Part
-; of the functions are originally derived from the ffmpeg project.
-; Note that the current version applies to x86 64-bit only.
-
-SECTION_RODATA
-
-pw_11585x2: times 8 dw 23170
-
-pw_m2404x2: times 8 dw -2404*2
-pw_m4756x2: times 8 dw -4756*2
-pw_m5520x2: times 8 dw -5520*2
-pw_m8423x2: times 8 dw -8423*2
-pw_m9102x2: times 8 dw -9102*2
-pw_m10394x2: times 8 dw -10394*2
-pw_m11003x2: times 8 dw -11003*2
-
-pw_16364x2: times 8 dw 16364*2
-pw_16305x2: times 8 dw 16305*2
-pw_16207x2: times 8 dw 16207*2
-pw_16069x2: times 8 dw 16069*2
-pw_15893x2: times 8 dw 15893*2
-pw_15679x2: times 8 dw 15679*2
-pw_15426x2: times 8 dw 15426*2
-pw_15137x2: times 8 dw 15137*2
-pw_14811x2: times 8 dw 14811*2
-pw_14449x2: times 8 dw 14449*2
-pw_14053x2: times 8 dw 14053*2
-pw_13623x2: times 8 dw 13623*2
-pw_13160x2: times 8 dw 13160*2
-pw_12665x2: times 8 dw 12665*2
-pw_12140x2: times 8 dw 12140*2
-pw__9760x2: times 8 dw 9760*2
-pw__7723x2: times 8 dw 7723*2
-pw__7005x2: times 8 dw 7005*2
-pw__6270x2: times 8 dw 6270*2
-pw__3981x2: times 8 dw 3981*2
-pw__3196x2: times 8 dw 3196*2
-pw__1606x2: times 8 dw 1606*2
-pw___804x2: times 8 dw 804*2
-
-pd_8192: times 4 dd 8192
-pw_32: times 8 dw 32
-pw_16: times 8 dw 16
-
-%macro TRANSFORM_COEFFS 2
-pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2
-pw_m%2_%1: dw -%2, %1, -%2, %1, -%2, %1, -%2, %1
-pw_m%1_m%2: dw -%1, -%2, -%1, -%2, -%1, -%2, -%1, -%2
-%endmacro
-
-TRANSFORM_COEFFS 6270, 15137
-TRANSFORM_COEFFS 3196, 16069
-TRANSFORM_COEFFS 13623, 9102
-
-; constants for 32x32_34
-TRANSFORM_COEFFS 804, 16364
-TRANSFORM_COEFFS 15426, 5520
-TRANSFORM_COEFFS 3981, 15893
-TRANSFORM_COEFFS 16207, 2404
-TRANSFORM_COEFFS 1606, 16305
-TRANSFORM_COEFFS 15679, 4756
-TRANSFORM_COEFFS 11585, 11585
-
-; constants for 32x32_1024
-TRANSFORM_COEFFS 12140, 11003
-TRANSFORM_COEFFS 7005, 14811
-TRANSFORM_COEFFS 14053, 8423
-TRANSFORM_COEFFS 9760, 13160
-TRANSFORM_COEFFS 12665, 10394
-TRANSFORM_COEFFS 7723, 14449
-
-%macro PAIR_PP_COEFFS 2
-dpw_%1_%2: dw %1, %1, %1, %1, %2, %2, %2, %2
-%endmacro
-
-%macro PAIR_MP_COEFFS 2
-dpw_m%1_%2: dw -%1, -%1, -%1, -%1, %2, %2, %2, %2
-%endmacro
-
-%macro PAIR_MM_COEFFS 2
-dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2
-%endmacro
-
-PAIR_PP_COEFFS 30274, 12540
-PAIR_PP_COEFFS 6392, 32138
-PAIR_MP_COEFFS 18204, 27246
-
-PAIR_PP_COEFFS 12540, 12540
-PAIR_PP_COEFFS 30274, 30274
-PAIR_PP_COEFFS 6392, 6392
-PAIR_PP_COEFFS 32138, 32138
-PAIR_MM_COEFFS 18204, 18204
-PAIR_PP_COEFFS 27246, 27246
-
-SECTION .text
-
-%if ARCH_X86_64
-%macro SUM_SUB 3
- psubw m%3, m%1, m%2
- paddw m%1, m%2
- SWAP %2, %3
-%endmacro
-
-; butterfly operation
-%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2
- pmaddwd m%1, m%3, %5
- pmaddwd m%2, m%3, %6
- paddd m%1, %4
- paddd m%2, %4
- psrad m%1, 14
- psrad m%2, 14
-%endmacro
-
-%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
- punpckhwd m%6, m%2, m%1
- MUL_ADD_2X %7, %6, %6, %5, [pw_m%4_%3], [pw_%3_%4]
- punpcklwd m%2, m%1
- MUL_ADD_2X %1, %2, %2, %5, [pw_m%4_%3], [pw_%3_%4]
- packssdw m%1, m%7
- packssdw m%2, m%6
-%endmacro
-
-%macro BUTTERFLY_4Xmm 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
- punpckhwd m%6, m%2, m%1
- MUL_ADD_2X %7, %6, %6, %5, [pw_m%4_%3], [pw_m%3_m%4]
- punpcklwd m%2, m%1
- MUL_ADD_2X %1, %2, %2, %5, [pw_m%4_%3], [pw_m%3_m%4]
- packssdw m%1, m%7
- packssdw m%2, m%6
-%endmacro
-
-; matrix transpose
-%macro INTERLEAVE_2X 4
- punpckh%1 m%4, m%2, m%3
- punpckl%1 m%2, m%3
- SWAP %3, %4
-%endmacro
-
-%macro TRANSPOSE8X8 9
- INTERLEAVE_2X wd, %1, %2, %9
- INTERLEAVE_2X wd, %3, %4, %9
- INTERLEAVE_2X wd, %5, %6, %9
- INTERLEAVE_2X wd, %7, %8, %9
-
- INTERLEAVE_2X dq, %1, %3, %9
- INTERLEAVE_2X dq, %2, %4, %9
- INTERLEAVE_2X dq, %5, %7, %9
- INTERLEAVE_2X dq, %6, %8, %9
-
- INTERLEAVE_2X qdq, %1, %5, %9
- INTERLEAVE_2X qdq, %3, %7, %9
- INTERLEAVE_2X qdq, %2, %6, %9
- INTERLEAVE_2X qdq, %4, %8, %9
-
- SWAP %2, %5
- SWAP %4, %7
-%endmacro
-
-%macro IDCT8_1D 0
- SUM_SUB 0, 4, 9
- BUTTERFLY_4X 2, 6, 6270, 15137, m8, 9, 10
- pmulhrsw m0, m12
- pmulhrsw m4, m12
- BUTTERFLY_4X 1, 7, 3196, 16069, m8, 9, 10
- BUTTERFLY_4X 5, 3, 13623, 9102, m8, 9, 10
-
- SUM_SUB 1, 5, 9
- SUM_SUB 7, 3, 9
- SUM_SUB 0, 6, 9
- SUM_SUB 4, 2, 9
- SUM_SUB 3, 5, 9
- pmulhrsw m3, m12
- pmulhrsw m5, m12
-
- SUM_SUB 0, 7, 9
- SUM_SUB 4, 3, 9
- SUM_SUB 2, 5, 9
- SUM_SUB 6, 1, 9
-
- SWAP 3, 6
- SWAP 1, 4
-%endmacro
-
-; This macro handles 8 pixels per line
-%macro ADD_STORE_8P_2X 5; src1, src2, tmp1, tmp2, zero
- paddw m%1, m11
- paddw m%2, m11
- psraw m%1, 5
- psraw m%2, 5
-
- movh m%3, [outputq]
- movh m%4, [outputq + strideq]
- punpcklbw m%3, m%5
- punpcklbw m%4, m%5
- paddw m%3, m%1
- paddw m%4, m%2
- packuswb m%3, m%5
- packuswb m%4, m%5
- movh [outputq], m%3
- movh [outputq + strideq], m%4
-%endmacro
-
-INIT_XMM ssse3
-; full inverse 8x8 2D-DCT transform
-cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
- mova m8, [pd_8192]
- mova m11, [pw_16]
- mova m12, [pw_11585x2]
-
- lea r3, [2 * strideq]
-%if CONFIG_VP9_HIGHBITDEPTH
- mova m0, [inputq + 0]
- packssdw m0, [inputq + 16]
- mova m1, [inputq + 32]
- packssdw m1, [inputq + 48]
- mova m2, [inputq + 64]
- packssdw m2, [inputq + 80]
- mova m3, [inputq + 96]
- packssdw m3, [inputq + 112]
- mova m4, [inputq + 128]
- packssdw m4, [inputq + 144]
- mova m5, [inputq + 160]
- packssdw m5, [inputq + 176]
- mova m6, [inputq + 192]
- packssdw m6, [inputq + 208]
- mova m7, [inputq + 224]
- packssdw m7, [inputq + 240]
-%else
- mova m0, [inputq + 0]
- mova m1, [inputq + 16]
- mova m2, [inputq + 32]
- mova m3, [inputq + 48]
- mova m4, [inputq + 64]
- mova m5, [inputq + 80]
- mova m6, [inputq + 96]
- mova m7, [inputq + 112]
-%endif
- TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
- IDCT8_1D
- TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
- IDCT8_1D
-
- pxor m12, m12
- ADD_STORE_8P_2X 0, 1, 9, 10, 12
- lea outputq, [outputq + r3]
- ADD_STORE_8P_2X 2, 3, 9, 10, 12
- lea outputq, [outputq + r3]
- ADD_STORE_8P_2X 4, 5, 9, 10, 12
- lea outputq, [outputq + r3]
- ADD_STORE_8P_2X 6, 7, 9, 10, 12
-
- RET
-
-; inverse 8x8 2D-DCT transform with only first 12 coeffs non-zero
-cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
- mova m8, [pd_8192]
- mova m11, [pw_16]
- mova m12, [pw_11585x2]
-
- lea r3, [2 * strideq]
-
-%if CONFIG_VP9_HIGHBITDEPTH
- mova m0, [inputq + 0]
- packssdw m0, [inputq + 16]
- mova m1, [inputq + 32]
- packssdw m1, [inputq + 48]
- mova m2, [inputq + 64]
- packssdw m2, [inputq + 80]
- mova m3, [inputq + 96]
- packssdw m3, [inputq + 112]
-%else
- mova m0, [inputq + 0]
- mova m1, [inputq + 16]
- mova m2, [inputq + 32]
- mova m3, [inputq + 48]
-%endif
-
- punpcklwd m0, m1
- punpcklwd m2, m3
- punpckhdq m9, m0, m2
- punpckldq m0, m2
- SWAP 2, 9
-
- ; m0 -> [0], [0]
- ; m1 -> [1], [1]
- ; m2 -> [2], [2]
- ; m3 -> [3], [3]
- punpckhqdq m10, m0, m0
- punpcklqdq m0, m0
- punpckhqdq m9, m2, m2
- punpcklqdq m2, m2
- SWAP 1, 10
- SWAP 3, 9
-
- pmulhrsw m0, m12
- pmulhrsw m2, [dpw_30274_12540]
- pmulhrsw m1, [dpw_6392_32138]
- pmulhrsw m3, [dpw_m18204_27246]
-
- SUM_SUB 0, 2, 9
- SUM_SUB 1, 3, 9
-
- punpcklqdq m9, m3, m3
- punpckhqdq m5, m3, m9
-
- SUM_SUB 3, 5, 9
- punpckhqdq m5, m3
- pmulhrsw m5, m12
-
- punpckhqdq m9, m1, m5
- punpcklqdq m1, m5
- SWAP 5, 9
-
- SUM_SUB 0, 5, 9
- SUM_SUB 2, 1, 9
-
- punpckhqdq m3, m0, m0
- punpckhqdq m4, m1, m1
- punpckhqdq m6, m5, m5
- punpckhqdq m7, m2, m2
-
- punpcklwd m0, m3
- punpcklwd m7, m2
- punpcklwd m1, m4
- punpcklwd m6, m5
-
- punpckhdq m4, m0, m7
- punpckldq m0, m7
- punpckhdq m10, m1, m6
- punpckldq m5, m1, m6
-
- punpckhqdq m1, m0, m5
- punpcklqdq m0, m5
- punpckhqdq m3, m4, m10
- punpcklqdq m2, m4, m10
-
-
- pmulhrsw m0, m12
- pmulhrsw m6, m2, [dpw_30274_30274]
- pmulhrsw m4, m2, [dpw_12540_12540]
-
- pmulhrsw m7, m1, [dpw_32138_32138]
- pmulhrsw m1, [dpw_6392_6392]
- pmulhrsw m5, m3, [dpw_m18204_m18204]
- pmulhrsw m3, [dpw_27246_27246]
-
- mova m2, m0
- SUM_SUB 0, 6, 9
- SUM_SUB 2, 4, 9
- SUM_SUB 1, 5, 9
- SUM_SUB 7, 3, 9
-
- SUM_SUB 3, 5, 9
- pmulhrsw m3, m12
- pmulhrsw m5, m12
-
- SUM_SUB 0, 7, 9
- SUM_SUB 2, 3, 9
- SUM_SUB 4, 5, 9
- SUM_SUB 6, 1, 9
-
- SWAP 3, 6
- SWAP 1, 2
- SWAP 2, 4
-
-
- pxor m12, m12
- ADD_STORE_8P_2X 0, 1, 9, 10, 12
- lea outputq, [outputq + r3]
- ADD_STORE_8P_2X 2, 3, 9, 10, 12
- lea outputq, [outputq + r3]
- ADD_STORE_8P_2X 4, 5, 9, 10, 12
- lea outputq, [outputq + r3]
- ADD_STORE_8P_2X 6, 7, 9, 10, 12
-
- RET
-
-%define idx0 16 * 0
-%define idx1 16 * 1
-%define idx2 16 * 2
-%define idx3 16 * 3
-%define idx4 16 * 4
-%define idx5 16 * 5
-%define idx6 16 * 6
-%define idx7 16 * 7
-%define idx8 16 * 0
-%define idx9 16 * 1
-%define idx10 16 * 2
-%define idx11 16 * 3
-%define idx12 16 * 4
-%define idx13 16 * 5
-%define idx14 16 * 6
-%define idx15 16 * 7
-%define idx16 16 * 0
-%define idx17 16 * 1
-%define idx18 16 * 2
-%define idx19 16 * 3
-%define idx20 16 * 4
-%define idx21 16 * 5
-%define idx22 16 * 6
-%define idx23 16 * 7
-%define idx24 16 * 0
-%define idx25 16 * 1
-%define idx26 16 * 2
-%define idx27 16 * 3
-%define idx28 16 * 4
-%define idx29 16 * 5
-%define idx30 16 * 6
-%define idx31 16 * 7
-
-; FROM idct32x32_add_neon.asm
-;
-; Instead of doing the transforms stage by stage, it is done by loading
-; some input values and doing as many stages as possible to minimize the
-; storing/loading of intermediate results. To fit within registers, the
-; final coefficients are cut into four blocks:
-; BLOCK A: 16-19,28-31
-; BLOCK B: 20-23,24-27
-; BLOCK C: 8-11,12-15
-; BLOCK D: 0-3,4-7
-; Blocks A and C are straight calculation through the various stages. In
-; block B, further calculations are performed using the results from
-; block A. In block D, further calculations are performed using the results
-; from block C and then the final calculations are done using results from
-; block A and B which have been combined at the end of block B.
-;
-
-%macro IDCT32X32_34 4
- ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m11, m1
- pmulhrsw m1, [pw___804x2] ; stp1_16
- mova [r4 + 0], m0
- pmulhrsw m11, [pw_16364x2] ; stp2_31
- mova [r4 + 16 * 2], m2
- mova m12, m7
- pmulhrsw m7, [pw_15426x2] ; stp1_28
- mova [r4 + 16 * 4], m4
- pmulhrsw m12, [pw_m5520x2] ; stp2_19
- mova [r4 + 16 * 6], m6
-
- ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m2, m1 ; stp1_16
- mova m0, m11 ; stp1_31
- mova m4, m7 ; stp1_28
- mova m15, m12 ; stp1_19
-
- ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30
- BUTTERFLY_4Xmm 4, 15, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18
-
- ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 1, 12, 9 ; stp2_16, stp2_19
- SUM_SUB 0, 15, 9 ; stp2_17, stp2_18
- SUM_SUB 11, 7, 9 ; stp2_31, stp2_28
- SUM_SUB 2, 4, 9 ; stp2_30, stp2_29
-
- ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 4, 15, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29
- BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28
-
- ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m6, m5
- pmulhrsw m5, [pw__3981x2] ; stp1_20
- mova [stp + %4 + idx28], m12
- mova [stp + %4 + idx29], m15
- pmulhrsw m6, [pw_15893x2] ; stp2_27
- mova [stp + %4 + idx30], m2
- mova m2, m3
- pmulhrsw m3, [pw_m2404x2] ; stp1_23
- mova [stp + %4 + idx31], m11
- pmulhrsw m2, [pw_16207x2] ; stp2_24
-
- ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m13, m5 ; stp1_20
- mova m14, m6 ; stp1_27
- mova m15, m3 ; stp1_23
- mova m11, m2 ; stp1_24
-
- ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26
- BUTTERFLY_4Xmm 11, 15, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22
-
- ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 3, 5, 9 ; stp2_23, stp2_20
- SUM_SUB 15, 14, 9 ; stp2_22, stp2_21
- SUM_SUB 2, 6, 9 ; stp2_24, stp2_27
- SUM_SUB 11, 13, 9 ; stp2_25, stp2_26
-
- ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20
- BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21
-
- ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 1, 3, 9 ; stp2_16, stp2_23
- SUM_SUB 0, 15, 9 ; stp2_17, stp2_22
- SUM_SUB 4, 14, 9 ; stp2_18, stp2_21
- SUM_SUB 7, 5, 9 ; stp2_19, stp2_20
- mova [stp + %3 + idx16], m1
- mova [stp + %3 + idx17], m0
- mova [stp + %3 + idx18], m4
- mova [stp + %3 + idx19], m7
-
- mova m4, [stp + %4 + idx28]
- mova m7, [stp + %4 + idx29]
- mova m10, [stp + %4 + idx30]
- mova m12, [stp + %4 + idx31]
- SUM_SUB 4, 6, 9 ; stp2_28, stp2_27
- SUM_SUB 7, 13, 9 ; stp2_29, stp2_26
- SUM_SUB 10, 11, 9 ; stp2_30, stp2_25
- SUM_SUB 12, 2, 9 ; stp2_31, stp2_24
- mova [stp + %4 + idx28], m4
- mova [stp + %4 + idx29], m7
- mova [stp + %4 + idx30], m10
- mova [stp + %4 + idx31], m12
-
- ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
- mova m10, [pw_11585x2]
- SUM_SUB 6, 5, 9
- pmulhrsw m6, m10 ; stp1_27
- pmulhrsw m5, m10 ; stp1_20
- SUM_SUB 13, 14, 9
- pmulhrsw m13, m10 ; stp1_26
- pmulhrsw m14, m10 ; stp1_21
- SUM_SUB 11, 15, 9
- pmulhrsw m11, m10 ; stp1_25
- pmulhrsw m15, m10 ; stp1_22
- SUM_SUB 2, 3, 9
- pmulhrsw m2, m10 ; stp1_24
- pmulhrsw m3, m10 ; stp1_23
-%else
- BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27
- SWAP 6, 5
- BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26
- SWAP 13, 14
- BUTTERFLY_4X 11, 15, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25
- SWAP 11, 15
- BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24
- SWAP 2, 3
-%endif
-
- mova [stp + %4 + idx24], m2
- mova [stp + %4 + idx25], m11
- mova [stp + %4 + idx26], m13
- mova [stp + %4 + idx27], m6
-
- ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ;
- ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m0, [rsp + transposed_in + 16 * 2]
- mova m6, [rsp + transposed_in + 16 * 6]
-
- mova m1, m0
- pmulhrsw m0, [pw__1606x2] ; stp1_8
- mova [stp + %3 + idx20], m5
- mova [stp + %3 + idx21], m14
- pmulhrsw m1, [pw_16305x2] ; stp2_15
- mova [stp + %3 + idx22], m15
- mova m7, m6
- pmulhrsw m7, [pw_m4756x2] ; stp2_11
- mova [stp + %3 + idx23], m3
- pmulhrsw m6, [pw_15679x2] ; stp1_12
-
- ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m3, m0 ; stp1_8
- mova m2, m1 ; stp1_15
-
- ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14
- mova m4, m7 ; stp1_11
- mova m5, m6 ; stp1_12
- BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10
-
- ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 0, 7, 9 ; stp1_8, stp1_11
- SUM_SUB 2, 4, 9 ; stp1_9, stp1_10
- SUM_SUB 1, 6, 9 ; stp1_15, stp1_12
- SUM_SUB 3, 5, 9 ; stp1_14, stp1_13
-
- ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
- mova m10, [pw_11585x2]
- SUM_SUB 5, 4, 9
- pmulhrsw m5, m10 ; stp1_13
- pmulhrsw m4, m10 ; stp1_10
- SUM_SUB 6, 7, 9
- pmulhrsw m6, m10 ; stp1_12
- pmulhrsw m7, m10 ; stp1_11
-%else
- BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13
- SWAP 5, 4
- BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12
- SWAP 6, 7
-%endif
-
- ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova [stp + %2 + idx8], m0
- mova [stp + %2 + idx9], m2
- mova [stp + %2 + idx10], m4
- mova [stp + %2 + idx11], m7
-
- ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ;
- ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ;
- ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m11, [rsp + transposed_in + 16 * 4]
- mova m12, m11
- pmulhrsw m11, [pw__3196x2] ; stp1_4
- pmulhrsw m12, [pw_16069x2] ; stp1_7
-
- ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m0, [rsp + transposed_in + 16 * 0]
- mova m10, [pw_11585x2]
- pmulhrsw m0, m10 ; stp1_1
-
- mova m14, m11 ; stp1_4
- mova m13, m12 ; stp1_7
-
- ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
- SUM_SUB 13, 14, 9
- pmulhrsw m13, m10 ; stp1_6
- pmulhrsw m14, m10 ; stp1_5
-%else
- BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6
- SWAP 13, 14
-%endif
- mova m7, m0 ; stp1_0 = stp1_1
- mova m4, m0 ; stp1_1
- mova m2, m7 ; stp1_0
-
- ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 0, 12, 9 ; stp1_0, stp1_7
- SUM_SUB 7, 13, 9 ; stp1_1, stp1_6
- SUM_SUB 2, 14, 9 ; stp1_2, stp1_5
- SUM_SUB 4, 11, 9 ; stp1_3, stp1_4
-
- ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 0, 1, 9 ; stp1_0, stp1_15
- SUM_SUB 7, 3, 9 ; stp1_1, stp1_14
- SUM_SUB 2, 5, 9 ; stp1_2, stp1_13
- SUM_SUB 4, 6, 9 ; stp1_3, stp1_12
-
- ; 0-3, 28-31 final stage
- mova m15, [stp + %4 + idx30]
- mova m10, [stp + %4 + idx31]
- SUM_SUB 0, 10, 9 ; stp1_0, stp1_31
- SUM_SUB 7, 15, 9 ; stp1_1, stp1_30
- mova [stp + %1 + idx0], m0
- mova [stp + %1 + idx1], m7
- mova [stp + %4 + idx30], m15
- mova [stp + %4 + idx31], m10
- mova m7, [stp + %4 + idx28]
- mova m0, [stp + %4 + idx29]
- SUM_SUB 2, 0, 9 ; stp1_2, stp1_29
- SUM_SUB 4, 7, 9 ; stp1_3, stp1_28
- mova [stp + %1 + idx2], m2
- mova [stp + %1 + idx3], m4
- mova [stp + %4 + idx28], m7
- mova [stp + %4 + idx29], m0
-
- ; 12-15, 16-19 final stage
- mova m0, [stp + %3 + idx16]
- mova m7, [stp + %3 + idx17]
- mova m2, [stp + %3 + idx18]
- mova m4, [stp + %3 + idx19]
- SUM_SUB 1, 0, 9 ; stp1_15, stp1_16
- SUM_SUB 3, 7, 9 ; stp1_14, stp1_17
- SUM_SUB 5, 2, 9 ; stp1_13, stp1_18
- SUM_SUB 6, 4, 9 ; stp1_12, stp1_19
- mova [stp + %2 + idx12], m6
- mova [stp + %2 + idx13], m5
- mova [stp + %2 + idx14], m3
- mova [stp + %2 + idx15], m1
- mova [stp + %3 + idx16], m0
- mova [stp + %3 + idx17], m7
- mova [stp + %3 + idx18], m2
- mova [stp + %3 + idx19], m4
-
- mova m4, [stp + %2 + idx8]
- mova m5, [stp + %2 + idx9]
- mova m6, [stp + %2 + idx10]
- mova m7, [stp + %2 + idx11]
- SUM_SUB 11, 7, 9 ; stp1_4, stp1_11
- SUM_SUB 14, 6, 9 ; stp1_5, stp1_10
- SUM_SUB 13, 5, 9 ; stp1_6, stp1_9
- SUM_SUB 12, 4, 9 ; stp1_7, stp1_8
-
- ; 4-7, 24-27 final stage
- mova m0, [stp + %4 + idx27]
- mova m1, [stp + %4 + idx26]
- mova m2, [stp + %4 + idx25]
- mova m3, [stp + %4 + idx24]
- SUM_SUB 11, 0, 9 ; stp1_4, stp1_27
- SUM_SUB 14, 1, 9 ; stp1_5, stp1_26
- SUM_SUB 13, 2, 9 ; stp1_6, stp1_25
- SUM_SUB 12, 3, 9 ; stp1_7, stp1_24
- mova [stp + %4 + idx27], m0
- mova [stp + %4 + idx26], m1
- mova [stp + %4 + idx25], m2
- mova [stp + %4 + idx24], m3
- mova [stp + %1 + idx4], m11
- mova [stp + %1 + idx5], m14
- mova [stp + %1 + idx6], m13
- mova [stp + %1 + idx7], m12
-
- ; 8-11, 20-23 final stage
- mova m0, [stp + %3 + idx20]
- mova m1, [stp + %3 + idx21]
- mova m2, [stp + %3 + idx22]
- mova m3, [stp + %3 + idx23]
- SUM_SUB 7, 0, 9 ; stp1_11, stp_20
- SUM_SUB 6, 1, 9 ; stp1_10, stp_21
- SUM_SUB 5, 2, 9 ; stp1_9, stp_22
- SUM_SUB 4, 3, 9 ; stp1_8, stp_23
- mova [stp + %2 + idx8], m4
- mova [stp + %2 + idx9], m5
- mova [stp + %2 + idx10], m6
- mova [stp + %2 + idx11], m7
- mova [stp + %3 + idx20], m0
- mova [stp + %3 + idx21], m1
- mova [stp + %3 + idx22], m2
- mova [stp + %3 + idx23], m3
-%endmacro
-
-%macro RECON_AND_STORE 1
- mova m11, [pw_32]
- lea stp, [rsp + %1]
- mov r6, 32
- pxor m8, m8
-%%recon_and_store:
- mova m0, [stp + 16 * 32 * 0]
- mova m1, [stp + 16 * 32 * 1]
- mova m2, [stp + 16 * 32 * 2]
- mova m3, [stp + 16 * 32 * 3]
- add stp, 16
-
- paddw m0, m11
- paddw m1, m11
- paddw m2, m11
- paddw m3, m11
- psraw m0, 6
- psraw m1, 6
- psraw m2, 6
- psraw m3, 6
- movh m4, [outputq + 0]
- movh m5, [outputq + 8]
- movh m6, [outputq + 16]
- movh m7, [outputq + 24]
- punpcklbw m4, m8
- punpcklbw m5, m8
- punpcklbw m6, m8
- punpcklbw m7, m8
- paddw m0, m4
- paddw m1, m5
- paddw m2, m6
- paddw m3, m7
- packuswb m0, m1
- packuswb m2, m3
- mova [outputq + 0], m0
- mova [outputq + 16], m2
- lea outputq, [outputq + strideq]
- dec r6
- jnz %%recon_and_store
-%endmacro
-
-%define i32x32_size 16*32*5
-%define pass_two_start 16*32*0
-%define transposed_in 16*32*4
-%define pass_one_start 16*32*0
-%define stp r8
-
-INIT_XMM ssse3
-cglobal idct32x32_34_add, 3, 11, 16, i32x32_size, input, output, stride
- mova m8, [pd_8192]
- lea stp, [rsp + pass_one_start]
-
-idct32x32_34:
- mov r3, inputq
- lea r4, [rsp + transposed_in]
-
-idct32x32_34_transpose:
-%if CONFIG_VP9_HIGHBITDEPTH
- mova m0, [r3 + 0]
- packssdw m0, [r3 + 16]
- mova m1, [r3 + 32 * 4]
- packssdw m1, [r3 + 32 * 4 + 16]
- mova m2, [r3 + 32 * 8]
- packssdw m2, [r3 + 32 * 8 + 16]
- mova m3, [r3 + 32 * 12]
- packssdw m3, [r3 + 32 * 12 + 16]
- mova m4, [r3 + 32 * 16]
- packssdw m4, [r3 + 32 * 16 + 16]
- mova m5, [r3 + 32 * 20]
- packssdw m5, [r3 + 32 * 20 + 16]
- mova m6, [r3 + 32 * 24]
- packssdw m6, [r3 + 32 * 24 + 16]
- mova m7, [r3 + 32 * 28]
- packssdw m7, [r3 + 32 * 28 + 16]
-%else
- mova m0, [r3 + 0]
- mova m1, [r3 + 16 * 4]
- mova m2, [r3 + 16 * 8]
- mova m3, [r3 + 16 * 12]
- mova m4, [r3 + 16 * 16]
- mova m5, [r3 + 16 * 20]
- mova m6, [r3 + 16 * 24]
- mova m7, [r3 + 16 * 28]
-%endif
-
- TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-
- IDCT32X32_34 16*0, 16*32, 16*64, 16*96
- lea stp, [stp + 16 * 8]
- mov r6, 4
- lea stp, [rsp + pass_one_start]
- lea r9, [rsp + pass_one_start]
-
-idct32x32_34_2:
- lea r4, [rsp + transposed_in]
- mov r3, r9
-
-idct32x32_34_transpose_2:
- mova m0, [r3 + 0]
- mova m1, [r3 + 16 * 1]
- mova m2, [r3 + 16 * 2]
- mova m3, [r3 + 16 * 3]
- mova m4, [r3 + 16 * 4]
- mova m5, [r3 + 16 * 5]
- mova m6, [r3 + 16 * 6]
- mova m7, [r3 + 16 * 7]
-
- TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-
- IDCT32X32_34 16*0, 16*8, 16*16, 16*24
-
- lea stp, [stp + 16 * 32]
- add r9, 16 * 32
- dec r6
- jnz idct32x32_34_2
-
- RECON_AND_STORE pass_two_start
-
- RET
-
-%macro IDCT32X32_135 4
- ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m1, [rsp + transposed_in + 16 * 1]
- mova m11, m1
- pmulhrsw m1, [pw___804x2] ; stp1_16
- pmulhrsw m11, [pw_16364x2] ; stp2_31
-
- mova m7, [rsp + transposed_in + 16 * 7]
- mova m12, m7
- pmulhrsw m7, [pw_15426x2] ; stp1_28
- pmulhrsw m12, [pw_m5520x2] ; stp2_19
-
- mova m3, [rsp + transposed_in + 16 * 9]
- mova m4, m3
- pmulhrsw m3, [pw__7005x2] ; stp1_18
- pmulhrsw m4, [pw_14811x2] ; stp2_29
-
- mova m0, [rsp + transposed_in + 16 * 15]
- mova m2, m0
- pmulhrsw m0, [pw_12140x2] ; stp1_30
- pmulhrsw m2, [pw_m11003x2] ; stp2_17
-
- ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 1, 2, 9 ; stp2_16, stp2_17
- SUM_SUB 12, 3, 9 ; stp2_19, stp2_18
- SUM_SUB 7, 4, 9 ; stp2_28, stp2_29
- SUM_SUB 11, 0, 9 ; stp2_31, stp2_30
-
- ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30
- BUTTERFLY_4Xmm 4, 3, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18
-
- ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 1, 12, 9 ; stp2_16, stp2_19
- SUM_SUB 0, 3, 9 ; stp2_17, stp2_18
- SUM_SUB 11, 7, 9 ; stp2_31, stp2_28
- SUM_SUB 2, 4, 9 ; stp2_30, stp2_29
-
- ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 4, 3, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29
- BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28
-
- mova [stp + %3 + idx16], m1
- mova [stp + %3 + idx17], m0
- mova [stp + %3 + idx18], m4
- mova [stp + %3 + idx19], m7
- mova [stp + %4 + idx28], m12
- mova [stp + %4 + idx29], m3
- mova [stp + %4 + idx30], m2
- mova [stp + %4 + idx31], m11
-
- ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m2, [rsp + transposed_in + 16 * 3]
- mova m3, m2
- pmulhrsw m3, [pw_m2404x2] ; stp1_23
- pmulhrsw m2, [pw_16207x2] ; stp2_24
-
- mova m5, [rsp + transposed_in + 16 * 5]
- mova m6, m5
- pmulhrsw m5, [pw__3981x2] ; stp1_20
- pmulhrsw m6, [pw_15893x2] ; stp2_27
-
- mova m14, [rsp + transposed_in + 16 * 11]
- mova m13, m14
- pmulhrsw m13, [pw_m8423x2] ; stp1_21
- pmulhrsw m14, [pw_14053x2] ; stp2_26
-
- mova m0, [rsp + transposed_in + 16 * 13]
- mova m1, m0
- pmulhrsw m0, [pw__9760x2] ; stp1_22
- pmulhrsw m1, [pw_13160x2] ; stp2_25
-
- ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 5, 13, 9 ; stp2_20, stp2_21
- SUM_SUB 3, 0, 9 ; stp2_23, stp2_22
- SUM_SUB 2, 1, 9 ; stp2_24, stp2_25
- SUM_SUB 6, 14, 9 ; stp2_27, stp2_26
-
- ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26
- BUTTERFLY_4Xmm 1, 0, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22
-
- ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 3, 5, 9 ; stp2_23, stp2_20
- SUM_SUB 0, 14, 9 ; stp2_22, stp2_21
- SUM_SUB 2, 6, 9 ; stp2_24, stp2_27
- SUM_SUB 1, 13, 9 ; stp2_25, stp2_26
-
- ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20
- BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21
-
- ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m4, [stp + %3 + idx16]
- mova m7, [stp + %3 + idx17]
- mova m11, [stp + %3 + idx18]
- mova m12, [stp + %3 + idx19]
- SUM_SUB 4, 3, 9 ; stp2_16, stp2_23
- SUM_SUB 7, 0, 9 ; stp2_17, stp2_22
- SUM_SUB 11, 14, 9 ; stp2_18, stp2_21
- SUM_SUB 12, 5, 9 ; stp2_19, stp2_20
- mova [stp + %3 + idx16], m4
- mova [stp + %3 + idx17], m7
- mova [stp + %3 + idx18], m11
- mova [stp + %3 + idx19], m12
-
- mova m4, [stp + %4 + idx28]
- mova m7, [stp + %4 + idx29]
- mova m11, [stp + %4 + idx30]
- mova m12, [stp + %4 + idx31]
- SUM_SUB 4, 6, 9 ; stp2_28, stp2_27
- SUM_SUB 7, 13, 9 ; stp2_29, stp2_26
- SUM_SUB 11, 1, 9 ; stp2_30, stp2_25
- SUM_SUB 12, 2, 9 ; stp2_31, stp2_24
- mova [stp + %4 + idx28], m4
- mova [stp + %4 + idx29], m7
- mova [stp + %4 + idx30], m11
- mova [stp + %4 + idx31], m12
-
- ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
- mova m10, [pw_11585x2]
- SUM_SUB 6, 5, 9
- pmulhrsw m6, m10 ; stp1_27
- pmulhrsw m5, m10 ; stp1_20
- SUM_SUB 13, 14, 9
- pmulhrsw m13, m10 ; stp1_26
- pmulhrsw m14, m10 ; stp1_21
- SUM_SUB 1, 0, 9
- pmulhrsw m1, m10 ; stp1_25
- pmulhrsw m0, m10 ; stp1_22
- SUM_SUB 2, 3, 9
- pmulhrsw m2, m10 ; stp1_25
- pmulhrsw m3, m10 ; stp1_22
-%else
- BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27
- SWAP 6, 5
- BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26
- SWAP 13, 14
- BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25
- SWAP 1, 0
- BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24
- SWAP 2, 3
-%endif
- mova [stp + %3 + idx20], m5
- mova [stp + %3 + idx21], m14
- mova [stp + %3 + idx22], m0
- mova [stp + %3 + idx23], m3
- mova [stp + %4 + idx24], m2
- mova [stp + %4 + idx25], m1
- mova [stp + %4 + idx26], m13
- mova [stp + %4 + idx27], m6
-
- ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ;
- ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m0, [rsp + transposed_in + 16 * 2]
- mova m1, m0
- pmulhrsw m0, [pw__1606x2] ; stp1_8
- pmulhrsw m1, [pw_16305x2] ; stp2_15
-
- mova m6, [rsp + transposed_in + 16 * 6]
- mova m7, m6
- pmulhrsw m7, [pw_m4756x2] ; stp2_11
- pmulhrsw m6, [pw_15679x2] ; stp1_12
-
- mova m4, [rsp + transposed_in + 16 * 10]
- mova m5, m4
- pmulhrsw m4, [pw__7723x2] ; stp1_10
- pmulhrsw m5, [pw_14449x2] ; stp2_13
-
- mova m2, [rsp + transposed_in + 16 * 14]
- mova m3, m2
- pmulhrsw m3, [pw_m10394x2] ; stp1_9
- pmulhrsw m2, [pw_12665x2] ; stp2_14
-
- ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 0, 3, 9 ; stp1_8, stp1_9
- SUM_SUB 7, 4, 9 ; stp1_11, stp1_10
- SUM_SUB 6, 5, 9 ; stp1_12, stp1_13
- SUM_SUB 1, 2, 9 ; stp1_15, stp1_14
-
- ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14
- BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10
-
- ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 0, 7, 9 ; stp1_8, stp1_11
- SUM_SUB 2, 4, 9 ; stp1_9, stp1_10
- SUM_SUB 1, 6, 9 ; stp1_15, stp1_12
- SUM_SUB 3, 5, 9 ; stp1_14, stp1_13
-
- ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
- mova m10, [pw_11585x2]
- SUM_SUB 5, 4, 9
- pmulhrsw m5, m10 ; stp1_13
- pmulhrsw m4, m10 ; stp1_10
- SUM_SUB 6, 7, 9
- pmulhrsw m6, m10 ; stp1_12
- pmulhrsw m7, m10 ; stp1_11
-%else
- BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13
- SWAP 5, 4
- BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12
- SWAP 6, 7
-%endif
- ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova [stp + %2 + idx8], m0
- mova [stp + %2 + idx9], m2
- mova [stp + %2 + idx10], m4
- mova [stp + %2 + idx11], m7
- mova [stp + %2 + idx12], m6
- mova [stp + %2 + idx13], m5
- mova [stp + %2 + idx14], m3
- mova [stp + %2 + idx15], m1
-
- ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ;
- ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ;
- ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m11, [rsp + transposed_in + 16 * 4]
- mova m12, m11
- pmulhrsw m11, [pw__3196x2] ; stp1_4
- pmulhrsw m12, [pw_16069x2] ; stp1_7
-
- mova m13, [rsp + transposed_in + 16 * 12]
- mova m14, m13
- pmulhrsw m13, [pw_13623x2] ; stp1_6
- pmulhrsw m14, [pw_m9102x2] ; stp1_5
-
- ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m0, [rsp + transposed_in + 16 * 0]
- mova m2, [rsp + transposed_in + 16 * 8]
- pmulhrsw m0, [pw_11585x2] ; stp1_1
- mova m3, m2
- pmulhrsw m2, [pw__6270x2] ; stp1_2
- pmulhrsw m3, [pw_15137x2] ; stp1_3
-
- SUM_SUB 11, 14, 9 ; stp1_4, stp1_5
- SUM_SUB 12, 13, 9 ; stp1_7, stp1_6
-
- ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
- mova m10, [pw_11585x2]
- SUM_SUB 13, 14, 9
- pmulhrsw m13, m10 ; stp1_6
- pmulhrsw m14, m10 ; stp1_5
-%else
- BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6
- SWAP 13, 14
-%endif
- mova m1, m0 ; stp1_0 = stp1_1
- SUM_SUB 0, 3, 9 ; stp1_0, stp1_3
- SUM_SUB 1, 2, 9 ; stp1_1, stp1_2
-
- ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 0, 12, 9 ; stp1_0, stp1_7
- SUM_SUB 1, 13, 9 ; stp1_1, stp1_6
- SUM_SUB 2, 14, 9 ; stp1_2, stp1_5
- SUM_SUB 3, 11, 9 ; stp1_3, stp1_4
-
- ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m4, [stp + %2 + idx12]
- mova m5, [stp + %2 + idx13]
- mova m6, [stp + %2 + idx14]
- mova m7, [stp + %2 + idx15]
- SUM_SUB 0, 7, 9 ; stp1_0, stp1_15
- SUM_SUB 1, 6, 9 ; stp1_1, stp1_14
- SUM_SUB 2, 5, 9 ; stp1_2, stp1_13
- SUM_SUB 3, 4, 9 ; stp1_3, stp1_12
-
- ; 0-3, 28-31 final stage
- mova m10, [stp + %4 + idx31]
- mova m15, [stp + %4 + idx30]
- SUM_SUB 0, 10, 9 ; stp1_0, stp1_31
- SUM_SUB 1, 15, 9 ; stp1_1, stp1_30
- mova [stp + %1 + idx0], m0
- mova [stp + %1 + idx1], m1
- mova [stp + %4 + idx31], m10
- mova [stp + %4 + idx30], m15
- mova m0, [stp + %4 + idx29]
- mova m1, [stp + %4 + idx28]
- SUM_SUB 2, 0, 9 ; stp1_2, stp1_29
- SUM_SUB 3, 1, 9 ; stp1_3, stp1_28
- mova [stp + %1 + idx2], m2
- mova [stp + %1 + idx3], m3
- mova [stp + %4 + idx29], m0
- mova [stp + %4 + idx28], m1
-
- ; 12-15, 16-19 final stage
- mova m0, [stp + %3 + idx16]
- mova m1, [stp + %3 + idx17]
- mova m2, [stp + %3 + idx18]
- mova m3, [stp + %3 + idx19]
- SUM_SUB 7, 0, 9 ; stp1_15, stp1_16
- SUM_SUB 6, 1, 9 ; stp1_14, stp1_17
- SUM_SUB 5, 2, 9 ; stp1_13, stp1_18
- SUM_SUB 4, 3, 9 ; stp1_12, stp1_19
- mova [stp + %2 + idx12], m4
- mova [stp + %2 + idx13], m5
- mova [stp + %2 + idx14], m6
- mova [stp + %2 + idx15], m7
- mova [stp + %3 + idx16], m0
- mova [stp + %3 + idx17], m1
- mova [stp + %3 + idx18], m2
- mova [stp + %3 + idx19], m3
-
- mova m4, [stp + %2 + idx8]
- mova m5, [stp + %2 + idx9]
- mova m6, [stp + %2 + idx10]
- mova m7, [stp + %2 + idx11]
- SUM_SUB 11, 7, 9 ; stp1_4, stp1_11
- SUM_SUB 14, 6, 9 ; stp1_5, stp1_10
- SUM_SUB 13, 5, 9 ; stp1_6, stp1_9
- SUM_SUB 12, 4, 9 ; stp1_7, stp1_8
-
- ; 4-7, 24-27 final stage
- mova m3, [stp + %4 + idx24]
- mova m2, [stp + %4 + idx25]
- mova m1, [stp + %4 + idx26]
- mova m0, [stp + %4 + idx27]
- SUM_SUB 12, 3, 9 ; stp1_7, stp1_24
- SUM_SUB 13, 2, 9 ; stp1_6, stp1_25
- SUM_SUB 14, 1, 9 ; stp1_5, stp1_26
- SUM_SUB 11, 0, 9 ; stp1_4, stp1_27
- mova [stp + %4 + idx24], m3
- mova [stp + %4 + idx25], m2
- mova [stp + %4 + idx26], m1
- mova [stp + %4 + idx27], m0
- mova [stp + %1 + idx4], m11
- mova [stp + %1 + idx5], m14
- mova [stp + %1 + idx6], m13
- mova [stp + %1 + idx7], m12
-
- ; 8-11, 20-23 final stage
- mova m0, [stp + %3 + idx20]
- mova m1, [stp + %3 + idx21]
- mova m2, [stp + %3 + idx22]
- mova m3, [stp + %3 + idx23]
- SUM_SUB 7, 0, 9 ; stp1_11, stp_20
- SUM_SUB 6, 1, 9 ; stp1_10, stp_21
- SUM_SUB 5, 2, 9 ; stp1_9, stp_22
- SUM_SUB 4, 3, 9 ; stp1_8, stp_23
- mova [stp + %2 + idx8], m4
- mova [stp + %2 + idx9], m5
- mova [stp + %2 + idx10], m6
- mova [stp + %2 + idx11], m7
- mova [stp + %3 + idx20], m0
- mova [stp + %3 + idx21], m1
- mova [stp + %3 + idx22], m2
- mova [stp + %3 + idx23], m3
-%endmacro
-
-INIT_XMM ssse3
-cglobal idct32x32_135_add, 3, 11, 16, i32x32_size, input, output, stride
- mova m8, [pd_8192]
- mov r6, 2
- lea stp, [rsp + pass_one_start]
-
-idct32x32_135:
- mov r3, inputq
- lea r4, [rsp + transposed_in]
- mov r7, 2
-
-idct32x32_135_transpose:
-%if CONFIG_VP9_HIGHBITDEPTH
- mova m0, [r3 + 0]
- packssdw m0, [r3 + 16]
- mova m1, [r3 + 32 * 4]
- packssdw m1, [r3 + 32 * 4 + 16]
- mova m2, [r3 + 32 * 8]
- packssdw m2, [r3 + 32 * 8 + 16]
- mova m3, [r3 + 32 * 12]
- packssdw m3, [r3 + 32 * 12 + 16]
- mova m4, [r3 + 32 * 16]
- packssdw m4, [r3 + 32 * 16 + 16]
- mova m5, [r3 + 32 * 20]
- packssdw m5, [r3 + 32 * 20 + 16]
- mova m6, [r3 + 32 * 24]
- packssdw m6, [r3 + 32 * 24 + 16]
- mova m7, [r3 + 32 * 28]
- packssdw m7, [r3 + 32 * 28 + 16]
-%else
- mova m0, [r3 + 0]
- mova m1, [r3 + 16 * 4]
- mova m2, [r3 + 16 * 8]
- mova m3, [r3 + 16 * 12]
- mova m4, [r3 + 16 * 16]
- mova m5, [r3 + 16 * 20]
- mova m6, [r3 + 16 * 24]
- mova m7, [r3 + 16 * 28]
-%endif
- TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-
- mova [r4 + 0], m0
- mova [r4 + 16 * 1], m1
- mova [r4 + 16 * 2], m2
- mova [r4 + 16 * 3], m3
- mova [r4 + 16 * 4], m4
- mova [r4 + 16 * 5], m5
- mova [r4 + 16 * 6], m6
- mova [r4 + 16 * 7], m7
-
-%if CONFIG_VP9_HIGHBITDEPTH
- add r3, 32
-%else
- add r3, 16
-%endif
- add r4, 16 * 8
- dec r7
- jne idct32x32_135_transpose
-
- IDCT32X32_135 16*0, 16*32, 16*64, 16*96
- lea stp, [stp + 16 * 8]
-%if CONFIG_VP9_HIGHBITDEPTH
- lea inputq, [inputq + 32 * 32]
-%else
- lea inputq, [inputq + 16 * 32]
-%endif
- dec r6
- jnz idct32x32_135
-
- mov r6, 4
- lea stp, [rsp + pass_one_start]
- lea r9, [rsp + pass_one_start]
-
-idct32x32_135_2:
- lea r4, [rsp + transposed_in]
- mov r3, r9
- mov r7, 2
-
-idct32x32_135_transpose_2:
- mova m0, [r3 + 0]
- mova m1, [r3 + 16 * 1]
- mova m2, [r3 + 16 * 2]
- mova m3, [r3 + 16 * 3]
- mova m4, [r3 + 16 * 4]
- mova m5, [r3 + 16 * 5]
- mova m6, [r3 + 16 * 6]
- mova m7, [r3 + 16 * 7]
-
- TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-
- mova [r4 + 0], m0
- mova [r4 + 16 * 1], m1
- mova [r4 + 16 * 2], m2
- mova [r4 + 16 * 3], m3
- mova [r4 + 16 * 4], m4
- mova [r4 + 16 * 5], m5
- mova [r4 + 16 * 6], m6
- mova [r4 + 16 * 7], m7
-
- add r3, 16 * 8
- add r4, 16 * 8
- dec r7
- jne idct32x32_135_transpose_2
-
- IDCT32X32_135 16*0, 16*8, 16*16, 16*24
-
- lea stp, [stp + 16 * 32]
- add r9, 16 * 32
- dec r6
- jnz idct32x32_135_2
-
- RECON_AND_STORE pass_two_start
-
- RET
-
-%macro IDCT32X32_1024 4
- ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m1, [rsp + transposed_in + 16 * 1]
- mova m11, [rsp + transposed_in + 16 * 31]
- BUTTERFLY_4X 1, 11, 804, 16364, m8, 9, 10 ; stp1_16, stp1_31
-
- mova m0, [rsp + transposed_in + 16 * 15]
- mova m2, [rsp + transposed_in + 16 * 17]
- BUTTERFLY_4X 2, 0, 12140, 11003, m8, 9, 10 ; stp1_17, stp1_30
-
- mova m7, [rsp + transposed_in + 16 * 7]
- mova m12, [rsp + transposed_in + 16 * 25]
- BUTTERFLY_4X 12, 7, 15426, 5520, m8, 9, 10 ; stp1_19, stp1_28
-
- mova m3, [rsp + transposed_in + 16 * 9]
- mova m4, [rsp + transposed_in + 16 * 23]
- BUTTERFLY_4X 3, 4, 7005, 14811, m8, 9, 10 ; stp1_18, stp1_29
-
- ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 1, 2, 9 ; stp2_16, stp2_17
- SUM_SUB 12, 3, 9 ; stp2_19, stp2_18
- SUM_SUB 7, 4, 9 ; stp2_28, stp2_29
- SUM_SUB 11, 0, 9 ; stp2_31, stp2_30
-
- ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30
- BUTTERFLY_4Xmm 4, 3, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18
-
- ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 1, 12, 9 ; stp2_16, stp2_19
- SUM_SUB 0, 3, 9 ; stp2_17, stp2_18
- SUM_SUB 11, 7, 9 ; stp2_31, stp2_28
- SUM_SUB 2, 4, 9 ; stp2_30, stp2_29
-
- ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 4, 3, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29
- BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28
-
- mova [stp + %3 + idx16], m1
- mova [stp + %3 + idx17], m0
- mova [stp + %3 + idx18], m4
- mova [stp + %3 + idx19], m7
- mova [stp + %4 + idx28], m12
- mova [stp + %4 + idx29], m3
- mova [stp + %4 + idx30], m2
- mova [stp + %4 + idx31], m11
-
- ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m5, [rsp + transposed_in + 16 * 5]
- mova m6, [rsp + transposed_in + 16 * 27]
- BUTTERFLY_4X 5, 6, 3981, 15893, m8, 9, 10 ; stp1_20, stp1_27
-
- mova m13, [rsp + transposed_in + 16 * 21]
- mova m14, [rsp + transposed_in + 16 * 11]
- BUTTERFLY_4X 13, 14, 14053, 8423, m8, 9, 10 ; stp1_21, stp1_26
-
- mova m0, [rsp + transposed_in + 16 * 13]
- mova m1, [rsp + transposed_in + 16 * 19]
- BUTTERFLY_4X 0, 1, 9760, 13160, m8, 9, 10 ; stp1_22, stp1_25
-
- mova m2, [rsp + transposed_in + 16 * 3]
- mova m3, [rsp + transposed_in + 16 * 29]
- BUTTERFLY_4X 3, 2, 16207, 2404, m8, 9, 10 ; stp1_23, stp1_24
-
- ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 5, 13, 9 ; stp2_20, stp2_21
- SUM_SUB 3, 0, 9 ; stp2_23, stp2_22
- SUM_SUB 2, 1, 9 ; stp2_24, stp2_25
- SUM_SUB 6, 14, 9 ; stp2_27, stp2_26
-
- ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26
- BUTTERFLY_4Xmm 1, 0, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22
-
- ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 3, 5, 9 ; stp2_23, stp2_20
- SUM_SUB 0, 14, 9 ; stp2_22, stp2_21
- SUM_SUB 2, 6, 9 ; stp2_24, stp2_27
- SUM_SUB 1, 13, 9 ; stp2_25, stp2_26
-
- ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20
- BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21
-
- ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m4, [stp + %3 + idx16]
- mova m7, [stp + %3 + idx17]
- mova m11, [stp + %3 + idx18]
- mova m12, [stp + %3 + idx19]
- SUM_SUB 4, 3, 9 ; stp2_16, stp2_23
- SUM_SUB 7, 0, 9 ; stp2_17, stp2_22
- SUM_SUB 11, 14, 9 ; stp2_18, stp2_21
- SUM_SUB 12, 5, 9 ; stp2_19, stp2_20
- mova [stp + %3 + idx16], m4
- mova [stp + %3 + idx17], m7
- mova [stp + %3 + idx18], m11
- mova [stp + %3 + idx19], m12
-
- mova m4, [stp + %4 + idx28]
- mova m7, [stp + %4 + idx29]
- mova m11, [stp + %4 + idx30]
- mova m12, [stp + %4 + idx31]
- SUM_SUB 4, 6, 9 ; stp2_28, stp2_27
- SUM_SUB 7, 13, 9 ; stp2_29, stp2_26
- SUM_SUB 11, 1, 9 ; stp2_30, stp2_25
- SUM_SUB 12, 2, 9 ; stp2_31, stp2_24
- mova [stp + %4 + idx28], m4
- mova [stp + %4 + idx29], m7
- mova [stp + %4 + idx30], m11
- mova [stp + %4 + idx31], m12
-
- ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
- mova m10, [pw_11585x2]
- SUM_SUB 6, 5, 9
- pmulhrsw m6, m10 ; stp1_27
- pmulhrsw m5, m10 ; stp1_20
- SUM_SUB 13, 14, 9
- pmulhrsw m13, m10 ; stp1_26
- pmulhrsw m14, m10 ; stp1_21
- SUM_SUB 1, 0, 9
- pmulhrsw m1, m10 ; stp1_25
- pmulhrsw m0, m10 ; stp1_22
- SUM_SUB 2, 3, 9
- pmulhrsw m2, m10 ; stp1_25
- pmulhrsw m3, m10 ; stp1_22
-%else
- BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27
- SWAP 6, 5
- BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26
- SWAP 13, 14
- BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25
- SWAP 1, 0
- BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24
- SWAP 2, 3
-%endif
- mova [stp + %3 + idx20], m5
- mova [stp + %3 + idx21], m14
- mova [stp + %3 + idx22], m0
- mova [stp + %3 + idx23], m3
- mova [stp + %4 + idx24], m2
- mova [stp + %4 + idx25], m1
- mova [stp + %4 + idx26], m13
- mova [stp + %4 + idx27], m6
-
- ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ;
- ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m0, [rsp + transposed_in + 16 * 2]
- mova m1, [rsp + transposed_in + 16 * 30]
- BUTTERFLY_4X 0, 1, 1606, 16305, m8, 9, 10 ; stp1_8, stp1_15
-
- mova m2, [rsp + transposed_in + 16 * 14]
- mova m3, [rsp + transposed_in + 16 * 18]
- BUTTERFLY_4X 3, 2, 12665, 10394, m8, 9, 10 ; stp1_9, stp1_14
-
- mova m4, [rsp + transposed_in + 16 * 10]
- mova m5, [rsp + transposed_in + 16 * 22]
- BUTTERFLY_4X 4, 5, 7723, 14449, m8, 9, 10 ; stp1_10, stp1_13
-
- mova m6, [rsp + transposed_in + 16 * 6]
- mova m7, [rsp + transposed_in + 16 * 26]
- BUTTERFLY_4X 7, 6, 15679, 4756, m8, 9, 10 ; stp1_11, stp1_12
-
- ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 0, 3, 9 ; stp1_8, stp1_9
- SUM_SUB 7, 4, 9 ; stp1_11, stp1_10
- SUM_SUB 6, 5, 9 ; stp1_12, stp1_13
- SUM_SUB 1, 2, 9 ; stp1_15, stp1_14
-
- ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14
- BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10
-
- ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 0, 7, 9 ; stp1_8, stp1_11
- SUM_SUB 2, 4, 9 ; stp1_9, stp1_10
- SUM_SUB 1, 6, 9 ; stp1_15, stp1_12
- SUM_SUB 3, 5, 9 ; stp1_14, stp1_13
-
- ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
- mova m10, [pw_11585x2]
- SUM_SUB 5, 4, 9
- pmulhrsw m5, m10 ; stp1_13
- pmulhrsw m4, m10 ; stp1_10
- SUM_SUB 6, 7, 9
- pmulhrsw m6, m10 ; stp1_12
- pmulhrsw m7, m10 ; stp1_11
-%else
- BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13
- SWAP 5, 4
- BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12
- SWAP 6, 7
-%endif
- ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova [stp + %2 + idx8], m0
- mova [stp + %2 + idx9], m2
- mova [stp + %2 + idx10], m4
- mova [stp + %2 + idx11], m7
- mova [stp + %2 + idx12], m6
- mova [stp + %2 + idx13], m5
- mova [stp + %2 + idx14], m3
- mova [stp + %2 + idx15], m1
-
- ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ;
- ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ;
- ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m11, [rsp + transposed_in + 16 * 4]
- mova m12, [rsp + transposed_in + 16 * 28]
- BUTTERFLY_4X 11, 12, 3196, 16069, m8, 9, 10 ; stp1_4, stp1_7
-
- mova m13, [rsp + transposed_in + 16 * 12]
- mova m14, [rsp + transposed_in + 16 * 20]
- BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_5, stp1_6
-
- ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m0, [rsp + transposed_in + 16 * 0]
- mova m1, [rsp + transposed_in + 16 * 16]
-
-%if 0 ; overflow occurs in SUM_SUB when using test streams
- mova m10, [pw_11585x2]
- SUM_SUB 0, 1, 9
- pmulhrsw m0, m10 ; stp1_1
- pmulhrsw m1, m10 ; stp1_0
-%else
- BUTTERFLY_4X 0, 1, 11585, 11585, m8, 9, 10 ; stp1_1, stp1_0
- SWAP 0, 1
-%endif
- mova m2, [rsp + transposed_in + 16 * 8]
- mova m3, [rsp + transposed_in + 16 * 24]
- BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_2, stp1_3
-
- mova m10, [pw_11585x2]
- SUM_SUB 11, 14, 9 ; stp1_4, stp1_5
- SUM_SUB 12, 13, 9 ; stp1_7, stp1_6
-
- ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-%if 0 ; overflow occurs in SUM_SUB when using test streams
- SUM_SUB 13, 14, 9
- pmulhrsw m13, m10 ; stp1_6
- pmulhrsw m14, m10 ; stp1_5
-%else
- BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6
- SWAP 13, 14
-%endif
- SUM_SUB 0, 3, 9 ; stp1_0, stp1_3
- SUM_SUB 1, 2, 9 ; stp1_1, stp1_2
-
- ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- SUM_SUB 0, 12, 9 ; stp1_0, stp1_7
- SUM_SUB 1, 13, 9 ; stp1_1, stp1_6
- SUM_SUB 2, 14, 9 ; stp1_2, stp1_5
- SUM_SUB 3, 11, 9 ; stp1_3, stp1_4
-
- ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- mova m4, [stp + %2 + idx12]
- mova m5, [stp + %2 + idx13]
- mova m6, [stp + %2 + idx14]
- mova m7, [stp + %2 + idx15]
- SUM_SUB 0, 7, 9 ; stp1_0, stp1_15
- SUM_SUB 1, 6, 9 ; stp1_1, stp1_14
- SUM_SUB 2, 5, 9 ; stp1_2, stp1_13
- SUM_SUB 3, 4, 9 ; stp1_3, stp1_12
-
- ; 0-3, 28-31 final stage
- mova m10, [stp + %4 + idx31]
- mova m15, [stp + %4 + idx30]
- SUM_SUB 0, 10, 9 ; stp1_0, stp1_31
- SUM_SUB 1, 15, 9 ; stp1_1, stp1_30
- mova [stp + %1 + idx0], m0
- mova [stp + %1 + idx1], m1
- mova [stp + %4 + idx31], m10
- mova [stp + %4 + idx30], m15
- mova m0, [stp + %4 + idx29]
- mova m1, [stp + %4 + idx28]
- SUM_SUB 2, 0, 9 ; stp1_2, stp1_29
- SUM_SUB 3, 1, 9 ; stp1_3, stp1_28
- mova [stp + %1 + idx2], m2
- mova [stp + %1 + idx3], m3
- mova [stp + %4 + idx29], m0
- mova [stp + %4 + idx28], m1
-
- ; 12-15, 16-19 final stage
- mova m0, [stp + %3 + idx16]
- mova m1, [stp + %3 + idx17]
- mova m2, [stp + %3 + idx18]
- mova m3, [stp + %3 + idx19]
- SUM_SUB 7, 0, 9 ; stp1_15, stp1_16
- SUM_SUB 6, 1, 9 ; stp1_14, stp1_17
- SUM_SUB 5, 2, 9 ; stp1_13, stp1_18
- SUM_SUB 4, 3, 9 ; stp1_12, stp1_19
- mova [stp + %2 + idx12], m4
- mova [stp + %2 + idx13], m5
- mova [stp + %2 + idx14], m6
- mova [stp + %2 + idx15], m7
- mova [stp + %3 + idx16], m0
- mova [stp + %3 + idx17], m1
- mova [stp + %3 + idx18], m2
- mova [stp + %3 + idx19], m3
-
- mova m4, [stp + %2 + idx8]
- mova m5, [stp + %2 + idx9]
- mova m6, [stp + %2 + idx10]
- mova m7, [stp + %2 + idx11]
- SUM_SUB 11, 7, 9 ; stp1_4, stp1_11
- SUM_SUB 14, 6, 9 ; stp1_5, stp1_10
- SUM_SUB 13, 5, 9 ; stp1_6, stp1_9
- SUM_SUB 12, 4, 9 ; stp1_7, stp1_8
-
- ; 4-7, 24-27 final stage
- mova m3, [stp + %4 + idx24]
- mova m2, [stp + %4 + idx25]
- mova m1, [stp + %4 + idx26]
- mova m0, [stp + %4 + idx27]
- SUM_SUB 12, 3, 9 ; stp1_7, stp1_24
- SUM_SUB 13, 2, 9 ; stp1_6, stp1_25
- SUM_SUB 14, 1, 9 ; stp1_5, stp1_26
- SUM_SUB 11, 0, 9 ; stp1_4, stp1_27
- mova [stp + %4 + idx24], m3
- mova [stp + %4 + idx25], m2
- mova [stp + %4 + idx26], m1
- mova [stp + %4 + idx27], m0
- mova [stp + %1 + idx4], m11
- mova [stp + %1 + idx5], m14
- mova [stp + %1 + idx6], m13
- mova [stp + %1 + idx7], m12
-
- ; 8-11, 20-23 final stage
- mova m0, [stp + %3 + idx20]
- mova m1, [stp + %3 + idx21]
- mova m2, [stp + %3 + idx22]
- mova m3, [stp + %3 + idx23]
- SUM_SUB 7, 0, 9 ; stp1_11, stp_20
- SUM_SUB 6, 1, 9 ; stp1_10, stp_21
- SUM_SUB 5, 2, 9 ; stp1_9, stp_22
- SUM_SUB 4, 3, 9 ; stp1_8, stp_23
- mova [stp + %2 + idx8], m4
- mova [stp + %2 + idx9], m5
- mova [stp + %2 + idx10], m6
- mova [stp + %2 + idx11], m7
- mova [stp + %3 + idx20], m0
- mova [stp + %3 + idx21], m1
- mova [stp + %3 + idx22], m2
- mova [stp + %3 + idx23], m3
-%endmacro
-
-INIT_XMM ssse3
-cglobal idct32x32_1024_add, 3, 11, 16, i32x32_size, input, output, stride
- mova m8, [pd_8192]
- mov r6, 4
- lea stp, [rsp + pass_one_start]
-
-idct32x32_1024:
- mov r3, inputq
- lea r4, [rsp + transposed_in]
- mov r7, 4
-
-idct32x32_1024_transpose:
-%if CONFIG_VP9_HIGHBITDEPTH
- mova m0, [r3 + 0]
- packssdw m0, [r3 + 16]
- mova m1, [r3 + 32 * 4]
- packssdw m1, [r3 + 32 * 4 + 16]
- mova m2, [r3 + 32 * 8]
- packssdw m2, [r3 + 32 * 8 + 16]
- mova m3, [r3 + 32 * 12]
- packssdw m3, [r3 + 32 * 12 + 16]
- mova m4, [r3 + 32 * 16]
- packssdw m4, [r3 + 32 * 16 + 16]
- mova m5, [r3 + 32 * 20]
- packssdw m5, [r3 + 32 * 20 + 16]
- mova m6, [r3 + 32 * 24]
- packssdw m6, [r3 + 32 * 24 + 16]
- mova m7, [r3 + 32 * 28]
- packssdw m7, [r3 + 32 * 28 + 16]
-%else
- mova m0, [r3 + 0]
- mova m1, [r3 + 16 * 4]
- mova m2, [r3 + 16 * 8]
- mova m3, [r3 + 16 * 12]
- mova m4, [r3 + 16 * 16]
- mova m5, [r3 + 16 * 20]
- mova m6, [r3 + 16 * 24]
- mova m7, [r3 + 16 * 28]
-%endif
-
- TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-
- mova [r4 + 0], m0
- mova [r4 + 16 * 1], m1
- mova [r4 + 16 * 2], m2
- mova [r4 + 16 * 3], m3
- mova [r4 + 16 * 4], m4
- mova [r4 + 16 * 5], m5
- mova [r4 + 16 * 6], m6
- mova [r4 + 16 * 7], m7
-%if CONFIG_VP9_HIGHBITDEPTH
- add r3, 32
-%else
- add r3, 16
-%endif
- add r4, 16 * 8
- dec r7
- jne idct32x32_1024_transpose
-
- IDCT32X32_1024 16*0, 16*32, 16*64, 16*96
-
- lea stp, [stp + 16 * 8]
-%if CONFIG_VP9_HIGHBITDEPTH
- lea inputq, [inputq + 32 * 32]
-%else
- lea inputq, [inputq + 16 * 32]
-%endif
- dec r6
- jnz idct32x32_1024
-
- mov r6, 4
- lea stp, [rsp + pass_one_start]
- lea r9, [rsp + pass_one_start]
-
-idct32x32_1024_2:
- lea r4, [rsp + transposed_in]
- mov r3, r9
- mov r7, 4
-
-idct32x32_1024_transpose_2:
- mova m0, [r3 + 0]
- mova m1, [r3 + 16 * 1]
- mova m2, [r3 + 16 * 2]
- mova m3, [r3 + 16 * 3]
- mova m4, [r3 + 16 * 4]
- mova m5, [r3 + 16 * 5]
- mova m6, [r3 + 16 * 6]
- mova m7, [r3 + 16 * 7]
-
- TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
-
- mova [r4 + 0], m0
- mova [r4 + 16 * 1], m1
- mova [r4 + 16 * 2], m2
- mova [r4 + 16 * 3], m3
- mova [r4 + 16 * 4], m4
- mova [r4 + 16 * 5], m5
- mova [r4 + 16 * 6], m6
- mova [r4 + 16 * 7], m7
-
- add r3, 16 * 8
- add r4, 16 * 8
- dec r7
- jne idct32x32_1024_transpose_2
-
- IDCT32X32_1024 16*0, 16*8, 16*16, 16*24
-
- lea stp, [stp + 16 * 32]
- add r9, 16 * 32
- dec r6
- jnz idct32x32_1024_2
-
- RECON_AND_STORE pass_two_start
-
- RET
-%endif
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_wht_sse2.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_wht_sse2.asm
index fbbcd76bd7b..bcf1a6ef989 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_wht_sse2.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/inv_wht_sse2.asm
@@ -9,6 +9,7 @@
;
%include "third_party/x86inc/x86inc.asm"
+%include "vpx_dsp/x86/bitdepth_conversion_sse2.asm"
SECTION .text
@@ -82,15 +83,8 @@ SECTION .text
INIT_XMM sse2
cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
-%if CONFIG_VP9_HIGHBITDEPTH
- mova m0, [inputq + 0]
- packssdw m0, [inputq + 16]
- mova m1, [inputq + 32]
- packssdw m1, [inputq + 48]
-%else
- mova m0, [inputq + 0]
- mova m1, [inputq + 16]
-%endif
+ LOAD_TRAN_LOW 0, inputq, 0
+ LOAD_TRAN_LOW 1, inputq, 8
psraw m0, 2
psraw m1, 2
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/quantize_sse2.c b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/quantize_sse2.c
index 0580a7bd7b6..32721beb3a6 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/quantize_sse2.c
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/quantize_sse2.c
@@ -13,7 +13,7 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
-#include "vpx_dsp/x86/fdct.h"
+#include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
int skip_block, const int16_t *zbin_ptr,
diff --git a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm
index ca215391739..ec2cafb94cd 100644
--- a/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm
+++ b/chromium/third_party/libvpx/source/libvpx/vpx_dsp/x86/quantize_ssse3_x86_64.asm
@@ -200,7 +200,6 @@ cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
pand m13, m12
%if CONFIG_VP9_HIGHBITDEPTH
; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
- pxor m11, m11
mova m11, m14
mova m6, m14
pcmpgtw m5, m14