summaryrefslogtreecommitdiff
path: root/vp10/encoder/x86
diff options
context:
space:
mode:
Diffstat (limited to 'vp10/encoder/x86')
-rw-r--r--vp10/encoder/x86/avg_intrin_sse2.c424
-rw-r--r--vp10/encoder/x86/dct_mmx.asm104
-rw-r--r--vp10/encoder/x86/dct_sse2.c2058
-rw-r--r--vp10/encoder/x86/dct_ssse3.c472
-rw-r--r--vp10/encoder/x86/dct_ssse3_x86_64.asm121
-rw-r--r--vp10/encoder/x86/denoiser_sse2.c375
-rw-r--r--vp10/encoder/x86/error_intrin_avx2.c73
-rw-r--r--vp10/encoder/x86/error_sse2.asm122
-rw-r--r--vp10/encoder/x86/highbd_block_error_intrin_sse2.c71
-rw-r--r--vp10/encoder/x86/quantize_sse2.c211
-rw-r--r--vp10/encoder/x86/quantize_ssse3_x86_64.asm201
-rw-r--r--vp10/encoder/x86/ssim_opt_x86_64.asm216
-rw-r--r--vp10/encoder/x86/temporal_filter_apply_sse2.asm212
13 files changed, 0 insertions, 4660 deletions
diff --git a/vp10/encoder/x86/avg_intrin_sse2.c b/vp10/encoder/x86/avg_intrin_sse2.c
deleted file mode 100644
index cf23013f9..000000000
--- a/vp10/encoder/x86/avg_intrin_sse2.c
+++ /dev/null
@@ -1,424 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <emmintrin.h>
-
-#include "./vp10_rtcd.h"
-#include "vpx_ports/mem.h"
-
-void vp10_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
- int *min, int *max) {
- __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
- u0 = _mm_setzero_si128();
- // Row 0
- s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
- d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
- diff = _mm_subs_epi16(s0, d0);
- negdiff = _mm_subs_epi16(u0, diff);
- absdiff0 = _mm_max_epi16(diff, negdiff);
- // Row 1
- s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
- d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
- diff = _mm_subs_epi16(s0, d0);
- negdiff = _mm_subs_epi16(u0, diff);
- absdiff = _mm_max_epi16(diff, negdiff);
- maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
- minabsdiff = _mm_min_epi16(absdiff0, absdiff);
- // Row 2
- s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
- d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
- diff = _mm_subs_epi16(s0, d0);
- negdiff = _mm_subs_epi16(u0, diff);
- absdiff = _mm_max_epi16(diff, negdiff);
- maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
- minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
- // Row 3
- s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
- d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
- diff = _mm_subs_epi16(s0, d0);
- negdiff = _mm_subs_epi16(u0, diff);
- absdiff = _mm_max_epi16(diff, negdiff);
- maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
- minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
- // Row 4
- s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
- d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
- diff = _mm_subs_epi16(s0, d0);
- negdiff = _mm_subs_epi16(u0, diff);
- absdiff = _mm_max_epi16(diff, negdiff);
- maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
- minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
- // Row 5
- s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
- d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
- diff = _mm_subs_epi16(s0, d0);
- negdiff = _mm_subs_epi16(u0, diff);
- absdiff = _mm_max_epi16(diff, negdiff);
- maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
- minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
- // Row 6
- s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
- d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
- diff = _mm_subs_epi16(s0, d0);
- negdiff = _mm_subs_epi16(u0, diff);
- absdiff = _mm_max_epi16(diff, negdiff);
- maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
- minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
- // Row 7
- s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
- d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
- diff = _mm_subs_epi16(s0, d0);
- negdiff = _mm_subs_epi16(u0, diff);
- absdiff = _mm_max_epi16(diff, negdiff);
- maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
- minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
-
- maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
- maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
- maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
- *max = _mm_extract_epi16(maxabsdiff, 0);
-
- minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
- minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
- minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
- *min = _mm_extract_epi16(minabsdiff, 0);
-}
-
-unsigned int vp10_avg_8x8_sse2(const uint8_t *s, int p) {
- __m128i s0, s1, u0;
- unsigned int avg = 0;
- u0 = _mm_setzero_si128();
- s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
- s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
- s0 = _mm_adds_epu16(s0, s1);
- s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
- s0 = _mm_adds_epu16(s0, s1);
- s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
- s0 = _mm_adds_epu16(s0, s1);
- s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
- s0 = _mm_adds_epu16(s0, s1);
- s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
- s0 = _mm_adds_epu16(s0, s1);
- s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
- s0 = _mm_adds_epu16(s0, s1);
- s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
- s0 = _mm_adds_epu16(s0, s1);
-
- s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
- s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32));
- s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
- avg = _mm_extract_epi16(s0, 0);
- return (avg + 32) >> 6;
-}
-
-unsigned int vp10_avg_4x4_sse2(const uint8_t *s, int p) {
- __m128i s0, s1, u0;
- unsigned int avg = 0;
- u0 = _mm_setzero_si128();
- s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
- s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
- s0 = _mm_adds_epu16(s0, s1);
- s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
- s0 = _mm_adds_epu16(s0, s1);
- s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
- s0 = _mm_adds_epu16(s0, s1);
-
- s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
- s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
- avg = _mm_extract_epi16(s0, 0);
- return (avg + 8) >> 4;
-}
-
-static void hadamard_col8_sse2(__m128i *in, int iter) {
- __m128i a0 = in[0];
- __m128i a1 = in[1];
- __m128i a2 = in[2];
- __m128i a3 = in[3];
- __m128i a4 = in[4];
- __m128i a5 = in[5];
- __m128i a6 = in[6];
- __m128i a7 = in[7];
-
- __m128i b0 = _mm_add_epi16(a0, a1);
- __m128i b1 = _mm_sub_epi16(a0, a1);
- __m128i b2 = _mm_add_epi16(a2, a3);
- __m128i b3 = _mm_sub_epi16(a2, a3);
- __m128i b4 = _mm_add_epi16(a4, a5);
- __m128i b5 = _mm_sub_epi16(a4, a5);
- __m128i b6 = _mm_add_epi16(a6, a7);
- __m128i b7 = _mm_sub_epi16(a6, a7);
-
- a0 = _mm_add_epi16(b0, b2);
- a1 = _mm_add_epi16(b1, b3);
- a2 = _mm_sub_epi16(b0, b2);
- a3 = _mm_sub_epi16(b1, b3);
- a4 = _mm_add_epi16(b4, b6);
- a5 = _mm_add_epi16(b5, b7);
- a6 = _mm_sub_epi16(b4, b6);
- a7 = _mm_sub_epi16(b5, b7);
-
- if (iter == 0) {
- b0 = _mm_add_epi16(a0, a4);
- b7 = _mm_add_epi16(a1, a5);
- b3 = _mm_add_epi16(a2, a6);
- b4 = _mm_add_epi16(a3, a7);
- b2 = _mm_sub_epi16(a0, a4);
- b6 = _mm_sub_epi16(a1, a5);
- b1 = _mm_sub_epi16(a2, a6);
- b5 = _mm_sub_epi16(a3, a7);
-
- a0 = _mm_unpacklo_epi16(b0, b1);
- a1 = _mm_unpacklo_epi16(b2, b3);
- a2 = _mm_unpackhi_epi16(b0, b1);
- a3 = _mm_unpackhi_epi16(b2, b3);
- a4 = _mm_unpacklo_epi16(b4, b5);
- a5 = _mm_unpacklo_epi16(b6, b7);
- a6 = _mm_unpackhi_epi16(b4, b5);
- a7 = _mm_unpackhi_epi16(b6, b7);
-
- b0 = _mm_unpacklo_epi32(a0, a1);
- b1 = _mm_unpacklo_epi32(a4, a5);
- b2 = _mm_unpackhi_epi32(a0, a1);
- b3 = _mm_unpackhi_epi32(a4, a5);
- b4 = _mm_unpacklo_epi32(a2, a3);
- b5 = _mm_unpacklo_epi32(a6, a7);
- b6 = _mm_unpackhi_epi32(a2, a3);
- b7 = _mm_unpackhi_epi32(a6, a7);
-
- in[0] = _mm_unpacklo_epi64(b0, b1);
- in[1] = _mm_unpackhi_epi64(b0, b1);
- in[2] = _mm_unpacklo_epi64(b2, b3);
- in[3] = _mm_unpackhi_epi64(b2, b3);
- in[4] = _mm_unpacklo_epi64(b4, b5);
- in[5] = _mm_unpackhi_epi64(b4, b5);
- in[6] = _mm_unpacklo_epi64(b6, b7);
- in[7] = _mm_unpackhi_epi64(b6, b7);
- } else {
- in[0] = _mm_add_epi16(a0, a4);
- in[7] = _mm_add_epi16(a1, a5);
- in[3] = _mm_add_epi16(a2, a6);
- in[4] = _mm_add_epi16(a3, a7);
- in[2] = _mm_sub_epi16(a0, a4);
- in[6] = _mm_sub_epi16(a1, a5);
- in[1] = _mm_sub_epi16(a2, a6);
- in[5] = _mm_sub_epi16(a3, a7);
- }
-}
-
-void vp10_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
- int16_t *coeff) {
- __m128i src[8];
- src[0] = _mm_load_si128((const __m128i *)src_diff);
- src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
- src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
- src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
- src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
- src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
- src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
- src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
-
- hadamard_col8_sse2(src, 0);
- hadamard_col8_sse2(src, 1);
-
- _mm_store_si128((__m128i *)coeff, src[0]);
- coeff += 8;
- _mm_store_si128((__m128i *)coeff, src[1]);
- coeff += 8;
- _mm_store_si128((__m128i *)coeff, src[2]);
- coeff += 8;
- _mm_store_si128((__m128i *)coeff, src[3]);
- coeff += 8;
- _mm_store_si128((__m128i *)coeff, src[4]);
- coeff += 8;
- _mm_store_si128((__m128i *)coeff, src[5]);
- coeff += 8;
- _mm_store_si128((__m128i *)coeff, src[6]);
- coeff += 8;
- _mm_store_si128((__m128i *)coeff, src[7]);
-}
-
-void vp10_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
- int16_t *coeff) {
- int idx;
- for (idx = 0; idx < 4; ++idx) {
- int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
- + (idx & 0x01) * 8;
- vp10_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
- }
-
- for (idx = 0; idx < 64; idx += 8) {
- __m128i coeff0 = _mm_load_si128((const __m128i *)coeff);
- __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64));
- __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128));
- __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192));
-
- __m128i b0 = _mm_add_epi16(coeff0, coeff1);
- __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
- __m128i b2 = _mm_add_epi16(coeff2, coeff3);
- __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
-
- b0 = _mm_srai_epi16(b0, 1);
- b1 = _mm_srai_epi16(b1, 1);
- b2 = _mm_srai_epi16(b2, 1);
- b3 = _mm_srai_epi16(b3, 1);
-
- coeff0 = _mm_add_epi16(b0, b2);
- coeff1 = _mm_add_epi16(b1, b3);
- _mm_store_si128((__m128i *)coeff, coeff0);
- _mm_store_si128((__m128i *)(coeff + 64), coeff1);
-
- coeff2 = _mm_sub_epi16(b0, b2);
- coeff3 = _mm_sub_epi16(b1, b3);
- _mm_store_si128((__m128i *)(coeff + 128), coeff2);
- _mm_store_si128((__m128i *)(coeff + 192), coeff3);
-
- coeff += 8;
- }
-}
-
-int16_t vp10_satd_sse2(const int16_t *coeff, int length) {
- int i;
- __m128i sum = _mm_load_si128((const __m128i *)coeff);
- __m128i sign = _mm_srai_epi16(sum, 15);
- __m128i val = _mm_xor_si128(sum, sign);
- sum = _mm_sub_epi16(val, sign);
- coeff += 8;
-
- for (i = 8; i < length; i += 8) {
- __m128i src_line = _mm_load_si128((const __m128i *)coeff);
- sign = _mm_srai_epi16(src_line, 15);
- val = _mm_xor_si128(src_line, sign);
- val = _mm_sub_epi16(val, sign);
- sum = _mm_add_epi16(sum, val);
- coeff += 8;
- }
-
- val = _mm_srli_si128(sum, 8);
- sum = _mm_add_epi16(sum, val);
- val = _mm_srli_epi64(sum, 32);
- sum = _mm_add_epi16(sum, val);
- val = _mm_srli_epi32(sum, 16);
- sum = _mm_add_epi16(sum, val);
-
- return _mm_extract_epi16(sum, 0);
-}
-
-void vp10_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
- const int ref_stride, const int height) {
- int idx;
- __m128i zero = _mm_setzero_si128();
- __m128i src_line = _mm_loadu_si128((const __m128i *)ref);
- __m128i s0 = _mm_unpacklo_epi8(src_line, zero);
- __m128i s1 = _mm_unpackhi_epi8(src_line, zero);
- __m128i t0, t1;
- int height_1 = height - 1;
- ref += ref_stride;
-
- for (idx = 1; idx < height_1; idx += 2) {
- src_line = _mm_loadu_si128((const __m128i *)ref);
- t0 = _mm_unpacklo_epi8(src_line, zero);
- t1 = _mm_unpackhi_epi8(src_line, zero);
- s0 = _mm_adds_epu16(s0, t0);
- s1 = _mm_adds_epu16(s1, t1);
- ref += ref_stride;
-
- src_line = _mm_loadu_si128((const __m128i *)ref);
- t0 = _mm_unpacklo_epi8(src_line, zero);
- t1 = _mm_unpackhi_epi8(src_line, zero);
- s0 = _mm_adds_epu16(s0, t0);
- s1 = _mm_adds_epu16(s1, t1);
- ref += ref_stride;
- }
-
- src_line = _mm_loadu_si128((const __m128i *)ref);
- t0 = _mm_unpacklo_epi8(src_line, zero);
- t1 = _mm_unpackhi_epi8(src_line, zero);
- s0 = _mm_adds_epu16(s0, t0);
- s1 = _mm_adds_epu16(s1, t1);
-
- if (height == 64) {
- s0 = _mm_srai_epi16(s0, 5);
- s1 = _mm_srai_epi16(s1, 5);
- } else if (height == 32) {
- s0 = _mm_srai_epi16(s0, 4);
- s1 = _mm_srai_epi16(s1, 4);
- } else {
- s0 = _mm_srai_epi16(s0, 3);
- s1 = _mm_srai_epi16(s1, 3);
- }
-
- _mm_storeu_si128((__m128i *)hbuf, s0);
- hbuf += 8;
- _mm_storeu_si128((__m128i *)hbuf, s1);
-}
-
-int16_t vp10_int_pro_col_sse2(uint8_t const *ref, const int width) {
- __m128i zero = _mm_setzero_si128();
- __m128i src_line = _mm_load_si128((const __m128i *)ref);
- __m128i s0 = _mm_sad_epu8(src_line, zero);
- __m128i s1;
- int i;
-
- for (i = 16; i < width; i += 16) {
- ref += 16;
- src_line = _mm_load_si128((const __m128i *)ref);
- s1 = _mm_sad_epu8(src_line, zero);
- s0 = _mm_adds_epu16(s0, s1);
- }
-
- s1 = _mm_srli_si128(s0, 8);
- s0 = _mm_adds_epu16(s0, s1);
-
- return _mm_extract_epi16(s0, 0);
-}
-
-int vp10_vector_var_sse2(int16_t const *ref, int16_t const *src,
- const int bwl) {
- int idx;
- int width = 4 << bwl;
- int16_t mean;
- __m128i v0 = _mm_loadu_si128((const __m128i *)ref);
- __m128i v1 = _mm_load_si128((const __m128i *)src);
- __m128i diff = _mm_subs_epi16(v0, v1);
- __m128i sum = diff;
- __m128i sse = _mm_madd_epi16(diff, diff);
-
- ref += 8;
- src += 8;
-
- for (idx = 8; idx < width; idx += 8) {
- v0 = _mm_loadu_si128((const __m128i *)ref);
- v1 = _mm_load_si128((const __m128i *)src);
- diff = _mm_subs_epi16(v0, v1);
-
- sum = _mm_add_epi16(sum, diff);
- v0 = _mm_madd_epi16(diff, diff);
- sse = _mm_add_epi32(sse, v0);
-
- ref += 8;
- src += 8;
- }
-
- v0 = _mm_srli_si128(sum, 8);
- sum = _mm_add_epi16(sum, v0);
- v0 = _mm_srli_epi64(sum, 32);
- sum = _mm_add_epi16(sum, v0);
- v0 = _mm_srli_epi32(sum, 16);
- sum = _mm_add_epi16(sum, v0);
-
- v1 = _mm_srli_si128(sse, 8);
- sse = _mm_add_epi32(sse, v1);
- v1 = _mm_srli_epi64(sse, 32);
- sse = _mm_add_epi32(sse, v1);
-
- mean = _mm_extract_epi16(sum, 0);
-
- return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2));
-}
diff --git a/vp10/encoder/x86/dct_mmx.asm b/vp10/encoder/x86/dct_mmx.asm
deleted file mode 100644
index 2327fe9e6..000000000
--- a/vp10/encoder/x86/dct_mmx.asm
+++ /dev/null
@@ -1,104 +0,0 @@
-;
-; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%define private_prefix vp10
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-%macro TRANSFORM_COLS 0
- paddw m0, m1
- movq m4, m0
- psubw m3, m2
- psubw m4, m3
- psraw m4, 1
- movq m5, m4
- psubw m5, m1 ;b1
- psubw m4, m2 ;c1
- psubw m0, m4
- paddw m3, m5
- ; m0 a0
- SWAP 1, 4 ; m1 c1
- SWAP 2, 3 ; m2 d1
- SWAP 3, 5 ; m3 b1
-%endmacro
-
-%macro TRANSPOSE_4X4 0
- movq m4, m0
- movq m5, m2
- punpcklwd m4, m1
- punpckhwd m0, m1
- punpcklwd m5, m3
- punpckhwd m2, m3
- movq m1, m4
- movq m3, m0
- punpckldq m1, m5
- punpckhdq m4, m5
- punpckldq m3, m2
- punpckhdq m0, m2
- SWAP 2, 3, 0, 1, 4
-%endmacro
-
-INIT_MMX mmx
-cglobal fwht4x4, 3, 4, 8, input, output, stride
- lea r3q, [inputq + strideq*4]
- movq m0, [inputq] ;a1
- movq m1, [inputq + strideq*2] ;b1
- movq m2, [r3q] ;c1
- movq m3, [r3q + strideq*2] ;d1
-
- TRANSFORM_COLS
- TRANSPOSE_4X4
- TRANSFORM_COLS
- TRANSPOSE_4X4
-
- psllw m0, 2
- psllw m1, 2
- psllw m2, 2
- psllw m3, 2
-
-%if CONFIG_VP9_HIGHBITDEPTH
- pxor m4, m4
- pxor m5, m5
- pcmpgtw m4, m0
- pcmpgtw m5, m1
- movq m6, m0
- movq m7, m1
- punpcklwd m0, m4
- punpcklwd m1, m5
- punpckhwd m6, m4
- punpckhwd m7, m5
- movq [outputq], m0
- movq [outputq + 8], m6
- movq [outputq + 16], m1
- movq [outputq + 24], m7
- pxor m4, m4
- pxor m5, m5
- pcmpgtw m4, m2
- pcmpgtw m5, m3
- movq m6, m2
- movq m7, m3
- punpcklwd m2, m4
- punpcklwd m3, m5
- punpckhwd m6, m4
- punpckhwd m7, m5
- movq [outputq + 32], m2
- movq [outputq + 40], m6
- movq [outputq + 48], m3
- movq [outputq + 56], m7
-%else
- movq [outputq], m0
- movq [outputq + 8], m1
- movq [outputq + 16], m2
- movq [outputq + 24], m3
-%endif
-
- RET
diff --git a/vp10/encoder/x86/dct_sse2.c b/vp10/encoder/x86/dct_sse2.c
deleted file mode 100644
index e1111570a..000000000
--- a/vp10/encoder/x86/dct_sse2.c
+++ /dev/null
@@ -1,2058 +0,0 @@
-/*
- * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-#include <emmintrin.h> // SSE2
-
-#include "./vp10_rtcd.h"
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/txfm_common.h"
-#include "vpx_dsp/x86/fwd_txfm_sse2.h"
-#include "vpx_dsp/x86/txfm_common_sse2.h"
-#include "vpx_ports/mem.h"
-
-static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
- int stride) {
- const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
- const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
- __m128i mask;
-
- in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
- in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
- in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
- in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
-
- in[0] = _mm_slli_epi16(in[0], 4);
- in[1] = _mm_slli_epi16(in[1], 4);
- in[2] = _mm_slli_epi16(in[2], 4);
- in[3] = _mm_slli_epi16(in[3], 4);
-
- mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
- in[0] = _mm_add_epi16(in[0], mask);
- in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
-}
-
-static INLINE void write_buffer_4x4(tran_low_t *output, __m128i *res) {
- const __m128i kOne = _mm_set1_epi16(1);
- __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
- __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
- __m128i out01 = _mm_add_epi16(in01, kOne);
- __m128i out23 = _mm_add_epi16(in23, kOne);
- out01 = _mm_srai_epi16(out01, 2);
- out23 = _mm_srai_epi16(out23, 2);
- store_output(&out01, (output + 0 * 8));
- store_output(&out23, (output + 1 * 8));
-}
-
-static INLINE void transpose_4x4(__m128i *res) {
- // Combine and transpose
- // 00 01 02 03 20 21 22 23
- // 10 11 12 13 30 31 32 33
- const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
- const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
-
- // 00 10 01 11 02 12 03 13
- // 20 30 21 31 22 32 23 33
- res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
- res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
-
- // 00 10 20 30 01 11 21 31
- // 02 12 22 32 03 13 23 33
- // only use the first 4 16-bit integers
- res[1] = _mm_unpackhi_epi64(res[0], res[0]);
- res[3] = _mm_unpackhi_epi64(res[2], res[2]);
-}
-
-static void fdct4_sse2(__m128i *in) {
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
- __m128i u[4], v[4];
- u[0]=_mm_unpacklo_epi16(in[0], in[1]);
- u[1]=_mm_unpacklo_epi16(in[3], in[2]);
-
- v[0] = _mm_add_epi16(u[0], u[1]);
- v[1] = _mm_sub_epi16(u[0], u[1]);
-
- u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0
- u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2
- u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24); // 1
- u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08); // 3
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-
- in[0] = _mm_packs_epi32(u[0], u[1]);
- in[1] = _mm_packs_epi32(u[2], u[3]);
- transpose_4x4(in);
-}
-
-static void fadst4_sse2(__m128i *in) {
- const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
- const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
- const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
- const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
- const __m128i k__sinpi_p03_p03 = _mm_set1_epi16((int16_t)sinpi_3_9);
- const __m128i kZero = _mm_set1_epi16(0);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- __m128i u[8], v[8];
- __m128i in7 = _mm_add_epi16(in[0], in[1]);
-
- u[0] = _mm_unpacklo_epi16(in[0], in[1]);
- u[1] = _mm_unpacklo_epi16(in[2], in[3]);
- u[2] = _mm_unpacklo_epi16(in7, kZero);
- u[3] = _mm_unpacklo_epi16(in[2], kZero);
- u[4] = _mm_unpacklo_epi16(in[3], kZero);
-
- v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2
- v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5
- v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x1
- v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3
- v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6
- v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4
- v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
-
- u[0] = _mm_add_epi32(v[0], v[1]);
- u[1] = _mm_sub_epi32(v[2], v[6]);
- u[2] = _mm_add_epi32(v[3], v[4]);
- u[3] = _mm_sub_epi32(u[2], u[0]);
- u[4] = _mm_slli_epi32(v[5], 2);
- u[5] = _mm_sub_epi32(u[4], v[5]);
- u[6] = _mm_add_epi32(u[3], u[5]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-
- in[0] = _mm_packs_epi32(u[0], u[2]);
- in[1] = _mm_packs_epi32(u[1], u[3]);
- transpose_4x4(in);
-}
-
-void vp10_fht4x4_sse2(const int16_t *input, tran_low_t *output,
- int stride, int tx_type) {
- __m128i in[4];
-
- switch (tx_type) {
- case DCT_DCT:
- vpx_fdct4x4_sse2(input, output, stride);
- break;
- case ADST_DCT:
- load_buffer_4x4(input, in, stride);
- fadst4_sse2(in);
- fdct4_sse2(in);
- write_buffer_4x4(output, in);
- break;
- case DCT_ADST:
- load_buffer_4x4(input, in, stride);
- fdct4_sse2(in);
- fadst4_sse2(in);
- write_buffer_4x4(output, in);
- break;
- case ADST_ADST:
- load_buffer_4x4(input, in, stride);
- fadst4_sse2(in);
- fadst4_sse2(in);
- write_buffer_4x4(output, in);
- break;
- default:
- assert(0);
- break;
- }
-}
-
-void vp10_fdct8x8_quant_sse2(const int16_t *input, int stride,
- int16_t* coeff_ptr, intptr_t n_coeffs,
- int skip_block, const int16_t* zbin_ptr,
- const int16_t* round_ptr, const int16_t* quant_ptr,
- const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
- int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
- uint16_t* eob_ptr,
- const int16_t* scan_ptr,
- const int16_t* iscan_ptr) {
- __m128i zero;
- int pass;
- // Constants
- // When we use them, in one case, they are all the same. In all others
- // it's a pair of them that we need to repeat four times. This is done
- // by constructing the 32 bit constant corresponding to that pair.
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
- const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- // Load input
- __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
- __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
- __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
- __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
- __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
- __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
- __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
- __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
- __m128i *in[8];
- int index = 0;
-
- (void)scan_ptr;
- (void)zbin_ptr;
- (void)quant_shift_ptr;
- (void)coeff_ptr;
-
- // Pre-condition input (shift by two)
- in0 = _mm_slli_epi16(in0, 2);
- in1 = _mm_slli_epi16(in1, 2);
- in2 = _mm_slli_epi16(in2, 2);
- in3 = _mm_slli_epi16(in3, 2);
- in4 = _mm_slli_epi16(in4, 2);
- in5 = _mm_slli_epi16(in5, 2);
- in6 = _mm_slli_epi16(in6, 2);
- in7 = _mm_slli_epi16(in7, 2);
-
- in[0] = &in0;
- in[1] = &in1;
- in[2] = &in2;
- in[3] = &in3;
- in[4] = &in4;
- in[5] = &in5;
- in[6] = &in6;
- in[7] = &in7;
-
- // We do two passes, first the columns, then the rows. The results of the
- // first pass are transposed so that the same column code can be reused. The
- // results of the second pass are also transposed so that the rows (processed
- // as columns) are put back in row positions.
- for (pass = 0; pass < 2; pass++) {
- // To store results of each pass before the transpose.
- __m128i res0, res1, res2, res3, res4, res5, res6, res7;
- // Add/subtract
- const __m128i q0 = _mm_add_epi16(in0, in7);
- const __m128i q1 = _mm_add_epi16(in1, in6);
- const __m128i q2 = _mm_add_epi16(in2, in5);
- const __m128i q3 = _mm_add_epi16(in3, in4);
- const __m128i q4 = _mm_sub_epi16(in3, in4);
- const __m128i q5 = _mm_sub_epi16(in2, in5);
- const __m128i q6 = _mm_sub_epi16(in1, in6);
- const __m128i q7 = _mm_sub_epi16(in0, in7);
- // Work on first four results
- {
- // Add/subtract
- const __m128i r0 = _mm_add_epi16(q0, q3);
- const __m128i r1 = _mm_add_epi16(q1, q2);
- const __m128i r2 = _mm_sub_epi16(q1, q2);
- const __m128i r3 = _mm_sub_epi16(q0, q3);
- // Interleave to do the multiply by constants which gets us into 32bits
- const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
- const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
- const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
- const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
- const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
- const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
- const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
- const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
- const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
- const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
- const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
- const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
- const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
- // Combine
- res0 = _mm_packs_epi32(w0, w1);
- res4 = _mm_packs_epi32(w2, w3);
- res2 = _mm_packs_epi32(w4, w5);
- res6 = _mm_packs_epi32(w6, w7);
- }
- // Work on next four results
- {
- // Interleave to do the multiply by constants which gets us into 32bits
- const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
- const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
- const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
- const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
- const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
- const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
- // dct_const_round_shift
- const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
- const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
- const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
- const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
- const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
- const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
- const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
- const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
- // Combine
- const __m128i r0 = _mm_packs_epi32(s0, s1);
- const __m128i r1 = _mm_packs_epi32(s2, s3);
- // Add/subtract
- const __m128i x0 = _mm_add_epi16(q4, r0);
- const __m128i x1 = _mm_sub_epi16(q4, r0);
- const __m128i x2 = _mm_sub_epi16(q7, r1);
- const __m128i x3 = _mm_add_epi16(q7, r1);
- // Interleave to do the multiply by constants which gets us into 32bits
- const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
- const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
- const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
- const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
- const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
- const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
- const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
- const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
- const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
- const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
- const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
- const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
- const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
- // Combine
- res1 = _mm_packs_epi32(w0, w1);
- res7 = _mm_packs_epi32(w2, w3);
- res5 = _mm_packs_epi32(w4, w5);
- res3 = _mm_packs_epi32(w6, w7);
- }
- // Transpose the 8x8.
- {
- // 00 01 02 03 04 05 06 07
- // 10 11 12 13 14 15 16 17
- // 20 21 22 23 24 25 26 27
- // 30 31 32 33 34 35 36 37
- // 40 41 42 43 44 45 46 47
- // 50 51 52 53 54 55 56 57
- // 60 61 62 63 64 65 66 67
- // 70 71 72 73 74 75 76 77
- const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
- const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
- const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
- const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
- const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
- const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
- const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
- const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
- // 00 10 01 11 02 12 03 13
- // 20 30 21 31 22 32 23 33
- // 04 14 05 15 06 16 07 17
- // 24 34 25 35 26 36 27 37
- // 40 50 41 51 42 52 43 53
- // 60 70 61 71 62 72 63 73
- // 54 54 55 55 56 56 57 57
- // 64 74 65 75 66 76 67 77
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
- // 00 10 20 30 01 11 21 31
- // 40 50 60 70 41 51 61 71
- // 02 12 22 32 03 13 23 33
- // 42 52 62 72 43 53 63 73
- // 04 14 24 34 05 15 21 36
- // 44 54 64 74 45 55 61 76
- // 06 16 26 36 07 17 27 37
- // 46 56 66 76 47 57 67 77
- in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
- in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
- in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
- in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
- in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
- in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
- in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
- in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
- // 00 10 20 30 40 50 60 70
- // 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72
- // 03 13 23 33 43 53 63 73
- // 04 14 24 34 44 54 64 74
- // 05 15 25 35 45 55 65 75
- // 06 16 26 36 46 56 66 76
- // 07 17 27 37 47 57 67 77
- }
- }
- // Post-condition output and store it
- {
- // Post-condition (division by two)
- // division of two 16 bits signed numbers using shifts
- // n / 2 = (n - (n >> 15)) >> 1
- const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
- const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
- const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
- const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
- const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
- const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
- const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
- const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
- in0 = _mm_sub_epi16(in0, sign_in0);
- in1 = _mm_sub_epi16(in1, sign_in1);
- in2 = _mm_sub_epi16(in2, sign_in2);
- in3 = _mm_sub_epi16(in3, sign_in3);
- in4 = _mm_sub_epi16(in4, sign_in4);
- in5 = _mm_sub_epi16(in5, sign_in5);
- in6 = _mm_sub_epi16(in6, sign_in6);
- in7 = _mm_sub_epi16(in7, sign_in7);
- in0 = _mm_srai_epi16(in0, 1);
- in1 = _mm_srai_epi16(in1, 1);
- in2 = _mm_srai_epi16(in2, 1);
- in3 = _mm_srai_epi16(in3, 1);
- in4 = _mm_srai_epi16(in4, 1);
- in5 = _mm_srai_epi16(in5, 1);
- in6 = _mm_srai_epi16(in6, 1);
- in7 = _mm_srai_epi16(in7, 1);
- }
-
- iscan_ptr += n_coeffs;
- qcoeff_ptr += n_coeffs;
- dqcoeff_ptr += n_coeffs;
- n_coeffs = -n_coeffs;
- zero = _mm_setzero_si128();
-
- if (!skip_block) {
- __m128i eob;
- __m128i round, quant, dequant;
- {
- __m128i coeff0, coeff1;
-
- // Setup global values
- {
- round = _mm_load_si128((const __m128i*)round_ptr);
- quant = _mm_load_si128((const __m128i*)quant_ptr);
- dequant = _mm_load_si128((const __m128i*)dequant_ptr);
- }
-
- {
- __m128i coeff0_sign, coeff1_sign;
- __m128i qcoeff0, qcoeff1;
- __m128i qtmp0, qtmp1;
- // Do DC and first 15 AC
- coeff0 = *in[0];
- coeff1 = *in[1];
-
- // Poor man's sign extract
- coeff0_sign = _mm_srai_epi16(coeff0, 15);
- coeff1_sign = _mm_srai_epi16(coeff1, 15);
- qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
- qcoeff0 = _mm_adds_epi16(qcoeff0, round);
- round = _mm_unpackhi_epi64(round, round);
- qcoeff1 = _mm_adds_epi16(qcoeff1, round);
- qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
- quant = _mm_unpackhi_epi64(quant, quant);
- qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-
- // Reinsert signs
- qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
-
- coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
- dequant = _mm_unpackhi_epi64(dequant, dequant);
- coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
- }
-
- {
- // Scan for eob
- __m128i zero_coeff0, zero_coeff1;
- __m128i nzero_coeff0, nzero_coeff1;
- __m128i iscan0, iscan1;
- __m128i eob1;
- zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
- zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
- nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
- nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
- iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
- iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
- // Add one to convert from indices to counts
- iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
- iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
- eob = _mm_and_si128(iscan0, nzero_coeff0);
- eob1 = _mm_and_si128(iscan1, nzero_coeff1);
- eob = _mm_max_epi16(eob, eob1);
- }
- n_coeffs += 8 * 2;
- }
-
- // AC only loop
- index = 2;
- while (n_coeffs < 0) {
- __m128i coeff0, coeff1;
- {
- __m128i coeff0_sign, coeff1_sign;
- __m128i qcoeff0, qcoeff1;
- __m128i qtmp0, qtmp1;
-
- assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1);
- coeff0 = *in[index];
- coeff1 = *in[index + 1];
-
- // Poor man's sign extract
- coeff0_sign = _mm_srai_epi16(coeff0, 15);
- coeff1_sign = _mm_srai_epi16(coeff1, 15);
- qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
- qcoeff0 = _mm_adds_epi16(qcoeff0, round);
- qcoeff1 = _mm_adds_epi16(qcoeff1, round);
- qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
- qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-
- // Reinsert signs
- qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
-
- coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
- coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
- }
-
- {
- // Scan for eob
- __m128i zero_coeff0, zero_coeff1;
- __m128i nzero_coeff0, nzero_coeff1;
- __m128i iscan0, iscan1;
- __m128i eob0, eob1;
- zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
- zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
- nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
- nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
- iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
- iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
- // Add one to convert from indices to counts
- iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
- iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
- eob0 = _mm_and_si128(iscan0, nzero_coeff0);
- eob1 = _mm_and_si128(iscan1, nzero_coeff1);
- eob0 = _mm_max_epi16(eob0, eob1);
- eob = _mm_max_epi16(eob, eob0);
- }
- n_coeffs += 8 * 2;
- index += 2;
- }
-
- // Accumulate EOB
- {
- __m128i eob_shuffled;
- eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
- eob = _mm_max_epi16(eob, eob_shuffled);
- eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
- eob = _mm_max_epi16(eob, eob_shuffled);
- eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
- eob = _mm_max_epi16(eob, eob_shuffled);
- *eob_ptr = _mm_extract_epi16(eob, 1);
- }
- } else {
- do {
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
- n_coeffs += 8 * 2;
- } while (n_coeffs < 0);
- *eob_ptr = 0;
- }
-}
-
-// load 8x8 array
-static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
- int stride) {
- in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
- in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
- in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
- in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
- in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
- in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
- in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
- in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
-
- in[0] = _mm_slli_epi16(in[0], 2);
- in[1] = _mm_slli_epi16(in[1], 2);
- in[2] = _mm_slli_epi16(in[2], 2);
- in[3] = _mm_slli_epi16(in[3], 2);
- in[4] = _mm_slli_epi16(in[4], 2);
- in[5] = _mm_slli_epi16(in[5], 2);
- in[6] = _mm_slli_epi16(in[6], 2);
- in[7] = _mm_slli_epi16(in[7], 2);
-}
-
-// right shift and rounding
-static INLINE void right_shift_8x8(__m128i *res, const int bit) {
- __m128i sign0 = _mm_srai_epi16(res[0], 15);
- __m128i sign1 = _mm_srai_epi16(res[1], 15);
- __m128i sign2 = _mm_srai_epi16(res[2], 15);
- __m128i sign3 = _mm_srai_epi16(res[3], 15);
- __m128i sign4 = _mm_srai_epi16(res[4], 15);
- __m128i sign5 = _mm_srai_epi16(res[5], 15);
- __m128i sign6 = _mm_srai_epi16(res[6], 15);
- __m128i sign7 = _mm_srai_epi16(res[7], 15);
-
- if (bit == 2) {
- const __m128i const_rounding = _mm_set1_epi16(1);
- res[0] = _mm_add_epi16(res[0], const_rounding);
- res[1] = _mm_add_epi16(res[1], const_rounding);
- res[2] = _mm_add_epi16(res[2], const_rounding);
- res[3] = _mm_add_epi16(res[3], const_rounding);
- res[4] = _mm_add_epi16(res[4], const_rounding);
- res[5] = _mm_add_epi16(res[5], const_rounding);
- res[6] = _mm_add_epi16(res[6], const_rounding);
- res[7] = _mm_add_epi16(res[7], const_rounding);
- }
-
- res[0] = _mm_sub_epi16(res[0], sign0);
- res[1] = _mm_sub_epi16(res[1], sign1);
- res[2] = _mm_sub_epi16(res[2], sign2);
- res[3] = _mm_sub_epi16(res[3], sign3);
- res[4] = _mm_sub_epi16(res[4], sign4);
- res[5] = _mm_sub_epi16(res[5], sign5);
- res[6] = _mm_sub_epi16(res[6], sign6);
- res[7] = _mm_sub_epi16(res[7], sign7);
-
- if (bit == 1) {
- res[0] = _mm_srai_epi16(res[0], 1);
- res[1] = _mm_srai_epi16(res[1], 1);
- res[2] = _mm_srai_epi16(res[2], 1);
- res[3] = _mm_srai_epi16(res[3], 1);
- res[4] = _mm_srai_epi16(res[4], 1);
- res[5] = _mm_srai_epi16(res[5], 1);
- res[6] = _mm_srai_epi16(res[6], 1);
- res[7] = _mm_srai_epi16(res[7], 1);
- } else {
- res[0] = _mm_srai_epi16(res[0], 2);
- res[1] = _mm_srai_epi16(res[1], 2);
- res[2] = _mm_srai_epi16(res[2], 2);
- res[3] = _mm_srai_epi16(res[3], 2);
- res[4] = _mm_srai_epi16(res[4], 2);
- res[5] = _mm_srai_epi16(res[5], 2);
- res[6] = _mm_srai_epi16(res[6], 2);
- res[7] = _mm_srai_epi16(res[7], 2);
- }
-}
-
-// write 8x8 array
-static INLINE void write_buffer_8x8(tran_low_t *output, __m128i *res,
- int stride) {
- store_output(&res[0], (output + 0 * stride));
- store_output(&res[1], (output + 1 * stride));
- store_output(&res[2], (output + 2 * stride));
- store_output(&res[3], (output + 3 * stride));
- store_output(&res[4], (output + 4 * stride));
- store_output(&res[5], (output + 5 * stride));
- store_output(&res[6], (output + 6 * stride));
- store_output(&res[7], (output + 7 * stride));
-}
-
-// perform in-place transpose
-static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
- const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
- const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
- const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
- const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
- const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
- const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
- const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
- const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
- // 00 10 01 11 02 12 03 13
- // 20 30 21 31 22 32 23 33
- // 04 14 05 15 06 16 07 17
- // 24 34 25 35 26 36 27 37
- // 40 50 41 51 42 52 43 53
- // 60 70 61 71 62 72 63 73
- // 44 54 45 55 46 56 47 57
- // 64 74 65 75 66 76 67 77
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
- // 00 10 20 30 01 11 21 31
- // 40 50 60 70 41 51 61 71
- // 02 12 22 32 03 13 23 33
- // 42 52 62 72 43 53 63 73
- // 04 14 24 34 05 15 25 35
- // 44 54 64 74 45 55 65 75
- // 06 16 26 36 07 17 27 37
- // 46 56 66 76 47 57 67 77
- res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
- res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
- res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
- res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
- res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
- res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
- res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
- res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
- // 00 10 20 30 40 50 60 70
- // 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72
- // 03 13 23 33 43 53 63 73
- // 04 14 24 34 44 54 64 74
- // 05 15 25 35 45 55 65 75
- // 06 16 26 36 46 56 66 76
- // 07 17 27 37 47 57 67 77
-}
-
-static void fdct8_sse2(__m128i *in) {
- // constants
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
- const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- __m128i u0, u1, u2, u3, u4, u5, u6, u7;
- __m128i v0, v1, v2, v3, v4, v5, v6, v7;
- __m128i s0, s1, s2, s3, s4, s5, s6, s7;
-
- // stage 1
- s0 = _mm_add_epi16(in[0], in[7]);
- s1 = _mm_add_epi16(in[1], in[6]);
- s2 = _mm_add_epi16(in[2], in[5]);
- s3 = _mm_add_epi16(in[3], in[4]);
- s4 = _mm_sub_epi16(in[3], in[4]);
- s5 = _mm_sub_epi16(in[2], in[5]);
- s6 = _mm_sub_epi16(in[1], in[6]);
- s7 = _mm_sub_epi16(in[0], in[7]);
-
- u0 = _mm_add_epi16(s0, s3);
- u1 = _mm_add_epi16(s1, s2);
- u2 = _mm_sub_epi16(s1, s2);
- u3 = _mm_sub_epi16(s0, s3);
- // interleave and perform butterfly multiplication/addition
- v0 = _mm_unpacklo_epi16(u0, u1);
- v1 = _mm_unpackhi_epi16(u0, u1);
- v2 = _mm_unpacklo_epi16(u2, u3);
- v3 = _mm_unpackhi_epi16(u2, u3);
-
- u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);
- u1 = _mm_madd_epi16(v1, k__cospi_p16_p16);
- u2 = _mm_madd_epi16(v0, k__cospi_p16_m16);
- u3 = _mm_madd_epi16(v1, k__cospi_p16_m16);
- u4 = _mm_madd_epi16(v2, k__cospi_p24_p08);
- u5 = _mm_madd_epi16(v3, k__cospi_p24_p08);
- u6 = _mm_madd_epi16(v2, k__cospi_m08_p24);
- u7 = _mm_madd_epi16(v3, k__cospi_m08_p24);
-
- // shift and rounding
- v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
- v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
- v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
- v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-
- u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-
- in[0] = _mm_packs_epi32(u0, u1);
- in[2] = _mm_packs_epi32(u4, u5);
- in[4] = _mm_packs_epi32(u2, u3);
- in[6] = _mm_packs_epi32(u6, u7);
-
- // stage 2
- // interleave and perform butterfly multiplication/addition
- u0 = _mm_unpacklo_epi16(s6, s5);
- u1 = _mm_unpackhi_epi16(s6, s5);
- v0 = _mm_madd_epi16(u0, k__cospi_p16_m16);
- v1 = _mm_madd_epi16(u1, k__cospi_p16_m16);
- v2 = _mm_madd_epi16(u0, k__cospi_p16_p16);
- v3 = _mm_madd_epi16(u1, k__cospi_p16_p16);
-
- // shift and rounding
- u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
- u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
- u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
- u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
-
- v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
- v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
- v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
- v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
-
- u0 = _mm_packs_epi32(v0, v1);
- u1 = _mm_packs_epi32(v2, v3);
-
- // stage 3
- s0 = _mm_add_epi16(s4, u0);
- s1 = _mm_sub_epi16(s4, u0);
- s2 = _mm_sub_epi16(s7, u1);
- s3 = _mm_add_epi16(s7, u1);
-
- // stage 4
- u0 = _mm_unpacklo_epi16(s0, s3);
- u1 = _mm_unpackhi_epi16(s0, s3);
- u2 = _mm_unpacklo_epi16(s1, s2);
- u3 = _mm_unpackhi_epi16(s1, s2);
-
- v0 = _mm_madd_epi16(u0, k__cospi_p28_p04);
- v1 = _mm_madd_epi16(u1, k__cospi_p28_p04);
- v2 = _mm_madd_epi16(u2, k__cospi_p12_p20);
- v3 = _mm_madd_epi16(u3, k__cospi_p12_p20);
- v4 = _mm_madd_epi16(u2, k__cospi_m20_p12);
- v5 = _mm_madd_epi16(u3, k__cospi_m20_p12);
- v6 = _mm_madd_epi16(u0, k__cospi_m04_p28);
- v7 = _mm_madd_epi16(u1, k__cospi_m04_p28);
-
- // shift and rounding
- u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
- u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
- u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
- u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
- u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
- u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
- u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
- u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
- v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
- v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
- v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
- v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
- v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
- v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
- v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
- v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
- in[1] = _mm_packs_epi32(v0, v1);
- in[3] = _mm_packs_epi32(v4, v5);
- in[5] = _mm_packs_epi32(v2, v3);
- in[7] = _mm_packs_epi32(v6, v7);
-
- // transpose
- array_transpose_8x8(in, in);
-}
-
-static void fadst8_sse2(__m128i *in) {
- // Constants
- const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
- const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
- const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
- const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
- const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
- const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
- const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
- const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
- const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
- const __m128i k__const_0 = _mm_set1_epi16(0);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
- __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
- __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
- __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
- __m128i s0, s1, s2, s3, s4, s5, s6, s7;
- __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-
- // properly aligned for butterfly input
- in0 = in[7];
- in1 = in[0];
- in2 = in[5];
- in3 = in[2];
- in4 = in[3];
- in5 = in[4];
- in6 = in[1];
- in7 = in[6];
-
- // column transformation
- // stage 1
- // interleave and multiply/add into 32-bit integer
- s0 = _mm_unpacklo_epi16(in0, in1);
- s1 = _mm_unpackhi_epi16(in0, in1);
- s2 = _mm_unpacklo_epi16(in2, in3);
- s3 = _mm_unpackhi_epi16(in2, in3);
- s4 = _mm_unpacklo_epi16(in4, in5);
- s5 = _mm_unpackhi_epi16(in4, in5);
- s6 = _mm_unpacklo_epi16(in6, in7);
- s7 = _mm_unpackhi_epi16(in6, in7);
-
- u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
- u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
- u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
- u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
- u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
- u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
- u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
- u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
- u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
- u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
- u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
- u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
- u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
- u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
- u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
- u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
-
- // addition
- w0 = _mm_add_epi32(u0, u8);
- w1 = _mm_add_epi32(u1, u9);
- w2 = _mm_add_epi32(u2, u10);
- w3 = _mm_add_epi32(u3, u11);
- w4 = _mm_add_epi32(u4, u12);
- w5 = _mm_add_epi32(u5, u13);
- w6 = _mm_add_epi32(u6, u14);
- w7 = _mm_add_epi32(u7, u15);
- w8 = _mm_sub_epi32(u0, u8);
- w9 = _mm_sub_epi32(u1, u9);
- w10 = _mm_sub_epi32(u2, u10);
- w11 = _mm_sub_epi32(u3, u11);
- w12 = _mm_sub_epi32(u4, u12);
- w13 = _mm_sub_epi32(u5, u13);
- w14 = _mm_sub_epi32(u6, u14);
- w15 = _mm_sub_epi32(u7, u15);
-
- // shift and rounding
- v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
- v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
- v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
- v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
- v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
- v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
- v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
- v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
- v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
- v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
- v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
- v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
- v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
- v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
- v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
- v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
-
- u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
- u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
- u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
- u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
- u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
- u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
- u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
- u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
- u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
-
- // back to 16-bit and pack 8 integers into __m128i
- in[0] = _mm_packs_epi32(u0, u1);
- in[1] = _mm_packs_epi32(u2, u3);
- in[2] = _mm_packs_epi32(u4, u5);
- in[3] = _mm_packs_epi32(u6, u7);
- in[4] = _mm_packs_epi32(u8, u9);
- in[5] = _mm_packs_epi32(u10, u11);
- in[6] = _mm_packs_epi32(u12, u13);
- in[7] = _mm_packs_epi32(u14, u15);
-
- // stage 2
- s0 = _mm_add_epi16(in[0], in[2]);
- s1 = _mm_add_epi16(in[1], in[3]);
- s2 = _mm_sub_epi16(in[0], in[2]);
- s3 = _mm_sub_epi16(in[1], in[3]);
- u0 = _mm_unpacklo_epi16(in[4], in[5]);
- u1 = _mm_unpackhi_epi16(in[4], in[5]);
- u2 = _mm_unpacklo_epi16(in[6], in[7]);
- u3 = _mm_unpackhi_epi16(in[6], in[7]);
-
- v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
- v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
- v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
- v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
- v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
- v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
- v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
- v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
-
- w0 = _mm_add_epi32(v0, v4);
- w1 = _mm_add_epi32(v1, v5);
- w2 = _mm_add_epi32(v2, v6);
- w3 = _mm_add_epi32(v3, v7);
- w4 = _mm_sub_epi32(v0, v4);
- w5 = _mm_sub_epi32(v1, v5);
- w6 = _mm_sub_epi32(v2, v6);
- w7 = _mm_sub_epi32(v3, v7);
-
- v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
- v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
- v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
- v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
- v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
- v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
- v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
- v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
-
- u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-
- // back to 16-bit intergers
- s4 = _mm_packs_epi32(u0, u1);
- s5 = _mm_packs_epi32(u2, u3);
- s6 = _mm_packs_epi32(u4, u5);
- s7 = _mm_packs_epi32(u6, u7);
-
- // stage 3
- u0 = _mm_unpacklo_epi16(s2, s3);
- u1 = _mm_unpackhi_epi16(s2, s3);
- u2 = _mm_unpacklo_epi16(s6, s7);
- u3 = _mm_unpackhi_epi16(s6, s7);
-
- v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
- v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
- v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
- v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
- v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
- v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
- v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
- v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
-
- u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
- u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
- u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
- u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
- u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
- u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
- u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
- u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
- v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
- v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
- v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
- v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
- v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
- v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
- v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
- v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
- s2 = _mm_packs_epi32(v0, v1);
- s3 = _mm_packs_epi32(v2, v3);
- s6 = _mm_packs_epi32(v4, v5);
- s7 = _mm_packs_epi32(v6, v7);
-
- // FIXME(jingning): do subtract using bit inversion?
- in[0] = s0;
- in[1] = _mm_sub_epi16(k__const_0, s4);
- in[2] = s6;
- in[3] = _mm_sub_epi16(k__const_0, s2);
- in[4] = s3;
- in[5] = _mm_sub_epi16(k__const_0, s7);
- in[6] = s5;
- in[7] = _mm_sub_epi16(k__const_0, s1);
-
- // transpose
- array_transpose_8x8(in, in);
-}
-
-void vp10_fht8x8_sse2(const int16_t *input, tran_low_t *output,
- int stride, int tx_type) {
- __m128i in[8];
-
- switch (tx_type) {
- case DCT_DCT:
- vpx_fdct8x8_sse2(input, output, stride);
- break;
- case ADST_DCT:
- load_buffer_8x8(input, in, stride);
- fadst8_sse2(in);
- fdct8_sse2(in);
- right_shift_8x8(in, 1);
- write_buffer_8x8(output, in, 8);
- break;
- case DCT_ADST:
- load_buffer_8x8(input, in, stride);
- fdct8_sse2(in);
- fadst8_sse2(in);
- right_shift_8x8(in, 1);
- write_buffer_8x8(output, in, 8);
- break;
- case ADST_ADST:
- load_buffer_8x8(input, in, stride);
- fadst8_sse2(in);
- fadst8_sse2(in);
- right_shift_8x8(in, 1);
- write_buffer_8x8(output, in, 8);
- break;
- default:
- assert(0);
- break;
- }
-}
-
-static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0,
- __m128i *in1, int stride) {
- // load first 8 columns
- load_buffer_8x8(input, in0, stride);
- load_buffer_8x8(input + 8 * stride, in0 + 8, stride);
-
- input += 8;
- // load second 8 columns
- load_buffer_8x8(input, in1, stride);
- load_buffer_8x8(input + 8 * stride, in1 + 8, stride);
-}
-
-static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0,
- __m128i *in1, int stride) {
- // write first 8 columns
- write_buffer_8x8(output, in0, stride);
- write_buffer_8x8(output + 8 * stride, in0 + 8, stride);
- // write second 8 columns
- output += 8;
- write_buffer_8x8(output, in1, stride);
- write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
-}
-
-static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) {
- __m128i tbuf[8];
- array_transpose_8x8(res0, res0);
- array_transpose_8x8(res1, tbuf);
- array_transpose_8x8(res0 + 8, res1);
- array_transpose_8x8(res1 + 8, res1 + 8);
-
- res0[8] = tbuf[0];
- res0[9] = tbuf[1];
- res0[10] = tbuf[2];
- res0[11] = tbuf[3];
- res0[12] = tbuf[4];
- res0[13] = tbuf[5];
- res0[14] = tbuf[6];
- res0[15] = tbuf[7];
-}
-
-static INLINE void right_shift_16x16(__m128i *res0, __m128i *res1) {
- // perform rounding operations
- right_shift_8x8(res0, 2);
- right_shift_8x8(res0 + 8, 2);
- right_shift_8x8(res1, 2);
- right_shift_8x8(res1 + 8, 2);
-}
-
-static void fdct16_8col(__m128i *in) {
- // perform 16x16 1-D DCT for 8 columns
- __m128i i[8], s[8], p[8], t[8], u[16], v[16];
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
- const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
- const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
- const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
- const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
- const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
- const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
- const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
- const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
- // stage 1
- i[0] = _mm_add_epi16(in[0], in[15]);
- i[1] = _mm_add_epi16(in[1], in[14]);
- i[2] = _mm_add_epi16(in[2], in[13]);
- i[3] = _mm_add_epi16(in[3], in[12]);
- i[4] = _mm_add_epi16(in[4], in[11]);
- i[5] = _mm_add_epi16(in[5], in[10]);
- i[6] = _mm_add_epi16(in[6], in[9]);
- i[7] = _mm_add_epi16(in[7], in[8]);
-
- s[0] = _mm_sub_epi16(in[7], in[8]);
- s[1] = _mm_sub_epi16(in[6], in[9]);
- s[2] = _mm_sub_epi16(in[5], in[10]);
- s[3] = _mm_sub_epi16(in[4], in[11]);
- s[4] = _mm_sub_epi16(in[3], in[12]);
- s[5] = _mm_sub_epi16(in[2], in[13]);
- s[6] = _mm_sub_epi16(in[1], in[14]);
- s[7] = _mm_sub_epi16(in[0], in[15]);
-
- p[0] = _mm_add_epi16(i[0], i[7]);
- p[1] = _mm_add_epi16(i[1], i[6]);
- p[2] = _mm_add_epi16(i[2], i[5]);
- p[3] = _mm_add_epi16(i[3], i[4]);
- p[4] = _mm_sub_epi16(i[3], i[4]);
- p[5] = _mm_sub_epi16(i[2], i[5]);
- p[6] = _mm_sub_epi16(i[1], i[6]);
- p[7] = _mm_sub_epi16(i[0], i[7]);
-
- u[0] = _mm_add_epi16(p[0], p[3]);
- u[1] = _mm_add_epi16(p[1], p[2]);
- u[2] = _mm_sub_epi16(p[1], p[2]);
- u[3] = _mm_sub_epi16(p[0], p[3]);
-
- v[0] = _mm_unpacklo_epi16(u[0], u[1]);
- v[1] = _mm_unpackhi_epi16(u[0], u[1]);
- v[2] = _mm_unpacklo_epi16(u[2], u[3]);
- v[3] = _mm_unpackhi_epi16(u[2], u[3]);
-
- u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);
- u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
- u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16);
- u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
- u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08);
- u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08);
- u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24);
- u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-
- in[0] = _mm_packs_epi32(u[0], u[1]);
- in[4] = _mm_packs_epi32(u[4], u[5]);
- in[8] = _mm_packs_epi32(u[2], u[3]);
- in[12] = _mm_packs_epi32(u[6], u[7]);
-
- u[0] = _mm_unpacklo_epi16(p[5], p[6]);
- u[1] = _mm_unpackhi_epi16(p[5], p[6]);
- v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
- v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-
- u[0] = _mm_packs_epi32(v[0], v[1]);
- u[1] = _mm_packs_epi32(v[2], v[3]);
-
- t[0] = _mm_add_epi16(p[4], u[0]);
- t[1] = _mm_sub_epi16(p[4], u[0]);
- t[2] = _mm_sub_epi16(p[7], u[1]);
- t[3] = _mm_add_epi16(p[7], u[1]);
-
- u[0] = _mm_unpacklo_epi16(t[0], t[3]);
- u[1] = _mm_unpackhi_epi16(t[0], t[3]);
- u[2] = _mm_unpacklo_epi16(t[1], t[2]);
- u[3] = _mm_unpackhi_epi16(t[1], t[2]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04);
- v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20);
- v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20);
- v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12);
- v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12);
- v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28);
- v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
- in[2] = _mm_packs_epi32(v[0], v[1]);
- in[6] = _mm_packs_epi32(v[4], v[5]);
- in[10] = _mm_packs_epi32(v[2], v[3]);
- in[14] = _mm_packs_epi32(v[6], v[7]);
-
- // stage 2
- u[0] = _mm_unpacklo_epi16(s[2], s[5]);
- u[1] = _mm_unpackhi_epi16(s[2], s[5]);
- u[2] = _mm_unpacklo_epi16(s[3], s[4]);
- u[3] = _mm_unpackhi_epi16(s[3], s[4]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
- v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
- v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
- v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
- v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
- v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
- t[2] = _mm_packs_epi32(v[0], v[1]);
- t[3] = _mm_packs_epi32(v[2], v[3]);
- t[4] = _mm_packs_epi32(v[4], v[5]);
- t[5] = _mm_packs_epi32(v[6], v[7]);
-
- // stage 3
- p[0] = _mm_add_epi16(s[0], t[3]);
- p[1] = _mm_add_epi16(s[1], t[2]);
- p[2] = _mm_sub_epi16(s[1], t[2]);
- p[3] = _mm_sub_epi16(s[0], t[3]);
- p[4] = _mm_sub_epi16(s[7], t[4]);
- p[5] = _mm_sub_epi16(s[6], t[5]);
- p[6] = _mm_add_epi16(s[6], t[5]);
- p[7] = _mm_add_epi16(s[7], t[4]);
-
- // stage 4
- u[0] = _mm_unpacklo_epi16(p[1], p[6]);
- u[1] = _mm_unpackhi_epi16(p[1], p[6]);
- u[2] = _mm_unpacklo_epi16(p[2], p[5]);
- u[3] = _mm_unpackhi_epi16(p[2], p[5]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
- v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
- v[2] = _mm_madd_epi16(u[2], k__cospi_p24_p08);
- v[3] = _mm_madd_epi16(u[3], k__cospi_p24_p08);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p08_m24);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p08_m24);
- v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
- v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
- t[1] = _mm_packs_epi32(v[0], v[1]);
- t[2] = _mm_packs_epi32(v[2], v[3]);
- t[5] = _mm_packs_epi32(v[4], v[5]);
- t[6] = _mm_packs_epi32(v[6], v[7]);
-
- // stage 5
- s[0] = _mm_add_epi16(p[0], t[1]);
- s[1] = _mm_sub_epi16(p[0], t[1]);
- s[2] = _mm_add_epi16(p[3], t[2]);
- s[3] = _mm_sub_epi16(p[3], t[2]);
- s[4] = _mm_sub_epi16(p[4], t[5]);
- s[5] = _mm_add_epi16(p[4], t[5]);
- s[6] = _mm_sub_epi16(p[7], t[6]);
- s[7] = _mm_add_epi16(p[7], t[6]);
-
- // stage 6
- u[0] = _mm_unpacklo_epi16(s[0], s[7]);
- u[1] = _mm_unpackhi_epi16(s[0], s[7]);
- u[2] = _mm_unpacklo_epi16(s[1], s[6]);
- u[3] = _mm_unpackhi_epi16(s[1], s[6]);
- u[4] = _mm_unpacklo_epi16(s[2], s[5]);
- u[5] = _mm_unpackhi_epi16(s[2], s[5]);
- u[6] = _mm_unpacklo_epi16(s[3], s[4]);
- u[7] = _mm_unpackhi_epi16(s[3], s[4]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02);
- v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18);
- v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18);
- v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10);
- v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10);
- v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26);
- v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26);
- v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06);
- v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06);
- v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22);
- v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22);
- v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14);
- v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14);
- v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30);
- v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
- u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
- u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
- u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
- u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
- u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
- u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
- u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
- u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
- v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
- v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
- v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
- v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
- v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
- v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
- v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
- v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
- in[1] = _mm_packs_epi32(v[0], v[1]);
- in[9] = _mm_packs_epi32(v[2], v[3]);
- in[5] = _mm_packs_epi32(v[4], v[5]);
- in[13] = _mm_packs_epi32(v[6], v[7]);
- in[3] = _mm_packs_epi32(v[8], v[9]);
- in[11] = _mm_packs_epi32(v[10], v[11]);
- in[7] = _mm_packs_epi32(v[12], v[13]);
- in[15] = _mm_packs_epi32(v[14], v[15]);
-}
-
-static void fadst16_8col(__m128i *in) {
- // perform 16x16 1-D ADST for 8 columns
- __m128i s[16], x[16], u[32], v[32];
- const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
- const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
- const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
- const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
- const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
- const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
- const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
- const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
- const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
- const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
- const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
- const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
- const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
- const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
- const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
- const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
- const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
- const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
- const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
- const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
- const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m16_m16 = _mm_set1_epi16((int16_t)-cospi_16_64);
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i kZero = _mm_set1_epi16(0);
-
- u[0] = _mm_unpacklo_epi16(in[15], in[0]);
- u[1] = _mm_unpackhi_epi16(in[15], in[0]);
- u[2] = _mm_unpacklo_epi16(in[13], in[2]);
- u[3] = _mm_unpackhi_epi16(in[13], in[2]);
- u[4] = _mm_unpacklo_epi16(in[11], in[4]);
- u[5] = _mm_unpackhi_epi16(in[11], in[4]);
- u[6] = _mm_unpacklo_epi16(in[9], in[6]);
- u[7] = _mm_unpackhi_epi16(in[9], in[6]);
- u[8] = _mm_unpacklo_epi16(in[7], in[8]);
- u[9] = _mm_unpackhi_epi16(in[7], in[8]);
- u[10] = _mm_unpacklo_epi16(in[5], in[10]);
- u[11] = _mm_unpackhi_epi16(in[5], in[10]);
- u[12] = _mm_unpacklo_epi16(in[3], in[12]);
- u[13] = _mm_unpackhi_epi16(in[3], in[12]);
- u[14] = _mm_unpacklo_epi16(in[1], in[14]);
- u[15] = _mm_unpackhi_epi16(in[1], in[14]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
- v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
- v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
- v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
- v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
- v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
- v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
- v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
- v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
- v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
- v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
- v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
- v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
- v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
- v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
- v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
- v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
- v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
- v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
- v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
- v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
- v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
- v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
-
- u[0] = _mm_add_epi32(v[0], v[16]);
- u[1] = _mm_add_epi32(v[1], v[17]);
- u[2] = _mm_add_epi32(v[2], v[18]);
- u[3] = _mm_add_epi32(v[3], v[19]);
- u[4] = _mm_add_epi32(v[4], v[20]);
- u[5] = _mm_add_epi32(v[5], v[21]);
- u[6] = _mm_add_epi32(v[6], v[22]);
- u[7] = _mm_add_epi32(v[7], v[23]);
- u[8] = _mm_add_epi32(v[8], v[24]);
- u[9] = _mm_add_epi32(v[9], v[25]);
- u[10] = _mm_add_epi32(v[10], v[26]);
- u[11] = _mm_add_epi32(v[11], v[27]);
- u[12] = _mm_add_epi32(v[12], v[28]);
- u[13] = _mm_add_epi32(v[13], v[29]);
- u[14] = _mm_add_epi32(v[14], v[30]);
- u[15] = _mm_add_epi32(v[15], v[31]);
- u[16] = _mm_sub_epi32(v[0], v[16]);
- u[17] = _mm_sub_epi32(v[1], v[17]);
- u[18] = _mm_sub_epi32(v[2], v[18]);
- u[19] = _mm_sub_epi32(v[3], v[19]);
- u[20] = _mm_sub_epi32(v[4], v[20]);
- u[21] = _mm_sub_epi32(v[5], v[21]);
- u[22] = _mm_sub_epi32(v[6], v[22]);
- u[23] = _mm_sub_epi32(v[7], v[23]);
- u[24] = _mm_sub_epi32(v[8], v[24]);
- u[25] = _mm_sub_epi32(v[9], v[25]);
- u[26] = _mm_sub_epi32(v[10], v[26]);
- u[27] = _mm_sub_epi32(v[11], v[27]);
- u[28] = _mm_sub_epi32(v[12], v[28]);
- u[29] = _mm_sub_epi32(v[13], v[29]);
- u[30] = _mm_sub_epi32(v[14], v[30]);
- u[31] = _mm_sub_epi32(v[15], v[31]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
- v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
- v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
- v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
- v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
- v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
- v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
- v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
- v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
- v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
- v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
- v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
- v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
- v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
- v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
- v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
- v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
- v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
- v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
- v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
- v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
- v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
- v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
- v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
- v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
- u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
- u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
- u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
- u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
- u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
- u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
- u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
- u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
- u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
- u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
- u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
- u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
- u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
- u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
- u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
- u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
- u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
- u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
- u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
- u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
- u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
- u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
- u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
- u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
-
- s[0] = _mm_packs_epi32(u[0], u[1]);
- s[1] = _mm_packs_epi32(u[2], u[3]);
- s[2] = _mm_packs_epi32(u[4], u[5]);
- s[3] = _mm_packs_epi32(u[6], u[7]);
- s[4] = _mm_packs_epi32(u[8], u[9]);
- s[5] = _mm_packs_epi32(u[10], u[11]);
- s[6] = _mm_packs_epi32(u[12], u[13]);
- s[7] = _mm_packs_epi32(u[14], u[15]);
- s[8] = _mm_packs_epi32(u[16], u[17]);
- s[9] = _mm_packs_epi32(u[18], u[19]);
- s[10] = _mm_packs_epi32(u[20], u[21]);
- s[11] = _mm_packs_epi32(u[22], u[23]);
- s[12] = _mm_packs_epi32(u[24], u[25]);
- s[13] = _mm_packs_epi32(u[26], u[27]);
- s[14] = _mm_packs_epi32(u[28], u[29]);
- s[15] = _mm_packs_epi32(u[30], u[31]);
-
- // stage 2
- u[0] = _mm_unpacklo_epi16(s[8], s[9]);
- u[1] = _mm_unpackhi_epi16(s[8], s[9]);
- u[2] = _mm_unpacklo_epi16(s[10], s[11]);
- u[3] = _mm_unpackhi_epi16(s[10], s[11]);
- u[4] = _mm_unpacklo_epi16(s[12], s[13]);
- u[5] = _mm_unpackhi_epi16(s[12], s[13]);
- u[6] = _mm_unpacklo_epi16(s[14], s[15]);
- u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
- v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
- v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
- v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
- v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
- v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
- v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
-
- u[0] = _mm_add_epi32(v[0], v[8]);
- u[1] = _mm_add_epi32(v[1], v[9]);
- u[2] = _mm_add_epi32(v[2], v[10]);
- u[3] = _mm_add_epi32(v[3], v[11]);
- u[4] = _mm_add_epi32(v[4], v[12]);
- u[5] = _mm_add_epi32(v[5], v[13]);
- u[6] = _mm_add_epi32(v[6], v[14]);
- u[7] = _mm_add_epi32(v[7], v[15]);
- u[8] = _mm_sub_epi32(v[0], v[8]);
- u[9] = _mm_sub_epi32(v[1], v[9]);
- u[10] = _mm_sub_epi32(v[2], v[10]);
- u[11] = _mm_sub_epi32(v[3], v[11]);
- u[12] = _mm_sub_epi32(v[4], v[12]);
- u[13] = _mm_sub_epi32(v[5], v[13]);
- u[14] = _mm_sub_epi32(v[6], v[14]);
- u[15] = _mm_sub_epi32(v[7], v[15]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
- v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
- v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
- v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
- v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
- v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
- v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
- v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
- v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
- u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
- u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
- u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
- u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
- u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
- u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
- u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
- u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
- x[0] = _mm_add_epi16(s[0], s[4]);
- x[1] = _mm_add_epi16(s[1], s[5]);
- x[2] = _mm_add_epi16(s[2], s[6]);
- x[3] = _mm_add_epi16(s[3], s[7]);
- x[4] = _mm_sub_epi16(s[0], s[4]);
- x[5] = _mm_sub_epi16(s[1], s[5]);
- x[6] = _mm_sub_epi16(s[2], s[6]);
- x[7] = _mm_sub_epi16(s[3], s[7]);
- x[8] = _mm_packs_epi32(u[0], u[1]);
- x[9] = _mm_packs_epi32(u[2], u[3]);
- x[10] = _mm_packs_epi32(u[4], u[5]);
- x[11] = _mm_packs_epi32(u[6], u[7]);
- x[12] = _mm_packs_epi32(u[8], u[9]);
- x[13] = _mm_packs_epi32(u[10], u[11]);
- x[14] = _mm_packs_epi32(u[12], u[13]);
- x[15] = _mm_packs_epi32(u[14], u[15]);
-
- // stage 3
- u[0] = _mm_unpacklo_epi16(x[4], x[5]);
- u[1] = _mm_unpackhi_epi16(x[4], x[5]);
- u[2] = _mm_unpacklo_epi16(x[6], x[7]);
- u[3] = _mm_unpackhi_epi16(x[6], x[7]);
- u[4] = _mm_unpacklo_epi16(x[12], x[13]);
- u[5] = _mm_unpackhi_epi16(x[12], x[13]);
- u[6] = _mm_unpacklo_epi16(x[14], x[15]);
- u[7] = _mm_unpackhi_epi16(x[14], x[15]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
- v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
- v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
- v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
- v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
- v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
- v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
- v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
- v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
-
- u[0] = _mm_add_epi32(v[0], v[4]);
- u[1] = _mm_add_epi32(v[1], v[5]);
- u[2] = _mm_add_epi32(v[2], v[6]);
- u[3] = _mm_add_epi32(v[3], v[7]);
- u[4] = _mm_sub_epi32(v[0], v[4]);
- u[5] = _mm_sub_epi32(v[1], v[5]);
- u[6] = _mm_sub_epi32(v[2], v[6]);
- u[7] = _mm_sub_epi32(v[3], v[7]);
- u[8] = _mm_add_epi32(v[8], v[12]);
- u[9] = _mm_add_epi32(v[9], v[13]);
- u[10] = _mm_add_epi32(v[10], v[14]);
- u[11] = _mm_add_epi32(v[11], v[15]);
- u[12] = _mm_sub_epi32(v[8], v[12]);
- u[13] = _mm_sub_epi32(v[9], v[13]);
- u[14] = _mm_sub_epi32(v[10], v[14]);
- u[15] = _mm_sub_epi32(v[11], v[15]);
-
- u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
- u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
- u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
- u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
- u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
- u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
- u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
- u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
- u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
- v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
- v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
- v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
- v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
- v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
- v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
- v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
- v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
- s[0] = _mm_add_epi16(x[0], x[2]);
- s[1] = _mm_add_epi16(x[1], x[3]);
- s[2] = _mm_sub_epi16(x[0], x[2]);
- s[3] = _mm_sub_epi16(x[1], x[3]);
- s[4] = _mm_packs_epi32(v[0], v[1]);
- s[5] = _mm_packs_epi32(v[2], v[3]);
- s[6] = _mm_packs_epi32(v[4], v[5]);
- s[7] = _mm_packs_epi32(v[6], v[7]);
- s[8] = _mm_add_epi16(x[8], x[10]);
- s[9] = _mm_add_epi16(x[9], x[11]);
- s[10] = _mm_sub_epi16(x[8], x[10]);
- s[11] = _mm_sub_epi16(x[9], x[11]);
- s[12] = _mm_packs_epi32(v[8], v[9]);
- s[13] = _mm_packs_epi32(v[10], v[11]);
- s[14] = _mm_packs_epi32(v[12], v[13]);
- s[15] = _mm_packs_epi32(v[14], v[15]);
-
- // stage 4
- u[0] = _mm_unpacklo_epi16(s[2], s[3]);
- u[1] = _mm_unpackhi_epi16(s[2], s[3]);
- u[2] = _mm_unpacklo_epi16(s[6], s[7]);
- u[3] = _mm_unpackhi_epi16(s[6], s[7]);
- u[4] = _mm_unpacklo_epi16(s[10], s[11]);
- u[5] = _mm_unpackhi_epi16(s[10], s[11]);
- u[6] = _mm_unpacklo_epi16(s[14], s[15]);
- u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
- v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
- v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
- v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
- v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
- v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
- v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
- v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
- v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
- v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
- u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
- u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
- u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
- u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
- u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
- u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
- u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
- u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
- v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
- v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
- v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
- v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
- v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
- v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
- v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
- v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
- in[0] = s[0];
- in[1] = _mm_sub_epi16(kZero, s[8]);
- in[2] = s[12];
- in[3] = _mm_sub_epi16(kZero, s[4]);
- in[4] = _mm_packs_epi32(v[4], v[5]);
- in[5] = _mm_packs_epi32(v[12], v[13]);
- in[6] = _mm_packs_epi32(v[8], v[9]);
- in[7] = _mm_packs_epi32(v[0], v[1]);
- in[8] = _mm_packs_epi32(v[2], v[3]);
- in[9] = _mm_packs_epi32(v[10], v[11]);
- in[10] = _mm_packs_epi32(v[14], v[15]);
- in[11] = _mm_packs_epi32(v[6], v[7]);
- in[12] = s[5];
- in[13] = _mm_sub_epi16(kZero, s[13]);
- in[14] = s[9];
- in[15] = _mm_sub_epi16(kZero, s[1]);
-}
-
-static void fdct16_sse2(__m128i *in0, __m128i *in1) {
- fdct16_8col(in0);
- fdct16_8col(in1);
- array_transpose_16x16(in0, in1);
-}
-
-static void fadst16_sse2(__m128i *in0, __m128i *in1) {
- fadst16_8col(in0);
- fadst16_8col(in1);
- array_transpose_16x16(in0, in1);
-}
-
-void vp10_fht16x16_sse2(const int16_t *input, tran_low_t *output,
- int stride, int tx_type) {
- __m128i in0[16], in1[16];
-
- switch (tx_type) {
- case DCT_DCT:
- vpx_fdct16x16_sse2(input, output, stride);
- break;
- case ADST_DCT:
- load_buffer_16x16(input, in0, in1, stride);
- fadst16_sse2(in0, in1);
- right_shift_16x16(in0, in1);
- fdct16_sse2(in0, in1);
- write_buffer_16x16(output, in0, in1, 16);
- break;
- case DCT_ADST:
- load_buffer_16x16(input, in0, in1, stride);
- fdct16_sse2(in0, in1);
- right_shift_16x16(in0, in1);
- fadst16_sse2(in0, in1);
- write_buffer_16x16(output, in0, in1, 16);
- break;
- case ADST_ADST:
- load_buffer_16x16(input, in0, in1, stride);
- fadst16_sse2(in0, in1);
- right_shift_16x16(in0, in1);
- fadst16_sse2(in0, in1);
- write_buffer_16x16(output, in0, in1, 16);
- break;
- default:
- assert(0);
- break;
- }
-}
diff --git a/vp10/encoder/x86/dct_ssse3.c b/vp10/encoder/x86/dct_ssse3.c
deleted file mode 100644
index df298d871..000000000
--- a/vp10/encoder/x86/dct_ssse3.c
+++ /dev/null
@@ -1,472 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-#if defined(_MSC_VER) && _MSC_VER <= 1500
-// Need to include math.h before calling tmmintrin.h/intrin.h
-// in certain versions of MSVS.
-#include <math.h>
-#endif
-#include <tmmintrin.h> // SSSE3
-
-#include "./vp10_rtcd.h"
-#include "vpx_dsp/x86/inv_txfm_sse2.h"
-#include "vpx_dsp/x86/txfm_common_sse2.h"
-
-void vp10_fdct8x8_quant_ssse3(const int16_t *input, int stride,
- int16_t* coeff_ptr, intptr_t n_coeffs,
- int skip_block, const int16_t* zbin_ptr,
- const int16_t* round_ptr, const int16_t* quant_ptr,
- const int16_t* quant_shift_ptr,
- int16_t* qcoeff_ptr,
- int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
- uint16_t* eob_ptr,
- const int16_t* scan_ptr,
- const int16_t* iscan_ptr) {
- __m128i zero;
- int pass;
- // Constants
- // When we use them, in one case, they are all the same. In all others
- // it's a pair of them that we need to repeat four times. This is done
- // by constructing the 32 bit constant corresponding to that pair.
- const __m128i k__dual_p16_p16 = dual_set_epi16(23170, 23170);
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
- const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- // Load input
- __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
- __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
- __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
- __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
- __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
- __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
- __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
- __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
- __m128i *in[8];
- int index = 0;
-
- (void)scan_ptr;
- (void)zbin_ptr;
- (void)quant_shift_ptr;
- (void)coeff_ptr;
-
- // Pre-condition input (shift by two)
- in0 = _mm_slli_epi16(in0, 2);
- in1 = _mm_slli_epi16(in1, 2);
- in2 = _mm_slli_epi16(in2, 2);
- in3 = _mm_slli_epi16(in3, 2);
- in4 = _mm_slli_epi16(in4, 2);
- in5 = _mm_slli_epi16(in5, 2);
- in6 = _mm_slli_epi16(in6, 2);
- in7 = _mm_slli_epi16(in7, 2);
-
- in[0] = &in0;
- in[1] = &in1;
- in[2] = &in2;
- in[3] = &in3;
- in[4] = &in4;
- in[5] = &in5;
- in[6] = &in6;
- in[7] = &in7;
-
- // We do two passes, first the columns, then the rows. The results of the
- // first pass are transposed so that the same column code can be reused. The
- // results of the second pass are also transposed so that the rows (processed
- // as columns) are put back in row positions.
- for (pass = 0; pass < 2; pass++) {
- // To store results of each pass before the transpose.
- __m128i res0, res1, res2, res3, res4, res5, res6, res7;
- // Add/subtract
- const __m128i q0 = _mm_add_epi16(in0, in7);
- const __m128i q1 = _mm_add_epi16(in1, in6);
- const __m128i q2 = _mm_add_epi16(in2, in5);
- const __m128i q3 = _mm_add_epi16(in3, in4);
- const __m128i q4 = _mm_sub_epi16(in3, in4);
- const __m128i q5 = _mm_sub_epi16(in2, in5);
- const __m128i q6 = _mm_sub_epi16(in1, in6);
- const __m128i q7 = _mm_sub_epi16(in0, in7);
- // Work on first four results
- {
- // Add/subtract
- const __m128i r0 = _mm_add_epi16(q0, q3);
- const __m128i r1 = _mm_add_epi16(q1, q2);
- const __m128i r2 = _mm_sub_epi16(q1, q2);
- const __m128i r3 = _mm_sub_epi16(q0, q3);
- // Interleave to do the multiply by constants which gets us into 32bits
- const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
- const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
- const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
- const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
-
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
- const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
- const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
-
- const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
- const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
- const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
- const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
- // dct_const_round_shift
-
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
-
- const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
- const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
- const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
- const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-
- const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
- // Combine
-
- res0 = _mm_packs_epi32(w0, w1);
- res4 = _mm_packs_epi32(w2, w3);
- res2 = _mm_packs_epi32(w4, w5);
- res6 = _mm_packs_epi32(w6, w7);
- }
- // Work on next four results
- {
- // Interleave to do the multiply by constants which gets us into 32bits
- const __m128i d0 = _mm_sub_epi16(q6, q5);
- const __m128i d1 = _mm_add_epi16(q6, q5);
- const __m128i r0 = _mm_mulhrs_epi16(d0, k__dual_p16_p16);
- const __m128i r1 = _mm_mulhrs_epi16(d1, k__dual_p16_p16);
-
- // Add/subtract
- const __m128i x0 = _mm_add_epi16(q4, r0);
- const __m128i x1 = _mm_sub_epi16(q4, r0);
- const __m128i x2 = _mm_sub_epi16(q7, r1);
- const __m128i x3 = _mm_add_epi16(q7, r1);
- // Interleave to do the multiply by constants which gets us into 32bits
- const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
- const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
- const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
- const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
- const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
- const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
- const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
- const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
- const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
- const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
- const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
- const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
- const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
- // Combine
- res1 = _mm_packs_epi32(w0, w1);
- res7 = _mm_packs_epi32(w2, w3);
- res5 = _mm_packs_epi32(w4, w5);
- res3 = _mm_packs_epi32(w6, w7);
- }
- // Transpose the 8x8.
- {
- // 00 01 02 03 04 05 06 07
- // 10 11 12 13 14 15 16 17
- // 20 21 22 23 24 25 26 27
- // 30 31 32 33 34 35 36 37
- // 40 41 42 43 44 45 46 47
- // 50 51 52 53 54 55 56 57
- // 60 61 62 63 64 65 66 67
- // 70 71 72 73 74 75 76 77
- const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
- const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
- const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
- const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
- const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
- const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
- const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
- const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
- // 00 10 01 11 02 12 03 13
- // 20 30 21 31 22 32 23 33
- // 04 14 05 15 06 16 07 17
- // 24 34 25 35 26 36 27 37
- // 40 50 41 51 42 52 43 53
- // 60 70 61 71 62 72 63 73
- // 54 54 55 55 56 56 57 57
- // 64 74 65 75 66 76 67 77
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
- // 00 10 20 30 01 11 21 31
- // 40 50 60 70 41 51 61 71
- // 02 12 22 32 03 13 23 33
- // 42 52 62 72 43 53 63 73
- // 04 14 24 34 05 15 21 36
- // 44 54 64 74 45 55 61 76
- // 06 16 26 36 07 17 27 37
- // 46 56 66 76 47 57 67 77
- in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
- in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
- in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
- in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
- in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
- in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
- in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
- in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
- // 00 10 20 30 40 50 60 70
- // 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72
- // 03 13 23 33 43 53 63 73
- // 04 14 24 34 44 54 64 74
- // 05 15 25 35 45 55 65 75
- // 06 16 26 36 46 56 66 76
- // 07 17 27 37 47 57 67 77
- }
- }
- // Post-condition output and store it
- {
- // Post-condition (division by two)
- // division of two 16 bits signed numbers using shifts
- // n / 2 = (n - (n >> 15)) >> 1
- const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
- const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
- const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
- const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
- const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
- const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
- const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
- const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
- in0 = _mm_sub_epi16(in0, sign_in0);
- in1 = _mm_sub_epi16(in1, sign_in1);
- in2 = _mm_sub_epi16(in2, sign_in2);
- in3 = _mm_sub_epi16(in3, sign_in3);
- in4 = _mm_sub_epi16(in4, sign_in4);
- in5 = _mm_sub_epi16(in5, sign_in5);
- in6 = _mm_sub_epi16(in6, sign_in6);
- in7 = _mm_sub_epi16(in7, sign_in7);
- in0 = _mm_srai_epi16(in0, 1);
- in1 = _mm_srai_epi16(in1, 1);
- in2 = _mm_srai_epi16(in2, 1);
- in3 = _mm_srai_epi16(in3, 1);
- in4 = _mm_srai_epi16(in4, 1);
- in5 = _mm_srai_epi16(in5, 1);
- in6 = _mm_srai_epi16(in6, 1);
- in7 = _mm_srai_epi16(in7, 1);
- }
-
- iscan_ptr += n_coeffs;
- qcoeff_ptr += n_coeffs;
- dqcoeff_ptr += n_coeffs;
- n_coeffs = -n_coeffs;
- zero = _mm_setzero_si128();
-
- if (!skip_block) {
- __m128i eob;
- __m128i round, quant, dequant, thr;
- int16_t nzflag;
- {
- __m128i coeff0, coeff1;
-
- // Setup global values
- {
- round = _mm_load_si128((const __m128i*)round_ptr);
- quant = _mm_load_si128((const __m128i*)quant_ptr);
- dequant = _mm_load_si128((const __m128i*)dequant_ptr);
- }
-
- {
- __m128i coeff0_sign, coeff1_sign;
- __m128i qcoeff0, qcoeff1;
- __m128i qtmp0, qtmp1;
- // Do DC and first 15 AC
- coeff0 = *in[0];
- coeff1 = *in[1];
-
- // Poor man's sign extract
- coeff0_sign = _mm_srai_epi16(coeff0, 15);
- coeff1_sign = _mm_srai_epi16(coeff1, 15);
- qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
- qcoeff0 = _mm_adds_epi16(qcoeff0, round);
- round = _mm_unpackhi_epi64(round, round);
- qcoeff1 = _mm_adds_epi16(qcoeff1, round);
- qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
- quant = _mm_unpackhi_epi64(quant, quant);
- qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-
- // Reinsert signs
- qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
-
- coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
- dequant = _mm_unpackhi_epi64(dequant, dequant);
- coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
- }
-
- {
- // Scan for eob
- __m128i zero_coeff0, zero_coeff1;
- __m128i nzero_coeff0, nzero_coeff1;
- __m128i iscan0, iscan1;
- __m128i eob1;
- zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
- zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
- nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
- nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
- iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
- iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
- // Add one to convert from indices to counts
- iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
- iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
- eob = _mm_and_si128(iscan0, nzero_coeff0);
- eob1 = _mm_and_si128(iscan1, nzero_coeff1);
- eob = _mm_max_epi16(eob, eob1);
- }
- n_coeffs += 8 * 2;
- }
-
- // AC only loop
- index = 2;
- thr = _mm_srai_epi16(dequant, 1);
- while (n_coeffs < 0) {
- __m128i coeff0, coeff1;
- {
- __m128i coeff0_sign, coeff1_sign;
- __m128i qcoeff0, qcoeff1;
- __m128i qtmp0, qtmp1;
-
- assert(index < (int)(sizeof(in) / sizeof(in[0])) - 1);
- coeff0 = *in[index];
- coeff1 = *in[index + 1];
-
- // Poor man's sign extract
- coeff0_sign = _mm_srai_epi16(coeff0, 15);
- coeff1_sign = _mm_srai_epi16(coeff1, 15);
- qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
- nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
- _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
-
- if (nzflag) {
- qcoeff0 = _mm_adds_epi16(qcoeff0, round);
- qcoeff1 = _mm_adds_epi16(qcoeff1, round);
- qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
- qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-
- // Reinsert signs
- qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
-
- coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
- coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
- } else {
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
-
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
- }
- }
-
- if (nzflag) {
- // Scan for eob
- __m128i zero_coeff0, zero_coeff1;
- __m128i nzero_coeff0, nzero_coeff1;
- __m128i iscan0, iscan1;
- __m128i eob0, eob1;
- zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
- zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
- nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
- nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
- iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
- iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
- // Add one to convert from indices to counts
- iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
- iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
- eob0 = _mm_and_si128(iscan0, nzero_coeff0);
- eob1 = _mm_and_si128(iscan1, nzero_coeff1);
- eob0 = _mm_max_epi16(eob0, eob1);
- eob = _mm_max_epi16(eob, eob0);
- }
- n_coeffs += 8 * 2;
- index += 2;
- }
-
- // Accumulate EOB
- {
- __m128i eob_shuffled;
- eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
- eob = _mm_max_epi16(eob, eob_shuffled);
- eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
- eob = _mm_max_epi16(eob, eob_shuffled);
- eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
- eob = _mm_max_epi16(eob, eob_shuffled);
- *eob_ptr = _mm_extract_epi16(eob, 1);
- }
- } else {
- do {
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
- n_coeffs += 8 * 2;
- } while (n_coeffs < 0);
- *eob_ptr = 0;
- }
-}
diff --git a/vp10/encoder/x86/dct_ssse3_x86_64.asm b/vp10/encoder/x86/dct_ssse3_x86_64.asm
deleted file mode 100644
index 5e8adab3d..000000000
--- a/vp10/encoder/x86/dct_ssse3_x86_64.asm
+++ /dev/null
@@ -1,121 +0,0 @@
-;
-; Copyright (c) 2014 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%define private_prefix vp10
-
-%include "third_party/x86inc/x86inc.asm"
-
-; This file provides SSSE3 version of the forward transformation. Part
-; of the macro definitions are originally derived from the ffmpeg project.
-; The current version applies to x86 64-bit only.
-
-SECTION .text
-
-%if ARCH_X86_64
-; matrix transpose
-%macro INTERLEAVE_2X 4
- punpckh%1 m%4, m%2, m%3
- punpckl%1 m%2, m%3
- SWAP %3, %4
-%endmacro
-
-%macro TRANSPOSE8X8 9
- INTERLEAVE_2X wd, %1, %2, %9
- INTERLEAVE_2X wd, %3, %4, %9
- INTERLEAVE_2X wd, %5, %6, %9
- INTERLEAVE_2X wd, %7, %8, %9
-
- INTERLEAVE_2X dq, %1, %3, %9
- INTERLEAVE_2X dq, %2, %4, %9
- INTERLEAVE_2X dq, %5, %7, %9
- INTERLEAVE_2X dq, %6, %8, %9
-
- INTERLEAVE_2X qdq, %1, %5, %9
- INTERLEAVE_2X qdq, %3, %7, %9
- INTERLEAVE_2X qdq, %2, %6, %9
- INTERLEAVE_2X qdq, %4, %8, %9
-
- SWAP %2, %5
- SWAP %4, %7
-%endmacro
-
-%macro HMD8_1D 0
- psubw m8, m0, m1
- psubw m9, m2, m3
- paddw m0, m1
- paddw m2, m3
- SWAP 1, 8
- SWAP 3, 9
- psubw m8, m4, m5
- psubw m9, m6, m7
- paddw m4, m5
- paddw m6, m7
- SWAP 5, 8
- SWAP 7, 9
-
- psubw m8, m0, m2
- psubw m9, m1, m3
- paddw m0, m2
- paddw m1, m3
- SWAP 2, 8
- SWAP 3, 9
- psubw m8, m4, m6
- psubw m9, m5, m7
- paddw m4, m6
- paddw m5, m7
- SWAP 6, 8
- SWAP 7, 9
-
- psubw m8, m0, m4
- psubw m9, m1, m5
- paddw m0, m4
- paddw m1, m5
- SWAP 4, 8
- SWAP 5, 9
- psubw m8, m2, m6
- psubw m9, m3, m7
- paddw m2, m6
- paddw m3, m7
- SWAP 6, 8
- SWAP 7, 9
-%endmacro
-
-INIT_XMM ssse3
-cglobal hadamard_8x8, 3, 5, 10, input, stride, output
- lea r3, [2 * strideq]
- lea r4, [4 * strideq]
-
- mova m0, [inputq]
- mova m1, [inputq + r3]
- lea inputq, [inputq + r4]
- mova m2, [inputq]
- mova m3, [inputq + r3]
- lea inputq, [inputq + r4]
- mova m4, [inputq]
- mova m5, [inputq + r3]
- lea inputq, [inputq + r4]
- mova m6, [inputq]
- mova m7, [inputq + r3]
-
- HMD8_1D
- TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
- HMD8_1D
-
- mova [outputq + 0], m0
- mova [outputq + 16], m1
- mova [outputq + 32], m2
- mova [outputq + 48], m3
- mova [outputq + 64], m4
- mova [outputq + 80], m5
- mova [outputq + 96], m6
- mova [outputq + 112], m7
-
- RET
-%endif
diff --git a/vp10/encoder/x86/denoiser_sse2.c b/vp10/encoder/x86/denoiser_sse2.c
deleted file mode 100644
index 047974ef8..000000000
--- a/vp10/encoder/x86/denoiser_sse2.c
+++ /dev/null
@@ -1,375 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <emmintrin.h>
-
-#include "./vpx_config.h"
-#include "./vp10_rtcd.h"
-
-#include "vpx_ports/emmintrin_compat.h"
-#include "vpx/vpx_integer.h"
-#include "vp10/common/reconinter.h"
-#include "vp10/encoder/context_tree.h"
-#include "vp10/encoder/denoiser.h"
-#include "vpx_mem/vpx_mem.h"
-
-// Compute the sum of all pixel differences of this MB.
-static INLINE int sum_diff_16x1(__m128i acc_diff) {
- const __m128i k_1 = _mm_set1_epi16(1);
- const __m128i acc_diff_lo =
- _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8);
- const __m128i acc_diff_hi =
- _mm_srai_epi16(_mm_unpackhi_epi8(acc_diff, acc_diff), 8);
- const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi);
- const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1);
- const __m128i hgfe_dcba =
- _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8));
- const __m128i hgfedcba =
- _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4));
- return _mm_cvtsi128_si32(hgfedcba);
-}
-
-// Denoise a 16x1 vector.
-static INLINE __m128i vp10_denoiser_16x1_sse2(const uint8_t *sig,
- const uint8_t *mc_running_avg_y,
- uint8_t *running_avg_y,
- const __m128i *k_0,
- const __m128i *k_4,
- const __m128i *k_8,
- const __m128i *k_16,
- const __m128i *l3,
- const __m128i *l32,
- const __m128i *l21,
- __m128i acc_diff) {
- // Calculate differences
- const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
- const __m128i v_mc_running_avg_y =
- _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
- __m128i v_running_avg_y;
- const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
- const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
- // Obtain the sign. FF if diff is negative.
- const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, *k_0);
- // Clamp absolute difference to 16 to be used to get mask. Doing this
- // allows us to use _mm_cmpgt_epi8, which operates on signed byte.
- const __m128i clamped_absdiff =
- _mm_min_epu8(_mm_or_si128(pdiff, ndiff), *k_16);
- // Get masks for l2 l1 and l0 adjustments.
- const __m128i mask2 = _mm_cmpgt_epi8(*k_16, clamped_absdiff);
- const __m128i mask1 = _mm_cmpgt_epi8(*k_8, clamped_absdiff);
- const __m128i mask0 = _mm_cmpgt_epi8(*k_4, clamped_absdiff);
- // Get adjustments for l2, l1, and l0.
- __m128i adj2 = _mm_and_si128(mask2, *l32);
- const __m128i adj1 = _mm_and_si128(mask1, *l21);
- const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
- __m128i adj, padj, nadj;
-
- // Combine the adjustments and get absolute adjustments.
- adj2 = _mm_add_epi8(adj2, adj1);
- adj = _mm_sub_epi8(*l3, adj2);
- adj = _mm_andnot_si128(mask0, adj);
- adj = _mm_or_si128(adj, adj0);
-
- // Restore the sign and get positive and negative adjustments.
- padj = _mm_andnot_si128(diff_sign, adj);
- nadj = _mm_and_si128(diff_sign, adj);
-
- // Calculate filtered value.
- v_running_avg_y = _mm_adds_epu8(v_sig, padj);
- v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj);
- _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
-
- // Adjustments <=7, and each element in acc_diff can fit in signed
- // char.
- acc_diff = _mm_adds_epi8(acc_diff, padj);
- acc_diff = _mm_subs_epi8(acc_diff, nadj);
- return acc_diff;
-}
-
-// Denoise a 16x1 vector with a weaker filter.
-static INLINE __m128i vp10_denoiser_adj_16x1_sse2(
- const uint8_t *sig, const uint8_t *mc_running_avg_y,
- uint8_t *running_avg_y, const __m128i k_0,
- const __m128i k_delta, __m128i acc_diff) {
- __m128i v_running_avg_y = _mm_loadu_si128((__m128i *)(&running_avg_y[0]));
- // Calculate differences.
- const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
- const __m128i v_mc_running_avg_y =
- _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
- const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
- const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
- // Obtain the sign. FF if diff is negative.
- const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
- // Clamp absolute difference to delta to get the adjustment.
- const __m128i adj =
- _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
- // Restore the sign and get positive and negative adjustments.
- __m128i padj, nadj;
- padj = _mm_andnot_si128(diff_sign, adj);
- nadj = _mm_and_si128(diff_sign, adj);
- // Calculate filtered value.
- v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj);
- v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj);
- _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);
-
- // Accumulate the adjustments.
- acc_diff = _mm_subs_epi8(acc_diff, padj);
- acc_diff = _mm_adds_epi8(acc_diff, nadj);
- return acc_diff;
-}
-
-// Denoiser for 4xM and 8xM blocks.
-static int vp10_denoiser_NxM_sse2_small(
- const uint8_t *sig, int sig_stride, const uint8_t *mc_running_avg_y,
- int mc_avg_y_stride, uint8_t *running_avg_y, int avg_y_stride,
- int increase_denoising, BLOCK_SIZE bs, int motion_magnitude, int width) {
- int sum_diff_thresh, r, sum_diff = 0;
- const int shift_inc = (increase_denoising &&
- motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ?
- 1 : 0;
- uint8_t sig_buffer[8][16], mc_running_buffer[8][16], running_buffer[8][16];
- __m128i acc_diff = _mm_setzero_si128();
- const __m128i k_0 = _mm_setzero_si128();
- const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
- const __m128i k_8 = _mm_set1_epi8(8);
- const __m128i k_16 = _mm_set1_epi8(16);
- // Modify each level's adjustment according to motion_magnitude.
- const __m128i l3 = _mm_set1_epi8(
- (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
- // Difference between level 3 and level 2 is 2.
- const __m128i l32 = _mm_set1_epi8(2);
- // Difference between level 2 and level 1 is 1.
- const __m128i l21 = _mm_set1_epi8(1);
- const uint8_t shift = (width == 4) ? 2 : 1;
-
- for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> shift); ++r) {
- memcpy(sig_buffer[r], sig, width);
- memcpy(sig_buffer[r] + width, sig + sig_stride, width);
- memcpy(mc_running_buffer[r], mc_running_avg_y, width);
- memcpy(mc_running_buffer[r] + width,
- mc_running_avg_y + mc_avg_y_stride, width);
- memcpy(running_buffer[r], running_avg_y, width);
- memcpy(running_buffer[r] + width, running_avg_y + avg_y_stride, width);
- if (width == 4) {
- memcpy(sig_buffer[r] + width * 2, sig + sig_stride * 2, width);
- memcpy(sig_buffer[r] + width * 3, sig + sig_stride * 3, width);
- memcpy(mc_running_buffer[r] + width * 2,
- mc_running_avg_y + mc_avg_y_stride * 2, width);
- memcpy(mc_running_buffer[r] + width * 3,
- mc_running_avg_y + mc_avg_y_stride * 3, width);
- memcpy(running_buffer[r] + width * 2,
- running_avg_y + avg_y_stride * 2, width);
- memcpy(running_buffer[r] + width * 3,
- running_avg_y + avg_y_stride * 3, width);
- }
- acc_diff = vp10_denoiser_16x1_sse2(sig_buffer[r],
- mc_running_buffer[r],
- running_buffer[r],
- &k_0, &k_4, &k_8, &k_16,
- &l3, &l32, &l21, acc_diff);
- memcpy(running_avg_y, running_buffer[r], width);
- memcpy(running_avg_y + avg_y_stride, running_buffer[r] + width, width);
- if (width == 4) {
- memcpy(running_avg_y + avg_y_stride * 2,
- running_buffer[r] + width * 2, width);
- memcpy(running_avg_y + avg_y_stride * 3,
- running_buffer[r] + width * 3, width);
- }
- // Update pointers for next iteration.
- sig += (sig_stride << shift);
- mc_running_avg_y += (mc_avg_y_stride << shift);
- running_avg_y += (avg_y_stride << shift);
- }
-
- {
- sum_diff = sum_diff_16x1(acc_diff);
- sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
- if (abs(sum_diff) > sum_diff_thresh) {
- // Before returning to copy the block (i.e., apply no denoising),
- // check if we can still apply some (weaker) temporal filtering to
- // this block, that would otherwise not be denoised at all. Simplest
- // is to apply an additional adjustment to running_avg_y to bring it
- // closer to sig. The adjustment is capped by a maximum delta, and
- // chosen such that in most cases the resulting sum_diff will be
- // within the acceptable range given by sum_diff_thresh.
-
- // The delta is set by the excess of absolute pixel diff over the
- // threshold.
- const int delta = ((abs(sum_diff) - sum_diff_thresh) >>
- num_pels_log2_lookup[bs]) + 1;
- // Only apply the adjustment for max delta up to 3.
- if (delta < 4) {
- const __m128i k_delta = _mm_set1_epi8(delta);
- running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]);
- for (r = 0; r < ((4 << b_height_log2_lookup[bs]) >> shift); ++r) {
- acc_diff = vp10_denoiser_adj_16x1_sse2(
- sig_buffer[r], mc_running_buffer[r], running_buffer[r],
- k_0, k_delta, acc_diff);
- memcpy(running_avg_y, running_buffer[r], width);
- memcpy(running_avg_y + avg_y_stride,
- running_buffer[r] + width, width);
- if (width == 4) {
- memcpy(running_avg_y + avg_y_stride * 2,
- running_buffer[r] + width * 2, width);
- memcpy(running_avg_y + avg_y_stride * 3,
- running_buffer[r] + width * 3, width);
- }
- // Update pointers for next iteration.
- running_avg_y += (avg_y_stride << shift);
- }
- sum_diff = sum_diff_16x1(acc_diff);
- if (abs(sum_diff) > sum_diff_thresh) {
- return COPY_BLOCK;
- }
- } else {
- return COPY_BLOCK;
- }
- }
- }
- return FILTER_BLOCK;
-}
-
-// Denoiser for 16xM, 32xM and 64xM blocks
-static int vp10_denoiser_NxM_sse2_big(const uint8_t *sig, int sig_stride,
- const uint8_t *mc_running_avg_y,
- int mc_avg_y_stride,
- uint8_t *running_avg_y,
- int avg_y_stride,
- int increase_denoising, BLOCK_SIZE bs,
- int motion_magnitude) {
- int sum_diff_thresh, r, c, sum_diff = 0;
- const int shift_inc = (increase_denoising &&
- motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ?
- 1 : 0;
- __m128i acc_diff[4][4];
- const __m128i k_0 = _mm_setzero_si128();
- const __m128i k_4 = _mm_set1_epi8(4 + shift_inc);
- const __m128i k_8 = _mm_set1_epi8(8);
- const __m128i k_16 = _mm_set1_epi8(16);
- // Modify each level's adjustment according to motion_magnitude.
- const __m128i l3 = _mm_set1_epi8(
- (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) ? 7 + shift_inc : 6);
- // Difference between level 3 and level 2 is 2.
- const __m128i l32 = _mm_set1_epi8(2);
- // Difference between level 2 and level 1 is 1.
- const __m128i l21 = _mm_set1_epi8(1);
-
- for (c = 0; c < 4; ++c) {
- for (r = 0; r < 4; ++r) {
- acc_diff[c][r] = _mm_setzero_si128();
- }
- }
-
- for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
- for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
- acc_diff[c>>4][r>>4] = vp10_denoiser_16x1_sse2(
- sig, mc_running_avg_y, running_avg_y, &k_0, &k_4,
- &k_8, &k_16, &l3, &l32, &l21, acc_diff[c>>4][r>>4]);
- // Update pointers for next iteration.
- sig += 16;
- mc_running_avg_y += 16;
- running_avg_y += 16;
- }
-
- if ((r + 1) % 16 == 0 || (bs == BLOCK_16X8 && r == 7)) {
- for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
- sum_diff += sum_diff_16x1(acc_diff[c>>4][r>>4]);
- }
- }
-
- // Update pointers for next iteration.
- sig = sig - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + sig_stride;
- mc_running_avg_y = mc_running_avg_y -
- 16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
- mc_avg_y_stride;
- running_avg_y = running_avg_y -
- 16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
- avg_y_stride;
- }
-
- {
- sum_diff_thresh = total_adj_strong_thresh(bs, increase_denoising);
- if (abs(sum_diff) > sum_diff_thresh) {
- const int delta = ((abs(sum_diff) - sum_diff_thresh) >>
- num_pels_log2_lookup[bs]) + 1;
-
- // Only apply the adjustment for max delta up to 3.
- if (delta < 4) {
- const __m128i k_delta = _mm_set1_epi8(delta);
- sig -= sig_stride * (4 << b_height_log2_lookup[bs]);
- mc_running_avg_y -= mc_avg_y_stride * (4 << b_height_log2_lookup[bs]);
- running_avg_y -= avg_y_stride * (4 << b_height_log2_lookup[bs]);
- sum_diff = 0;
- for (r = 0; r < (4 << b_height_log2_lookup[bs]); ++r) {
- for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
- acc_diff[c>>4][r>>4] = vp10_denoiser_adj_16x1_sse2(
- sig, mc_running_avg_y, running_avg_y, k_0,
- k_delta, acc_diff[c>>4][r>>4]);
- // Update pointers for next iteration.
- sig += 16;
- mc_running_avg_y += 16;
- running_avg_y += 16;
- }
-
- if ((r + 1) % 16 == 0 || (bs == BLOCK_16X8 && r == 7)) {
- for (c = 0; c < (4 << b_width_log2_lookup[bs]); c += 16) {
- sum_diff += sum_diff_16x1(acc_diff[c>>4][r>>4]);
- }
- }
- sig = sig - 16 * ((4 << b_width_log2_lookup[bs]) >> 4) + sig_stride;
- mc_running_avg_y = mc_running_avg_y -
- 16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
- mc_avg_y_stride;
- running_avg_y = running_avg_y -
- 16 * ((4 << b_width_log2_lookup[bs]) >> 4) +
- avg_y_stride;
- }
- if (abs(sum_diff) > sum_diff_thresh) {
- return COPY_BLOCK;
- }
- } else {
- return COPY_BLOCK;
- }
- }
- }
- return FILTER_BLOCK;
-}
-
-int vp10_denoiser_filter_sse2(const uint8_t *sig, int sig_stride,
- const uint8_t *mc_avg,
- int mc_avg_stride,
- uint8_t *avg, int avg_stride,
- int increase_denoising,
- BLOCK_SIZE bs,
- int motion_magnitude) {
- if (bs == BLOCK_4X4 || bs == BLOCK_4X8) {
- return vp10_denoiser_NxM_sse2_small(sig, sig_stride,
- mc_avg, mc_avg_stride,
- avg, avg_stride,
- increase_denoising,
- bs, motion_magnitude, 4);
- } else if (bs == BLOCK_8X4 || bs == BLOCK_8X8 || bs == BLOCK_8X16) {
- return vp10_denoiser_NxM_sse2_small(sig, sig_stride,
- mc_avg, mc_avg_stride,
- avg, avg_stride,
- increase_denoising,
- bs, motion_magnitude, 8);
- } else if (bs == BLOCK_16X8 || bs == BLOCK_16X16 || bs == BLOCK_16X32 ||
- bs == BLOCK_32X16|| bs == BLOCK_32X32 || bs == BLOCK_32X64 ||
- bs == BLOCK_64X32 || bs == BLOCK_64X64) {
- return vp10_denoiser_NxM_sse2_big(sig, sig_stride,
- mc_avg, mc_avg_stride,
- avg, avg_stride,
- increase_denoising,
- bs, motion_magnitude);
- } else {
- return COPY_BLOCK;
- }
-}
diff --git a/vp10/encoder/x86/error_intrin_avx2.c b/vp10/encoder/x86/error_intrin_avx2.c
deleted file mode 100644
index 9766be27b..000000000
--- a/vp10/encoder/x86/error_intrin_avx2.c
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Usee of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <immintrin.h> // AVX2
-
-#include "./vp10_rtcd.h"
-#include "vpx/vpx_integer.h"
-
-int64_t vp10_block_error_avx2(const int16_t *coeff,
- const int16_t *dqcoeff,
- intptr_t block_size,
- int64_t *ssz) {
- __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg;
- __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi;
- __m256i sse_reg_64hi, ssz_reg_64hi;
- __m128i sse_reg128, ssz_reg128;
- int64_t sse;
- int i;
- const __m256i zero_reg = _mm256_set1_epi16(0);
-
- // init sse and ssz registerd to zero
- sse_reg = _mm256_set1_epi16(0);
- ssz_reg = _mm256_set1_epi16(0);
-
- for (i = 0 ; i < block_size ; i+= 16) {
- // load 32 bytes from coeff and dqcoeff
- coeff_reg = _mm256_loadu_si256((const __m256i *)(coeff + i));
- dqcoeff_reg = _mm256_loadu_si256((const __m256i *)(dqcoeff + i));
- // dqcoeff - coeff
- dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg);
- // madd (dqcoeff - coeff)
- dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg);
- // madd coeff
- coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg);
- // expand each double word of madd (dqcoeff - coeff) to quad word
- exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg);
- exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg);
- // expand each double word of madd (coeff) to quad word
- exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg);
- exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg);
- // add each quad word of madd (dqcoeff - coeff) and madd (coeff)
- sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo);
- ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo);
- sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi);
- ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi);
- }
- // save the higher 64 bit of each 128 bit lane
- sse_reg_64hi = _mm256_srli_si256(sse_reg, 8);
- ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8);
- // add the higher 64 bit to the low 64 bit
- sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi);
- ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi);
-
- // add each 64 bit from each of the 128 bit lane of the 256 bit
- sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg),
- _mm256_extractf128_si256(sse_reg, 1));
-
- ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg),
- _mm256_extractf128_si256(ssz_reg, 1));
-
- // store the results
- _mm_storel_epi64((__m128i*)(&sse), sse_reg128);
-
- _mm_storel_epi64((__m128i*)(ssz), ssz_reg128);
- return sse;
-}
diff --git a/vp10/encoder/x86/error_sse2.asm b/vp10/encoder/x86/error_sse2.asm
deleted file mode 100644
index 0772da418..000000000
--- a/vp10/encoder/x86/error_sse2.asm
+++ /dev/null
@@ -1,122 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%define private_prefix vp10
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-
-; int64_t vp10_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size,
-; int64_t *ssz)
-
-INIT_XMM sse2
-cglobal block_error, 3, 3, 8, uqc, dqc, size, ssz
- pxor m4, m4 ; sse accumulator
- pxor m6, m6 ; ssz accumulator
- pxor m5, m5 ; dedicated zero register
- lea uqcq, [uqcq+sizeq*2]
- lea dqcq, [dqcq+sizeq*2]
- neg sizeq
-.loop:
- mova m2, [uqcq+sizeq*2]
- mova m0, [dqcq+sizeq*2]
- mova m3, [uqcq+sizeq*2+mmsize]
- mova m1, [dqcq+sizeq*2+mmsize]
- psubw m0, m2
- psubw m1, m3
- ; individual errors are max. 15bit+sign, so squares are 30bit, and
- ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
- pmaddwd m0, m0
- pmaddwd m1, m1
- pmaddwd m2, m2
- pmaddwd m3, m3
- ; accumulate in 64bit
- punpckldq m7, m0, m5
- punpckhdq m0, m5
- paddq m4, m7
- punpckldq m7, m1, m5
- paddq m4, m0
- punpckhdq m1, m5
- paddq m4, m7
- punpckldq m7, m2, m5
- paddq m4, m1
- punpckhdq m2, m5
- paddq m6, m7
- punpckldq m7, m3, m5
- paddq m6, m2
- punpckhdq m3, m5
- paddq m6, m7
- paddq m6, m3
- add sizeq, mmsize
- jl .loop
-
- ; accumulate horizontally and store in return value
- movhlps m5, m4
- movhlps m7, m6
- paddq m4, m5
- paddq m6, m7
-%if ARCH_X86_64
- movq rax, m4
- movq [sszq], m6
-%else
- mov eax, sszm
- pshufd m5, m4, 0x1
- movq [eax], m6
- movd eax, m4
- movd edx, m5
-%endif
- RET
-
-; Compute the sum of squared difference between two int16_t vectors.
-; int64_t vp10_block_error_fp(int16_t *coeff, int16_t *dqcoeff,
-; intptr_t block_size)
-
-INIT_XMM sse2
-cglobal block_error_fp, 3, 3, 6, uqc, dqc, size
- pxor m4, m4 ; sse accumulator
- pxor m5, m5 ; dedicated zero register
- lea uqcq, [uqcq+sizeq*2]
- lea dqcq, [dqcq+sizeq*2]
- neg sizeq
-.loop:
- mova m2, [uqcq+sizeq*2]
- mova m0, [dqcq+sizeq*2]
- mova m3, [uqcq+sizeq*2+mmsize]
- mova m1, [dqcq+sizeq*2+mmsize]
- psubw m0, m2
- psubw m1, m3
- ; individual errors are max. 15bit+sign, so squares are 30bit, and
- ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
- pmaddwd m0, m0
- pmaddwd m1, m1
- ; accumulate in 64bit
- punpckldq m3, m0, m5
- punpckhdq m0, m5
- paddq m4, m3
- punpckldq m3, m1, m5
- paddq m4, m0
- punpckhdq m1, m5
- paddq m4, m3
- paddq m4, m1
- add sizeq, mmsize
- jl .loop
-
- ; accumulate horizontally and store in return value
- movhlps m5, m4
- paddq m4, m5
-%if ARCH_X86_64
- movq rax, m4
-%else
- pshufd m5, m4, 0x1
- movd eax, m4
- movd edx, m5
-%endif
- RET
diff --git a/vp10/encoder/x86/highbd_block_error_intrin_sse2.c b/vp10/encoder/x86/highbd_block_error_intrin_sse2.c
deleted file mode 100644
index 6b4cf5099..000000000
--- a/vp10/encoder/x86/highbd_block_error_intrin_sse2.c
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <emmintrin.h>
-#include <stdio.h>
-
-#include "vp10/common/common.h"
-
-int64_t vp10_highbd_block_error_sse2(tran_low_t *coeff, tran_low_t *dqcoeff,
- intptr_t block_size, int64_t *ssz,
- int bps) {
- int i, j, test;
- uint32_t temp[4];
- __m128i max, min, cmp0, cmp1, cmp2, cmp3;
- int64_t error = 0, sqcoeff = 0;
- const int shift = 2 * (bps - 8);
- const int rounding = shift > 0 ? 1 << (shift - 1) : 0;
-
- for (i = 0; i < block_size; i+=8) {
- // Load the data into xmm registers
- __m128i mm_coeff = _mm_load_si128((__m128i*) (coeff + i));
- __m128i mm_coeff2 = _mm_load_si128((__m128i*) (coeff + i + 4));
- __m128i mm_dqcoeff = _mm_load_si128((__m128i*) (dqcoeff + i));
- __m128i mm_dqcoeff2 = _mm_load_si128((__m128i*) (dqcoeff + i + 4));
- // Check if any values require more than 15 bit
- max = _mm_set1_epi32(0x3fff);
- min = _mm_set1_epi32(0xffffc000);
- cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max),
- _mm_cmplt_epi32(mm_coeff, min));
- cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max),
- _mm_cmplt_epi32(mm_coeff2, min));
- cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max),
- _mm_cmplt_epi32(mm_dqcoeff, min));
- cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max),
- _mm_cmplt_epi32(mm_dqcoeff2, min));
- test = _mm_movemask_epi8(_mm_or_si128(_mm_or_si128(cmp0, cmp1),
- _mm_or_si128(cmp2, cmp3)));
-
- if (!test) {
- __m128i mm_diff, error_sse2, sqcoeff_sse2;;
- mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2);
- mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2);
- mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff);
- error_sse2 = _mm_madd_epi16(mm_diff, mm_diff);
- sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff);
- _mm_storeu_si128((__m128i*)temp, error_sse2);
- error = error + temp[0] + temp[1] + temp[2] + temp[3];
- _mm_storeu_si128((__m128i*)temp, sqcoeff_sse2);
- sqcoeff += temp[0] + temp[1] + temp[2] + temp[3];
- } else {
- for (j = 0; j < 8; j++) {
- const int64_t diff = coeff[i + j] - dqcoeff[i + j];
- error += diff * diff;
- sqcoeff += (int64_t)coeff[i + j] * (int64_t)coeff[i + j];
- }
- }
- }
- assert(error >= 0 && sqcoeff >= 0);
- error = (error + rounding) >> shift;
- sqcoeff = (sqcoeff + rounding) >> shift;
-
- *ssz = sqcoeff;
- return error;
-}
diff --git a/vp10/encoder/x86/quantize_sse2.c b/vp10/encoder/x86/quantize_sse2.c
deleted file mode 100644
index dabd3bd12..000000000
--- a/vp10/encoder/x86/quantize_sse2.c
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <emmintrin.h>
-#include <xmmintrin.h>
-
-#include "./vp10_rtcd.h"
-#include "vpx/vpx_integer.h"
-
-void vp10_quantize_fp_sse2(const int16_t* coeff_ptr, intptr_t n_coeffs,
- int skip_block, const int16_t* zbin_ptr,
- const int16_t* round_ptr, const int16_t* quant_ptr,
- const int16_t* quant_shift_ptr, int16_t* qcoeff_ptr,
- int16_t* dqcoeff_ptr, const int16_t* dequant_ptr,
- uint16_t* eob_ptr,
- const int16_t* scan_ptr,
- const int16_t* iscan_ptr) {
- __m128i zero;
- __m128i thr;
- int16_t nzflag;
- (void)scan_ptr;
- (void)zbin_ptr;
- (void)quant_shift_ptr;
-
- coeff_ptr += n_coeffs;
- iscan_ptr += n_coeffs;
- qcoeff_ptr += n_coeffs;
- dqcoeff_ptr += n_coeffs;
- n_coeffs = -n_coeffs;
- zero = _mm_setzero_si128();
-
- if (!skip_block) {
- __m128i eob;
- __m128i round, quant, dequant;
- {
- __m128i coeff0, coeff1;
-
- // Setup global values
- {
- round = _mm_load_si128((const __m128i*)round_ptr);
- quant = _mm_load_si128((const __m128i*)quant_ptr);
- dequant = _mm_load_si128((const __m128i*)dequant_ptr);
- }
-
- {
- __m128i coeff0_sign, coeff1_sign;
- __m128i qcoeff0, qcoeff1;
- __m128i qtmp0, qtmp1;
- // Do DC and first 15 AC
- coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
- coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
-
- // Poor man's sign extract
- coeff0_sign = _mm_srai_epi16(coeff0, 15);
- coeff1_sign = _mm_srai_epi16(coeff1, 15);
- qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
- qcoeff0 = _mm_adds_epi16(qcoeff0, round);
- round = _mm_unpackhi_epi64(round, round);
- qcoeff1 = _mm_adds_epi16(qcoeff1, round);
- qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
- quant = _mm_unpackhi_epi64(quant, quant);
- qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-
- // Reinsert signs
- qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
-
- coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
- dequant = _mm_unpackhi_epi64(dequant, dequant);
- coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
- }
-
- {
- // Scan for eob
- __m128i zero_coeff0, zero_coeff1;
- __m128i nzero_coeff0, nzero_coeff1;
- __m128i iscan0, iscan1;
- __m128i eob1;
- zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
- zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
- nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
- nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
- iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
- iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
- // Add one to convert from indices to counts
- iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
- iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
- eob = _mm_and_si128(iscan0, nzero_coeff0);
- eob1 = _mm_and_si128(iscan1, nzero_coeff1);
- eob = _mm_max_epi16(eob, eob1);
- }
- n_coeffs += 8 * 2;
- }
-
- thr = _mm_srai_epi16(dequant, 1);
-
- // AC only loop
- while (n_coeffs < 0) {
- __m128i coeff0, coeff1;
- {
- __m128i coeff0_sign, coeff1_sign;
- __m128i qcoeff0, qcoeff1;
- __m128i qtmp0, qtmp1;
-
- coeff0 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs));
- coeff1 = _mm_load_si128((const __m128i*)(coeff_ptr + n_coeffs) + 1);
-
- // Poor man's sign extract
- coeff0_sign = _mm_srai_epi16(coeff0, 15);
- coeff1_sign = _mm_srai_epi16(coeff1, 15);
- qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
- nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
- _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
-
- if (nzflag) {
- qcoeff0 = _mm_adds_epi16(qcoeff0, round);
- qcoeff1 = _mm_adds_epi16(qcoeff1, round);
- qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
- qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-
- // Reinsert signs
- qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
- qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
- qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
- qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
-
- coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
- coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
- } else {
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
-
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
- }
- }
-
- if (nzflag) {
- // Scan for eob
- __m128i zero_coeff0, zero_coeff1;
- __m128i nzero_coeff0, nzero_coeff1;
- __m128i iscan0, iscan1;
- __m128i eob0, eob1;
- zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
- zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
- nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
- nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
- iscan0 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs));
- iscan1 = _mm_load_si128((const __m128i*)(iscan_ptr + n_coeffs) + 1);
- // Add one to convert from indices to counts
- iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0);
- iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1);
- eob0 = _mm_and_si128(iscan0, nzero_coeff0);
- eob1 = _mm_and_si128(iscan1, nzero_coeff1);
- eob0 = _mm_max_epi16(eob0, eob1);
- eob = _mm_max_epi16(eob, eob0);
- }
- n_coeffs += 8 * 2;
- }
-
- // Accumulate EOB
- {
- __m128i eob_shuffled;
- eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
- eob = _mm_max_epi16(eob, eob_shuffled);
- eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
- eob = _mm_max_epi16(eob, eob_shuffled);
- eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
- eob = _mm_max_epi16(eob, eob_shuffled);
- *eob_ptr = _mm_extract_epi16(eob, 1);
- }
- } else {
- do {
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
- _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
- _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
- n_coeffs += 8 * 2;
- } while (n_coeffs < 0);
- *eob_ptr = 0;
- }
-}
diff --git a/vp10/encoder/x86/quantize_ssse3_x86_64.asm b/vp10/encoder/x86/quantize_ssse3_x86_64.asm
deleted file mode 100644
index b8fefa2f1..000000000
--- a/vp10/encoder/x86/quantize_ssse3_x86_64.asm
+++ /dev/null
@@ -1,201 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%define private_prefix vp10
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-pw_1: times 8 dw 1
-
-SECTION .text
-
-%macro QUANTIZE_FP 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
- shift, qcoeff, dqcoeff, dequant, \
- eob, scan, iscan
- cmp dword skipm, 0
- jne .blank
-
- ; actual quantize loop - setup pointers, rounders, etc.
- movifnidn coeffq, coeffmp
- movifnidn ncoeffq, ncoeffmp
- mov r2, dequantmp
- movifnidn zbinq, zbinmp
- movifnidn roundq, roundmp
- movifnidn quantq, quantmp
- mova m1, [roundq] ; m1 = round
- mova m2, [quantq] ; m2 = quant
-%ifidn %1, fp_32x32
- pcmpeqw m5, m5
- psrlw m5, 15
- paddw m1, m5
- psrlw m1, 1 ; m1 = (m1 + 1) / 2
-%endif
- mova m3, [r2q] ; m3 = dequant
- mov r3, qcoeffmp
- mov r4, dqcoeffmp
- mov r5, iscanmp
-%ifidn %1, fp_32x32
- psllw m2, 1
-%endif
- pxor m5, m5 ; m5 = dedicated zero
-
- lea coeffq, [ coeffq+ncoeffq*2]
- lea r5q, [ r5q+ncoeffq*2]
- lea r3q, [ r3q+ncoeffq*2]
- lea r4q, [r4q+ncoeffq*2]
- neg ncoeffq
-
- ; get DC and first 15 AC coeffs
- mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
- mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
- pabsw m6, m9 ; m6 = abs(m9)
- pabsw m11, m10 ; m11 = abs(m10)
- pcmpeqw m7, m7
-
- paddsw m6, m1 ; m6 += round
- punpckhqdq m1, m1
- paddsw m11, m1 ; m11 += round
- pmulhw m8, m6, m2 ; m8 = m6*q>>16
- punpckhqdq m2, m2
- pmulhw m13, m11, m2 ; m13 = m11*q>>16
- psignw m8, m9 ; m8 = reinsert sign
- psignw m13, m10 ; m13 = reinsert sign
- mova [r3q+ncoeffq*2+ 0], m8
- mova [r3q+ncoeffq*2+16], m13
-%ifidn %1, fp_32x32
- pabsw m8, m8
- pabsw m13, m13
-%endif
- pmullw m8, m3 ; r4[i] = r3[i] * q
- punpckhqdq m3, m3
- pmullw m13, m3 ; r4[i] = r3[i] * q
-%ifidn %1, fp_32x32
- psrlw m8, 1
- psrlw m13, 1
- psignw m8, m9
- psignw m13, m10
- psrlw m0, m3, 2
-%else
- psrlw m0, m3, 1
-%endif
- mova [r4q+ncoeffq*2+ 0], m8
- mova [r4q+ncoeffq*2+16], m13
- pcmpeqw m8, m5 ; m8 = c[i] == 0
- pcmpeqw m13, m5 ; m13 = c[i] == 0
- mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i]
- mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i]
- psubw m6, m7 ; m6 = scan[i] + 1
- psubw m11, m7 ; m11 = scan[i] + 1
- pandn m8, m6 ; m8 = max(eob)
- pandn m13, m11 ; m13 = max(eob)
- pmaxsw m8, m13
- add ncoeffq, mmsize
- jz .accumulate_eob
-
-.ac_only_loop:
- mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i]
- mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i]
- pabsw m6, m9 ; m6 = abs(m9)
- pabsw m11, m10 ; m11 = abs(m10)
-
- pcmpgtw m7, m6, m0
- pcmpgtw m12, m11, m0
- pmovmskb r6d, m7
- pmovmskb r2d, m12
-
- or r6, r2
- jz .skip_iter
-
- pcmpeqw m7, m7
-
- paddsw m6, m1 ; m6 += round
- paddsw m11, m1 ; m11 += round
- pmulhw m14, m6, m2 ; m14 = m6*q>>16
- pmulhw m13, m11, m2 ; m13 = m11*q>>16
- psignw m14, m9 ; m14 = reinsert sign
- psignw m13, m10 ; m13 = reinsert sign
- mova [r3q+ncoeffq*2+ 0], m14
- mova [r3q+ncoeffq*2+16], m13
-%ifidn %1, fp_32x32
- pabsw m14, m14
- pabsw m13, m13
-%endif
- pmullw m14, m3 ; r4[i] = r3[i] * q
- pmullw m13, m3 ; r4[i] = r3[i] * q
-%ifidn %1, fp_32x32
- psrlw m14, 1
- psrlw m13, 1
- psignw m14, m9
- psignw m13, m10
-%endif
- mova [r4q+ncoeffq*2+ 0], m14
- mova [r4q+ncoeffq*2+16], m13
- pcmpeqw m14, m5 ; m14 = c[i] == 0
- pcmpeqw m13, m5 ; m13 = c[i] == 0
- mova m6, [ r5q+ncoeffq*2+ 0] ; m6 = scan[i]
- mova m11, [ r5q+ncoeffq*2+16] ; m11 = scan[i]
- psubw m6, m7 ; m6 = scan[i] + 1
- psubw m11, m7 ; m11 = scan[i] + 1
- pandn m14, m6 ; m14 = max(eob)
- pandn m13, m11 ; m13 = max(eob)
- pmaxsw m8, m14
- pmaxsw m8, m13
- add ncoeffq, mmsize
- jl .ac_only_loop
-
- jmp .accumulate_eob
-.skip_iter:
- mova [r3q+ncoeffq*2+ 0], m5
- mova [r3q+ncoeffq*2+16], m5
- mova [r4q+ncoeffq*2+ 0], m5
- mova [r4q+ncoeffq*2+16], m5
- add ncoeffq, mmsize
- jl .ac_only_loop
-
-.accumulate_eob:
- ; horizontally accumulate/max eobs and write into [eob] memory pointer
- mov r2, eobmp
- pshufd m7, m8, 0xe
- pmaxsw m8, m7
- pshuflw m7, m8, 0xe
- pmaxsw m8, m7
- pshuflw m7, m8, 0x1
- pmaxsw m8, m7
- pextrw r6, m8, 0
- mov [r2], r6
- RET
-
- ; skip-block, i.e. just write all zeroes
-.blank:
- mov r0, dqcoeffmp
- movifnidn ncoeffq, ncoeffmp
- mov r2, qcoeffmp
- mov r3, eobmp
-
- lea r0q, [r0q+ncoeffq*2]
- lea r2q, [r2q+ncoeffq*2]
- neg ncoeffq
- pxor m7, m7
-.blank_loop:
- mova [r0q+ncoeffq*2+ 0], m7
- mova [r0q+ncoeffq*2+16], m7
- mova [r2q+ncoeffq*2+ 0], m7
- mova [r2q+ncoeffq*2+16], m7
- add ncoeffq, mmsize
- jl .blank_loop
- mov word [r3q], 0
- RET
-%endmacro
-
-INIT_XMM ssse3
-QUANTIZE_FP fp, 7
-QUANTIZE_FP fp_32x32, 7
diff --git a/vp10/encoder/x86/ssim_opt_x86_64.asm b/vp10/encoder/x86/ssim_opt_x86_64.asm
deleted file mode 100644
index b45f0095d..000000000
--- a/vp10/encoder/x86/ssim_opt_x86_64.asm
+++ /dev/null
@@ -1,216 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
-%macro TABULATE_SSIM 0
- paddusw xmm15, xmm3 ; sum_s
- paddusw xmm14, xmm4 ; sum_r
- movdqa xmm1, xmm3
- pmaddwd xmm1, xmm1
- paddd xmm13, xmm1 ; sum_sq_s
- movdqa xmm2, xmm4
- pmaddwd xmm2, xmm2
- paddd xmm12, xmm2 ; sum_sq_r
- pmaddwd xmm3, xmm4
- paddd xmm11, xmm3 ; sum_sxr
-%endmacro
-
-; Sum across the register %1 starting with q words
-%macro SUM_ACROSS_Q 1
- movdqa xmm2,%1
- punpckldq %1,xmm0
- punpckhdq xmm2,xmm0
- paddq %1,xmm2
- movdqa xmm2,%1
- punpcklqdq %1,xmm0
- punpckhqdq xmm2,xmm0
- paddq %1,xmm2
-%endmacro
-
-; Sum across the register %1 starting with q words
-%macro SUM_ACROSS_W 1
- movdqa xmm1, %1
- punpcklwd %1,xmm0
- punpckhwd xmm1,xmm0
- paddd %1, xmm1
- SUM_ACROSS_Q %1
-%endmacro
-;void ssim_parms_sse2(
-; unsigned char *s,
-; int sp,
-; unsigned char *r,
-; int rp
-; unsigned long *sum_s,
-; unsigned long *sum_r,
-; unsigned long *sum_sq_s,
-; unsigned long *sum_sq_r,
-; unsigned long *sum_sxr);
-;
-; TODO: Use parm passing through structure, probably don't need the pxors
-; ( calling app will initialize to 0 ) could easily fit everything in sse2
-; without too much hastle, and can probably do better estimates with psadw
-; or pavgb At this point this is just meant to be first pass for calculating
-; all the parms needed for 16x16 ssim so we can play with dssim as distortion
-; in mode selection code.
-global sym(vp10_ssim_parms_16x16_sse2) PRIVATE
-sym(vp10_ssim_parms_16x16_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 9
- SAVE_XMM 15
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;s
- mov rcx, arg(1) ;sp
- mov rdi, arg(2) ;r
- mov rax, arg(3) ;rp
-
- pxor xmm0, xmm0
- pxor xmm15,xmm15 ;sum_s
- pxor xmm14,xmm14 ;sum_r
- pxor xmm13,xmm13 ;sum_sq_s
- pxor xmm12,xmm12 ;sum_sq_r
- pxor xmm11,xmm11 ;sum_sxr
-
- mov rdx, 16 ;row counter
-.NextRow:
-
- ;grab source and reference pixels
- movdqu xmm5, [rsi]
- movdqu xmm6, [rdi]
- movdqa xmm3, xmm5
- movdqa xmm4, xmm6
- punpckhbw xmm3, xmm0 ; high_s
- punpckhbw xmm4, xmm0 ; high_r
-
- TABULATE_SSIM
-
- movdqa xmm3, xmm5
- movdqa xmm4, xmm6
- punpcklbw xmm3, xmm0 ; low_s
- punpcklbw xmm4, xmm0 ; low_r
-
- TABULATE_SSIM
-
- add rsi, rcx ; next s row
- add rdi, rax ; next r row
-
- dec rdx ; counter
- jnz .NextRow
-
- SUM_ACROSS_W xmm15
- SUM_ACROSS_W xmm14
- SUM_ACROSS_Q xmm13
- SUM_ACROSS_Q xmm12
- SUM_ACROSS_Q xmm11
-
- mov rdi,arg(4)
- movd [rdi], xmm15;
- mov rdi,arg(5)
- movd [rdi], xmm14;
- mov rdi,arg(6)
- movd [rdi], xmm13;
- mov rdi,arg(7)
- movd [rdi], xmm12;
- mov rdi,arg(8)
- movd [rdi], xmm11;
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void ssim_parms_sse2(
-; unsigned char *s,
-; int sp,
-; unsigned char *r,
-; int rp
-; unsigned long *sum_s,
-; unsigned long *sum_r,
-; unsigned long *sum_sq_s,
-; unsigned long *sum_sq_r,
-; unsigned long *sum_sxr);
-;
-; TODO: Use parm passing through structure, probably don't need the pxors
-; ( calling app will initialize to 0 ) could easily fit everything in sse2
-; without too much hastle, and can probably do better estimates with psadw
-; or pavgb At this point this is just meant to be first pass for calculating
-; all the parms needed for 16x16 ssim so we can play with dssim as distortion
-; in mode selection code.
-global sym(vp10_ssim_parms_8x8_sse2) PRIVATE
-sym(vp10_ssim_parms_8x8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 9
- SAVE_XMM 15
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;s
- mov rcx, arg(1) ;sp
- mov rdi, arg(2) ;r
- mov rax, arg(3) ;rp
-
- pxor xmm0, xmm0
- pxor xmm15,xmm15 ;sum_s
- pxor xmm14,xmm14 ;sum_r
- pxor xmm13,xmm13 ;sum_sq_s
- pxor xmm12,xmm12 ;sum_sq_r
- pxor xmm11,xmm11 ;sum_sxr
-
- mov rdx, 8 ;row counter
-.NextRow:
-
- ;grab source and reference pixels
- movq xmm3, [rsi]
- movq xmm4, [rdi]
- punpcklbw xmm3, xmm0 ; low_s
- punpcklbw xmm4, xmm0 ; low_r
-
- TABULATE_SSIM
-
- add rsi, rcx ; next s row
- add rdi, rax ; next r row
-
- dec rdx ; counter
- jnz .NextRow
-
- SUM_ACROSS_W xmm15
- SUM_ACROSS_W xmm14
- SUM_ACROSS_Q xmm13
- SUM_ACROSS_Q xmm12
- SUM_ACROSS_Q xmm11
-
- mov rdi,arg(4)
- movd [rdi], xmm15;
- mov rdi,arg(5)
- movd [rdi], xmm14;
- mov rdi,arg(6)
- movd [rdi], xmm13;
- mov rdi,arg(7)
- movd [rdi], xmm12;
- mov rdi,arg(8)
- movd [rdi], xmm11;
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/vp10/encoder/x86/temporal_filter_apply_sse2.asm b/vp10/encoder/x86/temporal_filter_apply_sse2.asm
deleted file mode 100644
index 717180713..000000000
--- a/vp10/encoder/x86/temporal_filter_apply_sse2.asm
+++ /dev/null
@@ -1,212 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-; void vp10_temporal_filter_apply_sse2 | arg
-; (unsigned char *frame1, | 0
-; unsigned int stride, | 1
-; unsigned char *frame2, | 2
-; unsigned int block_width, | 3
-; unsigned int block_height, | 4
-; int strength, | 5
-; int filter_weight, | 6
-; unsigned int *accumulator, | 7
-; unsigned short *count) | 8
-global sym(vp10_temporal_filter_apply_sse2) PRIVATE
-sym(vp10_temporal_filter_apply_sse2):
-
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 9
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ALIGN_STACK 16, rax
- %define block_width 0
- %define block_height 16
- %define strength 32
- %define filter_weight 48
- %define rounding_bit 64
- %define rbp_backup 80
- %define stack_size 96
- sub rsp, stack_size
- mov [rsp + rbp_backup], rbp
- ; end prolog
-
- mov edx, arg(3)
- mov [rsp + block_width], rdx
- mov edx, arg(4)
- mov [rsp + block_height], rdx
- movd xmm6, arg(5)
- movdqa [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
-
- ; calculate the rounding bit outside the loop
- ; 0x8000 >> (16 - strength)
- mov rdx, 16
- sub rdx, arg(5) ; 16 - strength
- movq xmm4, rdx ; can't use rdx w/ shift
- movdqa xmm5, [GLOBAL(_const_top_bit)]
- psrlw xmm5, xmm4
- movdqa [rsp + rounding_bit], xmm5
-
- mov rsi, arg(0) ; src/frame1
- mov rdx, arg(2) ; predictor frame
- mov rdi, arg(7) ; accumulator
- mov rax, arg(8) ; count
-
- ; dup the filter weight and store for later
- movd xmm0, arg(6) ; filter_weight
- pshuflw xmm0, xmm0, 0
- punpcklwd xmm0, xmm0
- movdqa [rsp + filter_weight], xmm0
-
- mov rbp, arg(1) ; stride
- pxor xmm7, xmm7 ; zero for extraction
-
- mov rcx, [rsp + block_width]
- imul rcx, [rsp + block_height]
- add rcx, rdx
- cmp dword ptr [rsp + block_width], 8
- jne .temporal_filter_apply_load_16
-
-.temporal_filter_apply_load_8:
- movq xmm0, [rsi] ; first row
- lea rsi, [rsi + rbp] ; += stride
- punpcklbw xmm0, xmm7 ; src[ 0- 7]
- movq xmm1, [rsi] ; second row
- lea rsi, [rsi + rbp] ; += stride
- punpcklbw xmm1, xmm7 ; src[ 8-15]
- jmp .temporal_filter_apply_load_finished
-
-.temporal_filter_apply_load_16:
- movdqa xmm0, [rsi] ; src (frame1)
- lea rsi, [rsi + rbp] ; += stride
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm7 ; src[ 0- 7]
- punpckhbw xmm1, xmm7 ; src[ 8-15]
-
-.temporal_filter_apply_load_finished:
- movdqa xmm2, [rdx] ; predictor (frame2)
- movdqa xmm3, xmm2
- punpcklbw xmm2, xmm7 ; pred[ 0- 7]
- punpckhbw xmm3, xmm7 ; pred[ 8-15]
-
- ; modifier = src_byte - pixel_value
- psubw xmm0, xmm2 ; src - pred[ 0- 7]
- psubw xmm1, xmm3 ; src - pred[ 8-15]
-
- ; modifier *= modifier
- pmullw xmm0, xmm0 ; modifer[ 0- 7]^2
- pmullw xmm1, xmm1 ; modifer[ 8-15]^2
-
- ; modifier *= 3
- pmullw xmm0, [GLOBAL(_const_3w)]
- pmullw xmm1, [GLOBAL(_const_3w)]
-
- ; modifer += 0x8000 >> (16 - strength)
- paddw xmm0, [rsp + rounding_bit]
- paddw xmm1, [rsp + rounding_bit]
-
- ; modifier >>= strength
- psrlw xmm0, [rsp + strength]
- psrlw xmm1, [rsp + strength]
-
- ; modifier = 16 - modifier
- ; saturation takes care of modifier > 16
- movdqa xmm3, [GLOBAL(_const_16w)]
- movdqa xmm2, [GLOBAL(_const_16w)]
- psubusw xmm3, xmm1
- psubusw xmm2, xmm0
-
- ; modifier *= filter_weight
- pmullw xmm2, [rsp + filter_weight]
- pmullw xmm3, [rsp + filter_weight]
-
- ; count
- movdqa xmm4, [rax]
- movdqa xmm5, [rax+16]
- ; += modifier
- paddw xmm4, xmm2
- paddw xmm5, xmm3
- ; write back
- movdqa [rax], xmm4
- movdqa [rax+16], xmm5
- lea rax, [rax + 16*2] ; count += 16*(sizeof(short))
-
- ; load and extract the predictor up to shorts
- pxor xmm7, xmm7
- movdqa xmm0, [rdx]
- lea rdx, [rdx + 16*1] ; pred += 16*(sizeof(char))
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm7 ; pred[ 0- 7]
- punpckhbw xmm1, xmm7 ; pred[ 8-15]
-
- ; modifier *= pixel_value
- pmullw xmm0, xmm2
- pmullw xmm1, xmm3
-
- ; expand to double words
- movdqa xmm2, xmm0
- punpcklwd xmm0, xmm7 ; [ 0- 3]
- punpckhwd xmm2, xmm7 ; [ 4- 7]
- movdqa xmm3, xmm1
- punpcklwd xmm1, xmm7 ; [ 8-11]
- punpckhwd xmm3, xmm7 ; [12-15]
-
- ; accumulator
- movdqa xmm4, [rdi]
- movdqa xmm5, [rdi+16]
- movdqa xmm6, [rdi+32]
- movdqa xmm7, [rdi+48]
- ; += modifier
- paddd xmm4, xmm0
- paddd xmm5, xmm2
- paddd xmm6, xmm1
- paddd xmm7, xmm3
- ; write back
- movdqa [rdi], xmm4
- movdqa [rdi+16], xmm5
- movdqa [rdi+32], xmm6
- movdqa [rdi+48], xmm7
- lea rdi, [rdi + 16*4] ; accumulator += 16*(sizeof(int))
-
- cmp rdx, rcx
- je .temporal_filter_apply_epilog
- pxor xmm7, xmm7 ; zero for extraction
- cmp dword ptr [rsp + block_width], 16
- je .temporal_filter_apply_load_16
- jmp .temporal_filter_apply_load_8
-
-.temporal_filter_apply_epilog:
- ; begin epilog
- mov rbp, [rsp + rbp_backup]
- add rsp, stack_size
- pop rsp
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-align 16
-_const_3w:
- times 8 dw 3
-align 16
-_const_top_bit:
- times 8 dw 1<<15
-align 16
-_const_16w
- times 8 dw 16