diff options
author | Erik de Castro Lopo <erikd@mega-nerd.com> | 2014-01-30 22:17:01 +1100 |
---|---|---|
committer | Erik de Castro Lopo <erikd@mega-nerd.com> | 2014-01-30 22:17:08 +1100 |
commit | a03999f570b899da03de6095b22f9562ef399fe0 (patch) | |
tree | dde5538a8b3bf0cc31854033507d97b5987f4e45 | |
parent | 1d920993f1a07a5ebcf3c3f29a7b07c6c946efb4 (diff) | |
download | flac-a03999f570b899da03de6095b22f9562ef399fe0.tar.gz |
lpc_intrin_sse2.c : Add RESIDUAL16_RESULT macro.
RESIDUAL16_RESULT is analogous to the existing RESIDUAL_RESULT macro
and simplifies the code a little.
Patch-from: lvqcl <lvqcl.mail@gmail.com>
-rw-r--r-- | src/libFLAC/lpc_intrin_sse2.c | 114 |
1 files changed, 41 insertions, 73 deletions
diff --git a/src/libFLAC/lpc_intrin_sse2.c b/src/libFLAC/lpc_intrin_sse2.c index 98d51bd5..3ccebd00 100644 --- a/src/libFLAC/lpc_intrin_sse2.c +++ b/src/libFLAC/lpc_intrin_sse2.c @@ -45,6 +45,12 @@ #include <emmintrin.h> /* SSE2 */ +#define RESIDUAL16_RESULT(xmmN) curr = *data++; *residual++ = curr - (_mm_cvtsi128_si32(xmmN) >> lp_quantization); +#define DATA16_RESULT(xmmN) curr = *residual++ + (_mm_cvtsi128_si32(xmmN) >> lp_quantization); *data++ = curr; + +#define RESIDUAL_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN) >> lp_quantization); +#define DATA_RESULT(xmmN) data[i] = residual[i] + (_mm_cvtsi128_si32(xmmN) >> lp_quantization); + FLAC__SSE_TARGET("sse2") void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) { @@ -59,7 +65,6 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ FLAC__int32 curr; if(order > 8) { /* order == 9, 10, 11, 12 */ #ifdef FLAC__CPU_IA32 /* 8 XMM registers available */ - /* can be modified to work with order <= 15 but the subset limit is 12 */ int r; __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0)); @@ -103,8 +108,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); - curr = *data++; - *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization); + RESIDUAL16_RESULT(xmm6); data_len--; r = data_len % 2; @@ -124,8 +128,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); - curr = *data++; - *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization); + RESIDUAL16_RESULT(xmm6); data_len--; } @@ -146,8 +149,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); - curr = *data++; - *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization); + RESIDUAL16_RESULT(xmm6); xmm3 = _mm_insert_epi16(xmm3, curr, 0); @@ -159,8 +161,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); - curr = *data++; - *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization); + RESIDUAL16_RESULT(xmm6); data_len-=2; } @@ -218,8 +219,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); - curr = *data++; - *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization); + RESIDUAL16_RESULT(xmm6); data_len--; r = data_len % 4; @@ -239,8 +239,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); - curr = *data++; - *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization); + RESIDUAL16_RESULT(xmm6); data_len--; r--; } @@ -261,8 +260,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); - curr = *data++; - *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization); + RESIDUAL16_RESULT(xmm6); xmm3 = _mm_insert_epi16(xmm3, curr, 2); @@ -274,8 +272,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); - curr = *data++; - *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization); + RESIDUAL16_RESULT(xmm6); xmm3 = _mm_insert_epi16(xmm3, curr, 1); @@ -287,8 +284,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); - curr = *data++; - *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization); + RESIDUAL16_RESULT(xmm6); xmm3 = _mm_insert_epi16(xmm3, curr, 0); @@ -300,8 +296,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); - curr = *data++; - *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization); + RESIDUAL16_RESULT(xmm6); data_len-=4; } @@ -329,8 +324,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); - curr = *data++; - *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization); + RESIDUAL16_RESULT(xmm6); data_len--; @@ -343,8 +337,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); - curr = *data++; - *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization); + RESIDUAL16_RESULT(xmm6); data_len--; } @@ -373,8 +366,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); - curr = *data++; - *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization); + RESIDUAL16_RESULT(xmm6); data_len--; r = data_len % 2; @@ -388,8 +380,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); - curr = *data++; - *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization); + RESIDUAL16_RESULT(xmm6); data_len--; } @@ -403,8 +394,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); - curr = *data++; - *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization); + RESIDUAL16_RESULT(xmm6); xmm3 = _mm_insert_epi16(xmm3, curr, 0); xmm6 = xmm3; @@ -412,8 +402,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); - curr = *data++; - *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization); + RESIDUAL16_RESULT(xmm6); data_len-=2; } @@ -446,8 +435,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); - curr = *data++; - *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization); + RESIDUAL16_RESULT(xmm6); data_len--; r = data_len % 3; @@ -461,8 +449,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); - curr = *data++; - *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization); + RESIDUAL16_RESULT(xmm6); data_len--; r--; } @@ -476,8 +463,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); - curr = *data++; - *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization); + RESIDUAL16_RESULT(xmm6); xmm3 = _mm_insert_epi16(xmm3, curr, 1); @@ -486,8 +472,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); - curr = *data++; - *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization); + RESIDUAL16_RESULT(xmm6); xmm3 = _mm_insert_epi16(xmm3, curr, 0); @@ -496,8 +481,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); - curr = *data++; - *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization); + RESIDUAL16_RESULT(xmm6); data_len-=3; } @@ -530,8 +514,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); - curr = *data++; - *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization); + RESIDUAL16_RESULT(xmm6); data_len--; r = data_len % 4; @@ -545,8 +528,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); - curr = *data++; - *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization); + RESIDUAL16_RESULT(xmm6); data_len--; r--; } @@ -560,8 +542,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); - curr = *data++; - *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization); + RESIDUAL16_RESULT(xmm6); xmm3 = _mm_insert_epi16(xmm3, curr, 2); @@ -570,8 +551,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); - curr = *data++; - *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization); + RESIDUAL16_RESULT(xmm6); xmm3 = _mm_insert_epi16(xmm3, curr, 1); @@ -580,8 +560,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); - curr = *data++; - *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization); + RESIDUAL16_RESULT(xmm6); xmm3 = _mm_insert_epi16(xmm3, curr, 0); @@ -590,8 +569,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8)); xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); - curr = *data++; - *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization); + RESIDUAL16_RESULT(xmm6); data_len-=4; } @@ -617,8 +595,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ xmm6 = _mm_madd_epi16(xmm6, xmm0); xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); - curr = *data++; - *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization); + RESIDUAL16_RESULT(xmm6); data_len--; @@ -630,8 +607,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ xmm6 = _mm_madd_epi16(xmm6, xmm0); xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); - curr = *data++; - *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization); + RESIDUAL16_RESULT(xmm6); data_len--; } @@ -657,8 +633,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ xmm6 = _mm_madd_epi16(xmm6, xmm0); xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); - curr = *data++; - *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization); + RESIDUAL16_RESULT(xmm6); data_len--; r = data_len % 2; @@ -671,8 +646,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ xmm6 = _mm_madd_epi16(xmm6, xmm0); xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); - curr = *data++; - *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization); + RESIDUAL16_RESULT(xmm6); data_len--; } @@ -686,8 +660,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ xmm6 = _mm_madd_epi16(xmm6, xmm1); xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); - curr = *data++; - *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization); + RESIDUAL16_RESULT(xmm6); xmm3 = _mm_insert_epi16(xmm3, curr, 0); @@ -695,8 +668,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ xmm6 = _mm_madd_epi16(xmm6, xmm0); xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4)); - curr = *data++; - *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization); + RESIDUAL16_RESULT(xmm6); data_len-=2; } @@ -720,8 +692,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ xmm6 = xmm3; xmm6 = _mm_madd_epi16(xmm6, xmm0); - curr = *data++; - *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization); + RESIDUAL16_RESULT(xmm6); data_len--; @@ -732,8 +703,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ xmm6 = xmm3; xmm6 = _mm_madd_epi16(xmm6, xmm0); - curr = *data++; - *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization); + RESIDUAL16_RESULT(xmm6); data_len--; } @@ -787,8 +757,6 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_ } } -#define RESIDUAL_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN) >> lp_quantization); - FLAC__SSE_TARGET("sse2") void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[]) { |