summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorErik de Castro Lopo <erikd@mega-nerd.com>2014-01-30 22:17:01 +1100
committerErik de Castro Lopo <erikd@mega-nerd.com>2014-01-30 22:17:08 +1100
commita03999f570b899da03de6095b22f9562ef399fe0 (patch)
treedde5538a8b3bf0cc31854033507d97b5987f4e45
parent1d920993f1a07a5ebcf3c3f29a7b07c6c946efb4 (diff)
downloadflac-a03999f570b899da03de6095b22f9562ef399fe0.tar.gz
lpc_intrin_sse2.c : Add RESIDUAL16_RESULT macro.
RESIDUAL16_RESULT is analogous to the existing RESIDUAL_RESULT macro and simplifies the code a little. Patch-from: lvqcl <lvqcl.mail@gmail.com>
-rw-r--r--src/libFLAC/lpc_intrin_sse2.c114
1 files changed, 41 insertions, 73 deletions
diff --git a/src/libFLAC/lpc_intrin_sse2.c b/src/libFLAC/lpc_intrin_sse2.c
index 98d51bd5..3ccebd00 100644
--- a/src/libFLAC/lpc_intrin_sse2.c
+++ b/src/libFLAC/lpc_intrin_sse2.c
@@ -45,6 +45,12 @@
#include <emmintrin.h> /* SSE2 */
+#define RESIDUAL16_RESULT(xmmN) curr = *data++; *residual++ = curr - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
+#define DATA16_RESULT(xmmN) curr = *residual++ + (_mm_cvtsi128_si32(xmmN) >> lp_quantization); *data++ = curr;
+
+#define RESIDUAL_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
+#define DATA_RESULT(xmmN) data[i] = residual[i] + (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
+
FLAC__SSE_TARGET("sse2")
void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
{
@@ -59,7 +65,6 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
FLAC__int32 curr;
if(order > 8) { /* order == 9, 10, 11, 12 */
#ifdef FLAC__CPU_IA32 /* 8 XMM registers available */
- /* can be modified to work with order <= 15 but the subset limit is 12 */
int r;
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
xmm0 = _mm_loadu_si128((const __m128i*)(qlp_coeff+0));
@@ -103,8 +108,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
- curr = *data++;
- *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+ RESIDUAL16_RESULT(xmm6);
data_len--;
r = data_len % 2;
@@ -124,8 +128,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
- curr = *data++;
- *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+ RESIDUAL16_RESULT(xmm6);
data_len--;
}
@@ -146,8 +149,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
- curr = *data++;
- *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+ RESIDUAL16_RESULT(xmm6);
xmm3 = _mm_insert_epi16(xmm3, curr, 0);
@@ -159,8 +161,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
- curr = *data++;
- *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+ RESIDUAL16_RESULT(xmm6);
data_len-=2;
}
@@ -218,8 +219,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
- curr = *data++;
- *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+ RESIDUAL16_RESULT(xmm6);
data_len--;
r = data_len % 4;
@@ -239,8 +239,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
- curr = *data++;
- *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+ RESIDUAL16_RESULT(xmm6);
data_len--; r--;
}
@@ -261,8 +260,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
- curr = *data++;
- *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+ RESIDUAL16_RESULT(xmm6);
xmm3 = _mm_insert_epi16(xmm3, curr, 2);
@@ -274,8 +272,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
- curr = *data++;
- *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+ RESIDUAL16_RESULT(xmm6);
xmm3 = _mm_insert_epi16(xmm3, curr, 1);
@@ -287,8 +284,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
- curr = *data++;
- *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+ RESIDUAL16_RESULT(xmm6);
xmm3 = _mm_insert_epi16(xmm3, curr, 0);
@@ -300,8 +296,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
- curr = *data++;
- *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+ RESIDUAL16_RESULT(xmm6);
data_len-=4;
}
@@ -329,8 +324,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
- curr = *data++;
- *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+ RESIDUAL16_RESULT(xmm6);
data_len--;
@@ -343,8 +337,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
- curr = *data++;
- *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+ RESIDUAL16_RESULT(xmm6);
data_len--;
}
@@ -373,8 +366,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
- curr = *data++;
- *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+ RESIDUAL16_RESULT(xmm6);
data_len--;
r = data_len % 2;
@@ -388,8 +380,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
- curr = *data++;
- *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+ RESIDUAL16_RESULT(xmm6);
data_len--;
}
@@ -403,8 +394,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
- curr = *data++;
- *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+ RESIDUAL16_RESULT(xmm6);
xmm3 = _mm_insert_epi16(xmm3, curr, 0);
xmm6 = xmm3;
@@ -412,8 +402,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
- curr = *data++;
- *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+ RESIDUAL16_RESULT(xmm6);
data_len-=2;
}
@@ -446,8 +435,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
- curr = *data++;
- *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+ RESIDUAL16_RESULT(xmm6);
data_len--;
r = data_len % 3;
@@ -461,8 +449,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
- curr = *data++;
- *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+ RESIDUAL16_RESULT(xmm6);
data_len--; r--;
}
@@ -476,8 +463,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
- curr = *data++;
- *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+ RESIDUAL16_RESULT(xmm6);
xmm3 = _mm_insert_epi16(xmm3, curr, 1);
@@ -486,8 +472,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
- curr = *data++;
- *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+ RESIDUAL16_RESULT(xmm6);
xmm3 = _mm_insert_epi16(xmm3, curr, 0);
@@ -496,8 +481,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
- curr = *data++;
- *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+ RESIDUAL16_RESULT(xmm6);
data_len-=3;
}
@@ -530,8 +514,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
- curr = *data++;
- *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+ RESIDUAL16_RESULT(xmm6);
data_len--;
r = data_len % 4;
@@ -545,8 +528,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
- curr = *data++;
- *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+ RESIDUAL16_RESULT(xmm6);
data_len--; r--;
}
@@ -560,8 +542,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
- curr = *data++;
- *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+ RESIDUAL16_RESULT(xmm6);
xmm3 = _mm_insert_epi16(xmm3, curr, 2);
@@ -570,8 +551,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
- curr = *data++;
- *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+ RESIDUAL16_RESULT(xmm6);
xmm3 = _mm_insert_epi16(xmm3, curr, 1);
@@ -580,8 +560,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
- curr = *data++;
- *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+ RESIDUAL16_RESULT(xmm6);
xmm3 = _mm_insert_epi16(xmm3, curr, 0);
@@ -590,8 +569,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 8));
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
- curr = *data++;
- *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+ RESIDUAL16_RESULT(xmm6);
data_len-=4;
}
@@ -617,8 +595,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
xmm6 = _mm_madd_epi16(xmm6, xmm0);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
- curr = *data++;
- *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+ RESIDUAL16_RESULT(xmm6);
data_len--;
@@ -630,8 +607,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
xmm6 = _mm_madd_epi16(xmm6, xmm0);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
- curr = *data++;
- *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+ RESIDUAL16_RESULT(xmm6);
data_len--;
}
@@ -657,8 +633,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
xmm6 = _mm_madd_epi16(xmm6, xmm0);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
- curr = *data++;
- *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+ RESIDUAL16_RESULT(xmm6);
data_len--;
r = data_len % 2;
@@ -671,8 +646,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
xmm6 = _mm_madd_epi16(xmm6, xmm0);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
- curr = *data++;
- *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+ RESIDUAL16_RESULT(xmm6);
data_len--;
}
@@ -686,8 +660,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
xmm6 = _mm_madd_epi16(xmm6, xmm1);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
- curr = *data++;
- *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+ RESIDUAL16_RESULT(xmm6);
xmm3 = _mm_insert_epi16(xmm3, curr, 0);
@@ -695,8 +668,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
xmm6 = _mm_madd_epi16(xmm6, xmm0);
xmm6 = _mm_add_epi32(xmm6, _mm_srli_si128(xmm6, 4));
- curr = *data++;
- *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+ RESIDUAL16_RESULT(xmm6);
data_len-=2;
}
@@ -720,8 +692,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
xmm6 = xmm3;
xmm6 = _mm_madd_epi16(xmm6, xmm0);
- curr = *data++;
- *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+ RESIDUAL16_RESULT(xmm6);
data_len--;
@@ -732,8 +703,7 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
xmm6 = xmm3;
xmm6 = _mm_madd_epi16(xmm6, xmm0);
- curr = *data++;
- *residual++ = curr - (_mm_cvtsi128_si32(xmm6) >> lp_quantization);
+ RESIDUAL16_RESULT(xmm6);
data_len--;
}
@@ -787,8 +757,6 @@ void FLAC__lpc_compute_residual_from_qlp_coefficients_16_intrin_sse2(const FLAC_
}
}
-#define RESIDUAL_RESULT(xmmN) residual[i] = data[i] - (_mm_cvtsi128_si32(xmmN) >> lp_quantization);
-
FLAC__SSE_TARGET("sse2")
void FLAC__lpc_compute_residual_from_qlp_coefficients_intrin_sse2(const FLAC__int32 *data, unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 residual[])
{