summaryrefslogtreecommitdiff
path: root/celt
diff options
context:
space:
mode:
authorFelicia Lim <flim@google.com>2021-12-09 12:54:43 -0800
committerFelicia Lim <flim@google.com>2021-12-14 22:46:42 -0800
commitec64b3c5b7abd621dfddee6b4cc115298e5d6803 (patch)
treef96dd95f408f3868169d6df857b2d1badb6a6afa /celt
parenta8e6a77c5fe0c37aa6788f939f24f8cd22ae2652 (diff)
downloadopus-ec64b3c5b7abd621dfddee6b4cc115298e5d6803.tar.gz
Fix buffer overflow in xcorr_kernel_sse4_1
Before, an overflow can occur in the last loop if `len` is not a multiple of 4 as OP_CVTEPI16_EPI32_M64 tries to load 64 bits, but there are insufficient bits allocated in `x`.
Diffstat (limited to 'celt')
-rw-r--r--celt/x86/pitch_sse4_1.c48
1 files changed, 45 insertions, 3 deletions
diff --git a/celt/x86/pitch_sse4_1.c b/celt/x86/pitch_sse4_1.c
index a092c68b..58db6c7f 100644
--- a/celt/x86/pitch_sse4_1.c
+++ b/celt/x86/pitch_sse4_1.c
@@ -117,6 +117,11 @@ void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32
__m128i sum0, sum1, sum2, sum3, vecSum;
__m128i initSum;
+#ifdef OPUS_CHECK_ASM
+ opus_val32 sum_c[4]={0,0,0,0};
+ xcorr_kernel_c(x, y, sum_c, len);
+#endif
+
celt_assert(len >= 3);
sum0 = _mm_setzero_si128();
@@ -177,19 +182,56 @@ void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32
vecSum = _mm_add_epi32(vecSum, sum2);
}
- for (;j<len;j++)
+ vecX = OP_CVTEPI16_EPI32_M64(&x[len - 4]);
+ if (len - j == 3)
{
- vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]);
- vecX0 = _mm_shuffle_epi32(vecX, 0x00);
+ vecX0 = _mm_shuffle_epi32(vecX, 0x55);
+ vecX1 = _mm_shuffle_epi32(vecX, 0xaa);
+ vecX2 = _mm_shuffle_epi32(vecX, 0xff);
+
+ vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
+ vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
+ vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]);
+
+ sum0 = _mm_mullo_epi32(vecX0, vecY0);
+ sum1 = _mm_mullo_epi32(vecX1, vecY1);
+ sum2 = _mm_mullo_epi32(vecX2, vecY2);
+
+ vecSum = _mm_add_epi32(vecSum, sum0);
+ vecSum = _mm_add_epi32(vecSum, sum1);
+ vecSum = _mm_add_epi32(vecSum, sum2);
+ }
+ else if (len - j == 2)
+ {
+ vecX0 = _mm_shuffle_epi32(vecX, 0xaa);
+ vecX1 = _mm_shuffle_epi32(vecX, 0xff);
vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
+ vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]);
sum0 = _mm_mullo_epi32(vecX0, vecY0);
+ sum1 = _mm_mullo_epi32(vecX1, vecY1);
+
+ vecSum = _mm_add_epi32(vecSum, sum0);
+ vecSum = _mm_add_epi32(vecSum, sum1);
+ }
+ else if (len - j == 1)
+ {
+ vecX0 = _mm_shuffle_epi32(vecX, 0xff);
+
+ vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]);
+
+ sum0 = _mm_mullo_epi32(vecX0, vecY0);
+
vecSum = _mm_add_epi32(vecSum, sum0);
}
initSum = _mm_loadu_si128((__m128i *)(&sum[0]));
initSum = _mm_add_epi32(initSum, vecSum);
_mm_storeu_si128((__m128i *)sum, initSum);
+
+#ifdef OPUS_CHECK_ASM
+ celt_assert(!memcmp(sum_c, sum, sizeof(sum_c)));
+#endif
}
#endif