diff options
author | Jonathan Lennox <jonathan@vidyo.com> | 2015-08-03 17:04:26 -0400 |
---|---|---|
committer | Jean-Marc Valin <jmvalin@jmvalin.ca> | 2015-09-01 17:21:31 -0400 |
commit | cb0875e07d7cac701b465863f532dc5bb8b0eb59 (patch) | |
tree | d027676a89c64b0d1bd1158581b25bb1d3bf27fc | |
parent | 26fc5c37759a2336b9256f60006f2ef3d59c4fb9 (diff) | |
download | opus-cb0875e07d7cac701b465863f532dc5bb8b0eb59.tar.gz |
Move SSE2 and SSE4.1 intrinsics functions to separate files, to be compiled with appropriate compiler flags. Otherwise, compilers are allowed to take advantage of (e.g.) -msse4.1 to generate code that uses SSE4.1 instructions, even when no SSE4.1 intrinsics are explicitly used in the source.
-rw-r--r-- | Makefile.am | 37 | ||||
-rw-r--r-- | celt/tests/test_unit_mathops.c | 9 | ||||
-rw-r--r-- | celt/tests/test_unit_rotation.c | 9 | ||||
-rw-r--r-- | celt/x86/pitch_sse.c | 214 | ||||
-rw-r--r-- | celt/x86/pitch_sse2.c | 95 | ||||
-rw-r--r-- | celt/x86/pitch_sse4_1.c | 195 | ||||
-rw-r--r-- | celt_sources.mk | 5 | ||||
-rw-r--r-- | configure.ac | 37 |
8 files changed, 372 insertions, 229 deletions
diff --git a/Makefile.am b/Makefile.am index 51bce63c..efff3371 100644 --- a/Makefile.am +++ b/Makefile.am @@ -30,12 +30,14 @@ else OPUS_SOURCES += $(OPUS_SOURCES_FLOAT) endif -if HAVE_SSE4_1 -CELT_SOURCES += $(CELT_SOURCES_SSE) $(CELT_SOURCES_SSE4_1) -else -if HAVE_SSE2 +if HAVE_SSE CELT_SOURCES += $(CELT_SOURCES_SSE) endif +if HAVE_SSE2 +CELT_SOURCES += $(CELT_SOURCES_SSE2) +endif +if HAVE_SSE4_1 +CELT_SOURCES += $(CELT_SOURCES_SSE4_1) endif if CPU_ARM @@ -257,18 +259,29 @@ $(CELT_SOURCES_ARM_ASM:%.s=%-gnu.S): $(top_srcdir)/celt/arm/arm2gnu.pl %-gnu.S: %.s $(top_srcdir)/celt/arm/arm2gnu.pl @ARM2GNU_PARAMS@ < $< > $@ -SSE_OBJ = %_sse.o %_sse.lo %test_unit_mathops.o %test_unit_rotation.o +OPT_UNIT_TEST_OBJ = $(celt_tests_test_unit_mathops_SOURCES:.c=.o) \ + $(celt_tests_test_unit_rotation_SOURCES:.c=.o) \ + $(celt_tests_test_unit_mdct_SOURCES:.c=.o) \ + $(celt_tests_test_unit_dft_SOURCES:.c=.o) + +if HAVE_SSE +SSE_OBJ = $(CELT_SOURCES_SSE:.c=.lo) +$(SSE_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS += $(OPUS_X86_SSE_CFLAGS) +endif -if HAVE_SSE4_1 -$(SSE_OBJ): CFLAGS += $(OPUS_X86_SSE4_1_CFLAGS) -else if HAVE_SSE2 -$(SSE_OBJ): CFLAGS += $(OPUS_X86_SSE2_CFLAGS) +SSE2_OBJ = $(CELT_SOURCES_SSE2:.c=.lo) +$(SSE2_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS += $(OPUS_X86_SSE2_CFLAGS) endif + +if HAVE_SSE4_1 +SSE4_1_OBJ = $(CELT_SOURCES_SSE4_1:.c=.lo) \ + $(SILK_SOURCES_SSE4_1:.c=.lo) \ + $(SILK_SOURCES_FIXED_SSE4_1:.c=.lo) +$(SSE4_1_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS += $(OPUS_X86_SSE4_1_CFLAGS) endif if OPUS_ARM_NEON_INTR -CELT_ARM_NEON_INTR_OBJ = $(CELT_SOURCES_ARM_NEON_INTR:.c=.lo) \ - %test_unit_rotation.o %test_unit_mathops.o -$(CELT_ARM_NEON_INTR_OBJ): CFLAGS += $(OPUS_ARM_NEON_INTR_CFLAGS) +CELT_ARM_NEON_INTR_OBJ = $(CELT_SOURCES_ARM_NEON_INTR:.c=.lo) +$(CELT_ARM_NEON_INTR_OBJ) $(OPT_UNIT_TEST_OBJ): CFLAGS += $(OPUS_ARM_NEON_INTR_CFLAGS) endif diff --git a/celt/tests/test_unit_mathops.c b/celt/tests/test_unit_mathops.c index b9b1bcf1..2de39baf 100644 --- a/celt/tests/test_unit_mathops.c +++ b/celt/tests/test_unit_mathops.c @@ -49,10 +49,17 @@ #include "cwrs.c" #include "pitch.c" #include "celt_lpc.c" +#include "celt.c" -#if defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2) +#if defined(OPUS_X86_MAY_HAVE_SSE) || defined(OPUS_X86_MAY_HAVE_SSE2) || defined(OPUS_X86_MAY_HAVE_SSE4_1) +#if defined(OPUS_X86_MAY_HAVE_SSE) #include "x86/pitch_sse.c" +#endif +#if defined(OPUS_X86_MAY_HAVE_SSE2) +#include "x86/pitch_sse2.c" +#endif #if defined(OPUS_X86_MAY_HAVE_SSE4_1) +#include "x86/pitch_sse4_1.c" #include "x86/celt_lpc_sse.c" #endif #include "x86/x86_celt_map.c" diff --git a/celt/tests/test_unit_rotation.c b/celt/tests/test_unit_rotation.c index 5507884f..4780005f 100644 --- a/celt/tests/test_unit_rotation.c +++ b/celt/tests/test_unit_rotation.c @@ -46,11 +46,18 @@ #include "bands.h" #include "pitch.c" #include "celt_lpc.c" +#include "celt.c" #include <math.h> -#if defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2) +#if defined(OPUS_X86_MAY_HAVE_SSE) || defined(OPUS_X86_MAY_HAVE_SSE2) || defined(OPUS_X86_MAY_HAVE_SSE4_1) +#if defined(OPUS_X86_MAY_HAVE_SSE) #include "x86/pitch_sse.c" +#endif +#if defined(OPUS_X86_MAY_HAVE_SSE2) +#include "x86/pitch_sse2.c" +#endif #if defined(OPUS_X86_MAY_HAVE_SSE4_1) +#include "x86/pitch_sse4_1.c" #include "x86/celt_lpc_sse.c" #endif #include "x86/x86_celt_map.c" diff --git a/celt/x86/pitch_sse.c b/celt/x86/pitch_sse.c index e3bc6d7d..13d739e2 100644 --- a/celt/x86/pitch_sse.c +++ b/celt/x86/pitch_sse.c @@ -29,223 +29,9 @@ #include "config.h" #endif -#include <xmmintrin.h> -#include <emmintrin.h> - #include "macros.h" #include "celt_lpc.h" #include "stack_alloc.h" #include "mathops.h" #include "pitch.h" -#if defined(OPUS_X86_MAY_HAVE_SSE4_1) -#include <smmintrin.h> -#include "x86cpu.h" - -opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y, - int N) -{ - opus_int i, dataSize16; - opus_int32 sum; - __m128i inVec1_76543210, inVec1_FEDCBA98, acc1; - __m128i inVec2_76543210, inVec2_FEDCBA98, acc2; - __m128i inVec1_3210, inVec2_3210; - - sum = 0; - dataSize16 = N & ~15; - - acc1 = _mm_setzero_si128(); - acc2 = _mm_setzero_si128(); - - for (i=0;i<dataSize16;i+=16) { - inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); - inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); - - inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8])); - inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8])); - - inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); - inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98); - - acc1 = _mm_add_epi32(acc1, inVec1_76543210); - acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98); - } - - acc1 = _mm_add_epi32(acc1, acc2); - - if (N - i >= 8) - { - inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); - inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); - - inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); - - acc1 = _mm_add_epi32(acc1, inVec1_76543210); - i += 8; - } - - if (N - i >= 4) - { - inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]); - inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]); - - inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210); - - acc1 = _mm_add_epi32(acc1, inVec1_3210); - i += 4; - } - - acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1)); - acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E)); - - sum += _mm_cvtsi128_si32(acc1); - - for (;i<N;i++) - { - sum = silk_SMLABB(sum, x[i], y[i]); - } - - return sum; -} - -void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[ 4 ], int len) -{ - int j; - - __m128i vecX, vecX0, vecX1, vecX2, vecX3; - __m128i vecY0, vecY1, vecY2, vecY3; - __m128i sum0, sum1, sum2, sum3, vecSum; - __m128i initSum; - - celt_assert(len >= 3); - - sum0 = _mm_setzero_si128(); - sum1 = _mm_setzero_si128(); - sum2 = _mm_setzero_si128(); - sum3 = _mm_setzero_si128(); - - for (j=0;j<(len-7);j+=8) - { - vecX = _mm_loadu_si128((__m128i *)(&x[j + 0])); - vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0])); - vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1])); - vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2])); - vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3])); - - sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0)); - sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1)); - sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2)); - sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3)); - } - - sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0)); - sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E)); - - sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1)); - sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E)); - - sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2)); - sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E)); - - sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3)); - sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E)); - - vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1), - _mm_unpacklo_epi32(sum2, sum3)); - - for (;j<(len-3);j+=4) - { - vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]); - vecX0 = _mm_shuffle_epi32(vecX, 0x00); - vecX1 = _mm_shuffle_epi32(vecX, 0x55); - vecX2 = _mm_shuffle_epi32(vecX, 0xaa); - vecX3 = _mm_shuffle_epi32(vecX, 0xff); - - vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); - vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]); - vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]); - vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]); - - sum0 = _mm_mullo_epi32(vecX0, vecY0); - sum1 = _mm_mullo_epi32(vecX1, vecY1); - sum2 = _mm_mullo_epi32(vecX2, vecY2); - sum3 = _mm_mullo_epi32(vecX3, vecY3); - - sum0 = _mm_add_epi32(sum0, sum1); - sum2 = _mm_add_epi32(sum2, sum3); - vecSum = _mm_add_epi32(vecSum, sum0); - vecSum = _mm_add_epi32(vecSum, sum2); - } - - for (;j<len;j++) - { - vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]); - vecX0 = _mm_shuffle_epi32(vecX, 0x00); - - vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); - - sum0 = _mm_mullo_epi32(vecX0, vecY0); - vecSum = _mm_add_epi32(vecSum, sum0); - } - - initSum = _mm_loadu_si128((__m128i *)(&sum[0])); - initSum = _mm_add_epi32(initSum, vecSum); - _mm_storeu_si128((__m128i *)sum, initSum); -} -#endif - -#if defined(OPUS_X86_MAY_HAVE_SSE2) -opus_val32 celt_inner_prod_sse2(const opus_val16 *x, const opus_val16 *y, - int N) -{ - opus_int i, dataSize16; - opus_int32 sum; - - __m128i inVec1_76543210, inVec1_FEDCBA98, acc1; - __m128i inVec2_76543210, inVec2_FEDCBA98, acc2; - - sum = 0; - dataSize16 = N & ~15; - - acc1 = _mm_setzero_si128(); - acc2 = _mm_setzero_si128(); - - for (i=0;i<dataSize16;i+=16) - { - inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); - inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); - - inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8])); - inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8])); - - inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); - inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98); - - acc1 = _mm_add_epi32(acc1, inVec1_76543210); - acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98); - } - - acc1 = _mm_add_epi32( acc1, acc2 ); - - if (N - i >= 8) - { - inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); - inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); - - inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); - - acc1 = _mm_add_epi32(acc1, inVec1_76543210); - i += 8; - } - - acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64( acc1, acc1)); - acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16( acc1, 0x0E)); - sum += _mm_cvtsi128_si32(acc1); - - for (;i<N;i++) { - sum = silk_SMLABB(sum, x[i], y[i]); - } - - return sum; -} -#endif diff --git a/celt/x86/pitch_sse2.c b/celt/x86/pitch_sse2.c new file mode 100644 index 00000000..a0e7d1be --- /dev/null +++ b/celt/x86/pitch_sse2.c @@ -0,0 +1,95 @@ +/* Copyright (c) 2014, Cisco Systems, INC + Written by XiangMingZhu WeiZhou MinPeng YanWang + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <xmmintrin.h> +#include <emmintrin.h> + +#include "macros.h" +#include "celt_lpc.h" +#include "stack_alloc.h" +#include "mathops.h" +#include "pitch.h" + +#if defined(OPUS_X86_MAY_HAVE_SSE2) && defined(FIXED_POINT) +opus_val32 celt_inner_prod_sse2(const opus_val16 *x, const opus_val16 *y, + int N) +{ + opus_int i, dataSize16; + opus_int32 sum; + + __m128i inVec1_76543210, inVec1_FEDCBA98, acc1; + __m128i inVec2_76543210, inVec2_FEDCBA98, acc2; + + sum = 0; + dataSize16 = N & ~15; + + acc1 = _mm_setzero_si128(); + acc2 = _mm_setzero_si128(); + + for (i=0;i<dataSize16;i+=16) + { + inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); + inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); + + inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8])); + inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8])); + + inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); + inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98); + + acc1 = _mm_add_epi32(acc1, inVec1_76543210); + acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98); + } + + acc1 = _mm_add_epi32( acc1, acc2 ); + + if (N - i >= 8) + { + inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); + inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); + + inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); + + acc1 = _mm_add_epi32(acc1, inVec1_76543210); + i += 8; + } + + acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64( acc1, acc1)); + acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16( acc1, 0x0E)); + sum += _mm_cvtsi128_si32(acc1); + + for (;i<N;i++) { + sum = silk_SMLABB(sum, x[i], y[i]); + } + + return sum; +} +#endif diff --git a/celt/x86/pitch_sse4_1.c b/celt/x86/pitch_sse4_1.c new file mode 100644 index 00000000..a092c68b --- /dev/null +++ b/celt/x86/pitch_sse4_1.c @@ -0,0 +1,195 @@ +/* Copyright (c) 2014, Cisco Systems, INC + Written by XiangMingZhu WeiZhou MinPeng YanWang + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER + OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include <xmmintrin.h> +#include <emmintrin.h> + +#include "macros.h" +#include "celt_lpc.h" +#include "stack_alloc.h" +#include "mathops.h" +#include "pitch.h" + +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) && defined(FIXED_POINT) +#include <smmintrin.h> +#include "x86cpu.h" + +opus_val32 celt_inner_prod_sse4_1(const opus_val16 *x, const opus_val16 *y, + int N) +{ + opus_int i, dataSize16; + opus_int32 sum; + __m128i inVec1_76543210, inVec1_FEDCBA98, acc1; + __m128i inVec2_76543210, inVec2_FEDCBA98, acc2; + __m128i inVec1_3210, inVec2_3210; + + sum = 0; + dataSize16 = N & ~15; + + acc1 = _mm_setzero_si128(); + acc2 = _mm_setzero_si128(); + + for (i=0;i<dataSize16;i+=16) { + inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); + inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); + + inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8])); + inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8])); + + inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); + inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98); + + acc1 = _mm_add_epi32(acc1, inVec1_76543210); + acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98); + } + + acc1 = _mm_add_epi32(acc1, acc2); + + if (N - i >= 8) + { + inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); + inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); + + inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); + + acc1 = _mm_add_epi32(acc1, inVec1_76543210); + i += 8; + } + + if (N - i >= 4) + { + inVec1_3210 = OP_CVTEPI16_EPI32_M64(&x[i + 0]); + inVec2_3210 = OP_CVTEPI16_EPI32_M64(&y[i + 0]); + + inVec1_3210 = _mm_mullo_epi32(inVec1_3210, inVec2_3210); + + acc1 = _mm_add_epi32(acc1, inVec1_3210); + i += 4; + } + + acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64(acc1, acc1)); + acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16(acc1, 0x0E)); + + sum += _mm_cvtsi128_si32(acc1); + + for (;i<N;i++) + { + sum = silk_SMLABB(sum, x[i], y[i]); + } + + return sum; +} + +void xcorr_kernel_sse4_1(const opus_val16 * x, const opus_val16 * y, opus_val32 sum[ 4 ], int len) +{ + int j; + + __m128i vecX, vecX0, vecX1, vecX2, vecX3; + __m128i vecY0, vecY1, vecY2, vecY3; + __m128i sum0, sum1, sum2, sum3, vecSum; + __m128i initSum; + + celt_assert(len >= 3); + + sum0 = _mm_setzero_si128(); + sum1 = _mm_setzero_si128(); + sum2 = _mm_setzero_si128(); + sum3 = _mm_setzero_si128(); + + for (j=0;j<(len-7);j+=8) + { + vecX = _mm_loadu_si128((__m128i *)(&x[j + 0])); + vecY0 = _mm_loadu_si128((__m128i *)(&y[j + 0])); + vecY1 = _mm_loadu_si128((__m128i *)(&y[j + 1])); + vecY2 = _mm_loadu_si128((__m128i *)(&y[j + 2])); + vecY3 = _mm_loadu_si128((__m128i *)(&y[j + 3])); + + sum0 = _mm_add_epi32(sum0, _mm_madd_epi16(vecX, vecY0)); + sum1 = _mm_add_epi32(sum1, _mm_madd_epi16(vecX, vecY1)); + sum2 = _mm_add_epi32(sum2, _mm_madd_epi16(vecX, vecY2)); + sum3 = _mm_add_epi32(sum3, _mm_madd_epi16(vecX, vecY3)); + } + + sum0 = _mm_add_epi32(sum0, _mm_unpackhi_epi64( sum0, sum0)); + sum0 = _mm_add_epi32(sum0, _mm_shufflelo_epi16( sum0, 0x0E)); + + sum1 = _mm_add_epi32(sum1, _mm_unpackhi_epi64( sum1, sum1)); + sum1 = _mm_add_epi32(sum1, _mm_shufflelo_epi16( sum1, 0x0E)); + + sum2 = _mm_add_epi32(sum2, _mm_unpackhi_epi64( sum2, sum2)); + sum2 = _mm_add_epi32(sum2, _mm_shufflelo_epi16( sum2, 0x0E)); + + sum3 = _mm_add_epi32(sum3, _mm_unpackhi_epi64( sum3, sum3)); + sum3 = _mm_add_epi32(sum3, _mm_shufflelo_epi16( sum3, 0x0E)); + + vecSum = _mm_unpacklo_epi64(_mm_unpacklo_epi32(sum0, sum1), + _mm_unpacklo_epi32(sum2, sum3)); + + for (;j<(len-3);j+=4) + { + vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]); + vecX0 = _mm_shuffle_epi32(vecX, 0x00); + vecX1 = _mm_shuffle_epi32(vecX, 0x55); + vecX2 = _mm_shuffle_epi32(vecX, 0xaa); + vecX3 = _mm_shuffle_epi32(vecX, 0xff); + + vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); + vecY1 = OP_CVTEPI16_EPI32_M64(&y[j + 1]); + vecY2 = OP_CVTEPI16_EPI32_M64(&y[j + 2]); + vecY3 = OP_CVTEPI16_EPI32_M64(&y[j + 3]); + + sum0 = _mm_mullo_epi32(vecX0, vecY0); + sum1 = _mm_mullo_epi32(vecX1, vecY1); + sum2 = _mm_mullo_epi32(vecX2, vecY2); + sum3 = _mm_mullo_epi32(vecX3, vecY3); + + sum0 = _mm_add_epi32(sum0, sum1); + sum2 = _mm_add_epi32(sum2, sum3); + vecSum = _mm_add_epi32(vecSum, sum0); + vecSum = _mm_add_epi32(vecSum, sum2); + } + + for (;j<len;j++) + { + vecX = OP_CVTEPI16_EPI32_M64(&x[j + 0]); + vecX0 = _mm_shuffle_epi32(vecX, 0x00); + + vecY0 = OP_CVTEPI16_EPI32_M64(&y[j + 0]); + + sum0 = _mm_mullo_epi32(vecX0, vecY0); + vecSum = _mm_add_epi32(vecSum, sum0); + } + + initSum = _mm_loadu_si128((__m128i *)(&sum[0])); + initSum = _mm_add_epi32(initSum, vecSum); + _mm_storeu_si128((__m128i *)sum, initSum); +} +#endif diff --git a/celt_sources.mk b/celt_sources.mk index 29ec9374..c92693fe 100644 --- a/celt_sources.mk +++ b/celt_sources.mk @@ -21,7 +21,10 @@ CELT_SOURCES_SSE = celt/x86/x86cpu.c \ celt/x86/x86_celt_map.c \ celt/x86/pitch_sse.c -CELT_SOURCES_SSE4_1 = celt/x86/celt_lpc_sse.c +CELT_SOURCES_SSE2 = celt/x86/pitch_sse2.c + +CELT_SOURCES_SSE4_1 = celt/x86/celt_lpc_sse.c \ +celt/x86/pitch_sse4_1.c CELT_SOURCES_ARM = \ celt/arm/armcpu.c \ diff --git a/configure.ac b/configure.ac index c19a6a7c..ea91ab2e 100644 --- a/configure.ac +++ b/configure.ac @@ -348,9 +348,11 @@ AM_CONDITIONAL([OPUS_ARM_INLINE_ASM], AM_CONDITIONAL([OPUS_ARM_EXTERNAL_ASM], [test x"${asm_optimization%% *}" = x"ARM"]) +AM_CONDITIONAL([HAVE_SSE], [false]) AM_CONDITIONAL([HAVE_SSE2], [false]) AM_CONDITIONAL([HAVE_SSE4_1], [false]) +m4_define([DEFAULT_X86_SSE_CFLAGS], [-msse]) m4_define([DEFAULT_X86_SSE2_CFLAGS], [-msse2]) m4_define([DEFAULT_X86_SSE4_1_CFLAGS], [-msse4.1]) m4_define([DEFAULT_ARM_NEON_INTR_CFLAGS], [-mfpu=neon]) @@ -366,10 +368,12 @@ AS_CASE([$host], [arm*eabi*], [AS_VAR_SET([RESOLVED_DEFAULT_ARM_NEON_INTR_CFLAGS], "DEFAULT_ARM_NEON_SOFTFP_INTR_CFLAGS")], [AS_VAR_SET([RESOLVED_DEFAULT_ARM_NEON_INTR_CFLAGS], "DEFAULT_ARM_NEON_INTR_CFLAGS")]) +AC_ARG_VAR([X86_SSE_CFLAGS], [C compiler flags to compile SSE intrinsics @<:@default=]DEFAULT_X86_SSE_CFLAGS[@:>@]) AC_ARG_VAR([X86_SSE2_CFLAGS], [C compiler flags to compile SSE2 intrinsics @<:@default=]DEFAULT_X86_SSE2_CFLAGS[@:>@]) AC_ARG_VAR([X86_SSE4_1_CFLAGS], [C compiler flags to compile SSE4.1 intrinsics @<:@default=]DEFAULT_X86_SSE4_1_CFLAGS[@:>@]) AC_ARG_VAR([ARM_NEON_INTR_CFLAGS], [C compiler flags to compile ARM NEON intrinsics @<:@default=]DEFAULT_ARM_NEON_INTR_CFLAGS / DEFAULT_ARM_NEON_SOFTFP_INTR_CFLAGS[@:>@]) +AS_VAR_SET_IF([X86_SSE_CFLAGS], [], [AS_VAR_SET([X86_SSE_CFLAGS], "DEFAULT_X86_SSE_CFLAGS")]) AS_VAR_SET_IF([X86_SSE2_CFLAGS], [], [AS_VAR_SET([X86_SSE2_CFLAGS], "DEFAULT_X86_SSE2_CFLAGS")]) AS_VAR_SET_IF([X86_SSE4_1_CFLAGS], [], [AS_VAR_SET([X86_SSE4_1_CFLAGS], "DEFAULT_X86_SSE4_1_CFLAGS")]) AS_VAR_SET_IF([ARM_NEON_INTR_CFLAGS], [], [AS_VAR_SET([ARM_NEON_INTR_CFLAGS], ["$RESOLVED_DEFAULT_ARM_NEON_INTR_CFLAGS"])]) @@ -432,6 +436,24 @@ AS_IF([test x"$enable_intrinsics" = x"yes"],[ [i?86|x86_64], [ OPUS_CHECK_INTRINSICS( + [SSE], + [$X86_SSE_CFLAGS], + [OPUS_X86_MAY_HAVE_SSE], + [OPUS_X86_PRESUME_SSE], + [[#include <xmmintrin.h> + ]], + [[ + static __m128 mtest; + mtest = _mm_setzero_ps(); + ]] + ) + AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE" = x"1" && test x"$OPUS_X86_PRESUME_SSE" != x"1"], + [ + OPUS_X86_SSE_CFLAGS="$X86_SSE_CFLAGS" + AC_SUBST([OPUS_X86_SSE_CFLAGS]) + ] + ) + OPUS_CHECK_INTRINSICS( [SSE2], [$X86_SSE2_CFLAGS], [OPUS_X86_MAY_HAVE_SSE2], @@ -473,6 +495,19 @@ AS_IF([test x"$enable_intrinsics" = x"yes"],[ AS_IF([test x"$enable_float" = x"no"], [ AS_IF([test x"$rtcd_support" = x"no"], [rtcd_support=""]) + AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE" = x"1"], + [ + AC_DEFINE([OPUS_X86_MAY_HAVE_SSE], 1, [Compiler supports X86 SSE Intrinsics]) + intrinsics_support="$intrinsics_support SSE" + + AS_IF([test x"$OPUS_X86_PRESUME_SSE" = x"1"], + [AC_DEFINE([OPUS_X86_PRESUME_SSE], 1, [Define if binary requires SSE intrinsics support])], + [rtcd_support="$rtcd_support SSE"]) + ], + [ + AC_MSG_WARN([Compiler does not support SSE intrinsics]) + ]) + AS_IF([test x"$OPUS_X86_MAY_HAVE_SSE2" = x"1"], [ AC_DEFINE([OPUS_X86_MAY_HAVE_SSE2], 1, [Compiler supports X86 SSE2 Intrinsics]) @@ -561,6 +596,8 @@ AS_IF([test x"$enable_intrinsics" = x"yes"],[ AM_CONDITIONAL([CPU_ARM], [test "$cpu_arm" = "yes"]) AM_CONDITIONAL([OPUS_ARM_NEON_INTR], [test x"$OPUS_ARM_MAY_HAVE_NEON_INTR" = x"1"]) +AM_CONDITIONAL([HAVE_SSE], + [test x"$OPUS_X86_MAY_HAVE_SSE" = x"1"]) AM_CONDITIONAL([HAVE_SSE2], [test x"$OPUS_X86_MAY_HAVE_SSE2" = x"1"]) AM_CONDITIONAL([HAVE_SSE4_1], |