diff options
Diffstat (limited to 'libavcodec/x86')
-rw-r--r-- | libavcodec/x86/opus_pvq_search.asm | 37 |
1 files changed, 18 insertions, 19 deletions
diff --git a/libavcodec/x86/opus_pvq_search.asm b/libavcodec/x86/opus_pvq_search.asm index 8cf040465d..5c1e6d6174 100644 --- a/libavcodec/x86/opus_pvq_search.asm +++ b/libavcodec/x86/opus_pvq_search.asm @@ -82,7 +82,7 @@ SECTION .text %endif %endmacro -%macro PULSES_SEARCH 2 ; %1 - add or sub, %2 - use approximation +%macro PULSES_SEARCH 1 ; m6 Syy_norm ; m7 Sxy_norm addps m6, mm_const_float_0_5 ; Syy_norm += 1.0/2 @@ -96,17 +96,17 @@ align 16 movaps m4, [tmpY + r4] ; y[i] movaps m5, [tmpX + r4] ; X[i] -%if %2 + %if USE_APPROXIMATION == 1 xorps m0, m0 cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0) -%endif + %endif addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm -%if %2 + %if USE_APPROXIMATION == 1 andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding. -%endif + %endif %else movaps m5, [tmpY + r4] ; m5 = y[i] @@ -119,7 +119,7 @@ align 16 andps m5, m0 ; (0<y)?m5:0 %endif -%if %2 +%if USE_APPROXIMATION == 1 rsqrtps m4, m4 mulps m5, m4 ; m5 = p = Sxy_new*approx(1/sqrt(Syy) ) %else @@ -211,13 +211,8 @@ align 16 ; uint32 K - Number of pulses to have after quantizations. ; uint32 N - Number of vector elements. Must be 0 < N < 256 ; -%macro PVQ_FAST_SEARCH 1 ; %1 - use approximation -%if %1 -cglobal pvq_search_approx, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N -%else -cglobal pvq_search_exact, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N -%endif - +%macro PVQ_FAST_SEARCH 1 +cglobal pvq_search%1, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N %define tmpX rsp %define tmpY outYq @@ -260,7 +255,7 @@ align 16 jz %%zero_input ; if (Sx==0) goto zero_input cvtsi2ss xm0, dword Kd ; m0 = K -%if %1 +%if USE_APPROXIMATION == 1 rcpss xm1, xm1 ; m1 = approx(1/Sx) mulss xm0, xm1 ; m0 = K*(1/Sx) %else @@ -313,7 +308,7 @@ align 16 align 16 ; K - pulses > 0 %%add_pulses_loop: - PULSES_SEARCH add, %1 ; m6 Syy_norm ; m7 Sxy_norm + PULSES_SEARCH add ; m6 Syy_norm ; m7 Sxy_norm sub Kd, 1 jnz %%add_pulses_loop @@ -325,7 +320,7 @@ align 16 ; K - pulses > 0 align 16 %%remove_pulses_loop: - PULSES_SEARCH sub, %1 ; m6 Syy_norm ; m7 Sxy_norm + PULSES_SEARCH sub ; m6 Syy_norm ; m7 Sxy_norm add Kd, 1 jnz %%remove_pulses_loop @@ -376,11 +371,15 @@ align 16 ; On Skylake & Ryzen the division is much faster (around 11c/3), ; that makes the full precision code about 2% slower. ; Opus also does use rsqrt approximation in their intrinsics code. +%define USE_APPROXIMATION 1 + INIT_XMM sse2 -PVQ_FAST_SEARCH 1 +PVQ_FAST_SEARCH _approx INIT_XMM sse4 -PVQ_FAST_SEARCH 1 +PVQ_FAST_SEARCH _approx + +%define USE_APPROXIMATION 0 INIT_XMM avx -PVQ_FAST_SEARCH 0 +PVQ_FAST_SEARCH _exact |