summaryrefslogtreecommitdiff
path: root/libavcodec/x86
diff options
context:
space:
mode:
Diffstat (limited to 'libavcodec/x86')
-rw-r--r--libavcodec/x86/opus_pvq_search.asm37
1 files changed, 18 insertions, 19 deletions
diff --git a/libavcodec/x86/opus_pvq_search.asm b/libavcodec/x86/opus_pvq_search.asm
index 8cf040465d..5c1e6d6174 100644
--- a/libavcodec/x86/opus_pvq_search.asm
+++ b/libavcodec/x86/opus_pvq_search.asm
@@ -82,7 +82,7 @@ SECTION .text
%endif
%endmacro
-%macro PULSES_SEARCH 2 ; %1 - add or sub, %2 - use approximation
+%macro PULSES_SEARCH 1
; m6 Syy_norm
; m7 Sxy_norm
addps m6, mm_const_float_0_5 ; Syy_norm += 1.0/2
@@ -96,17 +96,17 @@ align 16
movaps m4, [tmpY + r4] ; y[i]
movaps m5, [tmpX + r4] ; X[i]
-%if %2
+ %if USE_APPROXIMATION == 1
xorps m0, m0
cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0)
-%endif
+ %endif
addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm
addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm
-%if %2
+ %if USE_APPROXIMATION == 1
andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding.
-%endif
+ %endif
%else
movaps m5, [tmpY + r4] ; m5 = y[i]
@@ -119,7 +119,7 @@ align 16
andps m5, m0 ; (0<y)?m5:0
%endif
-%if %2
+%if USE_APPROXIMATION == 1
rsqrtps m4, m4
mulps m5, m4 ; m5 = p = Sxy_new*approx(1/sqrt(Syy) )
%else
@@ -211,13 +211,8 @@ align 16
; uint32 K - Number of pulses to have after quantizations.
; uint32 N - Number of vector elements. Must be 0 < N < 256
;
-%macro PVQ_FAST_SEARCH 1 ; %1 - use approximation
-%if %1
-cglobal pvq_search_approx, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
-%else
-cglobal pvq_search_exact, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
-%endif
-
+%macro PVQ_FAST_SEARCH 1
+cglobal pvq_search%1, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
%define tmpX rsp
%define tmpY outYq
@@ -260,7 +255,7 @@ align 16
jz %%zero_input ; if (Sx==0) goto zero_input
cvtsi2ss xm0, dword Kd ; m0 = K
-%if %1
+%if USE_APPROXIMATION == 1
rcpss xm1, xm1 ; m1 = approx(1/Sx)
mulss xm0, xm1 ; m0 = K*(1/Sx)
%else
@@ -313,7 +308,7 @@ align 16
align 16 ; K - pulses > 0
%%add_pulses_loop:
- PULSES_SEARCH add, %1 ; m6 Syy_norm ; m7 Sxy_norm
+ PULSES_SEARCH add ; m6 Syy_norm ; m7 Sxy_norm
sub Kd, 1
jnz %%add_pulses_loop
@@ -325,7 +320,7 @@ align 16 ; K - pulses > 0
align 16
%%remove_pulses_loop:
- PULSES_SEARCH sub, %1 ; m6 Syy_norm ; m7 Sxy_norm
+ PULSES_SEARCH sub ; m6 Syy_norm ; m7 Sxy_norm
add Kd, 1
jnz %%remove_pulses_loop
@@ -376,11 +371,15 @@ align 16
; On Skylake & Ryzen the division is much faster (around 11c/3),
; that makes the full precision code about 2% slower.
; Opus also does use rsqrt approximation in their intrinsics code.
+%define USE_APPROXIMATION 1
+
INIT_XMM sse2
-PVQ_FAST_SEARCH 1
+PVQ_FAST_SEARCH _approx
INIT_XMM sse4
-PVQ_FAST_SEARCH 1
+PVQ_FAST_SEARCH _approx
+
+%define USE_APPROXIMATION 0
INIT_XMM avx
-PVQ_FAST_SEARCH 0
+PVQ_FAST_SEARCH _exact