summaryrefslogtreecommitdiff
path: root/libavutil/x86/tx_float.asm
diff options
context:
space:
mode:
authorLynne <dev@lynne.ee>2022-09-03 03:34:57 +0200
committerLynne <dev@lynne.ee>2022-09-06 04:21:41 +0200
commit2425d5cd7e37387305f85bef63f7441c8b1cc147 (patch)
tree83c67a7dd2a300607a6cca3df4928ab842c6bb81 /libavutil/x86/tx_float.asm
parentb881d2db8892e88a625dfa4ac2d5b8ca53ab9595 (diff)
downloadffmpeg-2425d5cd7e37387305f85bef63f7441c8b1cc147.tar.gz
x86/tx_float: add support for calling assembly functions from assembly
Needed for the next patch. We get this for the extremely small cost of a branch on _ns functions, which wouldn't be used anyway with assembly.
Diffstat (limited to 'libavutil/x86/tx_float.asm')
-rw-r--r--libavutil/x86/tx_float.asm160
1 files changed, 113 insertions, 47 deletions
diff --git a/libavutil/x86/tx_float.asm b/libavutil/x86/tx_float.asm
index 191af7d68f..791ad7b322 100644
--- a/libavutil/x86/tx_float.asm
+++ b/libavutil/x86/tx_float.asm
@@ -707,20 +707,21 @@ cglobal fft4_ %+ %1 %+ _float, 4, 4, 3, ctx, out, in, stride
FFT4 fwd, 0
FFT4 inv, 1
-%macro FFT8_SSE_FN 2
+%macro FFT8_SSE_FN 1
INIT_XMM sse3
-cglobal fft8_ %+ %1, 4, 4, 6, ctx, out, in, tmp
-%if %2
+%if %1
+cglobal fft8_asm_float, 0, 0, 0, ctx, out, in, tmp
+ movaps m0, [inq + 0*mmsize]
+ movaps m1, [inq + 1*mmsize]
+ movaps m2, [inq + 2*mmsize]
+ movaps m3, [inq + 3*mmsize]
+%else
+cglobal fft8_float, 4, 4, 6, ctx, out, in, tmp
mov ctxq, [ctxq + AVTXContext.map]
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq
LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq
LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq
-%else
- movaps m0, [inq + 0*mmsize]
- movaps m1, [inq + 1*mmsize]
- movaps m2, [inq + 2*mmsize]
- movaps m3, [inq + 3*mmsize]
%endif
FFT8 m0, m1, m2, m3, m4, m5
@@ -735,22 +736,33 @@ cglobal fft8_ %+ %1, 4, 4, 6, ctx, out, in, tmp
movups [outq + 2*mmsize], m5
movups [outq + 3*mmsize], m1
+%if %1
+ ret
+%else
+ RET
+%endif
+
+%if %1
+cglobal fft8_ns_float, 4, 4, 6, ctx, out, in, tmp
+ call ff_tx_fft8_asm_float_sse3
RET
+%endif
%endmacro
-FFT8_SSE_FN float, 1
-FFT8_SSE_FN ns_float, 0
+FFT8_SSE_FN 0
+FFT8_SSE_FN 1
-%macro FFT8_AVX_FN 2
+%macro FFT8_AVX_FN 1
INIT_YMM avx
-cglobal fft8_ %+ %1, 4, 4, 4, ctx, out, in, tmp
-%if %2
+%if %1
+cglobal fft8_asm_float, 0, 0, 0, ctx, out, in, tmp
+ movaps m0, [inq + 0*mmsize]
+ movaps m1, [inq + 1*mmsize]
+%else
+cglobal fft8_float, 4, 4, 4, ctx, out, in, tmp
mov ctxq, [ctxq + AVTXContext.map]
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m2
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m3
-%else
- movaps m0, [inq + 0*mmsize]
- movaps m1, [inq + 1*mmsize]
%endif
FFT8_AVX m0, m1, m2, m3
@@ -764,21 +776,32 @@ cglobal fft8_ %+ %1, 4, 4, 4, ctx, out, in, tmp
vextractf128 [outq + 16*2], m2, 1
vextractf128 [outq + 16*3], m0, 1
+%if %1
+ ret
+%else
+ RET
+%endif
+
+%if %1
+cglobal fft8_ns_float, 4, 4, 4, ctx, out, in, tmp
+ call ff_tx_fft8_asm_float_avx
RET
+%endif
%endmacro
-FFT8_AVX_FN float, 1
-FFT8_AVX_FN ns_float, 0
+FFT8_AVX_FN 0
+FFT8_AVX_FN 1
-%macro FFT16_FN 3
+%macro FFT16_FN 2
INIT_YMM %1
-cglobal fft16_ %+ %2, 4, 4, 8, ctx, out, in, tmp
-%if %3
+%if %2
+cglobal fft16_asm_float, 0, 0, 0, ctx, out, in, tmp
movaps m0, [inq + 0*mmsize]
movaps m1, [inq + 1*mmsize]
movaps m2, [inq + 2*mmsize]
movaps m3, [inq + 3*mmsize]
%else
+cglobal fft16_float, 4, 4, 8, ctx, out, in, tmp
mov ctxq, [ctxq + AVTXContext.map]
LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m4
LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m5
@@ -802,23 +825,34 @@ cglobal fft16_ %+ %2, 4, 4, 8, ctx, out, in, tmp
vextractf128 [outq + 16*6], m5, 1
vextractf128 [outq + 16*7], m1, 1
+%if %2
+ ret
+%else
+ RET
+%endif
+
+%if %2
+cglobal fft16_ns_float, 4, 4, 8, ctx, out, in, tmp
+ call ff_tx_fft16_asm_float_ %+ %1
RET
+%endif
%endmacro
-FFT16_FN avx, float, 0
-FFT16_FN avx, ns_float, 1
-FFT16_FN fma3, float, 0
-FFT16_FN fma3, ns_float, 1
+FFT16_FN avx, 0
+FFT16_FN avx, 1
+FFT16_FN fma3, 0
+FFT16_FN fma3, 1
-%macro FFT32_FN 3
+%macro FFT32_FN 2
INIT_YMM %1
-cglobal fft32_ %+ %2, 4, 4, 16, ctx, out, in, tmp
-%if %3
+%if %2
+cglobal fft32_asm_float, 0, 0, 0, ctx, out, in, tmp
movaps m4, [inq + 4*mmsize]
movaps m5, [inq + 5*mmsize]
movaps m6, [inq + 6*mmsize]
movaps m7, [inq + 7*mmsize]
%else
+cglobal fft32_float, 4, 4, 16, ctx, out, in, tmp
mov ctxq, [ctxq + AVTXContext.map]
LOAD64_LUT m4, inq, ctxq, (mmsize/2)*4, tmpq, m8, m12
LOAD64_LUT m5, inq, ctxq, (mmsize/2)*5, tmpq, m9, m13
@@ -828,7 +862,7 @@ cglobal fft32_ %+ %2, 4, 4, 16, ctx, out, in, tmp
FFT8 m4, m5, m6, m7, m8, m9
-%if %3
+%if %2
movaps m0, [inq + 0*mmsize]
movaps m1, [inq + 1*mmsize]
movaps m2, [inq + 2*mmsize]
@@ -875,14 +909,24 @@ cglobal fft32_ %+ %2, 4, 4, 16, ctx, out, in, tmp
vextractf128 [outq + 16*14], m10, 1
vextractf128 [outq + 16*15], m5, 1
+%if %2
+ ret
+%else
+ RET
+%endif
+
+%if %2
+cglobal fft32_ns_float, 4, 4, 16, ctx, out, in, tmp
+ call ff_tx_fft32_asm_float_ %+ %1
RET
+%endif
%endmacro
%if ARCH_X86_64
-FFT32_FN avx, float, 0
-FFT32_FN avx, ns_float, 1
-FFT32_FN fma3, float, 0
-FFT32_FN fma3, ns_float, 1
+FFT32_FN avx, 0
+FFT32_FN avx, 1
+FFT32_FN fma3, 0
+FFT32_FN fma3, 1
%endif
%macro FFT_SPLIT_RADIX_DEF 1-2
@@ -923,17 +967,21 @@ ALIGN 16
%endif
%endmacro
-%macro FFT_SPLIT_RADIX_FN 3
+%macro FFT_SPLIT_RADIX_FN 2
INIT_YMM %1
-cglobal fft_sr_ %+ %2, 4, 8, 16, 272, lut, out, in, len, tmp, itab, rtab, tgt
- movsxd lenq, dword [lutq + AVTXContext.len]
- mov lutq, [lutq + AVTXContext.map]
+%if %2
+cglobal fft_sr_asm_float, 0, 0, 0, ctx, out, in, tmp, len, lut, itab, rtab, tgt
+%else
+cglobal fft_sr_float, 4, 9, 16, 272, ctx, out, in, tmp, len, lut, itab, rtab, tgt
+ movsxd lenq, dword [ctxq + AVTXContext.len]
+ mov lutq, [ctxq + AVTXContext.map]
mov tgtq, lenq
+%endif
; Bottom-most/32-point transform ===============================================
ALIGN 16
.32pt:
-%if %3
+%if %2
movaps m4, [inq + 4*mmsize]
movaps m5, [inq + 5*mmsize]
movaps m6, [inq + 6*mmsize]
@@ -947,7 +995,7 @@ ALIGN 16
FFT8 m4, m5, m6, m7, m8, m9
-%if %3
+%if %2
movaps m0, [inq + 0*mmsize]
movaps m1, [inq + 1*mmsize]
movaps m2, [inq + 2*mmsize]
@@ -972,7 +1020,7 @@ ALIGN 16
movaps [outq + 5*mmsize], m5
movaps [outq + 7*mmsize], m7
-%if %3
+%if %2
add inq, 8*mmsize
%else
add lutq, (mmsize/2)*8
@@ -1007,7 +1055,7 @@ ALIGN 16
SWAP m4, m1
SWAP m6, m3
-%if %3
+%if %2
movaps tx1_e0, [inq + 0*mmsize]
movaps tx1_e1, [inq + 1*mmsize]
movaps tx1_o0, [inq + 2*mmsize]
@@ -1021,7 +1069,7 @@ ALIGN 16
FFT16 tx1_e0, tx1_e1, tx1_o0, tx1_o1, tw_e, tw_o, tx2_o0, tx2_o1
-%if %3
+%if %2
movaps tx2_e0, [inq + 4*mmsize]
movaps tx2_e1, [inq + 5*mmsize]
movaps tx2_o0, [inq + 6*mmsize]
@@ -1038,7 +1086,7 @@ ALIGN 16
movaps tw_e, [tab_64_float]
vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7], 0x23
-%if %3
+%if %2
add inq, 8*mmsize
%else
add lutq, (mmsize/2)*8
@@ -1201,7 +1249,11 @@ FFT_SPLIT_RADIX_DEF 131072
sub lenq, 4*mmsize
jg .synth_deinterleave
+%if %2
+ ret
+%else
RET
+%endif
; 64-point deinterleave which only has to load 4 registers =====================
.64pt_deint:
@@ -1278,14 +1330,28 @@ FFT_SPLIT_RADIX_DEF 131072
vextractf128 [outq + 15*mmsize + 0], tw_o, 1
vextractf128 [outq + 15*mmsize + 16], tx2_e1, 1
+%if %2
+ ret
+%else
RET
+%endif
+
+%if %2
+cglobal fft_sr_ns_float, 4, 9, 16, 272, ctx, out, in, tmp, len, lut, itab, rtab, tgt
+ movsxd lenq, dword [ctxq + AVTXContext.len]
+ mov lutq, [ctxq + AVTXContext.map]
+ mov tgtq, lenq
+
+ call ff_tx_fft_sr_asm_float_ %+ %1
+ RET
+%endif
%endmacro
%if ARCH_X86_64
-FFT_SPLIT_RADIX_FN fma3, float, 0
-FFT_SPLIT_RADIX_FN fma3, ns_float, 1
+FFT_SPLIT_RADIX_FN fma3, 0
+FFT_SPLIT_RADIX_FN fma3, 1
%if HAVE_AVX2_EXTERNAL
-FFT_SPLIT_RADIX_FN avx2, float, 0
-FFT_SPLIT_RADIX_FN avx2, ns_float, 1
+FFT_SPLIT_RADIX_FN avx2, 0
+FFT_SPLIT_RADIX_FN avx2, 1
%endif
%endif