diff options
author | Lynne <dev@lynne.ee> | 2022-01-25 22:39:09 +0100 |
---|---|---|
committer | Lynne <dev@lynne.ee> | 2022-01-26 04:13:58 +0100 |
commit | 28bff6ae5435b0a6574067de8c129cbe7225c729 (patch) | |
tree | b881c2de09b4d0d76af3d6d9bcdee32e5d1ef005 /libavutil/x86/tx_float.asm | |
parent | 350142560bd44d02ed532ae93a544d5d069620bc (diff) | |
download | ffmpeg-28bff6ae5435b0a6574067de8c129cbe7225c729.tar.gz |
x86/tx_float: add permute-free FFT versions
These are used in the PFA transforms and MDCTs.
Diffstat (limited to 'libavutil/x86/tx_float.asm')
-rw-r--r-- | libavutil/x86/tx_float.asm | 115 |
1 files changed, 99 insertions, 16 deletions
diff --git a/libavutil/x86/tx_float.asm b/libavutil/x86/tx_float.asm index e3b48d7c1f..8698e858d0 100644 --- a/libavutil/x86/tx_float.asm +++ b/libavutil/x86/tx_float.asm @@ -707,13 +707,21 @@ cglobal fft4_ %+ %1 %+ _float, 4, 4, 3, ctx, out, in, stride FFT4 fwd, 0 FFT4 inv, 1 +%macro FFT8_FN 2 INIT_XMM sse3 -cglobal fft8_float, 4, 4, 6, ctx, out, in, tmp +cglobal fft8_ %+ %1, 4, 4, 6, ctx, out, in, tmp +%if %2 mov ctxq, [ctxq + AVTXContext.map] LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq +%else + movaps m0, [inq + 0*mmsize] + movaps m1, [inq + 1*mmsize] + movaps m2, [inq + 2*mmsize] + movaps m3, [inq + 3*mmsize] +%endif FFT8 m0, m1, m2, m3, m4, m5 @@ -728,12 +736,22 @@ cglobal fft8_float, 4, 4, 6, ctx, out, in, tmp movups [outq + 3*mmsize], m1 RET +%endmacro + +FFT8_FN float, 1 +FFT8_FN ns_float, 0 +%macro FFT16_FN 2 INIT_YMM avx -cglobal fft8_float, 4, 4, 4, ctx, out, in, tmp +cglobal fft8_ %+ %1, 4, 4, 4, ctx, out, in, tmp +%if %2 mov ctxq, [ctxq + AVTXContext.map] LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m2 LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m3 +%else + movaps m0, [inq + 0*mmsize] + movaps m1, [inq + 1*mmsize] +%endif FFT8_AVX m0, m1, m2, m3 @@ -747,16 +765,26 @@ cglobal fft8_float, 4, 4, 4, ctx, out, in, tmp vextractf128 [outq + 16*3], m0, 1 RET +%endmacro -%macro FFT16_FN 1 +FFT16_FN float, 1 +FFT16_FN ns_float, 0 + +%macro FFT16_FN 3 INIT_YMM %1 -cglobal fft16_float, 4, 4, 8, ctx, out, in, tmp +cglobal fft16_ %+ %2, 4, 4, 8, ctx, out, in, tmp +%if %3 + movaps m0, [inq + 0*mmsize] + movaps m1, [inq + 1*mmsize] + movaps m2, [inq + 2*mmsize] + movaps m3, [inq + 3*mmsize] +%else mov ctxq, [ctxq + AVTXContext.map] - LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m4 LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m5 LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m6 LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m7 +%endif FFT16 m0, m1, m2, m3, m4, m5, m6, m7 @@ -777,25 +805,40 @@ cglobal fft16_float, 4, 4, 8, ctx, out, in, tmp RET %endmacro -FFT16_FN avx -FFT16_FN fma3 +FFT16_FN avx, float, 0 +FFT16_FN avx, ns_float, 1 +FFT16_FN fma3, float, 0 +FFT16_FN fma3, ns_float, 1 -%macro FFT32_FN 1 +%macro FFT32_FN 3 INIT_YMM %1 -cglobal fft32_float, 4, 4, 16, ctx, out, in, tmp +cglobal fft32_ %+ %2, 4, 4, 16, ctx, out, in, tmp +%if %3 + movaps m4, [inq + 4*mmsize] + movaps m5, [inq + 5*mmsize] + movaps m6, [inq + 6*mmsize] + movaps m7, [inq + 7*mmsize] +%else mov ctxq, [ctxq + AVTXContext.map] - LOAD64_LUT m4, inq, ctxq, (mmsize/2)*4, tmpq, m8, m9 LOAD64_LUT m5, inq, ctxq, (mmsize/2)*5, tmpq, m10, m11 LOAD64_LUT m6, inq, ctxq, (mmsize/2)*6, tmpq, m12, m13 LOAD64_LUT m7, inq, ctxq, (mmsize/2)*7, tmpq, m14, m15 +%endif FFT8 m4, m5, m6, m7, m8, m9 +%if %3 + movaps m0, [inq + 0*mmsize] + movaps m1, [inq + 1*mmsize] + movaps m2, [inq + 2*mmsize] + movaps m3, [inq + 3*mmsize] +%else LOAD64_LUT m0, inq, ctxq, (mmsize/2)*0, tmpq, m8, m9 LOAD64_LUT m1, inq, ctxq, (mmsize/2)*1, tmpq, m10, m11 LOAD64_LUT m2, inq, ctxq, (mmsize/2)*2, tmpq, m12, m13 LOAD64_LUT m3, inq, ctxq, (mmsize/2)*3, tmpq, m14, m15 +%endif movaps m8, [tab_32_float] vperm2f128 m9, m9, [tab_32_float + 4*8 - 4*7], 0x23 @@ -836,8 +879,10 @@ cglobal fft32_float, 4, 4, 16, ctx, out, in, tmp %endmacro %if ARCH_X86_64 -FFT32_FN avx -FFT32_FN fma3 +FFT32_FN avx, float, 0 +FFT32_FN avx, ns_float, 1 +FFT32_FN fma3, float, 0 +FFT32_FN fma3, ns_float, 1 %endif %macro FFT_SPLIT_RADIX_DEF 1-2 @@ -878,9 +923,9 @@ ALIGN 16 %endif %endmacro -%macro FFT_SPLIT_RADIX_FN 1 +%macro FFT_SPLIT_RADIX_FN 3 INIT_YMM %1 -cglobal fft_sr_float, 4, 8, 16, 272, lut, out, in, len, tmp, itab, rtab, tgt +cglobal fft_sr_ %+ %2, 4, 8, 16, 272, lut, out, in, len, tmp, itab, rtab, tgt movsxd lenq, dword [lutq + AVTXContext.len] mov lutq, [lutq + AVTXContext.map] mov tgtq, lenq @@ -888,17 +933,31 @@ cglobal fft_sr_float, 4, 8, 16, 272, lut, out, in, len, tmp, itab, rtab, tgt ; Bottom-most/32-point transform =============================================== ALIGN 16 .32pt: +%if %3 + movaps m4, [inq + 4*mmsize] + movaps m5, [inq + 5*mmsize] + movaps m6, [inq + 6*mmsize] + movaps m7, [inq + 7*mmsize] +%else LOAD64_LUT m4, inq, lutq, (mmsize/2)*4, tmpq, m8, m9 LOAD64_LUT m5, inq, lutq, (mmsize/2)*5, tmpq, m10, m11 LOAD64_LUT m6, inq, lutq, (mmsize/2)*6, tmpq, m12, m13 LOAD64_LUT m7, inq, lutq, (mmsize/2)*7, tmpq, m14, m15 +%endif FFT8 m4, m5, m6, m7, m8, m9 +%if %3 + movaps m0, [inq + 0*mmsize] + movaps m1, [inq + 1*mmsize] + movaps m2, [inq + 2*mmsize] + movaps m3, [inq + 3*mmsize] +%else LOAD64_LUT m0, inq, lutq, (mmsize/2)*0, tmpq, m8, m9 LOAD64_LUT m1, inq, lutq, (mmsize/2)*1, tmpq, m10, m11 LOAD64_LUT m2, inq, lutq, (mmsize/2)*2, tmpq, m12, m13 LOAD64_LUT m3, inq, lutq, (mmsize/2)*3, tmpq, m14, m15 +%endif movaps m8, [tab_32_float] vperm2f128 m9, m9, [tab_32_float + 32 - 4*7], 0x23 @@ -913,7 +972,11 @@ ALIGN 16 movaps [outq + 5*mmsize], m5 movaps [outq + 7*mmsize], m7 +%if %3 + add inq, 8*mmsize +%else add lutq, (mmsize/2)*8 +%endif cmp lenq, 32 jg .64pt @@ -944,24 +1007,42 @@ ALIGN 16 SWAP m4, m1 SWAP m6, m3 +%if %3 + movaps tx1_e0, [inq + 0*mmsize] + movaps tx1_e1, [inq + 1*mmsize] + movaps tx1_o0, [inq + 2*mmsize] + movaps tx1_o1, [inq + 3*mmsize] +%else LOAD64_LUT tx1_e0, inq, lutq, (mmsize/2)*0, tmpq, tw_e, tw_o LOAD64_LUT tx1_e1, inq, lutq, (mmsize/2)*1, tmpq, tmp1, tmp2 LOAD64_LUT tx1_o0, inq, lutq, (mmsize/2)*2, tmpq, tw_e, tw_o LOAD64_LUT tx1_o1, inq, lutq, (mmsize/2)*3, tmpq, tmp1, tmp2 +%endif FFT16 tx1_e0, tx1_e1, tx1_o0, tx1_o1, tw_e, tw_o, tx2_o0, tx2_o1 +%if %3 + movaps tx2_e0, [inq + 4*mmsize] + movaps tx2_e1, [inq + 5*mmsize] + movaps tx2_o0, [inq + 6*mmsize] + movaps tx2_o1, [inq + 7*mmsize] +%else LOAD64_LUT tx2_e0, inq, lutq, (mmsize/2)*4, tmpq, tmp1, tmp2 LOAD64_LUT tx2_e1, inq, lutq, (mmsize/2)*5, tmpq, tw_e, tw_o LOAD64_LUT tx2_o0, inq, lutq, (mmsize/2)*6, tmpq, tmp1, tmp2 LOAD64_LUT tx2_o1, inq, lutq, (mmsize/2)*7, tmpq, tw_e, tw_o +%endif FFT16 tx2_e0, tx2_e1, tx2_o0, tx2_o1, tmp1, tmp2, tw_e, tw_o movaps tw_e, [tab_64_float] vperm2f128 tw_o, tw_o, [tab_64_float + 64 - 4*7], 0x23 +%if %3 + add inq, 8*mmsize +%else add lutq, (mmsize/2)*8 +%endif cmp tgtq, 64 je .deinterleave @@ -1204,8 +1285,10 @@ FFT_SPLIT_RADIX_DEF 131072 %endmacro %if ARCH_X86_64 -FFT_SPLIT_RADIX_FN avx +FFT_SPLIT_RADIX_FN avx, float, 0 +FFT_SPLIT_RADIX_FN avx, ns_float, 1 %if HAVE_AVX2_EXTERNAL -FFT_SPLIT_RADIX_FN avx2 +FFT_SPLIT_RADIX_FN avx2, float, 0 +FFT_SPLIT_RADIX_FN avx2, ns_float, 1 %endif %endif |