summaryrefslogtreecommitdiff
path: root/libavutil/x86/tx_float.asm
diff options
context:
space:
mode:
authorLynne <dev@lynne.ee>2022-09-19 05:53:01 +0200
committerLynne <dev@lynne.ee>2022-09-23 12:35:27 +0200
commitace42cf581f8c06872bfb58cf575d9e8bd398c0a (patch)
tree217d6653d5664d47f95c327fdb09d63e01dffcb3 /libavutil/x86/tx_float.asm
parent3241e9225c7adfb2d8d24cfd05a7a8db8ddbd023 (diff)
downloadffmpeg-ace42cf581f8c06872bfb58cf575d9e8bd398c0a.tar.gz
x86/tx_float: add 15xN PFA FFT AVX SIMD
~4x faster than the C version. The shuffles in the 15pt dim1 are seriously expensive. Not happy with it, but I'm contempt. Can be easily converted to pure AVX by removing all vpermpd/vpermps instructions.
Diffstat (limited to 'libavutil/x86/tx_float.asm')
-rw-r--r--libavutil/x86/tx_float.asm280
1 files changed, 280 insertions, 0 deletions
diff --git a/libavutil/x86/tx_float.asm b/libavutil/x86/tx_float.asm
index 5e0c438b9c..67f363fc01 100644
--- a/libavutil/x86/tx_float.asm
+++ b/libavutil/x86/tx_float.asm
@@ -51,6 +51,8 @@ cextern tab_ %+ i %+ _float ; ff_tab_i_float...
%assign i (i << 1)
%endrep
+cextern tab_53_float
+
struc AVTXContext
.len: resd 1 ; Length
.inv resd 1 ; Inverse flag
@@ -87,6 +89,9 @@ s16_mult_odd1: dd COS16_1, COS16_1, COS16_3, COS16_3, COS16_1, -COS16_1, CO
s16_mult_odd2: dd COS16_3, -COS16_3, COS16_1, -COS16_1, -COS16_3, -COS16_3, -COS16_1, -COS16_1
s16_perm: dd 0, 1, 2, 3, 1, 0, 3, 2
+s15_perm: dd 0, 6, 5, 3, 2, 4, 7, 1
+
+mask_mmmmmmpp: dd NEG, NEG, NEG, NEG, NEG, NEG, POS, POS
mask_mmmmpppm: dd NEG, NEG, NEG, NEG, POS, POS, POS, NEG
mask_ppmpmmpm: dd POS, POS, NEG, POS, NEG, NEG, POS, NEG
mask_mppmmpmp: dd NEG, POS, POS, NEG, NEG, POS, NEG, POS
@@ -1565,3 +1570,278 @@ cglobal mdct_sr_inv_float, 4, 13, 16, 272, ctx, out, in, stride, len, lut, exp,
%if ARCH_X86_64 && HAVE_AVX2_EXTERNAL
IMDCT_FN avx2
%endif
+
+%macro PFA_15_FN 2
+INIT_YMM %1
+%if %2
+cglobal fft_pfa_15xM_asm_float, 0, 0, 0, ctx, out, in, stride, len, lut, buf, map, tgt, tmp, \
+ tgt5, stride3, stride5, btmp
+%else
+cglobal fft_pfa_15xM_float, 4, 14, 16, 320, ctx, out, in, stride, len, lut, buf, map, tgt, tmp, \
+ tgt5, stride3, stride5, btmp
+%endif
+
+%if %2
+ PUSH inq
+ PUSH tgt5q
+ PUSH stride3q
+ PUSH stride5q
+ PUSH btmpq
+%endif
+
+ mov btmpq, outq
+
+ mov outq, [ctxq + AVTXContext.tmp]
+%if !%2
+ movsxd lenq, dword [ctxq + AVTXContext.len]
+ mov lutq, [ctxq + AVTXContext.map]
+%endif
+
+ ; Load stride (second transform's length) and second transform's LUT
+ mov tmpq, [ctxq + AVTXContext.sub]
+ movsxd strideq, dword [tmpq + AVTXContext.len]
+ mov mapq, [tmpq + AVTXContext.map]
+
+ shl strideq, 3
+ imul stride3q, strideq, 3
+ imul stride5q, strideq, 5
+
+ movaps m13, [mask_mmmmmmpp] ; mmmmmmpp
+ vpermpd m12, m13, q0033 ; ppppmmmm
+ vextractf128 xm11, m13, 1 ; mmpp
+ movaps m10, [ff_tx_tab_53_float] ; tab5
+ movaps xm9, [ff_tx_tab_53_float + 32] ; tab3
+ movaps m8, [s15_perm]
+
+.dim1:
+ mov tmpd, [mapq]
+ lea tgtq, [outq + tmpq*8]
+
+%if %2
+ movups xm0, [inq]
+%else
+ LOAD64_LUT xm0, inq, lutq, 0, tmpq, m14, xm15 ; in[0,1].reim
+%endif
+
+ shufps xm1, xm0, xm0, q3223 ; in[1].imrereim
+ shufps xm0, xm0, xm0, q1001 ; in[0].imrereim
+
+ xorps xm1, xm11
+ addps xm1, xm0 ; pc[0,1].imre
+
+%if %2
+ movddup xm14, [inq + 16] ; in[2].reimreim
+%else
+ mov tmpd, [lutq + 8]
+ movddup xm14, [inq + tmpq*8] ; in[2].reimreim
+%endif
+ shufps xm0, xm1, xm1, q3232 ; pc[1].reimreim
+ addps xm0, xm14 ; dc[0].reimreim
+
+ mulps xm1, xm9 ; tab[0123]*pc[01]
+
+ shufpd xm5, xm1, xm1, 01b ; pc[1,0].reim
+ xorps xm1, xm11
+ addps xm1, xm1, xm5
+ addsubps xm1, xm14, xm1 ; dc[1,2].reim
+
+%if %2
+ movups m2, [inq + mmsize*0 + 24]
+ movups m3, [inq + mmsize*1 + 24]
+%else
+ LOAD64_LUT m2, inq, lutq, (mmsize/2)*0 + 12, tmpq, m14, m15
+ LOAD64_LUT m3, inq, lutq, (mmsize/2)*1 + 12, tmpq, m14, m15
+%endif
+
+ subps m7, m2, m3 ; q[0-3].imre
+ addps m6, m2, m3 ; q[4-7]
+ shufps m7, m7, m7, q2301 ; q[0-3].reim
+
+%if %2
+ movups m4, [inq + mmsize*2 + 24]
+%else
+ LOAD64_LUT m4, inq, lutq, (mmsize/2)*2 + 12, tmpq, m14, m15
+%endif
+
+ addps m5, m4, m6 ; y[0-3]
+
+ vpermpd m14, m9, q1111 ; tab[23232323]
+ vbroadcastsd m15, xm9 ; tab[01010101]
+
+ mulps m6, m14
+ mulps m7, m15
+
+ subps m2, m6, m7 ; k[0-3]
+ addps m3, m6, m7 ; k[4-7]
+
+ addsubps m6, m4, m2 ; k[0-3]
+ addsubps m7, m4, m3 ; k[4-7]
+
+ ; 15pt from here on
+ vpermpd m2, m5, q0123 ; y[3-0]
+ vpermpd m3, m6, q0123 ; k[3-0]
+ vpermpd m4, m7, q0123 ; k[7-4]
+
+ xorps m5, m12
+ xorps m6, m12
+ xorps m7, m12
+
+ addps m2, m5 ; t[0-3]
+ addps m3, m6 ; t[4-7]
+ addps m4, m7 ; t[8-11]
+
+ movlhps xm14, xm2 ; out[0]
+ unpcklpd xm7, xm3, xm4 ; out[10,5]
+ unpckhpd xm5, xm3, xm4 ; out[10,5]
+
+ addps xm14, xm2 ; out[0]
+ addps xm7, xm5 ; out[10,5]
+ addps xm14, xm0 ; out[0]
+ addps xm7, xm1 ; out[10,5]
+
+ movhps [tgtq], xm14 ; out[0]
+ movhps [tgtq + stride5q*1], xm7 ; out[5]
+ movlps [tgtq + stride5q*2], xm7 ; out[10]
+
+ shufps m14, m10, m10, q3232 ; tab5 4 5 4 5 8 9 8 9
+ shufps m15, m10, m10, q1010 ; tab5 6 7 6 7 10 11 10 11
+
+ mulps m5, m2, m14 ; t[0-3]
+ mulps m6, m3, m14 ; t[4-7]
+ mulps m7, m4, m14 ; t[8-11]
+
+ mulps m2, m15 ; r[0-3]
+ mulps m3, m15 ; r[4-7]
+ mulps m4, m15 ; r[8-11]
+
+ shufps m5, m5, m5, q1032 ; t[1,0,3,2].reim
+ shufps m6, m6, m6, q1032 ; t[5,4,7,6].reim
+ shufps m7, m7, m7, q1032 ; t[9,8,11,10].reim
+
+ lea tgt5q, [tgtq + stride5q]
+ lea tmpq, [tgtq + stride5q*2]
+
+ xorps m5, m13
+ xorps m6, m13
+ xorps m7, m13
+
+ addps m2, m5 ; r[0,1,2,3]
+ addps m3, m6 ; r[4,5,6,7]
+ addps m4, m7 ; r[8,9,10,11]
+
+ shufps m5, m2, m2, q2301
+ shufps m6, m3, m3, q2301
+ shufps m7, m4, m4, q2301
+
+ xorps m2, m12
+ xorps m3, m12
+ xorps m4, m12
+
+ vpermpd m5, m5, q0123
+ vpermpd m6, m6, q0123
+ vpermpd m7, m7, q0123
+
+ addps m5, m2
+ addps m6, m3
+ addps m7, m4
+
+ vpermps m5, m8, m5
+ vpermps m6, m8, m6
+ vpermps m7, m8, m7
+
+ vbroadcastsd m0, xm0 ; dc[0]
+ vpermpd m2, m1, q1111 ; dc[2]
+ vbroadcastsd m1, xm1 ; dc[1]
+
+ addps m0, m5
+ addps m1, m6
+ addps m2, m7
+
+ vextractf128 xm3, m0, 1
+ vextractf128 xm4, m1, 1
+ vextractf128 xm5, m2, 1
+
+ movlps [tgtq + strideq*1], xm1
+ movhps [tgtq + strideq*2], xm2
+ movlps [tgtq + stride3q*1], xm3
+ movhps [tgtq + strideq*4], xm4
+ movlps [tgtq + stride3q*2], xm0
+ movlps [tgtq + strideq*8], xm5
+ movhps [tgtq + stride3q*4], xm0
+ movhps [tgt5q + strideq*2], xm1
+ movhps [tgt5q + strideq*4], xm3
+ movlps [tmpq + strideq*1], xm2
+ movlps [tmpq + stride3q*1], xm4
+ movhps [tmpq + strideq*4], xm5
+
+%if %2
+ add inq, mmsize*3 + 24
+%else
+ add lutq, (mmsize/2)*3 + 12
+%endif
+ add mapq, 4
+ sub lenq, 15
+ jg .dim1
+
+ ; Second transform setup
+ mov stride5q, ctxq ; backup original context
+ movsxd stride3q, dword [ctxq + AVTXContext.len] ; full length
+ mov tgt5q, [ctxq + AVTXContext.fn] ; subtransform's jump point
+
+ mov inq, outq ; in-place transform
+ mov ctxq, [ctxq + AVTXContext.sub] ; load subtransform's context
+ mov lutq, [ctxq + AVTXContext.map] ; load subtransform's map
+ movsxd lenq, dword [ctxq + AVTXContext.len] ; load subtransform's length
+
+.dim2:
+ call tgt5q ; call the FFT
+ lea inq, [inq + lenq*8]
+ lea outq, [outq + lenq*8]
+ sub stride3q, lenq
+ jg .dim2
+
+ mov ctxq, stride5q ; restore original context
+ mov lutq, [ctxq + AVTXContext.map]
+ mov inq, [ctxq + AVTXContext.tmp]
+ movsxd lenq, dword [ctxq + AVTXContext.len] ; full length
+
+ lea stride3q, [lutq + lenq*4] ; second part of the LUT
+ mov stride5q, lenq
+ mov tgt5q, btmpq
+
+.post:
+ LOAD64_LUT m0, inq, stride3q, 0, tmpq, m8, m9
+ movups [tgt5q], m0
+
+ add tgt5q, mmsize
+ add stride3q, mmsize/2
+ sub stride5q, mmsize/8
+ jg .post
+
+%if %2
+ mov outq, btmpq
+ POP btmpq
+ POP stride5q
+ POP stride3q
+ POP tgt5q
+ POP inq
+ ret
+%else
+ RET
+%endif
+
+%if %2
+cglobal fft_pfa_15xM_ns_float, 4, 14, 16, 320, ctx, out, in, stride, len, lut, buf, map, tgt, tmp, \
+ tgt5, stride3, stride5, btmp
+ movsxd lenq, dword [ctxq + AVTXContext.len]
+ mov lutq, [ctxq + AVTXContext.map]
+
+ call mangle(fft_pfa_15xM_asm_float)
+ RET
+%endif
+%endmacro
+
+%if ARCH_X86_64
+PFA_15_FN avx2, 0
+PFA_15_FN avx2, 1
+%endif