x86/tx_float: adjust internal ASM call ABI again

There are many ways to go about it, and this one seems optimal for both MDCTs and PFA FFTs without requiring excessive instructions or stack usage.
author: Lynne <dev@lynne.ee> 2022-09-23 10:34:08 +0200
committer: Lynne <dev@lynne.ee> 2022-09-23 12:33:35 +0200
commit: 3241e9225c7adfb2d8d24cfd05a7a8db8ddbd023 (patch)
tree: 3e3b555c251837688b7664c50ef0148693d49cf7 /libavutil/x86/tx_float.asm
parent: 7e7baf8ab86c4ae715f12d2c0babf831a5b18c39 (diff)
download: ffmpeg-3241e9225c7adfb2d8d24cfd05a7a8db8ddbd023.tar.gz
1 files changed, 8 insertions, 20 deletions
diff --git a/libavutil/x86/tx_float.asm b/libavutil/x86/tx_float.asm
index b3a85a7cb9..5e0c438b9c 100644
--- a/libavutil/x86/tx_float.asm
+++ b/libavutil/x86/tx_float.asm
@@ -22,11 +22,10 @@
 ; based upon and compare.
 
 ; Intra-asm call convention:
-;       272 bytes of stack available
-;       First 10 GPRs available
+;       320 bytes of stack available
+;       14 GPRs available (last 4 must not be clobbered)
+;       Additionally, don't clobber ctx, in, out, len, lut
 ;       All vector regs available
-;       Don't clobber ctx, len, lut
-;       in and out must point to the end
 
 ; TODO:
 ;       carry over registers from smaller transforms to save on ~8 loads/stores
@@ -686,8 +685,6 @@ cglobal fft2_asm_float, 0, 0, 0, ctx, out, in, stride
     movaps m0, [inq]
     FFT2 m0, m1
     movaps [outq], m0
-    add inq, mmsize*1
-    add outq, mmsize*1
     ret
 
 cglobal fft2_float, 4, 4, 2, ctx, out, in, stride
@@ -721,8 +718,6 @@ cglobal fft4_ %+ %1 %+ _float, 4, 4, 3, ctx, out, in, stride
     movaps [outq + 1*mmsize], m0
 
 %if %3
-    add inq, mmsize*2
-    add outq, mmsize*2
     ret
 %else
     RET
@@ -764,8 +759,6 @@ cglobal fft8_float, 4, 4, 6, ctx, out, in, tmp
     movups [outq + 3*mmsize], m1
 
 %if %1
-    add inq, mmsize*4
-    add outq, mmsize*4
     ret
 %else
     RET
@@ -806,8 +799,6 @@ cglobal fft8_float, 4, 4, 4, ctx, out, in, tmp
     vextractf128 [outq + 16*3], m0, 1
 
 %if %1
-    add inq, mmsize*2
-    add outq, mmsize*2
     ret
 %else
     RET
@@ -857,8 +848,6 @@ cglobal fft16_float, 4, 4, 8, ctx, out, in, tmp
     vextractf128 [outq + 16*7], m1, 1
 
 %if %2
-    add inq, mmsize*4
-    add outq, mmsize*4
     ret
 %else
     RET
@@ -943,8 +932,6 @@ cglobal fft32_float, 4, 4, 16, ctx, out, in, tmp
     vextractf128 [outq + 16*15],  m5, 1
 
 %if %2
-    add inq, mmsize*8
-    add outq, mmsize*8
     ret
 %else
     RET
@@ -1282,12 +1269,13 @@ FFT_SPLIT_RADIX_DEF 131072
     add outq, 8*mmsize
     add rtabq, 4*mmsize
     sub itabq, 4*mmsize
-    sub lenq, 4*mmsize
+    sub tgtq, 4*mmsize
     jg .synth_deinterleave
 
 %if %2
-    mov lenq, tgtq
-    add outq, offq
+    sub outq, tmpq
+    neg tmpq
+    lea inq, [inq + tmpq*4]
     ret
 %else
     RET
@@ -1369,7 +1357,7 @@ FFT_SPLIT_RADIX_DEF 131072
     vextractf128 [outq + 15*mmsize + 16], tx2_e1, 1
 
 %if %2
-    add outq, 16*mmsize
+    sub inq, 16*mmsize
     ret
 %else
     RET
author	Lynne <dev@lynne.ee>	2022-09-23 10:34:08 +0200
committer	Lynne <dev@lynne.ee>	2022-09-23 12:33:35 +0200
commit	3241e9225c7adfb2d8d24cfd05a7a8db8ddbd023 (patch)
tree	3e3b555c251837688b7664c50ef0148693d49cf7 /libavutil/x86/tx_float.asm
parent	7e7baf8ab86c4ae715f12d2c0babf831a5b18c39 (diff)
download	ffmpeg-3241e9225c7adfb2d8d24cfd05a7a8db8ddbd023.tar.gz