summaryrefslogtreecommitdiff
path: root/chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S
diff options
context:
space:
mode:
Diffstat (limited to 'chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S')
-rw-r--r--chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S320
1 files changed, 252 insertions, 68 deletions
diff --git a/chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S b/chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S
index 0a0c7768b13..eee3a9636de 100644
--- a/chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S
+++ b/chromium/third_party/dav1d/libdav1d/src/arm/64/itx16.S
@@ -124,6 +124,13 @@ endconst
.endif
.endm
+.macro smin_4s r0, r1, r2
+ smin \r0\().4s, \r1\().4s, \r2\().4s
+.endm
+.macro smax_4s r0, r1, r2
+ smax \r0\().4s, \r1\().4s, \r2\().4s
+.endm
+
.macro load_add_store load, shift, addsrc, adddst, min, store, dst, src, shiftbits=4
.ifnb \load
ld1 {\load}, [\src], x1
@@ -599,12 +606,21 @@ def_fn_4x4 identity, flipadst
.macro idct_8 r0, r1, r2, r3, r4, r5, r6, r7
idct_4 \r0, \r2, \r4, \r6
+ movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+.irp r, \r0, \r2, \r4, \r6
+ smin_4s \r, \r, v5
+.endr
+.irp r, \r0, \r2, \r4, \r6
+ smax_4s \r, \r, v4
+.endr
+
mul_mls v2, \r1, \r7, v1.s[0], v1.s[1] // -> t4a
- mul_mla v4, \r1, \r7, v1.s[1], v1.s[0] // -> t7a
+ mul_mla v3, \r1, \r7, v1.s[1], v1.s[0] // -> t7a
mul_mls v6, \r5, \r3, v1.s[2], v1.s[3] // -> t5a
mul_mla v7, \r5, \r3, v1.s[3], v1.s[2] // -> t6a
srshr \r1\().4s, v2.4s, #12 // t4a
- srshr \r7\().4s, v4.4s, #12 // t7a
+ srshr \r7\().4s, v3.4s, #12 // t7a
srshr \r3\().4s, v6.4s, #12 // t5a
srshr \r5\().4s, v7.4s, #12 // t6a
@@ -613,17 +629,24 @@ def_fn_4x4 identity, flipadst
sqadd v3.4s, \r7\().4s, \r5\().4s // t7
sqsub \r3\().4s, \r7\().4s, \r5\().4s // t6a
- mul_mls v4, \r3, \r1, v0.s[0], v0.s[0] // -> t5
+.irp r, v2, \r1, v3, \r3
+ smin_4s \r, \r, v5
+.endr
+.irp r, v2, \r1, v3, \r3
+ smax_4s \r, \r, v4
+.endr
+
+ mul_mls v7, \r3, \r1, v0.s[0], v0.s[0] // -> t5
mul_mla v6, \r3, \r1, v0.s[0], v0.s[0] // -> t6
- srshr v4.4s, v4.4s, #12 // t5
- srshr v5.4s, v6.4s, #12 // t6
+ srshr v7.4s, v7.4s, #12 // t5
+ srshr v6.4s, v6.4s, #12 // t6
sqsub \r7\().4s, \r0\().4s, v3.4s // out7
sqadd \r0\().4s, \r0\().4s, v3.4s // out0
- sqadd \r1\().4s, \r2\().4s, v5.4s // out1
- sqsub v6.4s, \r2\().4s, v5.4s // out6
- sqadd \r2\().4s, \r4\().4s, v4.4s // out2
- sqsub \r5\().4s, \r4\().4s, v4.4s // out5
+ sqadd \r1\().4s, \r2\().4s, v6.4s // out1
+ sqsub v6.4s, \r2\().4s, v6.4s // out6
+ sqadd \r2\().4s, \r4\().4s, v7.4s // out2
+ sqsub \r5\().4s, \r4\().4s, v7.4s // out5
sqadd \r3\().4s, \r6\().4s, v2.4s // out3
sqsub \r4\().4s, \r6\().4s, v2.4s // out4
mov \r6\().16b, v6.16b // out6
@@ -660,8 +683,11 @@ endfunc
ld1 {v0.4s}, [x16]
+ movi v1.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+
sqadd v2.4s, v16.4s, v20.4s // t0
sqsub v3.4s, v16.4s, v20.4s // t4
+ mvni v20.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
sqadd v4.4s, v23.4s, v19.4s // t1
sqsub v5.4s, v23.4s, v19.4s // t5
sqadd v6.4s, v18.4s, v22.4s // t2
@@ -669,6 +695,13 @@ endfunc
sqadd v18.4s, v21.4s, v17.4s // t3
sqsub v19.4s, v21.4s, v17.4s // t7
+.irp r, v2, v3, v4, v5, v6, v7, v18, v19
+ smin_4s \r, \r, v1
+.endr
+.irp r, v2, v3, v4, v5, v6, v7, v18, v19
+ smax_4s \r, \r, v20
+.endr
+
mul_mla v16, v3, v5, v0.s[3], v0.s[2]
mul_mls v20, v3, v5, v0.s[2], v0.s[3]
mul_mls v22, v19, v7, v0.s[3], v0.s[2]
@@ -685,12 +718,24 @@ endfunc
sqsub v2.4s, v2.4s, v6.4s // t2
sqadd \o7\().4s, v4.4s, v18.4s // out7
sqsub v4.4s, v4.4s, v18.4s // t3
- sqneg \o7\().4s, \o7\().4s // out7
+
+ mvni v18.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
sqadd \o1\().4s, v3.4s, v7.4s // out1
sqsub v3.4s, v3.4s, v7.4s // t6
sqadd \o6\().4s, v5.4s, v19.4s // out6
sqsub v5.4s, v5.4s, v19.4s // t7
+
+ // Not clipping the output registers, as they will be downshifted and
+ // narrowed afterwards anyway.
+.irp r, v2, v4, v3, v5
+ smin_4s \r, \r, v1
+.endr
+.irp r, v2, v4, v3, v5
+ smax_4s \r, \r, v18
+.endr
+
+ sqneg \o7\().4s, \o7\().4s // out7
sqneg \o1\().4s, \o1\().4s // out1
mul_mla v18, v2, v4, v0.s[0], v0.s[0] // -> out3 (v19 or v20)
@@ -959,25 +1004,33 @@ function inv_dct_4s_x16_neon
idct_8 v16, v18, v20, v22, v24, v26, v28, v30
+ // idct_8 leaves the row_clip_max/min constants in v5 and v4
+.irp r, v16, v18, v20, v22, v24, v26, v28, v30
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v16, v18, v20, v22, v24, v26, v28, v30
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
ld1 {v0.4s, v1.4s}, [x16]
sub x16, x16, #32
mul_mls v2, v17, v31, v0.s[0], v0.s[1] // -> t8a
- mul_mla v4, v17, v31, v0.s[1], v0.s[0] // -> t15a
+ mul_mla v3, v17, v31, v0.s[1], v0.s[0] // -> t15a
mul_mls v6, v25, v23, v0.s[2], v0.s[3] // -> t9a
srshr v17.4s, v2.4s, #12 // t8a
- srshr v31.4s, v4.4s, #12 // t15a
+ srshr v31.4s, v3.4s, #12 // t15a
mul_mla v2, v25, v23, v0.s[3], v0.s[2] // -> t14a
- mul_mls v4, v21, v27, v1.s[0], v1.s[1] // -> t10a
+ mul_mls v3, v21, v27, v1.s[0], v1.s[1] // -> t10a
srshr v23.4s, v6.4s, #12 // t9a
srshr v25.4s, v2.4s, #12 // t14a
mul_mla v6, v21, v27, v1.s[1], v1.s[0] // -> t13a
mul_mls v2, v29, v19, v1.s[2], v1.s[3] // -> t11a
- srshr v21.4s, v4.4s, #12 // t10a
+ srshr v21.4s, v3.4s, #12 // t10a
srshr v27.4s, v6.4s, #12 // t13a
- mul_mla v4, v29, v19, v1.s[3], v1.s[2] // -> t12a
+ mul_mla v3, v29, v19, v1.s[3], v1.s[2] // -> t12a
srshr v19.4s, v2.4s, #12 // t11a
- srshr v29.4s, v4.4s, #12 // t12a
+ srshr v29.4s, v3.4s, #12 // t12a
ld1 {v0.4s}, [x16]
@@ -990,14 +1043,21 @@ function inv_dct_4s_x16_neon
sqadd v25.4s, v29.4s, v27.4s // t12
sqsub v29.4s, v29.4s, v27.4s // t13
- mul_mls v4, v3, v2, v0.s[2], v0.s[3] // -> t9a
+.irp r, v2, v17, v3, v31, v23, v19, v25, v29
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v2, v17, v3, v31, v23, v19, v25, v29
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ mul_mls v7, v3, v2, v0.s[2], v0.s[3] // -> t9a
mul_mla v6, v3, v2, v0.s[3], v0.s[2] // -> t14a
- srshr v21.4s, v4.4s, #12 // t9a
+ srshr v21.4s, v7.4s, #12 // t9a
srshr v27.4s, v6.4s, #12 // t14a
- mul_mls v4, v29, v23, v0.s[2], v0.s[3] // -> t13a
+ mul_mls v7, v29, v23, v0.s[2], v0.s[3] // -> t13a
mul_mla v6, v29, v23, v0.s[3], v0.s[2] // -> t10a
- srshr v29.4s, v4.4s, #12 // t13a
+ srshr v29.4s, v7.4s, #12 // t13a
neg v6.4s, v6.4s
srshr v23.4s, v6.4s, #12 // t10a
@@ -1010,34 +1070,41 @@ function inv_dct_4s_x16_neon
sqsub v25.4s, v27.4s, v29.4s // t13
sqadd v27.4s, v27.4s, v29.4s // t14
- mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t11
+.irp r, v2, v17, v3, v31, v19, v21, v25, v27
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v2, v17, v3, v31, v19, v21, v25, v27
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ mul_mls v7, v3, v2, v0.s[0], v0.s[0] // -> t11
mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t12
mul_mls v2, v25, v21, v0.s[0], v0.s[0] // -> t10a
- srshr v4.4s, v4.4s, #12 // t11
- srshr v5.4s, v6.4s, #12 // t12
- mul_mla v6, v25, v21, v0.s[0], v0.s[0] // -> t13a
+ srshr v7.4s, v7.4s, #12 // t11
+ srshr v6.4s, v6.4s, #12 // t12
+ mul_mla v3, v25, v21, v0.s[0], v0.s[0] // -> t13a
srshr v2.4s, v2.4s, #12 // t10a
- srshr v3.4s, v6.4s, #12 // t13a
+ srshr v3.4s, v3.4s, #12 // t13a
- sqadd v6.4s, v16.4s, v31.4s // out0
+ sqadd v1.4s, v16.4s, v31.4s // out0
sqsub v31.4s, v16.4s, v31.4s // out15
- mov v16.16b, v6.16b
+ mov v16.16b, v1.16b
sqadd v23.4s, v30.4s, v17.4s // out7
- sqsub v7.4s, v30.4s, v17.4s // out8
+ sqsub v1.4s, v30.4s, v17.4s // out8
sqadd v17.4s, v18.4s, v27.4s // out1
sqsub v30.4s, v18.4s, v27.4s // out14
sqadd v18.4s, v20.4s, v3.4s // out2
sqsub v29.4s, v20.4s, v3.4s // out13
sqadd v3.4s, v28.4s, v19.4s // out6
sqsub v25.4s, v28.4s, v19.4s // out9
- sqadd v19.4s, v22.4s, v5.4s // out3
- sqsub v28.4s, v22.4s, v5.4s // out12
- sqadd v20.4s, v24.4s, v4.4s // out4
- sqsub v27.4s, v24.4s, v4.4s // out11
+ sqadd v19.4s, v22.4s, v6.4s // out3
+ sqsub v28.4s, v22.4s, v6.4s // out12
+ sqadd v20.4s, v24.4s, v7.4s // out4
+ sqsub v27.4s, v24.4s, v7.4s // out11
sqadd v21.4s, v26.4s, v2.4s // out5
sqsub v26.4s, v26.4s, v2.4s // out10
- mov v24.16b, v7.16b
+ mov v24.16b, v1.16b
mov v22.16b, v3.16b
ret
@@ -1084,6 +1151,9 @@ endfunc
ld1 {v0.4s, v1.4s}, [x16]
+ movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ mvni v7.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+
sqsub v2.4s, v16.4s, v23.4s // t8a
sqadd v16.4s, v16.4s, v23.4s // t0a
sqsub v3.4s, v31.4s, v24.4s // t9a
@@ -1101,6 +1171,13 @@ endfunc
sqadd v28.4s, v25.4s, v30.4s // t7a
sqsub v25.4s, v25.4s, v30.4s // t15a
+.irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25
+ smin_4s \r, \r, v5
+.endr
+.irp r, v2, v16, v3, v31, v23, v18, v24, v29, v21, v20, v26, v27, v19, v22, v28, v25
+ smax_4s \r, \r, v7
+.endr
+
mul_mla v4, v2, v3, v1.s[1], v1.s[0] // -> t8
mul_mls v6, v2, v3, v1.s[0], v1.s[1] // -> t9
mul_mla v2, v18, v29, v1.s[3], v1.s[2] // -> t10
@@ -1135,6 +1212,13 @@ endfunc
sqadd v20.4s, v29.4s, v22.4s // t11a
sqsub v29.4s, v29.4s, v22.4s // t15a
+.irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29
+ smin_4s \r, \r, v5
+.endr
+.irp r, v2, v16, v3, v31, v21, v23, v26, v24, v19, v17, v28, v30, v27, v18, v20, v29
+ smax_4s \r, \r, v7
+.endr
+
mul_mla v4, v2, v3, v0.s[3], v0.s[2] // -> t4a
mul_mls v6, v2, v3, v0.s[2], v0.s[3] // -> t5a
mul_mls v2, v24, v23, v0.s[3], v0.s[2] // -> t6a
@@ -1163,24 +1247,34 @@ endfunc
sqadd \o15\().4s, v31.4s, v26.4s // out15
mov \o0\().16b, v4.16b
.endif
- sqneg \o15\().4s, \o15\().4s // out15
sqsub v3.4s, v29.4s, v18.4s // t15a
sqadd \o13\().4s, v29.4s, v18.4s // out13
sqadd \o2\().4s, v17.4s, v30.4s // out2
sqsub v26.4s, v17.4s, v30.4s // t14a
- sqneg \o13\().4s, \o13\().4s // out13
sqadd \o1\().4s, v19.4s, v27.4s // out1
sqsub v27.4s, v19.4s, v27.4s // t10
sqadd \o14\().4s, v28.4s, v20.4s // out14
sqsub v20.4s, v28.4s, v20.4s // t11
- sqneg \o1\().4s, \o1\().4s // out1
sqadd \o3\().4s, v22.4s, v24.4s // out3
sqsub v22.4s, v22.4s, v24.4s // t6
sqadd \o12\().4s, v25.4s, v23.4s // out12
sqsub v23.4s, v25.4s, v23.4s // t7
+
+ // Not clipping the output registers, as they will be downshifted and
+ // narrowed afterwards anyway.
+.irp r, v2, v21, v3, v26, v27, v20, v22, v23
+ smin_4s \r, \r, v5
+.endr
+.irp r, v2, v21, v3, v26, v27, v20, v22, v23
+ smax_4s \r, \r, v7
+.endr
+
+ sqneg \o15\().4s, \o15\().4s // out15
+ sqneg \o13\().4s, \o13\().4s // out13
+ sqneg \o1\().4s, \o1\().4s // out1
sqneg \o3\().4s, \o3\().4s // out3
mul_mls v24, v2, v21, v0.s[0], v0.s[0] // -> out8 (v24 or v23)
@@ -1956,6 +2050,9 @@ function inv_dct32_odd_4s_x16_neon
ld1 {v0.4s, v1.4s}, [x16]
+ movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+
sqsub v2.4s, v16.4s, v24.4s // t17
sqadd v16.4s, v16.4s, v24.4s // t16
sqsub v3.4s, v31.4s, v23.4s // t30
@@ -1973,23 +2070,30 @@ function inv_dct32_odd_4s_x16_neon
sqadd v25.4s, v19.4s, v27.4s // t28
sqsub v19.4s, v19.4s, v27.4s // t29
- mul_mls v4, v3, v2, v1.s[0], v1.s[1] // -> t17a
+.irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v2, v16, v3, v31, v24, v28, v23, v18, v20, v30, v26, v17, v22, v29, v25, v19
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ mul_mls v7, v3, v2, v1.s[0], v1.s[1] // -> t17a
mul_mla v6, v3, v2, v1.s[1], v1.s[0] // -> t30a
mul_mla v2, v19, v24, v1.s[1], v1.s[0] // -> t18a
- srshr v21.4s, v4.4s, #12 // t17a
+ srshr v21.4s, v7.4s, #12 // t17a
srshr v27.4s, v6.4s, #12 // t30a
neg v2.4s, v2.4s // -> t18a
- mul_mls v4, v19, v24, v1.s[0], v1.s[1] // -> t29a
+ mul_mls v7, v19, v24, v1.s[0], v1.s[1] // -> t29a
mul_mls v6, v22, v18, v1.s[2], v1.s[3] // -> t21a
srshr v19.4s, v2.4s, #12 // t18a
- srshr v24.4s, v4.4s, #12 // t29a
+ srshr v24.4s, v7.4s, #12 // t29a
mul_mla v2, v22, v18, v1.s[3], v1.s[2] // -> t26a
- mul_mla v4, v17, v20, v1.s[3], v1.s[2] // -> t22a
+ mul_mla v7, v17, v20, v1.s[3], v1.s[2] // -> t22a
srshr v22.4s, v6.4s, #12 // t21a
srshr v18.4s, v2.4s, #12 // t26a
- neg v4.4s, v4.4s // -> t22a
+ neg v7.4s, v7.4s // -> t22a
mul_mls v6, v17, v20, v1.s[2], v1.s[3] // -> t25a
- srshr v17.4s, v4.4s, #12 // t22a
+ srshr v17.4s, v7.4s, #12 // t22a
srshr v20.4s, v6.4s, #12 // t25a
sqsub v2.4s, v27.4s, v24.4s // t29
@@ -2009,23 +2113,30 @@ function inv_dct32_odd_4s_x16_neon
sqsub v29.4s, v31.4s, v25.4s // t28a
sqadd v31.4s, v31.4s, v25.4s // t31a
- mul_mls v4, v2, v3, v0.s[2], v0.s[3] // -> t18a
+.irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v2, v27, v3, v21, v24, v16, v19, v30, v28, v17, v23, v26, v22, v20, v29, v31
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ mul_mls v7, v2, v3, v0.s[2], v0.s[3] // -> t18a
mul_mla v6, v2, v3, v0.s[3], v0.s[2] // -> t29a
mul_mls v2, v29, v24, v0.s[2], v0.s[3] // -> t19
- srshr v18.4s, v4.4s, #12 // t18a
+ srshr v18.4s, v7.4s, #12 // t18a
srshr v25.4s, v6.4s, #12 // t29a
- mul_mla v4, v29, v24, v0.s[3], v0.s[2] // -> t28
+ mul_mla v7, v29, v24, v0.s[3], v0.s[2] // -> t28
mul_mla v6, v26, v19, v0.s[3], v0.s[2] // -> t20
srshr v29.4s, v2.4s, #12 // t19
- srshr v24.4s, v4.4s, #12 // t28
+ srshr v24.4s, v7.4s, #12 // t28
neg v6.4s, v6.4s // -> t20
mul_mls v2, v26, v19, v0.s[2], v0.s[3] // -> t27
- mul_mla v4, v20, v28, v0.s[3], v0.s[2] // -> t21a
+ mul_mla v7, v20, v28, v0.s[3], v0.s[2] // -> t21a
srshr v26.4s, v6.4s, #12 // t20
srshr v19.4s, v2.4s, #12 // t27
- neg v4.4s, v4.4s // -> t21a
+ neg v7.4s, v7.4s // -> t21a
mul_mls v6, v20, v28, v0.s[2], v0.s[3] // -> t26a
- srshr v20.4s, v4.4s, #12 // t21a
+ srshr v20.4s, v7.4s, #12 // t21a
srshr v28.4s, v6.4s, #12 // t26a
sqsub v2.4s, v16.4s, v30.4s // t23
@@ -2038,33 +2149,40 @@ function inv_dct32_odd_4s_x16_neon
sqsub v21.4s, v27.4s, v22.4s // t25a
sqsub v27.4s, v18.4s, v20.4s // t21
sqadd v18.4s, v18.4s, v20.4s // t18 = out18
- sqadd v4.4s, v29.4s, v26.4s // t19a = out19
+ sqadd v7.4s, v29.4s, v26.4s // t19a = out19
sqsub v26.4s, v29.4s, v26.4s // t20a
sqadd v29.4s, v25.4s, v28.4s // t29 = out29
sqsub v25.4s, v25.4s, v28.4s // t26
sqadd v28.4s, v24.4s, v19.4s // t28a = out28
sqsub v24.4s, v24.4s, v19.4s // t27a
- mov v19.16b, v4.16b // out19
+ mov v19.16b, v7.16b // out19
- mul_mls v4, v24, v26, v0.s[0], v0.s[0] // -> t20
+.irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24
+ smin \r\().4s, \r\().4s, v5.4s
+.endr
+.irp r, v2, v16, v3, v31, v23, v17, v30, v21, v27, v18, v19, v26, v29, v25, v28, v24
+ smax \r\().4s, \r\().4s, v4.4s
+.endr
+
+ mul_mls v7, v24, v26, v0.s[0], v0.s[0] // -> t20
mul_mla v6, v24, v26, v0.s[0], v0.s[0] // -> t27
- srshr v20.4s, v4.4s, #12 // t20
+ srshr v20.4s, v7.4s, #12 // t20
srshr v22.4s, v6.4s, #12 // t27
- mul_mla v4, v25, v27, v0.s[0], v0.s[0] // -> t26a
+ mul_mla v7, v25, v27, v0.s[0], v0.s[0] // -> t26a
mul_mls v6, v25, v27, v0.s[0], v0.s[0] // -> t21a
mov v27.16b, v22.16b // t27
- srshr v26.4s, v4.4s, #12 // t26a
+ srshr v26.4s, v7.4s, #12 // t26a
mul_mls v24, v21, v23, v0.s[0], v0.s[0] // -> t22
- mul_mla v4, v21, v23, v0.s[0], v0.s[0] // -> t25
+ mul_mla v7, v21, v23, v0.s[0], v0.s[0] // -> t25
srshr v21.4s, v6.4s, #12 // t21a
srshr v22.4s, v24.4s, #12 // t22
- srshr v25.4s, v4.4s, #12 // t25
+ srshr v25.4s, v7.4s, #12 // t25
- mul_mls v4, v3, v2, v0.s[0], v0.s[0] // -> t23a
+ mul_mls v7, v3, v2, v0.s[0], v0.s[0] // -> t23a
mul_mla v6, v3, v2, v0.s[0], v0.s[0] // -> t24a
- srshr v23.4s, v4.4s, #12 // t23a
+ srshr v23.4s, v7.4s, #12 // t23a
srshr v24.4s, v6.4s, #12 // t24a
ret
@@ -2091,6 +2209,15 @@ function inv_txfm_horz\suffix\()_dct_32x4_neon
scale_input .4s, v0.s[0], v24, v25, v26, v27, v28, v29, v30, v31
.endif
bl inv_dct_4s_x16_neon
+
+ // idct_16 leaves the row_clip_max/min constants in v5 and v4
+.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+ smin_4s \r, \r, v5
+.endr
+.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+ smax_4s \r, \r, v4
+.endr
+
transpose_4x4s v16, v17, v18, v19, v2, v3, v4, v5
transpose_4x4s v20, v21, v22, v23, v2, v3, v4, v5
transpose_4x4s v24, v25, v26, v27, v2, v3, v4, v5
@@ -2786,13 +2913,20 @@ function inv_dct64_step1_neon
sqsub v30.4s, v23.4s, v22.4s // t62
sqadd v31.4s, v23.4s, v22.4s // t63
+.irp r, v24, v25, v26, v27, v28, v29, v30, v31
+ smin_4s \r, \r, v5
+.endr
+.irp r, v24, v25, v26, v27, v28, v29, v30, v31
+ smax_4s \r, \r, v4
+.endr
+
mul_mla v2, v29, v26, v0.s[0], v0.s[1] // -> t34a
- mul_mls v4, v29, v26, v0.s[1], v0.s[0] // -> t61a
+ mul_mls v7, v29, v26, v0.s[1], v0.s[0] // -> t61a
neg v2.4s, v2.4s // t34a
mul_mls v6, v30, v25, v0.s[1], v0.s[0] // -> t33a
srshr v26.4s, v2.4s, #12 // t34a
mul_mla v2, v30, v25, v0.s[0], v0.s[1] // -> t62a
- srshr v29.4s, v4.4s, #12 // t61a
+ srshr v29.4s, v7.4s, #12 // t61a
srshr v25.4s, v6.4s, #12 // t33a
srshr v30.4s, v2.4s, #12 // t62a
@@ -2805,11 +2939,18 @@ function inv_dct64_step1_neon
sqsub v21.4s, v30.4s, v29.4s // t61
sqadd v22.4s, v30.4s, v29.4s // t62
+.irp r, v16, v19, v17, v18, v20, v23, v21, v22
+ smin_4s \r, \r, v5
+.endr
+.irp r, v16, v19, v17, v18, v20, v23, v21, v22
+ smax_4s \r, \r, v4
+.endr
+
mul_mla v2, v21, v18, v0.s[2], v0.s[3] // -> t61a
- mul_mls v4, v21, v18, v0.s[3], v0.s[2] // -> t34a
+ mul_mls v7, v21, v18, v0.s[3], v0.s[2] // -> t34a
mul_mla v6, v20, v19, v0.s[2], v0.s[3] // -> t60
srshr v21.4s, v2.4s, #12 // t61a
- srshr v18.4s, v4.4s, #12 // t34a
+ srshr v18.4s, v7.4s, #12 // t34a
mul_mls v2, v20, v19, v0.s[3], v0.s[2] // -> t35
srshr v20.4s, v6.4s, #12 // t60
srshr v19.4s, v2.4s, #12 // t35
@@ -2846,11 +2987,18 @@ function inv_dct64_step2_neon
sqadd v30.4s, v23.4s, v22.4s // t48
sqsub v31.4s, v23.4s, v22.4s // t55
+.irp r, v24, v25, v26, v27, v28, v29, v30, v31
+ smin_4s \r, \r, v5
+.endr
+.irp r, v24, v25, v26, v27, v28, v29, v30, v31
+ smax_4s \r, \r, v4
+.endr
+
mul_mla v2, v27, v25, v0.s[3], v0.s[2] // -> t56a
- mul_mls v4, v27, v25, v0.s[2], v0.s[3] // -> t39a
+ mul_mls v7, v27, v25, v0.s[2], v0.s[3] // -> t39a
mul_mla v6, v31, v28, v0.s[3], v0.s[2] // -> t40a
srshr v25.4s, v2.4s, #12 // t56a
- srshr v27.4s, v4.4s, #12 // t39a
+ srshr v27.4s, v7.4s, #12 // t39a
neg v6.4s, v6.4s // t40a
mul_mls v2, v31, v28, v0.s[2], v0.s[3] // -> t55a
srshr v31.4s, v6.4s, #12 // t40a
@@ -2865,11 +3013,18 @@ function inv_dct64_step2_neon
sqsub v21.4s, v25.4s, v28.4s // t55
sqadd v22.4s, v25.4s, v28.4s // t56
+.irp r, v16, v19, v17, v18, v20, v23, v21, v22
+ smin_4s \r, \r, v5
+.endr
+.irp r, v16, v19, v17, v18, v20, v23, v21, v22
+ smax_4s \r, \r, v4
+.endr
+
mul_mls v2, v21, v18, v0.s[0], v0.s[0] // -> t40a
- mul_mla v4, v21, v18, v0.s[0], v0.s[0] // -> t55a
+ mul_mla v7, v21, v18, v0.s[0], v0.s[0] // -> t55a
mul_mls v6, v20, v19, v0.s[0], v0.s[0] // -> t47
srshr v18.4s, v2.4s, #12 // t40a
- srshr v21.4s, v4.4s, #12 // t55a
+ srshr v21.4s, v7.4s, #12 // t55a
mul_mla v2, v20, v19, v0.s[0], v0.s[0] // -> t48
srshr v19.4s, v6.4s, #12 // t47
srshr v20.4s, v2.4s, #12 // t48
@@ -2966,6 +3121,14 @@ function inv_txfm_dct\suffix\()_4s_x64_neon
bl inv_dct_4s_x16_neon
+ // idct_16 leaves the row_clip_max/min constants in v5 and v4
+.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+ smin_4s \r, \r, v5
+.endr
+.irp r, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+ smax_4s \r, \r, v4
+.endr
+
store16 x6
movz16dup_if v0.2s, w16, #2896*8, \scale
@@ -2984,6 +3147,9 @@ function inv_txfm_dct\suffix\()_4s_x64_neon
mov x9, #-16
+ movi v1.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ mvni v0.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
+
.macro store_addsub r0, r1, r2, r3
ld1 {v2.4s}, [x6], #16
ld1 {v3.4s}, [x6], #16
@@ -2992,16 +3158,32 @@ function inv_txfm_dct\suffix\()_4s_x64_neon
ld1 {v4.4s}, [x6], #16
sqadd v7.4s, v3.4s, \r1
sqsub \r1, v3.4s, \r1
+ smin v6.4s, v6.4s, v1.4s
+ smin \r0, \r0, v1.4s
ld1 {v5.4s}, [x6], #16
sqadd v2.4s, v4.4s, \r2
sub x6, x6, #16*4
+ smax v6.4s, v6.4s, v0.4s
+ smax \r0, \r0, v0.4s
sqsub \r2, v4.4s, \r2
+ smin v7.4s, v7.4s, v1.4s
+ smin \r1, \r1, v1.4s
st1 {v6.4s}, [x6], #16
st1 {\r0}, [x10], x9
+ smin v2.4s, v2.4s, v1.4s
+ smin \r2, \r2, v1.4s
+ smax v7.4s, v7.4s, v0.4s
+ smax \r1, \r1, v0.4s
sqadd v3.4s, v5.4s, \r3
sqsub \r3, v5.4s, \r3
+ smax v2.4s, v2.4s, v0.4s
+ smax \r2, \r2, v0.4s
+ smin v3.4s, v3.4s, v1.4s
+ smin \r3, \r3, v1.4s
st1 {v7.4s}, [x6], #16
st1 {\r1}, [x10], x9
+ smax v3.4s, v3.4s, v0.4s
+ smax \r3, \r3, v0.4s
st1 {v2.4s}, [x6], #16
st1 {\r2}, [x10], x9
st1 {v3.4s}, [x6], #16
@@ -3016,6 +3198,8 @@ function inv_txfm_dct\suffix\()_4s_x64_neon
add x6, x6, #4*4*16
movrel x17, idct64_coeffs
+ movi v5.4s, #1, msl #16 // row_clip_max = ~(~bdmax << 7), 0x1ffff
+ mvni v4.4s, #1, msl #16 // row_clip_min = (~bdmax << 7), 0xfffe0000
movz16dup_if v0.2s, w16, #2896*8, \scale
movi_if v7.4s, #0, \clear
add x9, x7, x8, lsl #4 // offset 16