summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNiels Möller <nisse@lysator.liu.se>2022-06-13 17:45:37 +0000
committerNiels Möller <nisse@lysator.liu.se>2022-06-13 17:45:37 +0000
commitc5804293ef315b5b38e730a55718b920b35bb00f (patch)
treee070e0eba471f09e36228d384b9cedff1c2b807a
parent168f826e018c695b89131c178991702a20f616cb (diff)
parentd4c7597e4236f746434c9a1a24f6191f7ff870cd (diff)
downloadnettle-c5804293ef315b5b38e730a55718b920b35bb00f.tar.gz
Merge branch 'chacha_m4_fix' into 'master'
Fix a POSIX violation of m4 argument expansion See merge request nettle/nettle!49
-rw-r--r--arm64/chacha-4core.asm129
-rw-r--r--powerpc64/p7/chacha-4core.asm113
-rw-r--r--s390x/vf/chacha-4core.asm113
3 files changed, 188 insertions, 167 deletions
diff --git a/arm64/chacha-4core.asm b/arm64/chacha-4core.asm
index b4306ca9..12213126 100644
--- a/arm64/chacha-4core.asm
+++ b/arm64/chacha-4core.asm
@@ -53,67 +53,74 @@ define(`TMP3', `v7')
define(`ROT24', `v8')
+C A workaround for expanding multiple digits of argument references to QR macro which is incompatible with POSIX
+C See https://www.gnu.org/software/m4/manual/html_node/Arguments.html
+define(`P1',
+`ifelse($1, 0, v16, $1, 1, v17, $1, 2, v18, $1, 3, v19, $1, 4, v20, $1, 5, v21, $1, 6, v22, $1, 7, v23, $1, 8, v24, $1, 9, v25, $1, 10, v26, $1, 11, v27, $1, 12, v28, $1, 13, v29, $1, 14, v30, $1, 15, v31)')
+define(`P2',
+`ifelse($1, 0, v16, $1, 1, v21, $1, 2, v26, $1, 3, v31, $1, 4, v20, $1, 5, v25, $1, 6, v30, $1, 7, v19, $1, 8, v24, $1, 9, v29, $1, 10, v18, $1, 11, v23, $1, 12, v28, $1, 13, v17, $1, 14, v22, $1, 15, v27)')
+
C Main loop for round
define(`QR',`
- add $1.4s, $1.4s, $2.4s
- add $5.4s, $5.4s, $6.4s
- add $9.4s, $9.4s, $10.4s
- add $13.4s, $13.4s, $14.4s
- eor $4.16b, $4.16b, $1.16b
- eor $8.16b, $8.16b, $5.16b
- eor $12.16b, $12.16b, $9.16b
- eor $16.16b, $16.16b, $13.16b
- rev32 $4.8h, $4.8h
- rev32 $8.8h, $8.8h
- rev32 $12.8h, $12.8h
- rev32 $16.8h, $16.8h
-
- add $3.4s, $3.4s, $4.4s
- add $7.4s, $7.4s, $8.4s
- add $11.4s, $11.4s, $12.4s
- add $15.4s, $15.4s, $16.4s
- eor TMP0.16b, $2.16b, $3.16b
- eor TMP1.16b, $6.16b, $7.16b
- eor TMP2.16b, $10.16b, $11.16b
- eor TMP3.16b, $14.16b, $15.16b
- ushr $2.4s, TMP0.4s, #20
- ushr $6.4s, TMP1.4s, #20
- ushr $10.4s, TMP2.4s, #20
- ushr $14.4s, TMP3.4s, #20
- sli $2.4s, TMP0.4s, #12
- sli $6.4s, TMP1.4s, #12
- sli $10.4s, TMP2.4s, #12
- sli $14.4s, TMP3.4s, #12
-
- add $1.4s, $1.4s, $2.4s
- add $5.4s, $5.4s, $6.4s
- add $9.4s, $9.4s, $10.4s
- add $13.4s, $13.4s, $14.4s
- eor $4.16b, $4.16b, $1.16b
- eor $8.16b, $8.16b, $5.16b
- eor $12.16b, $12.16b, $9.16b
- eor $16.16b, $16.16b, $13.16b
- tbl $4.16b, {$4.16b}, ROT24.16b
- tbl $8.16b, {$8.16b}, ROT24.16b
- tbl $12.16b, {$12.16b}, ROT24.16b
- tbl $16.16b, {$16.16b}, ROT24.16b
-
- add $3.4s, $3.4s, $4.4s
- add $7.4s, $7.4s, $8.4s
- add $11.4s, $11.4s, $12.4s
- add $15.4s, $15.4s, $16.4s
- eor TMP0.16b, $2.16b, $3.16b
- eor TMP1.16b, $6.16b, $7.16b
- eor TMP2.16b, $10.16b, $11.16b
- eor TMP3.16b, $14.16b, $15.16b
- ushr $2.4s, TMP0.4s, #25
- ushr $6.4s, TMP1.4s, #25
- ushr $10.4s, TMP2.4s, #25
- ushr $14.4s, TMP3.4s, #25
- sli $2.4s, TMP0.4s, #7
- sli $6.4s, TMP1.4s, #7
- sli $10.4s, TMP2.4s, #7
- sli $14.4s, TMP3.4s, #7
+ add $1(0).4s, $1(0).4s, $1(1).4s
+ add $1(4).4s, $1(4).4s, $1(5).4s
+ add $1(8).4s, $1(8).4s, $1(9).4s
+ add $1(12).4s, $1(12).4s, $1(13).4s
+ eor $1(3).16b, $1(3).16b, $1(0).16b
+ eor $1(7).16b, $1(7).16b, $1(4).16b
+ eor $1(11).16b, $1(11).16b, $1(8).16b
+ eor $1(15).16b, $1(15).16b, $1(12).16b
+ rev32 $1(3).8h, $1(3).8h
+ rev32 $1(7).8h, $1(7).8h
+ rev32 $1(11).8h, $1(11).8h
+ rev32 $1(15).8h, $1(15).8h
+
+ add $1(2).4s, $1(2).4s, $1(3).4s
+ add $1(6).4s, $1(6).4s, $1(7).4s
+ add $1(10).4s, $1(10).4s, $1(11).4s
+ add $1(14).4s, $1(14).4s, $1(15).4s
+ eor TMP0.16b, $1(1).16b, $1(2).16b
+ eor TMP1.16b, $1(5).16b, $1(6).16b
+ eor TMP2.16b, $1(9).16b, $1(10).16b
+ eor TMP3.16b, $1(13).16b, $1(14).16b
+ ushr $1(1).4s, TMP0.4s, #20
+ ushr $1(5).4s, TMP1.4s, #20
+ ushr $1(9).4s, TMP2.4s, #20
+ ushr $1(13).4s, TMP3.4s, #20
+ sli $1(1).4s, TMP0.4s, #12
+ sli $1(5).4s, TMP1.4s, #12
+ sli $1(9).4s, TMP2.4s, #12
+ sli $1(13).4s, TMP3.4s, #12
+
+ add $1(0).4s, $1(0).4s, $1(1).4s
+ add $1(4).4s, $1(4).4s, $1(5).4s
+ add $1(8).4s, $1(8).4s, $1(9).4s
+ add $1(12).4s, $1(12).4s, $1(13).4s
+ eor $1(3).16b, $1(3).16b, $1(0).16b
+ eor $1(7).16b, $1(7).16b, $1(4).16b
+ eor $1(11).16b, $1(11).16b, $1(8).16b
+ eor $1(15).16b, $1(15).16b, $1(12).16b
+ tbl $1(3).16b, {$1(3).16b}, ROT24.16b
+ tbl $1(7).16b, {$1(7).16b}, ROT24.16b
+ tbl $1(11).16b, {$1(11).16b}, ROT24.16b
+ tbl $1(15).16b, {$1(15).16b}, ROT24.16b
+
+ add $1(2).4s, $1(2).4s, $1(3).4s
+ add $1(6).4s, $1(6).4s, $1(7).4s
+ add $1(10).4s, $1(10).4s, $1(11).4s
+ add $1(14).4s, $1(14).4s, $1(15).4s
+ eor TMP0.16b, $1(1).16b, $1(2).16b
+ eor TMP1.16b, $1(5).16b, $1(6).16b
+ eor TMP2.16b, $1(9).16b, $1(10).16b
+ eor TMP3.16b, $1(13).16b, $1(14).16b
+ ushr $1(1).4s, TMP0.4s, #25
+ ushr $1(5).4s, TMP1.4s, #25
+ ushr $1(9).4s, TMP2.4s, #25
+ ushr $1(13).4s, TMP3.4s, #25
+ sli $1(1).4s, TMP0.4s, #7
+ sli $1(5).4s, TMP1.4s, #7
+ sli $1(9).4s, TMP2.4s, #7
+ sli $1(13).4s, TMP3.4s, #7
')
define(`TRANSPOSE',`
@@ -174,8 +181,8 @@ C Load state and splat
mov T3.16b, v31.16b
.Loop:
- QR(v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31)
- QR(v16, v21, v26, v31, v20, v25, v30, v19, v24, v29, v18, v23, v28, v17, v22, v27)
+ QR(`P1')
+ QR(`P2')
subs ROUNDS, ROUNDS, #2
b.ne .Loop
diff --git a/powerpc64/p7/chacha-4core.asm b/powerpc64/p7/chacha-4core.asm
index e3870063..deede8ad 100644
--- a/powerpc64/p7/chacha-4core.asm
+++ b/powerpc64/p7/chacha-4core.asm
@@ -53,59 +53,66 @@ define(`T1', `v21')
define(`T2', `v22')
define(`T3', `v23')
+C A workaround for expanding multiple digits of argument references to QR macro which is incompatible with POSIX
+C See https://www.gnu.org/software/m4/manual/html_node/Arguments.html
+define(`P1',
+`ifelse($1, 0, v0, $1, 1, v4, $1, 2, v8, $1, 3, v12, $1, 4, v1, $1, 5, v5, $1, 6, v9, $1, 7, v13, $1, 8, v2, $1, 9, v6, $1, 10, v10, $1, 11, v14, $1, 12, v3, $1, 13, v7, $1, 14, v11, $1, 15, v15)')
+define(`P2',
+`ifelse($1, 0, v0, $1, 1, v5, $1, 2, v10, $1, 3, v15, $1, 4, v1, $1, 5, v6, $1, 6, v11, $1, 7, v12, $1, 8, v2, $1, 9, v7, $1, 10, v8, $1, 11, v13, $1, 12, v3, $1, 13, v4, $1, 14, v9, $1, 15, v14)')
+
C Main loop for round
define(`QR',`
- vadduwm $1, $1, $2
- vadduwm $5, $5, $6
- vadduwm $9, $9, $10
- vadduwm $13, $13, $14
- vxor $4, $4, $1
- vxor $8, $8, $5
- vxor $12, $12, $9
- vxor $16, $16, $13
- vrlw $4, $4, ROT16
- vrlw $8, $8, ROT16
- vrlw $12, $12, ROT16
- vrlw $16, $16, ROT16
-
- vadduwm $3, $3, $4
- vadduwm $7, $7, $8
- vadduwm $11, $11, $12
- vadduwm $15, $15, $16
- vxor $2, $2, $3
- vxor $6, $6, $7
- vxor $10, $10, $11
- vxor $14, $14, $15
- vrlw $2, $2, ROT12
- vrlw $6, $6, ROT12
- vrlw $10, $10, ROT12
- vrlw $14, $14, ROT12
-
- vadduwm $1, $1, $2
- vadduwm $5, $5, $6
- vadduwm $9, $9, $10
- vadduwm $13, $13, $14
- vxor $4, $4, $1
- vxor $8, $8, $5
- vxor $12, $12, $9
- vxor $16, $16, $13
- vrlw $4, $4, ROT8
- vrlw $8, $8, ROT8
- vrlw $12, $12, ROT8
- vrlw $16, $16, ROT8
-
- vadduwm $3, $3, $4
- vadduwm $7, $7, $8
- vadduwm $11, $11, $12
- vadduwm $15, $15, $16
- vxor $2, $2, $3
- vxor $6, $6, $7
- vxor $10, $10, $11
- vxor $14, $14, $15
- vrlw $2, $2, ROT7
- vrlw $6, $6, ROT7
- vrlw $10, $10, ROT7
- vrlw $14, $14, ROT7
+ vadduwm $1(0), $1(0), $1(1)
+ vadduwm $1(4), $1(4), $1(5)
+ vadduwm $1(8), $1(8), $1(9)
+ vadduwm $1(12), $1(12), $1(13)
+ vxor $1(3), $1(3), $1(0)
+ vxor $1(7), $1(7), $1(4)
+ vxor $1(11), $1(11), $1(8)
+ vxor $1(15), $1(15), $1(12)
+ vrlw $1(3), $1(3), ROT16
+ vrlw $1(7), $1(7), ROT16
+ vrlw $1(11), $1(11), ROT16
+ vrlw $1(15), $1(15), ROT16
+
+ vadduwm $1(2), $1(2), $1(3)
+ vadduwm $1(6), $1(6), $1(7)
+ vadduwm $1(10), $1(10), $1(11)
+ vadduwm $1(14), $1(14), $1(15)
+ vxor $1(1), $1(1), $1(2)
+ vxor $1(5), $1(5), $1(6)
+ vxor $1(9), $1(9), $1(10)
+ vxor $1(13), $1(13), $1(14)
+ vrlw $1(1), $1(1), ROT12
+ vrlw $1(5), $1(5), ROT12
+ vrlw $1(9), $1(9), ROT12
+ vrlw $1(13), $1(13), ROT12
+
+ vadduwm $1(0), $1(0), $1(1)
+ vadduwm $1(4), $1(4), $1(5)
+ vadduwm $1(8), $1(8), $1(9)
+ vadduwm $1(12), $1(12), $1(13)
+ vxor $1(3), $1(3), $1(0)
+ vxor $1(7), $1(7), $1(4)
+ vxor $1(11), $1(11), $1(8)
+ vxor $1(15), $1(15), $1(12)
+ vrlw $1(3), $1(3), ROT8
+ vrlw $1(7), $1(7), ROT8
+ vrlw $1(11), $1(11), ROT8
+ vrlw $1(15), $1(15), ROT8
+
+ vadduwm $1(2), $1(2), $1(3)
+ vadduwm $1(6), $1(6), $1(7)
+ vadduwm $1(10), $1(10), $1(11)
+ vadduwm $1(14), $1(14), $1(15)
+ vxor $1(1), $1(1), $1(2)
+ vxor $1(5), $1(5), $1(6)
+ vxor $1(9), $1(9), $1(10)
+ vxor $1(13), $1(13), $1(14)
+ vrlw $1(1), $1(1), ROT7
+ vrlw $1(5), $1(5), ROT7
+ vrlw $1(9), $1(9), ROT7
+ vrlw $1(13), $1(13), ROT7
')
define(`TRANSPOSE',`
@@ -185,8 +192,8 @@ C Load state and splat
srdi ROUNDS, ROUNDS, 1
mtctr ROUNDS
.Loop:
- QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15)
- QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14)
+ QR(`P1')
+ QR(`P2')
bdnz .Loop
C Add in saved original words, including counters, before
diff --git a/s390x/vf/chacha-4core.asm b/s390x/vf/chacha-4core.asm
index 276de9f1..c10093a8 100644
--- a/s390x/vf/chacha-4core.asm
+++ b/s390x/vf/chacha-4core.asm
@@ -48,59 +48,66 @@ define(`T1', `%v26')
define(`T2', `%v27')
define(`T3', `%v28')
+C A workaround for expanding multiple digits of argument references to QR macro which is incompatible with POSIX
+C See https://www.gnu.org/software/m4/manual/html_node/Arguments.html
+define(`P1',
+`ifelse($1, 0, v0, $1, 1, v1, $1, 2, v2, $1, 3, v3, $1, 4, v4, $1, 5, v5, $1, 6, v6, $1, 7, v7, $1, 8, v8, $1, 9, v9, $1, 10, v10, $1, 11, v11, $1, 12, v12, $1, 13, v13, $1, 14, v14, $1, 15, v15)')
+define(`P2',
+`ifelse($1, 0, v0, $1, 1, v5, $1, 2, v10, $1, 3, v15, $1, 4, v4, $1, 5, v9, $1, 6, v14, $1, 7, v3, $1, 8, v8, $1, 9, v13, $1, 10, v2, $1, 11, v7, $1, 12, v12, $1, 13, v1, $1, 14, v6, $1, 15, v11)')
+
C Main loop for round
define(`QR',`
- vaf $1, $1, $2
- vaf $5, $5, $6
- vaf $9, $9, $10
- vaf $13, $13, $14
- vx $4, $4, $1
- vx $8, $8, $5
- vx $12, $12, $9
- vx $16, $16, $13
- verllf $4, $4, 16
- verllf $8, $8, 16
- verllf $12, $12, 16
- verllf $16, $16, 16
-
- vaf $3, $3, $4
- vaf $7, $7, $8
- vaf $11, $11, $12
- vaf $15, $15, $16
- vx $2, $2, $3
- vx $6, $6, $7
- vx $10, $10, $11
- vx $14, $14, $15
- verllf $2, $2, 12
- verllf $6, $6, 12
- verllf $10, $10, 12
- verllf $14, $14, 12
-
- vaf $1, $1, $2
- vaf $5, $5, $6
- vaf $9, $9, $10
- vaf $13, $13, $14
- vx $4, $4, $1
- vx $8, $8, $5
- vx $12, $12, $9
- vx $16, $16, $13
- verllf $4, $4, 8
- verllf $8, $8, 8
- verllf $12, $12, 8
- verllf $16, $16, 8
-
- vaf $3, $3, $4
- vaf $7, $7, $8
- vaf $11, $11, $12
- vaf $15, $15, $16
- vx $2, $2, $3
- vx $6, $6, $7
- vx $10, $10, $11
- vx $14, $14, $15
- verllf $2, $2, 7
- verllf $6, $6, 7
- verllf $10, $10, 7
- verllf $14, $14, 7
+ vaf $1(0), $1(0), $1(1)
+ vaf $1(4), $1(4), $1(5)
+ vaf $1(8), $1(8), $1(9)
+ vaf $1(12), $1(12), $1(13)
+ vx $1(3), $1(3), $1(0)
+ vx $1(7), $1(7), $1(4)
+ vx $1(11), $1(11), $1(8)
+ vx $1(15), $1(15), $1(12)
+ verllf $1(3), $1(3), 16
+ verllf $1(7), $1(7), 16
+ verllf $1(11), $1(11), 16
+ verllf $1(15), $1(15), 16
+
+ vaf $1(2), $1(2), $1(3)
+ vaf $1(6), $1(6), $1(7)
+ vaf $1(10), $1(10), $1(11)
+ vaf $1(14), $1(14), $1(15)
+ vx $1(1), $1(1), $1(2)
+ vx $1(5), $1(5), $1(6)
+ vx $1(9), $1(9), $1(10)
+ vx $1(13), $1(13), $1(14)
+ verllf $1(1), $1(1), 12
+ verllf $1(5), $1(5), 12
+ verllf $1(9), $1(9), 12
+ verllf $1(13), $1(13), 12
+
+ vaf $1(0), $1(0), $1(1)
+ vaf $1(4), $1(4), $1(5)
+ vaf $1(8), $1(8), $1(9)
+ vaf $1(12), $1(12), $1(13)
+ vx $1(3), $1(3), $1(0)
+ vx $1(7), $1(7), $1(4)
+ vx $1(11), $1(11), $1(8)
+ vx $1(15), $1(15), $1(12)
+ verllf $1(3), $1(3), 8
+ verllf $1(7), $1(7), 8
+ verllf $1(11), $1(11), 8
+ verllf $1(15), $1(15), 8
+
+ vaf $1(2), $1(2), $1(3)
+ vaf $1(6), $1(6), $1(7)
+ vaf $1(10), $1(10), $1(11)
+ vaf $1(14), $1(14), $1(15)
+ vx $1(1), $1(1), $1(2)
+ vx $1(5), $1(5), $1(6)
+ vx $1(9), $1(9), $1(10)
+ vx $1(13), $1(13), $1(14)
+ verllf $1(1), $1(1), 7
+ verllf $1(5), $1(5), 7
+ verllf $1(9), $1(9), 7
+ verllf $1(13), $1(13), 7
')
define(`TRANSPOSE',`
@@ -176,8 +183,8 @@ C Load state and splat
srlg ROUNDS, ROUNDS, 1
.Loop:
- QR(%v0, %v1, %v2, %v3, %v4, %v5, %v6, %v7, %v8, %v9, %v10, %v11, %v12, %v13, %v14, %v15)
- QR(%v0, %v5, %v10, %v15, %v4, %v9, %v14, %v3, %v8, %v13, %v2, %v7, %v12, %v1, %v6, %v11)
+ QR(`P1')
+ QR(`P2')
brctg ROUNDS, .Loop
C Add in saved original words, including counters, before