diff options
author | Niels Möller <nisse@lysator.liu.se> | 2022-06-13 17:45:37 +0000 |
---|---|---|
committer | Niels Möller <nisse@lysator.liu.se> | 2022-06-13 17:45:37 +0000 |
commit | c5804293ef315b5b38e730a55718b920b35bb00f (patch) | |
tree | e070e0eba471f09e36228d384b9cedff1c2b807a | |
parent | 168f826e018c695b89131c178991702a20f616cb (diff) | |
parent | d4c7597e4236f746434c9a1a24f6191f7ff870cd (diff) | |
download | nettle-c5804293ef315b5b38e730a55718b920b35bb00f.tar.gz |
Merge branch 'chacha_m4_fix' into 'master'
Fix a POSIX violation of m4 argument expansion
See merge request nettle/nettle!49
-rw-r--r-- | arm64/chacha-4core.asm | 129 | ||||
-rw-r--r-- | powerpc64/p7/chacha-4core.asm | 113 | ||||
-rw-r--r-- | s390x/vf/chacha-4core.asm | 113 |
3 files changed, 188 insertions, 167 deletions
diff --git a/arm64/chacha-4core.asm b/arm64/chacha-4core.asm index b4306ca9..12213126 100644 --- a/arm64/chacha-4core.asm +++ b/arm64/chacha-4core.asm @@ -53,67 +53,74 @@ define(`TMP3', `v7') define(`ROT24', `v8') +C A workaround for expanding multiple digits of argument references to QR macro which is incompatible with POSIX +C See https://www.gnu.org/software/m4/manual/html_node/Arguments.html +define(`P1', +`ifelse($1, 0, v16, $1, 1, v17, $1, 2, v18, $1, 3, v19, $1, 4, v20, $1, 5, v21, $1, 6, v22, $1, 7, v23, $1, 8, v24, $1, 9, v25, $1, 10, v26, $1, 11, v27, $1, 12, v28, $1, 13, v29, $1, 14, v30, $1, 15, v31)') +define(`P2', +`ifelse($1, 0, v16, $1, 1, v21, $1, 2, v26, $1, 3, v31, $1, 4, v20, $1, 5, v25, $1, 6, v30, $1, 7, v19, $1, 8, v24, $1, 9, v29, $1, 10, v18, $1, 11, v23, $1, 12, v28, $1, 13, v17, $1, 14, v22, $1, 15, v27)') + C Main loop for round define(`QR',` - add $1.4s, $1.4s, $2.4s - add $5.4s, $5.4s, $6.4s - add $9.4s, $9.4s, $10.4s - add $13.4s, $13.4s, $14.4s - eor $4.16b, $4.16b, $1.16b - eor $8.16b, $8.16b, $5.16b - eor $12.16b, $12.16b, $9.16b - eor $16.16b, $16.16b, $13.16b - rev32 $4.8h, $4.8h - rev32 $8.8h, $8.8h - rev32 $12.8h, $12.8h - rev32 $16.8h, $16.8h - - add $3.4s, $3.4s, $4.4s - add $7.4s, $7.4s, $8.4s - add $11.4s, $11.4s, $12.4s - add $15.4s, $15.4s, $16.4s - eor TMP0.16b, $2.16b, $3.16b - eor TMP1.16b, $6.16b, $7.16b - eor TMP2.16b, $10.16b, $11.16b - eor TMP3.16b, $14.16b, $15.16b - ushr $2.4s, TMP0.4s, #20 - ushr $6.4s, TMP1.4s, #20 - ushr $10.4s, TMP2.4s, #20 - ushr $14.4s, TMP3.4s, #20 - sli $2.4s, TMP0.4s, #12 - sli $6.4s, TMP1.4s, #12 - sli $10.4s, TMP2.4s, #12 - sli $14.4s, TMP3.4s, #12 - - add $1.4s, $1.4s, $2.4s - add $5.4s, $5.4s, $6.4s - add $9.4s, $9.4s, $10.4s - add $13.4s, $13.4s, $14.4s - eor $4.16b, $4.16b, $1.16b - eor $8.16b, $8.16b, $5.16b - eor $12.16b, $12.16b, $9.16b - eor $16.16b, $16.16b, $13.16b - tbl $4.16b, {$4.16b}, ROT24.16b - tbl $8.16b, {$8.16b}, ROT24.16b - tbl $12.16b, {$12.16b}, ROT24.16b - tbl $16.16b, {$16.16b}, ROT24.16b - - add $3.4s, $3.4s, $4.4s - add $7.4s, $7.4s, $8.4s - add $11.4s, $11.4s, $12.4s - add $15.4s, $15.4s, $16.4s - eor TMP0.16b, $2.16b, $3.16b - eor TMP1.16b, $6.16b, $7.16b - eor TMP2.16b, $10.16b, $11.16b - eor TMP3.16b, $14.16b, $15.16b - ushr $2.4s, TMP0.4s, #25 - ushr $6.4s, TMP1.4s, #25 - ushr $10.4s, TMP2.4s, #25 - ushr $14.4s, TMP3.4s, #25 - sli $2.4s, TMP0.4s, #7 - sli $6.4s, TMP1.4s, #7 - sli $10.4s, TMP2.4s, #7 - sli $14.4s, TMP3.4s, #7 + add $1(0).4s, $1(0).4s, $1(1).4s + add $1(4).4s, $1(4).4s, $1(5).4s + add $1(8).4s, $1(8).4s, $1(9).4s + add $1(12).4s, $1(12).4s, $1(13).4s + eor $1(3).16b, $1(3).16b, $1(0).16b + eor $1(7).16b, $1(7).16b, $1(4).16b + eor $1(11).16b, $1(11).16b, $1(8).16b + eor $1(15).16b, $1(15).16b, $1(12).16b + rev32 $1(3).8h, $1(3).8h + rev32 $1(7).8h, $1(7).8h + rev32 $1(11).8h, $1(11).8h + rev32 $1(15).8h, $1(15).8h + + add $1(2).4s, $1(2).4s, $1(3).4s + add $1(6).4s, $1(6).4s, $1(7).4s + add $1(10).4s, $1(10).4s, $1(11).4s + add $1(14).4s, $1(14).4s, $1(15).4s + eor TMP0.16b, $1(1).16b, $1(2).16b + eor TMP1.16b, $1(5).16b, $1(6).16b + eor TMP2.16b, $1(9).16b, $1(10).16b + eor TMP3.16b, $1(13).16b, $1(14).16b + ushr $1(1).4s, TMP0.4s, #20 + ushr $1(5).4s, TMP1.4s, #20 + ushr $1(9).4s, TMP2.4s, #20 + ushr $1(13).4s, TMP3.4s, #20 + sli $1(1).4s, TMP0.4s, #12 + sli $1(5).4s, TMP1.4s, #12 + sli $1(9).4s, TMP2.4s, #12 + sli $1(13).4s, TMP3.4s, #12 + + add $1(0).4s, $1(0).4s, $1(1).4s + add $1(4).4s, $1(4).4s, $1(5).4s + add $1(8).4s, $1(8).4s, $1(9).4s + add $1(12).4s, $1(12).4s, $1(13).4s + eor $1(3).16b, $1(3).16b, $1(0).16b + eor $1(7).16b, $1(7).16b, $1(4).16b + eor $1(11).16b, $1(11).16b, $1(8).16b + eor $1(15).16b, $1(15).16b, $1(12).16b + tbl $1(3).16b, {$1(3).16b}, ROT24.16b + tbl $1(7).16b, {$1(7).16b}, ROT24.16b + tbl $1(11).16b, {$1(11).16b}, ROT24.16b + tbl $1(15).16b, {$1(15).16b}, ROT24.16b + + add $1(2).4s, $1(2).4s, $1(3).4s + add $1(6).4s, $1(6).4s, $1(7).4s + add $1(10).4s, $1(10).4s, $1(11).4s + add $1(14).4s, $1(14).4s, $1(15).4s + eor TMP0.16b, $1(1).16b, $1(2).16b + eor TMP1.16b, $1(5).16b, $1(6).16b + eor TMP2.16b, $1(9).16b, $1(10).16b + eor TMP3.16b, $1(13).16b, $1(14).16b + ushr $1(1).4s, TMP0.4s, #25 + ushr $1(5).4s, TMP1.4s, #25 + ushr $1(9).4s, TMP2.4s, #25 + ushr $1(13).4s, TMP3.4s, #25 + sli $1(1).4s, TMP0.4s, #7 + sli $1(5).4s, TMP1.4s, #7 + sli $1(9).4s, TMP2.4s, #7 + sli $1(13).4s, TMP3.4s, #7 ') define(`TRANSPOSE',` @@ -174,8 +181,8 @@ C Load state and splat mov T3.16b, v31.16b .Loop: - QR(v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31) - QR(v16, v21, v26, v31, v20, v25, v30, v19, v24, v29, v18, v23, v28, v17, v22, v27) + QR(`P1') + QR(`P2') subs ROUNDS, ROUNDS, #2 b.ne .Loop diff --git a/powerpc64/p7/chacha-4core.asm b/powerpc64/p7/chacha-4core.asm index e3870063..deede8ad 100644 --- a/powerpc64/p7/chacha-4core.asm +++ b/powerpc64/p7/chacha-4core.asm @@ -53,59 +53,66 @@ define(`T1', `v21') define(`T2', `v22') define(`T3', `v23') +C A workaround for expanding multiple digits of argument references to QR macro which is incompatible with POSIX +C See https://www.gnu.org/software/m4/manual/html_node/Arguments.html +define(`P1', +`ifelse($1, 0, v0, $1, 1, v4, $1, 2, v8, $1, 3, v12, $1, 4, v1, $1, 5, v5, $1, 6, v9, $1, 7, v13, $1, 8, v2, $1, 9, v6, $1, 10, v10, $1, 11, v14, $1, 12, v3, $1, 13, v7, $1, 14, v11, $1, 15, v15)') +define(`P2', +`ifelse($1, 0, v0, $1, 1, v5, $1, 2, v10, $1, 3, v15, $1, 4, v1, $1, 5, v6, $1, 6, v11, $1, 7, v12, $1, 8, v2, $1, 9, v7, $1, 10, v8, $1, 11, v13, $1, 12, v3, $1, 13, v4, $1, 14, v9, $1, 15, v14)') + C Main loop for round define(`QR',` - vadduwm $1, $1, $2 - vadduwm $5, $5, $6 - vadduwm $9, $9, $10 - vadduwm $13, $13, $14 - vxor $4, $4, $1 - vxor $8, $8, $5 - vxor $12, $12, $9 - vxor $16, $16, $13 - vrlw $4, $4, ROT16 - vrlw $8, $8, ROT16 - vrlw $12, $12, ROT16 - vrlw $16, $16, ROT16 - - vadduwm $3, $3, $4 - vadduwm $7, $7, $8 - vadduwm $11, $11, $12 - vadduwm $15, $15, $16 - vxor $2, $2, $3 - vxor $6, $6, $7 - vxor $10, $10, $11 - vxor $14, $14, $15 - vrlw $2, $2, ROT12 - vrlw $6, $6, ROT12 - vrlw $10, $10, ROT12 - vrlw $14, $14, ROT12 - - vadduwm $1, $1, $2 - vadduwm $5, $5, $6 - vadduwm $9, $9, $10 - vadduwm $13, $13, $14 - vxor $4, $4, $1 - vxor $8, $8, $5 - vxor $12, $12, $9 - vxor $16, $16, $13 - vrlw $4, $4, ROT8 - vrlw $8, $8, ROT8 - vrlw $12, $12, ROT8 - vrlw $16, $16, ROT8 - - vadduwm $3, $3, $4 - vadduwm $7, $7, $8 - vadduwm $11, $11, $12 - vadduwm $15, $15, $16 - vxor $2, $2, $3 - vxor $6, $6, $7 - vxor $10, $10, $11 - vxor $14, $14, $15 - vrlw $2, $2, ROT7 - vrlw $6, $6, ROT7 - vrlw $10, $10, ROT7 - vrlw $14, $14, ROT7 + vadduwm $1(0), $1(0), $1(1) + vadduwm $1(4), $1(4), $1(5) + vadduwm $1(8), $1(8), $1(9) + vadduwm $1(12), $1(12), $1(13) + vxor $1(3), $1(3), $1(0) + vxor $1(7), $1(7), $1(4) + vxor $1(11), $1(11), $1(8) + vxor $1(15), $1(15), $1(12) + vrlw $1(3), $1(3), ROT16 + vrlw $1(7), $1(7), ROT16 + vrlw $1(11), $1(11), ROT16 + vrlw $1(15), $1(15), ROT16 + + vadduwm $1(2), $1(2), $1(3) + vadduwm $1(6), $1(6), $1(7) + vadduwm $1(10), $1(10), $1(11) + vadduwm $1(14), $1(14), $1(15) + vxor $1(1), $1(1), $1(2) + vxor $1(5), $1(5), $1(6) + vxor $1(9), $1(9), $1(10) + vxor $1(13), $1(13), $1(14) + vrlw $1(1), $1(1), ROT12 + vrlw $1(5), $1(5), ROT12 + vrlw $1(9), $1(9), ROT12 + vrlw $1(13), $1(13), ROT12 + + vadduwm $1(0), $1(0), $1(1) + vadduwm $1(4), $1(4), $1(5) + vadduwm $1(8), $1(8), $1(9) + vadduwm $1(12), $1(12), $1(13) + vxor $1(3), $1(3), $1(0) + vxor $1(7), $1(7), $1(4) + vxor $1(11), $1(11), $1(8) + vxor $1(15), $1(15), $1(12) + vrlw $1(3), $1(3), ROT8 + vrlw $1(7), $1(7), ROT8 + vrlw $1(11), $1(11), ROT8 + vrlw $1(15), $1(15), ROT8 + + vadduwm $1(2), $1(2), $1(3) + vadduwm $1(6), $1(6), $1(7) + vadduwm $1(10), $1(10), $1(11) + vadduwm $1(14), $1(14), $1(15) + vxor $1(1), $1(1), $1(2) + vxor $1(5), $1(5), $1(6) + vxor $1(9), $1(9), $1(10) + vxor $1(13), $1(13), $1(14) + vrlw $1(1), $1(1), ROT7 + vrlw $1(5), $1(5), ROT7 + vrlw $1(9), $1(9), ROT7 + vrlw $1(13), $1(13), ROT7 ') define(`TRANSPOSE',` @@ -185,8 +192,8 @@ C Load state and splat srdi ROUNDS, ROUNDS, 1 mtctr ROUNDS .Loop: - QR(v0, v4, v8, v12, v1, v5, v9, v13, v2, v6, v10, v14, v3, v7, v11, v15) - QR(v0, v5, v10, v15, v1, v6, v11, v12, v2, v7, v8, v13, v3, v4, v9, v14) + QR(`P1') + QR(`P2') bdnz .Loop C Add in saved original words, including counters, before diff --git a/s390x/vf/chacha-4core.asm b/s390x/vf/chacha-4core.asm index 276de9f1..c10093a8 100644 --- a/s390x/vf/chacha-4core.asm +++ b/s390x/vf/chacha-4core.asm @@ -48,59 +48,66 @@ define(`T1', `%v26') define(`T2', `%v27') define(`T3', `%v28') +C A workaround for expanding multiple digits of argument references to QR macro which is incompatible with POSIX +C See https://www.gnu.org/software/m4/manual/html_node/Arguments.html +define(`P1', +`ifelse($1, 0, v0, $1, 1, v1, $1, 2, v2, $1, 3, v3, $1, 4, v4, $1, 5, v5, $1, 6, v6, $1, 7, v7, $1, 8, v8, $1, 9, v9, $1, 10, v10, $1, 11, v11, $1, 12, v12, $1, 13, v13, $1, 14, v14, $1, 15, v15)') +define(`P2', +`ifelse($1, 0, v0, $1, 1, v5, $1, 2, v10, $1, 3, v15, $1, 4, v4, $1, 5, v9, $1, 6, v14, $1, 7, v3, $1, 8, v8, $1, 9, v13, $1, 10, v2, $1, 11, v7, $1, 12, v12, $1, 13, v1, $1, 14, v6, $1, 15, v11)') + C Main loop for round define(`QR',` - vaf $1, $1, $2 - vaf $5, $5, $6 - vaf $9, $9, $10 - vaf $13, $13, $14 - vx $4, $4, $1 - vx $8, $8, $5 - vx $12, $12, $9 - vx $16, $16, $13 - verllf $4, $4, 16 - verllf $8, $8, 16 - verllf $12, $12, 16 - verllf $16, $16, 16 - - vaf $3, $3, $4 - vaf $7, $7, $8 - vaf $11, $11, $12 - vaf $15, $15, $16 - vx $2, $2, $3 - vx $6, $6, $7 - vx $10, $10, $11 - vx $14, $14, $15 - verllf $2, $2, 12 - verllf $6, $6, 12 - verllf $10, $10, 12 - verllf $14, $14, 12 - - vaf $1, $1, $2 - vaf $5, $5, $6 - vaf $9, $9, $10 - vaf $13, $13, $14 - vx $4, $4, $1 - vx $8, $8, $5 - vx $12, $12, $9 - vx $16, $16, $13 - verllf $4, $4, 8 - verllf $8, $8, 8 - verllf $12, $12, 8 - verllf $16, $16, 8 - - vaf $3, $3, $4 - vaf $7, $7, $8 - vaf $11, $11, $12 - vaf $15, $15, $16 - vx $2, $2, $3 - vx $6, $6, $7 - vx $10, $10, $11 - vx $14, $14, $15 - verllf $2, $2, 7 - verllf $6, $6, 7 - verllf $10, $10, 7 - verllf $14, $14, 7 + vaf $1(0), $1(0), $1(1) + vaf $1(4), $1(4), $1(5) + vaf $1(8), $1(8), $1(9) + vaf $1(12), $1(12), $1(13) + vx $1(3), $1(3), $1(0) + vx $1(7), $1(7), $1(4) + vx $1(11), $1(11), $1(8) + vx $1(15), $1(15), $1(12) + verllf $1(3), $1(3), 16 + verllf $1(7), $1(7), 16 + verllf $1(11), $1(11), 16 + verllf $1(15), $1(15), 16 + + vaf $1(2), $1(2), $1(3) + vaf $1(6), $1(6), $1(7) + vaf $1(10), $1(10), $1(11) + vaf $1(14), $1(14), $1(15) + vx $1(1), $1(1), $1(2) + vx $1(5), $1(5), $1(6) + vx $1(9), $1(9), $1(10) + vx $1(13), $1(13), $1(14) + verllf $1(1), $1(1), 12 + verllf $1(5), $1(5), 12 + verllf $1(9), $1(9), 12 + verllf $1(13), $1(13), 12 + + vaf $1(0), $1(0), $1(1) + vaf $1(4), $1(4), $1(5) + vaf $1(8), $1(8), $1(9) + vaf $1(12), $1(12), $1(13) + vx $1(3), $1(3), $1(0) + vx $1(7), $1(7), $1(4) + vx $1(11), $1(11), $1(8) + vx $1(15), $1(15), $1(12) + verllf $1(3), $1(3), 8 + verllf $1(7), $1(7), 8 + verllf $1(11), $1(11), 8 + verllf $1(15), $1(15), 8 + + vaf $1(2), $1(2), $1(3) + vaf $1(6), $1(6), $1(7) + vaf $1(10), $1(10), $1(11) + vaf $1(14), $1(14), $1(15) + vx $1(1), $1(1), $1(2) + vx $1(5), $1(5), $1(6) + vx $1(9), $1(9), $1(10) + vx $1(13), $1(13), $1(14) + verllf $1(1), $1(1), 7 + verllf $1(5), $1(5), 7 + verllf $1(9), $1(9), 7 + verllf $1(13), $1(13), 7 ') define(`TRANSPOSE',` @@ -176,8 +183,8 @@ C Load state and splat srlg ROUNDS, ROUNDS, 1 .Loop: - QR(%v0, %v1, %v2, %v3, %v4, %v5, %v6, %v7, %v8, %v9, %v10, %v11, %v12, %v13, %v14, %v15) - QR(%v0, %v5, %v10, %v15, %v4, %v9, %v14, %v3, %v8, %v13, %v2, %v7, %v12, %v1, %v6, %v11) + QR(`P1') + QR(`P2') brctg ROUNDS, .Loop C Add in saved original words, including counters, before |