summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMamone Tarsha <maamoun.tk@googlemail.com>2022-01-18 19:27:32 +0200
committerMamone Tarsha <maamoun.tk@googlemail.com>2022-01-18 19:27:32 +0200
commit173e25cd6c9472bf8f6238a6833cae90d48d86da (patch)
tree4814709c5be64da25e5f4831ae062c5ef123ed48
parent94228f87fac465bcc3cb36efb8a43ef27554f7e5 (diff)
downloadnettle-arm64-chacha.tar.gz
[Arm64] Optimize Chacha20nettle-arm64-chacha
-rw-r--r--Makefile.in2
-rw-r--r--arm64/asimd/chacha-2core.asm231
-rw-r--r--arm64/asimd/chacha-4core.asm228
-rw-r--r--arm64/asimd/chacha-core-internal.asm126
-rw-r--r--arm64/fat/chacha-2core.asm36
-rw-r--r--arm64/fat/chacha-4core.asm36
-rw-r--r--arm64/fat/chacha-core-internal-2.asm37
-rw-r--r--configure.ac9
-rw-r--r--fat-arm64.c57
9 files changed, 757 insertions, 5 deletions
diff --git a/Makefile.in b/Makefile.in
index 0590c370..7c87ca65 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -606,7 +606,7 @@ distdir: $(DISTFILES)
set -e; for d in sparc32 sparc64 x86 \
x86_64 x86_64/aesni x86_64/sha_ni x86_64/fat \
arm arm/neon arm/v6 arm/fat \
- arm64 arm64/crypto arm64/fat \
+ arm64 arm64/asimd arm64/crypto arm64/fat \
powerpc64 powerpc64/p7 powerpc64/p8 powerpc64/fat \
s390x s390x/vf s390x/msa s390x/msa_x1 s390x/msa_x2 s390x/msa_x4 s390x/fat ; do \
mkdir "$(distdir)/$$d" ; \
diff --git a/arm64/asimd/chacha-2core.asm b/arm64/asimd/chacha-2core.asm
new file mode 100644
index 00000000..792d2c48
--- /dev/null
+++ b/arm64/asimd/chacha-2core.asm
@@ -0,0 +1,231 @@
+C arm64/asimd/chacha-2core.asm
+
+ifelse(`
+ Copyright (C) 2020 Niels Möller and Torbjörn Granlund
+ Copyright (C) 2022 Mamone Tarsha
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+')
+
+C Register usage:
+
+C Argments
+define(`DST', `x0')
+define(`SRC', `x1')
+define(`ROUNDS', `x2')
+
+C Working state
+
+define(`ROT24', `v0')
+
+define(`T0', `v16')
+
+C State, even elements in X, odd elements in Y
+define(`X0', `v17')
+define(`X1', `v18')
+define(`X2', `v19')
+define(`X3', `v20')
+define(`Y0', `v21')
+define(`Y1', `v22')
+define(`Y2', `v23')
+define(`Y3', `v24')
+
+C Original input state
+define(`S0', `v25')
+define(`S1', `v26')
+define(`S2', `v27')
+define(`S3', `v28')
+define(`S3p1', `v29')
+
+define(`TMP0', `v30')
+define(`TMP1', `v31')
+
+ C _chacha_2core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+PROLOGUE(_nettle_chacha_2core)
+
+ eor X1.16b, X1.16b, X1.16b
+ mov w3, #1
+ mov X1.s[0], w3
+
+ add x3, SRC, #48
+ ld1 {X3.4s}, [x3]
+
+ add Y3.4s, X3.4s, X1.4s
+ cmhi Y3.4s, X3.4s, Y3.4s
+ ext Y3.16b, Y3.16b, Y3.16b, #12
+ orr Y3.16b, Y3.16b, X1.16b
+
+.Lshared_entry:
+ adr x3, .Lrot24
+ ld1 {ROT24.4s},[x3]
+
+ add Y3.4s, Y3.4s, X3.4s
+
+C Load state
+ ld1 {X0.4s,X1.4s,X2.4s}, [SRC]
+
+ mov S0.16b, X0.16b
+ mov S1.16b, X1.16b
+ mov S2.16b, X2.16b
+ mov S3.16b, X3.16b
+ mov S3p1.16b, Y3.16b
+
+ trn2 Y0.4s, X0.4s, X0.4s C 1 1 3 3
+ trn1 X0.4s, X0.4s, X0.4s C 0 0 2 2
+ trn2 Y1.4s, X1.4s, X1.4s C 5 5 7 7
+ trn1 X1.4s, X1.4s, X1.4s C 4 4 6 6
+ trn2 Y2.4s, X2.4s, X2.4s C 9 9 11 11
+ trn1 X2.4s, X2.4s, X2.4s C 8 8 10 10
+ trn2 Y3.4s, X3.4s, S3p1.4s C 13 13 15 15
+ trn1 X3.4s, X3.4s, S3p1.4s C 12 12 14 14
+
+.Loop:
+C Register layout (A is first block, B is second block)
+C
+C X0: A0 B0 A2 B2 Y0: A1 B1 A3 B3
+C X1: A4 B4 A6 B6 Y1: A5 B5 A7 B7
+C X2: A8 B8 A10 B10 Y2: A9 B9 A11 B11
+C X3: A12 B12 A14 B14 Y3: A13 B13 A15 B15
+ add X0.4s, X0.4s, X1.4s
+ add Y0.4s, Y0.4s, Y1.4s
+ eor X3.16b, X3.16b, X0.16b
+ eor Y3.16b, Y3.16b, Y0.16b
+ rev32 X3.8h, X3.8h
+ rev32 Y3.8h, Y3.8h
+
+ add X2.4s, X2.4s, X3.4s
+ add Y2.4s, Y2.4s, Y3.4s
+ eor TMP0.16b, X1.16b, X2.16b
+ eor TMP1.16b, Y1.16b, Y2.16b
+ ushr X1.4s, TMP0.4s, #20
+ ushr Y1.4s, TMP1.4s, #20
+ sli X1.4s, TMP0.4s, #12
+ sli Y1.4s, TMP1.4s, #12
+
+ add X0.4s, X0.4s, X1.4s
+ add Y0.4s, Y0.4s, Y1.4s
+ eor X3.16b, X3.16b, X0.16b
+ eor Y3.16b, Y3.16b, Y0.16b
+ tbl X3.16b, {X3.16b}, ROT24.16b
+ tbl Y3.16b, {Y3.16b}, ROT24.16b
+
+ add X2.4s, X2.4s, X3.4s
+ add Y2.4s, Y2.4s, Y3.4s
+ eor TMP0.16b, X1.16b, X2.16b
+ eor TMP1.16b, Y1.16b, Y2.16b
+ ushr X1.4s, TMP0.4s, #25
+ ushr Y1.4s, TMP1.4s, #25
+ sli X1.4s, TMP0.4s, #7
+ sli Y1.4s, TMP1.4s, #7
+
+ ext X1.16b, X1.16b, X1.16b, #8
+ ext X2.16b, X2.16b, X2.16b, #8
+ ext Y2.16b, Y2.16b, Y2.16b, #8
+ ext Y3.16b, Y3.16b, Y3.16b, #8
+
+C Register layout:
+C X0: A0 B0 A2 B2 Y0: A1 B1 A3 B3
+C Y1: A5 B5 A7 B7 X1: A6 B6 A4 B4 (X1 swapped)
+C X2: A10 B10 A8 B8 Y2: A11 A11 A9 B9 (X2, Y2 swapped)
+C Y3 A15 B15 A13 B13 X3 A12 B12 A14 B14 (Y3 swapped)
+
+ add X0.4s, X0.4s, Y1.4s
+ add Y0.4s, Y0.4s, X1.4s
+ eor Y3.16b, Y3.16b, X0.16b
+ eor X3.16b, X3.16b, Y0.16b
+ rev32 Y3.8h, Y3.8h
+ rev32 X3.8h, X3.8h
+
+ add X2.4s, X2.4s, Y3.4s
+ add Y2.4s, Y2.4s, X3.4s
+ eor TMP0.16b, Y1.16b, X2.16b
+ eor TMP1.16b, X1.16b, Y2.16b
+ ushr Y1.4s, TMP0.4s, #20
+ ushr X1.4s, TMP1.4s, #20
+ sli Y1.4s, TMP0.4s, #12
+ sli X1.4s, TMP1.4s, #12
+
+ add X0.4s, X0.4s, Y1.4s
+ add Y0.4s, Y0.4s, X1.4s
+ eor Y3.16b, Y3.16b, X0.16b
+ eor X3.16b, X3.16b, Y0.16b
+ tbl Y3.16b, {Y3.16b}, ROT24.16b
+ tbl X3.16b, {X3.16b}, ROT24.16b
+
+ add X2.4s, X2.4s, Y3.4s
+ add Y2.4s, Y2.4s, X3.4s
+ eor TMP0.16b, Y1.16b, X2.16b
+ eor TMP1.16b, X1.16b, Y2.16b
+ ushr Y1.4s, TMP0.4s, #25
+ ushr X1.4s, TMP1.4s, #25
+ sli Y1.4s, TMP0.4s, #7
+ sli X1.4s, TMP1.4s, #7
+
+ ext X1.16b, X1.16b, X1.16b, #8
+ ext X2.16b, X2.16b, X2.16b, #8
+ ext Y2.16b, Y2.16b, Y2.16b, #8
+ ext Y3.16b, Y3.16b, Y3.16b, #8
+
+ subs ROUNDS, ROUNDS, #2
+ b.ne .Loop
+
+ trn1 T0.4s, X0.4s, Y0.4s
+ trn2 Y0.4s, X0.4s, Y0.4s
+
+ trn1 X0.4s, X1.4s, Y1.4s
+ trn2 Y1.4s, X1.4s, Y1.4s
+
+ trn1 X1.4s, X2.4s, Y2.4s
+ trn2 Y2.4s, X2.4s, Y2.4s
+
+ trn1 X2.4s, X3.4s, Y3.4s
+ trn2 Y3.4s, X3.4s, Y3.4s
+
+ add T0.4s, T0.4s, S0.4s
+ add Y0.4s, Y0.4s, S0.4s
+ add X0.4s, X0.4s, S1.4s
+ add Y1.4s, Y1.4s, S1.4s
+ add X1.4s, X1.4s, S2.4s
+ add Y2.4s, Y2.4s, S2.4s
+ add X2.4s, X2.4s, S3.4s
+ add Y3.4s, Y3.4s, S3p1.4s
+
+ st1 {T0.16b,X0.16b,X1.16b,X2.16b}, [DST], #64
+ st1 {Y0.16b,Y1.16b,Y2.16b,Y3.16b}, [DST]
+ ret
+EPILOGUE(_nettle_chacha_2core)
+
+PROLOGUE(_nettle_chacha_2core32)
+ eor Y3.16b, Y3.16b, Y3.16b C {0,0,...,0}
+ mov w3, #1
+ mov Y3.s[0], w3 C {1,0,...,0}
+ add x3, SRC, #48
+ ld1 {X3.4s}, [x3]
+ b .Lshared_entry
+EPILOGUE(_nettle_chacha_2core32)
+
+.align 4
+.Lrot24: .long 0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f
diff --git a/arm64/asimd/chacha-4core.asm b/arm64/asimd/chacha-4core.asm
new file mode 100644
index 00000000..5690e546
--- /dev/null
+++ b/arm64/asimd/chacha-4core.asm
@@ -0,0 +1,228 @@
+C arm64/asimd/chacha-4core.asm
+
+ifelse(`
+ Copyright (C) 2020 Niels Möller and Torbjörn Granlund
+ Copyright (C) 2022 Mamone Tarsha
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+')
+
+C Register usage:
+
+C Argments
+define(`DST', `x0')
+define(`SRC', `x1')
+define(`ROUNDS', `x2')
+
+C Working state
+
+C During the loop, used to save the original values for last 4 words
+C of each block. Also used as temporaries for transpose.
+define(`T0', `v0')
+define(`T1', `v1')
+define(`T2', `v2')
+define(`T3', `v3')
+
+define(`TMP0', `v4')
+define(`TMP1', `v5')
+define(`TMP2', `v6')
+define(`TMP3', `v7')
+
+define(`ROT24', `v8')
+
+C Main loop for round
+define(`QR',`
+ add $1.4s, $1.4s, $2.4s
+ add $5.4s, $5.4s, $6.4s
+ add $9.4s, $9.4s, $10.4s
+ add $13.4s, $13.4s, $14.4s
+ eor $4.16b, $4.16b, $1.16b
+ eor $8.16b, $8.16b, $5.16b
+ eor $12.16b, $12.16b, $9.16b
+ eor $16.16b, $16.16b, $13.16b
+ rev32 $4.8h, $4.8h
+ rev32 $8.8h, $8.8h
+ rev32 $12.8h, $12.8h
+ rev32 $16.8h, $16.8h
+
+ add $3.4s, $3.4s, $4.4s
+ add $7.4s, $7.4s, $8.4s
+ add $11.4s, $11.4s, $12.4s
+ add $15.4s, $15.4s, $16.4s
+ eor TMP0.16b, $2.16b, $3.16b
+ eor TMP1.16b, $6.16b, $7.16b
+ eor TMP2.16b, $10.16b, $11.16b
+ eor TMP3.16b, $14.16b, $15.16b
+ ushr $2.4s, TMP0.4s, #20
+ ushr $6.4s, TMP1.4s, #20
+ ushr $10.4s, TMP2.4s, #20
+ ushr $14.4s, TMP3.4s, #20
+ sli $2.4s, TMP0.4s, #12
+ sli $6.4s, TMP1.4s, #12
+ sli $10.4s, TMP2.4s, #12
+ sli $14.4s, TMP3.4s, #12
+
+ add $1.4s, $1.4s, $2.4s
+ add $5.4s, $5.4s, $6.4s
+ add $9.4s, $9.4s, $10.4s
+ add $13.4s, $13.4s, $14.4s
+ eor $4.16b, $4.16b, $1.16b
+ eor $8.16b, $8.16b, $5.16b
+ eor $12.16b, $12.16b, $9.16b
+ eor $16.16b, $16.16b, $13.16b
+ tbl $4.16b, {$4.16b}, ROT24.16b
+ tbl $8.16b, {$8.16b}, ROT24.16b
+ tbl $12.16b, {$12.16b}, ROT24.16b
+ tbl $16.16b, {$16.16b}, ROT24.16b
+
+ add $3.4s, $3.4s, $4.4s
+ add $7.4s, $7.4s, $8.4s
+ add $11.4s, $11.4s, $12.4s
+ add $15.4s, $15.4s, $16.4s
+ eor TMP0.16b, $2.16b, $3.16b
+ eor TMP1.16b, $6.16b, $7.16b
+ eor TMP2.16b, $10.16b, $11.16b
+ eor TMP3.16b, $14.16b, $15.16b
+ ushr $2.4s, TMP0.4s, #25
+ ushr $6.4s, TMP1.4s, #25
+ ushr $10.4s, TMP2.4s, #25
+ ushr $14.4s, TMP3.4s, #25
+ sli $2.4s, TMP0.4s, #7
+ sli $6.4s, TMP1.4s, #7
+ sli $10.4s, TMP2.4s, #7
+ sli $14.4s, TMP3.4s, #7
+')
+
+define(`TRANSPOSE',`
+ zip1 T0.4s, $1.4s, $3.4s C A0 A2 B0 B2
+ zip1 T1.4s, $2.4s, $4.4s C A1 A3 B1 B3
+ zip2 T2.4s, $1.4s, $3.4s C C0 C2 D0 D2
+ zip2 T3.4s, $2.4s, $4.4s C C1 C3 D1 D3
+
+ zip1 $1.4s, T0.4s, T1.4s C A0 A1 A2 A3
+ zip2 $2.4s, T0.4s, T1.4s C B0 B1 B2 B3
+ zip1 $3.4s, T2.4s, T3.4s C C0 C2 C1 C3
+ zip2 $4.4s, T2.4s, T3.4s C D0 D1 D2 D3
+')
+
+ C _chacha_4core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+PROLOGUE(_nettle_chacha_4core)
+
+ mov w3, #1
+ dup TMP2.4s, w3 C Apply counter carries
+
+.Lshared_entry:
+
+ C Save callee-save registers
+ fmov x3, d8
+
+ adr x4, .Lcnts
+ ld1 {TMP3.4s,ROT24.4s},[x4]
+
+C Load state and splat
+ ld1 {v16.4s,v17.4s,v18.4s,v19.4s}, [SRC]
+
+ dup v20.4s, v16.s[1]
+ dup v24.4s, v16.s[2]
+ dup v28.4s, v16.s[3]
+ dup v16.4s, v16.s[0]
+ dup v21.4s, v17.s[1]
+ dup v25.4s, v17.s[2]
+ dup v29.4s, v17.s[3]
+ dup v17.4s, v17.s[0]
+ dup v22.4s, v18.s[1]
+ dup v26.4s, v18.s[2]
+ dup v30.4s, v18.s[3]
+ dup v18.4s, v18.s[0]
+ dup v23.4s, v19.s[1]
+ dup v27.4s, v19.s[2]
+ dup v31.4s, v19.s[3]
+ dup v19.4s, v19.s[0]
+
+ add v19.4s, v19.4s, TMP3.4s C low adds
+ cmhi TMP1.4s, TMP3.4s, v19.4s C compute carry-out
+ and TMP1.16b, TMP1.16b, TMP2.16b C discard carries for 32-bit counter variant
+ add v23.4s, v23.4s, TMP1.4s C apply carries
+
+ C Save all 4x4 of the last words.
+ mov T0.16b, v19.16b
+ mov T1.16b, v23.16b
+ mov T2.16b, v27.16b
+ mov T3.16b, v31.16b
+
+.Loop:
+ QR(v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31)
+ QR(v16, v21, v26, v31, v20, v25, v30, v19, v24, v29, v18, v23, v28, v17, v22, v27)
+ subs ROUNDS, ROUNDS, #2
+ b.ne .Loop
+
+ C Add in saved original words, including counters, before
+ C transpose.
+ add v19.4s, v19.4s, T0.4s
+ add v23.4s, v23.4s, T1.4s
+ add v27.4s, v27.4s, T2.4s
+ add v31.4s, v31.4s, T3.4s
+
+ TRANSPOSE(v16, v20,v24, v28)
+ TRANSPOSE(v17, v21, v25, v29)
+ TRANSPOSE(v18, v22, v26, v30)
+ TRANSPOSE(v19, v23, v27, v31)
+
+ ld1 {T0.4s,T1.4s,T2.4s}, [SRC]
+
+ add v16.4s, v16.4s, T0.4s
+ add v20.4s, v20.4s, T0.4s
+ add v24.4s, v24.4s, T0.4s
+ add v28.4s, v28.4s, T0.4s
+
+ add v17.4s, v17.4s, T1.4s
+ add v21.4s, v21.4s, T1.4s
+ add v25.4s, v25.4s, T1.4s
+ add v29.4s, v29.4s, T1.4s
+
+ add v18.4s, v18.4s, T2.4s
+ add v22.4s, v22.4s, T2.4s
+ add v26.4s, v26.4s, T2.4s
+ add v30.4s, v30.4s, T2.4s
+
+ st1 {v16.16b,v17.16b,v18.16b,v19.16b}, [DST], #64
+ st1 {v20.16b,v21.16b,v22.16b,v23.16b}, [DST], #64
+ st1 {v24.16b,v25.16b,v26.16b,v27.16b}, [DST], #64
+ st1 {v28.16b,v29.16b,v30.16b,v31.16b}, [DST]
+
+ C Restore callee-save registers
+ fmov d8, x3
+ ret
+EPILOGUE(_nettle_chacha_4core)
+
+PROLOGUE(_nettle_chacha_4core32)
+ eor TMP2.16b, TMP2.16b, TMP2.16b C Ignore counter carries
+ b .Lshared_entry
+EPILOGUE(_nettle_chacha_4core32)
+
+.align 4
+.Lcnts: .long 0,1,2,3 C increments
+.Lrot24: .long 0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f
diff --git a/arm64/asimd/chacha-core-internal.asm b/arm64/asimd/chacha-core-internal.asm
new file mode 100644
index 00000000..da10ad13
--- /dev/null
+++ b/arm64/asimd/chacha-core-internal.asm
@@ -0,0 +1,126 @@
+C arm64/asimd/chacha-core-internal.asm
+
+ifelse(`
+ Copyright (C) 2020 Niels Möller and Torbjörn Granlund
+ Copyright (C) 2022 Mamone Tarsha
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+')
+
+C Register usage:
+
+C Argments
+define(`DST', `x0')
+define(`SRC', `x1')
+define(`ROUNDS', `x2')
+
+C Working state
+define(`X0', `v0')
+define(`X1', `v1')
+define(`X2', `v2')
+define(`X3', `v3')
+
+C Original input state
+define(`S0', `v4')
+define(`S1', `v5')
+define(`S2', `v6')
+define(`S3', `v7')
+
+define(`ROT24', `v16')
+
+define(`TMP', `v17')
+
+C QROUND(X0, X1, X2, X3)
+define(`QROUND', `
+ C x0 += x1, x3 ^= x0, x3 lrot 16
+ C x2 += x3, x1 ^= x2, x1 lrot 12
+ C x0 += x1, x3 ^= x0, x3 lrot 8
+ C x2 += x3, x1 ^= x2, x1 lrot 7
+
+ add $1.4s, $1.4s, $2.4s
+ eor $4.16b, $4.16b, $1.16b
+ rev32 $4.8h, $4.8h
+
+ add $3.4s, $3.4s, $4.4s
+ eor TMP.16b, $2.16b, $3.16b
+ ushr $2.4s, TMP.4s, #20
+ sli $2.4s, TMP.4s, #12
+
+ add $1.4s, $1.4s, $2.4s
+ eor $4.16b, $4.16b, $1.16b
+ tbl $4.16b, {$4.16b}, ROT24.16b
+
+ add $3.4s, $3.4s, $4.4s
+ eor TMP.16b, $2.16b, $3.16b
+ ushr $2.4s, TMP.4s, #25
+ sli $2.4s, TMP.4s, #7
+')
+
+ .text
+ C _chacha_core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+PROLOGUE(_nettle_chacha_core)
+ adr x3, .Lrot24
+ ld1 {ROT24.4s},[x3]
+
+ ld1 {X0.4s,X1.4s,X2.4s,X3.4s}, [SRC]
+
+ mov S0.16b, X0.16b
+ mov S1.16b, X1.16b
+ mov S2.16b, X2.16b
+ mov S3.16b, X3.16b
+
+.Loop:
+ QROUND(X0, X1, X2, X3)
+ C Rotate rows, to get
+ C 0 1 2 3
+ C 5 6 7 4 <<< 1
+ C 10 11 8 9 <<< 2
+ C 15 12 13 14 <<< 3
+
+ ext X1.16b, X1.16b, X1.16b, #4
+ ext X2.16b, X2.16b, X2.16b, #8
+ ext X3.16b, X3.16b, X3.16b, #12
+
+ QROUND(X0, X1, X2, X3)
+
+ ext X1.16b, X1.16b, X1.16b, #12
+ ext X2.16b, X2.16b, X2.16b, #8
+ ext X3.16b, X3.16b, X3.16b, #4
+
+ subs ROUNDS, ROUNDS, #2
+ b.ne .Loop
+
+ add X0.4s, X0.4s, S0.4s
+ add X1.4s, X1.4s, S1.4s
+ add X2.4s, X2.4s, S2.4s
+ add X3.4s, X3.4s, S3.4s
+
+ st1 {X0.16b,X1.16b,X2.16b,X3.16b}, [DST]
+ ret
+EPILOGUE(_nettle_chacha_core)
+
+.align 4
+.Lrot24: .long 0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f
diff --git a/arm64/fat/chacha-2core.asm b/arm64/fat/chacha-2core.asm
new file mode 100644
index 00000000..cb1b95d5
--- /dev/null
+++ b/arm64/fat/chacha-2core.asm
@@ -0,0 +1,36 @@
+C arm64/fat/chacha-2core.asm
+
+
+ifelse(`
+ Copyright (C) 2022 Mamone Tarsha
+
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_fat_chacha_2core) picked up by configure
+
+include_src(`arm64/asimd/chacha-2core.asm')
diff --git a/arm64/fat/chacha-4core.asm b/arm64/fat/chacha-4core.asm
new file mode 100644
index 00000000..2d89e6a6
--- /dev/null
+++ b/arm64/fat/chacha-4core.asm
@@ -0,0 +1,36 @@
+C arm64/fat/chacha-4core.asm
+
+
+ifelse(`
+ Copyright (C) 2022 Mamone Tarsha
+
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_fat_chacha_4core) picked up by configure
+
+include_src(`arm64/asimd/chacha-4core.asm')
diff --git a/arm64/fat/chacha-core-internal-2.asm b/arm64/fat/chacha-core-internal-2.asm
new file mode 100644
index 00000000..dad4c694
--- /dev/null
+++ b/arm64/fat/chacha-core-internal-2.asm
@@ -0,0 +1,37 @@
+C arm64/fat/chacha-core-internal-2.asm
+
+
+ifelse(`
+ Copyright (C) 2022 Mamone Tarsha
+
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_chacha_core) picked up by configure
+
+define(`fat_transform', `$1_arm64')
+include_src(`arm64/asimd/chacha-core-internal.asm')
diff --git a/configure.ac b/configure.ac
index da72f908..053535ba 100644
--- a/configure.ac
+++ b/configure.ac
@@ -81,6 +81,10 @@ AC_ARG_ENABLE(arm-neon,
AC_HELP_STRING([--enable-arm-neon], [Enable ARM Neon assembly. (default=auto)]),,
[enable_arm_neon=auto])
+AC_ARG_ENABLE(arm64-asimd,
+ AC_HELP_STRING([--enable-arm64-asimd], [Enable Arm64 advanced SIMD. (default=no)]),,
+ [enable_arm64_asimd=no])
+
AC_ARG_ENABLE(arm64-crypto,
AC_HELP_STRING([--enable-arm64-crypto], [Enable Arm64 crypto extension. (default=no)]),,
[enable_arm64_crypto=no])
@@ -511,8 +515,11 @@ if test "x$enable_assembler" = xyes ; then
if test "x$enable_fat" = xyes ; then
asm_path="arm64/fat $asm_path"
OPT_NETTLE_SOURCES="fat-arm64.c $OPT_NETTLE_SOURCES"
- FAT_TEST_LIST="none aes pmull sha1 sha2"
+ FAT_TEST_LIST="none asimd aes pmull sha1 sha2"
else
+ if test "$enable_arm64_asimd" = yes ; then
+ asm_path="arm64/asimd $asm_path"
+ fi
if test "$enable_arm64_crypto" = yes ; then
asm_path="arm64/crypto $asm_path"
fi
diff --git a/fat-arm64.c b/fat-arm64.c
index fcb2ece8..af3c98ed 100644
--- a/fat-arm64.c
+++ b/fat-arm64.c
@@ -74,6 +74,7 @@
struct arm64_features
{
+ int have_asimd;
int have_aes;
int have_pmull;
int have_sha1;
@@ -87,6 +88,7 @@ static void
get_arm64_features (struct arm64_features *features)
{
const char *s;
+ features->have_asimd = 0;
features->have_aes = 0;
features->have_pmull = 0;
features->have_sha1 = 0;
@@ -99,7 +101,9 @@ get_arm64_features (struct arm64_features *features)
const char *sep = strchr (s, ',');
size_t length = sep ? (size_t) (sep - s) : strlen(s);
- if (MATCH (s, length, "aes", 3))
+ if (MATCH (s, length, "asimd", 5))
+ features->have_asimd = 1;
+ else if (MATCH (s, length, "aes", 3))
features->have_aes = 1;
else if (MATCH (s, length, "pmull", 5))
features->have_pmull = 1;
@@ -115,6 +119,8 @@ get_arm64_features (struct arm64_features *features)
{
#if USE_GETAUXVAL
unsigned long hwcap = getauxval(AT_HWCAP);
+ features->have_asimd
+ = ((hwcap & HWCAP_ASIMD) == HWCAP_ASIMD);
features->have_aes
= ((hwcap & (HWCAP_ASIMD | HWCAP_AES)) == (HWCAP_ASIMD | HWCAP_AES));
features->have_pmull
@@ -166,6 +172,18 @@ DECLARE_FAT_FUNC(_nettle_sha256_compress, sha256_compress_func)
DECLARE_FAT_FUNC_VAR(sha256_compress, sha256_compress_func, c)
DECLARE_FAT_FUNC_VAR(sha256_compress, sha256_compress_func, arm64)
+DECLARE_FAT_FUNC(_nettle_chacha_core, chacha_core_func)
+DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, c);
+DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, arm64);
+
+DECLARE_FAT_FUNC(nettle_chacha_crypt, chacha_crypt_func)
+DECLARE_FAT_FUNC_VAR(chacha_crypt, chacha_crypt_func, 1core)
+DECLARE_FAT_FUNC_VAR(chacha_crypt, chacha_crypt_func, 4core)
+
+DECLARE_FAT_FUNC(nettle_chacha_crypt32, chacha_crypt_func)
+DECLARE_FAT_FUNC_VAR(chacha_crypt32, chacha_crypt_func, 1core)
+DECLARE_FAT_FUNC_VAR(chacha_crypt32, chacha_crypt_func, 4core)
+
static void CONSTRUCTOR
fat_init (void)
{
@@ -176,8 +194,9 @@ fat_init (void)
verbose = getenv (ENV_VERBOSE) != NULL;
if (verbose)
- fprintf (stderr, "libnettle: cpu features:%s%s%s%s\n",
- features.have_aes ? " aes instructions" : "",
+ fprintf (stderr, "libnettle: cpu features:%s%s%s%s%s\n",
+ features.have_asimd ? " advanced simd" : "",
+ features.have_aes ? " aes instructions" : "",
features.have_pmull ? " polynomial multiply long instructions (PMULL/PMULL2)" : "",
features.have_sha1 ? " sha1 instructions" : "",
features.have_sha2 ? " sha2 instructions" : "");
@@ -243,6 +262,20 @@ fat_init (void)
{
_nettle_sha256_compress_vec = _nettle_sha256_compress_c;
}
+ if (features.have_asimd)
+ {
+ if (verbose)
+ fprintf (stderr, "libnettle: enabling advanced simd code.\n");
+ _nettle_chacha_core_vec = _nettle_chacha_core_arm64;
+ nettle_chacha_crypt_vec = _nettle_chacha_crypt_4core;
+ nettle_chacha_crypt32_vec = _nettle_chacha_crypt32_4core;
+ }
+ else
+ {
+ _nettle_chacha_core_vec = _nettle_chacha_core_c;
+ nettle_chacha_crypt_vec = _nettle_chacha_crypt_1core;
+ nettle_chacha_crypt32_vec = _nettle_chacha_crypt32_1core;
+ }
}
DEFINE_FAT_FUNC(nettle_aes128_encrypt, void,
@@ -290,3 +323,21 @@ DEFINE_FAT_FUNC(nettle_sha1_compress, void,
DEFINE_FAT_FUNC(_nettle_sha256_compress, void,
(uint32_t *state, const uint8_t *input, const uint32_t *k),
(state, input, k))
+
+DEFINE_FAT_FUNC(_nettle_chacha_core, void,
+ (uint32_t *dst, const uint32_t *src, unsigned rounds),
+ (dst, src, rounds))
+
+DEFINE_FAT_FUNC(nettle_chacha_crypt, void,
+ (struct chacha_ctx *ctx,
+ size_t length,
+ uint8_t *dst,
+ const uint8_t *src),
+ (ctx, length, dst, src))
+
+DEFINE_FAT_FUNC(nettle_chacha_crypt32, void,
+ (struct chacha_ctx *ctx,
+ size_t length,
+ uint8_t *dst,
+ const uint8_t *src),
+ (ctx, length, dst, src))