[Arm64] Optimize Chacha20nettle-arm64-chacha

author: Mamone Tarsha <maamoun.tk@googlemail.com> 2022-01-18 19:27:32 +0200
committer: Mamone Tarsha <maamoun.tk@googlemail.com> 2022-01-18 19:27:32 +0200
commit: 173e25cd6c9472bf8f6238a6833cae90d48d86da (patch)
tree: 4814709c5be64da25e5f4831ae062c5ef123ed48
parent: 94228f87fac465bcc3cb36efb8a43ef27554f7e5 (diff)
download: nettle-arm64-chacha.tar.gz
9 files changed, 757 insertions, 5 deletions
diff --git a/Makefile.in b/Makefile.in
index 0590c370..7c87ca65 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -606,7 +606,7 @@ distdir: $(DISTFILES)
 	set -e; for d in sparc32 sparc64 x86 \
 		x86_64 x86_64/aesni x86_64/sha_ni x86_64/fat \
 		arm arm/neon arm/v6 arm/fat \
-		arm64 arm64/crypto arm64/fat \
+		arm64 arm64/asimd arm64/crypto arm64/fat \
 		powerpc64 powerpc64/p7 powerpc64/p8 powerpc64/fat \
 		s390x s390x/vf s390x/msa s390x/msa_x1 s390x/msa_x2 s390x/msa_x4 s390x/fat ; do \
 	  mkdir "$(distdir)/$$d" ; \
diff --git a/arm64/asimd/chacha-2core.asm b/arm64/asimd/chacha-2core.asm
new file mode 100644
index 00000000..792d2c48
--- /dev/null
+++ b/arm64/asimd/chacha-2core.asm
@@ -0,0 +1,231 @@
+C arm64/asimd/chacha-2core.asm
+
+ifelse(`
+   Copyright (C) 2020 Niels Möller and Torbjörn Granlund
+   Copyright (C) 2022 Mamone Tarsha
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+C Register usage:
+
+C Argments
+define(`DST', `x0')
+define(`SRC', `x1')
+define(`ROUNDS', `x2')
+
+C Working state
+
+define(`ROT24', `v0')
+
+define(`T0', `v16')
+
+C State, even elements in X, odd elements in Y
+define(`X0', `v17')
+define(`X1', `v18')
+define(`X2', `v19')
+define(`X3', `v20')
+define(`Y0', `v21')
+define(`Y1', `v22')
+define(`Y2', `v23')
+define(`Y3', `v24')
+
+C Original input state
+define(`S0', `v25')
+define(`S1', `v26')
+define(`S2', `v27')
+define(`S3', `v28')
+define(`S3p1', `v29')
+
+define(`TMP0', `v30')
+define(`TMP1', `v31')
+
+	C _chacha_2core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+PROLOGUE(_nettle_chacha_2core)
+
+	eor		X1.16b, X1.16b, X1.16b
+	mov		w3, #1
+	mov		X1.s[0], w3
+
+	add		x3, SRC, #48
+	ld1		{X3.4s}, [x3]
+
+	add     Y3.4s, X3.4s, X1.4s
+	cmhi	Y3.4s, X3.4s, Y3.4s
+	ext		Y3.16b, Y3.16b, Y3.16b, #12
+	orr		Y3.16b, Y3.16b, X1.16b
+
+.Lshared_entry:
+	adr		x3, .Lrot24
+	ld1		{ROT24.4s},[x3]
+
+	add     Y3.4s, Y3.4s, X3.4s
+
+C Load state
+	ld1		{X0.4s,X1.4s,X2.4s}, [SRC]
+
+	mov		S0.16b, X0.16b
+	mov		S1.16b, X1.16b
+	mov		S2.16b, X2.16b
+	mov		S3.16b, X3.16b
+	mov		S3p1.16b, Y3.16b
+
+	trn2	Y0.4s, X0.4s, X0.4s	C  1  1  3  3
+	trn1	X0.4s, X0.4s, X0.4s	C  0  0  2  2
+	trn2	Y1.4s, X1.4s, X1.4s	C  5  5  7  7
+	trn1	X1.4s, X1.4s, X1.4s	C  4  4  6  6
+	trn2	Y2.4s, X2.4s, X2.4s	C  9  9 11 11
+	trn1	X2.4s, X2.4s, X2.4s	C  8  8 10 10
+	trn2	Y3.4s, X3.4s, S3p1.4s	C  13 13 15 15
+	trn1	X3.4s, X3.4s, S3p1.4s	C  12 12 14 14
+
+.Loop:
+C Register layout (A is first block, B is second block)
+C
+C X0:  A0  B0  A2  B2  Y0:  A1  B1  A3  B3
+C X1:  A4  B4  A6  B6  Y1:  A5  B5  A7  B7
+C X2:  A8  B8 A10 B10  Y2:  A9  B9 A11 B11
+C X3: A12 B12 A14 B14  Y3: A13 B13 A15 B15
+	add		X0.4s, X0.4s, X1.4s
+	 add	Y0.4s, Y0.4s, Y1.4s
+	eor		X3.16b, X3.16b, X0.16b
+	 eor	Y3.16b, Y3.16b, Y0.16b
+	rev32	X3.8h, X3.8h
+	 rev32	Y3.8h, Y3.8h
+	
+	add		X2.4s, X2.4s, X3.4s
+	 add	Y2.4s, Y2.4s, Y3.4s
+	eor		TMP0.16b, X1.16b, X2.16b
+	 eor	TMP1.16b, Y1.16b, Y2.16b
+	ushr	X1.4s, TMP0.4s, #20
+	 ushr	Y1.4s, TMP1.4s, #20
+	sli		X1.4s, TMP0.4s, #12
+	 sli	Y1.4s, TMP1.4s, #12
+	
+	add		X0.4s, X0.4s, X1.4s
+	 add	Y0.4s, Y0.4s, Y1.4s
+	eor		X3.16b, X3.16b, X0.16b
+	 eor	Y3.16b, Y3.16b, Y0.16b
+	tbl		X3.16b, {X3.16b}, ROT24.16b
+	 tbl	Y3.16b, {Y3.16b}, ROT24.16b
+	
+	add		X2.4s, X2.4s, X3.4s
+	 add	Y2.4s, Y2.4s, Y3.4s
+	eor		TMP0.16b, X1.16b, X2.16b
+	 eor	TMP1.16b, Y1.16b, Y2.16b
+	ushr	X1.4s, TMP0.4s, #25
+	 ushr	Y1.4s, TMP1.4s, #25
+	sli		X1.4s, TMP0.4s, #7
+	 sli	Y1.4s, TMP1.4s, #7
+
+	ext		X1.16b, X1.16b, X1.16b, #8
+	ext		X2.16b, X2.16b, X2.16b, #8
+	ext		Y2.16b, Y2.16b, Y2.16b, #8
+	ext		Y3.16b, Y3.16b, Y3.16b, #8
+
+C Register layout:
+C X0:  A0  B0  A2  B2  Y0:  A1  B1  A3  B3
+C Y1:  A5  B5  A7  B7  X1:  A6  B6  A4  B4 (X1 swapped)
+C X2: A10 B10  A8  B8  Y2: A11 A11  A9  B9 (X2, Y2 swapped)
+C Y3  A15 B15 A13 B13  X3  A12 B12 A14 B14 (Y3 swapped)
+
+	add		X0.4s, X0.4s, Y1.4s
+	 add	Y0.4s, Y0.4s, X1.4s
+	eor		Y3.16b, Y3.16b, X0.16b
+	 eor	X3.16b, X3.16b, Y0.16b
+	rev32	Y3.8h, Y3.8h
+	 rev32	X3.8h, X3.8h
+	
+	add		X2.4s, X2.4s, Y3.4s
+	 add	Y2.4s, Y2.4s, X3.4s
+	eor		TMP0.16b, Y1.16b, X2.16b
+	 eor	TMP1.16b, X1.16b, Y2.16b
+	ushr	Y1.4s, TMP0.4s, #20
+	 ushr	X1.4s, TMP1.4s, #20
+	sli		Y1.4s, TMP0.4s, #12
+	 sli	X1.4s, TMP1.4s, #12
+	
+	add		X0.4s, X0.4s, Y1.4s
+	 add	Y0.4s, Y0.4s, X1.4s
+	eor		Y3.16b, Y3.16b, X0.16b
+	 eor	X3.16b, X3.16b, Y0.16b
+	tbl		Y3.16b, {Y3.16b}, ROT24.16b
+	 tbl	X3.16b, {X3.16b}, ROT24.16b
+	
+	add		X2.4s, X2.4s, Y3.4s
+	 add	Y2.4s, Y2.4s, X3.4s
+	eor		TMP0.16b, Y1.16b, X2.16b
+	 eor	TMP1.16b, X1.16b, Y2.16b
+	ushr	Y1.4s, TMP0.4s, #25
+	 ushr	X1.4s, TMP1.4s, #25
+	sli		Y1.4s, TMP0.4s, #7
+	 sli	X1.4s, TMP1.4s, #7
+
+	ext		X1.16b, X1.16b, X1.16b, #8
+	ext		X2.16b, X2.16b, X2.16b, #8
+	ext		Y2.16b, Y2.16b, Y2.16b, #8
+	ext		Y3.16b, Y3.16b, Y3.16b, #8
+
+	subs	ROUNDS, ROUNDS, #2
+	b.ne	.Loop
+
+	trn1	T0.4s, X0.4s, Y0.4s
+	trn2	Y0.4s, X0.4s, Y0.4s
+
+	trn1	X0.4s, X1.4s, Y1.4s
+	trn2	Y1.4s, X1.4s, Y1.4s
+
+	trn1	X1.4s, X2.4s, Y2.4s
+	trn2	Y2.4s, X2.4s, Y2.4s
+
+	trn1	X2.4s, X3.4s, Y3.4s
+	trn2	Y3.4s, X3.4s, Y3.4s
+
+	add		T0.4s, T0.4s, S0.4s
+	add		Y0.4s, Y0.4s, S0.4s
+	add		X0.4s, X0.4s, S1.4s
+	add		Y1.4s, Y1.4s, S1.4s
+	add		X1.4s, X1.4s, S2.4s
+	add		Y2.4s, Y2.4s, S2.4s
+	add		X2.4s, X2.4s, S3.4s
+	add		Y3.4s, Y3.4s, S3p1.4s
+
+	st1		{T0.16b,X0.16b,X1.16b,X2.16b}, [DST], #64
+	st1		{Y0.16b,Y1.16b,Y2.16b,Y3.16b}, [DST]
+	ret
+EPILOGUE(_nettle_chacha_2core)
+
+PROLOGUE(_nettle_chacha_2core32)
+	eor		Y3.16b, Y3.16b, Y3.16b	C {0,0,...,0}
+	mov		w3, #1
+	mov		Y3.s[0], w3	C {1,0,...,0}
+	add		x3, SRC, #48
+	ld1		{X3.4s}, [x3]
+	b		.Lshared_entry
+EPILOGUE(_nettle_chacha_2core32)
+
+.align	4
+.Lrot24: .long	0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f
diff --git a/arm64/asimd/chacha-4core.asm b/arm64/asimd/chacha-4core.asm
new file mode 100644
index 00000000..5690e546
--- /dev/null
+++ b/arm64/asimd/chacha-4core.asm
@@ -0,0 +1,228 @@
+C arm64/asimd/chacha-4core.asm
+
+ifelse(`
+   Copyright (C) 2020 Niels Möller and Torbjörn Granlund
+   Copyright (C) 2022 Mamone Tarsha
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+C Register usage:
+
+C Argments
+define(`DST', `x0')
+define(`SRC', `x1')
+define(`ROUNDS', `x2')
+
+C Working state
+
+C During the loop, used to save the original values for last 4 words
+C of each block. Also used as temporaries for transpose.
+define(`T0', `v0')
+define(`T1', `v1')
+define(`T2', `v2')
+define(`T3', `v3')
+
+define(`TMP0', `v4')
+define(`TMP1', `v5')
+define(`TMP2', `v6')
+define(`TMP3', `v7')
+
+define(`ROT24', `v8')
+
+C Main loop for round
+define(`QR',`
+	add		$1.4s, $1.4s, $2.4s
+	add		$5.4s, $5.4s, $6.4s
+	add		$9.4s, $9.4s, $10.4s
+	add		$13.4s, $13.4s, $14.4s
+	eor		$4.16b, $4.16b, $1.16b
+	eor		$8.16b, $8.16b, $5.16b
+	eor		$12.16b, $12.16b, $9.16b
+	eor		$16.16b, $16.16b, $13.16b
+	rev32	$4.8h, $4.8h
+	rev32	$8.8h, $8.8h
+	rev32	$12.8h, $12.8h
+	rev32	$16.8h, $16.8h
+
+	add		$3.4s, $3.4s, $4.4s
+	add		$7.4s, $7.4s, $8.4s
+	add		$11.4s, $11.4s, $12.4s
+	add		$15.4s, $15.4s, $16.4s
+	eor		TMP0.16b, $2.16b, $3.16b
+	eor		TMP1.16b, $6.16b, $7.16b
+	eor		TMP2.16b, $10.16b, $11.16b
+	eor		TMP3.16b, $14.16b, $15.16b
+	ushr	$2.4s, TMP0.4s, #20
+	ushr	$6.4s, TMP1.4s, #20
+	ushr	$10.4s, TMP2.4s, #20
+	ushr	$14.4s, TMP3.4s, #20
+	sli		$2.4s, TMP0.4s, #12
+	sli		$6.4s, TMP1.4s, #12
+	sli		$10.4s, TMP2.4s, #12
+	sli		$14.4s, TMP3.4s, #12
+
+	add		$1.4s, $1.4s, $2.4s
+	add		$5.4s, $5.4s, $6.4s
+	add		$9.4s, $9.4s, $10.4s
+	add		$13.4s, $13.4s, $14.4s
+	eor		$4.16b, $4.16b, $1.16b
+	eor		$8.16b, $8.16b, $5.16b
+	eor		$12.16b, $12.16b, $9.16b
+	eor		$16.16b, $16.16b, $13.16b
+	tbl		$4.16b, {$4.16b}, ROT24.16b
+	tbl		$8.16b, {$8.16b}, ROT24.16b
+	tbl		$12.16b, {$12.16b}, ROT24.16b
+	tbl		$16.16b, {$16.16b}, ROT24.16b
+
+	add		$3.4s, $3.4s, $4.4s
+	add		$7.4s, $7.4s, $8.4s
+	add		$11.4s, $11.4s, $12.4s
+	add		$15.4s, $15.4s, $16.4s
+	eor		TMP0.16b, $2.16b, $3.16b
+	eor		TMP1.16b, $6.16b, $7.16b
+	eor		TMP2.16b, $10.16b, $11.16b
+	eor		TMP3.16b, $14.16b, $15.16b
+	ushr	$2.4s, TMP0.4s, #25
+	ushr	$6.4s, TMP1.4s, #25
+	ushr	$10.4s, TMP2.4s, #25
+	ushr	$14.4s, TMP3.4s, #25
+	sli		$2.4s, TMP0.4s, #7
+	sli		$6.4s, TMP1.4s, #7
+	sli		$10.4s, TMP2.4s, #7
+	sli		$14.4s, TMP3.4s, #7
+')
+
+define(`TRANSPOSE',`
+	zip1	T0.4s, $1.4s, $3.4s		C A0 A2 B0 B2
+	zip1	T1.4s, $2.4s, $4.4s		C A1 A3 B1 B3
+	zip2	T2.4s, $1.4s, $3.4s		C C0 C2 D0 D2
+	zip2	T3.4s, $2.4s, $4.4s		C C1 C3 D1 D3
+
+	zip1	$1.4s, T0.4s, T1.4s		C A0 A1 A2 A3
+	zip2	$2.4s, T0.4s, T1.4s		C B0 B1 B2 B3
+	zip1	$3.4s, T2.4s, T3.4s		C C0 C2 C1 C3
+	zip2	$4.4s, T2.4s, T3.4s		C D0 D1 D2 D3
+')
+
+	C _chacha_4core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+PROLOGUE(_nettle_chacha_4core)
+
+	mov		w3, #1
+	dup		TMP2.4s, w3	C Apply counter carries
+
+.Lshared_entry:
+
+	C Save callee-save registers
+	fmov	x3, d8
+
+	adr		x4, .Lcnts
+	ld1		{TMP3.4s,ROT24.4s},[x4]
+
+C Load state and splat
+	ld1		{v16.4s,v17.4s,v18.4s,v19.4s}, [SRC]
+
+	dup		v20.4s, v16.s[1]
+	dup		v24.4s, v16.s[2]
+	dup		v28.4s, v16.s[3]
+	dup		v16.4s, v16.s[0]
+	dup		v21.4s, v17.s[1]
+	dup		v25.4s, v17.s[2]
+	dup		v29.4s, v17.s[3]
+	dup		v17.4s, v17.s[0]
+	dup		v22.4s, v18.s[1]
+	dup		v26.4s, v18.s[2]
+	dup		v30.4s, v18.s[3]
+	dup		v18.4s, v18.s[0]
+	dup		v23.4s, v19.s[1]
+	dup		v27.4s, v19.s[2]
+	dup		v31.4s, v19.s[3]
+	dup		v19.4s, v19.s[0]
+
+	add		v19.4s, v19.4s, TMP3.4s	C low adds
+	cmhi	TMP1.4s, TMP3.4s, v19.4s	C compute carry-out
+	and		TMP1.16b, TMP1.16b, TMP2.16b	C discard carries for 32-bit counter variant
+	add		v23.4s, v23.4s, TMP1.4s	C apply carries
+
+	C Save all 4x4 of the last words.
+	mov		T0.16b, v19.16b
+	mov		T1.16b, v23.16b
+	mov		T2.16b, v27.16b
+	mov		T3.16b, v31.16b
+
+.Loop:
+	QR(v16, v17,  v18, v19, v20, v21,  v22, v23, v24, v25, v26, v27, v28, v29, v30, v31)
+	QR(v16, v21, v26, v31, v20, v25, v30, v19, v24, v29,  v18, v23, v28, v17,  v22, v27)
+	subs	ROUNDS, ROUNDS, #2
+	b.ne	.Loop
+
+	C Add in saved original words, including counters, before
+	C transpose.
+	add		v19.4s, v19.4s, T0.4s
+	add		v23.4s, v23.4s, T1.4s
+	add		v27.4s, v27.4s, T2.4s
+	add		v31.4s, v31.4s, T3.4s
+
+	TRANSPOSE(v16, v20,v24, v28)
+	TRANSPOSE(v17, v21, v25, v29)
+	TRANSPOSE(v18, v22, v26, v30)
+	TRANSPOSE(v19, v23, v27, v31)
+
+	ld1		{T0.4s,T1.4s,T2.4s}, [SRC]
+
+	add		v16.4s, v16.4s, T0.4s
+	add		v20.4s, v20.4s, T0.4s
+	add		v24.4s, v24.4s, T0.4s
+	add		v28.4s, v28.4s, T0.4s
+
+	add		v17.4s, v17.4s, T1.4s
+	add		v21.4s, v21.4s, T1.4s
+	add		v25.4s, v25.4s, T1.4s
+	add		v29.4s, v29.4s, T1.4s
+
+	add		v18.4s, v18.4s, T2.4s
+	add		v22.4s, v22.4s, T2.4s
+	add		v26.4s, v26.4s, T2.4s
+	add		v30.4s, v30.4s, T2.4s
+
+	st1		{v16.16b,v17.16b,v18.16b,v19.16b}, [DST], #64
+	st1		{v20.16b,v21.16b,v22.16b,v23.16b}, [DST], #64
+	st1		{v24.16b,v25.16b,v26.16b,v27.16b}, [DST], #64
+	st1		{v28.16b,v29.16b,v30.16b,v31.16b}, [DST]
+
+	C Restore callee-save registers
+	fmov	d8, x3
+	ret
+EPILOGUE(_nettle_chacha_4core)
+
+PROLOGUE(_nettle_chacha_4core32)
+	eor		TMP2.16b, TMP2.16b, TMP2.16b	C Ignore counter carries
+	b		.Lshared_entry
+EPILOGUE(_nettle_chacha_4core32)
+
+.align	4
+.Lcnts: .long	0,1,2,3		C increments
+.Lrot24: .long	0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f
diff --git a/arm64/asimd/chacha-core-internal.asm b/arm64/asimd/chacha-core-internal.asm
new file mode 100644
index 00000000..da10ad13
--- /dev/null
+++ b/arm64/asimd/chacha-core-internal.asm
@@ -0,0 +1,126 @@
+C arm64/asimd/chacha-core-internal.asm
+
+ifelse(`
+   Copyright (C) 2020 Niels Möller and Torbjörn Granlund
+   Copyright (C) 2022 Mamone Tarsha
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+C Register usage:
+
+C Argments
+define(`DST', `x0')
+define(`SRC', `x1')
+define(`ROUNDS', `x2')
+
+C Working state
+define(`X0', `v0')
+define(`X1', `v1')
+define(`X2', `v2')
+define(`X3', `v3')
+
+C Original input state
+define(`S0', `v4')
+define(`S1', `v5')
+define(`S2', `v6')
+define(`S3', `v7')
+
+define(`ROT24', `v16')
+
+define(`TMP', `v17')
+
+C QROUND(X0, X1, X2, X3)
+define(`QROUND', `
+	C x0 += x1, x3 ^= x0, x3 lrot 16
+	C x2 += x3, x1 ^= x2, x1 lrot 12
+	C x0 += x1, x3 ^= x0, x3 lrot 8
+	C x2 += x3, x1 ^= x2, x1 lrot 7
+
+	add		$1.4s, $1.4s, $2.4s
+	eor		$4.16b, $4.16b, $1.16b
+	rev32	$4.8h, $4.8h
+
+	add		$3.4s, $3.4s, $4.4s
+	eor		TMP.16b, $2.16b, $3.16b
+	ushr	$2.4s, TMP.4s, #20
+	sli		$2.4s, TMP.4s, #12
+
+	add		$1.4s, $1.4s, $2.4s
+	eor		$4.16b, $4.16b, $1.16b
+	tbl		$4.16b, {$4.16b}, ROT24.16b
+
+	add		$3.4s, $3.4s, $4.4s
+	eor		TMP.16b, $2.16b, $3.16b
+	ushr	$2.4s, TMP.4s, #25
+	sli		$2.4s, TMP.4s, #7
+')
+
+	.text
+	C _chacha_core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+PROLOGUE(_nettle_chacha_core)
+	adr		x3, .Lrot24
+	ld1		{ROT24.4s},[x3]
+
+	ld1		{X0.4s,X1.4s,X2.4s,X3.4s}, [SRC]
+
+	mov		S0.16b, X0.16b
+	mov		S1.16b, X1.16b
+	mov		S2.16b, X2.16b
+	mov		S3.16b, X3.16b
+
+.Loop:
+	QROUND(X0, X1, X2, X3)
+	C Rotate rows, to get
+	C	 0  1  2  3
+	C	 5  6  7  4  <<< 1
+	C	10 11  8  9  <<< 2
+	C	15 12 13 14  <<< 3
+
+	ext		X1.16b, X1.16b, X1.16b, #4
+	ext		X2.16b, X2.16b, X2.16b, #8
+	ext		X3.16b, X3.16b, X3.16b, #12
+
+	QROUND(X0, X1, X2, X3)
+
+	ext		X1.16b, X1.16b, X1.16b, #12
+	ext		X2.16b, X2.16b, X2.16b, #8
+	ext		X3.16b, X3.16b, X3.16b, #4
+
+	subs	ROUNDS, ROUNDS, #2
+	b.ne	.Loop
+
+	add		X0.4s, X0.4s, S0.4s
+	add		X1.4s, X1.4s, S1.4s
+	add		X2.4s, X2.4s, S2.4s
+	add		X3.4s, X3.4s, S3.4s
+
+	st1		{X0.16b,X1.16b,X2.16b,X3.16b}, [DST]
+	ret
+EPILOGUE(_nettle_chacha_core)
+
+.align	4
+.Lrot24: .long	0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f
diff --git a/arm64/fat/chacha-2core.asm b/arm64/fat/chacha-2core.asm
new file mode 100644
index 00000000..cb1b95d5
--- /dev/null
+++ b/arm64/fat/chacha-2core.asm
@@ -0,0 +1,36 @@
+C arm64/fat/chacha-2core.asm
+
+
+ifelse(`
+   Copyright (C) 2022 Mamone Tarsha
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_fat_chacha_2core) picked up by configure
+
+include_src(`arm64/asimd/chacha-2core.asm')
diff --git a/arm64/fat/chacha-4core.asm b/arm64/fat/chacha-4core.asm
new file mode 100644
index 00000000..2d89e6a6
--- /dev/null
+++ b/arm64/fat/chacha-4core.asm
@@ -0,0 +1,36 @@
+C arm64/fat/chacha-4core.asm
+
+
+ifelse(`
+   Copyright (C) 2022 Mamone Tarsha
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_fat_chacha_4core) picked up by configure
+
+include_src(`arm64/asimd/chacha-4core.asm')
diff --git a/arm64/fat/chacha-core-internal-2.asm b/arm64/fat/chacha-core-internal-2.asm
new file mode 100644
index 00000000..dad4c694
--- /dev/null
+++ b/arm64/fat/chacha-core-internal-2.asm
@@ -0,0 +1,37 @@
+C arm64/fat/chacha-core-internal-2.asm
+
+
+ifelse(`
+   Copyright (C) 2022 Mamone Tarsha
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_chacha_core) picked up by configure
+
+define(`fat_transform', `$1_arm64')
+include_src(`arm64/asimd/chacha-core-internal.asm')
diff --git a/configure.ac b/configure.ac
index da72f908..053535ba 100644
--- a/configure.ac
+++ b/configure.ac
@@ -81,6 +81,10 @@ AC_ARG_ENABLE(arm-neon,
   AC_HELP_STRING([--enable-arm-neon], [Enable ARM Neon assembly. (default=auto)]),,
   [enable_arm_neon=auto])
 
+AC_ARG_ENABLE(arm64-asimd,
+  AC_HELP_STRING([--enable-arm64-asimd], [Enable Arm64 advanced SIMD. (default=no)]),,
+  [enable_arm64_asimd=no])
+
 AC_ARG_ENABLE(arm64-crypto,
   AC_HELP_STRING([--enable-arm64-crypto], [Enable Arm64 crypto extension. (default=no)]),,
   [enable_arm64_crypto=no])
@@ -511,8 +515,11 @@ if test "x$enable_assembler" = xyes ; then
         if test "x$enable_fat" = xyes ; then
           asm_path="arm64/fat $asm_path"
           OPT_NETTLE_SOURCES="fat-arm64.c $OPT_NETTLE_SOURCES"
-          FAT_TEST_LIST="none aes pmull sha1 sha2"
+          FAT_TEST_LIST="none asimd aes pmull sha1 sha2"
         else
+          if test "$enable_arm64_asimd" = yes ; then
+            asm_path="arm64/asimd $asm_path"
+          fi
           if test "$enable_arm64_crypto" = yes ; then
             asm_path="arm64/crypto $asm_path"
           fi
diff --git a/fat-arm64.c b/fat-arm64.c
index fcb2ece8..af3c98ed 100644
--- a/fat-arm64.c
+++ b/fat-arm64.c
@@ -74,6 +74,7 @@
 
 struct arm64_features
 {
+  int have_asimd;
   int have_aes;
   int have_pmull;
   int have_sha1;
@@ -87,6 +88,7 @@ static void
 get_arm64_features (struct arm64_features *features)
 {
   const char *s;
+  features->have_asimd = 0;
   features->have_aes = 0;
   features->have_pmull = 0;
   features->have_sha1 = 0;
@@ -99,7 +101,9 @@ get_arm64_features (struct arm64_features *features)
 	const char *sep = strchr (s, ',');
 	size_t length = sep ? (size_t) (sep - s) : strlen(s);
 
-	if (MATCH (s, length, "aes", 3))
+	if (MATCH (s, length, "asimd", 5))
+	  features->have_asimd = 1;
+  else if (MATCH (s, length, "aes", 3))
 	  features->have_aes = 1;
   else if (MATCH (s, length, "pmull", 5))
 	  features->have_pmull = 1;
@@ -115,6 +119,8 @@ get_arm64_features (struct arm64_features *features)
     {
 #if USE_GETAUXVAL
       unsigned long hwcap = getauxval(AT_HWCAP);
+      features->have_asimd
+	= ((hwcap & HWCAP_ASIMD) == HWCAP_ASIMD);
       features->have_aes
 	= ((hwcap & (HWCAP_ASIMD | HWCAP_AES)) == (HWCAP_ASIMD | HWCAP_AES));
       features->have_pmull
@@ -166,6 +172,18 @@ DECLARE_FAT_FUNC(_nettle_sha256_compress, sha256_compress_func)
 DECLARE_FAT_FUNC_VAR(sha256_compress, sha256_compress_func, c)
 DECLARE_FAT_FUNC_VAR(sha256_compress, sha256_compress_func, arm64)
 
+DECLARE_FAT_FUNC(_nettle_chacha_core, chacha_core_func)
+DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, c);
+DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, arm64);
+
+DECLARE_FAT_FUNC(nettle_chacha_crypt, chacha_crypt_func)
+DECLARE_FAT_FUNC_VAR(chacha_crypt, chacha_crypt_func, 1core)
+DECLARE_FAT_FUNC_VAR(chacha_crypt, chacha_crypt_func, 4core)
+
+DECLARE_FAT_FUNC(nettle_chacha_crypt32, chacha_crypt_func)
+DECLARE_FAT_FUNC_VAR(chacha_crypt32, chacha_crypt_func, 1core)
+DECLARE_FAT_FUNC_VAR(chacha_crypt32, chacha_crypt_func, 4core)
+
 static void CONSTRUCTOR
 fat_init (void)
 {
@@ -176,8 +194,9 @@ fat_init (void)
 
   verbose = getenv (ENV_VERBOSE) != NULL;
   if (verbose)
-    fprintf (stderr, "libnettle: cpu features:%s%s%s%s\n",
-	     features.have_aes ? " aes instructions" : "",
+    fprintf (stderr, "libnettle: cpu features:%s%s%s%s%s\n",
+	     features.have_asimd ? " advanced simd" : "",
+       features.have_aes ? " aes instructions" : "",
 	     features.have_pmull ? " polynomial multiply long instructions (PMULL/PMULL2)" : "",
        features.have_sha1 ? " sha1 instructions" : "",
        features.have_sha2 ? " sha2 instructions" : "");
@@ -243,6 +262,20 @@ fat_init (void)
     {
       _nettle_sha256_compress_vec = _nettle_sha256_compress_c;
     }
+  if (features.have_asimd)
+    {
+      if (verbose)
+	fprintf (stderr, "libnettle: enabling advanced simd code.\n");
+      _nettle_chacha_core_vec = _nettle_chacha_core_arm64;
+      nettle_chacha_crypt_vec = _nettle_chacha_crypt_4core;
+      nettle_chacha_crypt32_vec = _nettle_chacha_crypt32_4core;
+    }
+  else
+    {
+      _nettle_chacha_core_vec = _nettle_chacha_core_c;
+      nettle_chacha_crypt_vec = _nettle_chacha_crypt_1core;
+      nettle_chacha_crypt32_vec = _nettle_chacha_crypt32_1core;
+    }
 }
 
 DEFINE_FAT_FUNC(nettle_aes128_encrypt, void,
@@ -290,3 +323,21 @@ DEFINE_FAT_FUNC(nettle_sha1_compress, void,
 DEFINE_FAT_FUNC(_nettle_sha256_compress, void,
 		(uint32_t *state, const uint8_t *input, const uint32_t *k),
 		(state, input, k))
+
+DEFINE_FAT_FUNC(_nettle_chacha_core, void,
+		(uint32_t *dst, const uint32_t *src, unsigned rounds),
+		(dst, src, rounds))
+
+DEFINE_FAT_FUNC(nettle_chacha_crypt, void,
+		(struct chacha_ctx *ctx,
+		 size_t length,
+		 uint8_t *dst,
+		 const uint8_t *src),
+		(ctx, length, dst, src))
+
+DEFINE_FAT_FUNC(nettle_chacha_crypt32, void,
+		(struct chacha_ctx *ctx,
+		 size_t length,
+		 uint8_t *dst,
+		 const uint8_t *src),
+		(ctx, length, dst, src))
author	Mamone Tarsha <maamoun.tk@googlemail.com>	2022-01-18 19:27:32 +0200
committer	Mamone Tarsha <maamoun.tk@googlemail.com>	2022-01-18 19:27:32 +0200
commit	173e25cd6c9472bf8f6238a6833cae90d48d86da (patch)
tree	4814709c5be64da25e5f4831ae062c5ef123ed48
parent	94228f87fac465bcc3cb36efb8a43ef27554f7e5 (diff)
download	nettle-arm64-chacha.tar.gz