[Arm64] Optimize Poly1305arm64-poly1305

author: Mamone Tarsha <maamoun.tk@googlemail.com> 2022-01-18 19:29:32 +0200
committer: Mamone Tarsha <maamoun.tk@googlemail.com> 2022-01-18 19:29:32 +0200
commit: fb89ef522a58cbb8d116e99ac33fd971ff9bf825 (patch)
tree: 0d86c2658f44e7eef13e2e273f174c52416a1dfb
parent: 94228f87fac465bcc3cb36efb8a43ef27554f7e5 (diff)
download: nettle-arm64-poly1305.tar.gz
10 files changed, 601 insertions, 12 deletions
diff --git a/Makefile.in b/Makefile.in
index 0590c370..4fd02bf6 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -102,6 +102,7 @@ nettle_SOURCES = aes-decrypt-internal.c aes-decrypt.c aes-decrypt-table.c \
 		 siv-cmac.c siv-cmac-aes128.c siv-cmac-aes256.c \
 		 cnd-memcpy.c \
 		 chacha-crypt.c chacha-core-internal.c \
+		 poly1305-update.c \
 		 chacha-poly1305.c chacha-poly1305-meta.c \
 		 chacha-set-key.c chacha-set-nonce.c \
 		 ctr.c ctr16.c des.c des3.c \
@@ -606,7 +607,7 @@ distdir: $(DISTFILES)
 	set -e; for d in sparc32 sparc64 x86 \
 		x86_64 x86_64/aesni x86_64/sha_ni x86_64/fat \
 		arm arm/neon arm/v6 arm/fat \
-		arm64 arm64/crypto arm64/fat \
+		arm64 arm64/asimd arm64/crypto arm64/fat \
 		powerpc64 powerpc64/p7 powerpc64/p8 powerpc64/fat \
 		s390x s390x/vf s390x/msa s390x/msa_x1 s390x/msa_x2 s390x/msa_x4 s390x/fat ; do \
 	  mkdir "$(distdir)/$$d" ; \
diff --git a/arm64/asimd/poly1305-2core.asm b/arm64/asimd/poly1305-2core.asm
new file mode 100644
index 00000000..d624cded
--- /dev/null
+++ b/arm64/asimd/poly1305-2core.asm
@@ -0,0 +1,351 @@
+C arm64/asimd/poly1305-2core.asm
+
+ifelse(`
+   Copyright (C) 2022 Mamone Tarsha
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+C Register usage:
+
+C Argments
+define(`CTX', `x0')
+define(`DATA', `x1')
+define(`LEN', `x2')
+define(`T4', `w3')
+
+C Working state
+define(`H0', `v1')
+define(`H1', `v2')
+define(`H2', `v3')
+define(`H3', `v4')
+define(`H4', `v0')
+
+define(`R0', `v16')
+define(`R1', `v17')
+define(`R2', `v18')
+define(`R3', `v19')
+define(`R4', `v20')
+
+define(`S1', `v21')
+define(`S2', `v22')
+define(`S3', `v23')
+define(`S4', `v24')
+
+define(`C0', `v25')
+define(`C1', `v26')
+define(`C2',  `v27')
+define(`C3',  `v28')
+define(`C4',  `v29')
+
+define(`T4W',  `v5')
+define(`MASK26',  `v6')
+define(`H2TBL',  `v7')
+
+C Multiply state by key of two horizontal parts and reduce both products
+define(`MUL_REDC', `
+	umull	C0.2d, H0.2s, R0.2s
+	umull	C1.2d, H1.2s, R0.2s
+	umull	C2.2d, H2.2s, R0.2s
+	umull	C3.2d, H3.2s, R0.2s
+	umull	C4.2d, H4.2s, R0.2s
+
+	umlal	C0.2d, H4.2s, S1.2s
+	umlal	C1.2d, H0.2s, R1.2s
+	umlal	C2.2d, H1.2s, R1.2s
+	umlal	C3.2d, H2.2s, R1.2s
+	umlal	C4.2d, H3.2s, R1.2s
+
+	umlal	C0.2d, H3.2s, S2.2s
+	umlal	C1.2d, H4.2s, S2.2s
+	umlal	C2.2d, H0.2s, R2.2s
+	umlal	C3.2d, H1.2s, R2.2s
+	umlal	C4.2d, H2.2s, R2.2s
+
+	umlal	C0.2d, H2.2s, S3.2s
+	umlal	C1.2d, H3.2s, S3.2s
+	umlal	C2.2d, H4.2s, S3.2s
+	umlal	C3.2d, H0.2s, R3.2s
+	umlal	C4.2d, H1.2s, R3.2s
+
+	umlal	C0.2d, H1.2s, S4.2s
+	umlal	C1.2d, H2.2s, S4.2s
+	umlal	C2.2d, H3.2s, S4.2s
+	umlal	C3.2d, H4.2s, S4.2s
+	umlal	C4.2d, H0.2s, R4.2s
+
+	C -- Reduction phase --
+	
+	C carry h0 -> h1
+	C carry h3 -> h4
+	ushr	H1.2d, C0.2d, #26
+	ushr	H4.2d, C3.2d, #26
+	add		H1.2d, H1.2d, C1.2d
+	add		H4.2d, H4.2d, C4.2d
+	and		H0.16b, C0.16b, MASK26.16b
+	and		H3.16b, C3.16b, MASK26.16b
+
+	C carry h1 -> h2
+	C carry h4 -> h0
+	ushr	C1.2d, H1.2d, #26
+	ushr	C4.2d, H4.2d, #26
+	add		H2.2d, C2.2d, C1.2d
+	add		H0.2d, H0.2d, C4.2d
+	and		H1.16b, H1.16b, MASK26.16b
+	and		H4.16b, H4.16b, MASK26.16b
+
+	C carry h4*4 -> h0
+	C carry h2 -> h3
+	shl		C4.2d, C4.2d, #2
+	ushr	C2.2d, H2.2d, #26
+	add		H0.2d, H0.2d, C4.2d
+	add		H3.2d, H3.2d, C2.2d
+	and		H2.16b, H2.16b, MASK26.16b
+
+	C carry h0 -> h1
+	C carry h3 -> h4
+	ushr	C0.2d, H0.2d, #26
+	ushr	C3.2d, H3.2d, #26
+	add		H1.2d, H1.2d, C0.2d
+	add		H4.2d, H4.2d, C3.2d
+	and		H0.16b, H0.16b, MASK26.16b
+	and		H3.16b, H3.16b, MASK26.16b
+	')
+
+	.text
+	C void _nettle_poly1305_2core(struct poly1305_ctx *ctx, const uint8_t *m, size_t len, unsigned t4)
+
+PROLOGUE(_nettle_poly1305_2core)
+	adr		x4, .mask26
+	ld1		{MASK26.2d}, [x4]
+	adr		x4, .h2tbl
+	ld1		{H2TBL.16b}, [x4]
+
+	C Shift and replicate T4 across vector
+	lsl		T4, T4, #24
+	dup		T4W.4s, T4
+
+	C In case the buffer has only two blocks, process them separately
+	cmp		LEN, #32
+	b.eq	L2B
+
+	C This procedure processes two blocks horizontally over vector 
+	C registers. In order to keep two separated parts of state, we
+	C store the state in the first parts of vector reigters and
+	C initialize the second part with zeros. For each iteration, two
+	C blocks would be added to both parts and multiply the state parts
+	C by r^2 except for the last iteration we multiply the first part
+	C of state by r^2 and the second part by r. In this way we can
+	C maintain the correct sequence of multiples for each mutiplication
+	C of consecutive blocks.
+
+	C Load key and cached multiples
+	ld4		{R0.s, R1.s, R2.s, R3.s}[0], [CTX], #16
+	ld1		{R4.s}[0], [CTX], #4
+	ld4		{S1.s, S2.s, S3.s, S4.s}[0], [CTX], #16
+
+	C -- Calculate r^2 = r*r ---
+	
+	ins		H0.s[0], R0.s[0]
+	ins		H1.s[0], R1.s[0]
+	ins		H2.s[0], R2.s[0]
+	ins		H3.s[0], R3.s[0]
+	ins		H4.s[0], R4.s[0]
+
+	MUL_REDC()
+
+	C Horizontally asssign two parts of key vectors to r^2
+	dup		R0.4s, H0.s[0]
+	dup		R1.4s, H1.s[0]
+	dup		R2.4s, H2.s[0]
+	dup		R3.4s, H3.s[0]
+	dup		R4.4s, H4.s[0]
+
+	C Calculate S = R*5
+	shl		S1.4s, R1.4s, #2
+	shl		S2.4s, R2.4s, #2
+	shl		S3.4s, R3.4s, #2
+	shl		S4.4s, R4.4s, #2
+	add		S1.4s, S1.4s, R1.4s
+	add		S2.4s, S2.4s, R2.4s
+	add		S3.4s, S3.4s, R3.4s
+	add		S4.4s, S4.4s, R4.4s
+
+	C initialize the second parts of state with zeros
+	eor		H0.16b, H0.16b, H0.16b
+	eor		H1.16b, H1.16b, H1.16b
+	eor		H2.16b, H2.16b, H2.16b
+	eor		H3.16b, H3.16b, H3.16b
+	eor		H4.16b, H4.16b, H4.16b
+
+	C Load state
+	ld4		{H4.s, H0.s, H1.s, H2.s}[0], [CTX], #16
+	ld1		{H3.s}[0], [CTX]
+
+	C Iterate over every pair of blocks and exclude the final one.
+	sub		LEN, LEN, #32
+L2B_loop:
+	C Load two blocks
+	ld1		{C3.16b, C4.16b}, [DATA], #32
+
+	C Permute the two blocks and line them horizontally
+	zip1	C0.2d, C3.2d, C4.2d
+	tbl		C2.16b, { C3.16b, C4.16b }, H2TBL.16b
+	zip2	C4.2d, C3.2d, C4.2d
+
+	ushr	C1.2d, C0.2d, #26
+	ushr	C2.2d, C2.2d, #4
+	ushr	C3.2d, C4.2d, #14
+	ushr	C4.2d, C4.2d, #40
+
+	and		C0.16b, C0.16b, MASK26.16b
+	and		C1.16b, C1.16b, MASK26.16b
+	and		C2.16b, C2.16b, MASK26.16b
+	and		C3.16b, C3.16b, MASK26.16b
+	orr		C4.16b, C4.16b, T4W.16b
+
+	add		H0.2d, H0.2d, C0.2d
+	add		H1.2d, H1.2d, C1.2d
+	add		H2.2d, H2.2d, C2.2d
+	add		H3.2d, H3.2d, C3.2d
+	add		H4.2d, H4.2d, C4.2d
+
+	xtn		H0.2s, H0.2d
+	xtn		H1.2s, H1.2d
+	xtn		H2.2s, H2.2d
+	xtn		H3.2s, H3.2d
+	xtn		H4.2s, H4.2d
+
+	MUL_REDC()
+
+	subs	LEN, LEN, #32
+	b.ne	L2B_loop
+
+	C Set the first part of key to r^2 and the second part to r
+	sub		CTX, CTX, #52
+	ld4		{R0.s, R1.s, R2.s, R3.s}[1], [CTX], #16
+	ld1		{R4.s}[1], [CTX], #4
+	ld4		{S1.s, S2.s, S3.s, S4.s}[1], [CTX], #16
+
+	ld1		{C3.16b, C4.16b}, [DATA]
+
+	zip1	C0.2d, C3.2d, C4.2d
+	tbl		C2.16b, { C3.16b, C4.16b }, H2TBL.16b
+	zip2	C4.2d, C3.2d, C4.2d
+
+	ushr	C1.2d, C0.2d, #26
+	ushr	C2.2d, C2.2d, #4
+	ushr	C3.2d, C4.2d, #14
+	ushr	C4.2d, C4.2d, #40
+
+	and		C0.16b, C0.16b, MASK26.16b
+	and		C1.16b, C1.16b, MASK26.16b
+	and		C2.16b, C2.16b, MASK26.16b
+	and		C3.16b, C3.16b, MASK26.16b
+	orr		C4.16b, C4.16b, T4W.16b
+
+	add		H0.2d, H0.2d, C0.2d
+	add		H1.2d, H1.2d, C1.2d
+	add		H2.2d, H2.2d, C2.2d
+	add		H3.2d, H3.2d, C3.2d
+	add		H4.2d, H4.2d, C4.2d
+
+	xtn		H0.2s, H0.2d
+	xtn		H1.2s, H1.2d
+	xtn		H2.2s, H2.2d
+	xtn		H3.2s, H3.2d
+	xtn		H4.2s, H4.2d
+
+	MUL_REDC()
+
+	C Combine both state parts
+	dup		C0.2d, H0.d[1]
+	dup		C1.2d, H1.d[1]
+	dup		C2.2d, H2.d[1]
+	dup		C3.2d, H3.d[1]
+	dup		C4.2d, H4.d[1]
+
+	add		H0.2d, H0.2d, C0.2d
+	add		H1.2d, H1.2d, C1.2d
+	add		H2.2d, H2.2d, C2.2d
+	add		H3.2d, H3.2d, C3.2d
+	add		H4.2d, H4.2d, C4.2d
+
+	b		Ldone
+
+	C Process two blocks separately
+L2B:
+	ld4		{R0.s, R1.s, R2.s, R3.s}[0], [CTX], #16
+	ld1		{R4.s}[0], [CTX], #4
+	ld4		{S1.s, S2.s, S3.s, S4.s}[0], [CTX], #16
+	ld4		{H4.s, H0.s, H1.s, H2.s}[0], [CTX], #16
+	ld1		{H3.s}[0], [CTX]
+	sub		CTX, CTX, #16
+L1B_loop:
+	ld1		{C0.16b}, [DATA], #16
+
+	tbl		C2.16b, { C0.16b }, H2TBL.16b
+	ext		C4.16b, C0.16b, C0.16b, #8
+
+	ushr	C1.2d, C0.2d, #26
+	ushr	C2.2d, C2.2d, #4
+	ushr	C3.2d, C4.2d, #14
+	ushr	C4.2d, C4.2d, #40
+
+	and		C0.16b, C0.16b, MASK26.16b
+	and		C1.16b, C1.16b, MASK26.16b
+	and		C2.16b, C2.16b, MASK26.16b
+	and		C3.16b, C3.16b, MASK26.16b
+	orr		C4.16b, C4.16b, T4W.16b
+
+	add		H0.2d, H0.2d, C0.2d
+	add		H1.2d, H1.2d, C1.2d
+	add		H2.2d, H2.2d, C2.2d
+	add		H3.2d, H3.2d, C3.2d
+	add		H4.2d, H4.2d, C4.2d
+
+	xtn		H0.2s, H0.2d
+	xtn		H1.2s, H1.2d
+	xtn		H2.2s, H2.2d
+	xtn		H3.2s, H3.2d
+	xtn		H4.2s, H4.2d
+
+	MUL_REDC()
+
+	subs	LEN, LEN, #16
+	b.ne	L1B_loop
+
+Ldone:
+	C Store state
+	st4		{H4.s, H0.s, H1.s, H2.s}[0], [CTX], #16
+	st1		{H3.s}[0], [CTX]
+
+	ret
+EPILOGUE(_nettle_poly1305_2core)
+
+.align	4
+.mask26: .quad	0x0000000003FFFFFF,0x0000000003FFFFFF
+.h2tbl: .byte	0x06,0x07,0x08,0x09,0x00,0x00,0x00,0x00,0x16,0x17,0x18,0x19,0x00,0x00,0x00,0x00
diff --git a/arm64/fat/poly1305-2core.asm b/arm64/fat/poly1305-2core.asm
new file mode 100644
index 00000000..f5486302
--- /dev/null
+++ b/arm64/fat/poly1305-2core.asm
@@ -0,0 +1,35 @@
+C arm64/fat/poly1305-2core.asm
+
+ifelse(`
+   Copyright (C) 2022 Mamone Tarsha
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_fat_poly1305_2core) picked up by configure
+
+include_src(`arm64/asimd/poly1305-2core.asm')
diff --git a/chacha-poly1305.c b/chacha-poly1305.c
index 7a423e1e..521b441a 100644
--- a/chacha-poly1305.c
+++ b/chacha-poly1305.c
@@ -90,14 +90,11 @@ chacha_poly1305_set_nonce (struct chacha_poly1305_ctx *ctx,
   ctx->auth_size = ctx->data_size = ctx->index = 0;
 }
 
-/* FIXME: Duplicated in poly1305-aes128.c */
-#define COMPRESS(ctx, data) _nettle_poly1305_block(&(ctx)->poly1305, (data), 1)
-
 static void
 poly1305_update (struct chacha_poly1305_ctx *ctx,
 		 size_t length, const uint8_t *data)
 {
-  MD_UPDATE (ctx, length, data, COMPRESS, (void) 0);
+  ctx->index = _nettle_poly1305_update(&ctx->poly1305, ctx->block, ctx->index, length, data);
 }
 
 static void
diff --git a/configure.ac b/configure.ac
index da72f908..0b4a358c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -81,6 +81,10 @@ AC_ARG_ENABLE(arm-neon,
   AC_HELP_STRING([--enable-arm-neon], [Enable ARM Neon assembly. (default=auto)]),,
   [enable_arm_neon=auto])
 
+AC_ARG_ENABLE(arm64-asimd,
+  AC_HELP_STRING([--enable-arm64-asimd], [Enable Arm64 advanced SIMD. (default=no)]),,
+  [enable_arm64_asimd=no])
+
 AC_ARG_ENABLE(arm64-crypto,
   AC_HELP_STRING([--enable-arm64-crypto], [Enable Arm64 crypto extension. (default=no)]),,
   [enable_arm64_crypto=no])
@@ -511,8 +515,11 @@ if test "x$enable_assembler" = xyes ; then
         if test "x$enable_fat" = xyes ; then
           asm_path="arm64/fat $asm_path"
           OPT_NETTLE_SOURCES="fat-arm64.c $OPT_NETTLE_SOURCES"
-          FAT_TEST_LIST="none aes pmull sha1 sha2"
+          FAT_TEST_LIST="none asimd aes pmull sha1 sha2"
         else
+          if test "$enable_arm64_asimd" = yes ; then
+            asm_path="arm64/asimd $asm_path"
+          fi
           if test "$enable_arm64_crypto" = yes ; then
             asm_path="arm64/crypto $asm_path"
           fi
@@ -597,6 +604,7 @@ asm_nettle_optional_list="gcm-hash.asm gcm-hash8.asm cpuid.asm cpu-facility.asm
   aes256-encrypt-2.asm aes256-decrypt-2.asm \
   cbc-aes128-encrypt-2.asm cbc-aes192-encrypt-2.asm cbc-aes256-encrypt-2.asm \
   chacha-2core.asm chacha-3core.asm chacha-4core.asm chacha-core-internal-2.asm \
+  poly1305-2core.asm \
   salsa20-2core.asm salsa20-core-internal-2.asm \
   sha1-compress-2.asm sha256-compress-2.asm \
   sha3-permute-2.asm sha512-compress-2.asm \
@@ -730,6 +738,8 @@ AH_VERBATIM([HAVE_NATIVE],
 #undef HAVE_NATIVE_fat_chacha_2core
 #undef HAVE_NATIVE_fat_chacha_3core
 #undef HAVE_NATIVE_fat_chacha_4core
+#undef HAVE_NATIVE_poly1305_2core
+#undef HAVE_NATIVE_fat_poly1305_2core
 #undef HAVE_NATIVE_ecc_curve25519_modp
 #undef HAVE_NATIVE_ecc_curve448_modp
 #undef HAVE_NATIVE_ecc_secp192r1_modp
diff --git a/fat-arm64.c b/fat-arm64.c
index fcb2ece8..be3d0b1e 100644
--- a/fat-arm64.c
+++ b/fat-arm64.c
@@ -53,6 +53,7 @@
 #include "aes.h"
 #include "gcm.h"
 #include "gcm-internal.h"
+#include "poly1305.h"
 #include "fat-setup.h"
 
 /* Defines from arch/arm64/include/uapi/asm/hwcap.h in Linux kernel */
@@ -74,6 +75,7 @@
 
 struct arm64_features
 {
+  int have_asimd;
   int have_aes;
   int have_pmull;
   int have_sha1;
@@ -87,6 +89,7 @@ static void
 get_arm64_features (struct arm64_features *features)
 {
   const char *s;
+  features->have_asimd = 0;
   features->have_aes = 0;
   features->have_pmull = 0;
   features->have_sha1 = 0;
@@ -99,7 +102,9 @@ get_arm64_features (struct arm64_features *features)
 	const char *sep = strchr (s, ',');
 	size_t length = sep ? (size_t) (sep - s) : strlen(s);
 
-	if (MATCH (s, length, "aes", 3))
+	if (MATCH (s, length, "asimd", 5))
+	  features->have_asimd = 1;
+  else if (MATCH (s, length, "aes", 3))
 	  features->have_aes = 1;
   else if (MATCH (s, length, "pmull", 5))
 	  features->have_pmull = 1;
@@ -115,6 +120,8 @@ get_arm64_features (struct arm64_features *features)
     {
 #if USE_GETAUXVAL
       unsigned long hwcap = getauxval(AT_HWCAP);
+      features->have_asimd
+	= ((hwcap & HWCAP_ASIMD) == HWCAP_ASIMD);
       features->have_aes
 	= ((hwcap & (HWCAP_ASIMD | HWCAP_AES)) == (HWCAP_ASIMD | HWCAP_AES));
       features->have_pmull
@@ -166,6 +173,22 @@ DECLARE_FAT_FUNC(_nettle_sha256_compress, sha256_compress_func)
 DECLARE_FAT_FUNC_VAR(sha256_compress, sha256_compress_func, c)
 DECLARE_FAT_FUNC_VAR(sha256_compress, sha256_compress_func, arm64)
 
+DECLARE_FAT_FUNC(_nettle_chacha_core, chacha_core_func)
+DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, c);
+DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, asimd);
+
+DECLARE_FAT_FUNC(nettle_chacha_crypt, chacha_crypt_func)
+DECLARE_FAT_FUNC_VAR(chacha_crypt, chacha_crypt_func, 1core)
+DECLARE_FAT_FUNC_VAR(chacha_crypt, chacha_crypt_func, 4core)
+
+DECLARE_FAT_FUNC(nettle_chacha_crypt32, chacha_crypt_func)
+DECLARE_FAT_FUNC_VAR(chacha_crypt32, chacha_crypt_func, 1core)
+DECLARE_FAT_FUNC_VAR(chacha_crypt32, chacha_crypt_func, 4core)
+
+DECLARE_FAT_FUNC(_nettle_poly1305_update, poly1305_update_func)
+DECLARE_FAT_FUNC_VAR(poly1305_update, poly1305_update_func, 1core)
+DECLARE_FAT_FUNC_VAR(poly1305_update, poly1305_update_func, 2core)
+
 static void CONSTRUCTOR
 fat_init (void)
 {
@@ -176,8 +199,9 @@ fat_init (void)
 
   verbose = getenv (ENV_VERBOSE) != NULL;
   if (verbose)
-    fprintf (stderr, "libnettle: cpu features:%s%s%s%s\n",
-	     features.have_aes ? " aes instructions" : "",
+    fprintf (stderr, "libnettle: cpu features:%s%s%s%s%s\n",
+	     features.have_asimd ? " advanced simd" : "",
+       features.have_aes ? " aes instructions" : "",
 	     features.have_pmull ? " polynomial multiply long instructions (PMULL/PMULL2)" : "",
        features.have_sha1 ? " sha1 instructions" : "",
        features.have_sha2 ? " sha2 instructions" : "");
@@ -243,6 +267,22 @@ fat_init (void)
     {
       _nettle_sha256_compress_vec = _nettle_sha256_compress_c;
     }
+  if (features.have_asimd)
+    {
+      if (verbose)
+	fprintf (stderr, "libnettle: enabling advanced simd code.\n");
+      _nettle_chacha_core_vec = _nettle_chacha_core_asimd;
+      nettle_chacha_crypt_vec = _nettle_chacha_crypt_4core;
+      nettle_chacha_crypt32_vec = _nettle_chacha_crypt32_4core;
+      _nettle_poly1305_update_vec = _nettle_poly1305_update_2core;
+    }
+  else
+    {
+      _nettle_chacha_core_vec = _nettle_chacha_core_c;
+      nettle_chacha_crypt_vec = _nettle_chacha_crypt_1core;
+      nettle_chacha_crypt32_vec = _nettle_chacha_crypt32_1core;
+      _nettle_poly1305_update_vec = _nettle_poly1305_update_1core;
+    }
 }
 
 DEFINE_FAT_FUNC(nettle_aes128_encrypt, void,
@@ -290,3 +330,29 @@ DEFINE_FAT_FUNC(nettle_sha1_compress, void,
 DEFINE_FAT_FUNC(_nettle_sha256_compress, void,
 		(uint32_t *state, const uint8_t *input, const uint32_t *k),
 		(state, input, k))
+
+DEFINE_FAT_FUNC(_nettle_chacha_core, void,
+		(uint32_t *dst, const uint32_t *src, unsigned rounds),
+		(dst, src, rounds))
+
+DEFINE_FAT_FUNC(nettle_chacha_crypt, void,
+		(struct chacha_ctx *ctx,
+		 size_t length,
+		 uint8_t *dst,
+		 const uint8_t *src),
+		(ctx, length, dst, src))
+
+DEFINE_FAT_FUNC(nettle_chacha_crypt32, void,
+		(struct chacha_ctx *ctx,
+		 size_t length,
+		 uint8_t *dst,
+		 const uint8_t *src),
+		(ctx, length, dst, src))
+
+DEFINE_FAT_FUNC(_nettle_poly1305_update, unsigned,
+		(struct poly1305_ctx *ctx,
+		 uint8_t *block,
+		 unsigned pos,
+     size_t length,
+		 const uint8_t *data),
+		(ctx, block, pos, length, data))
diff --git a/fat-setup.h b/fat-setup.h
index 64b27244..677824ce 100644
--- a/fat-setup.h
+++ b/fat-setup.h
@@ -196,6 +196,10 @@ typedef void chacha_crypt_func(struct chacha_ctx *ctx,
 			       uint8_t *dst,
 			       const uint8_t *src);
 
+struct poly1305_ctx;
+typedef unsigned poly1305_update_func(struct poly1305_ctx *ctx, uint8_t *block, unsigned pos,
+			       size_t length, const uint8_t *data);
+
 struct aes128_ctx;
 typedef void aes128_set_key_func (struct aes128_ctx *ctx, const uint8_t *key);
 typedef void aes128_invert_key_func (struct aes128_ctx *dst, const struct aes128_ctx *src);
diff --git a/poly1305-aes.c b/poly1305-aes.c
index a4050254..935ea638 100644
--- a/poly1305-aes.c
+++ b/poly1305-aes.c
@@ -56,13 +56,11 @@ poly1305_aes_set_nonce (struct poly1305_aes_ctx *ctx,
   memcpy (ctx->nonce, nonce, POLY1305_AES_NONCE_SIZE);
 }
 
-#define COMPRESS(ctx, data) _nettle_poly1305_block(&(ctx)->pctx, (data), 1)
-
 void
 poly1305_aes_update (struct poly1305_aes_ctx *ctx,
 		     size_t length, const uint8_t *data)
 {
-  MD_UPDATE (ctx, length, data, COMPRESS, (void) 0);
+  ctx->index = _nettle_poly1305_update(&ctx->pctx, ctx->block, ctx->index, length, data);
 }
 
 void
diff --git a/poly1305-internal.h b/poly1305-internal.h
index 9932d524..b55f19fe 100644
--- a/poly1305-internal.h
+++ b/poly1305-internal.h
@@ -53,6 +53,19 @@ void _nettle_poly1305_digest (struct poly1305_ctx *ctx, union nettle_block16 *s)
 /* Process one block. */
 void _nettle_poly1305_block (struct poly1305_ctx *ctx, const uint8_t *m,
 			     unsigned high);
+unsigned _nettle_poly1305_update(struct poly1305_ctx *ctx, uint8_t *block, unsigned pos, size_t length,
+			     const uint8_t *data);
+
+/* Functions available only in some configurations */
+unsigned
+_nettle_poly1305_update_2core(struct poly1305_ctx *ctx,
+			     uint8_t *block, unsigned pos,
+			     size_t length, const uint8_t *data);
+
+unsigned
+_nettle_poly1305_update_1core(struct poly1305_ctx *ctx,
+			     uint8_t *block, unsigned pos,
+			     size_t length, const uint8_t *data);
 
 #ifdef __cplusplus
 }
diff --git a/poly1305-update.c b/poly1305-update.c
new file mode 100644
index 00000000..180aa169
--- /dev/null
+++ b/poly1305-update.c
@@ -0,0 +1,114 @@
+/* poly1305-update.c
+
+   Copyright (C) 2021 Mamone Tarsha
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+*/
+
+#if HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <string.h>
+
+#include "poly1305.h"
+#include "poly1305-internal.h"
+
+#if HAVE_NATIVE_poly1305_2core
+#define _nettle_poly1305_update_2core _nettle_poly1305_update
+#elif !HAVE_NATIVE_fat_poly1305_2core
+#define _nettle_poly1305_update_1core _nettle_poly1305_update
+#endif
+
+#if HAVE_NATIVE_poly1305_2core || HAVE_NATIVE_fat_poly1305_2core
+void _nettle_poly1305_2core(struct poly1305_ctx *ctx, const uint8_t *m, size_t len, unsigned t4);
+unsigned
+_nettle_poly1305_update_2core(struct poly1305_ctx *ctx,
+			   uint8_t *block, unsigned pos,
+			   size_t length, const uint8_t *data)
+{
+  if (pos)
+  {
+    if (pos + length < POLY1305_BLOCK_SIZE)
+    {
+      memcpy (block + pos, data, length);
+      return pos + length;
+    }
+    else
+    {
+      unsigned left = POLY1305_BLOCK_SIZE - pos;
+      memcpy (block + pos, data, left);
+      data += left;
+      length -= left;
+      _nettle_poly1305_block(ctx, block, 1);
+    }
+  }
+  if (length >= 2*POLY1305_BLOCK_SIZE)
+  {
+    size_t rlen = length & -(2*POLY1305_BLOCK_SIZE);
+    _nettle_poly1305_2core(ctx, data, rlen, 1);
+    data += rlen;
+    length -= rlen;
+  }
+  if (length >= POLY1305_BLOCK_SIZE)
+  {
+    _nettle_poly1305_block(ctx, data, 1);
+    data += POLY1305_BLOCK_SIZE;
+    length -= POLY1305_BLOCK_SIZE;
+  }
+  memcpy (block, data, length);
+  return length;
+}
+#endif
+#if !HAVE_NATIVE_poly1305_2core
+unsigned
+_nettle_poly1305_update_1core(struct poly1305_ctx *ctx,
+			   uint8_t *block, unsigned pos,
+			   size_t length, const uint8_t *data)
+{
+  if (pos)
+  {
+    if (pos + length < POLY1305_BLOCK_SIZE)
+    {
+      memcpy (block + pos, data, length);
+      return pos + length;
+    }
+    else
+    {
+      unsigned left = POLY1305_BLOCK_SIZE - pos;
+      memcpy (block + pos, data, left);
+      data += left;
+      length -= left;
+      _nettle_poly1305_block(ctx, block, 1);
+    }
+  }
+  for (; length >= POLY1305_BLOCK_SIZE; length -= POLY1305_BLOCK_SIZE, data += POLY1305_BLOCK_SIZE)
+    _nettle_poly1305_block(ctx, data, 1);
+  memcpy (block, data, length);
+  return length;
+}
+#endif
author	Mamone Tarsha <maamoun.tk@googlemail.com>	2022-01-18 19:29:32 +0200
committer	Mamone Tarsha <maamoun.tk@googlemail.com>	2022-01-18 19:29:32 +0200
commit	fb89ef522a58cbb8d116e99ac33fd971ff9bf825 (patch)
tree	0d86c2658f44e7eef13e2e273f174c52416a1dfb
parent	94228f87fac465bcc3cb36efb8a43ef27554f7e5 (diff)
download	nettle-arm64-poly1305.tar.gz