summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMamone Tarsha <maamoun.tk@googlemail.com>2022-01-18 19:29:32 +0200
committerMamone Tarsha <maamoun.tk@googlemail.com>2022-01-18 19:29:32 +0200
commitfb89ef522a58cbb8d116e99ac33fd971ff9bf825 (patch)
tree0d86c2658f44e7eef13e2e273f174c52416a1dfb
parent94228f87fac465bcc3cb36efb8a43ef27554f7e5 (diff)
downloadnettle-arm64-poly1305.tar.gz
[Arm64] Optimize Poly1305arm64-poly1305
-rw-r--r--Makefile.in3
-rw-r--r--arm64/asimd/poly1305-2core.asm351
-rw-r--r--arm64/fat/poly1305-2core.asm35
-rw-r--r--chacha-poly1305.c5
-rw-r--r--configure.ac12
-rw-r--r--fat-arm64.c72
-rw-r--r--fat-setup.h4
-rw-r--r--poly1305-aes.c4
-rw-r--r--poly1305-internal.h13
-rw-r--r--poly1305-update.c114
10 files changed, 601 insertions, 12 deletions
diff --git a/Makefile.in b/Makefile.in
index 0590c370..4fd02bf6 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -102,6 +102,7 @@ nettle_SOURCES = aes-decrypt-internal.c aes-decrypt.c aes-decrypt-table.c \
siv-cmac.c siv-cmac-aes128.c siv-cmac-aes256.c \
cnd-memcpy.c \
chacha-crypt.c chacha-core-internal.c \
+ poly1305-update.c \
chacha-poly1305.c chacha-poly1305-meta.c \
chacha-set-key.c chacha-set-nonce.c \
ctr.c ctr16.c des.c des3.c \
@@ -606,7 +607,7 @@ distdir: $(DISTFILES)
set -e; for d in sparc32 sparc64 x86 \
x86_64 x86_64/aesni x86_64/sha_ni x86_64/fat \
arm arm/neon arm/v6 arm/fat \
- arm64 arm64/crypto arm64/fat \
+ arm64 arm64/asimd arm64/crypto arm64/fat \
powerpc64 powerpc64/p7 powerpc64/p8 powerpc64/fat \
s390x s390x/vf s390x/msa s390x/msa_x1 s390x/msa_x2 s390x/msa_x4 s390x/fat ; do \
mkdir "$(distdir)/$$d" ; \
diff --git a/arm64/asimd/poly1305-2core.asm b/arm64/asimd/poly1305-2core.asm
new file mode 100644
index 00000000..d624cded
--- /dev/null
+++ b/arm64/asimd/poly1305-2core.asm
@@ -0,0 +1,351 @@
+C arm64/asimd/poly1305-2core.asm
+
+ifelse(`
+ Copyright (C) 2022 Mamone Tarsha
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+')
+
+C Register usage:
+
+C Argments
+define(`CTX', `x0')
+define(`DATA', `x1')
+define(`LEN', `x2')
+define(`T4', `w3')
+
+C Working state
+define(`H0', `v1')
+define(`H1', `v2')
+define(`H2', `v3')
+define(`H3', `v4')
+define(`H4', `v0')
+
+define(`R0', `v16')
+define(`R1', `v17')
+define(`R2', `v18')
+define(`R3', `v19')
+define(`R4', `v20')
+
+define(`S1', `v21')
+define(`S2', `v22')
+define(`S3', `v23')
+define(`S4', `v24')
+
+define(`C0', `v25')
+define(`C1', `v26')
+define(`C2', `v27')
+define(`C3', `v28')
+define(`C4', `v29')
+
+define(`T4W', `v5')
+define(`MASK26', `v6')
+define(`H2TBL', `v7')
+
+C Multiply state by key of two horizontal parts and reduce both products
+define(`MUL_REDC', `
+ umull C0.2d, H0.2s, R0.2s
+ umull C1.2d, H1.2s, R0.2s
+ umull C2.2d, H2.2s, R0.2s
+ umull C3.2d, H3.2s, R0.2s
+ umull C4.2d, H4.2s, R0.2s
+
+ umlal C0.2d, H4.2s, S1.2s
+ umlal C1.2d, H0.2s, R1.2s
+ umlal C2.2d, H1.2s, R1.2s
+ umlal C3.2d, H2.2s, R1.2s
+ umlal C4.2d, H3.2s, R1.2s
+
+ umlal C0.2d, H3.2s, S2.2s
+ umlal C1.2d, H4.2s, S2.2s
+ umlal C2.2d, H0.2s, R2.2s
+ umlal C3.2d, H1.2s, R2.2s
+ umlal C4.2d, H2.2s, R2.2s
+
+ umlal C0.2d, H2.2s, S3.2s
+ umlal C1.2d, H3.2s, S3.2s
+ umlal C2.2d, H4.2s, S3.2s
+ umlal C3.2d, H0.2s, R3.2s
+ umlal C4.2d, H1.2s, R3.2s
+
+ umlal C0.2d, H1.2s, S4.2s
+ umlal C1.2d, H2.2s, S4.2s
+ umlal C2.2d, H3.2s, S4.2s
+ umlal C3.2d, H4.2s, S4.2s
+ umlal C4.2d, H0.2s, R4.2s
+
+ C -- Reduction phase --
+
+ C carry h0 -> h1
+ C carry h3 -> h4
+ ushr H1.2d, C0.2d, #26
+ ushr H4.2d, C3.2d, #26
+ add H1.2d, H1.2d, C1.2d
+ add H4.2d, H4.2d, C4.2d
+ and H0.16b, C0.16b, MASK26.16b
+ and H3.16b, C3.16b, MASK26.16b
+
+ C carry h1 -> h2
+ C carry h4 -> h0
+ ushr C1.2d, H1.2d, #26
+ ushr C4.2d, H4.2d, #26
+ add H2.2d, C2.2d, C1.2d
+ add H0.2d, H0.2d, C4.2d
+ and H1.16b, H1.16b, MASK26.16b
+ and H4.16b, H4.16b, MASK26.16b
+
+ C carry h4*4 -> h0
+ C carry h2 -> h3
+ shl C4.2d, C4.2d, #2
+ ushr C2.2d, H2.2d, #26
+ add H0.2d, H0.2d, C4.2d
+ add H3.2d, H3.2d, C2.2d
+ and H2.16b, H2.16b, MASK26.16b
+
+ C carry h0 -> h1
+ C carry h3 -> h4
+ ushr C0.2d, H0.2d, #26
+ ushr C3.2d, H3.2d, #26
+ add H1.2d, H1.2d, C0.2d
+ add H4.2d, H4.2d, C3.2d
+ and H0.16b, H0.16b, MASK26.16b
+ and H3.16b, H3.16b, MASK26.16b
+ ')
+
+ .text
+ C void _nettle_poly1305_2core(struct poly1305_ctx *ctx, const uint8_t *m, size_t len, unsigned t4)
+
+PROLOGUE(_nettle_poly1305_2core)
+ adr x4, .mask26
+ ld1 {MASK26.2d}, [x4]
+ adr x4, .h2tbl
+ ld1 {H2TBL.16b}, [x4]
+
+ C Shift and replicate T4 across vector
+ lsl T4, T4, #24
+ dup T4W.4s, T4
+
+ C In case the buffer has only two blocks, process them separately
+ cmp LEN, #32
+ b.eq L2B
+
+ C This procedure processes two blocks horizontally over vector
+ C registers. In order to keep two separated parts of state, we
+ C store the state in the first parts of vector reigters and
+ C initialize the second part with zeros. For each iteration, two
+ C blocks would be added to both parts and multiply the state parts
+ C by r^2 except for the last iteration we multiply the first part
+ C of state by r^2 and the second part by r. In this way we can
+ C maintain the correct sequence of multiples for each mutiplication
+ C of consecutive blocks.
+
+ C Load key and cached multiples
+ ld4 {R0.s, R1.s, R2.s, R3.s}[0], [CTX], #16
+ ld1 {R4.s}[0], [CTX], #4
+ ld4 {S1.s, S2.s, S3.s, S4.s}[0], [CTX], #16
+
+ C -- Calculate r^2 = r*r ---
+
+ ins H0.s[0], R0.s[0]
+ ins H1.s[0], R1.s[0]
+ ins H2.s[0], R2.s[0]
+ ins H3.s[0], R3.s[0]
+ ins H4.s[0], R4.s[0]
+
+ MUL_REDC()
+
+ C Horizontally asssign two parts of key vectors to r^2
+ dup R0.4s, H0.s[0]
+ dup R1.4s, H1.s[0]
+ dup R2.4s, H2.s[0]
+ dup R3.4s, H3.s[0]
+ dup R4.4s, H4.s[0]
+
+ C Calculate S = R*5
+ shl S1.4s, R1.4s, #2
+ shl S2.4s, R2.4s, #2
+ shl S3.4s, R3.4s, #2
+ shl S4.4s, R4.4s, #2
+ add S1.4s, S1.4s, R1.4s
+ add S2.4s, S2.4s, R2.4s
+ add S3.4s, S3.4s, R3.4s
+ add S4.4s, S4.4s, R4.4s
+
+ C initialize the second parts of state with zeros
+ eor H0.16b, H0.16b, H0.16b
+ eor H1.16b, H1.16b, H1.16b
+ eor H2.16b, H2.16b, H2.16b
+ eor H3.16b, H3.16b, H3.16b
+ eor H4.16b, H4.16b, H4.16b
+
+ C Load state
+ ld4 {H4.s, H0.s, H1.s, H2.s}[0], [CTX], #16
+ ld1 {H3.s}[0], [CTX]
+
+ C Iterate over every pair of blocks and exclude the final one.
+ sub LEN, LEN, #32
+L2B_loop:
+ C Load two blocks
+ ld1 {C3.16b, C4.16b}, [DATA], #32
+
+ C Permute the two blocks and line them horizontally
+ zip1 C0.2d, C3.2d, C4.2d
+ tbl C2.16b, { C3.16b, C4.16b }, H2TBL.16b
+ zip2 C4.2d, C3.2d, C4.2d
+
+ ushr C1.2d, C0.2d, #26
+ ushr C2.2d, C2.2d, #4
+ ushr C3.2d, C4.2d, #14
+ ushr C4.2d, C4.2d, #40
+
+ and C0.16b, C0.16b, MASK26.16b
+ and C1.16b, C1.16b, MASK26.16b
+ and C2.16b, C2.16b, MASK26.16b
+ and C3.16b, C3.16b, MASK26.16b
+ orr C4.16b, C4.16b, T4W.16b
+
+ add H0.2d, H0.2d, C0.2d
+ add H1.2d, H1.2d, C1.2d
+ add H2.2d, H2.2d, C2.2d
+ add H3.2d, H3.2d, C3.2d
+ add H4.2d, H4.2d, C4.2d
+
+ xtn H0.2s, H0.2d
+ xtn H1.2s, H1.2d
+ xtn H2.2s, H2.2d
+ xtn H3.2s, H3.2d
+ xtn H4.2s, H4.2d
+
+ MUL_REDC()
+
+ subs LEN, LEN, #32
+ b.ne L2B_loop
+
+ C Set the first part of key to r^2 and the second part to r
+ sub CTX, CTX, #52
+ ld4 {R0.s, R1.s, R2.s, R3.s}[1], [CTX], #16
+ ld1 {R4.s}[1], [CTX], #4
+ ld4 {S1.s, S2.s, S3.s, S4.s}[1], [CTX], #16
+
+ ld1 {C3.16b, C4.16b}, [DATA]
+
+ zip1 C0.2d, C3.2d, C4.2d
+ tbl C2.16b, { C3.16b, C4.16b }, H2TBL.16b
+ zip2 C4.2d, C3.2d, C4.2d
+
+ ushr C1.2d, C0.2d, #26
+ ushr C2.2d, C2.2d, #4
+ ushr C3.2d, C4.2d, #14
+ ushr C4.2d, C4.2d, #40
+
+ and C0.16b, C0.16b, MASK26.16b
+ and C1.16b, C1.16b, MASK26.16b
+ and C2.16b, C2.16b, MASK26.16b
+ and C3.16b, C3.16b, MASK26.16b
+ orr C4.16b, C4.16b, T4W.16b
+
+ add H0.2d, H0.2d, C0.2d
+ add H1.2d, H1.2d, C1.2d
+ add H2.2d, H2.2d, C2.2d
+ add H3.2d, H3.2d, C3.2d
+ add H4.2d, H4.2d, C4.2d
+
+ xtn H0.2s, H0.2d
+ xtn H1.2s, H1.2d
+ xtn H2.2s, H2.2d
+ xtn H3.2s, H3.2d
+ xtn H4.2s, H4.2d
+
+ MUL_REDC()
+
+ C Combine both state parts
+ dup C0.2d, H0.d[1]
+ dup C1.2d, H1.d[1]
+ dup C2.2d, H2.d[1]
+ dup C3.2d, H3.d[1]
+ dup C4.2d, H4.d[1]
+
+ add H0.2d, H0.2d, C0.2d
+ add H1.2d, H1.2d, C1.2d
+ add H2.2d, H2.2d, C2.2d
+ add H3.2d, H3.2d, C3.2d
+ add H4.2d, H4.2d, C4.2d
+
+ b Ldone
+
+ C Process two blocks separately
+L2B:
+ ld4 {R0.s, R1.s, R2.s, R3.s}[0], [CTX], #16
+ ld1 {R4.s}[0], [CTX], #4
+ ld4 {S1.s, S2.s, S3.s, S4.s}[0], [CTX], #16
+ ld4 {H4.s, H0.s, H1.s, H2.s}[0], [CTX], #16
+ ld1 {H3.s}[0], [CTX]
+ sub CTX, CTX, #16
+L1B_loop:
+ ld1 {C0.16b}, [DATA], #16
+
+ tbl C2.16b, { C0.16b }, H2TBL.16b
+ ext C4.16b, C0.16b, C0.16b, #8
+
+ ushr C1.2d, C0.2d, #26
+ ushr C2.2d, C2.2d, #4
+ ushr C3.2d, C4.2d, #14
+ ushr C4.2d, C4.2d, #40
+
+ and C0.16b, C0.16b, MASK26.16b
+ and C1.16b, C1.16b, MASK26.16b
+ and C2.16b, C2.16b, MASK26.16b
+ and C3.16b, C3.16b, MASK26.16b
+ orr C4.16b, C4.16b, T4W.16b
+
+ add H0.2d, H0.2d, C0.2d
+ add H1.2d, H1.2d, C1.2d
+ add H2.2d, H2.2d, C2.2d
+ add H3.2d, H3.2d, C3.2d
+ add H4.2d, H4.2d, C4.2d
+
+ xtn H0.2s, H0.2d
+ xtn H1.2s, H1.2d
+ xtn H2.2s, H2.2d
+ xtn H3.2s, H3.2d
+ xtn H4.2s, H4.2d
+
+ MUL_REDC()
+
+ subs LEN, LEN, #16
+ b.ne L1B_loop
+
+Ldone:
+ C Store state
+ st4 {H4.s, H0.s, H1.s, H2.s}[0], [CTX], #16
+ st1 {H3.s}[0], [CTX]
+
+ ret
+EPILOGUE(_nettle_poly1305_2core)
+
+.align 4
+.mask26: .quad 0x0000000003FFFFFF,0x0000000003FFFFFF
+.h2tbl: .byte 0x06,0x07,0x08,0x09,0x00,0x00,0x00,0x00,0x16,0x17,0x18,0x19,0x00,0x00,0x00,0x00
diff --git a/arm64/fat/poly1305-2core.asm b/arm64/fat/poly1305-2core.asm
new file mode 100644
index 00000000..f5486302
--- /dev/null
+++ b/arm64/fat/poly1305-2core.asm
@@ -0,0 +1,35 @@
+C arm64/fat/poly1305-2core.asm
+
+ifelse(`
+ Copyright (C) 2022 Mamone Tarsha
+
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_fat_poly1305_2core) picked up by configure
+
+include_src(`arm64/asimd/poly1305-2core.asm')
diff --git a/chacha-poly1305.c b/chacha-poly1305.c
index 7a423e1e..521b441a 100644
--- a/chacha-poly1305.c
+++ b/chacha-poly1305.c
@@ -90,14 +90,11 @@ chacha_poly1305_set_nonce (struct chacha_poly1305_ctx *ctx,
ctx->auth_size = ctx->data_size = ctx->index = 0;
}
-/* FIXME: Duplicated in poly1305-aes128.c */
-#define COMPRESS(ctx, data) _nettle_poly1305_block(&(ctx)->poly1305, (data), 1)
-
static void
poly1305_update (struct chacha_poly1305_ctx *ctx,
size_t length, const uint8_t *data)
{
- MD_UPDATE (ctx, length, data, COMPRESS, (void) 0);
+ ctx->index = _nettle_poly1305_update(&ctx->poly1305, ctx->block, ctx->index, length, data);
}
static void
diff --git a/configure.ac b/configure.ac
index da72f908..0b4a358c 100644
--- a/configure.ac
+++ b/configure.ac
@@ -81,6 +81,10 @@ AC_ARG_ENABLE(arm-neon,
AC_HELP_STRING([--enable-arm-neon], [Enable ARM Neon assembly. (default=auto)]),,
[enable_arm_neon=auto])
+AC_ARG_ENABLE(arm64-asimd,
+ AC_HELP_STRING([--enable-arm64-asimd], [Enable Arm64 advanced SIMD. (default=no)]),,
+ [enable_arm64_asimd=no])
+
AC_ARG_ENABLE(arm64-crypto,
AC_HELP_STRING([--enable-arm64-crypto], [Enable Arm64 crypto extension. (default=no)]),,
[enable_arm64_crypto=no])
@@ -511,8 +515,11 @@ if test "x$enable_assembler" = xyes ; then
if test "x$enable_fat" = xyes ; then
asm_path="arm64/fat $asm_path"
OPT_NETTLE_SOURCES="fat-arm64.c $OPT_NETTLE_SOURCES"
- FAT_TEST_LIST="none aes pmull sha1 sha2"
+ FAT_TEST_LIST="none asimd aes pmull sha1 sha2"
else
+ if test "$enable_arm64_asimd" = yes ; then
+ asm_path="arm64/asimd $asm_path"
+ fi
if test "$enable_arm64_crypto" = yes ; then
asm_path="arm64/crypto $asm_path"
fi
@@ -597,6 +604,7 @@ asm_nettle_optional_list="gcm-hash.asm gcm-hash8.asm cpuid.asm cpu-facility.asm
aes256-encrypt-2.asm aes256-decrypt-2.asm \
cbc-aes128-encrypt-2.asm cbc-aes192-encrypt-2.asm cbc-aes256-encrypt-2.asm \
chacha-2core.asm chacha-3core.asm chacha-4core.asm chacha-core-internal-2.asm \
+ poly1305-2core.asm \
salsa20-2core.asm salsa20-core-internal-2.asm \
sha1-compress-2.asm sha256-compress-2.asm \
sha3-permute-2.asm sha512-compress-2.asm \
@@ -730,6 +738,8 @@ AH_VERBATIM([HAVE_NATIVE],
#undef HAVE_NATIVE_fat_chacha_2core
#undef HAVE_NATIVE_fat_chacha_3core
#undef HAVE_NATIVE_fat_chacha_4core
+#undef HAVE_NATIVE_poly1305_2core
+#undef HAVE_NATIVE_fat_poly1305_2core
#undef HAVE_NATIVE_ecc_curve25519_modp
#undef HAVE_NATIVE_ecc_curve448_modp
#undef HAVE_NATIVE_ecc_secp192r1_modp
diff --git a/fat-arm64.c b/fat-arm64.c
index fcb2ece8..be3d0b1e 100644
--- a/fat-arm64.c
+++ b/fat-arm64.c
@@ -53,6 +53,7 @@
#include "aes.h"
#include "gcm.h"
#include "gcm-internal.h"
+#include "poly1305.h"
#include "fat-setup.h"
/* Defines from arch/arm64/include/uapi/asm/hwcap.h in Linux kernel */
@@ -74,6 +75,7 @@
struct arm64_features
{
+ int have_asimd;
int have_aes;
int have_pmull;
int have_sha1;
@@ -87,6 +89,7 @@ static void
get_arm64_features (struct arm64_features *features)
{
const char *s;
+ features->have_asimd = 0;
features->have_aes = 0;
features->have_pmull = 0;
features->have_sha1 = 0;
@@ -99,7 +102,9 @@ get_arm64_features (struct arm64_features *features)
const char *sep = strchr (s, ',');
size_t length = sep ? (size_t) (sep - s) : strlen(s);
- if (MATCH (s, length, "aes", 3))
+ if (MATCH (s, length, "asimd", 5))
+ features->have_asimd = 1;
+ else if (MATCH (s, length, "aes", 3))
features->have_aes = 1;
else if (MATCH (s, length, "pmull", 5))
features->have_pmull = 1;
@@ -115,6 +120,8 @@ get_arm64_features (struct arm64_features *features)
{
#if USE_GETAUXVAL
unsigned long hwcap = getauxval(AT_HWCAP);
+ features->have_asimd
+ = ((hwcap & HWCAP_ASIMD) == HWCAP_ASIMD);
features->have_aes
= ((hwcap & (HWCAP_ASIMD | HWCAP_AES)) == (HWCAP_ASIMD | HWCAP_AES));
features->have_pmull
@@ -166,6 +173,22 @@ DECLARE_FAT_FUNC(_nettle_sha256_compress, sha256_compress_func)
DECLARE_FAT_FUNC_VAR(sha256_compress, sha256_compress_func, c)
DECLARE_FAT_FUNC_VAR(sha256_compress, sha256_compress_func, arm64)
+DECLARE_FAT_FUNC(_nettle_chacha_core, chacha_core_func)
+DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, c);
+DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, asimd);
+
+DECLARE_FAT_FUNC(nettle_chacha_crypt, chacha_crypt_func)
+DECLARE_FAT_FUNC_VAR(chacha_crypt, chacha_crypt_func, 1core)
+DECLARE_FAT_FUNC_VAR(chacha_crypt, chacha_crypt_func, 4core)
+
+DECLARE_FAT_FUNC(nettle_chacha_crypt32, chacha_crypt_func)
+DECLARE_FAT_FUNC_VAR(chacha_crypt32, chacha_crypt_func, 1core)
+DECLARE_FAT_FUNC_VAR(chacha_crypt32, chacha_crypt_func, 4core)
+
+DECLARE_FAT_FUNC(_nettle_poly1305_update, poly1305_update_func)
+DECLARE_FAT_FUNC_VAR(poly1305_update, poly1305_update_func, 1core)
+DECLARE_FAT_FUNC_VAR(poly1305_update, poly1305_update_func, 2core)
+
static void CONSTRUCTOR
fat_init (void)
{
@@ -176,8 +199,9 @@ fat_init (void)
verbose = getenv (ENV_VERBOSE) != NULL;
if (verbose)
- fprintf (stderr, "libnettle: cpu features:%s%s%s%s\n",
- features.have_aes ? " aes instructions" : "",
+ fprintf (stderr, "libnettle: cpu features:%s%s%s%s%s\n",
+ features.have_asimd ? " advanced simd" : "",
+ features.have_aes ? " aes instructions" : "",
features.have_pmull ? " polynomial multiply long instructions (PMULL/PMULL2)" : "",
features.have_sha1 ? " sha1 instructions" : "",
features.have_sha2 ? " sha2 instructions" : "");
@@ -243,6 +267,22 @@ fat_init (void)
{
_nettle_sha256_compress_vec = _nettle_sha256_compress_c;
}
+ if (features.have_asimd)
+ {
+ if (verbose)
+ fprintf (stderr, "libnettle: enabling advanced simd code.\n");
+ _nettle_chacha_core_vec = _nettle_chacha_core_asimd;
+ nettle_chacha_crypt_vec = _nettle_chacha_crypt_4core;
+ nettle_chacha_crypt32_vec = _nettle_chacha_crypt32_4core;
+ _nettle_poly1305_update_vec = _nettle_poly1305_update_2core;
+ }
+ else
+ {
+ _nettle_chacha_core_vec = _nettle_chacha_core_c;
+ nettle_chacha_crypt_vec = _nettle_chacha_crypt_1core;
+ nettle_chacha_crypt32_vec = _nettle_chacha_crypt32_1core;
+ _nettle_poly1305_update_vec = _nettle_poly1305_update_1core;
+ }
}
DEFINE_FAT_FUNC(nettle_aes128_encrypt, void,
@@ -290,3 +330,29 @@ DEFINE_FAT_FUNC(nettle_sha1_compress, void,
DEFINE_FAT_FUNC(_nettle_sha256_compress, void,
(uint32_t *state, const uint8_t *input, const uint32_t *k),
(state, input, k))
+
+DEFINE_FAT_FUNC(_nettle_chacha_core, void,
+ (uint32_t *dst, const uint32_t *src, unsigned rounds),
+ (dst, src, rounds))
+
+DEFINE_FAT_FUNC(nettle_chacha_crypt, void,
+ (struct chacha_ctx *ctx,
+ size_t length,
+ uint8_t *dst,
+ const uint8_t *src),
+ (ctx, length, dst, src))
+
+DEFINE_FAT_FUNC(nettle_chacha_crypt32, void,
+ (struct chacha_ctx *ctx,
+ size_t length,
+ uint8_t *dst,
+ const uint8_t *src),
+ (ctx, length, dst, src))
+
+DEFINE_FAT_FUNC(_nettle_poly1305_update, unsigned,
+ (struct poly1305_ctx *ctx,
+ uint8_t *block,
+ unsigned pos,
+ size_t length,
+ const uint8_t *data),
+ (ctx, block, pos, length, data))
diff --git a/fat-setup.h b/fat-setup.h
index 64b27244..677824ce 100644
--- a/fat-setup.h
+++ b/fat-setup.h
@@ -196,6 +196,10 @@ typedef void chacha_crypt_func(struct chacha_ctx *ctx,
uint8_t *dst,
const uint8_t *src);
+struct poly1305_ctx;
+typedef unsigned poly1305_update_func(struct poly1305_ctx *ctx, uint8_t *block, unsigned pos,
+ size_t length, const uint8_t *data);
+
struct aes128_ctx;
typedef void aes128_set_key_func (struct aes128_ctx *ctx, const uint8_t *key);
typedef void aes128_invert_key_func (struct aes128_ctx *dst, const struct aes128_ctx *src);
diff --git a/poly1305-aes.c b/poly1305-aes.c
index a4050254..935ea638 100644
--- a/poly1305-aes.c
+++ b/poly1305-aes.c
@@ -56,13 +56,11 @@ poly1305_aes_set_nonce (struct poly1305_aes_ctx *ctx,
memcpy (ctx->nonce, nonce, POLY1305_AES_NONCE_SIZE);
}
-#define COMPRESS(ctx, data) _nettle_poly1305_block(&(ctx)->pctx, (data), 1)
-
void
poly1305_aes_update (struct poly1305_aes_ctx *ctx,
size_t length, const uint8_t *data)
{
- MD_UPDATE (ctx, length, data, COMPRESS, (void) 0);
+ ctx->index = _nettle_poly1305_update(&ctx->pctx, ctx->block, ctx->index, length, data);
}
void
diff --git a/poly1305-internal.h b/poly1305-internal.h
index 9932d524..b55f19fe 100644
--- a/poly1305-internal.h
+++ b/poly1305-internal.h
@@ -53,6 +53,19 @@ void _nettle_poly1305_digest (struct poly1305_ctx *ctx, union nettle_block16 *s)
/* Process one block. */
void _nettle_poly1305_block (struct poly1305_ctx *ctx, const uint8_t *m,
unsigned high);
+unsigned _nettle_poly1305_update(struct poly1305_ctx *ctx, uint8_t *block, unsigned pos, size_t length,
+ const uint8_t *data);
+
+/* Functions available only in some configurations */
+unsigned
+_nettle_poly1305_update_2core(struct poly1305_ctx *ctx,
+ uint8_t *block, unsigned pos,
+ size_t length, const uint8_t *data);
+
+unsigned
+_nettle_poly1305_update_1core(struct poly1305_ctx *ctx,
+ uint8_t *block, unsigned pos,
+ size_t length, const uint8_t *data);
#ifdef __cplusplus
}
diff --git a/poly1305-update.c b/poly1305-update.c
new file mode 100644
index 00000000..180aa169
--- /dev/null
+++ b/poly1305-update.c
@@ -0,0 +1,114 @@
+/* poly1305-update.c
+
+ Copyright (C) 2021 Mamone Tarsha
+
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+*/
+
+#if HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include <string.h>
+
+#include "poly1305.h"
+#include "poly1305-internal.h"
+
+#if HAVE_NATIVE_poly1305_2core
+#define _nettle_poly1305_update_2core _nettle_poly1305_update
+#elif !HAVE_NATIVE_fat_poly1305_2core
+#define _nettle_poly1305_update_1core _nettle_poly1305_update
+#endif
+
+#if HAVE_NATIVE_poly1305_2core || HAVE_NATIVE_fat_poly1305_2core
+void _nettle_poly1305_2core(struct poly1305_ctx *ctx, const uint8_t *m, size_t len, unsigned t4);
+unsigned
+_nettle_poly1305_update_2core(struct poly1305_ctx *ctx,
+ uint8_t *block, unsigned pos,
+ size_t length, const uint8_t *data)
+{
+ if (pos)
+ {
+ if (pos + length < POLY1305_BLOCK_SIZE)
+ {
+ memcpy (block + pos, data, length);
+ return pos + length;
+ }
+ else
+ {
+ unsigned left = POLY1305_BLOCK_SIZE - pos;
+ memcpy (block + pos, data, left);
+ data += left;
+ length -= left;
+ _nettle_poly1305_block(ctx, block, 1);
+ }
+ }
+ if (length >= 2*POLY1305_BLOCK_SIZE)
+ {
+ size_t rlen = length & -(2*POLY1305_BLOCK_SIZE);
+ _nettle_poly1305_2core(ctx, data, rlen, 1);
+ data += rlen;
+ length -= rlen;
+ }
+ if (length >= POLY1305_BLOCK_SIZE)
+ {
+ _nettle_poly1305_block(ctx, data, 1);
+ data += POLY1305_BLOCK_SIZE;
+ length -= POLY1305_BLOCK_SIZE;
+ }
+ memcpy (block, data, length);
+ return length;
+}
+#endif
+#if !HAVE_NATIVE_poly1305_2core
+unsigned
+_nettle_poly1305_update_1core(struct poly1305_ctx *ctx,
+ uint8_t *block, unsigned pos,
+ size_t length, const uint8_t *data)
+{
+ if (pos)
+ {
+ if (pos + length < POLY1305_BLOCK_SIZE)
+ {
+ memcpy (block + pos, data, length);
+ return pos + length;
+ }
+ else
+ {
+ unsigned left = POLY1305_BLOCK_SIZE - pos;
+ memcpy (block + pos, data, left);
+ data += left;
+ length -= left;
+ _nettle_poly1305_block(ctx, block, 1);
+ }
+ }
+ for (; length >= POLY1305_BLOCK_SIZE; length -= POLY1305_BLOCK_SIZE, data += POLY1305_BLOCK_SIZE)
+ _nettle_poly1305_block(ctx, data, 1);
+ memcpy (block, data, length);
+ return length;
+}
+#endif