diff options
author | Mamone Tarsha <maamoun.tk@googlemail.com> | 2022-01-18 19:29:32 +0200 |
---|---|---|
committer | Mamone Tarsha <maamoun.tk@googlemail.com> | 2022-01-18 19:29:32 +0200 |
commit | fb89ef522a58cbb8d116e99ac33fd971ff9bf825 (patch) | |
tree | 0d86c2658f44e7eef13e2e273f174c52416a1dfb | |
parent | 94228f87fac465bcc3cb36efb8a43ef27554f7e5 (diff) | |
download | nettle-arm64-poly1305.tar.gz |
[Arm64] Optimize Poly1305arm64-poly1305
-rw-r--r-- | Makefile.in | 3 | ||||
-rw-r--r-- | arm64/asimd/poly1305-2core.asm | 351 | ||||
-rw-r--r-- | arm64/fat/poly1305-2core.asm | 35 | ||||
-rw-r--r-- | chacha-poly1305.c | 5 | ||||
-rw-r--r-- | configure.ac | 12 | ||||
-rw-r--r-- | fat-arm64.c | 72 | ||||
-rw-r--r-- | fat-setup.h | 4 | ||||
-rw-r--r-- | poly1305-aes.c | 4 | ||||
-rw-r--r-- | poly1305-internal.h | 13 | ||||
-rw-r--r-- | poly1305-update.c | 114 |
10 files changed, 601 insertions, 12 deletions
diff --git a/Makefile.in b/Makefile.in index 0590c370..4fd02bf6 100644 --- a/Makefile.in +++ b/Makefile.in @@ -102,6 +102,7 @@ nettle_SOURCES = aes-decrypt-internal.c aes-decrypt.c aes-decrypt-table.c \ siv-cmac.c siv-cmac-aes128.c siv-cmac-aes256.c \ cnd-memcpy.c \ chacha-crypt.c chacha-core-internal.c \ + poly1305-update.c \ chacha-poly1305.c chacha-poly1305-meta.c \ chacha-set-key.c chacha-set-nonce.c \ ctr.c ctr16.c des.c des3.c \ @@ -606,7 +607,7 @@ distdir: $(DISTFILES) set -e; for d in sparc32 sparc64 x86 \ x86_64 x86_64/aesni x86_64/sha_ni x86_64/fat \ arm arm/neon arm/v6 arm/fat \ - arm64 arm64/crypto arm64/fat \ + arm64 arm64/asimd arm64/crypto arm64/fat \ powerpc64 powerpc64/p7 powerpc64/p8 powerpc64/fat \ s390x s390x/vf s390x/msa s390x/msa_x1 s390x/msa_x2 s390x/msa_x4 s390x/fat ; do \ mkdir "$(distdir)/$$d" ; \ diff --git a/arm64/asimd/poly1305-2core.asm b/arm64/asimd/poly1305-2core.asm new file mode 100644 index 00000000..d624cded --- /dev/null +++ b/arm64/asimd/poly1305-2core.asm @@ -0,0 +1,351 @@ +C arm64/asimd/poly1305-2core.asm + +ifelse(` + Copyright (C) 2022 Mamone Tarsha + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +C Register usage: + +C Argments +define(`CTX', `x0') +define(`DATA', `x1') +define(`LEN', `x2') +define(`T4', `w3') + +C Working state +define(`H0', `v1') +define(`H1', `v2') +define(`H2', `v3') +define(`H3', `v4') +define(`H4', `v0') + +define(`R0', `v16') +define(`R1', `v17') +define(`R2', `v18') +define(`R3', `v19') +define(`R4', `v20') + +define(`S1', `v21') +define(`S2', `v22') +define(`S3', `v23') +define(`S4', `v24') + +define(`C0', `v25') +define(`C1', `v26') +define(`C2', `v27') +define(`C3', `v28') +define(`C4', `v29') + +define(`T4W', `v5') +define(`MASK26', `v6') +define(`H2TBL', `v7') + +C Multiply state by key of two horizontal parts and reduce both products +define(`MUL_REDC', ` + umull C0.2d, H0.2s, R0.2s + umull C1.2d, H1.2s, R0.2s + umull C2.2d, H2.2s, R0.2s + umull C3.2d, H3.2s, R0.2s + umull C4.2d, H4.2s, R0.2s + + umlal C0.2d, H4.2s, S1.2s + umlal C1.2d, H0.2s, R1.2s + umlal C2.2d, H1.2s, R1.2s + umlal C3.2d, H2.2s, R1.2s + umlal C4.2d, H3.2s, R1.2s + + umlal C0.2d, H3.2s, S2.2s + umlal C1.2d, H4.2s, S2.2s + umlal C2.2d, H0.2s, R2.2s + umlal C3.2d, H1.2s, R2.2s + umlal C4.2d, H2.2s, R2.2s + + umlal C0.2d, H2.2s, S3.2s + umlal C1.2d, H3.2s, S3.2s + umlal C2.2d, H4.2s, S3.2s + umlal C3.2d, H0.2s, R3.2s + umlal C4.2d, H1.2s, R3.2s + + umlal C0.2d, H1.2s, S4.2s + umlal C1.2d, H2.2s, S4.2s + umlal C2.2d, H3.2s, S4.2s + umlal C3.2d, H4.2s, S4.2s + umlal C4.2d, H0.2s, R4.2s + + C -- Reduction phase -- + + C carry h0 -> h1 + C carry h3 -> h4 + ushr H1.2d, C0.2d, #26 + ushr H4.2d, C3.2d, #26 + add H1.2d, H1.2d, C1.2d + add H4.2d, H4.2d, C4.2d + and H0.16b, C0.16b, MASK26.16b + and H3.16b, C3.16b, MASK26.16b + + C carry h1 -> h2 + C carry h4 -> h0 + ushr C1.2d, H1.2d, #26 + ushr C4.2d, H4.2d, #26 + add H2.2d, C2.2d, C1.2d + add H0.2d, H0.2d, C4.2d + and H1.16b, H1.16b, MASK26.16b + and H4.16b, H4.16b, MASK26.16b + + C carry h4*4 -> h0 + C carry h2 -> h3 + shl C4.2d, C4.2d, #2 + ushr C2.2d, H2.2d, #26 + add H0.2d, H0.2d, C4.2d + add H3.2d, H3.2d, C2.2d + and H2.16b, H2.16b, MASK26.16b + + C carry h0 -> h1 + C carry h3 -> h4 + ushr C0.2d, H0.2d, #26 + ushr C3.2d, H3.2d, #26 + add H1.2d, H1.2d, C0.2d + add H4.2d, H4.2d, C3.2d + and H0.16b, H0.16b, MASK26.16b + and H3.16b, H3.16b, MASK26.16b + ') + + .text + C void _nettle_poly1305_2core(struct poly1305_ctx *ctx, const uint8_t *m, size_t len, unsigned t4) + +PROLOGUE(_nettle_poly1305_2core) + adr x4, .mask26 + ld1 {MASK26.2d}, [x4] + adr x4, .h2tbl + ld1 {H2TBL.16b}, [x4] + + C Shift and replicate T4 across vector + lsl T4, T4, #24 + dup T4W.4s, T4 + + C In case the buffer has only two blocks, process them separately + cmp LEN, #32 + b.eq L2B + + C This procedure processes two blocks horizontally over vector + C registers. In order to keep two separated parts of state, we + C store the state in the first parts of vector reigters and + C initialize the second part with zeros. For each iteration, two + C blocks would be added to both parts and multiply the state parts + C by r^2 except for the last iteration we multiply the first part + C of state by r^2 and the second part by r. In this way we can + C maintain the correct sequence of multiples for each mutiplication + C of consecutive blocks. + + C Load key and cached multiples + ld4 {R0.s, R1.s, R2.s, R3.s}[0], [CTX], #16 + ld1 {R4.s}[0], [CTX], #4 + ld4 {S1.s, S2.s, S3.s, S4.s}[0], [CTX], #16 + + C -- Calculate r^2 = r*r --- + + ins H0.s[0], R0.s[0] + ins H1.s[0], R1.s[0] + ins H2.s[0], R2.s[0] + ins H3.s[0], R3.s[0] + ins H4.s[0], R4.s[0] + + MUL_REDC() + + C Horizontally asssign two parts of key vectors to r^2 + dup R0.4s, H0.s[0] + dup R1.4s, H1.s[0] + dup R2.4s, H2.s[0] + dup R3.4s, H3.s[0] + dup R4.4s, H4.s[0] + + C Calculate S = R*5 + shl S1.4s, R1.4s, #2 + shl S2.4s, R2.4s, #2 + shl S3.4s, R3.4s, #2 + shl S4.4s, R4.4s, #2 + add S1.4s, S1.4s, R1.4s + add S2.4s, S2.4s, R2.4s + add S3.4s, S3.4s, R3.4s + add S4.4s, S4.4s, R4.4s + + C initialize the second parts of state with zeros + eor H0.16b, H0.16b, H0.16b + eor H1.16b, H1.16b, H1.16b + eor H2.16b, H2.16b, H2.16b + eor H3.16b, H3.16b, H3.16b + eor H4.16b, H4.16b, H4.16b + + C Load state + ld4 {H4.s, H0.s, H1.s, H2.s}[0], [CTX], #16 + ld1 {H3.s}[0], [CTX] + + C Iterate over every pair of blocks and exclude the final one. + sub LEN, LEN, #32 +L2B_loop: + C Load two blocks + ld1 {C3.16b, C4.16b}, [DATA], #32 + + C Permute the two blocks and line them horizontally + zip1 C0.2d, C3.2d, C4.2d + tbl C2.16b, { C3.16b, C4.16b }, H2TBL.16b + zip2 C4.2d, C3.2d, C4.2d + + ushr C1.2d, C0.2d, #26 + ushr C2.2d, C2.2d, #4 + ushr C3.2d, C4.2d, #14 + ushr C4.2d, C4.2d, #40 + + and C0.16b, C0.16b, MASK26.16b + and C1.16b, C1.16b, MASK26.16b + and C2.16b, C2.16b, MASK26.16b + and C3.16b, C3.16b, MASK26.16b + orr C4.16b, C4.16b, T4W.16b + + add H0.2d, H0.2d, C0.2d + add H1.2d, H1.2d, C1.2d + add H2.2d, H2.2d, C2.2d + add H3.2d, H3.2d, C3.2d + add H4.2d, H4.2d, C4.2d + + xtn H0.2s, H0.2d + xtn H1.2s, H1.2d + xtn H2.2s, H2.2d + xtn H3.2s, H3.2d + xtn H4.2s, H4.2d + + MUL_REDC() + + subs LEN, LEN, #32 + b.ne L2B_loop + + C Set the first part of key to r^2 and the second part to r + sub CTX, CTX, #52 + ld4 {R0.s, R1.s, R2.s, R3.s}[1], [CTX], #16 + ld1 {R4.s}[1], [CTX], #4 + ld4 {S1.s, S2.s, S3.s, S4.s}[1], [CTX], #16 + + ld1 {C3.16b, C4.16b}, [DATA] + + zip1 C0.2d, C3.2d, C4.2d + tbl C2.16b, { C3.16b, C4.16b }, H2TBL.16b + zip2 C4.2d, C3.2d, C4.2d + + ushr C1.2d, C0.2d, #26 + ushr C2.2d, C2.2d, #4 + ushr C3.2d, C4.2d, #14 + ushr C4.2d, C4.2d, #40 + + and C0.16b, C0.16b, MASK26.16b + and C1.16b, C1.16b, MASK26.16b + and C2.16b, C2.16b, MASK26.16b + and C3.16b, C3.16b, MASK26.16b + orr C4.16b, C4.16b, T4W.16b + + add H0.2d, H0.2d, C0.2d + add H1.2d, H1.2d, C1.2d + add H2.2d, H2.2d, C2.2d + add H3.2d, H3.2d, C3.2d + add H4.2d, H4.2d, C4.2d + + xtn H0.2s, H0.2d + xtn H1.2s, H1.2d + xtn H2.2s, H2.2d + xtn H3.2s, H3.2d + xtn H4.2s, H4.2d + + MUL_REDC() + + C Combine both state parts + dup C0.2d, H0.d[1] + dup C1.2d, H1.d[1] + dup C2.2d, H2.d[1] + dup C3.2d, H3.d[1] + dup C4.2d, H4.d[1] + + add H0.2d, H0.2d, C0.2d + add H1.2d, H1.2d, C1.2d + add H2.2d, H2.2d, C2.2d + add H3.2d, H3.2d, C3.2d + add H4.2d, H4.2d, C4.2d + + b Ldone + + C Process two blocks separately +L2B: + ld4 {R0.s, R1.s, R2.s, R3.s}[0], [CTX], #16 + ld1 {R4.s}[0], [CTX], #4 + ld4 {S1.s, S2.s, S3.s, S4.s}[0], [CTX], #16 + ld4 {H4.s, H0.s, H1.s, H2.s}[0], [CTX], #16 + ld1 {H3.s}[0], [CTX] + sub CTX, CTX, #16 +L1B_loop: + ld1 {C0.16b}, [DATA], #16 + + tbl C2.16b, { C0.16b }, H2TBL.16b + ext C4.16b, C0.16b, C0.16b, #8 + + ushr C1.2d, C0.2d, #26 + ushr C2.2d, C2.2d, #4 + ushr C3.2d, C4.2d, #14 + ushr C4.2d, C4.2d, #40 + + and C0.16b, C0.16b, MASK26.16b + and C1.16b, C1.16b, MASK26.16b + and C2.16b, C2.16b, MASK26.16b + and C3.16b, C3.16b, MASK26.16b + orr C4.16b, C4.16b, T4W.16b + + add H0.2d, H0.2d, C0.2d + add H1.2d, H1.2d, C1.2d + add H2.2d, H2.2d, C2.2d + add H3.2d, H3.2d, C3.2d + add H4.2d, H4.2d, C4.2d + + xtn H0.2s, H0.2d + xtn H1.2s, H1.2d + xtn H2.2s, H2.2d + xtn H3.2s, H3.2d + xtn H4.2s, H4.2d + + MUL_REDC() + + subs LEN, LEN, #16 + b.ne L1B_loop + +Ldone: + C Store state + st4 {H4.s, H0.s, H1.s, H2.s}[0], [CTX], #16 + st1 {H3.s}[0], [CTX] + + ret +EPILOGUE(_nettle_poly1305_2core) + +.align 4 +.mask26: .quad 0x0000000003FFFFFF,0x0000000003FFFFFF +.h2tbl: .byte 0x06,0x07,0x08,0x09,0x00,0x00,0x00,0x00,0x16,0x17,0x18,0x19,0x00,0x00,0x00,0x00 diff --git a/arm64/fat/poly1305-2core.asm b/arm64/fat/poly1305-2core.asm new file mode 100644 index 00000000..f5486302 --- /dev/null +++ b/arm64/fat/poly1305-2core.asm @@ -0,0 +1,35 @@ +C arm64/fat/poly1305-2core.asm + +ifelse(` + Copyright (C) 2022 Mamone Tarsha + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +dnl PROLOGUE(_nettle_fat_poly1305_2core) picked up by configure + +include_src(`arm64/asimd/poly1305-2core.asm') diff --git a/chacha-poly1305.c b/chacha-poly1305.c index 7a423e1e..521b441a 100644 --- a/chacha-poly1305.c +++ b/chacha-poly1305.c @@ -90,14 +90,11 @@ chacha_poly1305_set_nonce (struct chacha_poly1305_ctx *ctx, ctx->auth_size = ctx->data_size = ctx->index = 0; } -/* FIXME: Duplicated in poly1305-aes128.c */ -#define COMPRESS(ctx, data) _nettle_poly1305_block(&(ctx)->poly1305, (data), 1) - static void poly1305_update (struct chacha_poly1305_ctx *ctx, size_t length, const uint8_t *data) { - MD_UPDATE (ctx, length, data, COMPRESS, (void) 0); + ctx->index = _nettle_poly1305_update(&ctx->poly1305, ctx->block, ctx->index, length, data); } static void diff --git a/configure.ac b/configure.ac index da72f908..0b4a358c 100644 --- a/configure.ac +++ b/configure.ac @@ -81,6 +81,10 @@ AC_ARG_ENABLE(arm-neon, AC_HELP_STRING([--enable-arm-neon], [Enable ARM Neon assembly. (default=auto)]),, [enable_arm_neon=auto]) +AC_ARG_ENABLE(arm64-asimd, + AC_HELP_STRING([--enable-arm64-asimd], [Enable Arm64 advanced SIMD. (default=no)]),, + [enable_arm64_asimd=no]) + AC_ARG_ENABLE(arm64-crypto, AC_HELP_STRING([--enable-arm64-crypto], [Enable Arm64 crypto extension. (default=no)]),, [enable_arm64_crypto=no]) @@ -511,8 +515,11 @@ if test "x$enable_assembler" = xyes ; then if test "x$enable_fat" = xyes ; then asm_path="arm64/fat $asm_path" OPT_NETTLE_SOURCES="fat-arm64.c $OPT_NETTLE_SOURCES" - FAT_TEST_LIST="none aes pmull sha1 sha2" + FAT_TEST_LIST="none asimd aes pmull sha1 sha2" else + if test "$enable_arm64_asimd" = yes ; then + asm_path="arm64/asimd $asm_path" + fi if test "$enable_arm64_crypto" = yes ; then asm_path="arm64/crypto $asm_path" fi @@ -597,6 +604,7 @@ asm_nettle_optional_list="gcm-hash.asm gcm-hash8.asm cpuid.asm cpu-facility.asm aes256-encrypt-2.asm aes256-decrypt-2.asm \ cbc-aes128-encrypt-2.asm cbc-aes192-encrypt-2.asm cbc-aes256-encrypt-2.asm \ chacha-2core.asm chacha-3core.asm chacha-4core.asm chacha-core-internal-2.asm \ + poly1305-2core.asm \ salsa20-2core.asm salsa20-core-internal-2.asm \ sha1-compress-2.asm sha256-compress-2.asm \ sha3-permute-2.asm sha512-compress-2.asm \ @@ -730,6 +738,8 @@ AH_VERBATIM([HAVE_NATIVE], #undef HAVE_NATIVE_fat_chacha_2core #undef HAVE_NATIVE_fat_chacha_3core #undef HAVE_NATIVE_fat_chacha_4core +#undef HAVE_NATIVE_poly1305_2core +#undef HAVE_NATIVE_fat_poly1305_2core #undef HAVE_NATIVE_ecc_curve25519_modp #undef HAVE_NATIVE_ecc_curve448_modp #undef HAVE_NATIVE_ecc_secp192r1_modp diff --git a/fat-arm64.c b/fat-arm64.c index fcb2ece8..be3d0b1e 100644 --- a/fat-arm64.c +++ b/fat-arm64.c @@ -53,6 +53,7 @@ #include "aes.h" #include "gcm.h" #include "gcm-internal.h" +#include "poly1305.h" #include "fat-setup.h" /* Defines from arch/arm64/include/uapi/asm/hwcap.h in Linux kernel */ @@ -74,6 +75,7 @@ struct arm64_features { + int have_asimd; int have_aes; int have_pmull; int have_sha1; @@ -87,6 +89,7 @@ static void get_arm64_features (struct arm64_features *features) { const char *s; + features->have_asimd = 0; features->have_aes = 0; features->have_pmull = 0; features->have_sha1 = 0; @@ -99,7 +102,9 @@ get_arm64_features (struct arm64_features *features) const char *sep = strchr (s, ','); size_t length = sep ? (size_t) (sep - s) : strlen(s); - if (MATCH (s, length, "aes", 3)) + if (MATCH (s, length, "asimd", 5)) + features->have_asimd = 1; + else if (MATCH (s, length, "aes", 3)) features->have_aes = 1; else if (MATCH (s, length, "pmull", 5)) features->have_pmull = 1; @@ -115,6 +120,8 @@ get_arm64_features (struct arm64_features *features) { #if USE_GETAUXVAL unsigned long hwcap = getauxval(AT_HWCAP); + features->have_asimd + = ((hwcap & HWCAP_ASIMD) == HWCAP_ASIMD); features->have_aes = ((hwcap & (HWCAP_ASIMD | HWCAP_AES)) == (HWCAP_ASIMD | HWCAP_AES)); features->have_pmull @@ -166,6 +173,22 @@ DECLARE_FAT_FUNC(_nettle_sha256_compress, sha256_compress_func) DECLARE_FAT_FUNC_VAR(sha256_compress, sha256_compress_func, c) DECLARE_FAT_FUNC_VAR(sha256_compress, sha256_compress_func, arm64) +DECLARE_FAT_FUNC(_nettle_chacha_core, chacha_core_func) +DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, c); +DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, asimd); + +DECLARE_FAT_FUNC(nettle_chacha_crypt, chacha_crypt_func) +DECLARE_FAT_FUNC_VAR(chacha_crypt, chacha_crypt_func, 1core) +DECLARE_FAT_FUNC_VAR(chacha_crypt, chacha_crypt_func, 4core) + +DECLARE_FAT_FUNC(nettle_chacha_crypt32, chacha_crypt_func) +DECLARE_FAT_FUNC_VAR(chacha_crypt32, chacha_crypt_func, 1core) +DECLARE_FAT_FUNC_VAR(chacha_crypt32, chacha_crypt_func, 4core) + +DECLARE_FAT_FUNC(_nettle_poly1305_update, poly1305_update_func) +DECLARE_FAT_FUNC_VAR(poly1305_update, poly1305_update_func, 1core) +DECLARE_FAT_FUNC_VAR(poly1305_update, poly1305_update_func, 2core) + static void CONSTRUCTOR fat_init (void) { @@ -176,8 +199,9 @@ fat_init (void) verbose = getenv (ENV_VERBOSE) != NULL; if (verbose) - fprintf (stderr, "libnettle: cpu features:%s%s%s%s\n", - features.have_aes ? " aes instructions" : "", + fprintf (stderr, "libnettle: cpu features:%s%s%s%s%s\n", + features.have_asimd ? " advanced simd" : "", + features.have_aes ? " aes instructions" : "", features.have_pmull ? " polynomial multiply long instructions (PMULL/PMULL2)" : "", features.have_sha1 ? " sha1 instructions" : "", features.have_sha2 ? " sha2 instructions" : ""); @@ -243,6 +267,22 @@ fat_init (void) { _nettle_sha256_compress_vec = _nettle_sha256_compress_c; } + if (features.have_asimd) + { + if (verbose) + fprintf (stderr, "libnettle: enabling advanced simd code.\n"); + _nettle_chacha_core_vec = _nettle_chacha_core_asimd; + nettle_chacha_crypt_vec = _nettle_chacha_crypt_4core; + nettle_chacha_crypt32_vec = _nettle_chacha_crypt32_4core; + _nettle_poly1305_update_vec = _nettle_poly1305_update_2core; + } + else + { + _nettle_chacha_core_vec = _nettle_chacha_core_c; + nettle_chacha_crypt_vec = _nettle_chacha_crypt_1core; + nettle_chacha_crypt32_vec = _nettle_chacha_crypt32_1core; + _nettle_poly1305_update_vec = _nettle_poly1305_update_1core; + } } DEFINE_FAT_FUNC(nettle_aes128_encrypt, void, @@ -290,3 +330,29 @@ DEFINE_FAT_FUNC(nettle_sha1_compress, void, DEFINE_FAT_FUNC(_nettle_sha256_compress, void, (uint32_t *state, const uint8_t *input, const uint32_t *k), (state, input, k)) + +DEFINE_FAT_FUNC(_nettle_chacha_core, void, + (uint32_t *dst, const uint32_t *src, unsigned rounds), + (dst, src, rounds)) + +DEFINE_FAT_FUNC(nettle_chacha_crypt, void, + (struct chacha_ctx *ctx, + size_t length, + uint8_t *dst, + const uint8_t *src), + (ctx, length, dst, src)) + +DEFINE_FAT_FUNC(nettle_chacha_crypt32, void, + (struct chacha_ctx *ctx, + size_t length, + uint8_t *dst, + const uint8_t *src), + (ctx, length, dst, src)) + +DEFINE_FAT_FUNC(_nettle_poly1305_update, unsigned, + (struct poly1305_ctx *ctx, + uint8_t *block, + unsigned pos, + size_t length, + const uint8_t *data), + (ctx, block, pos, length, data)) diff --git a/fat-setup.h b/fat-setup.h index 64b27244..677824ce 100644 --- a/fat-setup.h +++ b/fat-setup.h @@ -196,6 +196,10 @@ typedef void chacha_crypt_func(struct chacha_ctx *ctx, uint8_t *dst, const uint8_t *src); +struct poly1305_ctx; +typedef unsigned poly1305_update_func(struct poly1305_ctx *ctx, uint8_t *block, unsigned pos, + size_t length, const uint8_t *data); + struct aes128_ctx; typedef void aes128_set_key_func (struct aes128_ctx *ctx, const uint8_t *key); typedef void aes128_invert_key_func (struct aes128_ctx *dst, const struct aes128_ctx *src); diff --git a/poly1305-aes.c b/poly1305-aes.c index a4050254..935ea638 100644 --- a/poly1305-aes.c +++ b/poly1305-aes.c @@ -56,13 +56,11 @@ poly1305_aes_set_nonce (struct poly1305_aes_ctx *ctx, memcpy (ctx->nonce, nonce, POLY1305_AES_NONCE_SIZE); } -#define COMPRESS(ctx, data) _nettle_poly1305_block(&(ctx)->pctx, (data), 1) - void poly1305_aes_update (struct poly1305_aes_ctx *ctx, size_t length, const uint8_t *data) { - MD_UPDATE (ctx, length, data, COMPRESS, (void) 0); + ctx->index = _nettle_poly1305_update(&ctx->pctx, ctx->block, ctx->index, length, data); } void diff --git a/poly1305-internal.h b/poly1305-internal.h index 9932d524..b55f19fe 100644 --- a/poly1305-internal.h +++ b/poly1305-internal.h @@ -53,6 +53,19 @@ void _nettle_poly1305_digest (struct poly1305_ctx *ctx, union nettle_block16 *s) /* Process one block. */ void _nettle_poly1305_block (struct poly1305_ctx *ctx, const uint8_t *m, unsigned high); +unsigned _nettle_poly1305_update(struct poly1305_ctx *ctx, uint8_t *block, unsigned pos, size_t length, + const uint8_t *data); + +/* Functions available only in some configurations */ +unsigned +_nettle_poly1305_update_2core(struct poly1305_ctx *ctx, + uint8_t *block, unsigned pos, + size_t length, const uint8_t *data); + +unsigned +_nettle_poly1305_update_1core(struct poly1305_ctx *ctx, + uint8_t *block, unsigned pos, + size_t length, const uint8_t *data); #ifdef __cplusplus } diff --git a/poly1305-update.c b/poly1305-update.c new file mode 100644 index 00000000..180aa169 --- /dev/null +++ b/poly1305-update.c @@ -0,0 +1,114 @@ +/* poly1305-update.c + + Copyright (C) 2021 Mamone Tarsha + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +*/ + +#if HAVE_CONFIG_H +# include "config.h" +#endif + +#include <string.h> + +#include "poly1305.h" +#include "poly1305-internal.h" + +#if HAVE_NATIVE_poly1305_2core +#define _nettle_poly1305_update_2core _nettle_poly1305_update +#elif !HAVE_NATIVE_fat_poly1305_2core +#define _nettle_poly1305_update_1core _nettle_poly1305_update +#endif + +#if HAVE_NATIVE_poly1305_2core || HAVE_NATIVE_fat_poly1305_2core +void _nettle_poly1305_2core(struct poly1305_ctx *ctx, const uint8_t *m, size_t len, unsigned t4); +unsigned +_nettle_poly1305_update_2core(struct poly1305_ctx *ctx, + uint8_t *block, unsigned pos, + size_t length, const uint8_t *data) +{ + if (pos) + { + if (pos + length < POLY1305_BLOCK_SIZE) + { + memcpy (block + pos, data, length); + return pos + length; + } + else + { + unsigned left = POLY1305_BLOCK_SIZE - pos; + memcpy (block + pos, data, left); + data += left; + length -= left; + _nettle_poly1305_block(ctx, block, 1); + } + } + if (length >= 2*POLY1305_BLOCK_SIZE) + { + size_t rlen = length & -(2*POLY1305_BLOCK_SIZE); + _nettle_poly1305_2core(ctx, data, rlen, 1); + data += rlen; + length -= rlen; + } + if (length >= POLY1305_BLOCK_SIZE) + { + _nettle_poly1305_block(ctx, data, 1); + data += POLY1305_BLOCK_SIZE; + length -= POLY1305_BLOCK_SIZE; + } + memcpy (block, data, length); + return length; +} +#endif +#if !HAVE_NATIVE_poly1305_2core +unsigned +_nettle_poly1305_update_1core(struct poly1305_ctx *ctx, + uint8_t *block, unsigned pos, + size_t length, const uint8_t *data) +{ + if (pos) + { + if (pos + length < POLY1305_BLOCK_SIZE) + { + memcpy (block + pos, data, length); + return pos + length; + } + else + { + unsigned left = POLY1305_BLOCK_SIZE - pos; + memcpy (block + pos, data, left); + data += left; + length -= left; + _nettle_poly1305_block(ctx, block, 1); + } + } + for (; length >= POLY1305_BLOCK_SIZE; length -= POLY1305_BLOCK_SIZE, data += POLY1305_BLOCK_SIZE) + _nettle_poly1305_block(ctx, data, 1); + memcpy (block, data, length); + return length; +} +#endif |