summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNiels Möller <nisse@lysator.liu.se>2020-07-13 21:43:37 +0200
committerNiels Möller <nisse@lysator.liu.se>2020-07-13 21:43:37 +0200
commit7a9d3f59ae3cf0690135d951580516cdafc4db5d (patch)
treeaf367038bd0536d60f59c4a849c34b633c20168f
parent097497ec6b93dc9b2b66afa8bd9321fb87dd6439 (diff)
downloadnettle-7a9d3f59ae3cf0690135d951580516cdafc4db5d.tar.gz
Three-way interleaving of chacha on Neon
-rw-r--r--ChangeLog5
-rw-r--r--arm/neon/chacha-3core.asm242
-rw-r--r--chacha-crypt.c40
-rw-r--r--chacha-internal.h5
-rw-r--r--configure.ac3
5 files changed, 294 insertions, 1 deletions
diff --git a/ChangeLog b/ChangeLog
index 40104add..8b496f1a 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2020-07-13 Niels Möller <nisse@lysator.liu.se>
+
+ * arm/neon/chacha-3core.asm: New file, 3-way interleaving of
+ chacha.
+
2020-07-11 Niels Möller <nisse@lysator.liu.se>
* testsuite/chacha-test.c (test_main): Delete obsolete tests for
diff --git a/arm/neon/chacha-3core.asm b/arm/neon/chacha-3core.asm
new file mode 100644
index 00000000..b73df2f1
--- /dev/null
+++ b/arm/neon/chacha-3core.asm
@@ -0,0 +1,242 @@
+C arm/neon/chacha-3core.asm
+
+ifelse(<
+ Copyright (C) 2020 Niels Möller
+
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+>)
+
+ .file "chacha-3core.asm"
+ .fpu neon
+
+define(<DST>, <r0>)
+define(<SRC>, <r1>)
+define(<ROUNDS>, <r2>)
+
+C State, X, Y and Z representing consecutive blocks
+define(<X0>, <q0>)
+define(<X1>, <q1>)
+define(<X2>, <q2>)
+define(<X3>, <q3>)
+define(<Y0>, <q8>)
+define(<Y1>, <q9>)
+define(<Y2>, <q10>)
+define(<Y3>, <q11>)
+define(<Z0>, <q12>)
+define(<Z1>, <q13>)
+define(<Z2>, <q14>)
+define(<Z3>, <q15>)
+
+define(<T0>, <q4>)
+define(<T1>, <q5>)
+define(<T2>, <q6>)
+define(<T3>, <q7>)
+
+ .text
+ .align 4
+.Lcount1:
+ .int 1,0,0,0
+
+ C _chacha_3core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+
+PROLOGUE(_nettle_chacha_3core)
+ vldm SRC, {X0,X1,X2,X3}
+ vpush {q4,q5,q6,q7}
+ adr r12, .Lcount1
+ vld1.64 {Z3}, [r12]
+
+ vadd.i64 Y3, X3, Z3 C Increment 64-bit counter
+ vadd.i64 Z3, Y3, Z3
+
+.Lshared_entry:
+ vmov Y0, X0
+ vmov Z0, X0
+ vmov Y1, X1
+ vmov Z1, X1
+ vmov Y2, X2
+ vmov Z2, X2
+ vpush {Z3}
+ vpush {Y3}
+
+.Loop:
+ C Interleave three blocks. Note that with this scheduling,
+ C only two temporaries, T0 and T1, are needed.
+ vadd.i32 X0, X0, X1
+ veor X3, X3, X0
+ vrev32.16 X3, X3 C lrot 16
+ vadd.i32 Y0, Y0, Y1
+ vadd.i32 X2, X2, X3
+ veor Y3, Y3, Y0
+ veor T0, X1, X2
+ vrev32.16 Y3, Y3 C lrot 16
+ vadd.i32 Z0, Z0, Z1
+ vshl.i32 X1, T0, #12
+ vadd.i32 Y2, Y2, Y3
+ veor Z3, Z3, Z0
+ vsri.u32 X1, T0, #20
+ veor T0, Y1, Y2
+ vrev32.16 Z3, Z3 C lrot 16
+ vadd.i32 X0, X0, X1
+ vshl.i32 Y1, T0, #12
+ vadd.i32 Z2, Z2, Z3
+ veor T1, X3, X0
+ vsri.u32 Y1, T0, #20
+ veor T0, Z1, Z2
+ vshl.i32 X3, T1, #8
+ vsri.u32 X3, T1, #24
+ vadd.i32 Y0, Y0, Y1
+ vshl.i32 Z1, T0, #12
+ vadd.i32 X2, X2, X3
+ veor T1, Y3, Y0
+ vsri.u32 Z1, T0, #20
+ veor T0, X1, X2
+ vshl.i32 Y3, T1, #8
+ vsri.u32 Y3, T1, #24
+ vadd.i32 Z0, Z0, Z1
+ vshl.i32 X1, T0, #7
+ vadd.i32 Y2, Y2, Y3
+ veor T1, Z3, Z0
+ vsri.u32 X1, T0, #25
+ veor T0, Y1, Y2
+ vshl.i32 Z3, T1, #8
+ vsri.u32 Z3, T1, #24
+ vshl.i32 Y1, T0, #7
+ vadd.i32 Z2, Z2, Z3
+ vsri.u32 Y1, T0, #25
+ veor T0, Z1, Z2
+ vshl.i32 Z1, T0, #7
+ vsri.u32 Z1, T0, #25
+
+ vext.32 X1, X1, X1, #1
+ vext.32 X2, X2, X2, #2
+ vext.32 X3, X3, X3, #3
+
+ vext.32 Y1, Y1, Y1, #1
+ vext.32 Y2, Y2, Y2, #2
+ vext.32 Y3, Y3, Y3, #3
+
+ vext.32 Z1, Z1, Z1, #1
+ vext.32 Z2, Z2, Z2, #2
+ vext.32 Z3, Z3, Z3, #3
+
+ vadd.i32 X0, X0, X1
+ veor X3, X3, X0
+ vrev32.16 X3, X3 C lrot 16
+ vadd.i32 Y0, Y0, Y1
+ vadd.i32 X2, X2, X3
+ veor Y3, Y3, Y0
+ veor T0, X1, X2
+ vrev32.16 Y3, Y3 C lrot 16
+ vadd.i32 Z0, Z0, Z1
+ vshl.i32 X1, T0, #12
+ vadd.i32 Y2, Y2, Y3
+ veor Z3, Z3, Z0
+ vsri.u32 X1, T0, #20
+ veor T0, Y1, Y2
+ vrev32.16 Z3, Z3 C lrot 16
+ vadd.i32 X0, X0, X1
+ vshl.i32 Y1, T0, #12
+ vadd.i32 Z2, Z2, Z3
+ veor T1, X3, X0
+ vsri.u32 Y1, T0, #20
+ veor T0, Z1, Z2
+ vshl.i32 X3, T1, #8
+ vsri.u32 X3, T1, #24
+ vadd.i32 Y0, Y0, Y1
+ vshl.i32 Z1, T0, #12
+ vadd.i32 X2, X2, X3
+ veor T1, Y3, Y0
+ vsri.u32 Z1, T0, #20
+ veor T0, X1, X2
+ vshl.i32 Y3, T1, #8
+ vsri.u32 Y3, T1, #24
+ vadd.i32 Z0, Z0, Z1
+ vshl.i32 X1, T0, #7
+ vadd.i32 Y2, Y2, Y3
+ veor T1, Z3, Z0
+ vsri.u32 X1, T0, #25
+ veor T0, Y1, Y2
+ vshl.i32 Z3, T1, #8
+ vsri.u32 Z3, T1, #24
+ vshl.i32 Y1, T0, #7
+ vadd.i32 Z2, Z2, Z3
+ vsri.u32 Y1, T0, #25
+ veor T0, Z1, Z2
+ vshl.i32 Z1, T0, #7
+ vsri.u32 Z1, T0, #25
+
+ subs ROUNDS, ROUNDS, #2
+
+ vext.32 X1, X1, X1, #3
+ vext.32 X2, X2, X2, #2
+ vext.32 X3, X3, X3, #1
+
+ vext.32 Y1, Y1, Y1, #3
+ vext.32 Y2, Y2, Y2, #2
+ vext.32 Y3, Y3, Y3, #1
+
+ vext.32 Z1, Z1, Z1, #3
+ vext.32 Z2, Z2, Z2, #2
+ vext.32 Z3, Z3, Z3, #1
+
+ bhi .Loop
+
+ vldm SRC, {T0,T1,T2,T3}
+ vadd.i32 X0, X0, T0
+ vadd.i32 Y0, Y0, T0
+ vadd.i32 Z0, Z0, T0
+ vadd.i32 X1, X1, T1
+ vadd.i32 Y1, Y1, T1
+ vadd.i32 Z1, Z1, T1
+ vadd.i32 X2, X2, T2
+ vadd.i32 Y2, Y2, T2
+ vadd.i32 Z2, Z2, T2
+
+ vpop {T0, T1} C updated counters
+ vadd.i32 X3, X3, T3
+ vadd.i32 Y3, Y3, T0
+ vadd.i32 Z3, Z3, T1
+
+ vpop {q4,q5,q6,q7}
+
+ vstmia DST!, {X0,X1,X2,X3}
+ vstmia DST!, {Y0,Y1,Y2,Y3}
+ vstm DST, {Z0,Z1,Z2,Z3}
+ bx lr
+EPILOGUE(_nettle_chacha_3core)
+
+PROLOGUE(_nettle_chacha_3core32)
+ vldm SRC, {X0,X1,X2,X3}
+ vpush {q4,q5,q6,q7}
+ adr r12, .Lcount1
+ vld1.64 {Z3}, [r12]
+ vadd.i32 Z3, Y3, Y3
+
+ vadd.i32 Y3, X3, Z3 C Increment 32-bit counter
+ vadd.i32 Z3, Y3, Z3
+ b .Lshared_entry
+EPILOGUE(_nettle_chacha_3core32)
diff --git a/chacha-crypt.c b/chacha-crypt.c
index 1797bd02..59d808d1 100644
--- a/chacha-crypt.c
+++ b/chacha-crypt.c
@@ -54,6 +54,45 @@
#define CHACHA_ROUNDS 20
+#if HAVE_NATIVE_chacha_3core
+void
+chacha_crypt(struct chacha_ctx *ctx,
+ size_t length,
+ uint8_t *dst,
+ const uint8_t *src)
+{
+ uint32_t x[3*_CHACHA_STATE_LENGTH];
+
+ if (!length)
+ return;
+
+ while (length > 2*CHACHA_BLOCK_SIZE)
+ {
+ _chacha_3core (x, ctx->state, CHACHA_ROUNDS);
+ ctx->state[12] += 3;
+ ctx->state[13] += (ctx->state[12] < 3);
+ if (length <= 3*CHACHA_BLOCK_SIZE)
+ {
+ memxor3 (dst, src, x, length);
+ return;
+ }
+ memxor3 (dst, src, x, 3*CHACHA_BLOCK_SIZE);
+
+ length -= 3*CHACHA_BLOCK_SIZE;
+ dst += 3*CHACHA_BLOCK_SIZE;
+ src += 3*CHACHA_BLOCK_SIZE;
+ }
+ _chacha_core (x, ctx->state, CHACHA_ROUNDS);
+ ctx->state[13] += (++ctx->state[12] == 0);
+
+ if (length > CHACHA_BLOCK_SIZE)
+ {
+ _chacha_core (x + _CHACHA_STATE_LENGTH, ctx->state, CHACHA_ROUNDS);
+ ctx->state[13] += (++ctx->state[12] == 0);
+ }
+ memxor3 (dst, src, x, length);
+}
+#else
void
chacha_crypt(struct chacha_ctx *ctx,
size_t length,
@@ -85,6 +124,7 @@ chacha_crypt(struct chacha_ctx *ctx,
m += CHACHA_BLOCK_SIZE;
}
}
+#endif
void
chacha_crypt32(struct chacha_ctx *ctx,
diff --git a/chacha-internal.h b/chacha-internal.h
index 1bca8e74..cc90b132 100644
--- a/chacha-internal.h
+++ b/chacha-internal.h
@@ -39,8 +39,13 @@
#include "nettle-types.h"
#define _chacha_core _nettle_chacha_core
+#define _chacha_3core _nettle_chacha_3core
void
_chacha_core(uint32_t *dst, const uint32_t *src, unsigned rounds);
+/* Functions available only in some configurations */
+void
+_chacha_3core(uint32_t *dst, const uint32_t *src, unsigned rounds);
+
#endif /* NETTLE_CHACHA_INTERNAL_H_INCLUDED */
diff --git a/configure.ac b/configure.ac
index a01eb7d3..3136c1a3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -455,7 +455,7 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \
# Assembler files which generate additional object files if they are used.
asm_nettle_optional_list="gcm-hash8.asm cpuid.asm \
aes-encrypt-internal-2.asm aes-decrypt-internal-2.asm memxor-2.asm \
- chacha-core-internal-2.asm salsa20-2core.asm \
+ chacha-3core.asm chacha-core-internal-2.asm salsa20-2core.asm \
salsa20-core-internal-2.asm sha1-compress-2.asm sha256-compress-2.asm \
sha3-permute-2.asm sha512-compress-2.asm \
umac-nh-n-2.asm umac-nh-2.asm"
@@ -559,6 +559,7 @@ AH_VERBATIM([HAVE_NATIVE],
[/* Define to 1 each of the following for which a native (ie. CPU specific)
implementation of the corresponding routine exists. */
#undef HAVE_NATIVE_chacha_core
+#undef HAVE_NATIVE_chacha_3core
#undef HAVE_NATIVE_ecc_curve25519_modp
#undef HAVE_NATIVE_ecc_curve448_modp
#undef HAVE_NATIVE_ecc_secp192r1_modp