summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNiels Möller <nisse@lysator.liu.se>2015-06-03 14:50:47 +0200
committerNiels Möller <nisse@lysator.liu.se>2015-06-03 14:50:47 +0200
commit21b5984ede719ee98899b1f28b72183a211dc2af (patch)
treee5de5fb7c52618a82a9c298b072424c2e8fc63b3
parentb7dd9283f0fe3b2b6e8b562dc8803f2b84354f61 (diff)
downloadnettle-21b5984ede719ee98899b1f28b72183a211dc2af.tar.gz
ARM Neon implementation of chacha.
-rw-r--r--ChangeLog5
-rw-r--r--arm/neon/chacha-core-internal.asm136
2 files changed, 141 insertions, 0 deletions
diff --git a/ChangeLog b/ChangeLog
index f4ae7933..fd005c76 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,8 @@
+2015-06-03 Niels Möller <nisse@lysator.liu.se>
+
+ * arm/neon/chacha-core-internal.asm: New file. 55% speedup over C
+ version on Cortex-A9.
+
2015-05-19 Niels Möller <nisse@lysator.liu.se>
* configure.ac: ABI detection (n32 or n64) on Irix, and
diff --git a/arm/neon/chacha-core-internal.asm b/arm/neon/chacha-core-internal.asm
new file mode 100644
index 00000000..6f623106
--- /dev/null
+++ b/arm/neon/chacha-core-internal.asm
@@ -0,0 +1,136 @@
+C arm/neon/chacha-core-internal.asm
+
+ifelse(<
+ Copyright (C) 2013, 2015 Niels Möller
+
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+>)
+
+ .file "chacha-core-internal.asm"
+ .fpu neon
+
+define(<DST>, <r0>)
+define(<SRC>, <r1>)
+define(<ROUNDS>, <r2>)
+
+define(<X0>, <q0>)
+define(<X1>, <q1>)
+define(<X2>, <q2>)
+define(<X3>, <q3>)
+define(<T0>, <q8>)
+define(<S0>, <q12>)
+define(<S1>, <q13>)
+define(<S2>, <q14>)
+define(<S3>, <q15>)
+
+define(<QROUND>, <
+ C x0 += x1, x3 ^= x0, x3 lrot 16
+ C x2 += x3, x1 ^= x2, x1 lrot 12
+ C x0 += x1, x3 ^= x0, x3 lrot 8
+ C x2 += x3, x1 ^= x2, x1 lrot 7
+
+ vadd.i32 $1, $1, $2
+ veor $4, $4, $1
+ vshl.i32 T0, $4, #16
+ vshr.u32 $4, $4, #16
+ veor $4, $4, T0
+
+ vadd.i32 $3, $3, $4
+ veor $2, $2, $3
+ vshl.i32 T0, $2, #12
+ vshr.u32 $2, $2, #20
+ veor $2, $2, T0
+
+ vadd.i32 $1, $1, $2
+ veor $4, $4, $1
+ vshl.i32 T0, $4, #8
+ vshr.u32 $4, $4, #24
+ veor $4, $4, T0
+
+ vadd.i32 $3, $3, $4
+ veor $2, $2, $3
+ vshl.i32 T0, $2, #7
+ vshr.u32 $2, $2, #25
+ veor $2, $2, T0
+>)
+
+ .text
+ .align 4
+ C _chacha_core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+
+PROLOGUE(_nettle_chacha_core)
+ vldm SRC, {X0,X1,X2,X3}
+
+ vmov S0, X0
+ vmov S1, X1
+ vmov S2, X2
+ vmov S3, X3
+
+ C Input rows:
+ C 0 1 2 3 X0
+ C 4 5 6 7 X1
+ C 8 9 10 11 X2
+ C 12 13 14 15 X3
+
+.Loop:
+ QROUND(X0, X1, X2, X3)
+
+ C Rotate rows, to get
+ C 0 1 2 3
+ C 5 6 7 4 >>> 3
+ C 10 11 8 9 >>> 2
+ C 15 12 13 14 >>> 1
+ vext.32 X1, X1, X1, #1
+ vext.32 X2, X2, X2, #2
+ vext.32 X3, X3, X3, #3
+
+ QROUND(X0, X1, X2, X3)
+
+ subs ROUNDS, ROUNDS, #2
+ C Inverse rotation
+ vext.32 X1, X1, X1, #3
+ vext.32 X2, X2, X2, #2
+ vext.32 X3, X3, X3, #1
+
+ bhi .Loop
+
+ vadd.u32 X0, X0, S0
+ vadd.u32 X1, X1, S1
+ vadd.u32 X2, X2, S2
+ vadd.u32 X3, X3, S3
+
+ vstm DST, {X0,X1,X2,X3}
+ bx lr
+EPILOGUE(_nettle_chacha_core)
+
+divert(-1)
+define chachastate
+p/x $q0.u32
+p/x $q1.u32
+p/x $q2.u32
+p/x $q3.u32
+end