diff options
-rw-r--r-- | ChangeLog | 3 | ||||
-rw-r--r-- | Makefile.in | 2 | ||||
-rw-r--r-- | powerpc64/p7/chacha-core-internal.asm | 140 |
3 files changed, 144 insertions, 1 deletions
@@ -1,5 +1,8 @@ 2020-09-25 Niels Möller <nisse@lysator.liu.se> + * powerpc64/p7/chacha-core-internal.asm: New file. + * Makefile.in (distdir): Add powerpc64/p7. + * gcm.c (gcm_fill): Added separate implementations for big- and little-endian, to use uint64_t stores and less overhead. diff --git a/Makefile.in b/Makefile.in index c10f3e9d..d955774d 100644 --- a/Makefile.in +++ b/Makefile.in @@ -616,7 +616,7 @@ distdir: $(DISTFILES) set -e; for d in sparc32 sparc64 x86 \ x86_64 x86_64/aesni x86_64/sha_ni x86_64/fat \ arm arm/neon arm/v6 arm/fat \ - powerpc64 powerpc64/p8 powerpc64/fat ; do \ + powerpc64 powerpc64/p7 powerpc64/p8 powerpc64/fat ; do \ mkdir "$(distdir)/$$d" ; \ find "$(srcdir)/$$d" -maxdepth 1 '(' -name '*.asm' -o -name '*.m4' ')' \ -exec cp '{}' "$(distdir)/$$d" ';' ; \ diff --git a/powerpc64/p7/chacha-core-internal.asm b/powerpc64/p7/chacha-core-internal.asm new file mode 100644 index 00000000..33c721c1 --- /dev/null +++ b/powerpc64/p7/chacha-core-internal.asm @@ -0,0 +1,140 @@ +C powerpc64/p7/chacha-core-internal.asm + +ifelse(` + Copyright (C) 2020 Niels Möller and Torbjörn Granlund + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +') + +C Register usage: + +C Argments +define(`DST', `r3') +define(`SRC', `r4') +define(`ROUNDS', `r5') + +C Working state +define(`X0', `v0') +define(`X1', `v1') +define(`X2', `v2') +define(`X3', `v3') + +define(`ROT16', `v4') +define(`ROT12', `v5') +define(`ROT8', `v6') +define(`ROT7', `v7') + +C Original input state +define(`S0', `v8') +define(`S1', `v9') +define(`S2', `v10') +define(`S3', `v11') + +C QROUND(X0, X1, X2, X3) +define(`QROUND', ` + C x0 += x1, x3 ^= x0, x3 lrot 16 + C x2 += x3, x1 ^= x2, x1 lrot 12 + C x0 += x1, x3 ^= x0, x3 lrot 8 + C x2 += x3, x1 ^= x2, x1 lrot 7 + + vadduwm $1, $1, $2 + vxor $4, $4, $1 + vrlw $4, $4, ROT16 + + vadduwm $3, $3, $4 + vxor $2, $2, $3 + vrlw $2, $2, ROT12 + + vadduwm $1, $1, $2 + vxor $4, $4, $1 + vrlw $4, $4, ROT8 + + vadduwm $3, $3, $4 + vxor $2, $2, $3 + vrlw $2, $2, ROT7 +') + + .text + .align 4 + C _chacha_core(uint32_t *dst, const uint32_t *src, unsigned rounds) + +PROLOGUE(_nettle_chacha_core) + + li r6, 0x10 C set up some... + li r7, 0x20 C ...useful... + li r8, 0x30 C ...offsets + + vspltisw ROT16, -16 C -16 instead of 16 actually works! + vspltisw ROT12, 12 + vspltisw ROT8, 8 + vspltisw ROT7, 7 + + lxvw4x VSR(X0), 0, SRC + lxvw4x VSR(X1), r6, SRC + lxvw4x VSR(X2), r7, SRC + lxvw4x VSR(X3), r8, SRC + + vor S0, X0, X0 + vor S1, X1, X1 + vor S2, X2, X2 + vor S3, X3, X3 + + srdi ROUNDS, ROUNDS, 1 + mtctr ROUNDS + +.Loop: + QROUND(X0, X1, X2, X3) + C Rotate rows, to get + C 0 1 2 3 + C 5 6 7 4 <<< 1 + C 10 11 8 9 <<< 2 + C 15 12 13 14 <<< 3 + + vsldoi X1, X1, X1, 4 + vsldoi X2, X2, X2, 8 + vsldoi X3, X3, X3, 12 + + QROUND(X0, X1, X2, X3) + + C Inverse rotation + vsldoi X1, X1, X1, 12 + vsldoi X2, X2, X2, 8 + vsldoi X3, X3, X3, 4 + + bdnz .Loop + + vadduwm X0, X0, S0 + vadduwm X1, X1, S1 + vadduwm X2, X2, S2 + vadduwm X3, X3, S3 + + stxvw4x VSR(X0), 0, DST + stxvw4x VSR(X1), r6, DST + stxvw4x VSR(X2), r7, DST + stxvw4x VSR(X3), r8, DST + + blr +EPILOGUE(_nettle_chacha_core) |