3 files changed, 144 insertions, 1 deletions
diff --git a/ChangeLog b/ChangeLog
index 51bc1263..5e994dbc 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,8 @@
 2020-09-25  Niels Möller  <nisse@lysator.liu.se>
 
+	* powerpc64/p7/chacha-core-internal.asm: New file.
+	* Makefile.in (distdir): Add powerpc64/p7.
+
 	* gcm.c (gcm_fill): Added separate implementations for big- and
 	little-endian, to use uint64_t stores and less overhead.
 
diff --git a/Makefile.in b/Makefile.in
index c10f3e9d..d955774d 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -616,7 +616,7 @@ distdir: $(DISTFILES)
 	set -e; for d in sparc32 sparc64 x86 \
 		x86_64 x86_64/aesni x86_64/sha_ni x86_64/fat \
 		arm arm/neon arm/v6 arm/fat \
-		powerpc64 powerpc64/p8 powerpc64/fat ; do \
+		powerpc64 powerpc64/p7 powerpc64/p8 powerpc64/fat ; do \
 	  mkdir "$(distdir)/$$d" ; \
 	  find "$(srcdir)/$$d" -maxdepth 1 '(' -name '*.asm' -o -name '*.m4' ')' \
 	    -exec cp '{}' "$(distdir)/$$d" ';' ; \
diff --git a/powerpc64/p7/chacha-core-internal.asm b/powerpc64/p7/chacha-core-internal.asm
new file mode 100644
index 00000000..33c721c1
--- /dev/null
+++ b/powerpc64/p7/chacha-core-internal.asm
@@ -0,0 +1,140 @@
+C powerpc64/p7/chacha-core-internal.asm
+
+ifelse(`
+   Copyright (C) 2020 Niels Möller and Torbjörn Granlund
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+C Register usage:
+
+C Argments
+define(`DST', `r3')
+define(`SRC', `r4')
+define(`ROUNDS', `r5')
+
+C Working state
+define(`X0', `v0')
+define(`X1', `v1')
+define(`X2', `v2')
+define(`X3', `v3')
+
+define(`ROT16', `v4')
+define(`ROT12', `v5')
+define(`ROT8',  `v6')
+define(`ROT7',  `v7')
+
+C Original input state
+define(`S0', `v8')
+define(`S1', `v9')
+define(`S2', `v10')
+define(`S3', `v11')
+
+C QROUND(X0, X1, X2, X3)
+define(`QROUND', `
+	C x0 += x1, x3 ^= x0, x3 lrot 16
+	C x2 += x3, x1 ^= x2, x1 lrot 12
+	C x0 += x1, x3 ^= x0, x3 lrot 8
+	C x2 += x3, x1 ^= x2, x1 lrot 7
+
+	vadduwm $1, $1, $2
+	vxor	$4, $4, $1
+	vrlw	$4, $4, ROT16
+
+	vadduwm $3, $3, $4
+	vxor	$2, $2, $3
+	vrlw	$2, $2, ROT12
+
+	vadduwm $1, $1, $2
+	vxor	$4, $4, $1
+	vrlw	$4, $4, ROT8
+
+	vadduwm $3, $3, $4
+	vxor	$2, $2, $3
+	vrlw	$2, $2, ROT7
+')
+
+	.text
+	.align 4
+	C _chacha_core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+
+PROLOGUE(_nettle_chacha_core)
+
+	li	r6, 0x10	C set up some...
+	li	r7, 0x20	C ...useful...
+	li	r8, 0x30	C ...offsets
+
+	vspltisw ROT16, -16	C -16 instead of 16 actually works!
+	vspltisw ROT12, 12
+	vspltisw ROT8, 8
+	vspltisw ROT7, 7
+
+	lxvw4x	VSR(X0), 0, SRC
+	lxvw4x	VSR(X1), r6, SRC
+	lxvw4x	VSR(X2), r7, SRC
+	lxvw4x	VSR(X3), r8, SRC
+
+	vor	S0, X0, X0
+	vor	S1, X1, X1
+	vor	S2, X2, X2
+	vor	S3, X3, X3
+
+	srdi	ROUNDS, ROUNDS, 1
+	mtctr	ROUNDS
+
+.Loop:
+	QROUND(X0, X1, X2, X3)
+	C Rotate rows, to get
+	C	 0  1  2  3
+	C	 5  6  7  4  <<< 1
+	C	10 11  8  9  <<< 2
+	C	15 12 13 14  <<< 3
+
+	vsldoi	X1, X1, X1, 4
+	vsldoi	X2, X2, X2, 8
+	vsldoi	X3, X3, X3, 12
+
+	QROUND(X0, X1, X2, X3)
+
+	C Inverse rotation
+	vsldoi	X1, X1, X1, 12
+	vsldoi	X2, X2, X2, 8
+	vsldoi	X3, X3, X3, 4
+
+	bdnz    .Loop
+
+	vadduwm	X0, X0, S0
+	vadduwm	X1, X1, S1
+	vadduwm	X2, X2, S2
+	vadduwm	X3, X3, S3
+
+	stxvw4x	VSR(X0), 0, DST
+	stxvw4x	VSR(X1), r6, DST
+	stxvw4x	VSR(X2), r7, DST
+	stxvw4x	VSR(X3), r8, DST
+
+	blr
+EPILOGUE(_nettle_chacha_core)