Merge branch 'ppc-chacha-core'

author: Niels Möller <nisse@lysator.liu.se> 2020-11-07 11:16:35 +0100
committer: Niels Möller <nisse@lysator.liu.se> 2020-11-07 11:16:35 +0100
commit: 611abe02e3a1fde17697ab70e7c2805b6cfc0eee (patch)
tree: 1f01e194c68915c1e49ab18438dadae8ebbbfb05
parent: 4c8b0cdd97ffec3ae3f8d995afdfccbc261b3c79 (diff)
parent: 19ba206d46d8558bc3af6ab14f7a770a94df57b9 (diff)
download: nettle-611abe02e3a1fde17697ab70e7c2805b6cfc0eee.tar.gz
6 files changed, 256 insertions, 6 deletions
diff --git a/ChangeLog b/ChangeLog
index 6626c6ea..218fa394 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,18 @@
+2020-11-07  Niels Möller  <nisse@lysator.liu.se>
+
+	Merged initial powerpc64 implementation of chacha.
+	* configure.ac: New command line option --enable-power-altivec.
+	Update asm_path logic, and add altivec to FAT_TEST_LIST.
+	* fat-ppc.c (get_ppc_features): Add logic to check for altivec and
+	vsx support, and select aither C or altivec implementation of
+	chacha_core.
+	* powerpc64/p7/chacha-core-internal.asm: New file.
+
+2020-09-25  Niels Möller  <nisse@lysator.liu.se>
+
+	* powerpc64/p7/chacha-core-internal.asm: New file.
+	* Makefile.in (distdir): Add powerpc64/p7.
+
 2020-10-29  Niels Möller  <nisse@lysator.liu.se>
 
 	* blowfish.c (blowfish_set_key): Add casts to uint32_t. Avoids
diff --git a/Makefile.in b/Makefile.in
index c10f3e9d..d955774d 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -616,7 +616,7 @@ distdir: $(DISTFILES)
 	set -e; for d in sparc32 sparc64 x86 \
 		x86_64 x86_64/aesni x86_64/sha_ni x86_64/fat \
 		arm arm/neon arm/v6 arm/fat \
-		powerpc64 powerpc64/p8 powerpc64/fat ; do \
+		powerpc64 powerpc64/p7 powerpc64/p8 powerpc64/fat ; do \
 	  mkdir "$(distdir)/$$d" ; \
 	  find "$(srcdir)/$$d" -maxdepth 1 '(' -name '*.asm' -o -name '*.m4' ')' \
 	    -exec cp '{}' "$(distdir)/$$d" ';' ; \
diff --git a/configure.ac b/configure.ac
index e9983697..2a47f940 100644
--- a/configure.ac
+++ b/configure.ac
@@ -93,6 +93,10 @@ AC_ARG_ENABLE(power-crypto-ext,
   AC_HELP_STRING([--enable-power-crypto-ext], [Enable POWER crypto extensions. (default=no)]),,
   [enable_power_crypto_ext=no])
 
+AC_ARG_ENABLE(power-altivec,
+  AC_HELP_STRING([--enable-power-altivec], [Enable POWER altivec and vsx extensions. (default=no)]),,
+  [enable_altivec=no])
+
 AC_ARG_ENABLE(mini-gmp,
   AC_HELP_STRING([--enable-mini-gmp], [Enable mini-gmp, used instead of libgmp.]),,
   [enable_mini_gmp=no])
@@ -462,9 +466,14 @@ if test "x$enable_assembler" = xyes ; then
 	if test "x$enable_fat" = xyes ; then
 	  asm_path="powerpc64/fat $asm_path"
 	  OPT_NETTLE_SOURCES="fat-ppc.c $OPT_NETTLE_SOURCES"
-	  FAT_TEST_LIST="none crypto_ext"
-	elif test "x$enable_power_crypto_ext" = xyes ; then
-          asm_path="powerpc64/p8 $asm_path"
+	  FAT_TEST_LIST="none crypto_ext altivec"
+	else
+	  if test "$enable_power_crypto_ext" = yes ; then
+            asm_path="powerpc64/p8 $asm_path"
+	  fi
+	  if test "$enable_power_altivec" = yes ; then
+	    asm_path="powerpc64/p7 $asm_path"
+	  fi
 	fi
       fi
       ;;
diff --git a/fat-ppc.c b/fat-ppc.c
index 2bc50481..2bfd649f 100644
--- a/fat-ppc.c
+++ b/fat-ppc.c
@@ -66,6 +66,7 @@
 struct ppc_features
 {
   int have_crypto_ext;
+  int have_altivec;
 };
 
 #define MATCH(s, slen, literal, llen) \
@@ -76,6 +77,7 @@ get_ppc_features (struct ppc_features *features)
 {
   const char *s;
   features->have_crypto_ext = 0;
+  features->have_altivec = 0;
 
   s = secure_getenv (ENV_OVERRIDE);
   if (s)
@@ -86,6 +88,8 @@ get_ppc_features (struct ppc_features *features)
 
 	if (MATCH (s, length, "crypto_ext", 10))
 	  features->have_crypto_ext = 1;
+	else if (MATCH(s, length, "altivec", 7))
+	  features->have_altivec = 1;
 	if (!sep)
 	  break;
 	s = sep + 1;
@@ -95,8 +99,10 @@ get_ppc_features (struct ppc_features *features)
 #if defined(_AIX) && defined(__power_8_andup)
       features->have_crypto_ext = __power_8_andup() != 0 ? 1 : 0;
 #else
+      unsigned long hwcap = 0;
       unsigned long hwcap2 = 0;
 # if defined(__linux__)
+      hwcap = getauxval(AT_HWCAP);
       hwcap2 = getauxval(AT_HWCAP2);
 # elif defined(__FreeBSD__)
 #  if __FreeBSD__ >= 12
@@ -106,8 +112,13 @@ get_ppc_features (struct ppc_features *features)
       sysctlbyname("hw.cpu_features2", &hwcap2, &len, NULL, 0);
 #  endif
 # endif
-      features->have_crypto_ext =
-	(hwcap2 & PPC_FEATURE2_VEC_CRYPTO) == PPC_FEATURE2_VEC_CRYPTO ? 1 : 0;
+      features->have_crypto_ext
+	= ((hwcap2 & PPC_FEATURE2_VEC_CRYPTO) == PPC_FEATURE2_VEC_CRYPTO);
+
+      /* We also need VSX instructions, mainly for load and store. */
+      features->have_altivec
+	= ((hwcap & (PPC_FEATURE_HAS_ALTIVEC | PPC_FEATURE_HAS_VSX))
+	   == (PPC_FEATURE_HAS_ALTIVEC | PPC_FEATURE_HAS_VSX));
 #endif
     }
 }
@@ -120,6 +131,10 @@ DECLARE_FAT_FUNC(_nettle_aes_decrypt, aes_crypt_internal_func)
 DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, c)
 DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, ppc64)
 
+DECLARE_FAT_FUNC(_nettle_chacha_core, chacha_core_func)
+DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, c);
+DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, altivec);
+
 static void CONSTRUCTOR
 fat_init (void)
 {
@@ -145,6 +160,16 @@ fat_init (void)
       _nettle_aes_encrypt_vec = _nettle_aes_encrypt_c;
       _nettle_aes_decrypt_vec = _nettle_aes_decrypt_c;
     }
+  if (features.have_altivec)
+    {
+      if (verbose)
+	fprintf (stderr, "libnettle: enabling altivec code.\n");
+      _nettle_chacha_core_vec = _nettle_chacha_core_altivec;
+    }
+  else
+    {
+      _nettle_chacha_core_vec = _nettle_chacha_core_c;
+    }
 }
 
 DEFINE_FAT_FUNC(_nettle_aes_encrypt, void,
@@ -160,3 +185,7 @@ DEFINE_FAT_FUNC(_nettle_aes_decrypt, void,
  size_t length, uint8_t *dst,
  const uint8_t *src),
  (rounds, keys, T, length, dst, src))
+
+DEFINE_FAT_FUNC(_nettle_chacha_core, void,
+		(uint32_t *dst, const uint32_t *src, unsigned rounds),
+		(dst, src, rounds))
diff --git a/powerpc64/fat/chacha-core-internal-2.asm b/powerpc64/fat/chacha-core-internal-2.asm
new file mode 100644
index 00000000..35c059ea
--- /dev/null
+++ b/powerpc64/fat/chacha-core-internal-2.asm
@@ -0,0 +1,37 @@
+C powerpc64/fat/chacha-core-internal-2.asm
+
+
+ifelse(`
+   Copyright (C) 2020 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_chacha_core) picked up by configure
+
+define(`fat_transform', `$1_altivec')
+include_src(`powerpc64/p7/chacha-core-internal.asm')
diff --git a/powerpc64/p7/chacha-core-internal.asm b/powerpc64/p7/chacha-core-internal.asm
new file mode 100644
index 00000000..6eb1066f
--- /dev/null
+++ b/powerpc64/p7/chacha-core-internal.asm
@@ -0,0 +1,160 @@
+C powerpc64/p7/chacha-core-internal.asm
+
+ifelse(`
+   Copyright (C) 2020 Niels Möller and Torbjörn Granlund
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+C Register usage:
+
+C Argments
+define(`DST', `r3')
+define(`SRC', `r4')
+define(`ROUNDS', `r5')
+
+C Working state
+define(`X0', `v0')
+define(`X1', `v1')
+define(`X2', `v2')
+define(`X3', `v3')
+
+define(`ROT16', `v4')
+define(`ROT12', `v5')
+define(`ROT8',  `v6')
+define(`ROT7',  `v7')
+
+C Original input state
+define(`S0', `v8')
+define(`S1', `v9')
+define(`S2', `v10')
+define(`S3', `v11')
+
+C Big-endian working state
+define(`LE_MASK', `v12')
+define(`LE_TEMP', `v13')
+
+C QROUND(X0, X1, X2, X3)
+define(`QROUND', `
+	C x0 += x1, x3 ^= x0, x3 lrot 16
+	C x2 += x3, x1 ^= x2, x1 lrot 12
+	C x0 += x1, x3 ^= x0, x3 lrot 8
+	C x2 += x3, x1 ^= x2, x1 lrot 7
+
+	vadduwm $1, $1, $2
+	vxor	$4, $4, $1
+	vrlw	$4, $4, ROT16
+
+	vadduwm $3, $3, $4
+	vxor	$2, $2, $3
+	vrlw	$2, $2, ROT12
+
+	vadduwm $1, $1, $2
+	vxor	$4, $4, $1
+	vrlw	$4, $4, ROT8
+
+	vadduwm $3, $3, $4
+	vxor	$2, $2, $3
+	vrlw	$2, $2, ROT7
+')
+
+C LE_SWAP32(X0, X1, X2, X3)
+define(`LE_SWAP32', `IF_BE(`
+	vperm	X0, X0, X0, LE_MASK
+	vperm	X1, X1, X1, LE_MASK
+	vperm	X2, X2, X2, LE_MASK
+	vperm	X3, X3, X3, LE_MASK
+')')
+
+	.text
+	C _chacha_core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+
+define(`FUNC_ALIGN', `5')
+PROLOGUE(_nettle_chacha_core)
+
+	li	r6, 0x10	C set up some...
+	li	r7, 0x20	C ...useful...
+	li	r8, 0x30	C ...offsets
+
+	vspltisw ROT16, -16	C -16 instead of 16 actually works!
+	vspltisw ROT12, 12
+	vspltisw ROT8, 8
+	vspltisw ROT7, 7
+IF_BE(`
+	li	 r9, 0
+	lvsl	 LE_MASK, r9, r9		C 00 01 02 03 ... 0c 0d 0e 0f
+	vspltisb LE_TEMP, 0x03			C 03 03 03 03 ... 03 03 03 03
+	vxor	 LE_MASK, LE_MASK, LE_TEMP	C 03 02 01 00 ... 0f 0e 0d 0c
+')
+
+	lxvw4x	VSR(X0), 0, SRC
+	lxvw4x	VSR(X1), r6, SRC
+	lxvw4x	VSR(X2), r7, SRC
+	lxvw4x	VSR(X3), r8, SRC
+
+	vor	S0, X0, X0
+	vor	S1, X1, X1
+	vor	S2, X2, X2
+	vor	S3, X3, X3
+
+	srdi	ROUNDS, ROUNDS, 1
+	mtctr	ROUNDS
+
+.Loop:
+	QROUND(X0, X1, X2, X3)
+	C Rotate rows, to get
+	C	 0  1  2  3
+	C	 5  6  7  4  <<< 1
+	C	10 11  8  9  <<< 2
+	C	15 12 13 14  <<< 3
+
+	vsldoi	X1, X1, X1, 4
+	vsldoi	X2, X2, X2, 8
+	vsldoi	X3, X3, X3, 12
+
+	QROUND(X0, X1, X2, X3)
+
+	C Inverse rotation
+	vsldoi	X1, X1, X1, 12
+	vsldoi	X2, X2, X2, 8
+	vsldoi	X3, X3, X3, 4
+
+	bdnz	.Loop
+
+	vadduwm	X0, X0, S0
+	vadduwm	X1, X1, S1
+	vadduwm	X2, X2, S2
+	vadduwm	X3, X3, S3
+
+	LE_SWAP32(X0, X1, X2, X3)
+
+	stxvw4x	VSR(X0), 0, DST
+	stxvw4x	VSR(X1), r6, DST
+	stxvw4x	VSR(X2), r7, DST
+	stxvw4x	VSR(X3), r8, DST
+
+	blr
+EPILOGUE(_nettle_chacha_core)
author	Niels Möller <nisse@lysator.liu.se>	2020-11-07 11:16:35 +0100
committer	Niels Möller <nisse@lysator.liu.se>	2020-11-07 11:16:35 +0100
commit	611abe02e3a1fde17697ab70e7c2805b6cfc0eee (patch)
tree	1f01e194c68915c1e49ab18438dadae8ebbbfb05
parent	4c8b0cdd97ffec3ae3f8d995afdfccbc261b3c79 (diff)
parent	19ba206d46d8558bc3af6ab14f7a770a94df57b9 (diff)
download	nettle-611abe02e3a1fde17697ab70e7c2805b6cfc0eee.tar.gz