summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNiels Möller <nisse@lysator.liu.se>2020-11-07 11:16:35 +0100
committerNiels Möller <nisse@lysator.liu.se>2020-11-07 11:16:35 +0100
commit611abe02e3a1fde17697ab70e7c2805b6cfc0eee (patch)
tree1f01e194c68915c1e49ab18438dadae8ebbbfb05
parent4c8b0cdd97ffec3ae3f8d995afdfccbc261b3c79 (diff)
parent19ba206d46d8558bc3af6ab14f7a770a94df57b9 (diff)
downloadnettle-611abe02e3a1fde17697ab70e7c2805b6cfc0eee.tar.gz
Merge branch 'ppc-chacha-core'
-rw-r--r--ChangeLog15
-rw-r--r--Makefile.in2
-rw-r--r--configure.ac15
-rw-r--r--fat-ppc.c33
-rw-r--r--powerpc64/fat/chacha-core-internal-2.asm37
-rw-r--r--powerpc64/p7/chacha-core-internal.asm160
6 files changed, 256 insertions, 6 deletions
diff --git a/ChangeLog b/ChangeLog
index 6626c6ea..218fa394 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,18 @@
+2020-11-07 Niels Möller <nisse@lysator.liu.se>
+
+ Merged initial powerpc64 implementation of chacha.
+ * configure.ac: New command line option --enable-power-altivec.
+ Update asm_path logic, and add altivec to FAT_TEST_LIST.
+ * fat-ppc.c (get_ppc_features): Add logic to check for altivec and
+ vsx support, and select aither C or altivec implementation of
+ chacha_core.
+ * powerpc64/p7/chacha-core-internal.asm: New file.
+
+2020-09-25 Niels Möller <nisse@lysator.liu.se>
+
+ * powerpc64/p7/chacha-core-internal.asm: New file.
+ * Makefile.in (distdir): Add powerpc64/p7.
+
2020-10-29 Niels Möller <nisse@lysator.liu.se>
* blowfish.c (blowfish_set_key): Add casts to uint32_t. Avoids
diff --git a/Makefile.in b/Makefile.in
index c10f3e9d..d955774d 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -616,7 +616,7 @@ distdir: $(DISTFILES)
set -e; for d in sparc32 sparc64 x86 \
x86_64 x86_64/aesni x86_64/sha_ni x86_64/fat \
arm arm/neon arm/v6 arm/fat \
- powerpc64 powerpc64/p8 powerpc64/fat ; do \
+ powerpc64 powerpc64/p7 powerpc64/p8 powerpc64/fat ; do \
mkdir "$(distdir)/$$d" ; \
find "$(srcdir)/$$d" -maxdepth 1 '(' -name '*.asm' -o -name '*.m4' ')' \
-exec cp '{}' "$(distdir)/$$d" ';' ; \
diff --git a/configure.ac b/configure.ac
index e9983697..2a47f940 100644
--- a/configure.ac
+++ b/configure.ac
@@ -93,6 +93,10 @@ AC_ARG_ENABLE(power-crypto-ext,
AC_HELP_STRING([--enable-power-crypto-ext], [Enable POWER crypto extensions. (default=no)]),,
[enable_power_crypto_ext=no])
+AC_ARG_ENABLE(power-altivec,
+ AC_HELP_STRING([--enable-power-altivec], [Enable POWER altivec and vsx extensions. (default=no)]),,
+ [enable_altivec=no])
+
AC_ARG_ENABLE(mini-gmp,
AC_HELP_STRING([--enable-mini-gmp], [Enable mini-gmp, used instead of libgmp.]),,
[enable_mini_gmp=no])
@@ -462,9 +466,14 @@ if test "x$enable_assembler" = xyes ; then
if test "x$enable_fat" = xyes ; then
asm_path="powerpc64/fat $asm_path"
OPT_NETTLE_SOURCES="fat-ppc.c $OPT_NETTLE_SOURCES"
- FAT_TEST_LIST="none crypto_ext"
- elif test "x$enable_power_crypto_ext" = xyes ; then
- asm_path="powerpc64/p8 $asm_path"
+ FAT_TEST_LIST="none crypto_ext altivec"
+ else
+ if test "$enable_power_crypto_ext" = yes ; then
+ asm_path="powerpc64/p8 $asm_path"
+ fi
+ if test "$enable_power_altivec" = yes ; then
+ asm_path="powerpc64/p7 $asm_path"
+ fi
fi
fi
;;
diff --git a/fat-ppc.c b/fat-ppc.c
index 2bc50481..2bfd649f 100644
--- a/fat-ppc.c
+++ b/fat-ppc.c
@@ -66,6 +66,7 @@
struct ppc_features
{
int have_crypto_ext;
+ int have_altivec;
};
#define MATCH(s, slen, literal, llen) \
@@ -76,6 +77,7 @@ get_ppc_features (struct ppc_features *features)
{
const char *s;
features->have_crypto_ext = 0;
+ features->have_altivec = 0;
s = secure_getenv (ENV_OVERRIDE);
if (s)
@@ -86,6 +88,8 @@ get_ppc_features (struct ppc_features *features)
if (MATCH (s, length, "crypto_ext", 10))
features->have_crypto_ext = 1;
+ else if (MATCH(s, length, "altivec", 7))
+ features->have_altivec = 1;
if (!sep)
break;
s = sep + 1;
@@ -95,8 +99,10 @@ get_ppc_features (struct ppc_features *features)
#if defined(_AIX) && defined(__power_8_andup)
features->have_crypto_ext = __power_8_andup() != 0 ? 1 : 0;
#else
+ unsigned long hwcap = 0;
unsigned long hwcap2 = 0;
# if defined(__linux__)
+ hwcap = getauxval(AT_HWCAP);
hwcap2 = getauxval(AT_HWCAP2);
# elif defined(__FreeBSD__)
# if __FreeBSD__ >= 12
@@ -106,8 +112,13 @@ get_ppc_features (struct ppc_features *features)
sysctlbyname("hw.cpu_features2", &hwcap2, &len, NULL, 0);
# endif
# endif
- features->have_crypto_ext =
- (hwcap2 & PPC_FEATURE2_VEC_CRYPTO) == PPC_FEATURE2_VEC_CRYPTO ? 1 : 0;
+ features->have_crypto_ext
+ = ((hwcap2 & PPC_FEATURE2_VEC_CRYPTO) == PPC_FEATURE2_VEC_CRYPTO);
+
+ /* We also need VSX instructions, mainly for load and store. */
+ features->have_altivec
+ = ((hwcap & (PPC_FEATURE_HAS_ALTIVEC | PPC_FEATURE_HAS_VSX))
+ == (PPC_FEATURE_HAS_ALTIVEC | PPC_FEATURE_HAS_VSX));
#endif
}
}
@@ -120,6 +131,10 @@ DECLARE_FAT_FUNC(_nettle_aes_decrypt, aes_crypt_internal_func)
DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, c)
DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, ppc64)
+DECLARE_FAT_FUNC(_nettle_chacha_core, chacha_core_func)
+DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, c);
+DECLARE_FAT_FUNC_VAR(chacha_core, chacha_core_func, altivec);
+
static void CONSTRUCTOR
fat_init (void)
{
@@ -145,6 +160,16 @@ fat_init (void)
_nettle_aes_encrypt_vec = _nettle_aes_encrypt_c;
_nettle_aes_decrypt_vec = _nettle_aes_decrypt_c;
}
+ if (features.have_altivec)
+ {
+ if (verbose)
+ fprintf (stderr, "libnettle: enabling altivec code.\n");
+ _nettle_chacha_core_vec = _nettle_chacha_core_altivec;
+ }
+ else
+ {
+ _nettle_chacha_core_vec = _nettle_chacha_core_c;
+ }
}
DEFINE_FAT_FUNC(_nettle_aes_encrypt, void,
@@ -160,3 +185,7 @@ DEFINE_FAT_FUNC(_nettle_aes_decrypt, void,
size_t length, uint8_t *dst,
const uint8_t *src),
(rounds, keys, T, length, dst, src))
+
+DEFINE_FAT_FUNC(_nettle_chacha_core, void,
+ (uint32_t *dst, const uint32_t *src, unsigned rounds),
+ (dst, src, rounds))
diff --git a/powerpc64/fat/chacha-core-internal-2.asm b/powerpc64/fat/chacha-core-internal-2.asm
new file mode 100644
index 00000000..35c059ea
--- /dev/null
+++ b/powerpc64/fat/chacha-core-internal-2.asm
@@ -0,0 +1,37 @@
+C powerpc64/fat/chacha-core-internal-2.asm
+
+
+ifelse(`
+ Copyright (C) 2020 Niels Möller
+
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+')
+
+dnl PROLOGUE(_nettle_chacha_core) picked up by configure
+
+define(`fat_transform', `$1_altivec')
+include_src(`powerpc64/p7/chacha-core-internal.asm')
diff --git a/powerpc64/p7/chacha-core-internal.asm b/powerpc64/p7/chacha-core-internal.asm
new file mode 100644
index 00000000..6eb1066f
--- /dev/null
+++ b/powerpc64/p7/chacha-core-internal.asm
@@ -0,0 +1,160 @@
+C powerpc64/p7/chacha-core-internal.asm
+
+ifelse(`
+ Copyright (C) 2020 Niels Möller and Torbjörn Granlund
+ This file is part of GNU Nettle.
+
+ GNU Nettle is free software: you can redistribute it and/or
+ modify it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+ or
+
+ * the GNU General Public License as published by the Free
+ Software Foundation; either version 2 of the License, or (at your
+ option) any later version.
+
+ or both in parallel, as here.
+
+ GNU Nettle is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ General Public License for more details.
+
+ You should have received copies of the GNU General Public License and
+ the GNU Lesser General Public License along with this program. If
+ not, see http://www.gnu.org/licenses/.
+')
+
+C Register usage:
+
+C Argments
+define(`DST', `r3')
+define(`SRC', `r4')
+define(`ROUNDS', `r5')
+
+C Working state
+define(`X0', `v0')
+define(`X1', `v1')
+define(`X2', `v2')
+define(`X3', `v3')
+
+define(`ROT16', `v4')
+define(`ROT12', `v5')
+define(`ROT8', `v6')
+define(`ROT7', `v7')
+
+C Original input state
+define(`S0', `v8')
+define(`S1', `v9')
+define(`S2', `v10')
+define(`S3', `v11')
+
+C Big-endian working state
+define(`LE_MASK', `v12')
+define(`LE_TEMP', `v13')
+
+C QROUND(X0, X1, X2, X3)
+define(`QROUND', `
+ C x0 += x1, x3 ^= x0, x3 lrot 16
+ C x2 += x3, x1 ^= x2, x1 lrot 12
+ C x0 += x1, x3 ^= x0, x3 lrot 8
+ C x2 += x3, x1 ^= x2, x1 lrot 7
+
+ vadduwm $1, $1, $2
+ vxor $4, $4, $1
+ vrlw $4, $4, ROT16
+
+ vadduwm $3, $3, $4
+ vxor $2, $2, $3
+ vrlw $2, $2, ROT12
+
+ vadduwm $1, $1, $2
+ vxor $4, $4, $1
+ vrlw $4, $4, ROT8
+
+ vadduwm $3, $3, $4
+ vxor $2, $2, $3
+ vrlw $2, $2, ROT7
+')
+
+C LE_SWAP32(X0, X1, X2, X3)
+define(`LE_SWAP32', `IF_BE(`
+ vperm X0, X0, X0, LE_MASK
+ vperm X1, X1, X1, LE_MASK
+ vperm X2, X2, X2, LE_MASK
+ vperm X3, X3, X3, LE_MASK
+')')
+
+ .text
+ C _chacha_core(uint32_t *dst, const uint32_t *src, unsigned rounds)
+
+define(`FUNC_ALIGN', `5')
+PROLOGUE(_nettle_chacha_core)
+
+ li r6, 0x10 C set up some...
+ li r7, 0x20 C ...useful...
+ li r8, 0x30 C ...offsets
+
+ vspltisw ROT16, -16 C -16 instead of 16 actually works!
+ vspltisw ROT12, 12
+ vspltisw ROT8, 8
+ vspltisw ROT7, 7
+IF_BE(`
+ li r9, 0
+ lvsl LE_MASK, r9, r9 C 00 01 02 03 ... 0c 0d 0e 0f
+ vspltisb LE_TEMP, 0x03 C 03 03 03 03 ... 03 03 03 03
+ vxor LE_MASK, LE_MASK, LE_TEMP C 03 02 01 00 ... 0f 0e 0d 0c
+')
+
+ lxvw4x VSR(X0), 0, SRC
+ lxvw4x VSR(X1), r6, SRC
+ lxvw4x VSR(X2), r7, SRC
+ lxvw4x VSR(X3), r8, SRC
+
+ vor S0, X0, X0
+ vor S1, X1, X1
+ vor S2, X2, X2
+ vor S3, X3, X3
+
+ srdi ROUNDS, ROUNDS, 1
+ mtctr ROUNDS
+
+.Loop:
+ QROUND(X0, X1, X2, X3)
+ C Rotate rows, to get
+ C 0 1 2 3
+ C 5 6 7 4 <<< 1
+ C 10 11 8 9 <<< 2
+ C 15 12 13 14 <<< 3
+
+ vsldoi X1, X1, X1, 4
+ vsldoi X2, X2, X2, 8
+ vsldoi X3, X3, X3, 12
+
+ QROUND(X0, X1, X2, X3)
+
+ C Inverse rotation
+ vsldoi X1, X1, X1, 12
+ vsldoi X2, X2, X2, 8
+ vsldoi X3, X3, X3, 4
+
+ bdnz .Loop
+
+ vadduwm X0, X0, S0
+ vadduwm X1, X1, S1
+ vadduwm X2, X2, S2
+ vadduwm X3, X3, S3
+
+ LE_SWAP32(X0, X1, X2, X3)
+
+ stxvw4x VSR(X0), 0, DST
+ stxvw4x VSR(X1), r6, DST
+ stxvw4x VSR(X2), r7, DST
+ stxvw4x VSR(X3), r8, DST
+
+ blr
+EPILOGUE(_nettle_chacha_core)