summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJussi Kivilinna <jussi.kivilinna@iki.fi>2020-12-30 17:46:07 +0200
committerJussi Kivilinna <jussi.kivilinna@iki.fi>2020-12-30 17:46:07 +0200
commit1f75681cbba895ea2f7ea0637900721f4522e729 (patch)
tree19eb7a48b5513f9f5811b1e515a3d4c8e637641c
parent6a0bb9ab7f886087d7edb0725c90485086a1c0b4 (diff)
downloadlibgcrypt-cipher-s390x-optimizations.tar.gz
Add s390x/zSeries implementation of Poly1305cipher-s390x-optimizations
* cipher/Makefile.am: Add 'poly1305-s390x.S' and 'asm-poly1305-s390x.h'. * cipher/asm-poly1305-s390x.h: New * cipher/chacha20-s390x.S (_gcry_chacha20_poly1305_s390x_vx_blocks8) (_gcry_chacha20_poly1305_s390x_vx_blocks4_2_1): New, stitched chacha20-poly1305 implementation. * cipher/chacha20.c (USE_S390X_VX_POLY1305): New. (_gcry_chacha20_poly1305_s390x_vx_blocks8) (_gcry_chacha20_poly1305_s390x_vx_blocks4_2_1): New prototypes. (_gcry_chacha20_poly1305_encrypt, _gcry_chacha20_poly1305_decrypt): Add s390x/VX stitched chacha20-poly1305 code-path. * cipher/poly1305-s390x.S: New. * cipher/poly1305.c (USE_S390X_ASM, HAVE_ASM_POLY1305_BLOCKS): New. [USE_S390X_ASM] (_gcry_poly1305_s390x_blocks1, poly1305_blocks): New. * configure.ac (gcry_cv_gcc_inline_asm_s390x): Check for 'risbgn' and 'algrk' instructions. * tests/basic.c (_check_poly1305_cipher): Add large chacha20-poly1305 test vector. -- Patch adds Poly1305 and stitched ChaCha20-Poly1305 implementation for zSeries. Stitched implementation interleaves ChaCha20 and Poly1305 processing for higher instruction level parallelism and better utilization of execution units. Benchmark on z15 (4504 Mhz): Before: CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte POLY1305 enc | 1.16 ns/B 823.2 MiB/s 5.22 c/B POLY1305 dec | 1.16 ns/B 823.2 MiB/s 5.22 c/B POLY1305 auth | 0.736 ns/B 1295 MiB/s 3.32 c/B After (chacha20-poly1305 ~71% faster, poly1305 ~29% faster): CHACHA20 | nanosecs/byte mebibytes/sec cycles/byte POLY1305 enc | 0.677 ns/B 1409 MiB/s 3.05 c/B POLY1305 dec | 0.655 ns/B 1456 MiB/s 2.95 c/B POLY1305 auth | 0.569 ns/B 1675 MiB/s 2.56 c/B GnuPG-bug-id: 5202 Signed-off-by: Jussi Kivilinna <jussi.kivilinna@iki.fi>
-rw-r--r--cipher/Makefile.am2
-rw-r--r--cipher/asm-poly1305-s390x.h140
-rw-r--r--cipher/chacha20-s390x.S673
-rw-r--r--cipher/chacha20.c126
-rw-r--r--cipher/poly1305-s390x.S87
-rw-r--r--cipher/poly1305.c40
-rw-r--r--configure.ac8
-rw-r--r--tests/basic.c138
8 files changed, 1213 insertions, 1 deletions
diff --git a/cipher/Makefile.am b/cipher/Makefile.am
index 3234bcb2..6727b8b1 100644
--- a/cipher/Makefile.am
+++ b/cipher/Makefile.am
@@ -60,6 +60,7 @@ libcipher_la_SOURCES = \
mac.c mac-internal.h \
mac-hmac.c mac-cmac.c mac-gmac.c mac-poly1305.c \
poly1305.c poly1305-internal.h \
+ poly1305-s390x.S \
kdf.c kdf-internal.h \
bithelp.h \
bufhelp.h \
@@ -75,6 +76,7 @@ EXTRA_libcipher_la_SOURCES = \
asm-inline-s390x.h \
asm-poly1305-aarch64.h \
asm-poly1305-amd64.h \
+ asm-poly1305-s390x.h \
arcfour.c arcfour-amd64.S \
blowfish.c blowfish-amd64.S blowfish-arm.S \
cast5.c cast5-amd64.S cast5-arm.S \
diff --git a/cipher/asm-poly1305-s390x.h b/cipher/asm-poly1305-s390x.h
new file mode 100644
index 00000000..113ab949
--- /dev/null
+++ b/cipher/asm-poly1305-s390x.h
@@ -0,0 +1,140 @@
+/* asm-common-amd64.h - Poly1305 macros for zSeries assembly
+ *
+ * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef GCRY_ASM_POLY1305_S390X_H
+#define GCRY_ASM_POLY1305_S390X_H
+
+#include "asm-common-s390x.h"
+
+/**********************************************************************
+ poly1305 for stitched chacha20-poly1305
+ **********************************************************************/
+
+#define POLY_RSTATE %r1
+#define POLY_RSRC %r14
+
+#define POLY_R_H0_TMP_HI %r6 // even-
+#define POLY_R_H0 %r7 // odd pair
+#define POLY_R_H1_TMP_HI %r8 // even-
+#define POLY_R_H1 %r9 // odd pair
+#define POLY_R_H2 %r10
+#define POLY_R_R0 %r11
+#define POLY_R_R1 %r12
+#define POLY_R_R1_MUL5 %r13
+#define POLY_R_X0_HI %r2 // even-
+#define POLY_R_X0_LO %r3 // odd pair
+#define POLY_R_X1_HI %r4 // even-
+#define POLY_R_X1_LO %r5 // odd pair
+
+#define POLY_S_R0 (4 * 4 + 0 * 8)(POLY_RSTATE)
+#define POLY_S_R1 (4 * 4 + 1 * 8)(POLY_RSTATE)
+#define POLY_S_H0 (4 * 4 + 2 * 8 + 0 * 8)(POLY_RSTATE)
+#define POLY_S_H1 (4 * 4 + 2 * 8 + 1 * 8)(POLY_RSTATE)
+#define POLY_S_H2d (4 * 4 + 2 * 8 + 2 * 8)(POLY_RSTATE)
+
+#define INC_POLY1305_SRC(a) \
+ aghi POLY_RSRC, (a);
+
+#define POLY1305_LOAD_STATE() \
+ lg POLY_R_H0, POLY_S_H0; \
+ lg POLY_R_H1, POLY_S_H1; \
+ llgf POLY_R_H2, POLY_S_H2d; \
+ rllg POLY_R_H0, POLY_R_H0, 32; \
+ rllg POLY_R_H1, POLY_R_H1, 32; \
+ lg POLY_R_R0, POLY_S_R0; \
+ lg POLY_R_R1, POLY_S_R1; \
+ rllg POLY_R_R0, POLY_R_R0, 32; \
+ rllg POLY_R_R1, POLY_R_R1, 32; \
+ srlg POLY_R_R1_MUL5, POLY_R_R1, 2; \
+ algr POLY_R_R1_MUL5, POLY_R_R1;
+
+#define POLY1305_STORE_STATE() \
+ rllg POLY_R_H0, POLY_R_H0, 32; \
+ rllg POLY_R_H1, POLY_R_H1, 32; \
+ stg POLY_R_H0, POLY_S_H0; \
+ stg POLY_R_H1, POLY_S_H1; \
+ st POLY_R_H2, POLY_S_H2d;
+
+/* a = h + m */
+#define POLY1305_BLOCK_PART1_HB(src_offset, high_pad) \
+ lrvg POLY_R_X0_HI, ((src_offset) + 1 * 8)(POLY_RSRC); \
+ lrvg POLY_R_X0_LO, ((src_offset) + 0 * 8)(POLY_RSRC); \
+ lghi POLY_R_H1_TMP_HI, (high_pad);
+
+#define POLY1305_BLOCK_PART1(src_offset) \
+ POLY1305_BLOCK_PART1_HB(src_offset, 1);
+
+#define POLY1305_BLOCK_PART2() \
+ algr POLY_R_H0, POLY_R_X0_LO; \
+ alcgr POLY_R_H1, POLY_R_X0_HI; \
+ alcgr POLY_R_H2, POLY_R_H1_TMP_HI; \
+ lgr POLY_R_X1_LO, POLY_R_H0; \
+ lgr POLY_R_X0_LO, POLY_R_H0;
+
+#define POLY1305_BLOCK_PART3() \
+ /* h = a * r (partial mod 2^130-5): */ \
+ \
+ /* h0 * r1 */ \
+ mlgr POLY_R_X1_HI, POLY_R_R1; \
+ \
+ /* h1 * r0 */ \
+ lgr POLY_R_H0, POLY_R_H1; \
+ mlgr POLY_R_H0_TMP_HI, POLY_R_R0; \
+ \
+ /* h1 * r1 mod 2^130-5 */ \
+ mlgr POLY_R_H1_TMP_HI, POLY_R_R1_MUL5;
+
+#define POLY1305_BLOCK_PART4() \
+ \
+ /* h0 * r0 */ \
+ mlgr POLY_R_X0_HI, POLY_R_R0; \
+ \
+ algr POLY_R_X1_LO, POLY_R_H0; \
+ alcgr POLY_R_X1_HI, POLY_R_H0_TMP_HI; \
+ \
+ lgr POLY_R_H0_TMP_HI, POLY_R_H2; \
+ msgr POLY_R_H0_TMP_HI, POLY_R_R1_MUL5; /* h2 * r1 mod 2^130-5 */ \
+ msgr POLY_R_H2, POLY_R_R0; /* h2 * r0 */
+
+#define POLY1305_BLOCK_PART5() \
+ \
+ algr POLY_R_X0_LO, POLY_R_H1; \
+ alcgr POLY_R_X0_HI, POLY_R_H1_TMP_HI;
+
+#define POLY1305_BLOCK_PART6() \
+ \
+ algrk POLY_R_H1, POLY_R_H0_TMP_HI, POLY_R_X1_LO; \
+ alcgr POLY_R_H2, POLY_R_X1_HI;
+
+#define POLY1305_BLOCK_PART7() \
+ \
+ /* carry propagation */ \
+ srlg POLY_R_H0, POLY_R_H2, 2; \
+ risbgn POLY_R_X1_LO, POLY_R_H2, 0, 0x80 | 61, 0; \
+ lghi POLY_R_H1_TMP_HI, 0; \
+ agr POLY_R_H0, POLY_R_X1_LO; \
+ risbgn POLY_R_H2, POLY_R_H2, 62, 0x80 | 63, 0;
+
+#define POLY1305_BLOCK_PART8() \
+ algr POLY_R_H0, POLY_R_X0_LO; \
+ alcgr POLY_R_H1, POLY_R_X0_HI; \
+ alcgr POLY_R_H2, POLY_R_H1_TMP_HI;
+
+#endif /* GCRY_ASM_POLY1305_AMD64_H */
diff --git a/cipher/chacha20-s390x.S b/cipher/chacha20-s390x.S
index 2cd38330..9b1d59c6 100644
--- a/cipher/chacha20-s390x.S
+++ b/cipher/chacha20-s390x.S
@@ -23,6 +23,7 @@
#if defined(HAVE_GCC_INLINE_ASM_S390X_VX)
#include "asm-common-s390x.h"
+#include "asm-poly1305-s390x.h"
.machine "z13+vx"
.text
@@ -574,6 +575,393 @@ ELF(.size _gcry_chacha20_s390x_vx_blocks4_2_1,
.-_gcry_chacha20_s390x_vx_blocks4_2_1;)
/**********************************************************************
+ 4-way && 2-way && 1-way stitched chacha20-poly1305 ("horizontal")
+ **********************************************************************/
+
+.balign 8
+.globl _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1
+ELF(.type _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1,@function;)
+
+_gcry_chacha20_poly1305_s390x_vx_blocks4_2_1:
+ /* input:
+ * %r2: input
+ * %r3: dst
+ * %r4: src
+ * %r5: nblks
+ * %r6: poly1305 state
+ * 160(%r15): poly1305 src
+ */
+ CFI_STARTPROC();
+
+ START_STACK(%r14);
+ lgr NBLKS, %r5;
+
+ /* Load constants. */
+ larl %r8, .Lconsts;
+ vl TMP0, (.Lwordswap - .Lconsts)(%r8);
+ vl TMP1, (.Lone - .Lconsts)(%r8);
+ vl TMP2, (.Lbswap128 - .Lconsts)(%r8);
+
+ /* Load state. */
+ vlm S0, S3, 0(INPUT);
+ vperm S0, S0, S0, TMP0;
+ vperm S1, S1, S1, TMP0;
+ vperm S2, S2, S2, TMP0;
+ vperm S3, S3, S3, TMP0;
+
+ /* Store parameters to stack. */
+ stmg %r2, %r6, STACK_INPUT(%r15);
+
+ lgr POLY_RSTATE, %r6;
+ lgr NBLKS, %r5;
+
+ lg POLY_RSRC, 0(%r15);
+ lg POLY_RSRC, 160(POLY_RSRC);
+ stg POLY_RSRC, STACK_POSRC(%r15);
+
+ /* Load poly1305 state */
+ POLY1305_LOAD_STATE();
+
+ clgijl NBLKS, 4, .Lloop2_poly;
+
+.balign 4
+.Lloop4_poly:
+ /* Process four chacha20 blocks and 16 poly1305 blocks. */
+ vlr TMP3, S3;
+ lghi ROUND, (20 / 4);
+ vlr A0, S0;
+ vlr A1, S1;
+ vlr A2, S2;
+ vlr A3, TMP3;
+ vag TMP3, TMP3, TMP1;
+ vlr B0, S0;
+ vlr B1, S1;
+ vlr B2, S2;
+ vlr B3, TMP3;
+ vag TMP3, TMP3, TMP1;
+ vlr C0, S0;
+ vlr C1, S1;
+ vlr C2, S2;
+ vlr C3, TMP3;
+ vlr D0, S0;
+ vlr D1, S1;
+ vlr D2, S2;
+ vag D3, TMP3, TMP1;
+
+ slgfi NBLKS, 4;
+
+.balign 4
+.Lround4_4_poly:
+ /* Total 15 poly1305 blocks processed by this loop. */
+ QUARTERROUND4_4_POLY(3, 2, 1,
+ POLY1305_BLOCK_PART1(0 * 16),
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART6());
+ QUARTERROUND4_4_POLY(1, 2, 3,
+ POLY1305_BLOCK_PART7(),
+ POLY1305_BLOCK_PART8(),
+ POLY1305_BLOCK_PART1(1 * 16),
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4());
+ QUARTERROUND4_4_POLY(3, 2, 1,
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART6(),
+ POLY1305_BLOCK_PART7(),
+ POLY1305_BLOCK_PART8(),
+ POLY1305_BLOCK_PART1(2 * 16);
+ INC_POLY1305_SRC(3 * 16),
+ POLY1305_BLOCK_PART2());
+ QUARTERROUND4_4_POLY(1, 2, 3,
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART6(),
+ POLY1305_BLOCK_PART7(),
+ POLY1305_BLOCK_PART8());
+ brctg ROUND, .Lround4_4_poly;
+
+ POLY1305_BLOCK_PART1(0 * 16);
+ INC_POLY1305_SRC(1 * 16);
+ stg POLY_RSRC, STACK_POSRC(%r15);
+
+ lg %r14, STACK_SRC(%r15);
+ vlm IO0, IO7, 0(%r14);
+
+ PLUS(A0, S0);
+ PLUS(A1, S1);
+ PLUS(A2, S2);
+ PLUS(A3, S3);
+ vag S3, S3, TMP1; /* Update counter. */
+ POLY1305_BLOCK_PART2();
+ PLUS(B0, S0);
+ PLUS(B1, S1);
+ PLUS(B2, S2);
+ PLUS(B3, S3);
+ vag S3, S3, TMP1; /* Update counter. */
+ POLY1305_BLOCK_PART3();
+ vperm A0, A0, A0, TMP2;
+ vperm A1, A1, A1, TMP2;
+ vperm A2, A2, A2, TMP2;
+ vperm A3, A3, A3, TMP2;
+ vperm B0, B0, B0, TMP2;
+ vperm B1, B1, B1, TMP2;
+ vperm B2, B2, B2, TMP2;
+ vperm B3, B3, B3, TMP2;
+ POLY1305_BLOCK_PART4();
+ PLUS(C0, S0);
+ PLUS(C1, S1);
+ PLUS(C2, S2);
+ PLUS(C3, S3);
+ vag S3, S3, TMP1; /* Update counter. */
+ PLUS(D0, S0);
+ PLUS(D1, S1);
+ PLUS(D2, S2);
+ PLUS(D3, S3);
+ vag S3, S3, TMP1; /* Update counter. */
+ POLY1305_BLOCK_PART5();
+ vperm C0, C0, C0, TMP2;
+ vperm C1, C1, C1, TMP2;
+ vperm C2, C2, C2, TMP2;
+ vperm C3, C3, C3, TMP2;
+ vperm D0, D0, D0, TMP2;
+ vperm D1, D1, D1, TMP2;
+ vperm D2, D2, D2, TMP2;
+ vperm D3, D3, D3, TMP2;
+
+ POLY1305_BLOCK_PART6();
+ XOR(IO0, A0);
+ XOR(IO1, A1);
+ XOR(IO2, A2);
+ XOR(IO3, A3);
+ XOR(IO4, B0);
+ XOR(IO5, B1);
+ XOR(IO6, B2);
+ XOR(IO7, B3);
+ vlm A0, B3, 128(%r14);
+ aghi %r14, 256;
+ stg %r14, STACK_SRC(%r15);
+
+ lg %r14, STACK_DST(%r15);
+ POLY1305_BLOCK_PART7();
+ vstm IO0, IO7, 0(%r14);
+ XOR(A0, C0);
+ XOR(A1, C1);
+ XOR(A2, C2);
+ XOR(A3, C3);
+ XOR(B0, D0);
+ XOR(B1, D1);
+ XOR(B2, D2);
+ XOR(B3, D3);
+ POLY1305_BLOCK_PART8();
+ vstm A0, B3, 128(%r14);
+ aghi %r14, 256;
+ stg %r14, STACK_DST(%r15);
+
+ lg POLY_RSRC, STACK_POSRC(%r15);
+
+ clgijhe NBLKS, 4, .Lloop4_poly;
+
+ CLEAR(C0);
+ CLEAR(C1);
+ CLEAR(C2);
+ CLEAR(C3);
+ CLEAR(D0);
+ CLEAR(D1);
+ CLEAR(D2);
+ CLEAR(D3);
+
+.balign 4
+.Lloop2_poly:
+ clgijl NBLKS, 2, .Lloop1_poly;
+
+ /* Process two chacha20 and eight poly1305 blocks. */
+ lghi ROUND, ((20 - 4) / 2);
+ vlr A0, S0;
+ vlr A1, S1;
+ vlr A2, S2;
+ vlr A3, S3;
+ vlr B0, S0;
+ vlr B1, S1;
+ vlr B2, S2;
+ vag B3, S3, TMP1;
+
+ slgfi NBLKS, 2;
+
+.balign 4
+.Lround4_2_poly:
+ /* Total eight poly1305 blocks processed by this loop. */
+ QUARTERROUND4_2_POLY(3, 2, 1,
+ POLY1305_BLOCK_PART1(0 * 16),
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4());
+ INC_POLY1305_SRC(1 * 16);
+ QUARTERROUND4_2_POLY(1, 2, 3,
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART6(),
+ POLY1305_BLOCK_PART7(),
+ POLY1305_BLOCK_PART8());
+ brctg ROUND, .Lround4_2_poly;
+
+ stg POLY_RSRC, STACK_POSRC(%r15);
+ lg %r14, STACK_SRC(%r15);
+
+ QUARTERROUND4_2(3, 2, 1);
+ QUARTERROUND4_2(1, 2, 3);
+ QUARTERROUND4_2(3, 2, 1);
+ QUARTERROUND4_2(1, 2, 3);
+
+ vlm IO0, IO7, 0(%r14);
+ aghi %r14, 128;
+ stg %r14, STACK_SRC(%r15);
+
+ PLUS(A0, S0);
+ PLUS(A1, S1);
+ PLUS(A2, S2);
+ PLUS(A3, S3);
+ vag S3, S3, TMP1; /* Update counter. */
+ PLUS(B0, S0);
+ PLUS(B1, S1);
+ PLUS(B2, S2);
+ PLUS(B3, S3);
+ vag S3, S3, TMP1; /* Update counter. */
+ vperm A0, A0, A0, TMP2;
+ vperm A1, A1, A1, TMP2;
+ vperm A2, A2, A2, TMP2;
+ vperm A3, A3, A3, TMP2;
+ vperm B0, B0, B0, TMP2;
+ vperm B1, B1, B1, TMP2;
+ vperm B2, B2, B2, TMP2;
+ vperm B3, B3, B3, TMP2;
+
+ lg %r14, STACK_DST(%r15);
+ XOR(IO0, A0);
+ XOR(IO1, A1);
+ XOR(IO2, A2);
+ XOR(IO3, A3);
+ XOR(IO4, B0);
+ XOR(IO5, B1);
+ XOR(IO6, B2);
+ XOR(IO7, B3);
+ vstm IO0, IO7, 0(%r14);
+ aghi %r14, 128;
+ stg %r14, STACK_DST(%r15);
+
+ lg POLY_RSRC, STACK_POSRC(%r15);
+
+ clgijhe NBLKS, 2, .Lloop2_poly;
+
+ CLEAR(B0);
+ CLEAR(B1);
+ CLEAR(B2);
+ CLEAR(B3);
+
+.balign 4
+.Lloop1_poly:
+ clgijl NBLKS, 1, .Ldone_poly;
+
+ /* Process one chacha20 block and four poly1305 blocks.*/
+ lghi ROUND, ((20 - 4) / 4);
+ vlr A0, S0;
+ vlr A1, S1;
+ vlr A2, S2;
+ vlr A3, S3;
+
+ slgfi NBLKS, 1;
+
+.balign 4
+.Lround4_1_poly:
+ /* Total four poly1305 blocks processed by this loop. */
+ QUARTERROUND4_POLY(3, 2, 1,
+ POLY1305_BLOCK_PART1(0 * 16),
+ POLY1305_BLOCK_PART2());
+ INC_POLY1305_SRC(1 * 16);
+ QUARTERROUND4_POLY(1, 2, 3,
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4());
+ QUARTERROUND4_POLY(3, 2, 1,
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART6());
+ QUARTERROUND4_POLY(1, 2, 3,
+ POLY1305_BLOCK_PART7(),
+ POLY1305_BLOCK_PART8());
+ brct ROUND, .Lround4_1_poly;
+
+ stg POLY_RSRC, STACK_POSRC(%r15);
+ lg %r14, STACK_SRC(%r15);
+
+ QUARTERROUND4(3, 2, 1);
+ QUARTERROUND4(1, 2, 3);
+ QUARTERROUND4(3, 2, 1);
+ QUARTERROUND4(1, 2, 3);
+
+ vlm IO0, IO3, 0(%r14);
+ aghi %r14, 64;
+ stg %r14, STACK_SRC(%r15);
+
+ PLUS(A0, S0);
+ PLUS(A1, S1);
+ PLUS(A2, S2);
+ PLUS(A3, S3);
+ vag S3, S3, TMP1; /* Update counter. */
+
+ lg %r14, STACK_DST(%r15);
+ vperm A0, A0, A0, TMP2;
+ vperm A1, A1, A1, TMP2;
+ vperm A2, A2, A2, TMP2;
+ vperm A3, A3, A3, TMP2;
+ XOR(IO0, A0);
+ XOR(IO1, A1);
+ XOR(IO2, A2);
+ XOR(IO3, A3);
+ vstm IO0, IO3, 0(%r14);
+ aghi %r14, 64;
+ stg %r14, STACK_DST(%r15);
+
+ lg POLY_RSRC, STACK_POSRC(%r15);
+
+ clgijhe NBLKS, 1, .Lloop1_poly;
+
+.balign 4
+.Ldone_poly:
+ /* Store poly1305 state */
+ lg POLY_RSTATE, STACK_POCTX(%r15);
+ POLY1305_STORE_STATE();
+
+ /* Store counter. */
+ lg INPUT, STACK_INPUT(%r15);
+ vperm S3, S3, S3, TMP0;
+ vst S3, (48)(INPUT);
+
+ /* Clear the used vector registers. */
+ CLEAR(A0);
+ CLEAR(A1);
+ CLEAR(A2);
+ CLEAR(A3);
+ CLEAR(IO0);
+ CLEAR(IO1);
+ CLEAR(IO2);
+ CLEAR(IO3);
+ CLEAR(IO4);
+ CLEAR(IO5);
+ CLEAR(IO6);
+ CLEAR(IO7);
+ CLEAR(TMP0);
+ CLEAR(TMP1);
+ CLEAR(TMP2);
+
+ END_STACK(%r14);
+ xgr %r2, %r2;
+ br %r14;
+ CFI_ENDPROC();
+ELF(.size _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1,
+ .-_gcry_chacha20_poly1305_s390x_vx_blocks4_2_1;)
+
+/**********************************************************************
8-way chacha20 ("vertical")
**********************************************************************/
@@ -884,5 +1272,290 @@ _gcry_chacha20_s390x_vx_blocks8:
ELF(.size _gcry_chacha20_s390x_vx_blocks8,
.-_gcry_chacha20_s390x_vx_blocks8;)
+/**********************************************************************
+ 8-way stitched chacha20-poly1305 ("vertical")
+ **********************************************************************/
+
+.balign 8
+.globl _gcry_chacha20_poly1305_s390x_vx_blocks8
+ELF(.type _gcry_chacha20_poly1305_s390x_vx_blocks8,@function;)
+
+_gcry_chacha20_poly1305_s390x_vx_blocks8:
+ /* input:
+ * %r2: input
+ * %r3: dst
+ * %r4: src
+ * %r5: nblks (multiple of 8)
+ * %r6: poly1305 state
+ * 160(%r15): poly1305 src
+ */
+ CFI_STARTPROC();
+
+ START_STACK(%r14);
+
+ /* Store parameters to stack. */
+ stmg %r2, %r6, STACK_INPUT(%r15);
+
+ lgr POLY_RSTATE, %r6;
+ lgr NBLKS, %r5;
+
+ lg POLY_RSRC, 0(%r15);
+ lg POLY_RSRC, 160(POLY_RSRC);
+ stg POLY_RSRC, STACK_POSRC(%r15);
+
+ /* Load poly1305 state */
+ POLY1305_LOAD_STATE();
+
+.balign 4
+ /* Process eight chacha20 blocks and 32 poly1305 blocks per loop. */
+.Lloop8_poly:
+ lg INPUT, STACK_INPUT(%r15);
+ larl %r8, .Lconsts;
+
+ vlm Y0, Y3, 0(INPUT);
+
+ slgfi NBLKS, 8;
+ lghi ROUND, (20 / 2);
+
+ /* Construct counter vectors X12/X13 & Y12/Y13. */
+ vl X4, (.Ladd_counter_0123 - .Lconsts)(%r8);
+ vl Y4, (.Ladd_counter_4567 - .Lconsts)(%r8);
+ lg %r8, (12 * 4)(INPUT); /* Update counter. */
+ vrepf Y12, Y3, 0;
+ vrepf Y13, Y3, 1;
+ vaccf X5, Y12, X4;
+ vaccf Y5, Y12, Y4;
+ vaf X12, Y12, X4;
+ vaf Y12, Y12, Y4;
+ vaf X13, Y13, X5;
+ vaf Y13, Y13, Y5;
+ rllg %r8, %r8, 32;
+
+ vrepf X0, Y0, 0;
+ vrepf X1, Y0, 1;
+ vrepf X2, Y0, 2;
+ vrepf X3, Y0, 3;
+ vrepf X4, Y1, 0;
+ vrepf X5, Y1, 1;
+ vrepf X6, Y1, 2;
+ vrepf X7, Y1, 3;
+ vrepf X8, Y2, 0;
+ vrepf X9, Y2, 1;
+ vrepf X10, Y2, 2;
+ vrepf X11, Y2, 3;
+ vrepf X14, Y3, 2;
+ vrepf X15, Y3, 3;
+ agfi %r8, 8;
+
+ /* Store counters for blocks 0-7. */
+ vstm X12, X13, (STACK_CTR + 0 * 16)(%r15);
+ vstm Y12, Y13, (STACK_CTR + 2 * 16)(%r15);
+ rllg %r8, %r8, 32;
+
+ vlr Y0, X0;
+ vlr Y1, X1;
+ vlr Y2, X2;
+ vlr Y3, X3;
+ vlr Y4, X4;
+ vlr Y5, X5;
+ vlr Y6, X6;
+ vlr Y7, X7;
+ vlr Y8, X8;
+ vlr Y9, X9;
+ vlr Y10, X10;
+ vlr Y11, X11;
+ vlr Y14, X14;
+ vlr Y15, X15;
+ stg %r8, (12 * 4)(INPUT);
+
+.balign 4
+.Lround2_8_poly:
+ /* Total 30 poly1305 blocks processed by this loop. */
+ QUARTERROUND4_V8_POLY(X0, X4, X8, X12, X1, X5, X9, X13,
+ X2, X6, X10, X14, X3, X7, X11, X15,
+ Y0, Y4, Y8, Y12, Y1, Y5, Y9, Y13,
+ Y2, Y6, Y10, Y14, Y3, Y7, Y11, Y15,
+ POLY1305_BLOCK_PART1(0 * 16),
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART6(),
+ POLY1305_BLOCK_PART7(),
+ POLY1305_BLOCK_PART8(),
+ POLY1305_BLOCK_PART1(1 * 16),
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4());
+ QUARTERROUND4_V8_POLY(X0, X5, X10, X15, X1, X6, X11, X12,
+ X2, X7, X8, X13, X3, X4, X9, X14,
+ Y0, Y5, Y10, Y15, Y1, Y6, Y11, Y12,
+ Y2, Y7, Y8, Y13, Y3, Y4, Y9, Y14,
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART6(),
+ POLY1305_BLOCK_PART7(),
+ POLY1305_BLOCK_PART8(),
+ POLY1305_BLOCK_PART1(2 * 16);
+ INC_POLY1305_SRC(3 * 16),
+ POLY1305_BLOCK_PART2(),
+ POLY1305_BLOCK_PART3(),
+ POLY1305_BLOCK_PART4(),
+ POLY1305_BLOCK_PART5(),
+ POLY1305_BLOCK_PART6(),
+ POLY1305_BLOCK_PART7(),
+ POLY1305_BLOCK_PART8());
+ brctg ROUND, .Lround2_8_poly;
+
+ POLY1305_BLOCK_PART1(0 * 16);
+
+ /* Store blocks 4-7. */
+ vstm Y0, Y15, STACK_Y0_Y15(%r15);
+
+ /* Load counters for blocks 0-3. */
+ vlm Y0, Y1, (STACK_CTR + 0 * 16)(%r15);
+
+ stg POLY_RSRC, STACK_POSRC(%r15); /* %r14 used for INPUT/SRC/DST pointer. */
+
+ lghi ROUND, 1;
+ j .Lfirst_output_4blks_8_poly;
+
+.balign 4
+.Lsecond_output_4blks_8_poly:
+
+ POLY1305_BLOCK_PART1(1 * 16);
+
+ /* Load blocks 4-7. */
+ vlm X0, X15, STACK_Y0_Y15(%r15);
+
+ /* Load counters for blocks 4-7. */
+ vlm Y0, Y1, (STACK_CTR + 2 * 16)(%r15);
+
+ INC_POLY1305_SRC(2 * 16);
+ stg POLY_RSRC, STACK_POSRC(%r15); /* %r14 used for INPUT/SRC/DST pointer. */
+
+ lghi ROUND, 0;
+
+.balign 4
+ /* Output four chacha20 blocks and one poly1305 block per loop. */
+.Lfirst_output_4blks_8_poly:
+ lg %r14, STACK_INPUT(%r15);
+ vlm Y12, Y15, 0(%r14);
+ POLY1305_BLOCK_PART2();
+ PLUS(X12, Y0);
+ PLUS(X13, Y1);
+ vrepf Y0, Y12, 0;
+ vrepf Y1, Y12, 1;
+ vrepf Y2, Y12, 2;
+ vrepf Y3, Y12, 3;
+ vrepf Y4, Y13, 0;
+ vrepf Y5, Y13, 1;
+ vrepf Y6, Y13, 2;
+ vrepf Y7, Y13, 3;
+ vrepf Y8, Y14, 0;
+ vrepf Y9, Y14, 1;
+ vrepf Y10, Y14, 2;
+ vrepf Y11, Y14, 3;
+ vrepf Y14, Y15, 2;
+ vrepf Y15, Y15, 3;
+ POLY1305_BLOCK_PART3();
+ PLUS(X0, Y0);
+ PLUS(X1, Y1);
+ PLUS(X2, Y2);
+ PLUS(X3, Y3);
+ PLUS(X4, Y4);
+ PLUS(X5, Y5);
+ PLUS(X6, Y6);
+ PLUS(X7, Y7);
+ PLUS(X8, Y8);
+ PLUS(X9, Y9);
+ PLUS(X10, Y10);
+ PLUS(X11, Y11);
+ PLUS(X14, Y14);
+ PLUS(X15, Y15);
+ POLY1305_BLOCK_PART4();
+
+ larl %r14, .Lconsts;
+ vl Y15, (.Lbswap32 - .Lconsts)(%r14);
+ TRANSPOSE_4X4_2(X0, X1, X2, X3, X4, X5, X6, X7,
+ Y9, Y10, Y11, Y12, Y13, Y14);
+ lg %r14, STACK_SRC(%r15);
+ POLY1305_BLOCK_PART5();
+ TRANSPOSE_4X4_2(X8, X9, X10, X11, X12, X13, X14, X15,
+ Y9, Y10, Y11, Y12, Y13, Y14);
+
+ vlm Y0, Y14, 0(%r14);
+ POLY1305_BLOCK_PART6();
+ vperm X0, X0, X0, Y15;
+ vperm X1, X1, X1, Y15;
+ vperm X2, X2, X2, Y15;
+ vperm X3, X3, X3, Y15;
+ vperm X4, X4, X4, Y15;
+ vperm X5, X5, X5, Y15;
+ vperm X6, X6, X6, Y15;
+ vperm X7, X7, X7, Y15;
+ vperm X8, X8, X8, Y15;
+ vperm X9, X9, X9, Y15;
+ vperm X10, X10, X10, Y15;
+ vperm X11, X11, X11, Y15;
+ vperm X12, X12, X12, Y15;
+ vperm X13, X13, X13, Y15;
+ vperm X14, X14, X14, Y15;
+ vperm X15, X15, X15, Y15;
+ vl Y15, (15 * 16)(%r14);
+ POLY1305_BLOCK_PART7();
+
+ aghi %r14, 256;
+ stg %r14, STACK_SRC(%r15);
+ lg %r14, STACK_DST(%r15);
+
+ XOR(Y0, X0);
+ XOR(Y1, X4);
+ XOR(Y2, X8);
+ XOR(Y3, X12);
+ XOR(Y4, X1);
+ XOR(Y5, X5);
+ XOR(Y6, X9);
+ XOR(Y7, X13);
+ XOR(Y8, X2);
+ XOR(Y9, X6);
+ XOR(Y10, X10);
+ XOR(Y11, X14);
+ XOR(Y12, X3);
+ XOR(Y13, X7);
+ XOR(Y14, X11);
+ XOR(Y15, X15);
+ POLY1305_BLOCK_PART8();
+ vstm Y0, Y15, 0(%r14);
+
+ aghi %r14, 256;
+ stg %r14, STACK_DST(%r15);
+
+ lg POLY_RSRC, STACK_POSRC(%r15);
+
+ clgije ROUND, 1, .Lsecond_output_4blks_8_poly;
+
+ clgijhe NBLKS, 8, .Lloop8_poly;
+
+ /* Store poly1305 state */
+ lg POLY_RSTATE, STACK_POCTX(%r15);
+ POLY1305_STORE_STATE();
+
+ /* Clear the used vector registers */
+ DST_8(CLEAR, 0, _);
+ DST_8(CLEAR, 1, _);
+ DST_8(CLEAR, 2, _);
+ DST_8(CLEAR, 3, _);
+
+ /* Clear sensitive data in stack. */
+ vlm Y0, Y15, STACK_Y0_Y15(%r15);
+ vlm Y0, Y3, STACK_CTR(%r15);
+
+ END_STACK(%r14);
+ xgr %r2, %r2;
+ br %r14;
+ CFI_ENDPROC();
+ELF(.size _gcry_chacha20_poly1305_s390x_vx_blocks8,
+ .-_gcry_chacha20_poly1305_s390x_vx_blocks8;)
+
#endif /*HAVE_GCC_INLINE_ASM_S390X_VX*/
#endif /*__s390x__*/
diff --git a/cipher/chacha20.c b/cipher/chacha20.c
index 7b283080..497594a0 100644
--- a/cipher/chacha20.c
+++ b/cipher/chacha20.c
@@ -189,6 +189,18 @@ unsigned int _gcry_chacha20_s390x_vx_blocks8(u32 *state, byte *dst,
unsigned int _gcry_chacha20_s390x_vx_blocks4_2_1(u32 *state, byte *dst,
const byte *src, size_t nblks);
+#undef USE_S390X_VX_POLY1305
+#if SIZEOF_UNSIGNED_LONG == 8
+#define USE_S390X_VX_POLY1305 1
+unsigned int _gcry_chacha20_poly1305_s390x_vx_blocks8(
+ u32 *state, byte *dst, const byte *src, size_t nblks,
+ POLY1305_STATE *st, const byte *poly1305_src);
+
+unsigned int _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1(
+ u32 *state, byte *dst, const byte *src, size_t nblks,
+ POLY1305_STATE *st, const byte *poly1305_src);
+#endif /* SIZEOF_UNSIGNED_LONG == 8 */
+
#endif /* USE_S390X_VX */
#ifdef USE_ARMV7_NEON
@@ -759,6 +771,48 @@ _gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf,
inbuf += 4 * CHACHA20_BLOCK_SIZE;
}
#endif
+#ifdef USE_S390X_VX_POLY1305
+ else if (ctx->use_s390x && length >= 2 * CHACHA20_BLOCK_SIZE * 8)
+ {
+ nburn = _gcry_chacha20_s390x_vx_blocks8(ctx->input, outbuf, inbuf, 8);
+ burn = nburn > burn ? nburn : burn;
+
+ authptr = outbuf;
+ length -= 8 * CHACHA20_BLOCK_SIZE;
+ outbuf += 8 * CHACHA20_BLOCK_SIZE;
+ inbuf += 8 * CHACHA20_BLOCK_SIZE;
+ }
+ else if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE * 4)
+ {
+ nburn = _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, outbuf, inbuf, 4);
+ burn = nburn > burn ? nburn : burn;
+
+ authptr = outbuf;
+ length -= 4 * CHACHA20_BLOCK_SIZE;
+ outbuf += 4 * CHACHA20_BLOCK_SIZE;
+ inbuf += 4 * CHACHA20_BLOCK_SIZE;
+ }
+ else if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE * 2)
+ {
+ nburn = _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, outbuf, inbuf, 2);
+ burn = nburn > burn ? nburn : burn;
+
+ authptr = outbuf;
+ length -= 2 * CHACHA20_BLOCK_SIZE;
+ outbuf += 2 * CHACHA20_BLOCK_SIZE;
+ inbuf += 2 * CHACHA20_BLOCK_SIZE;
+ }
+ else if (ctx->use_s390x && length >= CHACHA20_BLOCK_SIZE)
+ {
+ nburn = _gcry_chacha20_s390x_vx_blocks4_2_1(ctx->input, outbuf, inbuf, 1);
+ burn = nburn > burn ? nburn : burn;
+
+ authptr = outbuf;
+ length -= 1 * CHACHA20_BLOCK_SIZE;
+ outbuf += 1 * CHACHA20_BLOCK_SIZE;
+ inbuf += 1 * CHACHA20_BLOCK_SIZE;
+ }
+#endif
if (authptr)
{
@@ -862,6 +916,44 @@ _gcry_chacha20_poly1305_encrypt(gcry_cipher_hd_t c, byte *outbuf,
}
#endif
+#ifdef USE_S390X_VX_POLY1305
+ if (ctx->use_s390x)
+ {
+ if (length >= 8 * CHACHA20_BLOCK_SIZE &&
+ authoffset >= 8 * CHACHA20_BLOCK_SIZE)
+ {
+ size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+ nblocks -= nblocks % 8;
+
+ burn = _gcry_chacha20_poly1305_s390x_vx_blocks8(
+ ctx->input, outbuf, inbuf, nblocks,
+ &c->u_mode.poly1305.ctx.state, authptr);
+ burn = nburn > burn ? nburn : burn;
+
+ length -= nblocks * CHACHA20_BLOCK_SIZE;
+ outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ inbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ authptr += nblocks * CHACHA20_BLOCK_SIZE;
+ }
+
+ if (length >= CHACHA20_BLOCK_SIZE &&
+ authoffset >= CHACHA20_BLOCK_SIZE)
+ {
+ size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+
+ burn = _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1(
+ ctx->input, outbuf, inbuf, nblocks,
+ &c->u_mode.poly1305.ctx.state, authptr);
+ burn = nburn > burn ? nburn : burn;
+
+ length -= nblocks * CHACHA20_BLOCK_SIZE;
+ outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ inbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ authptr += nblocks * CHACHA20_BLOCK_SIZE;
+ }
+ }
+#endif
+
if (authoffset > 0)
{
_gcry_poly1305_update (&c->u_mode.poly1305.ctx, authptr, authoffset);
@@ -1026,6 +1118,40 @@ _gcry_chacha20_poly1305_decrypt(gcry_cipher_hd_t c, byte *outbuf,
}
#endif
+#ifdef USE_S390X_VX_POLY1305
+ if (ctx->use_s390x)
+ {
+ if (length >= 8 * CHACHA20_BLOCK_SIZE)
+ {
+ size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+ nblocks -= nblocks % 8;
+
+ nburn = _gcry_chacha20_poly1305_s390x_vx_blocks8(
+ ctx->input, outbuf, inbuf, nblocks,
+ &c->u_mode.poly1305.ctx.state, inbuf);
+ burn = nburn > burn ? nburn : burn;
+
+ length -= nblocks * CHACHA20_BLOCK_SIZE;
+ outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ inbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ }
+
+ if (length >= CHACHA20_BLOCK_SIZE)
+ {
+ size_t nblocks = length / CHACHA20_BLOCK_SIZE;
+
+ nburn = _gcry_chacha20_poly1305_s390x_vx_blocks4_2_1(
+ ctx->input, outbuf, inbuf, nblocks,
+ &c->u_mode.poly1305.ctx.state, inbuf);
+ burn = nburn > burn ? nburn : burn;
+
+ length -= nblocks * CHACHA20_BLOCK_SIZE;
+ outbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ inbuf += nblocks * CHACHA20_BLOCK_SIZE;
+ }
+ }
+#endif
+
while (length)
{
size_t currlen = length;
diff --git a/cipher/poly1305-s390x.S b/cipher/poly1305-s390x.S
new file mode 100644
index 00000000..844245f6
--- /dev/null
+++ b/cipher/poly1305-s390x.S
@@ -0,0 +1,87 @@
+/* poly1305-s390x.S - zSeries implementation of Poly1305
+ *
+ * Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is part of Libgcrypt.
+ *
+ * Libgcrypt is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU Lesser General Public License as
+ * published by the Free Software Foundation; either version 2.1 of
+ * the License, or (at your option) any later version.
+ *
+ * Libgcrypt is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#if defined (__s390x__) && __GNUC__ >= 4 && __ARCH__ >= 9
+#include <config.h>
+#if defined(HAVE_GCC_INLINE_ASM_S390X)
+
+#include "asm-poly1305-s390x.h"
+
+.text
+
+.balign 8
+.globl _gcry_poly1305_s390x_blocks1
+ELF(.type _gcry_poly1305_s390x_blocks1,@function;)
+
+_gcry_poly1305_s390x_blocks1:
+ /* input:
+ * %r2: poly1305-state
+ * %r3: src
+ * %r4: len
+ * %r5: high_pad
+ */
+ CFI_STARTPROC();
+
+ stmg %r6, %r14, 6 * 8(%r15);
+
+ lgr POLY_RSTATE, %r2;
+ lgr POLY_RSRC, %r3;
+ srlg %r0, %r4, 4;
+
+ cgije %r5, 0, .Lpoly_high0;
+
+ POLY1305_LOAD_STATE();
+
+.balign 4
+.Lpoly_loop_high1:
+ POLY1305_BLOCK_PART1(0 * 16);
+ INC_POLY1305_SRC(1 * 16);
+.Lpoly_block_part2:
+ POLY1305_BLOCK_PART2();
+ POLY1305_BLOCK_PART3();
+ POLY1305_BLOCK_PART4();
+ POLY1305_BLOCK_PART5();
+ POLY1305_BLOCK_PART6();
+ POLY1305_BLOCK_PART7();
+ POLY1305_BLOCK_PART8();
+
+ brctg %r0, .Lpoly_loop_high1;
+
+.balign 4
+.Lpoly_done:
+ POLY1305_STORE_STATE();
+
+ lmg %r6, %r14, 6 * 8(%r15);
+ xgr %r2, %r2;
+ br %r14;
+
+.balign 4
+.Lpoly_high0:
+ lghi %r0, 1;
+ POLY1305_LOAD_STATE();
+ POLY1305_BLOCK_PART1_HB(0 * 16, 0);
+ j .Lpoly_block_part2;
+
+ CFI_ENDPROC();
+ELF(.size _gcry_poly1305_s390x_blocks1,
+ .-_gcry_poly1305_s390x_blocks1;)
+
+#endif /*HAVE_GCC_INLINE_ASM_S390X*/
+#endif /*__s390x__*/
diff --git a/cipher/poly1305.c b/cipher/poly1305.c
index adcb6792..6cb4d2b7 100644
--- a/cipher/poly1305.c
+++ b/cipher/poly1305.c
@@ -35,6 +35,9 @@
static const char *selftest (void);
+#undef HAVE_ASM_POLY1305_BLOCKS
+
+
#undef USE_MPI_64BIT
#undef USE_MPI_32BIT
#if BYTES_PER_MPI_LIMB == 8 && defined(HAVE_TYPE_U64)
@@ -46,6 +49,35 @@ static const char *selftest (void);
#endif
+/* USE_S390X_ASM indicates whether to enable zSeries code. */
+#undef USE_S390X_ASM
+#if BYTES_PER_MPI_LIMB == 8
+# if defined (__s390x__) && __GNUC__ >= 4 && __ARCH__ >= 9
+# if defined(HAVE_GCC_INLINE_ASM_S390X)
+# define USE_S390X_ASM 1
+# endif /* USE_S390X_ASM */
+# endif
+#endif
+
+
+#ifdef USE_S390X_ASM
+
+#define HAVE_ASM_POLY1305_BLOCKS 1
+
+extern unsigned int _gcry_poly1305_s390x_blocks1(void *state,
+ const byte *buf, size_t len,
+ byte high_pad);
+
+static unsigned int
+poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
+ byte high_pad)
+{
+ return _gcry_poly1305_s390x_blocks1(&ctx->state, buf, len, high_pad);
+}
+
+#endif /* USE_S390X_ASM */
+
+
static void poly1305_init (poly1305_context_t *ctx,
const byte key[POLY1305_KEYLEN])
{
@@ -146,6 +178,8 @@ static void poly1305_init (poly1305_context_t *ctx,
ADD_1305_64(H2, H1, H0, (u64)0, x0_hi, x0_lo); \
} while (0)
+#ifndef HAVE_ASM_POLY1305_BLOCKS
+
static unsigned int
poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
byte high_pad)
@@ -201,6 +235,8 @@ poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
return 6 * sizeof (void *) + 18 * sizeof (u64);
}
+#endif /* !HAVE_ASM_POLY1305_BLOCKS */
+
static unsigned int poly1305_final (poly1305_context_t *ctx,
byte mac[POLY1305_TAGLEN])
{
@@ -354,6 +390,8 @@ static unsigned int poly1305_final (poly1305_context_t *ctx,
ADD_1305_32(H4, H3, H2, H1, H0, 0, x3_lo, x2_lo, x1_lo, x0_lo); \
} while (0)
+#ifndef HAVE_ASM_POLY1305_BLOCKS
+
static unsigned int
poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
byte high_pad)
@@ -403,6 +441,8 @@ poly1305_blocks (poly1305_context_t *ctx, const byte *buf, size_t len,
return 6 * sizeof (void *) + 28 * sizeof (u32);
}
+#endif /* !HAVE_ASM_POLY1305_BLOCKS */
+
static unsigned int poly1305_final (poly1305_context_t *ctx,
byte mac[POLY1305_TAGLEN])
{
diff --git a/configure.ac b/configure.ac
index c97d050e..a121093d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -2042,6 +2042,14 @@ AC_CACHE_CHECK([whether GCC inline assembler supports zSeries instructions],
:
: "a" (fac)
: "memory");
+ asm volatile ("risbgn %%r11, %%r11, 0, 129, 0\n\t"
+ :
+ :
+ : "memory", "r11");
+ asm volatile ("algrk %%r14, %%r14, %%r14\n\t"
+ :
+ :
+ : "memory", "r14");
return (unsigned int)r1 ^ reg0;
}
]])],
diff --git a/tests/basic.c b/tests/basic.c
index 436c1da8..46e4c0f8 100644
--- a/tests/basic.c
+++ b/tests/basic.c
@@ -4857,10 +4857,146 @@ _check_poly1305_cipher (unsigned int step)
"\x3f\xf4\xde\xf0\x8e\x4b\x7a\x9d\xe5\x76\xd2\x65\x86\xce\xc6\x4b"
"\x61\x16",
"\x1a\xe1\x0b\x59\x4f\x09\xe2\x6a\x7e\x90\x2e\xcb\xd0\x60\x06\x91" },
+ /* generated with c implementation */
+ { GCRY_CIPHER_CHACHA20,
+ "\x1c\x92\x40\xa5\xeb\x55\xd3\x8a\xf3\x33\x88\x86\x04\xf6\xb5\xf0"
+ "\x47\x39\x17\xc1\x40\x2b\x80\x09\x9d\xca\x5c\xbc\x20\x70\x75\xc0",
+ "\x00\x00\x00\x00\x01\x02\x03\x04\x05\x06\x07\x08", 12,
+ "\xf3\x33\x88\x86\x00\x00\x00\x00\x00\x00\x4e\x91", 12,
+ "\xb0\x58\x83\x17\x3a\x8e\x69\xf2\x18\x9d\x71\xe4\x8a\x0b\x7a\xcd"
+ "\xe2\xd8\xb9\x8b\xdf\x99\xc2\x6d\x05\x4b\x44\x1e\x65\x5d\xda\xd5"
+ "\x79\xf0\x19\xab\x94\x50\xd0\xc5\x54\xfe\x76\xc8\xd9\xf3\x39\x33"
+ "\x9c\x0f\x27\x89\x85\x99\xe3\xed\x5c\x31\x04\xa6\x20\xab\xb3\x78"
+ "\xac\x31\xba\x21\x8c\xac\x70\xd1\xe2\x92\xd6\x50\x58\x69\xab\xd4"
+ "\x38\xdc\x9c\x71\x81\xf7\xf1\x68\x10\x50\x07\x09\x0e\x51\x49\xd2"
+ "\x10\x9a\x2e\x78\xfb\xc7\xd3\xc2\x84\xda\xf2\x52\x17\x2c\xa6\xe8"
+ "\x56\x60\x80\x46\xed\xfb\x9f\xab\xc2\x01\xf0\x06\x6b\x6e\xcc\xf6"
+ "\x55\x3e\x81\xc7\x71\x9f\x10\xf0\x8e\x5a\x4b\xf6\xae\x90\x75\x03"
+ "\x4f\xb3\xb4\xff\x66\xfa\xe3\xb6\x1c\xca\x0c\x75\x8a\x08\x3d\xce"
+ "\x58\x69\x9d\xa9\x19\x29\xda\x2f\xa1\xb2\xae\xa7\x83\xd5\x92\xc2"
+ "\x15\xdc\xef\x76\xd2\xd1\x9f\xb4\x7f\x3e\xb3\x7a\xa8\x3e\xba\xa3"
+ "\x9e\x2e\x73\xe3\x4d\xdc\x50\xba\x5b\xb0\x8b\x1a\x87\x21\x03\x93"
+ "\x74\x20\x01\xda\x38\x85\x1c\x3c\x57\x51\x09\x0e\xd8\xfc\x2b\xef"
+ "\x38\x8e\x11\xa4\x9e\x11\xcc\xc5\x9f\x4c\xc2\x0d\x3e\x5f\x73\x40"
+ "\x5a\xf4\x5b\x57\x84\x6e\xc7\xd0\x8e\xad\x1c\x1b\xae\x59\xba\xf5"
+ "\x77\xed\x44\x08\x9c\x9b\xfd\x88\xd9\x27\xe8\x43\xe8\xdd\x86\xfd"
+ "\x05\x3a\xc2\x11\x88\x98\x87\xcb\xa1\x72\xc2\x52\x5c\xd1\x1a\x40"
+ "\x80\xe2\x1e\xe8\x9b\x4e\x63\x9b\xfb\x58\x11\x44\x36\x35\x83\x9b"
+ "\x20\x9b\x4b\x58\xef\x1f\xfa\xe1\xb0\xe0\xb8\x60\x87\x0b\xdb\x83"
+ "\x6f\xeb\xc0\x80\x63\xa8\xc4\x22\x0f\x1d\xec\x9b\x44\xfa\xd3\x13"
+ "\x75\xb0\xfe\x74\x3c\xde\x9e\xb4\x91\x72\xc5\xf6\x36\x14\x18\x2d"
+ "\x15\x2e\x6b\x34\xcf\xed\x86\x4f\x1b\x56\xcf\x09\x8f\x3d\xd1\x8d"
+ "\x01\x7c\xba\x6a\xf4\x82\xdc\xf6\x9e\xc9\x79\xd4\x9e\x50\xc2\x9a"
+ "\x4f\x90\x10\x44\xd5\xcf\x6b\x1d\xb3\xce\x7c\xeb\x3f\x8f\xbc\xe6"
+ "\x76\xad\x78\x97\xee\xaf\x66\x73\xe4\x11\xb9\x6c\xf4\xc1\x1a\x76"
+ "\xd6\x54\x4c\x6c\x44\x58\xec\xd9\x8f\xf9\xc6\x7f\x71\x95\x04\xfe"
+ "\x6b\x42\xd6\x4f\xc6\xa8\xc1\xfa\x1e\x2c\xf2\x49\x6a\x5a\xe5\x28"
+ "\x34\x30\x05\xc1\x21\x3a\x5f\xfd\xaf\x61\x1f\xa0\x91\xd4\x17\xcf"
+ "\x65\x9d\xf5\xdb\x4b\xc2\x3d\x12\xed\xe1\x4e\xf1\x34\x50\x13\xa7"
+ "\x3f\xe6\x26\xcb\xc9\xb3\x64\x69\xa9\x82\x21\xec\x64\xa9\x2e\x83"
+ "\xa9\x9d\xa0\xbe\x20\xef\x5f\x71\x45\xe7\x9f\x75\xa3\x72\x16\xef"
+ "\x1b\xf7\x9a\x15\xe2\x75\x92\x39\xbb\xb1\x4f\x34\xf4\x88\x0d\xcf"
+ "\xbf\xd6\xfe\x5d\x61\x14\x45\x83\xf9\x6a\x3e\x81\x0f\x14\x78\xda"
+ "\x94\xe2\xce\x7d\x1c\x15\xd7\xe0\x95\x1d\xd8\x96\xc2\x11\xb1\x55"
+ "\xae\xc6\x95\x43\x38\x0a\x01\xc2\x30\xb8\x1b\x12\x39\x98\x58\x20"
+ "\xbd\x65\x50\x1d\x17\x13\x02\xb9\xe4\x88\x39\x72\xc8\x58\xa0\xa8"
+ "\x8f\xb9\xc2\x78\x82\x3a\x56\xe8\x0d\xf9\x1b\xbb\xfb\xf0\x5b\xc4"
+ "\x9a\x2d\xf0\xd5\x57\x6f\xce\x4b\xb6\x3e\x1b\xbf\x54\xb4\x3e\x4e"
+ "\x52\x5c\x2e\x6b\x5e\x01\xd1\xb3\xb5\x16\x67\xe4\x16\xad\x3c\x4d"
+ "\x1c\xb2\xc0\x54\xcc\xf9\xba\x11\x85\xdf\x43\x1a\xfb\x55\x9b\x88"
+ "\x27\x9e\x17\x29\x41\x7d\x2a\xb4\xf6\x61\x93\xa5\x1f\x5b\xb3\x06"
+ "\xbe\x86\x40\x11\xc6\xfc\x36\x44\xdb\xbf\x4c\x6b\x21\x15\xa9\x10"
+ "\x01\xdc\x53\x9c\x57\x27\xbe\x55\x19\x86\x17\x96\xfa\xdc\x4d\xf4"
+ "\xd9\x79\xbe\x6c\x29\x1b\xed\xbd\x09\x72\xb4\xbf\x88\xc7\x52\x39"
+ "\x5f\x62\x35\xad\x41\x87\xa6\xaa\x99\x20\xbc\x7d\x97\x67\x83\xa5"
+ "\xc3\x43\xc6\x7f\x31\xb9\x0c\xe1\x82\xa5\x66\x9a\x58\xe3\xaf\x6b"
+ "\x59\x09\x5b\xad\xed\xc2\x57\x66\x4e\x72\xb0\xaa\x0d\xeb\x9c\x48"
+ "\x3f\x0b\xaf\xc6\x46\x06\x54\x3a\x2a\x19\xb3\x9d\xde\xd9\xa0\xcf"
+ "\x71\x69\x33\xe8\x2c\xa8\x56\x8c\x0b\xae\x41\xc7\xb5\xfd\xca\xea"
+ "\x0f\xd1\xd7\xe0\x3e\xf6\xf5\xd1\xb2\x57\x21\x00\x32\xca\x02\x4d"
+ "\x18\xbe\x2c\x25\xe9\xbe\x0a\x34\x44\x92\xaa\x43\x09\xf7\xb4\x35"
+ "\xac\x65\xc3\xc1\x4c\x66\x74\x91\x9f\xae\xe2\x27\x37\x8a\xfe\x13"
+ "\x57\xf0\x39\x30\xf0\x06\xef\xa0\x5f\x90\xb7\xfa\xd9\x42\x3e\xcb"
+ "\xdc\x9c\x44\x36\x13\x8e\x66\xbc\x85\xe8\xfa\x2c\x73\xa5\x87\xbd"
+ "\x63\x98\x42\x56\x1a\xe9\xc4\x80\xa1\x0e\xd5\x9a\x27\xd2\x82\x20"
+ "\x08\xe5\x98\x60\x00\x6d\xd9\x53\x9b\xae\x67\xfb\x03\xff\x82\xf1"
+ "\xc6\x9b\x0b\xf1\x2c\x97\x89\x1c\x8e\x84\xd0\xb3\x2a\x44\xa3\xb2"
+ "\x77\x1d\xf2\x2e\x6a\xf7\x05\x67\x32\x21\xca\x39\x2e\x7f\x1a\x69"
+ "\x21\xdd\xaa\xfc\x19\xad\xc5\xf8\xfe\x6f\x17\x9e\x32\x64\xf8\xeb"
+ "\x98\x8a\x5e\x2e\x89\xea\xfb\xed\xd7\x09\x1a\x7f\xa5\xf6\xe3\xd4"
+ "\x33\x60\xbb\xc2\x2b\x1a\xd6\x4c\x03\xe1\xc3\xc6\x90\x0e\x7a\x89"
+ "\xe8\x50\x4b\x47\xc2\x91\x5d\x2a\x49\xf5\xb0\x5f\x69\xbb\x88\x51"
+ "\x0c\xa2\xc0\x88\x99\x91\xcd\x77\x11\x31\x3a\x8f\x99\x03\xd7\x5e",
+ 1024,
+ "\x9d\x96\x71\x67\x3d\x66\x16\x72\x55\x29\x61\x42\x77\x99\x4a\x50"
+ "\xdd\x2a\x80\x56\x8f\xb7\x50\x82\x80\x63\x47\x7b\xc1\x44\x3b\x02"
+ "\x5b\xe8\x96\x93\x97\x6c\xff\x42\x90\x40\xf9\xe9\x93\xfe\x7e\xa3"
+ "\x4c\xd9\xe8\xdc\xda\xf7\x8f\xcd\xe7\xa7\x1f\xaa\x7c\x8b\x07\xda"
+ "\xf0\x70\x4d\x47\x8e\x87\x86\x71\x1e\x7a\x13\x7b\x9c\x42\x5d\x30"
+ "\x0c\x04\xfb\x7b\xe0\x0e\xa7\xb1\x5c\x89\xf7\xdd\x81\x0a\xe0\xe4"
+ "\xe2\x69\xa2\x36\x60\x45\x1c\xcc\x27\x2f\xaf\x70\x59\x6d\xc5\xb4"
+ "\x40\x04\x69\x1d\xe8\xf3\xf5\x7e\x49\xd7\x81\x12\x5b\xd3\xc6\x77"
+ "\x82\x5c\x9e\x91\x6b\x6b\x7d\xd7\x45\xb8\x39\x94\x0a\x1a\xb4\xc4"
+ "\xff\xba\x05\x7b\x0b\xba\xe1\x81\x90\x29\xdd\xb5\x58\x0b\x1f\x82"
+ "\x9e\x4d\xdd\x1b\xc1\x62\x14\x1a\x8f\xc1\x8c\xf6\x46\x07\xb2\xcd"
+ "\x6a\xb5\xa1\x06\x4c\xc3\xa3\x3f\x02\x08\xe2\x29\x3c\x05\xbd\xcb"
+ "\xf0\xfa\x27\xf1\x7b\x48\x45\x46\x62\x88\x01\xb8\xd3\x0a\x29\xbc"
+ "\xd6\xbb\x20\xee\x75\x5f\x29\x0c\x47\x9e\x0f\x1d\xdf\x81\x39\x9a"
+ "\x1c\x48\x69\x09\xeb\x42\xae\x71\x11\x4c\x53\x9c\x69\xa6\x71\x50"
+ "\x45\x4d\x31\x71\xdd\xdb\xb1\x64\x37\xbf\x03\x76\xb2\x44\xf9\xbb"
+ "\xa3\x25\x6b\xcf\xb0\x9f\x1d\x78\xdf\x93\xde\x2d\x57\x23\x6f\xff"
+ "\x02\xf8\xc6\xf5\x5f\x4b\xd5\x8a\x15\xc2\x5f\x9d\x47\x3b\x2f\x8f"
+ "\x36\x93\x4a\x96\xae\x57\xaa\xd7\x6e\xea\x45\x94\xfb\xa2\xab\x56"
+ "\xae\x7e\xb3\xc5\x87\xa5\xd4\x2d\xf0\x99\x1e\x0a\x05\xb8\x33\xe4"
+ "\x89\x6c\x9e\x6d\x8c\xf1\xb4\xaa\x1f\xaa\xfb\x4b\x40\x90\xc0\x50"
+ "\xf3\x7d\x2a\x67\x68\x25\x0a\x9a\x89\x1f\x90\xfd\xb0\x9d\x7d\xaf"
+ "\x72\x22\xeb\x22\xb9\x63\x5f\x2c\x54\x49\xa3\x99\xc4\x74\xab\xc0"
+ "\x2c\x85\x31\x26\x84\x57\xfd\xce\x34\x10\x63\x57\x9f\x0c\x0a\xa3"
+ "\x02\xb0\x87\x36\xf5\xf8\x1e\x66\x81\x74\x2c\x3e\x90\xc0\x10\xf1"
+ "\x53\xd4\xc3\x45\x9b\xe2\x58\xcf\x86\x2e\xf4\xb3\x11\xff\xe6\xc8"
+ "\x5c\x74\x6e\xb4\xd9\x52\x2c\x52\x71\x5e\xb4\xf1\xca\xa7\x1c\x09"
+ "\x6a\x2d\xc0\x20\x38\xf5\x61\xdc\xd9\x8d\x42\x71\x65\xf8\xce\xa7"
+ "\xcb\x2c\x44\x09\x87\x5a\x02\xdd\x8c\xe1\xec\xd0\xe1\xeb\x4d\x25"
+ "\x70\x57\xbd\xc7\x1b\xee\xb5\xc0\x81\xc5\x75\x45\xb8\xb7\xad\xfd"
+ "\x33\xdc\xbe\x09\x71\xd0\xd4\xee\xf7\x37\x4e\x6f\x80\x5f\xec\x3f"
+ "\x35\x75\x39\xaa\x41\xe6\x62\x17\xc5\x8f\xa4\xa7\x31\xd6\xd5\xe9"
+ "\x56\xc2\xc7\x1d\xf1\x58\xf6\xad\x3b\xbc\xbe\x65\x12\xd4\xfb\xe2"
+ "\x0a\x5a\x64\x9e\xad\x70\x1d\x95\xbd\x24\x1a\xa9\x99\xc0\x70\x74"
+ "\xb1\x79\x01\x4f\xfd\x5d\x76\xa7\xd9\x53\x3d\x87\x2b\x51\xb4\xf3"
+ "\x17\xa5\x41\xe9\x8b\xba\xd3\x69\xcd\xe6\x44\x0f\x18\x8f\x59\x0d"
+ "\xb0\xb8\x2a\x7f\xbb\x16\x51\xf5\xe8\xad\xda\x66\xaa\x3a\xb6\x7d"
+ "\x10\x13\x8d\xd9\x7d\x15\x09\x80\x7b\x00\x67\x96\x90\x21\x3e\xd4"
+ "\x1a\xe8\x3b\x1c\x78\x31\x9b\x63\x64\xb9\x1b\x50\x11\x93\x48\x13"
+ "\x89\xcb\xba\x57\x23\xcd\x95\x95\xd5\xee\x8b\x0d\xb4\xdf\x0c\x8a"
+ "\xae\xae\x55\x3f\x93\xad\xc1\x3e\xe5\x31\x20\x73\x58\xb0\x0b\xba"
+ "\xf5\x03\x7b\x50\x39\xa3\x66\xa9\x82\x47\x65\x29\xa8\x49\xd7\x5c"
+ "\x51\x89\x97\x03\x31\x11\x75\x83\x6e\x4e\x80\x2d\x57\x93\x88\xec"
+ "\x0e\x22\xa8\xde\x50\x99\x2c\xaa\xaf\x60\x3a\x74\xa0\x31\x16\x37"
+ "\xcd\x8a\x4d\xda\x40\x1d\x0c\xf1\xc4\x7a\xd0\xaa\xf4\xa7\x55\xe3"
+ "\xa4\xe3\x9d\x27\x4f\x81\xc6\x07\x74\x13\x8e\x4b\xd9\x6c\x33\xba"
+ "\x28\x8d\xb7\x79\x36\x29\xfc\x98\x91\x29\x87\xe7\xf6\x92\xb8\x7c"
+ "\xe4\xca\xb7\x21\x49\x8c\x01\x59\xad\x65\x37\x62\x9b\xba\x40\xc1"
+ "\x79\x87\xe5\x48\x58\xe3\x0e\x3a\xda\x31\x03\x55\x36\x64\x00\xda"
+ "\x61\x8a\x0a\x93\xdc\x82\xcc\x63\x40\xb5\x46\xde\xf0\x8c\x3f\x6d"
+ "\x3e\x32\xf2\xe6\x1d\x37\xf0\xd1\x7e\x33\x52\xb6\x97\xc3\x80\x64"
+ "\xa4\x0d\x5f\x97\xa5\xd8\xa3\x47\x1a\x83\x1f\xd0\x52\x81\xb9\xd9"
+ "\x7a\x32\xe6\xf1\x3e\x7d\xdc\x01\x5d\xb8\x44\x12\xc0\x1f\x72\x72"
+ "\x8b\x0e\xfa\x05\x37\x73\xbd\xc4\x06\x67\x18\xd7\xd4\x80\x2c\x2c"
+ "\x13\x06\xfe\x82\x5b\x65\x88\xe3\x0b\x06\x3c\xe6\xe4\xd0\x8f\x24"
+ "\x6a\x6a\x4d\x21\x4c\x2d\x05\x76\x12\xf9\xee\xbf\xb5\x5e\xcd\x03"
+ "\xf0\x5b\x35\x82\xb7\x1d\x7b\xca\xa6\x14\x40\x68\xd2\xa5\x49\x34"
+ "\x69\xb7\x05\x48\xf9\xdb\x93\xd4\x0b\x45\x8d\xb3\x1e\xa3\xf9\x5d"
+ "\x8c\x18\xc5\x40\x14\x67\xc5\x40\xbe\x61\x53\x74\x52\x94\x6c\x5e"
+ "\xc6\xdf\xd0\xe7\xe5\xbd\x4b\xca\x89\xca\xf6\xf4\xc5\x6f\xf6\x87"
+ "\x9e\x3a\x11\x5a\xa8\xcd\x83\x70\x19\x63\x8a\xaf\x08\xb1\x33\xa9"
+ "\x2a\xcc\xde\x7f\xd2\x63\xfb\x85\x40\x77\x40\x8f\x9d\xa0\x7c\xed"
+ "\x8d\xe5\xe5\x31\x05\x75\xf2\x7e\xab\x22\x54\xbf\xfe\xd3\x1f\x45"
+ "\x95\x0d\x6d\x07\x6a\x90\x06\xd6\x45\x97\xc0\x82\x88\xfc\xd8\xd0",
+ "\xf1\xef\xf4\x8d\x9c\xfa\x92\x10\xd9\x4f\x22\x3f\x2f\x75\xe1\x8b" },
};
gcry_cipher_hd_t hde, hdd;
- unsigned char out[1024];
+ unsigned char out[2048];
unsigned char tag[16];
int i, keylen;
gcry_error_t err = 0;