Merge branch 'sha256-compress-n' into master-updates

author: Niels Möller <nisse@lysator.liu.se> 2022-08-15 09:27:36 +0200
committer: Niels Möller <nisse@lysator.liu.se> 2022-08-15 09:27:36 +0200
commit: 7328fb0df069c9ce42b1a58f6788be6ea3fc2419 (patch)
tree: c00c59d414c9161e49faea43ec4a0b21132b718b /x86_64/sha_ni/sha256-compress-n.asm
parent: d618864183ccfdcd0d1b5443111fbaf9a5934517 (diff)
parent: 6a384afa5f5b498384d629c3257b6c7f62f459eb (diff)
download: nettle-7328fb0df069c9ce42b1a58f6788be6ea3fc2419.tar.gz
1 files changed, 187 insertions, 0 deletions
diff --git a/x86_64/sha_ni/sha256-compress-n.asm b/x86_64/sha_ni/sha256-compress-n.asm
new file mode 100644
index 00000000..005909df
--- /dev/null
+++ b/x86_64/sha_ni/sha256-compress-n.asm
@@ -0,0 +1,187 @@
+C x86_64/sha_ni/sha256-compress-n.asm
+
+ifelse(`
+   Copyright (C) 2018, 2022 Niels Möller
+
+   This file is part of GNU Nettle.
+
+   GNU Nettle is free software: you can redistribute it and/or
+   modify it under the terms of either:
+
+     * the GNU Lesser General Public License as published by the Free
+       Software Foundation; either version 3 of the License, or (at your
+       option) any later version.
+
+   or
+
+     * the GNU General Public License as published by the Free
+       Software Foundation; either version 2 of the License, or (at your
+       option) any later version.
+
+   or both in parallel, as here.
+
+   GNU Nettle is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   General Public License for more details.
+
+   You should have received copies of the GNU General Public License and
+   the GNU Lesser General Public License along with this program.  If
+   not, see http://www.gnu.org/licenses/.
+')
+
+	.file "sha256-compress-n.asm"
+define(`STATE', `%rdi')
+define(`K', `%rsi')
+define(`BLOCKS', `%rdx')
+define(`INPUT', `%rcx')
+
+define(`MSGK',`%xmm0')	C Implicit operand of sha256rnds2
+define(`MSG0',`%xmm1')
+define(`MSG1',`%xmm2')
+define(`MSG2',`%xmm3')
+define(`MSG3',`%xmm4')
+define(`ABEF',`%xmm5')
+define(`CDGH',`%xmm6')
+define(`ABEF_ORIG',`%xmm7')
+define(`CDGH_ORIG', `%xmm8')
+define(`SWAP_MASK',`%xmm9')
+define(`TMP', `%xmm10')
+
+C QROUND(M0, M1, M2, M3, R)
+define(`QROUND', `
+	movdqa	eval($5*4)(K), MSGK
+	paddd	$1, MSGK
+	sha256rnds2 ABEF, CDGH
+	pshufd	`$'0xe, MSGK, MSGK
+	sha256rnds2 CDGH, ABEF
+	movdqa	$1, TMP
+	palignr	`$'4, $4, TMP
+	paddd	TMP, $2
+	sha256msg2 $1, $2
+	sha256msg1 $1, $4
+	')
+
+C FIXME: Do something more clever, taking the pshufd into account.
+C TRANSPOSE(ABCD, EFGH, scratch) --> untouched, ABEF, CDGH
+define(`TRANSPOSE', `
+	movdqa	$2, $3
+	punpckhqdq $1, $2
+	punpcklqdq $1, $3
+')
+
+	C const uint8_t *
+	C _nettle_sha256_compress_n(uint32_t *state, const uint32_t *k,
+	C                           size_t blocks, const uint8_t *input)
+
+	.text
+	ALIGN(16)
+.Lswap_mask:
+	.byte 3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12
+PROLOGUE(_nettle_sha256_compress_n)
+	W64_ENTRY(4, 11)
+	test	BLOCKS, BLOCKS
+	jz	.Lend
+
+	movups	(STATE), TMP
+	movups	16(STATE), ABEF
+
+	pshufd	$0x1b, TMP, TMP
+	pshufd	$0x1b, ABEF, ABEF
+
+	TRANSPOSE(TMP, ABEF, CDGH)
+
+	movdqa	.Lswap_mask(%rip), SWAP_MASK
+
+.Loop:
+	movups	(INPUT), MSG0
+	pshufb	SWAP_MASK, MSG0
+
+	movdqa	ABEF, ABEF_ORIG
+	movdqa	CDGH, CDGH_ORIG
+
+	movdqa	(K), MSGK
+	paddd	MSG0, MSGK
+	sha256rnds2 ABEF, CDGH		C Round 0-1
+	pshufd	$0xe, MSGK, MSGK
+	sha256rnds2 CDGH, ABEF		C Round 2-3
+
+	movups	16(INPUT), MSG1
+	pshufb	SWAP_MASK, MSG1
+
+	movdqa	16(K), MSGK
+	paddd	MSG1, MSGK
+	sha256rnds2 ABEF, CDGH		C Round 4-5
+	pshufd	$0xe, MSGK, MSGK
+	sha256rnds2 CDGH, ABEF		C Round 6-7
+	sha256msg1 MSG1, MSG0
+
+	movups	32(INPUT), MSG2
+	pshufb	SWAP_MASK, MSG2
+
+	movdqa	32(K), MSGK
+	paddd	MSG2, MSGK
+	sha256rnds2 ABEF, CDGH		C Round 8-9
+	pshufd	$0xe, MSGK, MSGK
+	sha256rnds2 CDGH, ABEF		C Round 10-11
+	sha256msg1 MSG2, MSG1
+
+	movups	48(INPUT), MSG3
+	pshufb	SWAP_MASK, MSG3
+
+	QROUND(MSG3, MSG0, MSG1, MSG2, 12)	C Round 12-15
+	QROUND(MSG0, MSG1, MSG2, MSG3, 16)
+	QROUND(MSG1, MSG2, MSG3, MSG0, 20)
+	QROUND(MSG2, MSG3, MSG0, MSG1, 24)
+	QROUND(MSG3, MSG0, MSG1, MSG2, 28)
+	QROUND(MSG0, MSG1, MSG2, MSG3, 32)
+	QROUND(MSG1, MSG2, MSG3, MSG0, 36)
+	QROUND(MSG2, MSG3, MSG0, MSG1, 40)
+	QROUND(MSG3, MSG0, MSG1, MSG2, 44)
+	QROUND(MSG0, MSG1, MSG2, MSG3, 48)
+
+	movdqa	208(K), MSGK
+	paddd	MSG1, MSGK
+	sha256rnds2 ABEF, CDGH		C Round 52-53
+	pshufd	$0xe, MSGK, MSGK
+	sha256rnds2 CDGH, ABEF		C Round 54-55
+	movdqa	MSG1, TMP
+	palignr	$4, MSG0, TMP
+	paddd	TMP, MSG2
+	sha256msg2 MSG1, MSG2
+
+	movdqa	224(K), MSGK
+	paddd	MSG2, MSGK
+	sha256rnds2 ABEF, CDGH		C Round 56-57
+	pshufd	$0xe, MSGK, MSGK
+	sha256rnds2 CDGH, ABEF		C Round 58-59
+	movdqa	MSG2, TMP
+	palignr	$4, MSG1, TMP
+	paddd	TMP, MSG3
+	sha256msg2 MSG2, MSG3
+
+	movdqa	240(K), MSGK
+	paddd	MSG3, MSGK
+	sha256rnds2 ABEF, CDGH		C Round 60-61
+	pshufd	$0xe, MSGK, MSGK
+	sha256rnds2 CDGH, ABEF		C Round 62-63
+
+	paddd ABEF_ORIG, ABEF
+	paddd CDGH_ORIG, CDGH
+
+	add	$64, INPUT
+	dec	BLOCKS
+	jnz	.Loop
+
+	TRANSPOSE(ABEF, CDGH, TMP)
+
+	pshufd	$0x1b, CDGH, CDGH
+	pshufd	$0x1b, TMP, TMP
+	movups	CDGH, 0(STATE)
+	movups	TMP, 16(STATE)
+
+.Lend:
+	mov	INPUT, %rax
+	W64_EXIT(4, 11)
+	ret
+EPILOGUE(_nettle_sha256_compress_n)
author	Niels Möller <nisse@lysator.liu.se>	2022-08-15 09:27:36 +0200
committer	Niels Möller <nisse@lysator.liu.se>	2022-08-15 09:27:36 +0200
commit	7328fb0df069c9ce42b1a58f6788be6ea3fc2419 (patch)
tree	c00c59d414c9161e49faea43ec4a0b21132b718b /x86_64/sha_ni/sha256-compress-n.asm
parent	d618864183ccfdcd0d1b5443111fbaf9a5934517 (diff)
parent	6a384afa5f5b498384d629c3257b6c7f62f459eb (diff)
download	nettle-7328fb0df069c9ce42b1a58f6788be6ea3fc2419.tar.gz