diff options
author | Niels Möller <nisse@lysator.liu.se> | 2022-07-05 19:47:39 +0200 |
---|---|---|
committer | Niels Möller <nisse@lysator.liu.se> | 2022-07-05 19:47:39 +0200 |
commit | fad1bf50a5087a5b88a5500965ba3959d11f997f (patch) | |
tree | 428299e99c30bb24ce2bf048e5a887ac2dbc8eca | |
parent | 12bb2223428be4326c580b1b1f8e9916a2839cb1 (diff) | |
download | nettle-fad1bf50a5087a5b88a5500965ba3959d11f997f.tar.gz |
Implement _nettle_sha256_compress_n, C and x86_64 asm
-rw-r--r-- | ChangeLog | 14 | ||||
-rw-r--r-- | Makefile.in | 2 | ||||
-rw-r--r-- | configure.ac | 6 | ||||
-rw-r--r-- | fat-setup.h | 4 | ||||
-rw-r--r-- | fat-x86_64.c | 17 | ||||
-rw-r--r-- | sha2-internal.h | 5 | ||||
-rw-r--r-- | sha256-compress-n.c (renamed from sha256-compress.c) | 123 | ||||
-rw-r--r-- | sha256.c | 30 | ||||
-rw-r--r-- | x86_64/fat/sha256-compress-n-2.asm (renamed from x86_64/fat/sha256-compress-2.asm) | 4 | ||||
-rw-r--r-- | x86_64/fat/sha256-compress-n.asm (renamed from x86_64/fat/sha256-compress.asm) | 4 | ||||
-rw-r--r-- | x86_64/sha256-compress-n.asm (renamed from x86_64/sha256-compress.asm) | 85 | ||||
-rw-r--r-- | x86_64/sha_ni/sha256-compress-n.asm (renamed from x86_64/sha_ni/sha256-compress.asm) | 42 |
12 files changed, 205 insertions, 131 deletions
@@ -1,6 +1,20 @@ 2022-07-05 Niels Möller <nisse@lysator.liu.se> * md-internal.h (MD_FILL_OR_RETURN): New file, new macro. + * sha256-compress-n.c (_nettle_sha256_compress_n): New file and + function, replacing... + * sha256-compress.c (_nettle_sha256_compress): ...deleted file and + function. + * sha2-internal.h (_nettle_sha256_compress_n): Declare new function.. + * sha256.c (sha256_compress): Update to use + _nettle_sha256_compress_n and MD_FILL_OR_RETURN. + * x86_64/sha256-compress-n.asm: New file. replacing... + * x86_64/sha256-compress.asm: ...deleted file. + * x86_64/sha_ni/sha256-compress-n.asm: New file. replacing... + * x86_64/sha_ni/sha256-compress.asm: ...deleted file. + * fat-setup.h (sha256_compress_n_func): New typedef, replacing... + (sha256_compress_func): ... deleted typedef. + * fat-x86_64.c: Update fat setup. 2022-06-20 Niels Möller <nisse@lysator.liu.se> diff --git a/Makefile.in b/Makefile.in index ba536407..64027d4d 100644 --- a/Makefile.in +++ b/Makefile.in @@ -138,7 +138,7 @@ nettle_SOURCES = aes-decrypt-internal.c aes-decrypt.c aes-decrypt-table.c \ salsa20-set-nonce.c \ salsa20-128-set-key.c salsa20-256-set-key.c \ sha1.c sha1-compress.c sha1-meta.c \ - sha256.c sha256-compress.c sha224-meta.c sha256-meta.c \ + sha256.c sha256-compress-n.c sha224-meta.c sha256-meta.c \ sha512.c sha512-compress.c sha384-meta.c sha512-meta.c \ sha512-224-meta.c sha512-256-meta.c \ sha3.c sha3-permute.c \ diff --git a/configure.ac b/configure.ac index 73c6fc21..cb30dfb3 100644 --- a/configure.ac +++ b/configure.ac @@ -591,7 +591,7 @@ asm_replace_list="aes-encrypt-internal.asm aes-decrypt-internal.asm \ chacha-core-internal.asm \ salsa20-crypt.asm salsa20-core-internal.asm \ serpent-encrypt.asm serpent-decrypt.asm \ - sha1-compress.asm sha256-compress.asm sha512-compress.asm \ + sha1-compress.asm sha256-compress-n.asm sha512-compress.asm \ sha3-permute.asm umac-nh.asm umac-nh-n.asm machine.m4" # Assembler files which generate additional object files if they are used. @@ -607,7 +607,7 @@ asm_nettle_optional_list="cpuid.asm cpu-facility.asm \ chacha-2core.asm chacha-3core.asm chacha-4core.asm chacha-core-internal-2.asm \ ghash-set-key-2.asm ghash-update-2.asm \ salsa20-2core.asm salsa20-core-internal-2.asm \ - sha1-compress-2.asm sha256-compress-2.asm \ + sha1-compress-2.asm sha256-compress-n-2.asm \ sha3-permute-2.asm sha512-compress-2.asm \ umac-nh-n-2.asm umac-nh-2.asm" @@ -757,7 +757,7 @@ AH_VERBATIM([HAVE_NATIVE], #undef HAVE_NATIVE_salsa20_2core #undef HAVE_NATIVE_fat_salsa20_2core #undef HAVE_NATIVE_sha1_compress -#undef HAVE_NATIVE_sha256_compress +#undef HAVE_NATIVE_sha256_compress_n #undef HAVE_NATIVE_sha512_compress #undef HAVE_NATIVE_sha3_permute #undef HAVE_NATIVE_umac_nh diff --git a/fat-setup.h b/fat-setup.h index e77cce02..70bc2687 100644 --- a/fat-setup.h +++ b/fat-setup.h @@ -178,7 +178,9 @@ typedef void salsa20_crypt_func (struct salsa20_ctx *ctx, unsigned rounds, const uint8_t *src); typedef void sha1_compress_func(uint32_t *state, const uint8_t *input); -typedef void sha256_compress_func(uint32_t *state, const uint8_t *input, const uint32_t *k); +typedef const uint8_t * +sha256_compress_n_func(uint32_t *state, const uint32_t *k, + size_t blocks, const uint8_t *input); struct sha3_state; typedef void sha3_permute_func (struct sha3_state *state); diff --git a/fat-x86_64.c b/fat-x86_64.c index 47cf78ae..0a2fedf4 100644 --- a/fat-x86_64.c +++ b/fat-x86_64.c @@ -155,9 +155,9 @@ DECLARE_FAT_FUNC(nettle_sha1_compress, sha1_compress_func) DECLARE_FAT_FUNC_VAR(sha1_compress, sha1_compress_func, x86_64) DECLARE_FAT_FUNC_VAR(sha1_compress, sha1_compress_func, sha_ni) -DECLARE_FAT_FUNC(_nettle_sha256_compress, sha256_compress_func) -DECLARE_FAT_FUNC_VAR(sha256_compress, sha256_compress_func, x86_64) -DECLARE_FAT_FUNC_VAR(sha256_compress, sha256_compress_func, sha_ni) +DECLARE_FAT_FUNC(_nettle_sha256_compress_n, sha256_compress_n_func) +DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, x86_64) +DECLARE_FAT_FUNC_VAR(sha256_compress_n, sha256_compress_n_func, sha_ni) DECLARE_FAT_FUNC(_nettle_ghash_set_key, ghash_set_key_func) DECLARE_FAT_FUNC_VAR(ghash_set_key, ghash_set_key_func, c) @@ -228,14 +228,14 @@ fat_init (void) if (verbose) fprintf (stderr, "libnettle: using sha_ni instructions.\n"); nettle_sha1_compress_vec = _nettle_sha1_compress_sha_ni; - _nettle_sha256_compress_vec = _nettle_sha256_compress_sha_ni; + _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_sha_ni; } else { if (verbose) fprintf (stderr, "libnettle: not using sha_ni instructions.\n"); nettle_sha1_compress_vec = _nettle_sha1_compress_x86_64; - _nettle_sha256_compress_vec = _nettle_sha256_compress_x86_64; + _nettle_sha256_compress_n_vec = _nettle_sha256_compress_n_x86_64; } if (features.have_pclmul) @@ -315,9 +315,10 @@ DEFINE_FAT_FUNC(nettle_sha1_compress, void, (uint32_t *state, const uint8_t *input), (state, input)) -DEFINE_FAT_FUNC(_nettle_sha256_compress, void, - (uint32_t *state, const uint8_t *input, const uint32_t *k), - (state, input, k)) +DEFINE_FAT_FUNC(_nettle_sha256_compress_n, const uint8_t *, + (uint32_t *state, const uint32_t *k, + size_t blocks, const uint8_t *input), + (state, k, blocks, input)) DEFINE_FAT_FUNC(_nettle_ghash_set_key, void, (struct gcm_key *ctx, const union nettle_block16 *key), diff --git a/sha2-internal.h b/sha2-internal.h index 40f25a5f..93080bee 100644 --- a/sha2-internal.h +++ b/sha2-internal.h @@ -39,8 +39,9 @@ /* Internal compression function. STATE points to 8 uint32_t words, DATA points to 64 bytes of input data, possibly unaligned, and K points to the table of constants. */ -void -_nettle_sha256_compress(uint32_t *state, const uint8_t *data, const uint32_t *k); +const uint8_t * +_nettle_sha256_compress_n(uint32_t *state, const uint32_t *k, + size_t blocks, const uint8_t *data); /* Internal compression function. STATE points to 8 uint64_t words, DATA points to 128 bytes of input data, possibly unaligned, and K diff --git a/sha256-compress.c b/sha256-compress-n.c index cf17e3e1..1e40cb1d 100644 --- a/sha256-compress.c +++ b/sha256-compress-n.c @@ -1,8 +1,8 @@ -/* sha256-compress.c +/* sha256-compress-n.c The compression function of the sha256 hash function. - Copyright (C) 2001, 2010 Niels Möller + Copyright (C) 2001, 2010, 2022 Niels Möller This file is part of GNU Nettle. @@ -124,20 +124,12 @@ _nettle_sha256_compress_c(uint32_t *state, const uint8_t *input, const uint32_t #define _nettle_sha256_compress _nettle_sha256_compress_c #endif -void -_nettle_sha256_compress(uint32_t *state, const uint8_t *input, const uint32_t *k) +const uint8_t * +_nettle_sha256_compress_n(uint32_t *state, const uint32_t *table, + size_t blocks, const uint8_t *input) { - uint32_t data[SHA256_DATA_LENGTH]; uint32_t A, B, C, D, E, F, G, H; /* Local vars */ - unsigned i; - uint32_t *d; - for (i = 0; i < SHA256_DATA_LENGTH; i++, input+= 4) - { - data[i] = READ_UINT32(input); - } - - /* Set up first buffer and local data buffer */ A = state[0]; B = state[1]; C = state[2]; @@ -146,55 +138,68 @@ _nettle_sha256_compress(uint32_t *state, const uint8_t *input, const uint32_t *k F = state[5]; G = state[6]; H = state[7]; - - /* Heavy mangling */ - /* First 16 subrounds that act on the original data */ - DEBUG(-1); - for (i = 0, d = data; i<16; i+=8, k += 8, d+= 8) + for (; blocks > 0; blocks--) { - ROUND(A, B, C, D, E, F, G, H, k[0], d[0]); DEBUG(i); - ROUND(H, A, B, C, D, E, F, G, k[1], d[1]); DEBUG(i+1); - ROUND(G, H, A, B, C, D, E, F, k[2], d[2]); - ROUND(F, G, H, A, B, C, D, E, k[3], d[3]); - ROUND(E, F, G, H, A, B, C, D, k[4], d[4]); - ROUND(D, E, F, G, H, A, B, C, k[5], d[5]); - ROUND(C, D, E, F, G, H, A, B, k[6], d[6]); DEBUG(i+6); - ROUND(B, C, D, E, F, G, H, A, k[7], d[7]); DEBUG(i+7); - } + uint32_t data[SHA256_DATA_LENGTH]; + unsigned i; + const uint32_t *k; + uint32_t *d; + for (i = 0; i < SHA256_DATA_LENGTH; i++, input+= 4) + { + data[i] = READ_UINT32(input); + } + + /* Heavy mangling */ + /* First 16 subrounds that act on the original data */ + + DEBUG(-1); + for (i = 0, d = data, k = table; i<16; i+=8, k += 8, d+= 8) + { + ROUND(A, B, C, D, E, F, G, H, k[0], d[0]); DEBUG(i); + ROUND(H, A, B, C, D, E, F, G, k[1], d[1]); DEBUG(i+1); + ROUND(G, H, A, B, C, D, E, F, k[2], d[2]); + ROUND(F, G, H, A, B, C, D, E, k[3], d[3]); + ROUND(E, F, G, H, A, B, C, D, k[4], d[4]); + ROUND(D, E, F, G, H, A, B, C, k[5], d[5]); + ROUND(C, D, E, F, G, H, A, B, k[6], d[6]); DEBUG(i+6); + ROUND(B, C, D, E, F, G, H, A, k[7], d[7]); DEBUG(i+7); + } - for (; i<64; i += 16, k+= 16) - { - ROUND(A, B, C, D, E, F, G, H, k[ 0], EXPAND(data, 0)); DEBUG(i); - ROUND(H, A, B, C, D, E, F, G, k[ 1], EXPAND(data, 1)); DEBUG(i+1); - ROUND(G, H, A, B, C, D, E, F, k[ 2], EXPAND(data, 2)); DEBUG(i+2); - ROUND(F, G, H, A, B, C, D, E, k[ 3], EXPAND(data, 3)); DEBUG(i+3); - ROUND(E, F, G, H, A, B, C, D, k[ 4], EXPAND(data, 4)); DEBUG(i+4); - ROUND(D, E, F, G, H, A, B, C, k[ 5], EXPAND(data, 5)); DEBUG(i+5); - ROUND(C, D, E, F, G, H, A, B, k[ 6], EXPAND(data, 6)); DEBUG(i+6); - ROUND(B, C, D, E, F, G, H, A, k[ 7], EXPAND(data, 7)); DEBUG(i+7); - ROUND(A, B, C, D, E, F, G, H, k[ 8], EXPAND(data, 8)); DEBUG(i+8); - ROUND(H, A, B, C, D, E, F, G, k[ 9], EXPAND(data, 9)); DEBUG(i+9); - ROUND(G, H, A, B, C, D, E, F, k[10], EXPAND(data, 10)); DEBUG(i+10); - ROUND(F, G, H, A, B, C, D, E, k[11], EXPAND(data, 11)); DEBUG(i+11); - ROUND(E, F, G, H, A, B, C, D, k[12], EXPAND(data, 12)); DEBUG(i+12); - ROUND(D, E, F, G, H, A, B, C, k[13], EXPAND(data, 13)); DEBUG(i+13); - ROUND(C, D, E, F, G, H, A, B, k[14], EXPAND(data, 14)); DEBUG(i+14); - ROUND(B, C, D, E, F, G, H, A, k[15], EXPAND(data, 15)); DEBUG(i+15); - } - - /* Update state */ - state[0] += A; - state[1] += B; - state[2] += C; - state[3] += D; - state[4] += E; - state[5] += F; - state[6] += G; - state[7] += H; + for (; i<64; i += 16, k+= 16) + { + ROUND(A, B, C, D, E, F, G, H, k[ 0], EXPAND(data, 0)); DEBUG(i); + ROUND(H, A, B, C, D, E, F, G, k[ 1], EXPAND(data, 1)); DEBUG(i+1); + ROUND(G, H, A, B, C, D, E, F, k[ 2], EXPAND(data, 2)); DEBUG(i+2); + ROUND(F, G, H, A, B, C, D, E, k[ 3], EXPAND(data, 3)); DEBUG(i+3); + ROUND(E, F, G, H, A, B, C, D, k[ 4], EXPAND(data, 4)); DEBUG(i+4); + ROUND(D, E, F, G, H, A, B, C, k[ 5], EXPAND(data, 5)); DEBUG(i+5); + ROUND(C, D, E, F, G, H, A, B, k[ 6], EXPAND(data, 6)); DEBUG(i+6); + ROUND(B, C, D, E, F, G, H, A, k[ 7], EXPAND(data, 7)); DEBUG(i+7); + ROUND(A, B, C, D, E, F, G, H, k[ 8], EXPAND(data, 8)); DEBUG(i+8); + ROUND(H, A, B, C, D, E, F, G, k[ 9], EXPAND(data, 9)); DEBUG(i+9); + ROUND(G, H, A, B, C, D, E, F, k[10], EXPAND(data, 10)); DEBUG(i+10); + ROUND(F, G, H, A, B, C, D, E, k[11], EXPAND(data, 11)); DEBUG(i+11); + ROUND(E, F, G, H, A, B, C, D, k[12], EXPAND(data, 12)); DEBUG(i+12); + ROUND(D, E, F, G, H, A, B, C, k[13], EXPAND(data, 13)); DEBUG(i+13); + ROUND(C, D, E, F, G, H, A, B, k[14], EXPAND(data, 14)); DEBUG(i+14); + ROUND(B, C, D, E, F, G, H, A, k[15], EXPAND(data, 15)); DEBUG(i+15); + } + + /* Update state */ + state[0] = A = state[0] + A; + state[1] = B = state[1] + B; + state[2] = C = state[2] + C; + state[3] = D = state[3] + D; + state[4] = E = state[4] + E; + state[5] = F = state[5] + F; + state[6] = G = state[6] + G; + state[7] = H = state[7] + H; #if SHA256_DEBUG - fprintf(stderr, "99: %8x %8x %8x %8x %8x %8x %8x %8x\n", - state[0], state[1], state[2], state[3], - state[4], state[5], state[6], state[7]); + fprintf(stderr, "99: %8x %8x %8x %8x %8x %8x %8x %8x\n", + state[0], state[1], state[2], state[3], + state[4], state[5], state[6], state[7]); #endif + } + return input; } @@ -46,6 +46,7 @@ #include "sha2-internal.h" #include "macros.h" +#include "md-internal.h" #include "nettle-write.h" /* Generated by the shadata program. */ @@ -70,6 +71,12 @@ K[64] = 0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL, }; +void +sha256_compress(uint32_t *state, const uint8_t *input) +{ + _nettle_sha256_compress_n(state, K, 1, input); +} + #define COMPRESS(ctx, data) (sha256_compress((ctx)->state, (data))) /* Initialize the SHA values */ @@ -97,7 +104,22 @@ void sha256_update(struct sha256_ctx *ctx, size_t length, const uint8_t *data) { - MD_UPDATE (ctx, length, data, COMPRESS, ctx->count++); + size_t blocks; + if (ctx->index > 0) + { + /* Try to fill partial block */ + MD_FILL_OR_RETURN (ctx, length, data); + sha256_compress (ctx->state, ctx->block); + ctx->count++; + } + + blocks = length >> 6; + data = _nettle_sha256_compress_n (ctx->state, K, blocks, data); + ctx->count += blocks; + length &= 63; + + memcpy (ctx->block, data, length); + ctx->index = length; } static void @@ -161,9 +183,3 @@ sha224_digest(struct sha256_ctx *ctx, sha256_write_digest(ctx, length, digest); sha224_init(ctx); } - -void -sha256_compress(uint32_t *state, const uint8_t *input) -{ - _nettle_sha256_compress(state, input, K); -} diff --git a/x86_64/fat/sha256-compress-2.asm b/x86_64/fat/sha256-compress-n-2.asm index 996cf8c5..60f7c8f6 100644 --- a/x86_64/fat/sha256-compress-2.asm +++ b/x86_64/fat/sha256-compress-n-2.asm @@ -1,4 +1,4 @@ -C x86_64/fat/sha256-compress-2.asm +C x86_64/fat/sha256-compress-n-2.asm ifelse(` Copyright (C) 2018 Niels Möller @@ -31,4 +31,4 @@ ifelse(` ') define(`fat_transform', `$1_sha_ni') -include_src(`x86_64/sha_ni/sha256-compress.asm') +include_src(`x86_64/sha_ni/sha256-compress-n.asm') diff --git a/x86_64/fat/sha256-compress.asm b/x86_64/fat/sha256-compress-n.asm index 2aaeb5e8..fc358858 100644 --- a/x86_64/fat/sha256-compress.asm +++ b/x86_64/fat/sha256-compress-n.asm @@ -1,4 +1,4 @@ -C x86_64/fat/sha256-compress.asm +C x86_64/fat/sha256-compress-n.asm ifelse(` Copyright (C) 2018 Niels Möller @@ -31,4 +31,4 @@ ifelse(` ') define(`fat_transform', `$1_x86_64') -include_src(`x86_64/sha256-compress.asm') +include_src(`x86_64/sha256-compress-n.asm') diff --git a/x86_64/sha256-compress.asm b/x86_64/sha256-compress-n.asm index 5ed669b1..e10d260c 100644 --- a/x86_64/sha256-compress.asm +++ b/x86_64/sha256-compress-n.asm @@ -1,7 +1,7 @@ -C x86_64/sha256-compress.asm +C x86_64/sha256-compress-n.asm ifelse(` - Copyright (C) 2013 Niels Möller + Copyright (C) 2013, 2022 Niels Möller This file is part of GNU Nettle. @@ -30,21 +30,24 @@ ifelse(` not, see http://www.gnu.org/licenses/. ') - .file "sha256-compress.asm" + .file "sha256-compress-n.asm" define(`STATE', `%rdi') -define(`INPUT', `%rsi') -define(`K', `%rdx') +define(`K', `%rsi') +define(`BLOCKS', `%rdx') +define(`INPUT', `%rcx') +define(`STATE_SAVED', `64(%rsp)') + define(`SA', `%eax') define(`SB', `%ebx') -define(`SC', `%ecx') +define(`SC', `%ebp') define(`SD', `%r8d') define(`SE', `%r9d') define(`SF', `%r10d') define(`SG', `%r11d') define(`SH', `%r12d') define(`T0', `%r13d') -define(`T1', `%edi') C Overlap STATE -define(`COUNT', `%r14') +define(`T1', `%r14d') +define(`COUNT', `%rdi') C Overlap STATE define(`W', `%r15d') define(`EXPN', ` @@ -123,18 +126,21 @@ define(`NOEXPN', ` movl W, OFFSET($1)(%rsp, COUNT, 4) ') - C void - C _nettle_sha256_compress(uint32_t *state, const uint8_t *input, const uint32_t *k) + C const uint8_t * + C _nettle_sha256_compress_n(uint32_t *state, const uint32_t *k, + C size_t blocks, const uint8_t *input) .text ALIGN(16) -PROLOGUE(_nettle_sha256_compress) +PROLOGUE(_nettle_sha256_compress_n) W64_ENTRY(3, 0) + test BLOCKS, BLOCKS + jz .Lend sub $120, %rsp - mov %rbx, 64(%rsp) - mov STATE, 72(%rsp) C Save state, to free a register + mov STATE, STATE_SAVED C Save state, to free a register + mov %rbx, 72(%rsp) mov %rbp, 80(%rsp) mov %r12, 88(%rsp) mov %r13, 96(%rsp) @@ -149,7 +155,9 @@ PROLOGUE(_nettle_sha256_compress) movl 20(STATE), SF movl 24(STATE), SG movl 28(STATE), SH - xor COUNT, COUNT + +.Loop_block: + xorl XREG(COUNT), XREG(COUNT) ALIGN(16) .Loop1: @@ -161,8 +169,8 @@ PROLOGUE(_nettle_sha256_compress) NOEXPN(5) ROUND(SD,SE,SF,SG,SH,SA,SB,SC,5) NOEXPN(6) ROUND(SC,SD,SE,SF,SG,SH,SA,SB,6) NOEXPN(7) ROUND(SB,SC,SD,SE,SF,SG,SH,SA,7) - add $8, COUNT - cmp $16, COUNT + addl $8, XREG(COUNT) + cmpl $16, XREG(COUNT) jne .Loop1 .Loop2: @@ -182,22 +190,35 @@ PROLOGUE(_nettle_sha256_compress) EXPN(13) ROUND(SD,SE,SF,SG,SH,SA,SB,SC,13) EXPN(14) ROUND(SC,SD,SE,SF,SG,SH,SA,SB,14) EXPN(15) ROUND(SB,SC,SD,SE,SF,SG,SH,SA,15) - add $16, COUNT - cmp $64, COUNT + addl $16, XREG(COUNT) + cmpl $64, XREG(COUNT) jne .Loop2 - mov 72(%rsp), STATE - - addl SA, (STATE) - addl SB, 4(STATE) - addl SC, 8(STATE) - addl SD, 12(STATE) - addl SE, 16(STATE) - addl SF, 20(STATE) - addl SG, 24(STATE) - addl SH, 28(STATE) - - mov 64(%rsp), %rbx + mov STATE_SAVED, STATE + + addl (STATE), SA + addl 4(STATE), SB + addl 8(STATE), SC + addl 12(STATE), SD + addl 16(STATE), SE + addl 20(STATE), SF + addl 24(STATE), SG + addl 28(STATE), SH + + movl SA, (STATE) + movl SB, 4(STATE) + movl SC, 8(STATE) + movl SD, 12(STATE) + movl SE, 16(STATE) + movl SF, 20(STATE) + movl SG, 24(STATE) + movl SH, 28(STATE) + + add $64, INPUT + dec BLOCKS + jnz .Loop_block + + mov 72(%rsp), %rbx mov 80(%rsp), %rbp mov 88(%rsp), %r12 mov 96(%rsp), %r13 @@ -205,6 +226,8 @@ PROLOGUE(_nettle_sha256_compress) mov 112(%rsp),%r15 add $120, %rsp +.Lend: + mov INPUT, %rax W64_EXIT(3, 0) ret -EPILOGUE(_nettle_sha256_compress) +EPILOGUE(_nettle_sha256_compress_n) diff --git a/x86_64/sha_ni/sha256-compress.asm b/x86_64/sha_ni/sha256-compress-n.asm index 00bd3cd3..005909df 100644 --- a/x86_64/sha_ni/sha256-compress.asm +++ b/x86_64/sha_ni/sha256-compress-n.asm @@ -1,7 +1,7 @@ -C x86_64/sha_ni/sha256-compress.asm +C x86_64/sha_ni/sha256-compress-n.asm ifelse(` - Copyright (C) 2018 Niels Möller + Copyright (C) 2018, 2022 Niels Möller This file is part of GNU Nettle. @@ -30,10 +30,11 @@ ifelse(` not, see http://www.gnu.org/licenses/. ') - .file "sha256-compress.asm" + .file "sha256-compress-n.asm" define(`STATE', `%rdi') -define(`INPUT', `%rsi') -define(`K', `%rdx') +define(`K', `%rsi') +define(`BLOCKS', `%rdx') +define(`INPUT', `%rcx') define(`MSGK',`%xmm0') C Implicit operand of sha256rnds2 define(`MSG0',`%xmm1') @@ -45,7 +46,7 @@ define(`CDGH',`%xmm6') define(`ABEF_ORIG',`%xmm7') define(`CDGH_ORIG', `%xmm8') define(`SWAP_MASK',`%xmm9') -define(`TMP', `%xmm9') C Overlaps SWAP_MASK +define(`TMP', `%xmm10') C QROUND(M0, M1, M2, M3, R) define(`QROUND', ` @@ -69,15 +70,19 @@ define(`TRANSPOSE', ` punpcklqdq $1, $3 ') - C void - C _nettle_sha256_compress(uint32_t *state, const uint8_t *input, const uint32_t *k) + C const uint8_t * + C _nettle_sha256_compress_n(uint32_t *state, const uint32_t *k, + C size_t blocks, const uint8_t *input) .text ALIGN(16) .Lswap_mask: .byte 3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12 -PROLOGUE(_nettle_sha256_compress) - W64_ENTRY(3, 10) +PROLOGUE(_nettle_sha256_compress_n) + W64_ENTRY(4, 11) + test BLOCKS, BLOCKS + jz .Lend + movups (STATE), TMP movups 16(STATE), ABEF @@ -88,12 +93,13 @@ PROLOGUE(_nettle_sha256_compress) movdqa .Lswap_mask(%rip), SWAP_MASK - movdqa ABEF, ABEF_ORIG - movdqa CDGH, CDGH_ORIG - +.Loop: movups (INPUT), MSG0 pshufb SWAP_MASK, MSG0 + movdqa ABEF, ABEF_ORIG + movdqa CDGH, CDGH_ORIG + movdqa (K), MSGK paddd MSG0, MSGK sha256rnds2 ABEF, CDGH C Round 0-1 @@ -163,6 +169,10 @@ PROLOGUE(_nettle_sha256_compress) paddd ABEF_ORIG, ABEF paddd CDGH_ORIG, CDGH + add $64, INPUT + dec BLOCKS + jnz .Loop + TRANSPOSE(ABEF, CDGH, TMP) pshufd $0x1b, CDGH, CDGH @@ -170,6 +180,8 @@ PROLOGUE(_nettle_sha256_compress) movups CDGH, 0(STATE) movups TMP, 16(STATE) - W64_EXIT(3, 10) +.Lend: + mov INPUT, %rax + W64_EXIT(4, 11) ret -EPILOGUE(_nettle_sha256_compress) +EPILOGUE(_nettle_sha256_compress_n) |