From 42097e279837ad6f084b7910cea5306c503be988 Mon Sep 17 00:00:00 2001 From: Jeffrey Walton Date: Wed, 21 Apr 2021 03:24:15 -0400 Subject: Align LSH IV's for AVX --- lsh512.cpp | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'lsh512.cpp') diff --git a/lsh512.cpp b/lsh512.cpp index f6340314..b26c5718 100644 --- a/lsh512.cpp +++ b/lsh512.cpp @@ -122,7 +122,9 @@ struct LSH512_Internal }; #if defined(CRYPTOPP_LSH512_AVX_AVAILABLE) -// Clear upper bits on entry and exit +// Zero the upper 128 bits of all YMM registers +// on entry and exit. It avoids AVX state +// transition penalties when saving state. struct AVX_Cleanup { AVX_Cleanup() { @@ -204,7 +206,7 @@ lsh_u64 ROTL64(lsh_u64 x, lsh_u32 r) { # define MAYBE_CONSTEXPR const #endif -CRYPTOPP_ALIGN_DATA(16) +CRYPTOPP_ALIGN_DATA(32) MAYBE_CONSTEXPR lsh_u64 g_IV224[CV_WORD_LEN] = { W64LIT(0x0C401E9FE8813A55), W64LIT(0x4A5F446268FD3D35), W64LIT(0xFF13E452334F612A), W64LIT(0xF8227661037E354A), W64LIT(0xA5F223723C9CA29D), W64LIT(0x95D965A11AED3979), W64LIT(0x01E23835B9AB02CC), W64LIT(0x52D49CBAD5B30616), @@ -212,7 +214,7 @@ MAYBE_CONSTEXPR lsh_u64 g_IV224[CV_WORD_LEN] = { W64LIT(0x31E2B67D25BE3813), W64LIT(0xD522C4DEED8E4D83), W64LIT(0xA79F5509B43FBAFE), W64LIT(0xE00D2CD88B4B6C6A), }; -CRYPTOPP_ALIGN_DATA(16) +CRYPTOPP_ALIGN_DATA(32) MAYBE_CONSTEXPR lsh_u64 g_IV256[CV_WORD_LEN] = { W64LIT(0x6DC57C33DF989423), W64LIT(0xD8EA7F6E8342C199), W64LIT(0x76DF8356F8603AC4), W64LIT(0x40F1B44DE838223A), W64LIT(0x39FFE7CFC31484CD), W64LIT(0x39C4326CC5281548), W64LIT(0x8A2FF85A346045D8), W64LIT(0xFF202AA46DBDD61E), @@ -220,7 +222,7 @@ MAYBE_CONSTEXPR lsh_u64 g_IV256[CV_WORD_LEN] = { W64LIT(0xB596875BF8FF6DBA), W64LIT(0xFCCA39B089EF4615), W64LIT(0xECFF4017D020B4B6), W64LIT(0x7E77384C772ED802), }; -CRYPTOPP_ALIGN_DATA(16) +CRYPTOPP_ALIGN_DATA(32) MAYBE_CONSTEXPR lsh_u64 g_IV384[CV_WORD_LEN] = { W64LIT(0x53156A66292808F6), W64LIT(0xB2C4F362B204C2BC), W64LIT(0xB84B7213BFA05C4E), W64LIT(0x976CEB7C1B299F73), W64LIT(0xDF0CC63C0570AE97), W64LIT(0xDA4441BAA486CE3F), W64LIT(0x6559F5D9B5F2ACC2), W64LIT(0x22DACF19B4B52A16), @@ -228,7 +230,7 @@ MAYBE_CONSTEXPR lsh_u64 g_IV384[CV_WORD_LEN] = { W64LIT(0xBB08043FB34E3E30), W64LIT(0xA0DEC48D54618EAD), W64LIT(0x150317267464BC57), W64LIT(0x32D1501FDE63DC93) }; -CRYPTOPP_ALIGN_DATA(16) +CRYPTOPP_ALIGN_DATA(32) MAYBE_CONSTEXPR lsh_u64 g_IV512[CV_WORD_LEN] = { W64LIT(0xadd50f3c7f07094e), W64LIT(0xe3f3cee8f9418a4f), W64LIT(0xb527ecde5b3d0ae9), W64LIT(0x2ef6dec68076f501), W64LIT(0x8cb994cae5aca216), W64LIT(0xfbb9eae4bba48cc7), W64LIT(0x650a526174725fea), W64LIT(0x1f9a61a73f8d8085), @@ -1077,18 +1079,19 @@ inline void compress(LSH512_Context* ctx, const lsh_u8 pdMsgBlk[LSH512_MSG_BLK_B inline void load_iv(word64* cv_l, word64* cv_r, const word64* iv) { + // The IV's are 32-byte aligned so we can use aligned loads. + #if defined(CRYPTOPP_LSH512_AVX_AVAILABLE) _mm256_storeu_si256(M256_CAST(cv_l+0), - _mm256_loadu_si256(CONST_M256_CAST(iv+0))); + _mm256_load_si256(CONST_M256_CAST(iv+0))); _mm256_storeu_si256(M256_CAST(cv_l+4), - _mm256_loadu_si256(CONST_M256_CAST(iv+4))); + _mm256_load_si256(CONST_M256_CAST(iv+4))); _mm256_storeu_si256(M256_CAST(cv_r+0), - _mm256_loadu_si256(CONST_M256_CAST(iv+8))); + _mm256_load_si256(CONST_M256_CAST(iv+8))); _mm256_storeu_si256(M256_CAST(cv_r+4), - _mm256_loadu_si256(CONST_M256_CAST(iv+12))); + _mm256_load_si256(CONST_M256_CAST(iv+12))); #elif defined(CRYPTOPP_LSH512_SSE2_AVAILABLE) - // The IV's are 16-byte aligned so we can use _mm_load_si128. _mm_storeu_si128(M128_CAST(cv_l+0), _mm_load_si128(CONST_M128_CAST(iv+0))); _mm_storeu_si128(M128_CAST(cv_l+2), -- cgit v1.2.1