diff options
author | Makoto Kato <m_kato@ga2.so-net.ne.jp> | 2021-06-25 08:18:13 +0000 |
---|---|---|
committer | Makoto Kato <m_kato@ga2.so-net.ne.jp> | 2021-06-25 08:18:13 +0000 |
commit | a5cf364d1116025f1ec8cf253e8092c4ba83c5af (patch) | |
tree | e57264880002c3dbd9723f1b6c7335957a3ce2c9 | |
parent | acbf97ed8b4945cc9e66ba1fff0749d168361822 (diff) | |
download | nss-hg-a5cf364d1116025f1ec8cf253e8092c4ba83c5af.tar.gz |
Bug 1655493 - Support SHA2 HW acceleration using Intel SHA Extension. r=bbeurdouche
Before applying (on Ryzen 9 3900X)
```
# mode in opreps cxreps context op time(sec) thrgput
sha256_e 1Gb 208Mb 23M 0 0.000 10000.000 10.000 123Mb 301Kb
```
After applying
```
# mode in opreps cxreps context op time(sec) thrgput
sha256_e 5Gb 797Mb 110M 0 0.000 10000.000 10.000 591Mb 769Kb
```
Differential Revision: https://phabricator.services.mozilla.com/D116962
-rw-r--r-- | automation/taskcluster/graph/src/extend.js | 7 | ||||
-rw-r--r-- | coreconf/WIN32.mk | 2 | ||||
-rw-r--r-- | coreconf/config.gypi | 1 | ||||
-rw-r--r-- | lib/freebl/Makefile | 26 | ||||
-rw-r--r-- | lib/freebl/freebl.gyp | 52 | ||||
-rw-r--r-- | lib/freebl/sha256-x86.c | 236 | ||||
-rw-r--r-- | lib/freebl/sha512.c | 28 |
7 files changed, 343 insertions, 9 deletions
diff --git a/automation/taskcluster/graph/src/extend.js b/automation/taskcluster/graph/src/extend.js index 658f06ab1..58b89dc70 100644 --- a/automation/taskcluster/graph/src/extend.js +++ b/automation/taskcluster/graph/src/extend.js @@ -567,6 +567,13 @@ async function scheduleLinux(name, overrides, args = "") { CC: "gcc-4.8", CCC: "g++-4.8" }, + // Use -Ddisable-intelhw_sha=1, GYP doesn't have a proper GCC version + // check for Intel SHA support. + command: [ + "/bin/bash", + "-c", + "bin/checkout.sh && nss/automation/taskcluster/scripts/build_gyp.sh -Ddisable_intel_hw_sha=1" + ], symbol: "gcc-4.8" })); diff --git a/coreconf/WIN32.mk b/coreconf/WIN32.mk index 9e04ad93e..f5db943cf 100644 --- a/coreconf/WIN32.mk +++ b/coreconf/WIN32.mk @@ -56,6 +56,8 @@ else _MSC_VER_GE_11 := $(shell expr $(_MSC_VER) \>= 1700) # VC12 (2013). _MSC_VER_GE_12 := $(shell expr $(_MSC_VER) \>= 1800) + # VC14 (2015). + _MSC_VER_GE_14 := $(shell expr $(_MSC_VER) \>= 1900) ifeq ($(_CC_VMAJOR),14) # -DYNAMICBASE is only supported on VC8SP1 or newer, # so be very specific here! diff --git a/coreconf/config.gypi b/coreconf/config.gypi index 5800f8791..0867ba2c9 100644 --- a/coreconf/config.gypi +++ b/coreconf/config.gypi @@ -99,6 +99,7 @@ 'disable_arm_hw_aes%': 0, 'disable_arm_hw_sha1%': 0, 'disable_arm_hw_sha2%': 0, + 'disable_intel_hw_sha%': 0, 'disable_tests%': 0, 'disable_chachapoly%': 0, 'disable_deprecated_seed%': 0, diff --git a/lib/freebl/Makefile b/lib/freebl/Makefile index d13a5e930..0b8c6f42f 100644 --- a/lib/freebl/Makefile +++ b/lib/freebl/Makefile @@ -118,6 +118,20 @@ ifneq (,$(USE_64)$(USE_X32)) else DEFINES += -DNSS_X86 endif + ifdef CC_IS_CLANG + EXTRA_SRCS += sha256-x86.c + DEFINES += -DUSE_HW_SHA2 + else ifeq (1,$(CC_IS_GCC)) + # Old compiler doesn't support Intel SHA extension + ifneq (,$(filter 4.9,$(word 1,$(GCC_VERSION)).$(word 2,$(GCC_VERSION)))) + EXTRA_SRCS += sha256-x86.c + DEFINES += -DUSE_HW_SHA2 + endif + ifeq (,$(filter 0 1 2 3 4,$(word 1,$(GCC_VERSION)))) + EXTRA_SRCS += sha256-x86.c + DEFINES += -DUSE_HW_SHA2 + endif + endif endif ifeq ($(CPU_ARCH),aarch64) ifdef CC_IS_CLANG @@ -204,6 +218,11 @@ else INTEL_GCM_CLANG_CL = 1 endif endif + # The Intel SHA extenstion requires Visual C++ 2015. + ifeq ($(_MSC_VER_GE_14),1) + DEFINES += -DUSE_HW_SHA2 + EXTRA_SRCS += sha256-x86.c + endif endif else # -DMP_NO_MP_WORD @@ -230,6 +249,11 @@ ifeq ($(CPU_ARCH),x86_64) INTEL_GCM_CLANG_CL = 1 endif endif + # The Intel SHA extenstion requires Visual C++ 2015. + ifeq ($(_MSC_VER_GE_14),1) + DEFINES += -DUSE_HW_SHA2 + EXTRA_SRCS += sha256-x86.c + endif MPI_SRCS += mpi_amd64.c endif endif @@ -740,6 +764,8 @@ ifdef INTEL_GCM_CLANG_CL $(OBJDIR)/$(PROG_PREFIX)intel-gcm-wrap$(OBJ_SUFFIX): CFLAGS += -mssse3 endif +$(OBJDIR)/$(PROG_PREFIX)sha256-x86$(OBJ_SUFFIX): CFLAGS += -msha -mssse3 -msse4.1 + ifeq ($(CPU_ARCH),arm) # When the compiler uses the softfloat ABI, we want to use the compatible softfp ABI when # enabling NEON for these objects. diff --git a/lib/freebl/freebl.gyp b/lib/freebl/freebl.gyp index 66f30631a..e7703baf8 100644 --- a/lib/freebl/freebl.gyp +++ b/lib/freebl/freebl.gyp @@ -220,6 +220,38 @@ ] }, { + 'target_name': 'sha-x86_c_lib', + 'type': 'static_library', + 'sources': [ + 'sha256-x86.c' + ], + 'dependencies': [ + '<(DEPTH)/exports.gyp:nss_exports' + ], + 'cflags': [ + '-msha', + '-mssse3', + '-msse4.1' + ], + 'cflags_mozilla': [ + '-msha', + '-mssse3', + '-msse4.1' + ], + 'conditions': [ + # macOS build doesn't use cflags. + [ 'OS=="mac" or OS=="ios"', { + 'xcode_settings': { + 'OTHER_CFLAGS': [ + '-msha', + '-mssse3', + '-msse4.1' + ], + }, + }] + ] + }, + { 'target_name': 'gcm-aes-arm32-neon_c_lib', 'type': 'static_library', 'sources': [ @@ -488,6 +520,11 @@ 'armv8_c_lib' ], }], + [ '(target_arch=="ia32" or target_arch=="x64") and disable_intel_hw_sha==0', { + 'dependencies': [ + 'sha-x86_c_lib', + ], + }], [ 'disable_arm32_neon==0 and target_arch=="arm"', { 'dependencies': [ 'gcm-aes-arm32-neon_c_lib', @@ -570,6 +607,11 @@ 'armv8_c_lib', ], }], + [ '(target_arch=="ia32" or target_arch=="x64") and disable_intel_hw_sha==0', { + 'dependencies': [ + 'sha-x86_c_lib', + ], + }], [ 'disable_arm32_neon==0 and target_arch=="arm"', { 'dependencies': [ 'gcm-aes-arm32-neon_c_lib', @@ -765,6 +807,11 @@ }, }, }], + [ '(OS=="win" or OS=="mac" or OS=="ios") and (target_arch=="ia32" or target_arch=="x64") and disable_intel_hw_sha==0', { + 'defines': [ + 'USE_HW_SHA2', + ], + }], [ '(OS=="win" or OS=="mac" or OS=="ios") and (target_arch=="arm64" or target_arch=="aarch64") and disable_arm_hw_aes==0', { 'defines': [ 'USE_HW_AES', @@ -846,6 +893,11 @@ 'ARMHF', ], }], + [ 'disable_intel_hw_sha==0 and (target_arch=="ia32" or target_arch=="x64")', { + 'defines': [ + 'USE_HW_SHA2', + ], + }], [ 'disable_arm_hw_aes==0 and (target_arch=="arm" or target_arch=="arm64" or target_arch=="aarch64")', { 'defines': [ 'USE_HW_AES', diff --git a/lib/freebl/sha256-x86.c b/lib/freebl/sha256-x86.c new file mode 100644 index 000000000..3aa30e9cc --- /dev/null +++ b/lib/freebl/sha256-x86.c @@ -0,0 +1,236 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifdef USE_HW_SHA2 + +#include <immintrin.h> + +#ifdef FREEBL_NO_DEPEND +#include "stubs.h" +#endif + +#include "blapii.h" +#include "prcpucfg.h" +#include "prtypes.h" /* for PRUintXX */ +#include "prlong.h" +#include "blapi.h" +#include "sha256.h" + +/* SHA-256 constants, K256. */ +pre_align static const PRUint32 K256[64] post_align = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +}; + +#define ROUND(n, a, b, c, d) \ + { \ + __m128i t = _mm_add_epi32(a, k##n); \ + w1 = _mm_sha256rnds2_epu32(w1, w0, t); \ + t = _mm_shuffle_epi32(t, 0x0e); \ + w0 = _mm_sha256rnds2_epu32(w0, w1, t); \ + if (n < 12) { \ + a = _mm_sha256msg1_epu32(a, b); \ + a = _mm_add_epi32(a, _mm_alignr_epi8(d, c, 4)); \ + a = _mm_sha256msg2_epu32(a, d); \ + } \ + } + +void +SHA256_Compress_Native(SHA256Context *ctx) +{ + __m128i h0, h1, th; + __m128i a, b, c, d; + __m128i w0, w1; + const __m128i shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3); + + const __m128i *K = (__m128i *)K256; + const __m128i k0 = _mm_load_si128(K); + const __m128i k1 = _mm_load_si128(K + 1); + const __m128i k2 = _mm_load_si128(K + 2); + const __m128i k3 = _mm_load_si128(K + 3); + const __m128i k4 = _mm_load_si128(K + 4); + const __m128i k5 = _mm_load_si128(K + 5); + const __m128i k6 = _mm_load_si128(K + 6); + const __m128i k7 = _mm_load_si128(K + 7); + const __m128i k8 = _mm_load_si128(K + 8); + const __m128i k9 = _mm_load_si128(K + 9); + const __m128i k10 = _mm_load_si128(K + 10); + const __m128i k11 = _mm_load_si128(K + 11); + const __m128i k12 = _mm_load_si128(K + 12); + const __m128i k13 = _mm_load_si128(K + 13); + const __m128i k14 = _mm_load_si128(K + 14); + const __m128i k15 = _mm_load_si128(K + 15); + + const __m128i *input = (__m128i *)ctx->u.b; + + h0 = _mm_loadu_si128((__m128i *)(ctx->h)); + h1 = _mm_loadu_si128((__m128i *)(ctx->h + 4)); + + /* H0123:4567 -> H01256:H2367 */ + th = _mm_shuffle_epi32(h0, 0xb1); + h1 = _mm_shuffle_epi32(h1, 0x1b); + h0 = _mm_alignr_epi8(th, h1, 8); + h1 = _mm_blend_epi16(h1, th, 0xf0); + + a = _mm_shuffle_epi8(_mm_loadu_si128(input), shuffle); + b = _mm_shuffle_epi8(_mm_loadu_si128(input + 1), shuffle); + c = _mm_shuffle_epi8(_mm_loadu_si128(input + 2), shuffle); + d = _mm_shuffle_epi8(_mm_loadu_si128(input + 3), shuffle); + + w0 = h0; + w1 = h1; + + ROUND(0, a, b, c, d) + ROUND(1, b, c, d, a) + ROUND(2, c, d, a, b) + ROUND(3, d, a, b, c) + ROUND(4, a, b, c, d) + ROUND(5, b, c, d, a) + ROUND(6, c, d, a, b) + ROUND(7, d, a, b, c) + ROUND(8, a, b, c, d) + ROUND(9, b, c, d, a) + ROUND(10, c, d, a, b) + ROUND(11, d, a, b, c) + ROUND(12, a, b, c, d) + ROUND(13, b, c, d, a) + ROUND(14, c, d, a, b) + ROUND(15, d, a, b, c) + + h0 = _mm_add_epi32(h0, w0); + h1 = _mm_add_epi32(h1, w1); + + /* H0145:2367 -> H0123:4567 */ + th = _mm_shuffle_epi32(h0, 0x1b); + h1 = _mm_shuffle_epi32(h1, 0xb1); + h0 = _mm_blend_epi16(th, h1, 0xf0); + h1 = _mm_alignr_epi8(h1, th, 8); + + _mm_storeu_si128((__m128i *)ctx->h, h0); + _mm_storeu_si128((__m128i *)(ctx->h + 4), h1); +} + +void +SHA256_Update_Native(SHA256Context *ctx, const unsigned char *input, + unsigned int inputLen) +{ + __m128i h0, h1, th; + const __m128i shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3); + + const __m128i *K = (__m128i *)K256; + const __m128i k0 = _mm_load_si128(K); + const __m128i k1 = _mm_load_si128(K + 1); + const __m128i k2 = _mm_load_si128(K + 2); + const __m128i k3 = _mm_load_si128(K + 3); + const __m128i k4 = _mm_load_si128(K + 4); + const __m128i k5 = _mm_load_si128(K + 5); + const __m128i k6 = _mm_load_si128(K + 6); + const __m128i k7 = _mm_load_si128(K + 7); + const __m128i k8 = _mm_load_si128(K + 8); + const __m128i k9 = _mm_load_si128(K + 9); + const __m128i k10 = _mm_load_si128(K + 10); + const __m128i k11 = _mm_load_si128(K + 11); + const __m128i k12 = _mm_load_si128(K + 12); + const __m128i k13 = _mm_load_si128(K + 13); + const __m128i k14 = _mm_load_si128(K + 14); + const __m128i k15 = _mm_load_si128(K + 15); + + unsigned int inBuf = ctx->sizeLo & 0x3f; + if (!inputLen) { + return; + } + + /* Add inputLen into the count of bytes processed, before processing */ + if ((ctx->sizeLo += inputLen) < inputLen) { + ctx->sizeHi++; + } + + /* if data already in buffer, attempt to fill rest of buffer */ + if (inBuf) { + unsigned int todo = SHA256_BLOCK_LENGTH - inBuf; + if (inputLen < todo) { + todo = inputLen; + } + memcpy(ctx->u.b + inBuf, input, todo); + input += todo; + inputLen -= todo; + if (inBuf + todo == SHA256_BLOCK_LENGTH) { + SHA256_Compress_Native(ctx); + } + } + + h0 = _mm_loadu_si128((__m128i *)(ctx->h)); + h1 = _mm_loadu_si128((__m128i *)(ctx->h + 4)); + + /* H0123:4567 -> H01256:H2367 */ + th = _mm_shuffle_epi32(h0, 0xb1); + h1 = _mm_shuffle_epi32(h1, 0x1b); + h0 = _mm_alignr_epi8(th, h1, 8); + h1 = _mm_blend_epi16(h1, th, 0xf0); + + /* if enough data to fill one or more whole buffers, process them. */ + while (inputLen >= SHA256_BLOCK_LENGTH) { + __m128i a, b, c, d; + __m128i w0, w1; + a = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)input), shuffle); + b = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 16)), shuffle); + c = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 32)), shuffle); + d = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 48)), shuffle); + input += SHA256_BLOCK_LENGTH; + inputLen -= SHA256_BLOCK_LENGTH; + + w0 = h0; + w1 = h1; + + ROUND(0, a, b, c, d) + ROUND(1, b, c, d, a) + ROUND(2, c, d, a, b) + ROUND(3, d, a, b, c) + ROUND(4, a, b, c, d) + ROUND(5, b, c, d, a) + ROUND(6, c, d, a, b) + ROUND(7, d, a, b, c) + ROUND(8, a, b, c, d) + ROUND(9, b, c, d, a) + ROUND(10, c, d, a, b) + ROUND(11, d, a, b, c) + ROUND(12, a, b, c, d) + ROUND(13, b, c, d, a) + ROUND(14, c, d, a, b) + ROUND(15, d, a, b, c) + + h0 = _mm_add_epi32(h0, w0); + h1 = _mm_add_epi32(h1, w1); + } + + // H01234567 -> H01256 and H2367 + th = _mm_shuffle_epi32(h0, 0x1b); + h1 = _mm_shuffle_epi32(h1, 0xb1); + h0 = _mm_blend_epi16(th, h1, 0xf0); + h1 = _mm_alignr_epi8(h1, th, 8); + + _mm_storeu_si128((__m128i *)ctx->h, h0); + _mm_storeu_si128((__m128i *)(ctx->h + 4), h1); + + /* if data left over, fill it into buffer */ + if (inputLen) { + memcpy(ctx->u.b, input, inputLen); + } +} + +#endif /* USE_HW_SHA2 */ diff --git a/lib/freebl/sha512.c b/lib/freebl/sha512.c index dc0ed776b..946ab9f7f 100644 --- a/lib/freebl/sha512.c +++ b/lib/freebl/sha512.c @@ -164,7 +164,7 @@ static void SHA256_Compress_Generic(SHA256Context *ctx); static void SHA256_Update_Generic(SHA256Context *ctx, const unsigned char *input, unsigned int inputLen); -#ifndef USE_HW_SHA2 +#if !defined(USE_HW_SHA2) || !defined(IS_LITTLE_ENDIAN) void SHA256_Compress_Native(SHA256Context *ctx) { @@ -200,16 +200,20 @@ SHA256_DestroyContext(SHA256Context *ctx, PRBool freeit) void SHA256_Begin(SHA256Context *ctx) { + PRBool use_hw_sha2 = PR_FALSE; + memset(ctx, 0, sizeof *ctx); memcpy(H, H256, sizeof H256); + #if defined(USE_HW_SHA2) && defined(IS_LITTLE_ENDIAN) /* arm's implementation is tested on little endian only */ - if (arm_sha2_support()) { + use_hw_sha2 = arm_sha2_support() || (sha_support() && ssse3_support() && sse4_1_support()); +#endif + + if (use_hw_sha2) { ctx->compress = SHA256_Compress_Native; ctx->update = SHA256_Update_Native; - } else -#endif - { + } else { ctx->compress = SHA256_Compress_Generic; ctx->update = SHA256_Update_Generic; } @@ -692,16 +696,22 @@ SHA224_DestroyContext(SHA224Context *ctx, PRBool freeit) void SHA224_Begin(SHA224Context *ctx) { + PRBool use_hw_sha2; + memset(ctx, 0, sizeof *ctx); memcpy(H, H224, sizeof H224); + #if defined(USE_HW_SHA2) && defined(IS_LITTLE_ENDIAN) /* arm's implementation is tested on little endian only */ - if (arm_sha2_support()) { + use_hw_sha2 = arm_sha2_support() || (sha_support() && ssse3_support() && sse4_1_support()); +#else + use_hw_sha2 = PR_FALSE; +#endif + + if (use_hw_sha2) { ctx->compress = SHA256_Compress_Native; ctx->update = SHA256_Update_Native; - } else -#endif - { + } else { ctx->compress = SHA256_Compress_Generic; ctx->update = SHA256_Update_Generic; } |