Bug 1655493 - Support SHA2 HW acceleration using Intel SHA Extension. r=bbeurdouche

Before applying (on Ryzen 9 3900X) ``` # mode in opreps cxreps context op time(sec) thrgput sha256_e 1Gb 208Mb 23M 0 0.000 10000.000 10.000 123Mb 301Kb ``` After applying ``` # mode in opreps cxreps context op time(sec) thrgput sha256_e 5Gb 797Mb 110M 0 0.000 10000.000 10.000 591Mb 769Kb ``` Differential Revision: https://phabricator.services.mozilla.com/D116962
author: Makoto Kato <m_kato@ga2.so-net.ne.jp> 2021-06-25 08:18:13 +0000
committer: Makoto Kato <m_kato@ga2.so-net.ne.jp> 2021-06-25 08:18:13 +0000
commit: a5cf364d1116025f1ec8cf253e8092c4ba83c5af (patch)
tree: e57264880002c3dbd9723f1b6c7335957a3ce2c9
parent: acbf97ed8b4945cc9e66ba1fff0749d168361822 (diff)
download: nss-hg-a5cf364d1116025f1ec8cf253e8092c4ba83c5af.tar.gz
7 files changed, 343 insertions, 9 deletions
diff --git a/automation/taskcluster/graph/src/extend.js b/automation/taskcluster/graph/src/extend.js
index 658f06ab1..58b89dc70 100644
--- a/automation/taskcluster/graph/src/extend.js
+++ b/automation/taskcluster/graph/src/extend.js
@@ -567,6 +567,13 @@ async function scheduleLinux(name, overrides, args = "") {
       CC: "gcc-4.8",
       CCC: "g++-4.8"
     },
+    // Use -Ddisable-intelhw_sha=1, GYP doesn't have a proper GCC version
+    // check for Intel SHA support.
+    command: [
+      "/bin/bash",
+      "-c",
+      "bin/checkout.sh && nss/automation/taskcluster/scripts/build_gyp.sh -Ddisable_intel_hw_sha=1"
+    ],
     symbol: "gcc-4.8"
   }));
 
diff --git a/coreconf/WIN32.mk b/coreconf/WIN32.mk
index 9e04ad93e..f5db943cf 100644
--- a/coreconf/WIN32.mk
+++ b/coreconf/WIN32.mk
@@ -56,6 +56,8 @@ else
 	_MSC_VER_GE_11 := $(shell expr $(_MSC_VER) \>= 1700)
 	# VC12 (2013).
 	_MSC_VER_GE_12 := $(shell expr $(_MSC_VER) \>= 1800)
+	# VC14 (2015).
+	_MSC_VER_GE_14 := $(shell expr $(_MSC_VER) \>= 1900)
 	ifeq ($(_CC_VMAJOR),14)
 	    # -DYNAMICBASE is only supported on VC8SP1 or newer,
 	    # so be very specific here!
diff --git a/coreconf/config.gypi b/coreconf/config.gypi
index 5800f8791..0867ba2c9 100644
--- a/coreconf/config.gypi
+++ b/coreconf/config.gypi
@@ -99,6 +99,7 @@
     'disable_arm_hw_aes%': 0,
     'disable_arm_hw_sha1%': 0,
     'disable_arm_hw_sha2%': 0,
+    'disable_intel_hw_sha%': 0,
     'disable_tests%': 0,
     'disable_chachapoly%': 0,
     'disable_deprecated_seed%': 0,
diff --git a/lib/freebl/Makefile b/lib/freebl/Makefile
index d13a5e930..0b8c6f42f 100644
--- a/lib/freebl/Makefile
+++ b/lib/freebl/Makefile
@@ -118,6 +118,20 @@ ifneq (,$(USE_64)$(USE_X32))
 else
         DEFINES += -DNSS_X86
 endif
+    ifdef CC_IS_CLANG
+        EXTRA_SRCS += sha256-x86.c
+        DEFINES += -DUSE_HW_SHA2
+    else ifeq (1,$(CC_IS_GCC))
+        # Old compiler doesn't support Intel SHA extension
+        ifneq (,$(filter 4.9,$(word 1,$(GCC_VERSION)).$(word 2,$(GCC_VERSION))))
+            EXTRA_SRCS += sha256-x86.c
+            DEFINES += -DUSE_HW_SHA2
+        endif
+        ifeq (,$(filter 0 1 2 3 4,$(word 1,$(GCC_VERSION))))
+            EXTRA_SRCS += sha256-x86.c
+            DEFINES += -DUSE_HW_SHA2
+        endif
+    endif
 endif
 ifeq ($(CPU_ARCH),aarch64)
     ifdef CC_IS_CLANG
@@ -204,6 +218,11 @@ else
 	    INTEL_GCM_CLANG_CL = 1
 	endif
     endif
+    # The Intel SHA extenstion requires Visual C++ 2015.
+    ifeq ($(_MSC_VER_GE_14),1)
+        DEFINES += -DUSE_HW_SHA2
+        EXTRA_SRCS += sha256-x86.c
+    endif
 endif
 else
     # -DMP_NO_MP_WORD
@@ -230,6 +249,11 @@ ifeq ($(CPU_ARCH),x86_64)
 	    INTEL_GCM_CLANG_CL = 1
 	endif
     endif
+    # The Intel SHA extenstion requires Visual C++ 2015.
+    ifeq ($(_MSC_VER_GE_14),1)
+        DEFINES += -DUSE_HW_SHA2
+        EXTRA_SRCS += sha256-x86.c
+    endif
     MPI_SRCS += mpi_amd64.c
 endif
 endif
@@ -740,6 +764,8 @@ ifdef INTEL_GCM_CLANG_CL
 $(OBJDIR)/$(PROG_PREFIX)intel-gcm-wrap$(OBJ_SUFFIX): CFLAGS += -mssse3
 endif
 
+$(OBJDIR)/$(PROG_PREFIX)sha256-x86$(OBJ_SUFFIX): CFLAGS += -msha -mssse3 -msse4.1
+
 ifeq ($(CPU_ARCH),arm)
 # When the compiler uses the softfloat ABI, we want to use the compatible softfp ABI when
 # enabling NEON for these objects.
diff --git a/lib/freebl/freebl.gyp b/lib/freebl/freebl.gyp
index 66f30631a..e7703baf8 100644
--- a/lib/freebl/freebl.gyp
+++ b/lib/freebl/freebl.gyp
@@ -220,6 +220,38 @@
       ]
     },
     {
+      'target_name': 'sha-x86_c_lib',
+      'type': 'static_library',
+      'sources': [
+        'sha256-x86.c'
+      ],
+      'dependencies': [
+        '<(DEPTH)/exports.gyp:nss_exports'
+      ],
+      'cflags': [
+        '-msha',
+        '-mssse3',
+        '-msse4.1'
+      ],
+      'cflags_mozilla': [
+        '-msha',
+        '-mssse3',
+        '-msse4.1'
+      ],
+      'conditions': [
+        # macOS build doesn't use cflags.
+        [ 'OS=="mac" or OS=="ios"', {
+          'xcode_settings': {
+            'OTHER_CFLAGS': [
+              '-msha',
+              '-mssse3',
+              '-msse4.1'
+            ],
+          },
+        }]
+      ]
+    },
+    {
       'target_name': 'gcm-aes-arm32-neon_c_lib',
       'type': 'static_library',
       'sources': [
@@ -488,6 +520,11 @@
             'armv8_c_lib'
           ],
         }],
+        [ '(target_arch=="ia32" or target_arch=="x64") and disable_intel_hw_sha==0', {
+          'dependencies': [
+            'sha-x86_c_lib',
+          ],
+        }],
         [ 'disable_arm32_neon==0 and target_arch=="arm"', {
           'dependencies': [
             'gcm-aes-arm32-neon_c_lib',
@@ -570,6 +607,11 @@
             'armv8_c_lib',
           ],
         }],
+        [ '(target_arch=="ia32" or target_arch=="x64") and disable_intel_hw_sha==0', {
+          'dependencies': [
+            'sha-x86_c_lib',
+          ],
+        }],
         [ 'disable_arm32_neon==0 and target_arch=="arm"', {
           'dependencies': [
             'gcm-aes-arm32-neon_c_lib',
@@ -765,6 +807,11 @@
           },
         },
       }],
+      [ '(OS=="win" or OS=="mac" or OS=="ios") and (target_arch=="ia32" or target_arch=="x64") and disable_intel_hw_sha==0', {
+        'defines': [
+          'USE_HW_SHA2',
+        ],
+      }],
       [ '(OS=="win" or OS=="mac" or OS=="ios") and (target_arch=="arm64" or target_arch=="aarch64") and disable_arm_hw_aes==0', {
         'defines': [
           'USE_HW_AES',
@@ -846,6 +893,11 @@
               'ARMHF',
             ],
           }],
+          [ 'disable_intel_hw_sha==0 and (target_arch=="ia32" or target_arch=="x64")', {
+            'defines': [
+              'USE_HW_SHA2',
+            ],
+          }],
           [ 'disable_arm_hw_aes==0 and (target_arch=="arm" or target_arch=="arm64" or target_arch=="aarch64")', {
             'defines': [
               'USE_HW_AES',
diff --git a/lib/freebl/sha256-x86.c b/lib/freebl/sha256-x86.c
new file mode 100644
index 000000000..3aa30e9cc
--- /dev/null
+++ b/lib/freebl/sha256-x86.c
@@ -0,0 +1,236 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+#ifdef USE_HW_SHA2
+
+#include <immintrin.h>
+
+#ifdef FREEBL_NO_DEPEND
+#include "stubs.h"
+#endif
+
+#include "blapii.h"
+#include "prcpucfg.h"
+#include "prtypes.h" /* for PRUintXX */
+#include "prlong.h"
+#include "blapi.h"
+#include "sha256.h"
+
+/* SHA-256 constants, K256. */
+pre_align static const PRUint32 K256[64] post_align = {
+    0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
+    0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
+    0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
+    0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
+    0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
+    0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
+    0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
+    0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
+    0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
+    0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
+    0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
+    0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
+    0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
+    0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
+    0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
+    0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+};
+
+#define ROUND(n, a, b, c, d)                                \
+    {                                                       \
+        __m128i t = _mm_add_epi32(a, k##n);                 \
+        w1 = _mm_sha256rnds2_epu32(w1, w0, t);              \
+        t = _mm_shuffle_epi32(t, 0x0e);                     \
+        w0 = _mm_sha256rnds2_epu32(w0, w1, t);              \
+        if (n < 12) {                                       \
+            a = _mm_sha256msg1_epu32(a, b);                 \
+            a = _mm_add_epi32(a, _mm_alignr_epi8(d, c, 4)); \
+            a = _mm_sha256msg2_epu32(a, d);                 \
+        }                                                   \
+    }
+
+void
+SHA256_Compress_Native(SHA256Context *ctx)
+{
+    __m128i h0, h1, th;
+    __m128i a, b, c, d;
+    __m128i w0, w1;
+    const __m128i shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);
+
+    const __m128i *K = (__m128i *)K256;
+    const __m128i k0 = _mm_load_si128(K);
+    const __m128i k1 = _mm_load_si128(K + 1);
+    const __m128i k2 = _mm_load_si128(K + 2);
+    const __m128i k3 = _mm_load_si128(K + 3);
+    const __m128i k4 = _mm_load_si128(K + 4);
+    const __m128i k5 = _mm_load_si128(K + 5);
+    const __m128i k6 = _mm_load_si128(K + 6);
+    const __m128i k7 = _mm_load_si128(K + 7);
+    const __m128i k8 = _mm_load_si128(K + 8);
+    const __m128i k9 = _mm_load_si128(K + 9);
+    const __m128i k10 = _mm_load_si128(K + 10);
+    const __m128i k11 = _mm_load_si128(K + 11);
+    const __m128i k12 = _mm_load_si128(K + 12);
+    const __m128i k13 = _mm_load_si128(K + 13);
+    const __m128i k14 = _mm_load_si128(K + 14);
+    const __m128i k15 = _mm_load_si128(K + 15);
+
+    const __m128i *input = (__m128i *)ctx->u.b;
+
+    h0 = _mm_loadu_si128((__m128i *)(ctx->h));
+    h1 = _mm_loadu_si128((__m128i *)(ctx->h + 4));
+
+    /* H0123:4567 -> H01256:H2367 */
+    th = _mm_shuffle_epi32(h0, 0xb1);
+    h1 = _mm_shuffle_epi32(h1, 0x1b);
+    h0 = _mm_alignr_epi8(th, h1, 8);
+    h1 = _mm_blend_epi16(h1, th, 0xf0);
+
+    a = _mm_shuffle_epi8(_mm_loadu_si128(input), shuffle);
+    b = _mm_shuffle_epi8(_mm_loadu_si128(input + 1), shuffle);
+    c = _mm_shuffle_epi8(_mm_loadu_si128(input + 2), shuffle);
+    d = _mm_shuffle_epi8(_mm_loadu_si128(input + 3), shuffle);
+
+    w0 = h0;
+    w1 = h1;
+
+    ROUND(0, a, b, c, d)
+    ROUND(1, b, c, d, a)
+    ROUND(2, c, d, a, b)
+    ROUND(3, d, a, b, c)
+    ROUND(4, a, b, c, d)
+    ROUND(5, b, c, d, a)
+    ROUND(6, c, d, a, b)
+    ROUND(7, d, a, b, c)
+    ROUND(8, a, b, c, d)
+    ROUND(9, b, c, d, a)
+    ROUND(10, c, d, a, b)
+    ROUND(11, d, a, b, c)
+    ROUND(12, a, b, c, d)
+    ROUND(13, b, c, d, a)
+    ROUND(14, c, d, a, b)
+    ROUND(15, d, a, b, c)
+
+    h0 = _mm_add_epi32(h0, w0);
+    h1 = _mm_add_epi32(h1, w1);
+
+    /* H0145:2367 -> H0123:4567 */
+    th = _mm_shuffle_epi32(h0, 0x1b);
+    h1 = _mm_shuffle_epi32(h1, 0xb1);
+    h0 = _mm_blend_epi16(th, h1, 0xf0);
+    h1 = _mm_alignr_epi8(h1, th, 8);
+
+    _mm_storeu_si128((__m128i *)ctx->h, h0);
+    _mm_storeu_si128((__m128i *)(ctx->h + 4), h1);
+}
+
+void
+SHA256_Update_Native(SHA256Context *ctx, const unsigned char *input,
+                     unsigned int inputLen)
+{
+    __m128i h0, h1, th;
+    const __m128i shuffle = _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3);
+
+    const __m128i *K = (__m128i *)K256;
+    const __m128i k0 = _mm_load_si128(K);
+    const __m128i k1 = _mm_load_si128(K + 1);
+    const __m128i k2 = _mm_load_si128(K + 2);
+    const __m128i k3 = _mm_load_si128(K + 3);
+    const __m128i k4 = _mm_load_si128(K + 4);
+    const __m128i k5 = _mm_load_si128(K + 5);
+    const __m128i k6 = _mm_load_si128(K + 6);
+    const __m128i k7 = _mm_load_si128(K + 7);
+    const __m128i k8 = _mm_load_si128(K + 8);
+    const __m128i k9 = _mm_load_si128(K + 9);
+    const __m128i k10 = _mm_load_si128(K + 10);
+    const __m128i k11 = _mm_load_si128(K + 11);
+    const __m128i k12 = _mm_load_si128(K + 12);
+    const __m128i k13 = _mm_load_si128(K + 13);
+    const __m128i k14 = _mm_load_si128(K + 14);
+    const __m128i k15 = _mm_load_si128(K + 15);
+
+    unsigned int inBuf = ctx->sizeLo & 0x3f;
+    if (!inputLen) {
+        return;
+    }
+
+    /* Add inputLen into the count of bytes processed, before processing */
+    if ((ctx->sizeLo += inputLen) < inputLen) {
+        ctx->sizeHi++;
+    }
+
+    /* if data already in buffer, attempt to fill rest of buffer */
+    if (inBuf) {
+        unsigned int todo = SHA256_BLOCK_LENGTH - inBuf;
+        if (inputLen < todo) {
+            todo = inputLen;
+        }
+        memcpy(ctx->u.b + inBuf, input, todo);
+        input += todo;
+        inputLen -= todo;
+        if (inBuf + todo == SHA256_BLOCK_LENGTH) {
+            SHA256_Compress_Native(ctx);
+        }
+    }
+
+    h0 = _mm_loadu_si128((__m128i *)(ctx->h));
+    h1 = _mm_loadu_si128((__m128i *)(ctx->h + 4));
+
+    /* H0123:4567 -> H01256:H2367 */
+    th = _mm_shuffle_epi32(h0, 0xb1);
+    h1 = _mm_shuffle_epi32(h1, 0x1b);
+    h0 = _mm_alignr_epi8(th, h1, 8);
+    h1 = _mm_blend_epi16(h1, th, 0xf0);
+
+    /* if enough data to fill one or more whole buffers, process them. */
+    while (inputLen >= SHA256_BLOCK_LENGTH) {
+        __m128i a, b, c, d;
+        __m128i w0, w1;
+        a = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)input), shuffle);
+        b = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 16)), shuffle);
+        c = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 32)), shuffle);
+        d = _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(input + 48)), shuffle);
+        input += SHA256_BLOCK_LENGTH;
+        inputLen -= SHA256_BLOCK_LENGTH;
+
+        w0 = h0;
+        w1 = h1;
+
+        ROUND(0, a, b, c, d)
+        ROUND(1, b, c, d, a)
+        ROUND(2, c, d, a, b)
+        ROUND(3, d, a, b, c)
+        ROUND(4, a, b, c, d)
+        ROUND(5, b, c, d, a)
+        ROUND(6, c, d, a, b)
+        ROUND(7, d, a, b, c)
+        ROUND(8, a, b, c, d)
+        ROUND(9, b, c, d, a)
+        ROUND(10, c, d, a, b)
+        ROUND(11, d, a, b, c)
+        ROUND(12, a, b, c, d)
+        ROUND(13, b, c, d, a)
+        ROUND(14, c, d, a, b)
+        ROUND(15, d, a, b, c)
+
+        h0 = _mm_add_epi32(h0, w0);
+        h1 = _mm_add_epi32(h1, w1);
+    }
+
+    // H01234567 -> H01256 and H2367
+    th = _mm_shuffle_epi32(h0, 0x1b);
+    h1 = _mm_shuffle_epi32(h1, 0xb1);
+    h0 = _mm_blend_epi16(th, h1, 0xf0);
+    h1 = _mm_alignr_epi8(h1, th, 8);
+
+    _mm_storeu_si128((__m128i *)ctx->h, h0);
+    _mm_storeu_si128((__m128i *)(ctx->h + 4), h1);
+
+    /* if data left over, fill it into buffer */
+    if (inputLen) {
+        memcpy(ctx->u.b, input, inputLen);
+    }
+}
+
+#endif /* USE_HW_SHA2 */
diff --git a/lib/freebl/sha512.c b/lib/freebl/sha512.c
index dc0ed776b..946ab9f7f 100644
--- a/lib/freebl/sha512.c
+++ b/lib/freebl/sha512.c
@@ -164,7 +164,7 @@ static void SHA256_Compress_Generic(SHA256Context *ctx);
 static void SHA256_Update_Generic(SHA256Context *ctx, const unsigned char *input,
                                   unsigned int inputLen);
 
-#ifndef USE_HW_SHA2
+#if !defined(USE_HW_SHA2) || !defined(IS_LITTLE_ENDIAN)
 void
 SHA256_Compress_Native(SHA256Context *ctx)
 {
@@ -200,16 +200,20 @@ SHA256_DestroyContext(SHA256Context *ctx, PRBool freeit)
 void
 SHA256_Begin(SHA256Context *ctx)
 {
+    PRBool use_hw_sha2 = PR_FALSE;
+
     memset(ctx, 0, sizeof *ctx);
     memcpy(H, H256, sizeof H256);
+
 #if defined(USE_HW_SHA2) && defined(IS_LITTLE_ENDIAN)
     /* arm's implementation is tested on little endian only */
-    if (arm_sha2_support()) {
+    use_hw_sha2 = arm_sha2_support() || (sha_support() && ssse3_support() && sse4_1_support());
+#endif
+
+    if (use_hw_sha2) {
         ctx->compress = SHA256_Compress_Native;
         ctx->update = SHA256_Update_Native;
-    } else
-#endif
-    {
+    } else {
         ctx->compress = SHA256_Compress_Generic;
         ctx->update = SHA256_Update_Generic;
     }
@@ -692,16 +696,22 @@ SHA224_DestroyContext(SHA224Context *ctx, PRBool freeit)
 void
 SHA224_Begin(SHA224Context *ctx)
 {
+    PRBool use_hw_sha2;
+
     memset(ctx, 0, sizeof *ctx);
     memcpy(H, H224, sizeof H224);
+
 #if defined(USE_HW_SHA2) && defined(IS_LITTLE_ENDIAN)
     /* arm's implementation is tested on little endian only */
-    if (arm_sha2_support()) {
+    use_hw_sha2 = arm_sha2_support() || (sha_support() && ssse3_support() && sse4_1_support());
+#else
+    use_hw_sha2 = PR_FALSE;
+#endif
+
+    if (use_hw_sha2) {
         ctx->compress = SHA256_Compress_Native;
         ctx->update = SHA256_Update_Native;
-    } else
-#endif
-    {
+    } else {
         ctx->compress = SHA256_Compress_Generic;
         ctx->update = SHA256_Update_Generic;
     }
author	Makoto Kato <m_kato@ga2.so-net.ne.jp>	2021-06-25 08:18:13 +0000
committer	Makoto Kato <m_kato@ga2.so-net.ne.jp>	2021-06-25 08:18:13 +0000
commit	a5cf364d1116025f1ec8cf253e8092c4ba83c5af (patch)
tree	e57264880002c3dbd9723f1b6c7335957a3ce2c9
parent	acbf97ed8b4945cc9e66ba1fff0749d168361822 (diff)
download	nss-hg-a5cf364d1116025f1ec8cf253e8092c4ba83c5af.tar.gz