Merge pull request #19478 from r-devulap/svml

ENH: Vectorizing umath module using AVX-512 (open sourced from Intel Short Vector Math Library, SVML)
author: Matti Picus <matti.picus@gmail.com> 2021-10-10 10:31:32 +0300
committer: GitHub <noreply@github.com> 2021-10-10 10:31:32 +0300
commit: 1eff1c543a8f1e9d7ea29182b8c76db5a2efc3c2 (patch)
tree: bfa9c80c3236b0ee7cf0ca9baa77d70e7b2f647e
parent: cc545bfea39002a165f0d3401a5a2739317460c5 (diff)
parent: 1cbf913d7dda9441118c0b51f4020048334cd451 (diff)
download: numpy-1eff1c543a8f1e9d7ea29182b8c76db5a2efc3c2.tar.gz
13 files changed, 338 insertions, 40 deletions
diff --git a/.circleci/config.yml b/.circleci/config.yml
index c343e9168..de7f52f81 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -24,6 +24,12 @@ jobs:
             if [[ -v CI_PULL_REQUEST ]] ; then git pull --ff-only origin "refs/pull/${CI_PULL_REQUEST//*pull\//}/merge" ; fi
 
       - run:
+          name: update submodules
+          command: |
+            git submodule init
+            git submodule update
+
+      - run:
           name: create virtual environment, install dependencies
           command: |
             sudo apt-get update
diff --git a/.gitattributes b/.gitattributes
index a0676bee4..911db2b72 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -11,6 +11,7 @@ numpy/linalg/lapack_lite/f2c.c linguist-vendored
 numpy/linalg/lapack_lite/f2c.h linguist-vendored
 tools/npy_tempita/* linguist-vendored
 numpy/core/include/numpy/libdivide/* linguist-vendored
+numpy/core/src/umath/svml/* linguist-vendored
 
 # Mark some files as generated
 numpy/linalg/lapack_lite/f2c_*.c linguist-generated
diff --git a/.gitmodules b/.gitmodules
index 0d6857868..1ea274daf 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,6 @@
 [submodule "doc/source/_static/scipy-mathjax"]
 	path = doc/source/_static/scipy-mathjax
 	url = https://github.com/scipy/scipy-mathjax.git
+[submodule "numpy/core/src/umath/svml"]
+	path = numpy/core/src/umath/svml
+	url = https://github.com/numpy/SVML.git
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 3255b0758..f0c67b4aa 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -24,6 +24,9 @@ stages:
       vmImage: 'ubuntu-20.04'
     steps:
     - script: |
+        git submodule update --init
+      displayName: 'Fetch submodules'
+    - script: |
             if ! `gcc 2>/dev/null`; then
                 sudo apt install gcc
             fi
@@ -72,6 +75,9 @@ stages:
       vmImage: 'ubuntu-20.04'
     steps:
     - script: |
+        git submodule update --init
+      displayName: 'Fetch submodules'
+    - script: |
             docker run -v $(pwd):/numpy -e CFLAGS="-msse2 -std=c99 -UNDEBUG" \
             -e F77=gfortran-5 -e F90=gfortran-5 quay.io/pypa/manylinux2014_i686 \
             /bin/bash -xc "cd numpy && \
@@ -265,6 +271,9 @@ stages:
       vmImage: 'ubuntu-20.04'
     steps:
     - script: |
+        git submodule update --init
+      displayName: 'Fetch submodules'
+    - script: |
             # create and activate conda environment
             conda env create -f environment.yml
       displayName: 'Create conda environment.'
diff --git a/doc/release/upcoming_changes/19478.performance.rst b/doc/release/upcoming_changes/19478.performance.rst
new file mode 100644
index 000000000..6a389c20e
--- /dev/null
+++ b/doc/release/upcoming_changes/19478.performance.rst
@@ -0,0 +1,11 @@
+Vectorize umath module using AVX-512
+-------------------------------------
+
+By leveraging Intel Short Vector Math Library (SVML), 18 umath functions
+(``exp2``, ``log2``, ``log10``, ``expm1``, ``log1p``, ``cbrt``, ``sin``,
+``cos``, ``tan``, ``arcsin``, ``arccos``, ``arctan``, ``sinh``, ``cosh``,
+``tanh``, ``arcsinh``, ``arccosh``, ``arctanh``) are vectorized using AVX-512
+instruction set for both single and double precision implementations.  This
+change is currently enabled only for Linux users and on processors with
+AVX-512 instruction set.  It provides an average speed up of 32x and 14x for
+single and double precision functions respectively.
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 4891e8f23..3a27a34cd 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -359,7 +359,7 @@ defdict = {
           docstrings.get('numpy.core.umath.fmod'),
           None,
           TD(ints),
-          TD(flts, f='fmod', astype={'e':'f'}),
+          TD(flts, f='fmod', astype={'e': 'f'}),
           TD(P, f='fmod'),
           ),
 'square':
@@ -390,7 +390,7 @@ defdict = {
           docstrings.get('numpy.core.umath.power'),
           None,
           TD(ints),
-          TD(inexact, f='pow', astype={'e':'f'}),
+          TD(inexact, f='pow', astype={'e': 'f'}),
           TD(O, f='npy_ObjectPower'),
           ),
 'float_power':
@@ -551,13 +551,13 @@ defdict = {
     Ufunc(2, 1, MinusInfinity,
           docstrings.get('numpy.core.umath.logaddexp'),
           None,
-          TD(flts, f="logaddexp", astype={'e':'f'})
+          TD(flts, f="logaddexp", astype={'e': 'f'})
           ),
 'logaddexp2':
     Ufunc(2, 1, MinusInfinity,
           docstrings.get('numpy.core.umath.logaddexp2'),
           None,
-          TD(flts, f="logaddexp2", astype={'e':'f'})
+          TD(flts, f="logaddexp2", astype={'e': 'f'})
           ),
 'bitwise_and':
     Ufunc(2, 1, AllOnes,
@@ -605,80 +605,93 @@ defdict = {
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.heaviside'),
           None,
-          TD(flts, f='heaviside', astype={'e':'f'}),
+          TD(flts, f='heaviside', astype={'e': 'f'}),
           ),
 'degrees':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.degrees'),
           None,
-          TD(fltsP, f='degrees', astype={'e':'f'}),
+          TD(fltsP, f='degrees', astype={'e': 'f'}),
           ),
 'rad2deg':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.rad2deg'),
           None,
-          TD(fltsP, f='rad2deg', astype={'e':'f'}),
+          TD(fltsP, f='rad2deg', astype={'e': 'f'}),
           ),
 'radians':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.radians'),
           None,
-          TD(fltsP, f='radians', astype={'e':'f'}),
+          TD(fltsP, f='radians', astype={'e': 'f'}),
           ),
 'deg2rad':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.deg2rad'),
           None,
-          TD(fltsP, f='deg2rad', astype={'e':'f'}),
+          TD(fltsP, f='deg2rad', astype={'e': 'f'}),
           ),
 'arccos':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.arccos'),
           None,
-          TD(inexact, f='acos', astype={'e':'f'}),
+          TD('e', f='acos', astype={'e': 'f'}),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+          TD(inexact, f='acos', astype={'e': 'f'}),
           TD(P, f='arccos'),
           ),
 'arccosh':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.arccosh'),
           None,
-          TD(inexact, f='acosh', astype={'e':'f'}),
+          TD('e', f='acosh', astype={'e': 'f'}),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+          TD(inexact, f='acosh', astype={'e': 'f'}),
           TD(P, f='arccosh'),
           ),
 'arcsin':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.arcsin'),
           None,
-          TD(inexact, f='asin', astype={'e':'f'}),
+          TD('e', f='asin', astype={'e': 'f'}),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+          TD(inexact, f='asin', astype={'e': 'f'}),
           TD(P, f='arcsin'),
           ),
 'arcsinh':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.arcsinh'),
           None,
-          TD(inexact, f='asinh', astype={'e':'f'}),
+          TD('e', f='asinh', astype={'e': 'f'}),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+          TD(inexact, f='asinh', astype={'e': 'f'}),
           TD(P, f='arcsinh'),
           ),
 'arctan':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.arctan'),
           None,
-          TD(inexact, f='atan', astype={'e':'f'}),
+          TD('e', f='atan', astype={'e': 'f'}),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+          TD(inexact, f='atan', astype={'e': 'f'}),
           TD(P, f='arctan'),
           ),
 'arctanh':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.arctanh'),
           None,
-          TD(inexact, f='atanh', astype={'e':'f'}),
+          TD('e', f='atanh', astype={'e': 'f'}),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+          TD(inexact, f='atanh', astype={'e': 'f'}),
           TD(P, f='arctanh'),
           ),
 'cos':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.cos'),
           None,
-          TD('e', f='cos', astype={'e':'f'}),
+          TD('e', f='cos', astype={'e': 'f'}),
           TD('f', dispatch=[('loops_trigonometric', 'f')]),
+          TD('d', dispatch=[('loops_umath_fp', 'd')]),
           TD('fdg' + cmplx, f='cos'),
           TD(P, f='cos'),
           ),
@@ -686,8 +699,9 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.sin'),
           None,
-          TD('e', f='sin', astype={'e':'f'}),
+          TD('e', f='sin', astype={'e': 'f'}),
           TD('f', dispatch=[('loops_trigonometric', 'f')]),
+          TD('d', dispatch=[('loops_umath_fp', 'd')]),
           TD('fdg' + cmplx, f='sin'),
           TD(P, f='sin'),
           ),
@@ -695,35 +709,43 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.tan'),
           None,
-          TD(inexact, f='tan', astype={'e':'f'}),
+          TD('e', f='tan', astype={'e': 'f'}),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+          TD(inexact, f='tan', astype={'e': 'f'}),
           TD(P, f='tan'),
           ),
 'cosh':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.cosh'),
           None,
-          TD(inexact, f='cosh', astype={'e':'f'}),
+          TD('e', f='cosh', astype={'e': 'f'}),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+          TD(inexact, f='cosh', astype={'e': 'f'}),
           TD(P, f='cosh'),
           ),
 'sinh':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.sinh'),
           None,
-          TD(inexact, f='sinh', astype={'e':'f'}),
+          TD('e', f='sinh', astype={'e': 'f'}),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+          TD(inexact, f='sinh', astype={'e': 'f'}),
           TD(P, f='sinh'),
           ),
 'tanh':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.tanh'),
           None,
-          TD(inexact, f='tanh', astype={'e':'f'}),
+          TD('e', f='tanh', astype={'e': 'f'}),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+          TD(inexact, f='tanh', astype={'e': 'f'}),
           TD(P, f='tanh'),
           ),
 'exp':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.exp'),
           None,
-          TD('e', f='exp', astype={'e':'f'}),
+          TD('e', f='exp', astype={'e': 'f'}),
           TD('fd', dispatch=[('loops_exponent_log', 'fd')]),
           TD('fdg' + cmplx, f='exp'),
           TD(P, f='exp'),
@@ -732,21 +754,25 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.exp2'),
           None,
-          TD(inexact, f='exp2', astype={'e':'f'}),
+          TD('e', f='exp2', astype={'e': 'f'}),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+          TD(inexact, f='exp2', astype={'e': 'f'}),
           TD(P, f='exp2'),
           ),
 'expm1':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.expm1'),
           None,
-          TD(inexact, f='expm1', astype={'e':'f'}),
+          TD('e', f='expm1', astype={'e': 'f'}),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+          TD(inexact, f='expm1', astype={'e': 'f'}),
           TD(P, f='expm1'),
           ),
 'log':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.log'),
           None,
-          TD('e', f='log', astype={'e':'f'}),
+          TD('e', f='log', astype={'e': 'f'}),
           TD('fd', dispatch=[('loops_exponent_log', 'fd')]),
           TD('fdg' + cmplx, f='log'),
           TD(P, f='log'),
@@ -755,28 +781,34 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.log2'),
           None,
-          TD(inexact, f='log2', astype={'e':'f'}),
+          TD('e', f='log2', astype={'e': 'f'}),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+          TD(inexact, f='log2', astype={'e': 'f'}),
           TD(P, f='log2'),
           ),
 'log10':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.log10'),
           None,
-          TD(inexact, f='log10', astype={'e':'f'}),
+          TD('e', f='log10', astype={'e': 'f'}),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+          TD(inexact, f='log10', astype={'e': 'f'}),
           TD(P, f='log10'),
           ),
 'log1p':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.log1p'),
           None,
-          TD(inexact, f='log1p', astype={'e':'f'}),
+          TD('e', f='log1p', astype={'e': 'f'}),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+          TD(inexact, f='log1p', astype={'e': 'f'}),
           TD(P, f='log1p'),
           ),
 'sqrt':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.sqrt'),
           None,
-          TD('e', f='sqrt', astype={'e':'f'}),
+          TD('e', f='sqrt', astype={'e': 'f'}),
           TD(inexactvec, dispatch=[('loops_unary_fp', 'fd')]),
           TD('fdg' + cmplx, f='sqrt'),
           TD(P, f='sqrt'),
@@ -785,14 +817,16 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.cbrt'),
           None,
-          TD(flts, f='cbrt', astype={'e':'f'}),
+          TD('e', f='cbrt', astype={'e': 'f'}),
+          TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+          TD(flts, f='cbrt', astype={'e': 'f'}),
           TD(P, f='cbrt'),
           ),
 'ceil':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.ceil'),
           None,
-          TD('e', f='ceil', astype={'e':'f'}),
+          TD('e', f='ceil', astype={'e': 'f'}),
           TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
           TD('fdg', f='ceil'),
           TD(O, f='npy_ObjectCeil'),
@@ -801,7 +835,7 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.trunc'),
           None,
-          TD('e', f='trunc', astype={'e':'f'}),
+          TD('e', f='trunc', astype={'e': 'f'}),
           TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
           TD('fdg', f='trunc'),
           TD(O, f='npy_ObjectTrunc'),
@@ -810,14 +844,14 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.fabs'),
           None,
-          TD(flts, f='fabs', astype={'e':'f'}),
+          TD(flts, f='fabs', astype={'e': 'f'}),
           TD(P, f='fabs'),
        ),
 'floor':
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.floor'),
           None,
-          TD('e', f='floor', astype={'e':'f'}),
+          TD('e', f='floor', astype={'e': 'f'}),
           TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
           TD('fdg', f='floor'),
           TD(O, f='npy_ObjectFloor'),
@@ -826,7 +860,7 @@ defdict = {
     Ufunc(1, 1, None,
           docstrings.get('numpy.core.umath.rint'),
           None,
-          TD('e', f='rint', astype={'e':'f'}),
+          TD('e', f='rint', astype={'e': 'f'}),
           TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
           TD('fdg' + cmplx, f='rint'),
           TD(P, f='rint'),
@@ -835,7 +869,7 @@ defdict = {
     Ufunc(2, 1, None,
           docstrings.get('numpy.core.umath.arctan2'),
           None,
-          TD(flts, f='atan2', astype={'e':'f'}),
+          TD(flts, f='atan2', astype={'e': 'f'}),
           TD(P, f='arctan2'),
           ),
 'remainder':
@@ -858,7 +892,7 @@ defdict = {
     Ufunc(2, 1, Zero,
           docstrings.get('numpy.core.umath.hypot'),
           None,
-          TD(flts, f='hypot', astype={'e':'f'}),
+          TD(flts, f='hypot', astype={'e': 'f'}),
           TD(P, f='hypot'),
           ),
 'isnan':
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 2f495c48b..bde81bf2f 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -5,6 +5,7 @@ import copy
 import warnings
 import platform
 import textwrap
+import glob
 from os.path import join
 
 from numpy.distutils import log
@@ -63,6 +64,20 @@ class CallOnceOnly:
             out = copy.deepcopy(pickle.loads(self._check_complex))
         return out
 
+def can_link_svml():
+    """SVML library is supported only on x86_64 architecture and currently
+    only on linux
+    """
+    machine = platform.machine()
+    system = platform.system()
+    return "x86_64" in machine and system == "Linux"
+
+def check_svml_submodule(svmlpath):
+    if not os.path.exists(svmlpath + "/README.md"):
+        raise RuntimeError("Missing `SVML` submodule! Run `git submodule "
+                           "update --init` to fix this.")
+    return True
+
 def pythonlib_dir():
     """return path where libpython* is."""
     if sys.platform == 'win32':
@@ -455,6 +470,9 @@ def configuration(parent_package='',top_path=None):
             # Inline check
             inline = config_cmd.check_inline()
 
+            if can_link_svml():
+                moredefs.append(('NPY_CAN_LINK_SVML', 1))
+
             # Use relaxed stride checking
             if NPY_RELAXED_STRIDES_CHECKING:
                 moredefs.append(('NPY_RELAXED_STRIDES_CHECKING', 1))
@@ -727,6 +745,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'common', 'npy_import.h'),
             join('src', 'common', 'npy_hashtable.h'),
             join('src', 'common', 'npy_longdouble.h'),
+            join('src', 'common', 'npy_svml.h'),
             join('src', 'common', 'templ_common.h.src'),
             join('src', 'common', 'ucsnarrow.h'),
             join('src', 'common', 'ufunc_override.h'),
@@ -923,6 +942,7 @@ def configuration(parent_package='',top_path=None):
             join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'),
             join('src', 'umath', 'loops_arithmetic.dispatch.c.src'),
             join('src', 'umath', 'loops_trigonometric.dispatch.c.src'),
+            join('src', 'umath', 'loops_umath_fp.dispatch.c.src'),
             join('src', 'umath', 'loops_exponent_log.dispatch.c.src'),
             join('src', 'umath', 'matmul.h.src'),
             join('src', 'umath', 'matmul.c.src'),
@@ -951,6 +971,11 @@ def configuration(parent_package='',top_path=None):
             join(codegen_dir, 'generate_ufunc_api.py'),
             ]
 
+    svml_path = join('numpy', 'core', 'src', 'umath', 'svml')
+    svml_objs = []
+    if can_link_svml() and check_svml_submodule(svml_path):
+        svml_objs = glob.glob(svml_path + '/**/*.s', recursive=True)
+
     config.add_extension('_multiarray_umath',
                          sources=multiarray_src + umath_src +
                                  common_src +
@@ -965,6 +990,7 @@ def configuration(parent_package='',top_path=None):
                          depends=deps + multiarray_deps + umath_deps +
                                 common_deps,
                          libraries=['npymath'],
+                         extra_objects=svml_objs,
                          extra_info=extra_info)
 
     #######################################################################
diff --git a/numpy/core/src/common/npy_svml.h b/numpy/core/src/common/npy_svml.h
new file mode 100644
index 000000000..4292f7090
--- /dev/null
+++ b/numpy/core/src/common/npy_svml.h
@@ -0,0 +1,41 @@
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+extern __m512 __svml_exp2f16(__m512 x);
+extern __m512 __svml_log2f16(__m512 x);
+extern __m512 __svml_log10f16(__m512 x);
+extern __m512 __svml_expm1f16(__m512 x);
+extern __m512 __svml_log1pf16(__m512 x);
+extern __m512 __svml_cbrtf16(__m512 x);
+extern __m512 __svml_sinf16(__m512 x);
+extern __m512 __svml_cosf16(__m512 x);
+extern __m512 __svml_tanf16(__m512 x);
+extern __m512 __svml_asinf16(__m512 x);
+extern __m512 __svml_acosf16(__m512 x);
+extern __m512 __svml_atanf16(__m512 x);
+extern __m512 __svml_atan2f16(__m512 x);
+extern __m512 __svml_sinhf16(__m512 x);
+extern __m512 __svml_coshf16(__m512 x);
+extern __m512 __svml_tanhf16(__m512 x);
+extern __m512 __svml_asinhf16(__m512 x);
+extern __m512 __svml_acoshf16(__m512 x);
+extern __m512 __svml_atanhf16(__m512 x);
+
+extern __m512d __svml_exp28(__m512d x);
+extern __m512d __svml_log28(__m512d x);
+extern __m512d __svml_log108(__m512d x);
+extern __m512d __svml_expm18(__m512d x);
+extern __m512d __svml_log1p8(__m512d x);
+extern __m512d __svml_cbrt8(__m512d x);
+extern __m512d __svml_sin8(__m512d x);
+extern __m512d __svml_cos8(__m512d x);
+extern __m512d __svml_tan8(__m512d x);
+extern __m512d __svml_asin8(__m512d x);
+extern __m512d __svml_acos8(__m512d x);
+extern __m512d __svml_atan8(__m512d x);
+extern __m512d __svml_atan28(__m512d x);
+extern __m512d __svml_sinh8(__m512d x);
+extern __m512d __svml_cosh8(__m512d x);
+extern __m512d __svml_tanh8(__m512d x);
+extern __m512d __svml_asinh8(__m512d x);
+extern __m512d __svml_acosh8(__m512d x);
+extern __m512d __svml_atanh8(__m512d x);
+#endif
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 02d749a5e..0938cd050 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -210,6 +210,32 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
 /**end repeat1**/
 /**end repeat**/
 
+#ifndef NPY_DISABLE_OPTIMIZATION
+    #include "loops_umath_fp.dispatch.h"
+#endif
+
+/**begin repeat
+ *  #TYPE = FLOAT, DOUBLE#
+ */
+/**begin repeat1
+ * #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, arcsin, arccos, arctan, sinh, cosh, arcsinh, arccosh, arctanh#
+ */
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ * #func = sin, cos# 
+ */
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_@func@,
+    (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+/**end repeat**/
+
 /**begin repeat
  *  #TYPE = FLOAT, DOUBLE#
  */
diff --git a/numpy/core/src/umath/loops_umath_fp.dispatch.c.src b/numpy/core/src/umath/loops_umath_fp.dispatch.c.src
new file mode 100644
index 000000000..852604655
--- /dev/null
+++ b/numpy/core/src/umath/loops_umath_fp.dispatch.c.src
@@ -0,0 +1,141 @@
+/*@targets
+ ** $maxopt baseline avx512_skx
+ */
+#include "numpy/npy_math.h"
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "npy_svml.h"
+#include "fast_loop_macros.h"
+
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+/**begin repeat
+ * #sfx = f32, f64#
+ * #func_suffix = f16, 8#
+ */
+/**begin repeat1
+ * #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, asin, acos, atan, sinh, cosh, asinh, acosh, atanh#
+ * #default_val = 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0#
+ */
+static void
+simd_@func@_@sfx@(const npyv_lanetype_@sfx@ *src, npy_intp ssrc,
+                        npyv_lanetype_@sfx@ *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_@sfx@;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_@sfx@ x;
+        #if @default_val@
+            if (ssrc == 1) {
+                x = npyv_load_till_@sfx@(src, len, @default_val@);
+            } else {
+                x = npyv_loadn_till_@sfx@(src, ssrc, len, @default_val@);
+            }
+        #else
+            if (ssrc == 1) {
+                x = npyv_load_tillz_@sfx@(src, len);
+            } else {
+                x = npyv_loadn_tillz_@sfx@(src, ssrc, len);
+            }
+        #endif
+        npyv_@sfx@ out = __svml_@func@@func_suffix@(x);
+        if (sdst == 1) {
+            npyv_store_till_@sfx@(dst, len, out);
+        } else {
+            npyv_storen_till_@sfx@(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ * #func = sin, cos#
+ */
+static void
+simd_@func@_f64(const double *src, npy_intp ssrc,
+                      double *dst, npy_intp sdst, npy_intp len)
+{
+    const int vstep = npyv_nlanes_f64;
+    for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+        npyv_f64 x;
+        if (ssrc == 1) {
+            x = npyv_load_tillz_f64(src, len);
+        } else {
+            x = npyv_loadn_tillz_f64(src, ssrc, len);
+        }
+        npyv_f64 out = __svml_@func@8(x);
+        if (sdst == 1) {
+            npyv_store_till_f64(dst, len, out);
+        } else {
+            npyv_storen_till_f64(dst, sdst, len, out);
+        }
+    }
+    npyv_cleanup();
+}
+/**end repeat**/
+#endif
+
+/**begin repeat
+ *  #TYPE = DOUBLE, FLOAT#
+ *  #type = npy_double, npy_float#
+ *  #vsub = , f#
+ *  #sfx  = f64, f32#
+ */
+/**begin repeat1
+ *  #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, arcsin, arccos, arctan, sinh, cosh, arcsinh, arccosh, arctanh#
+ *  #intrin = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, asin, acos, atan, sinh, cosh, asinh, acosh, atanh#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@func@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const @type@ *src = (@type@*)args[0];
+          @type@ *dst = (@type@*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(steps[0] % lsize == 0 && steps[1] % lsize == 0);
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_@sfx@(ssrc) &&
+        npyv_storable_stride_@sfx@(sdst)) {
+        simd_@intrin@_@sfx@(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const @type@ in1 = *(@type@ *)ip1;
+        *(@type@ *)op1 = npy_@intrin@@vsub@(in1);
+    }
+}
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ *  #func = sin, cos#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_@func@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+    const double *src = (double*)args[0];
+          double *dst = (double*)args[1];
+    const int lsize = sizeof(src[0]);
+    const npy_intp ssrc = steps[0] / lsize;
+    const npy_intp sdst = steps[1] / lsize;
+    const npy_intp len = dimensions[0];
+    assert(steps[0] % lsize == 0 && steps[1] % lsize == 0);
+    if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+        npyv_loadable_stride_f64(ssrc) &&
+        npyv_storable_stride_f64(sdst)) {
+        simd_@func@_f64(src, ssrc, dst, sdst, len);
+        return;
+    }
+#endif
+    UNARY_LOOP {
+        const npy_double in1 = *(npy_double *)ip1;
+        *(npy_double *)op1 = npy_@func@(in1);
+    }
+}
+/**end repeat**/
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 654ab81cc..d47be9a30 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -116,9 +116,8 @@ run_binary_avx512f_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_in
 #endif
     return 0;
 }
-
-
 /**end repeat1**/
+
 /**end repeat**/
 
 /**begin repeat
@@ -1152,6 +1151,7 @@ NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
  * #is_finite = 0, 1, 0, 0#
  * #is_signbit = 0, 0, 0, 1#
  */
+
 #if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
 static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
 AVX512_SKX_@func@_@TYPE@(npy_bool* op, @type@* ip, const npy_intp array_size, const npy_intp steps)
diff --git a/numpy/core/src/umath/svml b/numpy/core/src/umath/svml
new file mode 160000
+Subproject 9f8af767ed6c75455d9a382af829048f8dd1806
diff --git a/numpy/polynomial/tests/test_classes.py b/numpy/polynomial/tests/test_classes.py
index 8e71a1945..6322062f2 100644
--- a/numpy/polynomial/tests/test_classes.py
+++ b/numpy/polynomial/tests/test_classes.py
@@ -597,4 +597,4 @@ class TestInterpolate:
         for deg in range(0, 10):
             for t in range(0, deg + 1):
                 p = Chebyshev.interpolate(powx, deg, domain=[0, 2], args=(t,))
-                assert_almost_equal(p(x), powx(x, t), decimal=12)
+                assert_almost_equal(p(x), powx(x, t), decimal=11)
author	Matti Picus <matti.picus@gmail.com>	2021-10-10 10:31:32 +0300
committer	GitHub <noreply@github.com>	2021-10-10 10:31:32 +0300
commit	1eff1c543a8f1e9d7ea29182b8c76db5a2efc3c2 (patch)
tree	bfa9c80c3236b0ee7cf0ca9baa77d70e7b2f647e
parent	cc545bfea39002a165f0d3401a5a2739317460c5 (diff)
parent	1cbf913d7dda9441118c0b51f4020048334cd451 (diff)
download	numpy-1eff1c543a8f1e9d7ea29182b8c76db5a2efc3c2.tar.gz