summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatti Picus <matti.picus@gmail.com>2021-10-10 10:31:32 +0300
committerGitHub <noreply@github.com>2021-10-10 10:31:32 +0300
commit1eff1c543a8f1e9d7ea29182b8c76db5a2efc3c2 (patch)
treebfa9c80c3236b0ee7cf0ca9baa77d70e7b2f647e
parentcc545bfea39002a165f0d3401a5a2739317460c5 (diff)
parent1cbf913d7dda9441118c0b51f4020048334cd451 (diff)
downloadnumpy-1eff1c543a8f1e9d7ea29182b8c76db5a2efc3c2.tar.gz
Merge pull request #19478 from r-devulap/svml
ENH: Vectorizing umath module using AVX-512 (open sourced from Intel Short Vector Math Library, SVML)
-rw-r--r--.circleci/config.yml6
-rw-r--r--.gitattributes1
-rw-r--r--.gitmodules3
-rw-r--r--azure-pipelines.yml9
-rw-r--r--doc/release/upcoming_changes/19478.performance.rst11
-rw-r--r--numpy/core/code_generators/generate_umath.py108
-rw-r--r--numpy/core/setup.py26
-rw-r--r--numpy/core/src/common/npy_svml.h41
-rw-r--r--numpy/core/src/umath/loops.h.src26
-rw-r--r--numpy/core/src/umath/loops_umath_fp.dispatch.c.src141
-rw-r--r--numpy/core/src/umath/simd.inc.src4
m---------numpy/core/src/umath/svml0
-rw-r--r--numpy/polynomial/tests/test_classes.py2
13 files changed, 338 insertions, 40 deletions
diff --git a/.circleci/config.yml b/.circleci/config.yml
index c343e9168..de7f52f81 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -24,6 +24,12 @@ jobs:
if [[ -v CI_PULL_REQUEST ]] ; then git pull --ff-only origin "refs/pull/${CI_PULL_REQUEST//*pull\//}/merge" ; fi
- run:
+ name: update submodules
+ command: |
+ git submodule init
+ git submodule update
+
+ - run:
name: create virtual environment, install dependencies
command: |
sudo apt-get update
diff --git a/.gitattributes b/.gitattributes
index a0676bee4..911db2b72 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -11,6 +11,7 @@ numpy/linalg/lapack_lite/f2c.c linguist-vendored
numpy/linalg/lapack_lite/f2c.h linguist-vendored
tools/npy_tempita/* linguist-vendored
numpy/core/include/numpy/libdivide/* linguist-vendored
+numpy/core/src/umath/svml/* linguist-vendored
# Mark some files as generated
numpy/linalg/lapack_lite/f2c_*.c linguist-generated
diff --git a/.gitmodules b/.gitmodules
index 0d6857868..1ea274daf 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +1,6 @@
[submodule "doc/source/_static/scipy-mathjax"]
path = doc/source/_static/scipy-mathjax
url = https://github.com/scipy/scipy-mathjax.git
+[submodule "numpy/core/src/umath/svml"]
+ path = numpy/core/src/umath/svml
+ url = https://github.com/numpy/SVML.git
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 3255b0758..f0c67b4aa 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -24,6 +24,9 @@ stages:
vmImage: 'ubuntu-20.04'
steps:
- script: |
+ git submodule update --init
+ displayName: 'Fetch submodules'
+ - script: |
if ! `gcc 2>/dev/null`; then
sudo apt install gcc
fi
@@ -72,6 +75,9 @@ stages:
vmImage: 'ubuntu-20.04'
steps:
- script: |
+ git submodule update --init
+ displayName: 'Fetch submodules'
+ - script: |
docker run -v $(pwd):/numpy -e CFLAGS="-msse2 -std=c99 -UNDEBUG" \
-e F77=gfortran-5 -e F90=gfortran-5 quay.io/pypa/manylinux2014_i686 \
/bin/bash -xc "cd numpy && \
@@ -265,6 +271,9 @@ stages:
vmImage: 'ubuntu-20.04'
steps:
- script: |
+ git submodule update --init
+ displayName: 'Fetch submodules'
+ - script: |
# create and activate conda environment
conda env create -f environment.yml
displayName: 'Create conda environment.'
diff --git a/doc/release/upcoming_changes/19478.performance.rst b/doc/release/upcoming_changes/19478.performance.rst
new file mode 100644
index 000000000..6a389c20e
--- /dev/null
+++ b/doc/release/upcoming_changes/19478.performance.rst
@@ -0,0 +1,11 @@
+Vectorize umath module using AVX-512
+-------------------------------------
+
+By leveraging Intel Short Vector Math Library (SVML), 18 umath functions
+(``exp2``, ``log2``, ``log10``, ``expm1``, ``log1p``, ``cbrt``, ``sin``,
+``cos``, ``tan``, ``arcsin``, ``arccos``, ``arctan``, ``sinh``, ``cosh``,
+``tanh``, ``arcsinh``, ``arccosh``, ``arctanh``) are vectorized using AVX-512
+instruction set for both single and double precision implementations. This
+change is currently enabled only for Linux users and on processors with
+AVX-512 instruction set. It provides an average speed up of 32x and 14x for
+single and double precision functions respectively.
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 4891e8f23..3a27a34cd 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -359,7 +359,7 @@ defdict = {
docstrings.get('numpy.core.umath.fmod'),
None,
TD(ints),
- TD(flts, f='fmod', astype={'e':'f'}),
+ TD(flts, f='fmod', astype={'e': 'f'}),
TD(P, f='fmod'),
),
'square':
@@ -390,7 +390,7 @@ defdict = {
docstrings.get('numpy.core.umath.power'),
None,
TD(ints),
- TD(inexact, f='pow', astype={'e':'f'}),
+ TD(inexact, f='pow', astype={'e': 'f'}),
TD(O, f='npy_ObjectPower'),
),
'float_power':
@@ -551,13 +551,13 @@ defdict = {
Ufunc(2, 1, MinusInfinity,
docstrings.get('numpy.core.umath.logaddexp'),
None,
- TD(flts, f="logaddexp", astype={'e':'f'})
+ TD(flts, f="logaddexp", astype={'e': 'f'})
),
'logaddexp2':
Ufunc(2, 1, MinusInfinity,
docstrings.get('numpy.core.umath.logaddexp2'),
None,
- TD(flts, f="logaddexp2", astype={'e':'f'})
+ TD(flts, f="logaddexp2", astype={'e': 'f'})
),
'bitwise_and':
Ufunc(2, 1, AllOnes,
@@ -605,80 +605,93 @@ defdict = {
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.heaviside'),
None,
- TD(flts, f='heaviside', astype={'e':'f'}),
+ TD(flts, f='heaviside', astype={'e': 'f'}),
),
'degrees':
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.degrees'),
None,
- TD(fltsP, f='degrees', astype={'e':'f'}),
+ TD(fltsP, f='degrees', astype={'e': 'f'}),
),
'rad2deg':
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.rad2deg'),
None,
- TD(fltsP, f='rad2deg', astype={'e':'f'}),
+ TD(fltsP, f='rad2deg', astype={'e': 'f'}),
),
'radians':
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.radians'),
None,
- TD(fltsP, f='radians', astype={'e':'f'}),
+ TD(fltsP, f='radians', astype={'e': 'f'}),
),
'deg2rad':
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.deg2rad'),
None,
- TD(fltsP, f='deg2rad', astype={'e':'f'}),
+ TD(fltsP, f='deg2rad', astype={'e': 'f'}),
),
'arccos':
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.arccos'),
None,
- TD(inexact, f='acos', astype={'e':'f'}),
+ TD('e', f='acos', astype={'e': 'f'}),
+ TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+ TD(inexact, f='acos', astype={'e': 'f'}),
TD(P, f='arccos'),
),
'arccosh':
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.arccosh'),
None,
- TD(inexact, f='acosh', astype={'e':'f'}),
+ TD('e', f='acosh', astype={'e': 'f'}),
+ TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+ TD(inexact, f='acosh', astype={'e': 'f'}),
TD(P, f='arccosh'),
),
'arcsin':
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.arcsin'),
None,
- TD(inexact, f='asin', astype={'e':'f'}),
+ TD('e', f='asin', astype={'e': 'f'}),
+ TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+ TD(inexact, f='asin', astype={'e': 'f'}),
TD(P, f='arcsin'),
),
'arcsinh':
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.arcsinh'),
None,
- TD(inexact, f='asinh', astype={'e':'f'}),
+ TD('e', f='asinh', astype={'e': 'f'}),
+ TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+ TD(inexact, f='asinh', astype={'e': 'f'}),
TD(P, f='arcsinh'),
),
'arctan':
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.arctan'),
None,
- TD(inexact, f='atan', astype={'e':'f'}),
+ TD('e', f='atan', astype={'e': 'f'}),
+ TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+ TD(inexact, f='atan', astype={'e': 'f'}),
TD(P, f='arctan'),
),
'arctanh':
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.arctanh'),
None,
- TD(inexact, f='atanh', astype={'e':'f'}),
+ TD('e', f='atanh', astype={'e': 'f'}),
+ TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+ TD(inexact, f='atanh', astype={'e': 'f'}),
TD(P, f='arctanh'),
),
'cos':
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.cos'),
None,
- TD('e', f='cos', astype={'e':'f'}),
+ TD('e', f='cos', astype={'e': 'f'}),
TD('f', dispatch=[('loops_trigonometric', 'f')]),
+ TD('d', dispatch=[('loops_umath_fp', 'd')]),
TD('fdg' + cmplx, f='cos'),
TD(P, f='cos'),
),
@@ -686,8 +699,9 @@ defdict = {
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.sin'),
None,
- TD('e', f='sin', astype={'e':'f'}),
+ TD('e', f='sin', astype={'e': 'f'}),
TD('f', dispatch=[('loops_trigonometric', 'f')]),
+ TD('d', dispatch=[('loops_umath_fp', 'd')]),
TD('fdg' + cmplx, f='sin'),
TD(P, f='sin'),
),
@@ -695,35 +709,43 @@ defdict = {
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.tan'),
None,
- TD(inexact, f='tan', astype={'e':'f'}),
+ TD('e', f='tan', astype={'e': 'f'}),
+ TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+ TD(inexact, f='tan', astype={'e': 'f'}),
TD(P, f='tan'),
),
'cosh':
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.cosh'),
None,
- TD(inexact, f='cosh', astype={'e':'f'}),
+ TD('e', f='cosh', astype={'e': 'f'}),
+ TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+ TD(inexact, f='cosh', astype={'e': 'f'}),
TD(P, f='cosh'),
),
'sinh':
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.sinh'),
None,
- TD(inexact, f='sinh', astype={'e':'f'}),
+ TD('e', f='sinh', astype={'e': 'f'}),
+ TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+ TD(inexact, f='sinh', astype={'e': 'f'}),
TD(P, f='sinh'),
),
'tanh':
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.tanh'),
None,
- TD(inexact, f='tanh', astype={'e':'f'}),
+ TD('e', f='tanh', astype={'e': 'f'}),
+ TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+ TD(inexact, f='tanh', astype={'e': 'f'}),
TD(P, f='tanh'),
),
'exp':
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.exp'),
None,
- TD('e', f='exp', astype={'e':'f'}),
+ TD('e', f='exp', astype={'e': 'f'}),
TD('fd', dispatch=[('loops_exponent_log', 'fd')]),
TD('fdg' + cmplx, f='exp'),
TD(P, f='exp'),
@@ -732,21 +754,25 @@ defdict = {
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.exp2'),
None,
- TD(inexact, f='exp2', astype={'e':'f'}),
+ TD('e', f='exp2', astype={'e': 'f'}),
+ TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+ TD(inexact, f='exp2', astype={'e': 'f'}),
TD(P, f='exp2'),
),
'expm1':
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.expm1'),
None,
- TD(inexact, f='expm1', astype={'e':'f'}),
+ TD('e', f='expm1', astype={'e': 'f'}),
+ TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+ TD(inexact, f='expm1', astype={'e': 'f'}),
TD(P, f='expm1'),
),
'log':
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.log'),
None,
- TD('e', f='log', astype={'e':'f'}),
+ TD('e', f='log', astype={'e': 'f'}),
TD('fd', dispatch=[('loops_exponent_log', 'fd')]),
TD('fdg' + cmplx, f='log'),
TD(P, f='log'),
@@ -755,28 +781,34 @@ defdict = {
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.log2'),
None,
- TD(inexact, f='log2', astype={'e':'f'}),
+ TD('e', f='log2', astype={'e': 'f'}),
+ TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+ TD(inexact, f='log2', astype={'e': 'f'}),
TD(P, f='log2'),
),
'log10':
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.log10'),
None,
- TD(inexact, f='log10', astype={'e':'f'}),
+ TD('e', f='log10', astype={'e': 'f'}),
+ TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+ TD(inexact, f='log10', astype={'e': 'f'}),
TD(P, f='log10'),
),
'log1p':
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.log1p'),
None,
- TD(inexact, f='log1p', astype={'e':'f'}),
+ TD('e', f='log1p', astype={'e': 'f'}),
+ TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+ TD(inexact, f='log1p', astype={'e': 'f'}),
TD(P, f='log1p'),
),
'sqrt':
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.sqrt'),
None,
- TD('e', f='sqrt', astype={'e':'f'}),
+ TD('e', f='sqrt', astype={'e': 'f'}),
TD(inexactvec, dispatch=[('loops_unary_fp', 'fd')]),
TD('fdg' + cmplx, f='sqrt'),
TD(P, f='sqrt'),
@@ -785,14 +817,16 @@ defdict = {
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.cbrt'),
None,
- TD(flts, f='cbrt', astype={'e':'f'}),
+ TD('e', f='cbrt', astype={'e': 'f'}),
+ TD('fd', dispatch=[('loops_umath_fp', 'fd')]),
+ TD(flts, f='cbrt', astype={'e': 'f'}),
TD(P, f='cbrt'),
),
'ceil':
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.ceil'),
None,
- TD('e', f='ceil', astype={'e':'f'}),
+ TD('e', f='ceil', astype={'e': 'f'}),
TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
TD('fdg', f='ceil'),
TD(O, f='npy_ObjectCeil'),
@@ -801,7 +835,7 @@ defdict = {
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.trunc'),
None,
- TD('e', f='trunc', astype={'e':'f'}),
+ TD('e', f='trunc', astype={'e': 'f'}),
TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
TD('fdg', f='trunc'),
TD(O, f='npy_ObjectTrunc'),
@@ -810,14 +844,14 @@ defdict = {
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.fabs'),
None,
- TD(flts, f='fabs', astype={'e':'f'}),
+ TD(flts, f='fabs', astype={'e': 'f'}),
TD(P, f='fabs'),
),
'floor':
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.floor'),
None,
- TD('e', f='floor', astype={'e':'f'}),
+ TD('e', f='floor', astype={'e': 'f'}),
TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
TD('fdg', f='floor'),
TD(O, f='npy_ObjectFloor'),
@@ -826,7 +860,7 @@ defdict = {
Ufunc(1, 1, None,
docstrings.get('numpy.core.umath.rint'),
None,
- TD('e', f='rint', astype={'e':'f'}),
+ TD('e', f='rint', astype={'e': 'f'}),
TD(inexactvec, simd=[('fma', 'fd'), ('avx512f', 'fd')]),
TD('fdg' + cmplx, f='rint'),
TD(P, f='rint'),
@@ -835,7 +869,7 @@ defdict = {
Ufunc(2, 1, None,
docstrings.get('numpy.core.umath.arctan2'),
None,
- TD(flts, f='atan2', astype={'e':'f'}),
+ TD(flts, f='atan2', astype={'e': 'f'}),
TD(P, f='arctan2'),
),
'remainder':
@@ -858,7 +892,7 @@ defdict = {
Ufunc(2, 1, Zero,
docstrings.get('numpy.core.umath.hypot'),
None,
- TD(flts, f='hypot', astype={'e':'f'}),
+ TD(flts, f='hypot', astype={'e': 'f'}),
TD(P, f='hypot'),
),
'isnan':
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 2f495c48b..bde81bf2f 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -5,6 +5,7 @@ import copy
import warnings
import platform
import textwrap
+import glob
from os.path import join
from numpy.distutils import log
@@ -63,6 +64,20 @@ class CallOnceOnly:
out = copy.deepcopy(pickle.loads(self._check_complex))
return out
+def can_link_svml():
+ """SVML library is supported only on x86_64 architecture and currently
+ only on linux
+ """
+ machine = platform.machine()
+ system = platform.system()
+ return "x86_64" in machine and system == "Linux"
+
+def check_svml_submodule(svmlpath):
+ if not os.path.exists(svmlpath + "/README.md"):
+ raise RuntimeError("Missing `SVML` submodule! Run `git submodule "
+ "update --init` to fix this.")
+ return True
+
def pythonlib_dir():
"""return path where libpython* is."""
if sys.platform == 'win32':
@@ -455,6 +470,9 @@ def configuration(parent_package='',top_path=None):
# Inline check
inline = config_cmd.check_inline()
+ if can_link_svml():
+ moredefs.append(('NPY_CAN_LINK_SVML', 1))
+
# Use relaxed stride checking
if NPY_RELAXED_STRIDES_CHECKING:
moredefs.append(('NPY_RELAXED_STRIDES_CHECKING', 1))
@@ -727,6 +745,7 @@ def configuration(parent_package='',top_path=None):
join('src', 'common', 'npy_import.h'),
join('src', 'common', 'npy_hashtable.h'),
join('src', 'common', 'npy_longdouble.h'),
+ join('src', 'common', 'npy_svml.h'),
join('src', 'common', 'templ_common.h.src'),
join('src', 'common', 'ucsnarrow.h'),
join('src', 'common', 'ufunc_override.h'),
@@ -923,6 +942,7 @@ def configuration(parent_package='',top_path=None):
join('src', 'umath', 'loops_arithm_fp.dispatch.c.src'),
join('src', 'umath', 'loops_arithmetic.dispatch.c.src'),
join('src', 'umath', 'loops_trigonometric.dispatch.c.src'),
+ join('src', 'umath', 'loops_umath_fp.dispatch.c.src'),
join('src', 'umath', 'loops_exponent_log.dispatch.c.src'),
join('src', 'umath', 'matmul.h.src'),
join('src', 'umath', 'matmul.c.src'),
@@ -951,6 +971,11 @@ def configuration(parent_package='',top_path=None):
join(codegen_dir, 'generate_ufunc_api.py'),
]
+ svml_path = join('numpy', 'core', 'src', 'umath', 'svml')
+ svml_objs = []
+ if can_link_svml() and check_svml_submodule(svml_path):
+ svml_objs = glob.glob(svml_path + '/**/*.s', recursive=True)
+
config.add_extension('_multiarray_umath',
sources=multiarray_src + umath_src +
common_src +
@@ -965,6 +990,7 @@ def configuration(parent_package='',top_path=None):
depends=deps + multiarray_deps + umath_deps +
common_deps,
libraries=['npymath'],
+ extra_objects=svml_objs,
extra_info=extra_info)
#######################################################################
diff --git a/numpy/core/src/common/npy_svml.h b/numpy/core/src/common/npy_svml.h
new file mode 100644
index 000000000..4292f7090
--- /dev/null
+++ b/numpy/core/src/common/npy_svml.h
@@ -0,0 +1,41 @@
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+extern __m512 __svml_exp2f16(__m512 x);
+extern __m512 __svml_log2f16(__m512 x);
+extern __m512 __svml_log10f16(__m512 x);
+extern __m512 __svml_expm1f16(__m512 x);
+extern __m512 __svml_log1pf16(__m512 x);
+extern __m512 __svml_cbrtf16(__m512 x);
+extern __m512 __svml_sinf16(__m512 x);
+extern __m512 __svml_cosf16(__m512 x);
+extern __m512 __svml_tanf16(__m512 x);
+extern __m512 __svml_asinf16(__m512 x);
+extern __m512 __svml_acosf16(__m512 x);
+extern __m512 __svml_atanf16(__m512 x);
+extern __m512 __svml_atan2f16(__m512 x);
+extern __m512 __svml_sinhf16(__m512 x);
+extern __m512 __svml_coshf16(__m512 x);
+extern __m512 __svml_tanhf16(__m512 x);
+extern __m512 __svml_asinhf16(__m512 x);
+extern __m512 __svml_acoshf16(__m512 x);
+extern __m512 __svml_atanhf16(__m512 x);
+
+extern __m512d __svml_exp28(__m512d x);
+extern __m512d __svml_log28(__m512d x);
+extern __m512d __svml_log108(__m512d x);
+extern __m512d __svml_expm18(__m512d x);
+extern __m512d __svml_log1p8(__m512d x);
+extern __m512d __svml_cbrt8(__m512d x);
+extern __m512d __svml_sin8(__m512d x);
+extern __m512d __svml_cos8(__m512d x);
+extern __m512d __svml_tan8(__m512d x);
+extern __m512d __svml_asin8(__m512d x);
+extern __m512d __svml_acos8(__m512d x);
+extern __m512d __svml_atan8(__m512d x);
+extern __m512d __svml_atan28(__m512d x);
+extern __m512d __svml_sinh8(__m512d x);
+extern __m512d __svml_cosh8(__m512d x);
+extern __m512d __svml_tanh8(__m512d x);
+extern __m512d __svml_asinh8(__m512d x);
+extern __m512d __svml_acosh8(__m512d x);
+extern __m512d __svml_atanh8(__m512d x);
+#endif
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 02d749a5e..0938cd050 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -210,6 +210,32 @@ NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@kind@,
/**end repeat1**/
/**end repeat**/
+#ifndef NPY_DISABLE_OPTIMIZATION
+ #include "loops_umath_fp.dispatch.h"
+#endif
+
+/**begin repeat
+ * #TYPE = FLOAT, DOUBLE#
+ */
+/**begin repeat1
+ * #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, arcsin, arccos, arctan, sinh, cosh, arcsinh, arccosh, arctanh#
+ */
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void @TYPE@_@func@,
+ (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ * #func = sin, cos#
+ */
+
+NPY_CPU_DISPATCH_DECLARE(NPY_NO_EXPORT void DOUBLE_@func@,
+ (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)))
+
+/**end repeat**/
+
/**begin repeat
* #TYPE = FLOAT, DOUBLE#
*/
diff --git a/numpy/core/src/umath/loops_umath_fp.dispatch.c.src b/numpy/core/src/umath/loops_umath_fp.dispatch.c.src
new file mode 100644
index 000000000..852604655
--- /dev/null
+++ b/numpy/core/src/umath/loops_umath_fp.dispatch.c.src
@@ -0,0 +1,141 @@
+/*@targets
+ ** $maxopt baseline avx512_skx
+ */
+#include "numpy/npy_math.h"
+#include "simd/simd.h"
+#include "loops_utils.h"
+#include "loops.h"
+#include "npy_svml.h"
+#include "fast_loop_macros.h"
+
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+/**begin repeat
+ * #sfx = f32, f64#
+ * #func_suffix = f16, 8#
+ */
+/**begin repeat1
+ * #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, asin, acos, atan, sinh, cosh, asinh, acosh, atanh#
+ * #default_val = 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0#
+ */
+static void
+simd_@func@_@sfx@(const npyv_lanetype_@sfx@ *src, npy_intp ssrc,
+ npyv_lanetype_@sfx@ *dst, npy_intp sdst, npy_intp len)
+{
+ const int vstep = npyv_nlanes_@sfx@;
+ for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+ npyv_@sfx@ x;
+ #if @default_val@
+ if (ssrc == 1) {
+ x = npyv_load_till_@sfx@(src, len, @default_val@);
+ } else {
+ x = npyv_loadn_till_@sfx@(src, ssrc, len, @default_val@);
+ }
+ #else
+ if (ssrc == 1) {
+ x = npyv_load_tillz_@sfx@(src, len);
+ } else {
+ x = npyv_loadn_tillz_@sfx@(src, ssrc, len);
+ }
+ #endif
+ npyv_@sfx@ out = __svml_@func@@func_suffix@(x);
+ if (sdst == 1) {
+ npyv_store_till_@sfx@(dst, len, out);
+ } else {
+ npyv_storen_till_@sfx@(dst, sdst, len, out);
+ }
+ }
+ npyv_cleanup();
+}
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ * #func = sin, cos#
+ */
+static void
+simd_@func@_f64(const double *src, npy_intp ssrc,
+ double *dst, npy_intp sdst, npy_intp len)
+{
+ const int vstep = npyv_nlanes_f64;
+ for (; len > 0; len -= vstep, src += ssrc*vstep, dst += sdst*vstep) {
+ npyv_f64 x;
+ if (ssrc == 1) {
+ x = npyv_load_tillz_f64(src, len);
+ } else {
+ x = npyv_loadn_tillz_f64(src, ssrc, len);
+ }
+ npyv_f64 out = __svml_@func@8(x);
+ if (sdst == 1) {
+ npyv_store_till_f64(dst, len, out);
+ } else {
+ npyv_storen_till_f64(dst, sdst, len, out);
+ }
+ }
+ npyv_cleanup();
+}
+/**end repeat**/
+#endif
+
+/**begin repeat
+ * #TYPE = DOUBLE, FLOAT#
+ * #type = npy_double, npy_float#
+ * #vsub = , f#
+ * #sfx = f64, f32#
+ */
+/**begin repeat1
+ * #func = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, arcsin, arccos, arctan, sinh, cosh, arcsinh, arccosh, arctanh#
+ * #intrin = tanh, exp2, log2, log10, expm1, log1p, cbrt, tan, asin, acos, atan, sinh, cosh, asinh, acosh, atanh#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_@func@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+ const @type@ *src = (@type@*)args[0];
+ @type@ *dst = (@type@*)args[1];
+ const int lsize = sizeof(src[0]);
+ const npy_intp ssrc = steps[0] / lsize;
+ const npy_intp sdst = steps[1] / lsize;
+ const npy_intp len = dimensions[0];
+ assert(steps[0] % lsize == 0 && steps[1] % lsize == 0);
+ if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+ npyv_loadable_stride_@sfx@(ssrc) &&
+ npyv_storable_stride_@sfx@(sdst)) {
+ simd_@intrin@_@sfx@(src, ssrc, dst, sdst, len);
+ return;
+ }
+#endif
+ UNARY_LOOP {
+ const @type@ in1 = *(@type@ *)ip1;
+ *(@type@ *)op1 = npy_@intrin@@vsub@(in1);
+ }
+}
+/**end repeat1**/
+/**end repeat**/
+
+/**begin repeat
+ * #func = sin, cos#
+ */
+NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(DOUBLE_@func@)
+(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+#if NPY_SIMD && defined(NPY_HAVE_AVX512_SKX) && defined(NPY_CAN_LINK_SVML)
+ const double *src = (double*)args[0];
+ double *dst = (double*)args[1];
+ const int lsize = sizeof(src[0]);
+ const npy_intp ssrc = steps[0] / lsize;
+ const npy_intp sdst = steps[1] / lsize;
+ const npy_intp len = dimensions[0];
+ assert(steps[0] % lsize == 0 && steps[1] % lsize == 0);
+ if (!is_mem_overlap(src, steps[0], dst, steps[1], len) &&
+ npyv_loadable_stride_f64(ssrc) &&
+ npyv_storable_stride_f64(sdst)) {
+ simd_@func@_f64(src, ssrc, dst, sdst, len);
+ return;
+ }
+#endif
+ UNARY_LOOP {
+ const npy_double in1 = *(npy_double *)ip1;
+ *(npy_double *)op1 = npy_@func@(in1);
+ }
+}
+/**end repeat**/
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 654ab81cc..d47be9a30 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -116,9 +116,8 @@ run_binary_avx512f_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_in
#endif
return 0;
}
-
-
/**end repeat1**/
+
/**end repeat**/
/**begin repeat
@@ -1152,6 +1151,7 @@ NPY_FINLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d
* #is_finite = 0, 1, 0, 0#
* #is_signbit = 0, 0, 0, 1#
*/
+
#if defined HAVE_ATTRIBUTE_TARGET_AVX512_SKX_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
static NPY_INLINE NPY_GCC_TARGET_AVX512_SKX void
AVX512_SKX_@func@_@TYPE@(npy_bool* op, @type@* ip, const npy_intp array_size, const npy_intp steps)
diff --git a/numpy/core/src/umath/svml b/numpy/core/src/umath/svml
new file mode 160000
+Subproject 9f8af767ed6c75455d9a382af829048f8dd1806
diff --git a/numpy/polynomial/tests/test_classes.py b/numpy/polynomial/tests/test_classes.py
index 8e71a1945..6322062f2 100644
--- a/numpy/polynomial/tests/test_classes.py
+++ b/numpy/polynomial/tests/test_classes.py
@@ -597,4 +597,4 @@ class TestInterpolate:
for deg in range(0, 10):
for t in range(0, deg + 1):
p = Chebyshev.interpolate(powx, deg, domain=[0, 2], args=(t,))
- assert_almost_equal(p(x), powx(x, t), decimal=12)
+ assert_almost_equal(p(x), powx(x, t), decimal=11)