summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatti Picus <matti.picus@gmail.com>2020-07-14 10:36:23 +0300
committerGitHub <noreply@github.com>2020-07-14 10:36:23 +0300
commitc02d02024c0ea2195b87dba690c6c6b36bfd74a4 (patch)
treef9a0a90c23fb6d930f245cf8190a1affc120d424
parentaa935dd812f2475797d39742fe3539572c974e0e (diff)
parenta1ef8a3e462676238b9604d6c1acd1880c8262ac (diff)
downloadnumpy-c02d02024c0ea2195b87dba690c6c6b36bfd74a4.tar.gz
Merge pull request #16605 from xiegengxin/avx512-log-float64
MAINT: Avx512 intrinsics implementation for float64 input np.log
-rw-r--r--benchmarks/benchmarks/bench_avx.py13
-rw-r--r--numpy/core/code_generators/generate_umath.py1
-rw-r--r--numpy/core/src/umath/loops.c.src18
-rw-r--r--numpy/core/src/umath/loops.h.src6
-rw-r--r--numpy/core/src/umath/npy_simd_data.h154
-rw-r--r--numpy/core/src/umath/simd.inc.src239
-rw-r--r--numpy/core/tests/data/umath-validation-set-log153
-rw-r--r--numpy/core/tests/test_umath.py13
8 files changed, 594 insertions, 3 deletions
diff --git a/benchmarks/benchmarks/bench_avx.py b/benchmarks/benchmarks/bench_avx.py
index ff105811d..82866c170 100644
--- a/benchmarks/benchmarks/bench_avx.py
+++ b/benchmarks/benchmarks/bench_avx.py
@@ -39,6 +39,19 @@ class AVX_UFunc(Benchmark):
def time_ufunc(self, ufuncname, stride, dtype):
self.f(self.arr[::stride])
+class AVX_UFunc_log(Benchmark):
+ params = [stride, dtype]
+ param_names = ['stride', 'dtype']
+ timeout = 10
+
+ def setup(self, stride, dtype):
+ np.seterr(all='ignore')
+ N = 10000
+ self.arr = np.array(np.random.random_sample(stride*N), dtype=dtype)
+
+ def time_log(self, stride, dtype):
+ np.log(self.arr[::stride])
+
avx_bfuncs = ['maximum',
'minimum']
diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py
index 2ce2fdb55..4c8884dc5 100644
--- a/numpy/core/code_generators/generate_umath.py
+++ b/numpy/core/code_generators/generate_umath.py
@@ -726,6 +726,7 @@ defdict = {
None,
TD('e', f='log', astype={'e':'f'}),
TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]),
+ TD('d', simd=[('avx512f', 'd')]),
TD('fdg' + cmplx, f='log'),
TD(P, f='log'),
),
diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src
index 0cfa1cea7..d9591ab33 100644
--- a/numpy/core/src/umath/loops.c.src
+++ b/numpy/core/src/umath/loops.c.src
@@ -1558,6 +1558,14 @@ DOUBLE_exp(char **args, npy_intp const *dimensions, npy_intp const *steps, void
*(npy_double *)op1 = npy_exp(in1);
}
}
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+DOUBLE_log(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+ UNARY_LOOP {
+ const npy_double in1 = *(npy_double *)ip1;
+ *(npy_double *)op1 = npy_log(in1);
+ }
+}
/**begin repeat
* #isa = avx512f, fma#
@@ -1700,6 +1708,16 @@ DOUBLE_exp_avx512f(char **args, npy_intp const *dimensions, npy_intp const *step
}
}
+NPY_NO_EXPORT NPY_GCC_OPT_3 void
+DOUBLE_log_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data))
+{
+ if (!run_unary_avx512f_log_DOUBLE(args, dimensions, steps)) {
+ UNARY_LOOP {
+ const npy_double in1 = *(npy_double *)ip1;
+ *(npy_double *)op1 = npy_log(in1);
+ }
+ }
+}
/**begin repeat
* Float types
diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src
index 5dd49c465..6b8b77e99 100644
--- a/numpy/core/src/umath/loops.h.src
+++ b/numpy/core/src/umath/loops.h.src
@@ -202,6 +202,12 @@ DOUBLE_exp(char **args, npy_intp const *dimensions, npy_intp const *steps, void
NPY_NO_EXPORT void
DOUBLE_exp_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+NPY_NO_EXPORT void
+DOUBLE_log(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
+NPY_NO_EXPORT void
+DOUBLE_log_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func));
+
/**begin repeat
* #func = sin, cos, exp, log#
*/
diff --git a/numpy/core/src/umath/npy_simd_data.h b/numpy/core/src/umath/npy_simd_data.h
index 36c8b6c03..45487d0a8 100644
--- a/numpy/core/src/umath/npy_simd_data.h
+++ b/numpy/core/src/umath/npy_simd_data.h
@@ -1,6 +1,7 @@
#ifndef __NPY_SIMD_DATA_H_
#define __NPY_SIMD_DATA_H_
#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+#if !(defined(__clang__) && (__clang_major__ < 10 || (__clang_major__ == 10 && __clang_minor__ < 1)))
/*
* Constants used in vector implementation of float64 exp(x)
*/
@@ -85,6 +86,7 @@ static npy_uint64 EXP_Table_tail[32] = {
0x3C99D3E12DD8A18B,
};
#endif
+#endif
/*
* Constants used in vector implementation of exp(x)
@@ -134,4 +136,156 @@ static npy_uint64 EXP_Table_tail[32] = {
#define NPY_COEFF_INVF7_SINEf -0x1.a06bbap-13f
#define NPY_COEFF_INVF9_SINEf 0x1.7d3bbcp-19f
+/*
+ * Lookup table of log(c_k)
+ * Reference form: Tang, Ping-Tak Peter. "Table-driven implementation of the
+ * logarithm function in IEEE floating-point arithmetic." ACM Transactions
+ * on Mathematical Software (TOMS) 16.4 (1990): 378-400.
+ */
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+#if !(defined(__clang__) && (__clang_major__ < 10 || (__clang_major__ == 10 && __clang_minor__ < 1)))
+static npy_uint64 LOG_TABLE_TOP[64] = {
+ 0x0000000000000000,
+ 0x3F8FC0A8B1000000,
+ 0x3F9F829B0E780000,
+ 0x3FA77458F6340000,
+ 0x3FAF0A30C0100000,
+ 0x3FB341D7961C0000,
+ 0x3FB6F0D28AE60000,
+ 0x3FBA926D3A4A0000,
+ 0x3FBE27076E2A0000,
+ 0x3FC0D77E7CD10000,
+ 0x3FC29552F8200000,
+ 0x3FC44D2B6CCB0000,
+ 0x3FC5FF3070A80000,
+ 0x3FC7AB8902110000,
+ 0x3FC9525A9CF40000,
+ 0x3FCAF3C94E810000,
+ 0x3FCC8FF7C79B0000,
+ 0x3FCE27076E2B0000,
+ 0x3FCFB9186D5E0000,
+ 0x3FD0A324E2738000,
+ 0x3FD1675CABAB8000,
+ 0x3FD22941FBCF8000,
+ 0x3FD2E8E2BAE10000,
+ 0x3FD3A64C55698000,
+ 0x3FD4618BC21C8000,
+ 0x3FD51AAD872E0000,
+ 0x3FD5D1BDBF580000,
+ 0x3FD686C81E9B0000,
+ 0x3FD739D7F6BC0000,
+ 0x3FD7EAF83B828000,
+ 0x3FD89A3386C18000,
+ 0x3FD947941C210000,
+ 0x3FD9F323ECBF8000,
+ 0x3FDA9CEC9A9A0000,
+ 0x3FDB44F77BCC8000,
+ 0x3FDBEB4D9DA70000,
+ 0x3FDC8FF7C79A8000,
+ 0x3FDD32FE7E010000,
+ 0x3FDDD46A04C20000,
+ 0x3FDE744261D68000,
+ 0x3FDF128F5FAF0000,
+ 0x3FDFAF588F790000,
+ 0x3FE02552A5A5C000,
+ 0x3FE0723E5C1CC000,
+ 0x3FE0BE72E4254000,
+ 0x3FE109F39E2D4000,
+ 0x3FE154C3D2F4C000,
+ 0x3FE19EE6B467C000,
+ 0x3FE1E85F5E704000,
+ 0x3FE23130D7BEC000,
+ 0x3FE2795E1289C000,
+ 0x3FE2C0E9ED448000,
+ 0x3FE307D7334F0000,
+ 0x3FE34E289D9D0000,
+ 0x3FE393E0D3564000,
+ 0x3FE3D9026A714000,
+ 0x3FE41D8FE8468000,
+ 0x3FE4618BC21C4000,
+ 0x3FE4A4F85DB04000,
+ 0x3FE4E7D811B74000,
+ 0x3FE52A2D265BC000,
+ 0x3FE56BF9D5B40000,
+ 0x3FE5AD404C358000,
+ 0x3FE5EE02A9240000,
+};
+
+static npy_uint64 LOG_TABLE_TAIL[64] = {
+ 0x0000000000000000,
+ 0xBD5FE0E183092C59,
+ 0x3D2980267C7E09E4,
+ 0xBD62303B9CB0D5E1,
+ 0x3D662A6617CC9717,
+ 0xBD4717B6B33E44F8,
+ 0xBD62968C836CC8C2,
+ 0x3D6AAC6CA17A4554,
+ 0x3D6E5CBD3D50FFFC,
+ 0xBD6C69A65A23A170,
+ 0xBD35B967F4471DFC,
+ 0x3D6F4799F4F6543E,
+ 0xBD6B0B0DE3077D7E,
+ 0xBD537B720E4A694B,
+ 0x3D65AD1D904C1D4E,
+ 0xBD600349CC67F9B2,
+ 0xBD697794F689F843,
+ 0xBD3A342C2AF0003C,
+ 0x3D5F1546AAA3361C,
+ 0x3D50E35F73F7A018,
+ 0x3D630701CE63EAB9,
+ 0xBD3A6976F5EB0963,
+ 0x3D5D309C2CC91A85,
+ 0xBD6D0B1C68651946,
+ 0xBD609EC17A426426,
+ 0xBD3F4BD8DB0A7CC1,
+ 0x3D4394A11B1C1EE4,
+ 0x3D54AEC442BE1015,
+ 0xBD67FCB18ED9D603,
+ 0x3D67E1B259D2F3DA,
+ 0xBD6ED2A52C73BF78,
+ 0x3D56FABA4CDD147D,
+ 0x3D584BF2B68D766F,
+ 0x3D40931A909FEA5E,
+ 0x3D4EC5197DDB55D3,
+ 0x3D5B7BF7861D37AC,
+ 0x3D5A21AC25DB1EF3,
+ 0xBD542A9E21373414,
+ 0xBD6DAFA08CECADB1,
+ 0x3D3E1F8DF68DBCF3,
+ 0x3D3BB2CD720EC44C,
+ 0xBD49C24CA098362B,
+ 0x3D60FEC69C695D7F,
+ 0x3D6F404E57963891,
+ 0xBD657D49676844CC,
+ 0x3D592DFBC7D93617,
+ 0x3D65E9A98F33A396,
+ 0x3D52DD98B97BAEF0,
+ 0x3D1A07BD8B34BE7C,
+ 0xBD17AFA4392F1BA7,
+ 0xBD5DCA290F818480,
+ 0x3D5D1772F5386374,
+ 0x3D60BE1FB590A1F5,
+ 0xBD6E2CE9146D271A,
+ 0xBD65E6563BBD9FC9,
+ 0x3D66FAA404263D0B,
+ 0xBD5AA33736867A17,
+ 0x3D6EC27D0B7B37B3,
+ 0xBD244FDD840B8591,
+ 0x3D6BB09CB0985646,
+ 0x3D46ABB9DF22BC57,
+ 0xBD58CD7DC73BD194,
+ 0x3D6F2CFB29AAA5F0,
+ 0x3D66757006095FD2,
+};
+
+#define NPY_TANG_LOG_A1 0x1.55555555554e6p-4
+#define NPY_TANG_LOG_A2 0x1.9999999bac6d4p-7
+#define NPY_TANG_LOG_A3 0x1.2492307f1519fp-9
+#define NPY_TANG_LOG_A4 0x1.c8034c85dfffp-12
+
+#define NPY_TANG_LOG_LN2HI 0x1.62e42fefa4p-1
+#define NPY_TANG_LOG_LN2LO -0x1.8432a1b0e2634p-43
+#endif
+#endif
+
#endif
diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src
index 8f01d33fa..2df1b7225 100644
--- a/numpy/core/src/umath/simd.inc.src
+++ b/numpy/core/src/umath/simd.inc.src
@@ -480,6 +480,26 @@ run_unary_avx512f_exp_DOUBLE(char **args, npy_intp const *dimensions, npy_intp c
return 0;
}
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+static NPY_INLINE void
+AVX512F_log_DOUBLE(npy_double *, npy_double *, const npy_intp n, const npy_intp stride);
+#endif
+static NPY_INLINE int
+run_unary_avx512f_log_DOUBLE(char **args, npy_intp const *dimensions, npy_intp const *steps)
+{
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS
+#if !(defined(__clang__) && (__clang_major__ < 10 || (__clang_major__ == 10 && __clang_minor__ < 1)))
+ if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_double), 64)) {
+ AVX512F_log_DOUBLE((npy_double*)args[1], (npy_double*)args[0], dimensions[0], steps[0]);
+ return 1;
+ }
+ else
+ return 0;
+#endif
+#endif
+ return 0;
+}
+
/**begin repeat
* Float types
* #type = npy_float, npy_double, npy_longdouble#
@@ -1795,14 +1815,27 @@ avx512_permute_x4var_pd(__m512d t0,
__m512d t3,
__m512i index)
{
-
- __mmask8 lut_mask = _mm512_cmp_epi64_mask(index, _mm512_set1_epi64(15),
- _MM_CMPINT_GT);
+ __mmask8 lut_mask = _mm512_cmp_epi64_mask(
+ _mm512_and_epi64(_mm512_set1_epi64(0x10ULL), index),
+ _mm512_set1_epi64(0), _MM_CMPINT_GT);
__m512d res1 = _mm512_permutex2var_pd(t0, index, t1);
__m512d res2 = _mm512_permutex2var_pd(t2, index, t3);
return _mm512_mask_blend_pd(lut_mask, res1, res2);
}
+static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d
+avx512_permute_x8var_pd(__m512d t0, __m512d t1, __m512d t2, __m512d t3,
+ __m512d t4, __m512d t5, __m512d t6, __m512d t7,
+ __m512i index)
+{
+ __mmask8 lut_mask = _mm512_cmp_epi64_mask(
+ _mm512_and_epi64(_mm512_set1_epi64(0x20ULL), index),
+ _mm512_set1_epi64(0), _MM_CMPINT_GT);
+ __m512d res1 = avx512_permute_x4var_pd(t0, t1, t2, t3, index);
+ __m512d res2 = avx512_permute_x4var_pd(t4, t5, t6, t7, index);
+ return _mm512_mask_blend_pd(lut_mask, res1, res2);
+}
+
/**begin repeat
* #vsub = ps, pd#
* #type= npy_float, npy_double#
@@ -3145,6 +3178,206 @@ AVX512F_exp_DOUBLE(npy_double * op,
#endif
#endif
+/*
+ * Vectorized implementation of log double using AVX512
+ * Reference:
+ * [1] Tang, Ping Tak Peter. Table-lookup algorithms for elementary functions
+ * and their error analysis. No. CONF-9106103-1. Argonne National Lab.,
+ * IL (USA), 1991.
+ * [2] Tang, Ping-Tak Peter. "Table-driven implementation of the logarithm
+ * function in IEEE floating-point arithmetic." ACM Transactions on
+ * Mathematical Software (TOMS) 16.4 (1990): 378-400.
+ * [3] Muller, Jean-Michel. "Elementary functions: algorithms and
+ * implementation." (2016).
+ * 1) if x = 0; return -INF
+ * 2) if x < 0; return NAN
+ * 3) if x is INF; return INF
+ * 4) if x is NAN; return NAN
+ * 5) if x on (1.0 - 0x1p-4, 1.0 + 0x1.09p-4), calling npy_log()
+ * 6) Range reduction:
+ * log(x) = log(2^m * z)
+ * = mln2 + log(z)
+ * 7) log(z) = log(z / c_k) + log(c_k);
+ * where c_k = 1 + k/64, k = 0,1,...,64
+ * s.t. |x - c_k| <= 1/128 when x on[1,2].
+ * 8) r = 2(x - c_k)/(x + c_k)
+ * log(x/c_k) = log((1 + r/2) / (1 - r/2))
+ * = p(r)
+ * = 2((r/2) + 1/3*(r/2)^3 + 1/5*(r/2)^5 + ...)
+ */
+#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS
+#if !(defined(__clang__) && (__clang_major__ < 10 || (__clang_major__ == 10 && __clang_minor__ < 1)))
+static NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F void
+AVX512F_log_DOUBLE(npy_double * op,
+ npy_double * ip,
+ const npy_intp array_size,
+ const npy_intp steps)
+{
+ npy_intp num_remaining_elements = array_size;
+ const npy_intp stride = steps / (npy_intp)sizeof(npy_double);
+ const npy_int num_lanes = 64 / (npy_intp)sizeof(npy_double);
+ npy_int32 indexarr[8];
+ for (npy_int32 ii = 0; ii < 8; ii++) {
+ indexarr[ii] = ii*stride;
+ }
+
+ __m512d zeros_d = _mm512_set1_pd(0.0f);
+ __m512d ones_d = _mm512_set1_pd(1.0f);
+ __m512d mInf = _mm512_set1_pd(NPY_INFINITY);
+ __m512d mInv64 = (__m512d)(_mm512_set1_epi64(0x3f90000000000000));
+ __m512d mNeg_nan = _mm512_set1_pd(-NPY_NAN);
+ __m512d mNan = _mm512_set1_pd(NPY_NAN);
+ __m512d mNeg_inf = _mm512_set1_pd(-NPY_INFINITY);
+ __m512d mA1 = _mm512_set1_pd(NPY_TANG_LOG_A1);
+ __m512d mA2 = _mm512_set1_pd(NPY_TANG_LOG_A2);
+ __m512d mA3 = _mm512_set1_pd(NPY_TANG_LOG_A3);
+ __m512d mA4 = _mm512_set1_pd(NPY_TANG_LOG_A4);
+ __m512d mLN2HI = _mm512_set1_pd(NPY_TANG_LOG_LN2HI);
+ __m512d mLN2LO = _mm512_set1_pd(NPY_TANG_LOG_LN2LO);
+
+ __m512d mTo_glibc_min = _mm512_set1_pd(1.0 - 0x1p-4);
+ __m512d mTo_glibc_max = _mm512_set1_pd(1.0 + 0x1.09p-4);
+ __m256i vindex = _mm256_loadu_si256((__m256i*)&indexarr[0]);
+
+ /* Load lookup table data */
+ /**begin repeat
+ * #i = 0, 1, 2, 3, 4, 5, 6, 7#
+ */
+
+ __m512d mLUT_TOP_@i@ = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*@i@]));
+ __m512d mLUT_TAIL_@i@ = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*@i@]));
+
+ /**end repeat**/
+
+ __mmask8 load_mask = avx512_get_full_load_mask_pd();
+ __mmask8 invalid_mask = avx512_get_partial_load_mask_pd(0, num_lanes);
+ __mmask8 divide_by_zero_mask = invalid_mask;
+
+ __mmask8 inf_mask, nan_mask, zero_mask, negx_mask, denormal_mask,
+ glibc_mask;
+
+ __m512d x;
+ while (num_remaining_elements > 0) {
+ if (num_remaining_elements < num_lanes) {
+ load_mask = avx512_get_partial_load_mask_pd(num_remaining_elements,
+ num_lanes);
+ }
+
+ if (1 == stride) {
+ x = avx512_masked_load_pd(load_mask, ip);
+ }
+ else {
+ x = avx512_masked_gather_pd(zeros_d, ip, vindex, load_mask);
+ }
+
+ /* call glibc when x on [1.0 - 0x1p-4, 1.0 + 0x1.09p-4] */
+ __mmask8 m1 = _mm512_cmp_pd_mask(x, mTo_glibc_max, _CMP_LT_OQ);
+ __mmask8 m2 = _mm512_cmp_pd_mask(x, mTo_glibc_min, _CMP_GT_OQ);
+ glibc_mask = m1 & m2;
+
+ if (glibc_mask != 0xFF) {
+ zero_mask = _mm512_cmp_pd_mask(x, zeros_d, _CMP_EQ_OQ);
+ inf_mask = _mm512_cmp_pd_mask(x, mInf, _CMP_EQ_OQ);
+ negx_mask = _mm512_cmp_pd_mask(x, zeros_d, _CMP_LT_OQ);
+ nan_mask = _mm512_cmp_pd_mask(x, x, _CMP_NEQ_UQ);
+
+ divide_by_zero_mask = divide_by_zero_mask | (zero_mask & load_mask);
+ invalid_mask = invalid_mask | negx_mask;
+
+ x = avx512_set_masked_lanes_pd(x, zeros_d, negx_mask);
+ __m512i ix = (__m512i)x;
+
+ /* Normalize x when it is denormal */
+ __m512i top12 = _mm512_and_epi64(ix,
+ _mm512_set1_epi64(0xfff0000000000000));
+ denormal_mask = _mm512_cmp_epi64_mask(top12, _mm512_set1_epi64(0),
+ _CMP_EQ_OQ);
+ denormal_mask = (~zero_mask) & denormal_mask;
+ ix = (__m512i)_mm512_mask_mul_pd(x, denormal_mask,
+ x, _mm512_set1_pd(0x1p52));
+ ix = _mm512_mask_sub_epi64(ix, denormal_mask,
+ ix, _mm512_set1_epi64(52ULL << 52));
+
+ /*
+ * x = 2^k * z; where z in range [1,2]
+ */
+ __m512i tmp = _mm512_sub_epi64(ix,
+ _mm512_set1_epi64(0x3ff0000000000000));
+ __m512i i = _mm512_and_epi64(_mm512_srai_epi64(tmp, 52 - 6),
+ _mm512_set1_epi64(0x3fULL));
+ __m512i ik = _mm512_srai_epi64(tmp, 52);
+ __m512d z = (__m512d)(_mm512_sub_epi64(ix, _mm512_and_epi64(tmp,
+ _mm512_set1_epi64(0xfff0000000000000))));
+ /* c = i/64 + 1 */
+ __m256i i_32 = _mm512_cvtepi64_epi32(i);
+ __m512d c = _mm512_fmadd_pd(_mm512_cvtepi32_pd(i_32), mInv64, ones_d);
+
+ /* u = 2 * (z - c) / (z + c) */
+ __m512d u = _mm512_div_pd(_mm512_sub_pd(z, c), _mm512_add_pd(z, c));
+ u = _mm512_mul_pd(_mm512_set1_pd(2.0), u);
+
+ /* v = u * u */
+ __m512d v = _mm512_mul_pd(u,u);
+
+ /* log(z/c) = u + u*v*(A1 + v*(A2 + v*(A3 + v*A4))) */
+ __m512d res = _mm512_fmadd_pd(v, mA4, mA3);
+ res = _mm512_fmadd_pd(v, res, mA2);
+ res = _mm512_fmadd_pd(v, res, mA1);
+ res = _mm512_mul_pd(v, res);
+ res = _mm512_fmadd_pd(u, res, u);
+
+ /* Load lookup table data */
+ __m512d c_hi = avx512_permute_x8var_pd(mLUT_TOP_0, mLUT_TOP_1,
+ mLUT_TOP_2, mLUT_TOP_3, mLUT_TOP_4, mLUT_TOP_5,
+ mLUT_TOP_6, mLUT_TOP_7, i);
+ __m512d c_lo = avx512_permute_x8var_pd(mLUT_TAIL_0, mLUT_TAIL_1,
+ mLUT_TAIL_2, mLUT_TAIL_3, mLUT_TAIL_4, mLUT_TAIL_5,
+ mLUT_TAIL_6, mLUT_TAIL_7, i);
+
+ /*
+ * log(x) = k * ln2_hi + c_hi +
+ * k * ln2_lo + c_lo +
+ * log(z/c)
+ */
+ __m256i ik_32 = _mm512_cvtepi64_epi32(ik);
+ __m512d k = _mm512_cvtepi32_pd(ik_32);
+ __m512d tt = _mm512_fmadd_pd(k, mLN2HI, c_hi);
+ __m512d tt2 = _mm512_fmadd_pd(k, mLN2LO, c_lo);
+ tt = _mm512_add_pd(tt, tt2);
+ res = _mm512_add_pd(tt, res);
+
+ /* return special cases */
+ res = avx512_set_masked_lanes_pd(res, mNan, nan_mask);
+ res = avx512_set_masked_lanes_pd(res, mNeg_nan, negx_mask);
+ res = avx512_set_masked_lanes_pd(res, mNeg_inf, zero_mask);
+ res = avx512_set_masked_lanes_pd(res, mInf, inf_mask);
+
+ _mm512_mask_storeu_pd(op, load_mask, res);
+ }
+
+ /* call glibc's log func when x around 1.0f */
+ for (int ii = 0, jj = 0; glibc_mask != 0; ii++, jj += stride) {
+ if (glibc_mask & 0x01) {
+ op[ii] = npy_log(ip[jj]);
+ }
+ glibc_mask = glibc_mask >> 1;
+ }
+
+ ip += num_lanes * stride;
+ op += num_lanes;
+ num_remaining_elements -= num_lanes;
+ }
+
+ if (invalid_mask) {
+ npy_set_floatstatus_invalid();
+ }
+ if (divide_by_zero_mask) {
+ npy_set_floatstatus_divbyzero();
+ }
+}
+#endif
+#endif
+
/**begin repeat
* #TYPE = CFLOAT, CDOUBLE#
* #type = npy_float, npy_double#
diff --git a/numpy/core/tests/data/umath-validation-set-log b/numpy/core/tests/data/umath-validation-set-log
index a7bd98481..b8f6b0875 100644
--- a/numpy/core/tests/data/umath-validation-set-log
+++ b/numpy/core/tests/data/umath-validation-set-log
@@ -116,3 +116,156 @@ np.float32,0x3f494ab1,0xbe763131,4
np.float32,0x3f476b69,0xbe7fc2c6,4
np.float32,0x3f4884e8,0xbe7a214a,4
np.float32,0x3f486945,0xbe7aae76,4
+#float64
+## +ve denormal ##
+np.float64,0x0000000000000001,0xc0874385446d71c3,1
+np.float64,0x0001000000000000,0xc086395a2079b70c,1
+np.float64,0x000fffffffffffff,0xc086232bdd7abcd2,1
+np.float64,0x0007ad63e2168cb6,0xc086290bc0b2980f,1
+## -ve denormal ##
+np.float64,0x8000000000000001,0xfff8000000000001,1
+np.float64,0x8001000000000000,0xfff8000000000001,1
+np.float64,0x800fffffffffffff,0xfff8000000000001,1
+np.float64,0x8007ad63e2168cb6,0xfff8000000000001,1
+## +/-0.0f, MAX, MIN##
+np.float64,0x0000000000000000,0xfff0000000000000,1
+np.float64,0x8000000000000000,0xfff0000000000000,1
+np.float64,0x7fefffffffffffff,0x40862e42fefa39ef,1
+np.float64,0xffefffffffffffff,0xfff8000000000001,1
+## near 1.0f ##
+np.float64,0x3ff0000000000000,0x0000000000000000,1
+np.float64,0x3fe8000000000000,0xbfd269621134db92,1
+np.float64,0x3ff0000000000001,0x3cafffffffffffff,1
+np.float64,0x3ff0000020000000,0x3e7fffffe000002b,1
+np.float64,0x3ff0000000000001,0x3cafffffffffffff,1
+np.float64,0x3fefffffe0000000,0xbe70000008000005,1
+np.float64,0x3fefffffffffffff,0xbca0000000000000,1
+## random numbers ##
+np.float64,0x02500186f3d9da56,0xc0855b8abf135773,1
+np.float64,0x09200815a3951173,0xc082ff1ad7131bdc,1
+np.float64,0x0da029623b0243d4,0xc0816fc994695bb5,1
+np.float64,0x48703b8ac483a382,0x40579213a313490b,1
+np.float64,0x09207b74c87c9860,0xc082fee20ff349ef,1
+np.float64,0x62c077698e8df947,0x407821c996d110f0,1
+np.float64,0x2350b45e87c3cfb0,0xc073d6b16b51d072,1
+np.float64,0x3990a23f9ff2b623,0xc051aa60eadd8c61,1
+np.float64,0x0d011386a116c348,0xc081a6cc7ea3b8fb,1
+np.float64,0x1fe0f0303ebe273a,0xc0763870b78a81ca,1
+np.float64,0x0cd1260121d387da,0xc081b7668d61a9d1,1
+np.float64,0x1e6135a8f581d422,0xc077425ac10f08c2,1
+np.float64,0x622168db5fe52d30,0x4077b3c669b9fadb,1
+np.float64,0x69f188e1ec6d1718,0x407d1e2f18c63889,1
+np.float64,0x3aa1bf1d9c4dd1a3,0xc04d682e24bde479,1
+np.float64,0x6c81c4011ce4f683,0x407ee5190e8a8e6a,1
+np.float64,0x2191fa55aa5a5095,0xc0750c0c318b5e2d,1
+np.float64,0x32a1f602a32bf360,0xc06270caa493fc17,1
+np.float64,0x16023c90ba93249b,0xc07d0f88e0801638,1
+np.float64,0x1c525fe6d71fa9ff,0xc078af49c66a5d63,1
+np.float64,0x1a927675815d65b7,0xc079e5bdd7fe376e,1
+np.float64,0x41227b8fe70da028,0x402aa0c9f9a84c71,1
+np.float64,0x4962bb6e853fe87d,0x405a34aa04c83747,1
+np.float64,0x23d2cda00b26b5a4,0xc0737c13a06d00ea,1
+np.float64,0x2d13083fd62987fa,0xc06a25055aeb474e,1
+np.float64,0x10e31e4c9b4579a1,0xc0804e181929418e,1
+np.float64,0x26d3247d556a86a9,0xc0716774171da7e8,1
+np.float64,0x6603379398d0d4ac,0x407a64f51f8a887b,1
+np.float64,0x02d38af17d9442ba,0xc0852d955ac9dd68,1
+np.float64,0x6a2382b4818dd967,0x407d4129d688e5d4,1
+np.float64,0x2ee3c403c79b3934,0xc067a091fefaf8b6,1
+np.float64,0x6493a699acdbf1a4,0x4079663c8602bfc5,1
+np.float64,0x1c8413c4f0de3100,0xc0788c99697059b6,1
+np.float64,0x4573f1ed350d9622,0x404e9bd1e4c08920,1
+np.float64,0x2f34265c9200b69c,0xc067310cfea4e986,1
+np.float64,0x19b43e65fa22029b,0xc07a7f8877de22d6,1
+np.float64,0x0af48ab7925ed6bc,0xc0825c4fbc0e5ade,1
+np.float64,0x4fa49699cad82542,0x4065c76d2a318235,1
+np.float64,0x7204a15e56ade492,0x40815bb87484dffb,1
+np.float64,0x4734aa08a230982d,0x40542a4bf7a361a9,1
+np.float64,0x1ae4ed296c2fd749,0xc079ac4921f20abb,1
+np.float64,0x472514ea4370289c,0x4053ff372bd8f18f,1
+np.float64,0x53a54b3f73820430,0x406b5411fc5f2e33,1
+np.float64,0x64754de5a15684fa,0x407951592e99a5ab,1
+np.float64,0x69358e279868a7c3,0x407c9c671a882c31,1
+np.float64,0x284579ec61215945,0xc0706688e55f0927,1
+np.float64,0x68b5c58806447adc,0x407c43d6f4eff760,1
+np.float64,0x1945a83f98b0e65d,0xc07acc15eeb032cc,1
+np.float64,0x0fc5eb98a16578bf,0xc080b0d02eddca0e,1
+np.float64,0x6a75e208f5784250,0x407d7a7383bf8f05,1
+np.float64,0x0fe63a029c47645d,0xc080a59ca1e98866,1
+np.float64,0x37963ac53f065510,0xc057236281f7bdb6,1
+np.float64,0x135661bb07067ff7,0xc07ee924930c21e4,1
+np.float64,0x4b4699469d458422,0x405f73843756e887,1
+np.float64,0x1a66d73e4bf4881b,0xc07a039ba1c63adf,1
+np.float64,0x12a6b9b119a7da59,0xc07f62e49c6431f3,1
+np.float64,0x24c719aa8fd1bdb5,0xc072d26da4bf84d3,1
+np.float64,0x0fa6ff524ffef314,0xc080bb8514662e77,1
+np.float64,0x1db751d66fdd4a9a,0xc077b77cb50d7c92,1
+np.float64,0x4947374c516da82c,0x4059e9acfc7105bf,1
+np.float64,0x1b1771ab98f3afc8,0xc07989326b8e1f66,1
+np.float64,0x25e78805baac8070,0xc0720a818e6ef080,1
+np.float64,0x4bd7a148225d3687,0x406082d004ea3ee7,1
+np.float64,0x53d7d6b2bbbda00a,0x406b9a398967cbd5,1
+np.float64,0x6997fb9f4e1c685f,0x407ce0a703413eba,1
+np.float64,0x069802c2ff71b951,0xc083df39bf7acddc,1
+np.float64,0x4d683ac9890f66d8,0x4062ae21d8c2acf0,1
+np.float64,0x5a2825863ec14f4c,0x40722d718d549552,1
+np.float64,0x0398799a88f4db80,0xc084e93dab8e2158,1
+np.float64,0x5ed87a8b77e135a5,0x40756d7051777b33,1
+np.float64,0x5828cd6d79b9bede,0x4070cafb22fc6ca1,1
+np.float64,0x7b18ba2a5ec6f068,0x408481386b3ed6fe,1
+np.float64,0x4938fd60922198fe,0x4059c206b762ea7e,1
+np.float64,0x31b8f44fcdd1a46e,0xc063b2faa8b6434e,1
+np.float64,0x5729341c0d918464,0x407019cac0c4a7d7,1
+np.float64,0x13595e9228ee878e,0xc07ee7235a7d8088,1
+np.float64,0x17698b0dc9dd4135,0xc07c1627e3a5ad5f,1
+np.float64,0x63b977c283abb0cc,0x4078cf1ec6ed65be,1
+np.float64,0x7349cc0d4dc16943,0x4081cc697ce4cb53,1
+np.float64,0x4e49a80b732fb28d,0x4063e67e3c5cbe90,1
+np.float64,0x07ba14b848a8ae02,0xc0837ac032a094e0,1
+np.float64,0x3da9f17b691bfddc,0xc03929c25366acda,1
+np.float64,0x02ea39aa6c3ac007,0xc08525af6f21e1c4,1
+np.float64,0x3a6a42f04ed9563d,0xc04e98e825dca46b,1
+np.float64,0x1afa877cd7900be7,0xc0799d6648cb34a9,1
+np.float64,0x58ea986649e052c6,0x4071512e939ad790,1
+np.float64,0x691abbc04647f536,0x407c89aaae0fcb83,1
+np.float64,0x43aabc5063e6f284,0x4044b45d18106fd2,1
+np.float64,0x488b003c893e0bea,0x4057df012a2dafbe,1
+np.float64,0x77eb076ed67caee5,0x40836720de94769e,1
+np.float64,0x5c1b46974aba46f4,0x40738731ba256007,1
+np.float64,0x1a5b29ecb5d3c261,0xc07a0becc77040d6,1
+np.float64,0x5d8b6ccf868c6032,0x4074865c1865e2db,1
+np.float64,0x4cfb6690b4aaf5af,0x406216cd8c7e8ddb,1
+np.float64,0x76cbd8eb5c5fc39e,0x4083038dc66d682b,1
+np.float64,0x28bbd1fec5012814,0xc07014c2dd1b9711,1
+np.float64,0x33dc1b3a4fd6bf7a,0xc060bd0756e07d8a,1
+np.float64,0x52bbe89b37de99f3,0x406a10041aa7d343,1
+np.float64,0x07bc479d15eb2dd3,0xc0837a1a6e3a3b61,1
+np.float64,0x18fc5275711a901d,0xc07aff3e9d62bc93,1
+np.float64,0x114c9758e247dc71,0xc080299a7cf15b05,1
+np.float64,0x25ac8f6d60755148,0xc07233c4c0c511d4,1
+np.float64,0x260cae2bb9e9fd7e,0xc071f128c7e82eac,1
+np.float64,0x572ccdfe0241de82,0x40701bedc84bb504,1
+np.float64,0x0ddcef6c8d41f5ee,0xc0815a7e16d07084,1
+np.float64,0x6dad1d59c988af68,0x407fb4a0bc0142b1,1
+np.float64,0x025d200580d8b6d1,0xc08556c0bc32b1b2,1
+np.float64,0x7aad344b6aa74c18,0x40845bbc453f22be,1
+np.float64,0x5b5d9d6ad9d14429,0x4073036d2d21f382,1
+np.float64,0x49cd8d8dcdf19954,0x405b5c034f5c7353,1
+np.float64,0x63edb9483335c1e6,0x4078f2dd21378786,1
+np.float64,0x7b1dd64c9d2c26bd,0x408482b922017bc9,1
+np.float64,0x782e13e0b574be5f,0x40837e2a0090a5ad,1
+np.float64,0x592dfe18b9d6db2f,0x40717f777fbcb1ec,1
+np.float64,0x654e3232ac60d72c,0x4079e71a95a70446,1
+np.float64,0x7b8e42ad22091456,0x4084a9a6f1e61722,1
+np.float64,0x570e88dfd5860ae6,0x407006ae6c0d137a,1
+np.float64,0x294e98346cb98ef1,0xc06f5edaac12bd44,1
+np.float64,0x1adeaa4ab792e642,0xc079b1431d5e2633,1
+np.float64,0x7b6ead3377529ac8,0x40849eabc8c7683c,1
+np.float64,0x2b8eedae8a9b2928,0xc06c400054deef11,1
+np.float64,0x65defb45b2dcf660,0x407a4b53f181c05a,1
+np.float64,0x1baf582d475e7701,0xc07920bcad4a502c,1
+np.float64,0x461f39cf05a0f15a,0x405126368f984fa1,1
+np.float64,0x7e5f6f5dcfff005b,0x4085a37d610439b4,1
+np.float64,0x136f66e4d09bd662,0xc07ed8a2719f2511,1
+np.float64,0x65afd8983fb6ca1f,0x407a2a7f48bf7fc1,1
+np.float64,0x572fa7f95ed22319,0x40701d706cf82e6f,1
diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py
index 941d99521..b4a129502 100644
--- a/numpy/core/tests/test_umath.py
+++ b/numpy/core/tests/test_umath.py
@@ -657,6 +657,19 @@ class TestLog:
yf = np.array(y, dtype=dt)*log2_
assert_almost_equal(np.log(xf), yf)
+ def test_log_strides(self):
+ np.random.seed(42)
+ strides = np.array([-4,-3,-2,-1,1,2,3,4])
+ sizes = np.arange(2,100)
+ for ii in sizes:
+ x_f64 = np.float64(np.random.uniform(low=0.01, high=100.0,size=ii))
+ x_special = x_f64.copy()
+ x_special[3:-1:4] = 1.0
+ y_true = np.log(x_f64)
+ y_special = np.log(x_special)
+ for jj in strides:
+ assert_array_almost_equal_nulp(np.log(x_f64[::jj]), y_true[::jj], nulp=2)
+ assert_array_almost_equal_nulp(np.log(x_special[::jj]), y_special[::jj], nulp=2)
class TestExp:
def test_exp_values(self):