diff options
author | Matti Picus <matti.picus@gmail.com> | 2020-07-14 10:36:23 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-07-14 10:36:23 +0300 |
commit | c02d02024c0ea2195b87dba690c6c6b36bfd74a4 (patch) | |
tree | f9a0a90c23fb6d930f245cf8190a1affc120d424 | |
parent | aa935dd812f2475797d39742fe3539572c974e0e (diff) | |
parent | a1ef8a3e462676238b9604d6c1acd1880c8262ac (diff) | |
download | numpy-c02d02024c0ea2195b87dba690c6c6b36bfd74a4.tar.gz |
Merge pull request #16605 from xiegengxin/avx512-log-float64
MAINT: Avx512 intrinsics implementation for float64 input np.log
-rw-r--r-- | benchmarks/benchmarks/bench_avx.py | 13 | ||||
-rw-r--r-- | numpy/core/code_generators/generate_umath.py | 1 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 18 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.h.src | 6 | ||||
-rw-r--r-- | numpy/core/src/umath/npy_simd_data.h | 154 | ||||
-rw-r--r-- | numpy/core/src/umath/simd.inc.src | 239 | ||||
-rw-r--r-- | numpy/core/tests/data/umath-validation-set-log | 153 | ||||
-rw-r--r-- | numpy/core/tests/test_umath.py | 13 |
8 files changed, 594 insertions, 3 deletions
diff --git a/benchmarks/benchmarks/bench_avx.py b/benchmarks/benchmarks/bench_avx.py index ff105811d..82866c170 100644 --- a/benchmarks/benchmarks/bench_avx.py +++ b/benchmarks/benchmarks/bench_avx.py @@ -39,6 +39,19 @@ class AVX_UFunc(Benchmark): def time_ufunc(self, ufuncname, stride, dtype): self.f(self.arr[::stride]) +class AVX_UFunc_log(Benchmark): + params = [stride, dtype] + param_names = ['stride', 'dtype'] + timeout = 10 + + def setup(self, stride, dtype): + np.seterr(all='ignore') + N = 10000 + self.arr = np.array(np.random.random_sample(stride*N), dtype=dtype) + + def time_log(self, stride, dtype): + np.log(self.arr[::stride]) + avx_bfuncs = ['maximum', 'minimum'] diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index 2ce2fdb55..4c8884dc5 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -726,6 +726,7 @@ defdict = { None, TD('e', f='log', astype={'e':'f'}), TD('f', simd=[('fma', 'f'), ('avx512f', 'f')]), + TD('d', simd=[('avx512f', 'd')]), TD('fdg' + cmplx, f='log'), TD(P, f='log'), ), diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 0cfa1cea7..d9591ab33 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -1558,6 +1558,14 @@ DOUBLE_exp(char **args, npy_intp const *dimensions, npy_intp const *steps, void *(npy_double *)op1 = npy_exp(in1); } } +NPY_NO_EXPORT NPY_GCC_OPT_3 void +DOUBLE_log(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) +{ + UNARY_LOOP { + const npy_double in1 = *(npy_double *)ip1; + *(npy_double *)op1 = npy_log(in1); + } +} /**begin repeat * #isa = avx512f, fma# @@ -1700,6 +1708,16 @@ DOUBLE_exp_avx512f(char **args, npy_intp const *dimensions, npy_intp const *step } } +NPY_NO_EXPORT NPY_GCC_OPT_3 void +DOUBLE_log_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(data)) +{ + if (!run_unary_avx512f_log_DOUBLE(args, dimensions, steps)) { + UNARY_LOOP { + const npy_double in1 = *(npy_double *)ip1; + *(npy_double *)op1 = npy_log(in1); + } + } +} /**begin repeat * Float types diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index 5dd49c465..6b8b77e99 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -202,6 +202,12 @@ DOUBLE_exp(char **args, npy_intp const *dimensions, npy_intp const *steps, void NPY_NO_EXPORT void DOUBLE_exp_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); +NPY_NO_EXPORT void +DOUBLE_log(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +NPY_NO_EXPORT void +DOUBLE_log_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + /**begin repeat * #func = sin, cos, exp, log# */ diff --git a/numpy/core/src/umath/npy_simd_data.h b/numpy/core/src/umath/npy_simd_data.h index 36c8b6c03..45487d0a8 100644 --- a/numpy/core/src/umath/npy_simd_data.h +++ b/numpy/core/src/umath/npy_simd_data.h @@ -1,6 +1,7 @@ #ifndef __NPY_SIMD_DATA_H_ #define __NPY_SIMD_DATA_H_ #if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS +#if !(defined(__clang__) && (__clang_major__ < 10 || (__clang_major__ == 10 && __clang_minor__ < 1))) /* * Constants used in vector implementation of float64 exp(x) */ @@ -85,6 +86,7 @@ static npy_uint64 EXP_Table_tail[32] = { 0x3C99D3E12DD8A18B, }; #endif +#endif /* * Constants used in vector implementation of exp(x) @@ -134,4 +136,156 @@ static npy_uint64 EXP_Table_tail[32] = { #define NPY_COEFF_INVF7_SINEf -0x1.a06bbap-13f #define NPY_COEFF_INVF9_SINEf 0x1.7d3bbcp-19f +/* + * Lookup table of log(c_k) + * Reference form: Tang, Ping-Tak Peter. "Table-driven implementation of the + * logarithm function in IEEE floating-point arithmetic." ACM Transactions + * on Mathematical Software (TOMS) 16.4 (1990): 378-400. + */ +#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS +#if !(defined(__clang__) && (__clang_major__ < 10 || (__clang_major__ == 10 && __clang_minor__ < 1))) +static npy_uint64 LOG_TABLE_TOP[64] = { + 0x0000000000000000, + 0x3F8FC0A8B1000000, + 0x3F9F829B0E780000, + 0x3FA77458F6340000, + 0x3FAF0A30C0100000, + 0x3FB341D7961C0000, + 0x3FB6F0D28AE60000, + 0x3FBA926D3A4A0000, + 0x3FBE27076E2A0000, + 0x3FC0D77E7CD10000, + 0x3FC29552F8200000, + 0x3FC44D2B6CCB0000, + 0x3FC5FF3070A80000, + 0x3FC7AB8902110000, + 0x3FC9525A9CF40000, + 0x3FCAF3C94E810000, + 0x3FCC8FF7C79B0000, + 0x3FCE27076E2B0000, + 0x3FCFB9186D5E0000, + 0x3FD0A324E2738000, + 0x3FD1675CABAB8000, + 0x3FD22941FBCF8000, + 0x3FD2E8E2BAE10000, + 0x3FD3A64C55698000, + 0x3FD4618BC21C8000, + 0x3FD51AAD872E0000, + 0x3FD5D1BDBF580000, + 0x3FD686C81E9B0000, + 0x3FD739D7F6BC0000, + 0x3FD7EAF83B828000, + 0x3FD89A3386C18000, + 0x3FD947941C210000, + 0x3FD9F323ECBF8000, + 0x3FDA9CEC9A9A0000, + 0x3FDB44F77BCC8000, + 0x3FDBEB4D9DA70000, + 0x3FDC8FF7C79A8000, + 0x3FDD32FE7E010000, + 0x3FDDD46A04C20000, + 0x3FDE744261D68000, + 0x3FDF128F5FAF0000, + 0x3FDFAF588F790000, + 0x3FE02552A5A5C000, + 0x3FE0723E5C1CC000, + 0x3FE0BE72E4254000, + 0x3FE109F39E2D4000, + 0x3FE154C3D2F4C000, + 0x3FE19EE6B467C000, + 0x3FE1E85F5E704000, + 0x3FE23130D7BEC000, + 0x3FE2795E1289C000, + 0x3FE2C0E9ED448000, + 0x3FE307D7334F0000, + 0x3FE34E289D9D0000, + 0x3FE393E0D3564000, + 0x3FE3D9026A714000, + 0x3FE41D8FE8468000, + 0x3FE4618BC21C4000, + 0x3FE4A4F85DB04000, + 0x3FE4E7D811B74000, + 0x3FE52A2D265BC000, + 0x3FE56BF9D5B40000, + 0x3FE5AD404C358000, + 0x3FE5EE02A9240000, +}; + +static npy_uint64 LOG_TABLE_TAIL[64] = { + 0x0000000000000000, + 0xBD5FE0E183092C59, + 0x3D2980267C7E09E4, + 0xBD62303B9CB0D5E1, + 0x3D662A6617CC9717, + 0xBD4717B6B33E44F8, + 0xBD62968C836CC8C2, + 0x3D6AAC6CA17A4554, + 0x3D6E5CBD3D50FFFC, + 0xBD6C69A65A23A170, + 0xBD35B967F4471DFC, + 0x3D6F4799F4F6543E, + 0xBD6B0B0DE3077D7E, + 0xBD537B720E4A694B, + 0x3D65AD1D904C1D4E, + 0xBD600349CC67F9B2, + 0xBD697794F689F843, + 0xBD3A342C2AF0003C, + 0x3D5F1546AAA3361C, + 0x3D50E35F73F7A018, + 0x3D630701CE63EAB9, + 0xBD3A6976F5EB0963, + 0x3D5D309C2CC91A85, + 0xBD6D0B1C68651946, + 0xBD609EC17A426426, + 0xBD3F4BD8DB0A7CC1, + 0x3D4394A11B1C1EE4, + 0x3D54AEC442BE1015, + 0xBD67FCB18ED9D603, + 0x3D67E1B259D2F3DA, + 0xBD6ED2A52C73BF78, + 0x3D56FABA4CDD147D, + 0x3D584BF2B68D766F, + 0x3D40931A909FEA5E, + 0x3D4EC5197DDB55D3, + 0x3D5B7BF7861D37AC, + 0x3D5A21AC25DB1EF3, + 0xBD542A9E21373414, + 0xBD6DAFA08CECADB1, + 0x3D3E1F8DF68DBCF3, + 0x3D3BB2CD720EC44C, + 0xBD49C24CA098362B, + 0x3D60FEC69C695D7F, + 0x3D6F404E57963891, + 0xBD657D49676844CC, + 0x3D592DFBC7D93617, + 0x3D65E9A98F33A396, + 0x3D52DD98B97BAEF0, + 0x3D1A07BD8B34BE7C, + 0xBD17AFA4392F1BA7, + 0xBD5DCA290F818480, + 0x3D5D1772F5386374, + 0x3D60BE1FB590A1F5, + 0xBD6E2CE9146D271A, + 0xBD65E6563BBD9FC9, + 0x3D66FAA404263D0B, + 0xBD5AA33736867A17, + 0x3D6EC27D0B7B37B3, + 0xBD244FDD840B8591, + 0x3D6BB09CB0985646, + 0x3D46ABB9DF22BC57, + 0xBD58CD7DC73BD194, + 0x3D6F2CFB29AAA5F0, + 0x3D66757006095FD2, +}; + +#define NPY_TANG_LOG_A1 0x1.55555555554e6p-4 +#define NPY_TANG_LOG_A2 0x1.9999999bac6d4p-7 +#define NPY_TANG_LOG_A3 0x1.2492307f1519fp-9 +#define NPY_TANG_LOG_A4 0x1.c8034c85dfffp-12 + +#define NPY_TANG_LOG_LN2HI 0x1.62e42fefa4p-1 +#define NPY_TANG_LOG_LN2LO -0x1.8432a1b0e2634p-43 +#endif +#endif + #endif diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 8f01d33fa..2df1b7225 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -480,6 +480,26 @@ run_unary_avx512f_exp_DOUBLE(char **args, npy_intp const *dimensions, npy_intp c return 0; } +#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS +static NPY_INLINE void +AVX512F_log_DOUBLE(npy_double *, npy_double *, const npy_intp n, const npy_intp stride); +#endif +static NPY_INLINE int +run_unary_avx512f_log_DOUBLE(char **args, npy_intp const *dimensions, npy_intp const *steps) +{ +#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS +#if !(defined(__clang__) && (__clang_major__ < 10 || (__clang_major__ == 10 && __clang_minor__ < 1))) + if (IS_OUTPUT_BLOCKABLE_UNARY(sizeof(npy_double), 64)) { + AVX512F_log_DOUBLE((npy_double*)args[1], (npy_double*)args[0], dimensions[0], steps[0]); + return 1; + } + else + return 0; +#endif +#endif + return 0; +} + /**begin repeat * Float types * #type = npy_float, npy_double, npy_longdouble# @@ -1795,14 +1815,27 @@ avx512_permute_x4var_pd(__m512d t0, __m512d t3, __m512i index) { - - __mmask8 lut_mask = _mm512_cmp_epi64_mask(index, _mm512_set1_epi64(15), - _MM_CMPINT_GT); + __mmask8 lut_mask = _mm512_cmp_epi64_mask( + _mm512_and_epi64(_mm512_set1_epi64(0x10ULL), index), + _mm512_set1_epi64(0), _MM_CMPINT_GT); __m512d res1 = _mm512_permutex2var_pd(t0, index, t1); __m512d res2 = _mm512_permutex2var_pd(t2, index, t3); return _mm512_mask_blend_pd(lut_mask, res1, res2); } +static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F __m512d +avx512_permute_x8var_pd(__m512d t0, __m512d t1, __m512d t2, __m512d t3, + __m512d t4, __m512d t5, __m512d t6, __m512d t7, + __m512i index) +{ + __mmask8 lut_mask = _mm512_cmp_epi64_mask( + _mm512_and_epi64(_mm512_set1_epi64(0x20ULL), index), + _mm512_set1_epi64(0), _MM_CMPINT_GT); + __m512d res1 = avx512_permute_x4var_pd(t0, t1, t2, t3, index); + __m512d res2 = avx512_permute_x4var_pd(t4, t5, t6, t7, index); + return _mm512_mask_blend_pd(lut_mask, res1, res2); +} + /**begin repeat * #vsub = ps, pd# * #type= npy_float, npy_double# @@ -3145,6 +3178,206 @@ AVX512F_exp_DOUBLE(npy_double * op, #endif #endif +/* + * Vectorized implementation of log double using AVX512 + * Reference: + * [1] Tang, Ping Tak Peter. Table-lookup algorithms for elementary functions + * and their error analysis. No. CONF-9106103-1. Argonne National Lab., + * IL (USA), 1991. + * [2] Tang, Ping-Tak Peter. "Table-driven implementation of the logarithm + * function in IEEE floating-point arithmetic." ACM Transactions on + * Mathematical Software (TOMS) 16.4 (1990): 378-400. + * [3] Muller, Jean-Michel. "Elementary functions: algorithms and + * implementation." (2016). + * 1) if x = 0; return -INF + * 2) if x < 0; return NAN + * 3) if x is INF; return INF + * 4) if x is NAN; return NAN + * 5) if x on (1.0 - 0x1p-4, 1.0 + 0x1.09p-4), calling npy_log() + * 6) Range reduction: + * log(x) = log(2^m * z) + * = mln2 + log(z) + * 7) log(z) = log(z / c_k) + log(c_k); + * where c_k = 1 + k/64, k = 0,1,...,64 + * s.t. |x - c_k| <= 1/128 when x on[1,2]. + * 8) r = 2(x - c_k)/(x + c_k) + * log(x/c_k) = log((1 + r/2) / (1 - r/2)) + * = p(r) + * = 2((r/2) + 1/3*(r/2)^3 + 1/5*(r/2)^5 + ...) + */ +#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS +#if !(defined(__clang__) && (__clang_major__ < 10 || (__clang_major__ == 10 && __clang_minor__ < 1))) +static NPY_GCC_OPT_3 NPY_GCC_TARGET_AVX512F void +AVX512F_log_DOUBLE(npy_double * op, + npy_double * ip, + const npy_intp array_size, + const npy_intp steps) +{ + npy_intp num_remaining_elements = array_size; + const npy_intp stride = steps / (npy_intp)sizeof(npy_double); + const npy_int num_lanes = 64 / (npy_intp)sizeof(npy_double); + npy_int32 indexarr[8]; + for (npy_int32 ii = 0; ii < 8; ii++) { + indexarr[ii] = ii*stride; + } + + __m512d zeros_d = _mm512_set1_pd(0.0f); + __m512d ones_d = _mm512_set1_pd(1.0f); + __m512d mInf = _mm512_set1_pd(NPY_INFINITY); + __m512d mInv64 = (__m512d)(_mm512_set1_epi64(0x3f90000000000000)); + __m512d mNeg_nan = _mm512_set1_pd(-NPY_NAN); + __m512d mNan = _mm512_set1_pd(NPY_NAN); + __m512d mNeg_inf = _mm512_set1_pd(-NPY_INFINITY); + __m512d mA1 = _mm512_set1_pd(NPY_TANG_LOG_A1); + __m512d mA2 = _mm512_set1_pd(NPY_TANG_LOG_A2); + __m512d mA3 = _mm512_set1_pd(NPY_TANG_LOG_A3); + __m512d mA4 = _mm512_set1_pd(NPY_TANG_LOG_A4); + __m512d mLN2HI = _mm512_set1_pd(NPY_TANG_LOG_LN2HI); + __m512d mLN2LO = _mm512_set1_pd(NPY_TANG_LOG_LN2LO); + + __m512d mTo_glibc_min = _mm512_set1_pd(1.0 - 0x1p-4); + __m512d mTo_glibc_max = _mm512_set1_pd(1.0 + 0x1.09p-4); + __m256i vindex = _mm256_loadu_si256((__m256i*)&indexarr[0]); + + /* Load lookup table data */ + /**begin repeat + * #i = 0, 1, 2, 3, 4, 5, 6, 7# + */ + + __m512d mLUT_TOP_@i@ = _mm512_loadu_pd(&(LOG_TABLE_TOP[8*@i@])); + __m512d mLUT_TAIL_@i@ = _mm512_loadu_pd(&(LOG_TABLE_TAIL[8*@i@])); + + /**end repeat**/ + + __mmask8 load_mask = avx512_get_full_load_mask_pd(); + __mmask8 invalid_mask = avx512_get_partial_load_mask_pd(0, num_lanes); + __mmask8 divide_by_zero_mask = invalid_mask; + + __mmask8 inf_mask, nan_mask, zero_mask, negx_mask, denormal_mask, + glibc_mask; + + __m512d x; + while (num_remaining_elements > 0) { + if (num_remaining_elements < num_lanes) { + load_mask = avx512_get_partial_load_mask_pd(num_remaining_elements, + num_lanes); + } + + if (1 == stride) { + x = avx512_masked_load_pd(load_mask, ip); + } + else { + x = avx512_masked_gather_pd(zeros_d, ip, vindex, load_mask); + } + + /* call glibc when x on [1.0 - 0x1p-4, 1.0 + 0x1.09p-4] */ + __mmask8 m1 = _mm512_cmp_pd_mask(x, mTo_glibc_max, _CMP_LT_OQ); + __mmask8 m2 = _mm512_cmp_pd_mask(x, mTo_glibc_min, _CMP_GT_OQ); + glibc_mask = m1 & m2; + + if (glibc_mask != 0xFF) { + zero_mask = _mm512_cmp_pd_mask(x, zeros_d, _CMP_EQ_OQ); + inf_mask = _mm512_cmp_pd_mask(x, mInf, _CMP_EQ_OQ); + negx_mask = _mm512_cmp_pd_mask(x, zeros_d, _CMP_LT_OQ); + nan_mask = _mm512_cmp_pd_mask(x, x, _CMP_NEQ_UQ); + + divide_by_zero_mask = divide_by_zero_mask | (zero_mask & load_mask); + invalid_mask = invalid_mask | negx_mask; + + x = avx512_set_masked_lanes_pd(x, zeros_d, negx_mask); + __m512i ix = (__m512i)x; + + /* Normalize x when it is denormal */ + __m512i top12 = _mm512_and_epi64(ix, + _mm512_set1_epi64(0xfff0000000000000)); + denormal_mask = _mm512_cmp_epi64_mask(top12, _mm512_set1_epi64(0), + _CMP_EQ_OQ); + denormal_mask = (~zero_mask) & denormal_mask; + ix = (__m512i)_mm512_mask_mul_pd(x, denormal_mask, + x, _mm512_set1_pd(0x1p52)); + ix = _mm512_mask_sub_epi64(ix, denormal_mask, + ix, _mm512_set1_epi64(52ULL << 52)); + + /* + * x = 2^k * z; where z in range [1,2] + */ + __m512i tmp = _mm512_sub_epi64(ix, + _mm512_set1_epi64(0x3ff0000000000000)); + __m512i i = _mm512_and_epi64(_mm512_srai_epi64(tmp, 52 - 6), + _mm512_set1_epi64(0x3fULL)); + __m512i ik = _mm512_srai_epi64(tmp, 52); + __m512d z = (__m512d)(_mm512_sub_epi64(ix, _mm512_and_epi64(tmp, + _mm512_set1_epi64(0xfff0000000000000)))); + /* c = i/64 + 1 */ + __m256i i_32 = _mm512_cvtepi64_epi32(i); + __m512d c = _mm512_fmadd_pd(_mm512_cvtepi32_pd(i_32), mInv64, ones_d); + + /* u = 2 * (z - c) / (z + c) */ + __m512d u = _mm512_div_pd(_mm512_sub_pd(z, c), _mm512_add_pd(z, c)); + u = _mm512_mul_pd(_mm512_set1_pd(2.0), u); + + /* v = u * u */ + __m512d v = _mm512_mul_pd(u,u); + + /* log(z/c) = u + u*v*(A1 + v*(A2 + v*(A3 + v*A4))) */ + __m512d res = _mm512_fmadd_pd(v, mA4, mA3); + res = _mm512_fmadd_pd(v, res, mA2); + res = _mm512_fmadd_pd(v, res, mA1); + res = _mm512_mul_pd(v, res); + res = _mm512_fmadd_pd(u, res, u); + + /* Load lookup table data */ + __m512d c_hi = avx512_permute_x8var_pd(mLUT_TOP_0, mLUT_TOP_1, + mLUT_TOP_2, mLUT_TOP_3, mLUT_TOP_4, mLUT_TOP_5, + mLUT_TOP_6, mLUT_TOP_7, i); + __m512d c_lo = avx512_permute_x8var_pd(mLUT_TAIL_0, mLUT_TAIL_1, + mLUT_TAIL_2, mLUT_TAIL_3, mLUT_TAIL_4, mLUT_TAIL_5, + mLUT_TAIL_6, mLUT_TAIL_7, i); + + /* + * log(x) = k * ln2_hi + c_hi + + * k * ln2_lo + c_lo + + * log(z/c) + */ + __m256i ik_32 = _mm512_cvtepi64_epi32(ik); + __m512d k = _mm512_cvtepi32_pd(ik_32); + __m512d tt = _mm512_fmadd_pd(k, mLN2HI, c_hi); + __m512d tt2 = _mm512_fmadd_pd(k, mLN2LO, c_lo); + tt = _mm512_add_pd(tt, tt2); + res = _mm512_add_pd(tt, res); + + /* return special cases */ + res = avx512_set_masked_lanes_pd(res, mNan, nan_mask); + res = avx512_set_masked_lanes_pd(res, mNeg_nan, negx_mask); + res = avx512_set_masked_lanes_pd(res, mNeg_inf, zero_mask); + res = avx512_set_masked_lanes_pd(res, mInf, inf_mask); + + _mm512_mask_storeu_pd(op, load_mask, res); + } + + /* call glibc's log func when x around 1.0f */ + for (int ii = 0, jj = 0; glibc_mask != 0; ii++, jj += stride) { + if (glibc_mask & 0x01) { + op[ii] = npy_log(ip[jj]); + } + glibc_mask = glibc_mask >> 1; + } + + ip += num_lanes * stride; + op += num_lanes; + num_remaining_elements -= num_lanes; + } + + if (invalid_mask) { + npy_set_floatstatus_invalid(); + } + if (divide_by_zero_mask) { + npy_set_floatstatus_divbyzero(); + } +} +#endif +#endif + /**begin repeat * #TYPE = CFLOAT, CDOUBLE# * #type = npy_float, npy_double# diff --git a/numpy/core/tests/data/umath-validation-set-log b/numpy/core/tests/data/umath-validation-set-log index a7bd98481..b8f6b0875 100644 --- a/numpy/core/tests/data/umath-validation-set-log +++ b/numpy/core/tests/data/umath-validation-set-log @@ -116,3 +116,156 @@ np.float32,0x3f494ab1,0xbe763131,4 np.float32,0x3f476b69,0xbe7fc2c6,4 np.float32,0x3f4884e8,0xbe7a214a,4 np.float32,0x3f486945,0xbe7aae76,4 +#float64 +## +ve denormal ## +np.float64,0x0000000000000001,0xc0874385446d71c3,1 +np.float64,0x0001000000000000,0xc086395a2079b70c,1 +np.float64,0x000fffffffffffff,0xc086232bdd7abcd2,1 +np.float64,0x0007ad63e2168cb6,0xc086290bc0b2980f,1 +## -ve denormal ## +np.float64,0x8000000000000001,0xfff8000000000001,1 +np.float64,0x8001000000000000,0xfff8000000000001,1 +np.float64,0x800fffffffffffff,0xfff8000000000001,1 +np.float64,0x8007ad63e2168cb6,0xfff8000000000001,1 +## +/-0.0f, MAX, MIN## +np.float64,0x0000000000000000,0xfff0000000000000,1 +np.float64,0x8000000000000000,0xfff0000000000000,1 +np.float64,0x7fefffffffffffff,0x40862e42fefa39ef,1 +np.float64,0xffefffffffffffff,0xfff8000000000001,1 +## near 1.0f ## +np.float64,0x3ff0000000000000,0x0000000000000000,1 +np.float64,0x3fe8000000000000,0xbfd269621134db92,1 +np.float64,0x3ff0000000000001,0x3cafffffffffffff,1 +np.float64,0x3ff0000020000000,0x3e7fffffe000002b,1 +np.float64,0x3ff0000000000001,0x3cafffffffffffff,1 +np.float64,0x3fefffffe0000000,0xbe70000008000005,1 +np.float64,0x3fefffffffffffff,0xbca0000000000000,1 +## random numbers ## +np.float64,0x02500186f3d9da56,0xc0855b8abf135773,1 +np.float64,0x09200815a3951173,0xc082ff1ad7131bdc,1 +np.float64,0x0da029623b0243d4,0xc0816fc994695bb5,1 +np.float64,0x48703b8ac483a382,0x40579213a313490b,1 +np.float64,0x09207b74c87c9860,0xc082fee20ff349ef,1 +np.float64,0x62c077698e8df947,0x407821c996d110f0,1 +np.float64,0x2350b45e87c3cfb0,0xc073d6b16b51d072,1 +np.float64,0x3990a23f9ff2b623,0xc051aa60eadd8c61,1 +np.float64,0x0d011386a116c348,0xc081a6cc7ea3b8fb,1 +np.float64,0x1fe0f0303ebe273a,0xc0763870b78a81ca,1 +np.float64,0x0cd1260121d387da,0xc081b7668d61a9d1,1 +np.float64,0x1e6135a8f581d422,0xc077425ac10f08c2,1 +np.float64,0x622168db5fe52d30,0x4077b3c669b9fadb,1 +np.float64,0x69f188e1ec6d1718,0x407d1e2f18c63889,1 +np.float64,0x3aa1bf1d9c4dd1a3,0xc04d682e24bde479,1 +np.float64,0x6c81c4011ce4f683,0x407ee5190e8a8e6a,1 +np.float64,0x2191fa55aa5a5095,0xc0750c0c318b5e2d,1 +np.float64,0x32a1f602a32bf360,0xc06270caa493fc17,1 +np.float64,0x16023c90ba93249b,0xc07d0f88e0801638,1 +np.float64,0x1c525fe6d71fa9ff,0xc078af49c66a5d63,1 +np.float64,0x1a927675815d65b7,0xc079e5bdd7fe376e,1 +np.float64,0x41227b8fe70da028,0x402aa0c9f9a84c71,1 +np.float64,0x4962bb6e853fe87d,0x405a34aa04c83747,1 +np.float64,0x23d2cda00b26b5a4,0xc0737c13a06d00ea,1 +np.float64,0x2d13083fd62987fa,0xc06a25055aeb474e,1 +np.float64,0x10e31e4c9b4579a1,0xc0804e181929418e,1 +np.float64,0x26d3247d556a86a9,0xc0716774171da7e8,1 +np.float64,0x6603379398d0d4ac,0x407a64f51f8a887b,1 +np.float64,0x02d38af17d9442ba,0xc0852d955ac9dd68,1 +np.float64,0x6a2382b4818dd967,0x407d4129d688e5d4,1 +np.float64,0x2ee3c403c79b3934,0xc067a091fefaf8b6,1 +np.float64,0x6493a699acdbf1a4,0x4079663c8602bfc5,1 +np.float64,0x1c8413c4f0de3100,0xc0788c99697059b6,1 +np.float64,0x4573f1ed350d9622,0x404e9bd1e4c08920,1 +np.float64,0x2f34265c9200b69c,0xc067310cfea4e986,1 +np.float64,0x19b43e65fa22029b,0xc07a7f8877de22d6,1 +np.float64,0x0af48ab7925ed6bc,0xc0825c4fbc0e5ade,1 +np.float64,0x4fa49699cad82542,0x4065c76d2a318235,1 +np.float64,0x7204a15e56ade492,0x40815bb87484dffb,1 +np.float64,0x4734aa08a230982d,0x40542a4bf7a361a9,1 +np.float64,0x1ae4ed296c2fd749,0xc079ac4921f20abb,1 +np.float64,0x472514ea4370289c,0x4053ff372bd8f18f,1 +np.float64,0x53a54b3f73820430,0x406b5411fc5f2e33,1 +np.float64,0x64754de5a15684fa,0x407951592e99a5ab,1 +np.float64,0x69358e279868a7c3,0x407c9c671a882c31,1 +np.float64,0x284579ec61215945,0xc0706688e55f0927,1 +np.float64,0x68b5c58806447adc,0x407c43d6f4eff760,1 +np.float64,0x1945a83f98b0e65d,0xc07acc15eeb032cc,1 +np.float64,0x0fc5eb98a16578bf,0xc080b0d02eddca0e,1 +np.float64,0x6a75e208f5784250,0x407d7a7383bf8f05,1 +np.float64,0x0fe63a029c47645d,0xc080a59ca1e98866,1 +np.float64,0x37963ac53f065510,0xc057236281f7bdb6,1 +np.float64,0x135661bb07067ff7,0xc07ee924930c21e4,1 +np.float64,0x4b4699469d458422,0x405f73843756e887,1 +np.float64,0x1a66d73e4bf4881b,0xc07a039ba1c63adf,1 +np.float64,0x12a6b9b119a7da59,0xc07f62e49c6431f3,1 +np.float64,0x24c719aa8fd1bdb5,0xc072d26da4bf84d3,1 +np.float64,0x0fa6ff524ffef314,0xc080bb8514662e77,1 +np.float64,0x1db751d66fdd4a9a,0xc077b77cb50d7c92,1 +np.float64,0x4947374c516da82c,0x4059e9acfc7105bf,1 +np.float64,0x1b1771ab98f3afc8,0xc07989326b8e1f66,1 +np.float64,0x25e78805baac8070,0xc0720a818e6ef080,1 +np.float64,0x4bd7a148225d3687,0x406082d004ea3ee7,1 +np.float64,0x53d7d6b2bbbda00a,0x406b9a398967cbd5,1 +np.float64,0x6997fb9f4e1c685f,0x407ce0a703413eba,1 +np.float64,0x069802c2ff71b951,0xc083df39bf7acddc,1 +np.float64,0x4d683ac9890f66d8,0x4062ae21d8c2acf0,1 +np.float64,0x5a2825863ec14f4c,0x40722d718d549552,1 +np.float64,0x0398799a88f4db80,0xc084e93dab8e2158,1 +np.float64,0x5ed87a8b77e135a5,0x40756d7051777b33,1 +np.float64,0x5828cd6d79b9bede,0x4070cafb22fc6ca1,1 +np.float64,0x7b18ba2a5ec6f068,0x408481386b3ed6fe,1 +np.float64,0x4938fd60922198fe,0x4059c206b762ea7e,1 +np.float64,0x31b8f44fcdd1a46e,0xc063b2faa8b6434e,1 +np.float64,0x5729341c0d918464,0x407019cac0c4a7d7,1 +np.float64,0x13595e9228ee878e,0xc07ee7235a7d8088,1 +np.float64,0x17698b0dc9dd4135,0xc07c1627e3a5ad5f,1 +np.float64,0x63b977c283abb0cc,0x4078cf1ec6ed65be,1 +np.float64,0x7349cc0d4dc16943,0x4081cc697ce4cb53,1 +np.float64,0x4e49a80b732fb28d,0x4063e67e3c5cbe90,1 +np.float64,0x07ba14b848a8ae02,0xc0837ac032a094e0,1 +np.float64,0x3da9f17b691bfddc,0xc03929c25366acda,1 +np.float64,0x02ea39aa6c3ac007,0xc08525af6f21e1c4,1 +np.float64,0x3a6a42f04ed9563d,0xc04e98e825dca46b,1 +np.float64,0x1afa877cd7900be7,0xc0799d6648cb34a9,1 +np.float64,0x58ea986649e052c6,0x4071512e939ad790,1 +np.float64,0x691abbc04647f536,0x407c89aaae0fcb83,1 +np.float64,0x43aabc5063e6f284,0x4044b45d18106fd2,1 +np.float64,0x488b003c893e0bea,0x4057df012a2dafbe,1 +np.float64,0x77eb076ed67caee5,0x40836720de94769e,1 +np.float64,0x5c1b46974aba46f4,0x40738731ba256007,1 +np.float64,0x1a5b29ecb5d3c261,0xc07a0becc77040d6,1 +np.float64,0x5d8b6ccf868c6032,0x4074865c1865e2db,1 +np.float64,0x4cfb6690b4aaf5af,0x406216cd8c7e8ddb,1 +np.float64,0x76cbd8eb5c5fc39e,0x4083038dc66d682b,1 +np.float64,0x28bbd1fec5012814,0xc07014c2dd1b9711,1 +np.float64,0x33dc1b3a4fd6bf7a,0xc060bd0756e07d8a,1 +np.float64,0x52bbe89b37de99f3,0x406a10041aa7d343,1 +np.float64,0x07bc479d15eb2dd3,0xc0837a1a6e3a3b61,1 +np.float64,0x18fc5275711a901d,0xc07aff3e9d62bc93,1 +np.float64,0x114c9758e247dc71,0xc080299a7cf15b05,1 +np.float64,0x25ac8f6d60755148,0xc07233c4c0c511d4,1 +np.float64,0x260cae2bb9e9fd7e,0xc071f128c7e82eac,1 +np.float64,0x572ccdfe0241de82,0x40701bedc84bb504,1 +np.float64,0x0ddcef6c8d41f5ee,0xc0815a7e16d07084,1 +np.float64,0x6dad1d59c988af68,0x407fb4a0bc0142b1,1 +np.float64,0x025d200580d8b6d1,0xc08556c0bc32b1b2,1 +np.float64,0x7aad344b6aa74c18,0x40845bbc453f22be,1 +np.float64,0x5b5d9d6ad9d14429,0x4073036d2d21f382,1 +np.float64,0x49cd8d8dcdf19954,0x405b5c034f5c7353,1 +np.float64,0x63edb9483335c1e6,0x4078f2dd21378786,1 +np.float64,0x7b1dd64c9d2c26bd,0x408482b922017bc9,1 +np.float64,0x782e13e0b574be5f,0x40837e2a0090a5ad,1 +np.float64,0x592dfe18b9d6db2f,0x40717f777fbcb1ec,1 +np.float64,0x654e3232ac60d72c,0x4079e71a95a70446,1 +np.float64,0x7b8e42ad22091456,0x4084a9a6f1e61722,1 +np.float64,0x570e88dfd5860ae6,0x407006ae6c0d137a,1 +np.float64,0x294e98346cb98ef1,0xc06f5edaac12bd44,1 +np.float64,0x1adeaa4ab792e642,0xc079b1431d5e2633,1 +np.float64,0x7b6ead3377529ac8,0x40849eabc8c7683c,1 +np.float64,0x2b8eedae8a9b2928,0xc06c400054deef11,1 +np.float64,0x65defb45b2dcf660,0x407a4b53f181c05a,1 +np.float64,0x1baf582d475e7701,0xc07920bcad4a502c,1 +np.float64,0x461f39cf05a0f15a,0x405126368f984fa1,1 +np.float64,0x7e5f6f5dcfff005b,0x4085a37d610439b4,1 +np.float64,0x136f66e4d09bd662,0xc07ed8a2719f2511,1 +np.float64,0x65afd8983fb6ca1f,0x407a2a7f48bf7fc1,1 +np.float64,0x572fa7f95ed22319,0x40701d706cf82e6f,1 diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py index 941d99521..b4a129502 100644 --- a/numpy/core/tests/test_umath.py +++ b/numpy/core/tests/test_umath.py @@ -657,6 +657,19 @@ class TestLog: yf = np.array(y, dtype=dt)*log2_ assert_almost_equal(np.log(xf), yf) + def test_log_strides(self): + np.random.seed(42) + strides = np.array([-4,-3,-2,-1,1,2,3,4]) + sizes = np.arange(2,100) + for ii in sizes: + x_f64 = np.float64(np.random.uniform(low=0.01, high=100.0,size=ii)) + x_special = x_f64.copy() + x_special[3:-1:4] = 1.0 + y_true = np.log(x_f64) + y_special = np.log(x_special) + for jj in strides: + assert_array_almost_equal_nulp(np.log(x_f64[::jj]), y_true[::jj], nulp=2) + assert_array_almost_equal_nulp(np.log(x_special[::jj]), y_special[::jj], nulp=2) class TestExp: def test_exp_values(self): |