diff options
author | Matti Picus <matti.picus@gmail.com> | 2020-02-01 22:52:36 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-02-01 22:52:36 +0200 |
commit | 94e516b7d55ea762e13f690c095172b8fcd47bc8 (patch) | |
tree | 4120a63181179da47c13e1d5592dcb7f480e7f18 | |
parent | 3873ec46816f3fe0f705394242f83a764d4ce90e (diff) | |
parent | 89618ad43b1fa867888fe8e18b7d85e264eb4f27 (diff) | |
download | numpy-94e516b7d55ea762e13f690c095172b8fcd47bc8.tar.gz |
Merge pull request #14867 from r-devulap/maximum-avx
ENH: Use AVX-512F for np.maximum and np.minimum
-rw-r--r-- | benchmarks/benchmarks/bench_avx.py | 22 | ||||
-rw-r--r-- | numpy/core/code_generators/generate_umath.py | 4 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.c.src | 28 | ||||
-rw-r--r-- | numpy/core/src/umath/loops.h.src | 8 | ||||
-rw-r--r-- | numpy/core/src/umath/simd.inc.src | 234 | ||||
-rw-r--r-- | numpy/core/tests/test_umath.py | 25 |
6 files changed, 298 insertions, 23 deletions
diff --git a/benchmarks/benchmarks/bench_avx.py b/benchmarks/benchmarks/bench_avx.py index 023696b90..6428b3764 100644 --- a/benchmarks/benchmarks/bench_avx.py +++ b/benchmarks/benchmarks/bench_avx.py @@ -29,3 +29,25 @@ class AVX_UFunc(Benchmark): def time_ufunc(self, ufuncname, stride, dtype): self.f(self.arr[::stride]) + +avx_bfuncs = ['maximum', + 'minimum'] + +class AVX_BFunc(Benchmark): + + params = [avx_bfuncs, dtype, stride] + param_names = ['avx_based_bfunc', 'dtype', 'stride'] + timeout = 10 + + def setup(self, ufuncname, dtype, stride): + np.seterr(all='ignore') + try: + self.f = getattr(np, ufuncname) + except AttributeError: + raise NotImplementedError() + N = 10000 + self.arr1 = np.array(np.random.rand(stride*N), dtype=dtype) + self.arr2 = np.array(np.random.rand(stride*N), dtype=dtype) + + def time_ufunc(self, ufuncname, dtype, stride): + self.f(self.arr1[::stride], self.arr2[::stride]) diff --git a/numpy/core/code_generators/generate_umath.py b/numpy/core/code_generators/generate_umath.py index f9ee7d993..1fd08241d 100644 --- a/numpy/core/code_generators/generate_umath.py +++ b/numpy/core/code_generators/generate_umath.py @@ -492,14 +492,14 @@ defdict = { Ufunc(2, 1, ReorderableNone, docstrings.get('numpy.core.umath.maximum'), 'PyUFunc_SimpleUniformOperationTypeResolver', - TD(noobj), + TD(noobj, simd=[('avx512f', 'fd')]), TD(O, f='npy_ObjectMax') ), 'minimum': Ufunc(2, 1, ReorderableNone, docstrings.get('numpy.core.umath.minimum'), 'PyUFunc_SimpleUniformOperationTypeResolver', - TD(noobj), + TD(noobj, simd=[('avx512f', 'fd')]), TD(O, f='npy_ObjectMin') ), 'clip': diff --git a/numpy/core/src/umath/loops.c.src b/numpy/core/src/umath/loops.c.src index 3b180ce59..b310d73ff 100644 --- a/numpy/core/src/umath/loops.c.src +++ b/numpy/core/src/umath/loops.c.src @@ -1898,6 +1898,34 @@ NPY_NO_EXPORT void * #OP = >=, <=# **/ NPY_NO_EXPORT void +@TYPE@_@kind@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) +{ + /* */ + if (IS_BINARY_REDUCE) { + if (!run_unary_reduce_simd_@kind@_@TYPE@(args, dimensions, steps)) { + BINARY_REDUCE_LOOP(@type@) { + const @type@ in2 = *(@type@ *)ip2; + /* Order of operations important for MSVC 2015 */ + io1 = (io1 @OP@ in2 || npy_isnan(io1)) ? io1 : in2; + } + *((@type@ *)iop1) = io1; + } + } + else { + if (!run_binary_avx512f_@kind@_@TYPE@(args, dimensions, steps)) { + BINARY_LOOP { + @type@ in1 = *(@type@ *)ip1; + const @type@ in2 = *(@type@ *)ip2; + /* Order of operations important for MSVC 2015 */ + in1 = (in1 @OP@ in2 || npy_isnan(in1)) ? in1 : in2; + *((@type@ *)op1) = in1; + } + } + } + npy_clear_floatstatus_barrier((char*)dimensions); +} + +NPY_NO_EXPORT void @TYPE@_@kind@(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) { /* */ diff --git a/numpy/core/src/umath/loops.h.src b/numpy/core/src/umath/loops.h.src index 8ddf201d7..6c89627ca 100644 --- a/numpy/core/src/umath/loops.h.src +++ b/numpy/core/src/umath/loops.h.src @@ -175,6 +175,14 @@ NPY_NO_EXPORT void @TYPE@_sqrt(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); /**begin repeat1 + * #func = maximum, minimum# + */ +NPY_NO_EXPORT void +@TYPE@_@func@_avx512f(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)); + +/**end repeat1**/ + +/**begin repeat1 * #isa = avx512f, fma# */ diff --git a/numpy/core/src/umath/simd.inc.src b/numpy/core/src/umath/simd.inc.src index 5473b58f1..cd485034e 100644 --- a/numpy/core/src/umath/simd.inc.src +++ b/numpy/core/src/umath/simd.inc.src @@ -34,6 +34,21 @@ #define VECTOR_SIZE_BYTES 16 +/* + * MAX_STEP_SIZE is used to determine if we need to use SIMD version of the ufunc. + * Very large step size can be as slow as processing it using scalar. The + * value of 2097152 ( = 2MB) was chosen using 2 considerations: + * 1) Typical linux kernel page size is 4Kb, but sometimes it could also be 2MB + * which is == 2097152 Bytes. For a step size as large as this, surely all + * the loads/stores of gather/scatter instructions falls on 16 different pages + * which one would think would slow down gather/scatter instructions. + * 2) It additionally satisfies MAX_STEP_SIZE*16/esize < NPY_MAX_INT32 which + * allows us to use i32 version of gather/scatter (as opposed to the i64 version) + * without problems (step larger than NPY_MAX_INT32*esize/16 would require use of + * i64gather/scatter). esize = element size = 4/8 bytes for float/double. + */ +#define MAX_STEP_SIZE 2097152 + static NPY_INLINE npy_uintp abs_ptrdiff(char *a, char *b) { @@ -52,10 +67,28 @@ abs_ptrdiff(char *a, char *b) ((abs_ptrdiff(args[1], args[0]) == 0)))) /* + * Avoid using SIMD for very large step sizes for several reasons: + * 1) Supporting large step sizes requires use of i64gather/scatter_ps instructions, + * in which case we need two i64gather instructions and an additional vinsertf32x8 + * instruction to load a single zmm register (since one i64gather instruction + * loads into a ymm register). This is not ideal for performance. + * 2) Gather and scatter instructions can be slow when the loads/stores + * cross page boundaries. + * + * We instead rely on i32gather/scatter_ps instructions which use a 32-bit index + * element. The index needs to be < INT_MAX to avoid overflow. MAX_STEP_SIZE ensures this. + */ +#define IS_BINARY_SMALL_STEPS \ + ((abs(steps[0]) < MAX_STEP_SIZE) && \ + (abs(steps[1]) < MAX_STEP_SIZE) && \ + (abs(steps[2]) < MAX_STEP_SIZE)) + +/* * output should be contiguous, can handle strided input data + * Input step should be smaller than MAX_STEP_SIZE for performance */ #define IS_OUTPUT_BLOCKABLE_UNARY(esize, vsize) \ - (steps[1] == (esize) && \ + (steps[1] == (esize) && abs(steps[0]) < MAX_STEP_SIZE && \ (npy_is_aligned(args[0], esize) && npy_is_aligned(args[1], esize)) && \ ((abs_ptrdiff(args[1], args[0]) >= (vsize)) || \ ((abs_ptrdiff(args[1], args[0]) == 0)))) @@ -130,6 +163,39 @@ abs_ptrdiff(char *a, char *b) */ /**begin repeat + * #type = npy_float, npy_double, npy_longdouble# + * #TYPE = FLOAT, DOUBLE, LONGDOUBLE# + * #EXISTS = 1, 1, 0# + */ + +/**begin repeat1 + * #func = maximum, minimum# + */ + +#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@ +static NPY_INLINE NPY_GCC_TARGET_AVX512F void +AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps); +#endif + +static NPY_INLINE int +run_binary_avx512f_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) +{ +#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS && @EXISTS@ + if (IS_BINARY_SMALL_STEPS) { + AVX512F_@func@_@TYPE@(args, dimensions, steps); + return 1; + } + else + return 0; +#endif + return 0; +} + + +/**end repeat1**/ +/**end repeat**/ + +/**begin repeat * #ISA = FMA, AVX512F# * #isa = fma, avx512f# * #CHK = HAVE_ATTRIBUTE_TARGET_AVX2_WITH_INTRINSICS, HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS# @@ -1671,6 +1737,101 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ @vtype@d #endif /**end repeat**/ +/**begin repeat + * #type = npy_float, npy_double# + * #TYPE = FLOAT, DOUBLE# + * #num_lanes = 16, 8# + * #vsuffix = ps, pd# + * #mask = __mmask16, __mmask8# + * #vtype = __m512, __m512d# + * #scale = 4, 8# + * #vindextype = __m512i, __m256i# + * #vindexsize = 512, 256# + * #vindexload = _mm512_loadu_si512, _mm256_loadu_si256# + */ + +/**begin repeat1 + * #func = maximum, minimum# + * #vectorf = max, min# + */ + +#if defined HAVE_ATTRIBUTE_TARGET_AVX512F_WITH_INTRINSICS && defined NPY_HAVE_SSE2_INTRINSICS +static NPY_INLINE NPY_GCC_TARGET_AVX512F void +AVX512F_@func@_@TYPE@(char **args, npy_intp const *dimensions, npy_intp const *steps) +{ + const npy_intp stride_ip1 = steps[0]/(npy_intp)sizeof(@type@); + const npy_intp stride_ip2 = steps[1]/(npy_intp)sizeof(@type@); + const npy_intp stride_op = steps[2]/(npy_intp)sizeof(@type@); + const npy_intp array_size = dimensions[0]; + npy_intp num_remaining_elements = array_size; + @type@* ip1 = (@type@*) args[0]; + @type@* ip2 = (@type@*) args[1]; + @type@* op = (@type@*) args[2]; + + @mask@ load_mask = avx512_get_full_load_mask_@vsuffix@(); + + /* + * Note: while generally indices are npy_intp, we ensure that our maximum index + * will fit in an int32 as a precondition for this function via + * IS_BINARY_SMALL_STEPS + */ + + npy_int32 index_ip1[@num_lanes@], index_ip2[@num_lanes@], index_op[@num_lanes@]; + for (npy_int32 ii = 0; ii < @num_lanes@; ii++) { + index_ip1[ii] = ii*stride_ip1; + index_ip2[ii] = ii*stride_ip2; + index_op[ii] = ii*stride_op; + } + @vindextype@ vindex_ip1 = @vindexload@((@vindextype@*)&index_ip1[0]); + @vindextype@ vindex_ip2 = @vindexload@((@vindextype@*)&index_ip2[0]); + @vindextype@ vindex_op = @vindexload@((@vindextype@*)&index_op[0]); + @vtype@ zeros_f = _mm512_setzero_@vsuffix@(); + + while (num_remaining_elements > 0) { + if (num_remaining_elements < @num_lanes@) { + load_mask = avx512_get_partial_load_mask_@vsuffix@( + num_remaining_elements, @num_lanes@); + } + @vtype@ x1, x2; + if (stride_ip1 == 1) { + x1 = avx512_masked_load_@vsuffix@(load_mask, ip1); + } + else { + x1 = avx512_masked_gather_@vsuffix@(zeros_f, ip1, vindex_ip1, load_mask); + } + if (stride_ip2 == 1) { + x2 = avx512_masked_load_@vsuffix@(load_mask, ip2); + } + else { + x2 = avx512_masked_gather_@vsuffix@(zeros_f, ip2, vindex_ip2, load_mask); + } + + /* + * when only one of the argument is a nan, the maxps/maxpd instruction + * returns the second argument. The additional blend instruction fixes + * this issue to conform with NumPy behaviour. + */ + @mask@ nan_mask = _mm512_cmp_@vsuffix@_mask(x1, x1, _CMP_NEQ_UQ); + @vtype@ out = _mm512_@vectorf@_@vsuffix@(x1, x2); + out = _mm512_mask_blend_@vsuffix@(nan_mask, out, x1); + + if (stride_op == 1) { + _mm512_mask_storeu_@vsuffix@(op, load_mask, out); + } + else { + /* scatter! */ + _mm512_mask_i32scatter_@vsuffix@(op, load_mask, vindex_op, out, @scale@); + } + + ip1 += @num_lanes@*stride_ip1; + ip2 += @num_lanes@*stride_ip2; + op += @num_lanes@*stride_op; + num_remaining_elements -= @num_lanes@; + } +} +#endif +/**end repeat**/ +/**end repeat1**/ /**begin repeat * #ISA = FMA, AVX512F# @@ -1699,16 +1860,23 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void const npy_intp array_size, const npy_intp steps) { - const npy_intp stride = steps/sizeof(npy_float); - const npy_int num_lanes = @BYTES@/sizeof(npy_float); + const npy_intp stride = steps/(npy_intp)sizeof(npy_float); + const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float); npy_intp num_remaining_elements = array_size; @vtype@ ones_f = _mm@vsize@_set1_ps(1.0f); @mask@ load_mask = @isa@_get_full_load_mask_ps(); #if @replace_0_with_1@ @mask@ inv_load_mask = @isa@_invert_mask_ps(load_mask); #endif - npy_int indexarr[16]; - for (npy_int ii = 0; ii < 16; ii++) { + + /* + * Note: while generally indices are npy_intp, we ensure that our maximum index + * will fit in an int32 as a precondition for this function via + * IS_OUTPUT_BLOCKABLE_UNARY + */ + + npy_int32 indexarr[16]; + for (npy_int32 ii = 0; ii < 16; ii++) { indexarr[ii] = ii*stride; } @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]); @@ -1778,16 +1946,22 @@ static NPY_INLINE NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void const npy_intp array_size, const npy_intp steps) { - const npy_intp stride = steps/sizeof(npy_double); - const npy_int num_lanes = @BYTES@/sizeof(npy_double); + const npy_intp stride = steps/(npy_intp)sizeof(npy_double); + const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_double); npy_intp num_remaining_elements = array_size; @mask@ load_mask = @isa@_get_full_load_mask_pd(); #if @replace_0_with_1@ @mask@ inv_load_mask = @isa@_invert_mask_pd(load_mask); #endif @vtype@ ones_d = _mm@vsize@_set1_pd(1.0f); - npy_int indexarr[8]; - for (npy_int ii = 0; ii < 8; ii++) { + + /* + * Note: while generally indices are npy_intp, we ensure that our maximum index + * will fit in an int32 as a precondition for this function via + * IS_OUTPUT_BLOCKABLE_UNARY + */ + npy_int32 indexarr[8]; + for (npy_int32 ii = 0; ii < 8; ii++) { indexarr[ii] = ii*stride; } @vindextype@ vindex = @vindexload@((@vindextype@*)&indexarr[0]); @@ -1874,8 +2048,8 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void const npy_intp steps, NPY_TRIG_OP my_trig_op) { - const npy_intp stride = steps/sizeof(npy_float); - const npy_int num_lanes = @BYTES@/sizeof(npy_float); + const npy_intp stride = steps/(npy_intp)sizeof(npy_float); + const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float); npy_float large_number = 71476.0625f; if (my_trig_op == npy_compute_sin) { large_number = 117435.992f; @@ -1905,8 +2079,14 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void @mask@ nan_mask, glibc_mask, sine_mask, negate_mask; @mask@ load_mask = @isa@_get_full_load_mask_ps(); npy_intp num_remaining_elements = array_size; - npy_int indexarr[16]; - for (npy_int ii = 0; ii < 16; ii++) { + + /* + * Note: while generally indices are npy_intp, we ensure that our maximum index + * will fit in an int32 as a precondition for this function via + * IS_OUTPUT_BLOCKABLE_UNARY + */ + npy_int32 indexarr[16]; + for (npy_int32 ii = 0; ii < 16; ii++) { indexarr[ii] = ii*stride; } @vtype@i vindex = _mm@vsize@_loadu_si@vsize@((@vtype@i*)&indexarr[0]); @@ -2017,12 +2197,18 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void const npy_intp array_size, const npy_intp steps) { - const npy_intp stride = steps/sizeof(npy_float); - const npy_int num_lanes = @BYTES@/sizeof(npy_float); + const npy_intp stride = steps/(npy_intp)sizeof(npy_float); + const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float); npy_float xmax = 88.72283935546875f; npy_float xmin = -103.97208404541015625f; - npy_int indexarr[16]; - for (npy_int ii = 0; ii < 16; ii++) { + + /* + * Note: while generally indices are npy_intp, we ensure that our maximum index + * will fit in an int32 as a precondition for this function via + * IS_OUTPUT_BLOCKABLE_UNARY + */ + npy_int32 indexarr[16]; + for (npy_int32 ii = 0; ii < 16; ii++) { indexarr[ii] = ii*stride; } @@ -2143,10 +2329,16 @@ static NPY_GCC_OPT_3 NPY_GCC_TARGET_@ISA@ void const npy_intp array_size, const npy_intp steps) { - const npy_intp stride = steps/sizeof(npy_float); - const npy_int num_lanes = @BYTES@/sizeof(npy_float); - npy_int indexarr[16]; - for (npy_int ii = 0; ii < 16; ii++) { + const npy_intp stride = steps/(npy_intp)sizeof(npy_float); + const npy_int num_lanes = @BYTES@/(npy_intp)sizeof(npy_float); + + /* + * Note: while generally indices are npy_intp, we ensure that our maximum index + * will fit in an int32 as a precondition for this function via + * IS_OUTPUT_BLOCKABLE_UNARY + */ + npy_int32 indexarr[16]; + for (npy_int32 ii = 0; ii < 16; ii++) { indexarr[ii] = ii*stride; } diff --git a/numpy/core/tests/test_umath.py b/numpy/core/tests/test_umath.py index 10a1c0803..d1d4467d6 100644 --- a/numpy/core/tests/test_umath.py +++ b/numpy/core/tests/test_umath.py @@ -1108,6 +1108,19 @@ class TestMaximum(_FilterInvalids): arg2 = arg1 + 1 assert_equal(np.maximum(arg1, arg2), arg2) + def test_strided_array(self): + arr1 = np.array([-4.0, 1.0, 10.0, 0.0, np.nan, -np.nan, np.inf, -np.inf]) + arr2 = np.array([-2.0,-1.0, np.nan, 1.0, 0.0, np.nan, 1.0, -3.0]) + maxtrue = np.array([-2.0, 1.0, np.nan, 1.0, np.nan, np.nan, np.inf, -3.0]) + out = np.ones(8) + out_maxtrue = np.array([-2.0, 1.0, 1.0, 10.0, 1.0, 1.0, np.nan, 1.0]) + assert_equal(np.maximum(arr1,arr2), maxtrue) + assert_equal(np.maximum(arr1[::2],arr2[::2]), maxtrue[::2]) + assert_equal(np.maximum(arr1[:4:], arr2[::2]), np.array([-2.0, np.nan, 10.0, 1.0])) + assert_equal(np.maximum(arr1[::3], arr2[:3:]), np.array([-2.0, 0.0, np.nan])) + assert_equal(np.maximum(arr1[:6:2], arr2[::3], out=out[::3]), np.array([-2.0, 10., np.nan])) + assert_equal(out, out_maxtrue) + class TestMinimum(_FilterInvalids): def test_reduce(self): @@ -1166,6 +1179,18 @@ class TestMinimum(_FilterInvalids): arg2 = arg1 + 1 assert_equal(np.minimum(arg1, arg2), arg1) + def test_strided_array(self): + arr1 = np.array([-4.0, 1.0, 10.0, 0.0, np.nan, -np.nan, np.inf, -np.inf]) + arr2 = np.array([-2.0,-1.0, np.nan, 1.0, 0.0, np.nan, 1.0, -3.0]) + mintrue = np.array([-4.0, -1.0, np.nan, 0.0, np.nan, np.nan, 1.0, -np.inf]) + out = np.ones(8) + out_mintrue = np.array([-4.0, 1.0, 1.0, 1.0, 1.0, 1.0, np.nan, 1.0]) + assert_equal(np.minimum(arr1,arr2), mintrue) + assert_equal(np.minimum(arr1[::2],arr2[::2]), mintrue[::2]) + assert_equal(np.minimum(arr1[:4:], arr2[::2]), np.array([-4.0, np.nan, 0.0, 0.0])) + assert_equal(np.minimum(arr1[::3], arr2[:3:]), np.array([-4.0, -1.0, np.nan])) + assert_equal(np.minimum(arr1[:6:2], arr2[::3], out=out[::3]), np.array([-4.0, 1.0, np.nan])) + assert_equal(out, out_mintrue) class TestFmax(_FilterInvalids): def test_reduce(self): |