#ifndef _NPY_SIMD_H_ #define _NPY_SIMD_H_ /** * the NumPy C SIMD vectorization interface "NPYV" are types and functions intended * to simplify vectorization of code on different platforms, currently supports * the following SIMD extensions SSE, AVX2, AVX512, VSX and NEON. * * TODO: Add an independent sphinx doc. */ #include "numpy/npy_common.h" #ifndef __cplusplus #include #endif #include "npy_cpu_dispatch.h" #include "simd_utils.h" #ifdef __cplusplus extern "C" { #endif // lane type by intrin suffix typedef npy_uint8 npyv_lanetype_u8; typedef npy_int8 npyv_lanetype_s8; typedef npy_uint16 npyv_lanetype_u16; typedef npy_int16 npyv_lanetype_s16; typedef npy_uint32 npyv_lanetype_u32; typedef npy_int32 npyv_lanetype_s32; typedef npy_uint64 npyv_lanetype_u64; typedef npy_int64 npyv_lanetype_s64; typedef float npyv_lanetype_f32; typedef double npyv_lanetype_f64; #if defined(_MSC_VER) && defined(_M_IX86) /* * Avoid using any of the following intrinsics with MSVC 32-bit, * even if they are apparently work on newer versions. * They had bad impact on the generated instructions, * sometimes the compiler deal with them without the respect * of 32-bit mode which lead to crush due to execute 64-bit * instructions and other times generate bad emulated instructions. */ #undef _mm512_set1_epi64 #undef _mm256_set1_epi64x #undef _mm_set1_epi64x #undef _mm512_setr_epi64x #undef _mm256_setr_epi64x #undef _mm_setr_epi64x #undef _mm512_set_epi64x #undef _mm256_set_epi64x #undef _mm_set_epi64x #endif #if defined(NPY_HAVE_AVX512F) && !defined(NPY_SIMD_FORCE_256) && !defined(NPY_SIMD_FORCE_128) #include "avx512/avx512.h" #elif defined(NPY_HAVE_AVX2) && !defined(NPY_SIMD_FORCE_128) #include "avx2/avx2.h" #elif defined(NPY_HAVE_SSE2) #include "sse/sse.h" #endif // TODO: Add support for VSX(2.06) and BE Mode for VSX #if defined(NPY_HAVE_VX) || (defined(NPY_HAVE_VSX2) && defined(__LITTLE_ENDIAN__)) #include "vec/vec.h" #endif #ifdef NPY_HAVE_NEON #include "neon/neon.h" #endif #ifndef NPY_SIMD /// SIMD width in bits or 0 if there's no SIMD extension available. #define NPY_SIMD 0 /// SIMD width in bytes or 0 if there's no SIMD extension available. #define NPY_SIMD_WIDTH 0 /// 1 if the enabled SIMD extension supports single-precision otherwise 0. #define NPY_SIMD_F32 0 /// 1 if the enabled SIMD extension supports double-precision otherwise 0. #define NPY_SIMD_F64 0 /// 1 if the enabled SIMD extension supports native FMA otherwise 0. /// note: we still emulate(fast) FMA intrinsics even if they /// aren't supported but they shouldn't be used if the precision is matters. #define NPY_SIMD_FMA3 0 /// 1 if the enabled SIMD extension is running on big-endian mode otherwise 0. #define NPY_SIMD_BIGENDIAN 0 /// 1 if the supported comparison intrinsics(lt, le, gt, ge) /// raises FP invalid exception for quite NaNs. #define NPY_SIMD_CMPSIGNAL 0 #endif // enable emulated mask operations for all SIMD extension except for AVX512 #if !defined(NPY_HAVE_AVX512F) && NPY_SIMD && NPY_SIMD < 512 #include "emulate_maskop.h" #endif // enable integer divisor generator for all SIMD extensions #if NPY_SIMD #include "intdiv.h" #endif /** * Some SIMD extensions currently(AVX2, AVX512F) require (de facto) * a maximum number of strides sizes when dealing with non-contiguous memory access. * * Therefore the following functions must be used to check the maximum * acceptable limit of strides before using any of non-contiguous load/store intrinsics. * * For instance: * npy_intp ld_stride = step[0] / sizeof(float); * npy_intp st_stride = step[1] / sizeof(float); * * if (npyv_loadable_stride_f32(ld_stride) && npyv_storable_stride_f32(st_stride)) { * for (;;) * npyv_f32 a = npyv_loadn_f32(ld_pointer, ld_stride); * // ... * npyv_storen_f32(st_pointer, st_stride, a); * } * else { * for (;;) * // C scalars * } */ #ifndef NPY_SIMD_MAXLOAD_STRIDE32 #define NPY_SIMD_MAXLOAD_STRIDE32 0 #endif #ifndef NPY_SIMD_MAXSTORE_STRIDE32 #define NPY_SIMD_MAXSTORE_STRIDE32 0 #endif #ifndef NPY_SIMD_MAXLOAD_STRIDE64 #define NPY_SIMD_MAXLOAD_STRIDE64 0 #endif #ifndef NPY_SIMD_MAXSTORE_STRIDE64 #define NPY_SIMD_MAXSTORE_STRIDE64 0 #endif #define NPYV_IMPL_MAXSTRIDE(SFX, MAXLOAD, MAXSTORE) \ NPY_FINLINE int npyv_loadable_stride_##SFX(npy_intp stride) \ { return MAXLOAD > 0 ? llabs(stride) <= MAXLOAD : 1; } \ NPY_FINLINE int npyv_storable_stride_##SFX(npy_intp stride) \ { return MAXSTORE > 0 ? llabs(stride) <= MAXSTORE : 1; } #if NPY_SIMD NPYV_IMPL_MAXSTRIDE(u32, NPY_SIMD_MAXLOAD_STRIDE32, NPY_SIMD_MAXSTORE_STRIDE32) NPYV_IMPL_MAXSTRIDE(s32, NPY_SIMD_MAXLOAD_STRIDE32, NPY_SIMD_MAXSTORE_STRIDE32) NPYV_IMPL_MAXSTRIDE(f32, NPY_SIMD_MAXLOAD_STRIDE32, NPY_SIMD_MAXSTORE_STRIDE32) NPYV_IMPL_MAXSTRIDE(u64, NPY_SIMD_MAXLOAD_STRIDE64, NPY_SIMD_MAXSTORE_STRIDE64) NPYV_IMPL_MAXSTRIDE(s64, NPY_SIMD_MAXLOAD_STRIDE64, NPY_SIMD_MAXSTORE_STRIDE64) #endif #if NPY_SIMD_F64 NPYV_IMPL_MAXSTRIDE(f64, NPY_SIMD_MAXLOAD_STRIDE64, NPY_SIMD_MAXSTORE_STRIDE64) #endif #ifdef __cplusplus } #endif #endif // _NPY_SIMD_H_