diff options
author | Matti Picus <matti.picus@gmail.com> | 2020-07-10 13:50:34 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-07-10 13:50:34 +0300 |
commit | 58da484abf6f466a9e8bf55c188cb5b501d31ceb (patch) | |
tree | 67631e36868543fb1d2baa75c9f8a4358825351e | |
parent | c43b2bfb15281e170cc5006829ebc559bd1915e3 (diff) | |
parent | 18d0fe5c1b2c4a95e0eee73de9aa086b2d64cf88 (diff) | |
download | numpy-58da484abf6f466a9e8bf55c188cb5b501d31ceb.tar.gz |
Merge pull request #16397 from seiko2plus/implement_npyv
ENH: Implement the NumPy C SIMD vectorization interface
41 files changed, 4459 insertions, 0 deletions
diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h index 3cec0c6ff..5706e0576 100644 --- a/numpy/core/include/numpy/npy_common.h +++ b/numpy/core/include/numpy/npy_common.h @@ -141,6 +141,14 @@ #define NPY_INLINE #endif +#ifdef _MSC_VER + #define NPY_FINLINE static __forceinline +#elif defined(__GNUC__) + #define NPY_FINLINE static NPY_INLINE __attribute__((always_inline)) +#else + #define NPY_FINLINE static +#endif + #ifdef HAVE___THREAD #define NPY_TLS __thread #else diff --git a/numpy/core/setup.py b/numpy/core/setup.py index 8e00e4392..aede12080 100644 --- a/numpy/core/setup.py +++ b/numpy/core/setup.py @@ -739,6 +739,7 @@ def configuration(parent_package='',top_path=None): join('src', 'common', 'umathmodule.h'), join('src', 'common', 'numpyos.h'), join('src', 'common', 'npy_cpu_dispatch.h'), + join('src', 'common', 'simd', 'simd.h'), ] common_src = [ diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h new file mode 100644 index 000000000..9d8b4ab5e --- /dev/null +++ b/numpy/core/src/common/simd/avx2/arithmetic.h @@ -0,0 +1,75 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_AVX2_ARITHMETIC_H +#define _NPY_SIMD_AVX2_ARITHMETIC_H + +/*************************** + * Addition + ***************************/ +// non-saturated +#define npyv_add_u8 _mm256_add_epi8 +#define npyv_add_s8 _mm256_add_epi8 +#define npyv_add_u16 _mm256_add_epi16 +#define npyv_add_s16 _mm256_add_epi16 +#define npyv_add_u32 _mm256_add_epi32 +#define npyv_add_s32 _mm256_add_epi32 +#define npyv_add_u64 _mm256_add_epi64 +#define npyv_add_s64 _mm256_add_epi64 +#define npyv_add_f32 _mm256_add_ps +#define npyv_add_f64 _mm256_add_pd + +// saturated +#define npyv_adds_u8 _mm256_adds_epu8 +#define npyv_adds_s8 _mm256_adds_epi8 +#define npyv_adds_u16 _mm256_adds_epu16 +#define npyv_adds_s16 _mm256_adds_epi16 +// TODO: rest, after implment Packs intrins + +/*************************** + * Subtraction + ***************************/ +// non-saturated +#define npyv_sub_u8 _mm256_sub_epi8 +#define npyv_sub_s8 _mm256_sub_epi8 +#define npyv_sub_u16 _mm256_sub_epi16 +#define npyv_sub_s16 _mm256_sub_epi16 +#define npyv_sub_u32 _mm256_sub_epi32 +#define npyv_sub_s32 _mm256_sub_epi32 +#define npyv_sub_u64 _mm256_sub_epi64 +#define npyv_sub_s64 _mm256_sub_epi64 +#define npyv_sub_f32 _mm256_sub_ps +#define npyv_sub_f64 _mm256_sub_pd + +// saturated +#define npyv_subs_u8 _mm256_subs_epu8 +#define npyv_subs_s8 _mm256_subs_epi8 +#define npyv_subs_u16 _mm256_subs_epu16 +#define npyv_subs_s16 _mm256_subs_epi16 +// TODO: rest, after implment Packs intrins + +/*************************** + * Multiplication + ***************************/ +// non-saturated +#define npyv_mul_u8 npyv256_mul_u8 +#define npyv_mul_s8 npyv_mul_u8 +#define npyv_mul_u16 _mm256_mullo_epi16 +#define npyv_mul_s16 _mm256_mullo_epi16 +#define npyv_mul_u32 _mm256_mullo_epi32 +#define npyv_mul_s32 _mm256_mullo_epi32 +#define npyv_mul_f32 _mm256_mul_ps +#define npyv_mul_f64 _mm256_mul_pd + +// saturated +// TODO: after implment Packs intrins + +/*************************** + * Division + ***************************/ +// TODO: emulate integer division +#define npyv_div_f32 _mm256_div_ps +#define npyv_div_f64 _mm256_div_pd + +#endif // _NPY_SIMD_AVX2_ARITHMETIC_H diff --git a/numpy/core/src/common/simd/avx2/avx2.h b/numpy/core/src/common/simd/avx2/avx2.h new file mode 100644 index 000000000..c99d628ee --- /dev/null +++ b/numpy/core/src/common/simd/avx2/avx2.h @@ -0,0 +1,67 @@ +#ifndef _NPY_SIMD_H_ + #error "Not a standalone header" +#endif + +#define NPY_SIMD 256 +#define NPY_SIMD_WIDTH 32 +#define NPY_SIMD_F64 1 + +typedef __m256i npyv_u8; +typedef __m256i npyv_s8; +typedef __m256i npyv_u16; +typedef __m256i npyv_s16; +typedef __m256i npyv_u32; +typedef __m256i npyv_s32; +typedef __m256i npyv_u64; +typedef __m256i npyv_s64; +typedef __m256 npyv_f32; +typedef __m256d npyv_f64; + +typedef __m256i npyv_b8; +typedef __m256i npyv_b16; +typedef __m256i npyv_b32; +typedef __m256i npyv_b64; + +typedef struct { __m256i val[2]; } npyv_m256ix2; +typedef npyv_m256ix2 npyv_u8x2; +typedef npyv_m256ix2 npyv_s8x2; +typedef npyv_m256ix2 npyv_u16x2; +typedef npyv_m256ix2 npyv_s16x2; +typedef npyv_m256ix2 npyv_u32x2; +typedef npyv_m256ix2 npyv_s32x2; +typedef npyv_m256ix2 npyv_u64x2; +typedef npyv_m256ix2 npyv_s64x2; + +typedef struct { __m256i val[3]; } npyv_m256ix3; +typedef npyv_m256ix3 npyv_u8x3; +typedef npyv_m256ix3 npyv_s8x3; +typedef npyv_m256ix3 npyv_u16x3; +typedef npyv_m256ix3 npyv_s16x3; +typedef npyv_m256ix3 npyv_u32x3; +typedef npyv_m256ix3 npyv_s32x3; +typedef npyv_m256ix3 npyv_u64x3; +typedef npyv_m256ix3 npyv_s64x3; + +typedef struct { __m256 val[2]; } npyv_f32x2; +typedef struct { __m256d val[2]; } npyv_f64x2; +typedef struct { __m256 val[3]; } npyv_f32x3; +typedef struct { __m256d val[3]; } npyv_f64x3; + +#define npyv_nlanes_u8 32 +#define npyv_nlanes_s8 32 +#define npyv_nlanes_u16 16 +#define npyv_nlanes_s16 16 +#define npyv_nlanes_u32 8 +#define npyv_nlanes_s32 8 +#define npyv_nlanes_u64 4 +#define npyv_nlanes_s64 4 +#define npyv_nlanes_f32 8 +#define npyv_nlanes_f64 4 + +#include "utils.h" +#include "memory.h" +#include "misc.h" +#include "reorder.h" +#include "operators.h" +#include "conversion.h" +#include "arithmetic.h" diff --git a/numpy/core/src/common/simd/avx2/conversion.h b/numpy/core/src/common/simd/avx2/conversion.h new file mode 100644 index 000000000..9fd86016d --- /dev/null +++ b/numpy/core/src/common/simd/avx2/conversion.h @@ -0,0 +1,32 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_AVX2_CVT_H +#define _NPY_SIMD_AVX2_CVT_H + +// convert mask types to integer types +#define npyv_cvt_u8_b8(A) A +#define npyv_cvt_s8_b8(A) A +#define npyv_cvt_u16_b16(A) A +#define npyv_cvt_s16_b16(A) A +#define npyv_cvt_u32_b32(A) A +#define npyv_cvt_s32_b32(A) A +#define npyv_cvt_u64_b64(A) A +#define npyv_cvt_s64_b64(A) A +#define npyv_cvt_f32_b32(A) _mm256_castsi256_ps(A) +#define npyv_cvt_f64_b64(A) _mm256_castsi256_pd(A) + +// convert integer types to mask types +#define npyv_cvt_b8_u8(BL) BL +#define npyv_cvt_b8_s8(BL) BL +#define npyv_cvt_b16_u16(BL) BL +#define npyv_cvt_b16_s16(BL) BL +#define npyv_cvt_b32_u32(BL) BL +#define npyv_cvt_b32_s32(BL) BL +#define npyv_cvt_b64_u64(BL) BL +#define npyv_cvt_b64_s64(BL) BL +#define npyv_cvt_b32_f32(BL) _mm256_castps_si256(BL) +#define npyv_cvt_b64_f64(BL) _mm256_castpd_si256(BL) + +#endif // _NPY_SIMD_AVX2_CVT_H diff --git a/numpy/core/src/common/simd/avx2/memory.h b/numpy/core/src/common/simd/avx2/memory.h new file mode 100644 index 000000000..5ea7414fd --- /dev/null +++ b/numpy/core/src/common/simd/avx2/memory.h @@ -0,0 +1,70 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_AVX2_MEMORY_H +#define _NPY_SIMD_AVX2_MEMORY_H + +/*************************** + * load/store + ***************************/ +#define NPYV_IMPL_AVX2_MEM_INT(CTYPE, SFX) \ + NPY_FINLINE npyv_##SFX npyv_load_##SFX(const CTYPE *ptr) \ + { return _mm256_loadu_si256((const __m256i*)ptr); } \ + NPY_FINLINE npyv_##SFX npyv_loada_##SFX(const CTYPE *ptr) \ + { return _mm256_load_si256((const __m256i*)ptr); } \ + NPY_FINLINE npyv_##SFX npyv_loads_##SFX(const CTYPE *ptr) \ + { return _mm256_stream_load_si256((const __m256i*)ptr); } \ + NPY_FINLINE npyv_##SFX npyv_loadl_##SFX(const CTYPE *ptr) \ + { return _mm256_castsi128_si256(_mm_loadu_si128((const __m128i*)ptr)); } \ + NPY_FINLINE void npyv_store_##SFX(CTYPE *ptr, npyv_##SFX vec) \ + { _mm256_storeu_si256((__m256i*)ptr, vec); } \ + NPY_FINLINE void npyv_storea_##SFX(CTYPE *ptr, npyv_##SFX vec) \ + { _mm256_store_si256((__m256i*)ptr, vec); } \ + NPY_FINLINE void npyv_stores_##SFX(CTYPE *ptr, npyv_##SFX vec) \ + { _mm256_stream_si256((__m256i*)ptr, vec); } \ + NPY_FINLINE void npyv_storel_##SFX(CTYPE *ptr, npyv_##SFX vec) \ + { _mm_storeu_si128((__m128i*)(ptr), _mm256_castsi256_si128(vec)); } \ + NPY_FINLINE void npyv_storeh_##SFX(CTYPE *ptr, npyv_##SFX vec) \ + { _mm_storeu_si128((__m128i*)(ptr), _mm256_extracti128_si256(vec, 1)); } + +NPYV_IMPL_AVX2_MEM_INT(npy_uint8, u8) +NPYV_IMPL_AVX2_MEM_INT(npy_int8, s8) +NPYV_IMPL_AVX2_MEM_INT(npy_uint16, u16) +NPYV_IMPL_AVX2_MEM_INT(npy_int16, s16) +NPYV_IMPL_AVX2_MEM_INT(npy_uint32, u32) +NPYV_IMPL_AVX2_MEM_INT(npy_int32, s32) +NPYV_IMPL_AVX2_MEM_INT(npy_uint64, u64) +NPYV_IMPL_AVX2_MEM_INT(npy_int64, s64) + +// unaligned load +#define npyv_load_f32 _mm256_loadu_ps +#define npyv_load_f64 _mm256_loadu_pd +// aligned load +#define npyv_loada_f32 _mm256_load_ps +#define npyv_loada_f64 _mm256_load_pd +// stream load +#define npyv_loads_f32(PTR) \ + _mm256_castsi256_ps(_mm256_stream_load_si256((const __m256i*)(PTR))) +#define npyv_loads_f64(PTR) \ + _mm256_castsi256_pd(_mm256_stream_load_si256((const __m256i*)(PTR))) +// load lower part +#define npyv_loadl_f32(PTR) _mm256_castps128_ps256(_mm_loadu_ps(PTR)) +#define npyv_loadl_f64(PTR) _mm256_castpd128_pd256(_mm_loadu_pd(PTR)) +// unaligned store +#define npyv_store_f32 _mm256_storeu_ps +#define npyv_store_f64 _mm256_storeu_pd +// aligned store +#define npyv_storea_f32 _mm256_store_ps +#define npyv_storea_f64 _mm256_store_pd +// stream store +#define npyv_stores_f32 _mm256_stream_ps +#define npyv_stores_f64 _mm256_stream_pd +// store lower part +#define npyv_storel_f32(PTR, VEC) _mm_storeu_ps(PTR, _mm256_castps256_ps128(VEC)) +#define npyv_storel_f64(PTR, VEC) _mm_storeu_pd(PTR, _mm256_castpd256_pd128(VEC)) +// store higher part +#define npyv_storeh_f32(PTR, VEC) _mm_storeu_ps(PTR, _mm256_extractf128_ps(VEC, 1)) +#define npyv_storeh_f64(PTR, VEC) _mm_storeu_pd(PTR, _mm256_extractf128_pd(VEC, 1)) + +#endif // _NPY_SIMD_AVX2_MEMORY_H diff --git a/numpy/core/src/common/simd/avx2/misc.h b/numpy/core/src/common/simd/avx2/misc.h new file mode 100644 index 000000000..e96696dc9 --- /dev/null +++ b/numpy/core/src/common/simd/avx2/misc.h @@ -0,0 +1,223 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_AVX2_MISC_H +#define _NPY_SIMD_AVX2_MISC_H + +// vector with zero lanes +#define npyv_zero_u8 _mm256_setzero_si256 +#define npyv_zero_s8 _mm256_setzero_si256 +#define npyv_zero_u16 _mm256_setzero_si256 +#define npyv_zero_s16 _mm256_setzero_si256 +#define npyv_zero_u32 _mm256_setzero_si256 +#define npyv_zero_s32 _mm256_setzero_si256 +#define npyv_zero_u64 _mm256_setzero_si256 +#define npyv_zero_s64 _mm256_setzero_si256 +#define npyv_zero_f32 _mm256_setzero_ps +#define npyv_zero_f64 _mm256_setzero_pd + +// vector with a specific value set to all lanes +#define npyv_setall_u8(VAL) _mm256_set1_epi8((char)VAL) +#define npyv_setall_s8(VAL) _mm256_set1_epi8((char)VAL) +#define npyv_setall_u16(VAL) _mm256_set1_epi16((short)VAL) +#define npyv_setall_s16(VAL) _mm256_set1_epi16((short)VAL) +#define npyv_setall_u32(VAL) _mm256_set1_epi32((int)VAL) +#define npyv_setall_s32(VAL) _mm256_set1_epi32(VAL) +#define npyv_setall_u64(VAL) _mm256_set1_epi64x(VAL) +#define npyv_setall_s64(VAL) _mm256_set1_epi64x(VAL) +#define npyv_setall_f32(VAL) _mm256_set1_ps(VAL) +#define npyv_setall_f64(VAL) _mm256_set1_pd(VAL) + +/* + * vector with specific values set to each lane and + * set a specific value to all remained lanes + * + * Args that generated by NPYV__SET_FILL_* not going to expand if + * _mm256_setr_* are defined as macros. +*/ +NPY_FINLINE __m256i npyv__setr_epi8( + char i0, char i1, char i2, char i3, char i4, char i5, char i6, char i7, + char i8, char i9, char i10, char i11, char i12, char i13, char i14, char i15, + char i16, char i17, char i18, char i19, char i20, char i21, char i22, char i23, + char i24, char i25, char i26, char i27, char i28, char i29, char i30, char i31) +{ + return _mm256_setr_epi8( + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, + i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31 + ); +} +NPY_FINLINE __m256i npyv__setr_epi16( + short i0, short i1, short i2, short i3, short i4, short i5, short i6, short i7, + short i8, short i9, short i10, short i11, short i12, short i13, short i14, short i15) +{ + return _mm256_setr_epi16(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15); +} +NPY_FINLINE __m256i npyv__setr_epi32(int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7) +{ + return _mm256_setr_epi32(i0, i1, i2, i3, i4, i5, i6, i7); +} +NPY_FINLINE __m256i npyv__setr_epi64(npy_int64 i0, npy_int64 i1, npy_int64 i2, npy_int64 i3) +{ + return _mm256_setr_epi64x(i0, i1, i2, i3); +} + +NPY_FINLINE __m256 npyv__setr_ps(float i0, float i1, float i2, float i3, float i4, float i5, + float i6, float i7) +{ + return _mm256_setr_ps(i0, i1, i2, i3, i4, i5, i6, i7); +} +NPY_FINLINE __m256d npyv__setr_pd(double i0, double i1, double i2, double i3) +{ + return _mm256_setr_pd(i0, i1, i2, i3); +} +#define npyv_setf_u8(FILL, ...) npyv__setr_epi8(NPYV__SET_FILL_32(char, FILL, __VA_ARGS__)) +#define npyv_setf_s8(FILL, ...) npyv__setr_epi8(NPYV__SET_FILL_32(char, FILL, __VA_ARGS__)) +#define npyv_setf_u16(FILL, ...) npyv__setr_epi16(NPYV__SET_FILL_16(short, FILL, __VA_ARGS__)) +#define npyv_setf_s16(FILL, ...) npyv__setr_epi16(NPYV__SET_FILL_16(short, FILL, __VA_ARGS__)) +#define npyv_setf_u32(FILL, ...) npyv__setr_epi32(NPYV__SET_FILL_8(int, FILL, __VA_ARGS__)) +#define npyv_setf_s32(FILL, ...) npyv__setr_epi32(NPYV__SET_FILL_8(int, FILL, __VA_ARGS__)) +#define npyv_setf_u64(FILL, ...) npyv__setr_epi64(NPYV__SET_FILL_4(npy_uint64, FILL, __VA_ARGS__)) +#define npyv_setf_s64(FILL, ...) npyv__setr_epi64(NPYV__SET_FILL_4(npy_int64, FILL, __VA_ARGS__)) +#define npyv_setf_f32(FILL, ...) npyv__setr_ps(NPYV__SET_FILL_8(float, FILL, __VA_ARGS__)) +#define npyv_setf_f64(FILL, ...) npyv__setr_pd(NPYV__SET_FILL_4(double, FILL, __VA_ARGS__)) + +// vector with specific values set to each lane and +// set zero to all remained lanes +#define npyv_set_u8(...) npyv_setf_u8(0, __VA_ARGS__) +#define npyv_set_s8(...) npyv_setf_s8(0, __VA_ARGS__) +#define npyv_set_u16(...) npyv_setf_u16(0, __VA_ARGS__) +#define npyv_set_s16(...) npyv_setf_s16(0, __VA_ARGS__) +#define npyv_set_u32(...) npyv_setf_u32(0, __VA_ARGS__) +#define npyv_set_s32(...) npyv_setf_s32(0, __VA_ARGS__) +#define npyv_set_u64(...) npyv_setf_u64(0, __VA_ARGS__) +#define npyv_set_s64(...) npyv_setf_s64(0, __VA_ARGS__) +#define npyv_set_f32(...) npyv_setf_f32(0, __VA_ARGS__) +#define npyv_set_f64(...) npyv_setf_f64(0, __VA_ARGS__) + +// Per lane select +#define npyv_select_u8(MASK, A, B) _mm256_blendv_epi8(B, A, MASK) +#define npyv_select_s8 npyv_select_u8 +#define npyv_select_u16 npyv_select_u8 +#define npyv_select_s16 npyv_select_u8 +#define npyv_select_u32 npyv_select_u8 +#define npyv_select_s32 npyv_select_u8 +#define npyv_select_u64 npyv_select_u8 +#define npyv_select_s64 npyv_select_u8 +#define npyv_select_f32(MASK, A, B) _mm256_blendv_ps(B, A, _mm256_castsi256_ps(MASK)) +#define npyv_select_f64(MASK, A, B) _mm256_blendv_pd(B, A, _mm256_castsi256_pd(MASK)) + +// Reinterpret +#define npyv_reinterpret_u8_u8(X) X +#define npyv_reinterpret_u8_s8(X) X +#define npyv_reinterpret_u8_u16(X) X +#define npyv_reinterpret_u8_s16(X) X +#define npyv_reinterpret_u8_u32(X) X +#define npyv_reinterpret_u8_s32(X) X +#define npyv_reinterpret_u8_u64(X) X +#define npyv_reinterpret_u8_s64(X) X +#define npyv_reinterpret_u8_f32 _mm256_castps_si256 +#define npyv_reinterpret_u8_f64 _mm256_castpd_si256 + +#define npyv_reinterpret_s8_s8(X) X +#define npyv_reinterpret_s8_u8(X) X +#define npyv_reinterpret_s8_u16(X) X +#define npyv_reinterpret_s8_s16(X) X +#define npyv_reinterpret_s8_u32(X) X +#define npyv_reinterpret_s8_s32(X) X +#define npyv_reinterpret_s8_u64(X) X +#define npyv_reinterpret_s8_s64(X) X +#define npyv_reinterpret_s8_f32 _mm256_castps_si256 +#define npyv_reinterpret_s8_f64 _mm256_castpd_si256 + +#define npyv_reinterpret_u16_u16(X) X +#define npyv_reinterpret_u16_u8(X) X +#define npyv_reinterpret_u16_s8(X) X +#define npyv_reinterpret_u16_s16(X) X +#define npyv_reinterpret_u16_u32(X) X +#define npyv_reinterpret_u16_s32(X) X +#define npyv_reinterpret_u16_u64(X) X +#define npyv_reinterpret_u16_s64(X) X +#define npyv_reinterpret_u16_f32 _mm256_castps_si256 +#define npyv_reinterpret_u16_f64 _mm256_castpd_si256 + +#define npyv_reinterpret_s16_s16(X) X +#define npyv_reinterpret_s16_u8(X) X +#define npyv_reinterpret_s16_s8(X) X +#define npyv_reinterpret_s16_u16(X) X +#define npyv_reinterpret_s16_u32(X) X +#define npyv_reinterpret_s16_s32(X) X +#define npyv_reinterpret_s16_u64(X) X +#define npyv_reinterpret_s16_s64(X) X +#define npyv_reinterpret_s16_f32 _mm256_castps_si256 +#define npyv_reinterpret_s16_f64 _mm256_castpd_si256 + +#define npyv_reinterpret_u32_u32(X) X +#define npyv_reinterpret_u32_u8(X) X +#define npyv_reinterpret_u32_s8(X) X +#define npyv_reinterpret_u32_u16(X) X +#define npyv_reinterpret_u32_s16(X) X +#define npyv_reinterpret_u32_s32(X) X +#define npyv_reinterpret_u32_u64(X) X +#define npyv_reinterpret_u32_s64(X) X +#define npyv_reinterpret_u32_f32 _mm256_castps_si256 +#define npyv_reinterpret_u32_f64 _mm256_castpd_si256 + +#define npyv_reinterpret_s32_s32(X) X +#define npyv_reinterpret_s32_u8(X) X +#define npyv_reinterpret_s32_s8(X) X +#define npyv_reinterpret_s32_u16(X) X +#define npyv_reinterpret_s32_s16(X) X +#define npyv_reinterpret_s32_u32(X) X +#define npyv_reinterpret_s32_u64(X) X +#define npyv_reinterpret_s32_s64(X) X +#define npyv_reinterpret_s32_f32 _mm256_castps_si256 +#define npyv_reinterpret_s32_f64 _mm256_castpd_si256 + +#define npyv_reinterpret_u64_u64(X) X +#define npyv_reinterpret_u64_u8(X) X +#define npyv_reinterpret_u64_s8(X) X +#define npyv_reinterpret_u64_u16(X) X +#define npyv_reinterpret_u64_s16(X) X +#define npyv_reinterpret_u64_u32(X) X +#define npyv_reinterpret_u64_s32(X) X +#define npyv_reinterpret_u64_s64(X) X +#define npyv_reinterpret_u64_f32 _mm256_castps_si256 +#define npyv_reinterpret_u64_f64 _mm256_castpd_si256 + +#define npyv_reinterpret_s64_s64(X) X +#define npyv_reinterpret_s64_u8(X) X +#define npyv_reinterpret_s64_s8(X) X +#define npyv_reinterpret_s64_u16(X) X +#define npyv_reinterpret_s64_s16(X) X +#define npyv_reinterpret_s64_u32(X) X +#define npyv_reinterpret_s64_s32(X) X +#define npyv_reinterpret_s64_u64(X) X +#define npyv_reinterpret_s64_f32 _mm256_castps_si256 +#define npyv_reinterpret_s64_f64 _mm256_castpd_si256 + +#define npyv_reinterpret_f32_f32(X) X +#define npyv_reinterpret_f32_u8 _mm256_castsi256_ps +#define npyv_reinterpret_f32_s8 _mm256_castsi256_ps +#define npyv_reinterpret_f32_u16 _mm256_castsi256_ps +#define npyv_reinterpret_f32_s16 _mm256_castsi256_ps +#define npyv_reinterpret_f32_u32 _mm256_castsi256_ps +#define npyv_reinterpret_f32_s32 _mm256_castsi256_ps +#define npyv_reinterpret_f32_u64 _mm256_castsi256_ps +#define npyv_reinterpret_f32_s64 _mm256_castsi256_ps +#define npyv_reinterpret_f32_f64 _mm256_castpd_ps + +#define npyv_reinterpret_f64_f64(X) X +#define npyv_reinterpret_f64_u8 _mm256_castsi256_pd +#define npyv_reinterpret_f64_s8 _mm256_castsi256_pd +#define npyv_reinterpret_f64_u16 _mm256_castsi256_pd +#define npyv_reinterpret_f64_s16 _mm256_castsi256_pd +#define npyv_reinterpret_f64_u32 _mm256_castsi256_pd +#define npyv_reinterpret_f64_s32 _mm256_castsi256_pd +#define npyv_reinterpret_f64_u64 _mm256_castsi256_pd +#define npyv_reinterpret_f64_s64 _mm256_castsi256_pd +#define npyv_reinterpret_f64_f32 _mm256_castps_pd + +#define npyv_cleanup _mm256_zeroall + +#endif // _NPY_SIMD_SSE_MISC_H diff --git a/numpy/core/src/common/simd/avx2/operators.h b/numpy/core/src/common/simd/avx2/operators.h new file mode 100644 index 000000000..c1d30413f --- /dev/null +++ b/numpy/core/src/common/simd/avx2/operators.h @@ -0,0 +1,200 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_AVX2_OPERATORS_H +#define _NPY_SIMD_AVX2_OPERATORS_H + +/*************************** + * Shifting + ***************************/ + +// left +#define npyv_shl_u16(A, C) _mm256_sll_epi16(A, _mm_cvtsi32_si128(C)) +#define npyv_shl_s16(A, C) _mm256_sll_epi16(A, _mm_cvtsi32_si128(C)) +#define npyv_shl_u32(A, C) _mm256_sll_epi32(A, _mm_cvtsi32_si128(C)) +#define npyv_shl_s32(A, C) _mm256_sll_epi32(A, _mm_cvtsi32_si128(C)) +#define npyv_shl_u64(A, C) _mm256_sll_epi64(A, _mm_cvtsi32_si128(C)) +#define npyv_shl_s64(A, C) _mm256_sll_epi64(A, _mm_cvtsi32_si128(C)) + +// left by an immediate constant +#define npyv_shli_u16 _mm256_slli_epi16 +#define npyv_shli_s16 _mm256_slli_epi16 +#define npyv_shli_u32 _mm256_slli_epi32 +#define npyv_shli_s32 _mm256_slli_epi32 +#define npyv_shli_u64 _mm256_slli_epi64 +#define npyv_shli_s64 _mm256_slli_epi64 + +// right +#define npyv_shr_u16(A, C) _mm256_srl_epi16(A, _mm_cvtsi32_si128(C)) +#define npyv_shr_s16(A, C) _mm256_sra_epi16(A, _mm_cvtsi32_si128(C)) +#define npyv_shr_u32(A, C) _mm256_srl_epi32(A, _mm_cvtsi32_si128(C)) +#define npyv_shr_s32(A, C) _mm256_sra_epi32(A, _mm_cvtsi32_si128(C)) +#define npyv_shr_u64(A, C) _mm256_srl_epi64(A, _mm_cvtsi32_si128(C)) +NPY_FINLINE __m256i npyv_shr_s64(__m256i a, int c) +{ + const __m256i sbit = _mm256_set1_epi64x(0x8000000000000000); + const __m128i c64 = _mm_cvtsi32_si128(c); + __m256i r = _mm256_srl_epi64(_mm256_add_epi64(a, sbit), c64); + return _mm256_sub_epi64(r, _mm256_srl_epi64(sbit, c64)); +} + +// right by an immediate constant +#define npyv_shri_u16 _mm256_srli_epi16 +#define npyv_shri_s16 _mm256_srai_epi16 +#define npyv_shri_u32 _mm256_srli_epi32 +#define npyv_shri_s32 _mm256_srai_epi32 +#define npyv_shri_u64 _mm256_srli_epi64 +#define npyv_shri_s64 npyv_shr_s64 + +/*************************** + * Logical + ***************************/ +// AND +#define npyv_and_u8 _mm256_and_si256 +#define npyv_and_s8 _mm256_and_si256 +#define npyv_and_u16 _mm256_and_si256 +#define npyv_and_s16 _mm256_and_si256 +#define npyv_and_u32 _mm256_and_si256 +#define npyv_and_s32 _mm256_and_si256 +#define npyv_and_u64 _mm256_and_si256 +#define npyv_and_s64 _mm256_and_si256 +#define npyv_and_f32 _mm256_and_ps +#define npyv_and_f64 _mm256_and_pd + +// OR +#define npyv_or_u8 _mm256_or_si256 +#define npyv_or_s8 _mm256_or_si256 +#define npyv_or_u16 _mm256_or_si256 +#define npyv_or_s16 _mm256_or_si256 +#define npyv_or_u32 _mm256_or_si256 +#define npyv_or_s32 _mm256_or_si256 +#define npyv_or_u64 _mm256_or_si256 +#define npyv_or_s64 _mm256_or_si256 +#define npyv_or_f32 _mm256_or_ps +#define npyv_or_f64 _mm256_or_pd + +// XOR +#define npyv_xor_u8 _mm256_xor_si256 +#define npyv_xor_s8 _mm256_xor_si256 +#define npyv_xor_u16 _mm256_xor_si256 +#define npyv_xor_s16 _mm256_xor_si256 +#define npyv_xor_u32 _mm256_xor_si256 +#define npyv_xor_s32 _mm256_xor_si256 +#define npyv_xor_u64 _mm256_xor_si256 +#define npyv_xor_s64 _mm256_xor_si256 +#define npyv_xor_f32 _mm256_xor_ps +#define npyv_xor_f64 _mm256_xor_pd + +// NOT +#define npyv_not_u8(A) _mm256_xor_si256(A, _mm256_set1_epi32(-1)) +#define npyv_not_s8 npyv_not_u8 +#define npyv_not_u16 npyv_not_u8 +#define npyv_not_s16 npyv_not_u8 +#define npyv_not_u32 npyv_not_u8 +#define npyv_not_s32 npyv_not_u8 +#define npyv_not_u64 npyv_not_u8 +#define npyv_not_s64 npyv_not_u8 +#define npyv_not_f32(A) _mm256_xor_ps(A, _mm256_castsi256_ps(_mm256_set1_epi32(-1))) +#define npyv_not_f64(A) _mm256_xor_pd(A, _mm256_castsi256_pd(_mm256_set1_epi32(-1))) + +/*************************** + * Comparison + ***************************/ + +// int Equal +#define npyv_cmpeq_u8 _mm256_cmpeq_epi8 +#define npyv_cmpeq_s8 _mm256_cmpeq_epi8 +#define npyv_cmpeq_u16 _mm256_cmpeq_epi16 +#define npyv_cmpeq_s16 _mm256_cmpeq_epi16 +#define npyv_cmpeq_u32 _mm256_cmpeq_epi32 +#define npyv_cmpeq_s32 _mm256_cmpeq_epi32 +#define npyv_cmpeq_u64 _mm256_cmpeq_epi64 +#define npyv_cmpeq_s64 _mm256_cmpeq_epi64 + +// int Not Equal +#define npyv_cmpneq_u8(A, B) npyv_not_u8(_mm256_cmpeq_epi8(A, B)) +#define npyv_cmpneq_s8 npyv_cmpneq_u8 +#define npyv_cmpneq_u16(A, B) npyv_not_u16(_mm256_cmpeq_epi16(A, B)) +#define npyv_cmpneq_s16 npyv_cmpneq_u16 +#define npyv_cmpneq_u32(A, B) npyv_not_u32(_mm256_cmpeq_epi32(A, B)) +#define npyv_cmpneq_s32 npyv_cmpneq_u32 +#define npyv_cmpneq_u64(A, B) npyv_not_u64(_mm256_cmpeq_epi64(A, B)) +#define npyv_cmpneq_s64 npyv_cmpneq_u64 + +// signed greater than +#define npyv_cmpgt_s8 _mm256_cmpgt_epi8 +#define npyv_cmpgt_s16 _mm256_cmpgt_epi16 +#define npyv_cmpgt_s32 _mm256_cmpgt_epi32 +#define npyv_cmpgt_s64 _mm256_cmpgt_epi64 + +// signed greater than or equal +#define npyv_cmpge_s8(A, B) npyv_not_s8(_mm256_cmpgt_epi8(B, A)) +#define npyv_cmpge_s16(A, B) npyv_not_s16(_mm256_cmpgt_epi16(B, A)) +#define npyv_cmpge_s32(A, B) npyv_not_s32(_mm256_cmpgt_epi32(B, A)) +#define npyv_cmpge_s64(A, B) npyv_not_s64(_mm256_cmpgt_epi64(B, A)) + +// unsigned greater than +#define NPYV_IMPL_AVX2_UNSIGNED_GT(LEN, SIGN) \ + NPY_FINLINE __m256i npyv_cmpgt_u##LEN(__m256i a, __m256i b) \ + { \ + const __m256i sbit = _mm256_set1_epi32(SIGN); \ + return _mm256_cmpgt_epi##LEN( \ + _mm256_xor_si256(a, sbit), _mm256_xor_si256(b, sbit) \ + ); \ + } + +NPYV_IMPL_AVX2_UNSIGNED_GT(8, 0x80808080) +NPYV_IMPL_AVX2_UNSIGNED_GT(16, 0x80008000) +NPYV_IMPL_AVX2_UNSIGNED_GT(32, 0x80000000) + +NPY_FINLINE __m256i npyv_cmpgt_u64(__m256i a, __m256i b) +{ + const __m256i sbit = _mm256_set1_epi64x(0x8000000000000000); + return _mm256_cmpgt_epi64(_mm256_xor_si256(a, sbit), _mm256_xor_si256(b, sbit)); +} + +// unsigned greater than or equal +NPY_FINLINE __m256i npyv_cmpge_u8(__m256i a, __m256i b) +{ return _mm256_cmpeq_epi8(a, _mm256_max_epu8(a, b)); } +NPY_FINLINE __m256i npyv_cmpge_u16(__m256i a, __m256i b) +{ return _mm256_cmpeq_epi16(a, _mm256_max_epu16(a, b)); } +NPY_FINLINE __m256i npyv_cmpge_u32(__m256i a, __m256i b) +{ return _mm256_cmpeq_epi32(a, _mm256_max_epu32(a, b)); } +#define npyv_cmpge_u64(A, B) npyv_not_u64(npyv_cmpgt_u64(B, A)) + +// less than +#define npyv_cmplt_u8(A, B) npyv_cmpgt_u8(B, A) +#define npyv_cmplt_s8(A, B) npyv_cmpgt_s8(B, A) +#define npyv_cmplt_u16(A, B) npyv_cmpgt_u16(B, A) +#define npyv_cmplt_s16(A, B) npyv_cmpgt_s16(B, A) +#define npyv_cmplt_u32(A, B) npyv_cmpgt_u32(B, A) +#define npyv_cmplt_s32(A, B) npyv_cmpgt_s32(B, A) +#define npyv_cmplt_u64(A, B) npyv_cmpgt_u64(B, A) +#define npyv_cmplt_s64(A, B) npyv_cmpgt_s64(B, A) + +// less than or equal +#define npyv_cmple_u8(A, B) npyv_cmpge_u8(B, A) +#define npyv_cmple_s8(A, B) npyv_cmpge_s8(B, A) +#define npyv_cmple_u16(A, B) npyv_cmpge_u16(B, A) +#define npyv_cmple_s16(A, B) npyv_cmpge_s16(B, A) +#define npyv_cmple_u32(A, B) npyv_cmpge_u32(B, A) +#define npyv_cmple_s32(A, B) npyv_cmpge_s32(B, A) +#define npyv_cmple_u64(A, B) npyv_cmpge_u64(B, A) +#define npyv_cmple_s64(A, B) npyv_cmpge_s64(B, A) + +// precision comparison +#define npyv_cmpeq_f32(A, B) _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_EQ_OQ)) +#define npyv_cmpeq_f64(A, B) _mm256_castpd_si256(_mm256_cmp_pd(A, B, _CMP_EQ_OQ)) +#define npyv_cmpneq_f32(A, B) _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_NEQ_OQ)) +#define npyv_cmpneq_f64(A, B) _mm256_castpd_si256(_mm256_cmp_pd(A, B, _CMP_NEQ_OQ)) +#define npyv_cmplt_f32(A, B) _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_LT_OQ)) +#define npyv_cmplt_f64(A, B) _mm256_castpd_si256(_mm256_cmp_pd(A, B, _CMP_LT_OQ)) +#define npyv_cmple_f32(A, B) _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_LE_OQ)) +#define npyv_cmple_f64(A, B) _mm256_castpd_si256(_mm256_cmp_pd(A, B, _CMP_LE_OQ)) +#define npyv_cmpgt_f32(A, B) _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_GT_OQ)) +#define npyv_cmpgt_f64(A, B) _mm256_castpd_si256(_mm256_cmp_pd(A, B, _CMP_GT_OQ)) +#define npyv_cmpge_f32(A, B) _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_GE_OQ)) +#define npyv_cmpge_f64(A, B) _mm256_castpd_si256(_mm256_cmp_pd(A, B, _CMP_GE_OQ)) + +#endif // _NPY_SIMD_AVX2_OPERATORS_H diff --git a/numpy/core/src/common/simd/avx2/reorder.h b/numpy/core/src/common/simd/avx2/reorder.h new file mode 100644 index 000000000..5a9e68e32 --- /dev/null +++ b/numpy/core/src/common/simd/avx2/reorder.h @@ -0,0 +1,97 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_AVX2_REORDER_H +#define _NPY_SIMD_AVX2_REORDER_H + +// combine lower part of two vectors +#define npyv_combinel_u8(A, B) _mm256_permute2x128_si256(A, B, 0x20) +#define npyv_combinel_s8 npyv_combinel_u8 +#define npyv_combinel_u16 npyv_combinel_u8 +#define npyv_combinel_s16 npyv_combinel_u8 +#define npyv_combinel_u32 npyv_combinel_u8 +#define npyv_combinel_s32 npyv_combinel_u8 +#define npyv_combinel_u64 npyv_combinel_u8 +#define npyv_combinel_s64 npyv_combinel_u8 +#define npyv_combinel_f32(A, B) _mm256_permute2f128_ps(A, B, 0x20) +#define npyv_combinel_f64(A, B) _mm256_permute2f128_pd(A, B, 0x20) + +// combine higher part of two vectors +#define npyv_combineh_u8(A, B) _mm256_permute2x128_si256(A, B, 0x31) +#define npyv_combineh_s8 npyv_combineh_u8 +#define npyv_combineh_u16 npyv_combineh_u8 +#define npyv_combineh_s16 npyv_combineh_u8 +#define npyv_combineh_u32 npyv_combineh_u8 +#define npyv_combineh_s32 npyv_combineh_u8 +#define npyv_combineh_u64 npyv_combineh_u8 +#define npyv_combineh_s64 npyv_combineh_u8 +#define npyv_combineh_f32(A, B) _mm256_permute2f128_ps(A, B, 0x31) +#define npyv_combineh_f64(A, B) _mm256_permute2f128_pd(A, B, 0x31) + +// combine two vectors from lower and higher parts of two other vectors +NPY_FINLINE npyv_m256ix2 npyv__combine(__m256i a, __m256i b) +{ + npyv_m256ix2 r; + __m256i a1b0 = _mm256_permute2x128_si256(a, b, 0x21); + r.val[0] = _mm256_blend_epi32(a, a1b0, 0xF0); + r.val[1] = _mm256_blend_epi32(b, a1b0, 0xF); + return r; +} +NPY_FINLINE npyv_f32x2 npyv_combine_f32(__m256 a, __m256 b) +{ + npyv_f32x2 r; + __m256 a1b0 = _mm256_permute2f128_ps(a, b, 0x21); + r.val[0] = _mm256_blend_ps(a, a1b0, 0xF0); + r.val[1] = _mm256_blend_ps(b, a1b0, 0xF); + return r; +} +NPY_FINLINE npyv_f64x2 npyv_combine_f64(__m256d a, __m256d b) +{ + npyv_f64x2 r; + __m256d a1b0 = _mm256_permute2f128_pd(a, b, 0x21); + r.val[0] = _mm256_blend_pd(a, a1b0, 0xC); + r.val[1] = _mm256_blend_pd(b, a1b0, 0x3); + return r; +} +#define npyv_combine_u8 npyv__combine +#define npyv_combine_s8 npyv__combine +#define npyv_combine_u16 npyv__combine +#define npyv_combine_s16 npyv__combine +#define npyv_combine_u32 npyv__combine +#define npyv_combine_s32 npyv__combine +#define npyv_combine_u64 npyv__combine +#define npyv_combine_s64 npyv__combine + +// interleave two vectors +#define NPYV_IMPL_AVX2_ZIP_U(T_VEC, LEN) \ + NPY_FINLINE T_VEC##x2 npyv_zip_u##LEN(T_VEC a, T_VEC b) \ + { \ + __m256i ab0 = _mm256_unpacklo_epi##LEN(a, b); \ + __m256i ab1 = _mm256_unpackhi_epi##LEN(a, b); \ + return npyv__combine(ab0, ab1); \ + } + +NPYV_IMPL_AVX2_ZIP_U(npyv_u8, 8) +NPYV_IMPL_AVX2_ZIP_U(npyv_u16, 16) +NPYV_IMPL_AVX2_ZIP_U(npyv_u32, 32) +NPYV_IMPL_AVX2_ZIP_U(npyv_u64, 64) +#define npyv_zip_s8 npyv_zip_u8 +#define npyv_zip_s16 npyv_zip_u16 +#define npyv_zip_s32 npyv_zip_u32 +#define npyv_zip_s64 npyv_zip_u64 + +NPY_FINLINE npyv_f32x2 npyv_zip_f32(__m256 a, __m256 b) +{ + __m256 ab0 = _mm256_unpacklo_ps(a, b); + __m256 ab1 = _mm256_unpackhi_ps(a, b); + return npyv_combine_f32(ab0, ab1); +} +NPY_FINLINE npyv_f64x2 npyv_zip_f64(__m256d a, __m256d b) +{ + __m256d ab0 = _mm256_unpacklo_pd(a, b); + __m256d ab1 = _mm256_unpackhi_pd(a, b); + return npyv_combine_f64(ab0, ab1); +} + +#endif // _NPY_SIMD_AVX2_REORDER_H diff --git a/numpy/core/src/common/simd/avx2/utils.h b/numpy/core/src/common/simd/avx2/utils.h new file mode 100644 index 000000000..24f1af5d1 --- /dev/null +++ b/numpy/core/src/common/simd/avx2/utils.h @@ -0,0 +1,21 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_AVX2_UTILS_H +#define _NPY_SIMD_AVX2_UTILS_H + +#define npyv256_shuffle_odd(A) _mm256_permute4x64_epi64(A, _MM_SHUFFLE(3, 1, 2, 0)) +#define npyv256_shuffle_odd_ps(A) _mm256_castsi256_ps(npyv256_shuffle_odd(_mm256_castps_si256(A))) +#define npyv256_shuffle_odd_pd(A) _mm256_permute4x64_pd(A, _MM_SHUFFLE(3, 1, 2, 0)) + +NPY_FINLINE __m256i npyv256_mul_u8(__m256i a, __m256i b) +{ + const __m256i mask = _mm256_set1_epi32(0xFF00FF00); + __m256i even = _mm256_mullo_epi16(a, b); + __m256i odd = _mm256_mullo_epi16(_mm256_srai_epi16(a, 8), _mm256_srai_epi16(b, 8)); + odd = _mm256_slli_epi16(odd, 8); + return _mm256_blendv_epi8(even, odd, mask); +} + +#endif // _NPY_SIMD_AVX2_UTILS_H diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h new file mode 100644 index 000000000..fcaef0efd --- /dev/null +++ b/numpy/core/src/common/simd/avx512/arithmetic.h @@ -0,0 +1,116 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_AVX512_ARITHMETIC_H +#define _NPY_SIMD_AVX512_ARITHMETIC_H + +#include "../avx2/utils.h" + +/*************************** + * Addition + ***************************/ +// non-saturated +#ifdef NPY_HAVE_AVX512BW + #define npyv_add_u8 _mm512_add_epi8 + #define npyv_add_u16 _mm512_add_epi16 +#else + NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_add_u8, _mm256_add_epi8) + NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_add_u16, _mm256_add_epi16) +#endif +#define npyv_add_s8 npyv_add_u8 +#define npyv_add_s16 npyv_add_u16 +#define npyv_add_u32 _mm512_add_epi32 +#define npyv_add_s32 _mm512_add_epi32 +#define npyv_add_u64 _mm512_add_epi64 +#define npyv_add_s64 _mm512_add_epi64 +#define npyv_add_f32 _mm512_add_ps +#define npyv_add_f64 _mm512_add_pd + +// saturated +#ifdef NPY_HAVE_AVX512BW + #define npyv_adds_u8 _mm512_adds_epu8 + #define npyv_adds_s8 _mm512_adds_epi8 + #define npyv_adds_u16 _mm512_adds_epu16 + #define npyv_adds_s16 _mm512_adds_epi16 +#else + NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_adds_u8, _mm256_adds_epu8) + NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_adds_s8, _mm256_adds_epi8) + NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_adds_u16, _mm256_adds_epu16) + NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_adds_s16, _mm256_adds_epi16) +#endif +// TODO: rest, after implment Packs intrins + +/*************************** + * Subtraction + ***************************/ +// non-saturated +#ifdef NPY_HAVE_AVX512BW + #define npyv_sub_u8 _mm512_sub_epi8 + #define npyv_sub_u16 _mm512_sub_epi16 +#else + NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_sub_u8, _mm256_sub_epi8) + NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_sub_u16, _mm256_sub_epi16) +#endif +#define npyv_sub_s8 npyv_sub_u8 +#define npyv_sub_s16 npyv_sub_u16 +#define npyv_sub_u32 _mm512_sub_epi32 +#define npyv_sub_s32 _mm512_sub_epi32 +#define npyv_sub_u64 _mm512_sub_epi64 +#define npyv_sub_s64 _mm512_sub_epi64 +#define npyv_sub_f32 _mm512_sub_ps +#define npyv_sub_f64 _mm512_sub_pd + +// saturated +#ifdef NPY_HAVE_AVX512BW + #define npyv_subs_u8 _mm512_subs_epu8 + #define npyv_subs_s8 _mm512_subs_epi8 + #define npyv_subs_u16 _mm512_subs_epu16 + #define npyv_subs_s16 _mm512_subs_epi16 +#else + NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_subs_u8, _mm256_subs_epu8) + NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_subs_s8, _mm256_subs_epi8) + NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_subs_u16, _mm256_subs_epu16) + NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_subs_s16, _mm256_subs_epi16) +#endif +// TODO: rest, after implment Packs intrins + +/*************************** + * Multiplication + ***************************/ +// non-saturated +#ifdef NPY_HAVE_AVX512BW +NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b) +{ + __m512i even = _mm512_mullo_epi16(a, b); + __m512i odd = _mm512_mullo_epi16(_mm512_srai_epi16(a, 8), _mm512_srai_epi16(b, 8)); + odd = _mm512_slli_epi16(odd, 8); + return _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, even, odd); +} +#else + NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_mul_u8, npyv256_mul_u8) +#endif + +#ifdef NPY_HAVE_AVX512BW + #define npyv_mul_u16 _mm512_mullo_epi16 +#else + NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_mul_u16, _mm256_mullo_epi16) +#endif +#define npyv_mul_s8 npyv_mul_u8 +#define npyv_mul_s16 npyv_mul_u16 +#define npyv_mul_u32 _mm512_mullo_epi32 +#define npyv_mul_s32 _mm512_mullo_epi32 +#define npyv_mul_f32 _mm512_mul_ps +#define npyv_mul_f64 _mm512_mul_pd + +// saturated +// TODO: after implment Packs intrins + +/*************************** + * Division + ***************************/ +// TODO: emulate integer division +#define npyv_div_f32 _mm512_div_ps +#define npyv_div_f64 _mm512_div_pd + +#endif // _NPY_SIMD_AVX512_ARITHMETIC_H diff --git a/numpy/core/src/common/simd/avx512/avx512.h b/numpy/core/src/common/simd/avx512/avx512.h new file mode 100644 index 000000000..96fdf72b9 --- /dev/null +++ b/numpy/core/src/common/simd/avx512/avx512.h @@ -0,0 +1,71 @@ +#ifndef _NPY_SIMD_H_ + #error "Not a standalone header" +#endif +#define NPY_SIMD 512 +#define NPY_SIMD_WIDTH 64 +#define NPY_SIMD_F64 1 + +typedef __m512i npyv_u8; +typedef __m512i npyv_s8; +typedef __m512i npyv_u16; +typedef __m512i npyv_s16; +typedef __m512i npyv_u32; +typedef __m512i npyv_s32; +typedef __m512i npyv_u64; +typedef __m512i npyv_s64; +typedef __m512 npyv_f32; +typedef __m512d npyv_f64; + +#ifdef NPY_HAVE_AVX512BW +typedef __mmask64 npyv_b8; +typedef __mmask32 npyv_b16; +#else +typedef __m512i npyv_b8; +typedef __m512i npyv_b16; +#endif +typedef __mmask16 npyv_b32; +typedef __mmask8 npyv_b64; + +typedef struct { __m512i val[2]; } npyv_m512ix2; +typedef npyv_m512ix2 npyv_u8x2; +typedef npyv_m512ix2 npyv_s8x2; +typedef npyv_m512ix2 npyv_u16x2; +typedef npyv_m512ix2 npyv_s16x2; +typedef npyv_m512ix2 npyv_u32x2; +typedef npyv_m512ix2 npyv_s32x2; +typedef npyv_m512ix2 npyv_u64x2; +typedef npyv_m512ix2 npyv_s64x2; + +typedef struct { __m512i val[3]; } npyv_m512ix3; +typedef npyv_m512ix3 npyv_u8x3; +typedef npyv_m512ix3 npyv_s8x3; +typedef npyv_m512ix3 npyv_u16x3; +typedef npyv_m512ix3 npyv_s16x3; +typedef npyv_m512ix3 npyv_u32x3; +typedef npyv_m512ix3 npyv_s32x3; +typedef npyv_m512ix3 npyv_u64x3; +typedef npyv_m512ix3 npyv_s64x3; + +typedef struct { __m512 val[2]; } npyv_f32x2; +typedef struct { __m512d val[2]; } npyv_f64x2; +typedef struct { __m512 val[3]; } npyv_f32x3; +typedef struct { __m512d val[3]; } npyv_f64x3; + +#define npyv_nlanes_u8 64 +#define npyv_nlanes_s8 64 +#define npyv_nlanes_u16 32 +#define npyv_nlanes_s16 32 +#define npyv_nlanes_u32 16 +#define npyv_nlanes_s32 16 +#define npyv_nlanes_u64 8 +#define npyv_nlanes_s64 8 +#define npyv_nlanes_f32 16 +#define npyv_nlanes_f64 8 + +#include "utils.h" +#include "memory.h" +#include "misc.h" +#include "reorder.h" +#include "operators.h" +#include "conversion.h" +#include "arithmetic.h" diff --git a/numpy/core/src/common/simd/avx512/conversion.h b/numpy/core/src/common/simd/avx512/conversion.h new file mode 100644 index 000000000..0f7e27de3 --- /dev/null +++ b/numpy/core/src/common/simd/avx512/conversion.h @@ -0,0 +1,54 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_AVX512_CVT_H +#define _NPY_SIMD_AVX512_CVT_H + +// convert mask to integer vectors +#ifdef NPY_HAVE_AVX512BW + #define npyv_cvt_u8_b8 _mm512_movm_epi8 + #define npyv_cvt_u16_b16 _mm512_movm_epi16 +#else + #define npyv_cvt_u8_b8(BL) BL + #define npyv_cvt_u16_b16(BL) BL +#endif +#define npyv_cvt_s8_b8 npyv_cvt_u8_b8 +#define npyv_cvt_s16_b16 npyv_cvt_u16_b16 + +#ifdef NPY_HAVE_AVX512DQ + #define npyv_cvt_u32_b32 _mm512_movm_epi32 + #define npyv_cvt_u64_b64 _mm512_movm_epi64 +#else + #define npyv_cvt_u32_b32(BL) _mm512_maskz_set1_epi32(BL, (int)-1) + #define npyv_cvt_u64_b64(BL) _mm512_maskz_set1_epi64(BL, (npy_int64)-1) +#endif +#define npyv_cvt_s32_b32 npyv_cvt_u32_b32 +#define npyv_cvt_s64_b64 npyv_cvt_u64_b64 +#define npyv_cvt_f32_b32(BL) _mm512_castsi512_ps(npyv_cvt_u32_b32(BL)) +#define npyv_cvt_f64_b64(BL) _mm512_castsi512_pd(npyv_cvt_u64_b64(BL)) + +// convert integer vectors to mask +#ifdef NPY_HAVE_AVX512BW + #define npyv_cvt_b8_u8 _mm512_movepi8_mask + #define npyv_cvt_b16_u16 _mm512_movepi16_mask +#else + #define npyv_cvt_b8_u8(A) A + #define npyv_cvt_b16_u16(A) A +#endif +#define npyv_cvt_b8_s8 npyv_cvt_b8_u8 +#define npyv_cvt_b16_s16 npyv_cvt_b16_u16 + +#ifdef NPY_HAVE_AVX512DQ + #define npyv_cvt_b32_u32 _mm512_movepi32_mask + #define npyv_cvt_b64_u64 _mm512_movepi64_mask +#else + #define npyv_cvt_b32_u32(A) _mm512_cmpneq_epu32_mask(A, _mm512_setzero_si512()) + #define npyv_cvt_b64_u64(A) _mm512_cmpneq_epu64_mask(A, _mm512_setzero_si512()) +#endif +#define npyv_cvt_b32_s32 npyv_cvt_b32_u32 +#define npyv_cvt_b64_s64 npyv_cvt_b64_u64 +#define npyv_cvt_b32_f32(A) npyv_cvt_b32_u32(_mm512_castps_si512(A)) +#define npyv_cvt_b64_f64(A) npyv_cvt_b64_u64(_mm512_castpd_si512(A)) + +#endif // _NPY_SIMD_AVX512_CVT_H diff --git a/numpy/core/src/common/simd/avx512/memory.h b/numpy/core/src/common/simd/avx512/memory.h new file mode 100644 index 000000000..e212c4555 --- /dev/null +++ b/numpy/core/src/common/simd/avx512/memory.h @@ -0,0 +1,94 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_AVX512_MEMORY_H +#define _NPY_SIMD_AVX512_MEMORY_H + +#include "misc.h" + +/*************************** + * load/store + ***************************/ +#if defined(__GNUC__) + // GCC expect pointer argument type to be `void*` instead of `const void *`, + // which cause a massive warning. + #define npyv__loads(PTR) _mm512_stream_load_si512((__m512i*)(PTR)) +#else + #define npyv__loads(PTR) _mm512_stream_load_si512((const __m512i*)(PTR)) +#endif +#if defined(_MSC_VER) && defined(_M_IX86) + // workaround msvc(32bit) overflow bug, reported at + // https://developercommunity.visualstudio.com/content/problem/911872/u.html + NPY_FINLINE __m512i npyv__loadl(const __m256i *ptr) + { + __m256i a = _mm256_loadu_si256(ptr); + return _mm512_inserti64x4(_mm512_castsi256_si512(a), a, 0); + } +#else + #define npyv__loadl(PTR) \ + _mm512_castsi256_si512(_mm256_loadu_si256(PTR)) +#endif +#define NPYV_IMPL_AVX512_MEM_INT(CTYPE, SFX) \ + NPY_FINLINE npyv_##SFX npyv_load_##SFX(const CTYPE *ptr) \ + { return _mm512_loadu_si512((const __m512i*)ptr); } \ + NPY_FINLINE npyv_##SFX npyv_loada_##SFX(const CTYPE *ptr) \ + { return _mm512_load_si512((const __m512i*)ptr); } \ + NPY_FINLINE npyv_##SFX npyv_loads_##SFX(const CTYPE *ptr) \ + { return npyv__loads(ptr); } \ + NPY_FINLINE npyv_##SFX npyv_loadl_##SFX(const CTYPE *ptr) \ + { return npyv__loadl((const __m256i *)ptr); } \ + NPY_FINLINE void npyv_store_##SFX(CTYPE *ptr, npyv_##SFX vec) \ + { _mm512_storeu_si512((__m512i*)ptr, vec); } \ + NPY_FINLINE void npyv_storea_##SFX(CTYPE *ptr, npyv_##SFX vec) \ + { _mm512_store_si512((__m512i*)ptr, vec); } \ + NPY_FINLINE void npyv_stores_##SFX(CTYPE *ptr, npyv_##SFX vec) \ + { _mm512_stream_si512((__m512i*)ptr, vec); } \ + NPY_FINLINE void npyv_storel_##SFX(CTYPE *ptr, npyv_##SFX vec) \ + { _mm256_storeu_si256((__m256i*)ptr, npyv512_lower_si256(vec)); } \ + NPY_FINLINE void npyv_storeh_##SFX(CTYPE *ptr, npyv_##SFX vec) \ + { _mm256_storeu_si256((__m256i*)(ptr), npyv512_higher_si256(vec)); } + +NPYV_IMPL_AVX512_MEM_INT(npy_uint8, u8) +NPYV_IMPL_AVX512_MEM_INT(npy_int8, s8) +NPYV_IMPL_AVX512_MEM_INT(npy_uint16, u16) +NPYV_IMPL_AVX512_MEM_INT(npy_int16, s16) +NPYV_IMPL_AVX512_MEM_INT(npy_uint32, u32) +NPYV_IMPL_AVX512_MEM_INT(npy_int32, s32) +NPYV_IMPL_AVX512_MEM_INT(npy_uint64, u64) +NPYV_IMPL_AVX512_MEM_INT(npy_int64, s64) + +// unaligned load +#define npyv_load_f32(PTR) _mm512_loadu_ps((const __m512*)(PTR)) +#define npyv_load_f64(PTR) _mm512_loadu_pd((const __m512d*)(PTR)) +// aligned load +#define npyv_loada_f32(PTR) _mm512_load_ps((const __m512*)(PTR)) +#define npyv_loada_f64(PTR) _mm512_load_pd((const __m512d*)(PTR)) +// load lower part +#if defined(_MSC_VER) && defined(_M_IX86) + #define npyv_loadl_f32(PTR) _mm512_castsi512_ps(npyv__loadl((const __m256i *)(PTR))) + #define npyv_loadl_f64(PTR) _mm512_castsi512_pd(npyv__loadl((const __m256i *)(PTR))) +#else + #define npyv_loadl_f32(PTR) _mm512_castps256_ps512(_mm256_loadu_ps(PTR)) + #define npyv_loadl_f64(PTR) _mm512_castpd256_pd512(_mm256_loadu_pd(PTR)) +#endif +// stream load +#define npyv_loads_f32(PTR) _mm512_castsi512_ps(npyv__loads(PTR)) +#define npyv_loads_f64(PTR) _mm512_castsi512_pd(npyv__loads(PTR)) +// unaligned store +#define npyv_store_f32 _mm512_storeu_ps +#define npyv_store_f64 _mm512_storeu_pd +// aligned store +#define npyv_storea_f32 _mm512_store_ps +#define npyv_storea_f64 _mm512_store_pd +// stream store +#define npyv_stores_f32 _mm512_stream_ps +#define npyv_stores_f64 _mm512_stream_pd +// store lower part +#define npyv_storel_f32(PTR, VEC) _mm256_storeu_ps(PTR, npyv512_lower_ps256(VEC)) +#define npyv_storel_f64(PTR, VEC) _mm256_storeu_pd(PTR, npyv512_lower_pd256(VEC)) +// store higher part +#define npyv_storeh_f32(PTR, VEC) _mm256_storeu_ps(PTR, npyv512_higher_ps256(VEC)) +#define npyv_storeh_f64(PTR, VEC) _mm256_storeu_pd(PTR, npyv512_higher_pd256(VEC)) + +#endif // _NPY_SIMD_AVX512_MEMORY_H diff --git a/numpy/core/src/common/simd/avx512/misc.h b/numpy/core/src/common/simd/avx512/misc.h new file mode 100644 index 000000000..4b6729b05 --- /dev/null +++ b/numpy/core/src/common/simd/avx512/misc.h @@ -0,0 +1,252 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_AVX512_MISC_H +#define _NPY_SIMD_AVX512_MISC_H + +// set all lanes to zero +#define npyv_zero_u8 _mm512_setzero_si512 +#define npyv_zero_s8 _mm512_setzero_si512 +#define npyv_zero_u16 _mm512_setzero_si512 +#define npyv_zero_s16 _mm512_setzero_si512 +#define npyv_zero_u32 _mm512_setzero_si512 +#define npyv_zero_s32 _mm512_setzero_si512 +#define npyv_zero_u64 _mm512_setzero_si512 +#define npyv_zero_s64 _mm512_setzero_si512 +#define npyv_zero_f32 _mm512_setzero_ps +#define npyv_zero_f64 _mm512_setzero_pd + +// set all lanes to same value +#define npyv_setall_u8(VAL) _mm512_set1_epi8((char)VAL) +#define npyv_setall_s8(VAL) _mm512_set1_epi8((char)VAL) +#define npyv_setall_u16(VAL) _mm512_set1_epi16((short)VAL) +#define npyv_setall_s16(VAL) _mm512_set1_epi16((short)VAL) +#define npyv_setall_u32(VAL) _mm512_set1_epi32((int)VAL) +#define npyv_setall_s32(VAL) _mm512_set1_epi32(VAL) +#define npyv_setall_u64(VAL) _mm512_set1_epi64(VAL) +#define npyv_setall_s64(VAL) _mm512_set1_epi64(VAL) +#define npyv_setall_f32(VAL) _mm512_set1_ps(VAL) +#define npyv_setall_f64(VAL) _mm512_set1_pd(VAL) + +/** + * vector with specific values set to each lane and + * set a specific value to all remained lanes + * + * _mm512_set_epi8 and _mm512_set_epi16 are missing in many compilers + */ +NPY_FINLINE __m512i npyv__setr_epi8( + char i0, char i1, char i2, char i3, char i4, char i5, char i6, char i7, + char i8, char i9, char i10, char i11, char i12, char i13, char i14, char i15, + char i16, char i17, char i18, char i19, char i20, char i21, char i22, char i23, + char i24, char i25, char i26, char i27, char i28, char i29, char i30, char i31, + char i32, char i33, char i34, char i35, char i36, char i37, char i38, char i39, + char i40, char i41, char i42, char i43, char i44, char i45, char i46, char i47, + char i48, char i49, char i50, char i51, char i52, char i53, char i54, char i55, + char i56, char i57, char i58, char i59, char i60, char i61, char i62, char i63) +{ + const char NPY_DECL_ALIGNED(64) data[64] = { + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, + i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31, + i32, i33, i34, i35, i36, i37, i38, i39, i40, i41, i42, i43, i44, i45, i46, i47, + i48, i49, i50, i51, i52, i53, i54, i55, i56, i57, i58, i59, i60, i61, i62, i63 + }; + return _mm512_load_si512((const void*)data); +} +NPY_FINLINE __m512i npyv__setr_epi16( + short i0, short i1, short i2, short i3, short i4, short i5, short i6, short i7, + short i8, short i9, short i10, short i11, short i12, short i13, short i14, short i15, + short i16, short i17, short i18, short i19, short i20, short i21, short i22, short i23, + short i24, short i25, short i26, short i27, short i28, short i29, short i30, short i31) +{ + const short NPY_DECL_ALIGNED(64) data[32] = { + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, + i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31 + }; + return _mm512_load_si512((const void*)data); +} +// args that generated by NPYV__SET_FILL_* not going to expand if +// _mm512_setr_* are defined as macros. +NPY_FINLINE __m512i npyv__setr_epi32( + int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7, + int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15) +{ + return _mm512_setr_epi32(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15); +} +NPY_FINLINE __m512i npyv__setr_epi64(npy_int64 i0, npy_int64 i1, npy_int64 i2, npy_int64 i3, + npy_int64 i4, npy_int64 i5, npy_int64 i6, npy_int64 i7) +{ + return _mm512_setr_epi64(i0, i1, i2, i3, i4, i5, i6, i7); +} + +NPY_FINLINE __m512 npyv__setr_ps( + float i0, float i1, float i2, float i3, float i4, float i5, float i6, float i7, + float i8, float i9, float i10, float i11, float i12, float i13, float i14, float i15) +{ + return _mm512_setr_ps(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15); +} +NPY_FINLINE __m512d npyv__setr_pd(double i0, double i1, double i2, double i3, + double i4, double i5, double i6, double i7) +{ + return _mm512_setr_pd(i0, i1, i2, i3, i4, i5, i6, i7); +} +#define npyv_setf_u8(FILL, ...) npyv__setr_epi8(NPYV__SET_FILL_64(char, FILL, __VA_ARGS__)) +#define npyv_setf_s8(FILL, ...) npyv__setr_epi8(NPYV__SET_FILL_64(char, FILL, __VA_ARGS__)) +#define npyv_setf_u16(FILL, ...) npyv__setr_epi16(NPYV__SET_FILL_32(short, FILL, __VA_ARGS__)) +#define npyv_setf_s16(FILL, ...) npyv__setr_epi16(NPYV__SET_FILL_32(short, FILL, __VA_ARGS__)) +#define npyv_setf_u32(FILL, ...) npyv__setr_epi32(NPYV__SET_FILL_16(int, FILL, __VA_ARGS__)) +#define npyv_setf_s32(FILL, ...) npyv__setr_epi32(NPYV__SET_FILL_16(int, FILL, __VA_ARGS__)) +#define npyv_setf_u64(FILL, ...) npyv__setr_epi64(NPYV__SET_FILL_8(npy_int64, FILL, __VA_ARGS__)) +#define npyv_setf_s64(FILL, ...) npyv__setr_epi64(NPYV__SET_FILL_8(npy_int64, FILL, __VA_ARGS__)) +#define npyv_setf_f32(FILL, ...) npyv__setr_ps(NPYV__SET_FILL_16(float, FILL, __VA_ARGS__)) +#define npyv_setf_f64(FILL, ...) npyv__setr_pd(NPYV__SET_FILL_8(double, FILL, __VA_ARGS__)) + +// vector with specific values set to each lane and +// set zero to all remained lanes +#define npyv_set_u8(...) npyv_setf_u8(0, __VA_ARGS__) +#define npyv_set_s8(...) npyv_setf_s8(0, __VA_ARGS__) +#define npyv_set_u16(...) npyv_setf_u16(0, __VA_ARGS__) +#define npyv_set_s16(...) npyv_setf_s16(0, __VA_ARGS__) +#define npyv_set_u32(...) npyv_setf_u32(0, __VA_ARGS__) +#define npyv_set_s32(...) npyv_setf_s32(0, __VA_ARGS__) +#define npyv_set_u64(...) npyv_setf_u64(0, __VA_ARGS__) +#define npyv_set_s64(...) npyv_setf_s64(0, __VA_ARGS__) +#define npyv_set_f32(...) npyv_setf_f32(0, __VA_ARGS__) +#define npyv_set_f64(...) npyv_setf_f64(0, __VA_ARGS__) + +// per lane select +#ifdef NPY_HAVE_AVX512BW + #define npyv_select_u8(MASK, A, B) _mm512_mask_blend_epi8(MASK, B, A) + #define npyv_select_u16(MASK, A, B) _mm512_mask_blend_epi16(MASK, B, A) +#else + NPY_FINLINE __m512i npyv_select_u8(__m512i mask, __m512i a, __m512i b) + { return _mm512_xor_si512(b, _mm512_and_si512(_mm512_xor_si512(b, a), mask)); } + #define npyv_select_u16 npyv_select_u8 +#endif +#define npyv_select_s8 npyv_select_u8 +#define npyv_select_s16 npyv_select_u16 +#define npyv_select_u32(MASK, A, B) _mm512_mask_blend_epi32(MASK, B, A) +#define npyv_select_s32 npyv_select_u32 +#define npyv_select_u64(MASK, A, B) _mm512_mask_blend_epi64(MASK, B, A) +#define npyv_select_s64 npyv_select_u64 +#define npyv_select_f32(MASK, A, B) _mm512_mask_blend_ps(MASK, B, A) +#define npyv_select_f64(MASK, A, B) _mm512_mask_blend_pd(MASK, B, A) + +// reinterpret +#define npyv_reinterpret_u8_u8(X) X +#define npyv_reinterpret_u8_s8(X) X +#define npyv_reinterpret_u8_u16(X) X +#define npyv_reinterpret_u8_s16(X) X +#define npyv_reinterpret_u8_u32(X) X +#define npyv_reinterpret_u8_s32(X) X +#define npyv_reinterpret_u8_u64(X) X +#define npyv_reinterpret_u8_s64(X) X +#define npyv_reinterpret_u8_f32 _mm512_castps_si512 +#define npyv_reinterpret_u8_f64 _mm512_castpd_si512 + +#define npyv_reinterpret_s8_s8(X) X +#define npyv_reinterpret_s8_u8(X) X +#define npyv_reinterpret_s8_u16(X) X +#define npyv_reinterpret_s8_s16(X) X +#define npyv_reinterpret_s8_u32(X) X +#define npyv_reinterpret_s8_s32(X) X +#define npyv_reinterpret_s8_u64(X) X +#define npyv_reinterpret_s8_s64(X) X +#define npyv_reinterpret_s8_f32 _mm512_castps_si512 +#define npyv_reinterpret_s8_f64 _mm512_castpd_si512 + +#define npyv_reinterpret_u16_u16(X) X +#define npyv_reinterpret_u16_u8(X) X +#define npyv_reinterpret_u16_s8(X) X +#define npyv_reinterpret_u16_s16(X) X +#define npyv_reinterpret_u16_u32(X) X +#define npyv_reinterpret_u16_s32(X) X +#define npyv_reinterpret_u16_u64(X) X +#define npyv_reinterpret_u16_s64(X) X +#define npyv_reinterpret_u16_f32 _mm512_castps_si512 +#define npyv_reinterpret_u16_f64 _mm512_castpd_si512 + +#define npyv_reinterpret_s16_s16(X) X +#define npyv_reinterpret_s16_u8(X) X +#define npyv_reinterpret_s16_s8(X) X +#define npyv_reinterpret_s16_u16(X) X +#define npyv_reinterpret_s16_u32(X) X +#define npyv_reinterpret_s16_s32(X) X +#define npyv_reinterpret_s16_u64(X) X +#define npyv_reinterpret_s16_s64(X) X +#define npyv_reinterpret_s16_f32 _mm512_castps_si512 +#define npyv_reinterpret_s16_f64 _mm512_castpd_si512 + +#define npyv_reinterpret_u32_u32(X) X +#define npyv_reinterpret_u32_u8(X) X +#define npyv_reinterpret_u32_s8(X) X +#define npyv_reinterpret_u32_u16(X) X +#define npyv_reinterpret_u32_s16(X) X +#define npyv_reinterpret_u32_s32(X) X +#define npyv_reinterpret_u32_u64(X) X +#define npyv_reinterpret_u32_s64(X) X +#define npyv_reinterpret_u32_f32 _mm512_castps_si512 +#define npyv_reinterpret_u32_f64 _mm512_castpd_si512 + +#define npyv_reinterpret_s32_s32(X) X +#define npyv_reinterpret_s32_u8(X) X +#define npyv_reinterpret_s32_s8(X) X +#define npyv_reinterpret_s32_u16(X) X +#define npyv_reinterpret_s32_s16(X) X +#define npyv_reinterpret_s32_u32(X) X +#define npyv_reinterpret_s32_u64(X) X +#define npyv_reinterpret_s32_s64(X) X +#define npyv_reinterpret_s32_f32 _mm512_castps_si512 +#define npyv_reinterpret_s32_f64 _mm512_castpd_si512 + +#define npyv_reinterpret_u64_u64(X) X +#define npyv_reinterpret_u64_u8(X) X +#define npyv_reinterpret_u64_s8(X) X +#define npyv_reinterpret_u64_u16(X) X +#define npyv_reinterpret_u64_s16(X) X +#define npyv_reinterpret_u64_u32(X) X +#define npyv_reinterpret_u64_s32(X) X +#define npyv_reinterpret_u64_s64(X) X +#define npyv_reinterpret_u64_f32 _mm512_castps_si512 +#define npyv_reinterpret_u64_f64 _mm512_castpd_si512 + +#define npyv_reinterpret_s64_s64(X) X +#define npyv_reinterpret_s64_u8(X) X +#define npyv_reinterpret_s64_s8(X) X +#define npyv_reinterpret_s64_u16(X) X +#define npyv_reinterpret_s64_s16(X) X +#define npyv_reinterpret_s64_u32(X) X +#define npyv_reinterpret_s64_s32(X) X +#define npyv_reinterpret_s64_u64(X) X +#define npyv_reinterpret_s64_f32 _mm512_castps_si512 +#define npyv_reinterpret_s64_f64 _mm512_castpd_si512 + +#define npyv_reinterpret_f32_f32(X) X +#define npyv_reinterpret_f32_u8 _mm512_castsi512_ps +#define npyv_reinterpret_f32_s8 _mm512_castsi512_ps +#define npyv_reinterpret_f32_u16 _mm512_castsi512_ps +#define npyv_reinterpret_f32_s16 _mm512_castsi512_ps +#define npyv_reinterpret_f32_u32 _mm512_castsi512_ps +#define npyv_reinterpret_f32_s32 _mm512_castsi512_ps +#define npyv_reinterpret_f32_u64 _mm512_castsi512_ps +#define npyv_reinterpret_f32_s64 _mm512_castsi512_ps +#define npyv_reinterpret_f32_f64 _mm512_castpd_ps + +#define npyv_reinterpret_f64_f64(X) X +#define npyv_reinterpret_f64_u8 _mm512_castsi512_pd +#define npyv_reinterpret_f64_s8 _mm512_castsi512_pd +#define npyv_reinterpret_f64_u16 _mm512_castsi512_pd +#define npyv_reinterpret_f64_s16 _mm512_castsi512_pd +#define npyv_reinterpret_f64_u32 _mm512_castsi512_pd +#define npyv_reinterpret_f64_s32 _mm512_castsi512_pd +#define npyv_reinterpret_f64_u64 _mm512_castsi512_pd +#define npyv_reinterpret_f64_s64 _mm512_castsi512_pd +#define npyv_reinterpret_f64_f32 _mm512_castps_pd + +#ifdef NPY_HAVE_AVX512_KNL + #define npyv_cleanup() ((void)0) +#else + #define npyv_cleanup _mm256_zeroall +#endif + +#endif // _NPY_SIMD_AVX512_MISC_H diff --git a/numpy/core/src/common/simd/avx512/operators.h b/numpy/core/src/common/simd/avx512/operators.h new file mode 100644 index 000000000..f76ea5e2d --- /dev/null +++ b/numpy/core/src/common/simd/avx512/operators.h @@ -0,0 +1,259 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_AVX512_OPERATORS_H +#define _NPY_SIMD_AVX512_OPERATORS_H + +/*************************** + * Shifting + ***************************/ + +// left +#ifdef NPY_HAVE_AVX512BW + #define npyv_shl_u16(A, C) _mm512_sll_epi16(A, _mm_cvtsi32_si128(C)) +#else + #define NPYV_IMPL_AVX512_SHIFT(FN, INTRIN) \ + NPY_FINLINE __m512i npyv_##FN(__m512i a, int c) \ + { \ + __m256i l = npyv512_lower_si256(a); \ + __m256i h = npyv512_higher_si256(a); \ + __m128i cv = _mm_cvtsi32_si128(c); \ + l = _mm256_##INTRIN(l, cv); \ + h = _mm256_##INTRIN(h, cv); \ + return npyv512_combine_si256(l, h); \ + } + + NPYV_IMPL_AVX512_SHIFT(shl_u16, sll_epi16) +#endif +#define npyv_shl_s16 npyv_shl_u16 +#define npyv_shl_u32(A, C) _mm512_sll_epi32(A, _mm_cvtsi32_si128(C)) +#define npyv_shl_s32(A, C) _mm512_sll_epi32(A, _mm_cvtsi32_si128(C)) +#define npyv_shl_u64(A, C) _mm512_sll_epi64(A, _mm_cvtsi32_si128(C)) +#define npyv_shl_s64(A, C) _mm512_sll_epi64(A, _mm_cvtsi32_si128(C)) + +// left by an immediate constant +#ifdef NPY_HAVE_AVX512BW + #define npyv_shli_u16 _mm512_slli_epi16 +#else + #define npyv_shli_u16 npyv_shl_u16 +#endif +#define npyv_shli_s16 npyv_shl_u16 +#define npyv_shli_u32 _mm512_slli_epi32 +#define npyv_shli_s32 _mm512_slli_epi32 +#define npyv_shli_u64 _mm512_slli_epi64 +#define npyv_shli_s64 _mm512_slli_epi64 + +// right +#ifdef NPY_HAVE_AVX512BW + #define npyv_shr_u16(A, C) _mm512_srl_epi16(A, _mm_cvtsi32_si128(C)) + #define npyv_shr_s16(A, C) _mm512_sra_epi16(A, _mm_cvtsi32_si128(C)) +#else + NPYV_IMPL_AVX512_SHIFT(shr_u16, srl_epi16) + NPYV_IMPL_AVX512_SHIFT(shr_s16, sra_epi16) +#endif +#define npyv_shr_u32(A, C) _mm512_srl_epi32(A, _mm_cvtsi32_si128(C)) +#define npyv_shr_s32(A, C) _mm512_sra_epi32(A, _mm_cvtsi32_si128(C)) +#define npyv_shr_u64(A, C) _mm512_srl_epi64(A, _mm_cvtsi32_si128(C)) +#define npyv_shr_s64(A, C) _mm512_sra_epi64(A, _mm_cvtsi32_si128(C)) + +// right by an immediate constant +#ifdef NPY_HAVE_AVX512BW + #define npyv_shri_u16 _mm512_srli_epi16 + #define npyv_shri_s16 _mm512_srai_epi16 +#else + #define npyv_shri_u16 npyv_shr_u16 + #define npyv_shri_s16 npyv_shr_s16 +#endif +#define npyv_shri_u32 _mm512_srli_epi32 +#define npyv_shri_s32 _mm512_srai_epi32 +#define npyv_shri_u64 _mm512_srli_epi64 +#define npyv_shri_s64 _mm512_srai_epi64 + +/*************************** + * Logical + ***************************/ + +// AND +#define npyv_and_u8 _mm512_and_si512 +#define npyv_and_s8 _mm512_and_si512 +#define npyv_and_u16 _mm512_and_si512 +#define npyv_and_s16 _mm512_and_si512 +#define npyv_and_u32 _mm512_and_si512 +#define npyv_and_s32 _mm512_and_si512 +#define npyv_and_u64 _mm512_and_si512 +#define npyv_and_s64 _mm512_and_si512 +#ifdef NPY_HAVE_AVX512DQ + #define npyv_and_f32 _mm512_and_ps + #define npyv_and_f64 _mm512_and_pd +#else + NPYV_IMPL_AVX512_FROM_SI512_PS_2ARG(npyv_and_f32, _mm512_and_si512) + NPYV_IMPL_AVX512_FROM_SI512_PD_2ARG(npyv_and_f64, _mm512_and_si512) +#endif + +// OR +#define npyv_or_u8 _mm512_or_si512 +#define npyv_or_s8 _mm512_or_si512 +#define npyv_or_u16 _mm512_or_si512 +#define npyv_or_s16 _mm512_or_si512 +#define npyv_or_u32 _mm512_or_si512 +#define npyv_or_s32 _mm512_or_si512 +#define npyv_or_u64 _mm512_or_si512 +#define npyv_or_s64 _mm512_or_si512 +#ifdef NPY_HAVE_AVX512DQ + #define npyv_or_f32 _mm512_or_ps + #define npyv_or_f64 _mm512_or_pd +#else + NPYV_IMPL_AVX512_FROM_SI512_PS_2ARG(npyv_or_f32, _mm512_or_si512) + NPYV_IMPL_AVX512_FROM_SI512_PD_2ARG(npyv_or_f64, _mm512_or_si512) +#endif + +// XOR +#define npyv_xor_u8 _mm512_xor_si512 +#define npyv_xor_s8 _mm512_xor_si512 +#define npyv_xor_u16 _mm512_xor_si512 +#define npyv_xor_s16 _mm512_xor_si512 +#define npyv_xor_u32 _mm512_xor_si512 +#define npyv_xor_s32 _mm512_xor_si512 +#define npyv_xor_u64 _mm512_xor_si512 +#define npyv_xor_s64 _mm512_xor_si512 +#ifdef NPY_HAVE_AVX512DQ + #define npyv_xor_f32 _mm512_xor_ps + #define npyv_xor_f64 _mm512_xor_pd +#else + NPYV_IMPL_AVX512_FROM_SI512_PS_2ARG(npyv_xor_f32, _mm512_xor_si512) + NPYV_IMPL_AVX512_FROM_SI512_PD_2ARG(npyv_xor_f64, _mm512_xor_si512) +#endif + +// NOT +#define npyv_not_u8(A) _mm512_xor_si512(A, _mm512_set1_epi32(-1)) +#define npyv_not_s8 npyv_not_u8 +#define npyv_not_u16 npyv_not_u8 +#define npyv_not_s16 npyv_not_u8 +#define npyv_not_u32 npyv_not_u8 +#define npyv_not_s32 npyv_not_u8 +#define npyv_not_u64 npyv_not_u8 +#define npyv_not_s64 npyv_not_u8 +#ifdef NPY_HAVE_AVX512DQ + #define npyv_not_f32(A) _mm512_xor_ps(A, _mm512_castsi512_ps(_mm512_set1_epi32(-1))) + #define npyv_not_f64(A) _mm512_xor_pd(A, _mm512_castsi512_pd(_mm512_set1_epi32(-1))) +#else + #define npyv_not_f32(A) _mm512_castsi512_ps(npyv_not_u32(_mm512_castps_si512(A))) + #define npyv_not_f64(A) _mm512_castsi512_pd(npyv_not_u64(_mm512_castpd_si512(A))) +#endif + +/*************************** + * Comparison + ***************************/ + +// int Equal +#ifdef NPY_HAVE_AVX512BW + #define npyv_cmpeq_u8 _mm512_cmpeq_epu8_mask + #define npyv_cmpeq_s8 _mm512_cmpeq_epi8_mask + #define npyv_cmpeq_u16 _mm512_cmpeq_epu16_mask + #define npyv_cmpeq_s16 _mm512_cmpeq_epi16_mask +#else + NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_cmpeq_u8, _mm256_cmpeq_epi8) + NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_cmpeq_u16, _mm256_cmpeq_epi16) + #define npyv_cmpeq_s8 npyv_cmpeq_u8 + #define npyv_cmpeq_s16 npyv_cmpeq_u16 +#endif +#define npyv_cmpeq_u32 _mm512_cmpeq_epu32_mask +#define npyv_cmpeq_s32 _mm512_cmpeq_epi32_mask +#define npyv_cmpeq_u64 _mm512_cmpeq_epu64_mask +#define npyv_cmpeq_s64 _mm512_cmpeq_epi64_mask + +// int not equal +#ifdef NPY_HAVE_AVX512BW + #define npyv_cmpneq_u8 _mm512_cmpneq_epu8_mask + #define npyv_cmpneq_s8 _mm512_cmpneq_epi8_mask + #define npyv_cmpneq_u16 _mm512_cmpneq_epu16_mask + #define npyv_cmpneq_s16 _mm512_cmpneq_epi16_mask +#else + #define npyv_cmpneq_u8(A, B) npyv_not_u8(npyv_cmpeq_u8(A, B)) + #define npyv_cmpneq_u16(A, B) npyv_not_u16(npyv_cmpeq_u16(A, B)) + #define npyv_cmpneq_s8 npyv_cmpneq_u8 + #define npyv_cmpneq_s16 npyv_cmpneq_u16 +#endif +#define npyv_cmpneq_u32 _mm512_cmpneq_epu32_mask +#define npyv_cmpneq_s32 _mm512_cmpneq_epi32_mask +#define npyv_cmpneq_u64 _mm512_cmpneq_epu64_mask +#define npyv_cmpneq_s64 _mm512_cmpneq_epi64_mask + +// greater than +#ifdef NPY_HAVE_AVX512BW + #define npyv_cmpgt_u8 _mm512_cmpgt_epu8_mask + #define npyv_cmpgt_s8 _mm512_cmpgt_epi8_mask + #define npyv_cmpgt_u16 _mm512_cmpgt_epu16_mask + #define npyv_cmpgt_s16 _mm512_cmpgt_epi16_mask +#else + NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_cmpgt_s8, _mm256_cmpgt_epi8) + NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_cmpgt_s16, _mm256_cmpgt_epi16) + NPY_FINLINE __m512i npyv_cmpgt_u8(__m512i a, __m512i b) + { + const __m512i sbit = _mm512_set1_epi32(0x80808080); + return npyv_cmpgt_s8(_mm512_xor_si512(a, sbit), _mm512_xor_si512(b, sbit)); + } + NPY_FINLINE __m512i npyv_cmpgt_u16(__m512i a, __m512i b) + { + const __m512i sbit = _mm512_set1_epi32(0x80008000); + return npyv_cmpgt_s16(_mm512_xor_si512(a, sbit), _mm512_xor_si512(b, sbit)); + } +#endif +#define npyv_cmpgt_u32 _mm512_cmpgt_epu32_mask +#define npyv_cmpgt_s32 _mm512_cmpgt_epi32_mask +#define npyv_cmpgt_u64 _mm512_cmpgt_epu64_mask +#define npyv_cmpgt_s64 _mm512_cmpgt_epi64_mask + +// greater than or equal +#ifdef NPY_HAVE_AVX512BW + #define npyv_cmpge_u8 _mm512_cmpge_epu8_mask + #define npyv_cmpge_s8 _mm512_cmpge_epi8_mask + #define npyv_cmpge_u16 _mm512_cmpge_epu16_mask + #define npyv_cmpge_s16 _mm512_cmpge_epi16_mask +#else + #define npyv_cmpge_u8(A, B) npyv_not_u8(npyv_cmpgt_u8(B, A)) + #define npyv_cmpge_s8(A, B) npyv_not_s8(npyv_cmpgt_s8(B, A)) + #define npyv_cmpge_u16(A, B) npyv_not_u16(npyv_cmpgt_u16(B, A)) + #define npyv_cmpge_s16(A, B) npyv_not_s16(npyv_cmpgt_s16(B, A)) +#endif +#define npyv_cmpge_u32 _mm512_cmpge_epu32_mask +#define npyv_cmpge_s32 _mm512_cmpge_epi32_mask +#define npyv_cmpge_u64 _mm512_cmpge_epu64_mask +#define npyv_cmpge_s64 _mm512_cmpge_epi64_mask + +// less than +#define npyv_cmplt_u8(A, B) npyv_cmpgt_u8(B, A) +#define npyv_cmplt_s8(A, B) npyv_cmpgt_s8(B, A) +#define npyv_cmplt_u16(A, B) npyv_cmpgt_u16(B, A) +#define npyv_cmplt_s16(A, B) npyv_cmpgt_s16(B, A) +#define npyv_cmplt_u32(A, B) npyv_cmpgt_u32(B, A) +#define npyv_cmplt_s32(A, B) npyv_cmpgt_s32(B, A) +#define npyv_cmplt_u64(A, B) npyv_cmpgt_u64(B, A) +#define npyv_cmplt_s64(A, B) npyv_cmpgt_s64(B, A) + +// less than or equal +#define npyv_cmple_u8(A, B) npyv_cmpge_u8(B, A) +#define npyv_cmple_s8(A, B) npyv_cmpge_s8(B, A) +#define npyv_cmple_u16(A, B) npyv_cmpge_u16(B, A) +#define npyv_cmple_s16(A, B) npyv_cmpge_s16(B, A) +#define npyv_cmple_u32(A, B) npyv_cmpge_u32(B, A) +#define npyv_cmple_s32(A, B) npyv_cmpge_s32(B, A) +#define npyv_cmple_u64(A, B) npyv_cmpge_u64(B, A) +#define npyv_cmple_s64(A, B) npyv_cmpge_s64(B, A) + +// precision comparison +#define npyv_cmpeq_f32(A, B) _mm512_cmp_ps_mask(A, B, _CMP_EQ_OQ) +#define npyv_cmpeq_f64(A, B) _mm512_cmp_pd_mask(A, B, _CMP_EQ_OQ) +#define npyv_cmpneq_f32(A, B) _mm512_cmp_ps_mask(A, B, _CMP_NEQ_OQ) +#define npyv_cmpneq_f64(A, B) _mm512_cmp_pd_mask(A, B, _CMP_NEQ_OQ) +#define npyv_cmplt_f32(A, B) _mm512_cmp_ps_mask(A, B, _CMP_LT_OQ) +#define npyv_cmplt_f64(A, B) _mm512_cmp_pd_mask(A, B, _CMP_LT_OQ) +#define npyv_cmple_f32(A, B) _mm512_cmp_ps_mask(A, B, _CMP_LE_OQ) +#define npyv_cmple_f64(A, B) _mm512_cmp_pd_mask(A, B, _CMP_LE_OQ) +#define npyv_cmpgt_f32(A, B) _mm512_cmp_ps_mask(A, B, _CMP_GT_OQ) +#define npyv_cmpgt_f64(A, B) _mm512_cmp_pd_mask(A, B, _CMP_GT_OQ) +#define npyv_cmpge_f32(A, B) _mm512_cmp_ps_mask(A, B, _CMP_GE_OQ) +#define npyv_cmpge_f64(A, B) _mm512_cmp_pd_mask(A, B, _CMP_GE_OQ) + +#endif // _NPY_SIMD_AVX512_OPERATORS_H diff --git a/numpy/core/src/common/simd/avx512/reorder.h b/numpy/core/src/common/simd/avx512/reorder.h new file mode 100644 index 000000000..cdbae7aac --- /dev/null +++ b/numpy/core/src/common/simd/avx512/reorder.h @@ -0,0 +1,170 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_AVX512_REORDER_H +#define _NPY_SIMD_AVX512_REORDER_H + +// combine lower part of two vectors +#define npyv_combinel_u8(A, B) _mm512_inserti64x4(A, _mm512_castsi512_si256(B), 1) +#define npyv_combinel_s8 npyv_combinel_u8 +#define npyv_combinel_u16 npyv_combinel_u8 +#define npyv_combinel_s16 npyv_combinel_u8 +#define npyv_combinel_u32 npyv_combinel_u8 +#define npyv_combinel_s32 npyv_combinel_u8 +#define npyv_combinel_u64 npyv_combinel_u8 +#define npyv_combinel_s64 npyv_combinel_u8 +#define npyv_combinel_f64(A, B) _mm512_insertf64x4(A, _mm512_castpd512_pd256(B), 1) +#ifdef NPY_HAVE_AVX512DQ + #define npyv_combinel_f32(A, B) \ + _mm512_insertf32x8(A, _mm512_castps512_ps256(B), 1) +#else + #define npyv_combinel_f32(A, B) \ + _mm512_castsi512_ps(npyv_combinel_u8(_mm512_castps_si512(A), _mm512_castps_si512(B))) +#endif + +// combine higher part of two vectors +#define npyv_combineh_u8(A, B) _mm512_inserti64x4(B, _mm512_extracti64x4_epi64(A, 1), 0) +#define npyv_combineh_s8 npyv_combineh_u8 +#define npyv_combineh_u16 npyv_combineh_u8 +#define npyv_combineh_s16 npyv_combineh_u8 +#define npyv_combineh_u32 npyv_combineh_u8 +#define npyv_combineh_s32 npyv_combineh_u8 +#define npyv_combineh_u64 npyv_combineh_u8 +#define npyv_combineh_s64 npyv_combineh_u8 +#define npyv_combineh_f64(A, B) _mm512_insertf64x4(B, _mm512_extractf64x4_pd(A, 1), 0) +#ifdef NPY_HAVE_AVX512DQ + #define npyv_combineh_f32(A, B) \ + _mm512_insertf32x8(B, _mm512_extractf32x8_ps(A, 1), 0) +#else + #define npyv_combineh_f32(A, B) \ + _mm512_castsi512_ps(npyv_combineh_u8(_mm512_castps_si512(A), _mm512_castps_si512(B))) +#endif + +// combine two vectors from lower and higher parts of two other vectors +NPY_FINLINE npyv_m512ix2 npyv__combine(__m512i a, __m512i b) +{ + npyv_m512ix2 r; + r.val[0] = npyv_combinel_u8(a, b); + r.val[1] = npyv_combineh_u8(a, b); + return r; +} +NPY_FINLINE npyv_f32x2 npyv_combine_f32(__m512 a, __m512 b) +{ + npyv_f32x2 r; + r.val[0] = npyv_combinel_f32(a, b); + r.val[1] = npyv_combineh_f32(a, b); + return r; +} +NPY_FINLINE npyv_f64x2 npyv_combine_f64(__m512d a, __m512d b) +{ + npyv_f64x2 r; + r.val[0] = npyv_combinel_f64(a, b); + r.val[1] = npyv_combineh_f64(a, b); + return r; +} +#define npyv_combine_u8 npyv__combine +#define npyv_combine_s8 npyv__combine +#define npyv_combine_u16 npyv__combine +#define npyv_combine_s16 npyv__combine +#define npyv_combine_u32 npyv__combine +#define npyv_combine_s32 npyv__combine +#define npyv_combine_u64 npyv__combine +#define npyv_combine_s64 npyv__combine + +// interleave two vectors +#ifndef NPY_HAVE_AVX512BW + NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv__unpacklo_epi8, _mm256_unpacklo_epi8) + NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv__unpackhi_epi8, _mm256_unpackhi_epi8) + NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv__unpacklo_epi16, _mm256_unpacklo_epi16) + NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv__unpackhi_epi16, _mm256_unpackhi_epi16) +#endif + +NPY_FINLINE npyv_u64x2 npyv_zip_u64(__m512i a, __m512i b) +{ + npyv_u64x2 r; + r.val[0] = _mm512_permutex2var_epi64(a, npyv_set_u64(0, 8, 1, 9, 2, 10, 3, 11), b); + r.val[1] = _mm512_permutex2var_epi64(a, npyv_set_u64(4, 12, 5, 13, 6, 14, 7, 15), b); + return r; +} +#define npyv_zip_s64 npyv_zip_u64 + +NPY_FINLINE npyv_u8x2 npyv_zip_u8(__m512i a, __m512i b) +{ + npyv_u8x2 r; +#ifdef NPY_HAVE_AVX512VBMI + r.val[0] = _mm512_permutex2var_epi8(a, + npyv_set_u8(0, 64, 1, 65, 2, 66, 3, 67, 4, 68, 5, 69, 6, 70, 7, 71, + 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79, + 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87, + 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95), b); + r.val[1] = _mm512_permutex2var_epi8(a, + npyv_set_u8(32, 96, 33, 97, 34, 98, 35, 99, 36, 100, 37, 101, 38, 102, 39, 103, + 40, 104, 41, 105, 42, 106, 43, 107, 44, 108, 45, 109, 46, 110, 47, 111, + 48, 112, 49, 113, 50, 114, 51, 115, 52, 116, 53, 117, 54, 118, 55, 119, + 56, 120, 57, 121, 58, 122, 59, 123, 60, 124, 61, 125, 62, 126, 63, 127), b); +#else + #ifdef NPY_HAVE_AVX512BW + __m512i ab0 = _mm512_unpacklo_epi8(a, b); + __m512i ab1 = _mm512_unpackhi_epi8(a, b); + #else + __m512i ab0 = npyv__unpacklo_epi8(a, b); + __m512i ab1 = npyv__unpackhi_epi8(a, b); + #endif + r.val[0] = _mm512_permutex2var_epi64(ab0, npyv_set_u64(0, 1, 8, 9, 2, 3, 10, 11), ab1); + r.val[1] = _mm512_permutex2var_epi64(ab0, npyv_set_u64(4, 5, 12, 13, 6, 7, 14, 15), ab1); +#endif + return r; +} +#define npyv_zip_s8 npyv_zip_u8 + +NPY_FINLINE npyv_u16x2 npyv_zip_u16(__m512i a, __m512i b) +{ + npyv_u16x2 r; +#ifdef NPY_HAVE_AVX512BW + r.val[0] = _mm512_permutex2var_epi16(a, + npyv_set_u16(0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, + 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47), b); + r.val[1] = _mm512_permutex2var_epi16(a, + npyv_set_u16(16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, + 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63), b); +#else + __m512i ab0 = npyv__unpacklo_epi16(a, b); + __m512i ab1 = npyv__unpackhi_epi16(a, b); + r.val[0] = _mm512_permutex2var_epi64(ab0, npyv_set_u64(0, 1, 8, 9, 2, 3, 10, 11), ab1); + r.val[1] = _mm512_permutex2var_epi64(ab0, npyv_set_u64(4, 5, 12, 13, 6, 7, 14, 15), ab1); +#endif + return r; +} +#define npyv_zip_s16 npyv_zip_u16 + +NPY_FINLINE npyv_u32x2 npyv_zip_u32(__m512i a, __m512i b) +{ + npyv_u32x2 r; + r.val[0] = _mm512_permutex2var_epi32(a, + npyv_set_u32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), b); + r.val[1] = _mm512_permutex2var_epi32(a, + npyv_set_u32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), b); + return r; +} +#define npyv_zip_s32 npyv_zip_u32 + +NPY_FINLINE npyv_f32x2 npyv_zip_f32(__m512 a, __m512 b) +{ + npyv_f32x2 r; + r.val[0] = _mm512_permutex2var_ps(a, + npyv_set_u32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), b); + r.val[1] = _mm512_permutex2var_ps(a, + npyv_set_u32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), b); + return r; +} + +NPY_FINLINE npyv_f64x2 npyv_zip_f64(__m512d a, __m512d b) +{ + npyv_f64x2 r; + r.val[0] = _mm512_permutex2var_pd(a, npyv_set_u64(0, 8, 1, 9, 2, 10, 3, 11), b); + r.val[1] = _mm512_permutex2var_pd(a, npyv_set_u64(4, 12, 5, 13, 6, 14, 7, 15), b); + return r; +} + +#endif // _NPY_SIMD_AVX512_REORDER_H diff --git a/numpy/core/src/common/simd/avx512/utils.h b/numpy/core/src/common/simd/avx512/utils.h new file mode 100644 index 000000000..8066283c6 --- /dev/null +++ b/numpy/core/src/common/simd/avx512/utils.h @@ -0,0 +1,70 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_AVX512_UTILS_H +#define _NPY_SIMD_AVX512_UTILS_H + +#define npyv512_lower_si256 _mm512_castsi512_si256 +#define npyv512_lower_ps256 _mm512_castps512_ps256 +#define npyv512_lower_pd256 _mm512_castpd512_pd256 + +#define npyv512_higher_si256(A) _mm512_extracti64x4_epi64(A, 1) +#define npyv512_higher_pd256(A) _mm512_extractf64x4_pd(A, 1) + +#ifdef NPY_HAVE_AVX512DQ + #define npyv512_higher_ps256(A) _mm512_extractf32x8_ps(A, 1) +#else + #define npyv512_higher_ps256(A) \ + _mm256_castsi256_ps(_mm512_extracti64x4_epi64(_mm512_castps_si512(A), 1)) +#endif + +#define npyv512_combine_si256(A, B) _mm512_inserti64x4(_mm512_castsi256_si512(A), B, 1) +#define npyv512_combine_pd256(A, B) _mm512_insertf64x4(_mm512_castpd256_pd512(A), B, 1) + +#ifdef NPY_HAVE_AVX512DQ + #define npyv512_combine_ps256(A, B) _mm512_insertf32x8(_mm512_castps256_ps512(A), B, 1) +#else + #define npyv512_combine_ps256(A, B) \ + _mm512_castsi512_ps(npyv512_combine_si256(_mm512_castps_si512(A), _mm512_castps_si512(B))) +#endif + +#define NPYV_IMPL_AVX512_FROM_AVX2_1ARG(FN_NAME, INTRIN) \ + NPY_FINLINE __m512i FN_NAME(__m512i a) \ + { \ + __m256i l_a = npyv512_lower_si256(a); \ + __m256i h_a = npyv512_higher_si256(a); \ + l_a = INTRIN(l_a); \ + h_a = INTRIN(h_a); \ + return npyv512_combine_si256(l_a, h_a); \ + } + +#define NPYV_IMPL_AVX512_FROM_AVX2_2ARG(FN_NAME, INTRIN) \ + NPY_FINLINE __m512i FN_NAME(__m512i a, __m512i b) \ + { \ + __m256i l_a = npyv512_lower_si256(a); \ + __m256i h_a = npyv512_higher_si256(a); \ + __m256i l_b = npyv512_lower_si256(b); \ + __m256i h_b = npyv512_higher_si256(b); \ + l_a = INTRIN(l_a, l_b); \ + h_a = INTRIN(h_a, h_b); \ + return npyv512_combine_si256(l_a, h_a); \ + } + +#define NPYV_IMPL_AVX512_FROM_SI512_PS_2ARG(FN_NAME, INTRIN) \ + NPY_FINLINE __m512 FN_NAME(__m512 a, __m512 b) \ + { \ + return _mm512_castsi512_ps(INTRIN( \ + _mm512_castps_si512(a), _mm512_castps_si512(b) \ + )); \ + } + +#define NPYV_IMPL_AVX512_FROM_SI512_PD_2ARG(FN_NAME, INTRIN) \ + NPY_FINLINE __m512d FN_NAME(__m512d a, __m512d b) \ + { \ + return _mm512_castsi512_pd(INTRIN( \ + _mm512_castpd_si512(a), _mm512_castpd_si512(b) \ + )); \ + } + +#endif // _NPY_SIMD_AVX512_UTILS_H diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h new file mode 100644 index 000000000..ec8b8ecd0 --- /dev/null +++ b/numpy/core/src/common/simd/neon/arithmetic.h @@ -0,0 +1,78 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_NEON_ARITHMETIC_H +#define _NPY_SIMD_NEON_ARITHMETIC_H + +/*************************** + * Addition + ***************************/ +// non-saturated +#define npyv_add_u8 vaddq_u8 +#define npyv_add_s8 vaddq_s8 +#define npyv_add_u16 vaddq_u16 +#define npyv_add_s16 vaddq_s16 +#define npyv_add_u32 vaddq_u32 +#define npyv_add_s32 vaddq_s32 +#define npyv_add_u64 vaddq_u64 +#define npyv_add_s64 vaddq_s64 +#define npyv_add_f32 vaddq_f32 +#define npyv_add_f64 vaddq_f64 + +// saturated +#define npyv_adds_u8 vqaddq_u8 +#define npyv_adds_s8 vqaddq_s8 +#define npyv_adds_u16 vqaddq_u16 +#define npyv_adds_s16 vqaddq_s16 + +/*************************** + * Subtraction + ***************************/ +// non-saturated +#define npyv_sub_u8 vsubq_u8 +#define npyv_sub_s8 vsubq_s8 +#define npyv_sub_u16 vsubq_u16 +#define npyv_sub_s16 vsubq_s16 +#define npyv_sub_u32 vsubq_u32 +#define npyv_sub_s32 vsubq_s32 +#define npyv_sub_u64 vsubq_u64 +#define npyv_sub_s64 vsubq_s64 +#define npyv_sub_f32 vsubq_f32 +#define npyv_sub_f64 vsubq_f64 + +// saturated +#define npyv_subs_u8 vqsubq_u8 +#define npyv_subs_s8 vqsubq_s8 +#define npyv_subs_u16 vqsubq_u16 +#define npyv_subs_s16 vqsubq_s16 + +/*************************** + * Multiplication + ***************************/ +// non-saturated +#define npyv_mul_u8 vmulq_u8 +#define npyv_mul_s8 vmulq_s8 +#define npyv_mul_u16 vmulq_u16 +#define npyv_mul_s16 vmulq_s16 +#define npyv_mul_u32 vmulq_u32 +#define npyv_mul_s32 vmulq_s32 +#define npyv_mul_f32 vmulq_f32 +#define npyv_mul_f64 vmulq_f64 + +/*************************** + * Division + ***************************/ +#ifdef __aarch64__ + #define npyv_div_f32 vdivq_f32 +#else + NPY_FINLINE float32x4_t npyv_div_f32(float32x4_t a, float32x4_t b) + { + float32x4_t recip = vrecpeq_f32(b); + recip = vmulq_f32(vrecpsq_f32(b, recip), recip); + return vmulq_f32(a, recip); + } +#endif +#define npyv_div_f64 vdivq_f64 + +#endif // _NPY_SIMD_NEON_ARITHMETIC_H diff --git a/numpy/core/src/common/simd/neon/conversion.h b/numpy/core/src/common/simd/neon/conversion.h new file mode 100644 index 000000000..b286931d1 --- /dev/null +++ b/numpy/core/src/common/simd/neon/conversion.h @@ -0,0 +1,32 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_NEON_CVT_H +#define _NPY_SIMD_NEON_CVT_H + +// convert boolean vectors to integer vectors +#define npyv_cvt_u8_b8(A) A +#define npyv_cvt_s8_b8(A) vreinterpretq_s8_u8(A) +#define npyv_cvt_u16_b16(A) A +#define npyv_cvt_s16_b16(A) vreinterpretq_s16_u16(A) +#define npyv_cvt_u32_b32(A) A +#define npyv_cvt_s32_b32(A) vreinterpretq_s32_u32(A) +#define npyv_cvt_u64_b64(A) A +#define npyv_cvt_s64_b64(A) vreinterpretq_s64_u64(A) +#define npyv_cvt_f32_b32(A) vreinterpretq_f32_u32(A) +#define npyv_cvt_f64_b64(A) vreinterpretq_f64_u64(A) + +// convert integer vectors to boolean vectors +#define npyv_cvt_b8_u8(BL) BL +#define npyv_cvt_b8_s8(BL) vreinterpretq_u8_s8(BL) +#define npyv_cvt_b16_u16(BL) BL +#define npyv_cvt_b16_s16(BL) vreinterpretq_u16_s16(BL) +#define npyv_cvt_b32_u32(BL) BL +#define npyv_cvt_b32_s32(BL) vreinterpretq_u32_s32(BL) +#define npyv_cvt_b64_u64(BL) BL +#define npyv_cvt_b64_s64(BL) vreinterpretq_u64_s64(BL) +#define npyv_cvt_b32_f32(BL) vreinterpretq_u32_f32(BL) +#define npyv_cvt_b64_f64(BL) vreinterpretq_u64_f64(BL) + +#endif // _NPY_SIMD_NEON_CVT_H diff --git a/numpy/core/src/common/simd/neon/memory.h b/numpy/core/src/common/simd/neon/memory.h new file mode 100644 index 000000000..afa703584 --- /dev/null +++ b/numpy/core/src/common/simd/neon/memory.h @@ -0,0 +1,49 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_NEON_MEMORY_H +#define _NPY_SIMD_NEON_MEMORY_H + +/*************************** + * load/store + ***************************/ +// GCC requires literal type definitions for pointers types otherwise it causes ambiguous errors +#define NPYV_IMPL_NEON_MEM(SFX, CTYPE) \ + NPY_FINLINE npyv_##SFX npyv_load_##SFX(const npyv_lanetype_##SFX *ptr) \ + { return vld1q_##SFX((const CTYPE*)ptr); } \ + NPY_FINLINE npyv_##SFX npyv_loada_##SFX(const npyv_lanetype_##SFX *ptr) \ + { return vld1q_##SFX((const CTYPE*)ptr); } \ + NPY_FINLINE npyv_##SFX npyv_loads_##SFX(const npyv_lanetype_##SFX *ptr) \ + { return vld1q_##SFX((const CTYPE*)ptr); } \ + NPY_FINLINE npyv_##SFX npyv_loadl_##SFX(const npyv_lanetype_##SFX *ptr) \ + { \ + return vcombine_##SFX( \ + vld1_##SFX((const CTYPE*)ptr), vdup_n_##SFX(0) \ + ); \ + } \ + NPY_FINLINE void npyv_store_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec) \ + { vst1q_##SFX((CTYPE*)ptr, vec); } \ + NPY_FINLINE void npyv_storea_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec) \ + { vst1q_##SFX((CTYPE*)ptr, vec); } \ + NPY_FINLINE void npyv_stores_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec) \ + { vst1q_##SFX((CTYPE*)ptr, vec); } \ + NPY_FINLINE void npyv_storel_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec) \ + { vst1_##SFX((CTYPE*)ptr, vget_low_##SFX(vec)); } \ + NPY_FINLINE void npyv_storeh_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec) \ + { vst1_##SFX((CTYPE*)ptr, vget_high_##SFX(vec)); } + +NPYV_IMPL_NEON_MEM(u8, uint8_t) +NPYV_IMPL_NEON_MEM(s8, int8_t) +NPYV_IMPL_NEON_MEM(u16, uint16_t) +NPYV_IMPL_NEON_MEM(s16, int16_t) +NPYV_IMPL_NEON_MEM(u32, uint32_t) +NPYV_IMPL_NEON_MEM(s32, int32_t) +NPYV_IMPL_NEON_MEM(u64, uint64_t) +NPYV_IMPL_NEON_MEM(s64, int64_t) +NPYV_IMPL_NEON_MEM(f32, float) +#if NPY_SIMD_F64 +NPYV_IMPL_NEON_MEM(f64, double) +#endif + +#endif // _NPY_SIMD_NEON_MEMORY_H diff --git a/numpy/core/src/common/simd/neon/misc.h b/numpy/core/src/common/simd/neon/misc.h new file mode 100644 index 000000000..51b0c3858 --- /dev/null +++ b/numpy/core/src/common/simd/neon/misc.h @@ -0,0 +1,255 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_NEON_MISC_H +#define _NPY_SIMD_NEON_MISC_H + +// vector with zero lanes +#define npyv_zero_u8() vreinterpretq_u8_s32(npyv_zero_s32()) +#define npyv_zero_s8() vreinterpretq_s8_s32(npyv_zero_s32()) +#define npyv_zero_u16() vreinterpretq_u16_s32(npyv_zero_s32()) +#define npyv_zero_s16() vreinterpretq_s16_s32(npyv_zero_s32()) +#define npyv_zero_u32() vdupq_n_u32((unsigned)0) +#define npyv_zero_s32() vdupq_n_s32((int)0) +#define npyv_zero_u64() vreinterpretq_u64_s32(npyv_zero_s32()) +#define npyv_zero_s64() vreinterpretq_s64_s32(npyv_zero_s32()) +#define npyv_zero_f32() vdupq_n_f32(0.0f) +#define npyv_zero_f64() vdupq_n_f64(0.0) + +// vector with a specific value set to all lanes +#define npyv_setall_u8 vdupq_n_u8 +#define npyv_setall_s8 vdupq_n_s8 +#define npyv_setall_u16 vdupq_n_u16 +#define npyv_setall_s16 vdupq_n_s16 +#define npyv_setall_u32 vdupq_n_u32 +#define npyv_setall_s32 vdupq_n_s32 +#define npyv_setall_u64 vdupq_n_u64 +#define npyv_setall_s64 vdupq_n_s64 +#define npyv_setall_f32 vdupq_n_f32 +#define npyv_setall_f64 vdupq_n_f64 + +// vector with specific values set to each lane and +// set a specific value to all remained lanes +NPY_FINLINE uint8x16_t npyv__set_u8(npy_uint8 i0, npy_uint8 i1, npy_uint8 i2, npy_uint8 i3, + npy_uint8 i4, npy_uint8 i5, npy_uint8 i6, npy_uint8 i7, npy_uint8 i8, npy_uint8 i9, + npy_uint8 i10, npy_uint8 i11, npy_uint8 i12, npy_uint8 i13, npy_uint8 i14, npy_uint8 i15) +{ + const uint8_t NPY_DECL_ALIGNED(16) data[16] = { + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 + }; + return vld1q_u8(data); +} +#define npyv_setf_u8(FILL, ...) npyv__set_u8(NPYV__SET_FILL_16(npy_uint8, FILL, __VA_ARGS__)) + +NPY_FINLINE int8x16_t npyv__set_s8(npy_int8 i0, npy_int8 i1, npy_int8 i2, npy_int8 i3, + npy_int8 i4, npy_int8 i5, npy_int8 i6, npy_int8 i7, npy_int8 i8, npy_int8 i9, + npy_int8 i10, npy_int8 i11, npy_int8 i12, npy_int8 i13, npy_int8 i14, npy_int8 i15) +{ + const int8_t NPY_DECL_ALIGNED(16) data[16] = { + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 + }; + return vld1q_s8(data); +} +#define npyv_setf_s8(FILL, ...) npyv__set_s8(NPYV__SET_FILL_16(npy_int8, FILL, __VA_ARGS__)) + +NPY_FINLINE uint16x8_t npyv__set_u16(npy_uint16 i0, npy_uint16 i1, npy_uint16 i2, npy_uint16 i3, + npy_uint16 i4, npy_uint16 i5, npy_uint16 i6, npy_uint16 i7) +{ + const uint16_t NPY_DECL_ALIGNED(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; + return vld1q_u16(data); +} +#define npyv_setf_u16(FILL, ...) npyv__set_u16(NPYV__SET_FILL_8(npy_uint16, FILL, __VA_ARGS__)) + +NPY_FINLINE int16x8_t npyv__set_s16(npy_int16 i0, npy_int16 i1, npy_int16 i2, npy_int16 i3, + npy_int16 i4, npy_int16 i5, npy_int16 i6, npy_int16 i7) +{ + const int16_t NPY_DECL_ALIGNED(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; + return vld1q_s16(data); +} +#define npyv_setf_s16(FILL, ...) npyv__set_s16(NPYV__SET_FILL_8(npy_int16, FILL, __VA_ARGS__)) + +NPY_FINLINE uint32x4_t npyv__set_u32(npy_uint32 i0, npy_uint32 i1, npy_uint32 i2, npy_uint32 i3) +{ + const uint32_t NPY_DECL_ALIGNED(16) data[4] = {i0, i1, i2, i3}; + return vld1q_u32(data); +} +#define npyv_setf_u32(FILL, ...) npyv__set_u32(NPYV__SET_FILL_4(npy_uint32, FILL, __VA_ARGS__)) + +NPY_FINLINE int32x4_t npyv__set_s32(npy_int32 i0, npy_int32 i1, npy_int32 i2, npy_int32 i3) +{ + const int32_t NPY_DECL_ALIGNED(16) data[4] = {i0, i1, i2, i3}; + return vld1q_s32(data); +} +#define npyv_setf_s32(FILL, ...) npyv__set_s32(NPYV__SET_FILL_4(npy_int32, FILL, __VA_ARGS__)) + +NPY_FINLINE uint64x2_t npyv__set_u64(npy_uint64 i0, npy_uint64 i1) +{ + const uint64_t NPY_DECL_ALIGNED(16) data[2] = {i0, i1}; + return vld1q_u64(data); +} +#define npyv_setf_u64(FILL, ...) npyv__set_u64(NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)) + +NPY_FINLINE int64x2_t npyv__set_s64(npy_int64 i0, npy_int64 i1) +{ + const int64_t NPY_DECL_ALIGNED(16) data[2] = {i0, i1}; + return vld1q_s64(data); +} +#define npyv_setf_s64(FILL, ...) npyv__set_s64(NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)) + +NPY_FINLINE float32x4_t npyv__set_f32(float i0, float i1, float i2, float i3) +{ + const float NPY_DECL_ALIGNED(16) data[4] = {i0, i1, i2, i3}; + return vld1q_f32(data); +} +#define npyv_setf_f32(FILL, ...) npyv__set_f32(NPYV__SET_FILL_4(float, FILL, __VA_ARGS__)) + +#ifdef __aarch64__ +NPY_FINLINE float64x2_t npyv__set_f64(double i0, double i1) +{ + const double NPY_DECL_ALIGNED(16) data[2] = {i0, i1}; + return vld1q_f64(data); +} +#define npyv_setf_f64(FILL, ...) npyv__set_f64(NPYV__SET_FILL_2(double, FILL, __VA_ARGS__)) +#endif + +// vector with specific values set to each lane and +// set zero to all remained lanes +#define npyv_set_u8(...) npyv_setf_u8(0, __VA_ARGS__) +#define npyv_set_s8(...) npyv_setf_s8(0, __VA_ARGS__) +#define npyv_set_u16(...) npyv_setf_u16(0, __VA_ARGS__) +#define npyv_set_s16(...) npyv_setf_s16(0, __VA_ARGS__) +#define npyv_set_u32(...) npyv_setf_u32(0, __VA_ARGS__) +#define npyv_set_s32(...) npyv_setf_s32(0, __VA_ARGS__) +#define npyv_set_u64(...) npyv_setf_u64(0, __VA_ARGS__) +#define npyv_set_s64(...) npyv_setf_s64(0, __VA_ARGS__) +#define npyv_set_f32(...) npyv_setf_f32(0, __VA_ARGS__) +#define npyv_set_f64(...) npyv_setf_f64(0, __VA_ARGS__) + +// Per lane select +#define npyv_select_u8 vbslq_u8 +#define npyv_select_s8 vbslq_s8 +#define npyv_select_u16 vbslq_u16 +#define npyv_select_s16 vbslq_s16 +#define npyv_select_u32 vbslq_u32 +#define npyv_select_s32 vbslq_s32 +#define npyv_select_u64 vbslq_u64 +#define npyv_select_s64 vbslq_s64 +#define npyv_select_f32 vbslq_f32 +#define npyv_select_f64 vbslq_f64 + +// Reinterpret +#define npyv_reinterpret_u8_u8(X) X +#define npyv_reinterpret_u8_s8 vreinterpretq_u8_s8 +#define npyv_reinterpret_u8_u16 vreinterpretq_u8_u16 +#define npyv_reinterpret_u8_s16 vreinterpretq_u8_s16 +#define npyv_reinterpret_u8_u32 vreinterpretq_u8_u32 +#define npyv_reinterpret_u8_s32 vreinterpretq_u8_s32 +#define npyv_reinterpret_u8_u64 vreinterpretq_u8_u64 +#define npyv_reinterpret_u8_s64 vreinterpretq_u8_s64 +#define npyv_reinterpret_u8_f32 vreinterpretq_u8_f32 +#define npyv_reinterpret_u8_f64 vreinterpretq_u8_f64 + +#define npyv_reinterpret_s8_s8(X) X +#define npyv_reinterpret_s8_u8 vreinterpretq_s8_u8 +#define npyv_reinterpret_s8_u16 vreinterpretq_s8_u16 +#define npyv_reinterpret_s8_s16 vreinterpretq_s8_s16 +#define npyv_reinterpret_s8_u32 vreinterpretq_s8_u32 +#define npyv_reinterpret_s8_s32 vreinterpretq_s8_s32 +#define npyv_reinterpret_s8_u64 vreinterpretq_s8_u64 +#define npyv_reinterpret_s8_s64 vreinterpretq_s8_s64 +#define npyv_reinterpret_s8_f32 vreinterpretq_s8_f32 +#define npyv_reinterpret_s8_f64 vreinterpretq_s8_f64 + +#define npyv_reinterpret_u16_u16(X) X +#define npyv_reinterpret_u16_u8 vreinterpretq_u16_u8 +#define npyv_reinterpret_u16_s8 vreinterpretq_u16_s8 +#define npyv_reinterpret_u16_s16 vreinterpretq_u16_s16 +#define npyv_reinterpret_u16_u32 vreinterpretq_u16_u32 +#define npyv_reinterpret_u16_s32 vreinterpretq_u16_s32 +#define npyv_reinterpret_u16_u64 vreinterpretq_u16_u64 +#define npyv_reinterpret_u16_s64 vreinterpretq_u16_s64 +#define npyv_reinterpret_u16_f32 vreinterpretq_u16_f32 +#define npyv_reinterpret_u16_f64 vreinterpretq_u16_f64 + +#define npyv_reinterpret_s16_s16(X) X +#define npyv_reinterpret_s16_u8 vreinterpretq_s16_u8 +#define npyv_reinterpret_s16_s8 vreinterpretq_s16_s8 +#define npyv_reinterpret_s16_u16 vreinterpretq_s16_u16 +#define npyv_reinterpret_s16_u32 vreinterpretq_s16_u32 +#define npyv_reinterpret_s16_s32 vreinterpretq_s16_s32 +#define npyv_reinterpret_s16_u64 vreinterpretq_s16_u64 +#define npyv_reinterpret_s16_s64 vreinterpretq_s16_s64 +#define npyv_reinterpret_s16_f32 vreinterpretq_s16_f32 +#define npyv_reinterpret_s16_f64 vreinterpretq_s16_f64 + +#define npyv_reinterpret_u32_u32(X) X +#define npyv_reinterpret_u32_u8 vreinterpretq_u32_u8 +#define npyv_reinterpret_u32_s8 vreinterpretq_u32_s8 +#define npyv_reinterpret_u32_u16 vreinterpretq_u32_u16 +#define npyv_reinterpret_u32_s16 vreinterpretq_u32_s16 +#define npyv_reinterpret_u32_s32 vreinterpretq_u32_s32 +#define npyv_reinterpret_u32_u64 vreinterpretq_u32_u64 +#define npyv_reinterpret_u32_s64 vreinterpretq_u32_s64 +#define npyv_reinterpret_u32_f32 vreinterpretq_u32_f32 +#define npyv_reinterpret_u32_f64 vreinterpretq_u32_f64 + +#define npyv_reinterpret_s32_s32(X) X +#define npyv_reinterpret_s32_u8 vreinterpretq_s32_u8 +#define npyv_reinterpret_s32_s8 vreinterpretq_s32_s8 +#define npyv_reinterpret_s32_u16 vreinterpretq_s32_u16 +#define npyv_reinterpret_s32_s16 vreinterpretq_s32_s16 +#define npyv_reinterpret_s32_u32 vreinterpretq_s32_u32 +#define npyv_reinterpret_s32_u64 vreinterpretq_s32_u64 +#define npyv_reinterpret_s32_s64 vreinterpretq_s32_s64 +#define npyv_reinterpret_s32_f32 vreinterpretq_s32_f32 +#define npyv_reinterpret_s32_f64 vreinterpretq_s32_f64 + +#define npyv_reinterpret_u64_u64(X) X +#define npyv_reinterpret_u64_u8 vreinterpretq_u64_u8 +#define npyv_reinterpret_u64_s8 vreinterpretq_u64_s8 +#define npyv_reinterpret_u64_u16 vreinterpretq_u64_u16 +#define npyv_reinterpret_u64_s16 vreinterpretq_u64_s16 +#define npyv_reinterpret_u64_u32 vreinterpretq_u64_u32 +#define npyv_reinterpret_u64_s32 vreinterpretq_u64_s32 +#define npyv_reinterpret_u64_s64 vreinterpretq_u64_s64 +#define npyv_reinterpret_u64_f32 vreinterpretq_u64_f32 +#define npyv_reinterpret_u64_f64 vreinterpretq_u64_f64 + +#define npyv_reinterpret_s64_s64(X) X +#define npyv_reinterpret_s64_u8 vreinterpretq_s64_u8 +#define npyv_reinterpret_s64_s8 vreinterpretq_s64_s8 +#define npyv_reinterpret_s64_u16 vreinterpretq_s64_u16 +#define npyv_reinterpret_s64_s16 vreinterpretq_s64_s16 +#define npyv_reinterpret_s64_u32 vreinterpretq_s64_u32 +#define npyv_reinterpret_s64_s32 vreinterpretq_s64_s32 +#define npyv_reinterpret_s64_u64 vreinterpretq_s64_u64 +#define npyv_reinterpret_s64_f32 vreinterpretq_s64_f32 +#define npyv_reinterpret_s64_f64 vreinterpretq_s64_f64 + +#define npyv_reinterpret_f32_f32(X) X +#define npyv_reinterpret_f32_u8 vreinterpretq_f32_u8 +#define npyv_reinterpret_f32_s8 vreinterpretq_f32_s8 +#define npyv_reinterpret_f32_u16 vreinterpretq_f32_u16 +#define npyv_reinterpret_f32_s16 vreinterpretq_f32_s16 +#define npyv_reinterpret_f32_u32 vreinterpretq_f32_u32 +#define npyv_reinterpret_f32_s32 vreinterpretq_f32_s32 +#define npyv_reinterpret_f32_u64 vreinterpretq_f32_u64 +#define npyv_reinterpret_f32_s64 vreinterpretq_f32_s64 +#define npyv_reinterpret_f32_f64 vreinterpretq_f32_f64 + +#define npyv_reinterpret_f64_f64(X) X +#define npyv_reinterpret_f64_u8 vreinterpretq_f64_u8 +#define npyv_reinterpret_f64_s8 vreinterpretq_f64_s8 +#define npyv_reinterpret_f64_u16 vreinterpretq_f64_u16 +#define npyv_reinterpret_f64_s16 vreinterpretq_f64_s16 +#define npyv_reinterpret_f64_u32 vreinterpretq_f64_u32 +#define npyv_reinterpret_f64_s32 vreinterpretq_f64_s32 +#define npyv_reinterpret_f64_u64 vreinterpretq_f64_u64 +#define npyv_reinterpret_f64_s64 vreinterpretq_f64_s64 +#define npyv_reinterpret_f64_f32 vreinterpretq_f64_f32 + +// Only required by AVX2/AVX512 +#define npyv_cleanup() ((void)0) + +#endif // _NPY_SIMD_NEON_MISC_H diff --git a/numpy/core/src/common/simd/neon/neon.h b/numpy/core/src/common/simd/neon/neon.h new file mode 100644 index 000000000..280a34297 --- /dev/null +++ b/numpy/core/src/common/simd/neon/neon.h @@ -0,0 +1,74 @@ +#ifndef _NPY_SIMD_H_ + #error "Not a standalone header" +#endif + +#define NPY_SIMD 128 +#define NPY_SIMD_WIDTH 16 + +#ifdef __aarch64__ + #define NPY_SIMD_F64 1 +#else + #define NPY_SIMD_F64 0 +#endif + +typedef uint8x16_t npyv_u8; +typedef int8x16_t npyv_s8; +typedef uint16x8_t npyv_u16; +typedef int16x8_t npyv_s16; +typedef uint32x4_t npyv_u32; +typedef int32x4_t npyv_s32; +typedef uint64x2_t npyv_u64; +typedef int64x2_t npyv_s64; +typedef float32x4_t npyv_f32; +#if NPY_SIMD_F64 +typedef float64x2_t npyv_f64; +#endif + +typedef uint8x16_t npyv_b8; +typedef uint16x8_t npyv_b16; +typedef uint32x4_t npyv_b32; +typedef uint64x2_t npyv_b64; + +typedef uint8x16x2_t npyv_u8x2; +typedef int8x16x2_t npyv_s8x2; +typedef uint16x8x2_t npyv_u16x2; +typedef int16x8x2_t npyv_s16x2; +typedef uint32x4x2_t npyv_u32x2; +typedef int32x4x2_t npyv_s32x2; +typedef uint64x2x2_t npyv_u64x2; +typedef int64x2x2_t npyv_s64x2; +typedef float32x4x2_t npyv_f32x2; +#if NPY_SIMD_F64 +typedef float64x2x2_t npyv_f64x2; +#endif + +typedef uint8x16x3_t npyv_u8x3; +typedef int8x16x3_t npyv_s8x3; +typedef uint16x8x3_t npyv_u16x3; +typedef int16x8x3_t npyv_s16x3; +typedef uint32x4x3_t npyv_u32x3; +typedef int32x4x3_t npyv_s32x3; +typedef uint64x2x3_t npyv_u64x3; +typedef int64x2x3_t npyv_s64x3; +typedef float32x4x3_t npyv_f32x3; +#if NPY_SIMD_F64 +typedef float64x2x3_t npyv_f64x3; +#endif + +#define npyv_nlanes_u8 16 +#define npyv_nlanes_s8 16 +#define npyv_nlanes_u16 8 +#define npyv_nlanes_s16 8 +#define npyv_nlanes_u32 4 +#define npyv_nlanes_s32 4 +#define npyv_nlanes_u64 2 +#define npyv_nlanes_s64 2 +#define npyv_nlanes_f32 4 +#define npyv_nlanes_f64 2 + +#include "memory.h" +#include "misc.h" +#include "reorder.h" +#include "operators.h" +#include "conversion.h" +#include "arithmetic.h" diff --git a/numpy/core/src/common/simd/neon/operators.h b/numpy/core/src/common/simd/neon/operators.h new file mode 100644 index 000000000..c1ad4ba12 --- /dev/null +++ b/numpy/core/src/common/simd/neon/operators.h @@ -0,0 +1,218 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_NEON_OPERATORS_H +#define _NPY_SIMD_NEON_OPERATORS_H + +/*************************** + * Shifting + ***************************/ + +// left +#define npyv_shl_u16(A, C) vshlq_u16(A, npyv_setall_s16(C)) +#define npyv_shl_s16(A, C) vshlq_s16(A, npyv_setall_s16(C)) +#define npyv_shl_u32(A, C) vshlq_u32(A, npyv_setall_s32(C)) +#define npyv_shl_s32(A, C) vshlq_s32(A, npyv_setall_s32(C)) +#define npyv_shl_u64(A, C) vshlq_u64(A, npyv_setall_s64(C)) +#define npyv_shl_s64(A, C) vshlq_s64(A, npyv_setall_s64(C)) + +// left by an immediate constant +#define npyv_shli_u16 vshlq_n_u16 +#define npyv_shli_s16 vshlq_n_s16 +#define npyv_shli_u32 vshlq_n_u32 +#define npyv_shli_s32 vshlq_n_s32 +#define npyv_shli_u64 vshlq_n_u64 +#define npyv_shli_s64 vshlq_n_s64 + +// right +#define npyv_shr_u16(A, C) vshlq_u16(A, npyv_setall_s16(-(C))) +#define npyv_shr_s16(A, C) vshlq_s16(A, npyv_setall_s16(-(C))) +#define npyv_shr_u32(A, C) vshlq_u32(A, npyv_setall_s32(-(C))) +#define npyv_shr_s32(A, C) vshlq_s32(A, npyv_setall_s32(-(C))) +#define npyv_shr_u64(A, C) vshlq_u64(A, npyv_setall_s64(-(C))) +#define npyv_shr_s64(A, C) vshlq_s64(A, npyv_setall_s64(-(C))) + +// right by an immediate constant +#define npyv_shri_u16(VEC, C) ((C) == 0 ? VEC : vshrq_n_u16(VEC, C)) +#define npyv_shri_s16(VEC, C) ((C) == 0 ? VEC : vshrq_n_s16(VEC, C)) +#define npyv_shri_u32(VEC, C) ((C) == 0 ? VEC : vshrq_n_u32(VEC, C)) +#define npyv_shri_s32(VEC, C) ((C) == 0 ? VEC : vshrq_n_s32(VEC, C)) +#define npyv_shri_u64(VEC, C) ((C) == 0 ? VEC : vshrq_n_u64(VEC, C)) +#define npyv_shri_s64(VEC, C) ((C) == 0 ? VEC : vshrq_n_s64(VEC, C)) + +/*************************** + * Logical + ***************************/ + +// AND +#define npyv_and_u8 vandq_u8 +#define npyv_and_s8 vandq_s8 +#define npyv_and_u16 vandq_u16 +#define npyv_and_s16 vandq_s16 +#define npyv_and_u32 vandq_u32 +#define npyv_and_s32 vandq_s32 +#define npyv_and_u64 vandq_u64 +#define npyv_and_s64 vandq_s64 +#define npyv_and_f32(A, B) \ + vreinterpretq_f32_u8(vandq_u8(vreinterpretq_u8_f32(A), vreinterpretq_u8_f32(B))) +#define npyv_and_f64(A, B) \ + vreinterpretq_f64_u8(vandq_u8(vreinterpretq_u8_f64(A), vreinterpretq_u8_f64(B))) + +// OR +#define npyv_or_u8 vorrq_u8 +#define npyv_or_s8 vorrq_s8 +#define npyv_or_u16 vorrq_u16 +#define npyv_or_s16 vorrq_s16 +#define npyv_or_u32 vorrq_u32 +#define npyv_or_s32 vorrq_s32 +#define npyv_or_u64 vorrq_u64 +#define npyv_or_s64 vorrq_s64 +#define npyv_or_f32(A, B) \ + vreinterpretq_f32_u8(vorrq_u8(vreinterpretq_u8_f32(A), vreinterpretq_u8_f32(B))) +#define npyv_or_f64(A, B) \ + vreinterpretq_f64_u8(vorrq_u8(vreinterpretq_u8_f64(A), vreinterpretq_u8_f64(B))) + +// XOR +#define npyv_xor_u8 veorq_u8 +#define npyv_xor_s8 veorq_s8 +#define npyv_xor_u16 veorq_u16 +#define npyv_xor_s16 veorq_s16 +#define npyv_xor_u32 veorq_u32 +#define npyv_xor_s32 veorq_s32 +#define npyv_xor_u64 veorq_u64 +#define npyv_xor_s64 veorq_s64 +#define npyv_xor_f32(A, B) \ + vreinterpretq_f32_u8(veorq_u8(vreinterpretq_u8_f32(A), vreinterpretq_u8_f32(B))) +#define npyv_xor_f64(A, B) \ + vreinterpretq_f64_u8(veorq_u8(vreinterpretq_u8_f64(A), vreinterpretq_u8_f64(B))) + +// NOT +#define npyv_not_u8 vmvnq_u8 +#define npyv_not_s8 vmvnq_s8 +#define npyv_not_u16 vmvnq_u16 +#define npyv_not_s16 vmvnq_s16 +#define npyv_not_u32 vmvnq_u32 +#define npyv_not_s32 vmvnq_s32 +#define npyv_not_u64(A) vreinterpretq_u64_u8(vmvnq_u8(vreinterpretq_u8_u64(A))) +#define npyv_not_s64(A) vreinterpretq_s64_u8(vmvnq_u8(vreinterpretq_u8_s64(A))) +#define npyv_not_f32(A) vreinterpretq_f32_u8(vmvnq_u8(vreinterpretq_u8_f32(A))) +#define npyv_not_f64(A) vreinterpretq_f64_u8(vmvnq_u8(vreinterpretq_u8_f64(A))) + +/*************************** + * Comparison + ***************************/ + +// equal +#define npyv_cmpeq_u8 vceqq_u8 +#define npyv_cmpeq_s8 vceqq_s8 +#define npyv_cmpeq_u16 vceqq_u16 +#define npyv_cmpeq_s16 vceqq_s16 +#define npyv_cmpeq_u32 vceqq_u32 +#define npyv_cmpeq_s32 vceqq_s32 +#define npyv_cmpeq_f32 vceqq_f32 +#define npyv_cmpeq_f64 vceqq_f64 + +#ifdef __aarch64__ + #define npyv_cmpeq_u64 vceqq_u64 + #define npyv_cmpeq_s64 vceqq_s64 +#else + NPY_FINLINE uint64x2_t npyv_cmpeq_u64(uint64x2_t a, uint64x2_t b) + { + uint64x2_t cmpeq = vreinterpretq_u64_u32(vceqq_u32( + vreinterpretq_u32_u64(a), vreinterpretq_u32_u64(b) + )); + uint64x2_t cmpeq_h = vshlq_n_u64(cmpeq, 32); + uint64x2_t test = vandq_u64(cmpeq, cmpeq_h); + return vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_u64(test), 32)); + } + #define npyv_cmpeq_s64(A, B) \ + npyv_cmpeq_u64(vreinterpretq_u64_s64(A), vreinterpretq_u64_s64(B)) +#endif + +// not Equal +#define npyv_cmpneq_u8(A, B) vmvnq_u8(vceqq_u8(A, B)) +#define npyv_cmpneq_s8(A, B) vmvnq_u8(vceqq_s8(A, B)) +#define npyv_cmpneq_u16(A, B) vmvnq_u16(vceqq_u16(A, B)) +#define npyv_cmpneq_s16(A, B) vmvnq_u16(vceqq_s16(A, B)) +#define npyv_cmpneq_u32(A, B) vmvnq_u32(vceqq_u32(A, B)) +#define npyv_cmpneq_s32(A, B) vmvnq_u32(vceqq_s32(A, B)) +#define npyv_cmpneq_u64(A, B) npyv_not_u64(npyv_cmpeq_u64(A, B)) +#define npyv_cmpneq_s64(A, B) npyv_not_u64(npyv_cmpeq_s64(A, B)) +#define npyv_cmpneq_f32(A, B) vmvnq_u32(vceqq_f32(A, B)) +#define npyv_cmpneq_f64(A, B) npyv_not_u64(vceqq_f64(A, B)) + +// greater than +#define npyv_cmpgt_u8 vcgtq_u8 +#define npyv_cmpgt_s8 vcgtq_s8 +#define npyv_cmpgt_u16 vcgtq_u16 +#define npyv_cmpgt_s16 vcgtq_s16 +#define npyv_cmpgt_u32 vcgtq_u32 +#define npyv_cmpgt_s32 vcgtq_s32 +#define npyv_cmpgt_f32 vcgtq_f32 +#define npyv_cmpgt_f64 vcgtq_f64 + +#ifdef __aarch64__ + #define npyv_cmpgt_u64 vcgtq_u64 + #define npyv_cmpgt_s64 vcgtq_s64 +#else + NPY_FINLINE uint64x2_t npyv_cmpgt_s64(int64x2_t a, int64x2_t b) + { + int64x2_t sub = vsubq_s64(b, a); + uint64x2_t nsame_sbit = vreinterpretq_u64_s64(veorq_s64(a, b)); + int64x2_t test = vbslq_s64(nsame_sbit, b, sub); + int64x2_t extend_sbit = vshrq_n_s64(test, 63); + return vreinterpretq_u64_s64(extend_sbit); + } + NPY_FINLINE uint64x2_t npyv_cmpgt_u64(uint64x2_t a, uint64x2_t b) + { + const uint64x2_t sbit = npyv_setall_u64(0x8000000000000000); + a = npyv_xor_u64(a, sbit); + b = npyv_xor_u64(b, sbit); + return npyv_cmpgt_s64(vreinterpretq_s64_u64(a), vreinterpretq_s64_u64(b)); + } +#endif + +// greater than or equal +#define npyv_cmpge_u8 vcgeq_u8 +#define npyv_cmpge_s8 vcgeq_s8 +#define npyv_cmpge_u16 vcgeq_u16 +#define npyv_cmpge_s16 vcgeq_s16 +#define npyv_cmpge_u32 vcgeq_u32 +#define npyv_cmpge_s32 vcgeq_s32 +#define npyv_cmpge_f32 vcgeq_f32 +#define npyv_cmpge_f64 vcgeq_f64 + +#ifdef __aarch64__ + #define npyv_cmpge_u64 vcgeq_u64 + #define npyv_cmpge_s64 vcgeq_s64 +#else + #define npyv_cmpge_u64(A, B) npyv_not_u64(npyv_cmpgt_u64(B, A)) + #define npyv_cmpge_s64(A, B) npyv_not_u64(npyv_cmpgt_s64(B, A)) +#endif + +// less than +#define npyv_cmplt_u8(A, B) npyv_cmpgt_u8(B, A) +#define npyv_cmplt_s8(A, B) npyv_cmpgt_s8(B, A) +#define npyv_cmplt_u16(A, B) npyv_cmpgt_u16(B, A) +#define npyv_cmplt_s16(A, B) npyv_cmpgt_s16(B, A) +#define npyv_cmplt_u32(A, B) npyv_cmpgt_u32(B, A) +#define npyv_cmplt_s32(A, B) npyv_cmpgt_s32(B, A) +#define npyv_cmplt_u64(A, B) npyv_cmpgt_u64(B, A) +#define npyv_cmplt_s64(A, B) npyv_cmpgt_s64(B, A) +#define npyv_cmplt_f32(A, B) npyv_cmpgt_f32(B, A) +#define npyv_cmplt_f64(A, B) npyv_cmpgt_f64(B, A) + +// less than or equal +#define npyv_cmple_u8(A, B) npyv_cmpge_u8(B, A) +#define npyv_cmple_s8(A, B) npyv_cmpge_s8(B, A) +#define npyv_cmple_u16(A, B) npyv_cmpge_u16(B, A) +#define npyv_cmple_s16(A, B) npyv_cmpge_s16(B, A) +#define npyv_cmple_u32(A, B) npyv_cmpge_u32(B, A) +#define npyv_cmple_s32(A, B) npyv_cmpge_s32(B, A) +#define npyv_cmple_u64(A, B) npyv_cmpge_u64(B, A) +#define npyv_cmple_s64(A, B) npyv_cmpge_s64(B, A) +#define npyv_cmple_f32(A, B) npyv_cmpge_f32(B, A) +#define npyv_cmple_f64(A, B) npyv_cmpge_f64(B, A) + +#endif // _NPY_SIMD_NEON_OPERATORS_H diff --git a/numpy/core/src/common/simd/neon/reorder.h b/numpy/core/src/common/simd/neon/reorder.h new file mode 100644 index 000000000..712a77982 --- /dev/null +++ b/numpy/core/src/common/simd/neon/reorder.h @@ -0,0 +1,110 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_NEON_REORDER_H +#define _NPY_SIMD_NEON_REORDER_H + +// combine lower part of two vectors +#ifdef __aarch64__ + #define npyv_combinel_u8(A, B) vreinterpretq_u8_u64(vzip1q_u64(vreinterpretq_u64_u8(A), vreinterpretq_u64_u8(B))) + #define npyv_combinel_s8(A, B) vreinterpretq_s8_u64(vzip1q_u64(vreinterpretq_u64_s8(A), vreinterpretq_u64_s8(B))) + #define npyv_combinel_u16(A, B) vreinterpretq_u16_u64(vzip1q_u64(vreinterpretq_u64_u16(A), vreinterpretq_u64_u16(B))) + #define npyv_combinel_s16(A, B) vreinterpretq_s16_u64(vzip1q_u64(vreinterpretq_u64_s16(A), vreinterpretq_u64_s16(B))) + #define npyv_combinel_u32(A, B) vreinterpretq_u32_u64(vzip1q_u64(vreinterpretq_u64_u32(A), vreinterpretq_u64_u32(B))) + #define npyv_combinel_s32(A, B) vreinterpretq_s32_u64(vzip1q_u64(vreinterpretq_u64_s32(A), vreinterpretq_u64_s32(B))) + #define npyv_combinel_u64 vzip1q_u64 + #define npyv_combinel_s64 vzip1q_s64 + #define npyv_combinel_f32(A, B) vreinterpretq_f32_u64(vzip1q_u64(vreinterpretq_u64_f32(A), vreinterpretq_u64_f32(B))) + #define npyv_combinel_f64 vzip1q_f64 +#else + #define npyv_combinel_u8(A, B) vcombine_u8(vget_low_u8(A), vget_low_u8(B)) + #define npyv_combinel_s8(A, B) vcombine_s8(vget_low_s8(A), vget_low_s8(B)) + #define npyv_combinel_u16(A, B) vcombine_u16(vget_low_u16(A), vget_low_u16(B)) + #define npyv_combinel_s16(A, B) vcombine_s16(vget_low_s16(A), vget_low_s16(B)) + #define npyv_combinel_u32(A, B) vcombine_u32(vget_low_u32(A), vget_low_u32(B)) + #define npyv_combinel_s32(A, B) vcombine_s32(vget_low_s32(A), vget_low_s32(B)) + #define npyv_combinel_u64(A, B) vcombine_u64(vget_low_u64(A), vget_low_u64(B)) + #define npyv_combinel_s64(A, B) vcombine_s64(vget_low_s64(A), vget_low_s64(B)) + #define npyv_combinel_f32(A, B) vcombine_f32(vget_low_f32(A), vget_low_f32(B)) +#endif + +// combine higher part of two vectors +#ifdef __aarch64__ + #define npyv_combineh_u8(A, B) vreinterpretq_u8_u64(vzip2q_u64(vreinterpretq_u64_u8(A), vreinterpretq_u64_u8(B))) + #define npyv_combineh_s8(A, B) vreinterpretq_s8_u64(vzip2q_u64(vreinterpretq_u64_s8(A), vreinterpretq_u64_s8(B))) + #define npyv_combineh_u16(A, B) vreinterpretq_u16_u64(vzip2q_u64(vreinterpretq_u64_u16(A), vreinterpretq_u64_u16(B))) + #define npyv_combineh_s16(A, B) vreinterpretq_s16_u64(vzip2q_u64(vreinterpretq_u64_s16(A), vreinterpretq_u64_s16(B))) + #define npyv_combineh_u32(A, B) vreinterpretq_u32_u64(vzip2q_u64(vreinterpretq_u64_u32(A), vreinterpretq_u64_u32(B))) + #define npyv_combineh_s32(A, B) vreinterpretq_s32_u64(vzip2q_u64(vreinterpretq_u64_s32(A), vreinterpretq_u64_s32(B))) + #define npyv_combineh_u64 vzip2q_u64 + #define npyv_combineh_s64 vzip2q_s64 + #define npyv_combineh_f32(A, B) vreinterpretq_f32_u64(vzip2q_u64(vreinterpretq_u64_f32(A), vreinterpretq_u64_f32(B))) + #define npyv_combineh_f64 vzip2q_f64 +#else + #define npyv_combineh_u8(A, B) vcombine_u8(vget_high_u8(A), vget_high_u8(B)) + #define npyv_combineh_s8(A, B) vcombine_s8(vget_high_s8(A), vget_high_s8(B)) + #define npyv_combineh_u16(A, B) vcombine_u16(vget_high_u16(A), vget_high_u16(B)) + #define npyv_combineh_s16(A, B) vcombine_s16(vget_high_s16(A), vget_high_s16(B)) + #define npyv_combineh_u32(A, B) vcombine_u32(vget_high_u32(A), vget_high_u32(B)) + #define npyv_combineh_s32(A, B) vcombine_s32(vget_high_s32(A), vget_high_s32(B)) + #define npyv_combineh_u64(A, B) vcombine_u64(vget_high_u64(A), vget_high_u64(B)) + #define npyv_combineh_s64(A, B) vcombine_s64(vget_high_s64(A), vget_high_s64(B)) + #define npyv_combineh_f32(A, B) vcombine_f32(vget_high_f32(A), vget_high_f32(B)) +#endif + +// combine two vectors from lower and higher parts of two other vectors +#define NPYV_IMPL_NEON_COMBINE(T_VEC, SFX) \ + NPY_FINLINE T_VEC##x2 npyv_combine_##SFX(T_VEC a, T_VEC b) \ + { \ + T_VEC##x2 r; \ + r.val[0] = NPY_CAT(npyv_combinel_, SFX)(a, b); \ + r.val[1] = NPY_CAT(npyv_combineh_, SFX)(a, b); \ + return r; \ + } + +NPYV_IMPL_NEON_COMBINE(npyv_u8, u8) +NPYV_IMPL_NEON_COMBINE(npyv_s8, s8) +NPYV_IMPL_NEON_COMBINE(npyv_u16, u16) +NPYV_IMPL_NEON_COMBINE(npyv_s16, s16) +NPYV_IMPL_NEON_COMBINE(npyv_u32, u32) +NPYV_IMPL_NEON_COMBINE(npyv_s32, s32) +NPYV_IMPL_NEON_COMBINE(npyv_u64, u64) +NPYV_IMPL_NEON_COMBINE(npyv_s64, s64) +NPYV_IMPL_NEON_COMBINE(npyv_f32, f32) +#ifdef __aarch64__ +NPYV_IMPL_NEON_COMBINE(npyv_f64, f64) +#endif + +// interleave two vectors +#define NPYV_IMPL_NEON_ZIP(T_VEC, SFX) \ + NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b) \ + { \ + T_VEC##x2 r; \ + r.val[0] = vzip1q_##SFX(a, b); \ + r.val[1] = vzip2q_##SFX(a, b); \ + return r; \ + } + +#ifdef __aarch64__ + NPYV_IMPL_NEON_ZIP(npyv_u8, u8) + NPYV_IMPL_NEON_ZIP(npyv_s8, s8) + NPYV_IMPL_NEON_ZIP(npyv_u16, u16) + NPYV_IMPL_NEON_ZIP(npyv_s16, s16) + NPYV_IMPL_NEON_ZIP(npyv_u32, u32) + NPYV_IMPL_NEON_ZIP(npyv_s32, s32) + NPYV_IMPL_NEON_ZIP(npyv_f32, f32) + NPYV_IMPL_NEON_ZIP(npyv_f64, f64) +#else + #define npyv_zip_u8 vzipq_u8 + #define npyv_zip_s8 vzipq_s8 + #define npyv_zip_u16 vzipq_u16 + #define npyv_zip_s16 vzipq_s16 + #define npyv_zip_u32 vzipq_u32 + #define npyv_zip_s32 vzipq_s32 + #define npyv_zip_f32 vzipq_f32 +#endif +#define npyv_zip_u64 npyv_combine_u64 +#define npyv_zip_s64 npyv_combine_s64 + +#endif // _NPY_SIMD_NEON_REORDER_H diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h new file mode 100644 index 000000000..2f39c8427 --- /dev/null +++ b/numpy/core/src/common/simd/simd.h @@ -0,0 +1,56 @@ +#ifndef _NPY_SIMD_H_ +#define _NPY_SIMD_H_ +/** + * the NumPy C SIMD vectorization interface "NPYV" are types and functions intended + * to simplify vectorization of code on different platforms, currently supports + * the following SIMD extensions SSE, AVX2, AVX512, VSX and NEON. + * + * TODO: Add an independent sphinx doc. +*/ +#include "numpy/npy_common.h" +#include "npy_cpu_dispatch.h" +#include "simd_utils.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// lane type by intrin suffix +typedef npy_uint8 npyv_lanetype_u8; +typedef npy_int8 npyv_lanetype_s8; +typedef npy_uint16 npyv_lanetype_u16; +typedef npy_int16 npyv_lanetype_s16; +typedef npy_uint32 npyv_lanetype_u32; +typedef npy_int32 npyv_lanetype_s32; +typedef npy_uint64 npyv_lanetype_u64; +typedef npy_int64 npyv_lanetype_s64; +typedef float npyv_lanetype_f32; +typedef double npyv_lanetype_f64; + +#if defined(NPY_HAVE_AVX512F) && !defined(NPY_SIMD_FORCE_256) && !defined(NPY_SIMD_FORCE_128) + #include "avx512/avx512.h" +#elif defined(NPY_HAVE_AVX2) && !defined(NPY_SIMD_FORCE_128) + #include "avx2/avx2.h" +#elif defined(NPY_HAVE_SSE2) + #include "sse/sse.h" +#endif + +// TODO: Add support for VSX(2.06) and BE Mode +#if defined(NPY_HAVE_VSX2) && defined(__LITTLE_ENDIAN__) + #include "vsx/vsx.h" +#endif + +#ifdef NPY_HAVE_NEON + #include "neon/neon.h" +#endif + +#ifndef NPY_SIMD + #define NPY_SIMD 0 + #define NPY_SIMD_WIDTH 0 + #define NPY_SIMD_F64 0 +#endif + +#ifdef __cplusplus +} +#endif +#endif // _NPY_SIMD_H_ diff --git a/numpy/core/src/common/simd/simd_utils.h b/numpy/core/src/common/simd/simd_utils.h new file mode 100644 index 000000000..06c2f16f7 --- /dev/null +++ b/numpy/core/src/common/simd/simd_utils.h @@ -0,0 +1,48 @@ +#ifndef _NPY_SIMD_UTILS_H +#define _NPY_SIMD_UTILS_H + +#define NPYV__SET_2(CAST, I0, I1, ...) (CAST)(I0), (CAST)(I1) + +#define NPYV__SET_4(CAST, I0, I1, I2, I3, ...) \ + (CAST)(I0), (CAST)(I1), (CAST)(I2), (CAST)(I3) + +#define NPYV__SET_8(CAST, I0, I1, I2, I3, I4, I5, I6, I7, ...) \ + (CAST)(I0), (CAST)(I1), (CAST)(I2), (CAST)(I3), (CAST)(I4), (CAST)(I5), (CAST)(I6), (CAST)(I7) + +#define NPYV__SET_16(CAST, I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15, ...) \ + NPYV__SET_8(CAST, I0, I1, I2, I3, I4, I5, I6, I7), \ + NPYV__SET_8(CAST, I8, I9, I10, I11, I12, I13, I14, I15) + +#define NPYV__SET_32(CAST, I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15, \ +I16, I17, I18, I19, I20, I21, I22, I23, I24, I25, I26, I27, I28, I29, I30, I31, ...) \ + \ + NPYV__SET_16(CAST, I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15), \ + NPYV__SET_16(CAST, I16, I17, I18, I19, I20, I21, I22, I23, I24, I25, I26, I27, I28, I29, I30, I31) + +#define NPYV__SET_64(CAST, I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15, \ +I16, I17, I18, I19, I20, I21, I22, I23, I24, I25, I26, I27, I28, I29, I30, I31, \ +I32, I33, I34, I35, I36, I37, I38, I39, I40, I41, I42, I43, I44, I45, I46, I47, \ +I48, I49, I50, I51, I52, I53, I54, I55, I56, I57, I58, I59, I60, I61, I62, I63, ...) \ + \ + NPYV__SET_32(CAST, I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15, \ +I16, I17, I18, I19, I20, I21, I22, I23, I24, I25, I26, I27, I28, I29, I30, I31), \ + NPYV__SET_32(CAST, I32, I33, I34, I35, I36, I37, I38, I39, I40, I41, I42, I43, I44, I45, I46, I47, \ +I48, I49, I50, I51, I52, I53, I54, I55, I56, I57, I58, I59, I60, I61, I62, I63) + +#define NPYV__SET_FILL_2(CAST, F, ...) NPY_EXPAND(NPYV__SET_2(CAST, __VA_ARGS__, F, F)) + +#define NPYV__SET_FILL_4(CAST, F, ...) NPY_EXPAND(NPYV__SET_4(CAST, __VA_ARGS__, F, F, F, F)) + +#define NPYV__SET_FILL_8(CAST, F, ...) NPY_EXPAND(NPYV__SET_8(CAST, __VA_ARGS__, F, F, F, F, F, F, F, F)) + +#define NPYV__SET_FILL_16(CAST, F, ...) NPY_EXPAND(NPYV__SET_16(CAST, __VA_ARGS__, \ + F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F)) + +#define NPYV__SET_FILL_32(CAST, F, ...) NPY_EXPAND(NPYV__SET_32(CAST, __VA_ARGS__, \ + F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F)) + +#define NPYV__SET_FILL_64(CAST, F, ...) NPY_EXPAND(NPYV__SET_64(CAST, __VA_ARGS__, \ + F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, \ + F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F)) + +#endif // _NPY_SIMD_UTILS_H diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h new file mode 100644 index 000000000..12d0af05c --- /dev/null +++ b/numpy/core/src/common/simd/sse/arithmetic.h @@ -0,0 +1,95 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_SSE_ARITHMETIC_H +#define _NPY_SIMD_SSE_ARITHMETIC_H + +/*************************** + * Addition + ***************************/ +// non-saturated +#define npyv_add_u8 _mm_add_epi8 +#define npyv_add_s8 _mm_add_epi8 +#define npyv_add_u16 _mm_add_epi16 +#define npyv_add_s16 _mm_add_epi16 +#define npyv_add_u32 _mm_add_epi32 +#define npyv_add_s32 _mm_add_epi32 +#define npyv_add_u64 _mm_add_epi64 +#define npyv_add_s64 _mm_add_epi64 +#define npyv_add_f32 _mm_add_ps +#define npyv_add_f64 _mm_add_pd + +// saturated +#define npyv_adds_u8 _mm_adds_epu8 +#define npyv_adds_s8 _mm_adds_epi8 +#define npyv_adds_u16 _mm_adds_epu16 +#define npyv_adds_s16 _mm_adds_epi16 +// TODO: rest, after implment Packs intrins + +/*************************** + * Subtraction + ***************************/ +// non-saturated +#define npyv_sub_u8 _mm_sub_epi8 +#define npyv_sub_s8 _mm_sub_epi8 +#define npyv_sub_u16 _mm_sub_epi16 +#define npyv_sub_s16 _mm_sub_epi16 +#define npyv_sub_u32 _mm_sub_epi32 +#define npyv_sub_s32 _mm_sub_epi32 +#define npyv_sub_u64 _mm_sub_epi64 +#define npyv_sub_s64 _mm_sub_epi64 +#define npyv_sub_f32 _mm_sub_ps +#define npyv_sub_f64 _mm_sub_pd + +// saturated +#define npyv_subs_u8 _mm_subs_epu8 +#define npyv_subs_s8 _mm_subs_epi8 +#define npyv_subs_u16 _mm_subs_epu16 +#define npyv_subs_s16 _mm_subs_epi16 +// TODO: rest, after implment Packs intrins + +/*************************** + * Multiplication + ***************************/ +// non-saturated +NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b) +{ + const __m128i mask = _mm_set1_epi32(0xFF00FF00); + __m128i even = _mm_mullo_epi16(a, b); + __m128i odd = _mm_mullo_epi16(_mm_srai_epi16(a, 8), _mm_srai_epi16(b, 8)); + odd = _mm_slli_epi16(odd, 8); + return npyv_select_u8(mask, odd, even); +} +#define npyv_mul_s8 npyv_mul_u8 +#define npyv_mul_u16 _mm_mullo_epi16 +#define npyv_mul_s16 _mm_mullo_epi16 + +#ifdef NPY_HAVE_SSE41 + #define npyv_mul_u32 _mm_mullo_epi32 +#else + NPY_FINLINE __m128i npyv_mul_u32(__m128i a, __m128i b) + { + __m128i even = _mm_mul_epu32(a, b); + __m128i odd = _mm_mul_epu32(_mm_srli_epi64(a, 32), _mm_srli_epi64(b, 32)); + __m128i low = _mm_unpacklo_epi32(even, odd); + __m128i high = _mm_unpackhi_epi32(even, odd); + return _mm_unpacklo_epi64(low, high); + } +#endif // NPY_HAVE_SSE41 +#define npyv_mul_s32 npyv_mul_u32 +// TODO: emulate 64-bit*/ +#define npyv_mul_f32 _mm_mul_ps +#define npyv_mul_f64 _mm_mul_pd + +// saturated +// TODO: after implment Packs intrins + +/*************************** + * Division + ***************************/ +// TODO: emulate integer division +#define npyv_div_f32 _mm_div_ps +#define npyv_div_f64 _mm_div_pd + +#endif // _NPY_SIMD_SSE_ARITHMETIC_H diff --git a/numpy/core/src/common/simd/sse/conversion.h b/numpy/core/src/common/simd/sse/conversion.h new file mode 100644 index 000000000..ea9660d13 --- /dev/null +++ b/numpy/core/src/common/simd/sse/conversion.h @@ -0,0 +1,32 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_SSE_CVT_H +#define _NPY_SIMD_SSE_CVT_H + +// convert mask types to integer types +#define npyv_cvt_u8_b8(BL) BL +#define npyv_cvt_s8_b8(BL) BL +#define npyv_cvt_u16_b16(BL) BL +#define npyv_cvt_s16_b16(BL) BL +#define npyv_cvt_u32_b32(BL) BL +#define npyv_cvt_s32_b32(BL) BL +#define npyv_cvt_u64_b64(BL) BL +#define npyv_cvt_s64_b64(BL) BL +#define npyv_cvt_f32_b32(BL) _mm_castsi128_ps(BL) +#define npyv_cvt_f64_b64(BL) _mm_castsi128_pd(BL) + +// convert integer types to mask types +#define npyv_cvt_b8_u8(A) A +#define npyv_cvt_b8_s8(A) A +#define npyv_cvt_b16_u16(A) A +#define npyv_cvt_b16_s16(A) A +#define npyv_cvt_b32_u32(A) A +#define npyv_cvt_b32_s32(A) A +#define npyv_cvt_b64_u64(A) A +#define npyv_cvt_b64_s64(A) A +#define npyv_cvt_b32_f32(A) _mm_castps_si128(A) +#define npyv_cvt_b64_f64(A) _mm_castpd_si128(A) + +#endif // _NPY_SIMD_SSE_CVT_H diff --git a/numpy/core/src/common/simd/sse/memory.h b/numpy/core/src/common/simd/sse/memory.h new file mode 100644 index 000000000..1a555d6f0 --- /dev/null +++ b/numpy/core/src/common/simd/sse/memory.h @@ -0,0 +1,74 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_SSE_MEMORY_H +#define _NPY_SIMD_SSE_MEMORY_H + +/*************************** + * load/store + ***************************/ +// stream load +#ifdef NPY_HAVE_SSE41 + #define npyv__loads(PTR) _mm_stream_load_si128((__m128i *)(PTR)) +#else + #define npyv__loads(PTR) _mm_load_si128((const __m128i *)(PTR)) +#endif +#define NPYV_IMPL_SSE_MEM_INT(CTYPE, SFX) \ + NPY_FINLINE npyv_##SFX npyv_load_##SFX(const CTYPE *ptr) \ + { return _mm_loadu_si128((const __m128i*)ptr); } \ + NPY_FINLINE npyv_##SFX npyv_loada_##SFX(const CTYPE *ptr) \ + { return _mm_load_si128((const __m128i*)ptr); } \ + NPY_FINLINE npyv_##SFX npyv_loads_##SFX(const CTYPE *ptr) \ + { return npyv__loads(ptr); } \ + NPY_FINLINE npyv_##SFX npyv_loadl_##SFX(const CTYPE *ptr) \ + { return _mm_loadl_epi64((const __m128i*)ptr); } \ + NPY_FINLINE void npyv_store_##SFX(CTYPE *ptr, npyv_##SFX vec) \ + { _mm_storeu_si128((__m128i*)ptr, vec); } \ + NPY_FINLINE void npyv_storea_##SFX(CTYPE *ptr, npyv_##SFX vec) \ + { _mm_store_si128((__m128i*)ptr, vec); } \ + NPY_FINLINE void npyv_stores_##SFX(CTYPE *ptr, npyv_##SFX vec) \ + { _mm_stream_si128((__m128i*)ptr, vec); } \ + NPY_FINLINE void npyv_storel_##SFX(CTYPE *ptr, npyv_##SFX vec) \ + { _mm_storel_epi64((__m128i *)ptr, vec); } \ + NPY_FINLINE void npyv_storeh_##SFX(CTYPE *ptr, npyv_##SFX vec) \ + { _mm_storel_epi64((__m128i *)ptr, _mm_unpackhi_epi64(vec, vec)); } + +NPYV_IMPL_SSE_MEM_INT(npy_uint8, u8) +NPYV_IMPL_SSE_MEM_INT(npy_int8, s8) +NPYV_IMPL_SSE_MEM_INT(npy_uint16, u16) +NPYV_IMPL_SSE_MEM_INT(npy_int16, s16) +NPYV_IMPL_SSE_MEM_INT(npy_uint32, u32) +NPYV_IMPL_SSE_MEM_INT(npy_int32, s32) +NPYV_IMPL_SSE_MEM_INT(npy_uint64, u64) +NPYV_IMPL_SSE_MEM_INT(npy_int64, s64) + +// unaligned load +#define npyv_load_f32 _mm_loadu_ps +#define npyv_load_f64 _mm_loadu_pd +// aligned load +#define npyv_loada_f32 _mm_load_ps +#define npyv_loada_f64 _mm_load_pd +// load lower part +#define npyv_loadl_f32(PTR) _mm_castsi128_ps(npyv_loadl_u32((const npy_uint32*)(PTR))) +#define npyv_loadl_f64(PTR) _mm_castsi128_pd(npyv_loadl_u32((const npy_uint32*)(PTR))) +// stream load +#define npyv_loads_f32(PTR) _mm_castsi128_ps(npyv__loads(PTR)) +#define npyv_loads_f64(PTR) _mm_castsi128_pd(npyv__loads(PTR)) +// unaligned store +#define npyv_store_f32 _mm_storeu_ps +#define npyv_store_f64 _mm_storeu_pd +// aligned store +#define npyv_storea_f32 _mm_store_ps +#define npyv_storea_f64 _mm_store_pd +// stream store +#define npyv_stores_f32 _mm_stream_ps +#define npyv_stores_f64 _mm_stream_pd +// store lower part +#define npyv_storel_f32(PTR, VEC) _mm_storel_epi64((__m128i*)(PTR), _mm_castps_si128(VEC)); +#define npyv_storel_f64(PTR, VEC) _mm_storel_epi64((__m128i*)(PTR), _mm_castpd_si128(VEC)); +// store higher part +#define npyv_storeh_f32(PTR, VEC) npyv_storeh_u32((npy_uint32*)(PTR), _mm_castps_si128(VEC)) +#define npyv_storeh_f64(PTR, VEC) npyv_storeh_u32((npy_uint32*)(PTR), _mm_castpd_si128(VEC)) + +#endif // _NPY_SIMD_SSE_MEMORY_H diff --git a/numpy/core/src/common/simd/sse/misc.h b/numpy/core/src/common/simd/sse/misc.h new file mode 100644 index 000000000..7ba47bc68 --- /dev/null +++ b/numpy/core/src/common/simd/sse/misc.h @@ -0,0 +1,230 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_SSE_MISC_H +#define _NPY_SIMD_SSE_MISC_H + +// vector with zero lanes +#define npyv_zero_u8 _mm_setzero_si128 +#define npyv_zero_s8 _mm_setzero_si128 +#define npyv_zero_u16 _mm_setzero_si128 +#define npyv_zero_s16 _mm_setzero_si128 +#define npyv_zero_u32 _mm_setzero_si128 +#define npyv_zero_s32 _mm_setzero_si128 +#define npyv_zero_u64 _mm_setzero_si128 +#define npyv_zero_s64 _mm_setzero_si128 +#define npyv_zero_f32 _mm_setzero_ps +#define npyv_zero_f64 _mm_setzero_pd + +// vector with a specific value set to all lanes +#define npyv_setall_u8(VAL) _mm_set1_epi8((char)VAL) +#define npyv_setall_s8(VAL) _mm_set1_epi8((char)VAL) +#define npyv_setall_u16(VAL) _mm_set1_epi16((short)VAL) +#define npyv_setall_s16(VAL) _mm_set1_epi16((short)VAL) +#define npyv_setall_u32(VAL) _mm_set1_epi32((int)VAL) +#define npyv_setall_s32(VAL) _mm_set1_epi32(VAL) +#if !defined(__x86_64__) && !defined(_M_X64) + #define npyv_setall_u64(VAL) _mm_set_epi32((int)(VAL >> 32), (int)VAL, (int)(VAL >> 32), (int)VAL) + #define npyv_setall_s64 npyv_setall_u64 +#else + #define npyv_setall_u64(VAL) _mm_set1_epi64x(VAL) + #define npyv_setall_s64(VAL) _mm_set1_epi64x(VAL) +#endif +#define npyv_setall_f32(VAL) _mm_set1_ps(VAL) +#define npyv_setall_f64(VAL) _mm_set1_pd(VAL) + +/** + * vector with specific values set to each lane and + * set a specific value to all remained lanes + * + * Args that generated by NPYV__SET_FILL_* not going to expand if + * _mm_setr_* are defined as macros. + */ +NPY_FINLINE __m128i npyv__setr_epi8( + char i0, char i1, char i2, char i3, char i4, char i5, char i6, char i7, + char i8, char i9, char i10, char i11, char i12, char i13, char i14, char i15) +{ + return _mm_setr_epi8(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15); +} +NPY_FINLINE __m128i npyv__setr_epi16(short i0, short i1, short i2, short i3, short i4, short i5, + short i6, short i7) +{ + return _mm_setr_epi16(i0, i1, i2, i3, i4, i5, i6, i7); +} +NPY_FINLINE __m128i npyv__setr_epi32(int i0, int i1, int i2, int i3) +{ + return _mm_setr_epi32(i0, i1, i2, i3); +} +NPY_FINLINE __m128i npyv__setr_epi64(npy_int64 i0, npy_int64 i1) +{ + return _mm_set_epi64x(i1, i0); +} +NPY_FINLINE __m128 npyv__setr_ps(float i0, float i1, float i2, float i3) +{ + return _mm_setr_ps(i0, i1, i2, i3); +} +NPY_FINLINE __m128d npyv__setr_pd(double i0, double i1) +{ + return _mm_setr_pd(i0, i1); +} +#define npyv_setf_u8(FILL, ...) npyv__setr_epi8(NPYV__SET_FILL_16(char, FILL, __VA_ARGS__)) +#define npyv_setf_s8(FILL, ...) npyv__setr_epi8(NPYV__SET_FILL_16(char, FILL, __VA_ARGS__)) +#define npyv_setf_u16(FILL, ...) npyv__setr_epi16(NPYV__SET_FILL_8(short, FILL, __VA_ARGS__)) +#define npyv_setf_s16(FILL, ...) npyv__setr_epi16(NPYV__SET_FILL_8(short, FILL, __VA_ARGS__)) +#define npyv_setf_u32(FILL, ...) npyv__setr_epi32(NPYV__SET_FILL_4(int, FILL, __VA_ARGS__)) +#define npyv_setf_s32(FILL, ...) npyv__setr_epi32(NPYV__SET_FILL_4(int, FILL, __VA_ARGS__)) +#define npyv_setf_u64(FILL, ...) npyv__setr_epi64(NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)) +#define npyv_setf_s64(FILL, ...) npyv__setr_epi64(NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)) +#define npyv_setf_f32(FILL, ...) npyv__setr_ps(NPYV__SET_FILL_4(float, FILL, __VA_ARGS__)) +#define npyv_setf_f64(FILL, ...) npyv__setr_pd(NPYV__SET_FILL_2(double, FILL, __VA_ARGS__)) + +// vector with specific values set to each lane and +// set zero to all remained lanes +#define npyv_set_u8(...) npyv_setf_u8(0, __VA_ARGS__) +#define npyv_set_s8(...) npyv_setf_s8(0, __VA_ARGS__) +#define npyv_set_u16(...) npyv_setf_u16(0, __VA_ARGS__) +#define npyv_set_s16(...) npyv_setf_s16(0, __VA_ARGS__) +#define npyv_set_u32(...) npyv_setf_u32(0, __VA_ARGS__) +#define npyv_set_s32(...) npyv_setf_s32(0, __VA_ARGS__) +#define npyv_set_u64(...) npyv_setf_u64(0, __VA_ARGS__) +#define npyv_set_s64(...) npyv_setf_s64(0, __VA_ARGS__) +#define npyv_set_f32(...) npyv_setf_f32(0, __VA_ARGS__) +#define npyv_set_f64(...) npyv_setf_f64(0, __VA_ARGS__) + +// Per lane select +#ifdef NPY_HAVE_SSE41 + #define npyv_select_u8(MASK, A, B) _mm_blendv_epi8(B, A, MASK) + #define npyv_select_f32(MASK, A, B) _mm_blendv_ps(B, A, _mm_castsi128_ps(MASK)) + #define npyv_select_f64(MASK, A, B) _mm_blendv_pd(B, A, _mm_castsi128_pd(MASK)) +#else + NPY_FINLINE __m128i npyv_select_u8(__m128i mask, __m128i a, __m128i b) + { return _mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(b, a), mask)); } + NPY_FINLINE __m128 npyv_select_f32(__m128i mask, __m128 a, __m128 b) + { return _mm_xor_ps(b, _mm_and_ps(_mm_xor_ps(b, a), _mm_castsi128_ps(mask))); } + NPY_FINLINE __m128d npyv_select_f64(__m128i mask, __m128d a, __m128d b) + { return _mm_xor_pd(b, _mm_and_pd(_mm_xor_pd(b, a), _mm_castsi128_pd(mask))); } +#endif +#define npyv_select_s8 npyv_select_u8 +#define npyv_select_u16 npyv_select_u8 +#define npyv_select_s16 npyv_select_u8 +#define npyv_select_u32 npyv_select_u8 +#define npyv_select_s32 npyv_select_u8 +#define npyv_select_u64 npyv_select_u8 +#define npyv_select_s64 npyv_select_u8 + +// Reinterpret +#define npyv_reinterpret_u8_u8(X) X +#define npyv_reinterpret_u8_s8(X) X +#define npyv_reinterpret_u8_u16(X) X +#define npyv_reinterpret_u8_s16(X) X +#define npyv_reinterpret_u8_u32(X) X +#define npyv_reinterpret_u8_s32(X) X +#define npyv_reinterpret_u8_u64(X) X +#define npyv_reinterpret_u8_s64(X) X +#define npyv_reinterpret_u8_f32 _mm_castps_si128 +#define npyv_reinterpret_u8_f64 _mm_castpd_si128 + +#define npyv_reinterpret_s8_s8(X) X +#define npyv_reinterpret_s8_u8(X) X +#define npyv_reinterpret_s8_u16(X) X +#define npyv_reinterpret_s8_s16(X) X +#define npyv_reinterpret_s8_u32(X) X +#define npyv_reinterpret_s8_s32(X) X +#define npyv_reinterpret_s8_u64(X) X +#define npyv_reinterpret_s8_s64(X) X +#define npyv_reinterpret_s8_f32 _mm_castps_si128 +#define npyv_reinterpret_s8_f64 _mm_castpd_si128 + +#define npyv_reinterpret_u16_u16(X) X +#define npyv_reinterpret_u16_u8(X) X +#define npyv_reinterpret_u16_s8(X) X +#define npyv_reinterpret_u16_s16(X) X +#define npyv_reinterpret_u16_u32(X) X +#define npyv_reinterpret_u16_s32(X) X +#define npyv_reinterpret_u16_u64(X) X +#define npyv_reinterpret_u16_s64(X) X +#define npyv_reinterpret_u16_f32 _mm_castps_si128 +#define npyv_reinterpret_u16_f64 _mm_castpd_si128 + +#define npyv_reinterpret_s16_s16(X) X +#define npyv_reinterpret_s16_u8(X) X +#define npyv_reinterpret_s16_s8(X) X +#define npyv_reinterpret_s16_u16(X) X +#define npyv_reinterpret_s16_u32(X) X +#define npyv_reinterpret_s16_s32(X) X +#define npyv_reinterpret_s16_u64(X) X +#define npyv_reinterpret_s16_s64(X) X +#define npyv_reinterpret_s16_f32 _mm_castps_si128 +#define npyv_reinterpret_s16_f64 _mm_castpd_si128 + +#define npyv_reinterpret_u32_u32(X) X +#define npyv_reinterpret_u32_u8(X) X +#define npyv_reinterpret_u32_s8(X) X +#define npyv_reinterpret_u32_u16(X) X +#define npyv_reinterpret_u32_s16(X) X +#define npyv_reinterpret_u32_s32(X) X +#define npyv_reinterpret_u32_u64(X) X +#define npyv_reinterpret_u32_s64(X) X +#define npyv_reinterpret_u32_f32 _mm_castps_si128 +#define npyv_reinterpret_u32_f64 _mm_castpd_si128 + +#define npyv_reinterpret_s32_s32(X) X +#define npyv_reinterpret_s32_u8(X) X +#define npyv_reinterpret_s32_s8(X) X +#define npyv_reinterpret_s32_u16(X) X +#define npyv_reinterpret_s32_s16(X) X +#define npyv_reinterpret_s32_u32(X) X +#define npyv_reinterpret_s32_u64(X) X +#define npyv_reinterpret_s32_s64(X) X +#define npyv_reinterpret_s32_f32 _mm_castps_si128 +#define npyv_reinterpret_s32_f64 _mm_castpd_si128 + +#define npyv_reinterpret_u64_u64(X) X +#define npyv_reinterpret_u64_u8(X) X +#define npyv_reinterpret_u64_s8(X) X +#define npyv_reinterpret_u64_u16(X) X +#define npyv_reinterpret_u64_s16(X) X +#define npyv_reinterpret_u64_u32(X) X +#define npyv_reinterpret_u64_s32(X) X +#define npyv_reinterpret_u64_s64(X) X +#define npyv_reinterpret_u64_f32 _mm_castps_si128 +#define npyv_reinterpret_u64_f64 _mm_castpd_si128 + +#define npyv_reinterpret_s64_s64(X) X +#define npyv_reinterpret_s64_u8(X) X +#define npyv_reinterpret_s64_s8(X) X +#define npyv_reinterpret_s64_u16(X) X +#define npyv_reinterpret_s64_s16(X) X +#define npyv_reinterpret_s64_u32(X) X +#define npyv_reinterpret_s64_s32(X) X +#define npyv_reinterpret_s64_u64(X) X +#define npyv_reinterpret_s64_f32 _mm_castps_si128 +#define npyv_reinterpret_s64_f64 _mm_castpd_si128 + +#define npyv_reinterpret_f32_f32(X) X +#define npyv_reinterpret_f32_u8 _mm_castsi128_ps +#define npyv_reinterpret_f32_s8 _mm_castsi128_ps +#define npyv_reinterpret_f32_u16 _mm_castsi128_ps +#define npyv_reinterpret_f32_s16 _mm_castsi128_ps +#define npyv_reinterpret_f32_u32 _mm_castsi128_ps +#define npyv_reinterpret_f32_s32 _mm_castsi128_ps +#define npyv_reinterpret_f32_u64 _mm_castsi128_ps +#define npyv_reinterpret_f32_s64 _mm_castsi128_ps +#define npyv_reinterpret_f32_f64 _mm_castpd_ps + +#define npyv_reinterpret_f64_f64(X) X +#define npyv_reinterpret_f64_u8 _mm_castsi128_pd +#define npyv_reinterpret_f64_s8 _mm_castsi128_pd +#define npyv_reinterpret_f64_u16 _mm_castsi128_pd +#define npyv_reinterpret_f64_s16 _mm_castsi128_pd +#define npyv_reinterpret_f64_u32 _mm_castsi128_pd +#define npyv_reinterpret_f64_s32 _mm_castsi128_pd +#define npyv_reinterpret_f64_u64 _mm_castsi128_pd +#define npyv_reinterpret_f64_s64 _mm_castsi128_pd +#define npyv_reinterpret_f64_f32 _mm_castps_pd + +// Only required by AVX2/AVX512 +#define npyv_cleanup() ((void)0) + +#endif // _NPY_SIMD_SSE_MISC_H diff --git a/numpy/core/src/common/simd/sse/operators.h b/numpy/core/src/common/simd/sse/operators.h new file mode 100644 index 000000000..6e32ca4fd --- /dev/null +++ b/numpy/core/src/common/simd/sse/operators.h @@ -0,0 +1,258 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_SSE_OPERATORS_H +#define _NPY_SIMD_SSE_OPERATORS_H + +/*************************** + * Shifting + ***************************/ + +// left +#define npyv_shl_u16(A, C) _mm_sll_epi16(A, _mm_cvtsi32_si128(C)) +#define npyv_shl_s16(A, C) _mm_sll_epi16(A, _mm_cvtsi32_si128(C)) +#define npyv_shl_u32(A, C) _mm_sll_epi32(A, _mm_cvtsi32_si128(C)) +#define npyv_shl_s32(A, C) _mm_sll_epi32(A, _mm_cvtsi32_si128(C)) +#define npyv_shl_u64(A, C) _mm_sll_epi64(A, _mm_cvtsi32_si128(C)) +#define npyv_shl_s64(A, C) _mm_sll_epi64(A, _mm_cvtsi32_si128(C)) + +// left by an immediate constant +#define npyv_shli_u16 _mm_slli_epi16 +#define npyv_shli_s16 _mm_slli_epi16 +#define npyv_shli_u32 _mm_slli_epi32 +#define npyv_shli_s32 _mm_slli_epi32 +#define npyv_shli_u64 _mm_slli_epi64 +#define npyv_shli_s64 _mm_slli_epi64 + +// right +#define npyv_shr_u16(A, C) _mm_srl_epi16(A, _mm_cvtsi32_si128(C)) +#define npyv_shr_s16(A, C) _mm_sra_epi16(A, _mm_cvtsi32_si128(C)) +#define npyv_shr_u32(A, C) _mm_srl_epi32(A, _mm_cvtsi32_si128(C)) +#define npyv_shr_s32(A, C) _mm_sra_epi32(A, _mm_cvtsi32_si128(C)) +#define npyv_shr_u64(A, C) _mm_srl_epi64(A, _mm_cvtsi32_si128(C)) +NPY_FINLINE __m128i npyv_shr_s64(__m128i a, int c) +{ + const __m128i sbit = npyv_setall_s64(0x8000000000000000); + const __m128i cv = _mm_cvtsi32_si128(c); + __m128i r = _mm_srl_epi64(_mm_add_epi64(a, sbit), cv); + return _mm_sub_epi64(r, _mm_srl_epi64(sbit, cv)); +} + +// Right by an immediate constant +#define npyv_shri_u16 _mm_srli_epi16 +#define npyv_shri_s16 _mm_srai_epi16 +#define npyv_shri_u32 _mm_srli_epi32 +#define npyv_shri_s32 _mm_srai_epi32 +#define npyv_shri_u64 _mm_srli_epi64 +#define npyv_shri_s64 npyv_shr_s64 + +/*************************** + * Logical + ***************************/ + +// AND +#define npyv_and_u8 _mm_and_si128 +#define npyv_and_s8 _mm_and_si128 +#define npyv_and_u16 _mm_and_si128 +#define npyv_and_s16 _mm_and_si128 +#define npyv_and_u32 _mm_and_si128 +#define npyv_and_s32 _mm_and_si128 +#define npyv_and_u64 _mm_and_si128 +#define npyv_and_s64 _mm_and_si128 +#define npyv_and_f32 _mm_and_ps +#define npyv_and_f64 _mm_and_pd + +// OR +#define npyv_or_u8 _mm_or_si128 +#define npyv_or_s8 _mm_or_si128 +#define npyv_or_u16 _mm_or_si128 +#define npyv_or_s16 _mm_or_si128 +#define npyv_or_u32 _mm_or_si128 +#define npyv_or_s32 _mm_or_si128 +#define npyv_or_u64 _mm_or_si128 +#define npyv_or_s64 _mm_or_si128 +#define npyv_or_f32 _mm_or_ps +#define npyv_or_f64 _mm_or_pd + +// XOR +#define npyv_xor_u8 _mm_xor_si128 +#define npyv_xor_s8 _mm_xor_si128 +#define npyv_xor_u16 _mm_xor_si128 +#define npyv_xor_s16 _mm_xor_si128 +#define npyv_xor_u32 _mm_xor_si128 +#define npyv_xor_s32 _mm_xor_si128 +#define npyv_xor_u64 _mm_xor_si128 +#define npyv_xor_s64 _mm_xor_si128 +#define npyv_xor_f32 _mm_xor_ps +#define npyv_xor_f64 _mm_xor_pd + +// NOT +#define npyv_not_u8(A) _mm_xor_si128(A, _mm_set1_epi32(-1)) +#define npyv_not_s8 npyv_not_u8 +#define npyv_not_u16 npyv_not_u8 +#define npyv_not_s16 npyv_not_u8 +#define npyv_not_u32 npyv_not_u8 +#define npyv_not_s32 npyv_not_u8 +#define npyv_not_u64 npyv_not_u8 +#define npyv_not_s64 npyv_not_u8 +#define npyv_not_f32(A) _mm_xor_ps(A, _mm_castsi128_ps(_mm_set1_epi32(-1))) +#define npyv_not_f64(A) _mm_xor_pd(A, _mm_castsi128_pd(_mm_set1_epi32(-1))) + +/*************************** + * Comparison + ***************************/ + +// Int Equal +#define npyv_cmpeq_u8 _mm_cmpeq_epi8 +#define npyv_cmpeq_s8 _mm_cmpeq_epi8 +#define npyv_cmpeq_u16 _mm_cmpeq_epi16 +#define npyv_cmpeq_s16 _mm_cmpeq_epi16 +#define npyv_cmpeq_u32 _mm_cmpeq_epi32 +#define npyv_cmpeq_s32 _mm_cmpeq_epi32 +#define npyv_cmpeq_s64 npyv_cmpeq_u64 + +#ifdef NPY_HAVE_SSE41 + #define npyv_cmpeq_u64 _mm_cmpeq_epi64 +#else + NPY_FINLINE __m128i npyv_cmpeq_u64(__m128i a, __m128i b) + { + __m128i cmpeq = _mm_cmpeq_epi32(a, b); + __m128i cmpeq_h = _mm_srli_epi64(cmpeq, 32); + __m128i test = _mm_and_si128(cmpeq, cmpeq_h); + return _mm_shuffle_epi32(test, _MM_SHUFFLE(2, 2, 0, 0)); + } +#endif + +// Int Not Equal +#ifdef NPY_HAVE_XOP + #define npyv_cmpneq_u8 _mm_comneq_epi8 + #define npyv_cmpneq_u16 _mm_comneq_epi16 + #define npyv_cmpneq_u32 _mm_comneq_epi32 + #define npyv_cmpneq_u64 _mm_comneq_epi64 +#else + #define npyv_cmpneq_u8(A, B) npyv_not_u8(npyv_cmpeq_u8(A, B)) + #define npyv_cmpneq_u16(A, B) npyv_not_u16(npyv_cmpeq_u16(A, B)) + #define npyv_cmpneq_u32(A, B) npyv_not_u32(npyv_cmpeq_u32(A, B)) + #define npyv_cmpneq_u64(A, B) npyv_not_u64(npyv_cmpeq_u64(A, B)) +#endif +#define npyv_cmpneq_s8 npyv_cmpneq_u8 +#define npyv_cmpneq_s16 npyv_cmpneq_u16 +#define npyv_cmpneq_s32 npyv_cmpneq_u32 +#define npyv_cmpneq_s64 npyv_cmpneq_u64 + +// signed greater than +#define npyv_cmpgt_s8 _mm_cmpgt_epi8 +#define npyv_cmpgt_s16 _mm_cmpgt_epi16 +#define npyv_cmpgt_s32 _mm_cmpgt_epi32 + +#ifdef NPY_HAVE_SSE42 + #define npyv_cmpgt_s64 _mm_cmpgt_epi64 +#else + NPY_FINLINE __m128i npyv_cmpgt_s64(__m128i a, __m128i b) + { + __m128i sub = _mm_sub_epi64(b, a); + __m128i nsame_sbit = _mm_xor_si128(a, b); + // nsame_sbit ? b : sub + __m128i test = _mm_xor_si128(sub, _mm_and_si128(_mm_xor_si128(sub, b), nsame_sbit)); + __m128i extend_sbit = _mm_shuffle_epi32(_mm_srai_epi32(test, 31), _MM_SHUFFLE(3, 3, 1, 1)); + return extend_sbit; + } +#endif + +// signed greater than or equal +#ifdef NPY_HAVE_XOP + #define npyv_cmpge_s8 _mm_comge_epi8 + #define npyv_cmpge_s16 _mm_comge_epi16 + #define npyv_cmpge_s32 _mm_comge_epi32 + #define npyv_cmpge_s64 _mm_comge_epi64 +#else + #define npyv_cmpge_s8(A, B) npyv_not_s8(_mm_cmpgt_epi8(B, A)) + #define npyv_cmpge_s16(A, B) npyv_not_s16(_mm_cmpgt_epi16(B, A)) + #define npyv_cmpge_s32(A, B) npyv_not_s32(_mm_cmpgt_epi32(B, A)) + #define npyv_cmpge_s64(A, B) npyv_not_s64(npyv_cmpgt_s64(B, A)) +#endif + +// unsigned greater than +#ifdef NPY_HAVE_XOP + #define npyv_cmpgt_u8 _mm_comgt_epu8 + #define npyv_cmpgt_u16 _mm_comgt_epu16 + #define npyv_cmpgt_u32 _mm_comgt_epu32 + #define npyv_cmpgt_u64 _mm_comgt_epu64 +#else + #define NPYV_IMPL_SSE_UNSIGNED_GT(LEN, SIGN) \ + NPY_FINLINE __m128i npyv_cmpgt_u##LEN(__m128i a, __m128i b) \ + { \ + const __m128i sbit = _mm_set1_epi32(SIGN); \ + return _mm_cmpgt_epi##LEN( \ + _mm_xor_si128(a, sbit), _mm_xor_si128(b, sbit) \ + ); \ + } + + NPYV_IMPL_SSE_UNSIGNED_GT(8, 0x80808080) + NPYV_IMPL_SSE_UNSIGNED_GT(16, 0x80008000) + NPYV_IMPL_SSE_UNSIGNED_GT(32, 0x80000000) + + NPY_FINLINE __m128i npyv_cmpgt_u64(__m128i a, __m128i b) + { + const __m128i sbit = npyv_setall_s64(0x8000000000000000); + return npyv_cmpgt_s64(_mm_xor_si128(a, sbit), _mm_xor_si128(b, sbit)); + } +#endif + +// unsigned greater than or equal +#ifdef NPY_HAVE_XOP + #define npyv_cmpge_u8 _mm_comge_epu8 + #define npyv_cmpge_u16 _mm_comge_epu16 + #define npyv_cmpge_u32 _mm_comge_epu32 + #define npyv_cmpge_u64 _mm_comge_epu64 +#else + NPY_FINLINE __m128i npyv_cmpge_u8(__m128i a, __m128i b) + { return _mm_cmpeq_epi8(a, _mm_max_epu8(a, b)); } + #ifdef NPY_HAVE_SSE41 + NPY_FINLINE __m128i npyv_cmpge_u16(__m128i a, __m128i b) + { return _mm_cmpeq_epi16(a, _mm_max_epu16(a, b)); } + NPY_FINLINE __m128i npyv_cmpge_u32(__m128i a, __m128i b) + { return _mm_cmpeq_epi32(a, _mm_max_epu32(a, b)); } + #else + #define npyv_cmpge_u16(A, B) _mm_cmpeq_epi16(_mm_subs_epu16(B, A), _mm_setzero_si128()) + #define npyv_cmpge_u32(A, B) npyv_not_u32(npyv_cmpgt_u32(B, A)) + #endif + #define npyv_cmpge_u64(A, B) npyv_not_u64(npyv_cmpgt_u64(B, A)) +#endif + +// less than +#define npyv_cmplt_u8(A, B) npyv_cmpgt_u8(B, A) +#define npyv_cmplt_s8(A, B) npyv_cmpgt_s8(B, A) +#define npyv_cmplt_u16(A, B) npyv_cmpgt_u16(B, A) +#define npyv_cmplt_s16(A, B) npyv_cmpgt_s16(B, A) +#define npyv_cmplt_u32(A, B) npyv_cmpgt_u32(B, A) +#define npyv_cmplt_s32(A, B) npyv_cmpgt_s32(B, A) +#define npyv_cmplt_u64(A, B) npyv_cmpgt_u64(B, A) +#define npyv_cmplt_s64(A, B) npyv_cmpgt_s64(B, A) + +// less than or equal +#define npyv_cmple_u8(A, B) npyv_cmpge_u8(B, A) +#define npyv_cmple_s8(A, B) npyv_cmpge_s8(B, A) +#define npyv_cmple_u16(A, B) npyv_cmpge_u16(B, A) +#define npyv_cmple_s16(A, B) npyv_cmpge_s16(B, A) +#define npyv_cmple_u32(A, B) npyv_cmpge_u32(B, A) +#define npyv_cmple_s32(A, B) npyv_cmpge_s32(B, A) +#define npyv_cmple_u64(A, B) npyv_cmpge_u64(B, A) +#define npyv_cmple_s64(A, B) npyv_cmpge_s64(B, A) + +// precision comparison +#define npyv_cmpeq_f32(a, b) _mm_castps_si128(_mm_cmpeq_ps(a, b)) +#define npyv_cmpeq_f64(a, b) _mm_castpd_si128(_mm_cmpeq_pd(a, b)) +#define npyv_cmpneq_f32(a, b) _mm_castps_si128(_mm_cmpneq_ps(a, b)) +#define npyv_cmpneq_f64(a, b) _mm_castpd_si128(_mm_cmpneq_pd(a, b)) +#define npyv_cmplt_f32(a, b) _mm_castps_si128(_mm_cmplt_ps(a, b)) +#define npyv_cmplt_f64(a, b) _mm_castpd_si128(_mm_cmplt_pd(a, b)) +#define npyv_cmple_f32(a, b) _mm_castps_si128(_mm_cmple_ps(a, b)) +#define npyv_cmple_f64(a, b) _mm_castpd_si128(_mm_cmple_pd(a, b)) +#define npyv_cmpgt_f32(a, b) _mm_castps_si128(_mm_cmpgt_ps(a, b)) +#define npyv_cmpgt_f64(a, b) _mm_castpd_si128(_mm_cmpgt_pd(a, b)) +#define npyv_cmpge_f32(a, b) _mm_castps_si128(_mm_cmpge_ps(a, b)) +#define npyv_cmpge_f64(a, b) _mm_castpd_si128(_mm_cmpge_pd(a, b)) + +#endif // _NPY_SIMD_SSE_OPERATORS_H diff --git a/numpy/core/src/common/simd/sse/reorder.h b/numpy/core/src/common/simd/sse/reorder.h new file mode 100644 index 000000000..3f68b4ad7 --- /dev/null +++ b/numpy/core/src/common/simd/sse/reorder.h @@ -0,0 +1,84 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_SSE_REORDER_H +#define _NPY_SIMD_SSE_REORDER_H + +// combine lower part of two vectors +#define npyv_combinel_u8 _mm_unpacklo_epi64 +#define npyv_combinel_s8 _mm_unpacklo_epi64 +#define npyv_combinel_u16 _mm_unpacklo_epi64 +#define npyv_combinel_s16 _mm_unpacklo_epi64 +#define npyv_combinel_u32 _mm_unpacklo_epi64 +#define npyv_combinel_s32 _mm_unpacklo_epi64 +#define npyv_combinel_u64 _mm_unpacklo_epi64 +#define npyv_combinel_s64 _mm_unpacklo_epi64 +#define npyv_combinel_f32(A, B) _mm_castsi128_ps(_mm_unpacklo_epi64(_mm_castps_si128(A), _mm_castps_si128(B))) +#define npyv_combinel_f64 _mm_unpacklo_pd + +// combine higher part of two vectors +#define npyv_combineh_u8 _mm_unpackhi_epi64 +#define npyv_combineh_s8 _mm_unpackhi_epi64 +#define npyv_combineh_u16 _mm_unpackhi_epi64 +#define npyv_combineh_s16 _mm_unpackhi_epi64 +#define npyv_combineh_u32 _mm_unpackhi_epi64 +#define npyv_combineh_s32 _mm_unpackhi_epi64 +#define npyv_combineh_u64 _mm_unpackhi_epi64 +#define npyv_combineh_s64 _mm_unpackhi_epi64 +#define npyv_combineh_f32(A, B) _mm_castsi128_ps(_mm_unpackhi_epi64(_mm_castps_si128(A), _mm_castps_si128(B))) +#define npyv_combineh_f64 _mm_unpackhi_pd + +// combine two vectors from lower and higher parts of two other vectors +NPY_FINLINE npyv_m128ix2 npyv__combine(__m128i a, __m128i b) +{ + npyv_m128ix2 r; + r.val[0] = npyv_combinel_u8(a, b); + r.val[1] = npyv_combineh_u8(a, b); + return r; +} +NPY_FINLINE npyv_f32x2 npyv_combine_f32(__m128 a, __m128 b) +{ + npyv_f32x2 r; + r.val[0] = npyv_combinel_f32(a, b); + r.val[1] = npyv_combineh_f32(a, b); + return r; +} +NPY_FINLINE npyv_f64x2 npyv_combine_f64(__m128d a, __m128d b) +{ + npyv_f64x2 r; + r.val[0] = npyv_combinel_f64(a, b); + r.val[1] = npyv_combineh_f64(a, b); + return r; +} +#define npyv_combine_u8 npyv__combine +#define npyv_combine_s8 npyv__combine +#define npyv_combine_u16 npyv__combine +#define npyv_combine_s16 npyv__combine +#define npyv_combine_u32 npyv__combine +#define npyv_combine_s32 npyv__combine +#define npyv_combine_u64 npyv__combine +#define npyv_combine_s64 npyv__combine + +// interleave two vectors +#define NPYV_IMPL_SSE_ZIP(T_VEC, SFX, INTR_SFX) \ + NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b) \ + { \ + T_VEC##x2 r; \ + r.val[0] = _mm_unpacklo_##INTR_SFX(a, b); \ + r.val[1] = _mm_unpackhi_##INTR_SFX(a, b); \ + return r; \ + } + +NPYV_IMPL_SSE_ZIP(npyv_u8, u8, epi8) +NPYV_IMPL_SSE_ZIP(npyv_s8, s8, epi8) +NPYV_IMPL_SSE_ZIP(npyv_u16, u16, epi16) +NPYV_IMPL_SSE_ZIP(npyv_s16, s16, epi16) +NPYV_IMPL_SSE_ZIP(npyv_u32, u32, epi32) +NPYV_IMPL_SSE_ZIP(npyv_s32, s32, epi32) +NPYV_IMPL_SSE_ZIP(npyv_u64, u64, epi64) +NPYV_IMPL_SSE_ZIP(npyv_s64, s64, epi64) +NPYV_IMPL_SSE_ZIP(npyv_f32, f32, ps) +NPYV_IMPL_SSE_ZIP(npyv_f64, f64, pd) + +#endif // _NPY_SIMD_SSE_REORDER_H diff --git a/numpy/core/src/common/simd/sse/sse.h b/numpy/core/src/common/simd/sse/sse.h new file mode 100644 index 000000000..364b4baf1 --- /dev/null +++ b/numpy/core/src/common/simd/sse/sse.h @@ -0,0 +1,66 @@ +#ifndef _NPY_SIMD_H_ + #error "Not a standalone header" +#endif + +#define NPY_SIMD 128 +#define NPY_SIMD_WIDTH 16 +#define NPY_SIMD_F64 1 + +typedef __m128i npyv_u8; +typedef __m128i npyv_s8; +typedef __m128i npyv_u16; +typedef __m128i npyv_s16; +typedef __m128i npyv_u32; +typedef __m128i npyv_s32; +typedef __m128i npyv_u64; +typedef __m128i npyv_s64; +typedef __m128 npyv_f32; +typedef __m128d npyv_f64; + +typedef __m128i npyv_b8; +typedef __m128i npyv_b16; +typedef __m128i npyv_b32; +typedef __m128i npyv_b64; + +typedef struct { __m128i val[2]; } npyv_m128ix2; +typedef npyv_m128ix2 npyv_u8x2; +typedef npyv_m128ix2 npyv_s8x2; +typedef npyv_m128ix2 npyv_u16x2; +typedef npyv_m128ix2 npyv_s16x2; +typedef npyv_m128ix2 npyv_u32x2; +typedef npyv_m128ix2 npyv_s32x2; +typedef npyv_m128ix2 npyv_u64x2; +typedef npyv_m128ix2 npyv_s64x2; + +typedef struct { __m128i val[3]; } npyv_m128ix3; +typedef npyv_m128ix3 npyv_u8x3; +typedef npyv_m128ix3 npyv_s8x3; +typedef npyv_m128ix3 npyv_u16x3; +typedef npyv_m128ix3 npyv_s16x3; +typedef npyv_m128ix3 npyv_u32x3; +typedef npyv_m128ix3 npyv_s32x3; +typedef npyv_m128ix3 npyv_u64x3; +typedef npyv_m128ix3 npyv_s64x3; + +typedef struct { __m128 val[2]; } npyv_f32x2; +typedef struct { __m128d val[2]; } npyv_f64x2; +typedef struct { __m128 val[3]; } npyv_f32x3; +typedef struct { __m128d val[3]; } npyv_f64x3; + +#define npyv_nlanes_u8 16 +#define npyv_nlanes_s8 16 +#define npyv_nlanes_u16 8 +#define npyv_nlanes_s16 8 +#define npyv_nlanes_u32 4 +#define npyv_nlanes_s32 4 +#define npyv_nlanes_u64 2 +#define npyv_nlanes_s64 2 +#define npyv_nlanes_f32 4 +#define npyv_nlanes_f64 2 + +#include "memory.h" +#include "misc.h" +#include "reorder.h" +#include "operators.h" +#include "conversion.h" +#include "arithmetic.h" diff --git a/numpy/core/src/common/simd/vsx/arithmetic.h b/numpy/core/src/common/simd/vsx/arithmetic.h new file mode 100644 index 000000000..dd23b5b11 --- /dev/null +++ b/numpy/core/src/common/simd/vsx/arithmetic.h @@ -0,0 +1,103 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_VSX_ARITHMETIC_H +#define _NPY_SIMD_VSX_ARITHMETIC_H + +/*************************** + * Addition + ***************************/ +// non-saturated +#define npyv_add_u8 vec_add +#define npyv_add_s8 vec_add +#define npyv_add_u16 vec_add +#define npyv_add_s16 vec_add +#define npyv_add_u32 vec_add +#define npyv_add_s32 vec_add +#define npyv_add_u64 vec_add +#define npyv_add_s64 vec_add +#define npyv_add_f32 vec_add +#define npyv_add_f64 vec_add + +// saturated +#define npyv_adds_u8 vec_adds +#define npyv_adds_s8 vec_adds +#define npyv_adds_u16 vec_adds +#define npyv_adds_s16 vec_adds + +/*************************** + * Subtraction + ***************************/ +// non-saturated +#define npyv_sub_u8 vec_sub +#define npyv_sub_s8 vec_sub +#define npyv_sub_u16 vec_sub +#define npyv_sub_s16 vec_sub +#define npyv_sub_u32 vec_sub +#define npyv_sub_s32 vec_sub +#define npyv_sub_u64 vec_sub +#define npyv_sub_s64 vec_sub +#define npyv_sub_f32 vec_sub +#define npyv_sub_f64 vec_sub + +// saturated +#define npyv_subs_u8 vec_subs +#define npyv_subs_s8 vec_subs +#define npyv_subs_u16 vec_subs +#define npyv_subs_s16 vec_subs + +/*************************** + * Multiplication + ***************************/ +// non-saturated +// up to GCC 6 vec_mul only supports precisions and llong +#if defined(__GNUC__) && __GNUC__ < 7 + #define NPYV_IMPL_VSX_MUL(T_VEC, SFX, ...) \ + NPY_FINLINE T_VEC npyv_mul_##SFX(T_VEC a, T_VEC b) \ + { \ + const npyv_u8 ev_od = {__VA_ARGS__}; \ + return vec_perm( \ + (T_VEC)vec_mule(a, b), \ + (T_VEC)vec_mulo(a, b), ev_od \ + ); \ + } + + NPYV_IMPL_VSX_MUL(npyv_u8, u8, 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30) + NPYV_IMPL_VSX_MUL(npyv_s8, s8, 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30) + NPYV_IMPL_VSX_MUL(npyv_u16, u16, 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29) + NPYV_IMPL_VSX_MUL(npyv_s16, s16, 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29) + + // vmuluwm can be used for unsigned or signed 32-bit integers + #define NPYV_IMPL_VSX_MUL_32(T_VEC, SFX) \ + NPY_FINLINE T_VEC npyv_mul_##SFX(T_VEC a, T_VEC b) \ + { \ + T_VEC ret; \ + __asm__ __volatile__( \ + "vmuluwm %0,%1,%2" : \ + "=v" (ret) : "v" (a), "v" (b) \ + ); \ + return ret; \ + } + + NPYV_IMPL_VSX_MUL_32(npyv_u32, u32) + NPYV_IMPL_VSX_MUL_32(npyv_s32, s32) + +#else + #define npyv_mul_u8 vec_mul + #define npyv_mul_s8 vec_mul + #define npyv_mul_u16 vec_mul + #define npyv_mul_s16 vec_mul + #define npyv_mul_u32 vec_mul + #define npyv_mul_s32 vec_mul +#endif +#define npyv_mul_f32 vec_mul +#define npyv_mul_f64 vec_mul + +/*************************** + * Division + ***************************/ +#define npyv_div_f32 vec_div +#define npyv_div_f64 vec_div + +#endif // _NPY_SIMD_VSX_ARITHMETIC_H diff --git a/numpy/core/src/common/simd/vsx/conversion.h b/numpy/core/src/common/simd/vsx/conversion.h new file mode 100644 index 000000000..6ed135990 --- /dev/null +++ b/numpy/core/src/common/simd/vsx/conversion.h @@ -0,0 +1,32 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_VSX_CVT_H +#define _NPY_SIMD_VSX_CVT_H + +// convert boolean vectors to integer vectors +#define npyv_cvt_u8_b8(BL) ((npyv_u8) BL) +#define npyv_cvt_s8_b8(BL) ((npyv_s8) BL) +#define npyv_cvt_u16_b16(BL) ((npyv_u16) BL) +#define npyv_cvt_s16_b16(BL) ((npyv_s16) BL) +#define npyv_cvt_u32_b32(BL) ((npyv_u32) BL) +#define npyv_cvt_s32_b32(BL) ((npyv_s32) BL) +#define npyv_cvt_u64_b64(BL) ((npyv_u64) BL) +#define npyv_cvt_s64_b64(BL) ((npyv_s64) BL) +#define npyv_cvt_f32_b32(BL) ((npyv_f32) BL) +#define npyv_cvt_f64_b64(BL) ((npyv_f64) BL) + +// convert integer vectors to boolean vectors +#define npyv_cvt_b8_u8(A) ((npyv_b8) A) +#define npyv_cvt_b8_s8(A) ((npyv_b8) A) +#define npyv_cvt_b16_u16(A) ((npyv_b16) A) +#define npyv_cvt_b16_s16(A) ((npyv_b16) A) +#define npyv_cvt_b32_u32(A) ((npyv_b32) A) +#define npyv_cvt_b32_s32(A) ((npyv_b32) A) +#define npyv_cvt_b64_u64(A) ((npyv_b64) A) +#define npyv_cvt_b64_s64(A) ((npyv_b64) A) +#define npyv_cvt_b32_f32(A) ((npyv_b32) A) +#define npyv_cvt_b64_f64(A) ((npyv_b64) A) + +#endif // _NPY_SIMD_VSX_CVT_H diff --git a/numpy/core/src/common/simd/vsx/memory.h b/numpy/core/src/common/simd/vsx/memory.h new file mode 100644 index 000000000..e0d908bf9 --- /dev/null +++ b/numpy/core/src/common/simd/vsx/memory.h @@ -0,0 +1,150 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_VSX_MEMORY_H +#define _NPY_SIMD_VSX_MEMORY_H +/**************************** + * load/store + ****************************/ +// TODO: test load by cast +#define VSX__CAST_lOAD 0 +#if VSX__CAST_lOAD + #define npyv__load(PTR, T_VEC) (*((T_VEC*)(PTR))) +#else + /** + * CLANG fails to load unaligned addresses via vec_xl, vec_xst + * so we failback to vec_vsx_ld, vec_vsx_st + */ + #if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__)) + #define npyv__load(PTR, T_VEC) vec_vsx_ld(0, PTR) + #else + #define npyv__load(PTR, T_VEC) vec_xl(0, PTR) + #endif +#endif +// unaligned load +#define npyv_load_u8(PTR) npyv__load(PTR, npyv_u8) +#define npyv_load_s8(PTR) npyv__load(PTR, npyv_s8) +#define npyv_load_u16(PTR) npyv__load(PTR, npyv_u16) +#define npyv_load_s16(PTR) npyv__load(PTR, npyv_s16) +#define npyv_load_u32(PTR) npyv__load(PTR, npyv_u32) +#define npyv_load_s32(PTR) npyv__load(PTR, npyv_s32) +#define npyv_load_f32(PTR) npyv__load(PTR, npyv_f32) +#define npyv_load_f64(PTR) npyv__load(PTR, npyv_f64) +#if VSX__CAST_lOAD + #define npyv_load_u64(PTR) npyv__load(PTR, npyv_u64) + #define npyv_load_s64(PTR) npyv__load(PTR, npyv_s64) +#else + #define npyv_load_u64(PTR) ((npyv_u64)npyv_load_u32((const unsigned int*)PTR)) + #define npyv_load_s64(PTR) ((npyv_s64)npyv_load_s32((const unsigned int*)PTR)) +#endif +// aligned load +#define npyv_loada_u8(PTR) vec_ld(0, PTR) +#define npyv_loada_s8 npyv_loada_u8 +#define npyv_loada_u16 npyv_loada_u8 +#define npyv_loada_s16 npyv_loada_u8 +#define npyv_loada_u32 npyv_loada_u8 +#define npyv_loada_s32 npyv_loada_u8 +#define npyv_loada_u64 npyv_load_u64 +#define npyv_loada_s64 npyv_load_s64 +#define npyv_loada_f32 npyv_loada_u8 +#define npyv_loada_f64 npyv_load_f64 +// stream load +#define npyv_loads_u8 npyv_loada_u8 +#define npyv_loads_s8 npyv_loada_s8 +#define npyv_loads_u16 npyv_loada_u16 +#define npyv_loads_s16 npyv_loada_s16 +#define npyv_loads_u32 npyv_loada_u32 +#define npyv_loads_s32 npyv_loada_s32 +#define npyv_loads_u64 npyv_loada_u64 +#define npyv_loads_s64 npyv_loada_s64 +#define npyv_loads_f32 npyv_loada_f32 +#define npyv_loads_f64 npyv_loada_f64 +// load lower part +// avoid aliasing rules +#ifdef __cplusplus + template<typename T_PTR> + NPY_FINLINE npy_uint64 *npyv__ptr2u64(T_PTR *ptr) + { return npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr; } +#else + NPY_FINLINE npy_uint64 *npyv__ptr2u64(void *ptr) + { npy_uint64 *ptr64 = ptr; return ptr64; } +#endif // __cplusplus +#if defined(__clang__) && !defined(__IBMC__) + // vec_promote doesn't support doubleword on clang + #define npyv_loadl_u64(PTR) npyv_setall_u64(*npyv__ptr2u64(PTR)) +#else + #define npyv_loadl_u64(PTR) vec_promote(*npyv__ptr2u64(PTR), 0) +#endif +#define npyv_loadl_u8(PTR) ((npyv_u8)npyv_loadl_u64(PTR)) +#define npyv_loadl_s8(PTR) ((npyv_s8)npyv_loadl_u64(PTR)) +#define npyv_loadl_u16(PTR) ((npyv_u16)npyv_loadl_u64(PTR)) +#define npyv_loadl_s16(PTR) ((npyv_s16)npyv_loadl_u64(PTR)) +#define npyv_loadl_u32(PTR) ((npyv_u32)npyv_loadl_u64(PTR)) +#define npyv_loadl_s32(PTR) ((npyv_s32)npyv_loadl_u64(PTR)) +#define npyv_loadl_s64(PTR) ((npyv_s64)npyv_loadl_u64(PTR)) +#define npyv_loadl_f32(PTR) ((npyv_f32)npyv_loadl_u64(PTR)) +#define npyv_loadl_f64(PTR) ((npyv_f64)npyv_loadl_u64(PTR)) +// unaligned store +#if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__)) + #define npyv_store_u8(PTR, VEC) vec_vsx_st(VEC, 0, PTR) +#else + #define npyv_store_u8(PTR, VEC) vec_xst(VEC, 0, PTR) +#endif +#define npyv_store_s8 npyv_store_u8 +#define npyv_store_u16 npyv_store_u8 +#define npyv_store_s16 npyv_store_u8 +#define npyv_store_u32 npyv_store_u8 +#define npyv_store_s32 npyv_store_u8 +#define npyv_store_u64(PTR, VEC) npyv_store_u8((unsigned int*)PTR, (npyv_u32)VEC) +#define npyv_store_s64(PTR, VEC) npyv_store_u8((unsigned int*)PTR, (npyv_u32)VEC) +#define npyv_store_f32 npyv_store_u8 +#define npyv_store_f64 npyv_store_u8 +// aligned store +#define npyv_storea_u8(PTR, VEC) vec_st(VEC, 0, PTR) +#define npyv_storea_s8 npyv_storea_u8 +#define npyv_storea_u16 npyv_storea_u8 +#define npyv_storea_s16 npyv_storea_u8 +#define npyv_storea_u32 npyv_storea_u8 +#define npyv_storea_s32 npyv_storea_u8 +#define npyv_storea_u64 npyv_store_u64 +#define npyv_storea_s64 npyv_store_s64 +#define npyv_storea_f32 npyv_storea_u8 +#define npyv_storea_f64 npyv_store_f64 +// stream store +#define npyv_stores_u8 npyv_storea_u8 +#define npyv_stores_s8 npyv_storea_s8 +#define npyv_stores_u16 npyv_storea_u16 +#define npyv_stores_s16 npyv_storea_s16 +#define npyv_stores_u32 npyv_storea_u32 +#define npyv_stores_s32 npyv_storea_s32 +#define npyv_stores_u64 npyv_storea_u64 +#define npyv_stores_s64 npyv_storea_s64 +#define npyv_stores_f32 npyv_storea_f32 +#define npyv_stores_f64 npyv_storea_f64 +// store lower part +#define npyv_storel_u8(PTR, VEC) \ + *npyv__ptr2u64(PTR) = vec_extract(((npyv_u64)VEC), 0) +#define npyv_storel_s8 npyv_storel_u8 +#define npyv_storel_u16 npyv_storel_u8 +#define npyv_storel_s16 npyv_storel_u8 +#define npyv_storel_u32 npyv_storel_u8 +#define npyv_storel_s32 npyv_storel_u8 +#define npyv_storel_s64 npyv_storel_u8 +#define npyv_storel_u64 npyv_storel_u8 +#define npyv_storel_f32 npyv_storel_u8 +#define npyv_storel_f64 npyv_storel_u8 +// store higher part +#define npyv_storeh_u8(PTR, VEC) \ + *npyv__ptr2u64(PTR) = vec_extract(((npyv_u64)VEC), 1) +#define npyv_storeh_s8 npyv_storeh_u8 +#define npyv_storeh_u16 npyv_storeh_u8 +#define npyv_storeh_s16 npyv_storeh_u8 +#define npyv_storeh_u32 npyv_storeh_u8 +#define npyv_storeh_s32 npyv_storeh_u8 +#define npyv_storeh_s64 npyv_storeh_u8 +#define npyv_storeh_u64 npyv_storeh_u8 +#define npyv_storeh_f32 npyv_storeh_u8 +#define npyv_storeh_f64 npyv_storeh_u8 + +#endif // _NPY_SIMD_VSX_MEMORY_H diff --git a/numpy/core/src/common/simd/vsx/misc.h b/numpy/core/src/common/simd/vsx/misc.h new file mode 100644 index 000000000..f7a0cdd5c --- /dev/null +++ b/numpy/core/src/common/simd/vsx/misc.h @@ -0,0 +1,190 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_VSX_MISC_H +#define _NPY_SIMD_VSX_MISC_H + +// vector with zero lanes +#define npyv_zero_u8() ((npyv_u8) npyv_setall_s32(0)) +#define npyv_zero_s8() ((npyv_s8) npyv_setall_s32(0)) +#define npyv_zero_u16() ((npyv_u16) npyv_setall_s32(0)) +#define npyv_zero_s16() ((npyv_s16) npyv_setall_s32(0)) +#define npyv_zero_u32() npyv_setall_u32(0) +#define npyv_zero_s32() npyv_setall_s32(0) +#define npyv_zero_u64() ((npyv_u64) npyv_setall_s32(0)) +#define npyv_zero_s64() ((npyv_s64) npyv_setall_s32(0)) +#define npyv_zero_f32() npyv_setall_f32(0.0f) +#define npyv_zero_f64() npyv_setall_f64(0.0) + +// vector with a specific value set to all lanes +// the safest way to generate vsplti* and vsplt* instructions +#define NPYV_IMPL_VSX_SPLTB(T_VEC, V) ((T_VEC){V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V}) +#define NPYV_IMPL_VSX_SPLTH(T_VEC, V) ((T_VEC){V, V, V, V, V, V, V, V}) +#define NPYV_IMPL_VSX_SPLTW(T_VEC, V) ((T_VEC){V, V, V, V}) +#define NPYV_IMPL_VSX_SPLTD(T_VEC, V) ((T_VEC){V, V}) + +#define npyv_setall_u8(VAL) NPYV_IMPL_VSX_SPLTB(npyv_u8, (unsigned char)VAL) +#define npyv_setall_s8(VAL) NPYV_IMPL_VSX_SPLTB(npyv_s8, (signed char)VAL) +#define npyv_setall_u16(VAL) NPYV_IMPL_VSX_SPLTH(npyv_u16, (unsigned short)VAL) +#define npyv_setall_s16(VAL) NPYV_IMPL_VSX_SPLTH(npyv_s16, (short)VAL) +#define npyv_setall_u32(VAL) NPYV_IMPL_VSX_SPLTW(npyv_u32, (unsigned int)VAL) +#define npyv_setall_s32(VAL) NPYV_IMPL_VSX_SPLTW(npyv_s32, (int)VAL) +#define npyv_setall_f32(VAL) NPYV_IMPL_VSX_SPLTW(npyv_f32, VAL) +#define npyv_setall_u64(VAL) NPYV_IMPL_VSX_SPLTD(npyv_u64, (npy_uint64)VAL) +#define npyv_setall_s64(VAL) NPYV_IMPL_VSX_SPLTD(npyv_s64, (npy_int64)VAL) +#define npyv_setall_f64(VAL) NPYV_IMPL_VSX_SPLTD(npyv_f64, VAL) + +// vector with specific values set to each lane and +// set a specific value to all remained lanes +#define npyv_setf_u8(FILL, ...) ((npyv_u8){NPYV__SET_FILL_16(char, FILL, __VA_ARGS__)}) +#define npyv_setf_s8(FILL, ...) ((npyv_s8){NPYV__SET_FILL_16(char, FILL, __VA_ARGS__)}) +#define npyv_setf_u16(FILL, ...) ((npyv_u16){NPYV__SET_FILL_8(short, FILL, __VA_ARGS__)}) +#define npyv_setf_s16(FILL, ...) ((npyv_s16){NPYV__SET_FILL_8(short, FILL, __VA_ARGS__)}) +#define npyv_setf_u32(FILL, ...) ((npyv_u32){NPYV__SET_FILL_4(int, FILL, __VA_ARGS__)}) +#define npyv_setf_s32(FILL, ...) ((npyv_s32){NPYV__SET_FILL_4(int, FILL, __VA_ARGS__)}) +#define npyv_setf_u64(FILL, ...) ((npyv_u64){NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)}) +#define npyv_setf_s64(FILL, ...) ((npyv_s64){NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)}) +#define npyv_setf_f32(FILL, ...) ((npyv_f32){NPYV__SET_FILL_4(float, FILL, __VA_ARGS__)}) +#define npyv_setf_f64(FILL, ...) ((npyv_f64){NPYV__SET_FILL_2(double, FILL, __VA_ARGS__)}) + +// vector with specific values set to each lane and +// set zero to all remained lanes +#define npyv_set_u8(...) npyv_setf_u8(0, __VA_ARGS__) +#define npyv_set_s8(...) npyv_setf_s8(0, __VA_ARGS__) +#define npyv_set_u16(...) npyv_setf_u16(0, __VA_ARGS__) +#define npyv_set_s16(...) npyv_setf_s16(0, __VA_ARGS__) +#define npyv_set_u32(...) npyv_setf_u32(0, __VA_ARGS__) +#define npyv_set_s32(...) npyv_setf_s32(0, __VA_ARGS__) +#define npyv_set_u64(...) npyv_setf_u64(0, __VA_ARGS__) +#define npyv_set_s64(...) npyv_setf_s64(0, __VA_ARGS__) +#define npyv_set_f32(...) npyv_setf_f32(0, __VA_ARGS__) +#define npyv_set_f64(...) npyv_setf_f64(0, __VA_ARGS__) + +// Per lane select +#define npyv_select_u8(MASK, A, B) vec_sel(B, A, MASK) +#define npyv_select_s8 npyv_select_u8 +#define npyv_select_u16 npyv_select_u8 +#define npyv_select_s16 npyv_select_u8 +#define npyv_select_u32 npyv_select_u8 +#define npyv_select_s32 npyv_select_u8 +#define npyv_select_u64 npyv_select_u8 +#define npyv_select_s64 npyv_select_u8 +#define npyv_select_f32 npyv_select_u8 +#define npyv_select_f64 npyv_select_u8 + +// Reinterpret +#define npyv_reinterpret_u8_u8(X) X +#define npyv_reinterpret_u8_s8(X) ((npyv_u8)X) +#define npyv_reinterpret_u8_u16 npyv_reinterpret_u8_s8 +#define npyv_reinterpret_u8_s16 npyv_reinterpret_u8_s8 +#define npyv_reinterpret_u8_u32 npyv_reinterpret_u8_s8 +#define npyv_reinterpret_u8_s32 npyv_reinterpret_u8_s8 +#define npyv_reinterpret_u8_u64 npyv_reinterpret_u8_s8 +#define npyv_reinterpret_u8_s64 npyv_reinterpret_u8_s8 +#define npyv_reinterpret_u8_f32 npyv_reinterpret_u8_s8 +#define npyv_reinterpret_u8_f64 npyv_reinterpret_u8_s8 + +#define npyv_reinterpret_s8_s8(X) X +#define npyv_reinterpret_s8_u8(X) ((npyv_s8)X) +#define npyv_reinterpret_s8_u16 npyv_reinterpret_s8_u8 +#define npyv_reinterpret_s8_s16 npyv_reinterpret_s8_u8 +#define npyv_reinterpret_s8_u32 npyv_reinterpret_s8_u8 +#define npyv_reinterpret_s8_s32 npyv_reinterpret_s8_u8 +#define npyv_reinterpret_s8_u64 npyv_reinterpret_s8_u8 +#define npyv_reinterpret_s8_s64 npyv_reinterpret_s8_u8 +#define npyv_reinterpret_s8_f32 npyv_reinterpret_s8_u8 +#define npyv_reinterpret_s8_f64 npyv_reinterpret_s8_u8 + +#define npyv_reinterpret_u16_u16(X) X +#define npyv_reinterpret_u16_u8(X) ((npyv_u16)X) +#define npyv_reinterpret_u16_s8 npyv_reinterpret_u16_u8 +#define npyv_reinterpret_u16_s16 npyv_reinterpret_u16_u8 +#define npyv_reinterpret_u16_u32 npyv_reinterpret_u16_u8 +#define npyv_reinterpret_u16_s32 npyv_reinterpret_u16_u8 +#define npyv_reinterpret_u16_u64 npyv_reinterpret_u16_u8 +#define npyv_reinterpret_u16_s64 npyv_reinterpret_u16_u8 +#define npyv_reinterpret_u16_f32 npyv_reinterpret_u16_u8 +#define npyv_reinterpret_u16_f64 npyv_reinterpret_u16_u8 + +#define npyv_reinterpret_s16_s16(X) X +#define npyv_reinterpret_s16_u8(X) ((npyv_s16)X) +#define npyv_reinterpret_s16_s8 npyv_reinterpret_s16_u8 +#define npyv_reinterpret_s16_u16 npyv_reinterpret_s16_u8 +#define npyv_reinterpret_s16_u32 npyv_reinterpret_s16_u8 +#define npyv_reinterpret_s16_s32 npyv_reinterpret_s16_u8 +#define npyv_reinterpret_s16_u64 npyv_reinterpret_s16_u8 +#define npyv_reinterpret_s16_s64 npyv_reinterpret_s16_u8 +#define npyv_reinterpret_s16_f32 npyv_reinterpret_s16_u8 +#define npyv_reinterpret_s16_f64 npyv_reinterpret_s16_u8 + +#define npyv_reinterpret_u32_u32(X) X +#define npyv_reinterpret_u32_u8(X) ((npyv_u32)X) +#define npyv_reinterpret_u32_s8 npyv_reinterpret_u32_u8 +#define npyv_reinterpret_u32_u16 npyv_reinterpret_u32_u8 +#define npyv_reinterpret_u32_s16 npyv_reinterpret_u32_u8 +#define npyv_reinterpret_u32_s32 npyv_reinterpret_u32_u8 +#define npyv_reinterpret_u32_u64 npyv_reinterpret_u32_u8 +#define npyv_reinterpret_u32_s64 npyv_reinterpret_u32_u8 +#define npyv_reinterpret_u32_f32 npyv_reinterpret_u32_u8 +#define npyv_reinterpret_u32_f64 npyv_reinterpret_u32_u8 + +#define npyv_reinterpret_s32_s32(X) X +#define npyv_reinterpret_s32_u8(X) ((npyv_s32)X) +#define npyv_reinterpret_s32_s8 npyv_reinterpret_s32_u8 +#define npyv_reinterpret_s32_u16 npyv_reinterpret_s32_u8 +#define npyv_reinterpret_s32_s16 npyv_reinterpret_s32_u8 +#define npyv_reinterpret_s32_u32 npyv_reinterpret_s32_u8 +#define npyv_reinterpret_s32_u64 npyv_reinterpret_s32_u8 +#define npyv_reinterpret_s32_s64 npyv_reinterpret_s32_u8 +#define npyv_reinterpret_s32_f32 npyv_reinterpret_s32_u8 +#define npyv_reinterpret_s32_f64 npyv_reinterpret_s32_u8 + +#define npyv_reinterpret_u64_u64(X) X +#define npyv_reinterpret_u64_u8(X) ((npyv_u64)X) +#define npyv_reinterpret_u64_s8 npyv_reinterpret_u64_u8 +#define npyv_reinterpret_u64_u16 npyv_reinterpret_u64_u8 +#define npyv_reinterpret_u64_s16 npyv_reinterpret_u64_u8 +#define npyv_reinterpret_u64_u32 npyv_reinterpret_u64_u8 +#define npyv_reinterpret_u64_s32 npyv_reinterpret_u64_u8 +#define npyv_reinterpret_u64_s64 npyv_reinterpret_u64_u8 +#define npyv_reinterpret_u64_f32 npyv_reinterpret_u64_u8 +#define npyv_reinterpret_u64_f64 npyv_reinterpret_u64_u8 + +#define npyv_reinterpret_s64_s64(X) X +#define npyv_reinterpret_s64_u8(X) ((npyv_s64)X) +#define npyv_reinterpret_s64_s8 npyv_reinterpret_s64_u8 +#define npyv_reinterpret_s64_u16 npyv_reinterpret_s64_u8 +#define npyv_reinterpret_s64_s16 npyv_reinterpret_s64_u8 +#define npyv_reinterpret_s64_u32 npyv_reinterpret_s64_u8 +#define npyv_reinterpret_s64_s32 npyv_reinterpret_s64_u8 +#define npyv_reinterpret_s64_u64 npyv_reinterpret_s64_u8 +#define npyv_reinterpret_s64_f32 npyv_reinterpret_s64_u8 +#define npyv_reinterpret_s64_f64 npyv_reinterpret_s64_u8 + +#define npyv_reinterpret_f32_f32(X) X +#define npyv_reinterpret_f32_u8(X) ((npyv_f32)X) +#define npyv_reinterpret_f32_s8 npyv_reinterpret_f32_u8 +#define npyv_reinterpret_f32_u16 npyv_reinterpret_f32_u8 +#define npyv_reinterpret_f32_s16 npyv_reinterpret_f32_u8 +#define npyv_reinterpret_f32_u32 npyv_reinterpret_f32_u8 +#define npyv_reinterpret_f32_s32 npyv_reinterpret_f32_u8 +#define npyv_reinterpret_f32_u64 npyv_reinterpret_f32_u8 +#define npyv_reinterpret_f32_s64 npyv_reinterpret_f32_u8 +#define npyv_reinterpret_f32_f64 npyv_reinterpret_f32_u8 + +#define npyv_reinterpret_f64_f64(X) X +#define npyv_reinterpret_f64_u8(X) ((npyv_f64)X) +#define npyv_reinterpret_f64_s8 npyv_reinterpret_f64_u8 +#define npyv_reinterpret_f64_u16 npyv_reinterpret_f64_u8 +#define npyv_reinterpret_f64_s16 npyv_reinterpret_f64_u8 +#define npyv_reinterpret_f64_u32 npyv_reinterpret_f64_u8 +#define npyv_reinterpret_f64_s32 npyv_reinterpret_f64_u8 +#define npyv_reinterpret_f64_u64 npyv_reinterpret_f64_u8 +#define npyv_reinterpret_f64_s64 npyv_reinterpret_f64_u8 +#define npyv_reinterpret_f64_f32 npyv_reinterpret_f64_u8 + +// Only required by AVX2/AVX512 +#define npyv_cleanup() ((void)0) + +#endif // _NPY_SIMD_VSX_MISC_H diff --git a/numpy/core/src/common/simd/vsx/operators.h b/numpy/core/src/common/simd/vsx/operators.h new file mode 100644 index 000000000..ca020d9e0 --- /dev/null +++ b/numpy/core/src/common/simd/vsx/operators.h @@ -0,0 +1,216 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_VSX_OPERATORS_H +#define _NPY_SIMD_VSX_OPERATORS_H + +/*************************** + * Shifting + ***************************/ + +// Left +#define npyv_shl_u16(A, C) vec_sl(A, npyv_setall_u16(C)) +#define npyv_shl_s16(A, C) vec_sl(A, npyv_setall_u16(C)) +#define npyv_shl_u32(A, C) vec_sl(A, npyv_setall_u32(C)) +#define npyv_shl_s32(A, C) vec_sl(A, npyv_setall_u32(C)) +#define npyv_shl_u64(A, C) vec_sl(A, npyv_setall_u64(C)) +#define npyv_shl_s64(A, C) vec_sl(A, npyv_setall_u64(C)) + +// Left by an immediate constant +#define npyv_shli_u16 npyv_shl_u16 +#define npyv_shli_s16 npyv_shl_s16 +#define npyv_shli_u32 npyv_shl_u32 +#define npyv_shli_s32 npyv_shl_s32 +#define npyv_shli_u64 npyv_shl_u64 +#define npyv_shli_s64 npyv_shl_s64 + +// Right +#define npyv_shr_u16(A, C) vec_sr(A, npyv_setall_u16(C)) +#define npyv_shr_s16(A, C) vec_sra(A, npyv_setall_u16(C)) +#define npyv_shr_u32(A, C) vec_sr(A, npyv_setall_u32(C)) +#define npyv_shr_s32(A, C) vec_sra(A, npyv_setall_u32(C)) +#define npyv_shr_u64(A, C) vec_sr(A, npyv_setall_u64(C)) +#define npyv_shr_s64(A, C) vec_sra(A, npyv_setall_u64(C)) + +// Right by an immediate constant +#define npyv_shri_u16 npyv_shr_u16 +#define npyv_shri_s16 npyv_shr_s16 +#define npyv_shri_u32 npyv_shr_u32 +#define npyv_shri_s32 npyv_shr_s32 +#define npyv_shri_u64 npyv_shr_u64 +#define npyv_shri_s64 npyv_shr_s64 + +/*************************** + * Logical + ***************************/ + +// AND +#define npyv_and_u8 vec_and +#define npyv_and_s8 vec_and +#define npyv_and_u16 vec_and +#define npyv_and_s16 vec_and +#define npyv_and_u32 vec_and +#define npyv_and_s32 vec_and +#define npyv_and_u64 vec_and +#define npyv_and_s64 vec_and +#define npyv_and_f32 vec_and +#define npyv_and_f64 vec_and + +// OR +#define npyv_or_u8 vec_or +#define npyv_or_s8 vec_or +#define npyv_or_u16 vec_or +#define npyv_or_s16 vec_or +#define npyv_or_u32 vec_or +#define npyv_or_s32 vec_or +#define npyv_or_u64 vec_or +#define npyv_or_s64 vec_or +#define npyv_or_f32 vec_or +#define npyv_or_f64 vec_or + +// XOR +#define npyv_xor_u8 vec_xor +#define npyv_xor_s8 vec_xor +#define npyv_xor_u16 vec_xor +#define npyv_xor_s16 vec_xor +#define npyv_xor_u32 vec_xor +#define npyv_xor_s32 vec_xor +#define npyv_xor_u64 vec_xor +#define npyv_xor_s64 vec_xor +#define npyv_xor_f32 vec_xor +#define npyv_xor_f64 vec_xor + +// NOT +// note: we implement npyv_not_b*(boolen types) for internal use*/ +#define NPYV_IMPL_VSX_NOT_INT(VEC_LEN) \ + NPY_FINLINE npyv_u##VEC_LEN npyv_not_u##VEC_LEN(npyv_u##VEC_LEN a) \ + { return vec_nor(a, a); } \ + NPY_FINLINE npyv_s##VEC_LEN npyv_not_s##VEC_LEN(npyv_s##VEC_LEN a) \ + { return vec_nor(a, a); } \ + NPY_FINLINE npyv_b##VEC_LEN npyv_not_b##VEC_LEN(npyv_b##VEC_LEN a) \ + { return vec_nor(a, a); } + +NPYV_IMPL_VSX_NOT_INT(8) +NPYV_IMPL_VSX_NOT_INT(16) +NPYV_IMPL_VSX_NOT_INT(32) + +// up to gcc5 vec_nor doesn't support bool long long +#if defined(__GNUC__) && __GNUC__ > 5 + NPYV_IMPL_VSX_NOT_INT(64) +#else + NPY_FINLINE npyv_u64 npyv_not_u64(npyv_u64 a) + { return vec_nor(a, a); } + NPY_FINLINE npyv_s64 npyv_not_s64(npyv_s64 a) + { return vec_nor(a, a); } + NPY_FINLINE npyv_b64 npyv_not_b64(npyv_b64 a) + { return (npyv_b64)vec_nor((npyv_u64)a, (npyv_u64)a); } +#endif + +NPY_FINLINE npyv_f32 npyv_not_f32(npyv_f32 a) +{ return vec_nor(a, a); } +NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a) +{ return vec_nor(a, a); } + +/*************************** + * Comparison + ***************************/ + +// Int Equal +#define npyv_cmpeq_u8 vec_cmpeq +#define npyv_cmpeq_s8 vec_cmpeq +#define npyv_cmpeq_u16 vec_cmpeq +#define npyv_cmpeq_s16 vec_cmpeq +#define npyv_cmpeq_u32 vec_cmpeq +#define npyv_cmpeq_s32 vec_cmpeq +#define npyv_cmpeq_u64 vec_cmpeq +#define npyv_cmpeq_s64 vec_cmpeq +#define npyv_cmpeq_f32 vec_cmpeq +#define npyv_cmpeq_f64 vec_cmpeq + +// Int Not Equal +#ifdef NPY_HAVE_VSX3 + #define npyv_cmpneq_u8 vec_cmpne + #define npyv_cmpneq_s8 vec_cmpne + #define npyv_cmpneq_u16 vec_cmpne + #define npyv_cmpneq_s16 vec_cmpne + #define npyv_cmpneq_u32 vec_cmpne + #define npyv_cmpneq_s32 vec_cmpne + #define npyv_cmpneq_u64 vec_cmpne + #define npyv_cmpneq_s64 vec_cmpne + #define npyv_cmpneq_f32 vec_cmpne + #define npyv_cmpneq_f64 vec_cmpne +#else + #define npyv_cmpneq_u8(A, B) npyv_not_b8(vec_cmpeq(A, B)) + #define npyv_cmpneq_s8(A, B) npyv_not_b8(vec_cmpeq(A, B)) + #define npyv_cmpneq_u16(A, B) npyv_not_b16(vec_cmpeq(A, B)) + #define npyv_cmpneq_s16(A, B) npyv_not_b16(vec_cmpeq(A, B)) + #define npyv_cmpneq_u32(A, B) npyv_not_b32(vec_cmpeq(A, B)) + #define npyv_cmpneq_s32(A, B) npyv_not_b32(vec_cmpeq(A, B)) + #define npyv_cmpneq_u64(A, B) npyv_not_b64(vec_cmpeq(A, B)) + #define npyv_cmpneq_s64(A, B) npyv_not_b64(vec_cmpeq(A, B)) + #define npyv_cmpneq_f32(A, B) npyv_not_b32(vec_cmpeq(A, B)) + #define npyv_cmpneq_f64(A, B) npyv_not_b64(vec_cmpeq(A, B)) +#endif + +// Greater than +#define npyv_cmpgt_u8 vec_cmpgt +#define npyv_cmpgt_s8 vec_cmpgt +#define npyv_cmpgt_u16 vec_cmpgt +#define npyv_cmpgt_s16 vec_cmpgt +#define npyv_cmpgt_u32 vec_cmpgt +#define npyv_cmpgt_s32 vec_cmpgt +#define npyv_cmpgt_u64 vec_cmpgt +#define npyv_cmpgt_s64 vec_cmpgt +#define npyv_cmpgt_f32 vec_cmpgt +#define npyv_cmpgt_f64 vec_cmpgt + +// Greater than or equal +// up to gcc5 vec_cmpge only supports single and double precision +#if defined(__GNUC__) && __GNUC__ > 5 + #define npyv_cmpge_u8 vec_cmpge + #define npyv_cmpge_s8 vec_cmpge + #define npyv_cmpge_u16 vec_cmpge + #define npyv_cmpge_s16 vec_cmpge + #define npyv_cmpge_u32 vec_cmpge + #define npyv_cmpge_s32 vec_cmpge + #define npyv_cmpge_u64 vec_cmpge + #define npyv_cmpge_s64 vec_cmpge +#else + #define npyv_cmpge_u8(A, B) npyv_not_b8(vec_cmpgt(B, A)) + #define npyv_cmpge_s8(A, B) npyv_not_b8(vec_cmpgt(B, A)) + #define npyv_cmpge_u16(A, B) npyv_not_b16(vec_cmpgt(B, A)) + #define npyv_cmpge_s16(A, B) npyv_not_b16(vec_cmpgt(B, A)) + #define npyv_cmpge_u32(A, B) npyv_not_b32(vec_cmpgt(B, A)) + #define npyv_cmpge_s32(A, B) npyv_not_b32(vec_cmpgt(B, A)) + #define npyv_cmpge_u64(A, B) npyv_not_b64(vec_cmpgt(B, A)) + #define npyv_cmpge_s64(A, B) npyv_not_b64(vec_cmpgt(B, A)) +#endif +#define npyv_cmpge_f32 vec_cmpge +#define npyv_cmpge_f64 vec_cmpge + +// Less than +#define npyv_cmplt_u8(A, B) npyv_cmpgt_u8(B, A) +#define npyv_cmplt_s8(A, B) npyv_cmpgt_s8(B, A) +#define npyv_cmplt_u16(A, B) npyv_cmpgt_u16(B, A) +#define npyv_cmplt_s16(A, B) npyv_cmpgt_s16(B, A) +#define npyv_cmplt_u32(A, B) npyv_cmpgt_u32(B, A) +#define npyv_cmplt_s32(A, B) npyv_cmpgt_s32(B, A) +#define npyv_cmplt_u64(A, B) npyv_cmpgt_u64(B, A) +#define npyv_cmplt_s64(A, B) npyv_cmpgt_s64(B, A) +#define npyv_cmplt_f32(A, B) npyv_cmpgt_f32(B, A) +#define npyv_cmplt_f64(A, B) npyv_cmpgt_f64(B, A) + +// Less than or equal +#define npyv_cmple_u8(A, B) npyv_cmpge_u8(B, A) +#define npyv_cmple_s8(A, B) npyv_cmpge_s8(B, A) +#define npyv_cmple_u16(A, B) npyv_cmpge_u16(B, A) +#define npyv_cmple_s16(A, B) npyv_cmpge_s16(B, A) +#define npyv_cmple_u32(A, B) npyv_cmpge_u32(B, A) +#define npyv_cmple_s32(A, B) npyv_cmpge_s32(B, A) +#define npyv_cmple_u64(A, B) npyv_cmpge_u64(B, A) +#define npyv_cmple_s64(A, B) npyv_cmpge_s64(B, A) +#define npyv_cmple_f32(A, B) npyv_cmpge_f32(B, A) +#define npyv_cmple_f64(A, B) npyv_cmpge_f64(B, A) + +#endif // _NPY_SIMD_VSX_OPERATORS_H diff --git a/numpy/core/src/common/simd/vsx/reorder.h b/numpy/core/src/common/simd/vsx/reorder.h new file mode 100644 index 000000000..bfb9115fa --- /dev/null +++ b/numpy/core/src/common/simd/vsx/reorder.h @@ -0,0 +1,65 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_VSX_REORDER_H +#define _NPY_SIMD_VSX_REORDER_H + +// combine lower part of two vectors +#define npyv__combinel(A, B) vec_mergeh((npyv_u64)(A), (npyv_u64)(B)) +#define npyv_combinel_u8(A, B) ((npyv_u8) npyv__combinel(A, B)) +#define npyv_combinel_s8(A, B) ((npyv_s8) npyv__combinel(A, B)) +#define npyv_combinel_u16(A, B) ((npyv_u16)npyv__combinel(A, B)) +#define npyv_combinel_s16(A, B) ((npyv_s16)npyv__combinel(A, B)) +#define npyv_combinel_u32(A, B) ((npyv_u32)npyv__combinel(A, B)) +#define npyv_combinel_s32(A, B) ((npyv_s32)npyv__combinel(A, B)) +#define npyv_combinel_u64 vec_mergeh +#define npyv_combinel_s64 vec_mergeh +#define npyv_combinel_f32(A, B) ((npyv_f32)npyv__combinel(A, B)) +#define npyv_combinel_f64 vec_mergeh + +// combine higher part of two vectors +#define npyv__combineh(A, B) vec_mergel((npyv_u64)(A), (npyv_u64)(B)) +#define npyv_combineh_u8(A, B) ((npyv_u8) npyv__combineh(A, B)) +#define npyv_combineh_s8(A, B) ((npyv_s8) npyv__combineh(A, B)) +#define npyv_combineh_u16(A, B) ((npyv_u16)npyv__combineh(A, B)) +#define npyv_combineh_s16(A, B) ((npyv_s16)npyv__combineh(A, B)) +#define npyv_combineh_u32(A, B) ((npyv_u32)npyv__combineh(A, B)) +#define npyv_combineh_s32(A, B) ((npyv_s32)npyv__combineh(A, B)) +#define npyv_combineh_u64 vec_mergel +#define npyv_combineh_s64 vec_mergel +#define npyv_combineh_f32(A, B) ((npyv_f32)npyv__combineh(A, B)) +#define npyv_combineh_f64 vec_mergel + +/* + * combine: combine two vectors from lower and higher parts of two other vectors + * zip: interleave two vectors +*/ +#define NPYV_IMPL_VSX_COMBINE_ZIP(T_VEC, SFX) \ + NPY_FINLINE T_VEC##x2 npyv_combine_##SFX(T_VEC a, T_VEC b) \ + { \ + T_VEC##x2 r; \ + r.val[0] = NPY_CAT(npyv_combinel_, SFX)(a, b); \ + r.val[1] = NPY_CAT(npyv_combineh_, SFX)(a, b); \ + return r; \ + } \ + NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b) \ + { \ + T_VEC##x2 r; \ + r.val[0] = vec_mergeh(a, b); \ + r.val[1] = vec_mergel(a, b); \ + return r; \ + } + +NPYV_IMPL_VSX_COMBINE_ZIP(npyv_u8, u8) +NPYV_IMPL_VSX_COMBINE_ZIP(npyv_s8, s8) +NPYV_IMPL_VSX_COMBINE_ZIP(npyv_u16, u16) +NPYV_IMPL_VSX_COMBINE_ZIP(npyv_s16, s16) +NPYV_IMPL_VSX_COMBINE_ZIP(npyv_u32, u32) +NPYV_IMPL_VSX_COMBINE_ZIP(npyv_s32, s32) +NPYV_IMPL_VSX_COMBINE_ZIP(npyv_u64, u64) +NPYV_IMPL_VSX_COMBINE_ZIP(npyv_s64, s64) +NPYV_IMPL_VSX_COMBINE_ZIP(npyv_f32, f32) +NPYV_IMPL_VSX_COMBINE_ZIP(npyv_f64, f64) + +#endif // _NPY_SIMD_VSX_REORDER_H diff --git a/numpy/core/src/common/simd/vsx/vsx.h b/numpy/core/src/common/simd/vsx/vsx.h new file mode 100644 index 000000000..5525dc1e6 --- /dev/null +++ b/numpy/core/src/common/simd/vsx/vsx.h @@ -0,0 +1,64 @@ +#ifndef _NPY_SIMD_H_ + #error "Not a standalone header" +#endif + +#define NPY_SIMD 128 +#define NPY_SIMD_WIDTH 16 +#define NPY_SIMD_F64 1 + +typedef __vector unsigned char npyv_u8; +typedef __vector signed char npyv_s8; +typedef __vector unsigned short npyv_u16; +typedef __vector signed short npyv_s16; +typedef __vector unsigned int npyv_u32; +typedef __vector signed int npyv_s32; +typedef __vector unsigned long long npyv_u64; +typedef __vector signed long long npyv_s64; +typedef __vector float npyv_f32; +typedef __vector double npyv_f64; + +typedef struct { npyv_u8 val[2]; } npyv_u8x2; +typedef struct { npyv_s8 val[2]; } npyv_s8x2; +typedef struct { npyv_u16 val[2]; } npyv_u16x2; +typedef struct { npyv_s16 val[2]; } npyv_s16x2; +typedef struct { npyv_u32 val[2]; } npyv_u32x2; +typedef struct { npyv_s32 val[2]; } npyv_s32x2; +typedef struct { npyv_u64 val[2]; } npyv_u64x2; +typedef struct { npyv_s64 val[2]; } npyv_s64x2; +typedef struct { npyv_f32 val[2]; } npyv_f32x2; +typedef struct { npyv_f64 val[2]; } npyv_f64x2; + +typedef struct { npyv_u8 val[3]; } npyv_u8x3; +typedef struct { npyv_s8 val[3]; } npyv_s8x3; +typedef struct { npyv_u16 val[3]; } npyv_u16x3; +typedef struct { npyv_s16 val[3]; } npyv_s16x3; +typedef struct { npyv_u32 val[3]; } npyv_u32x3; +typedef struct { npyv_s32 val[3]; } npyv_s32x3; +typedef struct { npyv_u64 val[3]; } npyv_u64x3; +typedef struct { npyv_s64 val[3]; } npyv_s64x3; +typedef struct { npyv_f32 val[3]; } npyv_f32x3; +typedef struct { npyv_f64 val[3]; } npyv_f64x3; + +#define npyv_nlanes_u8 16 +#define npyv_nlanes_s8 16 +#define npyv_nlanes_u16 8 +#define npyv_nlanes_s16 8 +#define npyv_nlanes_u32 4 +#define npyv_nlanes_s32 4 +#define npyv_nlanes_u64 2 +#define npyv_nlanes_s64 2 +#define npyv_nlanes_f32 4 +#define npyv_nlanes_f64 2 + +// using __bool with typdef cause ambiguous errors +#define npyv_b8 __vector __bool char +#define npyv_b16 __vector __bool short +#define npyv_b32 __vector __bool int +#define npyv_b64 __vector __bool long long + +#include "memory.h" +#include "misc.h" +#include "reorder.h" +#include "operators.h" +#include "conversion.h" +#include "arithmetic.h" |