summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatti Picus <matti.picus@gmail.com>2020-07-10 13:50:34 +0300
committerGitHub <noreply@github.com>2020-07-10 13:50:34 +0300
commit58da484abf6f466a9e8bf55c188cb5b501d31ceb (patch)
tree67631e36868543fb1d2baa75c9f8a4358825351e
parentc43b2bfb15281e170cc5006829ebc559bd1915e3 (diff)
parent18d0fe5c1b2c4a95e0eee73de9aa086b2d64cf88 (diff)
downloadnumpy-58da484abf6f466a9e8bf55c188cb5b501d31ceb.tar.gz
Merge pull request #16397 from seiko2plus/implement_npyv
ENH: Implement the NumPy C SIMD vectorization interface
-rw-r--r--numpy/core/include/numpy/npy_common.h8
-rw-r--r--numpy/core/setup.py1
-rw-r--r--numpy/core/src/common/simd/avx2/arithmetic.h75
-rw-r--r--numpy/core/src/common/simd/avx2/avx2.h67
-rw-r--r--numpy/core/src/common/simd/avx2/conversion.h32
-rw-r--r--numpy/core/src/common/simd/avx2/memory.h70
-rw-r--r--numpy/core/src/common/simd/avx2/misc.h223
-rw-r--r--numpy/core/src/common/simd/avx2/operators.h200
-rw-r--r--numpy/core/src/common/simd/avx2/reorder.h97
-rw-r--r--numpy/core/src/common/simd/avx2/utils.h21
-rw-r--r--numpy/core/src/common/simd/avx512/arithmetic.h116
-rw-r--r--numpy/core/src/common/simd/avx512/avx512.h71
-rw-r--r--numpy/core/src/common/simd/avx512/conversion.h54
-rw-r--r--numpy/core/src/common/simd/avx512/memory.h94
-rw-r--r--numpy/core/src/common/simd/avx512/misc.h252
-rw-r--r--numpy/core/src/common/simd/avx512/operators.h259
-rw-r--r--numpy/core/src/common/simd/avx512/reorder.h170
-rw-r--r--numpy/core/src/common/simd/avx512/utils.h70
-rw-r--r--numpy/core/src/common/simd/neon/arithmetic.h78
-rw-r--r--numpy/core/src/common/simd/neon/conversion.h32
-rw-r--r--numpy/core/src/common/simd/neon/memory.h49
-rw-r--r--numpy/core/src/common/simd/neon/misc.h255
-rw-r--r--numpy/core/src/common/simd/neon/neon.h74
-rw-r--r--numpy/core/src/common/simd/neon/operators.h218
-rw-r--r--numpy/core/src/common/simd/neon/reorder.h110
-rw-r--r--numpy/core/src/common/simd/simd.h56
-rw-r--r--numpy/core/src/common/simd/simd_utils.h48
-rw-r--r--numpy/core/src/common/simd/sse/arithmetic.h95
-rw-r--r--numpy/core/src/common/simd/sse/conversion.h32
-rw-r--r--numpy/core/src/common/simd/sse/memory.h74
-rw-r--r--numpy/core/src/common/simd/sse/misc.h230
-rw-r--r--numpy/core/src/common/simd/sse/operators.h258
-rw-r--r--numpy/core/src/common/simd/sse/reorder.h84
-rw-r--r--numpy/core/src/common/simd/sse/sse.h66
-rw-r--r--numpy/core/src/common/simd/vsx/arithmetic.h103
-rw-r--r--numpy/core/src/common/simd/vsx/conversion.h32
-rw-r--r--numpy/core/src/common/simd/vsx/memory.h150
-rw-r--r--numpy/core/src/common/simd/vsx/misc.h190
-rw-r--r--numpy/core/src/common/simd/vsx/operators.h216
-rw-r--r--numpy/core/src/common/simd/vsx/reorder.h65
-rw-r--r--numpy/core/src/common/simd/vsx/vsx.h64
41 files changed, 4459 insertions, 0 deletions
diff --git a/numpy/core/include/numpy/npy_common.h b/numpy/core/include/numpy/npy_common.h
index 3cec0c6ff..5706e0576 100644
--- a/numpy/core/include/numpy/npy_common.h
+++ b/numpy/core/include/numpy/npy_common.h
@@ -141,6 +141,14 @@
#define NPY_INLINE
#endif
+#ifdef _MSC_VER
+ #define NPY_FINLINE static __forceinline
+#elif defined(__GNUC__)
+ #define NPY_FINLINE static NPY_INLINE __attribute__((always_inline))
+#else
+ #define NPY_FINLINE static
+#endif
+
#ifdef HAVE___THREAD
#define NPY_TLS __thread
#else
diff --git a/numpy/core/setup.py b/numpy/core/setup.py
index 8e00e4392..aede12080 100644
--- a/numpy/core/setup.py
+++ b/numpy/core/setup.py
@@ -739,6 +739,7 @@ def configuration(parent_package='',top_path=None):
join('src', 'common', 'umathmodule.h'),
join('src', 'common', 'numpyos.h'),
join('src', 'common', 'npy_cpu_dispatch.h'),
+ join('src', 'common', 'simd', 'simd.h'),
]
common_src = [
diff --git a/numpy/core/src/common/simd/avx2/arithmetic.h b/numpy/core/src/common/simd/avx2/arithmetic.h
new file mode 100644
index 000000000..9d8b4ab5e
--- /dev/null
+++ b/numpy/core/src/common/simd/avx2/arithmetic.h
@@ -0,0 +1,75 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_AVX2_ARITHMETIC_H
+#define _NPY_SIMD_AVX2_ARITHMETIC_H
+
+/***************************
+ * Addition
+ ***************************/
+// non-saturated
+#define npyv_add_u8 _mm256_add_epi8
+#define npyv_add_s8 _mm256_add_epi8
+#define npyv_add_u16 _mm256_add_epi16
+#define npyv_add_s16 _mm256_add_epi16
+#define npyv_add_u32 _mm256_add_epi32
+#define npyv_add_s32 _mm256_add_epi32
+#define npyv_add_u64 _mm256_add_epi64
+#define npyv_add_s64 _mm256_add_epi64
+#define npyv_add_f32 _mm256_add_ps
+#define npyv_add_f64 _mm256_add_pd
+
+// saturated
+#define npyv_adds_u8 _mm256_adds_epu8
+#define npyv_adds_s8 _mm256_adds_epi8
+#define npyv_adds_u16 _mm256_adds_epu16
+#define npyv_adds_s16 _mm256_adds_epi16
+// TODO: rest, after implment Packs intrins
+
+/***************************
+ * Subtraction
+ ***************************/
+// non-saturated
+#define npyv_sub_u8 _mm256_sub_epi8
+#define npyv_sub_s8 _mm256_sub_epi8
+#define npyv_sub_u16 _mm256_sub_epi16
+#define npyv_sub_s16 _mm256_sub_epi16
+#define npyv_sub_u32 _mm256_sub_epi32
+#define npyv_sub_s32 _mm256_sub_epi32
+#define npyv_sub_u64 _mm256_sub_epi64
+#define npyv_sub_s64 _mm256_sub_epi64
+#define npyv_sub_f32 _mm256_sub_ps
+#define npyv_sub_f64 _mm256_sub_pd
+
+// saturated
+#define npyv_subs_u8 _mm256_subs_epu8
+#define npyv_subs_s8 _mm256_subs_epi8
+#define npyv_subs_u16 _mm256_subs_epu16
+#define npyv_subs_s16 _mm256_subs_epi16
+// TODO: rest, after implment Packs intrins
+
+/***************************
+ * Multiplication
+ ***************************/
+// non-saturated
+#define npyv_mul_u8 npyv256_mul_u8
+#define npyv_mul_s8 npyv_mul_u8
+#define npyv_mul_u16 _mm256_mullo_epi16
+#define npyv_mul_s16 _mm256_mullo_epi16
+#define npyv_mul_u32 _mm256_mullo_epi32
+#define npyv_mul_s32 _mm256_mullo_epi32
+#define npyv_mul_f32 _mm256_mul_ps
+#define npyv_mul_f64 _mm256_mul_pd
+
+// saturated
+// TODO: after implment Packs intrins
+
+/***************************
+ * Division
+ ***************************/
+// TODO: emulate integer division
+#define npyv_div_f32 _mm256_div_ps
+#define npyv_div_f64 _mm256_div_pd
+
+#endif // _NPY_SIMD_AVX2_ARITHMETIC_H
diff --git a/numpy/core/src/common/simd/avx2/avx2.h b/numpy/core/src/common/simd/avx2/avx2.h
new file mode 100644
index 000000000..c99d628ee
--- /dev/null
+++ b/numpy/core/src/common/simd/avx2/avx2.h
@@ -0,0 +1,67 @@
+#ifndef _NPY_SIMD_H_
+ #error "Not a standalone header"
+#endif
+
+#define NPY_SIMD 256
+#define NPY_SIMD_WIDTH 32
+#define NPY_SIMD_F64 1
+
+typedef __m256i npyv_u8;
+typedef __m256i npyv_s8;
+typedef __m256i npyv_u16;
+typedef __m256i npyv_s16;
+typedef __m256i npyv_u32;
+typedef __m256i npyv_s32;
+typedef __m256i npyv_u64;
+typedef __m256i npyv_s64;
+typedef __m256 npyv_f32;
+typedef __m256d npyv_f64;
+
+typedef __m256i npyv_b8;
+typedef __m256i npyv_b16;
+typedef __m256i npyv_b32;
+typedef __m256i npyv_b64;
+
+typedef struct { __m256i val[2]; } npyv_m256ix2;
+typedef npyv_m256ix2 npyv_u8x2;
+typedef npyv_m256ix2 npyv_s8x2;
+typedef npyv_m256ix2 npyv_u16x2;
+typedef npyv_m256ix2 npyv_s16x2;
+typedef npyv_m256ix2 npyv_u32x2;
+typedef npyv_m256ix2 npyv_s32x2;
+typedef npyv_m256ix2 npyv_u64x2;
+typedef npyv_m256ix2 npyv_s64x2;
+
+typedef struct { __m256i val[3]; } npyv_m256ix3;
+typedef npyv_m256ix3 npyv_u8x3;
+typedef npyv_m256ix3 npyv_s8x3;
+typedef npyv_m256ix3 npyv_u16x3;
+typedef npyv_m256ix3 npyv_s16x3;
+typedef npyv_m256ix3 npyv_u32x3;
+typedef npyv_m256ix3 npyv_s32x3;
+typedef npyv_m256ix3 npyv_u64x3;
+typedef npyv_m256ix3 npyv_s64x3;
+
+typedef struct { __m256 val[2]; } npyv_f32x2;
+typedef struct { __m256d val[2]; } npyv_f64x2;
+typedef struct { __m256 val[3]; } npyv_f32x3;
+typedef struct { __m256d val[3]; } npyv_f64x3;
+
+#define npyv_nlanes_u8 32
+#define npyv_nlanes_s8 32
+#define npyv_nlanes_u16 16
+#define npyv_nlanes_s16 16
+#define npyv_nlanes_u32 8
+#define npyv_nlanes_s32 8
+#define npyv_nlanes_u64 4
+#define npyv_nlanes_s64 4
+#define npyv_nlanes_f32 8
+#define npyv_nlanes_f64 4
+
+#include "utils.h"
+#include "memory.h"
+#include "misc.h"
+#include "reorder.h"
+#include "operators.h"
+#include "conversion.h"
+#include "arithmetic.h"
diff --git a/numpy/core/src/common/simd/avx2/conversion.h b/numpy/core/src/common/simd/avx2/conversion.h
new file mode 100644
index 000000000..9fd86016d
--- /dev/null
+++ b/numpy/core/src/common/simd/avx2/conversion.h
@@ -0,0 +1,32 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_AVX2_CVT_H
+#define _NPY_SIMD_AVX2_CVT_H
+
+// convert mask types to integer types
+#define npyv_cvt_u8_b8(A) A
+#define npyv_cvt_s8_b8(A) A
+#define npyv_cvt_u16_b16(A) A
+#define npyv_cvt_s16_b16(A) A
+#define npyv_cvt_u32_b32(A) A
+#define npyv_cvt_s32_b32(A) A
+#define npyv_cvt_u64_b64(A) A
+#define npyv_cvt_s64_b64(A) A
+#define npyv_cvt_f32_b32(A) _mm256_castsi256_ps(A)
+#define npyv_cvt_f64_b64(A) _mm256_castsi256_pd(A)
+
+// convert integer types to mask types
+#define npyv_cvt_b8_u8(BL) BL
+#define npyv_cvt_b8_s8(BL) BL
+#define npyv_cvt_b16_u16(BL) BL
+#define npyv_cvt_b16_s16(BL) BL
+#define npyv_cvt_b32_u32(BL) BL
+#define npyv_cvt_b32_s32(BL) BL
+#define npyv_cvt_b64_u64(BL) BL
+#define npyv_cvt_b64_s64(BL) BL
+#define npyv_cvt_b32_f32(BL) _mm256_castps_si256(BL)
+#define npyv_cvt_b64_f64(BL) _mm256_castpd_si256(BL)
+
+#endif // _NPY_SIMD_AVX2_CVT_H
diff --git a/numpy/core/src/common/simd/avx2/memory.h b/numpy/core/src/common/simd/avx2/memory.h
new file mode 100644
index 000000000..5ea7414fd
--- /dev/null
+++ b/numpy/core/src/common/simd/avx2/memory.h
@@ -0,0 +1,70 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_AVX2_MEMORY_H
+#define _NPY_SIMD_AVX2_MEMORY_H
+
+/***************************
+ * load/store
+ ***************************/
+#define NPYV_IMPL_AVX2_MEM_INT(CTYPE, SFX) \
+ NPY_FINLINE npyv_##SFX npyv_load_##SFX(const CTYPE *ptr) \
+ { return _mm256_loadu_si256((const __m256i*)ptr); } \
+ NPY_FINLINE npyv_##SFX npyv_loada_##SFX(const CTYPE *ptr) \
+ { return _mm256_load_si256((const __m256i*)ptr); } \
+ NPY_FINLINE npyv_##SFX npyv_loads_##SFX(const CTYPE *ptr) \
+ { return _mm256_stream_load_si256((const __m256i*)ptr); } \
+ NPY_FINLINE npyv_##SFX npyv_loadl_##SFX(const CTYPE *ptr) \
+ { return _mm256_castsi128_si256(_mm_loadu_si128((const __m128i*)ptr)); } \
+ NPY_FINLINE void npyv_store_##SFX(CTYPE *ptr, npyv_##SFX vec) \
+ { _mm256_storeu_si256((__m256i*)ptr, vec); } \
+ NPY_FINLINE void npyv_storea_##SFX(CTYPE *ptr, npyv_##SFX vec) \
+ { _mm256_store_si256((__m256i*)ptr, vec); } \
+ NPY_FINLINE void npyv_stores_##SFX(CTYPE *ptr, npyv_##SFX vec) \
+ { _mm256_stream_si256((__m256i*)ptr, vec); } \
+ NPY_FINLINE void npyv_storel_##SFX(CTYPE *ptr, npyv_##SFX vec) \
+ { _mm_storeu_si128((__m128i*)(ptr), _mm256_castsi256_si128(vec)); } \
+ NPY_FINLINE void npyv_storeh_##SFX(CTYPE *ptr, npyv_##SFX vec) \
+ { _mm_storeu_si128((__m128i*)(ptr), _mm256_extracti128_si256(vec, 1)); }
+
+NPYV_IMPL_AVX2_MEM_INT(npy_uint8, u8)
+NPYV_IMPL_AVX2_MEM_INT(npy_int8, s8)
+NPYV_IMPL_AVX2_MEM_INT(npy_uint16, u16)
+NPYV_IMPL_AVX2_MEM_INT(npy_int16, s16)
+NPYV_IMPL_AVX2_MEM_INT(npy_uint32, u32)
+NPYV_IMPL_AVX2_MEM_INT(npy_int32, s32)
+NPYV_IMPL_AVX2_MEM_INT(npy_uint64, u64)
+NPYV_IMPL_AVX2_MEM_INT(npy_int64, s64)
+
+// unaligned load
+#define npyv_load_f32 _mm256_loadu_ps
+#define npyv_load_f64 _mm256_loadu_pd
+// aligned load
+#define npyv_loada_f32 _mm256_load_ps
+#define npyv_loada_f64 _mm256_load_pd
+// stream load
+#define npyv_loads_f32(PTR) \
+ _mm256_castsi256_ps(_mm256_stream_load_si256((const __m256i*)(PTR)))
+#define npyv_loads_f64(PTR) \
+ _mm256_castsi256_pd(_mm256_stream_load_si256((const __m256i*)(PTR)))
+// load lower part
+#define npyv_loadl_f32(PTR) _mm256_castps128_ps256(_mm_loadu_ps(PTR))
+#define npyv_loadl_f64(PTR) _mm256_castpd128_pd256(_mm_loadu_pd(PTR))
+// unaligned store
+#define npyv_store_f32 _mm256_storeu_ps
+#define npyv_store_f64 _mm256_storeu_pd
+// aligned store
+#define npyv_storea_f32 _mm256_store_ps
+#define npyv_storea_f64 _mm256_store_pd
+// stream store
+#define npyv_stores_f32 _mm256_stream_ps
+#define npyv_stores_f64 _mm256_stream_pd
+// store lower part
+#define npyv_storel_f32(PTR, VEC) _mm_storeu_ps(PTR, _mm256_castps256_ps128(VEC))
+#define npyv_storel_f64(PTR, VEC) _mm_storeu_pd(PTR, _mm256_castpd256_pd128(VEC))
+// store higher part
+#define npyv_storeh_f32(PTR, VEC) _mm_storeu_ps(PTR, _mm256_extractf128_ps(VEC, 1))
+#define npyv_storeh_f64(PTR, VEC) _mm_storeu_pd(PTR, _mm256_extractf128_pd(VEC, 1))
+
+#endif // _NPY_SIMD_AVX2_MEMORY_H
diff --git a/numpy/core/src/common/simd/avx2/misc.h b/numpy/core/src/common/simd/avx2/misc.h
new file mode 100644
index 000000000..e96696dc9
--- /dev/null
+++ b/numpy/core/src/common/simd/avx2/misc.h
@@ -0,0 +1,223 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_AVX2_MISC_H
+#define _NPY_SIMD_AVX2_MISC_H
+
+// vector with zero lanes
+#define npyv_zero_u8 _mm256_setzero_si256
+#define npyv_zero_s8 _mm256_setzero_si256
+#define npyv_zero_u16 _mm256_setzero_si256
+#define npyv_zero_s16 _mm256_setzero_si256
+#define npyv_zero_u32 _mm256_setzero_si256
+#define npyv_zero_s32 _mm256_setzero_si256
+#define npyv_zero_u64 _mm256_setzero_si256
+#define npyv_zero_s64 _mm256_setzero_si256
+#define npyv_zero_f32 _mm256_setzero_ps
+#define npyv_zero_f64 _mm256_setzero_pd
+
+// vector with a specific value set to all lanes
+#define npyv_setall_u8(VAL) _mm256_set1_epi8((char)VAL)
+#define npyv_setall_s8(VAL) _mm256_set1_epi8((char)VAL)
+#define npyv_setall_u16(VAL) _mm256_set1_epi16((short)VAL)
+#define npyv_setall_s16(VAL) _mm256_set1_epi16((short)VAL)
+#define npyv_setall_u32(VAL) _mm256_set1_epi32((int)VAL)
+#define npyv_setall_s32(VAL) _mm256_set1_epi32(VAL)
+#define npyv_setall_u64(VAL) _mm256_set1_epi64x(VAL)
+#define npyv_setall_s64(VAL) _mm256_set1_epi64x(VAL)
+#define npyv_setall_f32(VAL) _mm256_set1_ps(VAL)
+#define npyv_setall_f64(VAL) _mm256_set1_pd(VAL)
+
+/*
+ * vector with specific values set to each lane and
+ * set a specific value to all remained lanes
+ *
+ * Args that generated by NPYV__SET_FILL_* not going to expand if
+ * _mm256_setr_* are defined as macros.
+*/
+NPY_FINLINE __m256i npyv__setr_epi8(
+ char i0, char i1, char i2, char i3, char i4, char i5, char i6, char i7,
+ char i8, char i9, char i10, char i11, char i12, char i13, char i14, char i15,
+ char i16, char i17, char i18, char i19, char i20, char i21, char i22, char i23,
+ char i24, char i25, char i26, char i27, char i28, char i29, char i30, char i31)
+{
+ return _mm256_setr_epi8(
+ i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15,
+ i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31
+ );
+}
+NPY_FINLINE __m256i npyv__setr_epi16(
+ short i0, short i1, short i2, short i3, short i4, short i5, short i6, short i7,
+ short i8, short i9, short i10, short i11, short i12, short i13, short i14, short i15)
+{
+ return _mm256_setr_epi16(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15);
+}
+NPY_FINLINE __m256i npyv__setr_epi32(int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7)
+{
+ return _mm256_setr_epi32(i0, i1, i2, i3, i4, i5, i6, i7);
+}
+NPY_FINLINE __m256i npyv__setr_epi64(npy_int64 i0, npy_int64 i1, npy_int64 i2, npy_int64 i3)
+{
+ return _mm256_setr_epi64x(i0, i1, i2, i3);
+}
+
+NPY_FINLINE __m256 npyv__setr_ps(float i0, float i1, float i2, float i3, float i4, float i5,
+ float i6, float i7)
+{
+ return _mm256_setr_ps(i0, i1, i2, i3, i4, i5, i6, i7);
+}
+NPY_FINLINE __m256d npyv__setr_pd(double i0, double i1, double i2, double i3)
+{
+ return _mm256_setr_pd(i0, i1, i2, i3);
+}
+#define npyv_setf_u8(FILL, ...) npyv__setr_epi8(NPYV__SET_FILL_32(char, FILL, __VA_ARGS__))
+#define npyv_setf_s8(FILL, ...) npyv__setr_epi8(NPYV__SET_FILL_32(char, FILL, __VA_ARGS__))
+#define npyv_setf_u16(FILL, ...) npyv__setr_epi16(NPYV__SET_FILL_16(short, FILL, __VA_ARGS__))
+#define npyv_setf_s16(FILL, ...) npyv__setr_epi16(NPYV__SET_FILL_16(short, FILL, __VA_ARGS__))
+#define npyv_setf_u32(FILL, ...) npyv__setr_epi32(NPYV__SET_FILL_8(int, FILL, __VA_ARGS__))
+#define npyv_setf_s32(FILL, ...) npyv__setr_epi32(NPYV__SET_FILL_8(int, FILL, __VA_ARGS__))
+#define npyv_setf_u64(FILL, ...) npyv__setr_epi64(NPYV__SET_FILL_4(npy_uint64, FILL, __VA_ARGS__))
+#define npyv_setf_s64(FILL, ...) npyv__setr_epi64(NPYV__SET_FILL_4(npy_int64, FILL, __VA_ARGS__))
+#define npyv_setf_f32(FILL, ...) npyv__setr_ps(NPYV__SET_FILL_8(float, FILL, __VA_ARGS__))
+#define npyv_setf_f64(FILL, ...) npyv__setr_pd(NPYV__SET_FILL_4(double, FILL, __VA_ARGS__))
+
+// vector with specific values set to each lane and
+// set zero to all remained lanes
+#define npyv_set_u8(...) npyv_setf_u8(0, __VA_ARGS__)
+#define npyv_set_s8(...) npyv_setf_s8(0, __VA_ARGS__)
+#define npyv_set_u16(...) npyv_setf_u16(0, __VA_ARGS__)
+#define npyv_set_s16(...) npyv_setf_s16(0, __VA_ARGS__)
+#define npyv_set_u32(...) npyv_setf_u32(0, __VA_ARGS__)
+#define npyv_set_s32(...) npyv_setf_s32(0, __VA_ARGS__)
+#define npyv_set_u64(...) npyv_setf_u64(0, __VA_ARGS__)
+#define npyv_set_s64(...) npyv_setf_s64(0, __VA_ARGS__)
+#define npyv_set_f32(...) npyv_setf_f32(0, __VA_ARGS__)
+#define npyv_set_f64(...) npyv_setf_f64(0, __VA_ARGS__)
+
+// Per lane select
+#define npyv_select_u8(MASK, A, B) _mm256_blendv_epi8(B, A, MASK)
+#define npyv_select_s8 npyv_select_u8
+#define npyv_select_u16 npyv_select_u8
+#define npyv_select_s16 npyv_select_u8
+#define npyv_select_u32 npyv_select_u8
+#define npyv_select_s32 npyv_select_u8
+#define npyv_select_u64 npyv_select_u8
+#define npyv_select_s64 npyv_select_u8
+#define npyv_select_f32(MASK, A, B) _mm256_blendv_ps(B, A, _mm256_castsi256_ps(MASK))
+#define npyv_select_f64(MASK, A, B) _mm256_blendv_pd(B, A, _mm256_castsi256_pd(MASK))
+
+// Reinterpret
+#define npyv_reinterpret_u8_u8(X) X
+#define npyv_reinterpret_u8_s8(X) X
+#define npyv_reinterpret_u8_u16(X) X
+#define npyv_reinterpret_u8_s16(X) X
+#define npyv_reinterpret_u8_u32(X) X
+#define npyv_reinterpret_u8_s32(X) X
+#define npyv_reinterpret_u8_u64(X) X
+#define npyv_reinterpret_u8_s64(X) X
+#define npyv_reinterpret_u8_f32 _mm256_castps_si256
+#define npyv_reinterpret_u8_f64 _mm256_castpd_si256
+
+#define npyv_reinterpret_s8_s8(X) X
+#define npyv_reinterpret_s8_u8(X) X
+#define npyv_reinterpret_s8_u16(X) X
+#define npyv_reinterpret_s8_s16(X) X
+#define npyv_reinterpret_s8_u32(X) X
+#define npyv_reinterpret_s8_s32(X) X
+#define npyv_reinterpret_s8_u64(X) X
+#define npyv_reinterpret_s8_s64(X) X
+#define npyv_reinterpret_s8_f32 _mm256_castps_si256
+#define npyv_reinterpret_s8_f64 _mm256_castpd_si256
+
+#define npyv_reinterpret_u16_u16(X) X
+#define npyv_reinterpret_u16_u8(X) X
+#define npyv_reinterpret_u16_s8(X) X
+#define npyv_reinterpret_u16_s16(X) X
+#define npyv_reinterpret_u16_u32(X) X
+#define npyv_reinterpret_u16_s32(X) X
+#define npyv_reinterpret_u16_u64(X) X
+#define npyv_reinterpret_u16_s64(X) X
+#define npyv_reinterpret_u16_f32 _mm256_castps_si256
+#define npyv_reinterpret_u16_f64 _mm256_castpd_si256
+
+#define npyv_reinterpret_s16_s16(X) X
+#define npyv_reinterpret_s16_u8(X) X
+#define npyv_reinterpret_s16_s8(X) X
+#define npyv_reinterpret_s16_u16(X) X
+#define npyv_reinterpret_s16_u32(X) X
+#define npyv_reinterpret_s16_s32(X) X
+#define npyv_reinterpret_s16_u64(X) X
+#define npyv_reinterpret_s16_s64(X) X
+#define npyv_reinterpret_s16_f32 _mm256_castps_si256
+#define npyv_reinterpret_s16_f64 _mm256_castpd_si256
+
+#define npyv_reinterpret_u32_u32(X) X
+#define npyv_reinterpret_u32_u8(X) X
+#define npyv_reinterpret_u32_s8(X) X
+#define npyv_reinterpret_u32_u16(X) X
+#define npyv_reinterpret_u32_s16(X) X
+#define npyv_reinterpret_u32_s32(X) X
+#define npyv_reinterpret_u32_u64(X) X
+#define npyv_reinterpret_u32_s64(X) X
+#define npyv_reinterpret_u32_f32 _mm256_castps_si256
+#define npyv_reinterpret_u32_f64 _mm256_castpd_si256
+
+#define npyv_reinterpret_s32_s32(X) X
+#define npyv_reinterpret_s32_u8(X) X
+#define npyv_reinterpret_s32_s8(X) X
+#define npyv_reinterpret_s32_u16(X) X
+#define npyv_reinterpret_s32_s16(X) X
+#define npyv_reinterpret_s32_u32(X) X
+#define npyv_reinterpret_s32_u64(X) X
+#define npyv_reinterpret_s32_s64(X) X
+#define npyv_reinterpret_s32_f32 _mm256_castps_si256
+#define npyv_reinterpret_s32_f64 _mm256_castpd_si256
+
+#define npyv_reinterpret_u64_u64(X) X
+#define npyv_reinterpret_u64_u8(X) X
+#define npyv_reinterpret_u64_s8(X) X
+#define npyv_reinterpret_u64_u16(X) X
+#define npyv_reinterpret_u64_s16(X) X
+#define npyv_reinterpret_u64_u32(X) X
+#define npyv_reinterpret_u64_s32(X) X
+#define npyv_reinterpret_u64_s64(X) X
+#define npyv_reinterpret_u64_f32 _mm256_castps_si256
+#define npyv_reinterpret_u64_f64 _mm256_castpd_si256
+
+#define npyv_reinterpret_s64_s64(X) X
+#define npyv_reinterpret_s64_u8(X) X
+#define npyv_reinterpret_s64_s8(X) X
+#define npyv_reinterpret_s64_u16(X) X
+#define npyv_reinterpret_s64_s16(X) X
+#define npyv_reinterpret_s64_u32(X) X
+#define npyv_reinterpret_s64_s32(X) X
+#define npyv_reinterpret_s64_u64(X) X
+#define npyv_reinterpret_s64_f32 _mm256_castps_si256
+#define npyv_reinterpret_s64_f64 _mm256_castpd_si256
+
+#define npyv_reinterpret_f32_f32(X) X
+#define npyv_reinterpret_f32_u8 _mm256_castsi256_ps
+#define npyv_reinterpret_f32_s8 _mm256_castsi256_ps
+#define npyv_reinterpret_f32_u16 _mm256_castsi256_ps
+#define npyv_reinterpret_f32_s16 _mm256_castsi256_ps
+#define npyv_reinterpret_f32_u32 _mm256_castsi256_ps
+#define npyv_reinterpret_f32_s32 _mm256_castsi256_ps
+#define npyv_reinterpret_f32_u64 _mm256_castsi256_ps
+#define npyv_reinterpret_f32_s64 _mm256_castsi256_ps
+#define npyv_reinterpret_f32_f64 _mm256_castpd_ps
+
+#define npyv_reinterpret_f64_f64(X) X
+#define npyv_reinterpret_f64_u8 _mm256_castsi256_pd
+#define npyv_reinterpret_f64_s8 _mm256_castsi256_pd
+#define npyv_reinterpret_f64_u16 _mm256_castsi256_pd
+#define npyv_reinterpret_f64_s16 _mm256_castsi256_pd
+#define npyv_reinterpret_f64_u32 _mm256_castsi256_pd
+#define npyv_reinterpret_f64_s32 _mm256_castsi256_pd
+#define npyv_reinterpret_f64_u64 _mm256_castsi256_pd
+#define npyv_reinterpret_f64_s64 _mm256_castsi256_pd
+#define npyv_reinterpret_f64_f32 _mm256_castps_pd
+
+#define npyv_cleanup _mm256_zeroall
+
+#endif // _NPY_SIMD_SSE_MISC_H
diff --git a/numpy/core/src/common/simd/avx2/operators.h b/numpy/core/src/common/simd/avx2/operators.h
new file mode 100644
index 000000000..c1d30413f
--- /dev/null
+++ b/numpy/core/src/common/simd/avx2/operators.h
@@ -0,0 +1,200 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_AVX2_OPERATORS_H
+#define _NPY_SIMD_AVX2_OPERATORS_H
+
+/***************************
+ * Shifting
+ ***************************/
+
+// left
+#define npyv_shl_u16(A, C) _mm256_sll_epi16(A, _mm_cvtsi32_si128(C))
+#define npyv_shl_s16(A, C) _mm256_sll_epi16(A, _mm_cvtsi32_si128(C))
+#define npyv_shl_u32(A, C) _mm256_sll_epi32(A, _mm_cvtsi32_si128(C))
+#define npyv_shl_s32(A, C) _mm256_sll_epi32(A, _mm_cvtsi32_si128(C))
+#define npyv_shl_u64(A, C) _mm256_sll_epi64(A, _mm_cvtsi32_si128(C))
+#define npyv_shl_s64(A, C) _mm256_sll_epi64(A, _mm_cvtsi32_si128(C))
+
+// left by an immediate constant
+#define npyv_shli_u16 _mm256_slli_epi16
+#define npyv_shli_s16 _mm256_slli_epi16
+#define npyv_shli_u32 _mm256_slli_epi32
+#define npyv_shli_s32 _mm256_slli_epi32
+#define npyv_shli_u64 _mm256_slli_epi64
+#define npyv_shli_s64 _mm256_slli_epi64
+
+// right
+#define npyv_shr_u16(A, C) _mm256_srl_epi16(A, _mm_cvtsi32_si128(C))
+#define npyv_shr_s16(A, C) _mm256_sra_epi16(A, _mm_cvtsi32_si128(C))
+#define npyv_shr_u32(A, C) _mm256_srl_epi32(A, _mm_cvtsi32_si128(C))
+#define npyv_shr_s32(A, C) _mm256_sra_epi32(A, _mm_cvtsi32_si128(C))
+#define npyv_shr_u64(A, C) _mm256_srl_epi64(A, _mm_cvtsi32_si128(C))
+NPY_FINLINE __m256i npyv_shr_s64(__m256i a, int c)
+{
+ const __m256i sbit = _mm256_set1_epi64x(0x8000000000000000);
+ const __m128i c64 = _mm_cvtsi32_si128(c);
+ __m256i r = _mm256_srl_epi64(_mm256_add_epi64(a, sbit), c64);
+ return _mm256_sub_epi64(r, _mm256_srl_epi64(sbit, c64));
+}
+
+// right by an immediate constant
+#define npyv_shri_u16 _mm256_srli_epi16
+#define npyv_shri_s16 _mm256_srai_epi16
+#define npyv_shri_u32 _mm256_srli_epi32
+#define npyv_shri_s32 _mm256_srai_epi32
+#define npyv_shri_u64 _mm256_srli_epi64
+#define npyv_shri_s64 npyv_shr_s64
+
+/***************************
+ * Logical
+ ***************************/
+// AND
+#define npyv_and_u8 _mm256_and_si256
+#define npyv_and_s8 _mm256_and_si256
+#define npyv_and_u16 _mm256_and_si256
+#define npyv_and_s16 _mm256_and_si256
+#define npyv_and_u32 _mm256_and_si256
+#define npyv_and_s32 _mm256_and_si256
+#define npyv_and_u64 _mm256_and_si256
+#define npyv_and_s64 _mm256_and_si256
+#define npyv_and_f32 _mm256_and_ps
+#define npyv_and_f64 _mm256_and_pd
+
+// OR
+#define npyv_or_u8 _mm256_or_si256
+#define npyv_or_s8 _mm256_or_si256
+#define npyv_or_u16 _mm256_or_si256
+#define npyv_or_s16 _mm256_or_si256
+#define npyv_or_u32 _mm256_or_si256
+#define npyv_or_s32 _mm256_or_si256
+#define npyv_or_u64 _mm256_or_si256
+#define npyv_or_s64 _mm256_or_si256
+#define npyv_or_f32 _mm256_or_ps
+#define npyv_or_f64 _mm256_or_pd
+
+// XOR
+#define npyv_xor_u8 _mm256_xor_si256
+#define npyv_xor_s8 _mm256_xor_si256
+#define npyv_xor_u16 _mm256_xor_si256
+#define npyv_xor_s16 _mm256_xor_si256
+#define npyv_xor_u32 _mm256_xor_si256
+#define npyv_xor_s32 _mm256_xor_si256
+#define npyv_xor_u64 _mm256_xor_si256
+#define npyv_xor_s64 _mm256_xor_si256
+#define npyv_xor_f32 _mm256_xor_ps
+#define npyv_xor_f64 _mm256_xor_pd
+
+// NOT
+#define npyv_not_u8(A) _mm256_xor_si256(A, _mm256_set1_epi32(-1))
+#define npyv_not_s8 npyv_not_u8
+#define npyv_not_u16 npyv_not_u8
+#define npyv_not_s16 npyv_not_u8
+#define npyv_not_u32 npyv_not_u8
+#define npyv_not_s32 npyv_not_u8
+#define npyv_not_u64 npyv_not_u8
+#define npyv_not_s64 npyv_not_u8
+#define npyv_not_f32(A) _mm256_xor_ps(A, _mm256_castsi256_ps(_mm256_set1_epi32(-1)))
+#define npyv_not_f64(A) _mm256_xor_pd(A, _mm256_castsi256_pd(_mm256_set1_epi32(-1)))
+
+/***************************
+ * Comparison
+ ***************************/
+
+// int Equal
+#define npyv_cmpeq_u8 _mm256_cmpeq_epi8
+#define npyv_cmpeq_s8 _mm256_cmpeq_epi8
+#define npyv_cmpeq_u16 _mm256_cmpeq_epi16
+#define npyv_cmpeq_s16 _mm256_cmpeq_epi16
+#define npyv_cmpeq_u32 _mm256_cmpeq_epi32
+#define npyv_cmpeq_s32 _mm256_cmpeq_epi32
+#define npyv_cmpeq_u64 _mm256_cmpeq_epi64
+#define npyv_cmpeq_s64 _mm256_cmpeq_epi64
+
+// int Not Equal
+#define npyv_cmpneq_u8(A, B) npyv_not_u8(_mm256_cmpeq_epi8(A, B))
+#define npyv_cmpneq_s8 npyv_cmpneq_u8
+#define npyv_cmpneq_u16(A, B) npyv_not_u16(_mm256_cmpeq_epi16(A, B))
+#define npyv_cmpneq_s16 npyv_cmpneq_u16
+#define npyv_cmpneq_u32(A, B) npyv_not_u32(_mm256_cmpeq_epi32(A, B))
+#define npyv_cmpneq_s32 npyv_cmpneq_u32
+#define npyv_cmpneq_u64(A, B) npyv_not_u64(_mm256_cmpeq_epi64(A, B))
+#define npyv_cmpneq_s64 npyv_cmpneq_u64
+
+// signed greater than
+#define npyv_cmpgt_s8 _mm256_cmpgt_epi8
+#define npyv_cmpgt_s16 _mm256_cmpgt_epi16
+#define npyv_cmpgt_s32 _mm256_cmpgt_epi32
+#define npyv_cmpgt_s64 _mm256_cmpgt_epi64
+
+// signed greater than or equal
+#define npyv_cmpge_s8(A, B) npyv_not_s8(_mm256_cmpgt_epi8(B, A))
+#define npyv_cmpge_s16(A, B) npyv_not_s16(_mm256_cmpgt_epi16(B, A))
+#define npyv_cmpge_s32(A, B) npyv_not_s32(_mm256_cmpgt_epi32(B, A))
+#define npyv_cmpge_s64(A, B) npyv_not_s64(_mm256_cmpgt_epi64(B, A))
+
+// unsigned greater than
+#define NPYV_IMPL_AVX2_UNSIGNED_GT(LEN, SIGN) \
+ NPY_FINLINE __m256i npyv_cmpgt_u##LEN(__m256i a, __m256i b) \
+ { \
+ const __m256i sbit = _mm256_set1_epi32(SIGN); \
+ return _mm256_cmpgt_epi##LEN( \
+ _mm256_xor_si256(a, sbit), _mm256_xor_si256(b, sbit) \
+ ); \
+ }
+
+NPYV_IMPL_AVX2_UNSIGNED_GT(8, 0x80808080)
+NPYV_IMPL_AVX2_UNSIGNED_GT(16, 0x80008000)
+NPYV_IMPL_AVX2_UNSIGNED_GT(32, 0x80000000)
+
+NPY_FINLINE __m256i npyv_cmpgt_u64(__m256i a, __m256i b)
+{
+ const __m256i sbit = _mm256_set1_epi64x(0x8000000000000000);
+ return _mm256_cmpgt_epi64(_mm256_xor_si256(a, sbit), _mm256_xor_si256(b, sbit));
+}
+
+// unsigned greater than or equal
+NPY_FINLINE __m256i npyv_cmpge_u8(__m256i a, __m256i b)
+{ return _mm256_cmpeq_epi8(a, _mm256_max_epu8(a, b)); }
+NPY_FINLINE __m256i npyv_cmpge_u16(__m256i a, __m256i b)
+{ return _mm256_cmpeq_epi16(a, _mm256_max_epu16(a, b)); }
+NPY_FINLINE __m256i npyv_cmpge_u32(__m256i a, __m256i b)
+{ return _mm256_cmpeq_epi32(a, _mm256_max_epu32(a, b)); }
+#define npyv_cmpge_u64(A, B) npyv_not_u64(npyv_cmpgt_u64(B, A))
+
+// less than
+#define npyv_cmplt_u8(A, B) npyv_cmpgt_u8(B, A)
+#define npyv_cmplt_s8(A, B) npyv_cmpgt_s8(B, A)
+#define npyv_cmplt_u16(A, B) npyv_cmpgt_u16(B, A)
+#define npyv_cmplt_s16(A, B) npyv_cmpgt_s16(B, A)
+#define npyv_cmplt_u32(A, B) npyv_cmpgt_u32(B, A)
+#define npyv_cmplt_s32(A, B) npyv_cmpgt_s32(B, A)
+#define npyv_cmplt_u64(A, B) npyv_cmpgt_u64(B, A)
+#define npyv_cmplt_s64(A, B) npyv_cmpgt_s64(B, A)
+
+// less than or equal
+#define npyv_cmple_u8(A, B) npyv_cmpge_u8(B, A)
+#define npyv_cmple_s8(A, B) npyv_cmpge_s8(B, A)
+#define npyv_cmple_u16(A, B) npyv_cmpge_u16(B, A)
+#define npyv_cmple_s16(A, B) npyv_cmpge_s16(B, A)
+#define npyv_cmple_u32(A, B) npyv_cmpge_u32(B, A)
+#define npyv_cmple_s32(A, B) npyv_cmpge_s32(B, A)
+#define npyv_cmple_u64(A, B) npyv_cmpge_u64(B, A)
+#define npyv_cmple_s64(A, B) npyv_cmpge_s64(B, A)
+
+// precision comparison
+#define npyv_cmpeq_f32(A, B) _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_EQ_OQ))
+#define npyv_cmpeq_f64(A, B) _mm256_castpd_si256(_mm256_cmp_pd(A, B, _CMP_EQ_OQ))
+#define npyv_cmpneq_f32(A, B) _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_NEQ_OQ))
+#define npyv_cmpneq_f64(A, B) _mm256_castpd_si256(_mm256_cmp_pd(A, B, _CMP_NEQ_OQ))
+#define npyv_cmplt_f32(A, B) _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_LT_OQ))
+#define npyv_cmplt_f64(A, B) _mm256_castpd_si256(_mm256_cmp_pd(A, B, _CMP_LT_OQ))
+#define npyv_cmple_f32(A, B) _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_LE_OQ))
+#define npyv_cmple_f64(A, B) _mm256_castpd_si256(_mm256_cmp_pd(A, B, _CMP_LE_OQ))
+#define npyv_cmpgt_f32(A, B) _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_GT_OQ))
+#define npyv_cmpgt_f64(A, B) _mm256_castpd_si256(_mm256_cmp_pd(A, B, _CMP_GT_OQ))
+#define npyv_cmpge_f32(A, B) _mm256_castps_si256(_mm256_cmp_ps(A, B, _CMP_GE_OQ))
+#define npyv_cmpge_f64(A, B) _mm256_castpd_si256(_mm256_cmp_pd(A, B, _CMP_GE_OQ))
+
+#endif // _NPY_SIMD_AVX2_OPERATORS_H
diff --git a/numpy/core/src/common/simd/avx2/reorder.h b/numpy/core/src/common/simd/avx2/reorder.h
new file mode 100644
index 000000000..5a9e68e32
--- /dev/null
+++ b/numpy/core/src/common/simd/avx2/reorder.h
@@ -0,0 +1,97 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_AVX2_REORDER_H
+#define _NPY_SIMD_AVX2_REORDER_H
+
+// combine lower part of two vectors
+#define npyv_combinel_u8(A, B) _mm256_permute2x128_si256(A, B, 0x20)
+#define npyv_combinel_s8 npyv_combinel_u8
+#define npyv_combinel_u16 npyv_combinel_u8
+#define npyv_combinel_s16 npyv_combinel_u8
+#define npyv_combinel_u32 npyv_combinel_u8
+#define npyv_combinel_s32 npyv_combinel_u8
+#define npyv_combinel_u64 npyv_combinel_u8
+#define npyv_combinel_s64 npyv_combinel_u8
+#define npyv_combinel_f32(A, B) _mm256_permute2f128_ps(A, B, 0x20)
+#define npyv_combinel_f64(A, B) _mm256_permute2f128_pd(A, B, 0x20)
+
+// combine higher part of two vectors
+#define npyv_combineh_u8(A, B) _mm256_permute2x128_si256(A, B, 0x31)
+#define npyv_combineh_s8 npyv_combineh_u8
+#define npyv_combineh_u16 npyv_combineh_u8
+#define npyv_combineh_s16 npyv_combineh_u8
+#define npyv_combineh_u32 npyv_combineh_u8
+#define npyv_combineh_s32 npyv_combineh_u8
+#define npyv_combineh_u64 npyv_combineh_u8
+#define npyv_combineh_s64 npyv_combineh_u8
+#define npyv_combineh_f32(A, B) _mm256_permute2f128_ps(A, B, 0x31)
+#define npyv_combineh_f64(A, B) _mm256_permute2f128_pd(A, B, 0x31)
+
+// combine two vectors from lower and higher parts of two other vectors
+NPY_FINLINE npyv_m256ix2 npyv__combine(__m256i a, __m256i b)
+{
+ npyv_m256ix2 r;
+ __m256i a1b0 = _mm256_permute2x128_si256(a, b, 0x21);
+ r.val[0] = _mm256_blend_epi32(a, a1b0, 0xF0);
+ r.val[1] = _mm256_blend_epi32(b, a1b0, 0xF);
+ return r;
+}
+NPY_FINLINE npyv_f32x2 npyv_combine_f32(__m256 a, __m256 b)
+{
+ npyv_f32x2 r;
+ __m256 a1b0 = _mm256_permute2f128_ps(a, b, 0x21);
+ r.val[0] = _mm256_blend_ps(a, a1b0, 0xF0);
+ r.val[1] = _mm256_blend_ps(b, a1b0, 0xF);
+ return r;
+}
+NPY_FINLINE npyv_f64x2 npyv_combine_f64(__m256d a, __m256d b)
+{
+ npyv_f64x2 r;
+ __m256d a1b0 = _mm256_permute2f128_pd(a, b, 0x21);
+ r.val[0] = _mm256_blend_pd(a, a1b0, 0xC);
+ r.val[1] = _mm256_blend_pd(b, a1b0, 0x3);
+ return r;
+}
+#define npyv_combine_u8 npyv__combine
+#define npyv_combine_s8 npyv__combine
+#define npyv_combine_u16 npyv__combine
+#define npyv_combine_s16 npyv__combine
+#define npyv_combine_u32 npyv__combine
+#define npyv_combine_s32 npyv__combine
+#define npyv_combine_u64 npyv__combine
+#define npyv_combine_s64 npyv__combine
+
+// interleave two vectors
+#define NPYV_IMPL_AVX2_ZIP_U(T_VEC, LEN) \
+ NPY_FINLINE T_VEC##x2 npyv_zip_u##LEN(T_VEC a, T_VEC b) \
+ { \
+ __m256i ab0 = _mm256_unpacklo_epi##LEN(a, b); \
+ __m256i ab1 = _mm256_unpackhi_epi##LEN(a, b); \
+ return npyv__combine(ab0, ab1); \
+ }
+
+NPYV_IMPL_AVX2_ZIP_U(npyv_u8, 8)
+NPYV_IMPL_AVX2_ZIP_U(npyv_u16, 16)
+NPYV_IMPL_AVX2_ZIP_U(npyv_u32, 32)
+NPYV_IMPL_AVX2_ZIP_U(npyv_u64, 64)
+#define npyv_zip_s8 npyv_zip_u8
+#define npyv_zip_s16 npyv_zip_u16
+#define npyv_zip_s32 npyv_zip_u32
+#define npyv_zip_s64 npyv_zip_u64
+
+NPY_FINLINE npyv_f32x2 npyv_zip_f32(__m256 a, __m256 b)
+{
+ __m256 ab0 = _mm256_unpacklo_ps(a, b);
+ __m256 ab1 = _mm256_unpackhi_ps(a, b);
+ return npyv_combine_f32(ab0, ab1);
+}
+NPY_FINLINE npyv_f64x2 npyv_zip_f64(__m256d a, __m256d b)
+{
+ __m256d ab0 = _mm256_unpacklo_pd(a, b);
+ __m256d ab1 = _mm256_unpackhi_pd(a, b);
+ return npyv_combine_f64(ab0, ab1);
+}
+
+#endif // _NPY_SIMD_AVX2_REORDER_H
diff --git a/numpy/core/src/common/simd/avx2/utils.h b/numpy/core/src/common/simd/avx2/utils.h
new file mode 100644
index 000000000..24f1af5d1
--- /dev/null
+++ b/numpy/core/src/common/simd/avx2/utils.h
@@ -0,0 +1,21 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_AVX2_UTILS_H
+#define _NPY_SIMD_AVX2_UTILS_H
+
+#define npyv256_shuffle_odd(A) _mm256_permute4x64_epi64(A, _MM_SHUFFLE(3, 1, 2, 0))
+#define npyv256_shuffle_odd_ps(A) _mm256_castsi256_ps(npyv256_shuffle_odd(_mm256_castps_si256(A)))
+#define npyv256_shuffle_odd_pd(A) _mm256_permute4x64_pd(A, _MM_SHUFFLE(3, 1, 2, 0))
+
+NPY_FINLINE __m256i npyv256_mul_u8(__m256i a, __m256i b)
+{
+ const __m256i mask = _mm256_set1_epi32(0xFF00FF00);
+ __m256i even = _mm256_mullo_epi16(a, b);
+ __m256i odd = _mm256_mullo_epi16(_mm256_srai_epi16(a, 8), _mm256_srai_epi16(b, 8));
+ odd = _mm256_slli_epi16(odd, 8);
+ return _mm256_blendv_epi8(even, odd, mask);
+}
+
+#endif // _NPY_SIMD_AVX2_UTILS_H
diff --git a/numpy/core/src/common/simd/avx512/arithmetic.h b/numpy/core/src/common/simd/avx512/arithmetic.h
new file mode 100644
index 000000000..fcaef0efd
--- /dev/null
+++ b/numpy/core/src/common/simd/avx512/arithmetic.h
@@ -0,0 +1,116 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_AVX512_ARITHMETIC_H
+#define _NPY_SIMD_AVX512_ARITHMETIC_H
+
+#include "../avx2/utils.h"
+
+/***************************
+ * Addition
+ ***************************/
+// non-saturated
+#ifdef NPY_HAVE_AVX512BW
+ #define npyv_add_u8 _mm512_add_epi8
+ #define npyv_add_u16 _mm512_add_epi16
+#else
+ NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_add_u8, _mm256_add_epi8)
+ NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_add_u16, _mm256_add_epi16)
+#endif
+#define npyv_add_s8 npyv_add_u8
+#define npyv_add_s16 npyv_add_u16
+#define npyv_add_u32 _mm512_add_epi32
+#define npyv_add_s32 _mm512_add_epi32
+#define npyv_add_u64 _mm512_add_epi64
+#define npyv_add_s64 _mm512_add_epi64
+#define npyv_add_f32 _mm512_add_ps
+#define npyv_add_f64 _mm512_add_pd
+
+// saturated
+#ifdef NPY_HAVE_AVX512BW
+ #define npyv_adds_u8 _mm512_adds_epu8
+ #define npyv_adds_s8 _mm512_adds_epi8
+ #define npyv_adds_u16 _mm512_adds_epu16
+ #define npyv_adds_s16 _mm512_adds_epi16
+#else
+ NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_adds_u8, _mm256_adds_epu8)
+ NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_adds_s8, _mm256_adds_epi8)
+ NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_adds_u16, _mm256_adds_epu16)
+ NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_adds_s16, _mm256_adds_epi16)
+#endif
+// TODO: rest, after implment Packs intrins
+
+/***************************
+ * Subtraction
+ ***************************/
+// non-saturated
+#ifdef NPY_HAVE_AVX512BW
+ #define npyv_sub_u8 _mm512_sub_epi8
+ #define npyv_sub_u16 _mm512_sub_epi16
+#else
+ NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_sub_u8, _mm256_sub_epi8)
+ NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_sub_u16, _mm256_sub_epi16)
+#endif
+#define npyv_sub_s8 npyv_sub_u8
+#define npyv_sub_s16 npyv_sub_u16
+#define npyv_sub_u32 _mm512_sub_epi32
+#define npyv_sub_s32 _mm512_sub_epi32
+#define npyv_sub_u64 _mm512_sub_epi64
+#define npyv_sub_s64 _mm512_sub_epi64
+#define npyv_sub_f32 _mm512_sub_ps
+#define npyv_sub_f64 _mm512_sub_pd
+
+// saturated
+#ifdef NPY_HAVE_AVX512BW
+ #define npyv_subs_u8 _mm512_subs_epu8
+ #define npyv_subs_s8 _mm512_subs_epi8
+ #define npyv_subs_u16 _mm512_subs_epu16
+ #define npyv_subs_s16 _mm512_subs_epi16
+#else
+ NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_subs_u8, _mm256_subs_epu8)
+ NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_subs_s8, _mm256_subs_epi8)
+ NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_subs_u16, _mm256_subs_epu16)
+ NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_subs_s16, _mm256_subs_epi16)
+#endif
+// TODO: rest, after implment Packs intrins
+
+/***************************
+ * Multiplication
+ ***************************/
+// non-saturated
+#ifdef NPY_HAVE_AVX512BW
+NPY_FINLINE __m512i npyv_mul_u8(__m512i a, __m512i b)
+{
+ __m512i even = _mm512_mullo_epi16(a, b);
+ __m512i odd = _mm512_mullo_epi16(_mm512_srai_epi16(a, 8), _mm512_srai_epi16(b, 8));
+ odd = _mm512_slli_epi16(odd, 8);
+ return _mm512_mask_blend_epi8(0xAAAAAAAAAAAAAAAA, even, odd);
+}
+#else
+ NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_mul_u8, npyv256_mul_u8)
+#endif
+
+#ifdef NPY_HAVE_AVX512BW
+ #define npyv_mul_u16 _mm512_mullo_epi16
+#else
+ NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_mul_u16, _mm256_mullo_epi16)
+#endif
+#define npyv_mul_s8 npyv_mul_u8
+#define npyv_mul_s16 npyv_mul_u16
+#define npyv_mul_u32 _mm512_mullo_epi32
+#define npyv_mul_s32 _mm512_mullo_epi32
+#define npyv_mul_f32 _mm512_mul_ps
+#define npyv_mul_f64 _mm512_mul_pd
+
+// saturated
+// TODO: after implment Packs intrins
+
+/***************************
+ * Division
+ ***************************/
+// TODO: emulate integer division
+#define npyv_div_f32 _mm512_div_ps
+#define npyv_div_f64 _mm512_div_pd
+
+#endif // _NPY_SIMD_AVX512_ARITHMETIC_H
diff --git a/numpy/core/src/common/simd/avx512/avx512.h b/numpy/core/src/common/simd/avx512/avx512.h
new file mode 100644
index 000000000..96fdf72b9
--- /dev/null
+++ b/numpy/core/src/common/simd/avx512/avx512.h
@@ -0,0 +1,71 @@
+#ifndef _NPY_SIMD_H_
+ #error "Not a standalone header"
+#endif
+#define NPY_SIMD 512
+#define NPY_SIMD_WIDTH 64
+#define NPY_SIMD_F64 1
+
+typedef __m512i npyv_u8;
+typedef __m512i npyv_s8;
+typedef __m512i npyv_u16;
+typedef __m512i npyv_s16;
+typedef __m512i npyv_u32;
+typedef __m512i npyv_s32;
+typedef __m512i npyv_u64;
+typedef __m512i npyv_s64;
+typedef __m512 npyv_f32;
+typedef __m512d npyv_f64;
+
+#ifdef NPY_HAVE_AVX512BW
+typedef __mmask64 npyv_b8;
+typedef __mmask32 npyv_b16;
+#else
+typedef __m512i npyv_b8;
+typedef __m512i npyv_b16;
+#endif
+typedef __mmask16 npyv_b32;
+typedef __mmask8 npyv_b64;
+
+typedef struct { __m512i val[2]; } npyv_m512ix2;
+typedef npyv_m512ix2 npyv_u8x2;
+typedef npyv_m512ix2 npyv_s8x2;
+typedef npyv_m512ix2 npyv_u16x2;
+typedef npyv_m512ix2 npyv_s16x2;
+typedef npyv_m512ix2 npyv_u32x2;
+typedef npyv_m512ix2 npyv_s32x2;
+typedef npyv_m512ix2 npyv_u64x2;
+typedef npyv_m512ix2 npyv_s64x2;
+
+typedef struct { __m512i val[3]; } npyv_m512ix3;
+typedef npyv_m512ix3 npyv_u8x3;
+typedef npyv_m512ix3 npyv_s8x3;
+typedef npyv_m512ix3 npyv_u16x3;
+typedef npyv_m512ix3 npyv_s16x3;
+typedef npyv_m512ix3 npyv_u32x3;
+typedef npyv_m512ix3 npyv_s32x3;
+typedef npyv_m512ix3 npyv_u64x3;
+typedef npyv_m512ix3 npyv_s64x3;
+
+typedef struct { __m512 val[2]; } npyv_f32x2;
+typedef struct { __m512d val[2]; } npyv_f64x2;
+typedef struct { __m512 val[3]; } npyv_f32x3;
+typedef struct { __m512d val[3]; } npyv_f64x3;
+
+#define npyv_nlanes_u8 64
+#define npyv_nlanes_s8 64
+#define npyv_nlanes_u16 32
+#define npyv_nlanes_s16 32
+#define npyv_nlanes_u32 16
+#define npyv_nlanes_s32 16
+#define npyv_nlanes_u64 8
+#define npyv_nlanes_s64 8
+#define npyv_nlanes_f32 16
+#define npyv_nlanes_f64 8
+
+#include "utils.h"
+#include "memory.h"
+#include "misc.h"
+#include "reorder.h"
+#include "operators.h"
+#include "conversion.h"
+#include "arithmetic.h"
diff --git a/numpy/core/src/common/simd/avx512/conversion.h b/numpy/core/src/common/simd/avx512/conversion.h
new file mode 100644
index 000000000..0f7e27de3
--- /dev/null
+++ b/numpy/core/src/common/simd/avx512/conversion.h
@@ -0,0 +1,54 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_AVX512_CVT_H
+#define _NPY_SIMD_AVX512_CVT_H
+
+// convert mask to integer vectors
+#ifdef NPY_HAVE_AVX512BW
+ #define npyv_cvt_u8_b8 _mm512_movm_epi8
+ #define npyv_cvt_u16_b16 _mm512_movm_epi16
+#else
+ #define npyv_cvt_u8_b8(BL) BL
+ #define npyv_cvt_u16_b16(BL) BL
+#endif
+#define npyv_cvt_s8_b8 npyv_cvt_u8_b8
+#define npyv_cvt_s16_b16 npyv_cvt_u16_b16
+
+#ifdef NPY_HAVE_AVX512DQ
+ #define npyv_cvt_u32_b32 _mm512_movm_epi32
+ #define npyv_cvt_u64_b64 _mm512_movm_epi64
+#else
+ #define npyv_cvt_u32_b32(BL) _mm512_maskz_set1_epi32(BL, (int)-1)
+ #define npyv_cvt_u64_b64(BL) _mm512_maskz_set1_epi64(BL, (npy_int64)-1)
+#endif
+#define npyv_cvt_s32_b32 npyv_cvt_u32_b32
+#define npyv_cvt_s64_b64 npyv_cvt_u64_b64
+#define npyv_cvt_f32_b32(BL) _mm512_castsi512_ps(npyv_cvt_u32_b32(BL))
+#define npyv_cvt_f64_b64(BL) _mm512_castsi512_pd(npyv_cvt_u64_b64(BL))
+
+// convert integer vectors to mask
+#ifdef NPY_HAVE_AVX512BW
+ #define npyv_cvt_b8_u8 _mm512_movepi8_mask
+ #define npyv_cvt_b16_u16 _mm512_movepi16_mask
+#else
+ #define npyv_cvt_b8_u8(A) A
+ #define npyv_cvt_b16_u16(A) A
+#endif
+#define npyv_cvt_b8_s8 npyv_cvt_b8_u8
+#define npyv_cvt_b16_s16 npyv_cvt_b16_u16
+
+#ifdef NPY_HAVE_AVX512DQ
+ #define npyv_cvt_b32_u32 _mm512_movepi32_mask
+ #define npyv_cvt_b64_u64 _mm512_movepi64_mask
+#else
+ #define npyv_cvt_b32_u32(A) _mm512_cmpneq_epu32_mask(A, _mm512_setzero_si512())
+ #define npyv_cvt_b64_u64(A) _mm512_cmpneq_epu64_mask(A, _mm512_setzero_si512())
+#endif
+#define npyv_cvt_b32_s32 npyv_cvt_b32_u32
+#define npyv_cvt_b64_s64 npyv_cvt_b64_u64
+#define npyv_cvt_b32_f32(A) npyv_cvt_b32_u32(_mm512_castps_si512(A))
+#define npyv_cvt_b64_f64(A) npyv_cvt_b64_u64(_mm512_castpd_si512(A))
+
+#endif // _NPY_SIMD_AVX512_CVT_H
diff --git a/numpy/core/src/common/simd/avx512/memory.h b/numpy/core/src/common/simd/avx512/memory.h
new file mode 100644
index 000000000..e212c4555
--- /dev/null
+++ b/numpy/core/src/common/simd/avx512/memory.h
@@ -0,0 +1,94 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_AVX512_MEMORY_H
+#define _NPY_SIMD_AVX512_MEMORY_H
+
+#include "misc.h"
+
+/***************************
+ * load/store
+ ***************************/
+#if defined(__GNUC__)
+ // GCC expect pointer argument type to be `void*` instead of `const void *`,
+ // which cause a massive warning.
+ #define npyv__loads(PTR) _mm512_stream_load_si512((__m512i*)(PTR))
+#else
+ #define npyv__loads(PTR) _mm512_stream_load_si512((const __m512i*)(PTR))
+#endif
+#if defined(_MSC_VER) && defined(_M_IX86)
+ // workaround msvc(32bit) overflow bug, reported at
+ // https://developercommunity.visualstudio.com/content/problem/911872/u.html
+ NPY_FINLINE __m512i npyv__loadl(const __m256i *ptr)
+ {
+ __m256i a = _mm256_loadu_si256(ptr);
+ return _mm512_inserti64x4(_mm512_castsi256_si512(a), a, 0);
+ }
+#else
+ #define npyv__loadl(PTR) \
+ _mm512_castsi256_si512(_mm256_loadu_si256(PTR))
+#endif
+#define NPYV_IMPL_AVX512_MEM_INT(CTYPE, SFX) \
+ NPY_FINLINE npyv_##SFX npyv_load_##SFX(const CTYPE *ptr) \
+ { return _mm512_loadu_si512((const __m512i*)ptr); } \
+ NPY_FINLINE npyv_##SFX npyv_loada_##SFX(const CTYPE *ptr) \
+ { return _mm512_load_si512((const __m512i*)ptr); } \
+ NPY_FINLINE npyv_##SFX npyv_loads_##SFX(const CTYPE *ptr) \
+ { return npyv__loads(ptr); } \
+ NPY_FINLINE npyv_##SFX npyv_loadl_##SFX(const CTYPE *ptr) \
+ { return npyv__loadl((const __m256i *)ptr); } \
+ NPY_FINLINE void npyv_store_##SFX(CTYPE *ptr, npyv_##SFX vec) \
+ { _mm512_storeu_si512((__m512i*)ptr, vec); } \
+ NPY_FINLINE void npyv_storea_##SFX(CTYPE *ptr, npyv_##SFX vec) \
+ { _mm512_store_si512((__m512i*)ptr, vec); } \
+ NPY_FINLINE void npyv_stores_##SFX(CTYPE *ptr, npyv_##SFX vec) \
+ { _mm512_stream_si512((__m512i*)ptr, vec); } \
+ NPY_FINLINE void npyv_storel_##SFX(CTYPE *ptr, npyv_##SFX vec) \
+ { _mm256_storeu_si256((__m256i*)ptr, npyv512_lower_si256(vec)); } \
+ NPY_FINLINE void npyv_storeh_##SFX(CTYPE *ptr, npyv_##SFX vec) \
+ { _mm256_storeu_si256((__m256i*)(ptr), npyv512_higher_si256(vec)); }
+
+NPYV_IMPL_AVX512_MEM_INT(npy_uint8, u8)
+NPYV_IMPL_AVX512_MEM_INT(npy_int8, s8)
+NPYV_IMPL_AVX512_MEM_INT(npy_uint16, u16)
+NPYV_IMPL_AVX512_MEM_INT(npy_int16, s16)
+NPYV_IMPL_AVX512_MEM_INT(npy_uint32, u32)
+NPYV_IMPL_AVX512_MEM_INT(npy_int32, s32)
+NPYV_IMPL_AVX512_MEM_INT(npy_uint64, u64)
+NPYV_IMPL_AVX512_MEM_INT(npy_int64, s64)
+
+// unaligned load
+#define npyv_load_f32(PTR) _mm512_loadu_ps((const __m512*)(PTR))
+#define npyv_load_f64(PTR) _mm512_loadu_pd((const __m512d*)(PTR))
+// aligned load
+#define npyv_loada_f32(PTR) _mm512_load_ps((const __m512*)(PTR))
+#define npyv_loada_f64(PTR) _mm512_load_pd((const __m512d*)(PTR))
+// load lower part
+#if defined(_MSC_VER) && defined(_M_IX86)
+ #define npyv_loadl_f32(PTR) _mm512_castsi512_ps(npyv__loadl((const __m256i *)(PTR)))
+ #define npyv_loadl_f64(PTR) _mm512_castsi512_pd(npyv__loadl((const __m256i *)(PTR)))
+#else
+ #define npyv_loadl_f32(PTR) _mm512_castps256_ps512(_mm256_loadu_ps(PTR))
+ #define npyv_loadl_f64(PTR) _mm512_castpd256_pd512(_mm256_loadu_pd(PTR))
+#endif
+// stream load
+#define npyv_loads_f32(PTR) _mm512_castsi512_ps(npyv__loads(PTR))
+#define npyv_loads_f64(PTR) _mm512_castsi512_pd(npyv__loads(PTR))
+// unaligned store
+#define npyv_store_f32 _mm512_storeu_ps
+#define npyv_store_f64 _mm512_storeu_pd
+// aligned store
+#define npyv_storea_f32 _mm512_store_ps
+#define npyv_storea_f64 _mm512_store_pd
+// stream store
+#define npyv_stores_f32 _mm512_stream_ps
+#define npyv_stores_f64 _mm512_stream_pd
+// store lower part
+#define npyv_storel_f32(PTR, VEC) _mm256_storeu_ps(PTR, npyv512_lower_ps256(VEC))
+#define npyv_storel_f64(PTR, VEC) _mm256_storeu_pd(PTR, npyv512_lower_pd256(VEC))
+// store higher part
+#define npyv_storeh_f32(PTR, VEC) _mm256_storeu_ps(PTR, npyv512_higher_ps256(VEC))
+#define npyv_storeh_f64(PTR, VEC) _mm256_storeu_pd(PTR, npyv512_higher_pd256(VEC))
+
+#endif // _NPY_SIMD_AVX512_MEMORY_H
diff --git a/numpy/core/src/common/simd/avx512/misc.h b/numpy/core/src/common/simd/avx512/misc.h
new file mode 100644
index 000000000..4b6729b05
--- /dev/null
+++ b/numpy/core/src/common/simd/avx512/misc.h
@@ -0,0 +1,252 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_AVX512_MISC_H
+#define _NPY_SIMD_AVX512_MISC_H
+
+// set all lanes to zero
+#define npyv_zero_u8 _mm512_setzero_si512
+#define npyv_zero_s8 _mm512_setzero_si512
+#define npyv_zero_u16 _mm512_setzero_si512
+#define npyv_zero_s16 _mm512_setzero_si512
+#define npyv_zero_u32 _mm512_setzero_si512
+#define npyv_zero_s32 _mm512_setzero_si512
+#define npyv_zero_u64 _mm512_setzero_si512
+#define npyv_zero_s64 _mm512_setzero_si512
+#define npyv_zero_f32 _mm512_setzero_ps
+#define npyv_zero_f64 _mm512_setzero_pd
+
+// set all lanes to same value
+#define npyv_setall_u8(VAL) _mm512_set1_epi8((char)VAL)
+#define npyv_setall_s8(VAL) _mm512_set1_epi8((char)VAL)
+#define npyv_setall_u16(VAL) _mm512_set1_epi16((short)VAL)
+#define npyv_setall_s16(VAL) _mm512_set1_epi16((short)VAL)
+#define npyv_setall_u32(VAL) _mm512_set1_epi32((int)VAL)
+#define npyv_setall_s32(VAL) _mm512_set1_epi32(VAL)
+#define npyv_setall_u64(VAL) _mm512_set1_epi64(VAL)
+#define npyv_setall_s64(VAL) _mm512_set1_epi64(VAL)
+#define npyv_setall_f32(VAL) _mm512_set1_ps(VAL)
+#define npyv_setall_f64(VAL) _mm512_set1_pd(VAL)
+
+/**
+ * vector with specific values set to each lane and
+ * set a specific value to all remained lanes
+ *
+ * _mm512_set_epi8 and _mm512_set_epi16 are missing in many compilers
+ */
+NPY_FINLINE __m512i npyv__setr_epi8(
+ char i0, char i1, char i2, char i3, char i4, char i5, char i6, char i7,
+ char i8, char i9, char i10, char i11, char i12, char i13, char i14, char i15,
+ char i16, char i17, char i18, char i19, char i20, char i21, char i22, char i23,
+ char i24, char i25, char i26, char i27, char i28, char i29, char i30, char i31,
+ char i32, char i33, char i34, char i35, char i36, char i37, char i38, char i39,
+ char i40, char i41, char i42, char i43, char i44, char i45, char i46, char i47,
+ char i48, char i49, char i50, char i51, char i52, char i53, char i54, char i55,
+ char i56, char i57, char i58, char i59, char i60, char i61, char i62, char i63)
+{
+ const char NPY_DECL_ALIGNED(64) data[64] = {
+ i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15,
+ i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31,
+ i32, i33, i34, i35, i36, i37, i38, i39, i40, i41, i42, i43, i44, i45, i46, i47,
+ i48, i49, i50, i51, i52, i53, i54, i55, i56, i57, i58, i59, i60, i61, i62, i63
+ };
+ return _mm512_load_si512((const void*)data);
+}
+NPY_FINLINE __m512i npyv__setr_epi16(
+ short i0, short i1, short i2, short i3, short i4, short i5, short i6, short i7,
+ short i8, short i9, short i10, short i11, short i12, short i13, short i14, short i15,
+ short i16, short i17, short i18, short i19, short i20, short i21, short i22, short i23,
+ short i24, short i25, short i26, short i27, short i28, short i29, short i30, short i31)
+{
+ const short NPY_DECL_ALIGNED(64) data[32] = {
+ i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15,
+ i16, i17, i18, i19, i20, i21, i22, i23, i24, i25, i26, i27, i28, i29, i30, i31
+ };
+ return _mm512_load_si512((const void*)data);
+}
+// args that generated by NPYV__SET_FILL_* not going to expand if
+// _mm512_setr_* are defined as macros.
+NPY_FINLINE __m512i npyv__setr_epi32(
+ int i0, int i1, int i2, int i3, int i4, int i5, int i6, int i7,
+ int i8, int i9, int i10, int i11, int i12, int i13, int i14, int i15)
+{
+ return _mm512_setr_epi32(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15);
+}
+NPY_FINLINE __m512i npyv__setr_epi64(npy_int64 i0, npy_int64 i1, npy_int64 i2, npy_int64 i3,
+ npy_int64 i4, npy_int64 i5, npy_int64 i6, npy_int64 i7)
+{
+ return _mm512_setr_epi64(i0, i1, i2, i3, i4, i5, i6, i7);
+}
+
+NPY_FINLINE __m512 npyv__setr_ps(
+ float i0, float i1, float i2, float i3, float i4, float i5, float i6, float i7,
+ float i8, float i9, float i10, float i11, float i12, float i13, float i14, float i15)
+{
+ return _mm512_setr_ps(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15);
+}
+NPY_FINLINE __m512d npyv__setr_pd(double i0, double i1, double i2, double i3,
+ double i4, double i5, double i6, double i7)
+{
+ return _mm512_setr_pd(i0, i1, i2, i3, i4, i5, i6, i7);
+}
+#define npyv_setf_u8(FILL, ...) npyv__setr_epi8(NPYV__SET_FILL_64(char, FILL, __VA_ARGS__))
+#define npyv_setf_s8(FILL, ...) npyv__setr_epi8(NPYV__SET_FILL_64(char, FILL, __VA_ARGS__))
+#define npyv_setf_u16(FILL, ...) npyv__setr_epi16(NPYV__SET_FILL_32(short, FILL, __VA_ARGS__))
+#define npyv_setf_s16(FILL, ...) npyv__setr_epi16(NPYV__SET_FILL_32(short, FILL, __VA_ARGS__))
+#define npyv_setf_u32(FILL, ...) npyv__setr_epi32(NPYV__SET_FILL_16(int, FILL, __VA_ARGS__))
+#define npyv_setf_s32(FILL, ...) npyv__setr_epi32(NPYV__SET_FILL_16(int, FILL, __VA_ARGS__))
+#define npyv_setf_u64(FILL, ...) npyv__setr_epi64(NPYV__SET_FILL_8(npy_int64, FILL, __VA_ARGS__))
+#define npyv_setf_s64(FILL, ...) npyv__setr_epi64(NPYV__SET_FILL_8(npy_int64, FILL, __VA_ARGS__))
+#define npyv_setf_f32(FILL, ...) npyv__setr_ps(NPYV__SET_FILL_16(float, FILL, __VA_ARGS__))
+#define npyv_setf_f64(FILL, ...) npyv__setr_pd(NPYV__SET_FILL_8(double, FILL, __VA_ARGS__))
+
+// vector with specific values set to each lane and
+// set zero to all remained lanes
+#define npyv_set_u8(...) npyv_setf_u8(0, __VA_ARGS__)
+#define npyv_set_s8(...) npyv_setf_s8(0, __VA_ARGS__)
+#define npyv_set_u16(...) npyv_setf_u16(0, __VA_ARGS__)
+#define npyv_set_s16(...) npyv_setf_s16(0, __VA_ARGS__)
+#define npyv_set_u32(...) npyv_setf_u32(0, __VA_ARGS__)
+#define npyv_set_s32(...) npyv_setf_s32(0, __VA_ARGS__)
+#define npyv_set_u64(...) npyv_setf_u64(0, __VA_ARGS__)
+#define npyv_set_s64(...) npyv_setf_s64(0, __VA_ARGS__)
+#define npyv_set_f32(...) npyv_setf_f32(0, __VA_ARGS__)
+#define npyv_set_f64(...) npyv_setf_f64(0, __VA_ARGS__)
+
+// per lane select
+#ifdef NPY_HAVE_AVX512BW
+ #define npyv_select_u8(MASK, A, B) _mm512_mask_blend_epi8(MASK, B, A)
+ #define npyv_select_u16(MASK, A, B) _mm512_mask_blend_epi16(MASK, B, A)
+#else
+ NPY_FINLINE __m512i npyv_select_u8(__m512i mask, __m512i a, __m512i b)
+ { return _mm512_xor_si512(b, _mm512_and_si512(_mm512_xor_si512(b, a), mask)); }
+ #define npyv_select_u16 npyv_select_u8
+#endif
+#define npyv_select_s8 npyv_select_u8
+#define npyv_select_s16 npyv_select_u16
+#define npyv_select_u32(MASK, A, B) _mm512_mask_blend_epi32(MASK, B, A)
+#define npyv_select_s32 npyv_select_u32
+#define npyv_select_u64(MASK, A, B) _mm512_mask_blend_epi64(MASK, B, A)
+#define npyv_select_s64 npyv_select_u64
+#define npyv_select_f32(MASK, A, B) _mm512_mask_blend_ps(MASK, B, A)
+#define npyv_select_f64(MASK, A, B) _mm512_mask_blend_pd(MASK, B, A)
+
+// reinterpret
+#define npyv_reinterpret_u8_u8(X) X
+#define npyv_reinterpret_u8_s8(X) X
+#define npyv_reinterpret_u8_u16(X) X
+#define npyv_reinterpret_u8_s16(X) X
+#define npyv_reinterpret_u8_u32(X) X
+#define npyv_reinterpret_u8_s32(X) X
+#define npyv_reinterpret_u8_u64(X) X
+#define npyv_reinterpret_u8_s64(X) X
+#define npyv_reinterpret_u8_f32 _mm512_castps_si512
+#define npyv_reinterpret_u8_f64 _mm512_castpd_si512
+
+#define npyv_reinterpret_s8_s8(X) X
+#define npyv_reinterpret_s8_u8(X) X
+#define npyv_reinterpret_s8_u16(X) X
+#define npyv_reinterpret_s8_s16(X) X
+#define npyv_reinterpret_s8_u32(X) X
+#define npyv_reinterpret_s8_s32(X) X
+#define npyv_reinterpret_s8_u64(X) X
+#define npyv_reinterpret_s8_s64(X) X
+#define npyv_reinterpret_s8_f32 _mm512_castps_si512
+#define npyv_reinterpret_s8_f64 _mm512_castpd_si512
+
+#define npyv_reinterpret_u16_u16(X) X
+#define npyv_reinterpret_u16_u8(X) X
+#define npyv_reinterpret_u16_s8(X) X
+#define npyv_reinterpret_u16_s16(X) X
+#define npyv_reinterpret_u16_u32(X) X
+#define npyv_reinterpret_u16_s32(X) X
+#define npyv_reinterpret_u16_u64(X) X
+#define npyv_reinterpret_u16_s64(X) X
+#define npyv_reinterpret_u16_f32 _mm512_castps_si512
+#define npyv_reinterpret_u16_f64 _mm512_castpd_si512
+
+#define npyv_reinterpret_s16_s16(X) X
+#define npyv_reinterpret_s16_u8(X) X
+#define npyv_reinterpret_s16_s8(X) X
+#define npyv_reinterpret_s16_u16(X) X
+#define npyv_reinterpret_s16_u32(X) X
+#define npyv_reinterpret_s16_s32(X) X
+#define npyv_reinterpret_s16_u64(X) X
+#define npyv_reinterpret_s16_s64(X) X
+#define npyv_reinterpret_s16_f32 _mm512_castps_si512
+#define npyv_reinterpret_s16_f64 _mm512_castpd_si512
+
+#define npyv_reinterpret_u32_u32(X) X
+#define npyv_reinterpret_u32_u8(X) X
+#define npyv_reinterpret_u32_s8(X) X
+#define npyv_reinterpret_u32_u16(X) X
+#define npyv_reinterpret_u32_s16(X) X
+#define npyv_reinterpret_u32_s32(X) X
+#define npyv_reinterpret_u32_u64(X) X
+#define npyv_reinterpret_u32_s64(X) X
+#define npyv_reinterpret_u32_f32 _mm512_castps_si512
+#define npyv_reinterpret_u32_f64 _mm512_castpd_si512
+
+#define npyv_reinterpret_s32_s32(X) X
+#define npyv_reinterpret_s32_u8(X) X
+#define npyv_reinterpret_s32_s8(X) X
+#define npyv_reinterpret_s32_u16(X) X
+#define npyv_reinterpret_s32_s16(X) X
+#define npyv_reinterpret_s32_u32(X) X
+#define npyv_reinterpret_s32_u64(X) X
+#define npyv_reinterpret_s32_s64(X) X
+#define npyv_reinterpret_s32_f32 _mm512_castps_si512
+#define npyv_reinterpret_s32_f64 _mm512_castpd_si512
+
+#define npyv_reinterpret_u64_u64(X) X
+#define npyv_reinterpret_u64_u8(X) X
+#define npyv_reinterpret_u64_s8(X) X
+#define npyv_reinterpret_u64_u16(X) X
+#define npyv_reinterpret_u64_s16(X) X
+#define npyv_reinterpret_u64_u32(X) X
+#define npyv_reinterpret_u64_s32(X) X
+#define npyv_reinterpret_u64_s64(X) X
+#define npyv_reinterpret_u64_f32 _mm512_castps_si512
+#define npyv_reinterpret_u64_f64 _mm512_castpd_si512
+
+#define npyv_reinterpret_s64_s64(X) X
+#define npyv_reinterpret_s64_u8(X) X
+#define npyv_reinterpret_s64_s8(X) X
+#define npyv_reinterpret_s64_u16(X) X
+#define npyv_reinterpret_s64_s16(X) X
+#define npyv_reinterpret_s64_u32(X) X
+#define npyv_reinterpret_s64_s32(X) X
+#define npyv_reinterpret_s64_u64(X) X
+#define npyv_reinterpret_s64_f32 _mm512_castps_si512
+#define npyv_reinterpret_s64_f64 _mm512_castpd_si512
+
+#define npyv_reinterpret_f32_f32(X) X
+#define npyv_reinterpret_f32_u8 _mm512_castsi512_ps
+#define npyv_reinterpret_f32_s8 _mm512_castsi512_ps
+#define npyv_reinterpret_f32_u16 _mm512_castsi512_ps
+#define npyv_reinterpret_f32_s16 _mm512_castsi512_ps
+#define npyv_reinterpret_f32_u32 _mm512_castsi512_ps
+#define npyv_reinterpret_f32_s32 _mm512_castsi512_ps
+#define npyv_reinterpret_f32_u64 _mm512_castsi512_ps
+#define npyv_reinterpret_f32_s64 _mm512_castsi512_ps
+#define npyv_reinterpret_f32_f64 _mm512_castpd_ps
+
+#define npyv_reinterpret_f64_f64(X) X
+#define npyv_reinterpret_f64_u8 _mm512_castsi512_pd
+#define npyv_reinterpret_f64_s8 _mm512_castsi512_pd
+#define npyv_reinterpret_f64_u16 _mm512_castsi512_pd
+#define npyv_reinterpret_f64_s16 _mm512_castsi512_pd
+#define npyv_reinterpret_f64_u32 _mm512_castsi512_pd
+#define npyv_reinterpret_f64_s32 _mm512_castsi512_pd
+#define npyv_reinterpret_f64_u64 _mm512_castsi512_pd
+#define npyv_reinterpret_f64_s64 _mm512_castsi512_pd
+#define npyv_reinterpret_f64_f32 _mm512_castps_pd
+
+#ifdef NPY_HAVE_AVX512_KNL
+ #define npyv_cleanup() ((void)0)
+#else
+ #define npyv_cleanup _mm256_zeroall
+#endif
+
+#endif // _NPY_SIMD_AVX512_MISC_H
diff --git a/numpy/core/src/common/simd/avx512/operators.h b/numpy/core/src/common/simd/avx512/operators.h
new file mode 100644
index 000000000..f76ea5e2d
--- /dev/null
+++ b/numpy/core/src/common/simd/avx512/operators.h
@@ -0,0 +1,259 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_AVX512_OPERATORS_H
+#define _NPY_SIMD_AVX512_OPERATORS_H
+
+/***************************
+ * Shifting
+ ***************************/
+
+// left
+#ifdef NPY_HAVE_AVX512BW
+ #define npyv_shl_u16(A, C) _mm512_sll_epi16(A, _mm_cvtsi32_si128(C))
+#else
+ #define NPYV_IMPL_AVX512_SHIFT(FN, INTRIN) \
+ NPY_FINLINE __m512i npyv_##FN(__m512i a, int c) \
+ { \
+ __m256i l = npyv512_lower_si256(a); \
+ __m256i h = npyv512_higher_si256(a); \
+ __m128i cv = _mm_cvtsi32_si128(c); \
+ l = _mm256_##INTRIN(l, cv); \
+ h = _mm256_##INTRIN(h, cv); \
+ return npyv512_combine_si256(l, h); \
+ }
+
+ NPYV_IMPL_AVX512_SHIFT(shl_u16, sll_epi16)
+#endif
+#define npyv_shl_s16 npyv_shl_u16
+#define npyv_shl_u32(A, C) _mm512_sll_epi32(A, _mm_cvtsi32_si128(C))
+#define npyv_shl_s32(A, C) _mm512_sll_epi32(A, _mm_cvtsi32_si128(C))
+#define npyv_shl_u64(A, C) _mm512_sll_epi64(A, _mm_cvtsi32_si128(C))
+#define npyv_shl_s64(A, C) _mm512_sll_epi64(A, _mm_cvtsi32_si128(C))
+
+// left by an immediate constant
+#ifdef NPY_HAVE_AVX512BW
+ #define npyv_shli_u16 _mm512_slli_epi16
+#else
+ #define npyv_shli_u16 npyv_shl_u16
+#endif
+#define npyv_shli_s16 npyv_shl_u16
+#define npyv_shli_u32 _mm512_slli_epi32
+#define npyv_shli_s32 _mm512_slli_epi32
+#define npyv_shli_u64 _mm512_slli_epi64
+#define npyv_shli_s64 _mm512_slli_epi64
+
+// right
+#ifdef NPY_HAVE_AVX512BW
+ #define npyv_shr_u16(A, C) _mm512_srl_epi16(A, _mm_cvtsi32_si128(C))
+ #define npyv_shr_s16(A, C) _mm512_sra_epi16(A, _mm_cvtsi32_si128(C))
+#else
+ NPYV_IMPL_AVX512_SHIFT(shr_u16, srl_epi16)
+ NPYV_IMPL_AVX512_SHIFT(shr_s16, sra_epi16)
+#endif
+#define npyv_shr_u32(A, C) _mm512_srl_epi32(A, _mm_cvtsi32_si128(C))
+#define npyv_shr_s32(A, C) _mm512_sra_epi32(A, _mm_cvtsi32_si128(C))
+#define npyv_shr_u64(A, C) _mm512_srl_epi64(A, _mm_cvtsi32_si128(C))
+#define npyv_shr_s64(A, C) _mm512_sra_epi64(A, _mm_cvtsi32_si128(C))
+
+// right by an immediate constant
+#ifdef NPY_HAVE_AVX512BW
+ #define npyv_shri_u16 _mm512_srli_epi16
+ #define npyv_shri_s16 _mm512_srai_epi16
+#else
+ #define npyv_shri_u16 npyv_shr_u16
+ #define npyv_shri_s16 npyv_shr_s16
+#endif
+#define npyv_shri_u32 _mm512_srli_epi32
+#define npyv_shri_s32 _mm512_srai_epi32
+#define npyv_shri_u64 _mm512_srli_epi64
+#define npyv_shri_s64 _mm512_srai_epi64
+
+/***************************
+ * Logical
+ ***************************/
+
+// AND
+#define npyv_and_u8 _mm512_and_si512
+#define npyv_and_s8 _mm512_and_si512
+#define npyv_and_u16 _mm512_and_si512
+#define npyv_and_s16 _mm512_and_si512
+#define npyv_and_u32 _mm512_and_si512
+#define npyv_and_s32 _mm512_and_si512
+#define npyv_and_u64 _mm512_and_si512
+#define npyv_and_s64 _mm512_and_si512
+#ifdef NPY_HAVE_AVX512DQ
+ #define npyv_and_f32 _mm512_and_ps
+ #define npyv_and_f64 _mm512_and_pd
+#else
+ NPYV_IMPL_AVX512_FROM_SI512_PS_2ARG(npyv_and_f32, _mm512_and_si512)
+ NPYV_IMPL_AVX512_FROM_SI512_PD_2ARG(npyv_and_f64, _mm512_and_si512)
+#endif
+
+// OR
+#define npyv_or_u8 _mm512_or_si512
+#define npyv_or_s8 _mm512_or_si512
+#define npyv_or_u16 _mm512_or_si512
+#define npyv_or_s16 _mm512_or_si512
+#define npyv_or_u32 _mm512_or_si512
+#define npyv_or_s32 _mm512_or_si512
+#define npyv_or_u64 _mm512_or_si512
+#define npyv_or_s64 _mm512_or_si512
+#ifdef NPY_HAVE_AVX512DQ
+ #define npyv_or_f32 _mm512_or_ps
+ #define npyv_or_f64 _mm512_or_pd
+#else
+ NPYV_IMPL_AVX512_FROM_SI512_PS_2ARG(npyv_or_f32, _mm512_or_si512)
+ NPYV_IMPL_AVX512_FROM_SI512_PD_2ARG(npyv_or_f64, _mm512_or_si512)
+#endif
+
+// XOR
+#define npyv_xor_u8 _mm512_xor_si512
+#define npyv_xor_s8 _mm512_xor_si512
+#define npyv_xor_u16 _mm512_xor_si512
+#define npyv_xor_s16 _mm512_xor_si512
+#define npyv_xor_u32 _mm512_xor_si512
+#define npyv_xor_s32 _mm512_xor_si512
+#define npyv_xor_u64 _mm512_xor_si512
+#define npyv_xor_s64 _mm512_xor_si512
+#ifdef NPY_HAVE_AVX512DQ
+ #define npyv_xor_f32 _mm512_xor_ps
+ #define npyv_xor_f64 _mm512_xor_pd
+#else
+ NPYV_IMPL_AVX512_FROM_SI512_PS_2ARG(npyv_xor_f32, _mm512_xor_si512)
+ NPYV_IMPL_AVX512_FROM_SI512_PD_2ARG(npyv_xor_f64, _mm512_xor_si512)
+#endif
+
+// NOT
+#define npyv_not_u8(A) _mm512_xor_si512(A, _mm512_set1_epi32(-1))
+#define npyv_not_s8 npyv_not_u8
+#define npyv_not_u16 npyv_not_u8
+#define npyv_not_s16 npyv_not_u8
+#define npyv_not_u32 npyv_not_u8
+#define npyv_not_s32 npyv_not_u8
+#define npyv_not_u64 npyv_not_u8
+#define npyv_not_s64 npyv_not_u8
+#ifdef NPY_HAVE_AVX512DQ
+ #define npyv_not_f32(A) _mm512_xor_ps(A, _mm512_castsi512_ps(_mm512_set1_epi32(-1)))
+ #define npyv_not_f64(A) _mm512_xor_pd(A, _mm512_castsi512_pd(_mm512_set1_epi32(-1)))
+#else
+ #define npyv_not_f32(A) _mm512_castsi512_ps(npyv_not_u32(_mm512_castps_si512(A)))
+ #define npyv_not_f64(A) _mm512_castsi512_pd(npyv_not_u64(_mm512_castpd_si512(A)))
+#endif
+
+/***************************
+ * Comparison
+ ***************************/
+
+// int Equal
+#ifdef NPY_HAVE_AVX512BW
+ #define npyv_cmpeq_u8 _mm512_cmpeq_epu8_mask
+ #define npyv_cmpeq_s8 _mm512_cmpeq_epi8_mask
+ #define npyv_cmpeq_u16 _mm512_cmpeq_epu16_mask
+ #define npyv_cmpeq_s16 _mm512_cmpeq_epi16_mask
+#else
+ NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_cmpeq_u8, _mm256_cmpeq_epi8)
+ NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_cmpeq_u16, _mm256_cmpeq_epi16)
+ #define npyv_cmpeq_s8 npyv_cmpeq_u8
+ #define npyv_cmpeq_s16 npyv_cmpeq_u16
+#endif
+#define npyv_cmpeq_u32 _mm512_cmpeq_epu32_mask
+#define npyv_cmpeq_s32 _mm512_cmpeq_epi32_mask
+#define npyv_cmpeq_u64 _mm512_cmpeq_epu64_mask
+#define npyv_cmpeq_s64 _mm512_cmpeq_epi64_mask
+
+// int not equal
+#ifdef NPY_HAVE_AVX512BW
+ #define npyv_cmpneq_u8 _mm512_cmpneq_epu8_mask
+ #define npyv_cmpneq_s8 _mm512_cmpneq_epi8_mask
+ #define npyv_cmpneq_u16 _mm512_cmpneq_epu16_mask
+ #define npyv_cmpneq_s16 _mm512_cmpneq_epi16_mask
+#else
+ #define npyv_cmpneq_u8(A, B) npyv_not_u8(npyv_cmpeq_u8(A, B))
+ #define npyv_cmpneq_u16(A, B) npyv_not_u16(npyv_cmpeq_u16(A, B))
+ #define npyv_cmpneq_s8 npyv_cmpneq_u8
+ #define npyv_cmpneq_s16 npyv_cmpneq_u16
+#endif
+#define npyv_cmpneq_u32 _mm512_cmpneq_epu32_mask
+#define npyv_cmpneq_s32 _mm512_cmpneq_epi32_mask
+#define npyv_cmpneq_u64 _mm512_cmpneq_epu64_mask
+#define npyv_cmpneq_s64 _mm512_cmpneq_epi64_mask
+
+// greater than
+#ifdef NPY_HAVE_AVX512BW
+ #define npyv_cmpgt_u8 _mm512_cmpgt_epu8_mask
+ #define npyv_cmpgt_s8 _mm512_cmpgt_epi8_mask
+ #define npyv_cmpgt_u16 _mm512_cmpgt_epu16_mask
+ #define npyv_cmpgt_s16 _mm512_cmpgt_epi16_mask
+#else
+ NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_cmpgt_s8, _mm256_cmpgt_epi8)
+ NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv_cmpgt_s16, _mm256_cmpgt_epi16)
+ NPY_FINLINE __m512i npyv_cmpgt_u8(__m512i a, __m512i b)
+ {
+ const __m512i sbit = _mm512_set1_epi32(0x80808080);
+ return npyv_cmpgt_s8(_mm512_xor_si512(a, sbit), _mm512_xor_si512(b, sbit));
+ }
+ NPY_FINLINE __m512i npyv_cmpgt_u16(__m512i a, __m512i b)
+ {
+ const __m512i sbit = _mm512_set1_epi32(0x80008000);
+ return npyv_cmpgt_s16(_mm512_xor_si512(a, sbit), _mm512_xor_si512(b, sbit));
+ }
+#endif
+#define npyv_cmpgt_u32 _mm512_cmpgt_epu32_mask
+#define npyv_cmpgt_s32 _mm512_cmpgt_epi32_mask
+#define npyv_cmpgt_u64 _mm512_cmpgt_epu64_mask
+#define npyv_cmpgt_s64 _mm512_cmpgt_epi64_mask
+
+// greater than or equal
+#ifdef NPY_HAVE_AVX512BW
+ #define npyv_cmpge_u8 _mm512_cmpge_epu8_mask
+ #define npyv_cmpge_s8 _mm512_cmpge_epi8_mask
+ #define npyv_cmpge_u16 _mm512_cmpge_epu16_mask
+ #define npyv_cmpge_s16 _mm512_cmpge_epi16_mask
+#else
+ #define npyv_cmpge_u8(A, B) npyv_not_u8(npyv_cmpgt_u8(B, A))
+ #define npyv_cmpge_s8(A, B) npyv_not_s8(npyv_cmpgt_s8(B, A))
+ #define npyv_cmpge_u16(A, B) npyv_not_u16(npyv_cmpgt_u16(B, A))
+ #define npyv_cmpge_s16(A, B) npyv_not_s16(npyv_cmpgt_s16(B, A))
+#endif
+#define npyv_cmpge_u32 _mm512_cmpge_epu32_mask
+#define npyv_cmpge_s32 _mm512_cmpge_epi32_mask
+#define npyv_cmpge_u64 _mm512_cmpge_epu64_mask
+#define npyv_cmpge_s64 _mm512_cmpge_epi64_mask
+
+// less than
+#define npyv_cmplt_u8(A, B) npyv_cmpgt_u8(B, A)
+#define npyv_cmplt_s8(A, B) npyv_cmpgt_s8(B, A)
+#define npyv_cmplt_u16(A, B) npyv_cmpgt_u16(B, A)
+#define npyv_cmplt_s16(A, B) npyv_cmpgt_s16(B, A)
+#define npyv_cmplt_u32(A, B) npyv_cmpgt_u32(B, A)
+#define npyv_cmplt_s32(A, B) npyv_cmpgt_s32(B, A)
+#define npyv_cmplt_u64(A, B) npyv_cmpgt_u64(B, A)
+#define npyv_cmplt_s64(A, B) npyv_cmpgt_s64(B, A)
+
+// less than or equal
+#define npyv_cmple_u8(A, B) npyv_cmpge_u8(B, A)
+#define npyv_cmple_s8(A, B) npyv_cmpge_s8(B, A)
+#define npyv_cmple_u16(A, B) npyv_cmpge_u16(B, A)
+#define npyv_cmple_s16(A, B) npyv_cmpge_s16(B, A)
+#define npyv_cmple_u32(A, B) npyv_cmpge_u32(B, A)
+#define npyv_cmple_s32(A, B) npyv_cmpge_s32(B, A)
+#define npyv_cmple_u64(A, B) npyv_cmpge_u64(B, A)
+#define npyv_cmple_s64(A, B) npyv_cmpge_s64(B, A)
+
+// precision comparison
+#define npyv_cmpeq_f32(A, B) _mm512_cmp_ps_mask(A, B, _CMP_EQ_OQ)
+#define npyv_cmpeq_f64(A, B) _mm512_cmp_pd_mask(A, B, _CMP_EQ_OQ)
+#define npyv_cmpneq_f32(A, B) _mm512_cmp_ps_mask(A, B, _CMP_NEQ_OQ)
+#define npyv_cmpneq_f64(A, B) _mm512_cmp_pd_mask(A, B, _CMP_NEQ_OQ)
+#define npyv_cmplt_f32(A, B) _mm512_cmp_ps_mask(A, B, _CMP_LT_OQ)
+#define npyv_cmplt_f64(A, B) _mm512_cmp_pd_mask(A, B, _CMP_LT_OQ)
+#define npyv_cmple_f32(A, B) _mm512_cmp_ps_mask(A, B, _CMP_LE_OQ)
+#define npyv_cmple_f64(A, B) _mm512_cmp_pd_mask(A, B, _CMP_LE_OQ)
+#define npyv_cmpgt_f32(A, B) _mm512_cmp_ps_mask(A, B, _CMP_GT_OQ)
+#define npyv_cmpgt_f64(A, B) _mm512_cmp_pd_mask(A, B, _CMP_GT_OQ)
+#define npyv_cmpge_f32(A, B) _mm512_cmp_ps_mask(A, B, _CMP_GE_OQ)
+#define npyv_cmpge_f64(A, B) _mm512_cmp_pd_mask(A, B, _CMP_GE_OQ)
+
+#endif // _NPY_SIMD_AVX512_OPERATORS_H
diff --git a/numpy/core/src/common/simd/avx512/reorder.h b/numpy/core/src/common/simd/avx512/reorder.h
new file mode 100644
index 000000000..cdbae7aac
--- /dev/null
+++ b/numpy/core/src/common/simd/avx512/reorder.h
@@ -0,0 +1,170 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_AVX512_REORDER_H
+#define _NPY_SIMD_AVX512_REORDER_H
+
+// combine lower part of two vectors
+#define npyv_combinel_u8(A, B) _mm512_inserti64x4(A, _mm512_castsi512_si256(B), 1)
+#define npyv_combinel_s8 npyv_combinel_u8
+#define npyv_combinel_u16 npyv_combinel_u8
+#define npyv_combinel_s16 npyv_combinel_u8
+#define npyv_combinel_u32 npyv_combinel_u8
+#define npyv_combinel_s32 npyv_combinel_u8
+#define npyv_combinel_u64 npyv_combinel_u8
+#define npyv_combinel_s64 npyv_combinel_u8
+#define npyv_combinel_f64(A, B) _mm512_insertf64x4(A, _mm512_castpd512_pd256(B), 1)
+#ifdef NPY_HAVE_AVX512DQ
+ #define npyv_combinel_f32(A, B) \
+ _mm512_insertf32x8(A, _mm512_castps512_ps256(B), 1)
+#else
+ #define npyv_combinel_f32(A, B) \
+ _mm512_castsi512_ps(npyv_combinel_u8(_mm512_castps_si512(A), _mm512_castps_si512(B)))
+#endif
+
+// combine higher part of two vectors
+#define npyv_combineh_u8(A, B) _mm512_inserti64x4(B, _mm512_extracti64x4_epi64(A, 1), 0)
+#define npyv_combineh_s8 npyv_combineh_u8
+#define npyv_combineh_u16 npyv_combineh_u8
+#define npyv_combineh_s16 npyv_combineh_u8
+#define npyv_combineh_u32 npyv_combineh_u8
+#define npyv_combineh_s32 npyv_combineh_u8
+#define npyv_combineh_u64 npyv_combineh_u8
+#define npyv_combineh_s64 npyv_combineh_u8
+#define npyv_combineh_f64(A, B) _mm512_insertf64x4(B, _mm512_extractf64x4_pd(A, 1), 0)
+#ifdef NPY_HAVE_AVX512DQ
+ #define npyv_combineh_f32(A, B) \
+ _mm512_insertf32x8(B, _mm512_extractf32x8_ps(A, 1), 0)
+#else
+ #define npyv_combineh_f32(A, B) \
+ _mm512_castsi512_ps(npyv_combineh_u8(_mm512_castps_si512(A), _mm512_castps_si512(B)))
+#endif
+
+// combine two vectors from lower and higher parts of two other vectors
+NPY_FINLINE npyv_m512ix2 npyv__combine(__m512i a, __m512i b)
+{
+ npyv_m512ix2 r;
+ r.val[0] = npyv_combinel_u8(a, b);
+ r.val[1] = npyv_combineh_u8(a, b);
+ return r;
+}
+NPY_FINLINE npyv_f32x2 npyv_combine_f32(__m512 a, __m512 b)
+{
+ npyv_f32x2 r;
+ r.val[0] = npyv_combinel_f32(a, b);
+ r.val[1] = npyv_combineh_f32(a, b);
+ return r;
+}
+NPY_FINLINE npyv_f64x2 npyv_combine_f64(__m512d a, __m512d b)
+{
+ npyv_f64x2 r;
+ r.val[0] = npyv_combinel_f64(a, b);
+ r.val[1] = npyv_combineh_f64(a, b);
+ return r;
+}
+#define npyv_combine_u8 npyv__combine
+#define npyv_combine_s8 npyv__combine
+#define npyv_combine_u16 npyv__combine
+#define npyv_combine_s16 npyv__combine
+#define npyv_combine_u32 npyv__combine
+#define npyv_combine_s32 npyv__combine
+#define npyv_combine_u64 npyv__combine
+#define npyv_combine_s64 npyv__combine
+
+// interleave two vectors
+#ifndef NPY_HAVE_AVX512BW
+ NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv__unpacklo_epi8, _mm256_unpacklo_epi8)
+ NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv__unpackhi_epi8, _mm256_unpackhi_epi8)
+ NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv__unpacklo_epi16, _mm256_unpacklo_epi16)
+ NPYV_IMPL_AVX512_FROM_AVX2_2ARG(npyv__unpackhi_epi16, _mm256_unpackhi_epi16)
+#endif
+
+NPY_FINLINE npyv_u64x2 npyv_zip_u64(__m512i a, __m512i b)
+{
+ npyv_u64x2 r;
+ r.val[0] = _mm512_permutex2var_epi64(a, npyv_set_u64(0, 8, 1, 9, 2, 10, 3, 11), b);
+ r.val[1] = _mm512_permutex2var_epi64(a, npyv_set_u64(4, 12, 5, 13, 6, 14, 7, 15), b);
+ return r;
+}
+#define npyv_zip_s64 npyv_zip_u64
+
+NPY_FINLINE npyv_u8x2 npyv_zip_u8(__m512i a, __m512i b)
+{
+ npyv_u8x2 r;
+#ifdef NPY_HAVE_AVX512VBMI
+ r.val[0] = _mm512_permutex2var_epi8(a,
+ npyv_set_u8(0, 64, 1, 65, 2, 66, 3, 67, 4, 68, 5, 69, 6, 70, 7, 71,
+ 8, 72, 9, 73, 10, 74, 11, 75, 12, 76, 13, 77, 14, 78, 15, 79,
+ 16, 80, 17, 81, 18, 82, 19, 83, 20, 84, 21, 85, 22, 86, 23, 87,
+ 24, 88, 25, 89, 26, 90, 27, 91, 28, 92, 29, 93, 30, 94, 31, 95), b);
+ r.val[1] = _mm512_permutex2var_epi8(a,
+ npyv_set_u8(32, 96, 33, 97, 34, 98, 35, 99, 36, 100, 37, 101, 38, 102, 39, 103,
+ 40, 104, 41, 105, 42, 106, 43, 107, 44, 108, 45, 109, 46, 110, 47, 111,
+ 48, 112, 49, 113, 50, 114, 51, 115, 52, 116, 53, 117, 54, 118, 55, 119,
+ 56, 120, 57, 121, 58, 122, 59, 123, 60, 124, 61, 125, 62, 126, 63, 127), b);
+#else
+ #ifdef NPY_HAVE_AVX512BW
+ __m512i ab0 = _mm512_unpacklo_epi8(a, b);
+ __m512i ab1 = _mm512_unpackhi_epi8(a, b);
+ #else
+ __m512i ab0 = npyv__unpacklo_epi8(a, b);
+ __m512i ab1 = npyv__unpackhi_epi8(a, b);
+ #endif
+ r.val[0] = _mm512_permutex2var_epi64(ab0, npyv_set_u64(0, 1, 8, 9, 2, 3, 10, 11), ab1);
+ r.val[1] = _mm512_permutex2var_epi64(ab0, npyv_set_u64(4, 5, 12, 13, 6, 7, 14, 15), ab1);
+#endif
+ return r;
+}
+#define npyv_zip_s8 npyv_zip_u8
+
+NPY_FINLINE npyv_u16x2 npyv_zip_u16(__m512i a, __m512i b)
+{
+ npyv_u16x2 r;
+#ifdef NPY_HAVE_AVX512BW
+ r.val[0] = _mm512_permutex2var_epi16(a,
+ npyv_set_u16(0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39,
+ 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47), b);
+ r.val[1] = _mm512_permutex2var_epi16(a,
+ npyv_set_u16(16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55,
+ 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63), b);
+#else
+ __m512i ab0 = npyv__unpacklo_epi16(a, b);
+ __m512i ab1 = npyv__unpackhi_epi16(a, b);
+ r.val[0] = _mm512_permutex2var_epi64(ab0, npyv_set_u64(0, 1, 8, 9, 2, 3, 10, 11), ab1);
+ r.val[1] = _mm512_permutex2var_epi64(ab0, npyv_set_u64(4, 5, 12, 13, 6, 7, 14, 15), ab1);
+#endif
+ return r;
+}
+#define npyv_zip_s16 npyv_zip_u16
+
+NPY_FINLINE npyv_u32x2 npyv_zip_u32(__m512i a, __m512i b)
+{
+ npyv_u32x2 r;
+ r.val[0] = _mm512_permutex2var_epi32(a,
+ npyv_set_u32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), b);
+ r.val[1] = _mm512_permutex2var_epi32(a,
+ npyv_set_u32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), b);
+ return r;
+}
+#define npyv_zip_s32 npyv_zip_u32
+
+NPY_FINLINE npyv_f32x2 npyv_zip_f32(__m512 a, __m512 b)
+{
+ npyv_f32x2 r;
+ r.val[0] = _mm512_permutex2var_ps(a,
+ npyv_set_u32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23), b);
+ r.val[1] = _mm512_permutex2var_ps(a,
+ npyv_set_u32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31), b);
+ return r;
+}
+
+NPY_FINLINE npyv_f64x2 npyv_zip_f64(__m512d a, __m512d b)
+{
+ npyv_f64x2 r;
+ r.val[0] = _mm512_permutex2var_pd(a, npyv_set_u64(0, 8, 1, 9, 2, 10, 3, 11), b);
+ r.val[1] = _mm512_permutex2var_pd(a, npyv_set_u64(4, 12, 5, 13, 6, 14, 7, 15), b);
+ return r;
+}
+
+#endif // _NPY_SIMD_AVX512_REORDER_H
diff --git a/numpy/core/src/common/simd/avx512/utils.h b/numpy/core/src/common/simd/avx512/utils.h
new file mode 100644
index 000000000..8066283c6
--- /dev/null
+++ b/numpy/core/src/common/simd/avx512/utils.h
@@ -0,0 +1,70 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_AVX512_UTILS_H
+#define _NPY_SIMD_AVX512_UTILS_H
+
+#define npyv512_lower_si256 _mm512_castsi512_si256
+#define npyv512_lower_ps256 _mm512_castps512_ps256
+#define npyv512_lower_pd256 _mm512_castpd512_pd256
+
+#define npyv512_higher_si256(A) _mm512_extracti64x4_epi64(A, 1)
+#define npyv512_higher_pd256(A) _mm512_extractf64x4_pd(A, 1)
+
+#ifdef NPY_HAVE_AVX512DQ
+ #define npyv512_higher_ps256(A) _mm512_extractf32x8_ps(A, 1)
+#else
+ #define npyv512_higher_ps256(A) \
+ _mm256_castsi256_ps(_mm512_extracti64x4_epi64(_mm512_castps_si512(A), 1))
+#endif
+
+#define npyv512_combine_si256(A, B) _mm512_inserti64x4(_mm512_castsi256_si512(A), B, 1)
+#define npyv512_combine_pd256(A, B) _mm512_insertf64x4(_mm512_castpd256_pd512(A), B, 1)
+
+#ifdef NPY_HAVE_AVX512DQ
+ #define npyv512_combine_ps256(A, B) _mm512_insertf32x8(_mm512_castps256_ps512(A), B, 1)
+#else
+ #define npyv512_combine_ps256(A, B) \
+ _mm512_castsi512_ps(npyv512_combine_si256(_mm512_castps_si512(A), _mm512_castps_si512(B)))
+#endif
+
+#define NPYV_IMPL_AVX512_FROM_AVX2_1ARG(FN_NAME, INTRIN) \
+ NPY_FINLINE __m512i FN_NAME(__m512i a) \
+ { \
+ __m256i l_a = npyv512_lower_si256(a); \
+ __m256i h_a = npyv512_higher_si256(a); \
+ l_a = INTRIN(l_a); \
+ h_a = INTRIN(h_a); \
+ return npyv512_combine_si256(l_a, h_a); \
+ }
+
+#define NPYV_IMPL_AVX512_FROM_AVX2_2ARG(FN_NAME, INTRIN) \
+ NPY_FINLINE __m512i FN_NAME(__m512i a, __m512i b) \
+ { \
+ __m256i l_a = npyv512_lower_si256(a); \
+ __m256i h_a = npyv512_higher_si256(a); \
+ __m256i l_b = npyv512_lower_si256(b); \
+ __m256i h_b = npyv512_higher_si256(b); \
+ l_a = INTRIN(l_a, l_b); \
+ h_a = INTRIN(h_a, h_b); \
+ return npyv512_combine_si256(l_a, h_a); \
+ }
+
+#define NPYV_IMPL_AVX512_FROM_SI512_PS_2ARG(FN_NAME, INTRIN) \
+ NPY_FINLINE __m512 FN_NAME(__m512 a, __m512 b) \
+ { \
+ return _mm512_castsi512_ps(INTRIN( \
+ _mm512_castps_si512(a), _mm512_castps_si512(b) \
+ )); \
+ }
+
+#define NPYV_IMPL_AVX512_FROM_SI512_PD_2ARG(FN_NAME, INTRIN) \
+ NPY_FINLINE __m512d FN_NAME(__m512d a, __m512d b) \
+ { \
+ return _mm512_castsi512_pd(INTRIN( \
+ _mm512_castpd_si512(a), _mm512_castpd_si512(b) \
+ )); \
+ }
+
+#endif // _NPY_SIMD_AVX512_UTILS_H
diff --git a/numpy/core/src/common/simd/neon/arithmetic.h b/numpy/core/src/common/simd/neon/arithmetic.h
new file mode 100644
index 000000000..ec8b8ecd0
--- /dev/null
+++ b/numpy/core/src/common/simd/neon/arithmetic.h
@@ -0,0 +1,78 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_NEON_ARITHMETIC_H
+#define _NPY_SIMD_NEON_ARITHMETIC_H
+
+/***************************
+ * Addition
+ ***************************/
+// non-saturated
+#define npyv_add_u8 vaddq_u8
+#define npyv_add_s8 vaddq_s8
+#define npyv_add_u16 vaddq_u16
+#define npyv_add_s16 vaddq_s16
+#define npyv_add_u32 vaddq_u32
+#define npyv_add_s32 vaddq_s32
+#define npyv_add_u64 vaddq_u64
+#define npyv_add_s64 vaddq_s64
+#define npyv_add_f32 vaddq_f32
+#define npyv_add_f64 vaddq_f64
+
+// saturated
+#define npyv_adds_u8 vqaddq_u8
+#define npyv_adds_s8 vqaddq_s8
+#define npyv_adds_u16 vqaddq_u16
+#define npyv_adds_s16 vqaddq_s16
+
+/***************************
+ * Subtraction
+ ***************************/
+// non-saturated
+#define npyv_sub_u8 vsubq_u8
+#define npyv_sub_s8 vsubq_s8
+#define npyv_sub_u16 vsubq_u16
+#define npyv_sub_s16 vsubq_s16
+#define npyv_sub_u32 vsubq_u32
+#define npyv_sub_s32 vsubq_s32
+#define npyv_sub_u64 vsubq_u64
+#define npyv_sub_s64 vsubq_s64
+#define npyv_sub_f32 vsubq_f32
+#define npyv_sub_f64 vsubq_f64
+
+// saturated
+#define npyv_subs_u8 vqsubq_u8
+#define npyv_subs_s8 vqsubq_s8
+#define npyv_subs_u16 vqsubq_u16
+#define npyv_subs_s16 vqsubq_s16
+
+/***************************
+ * Multiplication
+ ***************************/
+// non-saturated
+#define npyv_mul_u8 vmulq_u8
+#define npyv_mul_s8 vmulq_s8
+#define npyv_mul_u16 vmulq_u16
+#define npyv_mul_s16 vmulq_s16
+#define npyv_mul_u32 vmulq_u32
+#define npyv_mul_s32 vmulq_s32
+#define npyv_mul_f32 vmulq_f32
+#define npyv_mul_f64 vmulq_f64
+
+/***************************
+ * Division
+ ***************************/
+#ifdef __aarch64__
+ #define npyv_div_f32 vdivq_f32
+#else
+ NPY_FINLINE float32x4_t npyv_div_f32(float32x4_t a, float32x4_t b)
+ {
+ float32x4_t recip = vrecpeq_f32(b);
+ recip = vmulq_f32(vrecpsq_f32(b, recip), recip);
+ return vmulq_f32(a, recip);
+ }
+#endif
+#define npyv_div_f64 vdivq_f64
+
+#endif // _NPY_SIMD_NEON_ARITHMETIC_H
diff --git a/numpy/core/src/common/simd/neon/conversion.h b/numpy/core/src/common/simd/neon/conversion.h
new file mode 100644
index 000000000..b286931d1
--- /dev/null
+++ b/numpy/core/src/common/simd/neon/conversion.h
@@ -0,0 +1,32 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_NEON_CVT_H
+#define _NPY_SIMD_NEON_CVT_H
+
+// convert boolean vectors to integer vectors
+#define npyv_cvt_u8_b8(A) A
+#define npyv_cvt_s8_b8(A) vreinterpretq_s8_u8(A)
+#define npyv_cvt_u16_b16(A) A
+#define npyv_cvt_s16_b16(A) vreinterpretq_s16_u16(A)
+#define npyv_cvt_u32_b32(A) A
+#define npyv_cvt_s32_b32(A) vreinterpretq_s32_u32(A)
+#define npyv_cvt_u64_b64(A) A
+#define npyv_cvt_s64_b64(A) vreinterpretq_s64_u64(A)
+#define npyv_cvt_f32_b32(A) vreinterpretq_f32_u32(A)
+#define npyv_cvt_f64_b64(A) vreinterpretq_f64_u64(A)
+
+// convert integer vectors to boolean vectors
+#define npyv_cvt_b8_u8(BL) BL
+#define npyv_cvt_b8_s8(BL) vreinterpretq_u8_s8(BL)
+#define npyv_cvt_b16_u16(BL) BL
+#define npyv_cvt_b16_s16(BL) vreinterpretq_u16_s16(BL)
+#define npyv_cvt_b32_u32(BL) BL
+#define npyv_cvt_b32_s32(BL) vreinterpretq_u32_s32(BL)
+#define npyv_cvt_b64_u64(BL) BL
+#define npyv_cvt_b64_s64(BL) vreinterpretq_u64_s64(BL)
+#define npyv_cvt_b32_f32(BL) vreinterpretq_u32_f32(BL)
+#define npyv_cvt_b64_f64(BL) vreinterpretq_u64_f64(BL)
+
+#endif // _NPY_SIMD_NEON_CVT_H
diff --git a/numpy/core/src/common/simd/neon/memory.h b/numpy/core/src/common/simd/neon/memory.h
new file mode 100644
index 000000000..afa703584
--- /dev/null
+++ b/numpy/core/src/common/simd/neon/memory.h
@@ -0,0 +1,49 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_NEON_MEMORY_H
+#define _NPY_SIMD_NEON_MEMORY_H
+
+/***************************
+ * load/store
+ ***************************/
+// GCC requires literal type definitions for pointers types otherwise it causes ambiguous errors
+#define NPYV_IMPL_NEON_MEM(SFX, CTYPE) \
+ NPY_FINLINE npyv_##SFX npyv_load_##SFX(const npyv_lanetype_##SFX *ptr) \
+ { return vld1q_##SFX((const CTYPE*)ptr); } \
+ NPY_FINLINE npyv_##SFX npyv_loada_##SFX(const npyv_lanetype_##SFX *ptr) \
+ { return vld1q_##SFX((const CTYPE*)ptr); } \
+ NPY_FINLINE npyv_##SFX npyv_loads_##SFX(const npyv_lanetype_##SFX *ptr) \
+ { return vld1q_##SFX((const CTYPE*)ptr); } \
+ NPY_FINLINE npyv_##SFX npyv_loadl_##SFX(const npyv_lanetype_##SFX *ptr) \
+ { \
+ return vcombine_##SFX( \
+ vld1_##SFX((const CTYPE*)ptr), vdup_n_##SFX(0) \
+ ); \
+ } \
+ NPY_FINLINE void npyv_store_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec) \
+ { vst1q_##SFX((CTYPE*)ptr, vec); } \
+ NPY_FINLINE void npyv_storea_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec) \
+ { vst1q_##SFX((CTYPE*)ptr, vec); } \
+ NPY_FINLINE void npyv_stores_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec) \
+ { vst1q_##SFX((CTYPE*)ptr, vec); } \
+ NPY_FINLINE void npyv_storel_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec) \
+ { vst1_##SFX((CTYPE*)ptr, vget_low_##SFX(vec)); } \
+ NPY_FINLINE void npyv_storeh_##SFX(npyv_lanetype_##SFX *ptr, npyv_##SFX vec) \
+ { vst1_##SFX((CTYPE*)ptr, vget_high_##SFX(vec)); }
+
+NPYV_IMPL_NEON_MEM(u8, uint8_t)
+NPYV_IMPL_NEON_MEM(s8, int8_t)
+NPYV_IMPL_NEON_MEM(u16, uint16_t)
+NPYV_IMPL_NEON_MEM(s16, int16_t)
+NPYV_IMPL_NEON_MEM(u32, uint32_t)
+NPYV_IMPL_NEON_MEM(s32, int32_t)
+NPYV_IMPL_NEON_MEM(u64, uint64_t)
+NPYV_IMPL_NEON_MEM(s64, int64_t)
+NPYV_IMPL_NEON_MEM(f32, float)
+#if NPY_SIMD_F64
+NPYV_IMPL_NEON_MEM(f64, double)
+#endif
+
+#endif // _NPY_SIMD_NEON_MEMORY_H
diff --git a/numpy/core/src/common/simd/neon/misc.h b/numpy/core/src/common/simd/neon/misc.h
new file mode 100644
index 000000000..51b0c3858
--- /dev/null
+++ b/numpy/core/src/common/simd/neon/misc.h
@@ -0,0 +1,255 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_NEON_MISC_H
+#define _NPY_SIMD_NEON_MISC_H
+
+// vector with zero lanes
+#define npyv_zero_u8() vreinterpretq_u8_s32(npyv_zero_s32())
+#define npyv_zero_s8() vreinterpretq_s8_s32(npyv_zero_s32())
+#define npyv_zero_u16() vreinterpretq_u16_s32(npyv_zero_s32())
+#define npyv_zero_s16() vreinterpretq_s16_s32(npyv_zero_s32())
+#define npyv_zero_u32() vdupq_n_u32((unsigned)0)
+#define npyv_zero_s32() vdupq_n_s32((int)0)
+#define npyv_zero_u64() vreinterpretq_u64_s32(npyv_zero_s32())
+#define npyv_zero_s64() vreinterpretq_s64_s32(npyv_zero_s32())
+#define npyv_zero_f32() vdupq_n_f32(0.0f)
+#define npyv_zero_f64() vdupq_n_f64(0.0)
+
+// vector with a specific value set to all lanes
+#define npyv_setall_u8 vdupq_n_u8
+#define npyv_setall_s8 vdupq_n_s8
+#define npyv_setall_u16 vdupq_n_u16
+#define npyv_setall_s16 vdupq_n_s16
+#define npyv_setall_u32 vdupq_n_u32
+#define npyv_setall_s32 vdupq_n_s32
+#define npyv_setall_u64 vdupq_n_u64
+#define npyv_setall_s64 vdupq_n_s64
+#define npyv_setall_f32 vdupq_n_f32
+#define npyv_setall_f64 vdupq_n_f64
+
+// vector with specific values set to each lane and
+// set a specific value to all remained lanes
+NPY_FINLINE uint8x16_t npyv__set_u8(npy_uint8 i0, npy_uint8 i1, npy_uint8 i2, npy_uint8 i3,
+ npy_uint8 i4, npy_uint8 i5, npy_uint8 i6, npy_uint8 i7, npy_uint8 i8, npy_uint8 i9,
+ npy_uint8 i10, npy_uint8 i11, npy_uint8 i12, npy_uint8 i13, npy_uint8 i14, npy_uint8 i15)
+{
+ const uint8_t NPY_DECL_ALIGNED(16) data[16] = {
+ i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15
+ };
+ return vld1q_u8(data);
+}
+#define npyv_setf_u8(FILL, ...) npyv__set_u8(NPYV__SET_FILL_16(npy_uint8, FILL, __VA_ARGS__))
+
+NPY_FINLINE int8x16_t npyv__set_s8(npy_int8 i0, npy_int8 i1, npy_int8 i2, npy_int8 i3,
+ npy_int8 i4, npy_int8 i5, npy_int8 i6, npy_int8 i7, npy_int8 i8, npy_int8 i9,
+ npy_int8 i10, npy_int8 i11, npy_int8 i12, npy_int8 i13, npy_int8 i14, npy_int8 i15)
+{
+ const int8_t NPY_DECL_ALIGNED(16) data[16] = {
+ i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15
+ };
+ return vld1q_s8(data);
+}
+#define npyv_setf_s8(FILL, ...) npyv__set_s8(NPYV__SET_FILL_16(npy_int8, FILL, __VA_ARGS__))
+
+NPY_FINLINE uint16x8_t npyv__set_u16(npy_uint16 i0, npy_uint16 i1, npy_uint16 i2, npy_uint16 i3,
+ npy_uint16 i4, npy_uint16 i5, npy_uint16 i6, npy_uint16 i7)
+{
+ const uint16_t NPY_DECL_ALIGNED(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
+ return vld1q_u16(data);
+}
+#define npyv_setf_u16(FILL, ...) npyv__set_u16(NPYV__SET_FILL_8(npy_uint16, FILL, __VA_ARGS__))
+
+NPY_FINLINE int16x8_t npyv__set_s16(npy_int16 i0, npy_int16 i1, npy_int16 i2, npy_int16 i3,
+ npy_int16 i4, npy_int16 i5, npy_int16 i6, npy_int16 i7)
+{
+ const int16_t NPY_DECL_ALIGNED(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
+ return vld1q_s16(data);
+}
+#define npyv_setf_s16(FILL, ...) npyv__set_s16(NPYV__SET_FILL_8(npy_int16, FILL, __VA_ARGS__))
+
+NPY_FINLINE uint32x4_t npyv__set_u32(npy_uint32 i0, npy_uint32 i1, npy_uint32 i2, npy_uint32 i3)
+{
+ const uint32_t NPY_DECL_ALIGNED(16) data[4] = {i0, i1, i2, i3};
+ return vld1q_u32(data);
+}
+#define npyv_setf_u32(FILL, ...) npyv__set_u32(NPYV__SET_FILL_4(npy_uint32, FILL, __VA_ARGS__))
+
+NPY_FINLINE int32x4_t npyv__set_s32(npy_int32 i0, npy_int32 i1, npy_int32 i2, npy_int32 i3)
+{
+ const int32_t NPY_DECL_ALIGNED(16) data[4] = {i0, i1, i2, i3};
+ return vld1q_s32(data);
+}
+#define npyv_setf_s32(FILL, ...) npyv__set_s32(NPYV__SET_FILL_4(npy_int32, FILL, __VA_ARGS__))
+
+NPY_FINLINE uint64x2_t npyv__set_u64(npy_uint64 i0, npy_uint64 i1)
+{
+ const uint64_t NPY_DECL_ALIGNED(16) data[2] = {i0, i1};
+ return vld1q_u64(data);
+}
+#define npyv_setf_u64(FILL, ...) npyv__set_u64(NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__))
+
+NPY_FINLINE int64x2_t npyv__set_s64(npy_int64 i0, npy_int64 i1)
+{
+ const int64_t NPY_DECL_ALIGNED(16) data[2] = {i0, i1};
+ return vld1q_s64(data);
+}
+#define npyv_setf_s64(FILL, ...) npyv__set_s64(NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__))
+
+NPY_FINLINE float32x4_t npyv__set_f32(float i0, float i1, float i2, float i3)
+{
+ const float NPY_DECL_ALIGNED(16) data[4] = {i0, i1, i2, i3};
+ return vld1q_f32(data);
+}
+#define npyv_setf_f32(FILL, ...) npyv__set_f32(NPYV__SET_FILL_4(float, FILL, __VA_ARGS__))
+
+#ifdef __aarch64__
+NPY_FINLINE float64x2_t npyv__set_f64(double i0, double i1)
+{
+ const double NPY_DECL_ALIGNED(16) data[2] = {i0, i1};
+ return vld1q_f64(data);
+}
+#define npyv_setf_f64(FILL, ...) npyv__set_f64(NPYV__SET_FILL_2(double, FILL, __VA_ARGS__))
+#endif
+
+// vector with specific values set to each lane and
+// set zero to all remained lanes
+#define npyv_set_u8(...) npyv_setf_u8(0, __VA_ARGS__)
+#define npyv_set_s8(...) npyv_setf_s8(0, __VA_ARGS__)
+#define npyv_set_u16(...) npyv_setf_u16(0, __VA_ARGS__)
+#define npyv_set_s16(...) npyv_setf_s16(0, __VA_ARGS__)
+#define npyv_set_u32(...) npyv_setf_u32(0, __VA_ARGS__)
+#define npyv_set_s32(...) npyv_setf_s32(0, __VA_ARGS__)
+#define npyv_set_u64(...) npyv_setf_u64(0, __VA_ARGS__)
+#define npyv_set_s64(...) npyv_setf_s64(0, __VA_ARGS__)
+#define npyv_set_f32(...) npyv_setf_f32(0, __VA_ARGS__)
+#define npyv_set_f64(...) npyv_setf_f64(0, __VA_ARGS__)
+
+// Per lane select
+#define npyv_select_u8 vbslq_u8
+#define npyv_select_s8 vbslq_s8
+#define npyv_select_u16 vbslq_u16
+#define npyv_select_s16 vbslq_s16
+#define npyv_select_u32 vbslq_u32
+#define npyv_select_s32 vbslq_s32
+#define npyv_select_u64 vbslq_u64
+#define npyv_select_s64 vbslq_s64
+#define npyv_select_f32 vbslq_f32
+#define npyv_select_f64 vbslq_f64
+
+// Reinterpret
+#define npyv_reinterpret_u8_u8(X) X
+#define npyv_reinterpret_u8_s8 vreinterpretq_u8_s8
+#define npyv_reinterpret_u8_u16 vreinterpretq_u8_u16
+#define npyv_reinterpret_u8_s16 vreinterpretq_u8_s16
+#define npyv_reinterpret_u8_u32 vreinterpretq_u8_u32
+#define npyv_reinterpret_u8_s32 vreinterpretq_u8_s32
+#define npyv_reinterpret_u8_u64 vreinterpretq_u8_u64
+#define npyv_reinterpret_u8_s64 vreinterpretq_u8_s64
+#define npyv_reinterpret_u8_f32 vreinterpretq_u8_f32
+#define npyv_reinterpret_u8_f64 vreinterpretq_u8_f64
+
+#define npyv_reinterpret_s8_s8(X) X
+#define npyv_reinterpret_s8_u8 vreinterpretq_s8_u8
+#define npyv_reinterpret_s8_u16 vreinterpretq_s8_u16
+#define npyv_reinterpret_s8_s16 vreinterpretq_s8_s16
+#define npyv_reinterpret_s8_u32 vreinterpretq_s8_u32
+#define npyv_reinterpret_s8_s32 vreinterpretq_s8_s32
+#define npyv_reinterpret_s8_u64 vreinterpretq_s8_u64
+#define npyv_reinterpret_s8_s64 vreinterpretq_s8_s64
+#define npyv_reinterpret_s8_f32 vreinterpretq_s8_f32
+#define npyv_reinterpret_s8_f64 vreinterpretq_s8_f64
+
+#define npyv_reinterpret_u16_u16(X) X
+#define npyv_reinterpret_u16_u8 vreinterpretq_u16_u8
+#define npyv_reinterpret_u16_s8 vreinterpretq_u16_s8
+#define npyv_reinterpret_u16_s16 vreinterpretq_u16_s16
+#define npyv_reinterpret_u16_u32 vreinterpretq_u16_u32
+#define npyv_reinterpret_u16_s32 vreinterpretq_u16_s32
+#define npyv_reinterpret_u16_u64 vreinterpretq_u16_u64
+#define npyv_reinterpret_u16_s64 vreinterpretq_u16_s64
+#define npyv_reinterpret_u16_f32 vreinterpretq_u16_f32
+#define npyv_reinterpret_u16_f64 vreinterpretq_u16_f64
+
+#define npyv_reinterpret_s16_s16(X) X
+#define npyv_reinterpret_s16_u8 vreinterpretq_s16_u8
+#define npyv_reinterpret_s16_s8 vreinterpretq_s16_s8
+#define npyv_reinterpret_s16_u16 vreinterpretq_s16_u16
+#define npyv_reinterpret_s16_u32 vreinterpretq_s16_u32
+#define npyv_reinterpret_s16_s32 vreinterpretq_s16_s32
+#define npyv_reinterpret_s16_u64 vreinterpretq_s16_u64
+#define npyv_reinterpret_s16_s64 vreinterpretq_s16_s64
+#define npyv_reinterpret_s16_f32 vreinterpretq_s16_f32
+#define npyv_reinterpret_s16_f64 vreinterpretq_s16_f64
+
+#define npyv_reinterpret_u32_u32(X) X
+#define npyv_reinterpret_u32_u8 vreinterpretq_u32_u8
+#define npyv_reinterpret_u32_s8 vreinterpretq_u32_s8
+#define npyv_reinterpret_u32_u16 vreinterpretq_u32_u16
+#define npyv_reinterpret_u32_s16 vreinterpretq_u32_s16
+#define npyv_reinterpret_u32_s32 vreinterpretq_u32_s32
+#define npyv_reinterpret_u32_u64 vreinterpretq_u32_u64
+#define npyv_reinterpret_u32_s64 vreinterpretq_u32_s64
+#define npyv_reinterpret_u32_f32 vreinterpretq_u32_f32
+#define npyv_reinterpret_u32_f64 vreinterpretq_u32_f64
+
+#define npyv_reinterpret_s32_s32(X) X
+#define npyv_reinterpret_s32_u8 vreinterpretq_s32_u8
+#define npyv_reinterpret_s32_s8 vreinterpretq_s32_s8
+#define npyv_reinterpret_s32_u16 vreinterpretq_s32_u16
+#define npyv_reinterpret_s32_s16 vreinterpretq_s32_s16
+#define npyv_reinterpret_s32_u32 vreinterpretq_s32_u32
+#define npyv_reinterpret_s32_u64 vreinterpretq_s32_u64
+#define npyv_reinterpret_s32_s64 vreinterpretq_s32_s64
+#define npyv_reinterpret_s32_f32 vreinterpretq_s32_f32
+#define npyv_reinterpret_s32_f64 vreinterpretq_s32_f64
+
+#define npyv_reinterpret_u64_u64(X) X
+#define npyv_reinterpret_u64_u8 vreinterpretq_u64_u8
+#define npyv_reinterpret_u64_s8 vreinterpretq_u64_s8
+#define npyv_reinterpret_u64_u16 vreinterpretq_u64_u16
+#define npyv_reinterpret_u64_s16 vreinterpretq_u64_s16
+#define npyv_reinterpret_u64_u32 vreinterpretq_u64_u32
+#define npyv_reinterpret_u64_s32 vreinterpretq_u64_s32
+#define npyv_reinterpret_u64_s64 vreinterpretq_u64_s64
+#define npyv_reinterpret_u64_f32 vreinterpretq_u64_f32
+#define npyv_reinterpret_u64_f64 vreinterpretq_u64_f64
+
+#define npyv_reinterpret_s64_s64(X) X
+#define npyv_reinterpret_s64_u8 vreinterpretq_s64_u8
+#define npyv_reinterpret_s64_s8 vreinterpretq_s64_s8
+#define npyv_reinterpret_s64_u16 vreinterpretq_s64_u16
+#define npyv_reinterpret_s64_s16 vreinterpretq_s64_s16
+#define npyv_reinterpret_s64_u32 vreinterpretq_s64_u32
+#define npyv_reinterpret_s64_s32 vreinterpretq_s64_s32
+#define npyv_reinterpret_s64_u64 vreinterpretq_s64_u64
+#define npyv_reinterpret_s64_f32 vreinterpretq_s64_f32
+#define npyv_reinterpret_s64_f64 vreinterpretq_s64_f64
+
+#define npyv_reinterpret_f32_f32(X) X
+#define npyv_reinterpret_f32_u8 vreinterpretq_f32_u8
+#define npyv_reinterpret_f32_s8 vreinterpretq_f32_s8
+#define npyv_reinterpret_f32_u16 vreinterpretq_f32_u16
+#define npyv_reinterpret_f32_s16 vreinterpretq_f32_s16
+#define npyv_reinterpret_f32_u32 vreinterpretq_f32_u32
+#define npyv_reinterpret_f32_s32 vreinterpretq_f32_s32
+#define npyv_reinterpret_f32_u64 vreinterpretq_f32_u64
+#define npyv_reinterpret_f32_s64 vreinterpretq_f32_s64
+#define npyv_reinterpret_f32_f64 vreinterpretq_f32_f64
+
+#define npyv_reinterpret_f64_f64(X) X
+#define npyv_reinterpret_f64_u8 vreinterpretq_f64_u8
+#define npyv_reinterpret_f64_s8 vreinterpretq_f64_s8
+#define npyv_reinterpret_f64_u16 vreinterpretq_f64_u16
+#define npyv_reinterpret_f64_s16 vreinterpretq_f64_s16
+#define npyv_reinterpret_f64_u32 vreinterpretq_f64_u32
+#define npyv_reinterpret_f64_s32 vreinterpretq_f64_s32
+#define npyv_reinterpret_f64_u64 vreinterpretq_f64_u64
+#define npyv_reinterpret_f64_s64 vreinterpretq_f64_s64
+#define npyv_reinterpret_f64_f32 vreinterpretq_f64_f32
+
+// Only required by AVX2/AVX512
+#define npyv_cleanup() ((void)0)
+
+#endif // _NPY_SIMD_NEON_MISC_H
diff --git a/numpy/core/src/common/simd/neon/neon.h b/numpy/core/src/common/simd/neon/neon.h
new file mode 100644
index 000000000..280a34297
--- /dev/null
+++ b/numpy/core/src/common/simd/neon/neon.h
@@ -0,0 +1,74 @@
+#ifndef _NPY_SIMD_H_
+ #error "Not a standalone header"
+#endif
+
+#define NPY_SIMD 128
+#define NPY_SIMD_WIDTH 16
+
+#ifdef __aarch64__
+ #define NPY_SIMD_F64 1
+#else
+ #define NPY_SIMD_F64 0
+#endif
+
+typedef uint8x16_t npyv_u8;
+typedef int8x16_t npyv_s8;
+typedef uint16x8_t npyv_u16;
+typedef int16x8_t npyv_s16;
+typedef uint32x4_t npyv_u32;
+typedef int32x4_t npyv_s32;
+typedef uint64x2_t npyv_u64;
+typedef int64x2_t npyv_s64;
+typedef float32x4_t npyv_f32;
+#if NPY_SIMD_F64
+typedef float64x2_t npyv_f64;
+#endif
+
+typedef uint8x16_t npyv_b8;
+typedef uint16x8_t npyv_b16;
+typedef uint32x4_t npyv_b32;
+typedef uint64x2_t npyv_b64;
+
+typedef uint8x16x2_t npyv_u8x2;
+typedef int8x16x2_t npyv_s8x2;
+typedef uint16x8x2_t npyv_u16x2;
+typedef int16x8x2_t npyv_s16x2;
+typedef uint32x4x2_t npyv_u32x2;
+typedef int32x4x2_t npyv_s32x2;
+typedef uint64x2x2_t npyv_u64x2;
+typedef int64x2x2_t npyv_s64x2;
+typedef float32x4x2_t npyv_f32x2;
+#if NPY_SIMD_F64
+typedef float64x2x2_t npyv_f64x2;
+#endif
+
+typedef uint8x16x3_t npyv_u8x3;
+typedef int8x16x3_t npyv_s8x3;
+typedef uint16x8x3_t npyv_u16x3;
+typedef int16x8x3_t npyv_s16x3;
+typedef uint32x4x3_t npyv_u32x3;
+typedef int32x4x3_t npyv_s32x3;
+typedef uint64x2x3_t npyv_u64x3;
+typedef int64x2x3_t npyv_s64x3;
+typedef float32x4x3_t npyv_f32x3;
+#if NPY_SIMD_F64
+typedef float64x2x3_t npyv_f64x3;
+#endif
+
+#define npyv_nlanes_u8 16
+#define npyv_nlanes_s8 16
+#define npyv_nlanes_u16 8
+#define npyv_nlanes_s16 8
+#define npyv_nlanes_u32 4
+#define npyv_nlanes_s32 4
+#define npyv_nlanes_u64 2
+#define npyv_nlanes_s64 2
+#define npyv_nlanes_f32 4
+#define npyv_nlanes_f64 2
+
+#include "memory.h"
+#include "misc.h"
+#include "reorder.h"
+#include "operators.h"
+#include "conversion.h"
+#include "arithmetic.h"
diff --git a/numpy/core/src/common/simd/neon/operators.h b/numpy/core/src/common/simd/neon/operators.h
new file mode 100644
index 000000000..c1ad4ba12
--- /dev/null
+++ b/numpy/core/src/common/simd/neon/operators.h
@@ -0,0 +1,218 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_NEON_OPERATORS_H
+#define _NPY_SIMD_NEON_OPERATORS_H
+
+/***************************
+ * Shifting
+ ***************************/
+
+// left
+#define npyv_shl_u16(A, C) vshlq_u16(A, npyv_setall_s16(C))
+#define npyv_shl_s16(A, C) vshlq_s16(A, npyv_setall_s16(C))
+#define npyv_shl_u32(A, C) vshlq_u32(A, npyv_setall_s32(C))
+#define npyv_shl_s32(A, C) vshlq_s32(A, npyv_setall_s32(C))
+#define npyv_shl_u64(A, C) vshlq_u64(A, npyv_setall_s64(C))
+#define npyv_shl_s64(A, C) vshlq_s64(A, npyv_setall_s64(C))
+
+// left by an immediate constant
+#define npyv_shli_u16 vshlq_n_u16
+#define npyv_shli_s16 vshlq_n_s16
+#define npyv_shli_u32 vshlq_n_u32
+#define npyv_shli_s32 vshlq_n_s32
+#define npyv_shli_u64 vshlq_n_u64
+#define npyv_shli_s64 vshlq_n_s64
+
+// right
+#define npyv_shr_u16(A, C) vshlq_u16(A, npyv_setall_s16(-(C)))
+#define npyv_shr_s16(A, C) vshlq_s16(A, npyv_setall_s16(-(C)))
+#define npyv_shr_u32(A, C) vshlq_u32(A, npyv_setall_s32(-(C)))
+#define npyv_shr_s32(A, C) vshlq_s32(A, npyv_setall_s32(-(C)))
+#define npyv_shr_u64(A, C) vshlq_u64(A, npyv_setall_s64(-(C)))
+#define npyv_shr_s64(A, C) vshlq_s64(A, npyv_setall_s64(-(C)))
+
+// right by an immediate constant
+#define npyv_shri_u16(VEC, C) ((C) == 0 ? VEC : vshrq_n_u16(VEC, C))
+#define npyv_shri_s16(VEC, C) ((C) == 0 ? VEC : vshrq_n_s16(VEC, C))
+#define npyv_shri_u32(VEC, C) ((C) == 0 ? VEC : vshrq_n_u32(VEC, C))
+#define npyv_shri_s32(VEC, C) ((C) == 0 ? VEC : vshrq_n_s32(VEC, C))
+#define npyv_shri_u64(VEC, C) ((C) == 0 ? VEC : vshrq_n_u64(VEC, C))
+#define npyv_shri_s64(VEC, C) ((C) == 0 ? VEC : vshrq_n_s64(VEC, C))
+
+/***************************
+ * Logical
+ ***************************/
+
+// AND
+#define npyv_and_u8 vandq_u8
+#define npyv_and_s8 vandq_s8
+#define npyv_and_u16 vandq_u16
+#define npyv_and_s16 vandq_s16
+#define npyv_and_u32 vandq_u32
+#define npyv_and_s32 vandq_s32
+#define npyv_and_u64 vandq_u64
+#define npyv_and_s64 vandq_s64
+#define npyv_and_f32(A, B) \
+ vreinterpretq_f32_u8(vandq_u8(vreinterpretq_u8_f32(A), vreinterpretq_u8_f32(B)))
+#define npyv_and_f64(A, B) \
+ vreinterpretq_f64_u8(vandq_u8(vreinterpretq_u8_f64(A), vreinterpretq_u8_f64(B)))
+
+// OR
+#define npyv_or_u8 vorrq_u8
+#define npyv_or_s8 vorrq_s8
+#define npyv_or_u16 vorrq_u16
+#define npyv_or_s16 vorrq_s16
+#define npyv_or_u32 vorrq_u32
+#define npyv_or_s32 vorrq_s32
+#define npyv_or_u64 vorrq_u64
+#define npyv_or_s64 vorrq_s64
+#define npyv_or_f32(A, B) \
+ vreinterpretq_f32_u8(vorrq_u8(vreinterpretq_u8_f32(A), vreinterpretq_u8_f32(B)))
+#define npyv_or_f64(A, B) \
+ vreinterpretq_f64_u8(vorrq_u8(vreinterpretq_u8_f64(A), vreinterpretq_u8_f64(B)))
+
+// XOR
+#define npyv_xor_u8 veorq_u8
+#define npyv_xor_s8 veorq_s8
+#define npyv_xor_u16 veorq_u16
+#define npyv_xor_s16 veorq_s16
+#define npyv_xor_u32 veorq_u32
+#define npyv_xor_s32 veorq_s32
+#define npyv_xor_u64 veorq_u64
+#define npyv_xor_s64 veorq_s64
+#define npyv_xor_f32(A, B) \
+ vreinterpretq_f32_u8(veorq_u8(vreinterpretq_u8_f32(A), vreinterpretq_u8_f32(B)))
+#define npyv_xor_f64(A, B) \
+ vreinterpretq_f64_u8(veorq_u8(vreinterpretq_u8_f64(A), vreinterpretq_u8_f64(B)))
+
+// NOT
+#define npyv_not_u8 vmvnq_u8
+#define npyv_not_s8 vmvnq_s8
+#define npyv_not_u16 vmvnq_u16
+#define npyv_not_s16 vmvnq_s16
+#define npyv_not_u32 vmvnq_u32
+#define npyv_not_s32 vmvnq_s32
+#define npyv_not_u64(A) vreinterpretq_u64_u8(vmvnq_u8(vreinterpretq_u8_u64(A)))
+#define npyv_not_s64(A) vreinterpretq_s64_u8(vmvnq_u8(vreinterpretq_u8_s64(A)))
+#define npyv_not_f32(A) vreinterpretq_f32_u8(vmvnq_u8(vreinterpretq_u8_f32(A)))
+#define npyv_not_f64(A) vreinterpretq_f64_u8(vmvnq_u8(vreinterpretq_u8_f64(A)))
+
+/***************************
+ * Comparison
+ ***************************/
+
+// equal
+#define npyv_cmpeq_u8 vceqq_u8
+#define npyv_cmpeq_s8 vceqq_s8
+#define npyv_cmpeq_u16 vceqq_u16
+#define npyv_cmpeq_s16 vceqq_s16
+#define npyv_cmpeq_u32 vceqq_u32
+#define npyv_cmpeq_s32 vceqq_s32
+#define npyv_cmpeq_f32 vceqq_f32
+#define npyv_cmpeq_f64 vceqq_f64
+
+#ifdef __aarch64__
+ #define npyv_cmpeq_u64 vceqq_u64
+ #define npyv_cmpeq_s64 vceqq_s64
+#else
+ NPY_FINLINE uint64x2_t npyv_cmpeq_u64(uint64x2_t a, uint64x2_t b)
+ {
+ uint64x2_t cmpeq = vreinterpretq_u64_u32(vceqq_u32(
+ vreinterpretq_u32_u64(a), vreinterpretq_u32_u64(b)
+ ));
+ uint64x2_t cmpeq_h = vshlq_n_u64(cmpeq, 32);
+ uint64x2_t test = vandq_u64(cmpeq, cmpeq_h);
+ return vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_u64(test), 32));
+ }
+ #define npyv_cmpeq_s64(A, B) \
+ npyv_cmpeq_u64(vreinterpretq_u64_s64(A), vreinterpretq_u64_s64(B))
+#endif
+
+// not Equal
+#define npyv_cmpneq_u8(A, B) vmvnq_u8(vceqq_u8(A, B))
+#define npyv_cmpneq_s8(A, B) vmvnq_u8(vceqq_s8(A, B))
+#define npyv_cmpneq_u16(A, B) vmvnq_u16(vceqq_u16(A, B))
+#define npyv_cmpneq_s16(A, B) vmvnq_u16(vceqq_s16(A, B))
+#define npyv_cmpneq_u32(A, B) vmvnq_u32(vceqq_u32(A, B))
+#define npyv_cmpneq_s32(A, B) vmvnq_u32(vceqq_s32(A, B))
+#define npyv_cmpneq_u64(A, B) npyv_not_u64(npyv_cmpeq_u64(A, B))
+#define npyv_cmpneq_s64(A, B) npyv_not_u64(npyv_cmpeq_s64(A, B))
+#define npyv_cmpneq_f32(A, B) vmvnq_u32(vceqq_f32(A, B))
+#define npyv_cmpneq_f64(A, B) npyv_not_u64(vceqq_f64(A, B))
+
+// greater than
+#define npyv_cmpgt_u8 vcgtq_u8
+#define npyv_cmpgt_s8 vcgtq_s8
+#define npyv_cmpgt_u16 vcgtq_u16
+#define npyv_cmpgt_s16 vcgtq_s16
+#define npyv_cmpgt_u32 vcgtq_u32
+#define npyv_cmpgt_s32 vcgtq_s32
+#define npyv_cmpgt_f32 vcgtq_f32
+#define npyv_cmpgt_f64 vcgtq_f64
+
+#ifdef __aarch64__
+ #define npyv_cmpgt_u64 vcgtq_u64
+ #define npyv_cmpgt_s64 vcgtq_s64
+#else
+ NPY_FINLINE uint64x2_t npyv_cmpgt_s64(int64x2_t a, int64x2_t b)
+ {
+ int64x2_t sub = vsubq_s64(b, a);
+ uint64x2_t nsame_sbit = vreinterpretq_u64_s64(veorq_s64(a, b));
+ int64x2_t test = vbslq_s64(nsame_sbit, b, sub);
+ int64x2_t extend_sbit = vshrq_n_s64(test, 63);
+ return vreinterpretq_u64_s64(extend_sbit);
+ }
+ NPY_FINLINE uint64x2_t npyv_cmpgt_u64(uint64x2_t a, uint64x2_t b)
+ {
+ const uint64x2_t sbit = npyv_setall_u64(0x8000000000000000);
+ a = npyv_xor_u64(a, sbit);
+ b = npyv_xor_u64(b, sbit);
+ return npyv_cmpgt_s64(vreinterpretq_s64_u64(a), vreinterpretq_s64_u64(b));
+ }
+#endif
+
+// greater than or equal
+#define npyv_cmpge_u8 vcgeq_u8
+#define npyv_cmpge_s8 vcgeq_s8
+#define npyv_cmpge_u16 vcgeq_u16
+#define npyv_cmpge_s16 vcgeq_s16
+#define npyv_cmpge_u32 vcgeq_u32
+#define npyv_cmpge_s32 vcgeq_s32
+#define npyv_cmpge_f32 vcgeq_f32
+#define npyv_cmpge_f64 vcgeq_f64
+
+#ifdef __aarch64__
+ #define npyv_cmpge_u64 vcgeq_u64
+ #define npyv_cmpge_s64 vcgeq_s64
+#else
+ #define npyv_cmpge_u64(A, B) npyv_not_u64(npyv_cmpgt_u64(B, A))
+ #define npyv_cmpge_s64(A, B) npyv_not_u64(npyv_cmpgt_s64(B, A))
+#endif
+
+// less than
+#define npyv_cmplt_u8(A, B) npyv_cmpgt_u8(B, A)
+#define npyv_cmplt_s8(A, B) npyv_cmpgt_s8(B, A)
+#define npyv_cmplt_u16(A, B) npyv_cmpgt_u16(B, A)
+#define npyv_cmplt_s16(A, B) npyv_cmpgt_s16(B, A)
+#define npyv_cmplt_u32(A, B) npyv_cmpgt_u32(B, A)
+#define npyv_cmplt_s32(A, B) npyv_cmpgt_s32(B, A)
+#define npyv_cmplt_u64(A, B) npyv_cmpgt_u64(B, A)
+#define npyv_cmplt_s64(A, B) npyv_cmpgt_s64(B, A)
+#define npyv_cmplt_f32(A, B) npyv_cmpgt_f32(B, A)
+#define npyv_cmplt_f64(A, B) npyv_cmpgt_f64(B, A)
+
+// less than or equal
+#define npyv_cmple_u8(A, B) npyv_cmpge_u8(B, A)
+#define npyv_cmple_s8(A, B) npyv_cmpge_s8(B, A)
+#define npyv_cmple_u16(A, B) npyv_cmpge_u16(B, A)
+#define npyv_cmple_s16(A, B) npyv_cmpge_s16(B, A)
+#define npyv_cmple_u32(A, B) npyv_cmpge_u32(B, A)
+#define npyv_cmple_s32(A, B) npyv_cmpge_s32(B, A)
+#define npyv_cmple_u64(A, B) npyv_cmpge_u64(B, A)
+#define npyv_cmple_s64(A, B) npyv_cmpge_s64(B, A)
+#define npyv_cmple_f32(A, B) npyv_cmpge_f32(B, A)
+#define npyv_cmple_f64(A, B) npyv_cmpge_f64(B, A)
+
+#endif // _NPY_SIMD_NEON_OPERATORS_H
diff --git a/numpy/core/src/common/simd/neon/reorder.h b/numpy/core/src/common/simd/neon/reorder.h
new file mode 100644
index 000000000..712a77982
--- /dev/null
+++ b/numpy/core/src/common/simd/neon/reorder.h
@@ -0,0 +1,110 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_NEON_REORDER_H
+#define _NPY_SIMD_NEON_REORDER_H
+
+// combine lower part of two vectors
+#ifdef __aarch64__
+ #define npyv_combinel_u8(A, B) vreinterpretq_u8_u64(vzip1q_u64(vreinterpretq_u64_u8(A), vreinterpretq_u64_u8(B)))
+ #define npyv_combinel_s8(A, B) vreinterpretq_s8_u64(vzip1q_u64(vreinterpretq_u64_s8(A), vreinterpretq_u64_s8(B)))
+ #define npyv_combinel_u16(A, B) vreinterpretq_u16_u64(vzip1q_u64(vreinterpretq_u64_u16(A), vreinterpretq_u64_u16(B)))
+ #define npyv_combinel_s16(A, B) vreinterpretq_s16_u64(vzip1q_u64(vreinterpretq_u64_s16(A), vreinterpretq_u64_s16(B)))
+ #define npyv_combinel_u32(A, B) vreinterpretq_u32_u64(vzip1q_u64(vreinterpretq_u64_u32(A), vreinterpretq_u64_u32(B)))
+ #define npyv_combinel_s32(A, B) vreinterpretq_s32_u64(vzip1q_u64(vreinterpretq_u64_s32(A), vreinterpretq_u64_s32(B)))
+ #define npyv_combinel_u64 vzip1q_u64
+ #define npyv_combinel_s64 vzip1q_s64
+ #define npyv_combinel_f32(A, B) vreinterpretq_f32_u64(vzip1q_u64(vreinterpretq_u64_f32(A), vreinterpretq_u64_f32(B)))
+ #define npyv_combinel_f64 vzip1q_f64
+#else
+ #define npyv_combinel_u8(A, B) vcombine_u8(vget_low_u8(A), vget_low_u8(B))
+ #define npyv_combinel_s8(A, B) vcombine_s8(vget_low_s8(A), vget_low_s8(B))
+ #define npyv_combinel_u16(A, B) vcombine_u16(vget_low_u16(A), vget_low_u16(B))
+ #define npyv_combinel_s16(A, B) vcombine_s16(vget_low_s16(A), vget_low_s16(B))
+ #define npyv_combinel_u32(A, B) vcombine_u32(vget_low_u32(A), vget_low_u32(B))
+ #define npyv_combinel_s32(A, B) vcombine_s32(vget_low_s32(A), vget_low_s32(B))
+ #define npyv_combinel_u64(A, B) vcombine_u64(vget_low_u64(A), vget_low_u64(B))
+ #define npyv_combinel_s64(A, B) vcombine_s64(vget_low_s64(A), vget_low_s64(B))
+ #define npyv_combinel_f32(A, B) vcombine_f32(vget_low_f32(A), vget_low_f32(B))
+#endif
+
+// combine higher part of two vectors
+#ifdef __aarch64__
+ #define npyv_combineh_u8(A, B) vreinterpretq_u8_u64(vzip2q_u64(vreinterpretq_u64_u8(A), vreinterpretq_u64_u8(B)))
+ #define npyv_combineh_s8(A, B) vreinterpretq_s8_u64(vzip2q_u64(vreinterpretq_u64_s8(A), vreinterpretq_u64_s8(B)))
+ #define npyv_combineh_u16(A, B) vreinterpretq_u16_u64(vzip2q_u64(vreinterpretq_u64_u16(A), vreinterpretq_u64_u16(B)))
+ #define npyv_combineh_s16(A, B) vreinterpretq_s16_u64(vzip2q_u64(vreinterpretq_u64_s16(A), vreinterpretq_u64_s16(B)))
+ #define npyv_combineh_u32(A, B) vreinterpretq_u32_u64(vzip2q_u64(vreinterpretq_u64_u32(A), vreinterpretq_u64_u32(B)))
+ #define npyv_combineh_s32(A, B) vreinterpretq_s32_u64(vzip2q_u64(vreinterpretq_u64_s32(A), vreinterpretq_u64_s32(B)))
+ #define npyv_combineh_u64 vzip2q_u64
+ #define npyv_combineh_s64 vzip2q_s64
+ #define npyv_combineh_f32(A, B) vreinterpretq_f32_u64(vzip2q_u64(vreinterpretq_u64_f32(A), vreinterpretq_u64_f32(B)))
+ #define npyv_combineh_f64 vzip2q_f64
+#else
+ #define npyv_combineh_u8(A, B) vcombine_u8(vget_high_u8(A), vget_high_u8(B))
+ #define npyv_combineh_s8(A, B) vcombine_s8(vget_high_s8(A), vget_high_s8(B))
+ #define npyv_combineh_u16(A, B) vcombine_u16(vget_high_u16(A), vget_high_u16(B))
+ #define npyv_combineh_s16(A, B) vcombine_s16(vget_high_s16(A), vget_high_s16(B))
+ #define npyv_combineh_u32(A, B) vcombine_u32(vget_high_u32(A), vget_high_u32(B))
+ #define npyv_combineh_s32(A, B) vcombine_s32(vget_high_s32(A), vget_high_s32(B))
+ #define npyv_combineh_u64(A, B) vcombine_u64(vget_high_u64(A), vget_high_u64(B))
+ #define npyv_combineh_s64(A, B) vcombine_s64(vget_high_s64(A), vget_high_s64(B))
+ #define npyv_combineh_f32(A, B) vcombine_f32(vget_high_f32(A), vget_high_f32(B))
+#endif
+
+// combine two vectors from lower and higher parts of two other vectors
+#define NPYV_IMPL_NEON_COMBINE(T_VEC, SFX) \
+ NPY_FINLINE T_VEC##x2 npyv_combine_##SFX(T_VEC a, T_VEC b) \
+ { \
+ T_VEC##x2 r; \
+ r.val[0] = NPY_CAT(npyv_combinel_, SFX)(a, b); \
+ r.val[1] = NPY_CAT(npyv_combineh_, SFX)(a, b); \
+ return r; \
+ }
+
+NPYV_IMPL_NEON_COMBINE(npyv_u8, u8)
+NPYV_IMPL_NEON_COMBINE(npyv_s8, s8)
+NPYV_IMPL_NEON_COMBINE(npyv_u16, u16)
+NPYV_IMPL_NEON_COMBINE(npyv_s16, s16)
+NPYV_IMPL_NEON_COMBINE(npyv_u32, u32)
+NPYV_IMPL_NEON_COMBINE(npyv_s32, s32)
+NPYV_IMPL_NEON_COMBINE(npyv_u64, u64)
+NPYV_IMPL_NEON_COMBINE(npyv_s64, s64)
+NPYV_IMPL_NEON_COMBINE(npyv_f32, f32)
+#ifdef __aarch64__
+NPYV_IMPL_NEON_COMBINE(npyv_f64, f64)
+#endif
+
+// interleave two vectors
+#define NPYV_IMPL_NEON_ZIP(T_VEC, SFX) \
+ NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b) \
+ { \
+ T_VEC##x2 r; \
+ r.val[0] = vzip1q_##SFX(a, b); \
+ r.val[1] = vzip2q_##SFX(a, b); \
+ return r; \
+ }
+
+#ifdef __aarch64__
+ NPYV_IMPL_NEON_ZIP(npyv_u8, u8)
+ NPYV_IMPL_NEON_ZIP(npyv_s8, s8)
+ NPYV_IMPL_NEON_ZIP(npyv_u16, u16)
+ NPYV_IMPL_NEON_ZIP(npyv_s16, s16)
+ NPYV_IMPL_NEON_ZIP(npyv_u32, u32)
+ NPYV_IMPL_NEON_ZIP(npyv_s32, s32)
+ NPYV_IMPL_NEON_ZIP(npyv_f32, f32)
+ NPYV_IMPL_NEON_ZIP(npyv_f64, f64)
+#else
+ #define npyv_zip_u8 vzipq_u8
+ #define npyv_zip_s8 vzipq_s8
+ #define npyv_zip_u16 vzipq_u16
+ #define npyv_zip_s16 vzipq_s16
+ #define npyv_zip_u32 vzipq_u32
+ #define npyv_zip_s32 vzipq_s32
+ #define npyv_zip_f32 vzipq_f32
+#endif
+#define npyv_zip_u64 npyv_combine_u64
+#define npyv_zip_s64 npyv_combine_s64
+
+#endif // _NPY_SIMD_NEON_REORDER_H
diff --git a/numpy/core/src/common/simd/simd.h b/numpy/core/src/common/simd/simd.h
new file mode 100644
index 000000000..2f39c8427
--- /dev/null
+++ b/numpy/core/src/common/simd/simd.h
@@ -0,0 +1,56 @@
+#ifndef _NPY_SIMD_H_
+#define _NPY_SIMD_H_
+/**
+ * the NumPy C SIMD vectorization interface "NPYV" are types and functions intended
+ * to simplify vectorization of code on different platforms, currently supports
+ * the following SIMD extensions SSE, AVX2, AVX512, VSX and NEON.
+ *
+ * TODO: Add an independent sphinx doc.
+*/
+#include "numpy/npy_common.h"
+#include "npy_cpu_dispatch.h"
+#include "simd_utils.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// lane type by intrin suffix
+typedef npy_uint8 npyv_lanetype_u8;
+typedef npy_int8 npyv_lanetype_s8;
+typedef npy_uint16 npyv_lanetype_u16;
+typedef npy_int16 npyv_lanetype_s16;
+typedef npy_uint32 npyv_lanetype_u32;
+typedef npy_int32 npyv_lanetype_s32;
+typedef npy_uint64 npyv_lanetype_u64;
+typedef npy_int64 npyv_lanetype_s64;
+typedef float npyv_lanetype_f32;
+typedef double npyv_lanetype_f64;
+
+#if defined(NPY_HAVE_AVX512F) && !defined(NPY_SIMD_FORCE_256) && !defined(NPY_SIMD_FORCE_128)
+ #include "avx512/avx512.h"
+#elif defined(NPY_HAVE_AVX2) && !defined(NPY_SIMD_FORCE_128)
+ #include "avx2/avx2.h"
+#elif defined(NPY_HAVE_SSE2)
+ #include "sse/sse.h"
+#endif
+
+// TODO: Add support for VSX(2.06) and BE Mode
+#if defined(NPY_HAVE_VSX2) && defined(__LITTLE_ENDIAN__)
+ #include "vsx/vsx.h"
+#endif
+
+#ifdef NPY_HAVE_NEON
+ #include "neon/neon.h"
+#endif
+
+#ifndef NPY_SIMD
+ #define NPY_SIMD 0
+ #define NPY_SIMD_WIDTH 0
+ #define NPY_SIMD_F64 0
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif // _NPY_SIMD_H_
diff --git a/numpy/core/src/common/simd/simd_utils.h b/numpy/core/src/common/simd/simd_utils.h
new file mode 100644
index 000000000..06c2f16f7
--- /dev/null
+++ b/numpy/core/src/common/simd/simd_utils.h
@@ -0,0 +1,48 @@
+#ifndef _NPY_SIMD_UTILS_H
+#define _NPY_SIMD_UTILS_H
+
+#define NPYV__SET_2(CAST, I0, I1, ...) (CAST)(I0), (CAST)(I1)
+
+#define NPYV__SET_4(CAST, I0, I1, I2, I3, ...) \
+ (CAST)(I0), (CAST)(I1), (CAST)(I2), (CAST)(I3)
+
+#define NPYV__SET_8(CAST, I0, I1, I2, I3, I4, I5, I6, I7, ...) \
+ (CAST)(I0), (CAST)(I1), (CAST)(I2), (CAST)(I3), (CAST)(I4), (CAST)(I5), (CAST)(I6), (CAST)(I7)
+
+#define NPYV__SET_16(CAST, I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15, ...) \
+ NPYV__SET_8(CAST, I0, I1, I2, I3, I4, I5, I6, I7), \
+ NPYV__SET_8(CAST, I8, I9, I10, I11, I12, I13, I14, I15)
+
+#define NPYV__SET_32(CAST, I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15, \
+I16, I17, I18, I19, I20, I21, I22, I23, I24, I25, I26, I27, I28, I29, I30, I31, ...) \
+ \
+ NPYV__SET_16(CAST, I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15), \
+ NPYV__SET_16(CAST, I16, I17, I18, I19, I20, I21, I22, I23, I24, I25, I26, I27, I28, I29, I30, I31)
+
+#define NPYV__SET_64(CAST, I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15, \
+I16, I17, I18, I19, I20, I21, I22, I23, I24, I25, I26, I27, I28, I29, I30, I31, \
+I32, I33, I34, I35, I36, I37, I38, I39, I40, I41, I42, I43, I44, I45, I46, I47, \
+I48, I49, I50, I51, I52, I53, I54, I55, I56, I57, I58, I59, I60, I61, I62, I63, ...) \
+ \
+ NPYV__SET_32(CAST, I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15, \
+I16, I17, I18, I19, I20, I21, I22, I23, I24, I25, I26, I27, I28, I29, I30, I31), \
+ NPYV__SET_32(CAST, I32, I33, I34, I35, I36, I37, I38, I39, I40, I41, I42, I43, I44, I45, I46, I47, \
+I48, I49, I50, I51, I52, I53, I54, I55, I56, I57, I58, I59, I60, I61, I62, I63)
+
+#define NPYV__SET_FILL_2(CAST, F, ...) NPY_EXPAND(NPYV__SET_2(CAST, __VA_ARGS__, F, F))
+
+#define NPYV__SET_FILL_4(CAST, F, ...) NPY_EXPAND(NPYV__SET_4(CAST, __VA_ARGS__, F, F, F, F))
+
+#define NPYV__SET_FILL_8(CAST, F, ...) NPY_EXPAND(NPYV__SET_8(CAST, __VA_ARGS__, F, F, F, F, F, F, F, F))
+
+#define NPYV__SET_FILL_16(CAST, F, ...) NPY_EXPAND(NPYV__SET_16(CAST, __VA_ARGS__, \
+ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F))
+
+#define NPYV__SET_FILL_32(CAST, F, ...) NPY_EXPAND(NPYV__SET_32(CAST, __VA_ARGS__, \
+ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F))
+
+#define NPYV__SET_FILL_64(CAST, F, ...) NPY_EXPAND(NPYV__SET_64(CAST, __VA_ARGS__, \
+ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, \
+ F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F, F))
+
+#endif // _NPY_SIMD_UTILS_H
diff --git a/numpy/core/src/common/simd/sse/arithmetic.h b/numpy/core/src/common/simd/sse/arithmetic.h
new file mode 100644
index 000000000..12d0af05c
--- /dev/null
+++ b/numpy/core/src/common/simd/sse/arithmetic.h
@@ -0,0 +1,95 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_SSE_ARITHMETIC_H
+#define _NPY_SIMD_SSE_ARITHMETIC_H
+
+/***************************
+ * Addition
+ ***************************/
+// non-saturated
+#define npyv_add_u8 _mm_add_epi8
+#define npyv_add_s8 _mm_add_epi8
+#define npyv_add_u16 _mm_add_epi16
+#define npyv_add_s16 _mm_add_epi16
+#define npyv_add_u32 _mm_add_epi32
+#define npyv_add_s32 _mm_add_epi32
+#define npyv_add_u64 _mm_add_epi64
+#define npyv_add_s64 _mm_add_epi64
+#define npyv_add_f32 _mm_add_ps
+#define npyv_add_f64 _mm_add_pd
+
+// saturated
+#define npyv_adds_u8 _mm_adds_epu8
+#define npyv_adds_s8 _mm_adds_epi8
+#define npyv_adds_u16 _mm_adds_epu16
+#define npyv_adds_s16 _mm_adds_epi16
+// TODO: rest, after implment Packs intrins
+
+/***************************
+ * Subtraction
+ ***************************/
+// non-saturated
+#define npyv_sub_u8 _mm_sub_epi8
+#define npyv_sub_s8 _mm_sub_epi8
+#define npyv_sub_u16 _mm_sub_epi16
+#define npyv_sub_s16 _mm_sub_epi16
+#define npyv_sub_u32 _mm_sub_epi32
+#define npyv_sub_s32 _mm_sub_epi32
+#define npyv_sub_u64 _mm_sub_epi64
+#define npyv_sub_s64 _mm_sub_epi64
+#define npyv_sub_f32 _mm_sub_ps
+#define npyv_sub_f64 _mm_sub_pd
+
+// saturated
+#define npyv_subs_u8 _mm_subs_epu8
+#define npyv_subs_s8 _mm_subs_epi8
+#define npyv_subs_u16 _mm_subs_epu16
+#define npyv_subs_s16 _mm_subs_epi16
+// TODO: rest, after implment Packs intrins
+
+/***************************
+ * Multiplication
+ ***************************/
+// non-saturated
+NPY_FINLINE __m128i npyv_mul_u8(__m128i a, __m128i b)
+{
+ const __m128i mask = _mm_set1_epi32(0xFF00FF00);
+ __m128i even = _mm_mullo_epi16(a, b);
+ __m128i odd = _mm_mullo_epi16(_mm_srai_epi16(a, 8), _mm_srai_epi16(b, 8));
+ odd = _mm_slli_epi16(odd, 8);
+ return npyv_select_u8(mask, odd, even);
+}
+#define npyv_mul_s8 npyv_mul_u8
+#define npyv_mul_u16 _mm_mullo_epi16
+#define npyv_mul_s16 _mm_mullo_epi16
+
+#ifdef NPY_HAVE_SSE41
+ #define npyv_mul_u32 _mm_mullo_epi32
+#else
+ NPY_FINLINE __m128i npyv_mul_u32(__m128i a, __m128i b)
+ {
+ __m128i even = _mm_mul_epu32(a, b);
+ __m128i odd = _mm_mul_epu32(_mm_srli_epi64(a, 32), _mm_srli_epi64(b, 32));
+ __m128i low = _mm_unpacklo_epi32(even, odd);
+ __m128i high = _mm_unpackhi_epi32(even, odd);
+ return _mm_unpacklo_epi64(low, high);
+ }
+#endif // NPY_HAVE_SSE41
+#define npyv_mul_s32 npyv_mul_u32
+// TODO: emulate 64-bit*/
+#define npyv_mul_f32 _mm_mul_ps
+#define npyv_mul_f64 _mm_mul_pd
+
+// saturated
+// TODO: after implment Packs intrins
+
+/***************************
+ * Division
+ ***************************/
+// TODO: emulate integer division
+#define npyv_div_f32 _mm_div_ps
+#define npyv_div_f64 _mm_div_pd
+
+#endif // _NPY_SIMD_SSE_ARITHMETIC_H
diff --git a/numpy/core/src/common/simd/sse/conversion.h b/numpy/core/src/common/simd/sse/conversion.h
new file mode 100644
index 000000000..ea9660d13
--- /dev/null
+++ b/numpy/core/src/common/simd/sse/conversion.h
@@ -0,0 +1,32 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_SSE_CVT_H
+#define _NPY_SIMD_SSE_CVT_H
+
+// convert mask types to integer types
+#define npyv_cvt_u8_b8(BL) BL
+#define npyv_cvt_s8_b8(BL) BL
+#define npyv_cvt_u16_b16(BL) BL
+#define npyv_cvt_s16_b16(BL) BL
+#define npyv_cvt_u32_b32(BL) BL
+#define npyv_cvt_s32_b32(BL) BL
+#define npyv_cvt_u64_b64(BL) BL
+#define npyv_cvt_s64_b64(BL) BL
+#define npyv_cvt_f32_b32(BL) _mm_castsi128_ps(BL)
+#define npyv_cvt_f64_b64(BL) _mm_castsi128_pd(BL)
+
+// convert integer types to mask types
+#define npyv_cvt_b8_u8(A) A
+#define npyv_cvt_b8_s8(A) A
+#define npyv_cvt_b16_u16(A) A
+#define npyv_cvt_b16_s16(A) A
+#define npyv_cvt_b32_u32(A) A
+#define npyv_cvt_b32_s32(A) A
+#define npyv_cvt_b64_u64(A) A
+#define npyv_cvt_b64_s64(A) A
+#define npyv_cvt_b32_f32(A) _mm_castps_si128(A)
+#define npyv_cvt_b64_f64(A) _mm_castpd_si128(A)
+
+#endif // _NPY_SIMD_SSE_CVT_H
diff --git a/numpy/core/src/common/simd/sse/memory.h b/numpy/core/src/common/simd/sse/memory.h
new file mode 100644
index 000000000..1a555d6f0
--- /dev/null
+++ b/numpy/core/src/common/simd/sse/memory.h
@@ -0,0 +1,74 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_SSE_MEMORY_H
+#define _NPY_SIMD_SSE_MEMORY_H
+
+/***************************
+ * load/store
+ ***************************/
+// stream load
+#ifdef NPY_HAVE_SSE41
+ #define npyv__loads(PTR) _mm_stream_load_si128((__m128i *)(PTR))
+#else
+ #define npyv__loads(PTR) _mm_load_si128((const __m128i *)(PTR))
+#endif
+#define NPYV_IMPL_SSE_MEM_INT(CTYPE, SFX) \
+ NPY_FINLINE npyv_##SFX npyv_load_##SFX(const CTYPE *ptr) \
+ { return _mm_loadu_si128((const __m128i*)ptr); } \
+ NPY_FINLINE npyv_##SFX npyv_loada_##SFX(const CTYPE *ptr) \
+ { return _mm_load_si128((const __m128i*)ptr); } \
+ NPY_FINLINE npyv_##SFX npyv_loads_##SFX(const CTYPE *ptr) \
+ { return npyv__loads(ptr); } \
+ NPY_FINLINE npyv_##SFX npyv_loadl_##SFX(const CTYPE *ptr) \
+ { return _mm_loadl_epi64((const __m128i*)ptr); } \
+ NPY_FINLINE void npyv_store_##SFX(CTYPE *ptr, npyv_##SFX vec) \
+ { _mm_storeu_si128((__m128i*)ptr, vec); } \
+ NPY_FINLINE void npyv_storea_##SFX(CTYPE *ptr, npyv_##SFX vec) \
+ { _mm_store_si128((__m128i*)ptr, vec); } \
+ NPY_FINLINE void npyv_stores_##SFX(CTYPE *ptr, npyv_##SFX vec) \
+ { _mm_stream_si128((__m128i*)ptr, vec); } \
+ NPY_FINLINE void npyv_storel_##SFX(CTYPE *ptr, npyv_##SFX vec) \
+ { _mm_storel_epi64((__m128i *)ptr, vec); } \
+ NPY_FINLINE void npyv_storeh_##SFX(CTYPE *ptr, npyv_##SFX vec) \
+ { _mm_storel_epi64((__m128i *)ptr, _mm_unpackhi_epi64(vec, vec)); }
+
+NPYV_IMPL_SSE_MEM_INT(npy_uint8, u8)
+NPYV_IMPL_SSE_MEM_INT(npy_int8, s8)
+NPYV_IMPL_SSE_MEM_INT(npy_uint16, u16)
+NPYV_IMPL_SSE_MEM_INT(npy_int16, s16)
+NPYV_IMPL_SSE_MEM_INT(npy_uint32, u32)
+NPYV_IMPL_SSE_MEM_INT(npy_int32, s32)
+NPYV_IMPL_SSE_MEM_INT(npy_uint64, u64)
+NPYV_IMPL_SSE_MEM_INT(npy_int64, s64)
+
+// unaligned load
+#define npyv_load_f32 _mm_loadu_ps
+#define npyv_load_f64 _mm_loadu_pd
+// aligned load
+#define npyv_loada_f32 _mm_load_ps
+#define npyv_loada_f64 _mm_load_pd
+// load lower part
+#define npyv_loadl_f32(PTR) _mm_castsi128_ps(npyv_loadl_u32((const npy_uint32*)(PTR)))
+#define npyv_loadl_f64(PTR) _mm_castsi128_pd(npyv_loadl_u32((const npy_uint32*)(PTR)))
+// stream load
+#define npyv_loads_f32(PTR) _mm_castsi128_ps(npyv__loads(PTR))
+#define npyv_loads_f64(PTR) _mm_castsi128_pd(npyv__loads(PTR))
+// unaligned store
+#define npyv_store_f32 _mm_storeu_ps
+#define npyv_store_f64 _mm_storeu_pd
+// aligned store
+#define npyv_storea_f32 _mm_store_ps
+#define npyv_storea_f64 _mm_store_pd
+// stream store
+#define npyv_stores_f32 _mm_stream_ps
+#define npyv_stores_f64 _mm_stream_pd
+// store lower part
+#define npyv_storel_f32(PTR, VEC) _mm_storel_epi64((__m128i*)(PTR), _mm_castps_si128(VEC));
+#define npyv_storel_f64(PTR, VEC) _mm_storel_epi64((__m128i*)(PTR), _mm_castpd_si128(VEC));
+// store higher part
+#define npyv_storeh_f32(PTR, VEC) npyv_storeh_u32((npy_uint32*)(PTR), _mm_castps_si128(VEC))
+#define npyv_storeh_f64(PTR, VEC) npyv_storeh_u32((npy_uint32*)(PTR), _mm_castpd_si128(VEC))
+
+#endif // _NPY_SIMD_SSE_MEMORY_H
diff --git a/numpy/core/src/common/simd/sse/misc.h b/numpy/core/src/common/simd/sse/misc.h
new file mode 100644
index 000000000..7ba47bc68
--- /dev/null
+++ b/numpy/core/src/common/simd/sse/misc.h
@@ -0,0 +1,230 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_SSE_MISC_H
+#define _NPY_SIMD_SSE_MISC_H
+
+// vector with zero lanes
+#define npyv_zero_u8 _mm_setzero_si128
+#define npyv_zero_s8 _mm_setzero_si128
+#define npyv_zero_u16 _mm_setzero_si128
+#define npyv_zero_s16 _mm_setzero_si128
+#define npyv_zero_u32 _mm_setzero_si128
+#define npyv_zero_s32 _mm_setzero_si128
+#define npyv_zero_u64 _mm_setzero_si128
+#define npyv_zero_s64 _mm_setzero_si128
+#define npyv_zero_f32 _mm_setzero_ps
+#define npyv_zero_f64 _mm_setzero_pd
+
+// vector with a specific value set to all lanes
+#define npyv_setall_u8(VAL) _mm_set1_epi8((char)VAL)
+#define npyv_setall_s8(VAL) _mm_set1_epi8((char)VAL)
+#define npyv_setall_u16(VAL) _mm_set1_epi16((short)VAL)
+#define npyv_setall_s16(VAL) _mm_set1_epi16((short)VAL)
+#define npyv_setall_u32(VAL) _mm_set1_epi32((int)VAL)
+#define npyv_setall_s32(VAL) _mm_set1_epi32(VAL)
+#if !defined(__x86_64__) && !defined(_M_X64)
+ #define npyv_setall_u64(VAL) _mm_set_epi32((int)(VAL >> 32), (int)VAL, (int)(VAL >> 32), (int)VAL)
+ #define npyv_setall_s64 npyv_setall_u64
+#else
+ #define npyv_setall_u64(VAL) _mm_set1_epi64x(VAL)
+ #define npyv_setall_s64(VAL) _mm_set1_epi64x(VAL)
+#endif
+#define npyv_setall_f32(VAL) _mm_set1_ps(VAL)
+#define npyv_setall_f64(VAL) _mm_set1_pd(VAL)
+
+/**
+ * vector with specific values set to each lane and
+ * set a specific value to all remained lanes
+ *
+ * Args that generated by NPYV__SET_FILL_* not going to expand if
+ * _mm_setr_* are defined as macros.
+ */
+NPY_FINLINE __m128i npyv__setr_epi8(
+ char i0, char i1, char i2, char i3, char i4, char i5, char i6, char i7,
+ char i8, char i9, char i10, char i11, char i12, char i13, char i14, char i15)
+{
+ return _mm_setr_epi8(i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15);
+}
+NPY_FINLINE __m128i npyv__setr_epi16(short i0, short i1, short i2, short i3, short i4, short i5,
+ short i6, short i7)
+{
+ return _mm_setr_epi16(i0, i1, i2, i3, i4, i5, i6, i7);
+}
+NPY_FINLINE __m128i npyv__setr_epi32(int i0, int i1, int i2, int i3)
+{
+ return _mm_setr_epi32(i0, i1, i2, i3);
+}
+NPY_FINLINE __m128i npyv__setr_epi64(npy_int64 i0, npy_int64 i1)
+{
+ return _mm_set_epi64x(i1, i0);
+}
+NPY_FINLINE __m128 npyv__setr_ps(float i0, float i1, float i2, float i3)
+{
+ return _mm_setr_ps(i0, i1, i2, i3);
+}
+NPY_FINLINE __m128d npyv__setr_pd(double i0, double i1)
+{
+ return _mm_setr_pd(i0, i1);
+}
+#define npyv_setf_u8(FILL, ...) npyv__setr_epi8(NPYV__SET_FILL_16(char, FILL, __VA_ARGS__))
+#define npyv_setf_s8(FILL, ...) npyv__setr_epi8(NPYV__SET_FILL_16(char, FILL, __VA_ARGS__))
+#define npyv_setf_u16(FILL, ...) npyv__setr_epi16(NPYV__SET_FILL_8(short, FILL, __VA_ARGS__))
+#define npyv_setf_s16(FILL, ...) npyv__setr_epi16(NPYV__SET_FILL_8(short, FILL, __VA_ARGS__))
+#define npyv_setf_u32(FILL, ...) npyv__setr_epi32(NPYV__SET_FILL_4(int, FILL, __VA_ARGS__))
+#define npyv_setf_s32(FILL, ...) npyv__setr_epi32(NPYV__SET_FILL_4(int, FILL, __VA_ARGS__))
+#define npyv_setf_u64(FILL, ...) npyv__setr_epi64(NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__))
+#define npyv_setf_s64(FILL, ...) npyv__setr_epi64(NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__))
+#define npyv_setf_f32(FILL, ...) npyv__setr_ps(NPYV__SET_FILL_4(float, FILL, __VA_ARGS__))
+#define npyv_setf_f64(FILL, ...) npyv__setr_pd(NPYV__SET_FILL_2(double, FILL, __VA_ARGS__))
+
+// vector with specific values set to each lane and
+// set zero to all remained lanes
+#define npyv_set_u8(...) npyv_setf_u8(0, __VA_ARGS__)
+#define npyv_set_s8(...) npyv_setf_s8(0, __VA_ARGS__)
+#define npyv_set_u16(...) npyv_setf_u16(0, __VA_ARGS__)
+#define npyv_set_s16(...) npyv_setf_s16(0, __VA_ARGS__)
+#define npyv_set_u32(...) npyv_setf_u32(0, __VA_ARGS__)
+#define npyv_set_s32(...) npyv_setf_s32(0, __VA_ARGS__)
+#define npyv_set_u64(...) npyv_setf_u64(0, __VA_ARGS__)
+#define npyv_set_s64(...) npyv_setf_s64(0, __VA_ARGS__)
+#define npyv_set_f32(...) npyv_setf_f32(0, __VA_ARGS__)
+#define npyv_set_f64(...) npyv_setf_f64(0, __VA_ARGS__)
+
+// Per lane select
+#ifdef NPY_HAVE_SSE41
+ #define npyv_select_u8(MASK, A, B) _mm_blendv_epi8(B, A, MASK)
+ #define npyv_select_f32(MASK, A, B) _mm_blendv_ps(B, A, _mm_castsi128_ps(MASK))
+ #define npyv_select_f64(MASK, A, B) _mm_blendv_pd(B, A, _mm_castsi128_pd(MASK))
+#else
+ NPY_FINLINE __m128i npyv_select_u8(__m128i mask, __m128i a, __m128i b)
+ { return _mm_xor_si128(b, _mm_and_si128(_mm_xor_si128(b, a), mask)); }
+ NPY_FINLINE __m128 npyv_select_f32(__m128i mask, __m128 a, __m128 b)
+ { return _mm_xor_ps(b, _mm_and_ps(_mm_xor_ps(b, a), _mm_castsi128_ps(mask))); }
+ NPY_FINLINE __m128d npyv_select_f64(__m128i mask, __m128d a, __m128d b)
+ { return _mm_xor_pd(b, _mm_and_pd(_mm_xor_pd(b, a), _mm_castsi128_pd(mask))); }
+#endif
+#define npyv_select_s8 npyv_select_u8
+#define npyv_select_u16 npyv_select_u8
+#define npyv_select_s16 npyv_select_u8
+#define npyv_select_u32 npyv_select_u8
+#define npyv_select_s32 npyv_select_u8
+#define npyv_select_u64 npyv_select_u8
+#define npyv_select_s64 npyv_select_u8
+
+// Reinterpret
+#define npyv_reinterpret_u8_u8(X) X
+#define npyv_reinterpret_u8_s8(X) X
+#define npyv_reinterpret_u8_u16(X) X
+#define npyv_reinterpret_u8_s16(X) X
+#define npyv_reinterpret_u8_u32(X) X
+#define npyv_reinterpret_u8_s32(X) X
+#define npyv_reinterpret_u8_u64(X) X
+#define npyv_reinterpret_u8_s64(X) X
+#define npyv_reinterpret_u8_f32 _mm_castps_si128
+#define npyv_reinterpret_u8_f64 _mm_castpd_si128
+
+#define npyv_reinterpret_s8_s8(X) X
+#define npyv_reinterpret_s8_u8(X) X
+#define npyv_reinterpret_s8_u16(X) X
+#define npyv_reinterpret_s8_s16(X) X
+#define npyv_reinterpret_s8_u32(X) X
+#define npyv_reinterpret_s8_s32(X) X
+#define npyv_reinterpret_s8_u64(X) X
+#define npyv_reinterpret_s8_s64(X) X
+#define npyv_reinterpret_s8_f32 _mm_castps_si128
+#define npyv_reinterpret_s8_f64 _mm_castpd_si128
+
+#define npyv_reinterpret_u16_u16(X) X
+#define npyv_reinterpret_u16_u8(X) X
+#define npyv_reinterpret_u16_s8(X) X
+#define npyv_reinterpret_u16_s16(X) X
+#define npyv_reinterpret_u16_u32(X) X
+#define npyv_reinterpret_u16_s32(X) X
+#define npyv_reinterpret_u16_u64(X) X
+#define npyv_reinterpret_u16_s64(X) X
+#define npyv_reinterpret_u16_f32 _mm_castps_si128
+#define npyv_reinterpret_u16_f64 _mm_castpd_si128
+
+#define npyv_reinterpret_s16_s16(X) X
+#define npyv_reinterpret_s16_u8(X) X
+#define npyv_reinterpret_s16_s8(X) X
+#define npyv_reinterpret_s16_u16(X) X
+#define npyv_reinterpret_s16_u32(X) X
+#define npyv_reinterpret_s16_s32(X) X
+#define npyv_reinterpret_s16_u64(X) X
+#define npyv_reinterpret_s16_s64(X) X
+#define npyv_reinterpret_s16_f32 _mm_castps_si128
+#define npyv_reinterpret_s16_f64 _mm_castpd_si128
+
+#define npyv_reinterpret_u32_u32(X) X
+#define npyv_reinterpret_u32_u8(X) X
+#define npyv_reinterpret_u32_s8(X) X
+#define npyv_reinterpret_u32_u16(X) X
+#define npyv_reinterpret_u32_s16(X) X
+#define npyv_reinterpret_u32_s32(X) X
+#define npyv_reinterpret_u32_u64(X) X
+#define npyv_reinterpret_u32_s64(X) X
+#define npyv_reinterpret_u32_f32 _mm_castps_si128
+#define npyv_reinterpret_u32_f64 _mm_castpd_si128
+
+#define npyv_reinterpret_s32_s32(X) X
+#define npyv_reinterpret_s32_u8(X) X
+#define npyv_reinterpret_s32_s8(X) X
+#define npyv_reinterpret_s32_u16(X) X
+#define npyv_reinterpret_s32_s16(X) X
+#define npyv_reinterpret_s32_u32(X) X
+#define npyv_reinterpret_s32_u64(X) X
+#define npyv_reinterpret_s32_s64(X) X
+#define npyv_reinterpret_s32_f32 _mm_castps_si128
+#define npyv_reinterpret_s32_f64 _mm_castpd_si128
+
+#define npyv_reinterpret_u64_u64(X) X
+#define npyv_reinterpret_u64_u8(X) X
+#define npyv_reinterpret_u64_s8(X) X
+#define npyv_reinterpret_u64_u16(X) X
+#define npyv_reinterpret_u64_s16(X) X
+#define npyv_reinterpret_u64_u32(X) X
+#define npyv_reinterpret_u64_s32(X) X
+#define npyv_reinterpret_u64_s64(X) X
+#define npyv_reinterpret_u64_f32 _mm_castps_si128
+#define npyv_reinterpret_u64_f64 _mm_castpd_si128
+
+#define npyv_reinterpret_s64_s64(X) X
+#define npyv_reinterpret_s64_u8(X) X
+#define npyv_reinterpret_s64_s8(X) X
+#define npyv_reinterpret_s64_u16(X) X
+#define npyv_reinterpret_s64_s16(X) X
+#define npyv_reinterpret_s64_u32(X) X
+#define npyv_reinterpret_s64_s32(X) X
+#define npyv_reinterpret_s64_u64(X) X
+#define npyv_reinterpret_s64_f32 _mm_castps_si128
+#define npyv_reinterpret_s64_f64 _mm_castpd_si128
+
+#define npyv_reinterpret_f32_f32(X) X
+#define npyv_reinterpret_f32_u8 _mm_castsi128_ps
+#define npyv_reinterpret_f32_s8 _mm_castsi128_ps
+#define npyv_reinterpret_f32_u16 _mm_castsi128_ps
+#define npyv_reinterpret_f32_s16 _mm_castsi128_ps
+#define npyv_reinterpret_f32_u32 _mm_castsi128_ps
+#define npyv_reinterpret_f32_s32 _mm_castsi128_ps
+#define npyv_reinterpret_f32_u64 _mm_castsi128_ps
+#define npyv_reinterpret_f32_s64 _mm_castsi128_ps
+#define npyv_reinterpret_f32_f64 _mm_castpd_ps
+
+#define npyv_reinterpret_f64_f64(X) X
+#define npyv_reinterpret_f64_u8 _mm_castsi128_pd
+#define npyv_reinterpret_f64_s8 _mm_castsi128_pd
+#define npyv_reinterpret_f64_u16 _mm_castsi128_pd
+#define npyv_reinterpret_f64_s16 _mm_castsi128_pd
+#define npyv_reinterpret_f64_u32 _mm_castsi128_pd
+#define npyv_reinterpret_f64_s32 _mm_castsi128_pd
+#define npyv_reinterpret_f64_u64 _mm_castsi128_pd
+#define npyv_reinterpret_f64_s64 _mm_castsi128_pd
+#define npyv_reinterpret_f64_f32 _mm_castps_pd
+
+// Only required by AVX2/AVX512
+#define npyv_cleanup() ((void)0)
+
+#endif // _NPY_SIMD_SSE_MISC_H
diff --git a/numpy/core/src/common/simd/sse/operators.h b/numpy/core/src/common/simd/sse/operators.h
new file mode 100644
index 000000000..6e32ca4fd
--- /dev/null
+++ b/numpy/core/src/common/simd/sse/operators.h
@@ -0,0 +1,258 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_SSE_OPERATORS_H
+#define _NPY_SIMD_SSE_OPERATORS_H
+
+/***************************
+ * Shifting
+ ***************************/
+
+// left
+#define npyv_shl_u16(A, C) _mm_sll_epi16(A, _mm_cvtsi32_si128(C))
+#define npyv_shl_s16(A, C) _mm_sll_epi16(A, _mm_cvtsi32_si128(C))
+#define npyv_shl_u32(A, C) _mm_sll_epi32(A, _mm_cvtsi32_si128(C))
+#define npyv_shl_s32(A, C) _mm_sll_epi32(A, _mm_cvtsi32_si128(C))
+#define npyv_shl_u64(A, C) _mm_sll_epi64(A, _mm_cvtsi32_si128(C))
+#define npyv_shl_s64(A, C) _mm_sll_epi64(A, _mm_cvtsi32_si128(C))
+
+// left by an immediate constant
+#define npyv_shli_u16 _mm_slli_epi16
+#define npyv_shli_s16 _mm_slli_epi16
+#define npyv_shli_u32 _mm_slli_epi32
+#define npyv_shli_s32 _mm_slli_epi32
+#define npyv_shli_u64 _mm_slli_epi64
+#define npyv_shli_s64 _mm_slli_epi64
+
+// right
+#define npyv_shr_u16(A, C) _mm_srl_epi16(A, _mm_cvtsi32_si128(C))
+#define npyv_shr_s16(A, C) _mm_sra_epi16(A, _mm_cvtsi32_si128(C))
+#define npyv_shr_u32(A, C) _mm_srl_epi32(A, _mm_cvtsi32_si128(C))
+#define npyv_shr_s32(A, C) _mm_sra_epi32(A, _mm_cvtsi32_si128(C))
+#define npyv_shr_u64(A, C) _mm_srl_epi64(A, _mm_cvtsi32_si128(C))
+NPY_FINLINE __m128i npyv_shr_s64(__m128i a, int c)
+{
+ const __m128i sbit = npyv_setall_s64(0x8000000000000000);
+ const __m128i cv = _mm_cvtsi32_si128(c);
+ __m128i r = _mm_srl_epi64(_mm_add_epi64(a, sbit), cv);
+ return _mm_sub_epi64(r, _mm_srl_epi64(sbit, cv));
+}
+
+// Right by an immediate constant
+#define npyv_shri_u16 _mm_srli_epi16
+#define npyv_shri_s16 _mm_srai_epi16
+#define npyv_shri_u32 _mm_srli_epi32
+#define npyv_shri_s32 _mm_srai_epi32
+#define npyv_shri_u64 _mm_srli_epi64
+#define npyv_shri_s64 npyv_shr_s64
+
+/***************************
+ * Logical
+ ***************************/
+
+// AND
+#define npyv_and_u8 _mm_and_si128
+#define npyv_and_s8 _mm_and_si128
+#define npyv_and_u16 _mm_and_si128
+#define npyv_and_s16 _mm_and_si128
+#define npyv_and_u32 _mm_and_si128
+#define npyv_and_s32 _mm_and_si128
+#define npyv_and_u64 _mm_and_si128
+#define npyv_and_s64 _mm_and_si128
+#define npyv_and_f32 _mm_and_ps
+#define npyv_and_f64 _mm_and_pd
+
+// OR
+#define npyv_or_u8 _mm_or_si128
+#define npyv_or_s8 _mm_or_si128
+#define npyv_or_u16 _mm_or_si128
+#define npyv_or_s16 _mm_or_si128
+#define npyv_or_u32 _mm_or_si128
+#define npyv_or_s32 _mm_or_si128
+#define npyv_or_u64 _mm_or_si128
+#define npyv_or_s64 _mm_or_si128
+#define npyv_or_f32 _mm_or_ps
+#define npyv_or_f64 _mm_or_pd
+
+// XOR
+#define npyv_xor_u8 _mm_xor_si128
+#define npyv_xor_s8 _mm_xor_si128
+#define npyv_xor_u16 _mm_xor_si128
+#define npyv_xor_s16 _mm_xor_si128
+#define npyv_xor_u32 _mm_xor_si128
+#define npyv_xor_s32 _mm_xor_si128
+#define npyv_xor_u64 _mm_xor_si128
+#define npyv_xor_s64 _mm_xor_si128
+#define npyv_xor_f32 _mm_xor_ps
+#define npyv_xor_f64 _mm_xor_pd
+
+// NOT
+#define npyv_not_u8(A) _mm_xor_si128(A, _mm_set1_epi32(-1))
+#define npyv_not_s8 npyv_not_u8
+#define npyv_not_u16 npyv_not_u8
+#define npyv_not_s16 npyv_not_u8
+#define npyv_not_u32 npyv_not_u8
+#define npyv_not_s32 npyv_not_u8
+#define npyv_not_u64 npyv_not_u8
+#define npyv_not_s64 npyv_not_u8
+#define npyv_not_f32(A) _mm_xor_ps(A, _mm_castsi128_ps(_mm_set1_epi32(-1)))
+#define npyv_not_f64(A) _mm_xor_pd(A, _mm_castsi128_pd(_mm_set1_epi32(-1)))
+
+/***************************
+ * Comparison
+ ***************************/
+
+// Int Equal
+#define npyv_cmpeq_u8 _mm_cmpeq_epi8
+#define npyv_cmpeq_s8 _mm_cmpeq_epi8
+#define npyv_cmpeq_u16 _mm_cmpeq_epi16
+#define npyv_cmpeq_s16 _mm_cmpeq_epi16
+#define npyv_cmpeq_u32 _mm_cmpeq_epi32
+#define npyv_cmpeq_s32 _mm_cmpeq_epi32
+#define npyv_cmpeq_s64 npyv_cmpeq_u64
+
+#ifdef NPY_HAVE_SSE41
+ #define npyv_cmpeq_u64 _mm_cmpeq_epi64
+#else
+ NPY_FINLINE __m128i npyv_cmpeq_u64(__m128i a, __m128i b)
+ {
+ __m128i cmpeq = _mm_cmpeq_epi32(a, b);
+ __m128i cmpeq_h = _mm_srli_epi64(cmpeq, 32);
+ __m128i test = _mm_and_si128(cmpeq, cmpeq_h);
+ return _mm_shuffle_epi32(test, _MM_SHUFFLE(2, 2, 0, 0));
+ }
+#endif
+
+// Int Not Equal
+#ifdef NPY_HAVE_XOP
+ #define npyv_cmpneq_u8 _mm_comneq_epi8
+ #define npyv_cmpneq_u16 _mm_comneq_epi16
+ #define npyv_cmpneq_u32 _mm_comneq_epi32
+ #define npyv_cmpneq_u64 _mm_comneq_epi64
+#else
+ #define npyv_cmpneq_u8(A, B) npyv_not_u8(npyv_cmpeq_u8(A, B))
+ #define npyv_cmpneq_u16(A, B) npyv_not_u16(npyv_cmpeq_u16(A, B))
+ #define npyv_cmpneq_u32(A, B) npyv_not_u32(npyv_cmpeq_u32(A, B))
+ #define npyv_cmpneq_u64(A, B) npyv_not_u64(npyv_cmpeq_u64(A, B))
+#endif
+#define npyv_cmpneq_s8 npyv_cmpneq_u8
+#define npyv_cmpneq_s16 npyv_cmpneq_u16
+#define npyv_cmpneq_s32 npyv_cmpneq_u32
+#define npyv_cmpneq_s64 npyv_cmpneq_u64
+
+// signed greater than
+#define npyv_cmpgt_s8 _mm_cmpgt_epi8
+#define npyv_cmpgt_s16 _mm_cmpgt_epi16
+#define npyv_cmpgt_s32 _mm_cmpgt_epi32
+
+#ifdef NPY_HAVE_SSE42
+ #define npyv_cmpgt_s64 _mm_cmpgt_epi64
+#else
+ NPY_FINLINE __m128i npyv_cmpgt_s64(__m128i a, __m128i b)
+ {
+ __m128i sub = _mm_sub_epi64(b, a);
+ __m128i nsame_sbit = _mm_xor_si128(a, b);
+ // nsame_sbit ? b : sub
+ __m128i test = _mm_xor_si128(sub, _mm_and_si128(_mm_xor_si128(sub, b), nsame_sbit));
+ __m128i extend_sbit = _mm_shuffle_epi32(_mm_srai_epi32(test, 31), _MM_SHUFFLE(3, 3, 1, 1));
+ return extend_sbit;
+ }
+#endif
+
+// signed greater than or equal
+#ifdef NPY_HAVE_XOP
+ #define npyv_cmpge_s8 _mm_comge_epi8
+ #define npyv_cmpge_s16 _mm_comge_epi16
+ #define npyv_cmpge_s32 _mm_comge_epi32
+ #define npyv_cmpge_s64 _mm_comge_epi64
+#else
+ #define npyv_cmpge_s8(A, B) npyv_not_s8(_mm_cmpgt_epi8(B, A))
+ #define npyv_cmpge_s16(A, B) npyv_not_s16(_mm_cmpgt_epi16(B, A))
+ #define npyv_cmpge_s32(A, B) npyv_not_s32(_mm_cmpgt_epi32(B, A))
+ #define npyv_cmpge_s64(A, B) npyv_not_s64(npyv_cmpgt_s64(B, A))
+#endif
+
+// unsigned greater than
+#ifdef NPY_HAVE_XOP
+ #define npyv_cmpgt_u8 _mm_comgt_epu8
+ #define npyv_cmpgt_u16 _mm_comgt_epu16
+ #define npyv_cmpgt_u32 _mm_comgt_epu32
+ #define npyv_cmpgt_u64 _mm_comgt_epu64
+#else
+ #define NPYV_IMPL_SSE_UNSIGNED_GT(LEN, SIGN) \
+ NPY_FINLINE __m128i npyv_cmpgt_u##LEN(__m128i a, __m128i b) \
+ { \
+ const __m128i sbit = _mm_set1_epi32(SIGN); \
+ return _mm_cmpgt_epi##LEN( \
+ _mm_xor_si128(a, sbit), _mm_xor_si128(b, sbit) \
+ ); \
+ }
+
+ NPYV_IMPL_SSE_UNSIGNED_GT(8, 0x80808080)
+ NPYV_IMPL_SSE_UNSIGNED_GT(16, 0x80008000)
+ NPYV_IMPL_SSE_UNSIGNED_GT(32, 0x80000000)
+
+ NPY_FINLINE __m128i npyv_cmpgt_u64(__m128i a, __m128i b)
+ {
+ const __m128i sbit = npyv_setall_s64(0x8000000000000000);
+ return npyv_cmpgt_s64(_mm_xor_si128(a, sbit), _mm_xor_si128(b, sbit));
+ }
+#endif
+
+// unsigned greater than or equal
+#ifdef NPY_HAVE_XOP
+ #define npyv_cmpge_u8 _mm_comge_epu8
+ #define npyv_cmpge_u16 _mm_comge_epu16
+ #define npyv_cmpge_u32 _mm_comge_epu32
+ #define npyv_cmpge_u64 _mm_comge_epu64
+#else
+ NPY_FINLINE __m128i npyv_cmpge_u8(__m128i a, __m128i b)
+ { return _mm_cmpeq_epi8(a, _mm_max_epu8(a, b)); }
+ #ifdef NPY_HAVE_SSE41
+ NPY_FINLINE __m128i npyv_cmpge_u16(__m128i a, __m128i b)
+ { return _mm_cmpeq_epi16(a, _mm_max_epu16(a, b)); }
+ NPY_FINLINE __m128i npyv_cmpge_u32(__m128i a, __m128i b)
+ { return _mm_cmpeq_epi32(a, _mm_max_epu32(a, b)); }
+ #else
+ #define npyv_cmpge_u16(A, B) _mm_cmpeq_epi16(_mm_subs_epu16(B, A), _mm_setzero_si128())
+ #define npyv_cmpge_u32(A, B) npyv_not_u32(npyv_cmpgt_u32(B, A))
+ #endif
+ #define npyv_cmpge_u64(A, B) npyv_not_u64(npyv_cmpgt_u64(B, A))
+#endif
+
+// less than
+#define npyv_cmplt_u8(A, B) npyv_cmpgt_u8(B, A)
+#define npyv_cmplt_s8(A, B) npyv_cmpgt_s8(B, A)
+#define npyv_cmplt_u16(A, B) npyv_cmpgt_u16(B, A)
+#define npyv_cmplt_s16(A, B) npyv_cmpgt_s16(B, A)
+#define npyv_cmplt_u32(A, B) npyv_cmpgt_u32(B, A)
+#define npyv_cmplt_s32(A, B) npyv_cmpgt_s32(B, A)
+#define npyv_cmplt_u64(A, B) npyv_cmpgt_u64(B, A)
+#define npyv_cmplt_s64(A, B) npyv_cmpgt_s64(B, A)
+
+// less than or equal
+#define npyv_cmple_u8(A, B) npyv_cmpge_u8(B, A)
+#define npyv_cmple_s8(A, B) npyv_cmpge_s8(B, A)
+#define npyv_cmple_u16(A, B) npyv_cmpge_u16(B, A)
+#define npyv_cmple_s16(A, B) npyv_cmpge_s16(B, A)
+#define npyv_cmple_u32(A, B) npyv_cmpge_u32(B, A)
+#define npyv_cmple_s32(A, B) npyv_cmpge_s32(B, A)
+#define npyv_cmple_u64(A, B) npyv_cmpge_u64(B, A)
+#define npyv_cmple_s64(A, B) npyv_cmpge_s64(B, A)
+
+// precision comparison
+#define npyv_cmpeq_f32(a, b) _mm_castps_si128(_mm_cmpeq_ps(a, b))
+#define npyv_cmpeq_f64(a, b) _mm_castpd_si128(_mm_cmpeq_pd(a, b))
+#define npyv_cmpneq_f32(a, b) _mm_castps_si128(_mm_cmpneq_ps(a, b))
+#define npyv_cmpneq_f64(a, b) _mm_castpd_si128(_mm_cmpneq_pd(a, b))
+#define npyv_cmplt_f32(a, b) _mm_castps_si128(_mm_cmplt_ps(a, b))
+#define npyv_cmplt_f64(a, b) _mm_castpd_si128(_mm_cmplt_pd(a, b))
+#define npyv_cmple_f32(a, b) _mm_castps_si128(_mm_cmple_ps(a, b))
+#define npyv_cmple_f64(a, b) _mm_castpd_si128(_mm_cmple_pd(a, b))
+#define npyv_cmpgt_f32(a, b) _mm_castps_si128(_mm_cmpgt_ps(a, b))
+#define npyv_cmpgt_f64(a, b) _mm_castpd_si128(_mm_cmpgt_pd(a, b))
+#define npyv_cmpge_f32(a, b) _mm_castps_si128(_mm_cmpge_ps(a, b))
+#define npyv_cmpge_f64(a, b) _mm_castpd_si128(_mm_cmpge_pd(a, b))
+
+#endif // _NPY_SIMD_SSE_OPERATORS_H
diff --git a/numpy/core/src/common/simd/sse/reorder.h b/numpy/core/src/common/simd/sse/reorder.h
new file mode 100644
index 000000000..3f68b4ad7
--- /dev/null
+++ b/numpy/core/src/common/simd/sse/reorder.h
@@ -0,0 +1,84 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_SSE_REORDER_H
+#define _NPY_SIMD_SSE_REORDER_H
+
+// combine lower part of two vectors
+#define npyv_combinel_u8 _mm_unpacklo_epi64
+#define npyv_combinel_s8 _mm_unpacklo_epi64
+#define npyv_combinel_u16 _mm_unpacklo_epi64
+#define npyv_combinel_s16 _mm_unpacklo_epi64
+#define npyv_combinel_u32 _mm_unpacklo_epi64
+#define npyv_combinel_s32 _mm_unpacklo_epi64
+#define npyv_combinel_u64 _mm_unpacklo_epi64
+#define npyv_combinel_s64 _mm_unpacklo_epi64
+#define npyv_combinel_f32(A, B) _mm_castsi128_ps(_mm_unpacklo_epi64(_mm_castps_si128(A), _mm_castps_si128(B)))
+#define npyv_combinel_f64 _mm_unpacklo_pd
+
+// combine higher part of two vectors
+#define npyv_combineh_u8 _mm_unpackhi_epi64
+#define npyv_combineh_s8 _mm_unpackhi_epi64
+#define npyv_combineh_u16 _mm_unpackhi_epi64
+#define npyv_combineh_s16 _mm_unpackhi_epi64
+#define npyv_combineh_u32 _mm_unpackhi_epi64
+#define npyv_combineh_s32 _mm_unpackhi_epi64
+#define npyv_combineh_u64 _mm_unpackhi_epi64
+#define npyv_combineh_s64 _mm_unpackhi_epi64
+#define npyv_combineh_f32(A, B) _mm_castsi128_ps(_mm_unpackhi_epi64(_mm_castps_si128(A), _mm_castps_si128(B)))
+#define npyv_combineh_f64 _mm_unpackhi_pd
+
+// combine two vectors from lower and higher parts of two other vectors
+NPY_FINLINE npyv_m128ix2 npyv__combine(__m128i a, __m128i b)
+{
+ npyv_m128ix2 r;
+ r.val[0] = npyv_combinel_u8(a, b);
+ r.val[1] = npyv_combineh_u8(a, b);
+ return r;
+}
+NPY_FINLINE npyv_f32x2 npyv_combine_f32(__m128 a, __m128 b)
+{
+ npyv_f32x2 r;
+ r.val[0] = npyv_combinel_f32(a, b);
+ r.val[1] = npyv_combineh_f32(a, b);
+ return r;
+}
+NPY_FINLINE npyv_f64x2 npyv_combine_f64(__m128d a, __m128d b)
+{
+ npyv_f64x2 r;
+ r.val[0] = npyv_combinel_f64(a, b);
+ r.val[1] = npyv_combineh_f64(a, b);
+ return r;
+}
+#define npyv_combine_u8 npyv__combine
+#define npyv_combine_s8 npyv__combine
+#define npyv_combine_u16 npyv__combine
+#define npyv_combine_s16 npyv__combine
+#define npyv_combine_u32 npyv__combine
+#define npyv_combine_s32 npyv__combine
+#define npyv_combine_u64 npyv__combine
+#define npyv_combine_s64 npyv__combine
+
+// interleave two vectors
+#define NPYV_IMPL_SSE_ZIP(T_VEC, SFX, INTR_SFX) \
+ NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b) \
+ { \
+ T_VEC##x2 r; \
+ r.val[0] = _mm_unpacklo_##INTR_SFX(a, b); \
+ r.val[1] = _mm_unpackhi_##INTR_SFX(a, b); \
+ return r; \
+ }
+
+NPYV_IMPL_SSE_ZIP(npyv_u8, u8, epi8)
+NPYV_IMPL_SSE_ZIP(npyv_s8, s8, epi8)
+NPYV_IMPL_SSE_ZIP(npyv_u16, u16, epi16)
+NPYV_IMPL_SSE_ZIP(npyv_s16, s16, epi16)
+NPYV_IMPL_SSE_ZIP(npyv_u32, u32, epi32)
+NPYV_IMPL_SSE_ZIP(npyv_s32, s32, epi32)
+NPYV_IMPL_SSE_ZIP(npyv_u64, u64, epi64)
+NPYV_IMPL_SSE_ZIP(npyv_s64, s64, epi64)
+NPYV_IMPL_SSE_ZIP(npyv_f32, f32, ps)
+NPYV_IMPL_SSE_ZIP(npyv_f64, f64, pd)
+
+#endif // _NPY_SIMD_SSE_REORDER_H
diff --git a/numpy/core/src/common/simd/sse/sse.h b/numpy/core/src/common/simd/sse/sse.h
new file mode 100644
index 000000000..364b4baf1
--- /dev/null
+++ b/numpy/core/src/common/simd/sse/sse.h
@@ -0,0 +1,66 @@
+#ifndef _NPY_SIMD_H_
+ #error "Not a standalone header"
+#endif
+
+#define NPY_SIMD 128
+#define NPY_SIMD_WIDTH 16
+#define NPY_SIMD_F64 1
+
+typedef __m128i npyv_u8;
+typedef __m128i npyv_s8;
+typedef __m128i npyv_u16;
+typedef __m128i npyv_s16;
+typedef __m128i npyv_u32;
+typedef __m128i npyv_s32;
+typedef __m128i npyv_u64;
+typedef __m128i npyv_s64;
+typedef __m128 npyv_f32;
+typedef __m128d npyv_f64;
+
+typedef __m128i npyv_b8;
+typedef __m128i npyv_b16;
+typedef __m128i npyv_b32;
+typedef __m128i npyv_b64;
+
+typedef struct { __m128i val[2]; } npyv_m128ix2;
+typedef npyv_m128ix2 npyv_u8x2;
+typedef npyv_m128ix2 npyv_s8x2;
+typedef npyv_m128ix2 npyv_u16x2;
+typedef npyv_m128ix2 npyv_s16x2;
+typedef npyv_m128ix2 npyv_u32x2;
+typedef npyv_m128ix2 npyv_s32x2;
+typedef npyv_m128ix2 npyv_u64x2;
+typedef npyv_m128ix2 npyv_s64x2;
+
+typedef struct { __m128i val[3]; } npyv_m128ix3;
+typedef npyv_m128ix3 npyv_u8x3;
+typedef npyv_m128ix3 npyv_s8x3;
+typedef npyv_m128ix3 npyv_u16x3;
+typedef npyv_m128ix3 npyv_s16x3;
+typedef npyv_m128ix3 npyv_u32x3;
+typedef npyv_m128ix3 npyv_s32x3;
+typedef npyv_m128ix3 npyv_u64x3;
+typedef npyv_m128ix3 npyv_s64x3;
+
+typedef struct { __m128 val[2]; } npyv_f32x2;
+typedef struct { __m128d val[2]; } npyv_f64x2;
+typedef struct { __m128 val[3]; } npyv_f32x3;
+typedef struct { __m128d val[3]; } npyv_f64x3;
+
+#define npyv_nlanes_u8 16
+#define npyv_nlanes_s8 16
+#define npyv_nlanes_u16 8
+#define npyv_nlanes_s16 8
+#define npyv_nlanes_u32 4
+#define npyv_nlanes_s32 4
+#define npyv_nlanes_u64 2
+#define npyv_nlanes_s64 2
+#define npyv_nlanes_f32 4
+#define npyv_nlanes_f64 2
+
+#include "memory.h"
+#include "misc.h"
+#include "reorder.h"
+#include "operators.h"
+#include "conversion.h"
+#include "arithmetic.h"
diff --git a/numpy/core/src/common/simd/vsx/arithmetic.h b/numpy/core/src/common/simd/vsx/arithmetic.h
new file mode 100644
index 000000000..dd23b5b11
--- /dev/null
+++ b/numpy/core/src/common/simd/vsx/arithmetic.h
@@ -0,0 +1,103 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_VSX_ARITHMETIC_H
+#define _NPY_SIMD_VSX_ARITHMETIC_H
+
+/***************************
+ * Addition
+ ***************************/
+// non-saturated
+#define npyv_add_u8 vec_add
+#define npyv_add_s8 vec_add
+#define npyv_add_u16 vec_add
+#define npyv_add_s16 vec_add
+#define npyv_add_u32 vec_add
+#define npyv_add_s32 vec_add
+#define npyv_add_u64 vec_add
+#define npyv_add_s64 vec_add
+#define npyv_add_f32 vec_add
+#define npyv_add_f64 vec_add
+
+// saturated
+#define npyv_adds_u8 vec_adds
+#define npyv_adds_s8 vec_adds
+#define npyv_adds_u16 vec_adds
+#define npyv_adds_s16 vec_adds
+
+/***************************
+ * Subtraction
+ ***************************/
+// non-saturated
+#define npyv_sub_u8 vec_sub
+#define npyv_sub_s8 vec_sub
+#define npyv_sub_u16 vec_sub
+#define npyv_sub_s16 vec_sub
+#define npyv_sub_u32 vec_sub
+#define npyv_sub_s32 vec_sub
+#define npyv_sub_u64 vec_sub
+#define npyv_sub_s64 vec_sub
+#define npyv_sub_f32 vec_sub
+#define npyv_sub_f64 vec_sub
+
+// saturated
+#define npyv_subs_u8 vec_subs
+#define npyv_subs_s8 vec_subs
+#define npyv_subs_u16 vec_subs
+#define npyv_subs_s16 vec_subs
+
+/***************************
+ * Multiplication
+ ***************************/
+// non-saturated
+// up to GCC 6 vec_mul only supports precisions and llong
+#if defined(__GNUC__) && __GNUC__ < 7
+ #define NPYV_IMPL_VSX_MUL(T_VEC, SFX, ...) \
+ NPY_FINLINE T_VEC npyv_mul_##SFX(T_VEC a, T_VEC b) \
+ { \
+ const npyv_u8 ev_od = {__VA_ARGS__}; \
+ return vec_perm( \
+ (T_VEC)vec_mule(a, b), \
+ (T_VEC)vec_mulo(a, b), ev_od \
+ ); \
+ }
+
+ NPYV_IMPL_VSX_MUL(npyv_u8, u8, 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30)
+ NPYV_IMPL_VSX_MUL(npyv_s8, s8, 0, 16, 2, 18, 4, 20, 6, 22, 8, 24, 10, 26, 12, 28, 14, 30)
+ NPYV_IMPL_VSX_MUL(npyv_u16, u16, 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29)
+ NPYV_IMPL_VSX_MUL(npyv_s16, s16, 0, 1, 16, 17, 4, 5, 20, 21, 8, 9, 24, 25, 12, 13, 28, 29)
+
+ // vmuluwm can be used for unsigned or signed 32-bit integers
+ #define NPYV_IMPL_VSX_MUL_32(T_VEC, SFX) \
+ NPY_FINLINE T_VEC npyv_mul_##SFX(T_VEC a, T_VEC b) \
+ { \
+ T_VEC ret; \
+ __asm__ __volatile__( \
+ "vmuluwm %0,%1,%2" : \
+ "=v" (ret) : "v" (a), "v" (b) \
+ ); \
+ return ret; \
+ }
+
+ NPYV_IMPL_VSX_MUL_32(npyv_u32, u32)
+ NPYV_IMPL_VSX_MUL_32(npyv_s32, s32)
+
+#else
+ #define npyv_mul_u8 vec_mul
+ #define npyv_mul_s8 vec_mul
+ #define npyv_mul_u16 vec_mul
+ #define npyv_mul_s16 vec_mul
+ #define npyv_mul_u32 vec_mul
+ #define npyv_mul_s32 vec_mul
+#endif
+#define npyv_mul_f32 vec_mul
+#define npyv_mul_f64 vec_mul
+
+/***************************
+ * Division
+ ***************************/
+#define npyv_div_f32 vec_div
+#define npyv_div_f64 vec_div
+
+#endif // _NPY_SIMD_VSX_ARITHMETIC_H
diff --git a/numpy/core/src/common/simd/vsx/conversion.h b/numpy/core/src/common/simd/vsx/conversion.h
new file mode 100644
index 000000000..6ed135990
--- /dev/null
+++ b/numpy/core/src/common/simd/vsx/conversion.h
@@ -0,0 +1,32 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_VSX_CVT_H
+#define _NPY_SIMD_VSX_CVT_H
+
+// convert boolean vectors to integer vectors
+#define npyv_cvt_u8_b8(BL) ((npyv_u8) BL)
+#define npyv_cvt_s8_b8(BL) ((npyv_s8) BL)
+#define npyv_cvt_u16_b16(BL) ((npyv_u16) BL)
+#define npyv_cvt_s16_b16(BL) ((npyv_s16) BL)
+#define npyv_cvt_u32_b32(BL) ((npyv_u32) BL)
+#define npyv_cvt_s32_b32(BL) ((npyv_s32) BL)
+#define npyv_cvt_u64_b64(BL) ((npyv_u64) BL)
+#define npyv_cvt_s64_b64(BL) ((npyv_s64) BL)
+#define npyv_cvt_f32_b32(BL) ((npyv_f32) BL)
+#define npyv_cvt_f64_b64(BL) ((npyv_f64) BL)
+
+// convert integer vectors to boolean vectors
+#define npyv_cvt_b8_u8(A) ((npyv_b8) A)
+#define npyv_cvt_b8_s8(A) ((npyv_b8) A)
+#define npyv_cvt_b16_u16(A) ((npyv_b16) A)
+#define npyv_cvt_b16_s16(A) ((npyv_b16) A)
+#define npyv_cvt_b32_u32(A) ((npyv_b32) A)
+#define npyv_cvt_b32_s32(A) ((npyv_b32) A)
+#define npyv_cvt_b64_u64(A) ((npyv_b64) A)
+#define npyv_cvt_b64_s64(A) ((npyv_b64) A)
+#define npyv_cvt_b32_f32(A) ((npyv_b32) A)
+#define npyv_cvt_b64_f64(A) ((npyv_b64) A)
+
+#endif // _NPY_SIMD_VSX_CVT_H
diff --git a/numpy/core/src/common/simd/vsx/memory.h b/numpy/core/src/common/simd/vsx/memory.h
new file mode 100644
index 000000000..e0d908bf9
--- /dev/null
+++ b/numpy/core/src/common/simd/vsx/memory.h
@@ -0,0 +1,150 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_VSX_MEMORY_H
+#define _NPY_SIMD_VSX_MEMORY_H
+/****************************
+ * load/store
+ ****************************/
+// TODO: test load by cast
+#define VSX__CAST_lOAD 0
+#if VSX__CAST_lOAD
+ #define npyv__load(PTR, T_VEC) (*((T_VEC*)(PTR)))
+#else
+ /**
+ * CLANG fails to load unaligned addresses via vec_xl, vec_xst
+ * so we failback to vec_vsx_ld, vec_vsx_st
+ */
+ #if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__))
+ #define npyv__load(PTR, T_VEC) vec_vsx_ld(0, PTR)
+ #else
+ #define npyv__load(PTR, T_VEC) vec_xl(0, PTR)
+ #endif
+#endif
+// unaligned load
+#define npyv_load_u8(PTR) npyv__load(PTR, npyv_u8)
+#define npyv_load_s8(PTR) npyv__load(PTR, npyv_s8)
+#define npyv_load_u16(PTR) npyv__load(PTR, npyv_u16)
+#define npyv_load_s16(PTR) npyv__load(PTR, npyv_s16)
+#define npyv_load_u32(PTR) npyv__load(PTR, npyv_u32)
+#define npyv_load_s32(PTR) npyv__load(PTR, npyv_s32)
+#define npyv_load_f32(PTR) npyv__load(PTR, npyv_f32)
+#define npyv_load_f64(PTR) npyv__load(PTR, npyv_f64)
+#if VSX__CAST_lOAD
+ #define npyv_load_u64(PTR) npyv__load(PTR, npyv_u64)
+ #define npyv_load_s64(PTR) npyv__load(PTR, npyv_s64)
+#else
+ #define npyv_load_u64(PTR) ((npyv_u64)npyv_load_u32((const unsigned int*)PTR))
+ #define npyv_load_s64(PTR) ((npyv_s64)npyv_load_s32((const unsigned int*)PTR))
+#endif
+// aligned load
+#define npyv_loada_u8(PTR) vec_ld(0, PTR)
+#define npyv_loada_s8 npyv_loada_u8
+#define npyv_loada_u16 npyv_loada_u8
+#define npyv_loada_s16 npyv_loada_u8
+#define npyv_loada_u32 npyv_loada_u8
+#define npyv_loada_s32 npyv_loada_u8
+#define npyv_loada_u64 npyv_load_u64
+#define npyv_loada_s64 npyv_load_s64
+#define npyv_loada_f32 npyv_loada_u8
+#define npyv_loada_f64 npyv_load_f64
+// stream load
+#define npyv_loads_u8 npyv_loada_u8
+#define npyv_loads_s8 npyv_loada_s8
+#define npyv_loads_u16 npyv_loada_u16
+#define npyv_loads_s16 npyv_loada_s16
+#define npyv_loads_u32 npyv_loada_u32
+#define npyv_loads_s32 npyv_loada_s32
+#define npyv_loads_u64 npyv_loada_u64
+#define npyv_loads_s64 npyv_loada_s64
+#define npyv_loads_f32 npyv_loada_f32
+#define npyv_loads_f64 npyv_loada_f64
+// load lower part
+// avoid aliasing rules
+#ifdef __cplusplus
+ template<typename T_PTR>
+ NPY_FINLINE npy_uint64 *npyv__ptr2u64(T_PTR *ptr)
+ { return npy_uint64 *ptr64 = (npy_uint64*)ptr; return ptr; }
+#else
+ NPY_FINLINE npy_uint64 *npyv__ptr2u64(void *ptr)
+ { npy_uint64 *ptr64 = ptr; return ptr64; }
+#endif // __cplusplus
+#if defined(__clang__) && !defined(__IBMC__)
+ // vec_promote doesn't support doubleword on clang
+ #define npyv_loadl_u64(PTR) npyv_setall_u64(*npyv__ptr2u64(PTR))
+#else
+ #define npyv_loadl_u64(PTR) vec_promote(*npyv__ptr2u64(PTR), 0)
+#endif
+#define npyv_loadl_u8(PTR) ((npyv_u8)npyv_loadl_u64(PTR))
+#define npyv_loadl_s8(PTR) ((npyv_s8)npyv_loadl_u64(PTR))
+#define npyv_loadl_u16(PTR) ((npyv_u16)npyv_loadl_u64(PTR))
+#define npyv_loadl_s16(PTR) ((npyv_s16)npyv_loadl_u64(PTR))
+#define npyv_loadl_u32(PTR) ((npyv_u32)npyv_loadl_u64(PTR))
+#define npyv_loadl_s32(PTR) ((npyv_s32)npyv_loadl_u64(PTR))
+#define npyv_loadl_s64(PTR) ((npyv_s64)npyv_loadl_u64(PTR))
+#define npyv_loadl_f32(PTR) ((npyv_f32)npyv_loadl_u64(PTR))
+#define npyv_loadl_f64(PTR) ((npyv_f64)npyv_loadl_u64(PTR))
+// unaligned store
+#if (defined(__GNUC__) && !defined(vec_xl)) || (defined(__clang__) && !defined(__IBMC__))
+ #define npyv_store_u8(PTR, VEC) vec_vsx_st(VEC, 0, PTR)
+#else
+ #define npyv_store_u8(PTR, VEC) vec_xst(VEC, 0, PTR)
+#endif
+#define npyv_store_s8 npyv_store_u8
+#define npyv_store_u16 npyv_store_u8
+#define npyv_store_s16 npyv_store_u8
+#define npyv_store_u32 npyv_store_u8
+#define npyv_store_s32 npyv_store_u8
+#define npyv_store_u64(PTR, VEC) npyv_store_u8((unsigned int*)PTR, (npyv_u32)VEC)
+#define npyv_store_s64(PTR, VEC) npyv_store_u8((unsigned int*)PTR, (npyv_u32)VEC)
+#define npyv_store_f32 npyv_store_u8
+#define npyv_store_f64 npyv_store_u8
+// aligned store
+#define npyv_storea_u8(PTR, VEC) vec_st(VEC, 0, PTR)
+#define npyv_storea_s8 npyv_storea_u8
+#define npyv_storea_u16 npyv_storea_u8
+#define npyv_storea_s16 npyv_storea_u8
+#define npyv_storea_u32 npyv_storea_u8
+#define npyv_storea_s32 npyv_storea_u8
+#define npyv_storea_u64 npyv_store_u64
+#define npyv_storea_s64 npyv_store_s64
+#define npyv_storea_f32 npyv_storea_u8
+#define npyv_storea_f64 npyv_store_f64
+// stream store
+#define npyv_stores_u8 npyv_storea_u8
+#define npyv_stores_s8 npyv_storea_s8
+#define npyv_stores_u16 npyv_storea_u16
+#define npyv_stores_s16 npyv_storea_s16
+#define npyv_stores_u32 npyv_storea_u32
+#define npyv_stores_s32 npyv_storea_s32
+#define npyv_stores_u64 npyv_storea_u64
+#define npyv_stores_s64 npyv_storea_s64
+#define npyv_stores_f32 npyv_storea_f32
+#define npyv_stores_f64 npyv_storea_f64
+// store lower part
+#define npyv_storel_u8(PTR, VEC) \
+ *npyv__ptr2u64(PTR) = vec_extract(((npyv_u64)VEC), 0)
+#define npyv_storel_s8 npyv_storel_u8
+#define npyv_storel_u16 npyv_storel_u8
+#define npyv_storel_s16 npyv_storel_u8
+#define npyv_storel_u32 npyv_storel_u8
+#define npyv_storel_s32 npyv_storel_u8
+#define npyv_storel_s64 npyv_storel_u8
+#define npyv_storel_u64 npyv_storel_u8
+#define npyv_storel_f32 npyv_storel_u8
+#define npyv_storel_f64 npyv_storel_u8
+// store higher part
+#define npyv_storeh_u8(PTR, VEC) \
+ *npyv__ptr2u64(PTR) = vec_extract(((npyv_u64)VEC), 1)
+#define npyv_storeh_s8 npyv_storeh_u8
+#define npyv_storeh_u16 npyv_storeh_u8
+#define npyv_storeh_s16 npyv_storeh_u8
+#define npyv_storeh_u32 npyv_storeh_u8
+#define npyv_storeh_s32 npyv_storeh_u8
+#define npyv_storeh_s64 npyv_storeh_u8
+#define npyv_storeh_u64 npyv_storeh_u8
+#define npyv_storeh_f32 npyv_storeh_u8
+#define npyv_storeh_f64 npyv_storeh_u8
+
+#endif // _NPY_SIMD_VSX_MEMORY_H
diff --git a/numpy/core/src/common/simd/vsx/misc.h b/numpy/core/src/common/simd/vsx/misc.h
new file mode 100644
index 000000000..f7a0cdd5c
--- /dev/null
+++ b/numpy/core/src/common/simd/vsx/misc.h
@@ -0,0 +1,190 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_VSX_MISC_H
+#define _NPY_SIMD_VSX_MISC_H
+
+// vector with zero lanes
+#define npyv_zero_u8() ((npyv_u8) npyv_setall_s32(0))
+#define npyv_zero_s8() ((npyv_s8) npyv_setall_s32(0))
+#define npyv_zero_u16() ((npyv_u16) npyv_setall_s32(0))
+#define npyv_zero_s16() ((npyv_s16) npyv_setall_s32(0))
+#define npyv_zero_u32() npyv_setall_u32(0)
+#define npyv_zero_s32() npyv_setall_s32(0)
+#define npyv_zero_u64() ((npyv_u64) npyv_setall_s32(0))
+#define npyv_zero_s64() ((npyv_s64) npyv_setall_s32(0))
+#define npyv_zero_f32() npyv_setall_f32(0.0f)
+#define npyv_zero_f64() npyv_setall_f64(0.0)
+
+// vector with a specific value set to all lanes
+// the safest way to generate vsplti* and vsplt* instructions
+#define NPYV_IMPL_VSX_SPLTB(T_VEC, V) ((T_VEC){V, V, V, V, V, V, V, V, V, V, V, V, V, V, V, V})
+#define NPYV_IMPL_VSX_SPLTH(T_VEC, V) ((T_VEC){V, V, V, V, V, V, V, V})
+#define NPYV_IMPL_VSX_SPLTW(T_VEC, V) ((T_VEC){V, V, V, V})
+#define NPYV_IMPL_VSX_SPLTD(T_VEC, V) ((T_VEC){V, V})
+
+#define npyv_setall_u8(VAL) NPYV_IMPL_VSX_SPLTB(npyv_u8, (unsigned char)VAL)
+#define npyv_setall_s8(VAL) NPYV_IMPL_VSX_SPLTB(npyv_s8, (signed char)VAL)
+#define npyv_setall_u16(VAL) NPYV_IMPL_VSX_SPLTH(npyv_u16, (unsigned short)VAL)
+#define npyv_setall_s16(VAL) NPYV_IMPL_VSX_SPLTH(npyv_s16, (short)VAL)
+#define npyv_setall_u32(VAL) NPYV_IMPL_VSX_SPLTW(npyv_u32, (unsigned int)VAL)
+#define npyv_setall_s32(VAL) NPYV_IMPL_VSX_SPLTW(npyv_s32, (int)VAL)
+#define npyv_setall_f32(VAL) NPYV_IMPL_VSX_SPLTW(npyv_f32, VAL)
+#define npyv_setall_u64(VAL) NPYV_IMPL_VSX_SPLTD(npyv_u64, (npy_uint64)VAL)
+#define npyv_setall_s64(VAL) NPYV_IMPL_VSX_SPLTD(npyv_s64, (npy_int64)VAL)
+#define npyv_setall_f64(VAL) NPYV_IMPL_VSX_SPLTD(npyv_f64, VAL)
+
+// vector with specific values set to each lane and
+// set a specific value to all remained lanes
+#define npyv_setf_u8(FILL, ...) ((npyv_u8){NPYV__SET_FILL_16(char, FILL, __VA_ARGS__)})
+#define npyv_setf_s8(FILL, ...) ((npyv_s8){NPYV__SET_FILL_16(char, FILL, __VA_ARGS__)})
+#define npyv_setf_u16(FILL, ...) ((npyv_u16){NPYV__SET_FILL_8(short, FILL, __VA_ARGS__)})
+#define npyv_setf_s16(FILL, ...) ((npyv_s16){NPYV__SET_FILL_8(short, FILL, __VA_ARGS__)})
+#define npyv_setf_u32(FILL, ...) ((npyv_u32){NPYV__SET_FILL_4(int, FILL, __VA_ARGS__)})
+#define npyv_setf_s32(FILL, ...) ((npyv_s32){NPYV__SET_FILL_4(int, FILL, __VA_ARGS__)})
+#define npyv_setf_u64(FILL, ...) ((npyv_u64){NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)})
+#define npyv_setf_s64(FILL, ...) ((npyv_s64){NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)})
+#define npyv_setf_f32(FILL, ...) ((npyv_f32){NPYV__SET_FILL_4(float, FILL, __VA_ARGS__)})
+#define npyv_setf_f64(FILL, ...) ((npyv_f64){NPYV__SET_FILL_2(double, FILL, __VA_ARGS__)})
+
+// vector with specific values set to each lane and
+// set zero to all remained lanes
+#define npyv_set_u8(...) npyv_setf_u8(0, __VA_ARGS__)
+#define npyv_set_s8(...) npyv_setf_s8(0, __VA_ARGS__)
+#define npyv_set_u16(...) npyv_setf_u16(0, __VA_ARGS__)
+#define npyv_set_s16(...) npyv_setf_s16(0, __VA_ARGS__)
+#define npyv_set_u32(...) npyv_setf_u32(0, __VA_ARGS__)
+#define npyv_set_s32(...) npyv_setf_s32(0, __VA_ARGS__)
+#define npyv_set_u64(...) npyv_setf_u64(0, __VA_ARGS__)
+#define npyv_set_s64(...) npyv_setf_s64(0, __VA_ARGS__)
+#define npyv_set_f32(...) npyv_setf_f32(0, __VA_ARGS__)
+#define npyv_set_f64(...) npyv_setf_f64(0, __VA_ARGS__)
+
+// Per lane select
+#define npyv_select_u8(MASK, A, B) vec_sel(B, A, MASK)
+#define npyv_select_s8 npyv_select_u8
+#define npyv_select_u16 npyv_select_u8
+#define npyv_select_s16 npyv_select_u8
+#define npyv_select_u32 npyv_select_u8
+#define npyv_select_s32 npyv_select_u8
+#define npyv_select_u64 npyv_select_u8
+#define npyv_select_s64 npyv_select_u8
+#define npyv_select_f32 npyv_select_u8
+#define npyv_select_f64 npyv_select_u8
+
+// Reinterpret
+#define npyv_reinterpret_u8_u8(X) X
+#define npyv_reinterpret_u8_s8(X) ((npyv_u8)X)
+#define npyv_reinterpret_u8_u16 npyv_reinterpret_u8_s8
+#define npyv_reinterpret_u8_s16 npyv_reinterpret_u8_s8
+#define npyv_reinterpret_u8_u32 npyv_reinterpret_u8_s8
+#define npyv_reinterpret_u8_s32 npyv_reinterpret_u8_s8
+#define npyv_reinterpret_u8_u64 npyv_reinterpret_u8_s8
+#define npyv_reinterpret_u8_s64 npyv_reinterpret_u8_s8
+#define npyv_reinterpret_u8_f32 npyv_reinterpret_u8_s8
+#define npyv_reinterpret_u8_f64 npyv_reinterpret_u8_s8
+
+#define npyv_reinterpret_s8_s8(X) X
+#define npyv_reinterpret_s8_u8(X) ((npyv_s8)X)
+#define npyv_reinterpret_s8_u16 npyv_reinterpret_s8_u8
+#define npyv_reinterpret_s8_s16 npyv_reinterpret_s8_u8
+#define npyv_reinterpret_s8_u32 npyv_reinterpret_s8_u8
+#define npyv_reinterpret_s8_s32 npyv_reinterpret_s8_u8
+#define npyv_reinterpret_s8_u64 npyv_reinterpret_s8_u8
+#define npyv_reinterpret_s8_s64 npyv_reinterpret_s8_u8
+#define npyv_reinterpret_s8_f32 npyv_reinterpret_s8_u8
+#define npyv_reinterpret_s8_f64 npyv_reinterpret_s8_u8
+
+#define npyv_reinterpret_u16_u16(X) X
+#define npyv_reinterpret_u16_u8(X) ((npyv_u16)X)
+#define npyv_reinterpret_u16_s8 npyv_reinterpret_u16_u8
+#define npyv_reinterpret_u16_s16 npyv_reinterpret_u16_u8
+#define npyv_reinterpret_u16_u32 npyv_reinterpret_u16_u8
+#define npyv_reinterpret_u16_s32 npyv_reinterpret_u16_u8
+#define npyv_reinterpret_u16_u64 npyv_reinterpret_u16_u8
+#define npyv_reinterpret_u16_s64 npyv_reinterpret_u16_u8
+#define npyv_reinterpret_u16_f32 npyv_reinterpret_u16_u8
+#define npyv_reinterpret_u16_f64 npyv_reinterpret_u16_u8
+
+#define npyv_reinterpret_s16_s16(X) X
+#define npyv_reinterpret_s16_u8(X) ((npyv_s16)X)
+#define npyv_reinterpret_s16_s8 npyv_reinterpret_s16_u8
+#define npyv_reinterpret_s16_u16 npyv_reinterpret_s16_u8
+#define npyv_reinterpret_s16_u32 npyv_reinterpret_s16_u8
+#define npyv_reinterpret_s16_s32 npyv_reinterpret_s16_u8
+#define npyv_reinterpret_s16_u64 npyv_reinterpret_s16_u8
+#define npyv_reinterpret_s16_s64 npyv_reinterpret_s16_u8
+#define npyv_reinterpret_s16_f32 npyv_reinterpret_s16_u8
+#define npyv_reinterpret_s16_f64 npyv_reinterpret_s16_u8
+
+#define npyv_reinterpret_u32_u32(X) X
+#define npyv_reinterpret_u32_u8(X) ((npyv_u32)X)
+#define npyv_reinterpret_u32_s8 npyv_reinterpret_u32_u8
+#define npyv_reinterpret_u32_u16 npyv_reinterpret_u32_u8
+#define npyv_reinterpret_u32_s16 npyv_reinterpret_u32_u8
+#define npyv_reinterpret_u32_s32 npyv_reinterpret_u32_u8
+#define npyv_reinterpret_u32_u64 npyv_reinterpret_u32_u8
+#define npyv_reinterpret_u32_s64 npyv_reinterpret_u32_u8
+#define npyv_reinterpret_u32_f32 npyv_reinterpret_u32_u8
+#define npyv_reinterpret_u32_f64 npyv_reinterpret_u32_u8
+
+#define npyv_reinterpret_s32_s32(X) X
+#define npyv_reinterpret_s32_u8(X) ((npyv_s32)X)
+#define npyv_reinterpret_s32_s8 npyv_reinterpret_s32_u8
+#define npyv_reinterpret_s32_u16 npyv_reinterpret_s32_u8
+#define npyv_reinterpret_s32_s16 npyv_reinterpret_s32_u8
+#define npyv_reinterpret_s32_u32 npyv_reinterpret_s32_u8
+#define npyv_reinterpret_s32_u64 npyv_reinterpret_s32_u8
+#define npyv_reinterpret_s32_s64 npyv_reinterpret_s32_u8
+#define npyv_reinterpret_s32_f32 npyv_reinterpret_s32_u8
+#define npyv_reinterpret_s32_f64 npyv_reinterpret_s32_u8
+
+#define npyv_reinterpret_u64_u64(X) X
+#define npyv_reinterpret_u64_u8(X) ((npyv_u64)X)
+#define npyv_reinterpret_u64_s8 npyv_reinterpret_u64_u8
+#define npyv_reinterpret_u64_u16 npyv_reinterpret_u64_u8
+#define npyv_reinterpret_u64_s16 npyv_reinterpret_u64_u8
+#define npyv_reinterpret_u64_u32 npyv_reinterpret_u64_u8
+#define npyv_reinterpret_u64_s32 npyv_reinterpret_u64_u8
+#define npyv_reinterpret_u64_s64 npyv_reinterpret_u64_u8
+#define npyv_reinterpret_u64_f32 npyv_reinterpret_u64_u8
+#define npyv_reinterpret_u64_f64 npyv_reinterpret_u64_u8
+
+#define npyv_reinterpret_s64_s64(X) X
+#define npyv_reinterpret_s64_u8(X) ((npyv_s64)X)
+#define npyv_reinterpret_s64_s8 npyv_reinterpret_s64_u8
+#define npyv_reinterpret_s64_u16 npyv_reinterpret_s64_u8
+#define npyv_reinterpret_s64_s16 npyv_reinterpret_s64_u8
+#define npyv_reinterpret_s64_u32 npyv_reinterpret_s64_u8
+#define npyv_reinterpret_s64_s32 npyv_reinterpret_s64_u8
+#define npyv_reinterpret_s64_u64 npyv_reinterpret_s64_u8
+#define npyv_reinterpret_s64_f32 npyv_reinterpret_s64_u8
+#define npyv_reinterpret_s64_f64 npyv_reinterpret_s64_u8
+
+#define npyv_reinterpret_f32_f32(X) X
+#define npyv_reinterpret_f32_u8(X) ((npyv_f32)X)
+#define npyv_reinterpret_f32_s8 npyv_reinterpret_f32_u8
+#define npyv_reinterpret_f32_u16 npyv_reinterpret_f32_u8
+#define npyv_reinterpret_f32_s16 npyv_reinterpret_f32_u8
+#define npyv_reinterpret_f32_u32 npyv_reinterpret_f32_u8
+#define npyv_reinterpret_f32_s32 npyv_reinterpret_f32_u8
+#define npyv_reinterpret_f32_u64 npyv_reinterpret_f32_u8
+#define npyv_reinterpret_f32_s64 npyv_reinterpret_f32_u8
+#define npyv_reinterpret_f32_f64 npyv_reinterpret_f32_u8
+
+#define npyv_reinterpret_f64_f64(X) X
+#define npyv_reinterpret_f64_u8(X) ((npyv_f64)X)
+#define npyv_reinterpret_f64_s8 npyv_reinterpret_f64_u8
+#define npyv_reinterpret_f64_u16 npyv_reinterpret_f64_u8
+#define npyv_reinterpret_f64_s16 npyv_reinterpret_f64_u8
+#define npyv_reinterpret_f64_u32 npyv_reinterpret_f64_u8
+#define npyv_reinterpret_f64_s32 npyv_reinterpret_f64_u8
+#define npyv_reinterpret_f64_u64 npyv_reinterpret_f64_u8
+#define npyv_reinterpret_f64_s64 npyv_reinterpret_f64_u8
+#define npyv_reinterpret_f64_f32 npyv_reinterpret_f64_u8
+
+// Only required by AVX2/AVX512
+#define npyv_cleanup() ((void)0)
+
+#endif // _NPY_SIMD_VSX_MISC_H
diff --git a/numpy/core/src/common/simd/vsx/operators.h b/numpy/core/src/common/simd/vsx/operators.h
new file mode 100644
index 000000000..ca020d9e0
--- /dev/null
+++ b/numpy/core/src/common/simd/vsx/operators.h
@@ -0,0 +1,216 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_VSX_OPERATORS_H
+#define _NPY_SIMD_VSX_OPERATORS_H
+
+/***************************
+ * Shifting
+ ***************************/
+
+// Left
+#define npyv_shl_u16(A, C) vec_sl(A, npyv_setall_u16(C))
+#define npyv_shl_s16(A, C) vec_sl(A, npyv_setall_u16(C))
+#define npyv_shl_u32(A, C) vec_sl(A, npyv_setall_u32(C))
+#define npyv_shl_s32(A, C) vec_sl(A, npyv_setall_u32(C))
+#define npyv_shl_u64(A, C) vec_sl(A, npyv_setall_u64(C))
+#define npyv_shl_s64(A, C) vec_sl(A, npyv_setall_u64(C))
+
+// Left by an immediate constant
+#define npyv_shli_u16 npyv_shl_u16
+#define npyv_shli_s16 npyv_shl_s16
+#define npyv_shli_u32 npyv_shl_u32
+#define npyv_shli_s32 npyv_shl_s32
+#define npyv_shli_u64 npyv_shl_u64
+#define npyv_shli_s64 npyv_shl_s64
+
+// Right
+#define npyv_shr_u16(A, C) vec_sr(A, npyv_setall_u16(C))
+#define npyv_shr_s16(A, C) vec_sra(A, npyv_setall_u16(C))
+#define npyv_shr_u32(A, C) vec_sr(A, npyv_setall_u32(C))
+#define npyv_shr_s32(A, C) vec_sra(A, npyv_setall_u32(C))
+#define npyv_shr_u64(A, C) vec_sr(A, npyv_setall_u64(C))
+#define npyv_shr_s64(A, C) vec_sra(A, npyv_setall_u64(C))
+
+// Right by an immediate constant
+#define npyv_shri_u16 npyv_shr_u16
+#define npyv_shri_s16 npyv_shr_s16
+#define npyv_shri_u32 npyv_shr_u32
+#define npyv_shri_s32 npyv_shr_s32
+#define npyv_shri_u64 npyv_shr_u64
+#define npyv_shri_s64 npyv_shr_s64
+
+/***************************
+ * Logical
+ ***************************/
+
+// AND
+#define npyv_and_u8 vec_and
+#define npyv_and_s8 vec_and
+#define npyv_and_u16 vec_and
+#define npyv_and_s16 vec_and
+#define npyv_and_u32 vec_and
+#define npyv_and_s32 vec_and
+#define npyv_and_u64 vec_and
+#define npyv_and_s64 vec_and
+#define npyv_and_f32 vec_and
+#define npyv_and_f64 vec_and
+
+// OR
+#define npyv_or_u8 vec_or
+#define npyv_or_s8 vec_or
+#define npyv_or_u16 vec_or
+#define npyv_or_s16 vec_or
+#define npyv_or_u32 vec_or
+#define npyv_or_s32 vec_or
+#define npyv_or_u64 vec_or
+#define npyv_or_s64 vec_or
+#define npyv_or_f32 vec_or
+#define npyv_or_f64 vec_or
+
+// XOR
+#define npyv_xor_u8 vec_xor
+#define npyv_xor_s8 vec_xor
+#define npyv_xor_u16 vec_xor
+#define npyv_xor_s16 vec_xor
+#define npyv_xor_u32 vec_xor
+#define npyv_xor_s32 vec_xor
+#define npyv_xor_u64 vec_xor
+#define npyv_xor_s64 vec_xor
+#define npyv_xor_f32 vec_xor
+#define npyv_xor_f64 vec_xor
+
+// NOT
+// note: we implement npyv_not_b*(boolen types) for internal use*/
+#define NPYV_IMPL_VSX_NOT_INT(VEC_LEN) \
+ NPY_FINLINE npyv_u##VEC_LEN npyv_not_u##VEC_LEN(npyv_u##VEC_LEN a) \
+ { return vec_nor(a, a); } \
+ NPY_FINLINE npyv_s##VEC_LEN npyv_not_s##VEC_LEN(npyv_s##VEC_LEN a) \
+ { return vec_nor(a, a); } \
+ NPY_FINLINE npyv_b##VEC_LEN npyv_not_b##VEC_LEN(npyv_b##VEC_LEN a) \
+ { return vec_nor(a, a); }
+
+NPYV_IMPL_VSX_NOT_INT(8)
+NPYV_IMPL_VSX_NOT_INT(16)
+NPYV_IMPL_VSX_NOT_INT(32)
+
+// up to gcc5 vec_nor doesn't support bool long long
+#if defined(__GNUC__) && __GNUC__ > 5
+ NPYV_IMPL_VSX_NOT_INT(64)
+#else
+ NPY_FINLINE npyv_u64 npyv_not_u64(npyv_u64 a)
+ { return vec_nor(a, a); }
+ NPY_FINLINE npyv_s64 npyv_not_s64(npyv_s64 a)
+ { return vec_nor(a, a); }
+ NPY_FINLINE npyv_b64 npyv_not_b64(npyv_b64 a)
+ { return (npyv_b64)vec_nor((npyv_u64)a, (npyv_u64)a); }
+#endif
+
+NPY_FINLINE npyv_f32 npyv_not_f32(npyv_f32 a)
+{ return vec_nor(a, a); }
+NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a)
+{ return vec_nor(a, a); }
+
+/***************************
+ * Comparison
+ ***************************/
+
+// Int Equal
+#define npyv_cmpeq_u8 vec_cmpeq
+#define npyv_cmpeq_s8 vec_cmpeq
+#define npyv_cmpeq_u16 vec_cmpeq
+#define npyv_cmpeq_s16 vec_cmpeq
+#define npyv_cmpeq_u32 vec_cmpeq
+#define npyv_cmpeq_s32 vec_cmpeq
+#define npyv_cmpeq_u64 vec_cmpeq
+#define npyv_cmpeq_s64 vec_cmpeq
+#define npyv_cmpeq_f32 vec_cmpeq
+#define npyv_cmpeq_f64 vec_cmpeq
+
+// Int Not Equal
+#ifdef NPY_HAVE_VSX3
+ #define npyv_cmpneq_u8 vec_cmpne
+ #define npyv_cmpneq_s8 vec_cmpne
+ #define npyv_cmpneq_u16 vec_cmpne
+ #define npyv_cmpneq_s16 vec_cmpne
+ #define npyv_cmpneq_u32 vec_cmpne
+ #define npyv_cmpneq_s32 vec_cmpne
+ #define npyv_cmpneq_u64 vec_cmpne
+ #define npyv_cmpneq_s64 vec_cmpne
+ #define npyv_cmpneq_f32 vec_cmpne
+ #define npyv_cmpneq_f64 vec_cmpne
+#else
+ #define npyv_cmpneq_u8(A, B) npyv_not_b8(vec_cmpeq(A, B))
+ #define npyv_cmpneq_s8(A, B) npyv_not_b8(vec_cmpeq(A, B))
+ #define npyv_cmpneq_u16(A, B) npyv_not_b16(vec_cmpeq(A, B))
+ #define npyv_cmpneq_s16(A, B) npyv_not_b16(vec_cmpeq(A, B))
+ #define npyv_cmpneq_u32(A, B) npyv_not_b32(vec_cmpeq(A, B))
+ #define npyv_cmpneq_s32(A, B) npyv_not_b32(vec_cmpeq(A, B))
+ #define npyv_cmpneq_u64(A, B) npyv_not_b64(vec_cmpeq(A, B))
+ #define npyv_cmpneq_s64(A, B) npyv_not_b64(vec_cmpeq(A, B))
+ #define npyv_cmpneq_f32(A, B) npyv_not_b32(vec_cmpeq(A, B))
+ #define npyv_cmpneq_f64(A, B) npyv_not_b64(vec_cmpeq(A, B))
+#endif
+
+// Greater than
+#define npyv_cmpgt_u8 vec_cmpgt
+#define npyv_cmpgt_s8 vec_cmpgt
+#define npyv_cmpgt_u16 vec_cmpgt
+#define npyv_cmpgt_s16 vec_cmpgt
+#define npyv_cmpgt_u32 vec_cmpgt
+#define npyv_cmpgt_s32 vec_cmpgt
+#define npyv_cmpgt_u64 vec_cmpgt
+#define npyv_cmpgt_s64 vec_cmpgt
+#define npyv_cmpgt_f32 vec_cmpgt
+#define npyv_cmpgt_f64 vec_cmpgt
+
+// Greater than or equal
+// up to gcc5 vec_cmpge only supports single and double precision
+#if defined(__GNUC__) && __GNUC__ > 5
+ #define npyv_cmpge_u8 vec_cmpge
+ #define npyv_cmpge_s8 vec_cmpge
+ #define npyv_cmpge_u16 vec_cmpge
+ #define npyv_cmpge_s16 vec_cmpge
+ #define npyv_cmpge_u32 vec_cmpge
+ #define npyv_cmpge_s32 vec_cmpge
+ #define npyv_cmpge_u64 vec_cmpge
+ #define npyv_cmpge_s64 vec_cmpge
+#else
+ #define npyv_cmpge_u8(A, B) npyv_not_b8(vec_cmpgt(B, A))
+ #define npyv_cmpge_s8(A, B) npyv_not_b8(vec_cmpgt(B, A))
+ #define npyv_cmpge_u16(A, B) npyv_not_b16(vec_cmpgt(B, A))
+ #define npyv_cmpge_s16(A, B) npyv_not_b16(vec_cmpgt(B, A))
+ #define npyv_cmpge_u32(A, B) npyv_not_b32(vec_cmpgt(B, A))
+ #define npyv_cmpge_s32(A, B) npyv_not_b32(vec_cmpgt(B, A))
+ #define npyv_cmpge_u64(A, B) npyv_not_b64(vec_cmpgt(B, A))
+ #define npyv_cmpge_s64(A, B) npyv_not_b64(vec_cmpgt(B, A))
+#endif
+#define npyv_cmpge_f32 vec_cmpge
+#define npyv_cmpge_f64 vec_cmpge
+
+// Less than
+#define npyv_cmplt_u8(A, B) npyv_cmpgt_u8(B, A)
+#define npyv_cmplt_s8(A, B) npyv_cmpgt_s8(B, A)
+#define npyv_cmplt_u16(A, B) npyv_cmpgt_u16(B, A)
+#define npyv_cmplt_s16(A, B) npyv_cmpgt_s16(B, A)
+#define npyv_cmplt_u32(A, B) npyv_cmpgt_u32(B, A)
+#define npyv_cmplt_s32(A, B) npyv_cmpgt_s32(B, A)
+#define npyv_cmplt_u64(A, B) npyv_cmpgt_u64(B, A)
+#define npyv_cmplt_s64(A, B) npyv_cmpgt_s64(B, A)
+#define npyv_cmplt_f32(A, B) npyv_cmpgt_f32(B, A)
+#define npyv_cmplt_f64(A, B) npyv_cmpgt_f64(B, A)
+
+// Less than or equal
+#define npyv_cmple_u8(A, B) npyv_cmpge_u8(B, A)
+#define npyv_cmple_s8(A, B) npyv_cmpge_s8(B, A)
+#define npyv_cmple_u16(A, B) npyv_cmpge_u16(B, A)
+#define npyv_cmple_s16(A, B) npyv_cmpge_s16(B, A)
+#define npyv_cmple_u32(A, B) npyv_cmpge_u32(B, A)
+#define npyv_cmple_s32(A, B) npyv_cmpge_s32(B, A)
+#define npyv_cmple_u64(A, B) npyv_cmpge_u64(B, A)
+#define npyv_cmple_s64(A, B) npyv_cmpge_s64(B, A)
+#define npyv_cmple_f32(A, B) npyv_cmpge_f32(B, A)
+#define npyv_cmple_f64(A, B) npyv_cmpge_f64(B, A)
+
+#endif // _NPY_SIMD_VSX_OPERATORS_H
diff --git a/numpy/core/src/common/simd/vsx/reorder.h b/numpy/core/src/common/simd/vsx/reorder.h
new file mode 100644
index 000000000..bfb9115fa
--- /dev/null
+++ b/numpy/core/src/common/simd/vsx/reorder.h
@@ -0,0 +1,65 @@
+#ifndef NPY_SIMD
+ #error "Not a standalone header"
+#endif
+
+#ifndef _NPY_SIMD_VSX_REORDER_H
+#define _NPY_SIMD_VSX_REORDER_H
+
+// combine lower part of two vectors
+#define npyv__combinel(A, B) vec_mergeh((npyv_u64)(A), (npyv_u64)(B))
+#define npyv_combinel_u8(A, B) ((npyv_u8) npyv__combinel(A, B))
+#define npyv_combinel_s8(A, B) ((npyv_s8) npyv__combinel(A, B))
+#define npyv_combinel_u16(A, B) ((npyv_u16)npyv__combinel(A, B))
+#define npyv_combinel_s16(A, B) ((npyv_s16)npyv__combinel(A, B))
+#define npyv_combinel_u32(A, B) ((npyv_u32)npyv__combinel(A, B))
+#define npyv_combinel_s32(A, B) ((npyv_s32)npyv__combinel(A, B))
+#define npyv_combinel_u64 vec_mergeh
+#define npyv_combinel_s64 vec_mergeh
+#define npyv_combinel_f32(A, B) ((npyv_f32)npyv__combinel(A, B))
+#define npyv_combinel_f64 vec_mergeh
+
+// combine higher part of two vectors
+#define npyv__combineh(A, B) vec_mergel((npyv_u64)(A), (npyv_u64)(B))
+#define npyv_combineh_u8(A, B) ((npyv_u8) npyv__combineh(A, B))
+#define npyv_combineh_s8(A, B) ((npyv_s8) npyv__combineh(A, B))
+#define npyv_combineh_u16(A, B) ((npyv_u16)npyv__combineh(A, B))
+#define npyv_combineh_s16(A, B) ((npyv_s16)npyv__combineh(A, B))
+#define npyv_combineh_u32(A, B) ((npyv_u32)npyv__combineh(A, B))
+#define npyv_combineh_s32(A, B) ((npyv_s32)npyv__combineh(A, B))
+#define npyv_combineh_u64 vec_mergel
+#define npyv_combineh_s64 vec_mergel
+#define npyv_combineh_f32(A, B) ((npyv_f32)npyv__combineh(A, B))
+#define npyv_combineh_f64 vec_mergel
+
+/*
+ * combine: combine two vectors from lower and higher parts of two other vectors
+ * zip: interleave two vectors
+*/
+#define NPYV_IMPL_VSX_COMBINE_ZIP(T_VEC, SFX) \
+ NPY_FINLINE T_VEC##x2 npyv_combine_##SFX(T_VEC a, T_VEC b) \
+ { \
+ T_VEC##x2 r; \
+ r.val[0] = NPY_CAT(npyv_combinel_, SFX)(a, b); \
+ r.val[1] = NPY_CAT(npyv_combineh_, SFX)(a, b); \
+ return r; \
+ } \
+ NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b) \
+ { \
+ T_VEC##x2 r; \
+ r.val[0] = vec_mergeh(a, b); \
+ r.val[1] = vec_mergel(a, b); \
+ return r; \
+ }
+
+NPYV_IMPL_VSX_COMBINE_ZIP(npyv_u8, u8)
+NPYV_IMPL_VSX_COMBINE_ZIP(npyv_s8, s8)
+NPYV_IMPL_VSX_COMBINE_ZIP(npyv_u16, u16)
+NPYV_IMPL_VSX_COMBINE_ZIP(npyv_s16, s16)
+NPYV_IMPL_VSX_COMBINE_ZIP(npyv_u32, u32)
+NPYV_IMPL_VSX_COMBINE_ZIP(npyv_s32, s32)
+NPYV_IMPL_VSX_COMBINE_ZIP(npyv_u64, u64)
+NPYV_IMPL_VSX_COMBINE_ZIP(npyv_s64, s64)
+NPYV_IMPL_VSX_COMBINE_ZIP(npyv_f32, f32)
+NPYV_IMPL_VSX_COMBINE_ZIP(npyv_f64, f64)
+
+#endif // _NPY_SIMD_VSX_REORDER_H
diff --git a/numpy/core/src/common/simd/vsx/vsx.h b/numpy/core/src/common/simd/vsx/vsx.h
new file mode 100644
index 000000000..5525dc1e6
--- /dev/null
+++ b/numpy/core/src/common/simd/vsx/vsx.h
@@ -0,0 +1,64 @@
+#ifndef _NPY_SIMD_H_
+ #error "Not a standalone header"
+#endif
+
+#define NPY_SIMD 128
+#define NPY_SIMD_WIDTH 16
+#define NPY_SIMD_F64 1
+
+typedef __vector unsigned char npyv_u8;
+typedef __vector signed char npyv_s8;
+typedef __vector unsigned short npyv_u16;
+typedef __vector signed short npyv_s16;
+typedef __vector unsigned int npyv_u32;
+typedef __vector signed int npyv_s32;
+typedef __vector unsigned long long npyv_u64;
+typedef __vector signed long long npyv_s64;
+typedef __vector float npyv_f32;
+typedef __vector double npyv_f64;
+
+typedef struct { npyv_u8 val[2]; } npyv_u8x2;
+typedef struct { npyv_s8 val[2]; } npyv_s8x2;
+typedef struct { npyv_u16 val[2]; } npyv_u16x2;
+typedef struct { npyv_s16 val[2]; } npyv_s16x2;
+typedef struct { npyv_u32 val[2]; } npyv_u32x2;
+typedef struct { npyv_s32 val[2]; } npyv_s32x2;
+typedef struct { npyv_u64 val[2]; } npyv_u64x2;
+typedef struct { npyv_s64 val[2]; } npyv_s64x2;
+typedef struct { npyv_f32 val[2]; } npyv_f32x2;
+typedef struct { npyv_f64 val[2]; } npyv_f64x2;
+
+typedef struct { npyv_u8 val[3]; } npyv_u8x3;
+typedef struct { npyv_s8 val[3]; } npyv_s8x3;
+typedef struct { npyv_u16 val[3]; } npyv_u16x3;
+typedef struct { npyv_s16 val[3]; } npyv_s16x3;
+typedef struct { npyv_u32 val[3]; } npyv_u32x3;
+typedef struct { npyv_s32 val[3]; } npyv_s32x3;
+typedef struct { npyv_u64 val[3]; } npyv_u64x3;
+typedef struct { npyv_s64 val[3]; } npyv_s64x3;
+typedef struct { npyv_f32 val[3]; } npyv_f32x3;
+typedef struct { npyv_f64 val[3]; } npyv_f64x3;
+
+#define npyv_nlanes_u8 16
+#define npyv_nlanes_s8 16
+#define npyv_nlanes_u16 8
+#define npyv_nlanes_s16 8
+#define npyv_nlanes_u32 4
+#define npyv_nlanes_s32 4
+#define npyv_nlanes_u64 2
+#define npyv_nlanes_s64 2
+#define npyv_nlanes_f32 4
+#define npyv_nlanes_f64 2
+
+// using __bool with typdef cause ambiguous errors
+#define npyv_b8 __vector __bool char
+#define npyv_b16 __vector __bool short
+#define npyv_b32 __vector __bool int
+#define npyv_b64 __vector __bool long long
+
+#include "memory.h"
+#include "misc.h"
+#include "reorder.h"
+#include "operators.h"
+#include "conversion.h"
+#include "arithmetic.h"