summaryrefslogtreecommitdiff
path: root/numpy/distutils
diff options
context:
space:
mode:
authorSayed Adel <seiko@imavr.com>2022-06-09 05:09:31 +0200
committerSayed Adel <seiko@imavr.com>2022-06-09 20:58:56 +0200
commitcc92935d94b6703b5971c6c42b771ab54b041b76 (patch)
tree0a3eea8cfa85818be619f99e9dca57d80b8b5112 /numpy/distutils
parent5fedeb7e4f5140ab5b422fe7502a0c3b23feaf9b (diff)
downloadnumpy-cc92935d94b6703b5971c6c42b771ab54b041b76.tar.gz
BLD, SIMD: Hardened the Neon/ASIMD compile-time tests
Avoid passing any constants or traced pointers to avoid compiler optimizations, so we make the sure the required instructions have been tested against the linker.
Diffstat (limited to 'numpy/distutils')
-rw-r--r--numpy/distutils/checks/cpu_asimd.c8
-rw-r--r--numpy/distutils/checks/cpu_asimddp.c5
-rw-r--r--numpy/distutils/checks/cpu_asimdfhm.c12
-rw-r--r--numpy/distutils/checks/cpu_asimdhp.c7
-rw-r--r--numpy/distutils/checks/cpu_neon.c10
-rw-r--r--numpy/distutils/checks/cpu_neon_fp16.c6
-rw-r--r--numpy/distutils/checks/cpu_neon_vfpv4.c16
7 files changed, 38 insertions, 26 deletions
diff --git a/numpy/distutils/checks/cpu_asimd.c b/numpy/distutils/checks/cpu_asimd.c
index 8df556b6c..fc408feb0 100644
--- a/numpy/distutils/checks/cpu_asimd.c
+++ b/numpy/distutils/checks/cpu_asimd.c
@@ -3,9 +3,10 @@
#endif
#include <arm_neon.h>
-int main(void)
+int main(int argc, char **argv)
{
- float32x4_t v1 = vdupq_n_f32(1.0f), v2 = vdupq_n_f32(2.0f);
+ float *src = (float*)argv[argc-1];
+ float32x4_t v1 = vdupq_n_f32(src[0]), v2 = vdupq_n_f32(src[1]);
/* MAXMIN */
int ret = (int)vgetq_lane_f32(vmaxnmq_f32(v1, v2), 0);
ret += (int)vgetq_lane_f32(vminnmq_f32(v1, v2), 0);
@@ -13,7 +14,8 @@ int main(void)
ret += (int)vgetq_lane_f32(vrndq_f32(v1), 0);
#ifdef __aarch64__
{
- float64x2_t vd1 = vdupq_n_f64(1.0), vd2 = vdupq_n_f64(2.0);
+ double *src2 = (float*)argv[argc-1];
+ float64x2_t vd1 = vdupq_n_f64(src2[0]), vd2 = vdupq_n_f64(src2[1]);
/* MAXMIN */
ret += (int)vgetq_lane_f64(vmaxnmq_f64(vd1, vd2), 0);
ret += (int)vgetq_lane_f64(vminnmq_f64(vd1, vd2), 0);
diff --git a/numpy/distutils/checks/cpu_asimddp.c b/numpy/distutils/checks/cpu_asimddp.c
index 0158d1354..e7068ce02 100644
--- a/numpy/distutils/checks/cpu_asimddp.c
+++ b/numpy/distutils/checks/cpu_asimddp.c
@@ -3,9 +3,10 @@
#endif
#include <arm_neon.h>
-int main(void)
+int main(int argc, char **argv)
{
- uint8x16_t v1 = vdupq_n_u8((unsigned char)1), v2 = vdupq_n_u8((unsigned char)2);
+ unsigned char *src = (unsigned char*)argv[argc-1];
+ uint8x16_t v1 = vdupq_n_u8(src[0]), v2 = vdupq_n_u8(src[1]);
uint32x4_t va = vdupq_n_u32(3);
int ret = (int)vgetq_lane_u32(vdotq_u32(va, v1, v2), 0);
#ifdef __aarch64__
diff --git a/numpy/distutils/checks/cpu_asimdfhm.c b/numpy/distutils/checks/cpu_asimdfhm.c
index cb49751c4..af29d3003 100644
--- a/numpy/distutils/checks/cpu_asimdfhm.c
+++ b/numpy/distutils/checks/cpu_asimdfhm.c
@@ -3,12 +3,14 @@
#endif
#include <arm_neon.h>
-int main(void)
+int main(int argc, char **argv)
{
- float16x8_t vhp = vdupq_n_f16((float16_t)1);
- float16x4_t vlhp = vdup_n_f16((float16_t)1);
- float32x4_t vf = vdupq_n_f32(1.0f);
- float32x2_t vlf = vdup_n_f32(1.0f);
+ float16_t *src = (float16_t*)argv[argc-1];
+ float *src2 = (float*)argv[argc-2];
+ float16x8_t vhp = vdupq_n_f16(src[0]);
+ float16x4_t vlhp = vdup_n_f16((src[1]);
+ float32x4_t vf = vdupq_n_f32(src2[0]);
+ float32x2_t vlf = vdup_n_f32(src2[1]);
int ret = (int)vget_lane_f32(vfmlal_low_f16(vlf, vlhp, vlhp), 0);
ret += (int)vgetq_lane_f32(vfmlslq_high_f16(vf, vhp, vhp), 0);
diff --git a/numpy/distutils/checks/cpu_asimdhp.c b/numpy/distutils/checks/cpu_asimdhp.c
index 80b94000f..e2de0306e 100644
--- a/numpy/distutils/checks/cpu_asimdhp.c
+++ b/numpy/distutils/checks/cpu_asimdhp.c
@@ -3,10 +3,11 @@
#endif
#include <arm_neon.h>
-int main(void)
+int main(int argc, char **argv)
{
- float16x8_t vhp = vdupq_n_f16((float16_t)-1);
- float16x4_t vlhp = vdup_n_f16((float16_t)-1);
+ float16_t *src = (float16_t*)argv[argc-1];
+ float16x8_t vhp = vdupq_n_f16(src[0]);
+ float16x4_t vlhp = vdup_n_f16(src[1]);
int ret = (int)vgetq_lane_f16(vabdq_f16(vhp, vhp), 0);
ret += (int)vget_lane_f16(vabd_f16(vlhp, vlhp), 0);
diff --git a/numpy/distutils/checks/cpu_neon.c b/numpy/distutils/checks/cpu_neon.c
index 4eab1f384..8c64f864d 100644
--- a/numpy/distutils/checks/cpu_neon.c
+++ b/numpy/distutils/checks/cpu_neon.c
@@ -3,12 +3,16 @@
#endif
#include <arm_neon.h>
-int main(void)
+int main(int argc, char **argv)
{
- float32x4_t v1 = vdupq_n_f32(1.0f), v2 = vdupq_n_f32(2.0f);
+ // passing from untraced pointers to avoid optimizing out any constants
+ // so we can test against the linker.
+ float *src = (float*)argv[argc-1];
+ float32x4_t v1 = vdupq_n_f32(src[0]), v2 = vdupq_n_f32(src[1]);
int ret = (int)vgetq_lane_f32(vmulq_f32(v1, v2), 0);
#ifdef __aarch64__
- float64x2_t vd1 = vdupq_n_f64(1.0), vd2 = vdupq_n_f64(2.0);
+ double *src2 = (double*)argv[argc-2];
+ float64x2_t vd1 = vdupq_n_f64(src2[0]), vd2 = vdupq_n_f64(src2[1]);
ret += (int)vgetq_lane_f64(vmulq_f64(vd1, vd2), 0);
#endif
return ret;
diff --git a/numpy/distutils/checks/cpu_neon_fp16.c b/numpy/distutils/checks/cpu_neon_fp16.c
index 745d2e793..f3b949770 100644
--- a/numpy/distutils/checks/cpu_neon_fp16.c
+++ b/numpy/distutils/checks/cpu_neon_fp16.c
@@ -3,9 +3,9 @@
#endif
#include <arm_neon.h>
-int main(void)
+int main(int argc, char **argv)
{
- short z4[] = {0, 0, 0, 0, 0, 0, 0, 0};
- float32x4_t v_z4 = vcvt_f32_f16((float16x4_t)vld1_s16((const short*)z4));
+ short *src = (short*)argv[argc-1];
+ float32x4_t v_z4 = vcvt_f32_f16((float16x4_t)vld1_s16(src));
return (int)vgetq_lane_f32(v_z4, 0);
}
diff --git a/numpy/distutils/checks/cpu_neon_vfpv4.c b/numpy/distutils/checks/cpu_neon_vfpv4.c
index 45f7b5d69..a039159dd 100644
--- a/numpy/distutils/checks/cpu_neon_vfpv4.c
+++ b/numpy/distutils/checks/cpu_neon_vfpv4.c
@@ -3,16 +3,18 @@
#endif
#include <arm_neon.h>
-int main(void)
+int main(int argc, char **argv)
{
- float32x4_t v1 = vdupq_n_f32(1.0f);
- float32x4_t v2 = vdupq_n_f32(2.0f);
- float32x4_t v3 = vdupq_n_f32(3.0f);
+ float *src = (float*)argv[argc-1];
+ float32x4_t v1 = vdupq_n_f32(src[0]);
+ float32x4_t v2 = vdupq_n_f32(src[1]);
+ float32x4_t v3 = vdupq_n_f32(src[2]);
int ret = (int)vgetq_lane_f32(vfmaq_f32(v1, v2, v3), 0);
#ifdef __aarch64__
- float64x2_t vd1 = vdupq_n_f64(1.0);
- float64x2_t vd2 = vdupq_n_f64(2.0);
- float64x2_t vd3 = vdupq_n_f64(3.0);
+ double *src2 = (double*)argv[argc-2];
+ float64x2_t vd1 = vdupq_n_f64(src2[0]);
+ float64x2_t vd2 = vdupq_n_f64(src2[1]);
+ float64x2_t vd3 = vdupq_n_f64(src2[2]);
ret += (int)vgetq_lane_f64(vfmaq_f64(vd1, vd2, vd3), 0);
#endif
return ret;