BLD, SIMD: Hardened the Neon/ASIMD compile-time tests

Avoid passing any constants or traced pointers to avoid compiler optimizations, so we make the sure the required instructions have been tested against the linker.
author: Sayed Adel <seiko@imavr.com> 2022-06-09 05:09:31 +0200
committer: Sayed Adel <seiko@imavr.com> 2022-06-09 20:58:56 +0200
commit: cc92935d94b6703b5971c6c42b771ab54b041b76 (patch)
tree: 0a3eea8cfa85818be619f99e9dca57d80b8b5112 /numpy/distutils
parent: 5fedeb7e4f5140ab5b422fe7502a0c3b23feaf9b (diff)
download: numpy-cc92935d94b6703b5971c6c42b771ab54b041b76.tar.gz
7 files changed, 38 insertions, 26 deletions
diff --git a/numpy/distutils/checks/cpu_asimd.c b/numpy/distutils/checks/cpu_asimd.c
index 8df556b6c..fc408feb0 100644
--- a/numpy/distutils/checks/cpu_asimd.c
+++ b/numpy/distutils/checks/cpu_asimd.c
@@ -3,9 +3,10 @@
 #endif
 #include <arm_neon.h>
 
-int main(void)
+int main(int argc, char **argv)
 {
-    float32x4_t v1 = vdupq_n_f32(1.0f), v2 = vdupq_n_f32(2.0f);
+    float *src = (float*)argv[argc-1];
+    float32x4_t v1 = vdupq_n_f32(src[0]), v2 = vdupq_n_f32(src[1]);
     /* MAXMIN */
     int ret  = (int)vgetq_lane_f32(vmaxnmq_f32(v1, v2), 0);
         ret += (int)vgetq_lane_f32(vminnmq_f32(v1, v2), 0);
@@ -13,7 +14,8 @@ int main(void)
     ret += (int)vgetq_lane_f32(vrndq_f32(v1), 0);
 #ifdef __aarch64__
     {
-        float64x2_t vd1 = vdupq_n_f64(1.0), vd2 = vdupq_n_f64(2.0);
+        double *src2 = (float*)argv[argc-1];
+        float64x2_t vd1 = vdupq_n_f64(src2[0]), vd2 = vdupq_n_f64(src2[1]);
         /* MAXMIN */
         ret += (int)vgetq_lane_f64(vmaxnmq_f64(vd1, vd2), 0);
         ret += (int)vgetq_lane_f64(vminnmq_f64(vd1, vd2), 0);
diff --git a/numpy/distutils/checks/cpu_asimddp.c b/numpy/distutils/checks/cpu_asimddp.c
index 0158d1354..e7068ce02 100644
--- a/numpy/distutils/checks/cpu_asimddp.c
+++ b/numpy/distutils/checks/cpu_asimddp.c
@@ -3,9 +3,10 @@
 #endif
 #include <arm_neon.h>
 
-int main(void)
+int main(int argc, char **argv)
 {
-    uint8x16_t v1 = vdupq_n_u8((unsigned char)1), v2 = vdupq_n_u8((unsigned char)2);
+    unsigned char *src = (unsigned char*)argv[argc-1];
+    uint8x16_t v1 = vdupq_n_u8(src[0]), v2 = vdupq_n_u8(src[1]);
     uint32x4_t va = vdupq_n_u32(3);
     int ret = (int)vgetq_lane_u32(vdotq_u32(va, v1, v2), 0);
 #ifdef __aarch64__
diff --git a/numpy/distutils/checks/cpu_asimdfhm.c b/numpy/distutils/checks/cpu_asimdfhm.c
index cb49751c4..af29d3003 100644
--- a/numpy/distutils/checks/cpu_asimdfhm.c
+++ b/numpy/distutils/checks/cpu_asimdfhm.c
@@ -3,12 +3,14 @@
 #endif
 #include <arm_neon.h>
 
-int main(void)
+int main(int argc, char **argv)
 {
-    float16x8_t vhp  = vdupq_n_f16((float16_t)1);
-    float16x4_t vlhp = vdup_n_f16((float16_t)1);
-    float32x4_t vf   = vdupq_n_f32(1.0f);
-    float32x2_t vlf  = vdup_n_f32(1.0f);
+    float16_t *src = (float16_t*)argv[argc-1];
+    float *src2 = (float*)argv[argc-2];
+    float16x8_t vhp  = vdupq_n_f16(src[0]);
+    float16x4_t vlhp = vdup_n_f16((src[1]);
+    float32x4_t vf   = vdupq_n_f32(src2[0]);
+    float32x2_t vlf  = vdup_n_f32(src2[1]);
 
     int ret  = (int)vget_lane_f32(vfmlal_low_f16(vlf, vlhp, vlhp), 0);
         ret += (int)vgetq_lane_f32(vfmlslq_high_f16(vf, vhp, vhp), 0);
diff --git a/numpy/distutils/checks/cpu_asimdhp.c b/numpy/distutils/checks/cpu_asimdhp.c
index 80b94000f..e2de0306e 100644
--- a/numpy/distutils/checks/cpu_asimdhp.c
+++ b/numpy/distutils/checks/cpu_asimdhp.c
@@ -3,10 +3,11 @@
 #endif
 #include <arm_neon.h>
 
-int main(void)
+int main(int argc, char **argv)
 {
-    float16x8_t vhp  = vdupq_n_f16((float16_t)-1);
-    float16x4_t vlhp = vdup_n_f16((float16_t)-1);
+    float16_t *src = (float16_t*)argv[argc-1];
+    float16x8_t vhp  = vdupq_n_f16(src[0]);
+    float16x4_t vlhp = vdup_n_f16(src[1]);
 
     int ret  =  (int)vgetq_lane_f16(vabdq_f16(vhp, vhp), 0);
         ret  += (int)vget_lane_f16(vabd_f16(vlhp, vlhp), 0);
diff --git a/numpy/distutils/checks/cpu_neon.c b/numpy/distutils/checks/cpu_neon.c
index 4eab1f384..8c64f864d 100644
--- a/numpy/distutils/checks/cpu_neon.c
+++ b/numpy/distutils/checks/cpu_neon.c
@@ -3,12 +3,16 @@
 #endif
 #include <arm_neon.h>
 
-int main(void)
+int main(int argc, char **argv)
 {
-    float32x4_t v1 = vdupq_n_f32(1.0f), v2 = vdupq_n_f32(2.0f);
+    // passing from untraced pointers to avoid optimizing out any constants
+    // so we can test against the linker.
+    float *src = (float*)argv[argc-1];
+    float32x4_t v1 = vdupq_n_f32(src[0]), v2 = vdupq_n_f32(src[1]);
     int ret = (int)vgetq_lane_f32(vmulq_f32(v1, v2), 0);
 #ifdef __aarch64__
-    float64x2_t vd1 = vdupq_n_f64(1.0), vd2 = vdupq_n_f64(2.0);
+    double *src2 = (double*)argv[argc-2];
+    float64x2_t vd1 = vdupq_n_f64(src2[0]), vd2 = vdupq_n_f64(src2[1]);
     ret += (int)vgetq_lane_f64(vmulq_f64(vd1, vd2), 0);
 #endif
     return ret;
diff --git a/numpy/distutils/checks/cpu_neon_fp16.c b/numpy/distutils/checks/cpu_neon_fp16.c
index 745d2e793..f3b949770 100644
--- a/numpy/distutils/checks/cpu_neon_fp16.c
+++ b/numpy/distutils/checks/cpu_neon_fp16.c
@@ -3,9 +3,9 @@
 #endif
 #include <arm_neon.h>
 
-int main(void)
+int main(int argc, char **argv)
 {
-    short z4[] = {0, 0, 0, 0, 0, 0, 0, 0};
-    float32x4_t v_z4 = vcvt_f32_f16((float16x4_t)vld1_s16((const short*)z4));
+    short *src = (short*)argv[argc-1];
+    float32x4_t v_z4 = vcvt_f32_f16((float16x4_t)vld1_s16(src));
     return (int)vgetq_lane_f32(v_z4, 0);
 }
diff --git a/numpy/distutils/checks/cpu_neon_vfpv4.c b/numpy/distutils/checks/cpu_neon_vfpv4.c
index 45f7b5d69..a039159dd 100644
--- a/numpy/distutils/checks/cpu_neon_vfpv4.c
+++ b/numpy/distutils/checks/cpu_neon_vfpv4.c
@@ -3,16 +3,18 @@
 #endif
 #include <arm_neon.h>
 
-int main(void)
+int main(int argc, char **argv)
 {
-    float32x4_t v1 = vdupq_n_f32(1.0f);
-    float32x4_t v2 = vdupq_n_f32(2.0f);
-    float32x4_t v3 = vdupq_n_f32(3.0f);
+    float *src = (float*)argv[argc-1];
+    float32x4_t v1 = vdupq_n_f32(src[0]);
+    float32x4_t v2 = vdupq_n_f32(src[1]);
+    float32x4_t v3 = vdupq_n_f32(src[2]);
     int ret = (int)vgetq_lane_f32(vfmaq_f32(v1, v2, v3), 0);
 #ifdef __aarch64__
-    float64x2_t vd1 = vdupq_n_f64(1.0);
-    float64x2_t vd2 = vdupq_n_f64(2.0);
-    float64x2_t vd3 = vdupq_n_f64(3.0);
+    double *src2 = (double*)argv[argc-2];
+    float64x2_t vd1 = vdupq_n_f64(src2[0]);
+    float64x2_t vd2 = vdupq_n_f64(src2[1]);
+    float64x2_t vd3 = vdupq_n_f64(src2[2]);
     ret += (int)vgetq_lane_f64(vfmaq_f64(vd1, vd2, vd3), 0);
 #endif
     return ret;
author	Sayed Adel <seiko@imavr.com>	2022-06-09 05:09:31 +0200
committer	Sayed Adel <seiko@imavr.com>	2022-06-09 20:58:56 +0200
commit	cc92935d94b6703b5971c6c42b771ab54b041b76 (patch)
tree	0a3eea8cfa85818be619f99e9dca57d80b8b5112 /numpy/distutils
parent	5fedeb7e4f5140ab5b422fe7502a0c3b23feaf9b (diff)
download	numpy-cc92935d94b6703b5971c6c42b771ab54b041b76.tar.gz