From 52b5935ea1ab9a5f1043e7a4af2ced8311affe01 Mon Sep 17 00:00:00 2001
From: KIU Shueng Chuan <nixchuan@gmail.com>
Date: Tue, 19 Oct 2021 14:24:51 +0800
Subject: BUG: fix win32 np.clip slowness

The use of the macro _NPY_CLIP results in multiple re-evaluations of the
input arguments. Thus for floating point types, the check of NaNs is
performed multiple times.
This manifests itself as a slowness on Win32 builds. See #18673.
---
 numpy/core/src/umath/clip.c.src | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/numpy/core/src/umath/clip.c.src b/numpy/core/src/umath/clip.c.src
index bc966b7ac..48786d4a2 100644
--- a/numpy/core/src/umath/clip.c.src
+++ b/numpy/core/src/umath/clip.c.src
@@ -76,9 +76,6 @@
  *         npy_datetime, npy_timedelta#
  */
 
-#define _NPY_CLIP(x, min, max) \
-    _NPY_@name@_MIN(_NPY_@name@_MAX((x), (min)), (max))
-
 NPY_NO_EXPORT void
 @name@_clip(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func))
 {
@@ -95,25 +92,33 @@ NPY_NO_EXPORT void
         /* contiguous, branch to let the compiler optimize */
         if (is1 == sizeof(@type@) && os1 == sizeof(@type@)) {
             for(npy_intp i = 0; i < n; i++, ip1 += is1, op1 += os1) {
-                *(@type@ *)op1 = _NPY_CLIP(*(@type@ *)ip1, min_val, max_val);
+                @type@ t = *(@type@ *)ip1;
+                t = _NPY_@name@_MAX(t, min_val);
+                t = _NPY_@name@_MIN(t, max_val);
+                *(@type@ *)op1 = t;
             }
         }
         else {
             for(npy_intp i = 0; i < n; i++, ip1 += is1, op1 += os1) {
-                *(@type@ *)op1 = _NPY_CLIP(*(@type@ *)ip1, min_val, max_val);
+                @type@ t = *(@type@ *)ip1;
+                t = _NPY_@name@_MAX(t, min_val);
+                t = _NPY_@name@_MIN(t, max_val);
+                *(@type@ *)op1 = t;
             }
         }
     }
     else {
         TERNARY_LOOP {
-            *(@type@ *)op1 = _NPY_CLIP(*(@type@ *)ip1, *(@type@ *)ip2, *(@type@ *)ip3);
+            @type@ t = *(@type@ *)ip1;
+            t = _NPY_@name@_MAX(t, *(@type@ *)ip2);
+            t = _NPY_@name@_MIN(t, *(@type@ *)ip3);
+            *(@type@ *)op1 = t;
         }
     }
     npy_clear_floatstatus_barrier((char*)dimensions);
 }
 
 // clean up the macros we defined above
-#undef _NPY_CLIP
 #undef _NPY_@name@_MAX
 #undef _NPY_@name@_MIN
 
-- 
cgit v1.2.1