Merge pull request #15162 from seberg/reduce-axes-try

BUG,MAINT: Fix issues with non-reduce broadcasting axes
author: Matti Picus <matti.picus@gmail.com> 2020-06-04 22:03:43 +0300
committer: GitHub <noreply@github.com> 2020-06-04 22:03:43 +0300
commit: a5d021a1b6f439a19812926bc4d796ef5f346c44 (patch)
tree: 998f27e5296a2894b41a9600c9bdb8b7806241c1
parent: abce2a0604a4db5d8072a0b1478c7af439882ed7 (diff)
parent: 8da4fba24c193807555dfcbc0e67834cae9185ea (diff)
download: numpy-a5d021a1b6f439a19812926bc4d796ef5f346c44.tar.gz
10 files changed, 479 insertions, 595 deletions
diff --git a/numpy/core/src/multiarray/common.h b/numpy/core/src/multiarray/common.h
index 78a15a63c..4ba25c079 100644
--- a/numpy/core/src/multiarray/common.h
+++ b/numpy/core/src/multiarray/common.h
@@ -339,5 +339,14 @@ NPY_NO_EXPORT PyArrayObject *
 new_array_for_sum(PyArrayObject *ap1, PyArrayObject *ap2, PyArrayObject* out,
                   int nd, npy_intp dimensions[], int typenum, PyArrayObject **result);
 
+
+/*
+ * Used to indicate a broadcast axis, see also `npyiter_get_op_axis` in
+ * `nditer_constr.c`.  This may be the preferred API for reduction axes
+ * probably. So we should consider making this public either as a macro or
+ * function (so that the way we flag the axis can be changed).
+ */
+#define NPY_ITER_REDUCTION_AXIS(axis) (axis + (1 << (NPY_BITSOF_INT - 2)))
+
 #endif
 
diff --git a/numpy/core/src/multiarray/einsum.c.src b/numpy/core/src/multiarray/einsum.c.src
index 1cc557825..b914e5bb3 100644
--- a/numpy/core/src/multiarray/einsum.c.src
+++ b/numpy/core/src/multiarray/einsum.c.src
@@ -2740,7 +2740,7 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
         op_axes[nop][idim] = idim;
     }
     for (idim = ndim_output; idim < ndim_iter; ++idim) {
-        op_axes[nop][idim] = -1;
+        op_axes[nop][idim] = NPY_ITER_REDUCTION_AXIS(-1);
     }
 
     /* Set the iterator per-op flags */
@@ -2753,13 +2753,11 @@ PyArray_EinsteinSum(char *subscripts, npy_intp nop,
     op_flags[nop] = NPY_ITER_READWRITE|
                     NPY_ITER_NBO|
                     NPY_ITER_ALIGNED|
-                    NPY_ITER_ALLOCATE|
-                    NPY_ITER_NO_BROADCAST;
+                    NPY_ITER_ALLOCATE;
     iter_flags = NPY_ITER_EXTERNAL_LOOP|
             NPY_ITER_BUFFERED|
             NPY_ITER_DELAY_BUFALLOC|
             NPY_ITER_GROWINNER|
-            NPY_ITER_REDUCE_OK|
             NPY_ITER_REFS_OK|
             NPY_ITER_ZEROSIZE_OK;
     if (out != NULL) {
diff --git a/numpy/core/src/multiarray/nditer_api.c b/numpy/core/src/multiarray/nditer_api.c
index e7fe0fa50..a5b5e5c51 100644
--- a/numpy/core/src/multiarray/nditer_api.c
+++ b/numpy/core/src/multiarray/nditer_api.c
@@ -935,13 +935,8 @@ NpyIter_GetShape(NpyIter *iter, npy_intp *outshape)
     if (itflags&NPY_ITFLAG_HASMULTIINDEX) {
         perm = NIT_PERM(iter);
         for(idim = 0; idim < ndim; ++idim) {
-            npy_int8 p = perm[idim];
-            if (p < 0) {
-                outshape[ndim+p] = NAD_SHAPE(axisdata);
-            }
-            else {
-                outshape[ndim-p-1] = NAD_SHAPE(axisdata);
-            }
+            int axis = npyiter_undo_iter_axis_perm(idim, ndim, perm, NULL);
+            outshape[axis] = NAD_SHAPE(axisdata);
 
             NIT_ADVANCE_AXISDATA(axisdata, 1);
         }
@@ -1005,8 +1000,9 @@ NpyIter_CreateCompatibleStrides(NpyIter *iter,
 
     perm = NIT_PERM(iter);
     for(idim = 0; idim < ndim; ++idim) {
-        npy_int8 p = perm[idim];
-        if (p < 0) {
+        npy_bool flipped;
+        npy_int8 axis = npyiter_undo_iter_axis_perm(idim, ndim, perm, &flipped);
+        if (flipped) {
             PyErr_SetString(PyExc_RuntimeError,
                     "Iterator CreateCompatibleStrides may only be called "
                     "if DONT_NEGATE_STRIDES was used to prevent reverse "
@@ -1014,7 +1010,7 @@ NpyIter_CreateCompatibleStrides(NpyIter *iter,
             return NPY_FAIL;
         }
         else {
-            outstrides[ndim-p-1] = itemsize;
+            outstrides[axis] = itemsize;
         }
 
         itemsize *= NAD_SHAPE(axisdata);
diff --git a/numpy/core/src/multiarray/nditer_constr.c b/numpy/core/src/multiarray/nditer_constr.c
index 620c7d593..f2dbc9f03 100644
--- a/numpy/core/src/multiarray/nditer_constr.c
+++ b/numpy/core/src/multiarray/nditer_constr.c
@@ -57,11 +57,12 @@ npyiter_fill_axisdata(NpyIter *iter, npy_uint32 flags, npyiter_opitflags *op_itf
                     char **op_dataptr,
                     const npy_uint32 *op_flags, int **op_axes,
                     npy_intp const *itershape);
+static NPY_INLINE int
+npyiter_get_op_axis(int axis, npy_bool *reduction_axis);
 static void
-npyiter_replace_axisdata(NpyIter *iter, int iop,
-                      PyArrayObject *op,
-                      int op_ndim, char *op_dataptr,
-                      int *op_axes);
+npyiter_replace_axisdata(
+        NpyIter *iter, int iop, PyArrayObject *op,
+        int orig_op_ndim, const int *op_axes);
 static void
 npyiter_compute_index_strides(NpyIter *iter, npy_uint32 flags);
 static void
@@ -826,7 +827,8 @@ npyiter_check_op_axes(int nop, int oa_ndim, int **op_axes,
         if (axes != NULL) {
             memset(axes_dupcheck, 0, NPY_MAXDIMS);
             for (idim = 0; idim < oa_ndim; ++idim) {
-                int i = axes[idim];
+                int i = npyiter_get_op_axis(axes[idim], NULL);
+
                 if (i >= 0) {
                     if (i >= NPY_MAXDIMS) {
                         PyErr_Format(PyExc_ValueError,
@@ -1391,6 +1393,61 @@ check_mask_for_writemasked_reduction(NpyIter *iter, int iop)
 }
 
 /*
+ * Check whether a reduction is OK based on the flags and the operand being
+ * readwrite. This path is deprecated, since usually only specific axes
+ * should be reduced. If axes are specified explicitely, the flag is
+ * unnecessary.
+ */
+static int
+npyiter_check_reduce_ok_and_set_flags(
+        NpyIter *iter, npy_uint32 flags, npyiter_opitflags *op_itflags,
+        int dim) {
+    /* If it's writeable, this means a reduction */
+    if (*op_itflags & NPY_OP_ITFLAG_WRITE) {
+        if (!(flags & NPY_ITER_REDUCE_OK)) {
+            PyErr_Format(PyExc_ValueError,
+                    "output operand requires a reduction along dimension %d, "
+                    "but the reduction is not enabled. The dimension size of 1 "
+                    "does not match the expected output shape.", dim);
+            return 0;
+        }
+        if (!(*op_itflags & NPY_OP_ITFLAG_READ)) {
+            PyErr_SetString(PyExc_ValueError,
+                    "output operand requires a reduction, but is flagged as "
+                    "write-only, not read-write");
+            return 0;
+        }
+        NPY_IT_DBG_PRINT("Iterator: Indicating that a reduction is"
+                         "occurring\n");
+
+        NIT_ITFLAGS(iter) |= NPY_ITFLAG_REDUCE;
+        *op_itflags |= NPY_OP_ITFLAG_REDUCE;
+    }
+    return 1;
+}
+
+/**
+ * Removes the (additive) NPY_ITER_REDUCTION_AXIS indication and sets
+ * is_forced_broadcast to 1 if it is set. Otherwise to 0.
+ *
+ * @param axis The op_axes[i] to normalize.
+ * @param reduction_axis Output 1 if a reduction axis, otherwise 0.
+ * @returns The normalized axis (without reduce axis flag).
+ */
+static NPY_INLINE int
+npyiter_get_op_axis(int axis, npy_bool *reduction_axis) {
+    npy_bool forced_broadcast = axis >= NPY_ITER_REDUCTION_AXIS(-1);
+
+    if (reduction_axis != NULL) {
+        *reduction_axis = forced_broadcast;
+    }
+    if (forced_broadcast) {
+        return axis - NPY_ITER_REDUCTION_AXIS(0);
+    }
+    return axis;
+}
+
+/*
  * Fills in the AXISDATA for the 'nop' operands, broadcasting
  * the dimensionas as necessary.  Also fills
  * in the ITERSIZE data member.
@@ -1450,8 +1507,9 @@ npyiter_fill_axisdata(NpyIter *iter, npy_uint32 flags, npyiter_opitflags *op_itf
                     return 0;
                 }
                 for (idim = 0; idim < ondim; ++idim) {
-                    npy_intp bshape = broadcast_shape[idim+ndim-ondim],
-                                      op_shape = shape[idim];
+                    npy_intp bshape = broadcast_shape[idim+ndim-ondim];
+                    npy_intp op_shape = shape[idim];
+
                     if (bshape == 1) {
                         broadcast_shape[idim+ndim-ondim] = op_shape;
                     }
@@ -1463,11 +1521,13 @@ npyiter_fill_axisdata(NpyIter *iter, npy_uint32 flags, npyiter_opitflags *op_itf
             else {
                 int *axes = op_axes[iop];
                 for (idim = 0; idim < ndim; ++idim) {
-                    int i = axes[idim];
+                    int i = npyiter_get_op_axis(axes[idim], NULL);
+
                     if (i >= 0) {
                         if (i < ondim) {
-                            npy_intp bshape = broadcast_shape[idim],
-                                              op_shape = shape[i];
+                            npy_intp bshape = broadcast_shape[idim];
+                            npy_intp op_shape = shape[i];
+
                             if (bshape == 1) {
                                 broadcast_shape[idim] = op_shape;
                             }
@@ -1589,9 +1649,37 @@ npyiter_fill_axisdata(NpyIter *iter, npy_uint32 flags, npyiter_opitflags *op_itf
             }
             else {
                 int *axes = op_axes[iop];
-                int i = axes[ndim-idim-1];
-                if (i >= 0) {
-                    if (bshape == 1 || op_cur == NULL) {
+                npy_bool reduction_axis;
+                int i;
+                i = npyiter_get_op_axis(axes[ndim - idim - 1], &reduction_axis);
+
+                if (reduction_axis) {
+                    /* This is explicitly a reduction axis */
+                    strides[iop] = 0;
+                    NIT_ITFLAGS(iter) |= NPY_ITFLAG_REDUCE;
+                    op_itflags[iop] |= NPY_OP_ITFLAG_REDUCE;
+
+                    if (NPY_UNLIKELY((i >= 0) && (op_cur != NULL) &&
+                            (PyArray_DIM(op_cur, i) != 1))) {
+                        PyErr_Format(PyExc_ValueError,
+                                "operand was set up as a reduction along axis "
+                                "%d, but the length of the axis is %zd "
+                                "(it has to be 1)",
+                                i, (Py_ssize_t)PyArray_DIM(op_cur, i));
+                        return 0;
+                    }
+                }
+                else if (bshape == 1) {
+                    /*
+                     * If the full iterator shape is 1, zero always works.
+                     * NOTE: We thus always allow broadcast dimensions (i = -1)
+                     *       if the shape is 1.
+                     */
+                    strides[iop] = 0;
+                }
+                else if (i >= 0) {
+                    if (op_cur == NULL) {
+                        /* stride is filled later, shape will match `bshape` */
                         strides[iop] = 0;
                     }
                     else if (PyArray_DIM(op_cur, i) == 1) {
@@ -1599,51 +1687,20 @@ npyiter_fill_axisdata(NpyIter *iter, npy_uint32 flags, npyiter_opitflags *op_itf
                         if (op_flags[iop] & NPY_ITER_NO_BROADCAST) {
                             goto operand_different_than_broadcast;
                         }
-                        /* If it's writeable, this means a reduction */
-                        if (op_itflags[iop] & NPY_OP_ITFLAG_WRITE) {
-                            if (!(flags & NPY_ITER_REDUCE_OK)) {
-                                PyErr_SetString(PyExc_ValueError,
-                                        "output operand requires a reduction, but "
-                                        "reduction is not enabled");
-                                return 0;
-                            }
-                            if (!(op_itflags[iop] & NPY_OP_ITFLAG_READ)) {
-                                PyErr_SetString(PyExc_ValueError,
-                                        "output operand requires a reduction, but "
-                                        "is flagged as write-only, not "
-                                        "read-write");
-                                return 0;
-                            }
-                            NIT_ITFLAGS(iter) |= NPY_ITFLAG_REDUCE;
-                            op_itflags[iop] |= NPY_OP_ITFLAG_REDUCE;
+                        if (!npyiter_check_reduce_ok_and_set_flags(
+                                iter, flags, &op_itflags[iop], i)) {
+                            return 0;
                         }
                     }
                     else {
                         strides[iop] = PyArray_STRIDE(op_cur, i);
                     }
                 }
-                else if (bshape == 1) {
-                    strides[iop] = 0;
-                }
                 else {
                     strides[iop] = 0;
-                    /* If it's writeable, this means a reduction */
-                    if (op_itflags[iop] & NPY_OP_ITFLAG_WRITE) {
-                        if (!(flags & NPY_ITER_REDUCE_OK)) {
-                            PyErr_SetString(PyExc_ValueError,
-                                    "output operand requires a reduction, but "
-                                    "reduction is not enabled");
-                            return 0;
-                        }
-                        if (!(op_itflags[iop] & NPY_OP_ITFLAG_READ)) {
-                            PyErr_SetString(PyExc_ValueError,
-                                    "output operand requires a reduction, but "
-                                    "is flagged as write-only, not "
-                                    "read-write");
-                            return 0;
-                        }
-                        NIT_ITFLAGS(iter) |= NPY_ITFLAG_REDUCE;
-                        op_itflags[iop] |= NPY_OP_ITFLAG_REDUCE;
+                    if (!npyiter_check_reduce_ok_and_set_flags(
+                            iter, flags, &op_itflags[iop], i)) {
+                        return 0;
                     }
                 }
             }
@@ -1753,7 +1810,7 @@ broadcast_error: {
 
                     if (axes != NULL) {
                         for (idim = 0; idim < ndim; ++idim) {
-                            npy_intp i = axes[idim];
+                            int i = npyiter_get_op_axis(axes[idim], NULL);
 
                             if (i >= 0 && i < PyArray_NDIM(op[iop])) {
                                 remdims[idim] = PyArray_DIM(op[iop], i);
@@ -1899,14 +1956,14 @@ operand_different_than_broadcast: {
  * array.
  */
 static void
-npyiter_replace_axisdata(NpyIter *iter, int iop,
-                      PyArrayObject *op,
-                      int op_ndim, char *op_dataptr,
-                      int *op_axes)
+npyiter_replace_axisdata(
+        NpyIter *iter, int iop, PyArrayObject *op,
+        int orig_op_ndim, const int *op_axes)
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     int idim, ndim = NIT_NDIM(iter);
     int nop = NIT_NOP(iter);
+    char *op_dataptr = PyArray_DATA(op);
 
     NpyIter_AxisData *axisdata0, *axisdata;
     npy_intp sizeof_axisdata;
@@ -1925,25 +1982,20 @@ npyiter_replace_axisdata(NpyIter *iter, int iop,
 
     if (op_axes != NULL) {
         for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
-            npy_int8 p;
             int i;
+            npy_bool axis_flipped;
             npy_intp shape;
 
-            /* Apply the perm to get the original axis */
-            p = perm[idim];
-            if (p < 0) {
-                i = op_axes[ndim+p];
-            }
-            else {
-                i = op_axes[ndim-p-1];
-            }
+            /* Apply perm to get the original axis, and check if its flipped */
+            i = npyiter_undo_iter_axis_perm(idim, ndim, perm, &axis_flipped);
 
-            if (0 <= i && i < op_ndim) {
+            i = npyiter_get_op_axis(op_axes[i], NULL);
+            assert(i < orig_op_ndim);
+            if (i >= 0) {
                 shape = PyArray_DIM(op, i);
                 if (shape != 1) {
                     npy_intp stride = PyArray_STRIDE(op, i);
-                    if (p < 0) {
-                        /* If the perm entry is negative, flip the axis */
+                    if (axis_flipped) {
                         NAD_STRIDES(axisdata)[iop] = -stride;
                         baseoffset += stride*(shape-1);
                     }
@@ -1956,25 +2008,18 @@ npyiter_replace_axisdata(NpyIter *iter, int iop,
     }
     else {
         for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
-            npy_int8 p;
             int i;
+            npy_bool axis_flipped;
             npy_intp shape;
 
-            /* Apply the perm to get the original axis */
-            p = perm[idim];
-            if (p < 0) {
-                i = op_ndim+p;
-            }
-            else {
-                i = op_ndim-p-1;
-            }
+            i = npyiter_undo_iter_axis_perm(
+                    idim, orig_op_ndim, perm, &axis_flipped);
 
             if (i >= 0) {
                 shape = PyArray_DIM(op, i);
                 if (shape != 1) {
                     npy_intp stride = PyArray_STRIDE(op, i);
-                    if (p < 0) {
-                        /* If the perm entry is negative, flip the axis */
+                    if (axis_flipped) {
                         NAD_STRIDES(axisdata)[iop] = -stride;
                         baseoffset += stride*(shape-1);
                     }
@@ -2461,14 +2506,15 @@ npyiter_new_temp_array(NpyIter *iter, PyTypeObject *subtype,
 {
     npy_uint32 itflags = NIT_ITFLAGS(iter);
     int idim, ndim = NIT_NDIM(iter);
+    int used_op_ndim;
     int nop = NIT_NOP(iter);
 
     npy_int8 *perm = NIT_PERM(iter);
-    npy_intp new_shape[NPY_MAXDIMS], strides[NPY_MAXDIMS],
-             stride = op_dtype->elsize;
+    npy_intp new_shape[NPY_MAXDIMS], strides[NPY_MAXDIMS];
+    npy_intp stride = op_dtype->elsize;
     NpyIter_AxisData *axisdata;
     npy_intp sizeof_axisdata;
-    npy_intp i;
+    int i;
 
     PyArrayObject *ret;
 
@@ -2495,39 +2541,46 @@ npyiter_new_temp_array(NpyIter *iter, PyTypeObject *subtype,
     sizeof_axisdata = NIT_AXISDATA_SIZEOF(itflags, ndim, nop);
 
     /* Initialize the strides to invalid values */
-    for (i = 0; i < NPY_MAXDIMS; ++i) {
+    for (i = 0; i < op_ndim; ++i) {
         strides[i] = NPY_MAX_INTP;
     }
 
     if (op_axes != NULL) {
+        used_op_ndim = 0;
         for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
-            npy_int8 p;
+            npy_bool reduction_axis;
 
             /* Apply the perm to get the original axis */
-            p = perm[idim];
-            if (p < 0) {
-                i = op_axes[ndim+p];
-            }
-            else {
-                i = op_axes[ndim-p-1];
-            }
+            i = npyiter_undo_iter_axis_perm(idim, ndim, perm, NULL);
+            i = npyiter_get_op_axis(op_axes[i], &reduction_axis);
 
             if (i >= 0) {
                 NPY_IT_DBG_PRINT3("Iterator: Setting allocated stride %d "
                                     "for iterator dimension %d to %d\n", (int)i,
                                     (int)idim, (int)stride);
+                used_op_ndim += 1;
                 strides[i] = stride;
                 if (shape == NULL) {
-                    new_shape[i] = NAD_SHAPE(axisdata);
+                    if (reduction_axis) {
+                        /* reduction axes always have a length of 1 */
+                        new_shape[i] = 1;
+                    }
+                    else {
+                        new_shape[i] = NAD_SHAPE(axisdata);
+                    }
                     stride *= new_shape[i];
                     if (i >= ndim) {
-                        PyErr_SetString(PyExc_ValueError,
+                        PyErr_Format(PyExc_ValueError,
                                 "automatically allocated output array "
-                                "specified with an inconsistent axis mapping");
+                                "specified with an inconsistent axis mapping; "
+                                "the axis mapping cannot include dimension %d "
+                                "which is too large for the iterator dimension "
+                                "of %d.", i, ndim);
                         return NULL;
                     }
                 }
                 else {
+                    assert(!reduction_axis || shape[i] == 1);
                     stride *= shape[i];
                 }
             }
@@ -2535,44 +2588,25 @@ npyiter_new_temp_array(NpyIter *iter, PyTypeObject *subtype,
                 if (shape == NULL) {
                     /*
                      * If deleting this axis produces a reduction, but
-                     * reduction wasn't enabled, throw an error
+                     * reduction wasn't enabled, throw an error.
+                     * NOTE: We currently always allow new-axis if the iteration
+                     *       size is 1 (thus allowing broadcasting sometimes).
                      */
-                    if (NAD_SHAPE(axisdata) != 1) {
-                        if (!(flags & NPY_ITER_REDUCE_OK)) {
-                            PyErr_SetString(PyExc_ValueError,
-                                    "output requires a reduction, but "
-                                    "reduction is not enabled");
-                            return NULL;
-                        }
-                        if (!((*op_itflags) & NPY_OP_ITFLAG_READ)) {
-                            PyErr_SetString(PyExc_ValueError,
-                                    "output requires a reduction, but "
-                                    "is flagged as write-only, not read-write");
+                    if (!reduction_axis && NAD_SHAPE(axisdata) != 1) {
+                        if (!npyiter_check_reduce_ok_and_set_flags(
+                                iter, flags, op_itflags, i)) {
                             return NULL;
                         }
-
-                        NPY_IT_DBG_PRINT("Iterator: Indicating that a "
-                                          "reduction is occurring\n");
-                        /* Indicate that a reduction is occurring */
-                        NIT_ITFLAGS(iter) |= NPY_ITFLAG_REDUCE;
-                        (*op_itflags) |= NPY_OP_ITFLAG_REDUCE;
                     }
                 }
             }
         }
     }
     else {
+        used_op_ndim = ndim;
         for (idim = 0; idim < ndim; ++idim, NIT_ADVANCE_AXISDATA(axisdata, 1)) {
-            npy_int8 p;
-
             /* Apply the perm to get the original axis */
-            p = perm[idim];
-            if (p < 0) {
-                i = op_ndim + p;
-            }
-            else {
-                i = op_ndim - p - 1;
-            }
+            i = npyiter_undo_iter_axis_perm(idim, op_ndim, perm, NULL);
 
             if (i >= 0) {
                 NPY_IT_DBG_PRINT3("Iterator: Setting allocated stride %d "
@@ -2590,73 +2624,58 @@ npyiter_new_temp_array(NpyIter *iter, PyTypeObject *subtype,
         }
     }
 
-    /*
-     * If custom axes were specified, some dimensions may not have been used.
-     * Add the REDUCE itflag if this creates a reduction situation.
-     */
     if (shape == NULL) {
-        /* Ensure there are no dimension gaps in op_axes, and find op_ndim */
-        op_ndim = ndim;
-        if (op_axes != NULL) {
-            for (i = 0; i < ndim; ++i) {
-                if (strides[i] == NPY_MAX_INTP) {
-                    if (op_ndim == ndim) {
-                        op_ndim = i;
-                    }
-                }
-                /*
-                 * If there's a gap in the array's dimensions, it's an error.
-                 * For example, op_axes of [0,2] for the automatically
-                 * allocated output.
-                 */
-                else if (op_ndim != ndim) {
-                    PyErr_SetString(PyExc_ValueError,
-                            "automatically allocated output array "
-                            "specified with an inconsistent axis mapping");
-                    return NULL;
-                }
+        /* If shape was NULL, use the shape we calculated */
+        op_ndim = used_op_ndim;
+        shape = new_shape;
+        /*
+         * If there's a gap in the array's dimensions, it's an error.
+         * For instance, if op_axes [0, 2] is specified, there will a place
+         * in the strides array where the value is not set.
+         */
+        for (i = 0; i < op_ndim; i++) {
+            if (strides[i] == NPY_MAX_INTP) {
+                PyErr_Format(PyExc_ValueError,
+                        "automatically allocated output array "
+                        "specified with an inconsistent axis mapping; "
+                        "the axis mapping is missing an entry for "
+                        "dimension %d.", i);
+                return NULL;
             }
         }
     }
-    else {
-        for (i = 0; i < op_ndim; ++i) {
-            if (strides[i] == NPY_MAX_INTP) {
-                npy_intp factor, new_strides[NPY_MAXDIMS],
-                         itemsize;
-
-                /* Fill in the missing strides in C order */
-                factor = 1;
-                itemsize = op_dtype->elsize;
-                for (i = op_ndim-1; i >= 0; --i) {
-                    if (strides[i] == NPY_MAX_INTP) {
-                        new_strides[i] = factor * itemsize;
-                        factor *= shape[i];
-                    }
-                }
-
-                /*
-                 * Copy the missing strides, and multiply the existing strides
-                 * by the calculated factor.  This way, the missing strides
-                 * are tighter together in memory, which is good for nested
-                 * loops.
-                 */
-                for (i = 0; i < op_ndim; ++i) {
-                    if (strides[i] == NPY_MAX_INTP) {
-                        strides[i] = new_strides[i];
-                    }
-                    else {
-                        strides[i] *= factor;
-                    }
-                }
+    else if (used_op_ndim < op_ndim) {
+        /*
+         * If custom axes were specified, some dimensions may not have
+         * been used. These are additional axes which are ignored in the
+         * iterator but need to be handled here.
+         */
+        npy_intp factor, itemsize, new_strides[NPY_MAXDIMS];
 
-                break;
+        /* Fill in the missing strides in C order */
+        factor = 1;
+        itemsize = op_dtype->elsize;
+        for (i = op_ndim-1; i >= 0; --i) {
+            if (strides[i] == NPY_MAX_INTP) {
+                new_strides[i] = factor * itemsize;
+                factor *= shape[i];
             }
         }
-    }
 
-    /* If shape was NULL, set it to the shape we calculated */
-    if (shape == NULL) {
-        shape = new_shape;
+        /*
+         * Copy the missing strides, and multiply the existing strides
+         * by the calculated factor.  This way, the missing strides
+         * are tighter together in memory, which is good for nested
+         * loops.
+         */
+        for (i = 0; i < op_ndim; ++i) {
+            if (strides[i] == NPY_MAX_INTP) {
+                strides[i] = new_strides[i];
+            }
+            else {
+                strides[i] *= factor;
+            }
+        }
     }
 
     /* Allocate the temporary array */
@@ -2669,6 +2688,11 @@ npyiter_new_temp_array(NpyIter *iter, PyTypeObject *subtype,
 
     /* Double-check that the subtype didn't mess with the dimensions */
     if (subtype != &PyArray_Type) {
+        /*
+         * TODO: the dtype could have a subarray, which adds new dimensions
+         *       to `ret`, that should typically be fine, but will break
+         *       in this branch.
+         */
         if (PyArray_NDIM(ret) != op_ndim ||
                     !PyArray_CompareLists(shape, PyArray_DIMS(ret), op_ndim)) {
             PyErr_SetString(PyExc_RuntimeError,
@@ -2805,16 +2829,21 @@ npyiter_allocate_arrays(NpyIter *iter,
         if (op[iop] == NULL) {
             PyArrayObject *out;
             PyTypeObject *op_subtype;
-            int ondim = ndim;
 
             /* Check whether the subtype was disabled */
             op_subtype = (op_flags[iop] & NPY_ITER_NO_SUBTYPE) ?
                                                 &PyArray_Type : subtype;
 
-            /* Allocate the output array */
+            /*
+             * Allocate the output array.
+             *
+             * Note that here, ndim is always correct if no op_axes was given
+             * (but the actual dimension of op can be larger). If op_axes
+             * is given, ndim is not actually used.
+             */
             out = npyiter_new_temp_array(iter, op_subtype,
                                         flags, &op_itflags[iop],
-                                        ondim,
+                                        ndim,
                                         NULL,
                                         op_dtype[iop],
                                         op_axes ? op_axes[iop] : NULL);
@@ -2828,8 +2857,8 @@ npyiter_allocate_arrays(NpyIter *iter,
              * Now we need to replace the pointers and strides with values
              * from the new array.
              */
-            npyiter_replace_axisdata(iter, iop, op[iop], ondim,
-                    PyArray_DATA(op[iop]), op_axes ? op_axes[iop] : NULL);
+            npyiter_replace_axisdata(iter, iop, op[iop], ndim,
+                    op_axes ? op_axes[iop] : NULL);
 
             /*
              * New arrays are guaranteed true-aligned, but copy/cast code
@@ -2870,8 +2899,7 @@ npyiter_allocate_arrays(NpyIter *iter,
              * Now we need to replace the pointers and strides with values
              * from the temporary array.
              */
-            npyiter_replace_axisdata(iter, iop, op[iop], 0,
-                    PyArray_DATA(op[iop]), NULL);
+            npyiter_replace_axisdata(iter, iop, op[iop], 0, NULL);
 
             /*
              * New arrays are guaranteed true-aligned, but copy/cast code
@@ -2943,7 +2971,7 @@ npyiter_allocate_arrays(NpyIter *iter,
              * from the temporary array.
              */
             npyiter_replace_axisdata(iter, iop, op[iop], ondim,
-                    PyArray_DATA(op[iop]), op_axes ? op_axes[iop] : NULL);
+                    op_axes ? op_axes[iop] : NULL);
 
             /*
              * New arrays are guaranteed true-aligned, but copy/cast code
diff --git a/numpy/core/src/multiarray/nditer_impl.h b/numpy/core/src/multiarray/nditer_impl.h
index 5fb146026..1477c8631 100644
--- a/numpy/core/src/multiarray/nditer_impl.h
+++ b/numpy/core/src/multiarray/nditer_impl.h
@@ -301,6 +301,41 @@ struct NpyIter_AD {
         NIT_AXISDATA_SIZEOF(itflags, ndim, nop)*(ndim ? ndim : 1))
 
 /* Internal helper functions shared between implementation files */
+
+/**
+ * Undo the axis permutation of the iterator. When the operand has fewer
+ * dimensions then the iterator, this can return negative values for
+ * inserted (broadcast) dimensions.
+ *
+ * @param axis Axis for which to undo the iterator axis permutation.
+ * @param ndim If `op_axes` is being used, this is the iterator dimension,
+ *             otherwise this is the operand dimension.
+ * @param perm The iterator axis permutation NIT_PERM(iter)
+ * @param axis_flipped Will be set to true if this is a flipped axis
+ *        (i.e. is iterated in reversed order) and otherwise false.
+ *        Can be NULL if the information is not needed.
+ * @return The unpermuted axis. Without `op_axes` this is correct, with
+ *         `op_axes` this indexes into `op_axes` (unpermuted iterator axis)
+ */
+static NPY_INLINE int
+npyiter_undo_iter_axis_perm(
+        int axis, int ndim, const npy_int8 *perm, npy_bool *axis_flipped)
+{
+    npy_int8 p = perm[axis];
+    /* The iterator treats axis reversed, thus adjust by ndim */
+    npy_bool flipped = p < 0;
+    if (axis_flipped != NULL) {
+        *axis_flipped = flipped;
+    }
+    if (flipped) {
+        axis = ndim + p;
+    }
+    else {
+        axis = ndim - p - 1;
+    }
+    return axis;
+}
+
 NPY_NO_EXPORT void
 npyiter_coalesce_axes(NpyIter *iter);
 NPY_NO_EXPORT int
diff --git a/numpy/core/src/umath/reduction.c b/numpy/core/src/umath/reduction.c
index 79adb0051..4037a4757 100644
--- a/numpy/core/src/umath/reduction.c
+++ b/numpy/core/src/umath/reduction.c
@@ -25,235 +25,6 @@
 #include "reduction.h"
 #include "extobj.h"  /* for _check_ufunc_fperr */
 
-/*
- * Allocates a result array for a reduction operation, with
- * dimensions matching 'arr' except set to 1 with 0 stride
- * wherever axis_flags is True. Dropping the reduction axes
- * from the result must be done later by the caller once the
- * computation is complete.
- *
- * This function always allocates a base class ndarray.
- *
- * If 'dtype' isn't NULL, this function steals its reference.
- */
-static PyArrayObject *
-allocate_reduce_result(PyArrayObject *arr, const npy_bool *axis_flags,
-                        PyArray_Descr *dtype, int subok)
-{
-    npy_intp strides[NPY_MAXDIMS], stride;
-    npy_intp shape[NPY_MAXDIMS], *arr_shape = PyArray_DIMS(arr);
-    npy_stride_sort_item strideperm[NPY_MAXDIMS];
-    int idim, ndim = PyArray_NDIM(arr);
-
-    if (dtype == NULL) {
-        dtype = PyArray_DTYPE(arr);
-        Py_INCREF(dtype);
-    }
-
-    PyArray_CreateSortedStridePerm(PyArray_NDIM(arr),
-                                    PyArray_STRIDES(arr), strideperm);
-
-    /* Build the new strides and shape */
-    stride = dtype->elsize;
-    if (ndim) {
-        memcpy(shape, arr_shape, ndim * sizeof(shape[0]));
-    }
-    for (idim = ndim-1; idim >= 0; --idim) {
-        npy_intp i_perm = strideperm[idim].perm;
-        if (axis_flags[i_perm]) {
-            strides[i_perm] = 0;
-            shape[i_perm] = 1;
-        }
-        else {
-            strides[i_perm] = stride;
-            stride *= shape[i_perm];
-        }
-    }
-
-    /* Finally, allocate the array */
-    return (PyArrayObject *)PyArray_NewFromDescr(
-                                    subok ? Py_TYPE(arr) : &PyArray_Type,
-                                    dtype, ndim, shape, strides,
-                                    NULL, 0, subok ? (PyObject *)arr : NULL);
-}
-
-/*
- * Conforms an output parameter 'out' to have 'ndim' dimensions
- * with dimensions of size one added in the appropriate places
- * indicated by 'axis_flags'.
- *
- * The return value is a view into 'out'.
- */
-static PyArrayObject *
-conform_reduce_result(PyArrayObject *in, const npy_bool *axis_flags,
-                      PyArrayObject *out, int keepdims, const char *funcname,
-                      int need_copy)
-{
-    /* unpack shape information */
-    int const ndim = PyArray_NDIM(in);
-    npy_intp const *shape_in = PyArray_DIMS(in);
-
-    int const ndim_out = PyArray_NDIM(out);
-    npy_intp const *strides_out = PyArray_STRIDES(out);
-    npy_intp const *shape_out = PyArray_DIMS(out);
-
-    /*
-     * If the 'keepdims' parameter is true, do a simpler validation and
-     * return a new reference to 'out'.
-     */
-    if (keepdims) {
-        if (PyArray_NDIM(out) != ndim) {
-            PyErr_Format(PyExc_ValueError,
-                    "output parameter for reduction operation %s "
-                    "has the wrong number of dimensions (must match "
-                    "the operand's when keepdims=True)", funcname);
-            return NULL;
-        }
-
-        for (int idim = 0; idim < ndim; ++idim) {
-            if (axis_flags[idim]) {
-                if (shape_out[idim] != 1) {
-                    PyErr_Format(PyExc_ValueError,
-                            "output parameter for reduction operation %s "
-                            "has a reduction dimension not equal to one "
-                            "(required when keepdims=True)", funcname);
-                    return NULL;
-                }
-            }
-            else {
-                if (shape_out[idim] != shape_in[idim]) {
-                    PyErr_Format(PyExc_ValueError,
-                            "output parameter for reduction operation %s "
-                            "has a non-reduction dimension not equal to "
-                            "the input one.", funcname);
-                    return NULL;
-                }
-            }
-
-        }
-
-        Py_INCREF(out);
-        return out;
-    }
-
-    /* Construct the strides and shape */
-    npy_intp strides[NPY_MAXDIMS], shape[NPY_MAXDIMS];
-    int idim_out = 0;
-    for (int idim = 0; idim < ndim; ++idim) {
-        if (axis_flags[idim]) {
-            strides[idim] = 0;
-            shape[idim] = 1;
-        }
-        else {
-            if (idim_out >= ndim_out) {
-                PyErr_Format(PyExc_ValueError,
-                        "output parameter for reduction operation %s "
-                        "does not have enough dimensions", funcname);
-                return NULL;
-            }
-            if (shape_out[idim_out] != shape_in[idim]) {
-                PyErr_Format(PyExc_ValueError,
-                        "output parameter for reduction operation %s "
-                        "has a non-reduction dimension not equal to "
-                        "the input one.", funcname);
-                return NULL;
-            }
-            strides[idim] = strides_out[idim_out];
-            shape[idim] = shape_out[idim_out];
-            ++idim_out;
-        }
-    }
-
-    if (idim_out != ndim_out) {
-        PyErr_Format(PyExc_ValueError,
-                "output parameter for reduction operation %s "
-                "has too many dimensions", funcname);
-        return NULL;
-    }
-
-    /* Allocate the view */
-    PyArray_Descr *dtype = PyArray_DESCR(out);
-    Py_INCREF(dtype);
-
-    PyArrayObject_fields *ret = (PyArrayObject_fields *)PyArray_NewFromDescrAndBase(
-            &PyArray_Type, dtype,
-            ndim, shape, strides, PyArray_DATA(out),
-            PyArray_FLAGS(out), NULL, (PyObject *)out);
-    if (ret == NULL) {
-        return NULL;
-    }
-
-    if (need_copy) {
-        PyArrayObject *ret_copy = (PyArrayObject *)PyArray_NewLikeArray(
-                (PyArrayObject *)ret, NPY_ANYORDER, NULL, 0);
-        if (ret_copy == NULL) {
-            Py_DECREF(ret);
-            return NULL;
-        }
-
-        if (PyArray_CopyInto(ret_copy, (PyArrayObject *)ret) != 0) {
-            Py_DECREF(ret);
-            Py_DECREF(ret_copy);
-            return NULL;
-        }
-
-        if (PyArray_SetWritebackIfCopyBase(ret_copy, (PyArrayObject *)ret) < 0) {
-            Py_DECREF(ret_copy);
-            return NULL;
-        }
-
-        return ret_copy;
-    }
-    else {
-        return (PyArrayObject *)ret;
-    }
-}
-
-/*
- * Creates a result for reducing 'operand' along the axes specified
- * in 'axis_flags'. If 'dtype' isn't NULL, this function steals a
- * reference to 'dtype'.
- *
- * If 'out' isn't NULL, this function creates a view conforming
- * to the number of dimensions of 'operand', adding a singleton dimension
- * for each reduction axis specified. In this case, 'dtype' is ignored
- * (but its reference is still stolen), and the caller must handle any
- * type conversion/validity check for 'out'
- *
- * If 'subok' is true, creates a result with the subtype of 'operand',
- * otherwise creates on with the base ndarray class.
- *
- * If 'out' is NULL, it allocates a new array whose shape matches that of
- * 'operand', except for at the reduction axes. If 'dtype' is NULL, the dtype
- * of 'operand' is used for the result.
- */
-NPY_NO_EXPORT PyArrayObject *
-PyArray_CreateReduceResult(PyArrayObject *operand, PyArrayObject *out,
-                           PyArray_Descr *dtype, npy_bool *axis_flags,
-                           int keepdims, int subok,
-                           const char *funcname)
-{
-    PyArrayObject *result;
-
-    if (out == NULL) {
-        /* This function steals the reference to 'dtype' */
-        result = allocate_reduce_result(operand, axis_flags, dtype, subok);
-    }
-    else {
-        int need_copy = 0;
-
-        if (solve_may_share_memory(operand, out, 1) != 0) {
-            need_copy = 1;
-        }
-
-        /* Steal the dtype reference */
-        Py_XDECREF(dtype);
-        result = conform_reduce_result(operand, axis_flags,
-                                       out, keepdims, funcname, need_copy);
-    }
-
-    return result;
-}
 
 /*
  * Count the number of dimensions selected in 'axis_flags'
@@ -275,18 +46,15 @@ count_axes(int ndim, const npy_bool *axis_flags)
 /*
  * This function initializes a result array for a reduction operation
  * which has no identity. This means it needs to copy the first element
- * it sees along the reduction axes to result, then return a view of
- * the operand which excludes that element.
+ * it sees along the reduction axes to result.
  *
  * If a reduction has an identity, such as 0 or 1, the result should be
- * initialized by calling PyArray_AssignZero(result, NULL, NULL) or
- * PyArray_AssignOne(result, NULL, NULL), because this function raises an
- * exception when there are no elements to reduce (which appropriate iff the
- * reduction operation has no identity).
+ * fully initialized to the identity, because this function raises an
+ * exception when there are no elements to reduce (which is appropriate if,
+ * and only if, the reduction operation has no identity).
  *
  * This means it copies the subarray indexed at zero along each reduction axis
- * into 'result', then returns a view into 'operand' excluding those copied
- * elements.
+ * into 'result'.
  *
  * result  : The array into which the result is computed. This must have
  *           the same number of dimensions as 'operand', but for each
@@ -294,105 +62,83 @@ count_axes(int ndim, const npy_bool *axis_flags)
  * operand : The array being reduced.
  * axis_flags : An array of boolean flags, one for each axis of 'operand'.
  *              When a flag is True, it indicates to reduce along that axis.
- * out_skip_first_count : This gets populated with the number of first-visit
- *                        elements that should be skipped during the
- *                        iteration loop.
  * funcname : The name of the reduction operation, for the purpose of
  *            better quality error messages. For example, "numpy.max"
  *            would be a good name for NumPy's max function.
  *
- * Returns a view which contains the remaining elements on which to do
- * the reduction.
+ * Returns -1 if an error occurred, and otherwise the reduce arrays size,
+ * which is the number of elements already initialized.
  */
-NPY_NO_EXPORT PyArrayObject *
-PyArray_InitializeReduceResult(
+NPY_NO_EXPORT int
+PyArray_CopyInitialReduceValues(
                     PyArrayObject *result, PyArrayObject *operand,
-                    const npy_bool *axis_flags,
-                    npy_intp *out_skip_first_count, const char *funcname)
+                    const npy_bool *axis_flags, const char *funcname,
+                    int keepdims)
 {
-    npy_intp *strides, *shape, shape_orig[NPY_MAXDIMS];
+    npy_intp shape[NPY_MAXDIMS], strides[NPY_MAXDIMS];
+    npy_intp *shape_orig = PyArray_SHAPE(operand);
+    npy_intp *strides_orig = PyArray_STRIDES(operand);
     PyArrayObject *op_view = NULL;
-    int idim, ndim, nreduce_axes;
 
-    ndim = PyArray_NDIM(operand);
-
-    /* Default to no skipping first-visit elements in the iteration */
-    *out_skip_first_count = 0;
-
-    /* Take a view into 'operand' which we can modify. */
-    op_view = (PyArrayObject *)PyArray_View(operand, NULL, &PyArray_Type);
-    if (op_view == NULL) {
-        return NULL;
-    }
+    int ndim = PyArray_NDIM(operand);
 
     /*
-     * Now copy the subarray of the first element along each reduction axis,
-     * then return a view to the rest.
+     * Copy the subarray of the first element along each reduction axis.
      *
      * Adjust the shape to only look at the first element along
-     * any of the reduction axes. We count the number of reduction axes
-     * at the same time.
+     * any of the reduction axes. If keepdims is False remove the axes
+     * entirely.
      */
-    shape = PyArray_SHAPE(op_view);
-    nreduce_axes = 0;
-    if (ndim) {
-        memcpy(shape_orig, shape, ndim * sizeof(npy_intp));
-    }
-    for (idim = 0; idim < ndim; ++idim) {
+    int idim_out = 0;
+    npy_intp size = 1;
+    for (int idim = 0; idim < ndim; idim++) {
         if (axis_flags[idim]) {
-            if (shape[idim] == 0) {
+            if (NPY_UNLIKELY(shape_orig[idim] == 0)) {
                 PyErr_Format(PyExc_ValueError,
-                             "zero-size array to reduction operation %s "
-                             "which has no identity",
-                             funcname);
-                Py_DECREF(op_view);
-                return NULL;
+                        "zero-size array to reduction operation %s "
+                        "which has no identity", funcname);
+                return -1;
             }
-            shape[idim] = 1;
-            ++nreduce_axes;
+            if (keepdims) {
+                shape[idim_out] = 1;
+                strides[idim_out] = 0;
+                idim_out++;
+            }
+        }
+        else {
+            size *= shape_orig[idim];
+            shape[idim_out] = shape_orig[idim];
+            strides[idim_out] = strides_orig[idim];
+            idim_out++;
         }
     }
 
-    /*
-     * Copy the elements into the result to start.
-     */
-    if (PyArray_CopyInto(result, op_view) < 0) {
-        Py_DECREF(op_view);
-        return NULL;
+    PyArray_Descr *descr = PyArray_DESCR(operand);
+    Py_INCREF(descr);
+    op_view = (PyArrayObject *)PyArray_NewFromDescr(
+            &PyArray_Type, descr, idim_out, shape, strides,
+            PyArray_DATA(operand), 0, NULL);
+    if (op_view == NULL) {
+        return -1;
     }
 
     /*
-     * If there is one reduction axis, adjust the view's
-     * shape to only look at the remaining elements
+     * Copy the elements into the result to start.
      */
-    if (nreduce_axes == 1) {
-        strides = PyArray_STRIDES(op_view);
-        for (idim = 0; idim < ndim; ++idim) {
-            if (axis_flags[idim]) {
-                shape[idim] = shape_orig[idim] - 1;
-                ((PyArrayObject_fields *)op_view)->data += strides[idim];
-            }
-        }
-    }
-    /* If there are zero reduction axes, make the view empty */
-    else if (nreduce_axes == 0) {
-        for (idim = 0; idim < ndim; ++idim) {
-            shape[idim] = 0;
-        }
+    int res = PyArray_CopyInto(result, op_view);
+    Py_DECREF(op_view);
+    if (res < 0) {
+        return -1;
     }
+
     /*
-     * Otherwise iterate over the whole operand, but tell the inner loop
-     * to skip the elements we already copied by setting the skip_first_count.
+     * If there were no reduction axes, we would already be done here.
+     * Note that if there is only a single reduction axis, in principle the
+     * iteration could be set up more efficiently here by removing that
+     * axis before setting up the iterator (simplifying the iteration since
+     * `skip_first_count` (the returned size) can be set to 0).
      */
-    else {
-        *out_skip_first_count = PyArray_SIZE(result);
-
-        Py_DECREF(op_view);
-        Py_INCREF(operand);
-        op_view = operand;
-    }
-
-    return op_view;
+    return size;
 }
 
 /*
@@ -418,7 +164,7 @@ PyArray_InitializeReduceResult(
  *               with size one.
  * subok       : If true, the result uses the subclass of operand, otherwise
  *               it is always a base class ndarray.
- * identity    : If Py_None, PyArray_InitializeReduceResult is used, otherwise
+ * identity    : If Py_None, PyArray_CopyInitialReduceValues is used, otherwise
  *               this value is used to initialize the result to
  *               the reduction's unit.
  * loop        : The loop which does the reduction.
@@ -444,13 +190,12 @@ PyUFunc_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
                       NPY_CASTING casting,
                       npy_bool *axis_flags, int reorderable,
                       int keepdims,
-                      int subok,
                       PyObject *identity,
                       PyArray_ReduceLoopFunc *loop,
                       void *data, npy_intp buffersize, const char *funcname,
                       int errormask)
 {
-    PyArrayObject *result = NULL, *op_view = NULL;
+    PyArrayObject *result = NULL;
     npy_intp skip_first_count = 0;
 
     /* Iterator parameters */
@@ -476,49 +221,10 @@ PyUFunc_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
         return NULL;
     }
 
-    /*
-     * This either conforms 'out' to the ndim of 'operand', or allocates
-     * a new array appropriate for this reduction.
-     *
-     * A new array with WRITEBACKIFCOPY is allocated if operand and out have memory
-     * overlap.
-     */
-    Py_INCREF(result_dtype);
-    result = PyArray_CreateReduceResult(operand, out,
-                            result_dtype, axis_flags,
-                            keepdims, subok, funcname);
-    if (result == NULL) {
-        goto fail;
-    }
-
-    /*
-     * Initialize the result to the reduction unit if possible,
-     * otherwise copy the initial values and get a view to the rest.
-     */
-    if (identity != Py_None) {
-        if (PyArray_FillWithScalar(result, identity) < 0) {
-            goto fail;
-        }
-        op_view = operand;
-        Py_INCREF(op_view);
-    }
-    else {
-        op_view = PyArray_InitializeReduceResult(
-            result, operand, axis_flags, &skip_first_count, funcname);
-        if (op_view == NULL) {
-            goto fail;
-        }
-        /* empty op_view signals no reduction; but 0-d arrays cannot be empty */
-        if ((PyArray_SIZE(op_view) == 0) || (PyArray_NDIM(operand) == 0)) {
-            Py_DECREF(op_view);
-            op_view = NULL;
-            goto finish;
-        }
-    }
 
     /* Set up the iterator */
-    op[0] = result;
-    op[1] = op_view;
+    op[0] = out;
+    op[1] = operand;
     op_dtypes[0] = result_dtype;
     op_dtypes[1] = operand_dtype;
 
@@ -527,13 +233,17 @@ PyUFunc_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
             NPY_ITER_GROWINNER |
             NPY_ITER_DONT_NEGATE_STRIDES |
             NPY_ITER_ZEROSIZE_OK |
-            NPY_ITER_REDUCE_OK |
-            NPY_ITER_REFS_OK;
+            NPY_ITER_REFS_OK |
+            NPY_ITER_DELAY_BUFALLOC |
+            NPY_ITER_COPY_IF_OVERLAP;
     op_flags[0] = NPY_ITER_READWRITE |
                   NPY_ITER_ALIGNED |
+                  NPY_ITER_ALLOCATE |
                   NPY_ITER_NO_SUBTYPE;
     op_flags[1] = NPY_ITER_READONLY |
-                  NPY_ITER_ALIGNED;
+                  NPY_ITER_ALIGNED |
+                  NPY_ITER_NO_BROADCAST;
+
     if (wheremask != NULL) {
         op[2] = wheremask;
         /* wheremask is guaranteed to be NPY_BOOL, so borrow its reference */
@@ -545,15 +255,84 @@ PyUFunc_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
         op_flags[2] = NPY_ITER_READONLY;
     }
 
+    /* Set up result array axes mapping, operand and wheremask use default */
+    int result_axes[NPY_MAXDIMS];
+    int *op_axes[3] = {result_axes, NULL, NULL};
+
+    int curr_axis = 0;
+    for (int i = 0; i < PyArray_NDIM(operand); i++) {
+        if (axis_flags[i]) {
+            if (keepdims) {
+                result_axes[i] = NPY_ITER_REDUCTION_AXIS(curr_axis);
+                curr_axis++;
+            }
+            else {
+                result_axes[i] = NPY_ITER_REDUCTION_AXIS(-1);
+            }
+        }
+        else {
+            result_axes[i] = curr_axis;
+            curr_axis++;
+        }
+    }
+    if (out != NULL) {
+        /* NpyIter does not raise a good error message in this common case. */
+        if (NPY_UNLIKELY(curr_axis != PyArray_NDIM(out))) {
+            if (keepdims) {
+                PyErr_Format(PyExc_ValueError,
+                        "output parameter for reduction operation %s has the "
+                        "wrong number of dimensions: Found %d but expected %d "
+                        "(must match the operand's when keepdims=True)",
+                        funcname, PyArray_NDIM(out), curr_axis);
+            }
+            else {
+                PyErr_Format(PyExc_ValueError,
+                        "output parameter for reduction operation %s has the "
+                        "wrong number of dimensions: Found %d but expected %d",
+                        funcname, PyArray_NDIM(out), curr_axis);
+            }
+            goto fail;
+        }
+    }
+
     iter = NpyIter_AdvancedNew(wheremask == NULL ? 2 : 3, op, flags,
                                NPY_KEEPORDER, casting,
                                op_flags,
                                op_dtypes,
-                               -1, NULL, NULL, buffersize);
+                               PyArray_NDIM(operand), op_axes, NULL, buffersize);
     if (iter == NULL) {
         goto fail;
     }
 
+    result = NpyIter_GetOperandArray(iter)[0];
+
+    /*
+     * Initialize the result to the reduction unit if possible,
+     * otherwise copy the initial values and get a view to the rest.
+     */
+
+    if (identity != Py_None) {
+        if (PyArray_FillWithScalar(result, identity) < 0) {
+            goto fail;
+        }
+    }
+    else {
+        /*
+         * For 1-D skip_first_count could be optimized to 0, but no-identity
+         * reductions are not super common.
+         * (see also comment in CopyInitialReduceValues)
+         */
+        skip_first_count = PyArray_CopyInitialReduceValues(
+                result, operand, axis_flags, funcname, keepdims);
+        if (skip_first_count < 0) {
+            goto fail;
+        }
+    }
+
+    if (!NpyIter_Reset(iter, NULL)) {
+        goto fail;
+    }
+
     /* Start with the floating-point exception flags cleared */
     npy_clear_floatstatus_barrier((char*)&iter);
 
@@ -595,29 +374,15 @@ PyUFunc_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
         goto fail;
     }
 
-    NpyIter_Deallocate(iter);
-    Py_DECREF(op_view);
-
-finish:
-    /* Strip out the extra 'one' dimensions in the result */
-    if (out == NULL) {
-        if (!keepdims) {
-            PyArray_RemoveAxesInPlace(result, axis_flags);
-        }
-    }
-    else {
-        PyArray_ResolveWritebackIfCopy(result); /* prevent spurious warnings */
-        Py_DECREF(result);
+    if (out != NULL) {
         result = out;
-        Py_INCREF(result);
     }
+    Py_INCREF(result);
 
+    NpyIter_Deallocate(iter);
     return result;
 
 fail:
-    PyArray_ResolveWritebackIfCopy(result); /* prevent spurious warnings */
-    Py_XDECREF(result);
-    Py_XDECREF(op_view);
     if (iter != NULL) {
         NpyIter_Deallocate(iter);
     }
diff --git a/numpy/core/src/umath/reduction.h b/numpy/core/src/umath/reduction.h
index 0c2183ed6..372605dba 100644
--- a/numpy/core/src/umath/reduction.h
+++ b/numpy/core/src/umath/reduction.h
@@ -128,9 +128,7 @@ typedef int (PyArray_ReduceLoopFunc)(NpyIter *iter,
  *               of cache behavior or multithreading requirements.
  * keepdims    : If true, leaves the reduction dimensions in the result
  *               with size one.
- * subok       : If true, the result uses the subclass of operand, otherwise
- *               it is always a base class ndarray.
- * identity    : If Py_None, PyArray_InitializeReduceResult is used, otherwise
+ * identity    : If Py_None, PyArray_CopyInitialReduceValues is used, otherwise
  *               this value is used to initialize the result to
  *               the reduction's unit.
  * loop        : The loop which does the reduction.
@@ -147,7 +145,6 @@ PyUFunc_ReduceWrapper(PyArrayObject *operand, PyArrayObject *out,
                       NPY_CASTING casting,
                       npy_bool *axis_flags, int reorderable,
                       int keepdims,
-                      int subok,
                       PyObject *identity,
                       PyArray_ReduceLoopFunc *loop,
                       void *data, npy_intp buffersize, const char *funcname,
diff --git a/numpy/core/src/umath/ufunc_object.c b/numpy/core/src/umath/ufunc_object.c
index 591fbd7ad..b35f377d7 100644
--- a/numpy/core/src/umath/ufunc_object.c
+++ b/numpy/core/src/umath/ufunc_object.c
@@ -2701,9 +2701,13 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *ufunc,
             }
         }
 
-        /* Any output core dimensions shape should be ignored */
+        /*
+         * Any output core dimensions shape should be ignored, so we add
+         * it as a Reduce dimension (which can be broadcast with the rest).
+         * These will be removed before the actual iteration for gufuncs.
+         */
         for (idim = broadcast_ndim; idim < iter_ndim; ++idim) {
-            op_axes_arrays[i][idim] = -1;
+            op_axes_arrays[i][idim] = NPY_ITER_REDUCTION_AXIS(-1);
         }
 
         /* Except for when it belongs to this output */
@@ -2775,7 +2779,7 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *ufunc,
      */
     _ufunc_setup_flags(ufunc, NPY_ITER_COPY | NPY_UFUNC_DEFAULT_INPUT_FLAGS,
                        NPY_ITER_UPDATEIFCOPY |
-                       NPY_ITER_READWRITE |
+                       NPY_ITER_WRITEONLY |
                        NPY_UFUNC_DEFAULT_OUTPUT_FLAGS,
                        op_flags);
     /* For the generalized ufunc, we get the loop right away too */
@@ -2824,7 +2828,6 @@ PyUFunc_GeneralizedFunction(PyUFuncObject *ufunc,
     iter_flags = ufunc->iter_flags |
                  NPY_ITER_MULTI_INDEX |
                  NPY_ITER_REFS_OK |
-                 NPY_ITER_REDUCE_OK |
                  NPY_ITER_ZEROSIZE_OK |
                  NPY_ITER_COPY_IF_OVERLAP;
 
@@ -3650,7 +3653,7 @@ PyUFunc_Reduce(PyUFuncObject *ufunc, PyArrayObject *arr, PyArrayObject *out,
     result = PyUFunc_ReduceWrapper(arr, out, wheremask, dtype, dtype,
                                    NPY_UNSAFE_CASTING,
                                    axis_flags, reorderable,
-                                   keepdims, 0,
+                                   keepdims,
                                    initial,
                                    reduce_loop,
                                    ufunc, buffersize, ufunc_name, errormask);
diff --git a/numpy/core/tests/test_nditer.py b/numpy/core/tests/test_nditer.py
index c106c528d..7b3c3a40d 100644
--- a/numpy/core/tests/test_nditer.py
+++ b/numpy/core/tests/test_nditer.py
@@ -1520,6 +1520,13 @@ def test_iter_allocate_output_errors():
                         [['readonly'], ['writeonly', 'allocate']],
                         op_dtypes=[None, np.dtype('f4')],
                         op_axes=[None, [0, 2, 1, 0]])
+    # Not all axes may be specified if a reduction. If there is a hole
+    # in op_axes, this is an error.
+    a = arange(24, dtype='i4').reshape(2, 3, 4)
+    assert_raises(ValueError, nditer, [a, None], ["reduce_ok"],
+                        [['readonly'], ['readwrite', 'allocate']],
+                        op_dtypes=[None, np.dtype('f4')],
+                        op_axes=[None, [0, np.newaxis, 2]])
 
 def test_iter_remove_axis():
     a = arange(24).reshape(2, 3, 4)
@@ -2661,6 +2668,14 @@ def test_iter_allocated_array_dtypes():
         b[1] = a + 1
     assert_equal(it.operands[1], [[0, 2], [2, 4], [19, 21]])
 
+    # Check the same (less sensitive) thing when `op_axes` with -1 is given.
+    it = np.nditer(([[1, 3, 20]], None), op_dtypes=[None, ('i4', (2,))],
+                   flags=["reduce_ok"], op_axes=[None, (-1, 0)])
+    for a, b in it:
+        b[0] = a - 1
+        b[1] = a + 1
+    assert_equal(it.operands[1], [[0, 2], [2, 4], [19, 21]])
+
     # Make sure this works for scalars too
     it = np.nditer((10, 2, None), op_dtypes=[None, None, ('i4', (2, 2))])
     for a, b, c in it:
diff --git a/numpy/core/tests/test_ufunc.py b/numpy/core/tests/test_ufunc.py
index 642308d3b..1305f4877 100644
--- a/numpy/core/tests/test_ufunc.py
+++ b/numpy/core/tests/test_ufunc.py
@@ -610,7 +610,17 @@ class TestUfunc:
             warnings.simplefilter("always")
             u += v
             assert_equal(len(w), 1)
-            assert_(x[0,0]  != u[0, 0])
+            assert_(x[0, 0] != u[0, 0])
+
+        # Output reduction should not be allowed.
+        # See gh-15139
+        a = np.arange(6).reshape(3, 2)
+        b = np.ones(2)
+        out = np.empty(())
+        assert_raises(ValueError, umt.inner1d, a, b, out)
+        out2 = np.empty(3)
+        c = umt.inner1d(a, b, out2)
+        assert_(c is out2)
 
     def test_out_broadcasts(self):
         # For ufuncs and gufuncs (not for reductions), we currently allow
@@ -957,7 +967,10 @@ class TestUfunc:
         assert_array_equal(result, np.vstack((np.zeros(3), a[2], -a[1])))
         assert_raises(ValueError, umt.cross1d, np.eye(4), np.eye(4))
         assert_raises(ValueError, umt.cross1d, a, np.arange(4.))
+        # Wrong output core dimension.
         assert_raises(ValueError, umt.cross1d, a, np.arange(3.), np.zeros((3, 4)))
+        # Wrong output broadcast dimension (see gh-15139).
+        assert_raises(ValueError, umt.cross1d, a, np.arange(3.), np.zeros(3))
 
     def test_can_ignore_signature(self):
         # Comparing the effects of ? in signature:
@@ -1905,22 +1918,47 @@ class TestUfunc:
         assert_equal(y_base[1,:], y_base_copy[1,:])
         assert_equal(y_base[3,:], y_base_copy[3,:])
 
-    @pytest.mark.parametrize('output_shape',
-                             [(), (1,), (1, 1), (1, 3), (4, 3)])
+    @pytest.mark.parametrize('out_shape',
+                             [(), (1,), (3,), (1, 1), (1, 3), (4, 3)])
+    @pytest.mark.parametrize('keepdims', [True, False])
     @pytest.mark.parametrize('f_reduce', [np.add.reduce, np.minimum.reduce])
-    def test_reduce_wrong_dimension_output(self, f_reduce, output_shape):
+    def test_reduce_wrong_dimension_output(self, f_reduce, keepdims, out_shape):
         # Test that we're not incorrectly broadcasting dimensions.
         # See gh-15144 (failed for np.add.reduce previously).
         a = np.arange(12.).reshape(4, 3)
-        out = np.empty(output_shape, a.dtype)
-        assert_raises(ValueError, f_reduce, a, axis=0, out=out)
-        if output_shape != (1, 3):
-            assert_raises(ValueError, f_reduce, a, axis=0, out=out,
-                          keepdims=True)
+        out = np.empty(out_shape, a.dtype)
+
+        correct_out = f_reduce(a, axis=0, keepdims=keepdims)
+        if out_shape != correct_out.shape:
+            with assert_raises(ValueError):
+                f_reduce(a, axis=0, out=out, keepdims=keepdims)
         else:
-            check = f_reduce(a, axis=0, out=out, keepdims=True)
+            check = f_reduce(a, axis=0, out=out, keepdims=keepdims)
             assert_(check is out)
-            assert_array_equal(check, f_reduce(a, axis=0, keepdims=True))
+            assert_array_equal(check, correct_out)
+
+    def test_reduce_output_does_not_broadcast_input(self):
+        # Test that the output shape cannot broadcast an input dimension
+        # (it never can add dimensions, but it might expand an existing one)
+        a = np.ones((1, 10))
+        out_correct = (np.empty((1, 1)))
+        out_incorrect = np.empty((3, 1))
+        np.add.reduce(a, axis=-1, out=out_correct, keepdims=True)
+        np.add.reduce(a, axis=-1, out=out_correct[:, 0], keepdims=False)
+        with assert_raises(ValueError):
+            np.add.reduce(a, axis=-1, out=out_incorrect, keepdims=True)
+        with assert_raises(ValueError):
+            np.add.reduce(a, axis=-1, out=out_incorrect[:, 0], keepdims=False)
+
+    def test_reduce_output_subclass_ok(self):
+        class MyArr(np.ndarray):
+            pass
+
+        out = np.empty(())
+        np.add.reduce(np.ones(5), out=out)  # no subclass, all fine
+        out = out.view(MyArr)
+        assert np.add.reduce(np.ones(5), out=out) is out
+        assert type(np.add.reduce(out)) is MyArr
 
     def test_no_doc_string(self):
         # gh-9337
author	Matti Picus <matti.picus@gmail.com>	2020-06-04 22:03:43 +0300
committer	GitHub <noreply@github.com>	2020-06-04 22:03:43 +0300
commit	a5d021a1b6f439a19812926bc4d796ef5f346c44 (patch)
tree	998f27e5296a2894b41a9600c9bdb8b7806241c1
parent	abce2a0604a4db5d8072a0b1478c7af439882ed7 (diff)
parent	8da4fba24c193807555dfcbc0e67834cae9185ea (diff)
download	numpy-a5d021a1b6f439a19812926bc4d796ef5f346c44.tar.gz