MAINT: push back multifield copy->view changes to 1.16

author: Allan Haldane <allan.haldane@gmail.com> 2018-06-10 21:54:21 -0400
committer: Allan Haldane <allan.haldane@gmail.com> 2018-06-11 11:51:04 -0400
commit: e08eced7990fbdcecb2bd81d3fc736f69bad6dfd (patch)
tree: 9ded5059b440cff8c4f52519692e4c456c66717d
parent: 8525d3e30f3b57332dfd374bc64b068366126b52 (diff)
download: numpy-e08eced7990fbdcecb2bd81d3fc736f69bad6dfd.tar.gz
10 files changed, 297 insertions, 34 deletions
diff --git a/doc/release/1.15.0-notes.rst b/doc/release/1.15.0-notes.rst
index a269e25f1..cc193530c 100644
--- a/doc/release/1.15.0-notes.rst
+++ b/doc/release/1.15.0-notes.rst
@@ -154,6 +154,16 @@ Since ``np.ma.masked`` is a readonly scalar, copying should be a no-op. These
 functions now behave consistently with ``np.copy()``.
 
 
+Multifield Indexing of Structured Arrays will still return a copy
+-----------------------------------------------------------------
+The change that multi-field indexing of structured arrays returns a view
+instead of a copy is pushed back to 1.16. A new method
+``numpy.lib.recfunctions.repack_fields`` has been introduced to help mitigate
+the effects of this change, which can be used to write code compatible with
+both numpy 1.15 and 1.16. For more information on how to update code to account
+for this future change see "basics/structured arrays/accessing multiple fields"
+in the user guide.
+
 C API changes
 =============
 
diff --git a/numpy/core/src/multiarray/common.c b/numpy/core/src/multiarray/common.c
index f191f8db4..c70f8526e 100644
--- a/numpy/core/src/multiarray/common.c
+++ b/numpy/core/src/multiarray/common.c
@@ -424,7 +424,7 @@ PyArray_DTypeFromObjectHelper(PyObject *obj, int maxdims,
      * __len__ is not defined.
      */
     if (maxdims == 0 || !PySequence_Check(obj) || PySequence_Size(obj) < 0) {
-        // clear any PySequence_Size error, which corrupts further calls to it
+        /* clear any PySequence_Size error which corrupts further calls */
         PyErr_Clear();
 
         if (*out_dtype == NULL || (*out_dtype)->type_num != NPY_OBJECT) {
diff --git a/numpy/core/src/multiarray/convert.c b/numpy/core/src/multiarray/convert.c
index 0e38aaa61..e88582a51 100644
--- a/numpy/core/src/multiarray/convert.c
+++ b/numpy/core/src/multiarray/convert.c
@@ -613,11 +613,14 @@ PyArray_View(PyArrayObject *self, PyArray_Descr *type, PyTypeObject *pytype)
         subtype = Py_TYPE(self);
     }
 
-    if (type != NULL && (PyArray_FLAGS(self) & NPY_ARRAY_WARN_ON_WRITE)) {
+    dtype = PyArray_DESCR(self);
+
+    if (type != NULL && !PyArray_EquivTypes(dtype, type) &&
+            (PyArray_FLAGS(self) & NPY_ARRAY_WARN_ON_WRITE)) {
         const char *msg =
             "Numpy has detected that you may be viewing or writing to an array "
             "returned by selecting multiple fields in a structured array. \n\n"
-            "This code may break in numpy 1.13 because this will return a view "
+            "This code may break in numpy 1.16 because this will return a view "
             "instead of a copy -- see release notes for details.";
         /* 2016-09-19, 1.12 */
         if (DEPRECATE_FUTUREWARNING(msg) < 0) {
@@ -629,7 +632,6 @@ PyArray_View(PyArrayObject *self, PyArray_Descr *type, PyTypeObject *pytype)
 
     flags = PyArray_FLAGS(self);
 
-    dtype = PyArray_DESCR(self);
     Py_INCREF(dtype);
     ret = (PyArrayObject *)PyArray_NewFromDescr_int(
             subtype, dtype,
diff --git a/numpy/core/src/multiarray/mapping.c b/numpy/core/src/multiarray/mapping.c
index 6a7ffd39d..82a0683f9 100644
--- a/numpy/core/src/multiarray/mapping.c
+++ b/numpy/core/src/multiarray/mapping.c
@@ -1387,14 +1387,54 @@ array_subscript_asarray(PyArrayObject *self, PyObject *op)
 }
 
 /*
+ * Helper function for _get_field_view which turns a multifield
+ * view into a "packed" copy, as done in numpy 1.15 and before.
+ * In numpy 1.16 this function should be removed.
+ */
+NPY_NO_EXPORT int
+_multifield_view_to_copy(PyArrayObject **view) {
+    static PyObject *copyfunc = NULL;
+    PyObject *viewcopy;
+
+    /* return a repacked copy of the view */
+    npy_cache_import("numpy.lib.recfunctions", "repack_fields", &copyfunc);
+    if (copyfunc == NULL) {
+        goto view_fail;
+    }
+
+    PyArray_CLEARFLAGS(*view, NPY_ARRAY_WARN_ON_WRITE);
+    viewcopy = PyObject_CallFunction(copyfunc, "O", *view);
+    if (viewcopy == NULL) {
+        goto view_fail;
+    }
+    Py_DECREF(*view);
+    *view = (PyArrayObject*)viewcopy;
+
+    /* warn when writing to the copy */
+    PyArray_ENABLEFLAGS(*view, NPY_ARRAY_WARN_ON_WRITE);
+    return 0;
+
+view_fail:
+    Py_DECREF(*view);
+    *view = NULL;
+    return 0;
+}
+
+/*
  * Attempts to subscript an array using a field name or list of field names.
  *
  * If an error occurred, return 0 and set view to NULL. If the subscript is not
  * a string or list of strings, return -1 and set view to NULL. Otherwise
  * return 0 and set view to point to a new view into arr for the given fields.
+ *
+ * In numpy 1.15 and before, in the case of a list of field names the returned
+ * view will actually be a copy by default, with fields packed together.
+ * The `force_view` argument causes a view to be returned. This argument can be
+ * removed in 1.16 when we plan to return a view always.
  */
 NPY_NO_EXPORT int
-_get_field_view(PyArrayObject *arr, PyObject *ind, PyArrayObject **view)
+_get_field_view(PyArrayObject *arr, PyObject *ind, PyArrayObject **view,
+                int force_view)
 {
     *view = NULL;
 
@@ -1489,12 +1529,12 @@ _get_field_view(PyArrayObject *arr, PyObject *ind, PyArrayObject **view)
                 Py_DECREF(names);
                 return 0;
             }
-            // disallow use of titles as index
+            /* disallow use of titles as index */
             if (PyTuple_Size(tup) == 3) {
                 PyObject *title = PyTuple_GET_ITEM(tup, 2);
                 int titlecmp = PyObject_RichCompareBool(title, name, Py_EQ);
                 if (titlecmp == 1) {
-                    // if title == name, we were given a title, not a field name
+                    /* if title == name, we got a title, not a field name */
                     PyErr_SetString(PyExc_KeyError,
                                 "cannot use field titles in multi-field index");
                 }
@@ -1507,7 +1547,7 @@ _get_field_view(PyArrayObject *arr, PyObject *ind, PyArrayObject **view)
                 }
                 Py_DECREF(title);
             }
-            // disallow duplicate field indices
+            /* disallow duplicate field indices */
             if (PyDict_Contains(fields, name)) {
                 PyObject *errmsg = PyUString_FromString(
                                        "duplicate field of name ");
@@ -1552,10 +1592,16 @@ _get_field_view(PyArrayObject *arr, PyObject *ind, PyArrayObject **view)
                 PyArray_FLAGS(arr),
                 (PyObject *)arr, (PyObject *)arr,
                 0, 1);
+
         if (*view == NULL) {
             return 0;
         }
-        return 0;
+
+        /* the code below can be replaced by "return 0" in 1.16 */
+        if (force_view) {
+            return 0;
+        }
+        return _multifield_view_to_copy(view);
     }
     return -1;
 }
@@ -1583,7 +1629,7 @@ array_subscript(PyArrayObject *self, PyObject *op)
     /* return fields if op is a string index */
     if (PyDataType_HASFIELDS(PyArray_DESCR(self))) {
         PyArrayObject *view;
-        int ret = _get_field_view(self, op, &view);
+        int ret = _get_field_view(self, op, &view, 0);
         if (ret == 0){
             if (view == NULL) {
                 return NULL;
@@ -1865,7 +1911,7 @@ array_assign_subscript(PyArrayObject *self, PyObject *ind, PyObject *op)
     /* field access */
     if (PyDataType_HASFIELDS(PyArray_DESCR(self))){
         PyArrayObject *view;
-        int ret = _get_field_view(self, ind, &view);
+        int ret = _get_field_view(self, ind, &view, 1);
         if (ret == 0){
             if (view == NULL) {
                 return -1;
diff --git a/numpy/core/src/private/npy_import.h b/numpy/core/src/private/npy_import.h
index 221e1e645..e145a843e 100644
--- a/numpy/core/src/private/npy_import.h
+++ b/numpy/core/src/private/npy_import.h
@@ -29,4 +29,18 @@ npy_cache_import(const char *module, const char *attr, PyObject **cache)
     }
 }
 
+NPY_INLINE static PyObject *
+npy_import(const char *module, const char *attr)
+{
+    PyObject *mod = PyImport_ImportModule(module);
+    PyObject *ret = NULL;
+
+    if (mod != NULL) {
+        ret = PyObject_GetAttrString(mod, attr);
+    }
+    Py_XDECREF(mod);
+
+    return ret;
+}
+
 #endif
diff --git a/numpy/core/tests/test_multiarray.py b/numpy/core/tests/test_multiarray.py
index b13e48fdc..37d73e42c 100644
--- a/numpy/core/tests/test_multiarray.py
+++ b/numpy/core/tests/test_multiarray.py
@@ -4796,9 +4796,25 @@ class TestRecord(object):
             fn2 = func('f2')
             b[fn2] = 3
 
-            assert_equal(b[['f1', 'f2']][0].tolist(), (2, 3))
-            assert_equal(b[['f2', 'f1']][0].tolist(), (3, 2))
-            assert_equal(b[['f1', 'f3']][0].tolist(), (2, (1,)))
+            # In 1.16 code below can be replaced by:
+            # assert_equal(b[['f1', 'f2']][0].tolist(), (2, 3))
+            # assert_equal(b[['f2', 'f1']][0].tolist(), (3, 2))
+            # assert_equal(b[['f1', 'f3']][0].tolist(), (2, (1,)))
+            with suppress_warnings() as sup:
+                sup.filter(FutureWarning,
+                           ".* selecting multiple fields .*")
+
+                assert_equal(b[['f1', 'f2']][0].tolist(), (2, 3))
+                assert_equal(b[['f2', 'f1']][0].tolist(), (3, 2))
+                assert_equal(b[['f1', 'f3']][0].tolist(), (2, (1,)))
+                # view of subfield view/copy
+                assert_equal(b[['f1', 'f2']][0].view(('i4', 2)).tolist(),
+                             (2, 3))
+                assert_equal(b[['f2', 'f1']][0].view(('i4', 2)).tolist(),
+                             (3, 2))
+                view_dtype = [('f1', 'i4'), ('f3', [('', 'i4')])]
+                assert_equal(b[['f1', 'f3']][0].view(view_dtype).tolist(),
+                             (2, (1,)))
 
         # non-ascii unicode field indexing is well behaved
         if not is_py3:
@@ -4808,6 +4824,51 @@ class TestRecord(object):
             assert_raises(ValueError, a.__setitem__, u'\u03e0', 1)
             assert_raises(ValueError, a.__getitem__, u'\u03e0')
 
+    # can be removed in 1.16
+    def test_field_names_deprecation(self):
+
+        def collect_warnings(f, *args, **kwargs):
+            with warnings.catch_warnings(record=True) as log:
+                warnings.simplefilter("always")
+                f(*args, **kwargs)
+            return [w.category for w in log]
+
+        a = np.zeros((1,), dtype=[('f1', 'i4'),
+                                  ('f2', 'i4'),
+                                  ('f3', [('sf1', 'i4')])])
+        a['f1'][0] = 1
+        a['f2'][0] = 2
+        a['f3'][0] = (3,)
+        b = np.zeros((1,), dtype=[('f1', 'i4'),
+                                  ('f2', 'i4'),
+                                  ('f3', [('sf1', 'i4')])])
+        b['f1'][0] = 1
+        b['f2'][0] = 2
+        b['f3'][0] = (3,)
+
+        # All the different functions raise a warning, but not an error
+        assert_equal(collect_warnings(a[['f1', 'f2']].__setitem__, 0, (10, 20)),
+                     [FutureWarning])
+        # For <=1.12 a is not modified, but it will be in 1.13
+        assert_equal(a, b)
+
+        # Views also warn
+        subset = a[['f1', 'f2']]
+        subset_view = subset.view()
+        assert_equal(collect_warnings(subset_view['f1'].__setitem__, 0, 10),
+                     [FutureWarning])
+        # But the write goes through:
+        assert_equal(subset['f1'][0], 10)
+        # Only one warning per multiple field indexing, though (even if there
+        # are multiple views involved):
+        assert_equal(collect_warnings(subset['f1'].__setitem__, 0, 10), [])
+
+        # make sure views of a multi-field index warn too
+        c = np.zeros(3, dtype='i8,i8,i8')
+        assert_equal(collect_warnings(c[['f0', 'f2']].view, 'i8,i8'),
+                     [FutureWarning])
+
+
     def test_record_hash(self):
         a = np.array([(1, 2), (1, 2)], dtype='i1,i2')
         a.flags.writeable = False
diff --git a/numpy/core/tests/test_records.py b/numpy/core/tests/test_records.py
index 9aeb93f74..d7c7d16e3 100644
--- a/numpy/core/tests/test_records.py
+++ b/numpy/core/tests/test_records.py
@@ -11,6 +11,7 @@ import pickle
 import warnings
 import textwrap
 from os import path
+import pytest
 
 import numpy as np
 from numpy.testing import (
@@ -360,6 +361,7 @@ class TestRecord(object):
         with assert_raises(ValueError):
             r.setfield([2,3], *r.dtype.fields['f'])
 
+    @pytest.mark.xfail(reason="See gh-10411, becomes real error in 1.16")
     def test_out_of_order_fields(self):
         # names in the same order, padding added to descr
         x = self.data[['col1', 'col2']]
diff --git a/numpy/doc/structured_arrays.py b/numpy/doc/structured_arrays.py
index ba667da59..ab97c5df6 100644
--- a/numpy/doc/structured_arrays.py
+++ b/numpy/doc/structured_arrays.py
@@ -133,10 +133,9 @@ summary they are:
 
      Offsets may be chosen such that the fields overlap, though this will mean
      that assigning to one field may clobber any overlapping field's data. As
-     an exception, fields of :class:`numpy.object` type .. (see 
-     :ref:`object arrays <arrays.object>`) cannot overlap with other fields,
-     because of the risk of clobbering the internal object pointer and then
-     dereferencing it.
+     an exception, fields of :class:`numpy.object` type cannot overlap with
+     other fields, because of the risk of clobbering the internal object
+     pointer and then dereferencing it.
 
      The optional 'aligned' value can be set to ``True`` to make the automatic
      offset computation use aligned offsets (see :ref:`offsets-and-alignment`),
@@ -235,6 +234,11 @@ If the offsets of the fields and itemsize of a structured array satisfy the
 alignment conditions, the array will have the ``ALIGNED`` :ref:`flag
 <numpy.ndarray.flags>` set.
 
+A convenience function :func:`numpy.lib.recfunctions.repack_fields` converts an
+aligned dtype or array to a packed one and vice versa. It takes either a dtype
+or structured ndarray as an argument, and returns a copy with fields re-packed,
+with or without padding bytes.
+
 .. _titles:
 
 Field Titles
@@ -396,27 +400,61 @@ typically a non-structured array, except in the case of nested structures.
 Accessing Multiple Fields
 ```````````````````````````
 
-One can index a structured array with a multi-field index, where the index is a
-list of field names::
+One can index and assign to a structured array with a multi-field index, where
+the index is a list of field names.
+
+.. warning::
+    The behavior of multi-field indexes will change from Numpy 1.15 to Numpy
+    1.16.
 
- >>> a = np.zeros(3, dtype=[('a', 'i8'), ('b', 'i4'), ('c', 'f8')])
+In Numpy 1.16, the result of indexing with a multi-field index will be a view
+into the original array, as follows::
+
+ >>> a = np.zeros(3, dtype=[('a', 'i4'), ('b', 'i4'), ('c', 'f4')])
  >>> a[['a', 'c']]
- array([(0, 0.0), (0, 0.0), (0, 0.0)],
-       dtype={'names':['a','c'], 'formats':['<i8','<f8'], 'offsets':[0,11], 'itemsize':19})
+ array([(0, 0.), (0, 0.), (0, 0.)],
+      dtype={'names':['a','c'], 'formats':['<i4','<f4'], 'offsets':[0,8], 'itemsize':12})
+
+Assignment to the view modifies the original array. The view's fields will be
+in the order they were indexed. Note that unlike for single-field indexing, the
+view's dtype has the same itemsize as the original array, and has fields at the
+same offsets as in the original array, and unindexed fields are merely missing.
+
+In Numpy 1.15, indexing an array with a multi-field index returns a copy of
+the result above for 1.16, but with fields packed together in memory as if
+passed through :func:`numpy.lib.recfunctions.repack_fields`. This is the
+behavior since Numpy 1.7.
+
+.. warning::
+   The new behavior in Numpy 1.16 leads to extra "padding" bytes at the
+   location of unindexed fields. You will need to update any code which depends
+   on the data having a "packed" layout. For instance code such as::
+
+    >>> a[['a','c']].view('i8')  # will fail in Numpy 1.16
+    ValueError: When changing to a smaller dtype, its size must be a divisor of the size of original dtype
+
+   will need to be changed. This code has raised a ``FutureWarning`` since
+   Numpy 1.12.
+
+   The following is a recommended fix, which will behave identically in Numpy
+   1.15 and Numpy 1.16::
+
+    >>> from numpy.lib.recfunctions import repack_fields
+    >>> repack_fields(a[['a','c']]).view('i8')  # supported 1.15 and 1.16
+    array([0, 0, 0])
+
+Assigning to an array with a multi-field index will behave the same in Numpy
+1.15 and Numpy 1.16. In both versions the assignment will modify the original
+array::
+
  >>> a[['a', 'c']] = (2, 3)
  >>> a
  array([(2, 0, 3.0), (2, 0, 3.0), (2, 0, 3.0)],
        dtype=[('a', '<i8'), ('b', '<i4'), ('c', '<f8')])
 
-The resulting array is a view into the original array, such that assignment to
-the view modifies the original array. The view's fields will be in the order
-they were indexed. Note that unlike for single-field indexing, the view's dtype
-has the same itemsize as the original array, and has fields at the same offsets
-as in the original array, and unindexed fields are merely missing.
-
-Since the view is a structured array itself, it obeys the assignment rules
-described above. For example, this means that one can swap the values of two
-fields using appropriate multi-field indexes::
+This obeys the structured array assignment rules described above. For example,
+this means that one can swap the values of two fields using appropriate
+multi-field indexes::
 
  >>> a[['a', 'c']] = a[['c', 'a']]
 
diff --git a/numpy/lib/recfunctions.py b/numpy/lib/recfunctions.py
index c455bd93f..b6453d5a2 100644
--- a/numpy/lib/recfunctions.py
+++ b/numpy/lib/recfunctions.py
@@ -732,6 +732,84 @@ def rec_append_fields(base, names, data, dtypes=None):
     return append_fields(base, names, data=data, dtypes=dtypes,
                          asrecarray=True, usemask=False)
 
+def repack_fields(a, align=False, recurse=False):
+    """
+    Re-pack the fields of a structured array or dtype in memory.
+
+    The memory layout of structured datatypes allows fields at arbitrary
+    byte offsets. This means the fields can be separated by padding bytes,
+    their offsets can be non-monotonically increasing, and they can overlap.
+
+    This method removes any overlaps and reorders the fields in memory so they
+    have increasing byte offsets, and adds or removes padding bytes depending
+    on the `align` option, which behaves like the `align` option to `np.dtype`.
+
+    If `align=False`, this method produces a "packed" memory layout in which
+    each field starts at the byte the previous field ended, and any padding
+    bytes are removed.
+
+    If `align=True`, this methods produces an "aligned" memory layout in which
+    each field's offset is a multiple of its alignment, and the total itemsize
+    is a multiple of the largest alignment, by adding padding bytes as needed.
+
+    Parameters
+    ----------
+    a : ndarray or dtype
+       array or dtype for which to repack the fields.
+    align : boolean
+       If true, use an "aligned" memory layout, otherwise use a "packed" layout.
+    recurse : boolean
+       If True, also repack nested structures.
+
+    Returns
+    -------
+    repacked : ndarray or dtype
+       Copy of `a` with fields repacked, or `a` itself if no repacking was
+       needed.
+
+    Examples
+    --------
+
+    >>> def print_offsets(d):
+    ...     print("offsets:", [d.fields[name][1] for name in d.names])
+    ...     print("itemsize:", d.itemsize)
+    ...
+    >>> dt = np.dtype('u1,i4,f4', align=True)
+    >>> dt
+    dtype({'names':['f0','f1','f2'], 'formats':['u1','<i4','<f8'], 'offsets':[0,4,8], 'itemsize':16}, align=True)
+    >>> print_offsets(dt)
+    offsets: [0, 4, 8]
+    itemsize: 16
+    >>> packed_dt = repack_fields(dt)
+    >>> packed_dt
+    dtype([('f0', 'u1'), ('f1', '<i4'), ('f2', '<f8')])
+    >>> print_offsets(packed_dt)
+    offsets: [0, 1, 5]
+    itemsize: 13
+
+    """
+    if not isinstance(a, np.dtype):
+        dt = repack_fields(a.dtype, align=align, recurse=recurse)
+        return a.astype(dt, copy=False)
+
+    if a.names is None:
+        return a
+
+    fieldinfo = []
+    for name in a.names:
+        tup = a.fields[name]
+        if recurse:
+            fmt = repack_fields(tup[0], align=align, recurse=True)
+        else:
+            fmt = tup[0]
+
+        if len(tup) == 3:
+            name = (tup[2], name)
+
+        fieldinfo.append((name, fmt))
+
+    dt = np.dtype(fieldinfo, align=align)
+    return np.dtype((a.type, dt))
 
 def stack_arrays(arrays, defaults=None, usemask=True, asrecarray=False,
                  autoconvert=False):
diff --git a/numpy/lib/tests/test_recfunctions.py b/numpy/lib/tests/test_recfunctions.py
index 219ae24fa..d4828bc1f 100644
--- a/numpy/lib/tests/test_recfunctions.py
+++ b/numpy/lib/tests/test_recfunctions.py
@@ -9,8 +9,8 @@ from numpy.ma.testutils import assert_equal
 from numpy.testing import assert_, assert_raises
 from numpy.lib.recfunctions import (
     drop_fields, rename_fields, get_fieldstructure, recursive_fill_fields,
-    find_duplicates, merge_arrays, append_fields, stack_arrays, join_by
-    )
+    find_duplicates, merge_arrays, append_fields, stack_arrays, join_by,
+    repack_fields)
 get_names = np.lib.recfunctions.get_names
 get_names_flat = np.lib.recfunctions.get_names_flat
 zip_descr = np.lib.recfunctions.zip_descr
@@ -192,6 +192,18 @@ class TestRecFunctions(object):
         assert_equal(sorted(test[-1]), control)
         assert_equal(test[0], a[test[-1]])
 
+    def test_repack_fields(self):
+        dt = np.dtype('u1,f4,i8', align=True)
+        a = np.zeros(2, dtype=dt)
+
+        assert_equal(repack_fields(dt), np.dtype('u1,f4,i8'))
+        assert_equal(repack_fields(a).itemsize, 13)
+        assert_equal(repack_fields(repack_fields(dt), align=True), dt)
+
+        # make sure type is preserved
+        dt = np.dtype((np.record, dt))
+        assert_(repack_fields(dt).type is np.record)
+
 
 class TestRecursiveFillFields(object):
     # Test recursive_fill_fields.
author	Allan Haldane <allan.haldane@gmail.com>	2018-06-10 21:54:21 -0400
committer	Allan Haldane <allan.haldane@gmail.com>	2018-06-11 11:51:04 -0400
commit	e08eced7990fbdcecb2bd81d3fc736f69bad6dfd (patch)
tree	9ded5059b440cff8c4f52519692e4c456c66717d
parent	8525d3e30f3b57332dfd374bc64b068366126b52 (diff)
download	numpy-e08eced7990fbdcecb2bd81d3fc736f69bad6dfd.tar.gz