BUG: lib: Fix histogram problem with signed integer arrays.

An input such as np.histogram(np.array([-2, 0, 127], dtype=np.int8), bins="auto") would raise the exception ValueError: Number of samples, -1, must be non-negative. The problem was that the peak-to-peak value for the input array was computed with the `ptp` method, which returned negative values for signed integer arrays when the actual value was more than the maximum signed value of the array's data type. The fix is to use a peak-to-peak function that returns an unsigned value for signed integer arrays. Closes gh-14379.
author: Warren Weckesser <warren.weckesser@gmail.com> 2019-08-27 15:15:26 -0400
committer: Charles Harris <charlesr.harris@gmail.com> 2019-11-08 06:03:21 -0700
commit: 461a64aac1f8eca81f400f6e783c90f5ccad5f48 (patch)
tree: f888ae8c85e4d638a9f3c25a1dcdd4ae05d5b690
parent: 426f03164ebe572e74fd26786b86e9dbc83d3487 (diff)
download: numpy-461a64aac1f8eca81f400f6e783c90f5ccad5f48.tar.gz
2 files changed, 26 insertions, 5 deletions
diff --git a/numpy/lib/histograms.py b/numpy/lib/histograms.py
index d69e04e80..bed1f46b0 100644
--- a/numpy/lib/histograms.py
+++ b/numpy/lib/histograms.py
@@ -21,6 +21,16 @@ array_function_dispatch = functools.partial(
 _range = range
 
 
+def _ptp(x):
+    """Peak-to-peak value of x.
+
+    This implementation avoids the problem of signed integer arrays having a
+    peak-to-peak value that cannot be represented with the array's data type.
+    This function returns an unsigned value for signed integer arrays.
+    """
+    return _unsigned_subtract(x.max(), x.min())
+
+
 def _hist_bin_sqrt(x, range):
     """
     Square root histogram bin estimator.
@@ -39,7 +49,7 @@ def _hist_bin_sqrt(x, range):
     h : An estimate of the optimal bin width for the given data.
     """
     del range  # unused
-    return x.ptp() / np.sqrt(x.size)
+    return _ptp(x) / np.sqrt(x.size)
 
 
 def _hist_bin_sturges(x, range):
@@ -62,7 +72,7 @@ def _hist_bin_sturges(x, range):
     h : An estimate of the optimal bin width for the given data.
     """
     del range  # unused
-    return x.ptp() / (np.log2(x.size) + 1.0)
+    return _ptp(x) / (np.log2(x.size) + 1.0)
 
 
 def _hist_bin_rice(x, range):
@@ -86,7 +96,7 @@ def _hist_bin_rice(x, range):
     h : An estimate of the optimal bin width for the given data.
     """
     del range  # unused
-    return x.ptp() / (2.0 * x.size ** (1.0 / 3))
+    return _ptp(x) / (2.0 * x.size ** (1.0 / 3))
 
 
 def _hist_bin_scott(x, range):
@@ -136,7 +146,7 @@ def _hist_bin_stone(x, range):
     """
 
     n = x.size
-    ptp_x = np.ptp(x)
+    ptp_x = _ptp(x)
     if n <= 1 or ptp_x == 0:
         return 0
 
@@ -182,7 +192,7 @@ def _hist_bin_doane(x, range):
             np.true_divide(temp, sigma, temp)
             np.power(temp, 3, temp)
             g1 = np.mean(temp)
-            return x.ptp() / (1.0 + np.log2(x.size) +
+            return _ptp(x) / (1.0 + np.log2(x.size) +
                                     np.log2(1.0 + np.absolute(g1) / sg1))
     return 0.0
 
diff --git a/numpy/lib/tests/test_histograms.py b/numpy/lib/tests/test_histograms.py
index c96b01d42..594c8e782 100644
--- a/numpy/lib/tests/test_histograms.py
+++ b/numpy/lib/tests/test_histograms.py
@@ -8,6 +8,7 @@ from numpy.testing import (
     assert_array_almost_equal, assert_raises, assert_allclose,
     assert_array_max_ulp, assert_raises_regex, suppress_warnings,
     )
+import pytest
 
 
 class TestHistogram(object):
@@ -595,6 +596,16 @@ class TestHistogramOptimBinNums(object):
                 msg += " with datasize of {0}".format(testlen)
                 assert_equal(len(a), numbins, err_msg=msg)
 
+    @pytest.mark.parametrize("bins", ['auto', 'fd', 'doane', 'scott',
+                                      'stone', 'rice', 'sturges'])
+    def test_signed_integer_data(self, bins):
+        # Regression test for gh-14379.
+        a = np.array([-2, 0, 127], dtype=np.int8)
+        hist, edges = np.histogram(a, bins=bins)
+        hist32, edges32 = np.histogram(a.astype(np.int32), bins=bins)
+        assert_array_equal(hist, hist32)
+        assert_array_equal(edges, edges32)
+
     def test_simple_weighted(self):
         """
         Check that weighted data raises a TypeError
author	Warren Weckesser <warren.weckesser@gmail.com>	2019-08-27 15:15:26 -0400
committer	Charles Harris <charlesr.harris@gmail.com>	2019-11-08 06:03:21 -0700
commit	461a64aac1f8eca81f400f6e783c90f5ccad5f48 (patch)
tree	f888ae8c85e4d638a9f3c25a1dcdd4ae05d5b690
parent	426f03164ebe572e74fd26786b86e9dbc83d3487 (diff)
download	numpy-461a64aac1f8eca81f400f6e783c90f5ccad5f48.tar.gz