Merge pull request #10199 from chunweiyuan/quantile

ENH: Quantile
author: Stephan Hoyer <shoyer@gmail.com> 2018-04-23 17:03:15 -0700
committer: GitHub <noreply@github.com> 2018-04-23 17:03:15 -0700
commit: ac76793dafcd6e5f24ed052fc40413f29ebc5306 (patch)
tree: e5eed404c59488c4df688e4428fccb4d5afbad84
parent: a043c3ed2a08fc42fd2eff6669a862c5cb045bfc (diff)
parent: 5d5c379ed5af0df19eec68fdde1f87ad994ce6b1 (diff)
download: numpy-ac76793dafcd6e5f24ed052fc40413f29ebc5306.tar.gz
5 files changed, 284 insertions, 4 deletions
diff --git a/doc/release/1.15.0-notes.rst b/doc/release/1.15.0-notes.rst
index d076074ce..49e8ab22d 100644
--- a/doc/release/1.15.0-notes.rst
+++ b/doc/release/1.15.0-notes.rst
@@ -16,6 +16,9 @@ New functions
   common multiple.
 * `np.ma.stack`, the `np.stack` array-joining function generalized to masked
   arrays.
+* ``quantile`` function, an interface to ``percentile`` without factors of 100
+* ``nanquantile`` function, an interface to ``nanpercentile`` without factors
+  of 100
 
 * `np.printoptions`, the context manager which sets print options temporarily
   for the scope of the ``with`` block::
@@ -141,6 +144,13 @@ Creating a full iOS-compatible NumPy package requires building for the 5
 architectures supported by iOS (i386, x86_64, armv7, armv7s and arm64), and
 combining these 5 compiled builds products into a single "fat" binary.
 
+``np.quantile`` and ``np.nanquantile``
+--------------------------------------
+Like ``np.percentile`` and ``np.nanpercentile``, but takes quantiles in [0, 1]
+rather than percentiles in [0, 100]. ``np.percentile`` is now a thin wrapper
+around ``np.quantile`` with the extra step of dividing by 100.
+
+
 Build system
 ------------
 Added experimental support for the 64-bit RISC-V architecture.
diff --git a/numpy/lib/function_base.py b/numpy/lib/function_base.py
index 099b63c40..72beef471 100644
--- a/numpy/lib/function_base.py
+++ b/numpy/lib/function_base.py
@@ -54,7 +54,8 @@ __all__ = [
     'bincount', 'digitize', 'cov', 'corrcoef',
     'msort', 'median', 'sinc', 'hamming', 'hanning', 'bartlett',
     'blackman', 'kaiser', 'trapz', 'i0', 'add_newdoc', 'add_docstring',
-    'meshgrid', 'delete', 'insert', 'append', 'interp', 'add_newdoc_ufunc'
+    'meshgrid', 'delete', 'insert', 'append', 'interp', 'add_newdoc_ufunc',
+    'quantile'
     ]
 
 
@@ -3427,7 +3428,7 @@ def percentile(a, q, axis=None, out=None,
 
     interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
         This optional parameter specifies the interpolation method to
-        use when the desired quantile lies between two data points
+        use when the desired percentile lies between two data points
         ``i < j``:
 
         * 'linear': ``i + (j - i) * fraction``, where ``fraction``
@@ -3463,6 +3464,7 @@ def percentile(a, q, axis=None, out=None,
     mean
     median : equivalent to ``percentile(..., 50)``
     nanpercentile
+    quantile : equivalent to percentile, except with q in the range [0, 1].
 
     Notes
     -----
@@ -3539,6 +3541,110 @@ def percentile(a, q, axis=None, out=None,
         a, q, axis, out, overwrite_input, interpolation, keepdims)
 
 
+def quantile(a, q, axis=None, out=None,
+             overwrite_input=False, interpolation='linear', keepdims=False):
+    """
+    Compute the `q`th quantile of the data along the specified axis.
+    ..versionadded:: 1.15.0
+
+    Parameters
+    ----------
+    a : array_like
+        Input array or object that can be converted to an array.
+    q : array_like of float
+        Quantile or sequence of quantiles to compute, which must be between
+        0 and 1 inclusive.
+    axis : {int, tuple of int, None}, optional
+        Axis or axes along which the quantiles are computed. The
+        default is to compute the quantile(s) along a flattened
+        version of the array.
+    out : ndarray, optional
+        Alternative output array in which to place the result. It must
+        have the same shape and buffer length as the expected output,
+        but the type (of the output) will be cast if necessary.
+    overwrite_input : bool, optional
+        If True, then allow the input array `a` to be modified by intermediate
+        calculations, to save memory. In this case, the contents of the input
+        `a` after this function completes is undefined.
+    interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
+        This optional parameter specifies the interpolation method to
+        use when the desired quantile lies between two data points
+        ``i < j``:
+            * linear: ``i + (j - i) * fraction``, where ``fraction``
+              is the fractional part of the index surrounded by ``i``
+              and ``j``.
+            * lower: ``i``.
+            * higher: ``j``.
+            * nearest: ``i`` or ``j``, whichever is nearest.
+            * midpoint: ``(i + j) / 2``.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left in
+        the result as dimensions with size one. With this option, the
+        result will broadcast correctly against the original array `a`.
+
+    Returns
+    -------
+    quantile : scalar or ndarray
+        If `q` is a single quantile and `axis=None`, then the result
+        is a scalar. If multiple quantiles are given, first axis of
+        the result corresponds to the quantiles. The other axes are
+        the axes that remain after the reduction of `a`. If the input
+        contains integers or floats smaller than ``float64``, the output
+        data-type is ``float64``. Otherwise, the output data-type is the
+        same as that of the input. If `out` is specified, that array is
+        returned instead.
+
+    See Also
+    --------
+    mean
+    percentile : equivalent to quantile, but with q in the range [0, 100].
+    median : equivalent to ``quantile(..., 0.5)``
+    nanquantile
+
+    Notes
+    -----
+    Given a vector ``V`` of length ``N``, the ``q``-th quantile of
+    ``V`` is the value ``q`` of the way from the minimum to the
+    maximum in a sorted copy of ``V``. The values and distances of
+    the two nearest neighbors as well as the `interpolation` parameter
+    will determine the quantile if the normalized ranking does not
+    match the location of ``q`` exactly. This function is the same as
+    the median if ``q=0.5``, the same as the minimum if ``q=0.0`` and the
+    same as the maximum if ``q=1.0``.
+
+    Examples
+    --------
+    >>> a = np.array([[10, 7, 4], [3, 2, 1]])
+    >>> a
+    array([[10,  7,  4],
+           [ 3,  2,  1]])
+    >>> np.quantile(a, 0.5)
+    3.5
+    >>> np.quantile(a, 0.5, axis=0)
+    array([[ 6.5,  4.5,  2.5]])
+    >>> np.quantile(a, 0.5, axis=1)
+    array([ 7.,  2.])
+    >>> np.quantile(a, 0.5, axis=1, keepdims=True)
+    array([[ 7.],
+           [ 2.]])
+    >>> m = np.quantile(a, 0.5, axis=0)
+    >>> out = np.zeros_like(m)
+    >>> np.quantile(a, 0.5, axis=0, out=out)
+    array([[ 6.5,  4.5,  2.5]])
+    >>> m
+    array([[ 6.5,  4.5,  2.5]])
+    >>> b = a.copy()
+    >>> np.quantile(b, 0.5, axis=1, overwrite_input=True)
+    array([ 7.,  2.])
+    >>> assert not np.all(a == b)
+    """
+    q = np.asanyarray(q)
+    if not _quantile_is_valid(q):
+        raise ValueError("Quantiles must be in the range [0, 1]")
+    return _quantile_unchecked(
+        a, q, axis, out, overwrite_input, interpolation, keepdims)
+
+
 def _quantile_unchecked(a, q, axis=None, out=None, overwrite_input=False,
                         interpolation='linear', keepdims=False):
     """Assumes that q is in [0, 1], and is an ndarray"""
diff --git a/numpy/lib/nanfunctions.py b/numpy/lib/nanfunctions.py
index dddc0e5b8..abd2da1a2 100644
--- a/numpy/lib/nanfunctions.py
+++ b/numpy/lib/nanfunctions.py
@@ -16,6 +16,7 @@ Functions
 - `nanvar` -- variance of non-NaN values
 - `nanstd` -- standard deviation of non-NaN values
 - `nanmedian` -- median of non-NaN values
+- `nanquantile` -- qth quantile of non-NaN values
 - `nanpercentile` -- qth percentile of non-NaN values
 
 """
@@ -29,7 +30,7 @@ from numpy.lib import function_base
 __all__ = [
     'nansum', 'nanmax', 'nanmin', 'nanargmax', 'nanargmin', 'nanmean',
     'nanmedian', 'nanpercentile', 'nanvar', 'nanstd', 'nanprod',
-    'nancumsum', 'nancumprod'
+    'nancumsum', 'nancumprod', 'nanquantile'
     ]
 
 
@@ -1057,7 +1058,7 @@ def nanpercentile(a, q, axis=None, out=None, overwrite_input=False,
         `a` after this function completes is undefined.
     interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
         This optional parameter specifies the interpolation method to
-        use when the desired quantile lies between two data points
+        use when the desired percentile lies between two data points
         ``i < j``:
 
         * 'linear': ``i + (j - i) * fraction``, where ``fraction``
@@ -1095,6 +1096,7 @@ def nanpercentile(a, q, axis=None, out=None, overwrite_input=False,
     nanmean
     nanmedian : equivalent to ``nanpercentile(..., 50)``
     percentile, median, mean
+    nanquantile : equivalent to nanpercentile, but with q in the range [0, 1].
 
     Notes
     -----
@@ -1144,6 +1146,110 @@ def nanpercentile(a, q, axis=None, out=None, overwrite_input=False,
         a, q, axis, out, overwrite_input, interpolation, keepdims)
 
 
+def nanquantile(a, q, axis=None, out=None, overwrite_input=False,
+                interpolation='linear', keepdims=np._NoValue):
+    """
+    Compute the qth quantile of the data along the specified axis,
+    while ignoring nan values.
+    Returns the qth quantile(s) of the array elements.
+    .. versionadded:: 1.15.0
+
+    Parameters
+    ----------
+    a : array_like
+        Input array or object that can be converted to an array, containing
+        nan values to be ignored
+    q : array_like of float
+        Quantile or sequence of quantiles to compute, which must be between
+        0 and 1 inclusive.
+    axis : {int, tuple of int, None}, optional
+        Axis or axes along which the quantiles are computed. The
+        default is to compute the quantile(s) along a flattened
+        version of the array.
+    out : ndarray, optional
+        Alternative output array in which to place the result. It must
+        have the same shape and buffer length as the expected output,
+        but the type (of the output) will be cast if necessary.
+    overwrite_input : bool, optional
+        If True, then allow the input array `a` to be modified by intermediate
+        calculations, to save memory. In this case, the contents of the input
+        `a` after this function completes is undefined.
+    interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
+        This optional parameter specifies the interpolation method to
+        use when the desired quantile lies between two data points
+        ``i < j``:
+            * linear: ``i + (j - i) * fraction``, where ``fraction``
+              is the fractional part of the index surrounded by ``i``
+              and ``j``.
+            * lower: ``i``.
+            * higher: ``j``.
+            * nearest: ``i`` or ``j``, whichever is nearest.
+            * midpoint: ``(i + j) / 2``.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left in
+        the result as dimensions with size one. With this option, the
+        result will broadcast correctly against the original array `a`.
+
+        If this is anything but the default value it will be passed
+        through (in the special case of an empty array) to the
+        `mean` function of the underlying array.  If the array is
+        a sub-class and `mean` does not have the kwarg `keepdims` this
+        will raise a RuntimeError.
+
+    Returns
+    -------
+    quantile : scalar or ndarray
+        If `q` is a single percentile and `axis=None`, then the result
+        is a scalar. If multiple quantiles are given, first axis of
+        the result corresponds to the quantiles. The other axes are
+        the axes that remain after the reduction of `a`. If the input
+        contains integers or floats smaller than ``float64``, the output
+        data-type is ``float64``. Otherwise, the output data-type is the
+        same as that of the input. If `out` is specified, that array is
+        returned instead.
+
+    See Also
+    --------
+    quantile
+    nanmean, nanmedian
+    nanmedian : equivalent to ``nanquantile(..., 0.5)``
+    nanpercentile : same as nanquantile, but with q in the range [0, 100].
+
+    Examples
+    --------
+    >>> a = np.array([[10., 7., 4.], [3., 2., 1.]])
+    >>> a[0][1] = np.nan
+    >>> a
+    array([[ 10.,  nan,   4.],
+          [  3.,   2.,   1.]])
+    >>> np.quantile(a, 0.5)
+    nan
+    >>> np.nanquantile(a, 0.5)
+    3.5
+    >>> np.nanquantile(a, 0.5, axis=0)
+    array([ 6.5,  2.,   2.5])
+    >>> np.nanquantile(a, 0.5, axis=1, keepdims=True)
+    array([[ 7.],
+           [ 2.]])
+    >>> m = np.nanquantile(a, 0.5, axis=0)
+    >>> out = np.zeros_like(m)
+    >>> np.nanquantile(a, 0.5, axis=0, out=out)
+    array([ 6.5,  2.,   2.5])
+    >>> m
+    array([ 6.5,  2. ,  2.5])
+    >>> b = a.copy()
+    >>> np.nanquantile(b, 0.5, axis=1, overwrite_input=True)
+    array([  7.,  2.])
+    >>> assert not np.all(a==b)
+    """
+    a = np.asanyarray(a)
+    q = np.asanyarray(q)
+    if not function_base._quantile_is_valid(q):
+        raise ValueError("Quantiles must be in the range [0, 1]")
+    return _nanquantile_unchecked(
+        a, q, axis, out, overwrite_input, interpolation, keepdims)
+
+
 def _nanquantile_unchecked(a, q, axis=None, out=None, overwrite_input=False,
                            interpolation='linear', keepdims=np._NoValue):
     """Assumes that q is in [0, 1], and is an ndarray"""
diff --git a/numpy/lib/tests/test_function_base.py b/numpy/lib/tests/test_function_base.py
index 6653b5ba1..43d62a7ff 100644
--- a/numpy/lib/tests/test_function_base.py
+++ b/numpy/lib/tests/test_function_base.py
@@ -2749,6 +2749,28 @@ class TestPercentile(object):
                 a, [0.3, 0.6], (0, 2), interpolation='nearest'), b)
 
 
+class TestQuantile(object):
+    # most of this is already tested by TestPercentile
+
+    def test_basic(self):
+        x = np.arange(8) * 0.5
+        assert_equal(np.quantile(x, 0), 0.)
+        assert_equal(np.quantile(x, 1), 3.5)
+        assert_equal(np.quantile(x, 0.5), 1.75)
+
+    def test_no_p_overwrite(self):
+        # this is worth retesting, beause quantile does not make a copy
+        p0 = np.array([0, 0.75, 0.25, 0.5, 1.0])
+        p = p0.copy()
+        np.quantile(np.arange(100.), p, interpolation="midpoint")
+        assert_array_equal(p, p0)
+
+        p0 = p0.tolist()
+        p = p.tolist()
+        np.quantile(np.arange(100.), p, interpolation="midpoint")
+        assert_array_equal(p, p0)
+
+
 class TestMedian(object):
 
     def test_basic(self):
diff --git a/numpy/lib/tests/test_nanfunctions.py b/numpy/lib/tests/test_nanfunctions.py
index 1f403f7b8..e69d9dd7d 100644
--- a/numpy/lib/tests/test_nanfunctions.py
+++ b/numpy/lib/tests/test_nanfunctions.py
@@ -886,3 +886,39 @@ class TestNanFunctions_Percentile(object):
 
         megamat = np.ones((3, 4, 5, 6))
         assert_equal(np.nanpercentile(megamat, perc, axis=(1, 2)).shape, (2, 3, 6))
+
+
+class TestNanFunctions_Quantile(object):
+    # most of this is already tested by TestPercentile
+
+    def test_regression(self):
+        ar = np.arange(24).reshape(2, 3, 4).astype(float)
+        ar[0][1] = np.nan
+
+        assert_equal(np.nanquantile(ar, q=0.5), np.nanpercentile(ar, q=50))
+        assert_equal(np.nanquantile(ar, q=0.5, axis=0),
+                     np.nanpercentile(ar, q=50, axis=0))
+        assert_equal(np.nanquantile(ar, q=0.5, axis=1),
+                     np.nanpercentile(ar, q=50, axis=1))
+        assert_equal(np.nanquantile(ar, q=[0.5], axis=1),
+                     np.nanpercentile(ar, q=[50], axis=1))
+        assert_equal(np.nanquantile(ar, q=[0.25, 0.5, 0.75], axis=1),
+                     np.nanpercentile(ar, q=[25, 50, 75], axis=1))
+
+    def test_basic(self):
+        x = np.arange(8) * 0.5
+        assert_equal(np.nanquantile(x, 0), 0.)
+        assert_equal(np.nanquantile(x, 1), 3.5)
+        assert_equal(np.nanquantile(x, 0.5), 1.75)
+
+    def test_no_p_overwrite(self):
+        # this is worth retesting, beause quantile does not make a copy
+        p0 = np.array([0, 0.75, 0.25, 0.5, 1.0])
+        p = p0.copy()
+        np.nanquantile(np.arange(100.), p, interpolation="midpoint")
+        assert_array_equal(p, p0)
+
+        p0 = p0.tolist()
+        p = p.tolist()
+        np.nanquantile(np.arange(100.), p, interpolation="midpoint")
+        assert_array_equal(p, p0)
author	Stephan Hoyer <shoyer@gmail.com>	2018-04-23 17:03:15 -0700
committer	GitHub <noreply@github.com>	2018-04-23 17:03:15 -0700
commit	ac76793dafcd6e5f24ed052fc40413f29ebc5306 (patch)
tree	e5eed404c59488c4df688e4428fccb4d5afbad84
parent	a043c3ed2a08fc42fd2eff6669a862c5cb045bfc (diff)
parent	5d5c379ed5af0df19eec68fdde1f87ad994ce6b1 (diff)
download	numpy-ac76793dafcd6e5f24ed052fc40413f29ebc5306.tar.gz