Issue 26002 and 25974

patches by Upendra Kumar and Stefan Krah speed up median by using bisect, and general speedup for Decimals using as_integer_ratio
author: Steven D'Aprano <steve@pearwood.info> 2016-05-05 03:54:29 +1000
committer: Steven D'Aprano <steve@pearwood.info> 2016-05-05 03:54:29 +1000
commit: 3b06e243527da5f4ca4dc2e3126dc9f5bbdc243c (patch)
tree: 2c00f35c03dc6ff24ae5c72f1363ef2da761ddc0 /Lib/statistics.py
parent: ad039f754805dc9c9d4cd95ed249984bc1405bd6 (diff)
download: cpython-git-3b06e243527da5f4ca4dc2e3126dc9f5bbdc243c.tar.gz
1 files changed, 30 insertions, 38 deletions
diff --git a/Lib/statistics.py b/Lib/statistics.py
index 518f546544..af5d41e89e 100644
--- a/Lib/statistics.py
+++ b/Lib/statistics.py
@@ -105,6 +105,7 @@ import math
 from fractions import Fraction
 from decimal import Decimal
 from itertools import groupby
+from bisect import bisect_left, bisect_right
 
 
 
@@ -223,56 +224,26 @@ def _exact_ratio(x):
         # Optimise the common case of floats. We expect that the most often
         # used numeric type will be builtin floats, so try to make this as
         # fast as possible.
-        if type(x) is float:
+        if type(x) is float or type(x) is Decimal:
             return x.as_integer_ratio()
         try:
             # x may be an int, Fraction, or Integral ABC.
             return (x.numerator, x.denominator)
         except AttributeError:
             try:
-                # x may be a float subclass.
+                # x may be a float or Decimal subclass.
                 return x.as_integer_ratio()
             except AttributeError:
-                try:
-                    # x may be a Decimal.
-                    return _decimal_to_ratio(x)
-                except AttributeError:
-                    # Just give up?
-                    pass
+                # Just give up?
+                pass
     except (OverflowError, ValueError):
         # float NAN or INF.
-        assert not math.isfinite(x)
+        assert not _isfinite(x)
         return (x, None)
     msg = "can't convert type '{}' to numerator/denominator"
     raise TypeError(msg.format(type(x).__name__))
 
 
-# FIXME This is faster than Fraction.from_decimal, but still too slow.
-def _decimal_to_ratio(d):
-    """Convert Decimal d to exact integer ratio (numerator, denominator).
-
-    >>> from decimal import Decimal
-    >>> _decimal_to_ratio(Decimal("2.6"))
-    (26, 10)
-
-    """
-    sign, digits, exp = d.as_tuple()
-    if exp in ('F', 'n', 'N'):  # INF, NAN, sNAN
-        assert not d.is_finite()
-        return (d, None)
-    num = 0
-    for digit in digits:
-        num = num*10 + digit
-    if exp < 0:
-        den = 10**-exp
-    else:
-        num *= 10**exp
-        den = 1
-    if sign:
-        num = -num
-    return (num, den)
-
-
 def _convert(value, T):
     """Convert value to given numeric type T."""
     if type(value) is T:
@@ -305,6 +276,21 @@ def _counts(data):
     return table
 
 
+def _find_lteq(a, x):
+    'Locate the leftmost value exactly equal to x'
+    i = bisect_left(a, x)
+    if i != len(a) and a[i] == x:
+        return i
+    raise ValueError
+
+
+def _find_rteq(a, l, x):
+    'Locate the rightmost value exactly equal to x'
+    i = bisect_right(a, x, lo=l)
+    if i != (len(a)+1) and a[i-1] == x:
+        return i-1
+    raise ValueError
+
 # === Measures of central tendency (averages) ===
 
 def mean(data):
@@ -442,9 +428,15 @@ def median_grouped(data, interval=1):
     except TypeError:
         # Mixed type. For now we just coerce to float.
         L = float(x) - float(interval)/2
-    cf = data.index(x)  # Number of values below the median interval.
-    # FIXME The following line could be more efficient for big lists.
-    f = data.count(x)  # Number of data points in the median interval.
+
+    # Uses bisection search to search for x in data with log(n) time complexity
+    # Find the position of leftmost occurence of x in data
+    l1 = _find_lteq(data, x)
+    # Find the position of rightmost occurence of x in data[l1...len(data)]
+    # Assuming always l1 <= l2
+    l2 = _find_rteq(data, l1, x)
+    cf = l1
+    f = l2 - l1 + 1
     return L + interval*(n/2 - cf)/f
author	Steven D'Aprano <steve@pearwood.info>	2016-05-05 03:54:29 +1000
committer	Steven D'Aprano <steve@pearwood.info>	2016-05-05 03:54:29 +1000
commit	3b06e243527da5f4ca4dc2e3126dc9f5bbdc243c (patch)
tree	2c00f35c03dc6ff24ae5c72f1363ef2da761ddc0 /Lib/statistics.py
parent	ad039f754805dc9c9d4cd95ed249984bc1405bd6 (diff)
download	cpython-git-3b06e243527da5f4ca4dc2e3126dc9f5bbdc243c.tar.gz