summaryrefslogtreecommitdiff
path: root/Lib/statistics.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/statistics.py')
-rw-r--r--Lib/statistics.py127
1 files changed, 74 insertions, 53 deletions
diff --git a/Lib/statistics.py b/Lib/statistics.py
index ff07dc4a6b..8a6be7c759 100644
--- a/Lib/statistics.py
+++ b/Lib/statistics.py
@@ -80,12 +80,25 @@ A single exception is defined: StatisticsError is a subclass of ValueError.
"""
-__all__ = [ 'StatisticsError', 'NormalDist', 'quantiles',
- 'pstdev', 'pvariance', 'stdev', 'variance',
- 'median', 'median_low', 'median_high', 'median_grouped',
- 'mean', 'mode', 'multimode', 'harmonic_mean', 'fmean',
- 'geometric_mean',
- ]
+__all__ = [
+ 'NormalDist',
+ 'StatisticsError',
+ 'fmean',
+ 'geometric_mean',
+ 'harmonic_mean',
+ 'mean',
+ 'median',
+ 'median_grouped',
+ 'median_high',
+ 'median_low',
+ 'mode',
+ 'multimode',
+ 'pstdev',
+ 'pvariance',
+ 'quantiles',
+ 'stdev',
+ 'variance',
+]
import math
import numbers
@@ -304,8 +317,9 @@ def mean(data):
assert count == n
return _convert(total/n, T)
+
def fmean(data):
- """ Convert data to floats and compute the arithmetic mean.
+ """Convert data to floats and compute the arithmetic mean.
This runs faster than the mean() function and it always returns a float.
The result is highly accurate but not as perfect as mean().
@@ -313,7 +327,6 @@ def fmean(data):
>>> fmean([3.5, 4.0, 5.25])
4.25
-
"""
try:
n = len(data)
@@ -332,6 +345,7 @@ def fmean(data):
except ZeroDivisionError:
raise StatisticsError('fmean requires at least one data point') from None
+
def geometric_mean(data):
"""Convert data to floats and compute the geometric mean.
@@ -350,6 +364,7 @@ def geometric_mean(data):
raise StatisticsError('geometric mean requires a non-empty dataset '
' containing positive numbers') from None
+
def harmonic_mean(data):
"""Return the harmonic mean of data.
@@ -547,23 +562,23 @@ def mode(data):
def multimode(data):
- """ Return a list of the most frequently occurring values.
-
- Will return more than one result if there are multiple modes
- or an empty list if *data* is empty.
+ """Return a list of the most frequently occurring values.
- >>> multimode('aabbbbbbbbcc')
- ['b']
- >>> multimode('aabbbbccddddeeffffgg')
- ['b', 'd', 'f']
- >>> multimode('')
- []
+ Will return more than one result if there are multiple modes
+ or an empty list if *data* is empty.
+ >>> multimode('aabbbbbbbbcc')
+ ['b']
+ >>> multimode('aabbbbccddddeeffffgg')
+ ['b', 'd', 'f']
+ >>> multimode('')
+ []
"""
counts = Counter(iter(data)).most_common()
maxcount, mode_items = next(groupby(counts, key=itemgetter(1)), (0, []))
return list(map(itemgetter(0), mode_items))
+
# Notes on methods for computing quantiles
# ----------------------------------------
#
@@ -601,7 +616,7 @@ def multimode(data):
# external packages can be used for anything more advanced.
def quantiles(dist, /, *, n=4, method='exclusive'):
- '''Divide *dist* into *n* continuous intervals with equal probability.
+ """Divide *dist* into *n* continuous intervals with equal probability.
Returns a list of (n - 1) cut points separating the intervals.
@@ -616,7 +631,7 @@ def quantiles(dist, /, *, n=4, method='exclusive'):
If *method* is set to *inclusive*, *dist* is treated as population
data. The minimum value is treated as the 0th percentile and the
maximum value is treated as the 100th percentile.
- '''
+ """
if n < 1:
raise StatisticsError('n must be at least 1')
if hasattr(dist, 'inv_cdf'):
@@ -646,6 +661,7 @@ def quantiles(dist, /, *, n=4, method='exclusive'):
return result
raise ValueError(f'Unknown method: {method!r}')
+
# === Measures of spread ===
# See http://mathworld.wolfram.com/Variance.html
@@ -805,18 +821,21 @@ def pstdev(data, mu=None):
except AttributeError:
return math.sqrt(var)
+
## Normal Distribution #####################################################
class NormalDist:
- 'Normal distribution of a random variable'
+ "Normal distribution of a random variable"
# https://en.wikipedia.org/wiki/Normal_distribution
# https://en.wikipedia.org/wiki/Variance#Properties
- __slots__ = {'_mu': 'Arithmetic mean of a normal distribution',
- '_sigma': 'Standard deviation of a normal distribution'}
+ __slots__ = {
+ '_mu': 'Arithmetic mean of a normal distribution',
+ '_sigma': 'Standard deviation of a normal distribution',
+ }
def __init__(self, mu=0.0, sigma=1.0):
- 'NormalDist where mu is the mean and sigma is the standard deviation.'
+ "NormalDist where mu is the mean and sigma is the standard deviation."
if sigma < 0.0:
raise StatisticsError('sigma must be non-negative')
self._mu = mu
@@ -824,40 +843,42 @@ class NormalDist:
@classmethod
def from_samples(cls, data):
- 'Make a normal distribution instance from sample data.'
+ "Make a normal distribution instance from sample data."
if not isinstance(data, (list, tuple)):
data = list(data)
xbar = fmean(data)
return cls(xbar, stdev(data, xbar))
def samples(self, n, *, seed=None):
- 'Generate *n* samples for a given mean and standard deviation.'
+ "Generate *n* samples for a given mean and standard deviation."
gauss = random.gauss if seed is None else random.Random(seed).gauss
mu, sigma = self._mu, self._sigma
return [gauss(mu, sigma) for i in range(n)]
def pdf(self, x):
- 'Probability density function. P(x <= X < x+dx) / dx'
+ "Probability density function. P(x <= X < x+dx) / dx"
variance = self._sigma ** 2.0
if not variance:
raise StatisticsError('pdf() not defined when sigma is zero')
- return exp((x - self._mu)**2.0 / (-2.0*variance)) / sqrt(tau * variance)
+ return exp((x - self._mu)**2.0 / (-2.0*variance)) / sqrt(tau*variance)
def cdf(self, x):
- 'Cumulative distribution function. P(X <= x)'
+ "Cumulative distribution function. P(X <= x)"
if not self._sigma:
raise StatisticsError('cdf() not defined when sigma is zero')
return 0.5 * (1.0 + erf((x - self._mu) / (self._sigma * sqrt(2.0))))
def inv_cdf(self, p):
- '''Inverse cumulative distribution function. x : P(X <= x) = p
+ """Inverse cumulative distribution function. x : P(X <= x) = p
- Finds the value of the random variable such that the probability of the
- variable being less than or equal to that value equals the given probability.
+ Finds the value of the random variable such that the probability of
+ the variable being less than or equal to that value equals the given
+ probability.
- This function is also called the percent point function or quantile function.
- '''
- if (p <= 0.0 or p >= 1.0):
+ This function is also called the percent point function or quantile
+ function.
+ """
+ if p <= 0.0 or p >= 1.0:
raise StatisticsError('p must be in the range 0.0 < p < 1.0')
if self._sigma <= 0.0:
raise StatisticsError('cdf() not defined when sigma at or below zero')
@@ -933,7 +954,7 @@ class NormalDist:
return self._mu + (x * self._sigma)
def overlap(self, other):
- '''Compute the overlapping coefficient (OVL) between two normal distributions.
+ """Compute the overlapping coefficient (OVL) between two normal distributions.
Measures the agreement between two normal probability distributions.
Returns a value between 0.0 and 1.0 giving the overlapping area in
@@ -943,7 +964,7 @@ class NormalDist:
>>> N2 = NormalDist(3.2, 2.0)
>>> N1.overlap(N2)
0.8035050657330205
- '''
+ """
# See: "The overlapping coefficient as a measure of agreement between
# probability distributions and point estimation of the overlap of two
# normal densities" -- Henry F. Inman and Edwin L. Bradley Jr
@@ -968,21 +989,21 @@ class NormalDist:
@property
def mean(self):
- 'Arithmetic mean of the normal distribution.'
+ "Arithmetic mean of the normal distribution."
return self._mu
@property
def stdev(self):
- 'Standard deviation of the normal distribution.'
+ "Standard deviation of the normal distribution."
return self._sigma
@property
def variance(self):
- 'Square of the standard deviation.'
+ "Square of the standard deviation."
return self._sigma ** 2.0
def __add__(x1, x2):
- '''Add a constant or another NormalDist instance.
+ """Add a constant or another NormalDist instance.
If *other* is a constant, translate mu by the constant,
leaving sigma unchanged.
@@ -990,13 +1011,13 @@ class NormalDist:
If *other* is a NormalDist, add both the means and the variances.
Mathematically, this works only if the two distributions are
independent or if they are jointly normally distributed.
- '''
+ """
if isinstance(x2, NormalDist):
return NormalDist(x1._mu + x2._mu, hypot(x1._sigma, x2._sigma))
return NormalDist(x1._mu + x2, x1._sigma)
def __sub__(x1, x2):
- '''Subtract a constant or another NormalDist instance.
+ """Subtract a constant or another NormalDist instance.
If *other* is a constant, translate by the constant mu,
leaving sigma unchanged.
@@ -1004,51 +1025,51 @@ class NormalDist:
If *other* is a NormalDist, subtract the means and add the variances.
Mathematically, this works only if the two distributions are
independent or if they are jointly normally distributed.
- '''
+ """
if isinstance(x2, NormalDist):
return NormalDist(x1._mu - x2._mu, hypot(x1._sigma, x2._sigma))
return NormalDist(x1._mu - x2, x1._sigma)
def __mul__(x1, x2):
- '''Multiply both mu and sigma by a constant.
+ """Multiply both mu and sigma by a constant.
Used for rescaling, perhaps to change measurement units.
Sigma is scaled with the absolute value of the constant.
- '''
+ """
return NormalDist(x1._mu * x2, x1._sigma * fabs(x2))
def __truediv__(x1, x2):
- '''Divide both mu and sigma by a constant.
+ """Divide both mu and sigma by a constant.
Used for rescaling, perhaps to change measurement units.
Sigma is scaled with the absolute value of the constant.
- '''
+ """
return NormalDist(x1._mu / x2, x1._sigma / fabs(x2))
def __pos__(x1):
- 'Return a copy of the instance.'
+ "Return a copy of the instance."
return NormalDist(x1._mu, x1._sigma)
def __neg__(x1):
- 'Negates mu while keeping sigma the same.'
+ "Negates mu while keeping sigma the same."
return NormalDist(-x1._mu, x1._sigma)
__radd__ = __add__
def __rsub__(x1, x2):
- 'Subtract a NormalDist from a constant or another NormalDist.'
+ "Subtract a NormalDist from a constant or another NormalDist."
return -(x1 - x2)
__rmul__ = __mul__
def __eq__(x1, x2):
- 'Two NormalDist objects are equal if their mu and sigma are both equal.'
+ "Two NormalDist objects are equal if their mu and sigma are both equal."
if not isinstance(x2, NormalDist):
return NotImplemented
return (x1._mu, x2._sigma) == (x2._mu, x2._sigma)
def __hash__(self):
- 'NormalDist objects hash equal if their mu and sigma are both equal.'
+ "NormalDist objects hash equal if their mu and sigma are both equal."
return hash((self._mu, self._sigma))
def __repr__(self):