1##  Module statistics.py
2##
3##  Copyright (c) 2013 Steven D'Aprano <steve+python@pearwood.info>.
4##
5##  Licensed under the Apache License, Version 2.0 (the "License");
6##  you may not use this file except in compliance with the License.
7##  You may obtain a copy of the License at
8##
9##  http://www.apache.org/licenses/LICENSE-2.0
10##
11##  Unless required by applicable law or agreed to in writing, software
12##  distributed under the License is distributed on an "AS IS" BASIS,
13##  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14##  See the License for the specific language governing permissions and
15##  limitations under the License.
16
17
18"""
19Basic statistics module.
20
21This module provides functions for calculating statistics of data, including
22averages, variance, and standard deviation.
23
24Calculating averages
25--------------------
26
27==================  =============================================
28Function            Description
29==================  =============================================
30mean                Arithmetic mean (average) of data.
31median              Median (middle value) of data.
32median_low          Low median of data.
33median_high         High median of data.
34median_grouped      Median, or 50th percentile, of grouped data.
35mode                Mode (most common value) of data.
36==================  =============================================
37
38Calculate the arithmetic mean ("the average") of data:
39
40>>> mean([-1.0, 2.5, 3.25, 5.75])
412.625
42
43
44Calculate the standard median of discrete data:
45
46>>> median([2, 3, 4, 5])
473.5
48
49
50Calculate the median, or 50th percentile, of data grouped into class intervals
51centred on the data values provided. E.g. if your data points are rounded to
52the nearest whole number:
53
54>>> median_grouped([2, 2, 3, 3, 3, 4])  #doctest: +ELLIPSIS
552.8333333333...
56
57This should be interpreted in this way: you have two data points in the class
58interval 1.5-2.5, three data points in the class interval 2.5-3.5, and one in
59the class interval 3.5-4.5. The median of these data points is 2.8333...
60
61
62Calculating variability or spread
63---------------------------------
64
65==================  =============================================
66Function            Description
67==================  =============================================
68pvariance           Population variance of data.
69variance            Sample variance of data.
70pstdev              Population standard deviation of data.
71stdev               Sample standard deviation of data.
72==================  =============================================
73
74Calculate the standard deviation of sample data:
75
76>>> stdev([2.5, 3.25, 5.5, 11.25, 11.75])  #doctest: +ELLIPSIS
774.38961843444...
78
79If you have previously calculated the mean, you can pass it as the optional
80second argument to the four "spread" functions to avoid recalculating it:
81
82>>> data = [1, 2, 2, 4, 4, 4, 5, 6]
83>>> mu = mean(data)
84>>> pvariance(data, mu)
852.5
86
87
88Exceptions
89----------
90
91A single exception is defined: StatisticsError is a subclass of ValueError.
92
93"""
94
95__all__ = [ 'StatisticsError',
96            'pstdev', 'pvariance', 'stdev', 'variance',
97            'median',  'median_low', 'median_high', 'median_grouped',
98            'mean', 'mode',
99          ]
100
101
102import collections
103import math
104
105from fractions import Fraction
106from decimal import Decimal
107from itertools import groupby
108
109
110
111# === Exceptions ===
112
113class StatisticsError(ValueError):
114    pass
115
116
117# === Private utilities ===
118
119def _sum(data, start=0):
120    """_sum(data [, start]) -> (type, sum, count)
121
122    Return a high-precision sum of the given numeric data as a fraction,
123    together with the type to be converted to and the count of items.
124
125    If optional argument ``start`` is given, it is added to the total.
126    If ``data`` is empty, ``start`` (defaulting to 0) is returned.
127
128
129    Examples
130    --------
131
132    >>> _sum([3, 2.25, 4.5, -0.5, 1.0], 0.75)
133    (<class 'float'>, Fraction(11, 1), 5)
134
135    Some sources of round-off error will be avoided:
136
137    >>> _sum([1e50, 1, -1e50] * 1000)  # Built-in sum returns zero.
138    (<class 'float'>, Fraction(1000, 1), 3000)
139
140    Fractions and Decimals are also supported:
141
142    >>> from fractions import Fraction as F
143    >>> _sum([F(2, 3), F(7, 5), F(1, 4), F(5, 6)])
144    (<class 'fractions.Fraction'>, Fraction(63, 20), 4)
145
146    >>> from decimal import Decimal as D
147    >>> data = [D("0.1375"), D("0.2108"), D("0.3061"), D("0.0419")]
148    >>> _sum(data)
149    (<class 'decimal.Decimal'>, Fraction(6963, 10000), 4)
150
151    Mixed types are currently treated as an error, except that int is
152    allowed.
153    """
154    count = 0
155    n, d = _exact_ratio(start)
156    partials = {d: n}
157    partials_get = partials.get
158    T = _coerce(int, type(start))
159    for typ, values in groupby(data, type):
160        T = _coerce(T, typ)  # or raise TypeError
161        for n,d in map(_exact_ratio, values):
162            count += 1
163            partials[d] = partials_get(d, 0) + n
164    if None in partials:
165        # The sum will be a NAN or INF. We can ignore all the finite
166        # partials, and just look at this special one.
167        total = partials[None]
168        assert not _isfinite(total)
169    else:
170        # Sum all the partial sums using builtin sum.
171        # FIXME is this faster if we sum them in order of the denominator?
172        total = sum(Fraction(n, d) for d, n in sorted(partials.items()))
173    return (T, total, count)
174
175
176def _isfinite(x):
177    try:
178        return x.is_finite()  # Likely a Decimal.
179    except AttributeError:
180        return math.isfinite(x)  # Coerces to float first.
181
182
183def _coerce(T, S):
184    """Coerce types T and S to a common type, or raise TypeError.
185
186    Coercion rules are currently an implementation detail. See the CoerceTest
187    test class in test_statistics for details.
188    """
189    # See http://bugs.python.org/issue24068.
190    assert T is not bool, "initial type T is bool"
191    # If the types are the same, no need to coerce anything. Put this
192    # first, so that the usual case (no coercion needed) happens as soon
193    # as possible.
194    if T is S:  return T
195    # Mixed int & other coerce to the other type.
196    if S is int or S is bool:  return T
197    if T is int:  return S
198    # If one is a (strict) subclass of the other, coerce to the subclass.
199    if issubclass(S, T):  return S
200    if issubclass(T, S):  return T
201    # Ints coerce to the other type.
202    if issubclass(T, int):  return S
203    if issubclass(S, int):  return T
204    # Mixed fraction & float coerces to float (or float subclass).
205    if issubclass(T, Fraction) and issubclass(S, float):
206        return S
207    if issubclass(T, float) and issubclass(S, Fraction):
208        return T
209    # Any other combination is disallowed.
210    msg = "don't know how to coerce %s and %s"
211    raise TypeError(msg % (T.__name__, S.__name__))
212
213
214def _exact_ratio(x):
215    """Return Real number x to exact (numerator, denominator) pair.
216
217    >>> _exact_ratio(0.25)
218    (1, 4)
219
220    x is expected to be an int, Fraction, Decimal or float.
221    """
222    try:
223        # Optimise the common case of floats. We expect that the most often
224        # used numeric type will be builtin floats, so try to make this as
225        # fast as possible.
226        if type(x) is float:
227            return x.as_integer_ratio()
228        try:
229            # x may be an int, Fraction, or Integral ABC.
230            return (x.numerator, x.denominator)
231        except AttributeError:
232            try:
233                # x may be a float subclass.
234                return x.as_integer_ratio()
235            except AttributeError:
236                try:
237                    # x may be a Decimal.
238                    return _decimal_to_ratio(x)
239                except AttributeError:
240                    # Just give up?
241                    pass
242    except (OverflowError, ValueError):
243        # float NAN or INF.
244        assert not math.isfinite(x)
245        return (x, None)
246    msg = "can't convert type '{}' to numerator/denominator"
247    raise TypeError(msg.format(type(x).__name__))
248
249
250# FIXME This is faster than Fraction.from_decimal, but still too slow.
251def _decimal_to_ratio(d):
252    """Convert Decimal d to exact integer ratio (numerator, denominator).
253
254    >>> from decimal import Decimal
255    >>> _decimal_to_ratio(Decimal("2.6"))
256    (26, 10)
257
258    """
259    sign, digits, exp = d.as_tuple()
260    if exp in ('F', 'n', 'N'):  # INF, NAN, sNAN
261        assert not d.is_finite()
262        return (d, None)
263    num = 0
264    for digit in digits:
265        num = num*10 + digit
266    if exp < 0:
267        den = 10**-exp
268    else:
269        num *= 10**exp
270        den = 1
271    if sign:
272        num = -num
273    return (num, den)
274
275
276def _convert(value, T):
277    """Convert value to given numeric type T."""
278    if type(value) is T:
279        # This covers the cases where T is Fraction, or where value is
280        # a NAN or INF (Decimal or float).
281        return value
282    if issubclass(T, int) and value.denominator != 1:
283        T = float
284    try:
285        # FIXME: what do we do if this overflows?
286        return T(value)
287    except TypeError:
288        if issubclass(T, Decimal):
289            return T(value.numerator)/T(value.denominator)
290        else:
291            raise
292
293
294def _counts(data):
295    # Generate a table of sorted (value, frequency) pairs.
296    table = collections.Counter(iter(data)).most_common()
297    if not table:
298        return table
299    # Extract the values with the highest frequency.
300    maxfreq = table[0][1]
301    for i in range(1, len(table)):
302        if table[i][1] != maxfreq:
303            table = table[:i]
304            break
305    return table
306
307
308# === Measures of central tendency (averages) ===
309
310def mean(data):
311    """Return the sample arithmetic mean of data.
312
313    >>> mean([1, 2, 3, 4, 4])
314    2.8
315
316    >>> from fractions import Fraction as F
317    >>> mean([F(3, 7), F(1, 21), F(5, 3), F(1, 3)])
318    Fraction(13, 21)
319
320    >>> from decimal import Decimal as D
321    >>> mean([D("0.5"), D("0.75"), D("0.625"), D("0.375")])
322    Decimal('0.5625')
323
324    If ``data`` is empty, StatisticsError will be raised.
325    """
326    if iter(data) is data:
327        data = list(data)
328    n = len(data)
329    if n < 1:
330        raise StatisticsError('mean requires at least one data point')
331    T, total, count = _sum(data)
332    assert count == n
333    return _convert(total/n, T)
334
335
336# FIXME: investigate ways to calculate medians without sorting? Quickselect?
337def median(data):
338    """Return the median (middle value) of numeric data.
339
340    When the number of data points is odd, return the middle data point.
341    When the number of data points is even, the median is interpolated by
342    taking the average of the two middle values:
343
344    >>> median([1, 3, 5])
345    3
346    >>> median([1, 3, 5, 7])
347    4.0
348
349    """
350    data = sorted(data)
351    n = len(data)
352    if n == 0:
353        raise StatisticsError("no median for empty data")
354    if n%2 == 1:
355        return data[n//2]
356    else:
357        i = n//2
358        return (data[i - 1] + data[i])/2
359
360
361def median_low(data):
362    """Return the low median of numeric data.
363
364    When the number of data points is odd, the middle value is returned.
365    When it is even, the smaller of the two middle values is returned.
366
367    >>> median_low([1, 3, 5])
368    3
369    >>> median_low([1, 3, 5, 7])
370    3
371
372    """
373    data = sorted(data)
374    n = len(data)
375    if n == 0:
376        raise StatisticsError("no median for empty data")
377    if n%2 == 1:
378        return data[n//2]
379    else:
380        return data[n//2 - 1]
381
382
383def median_high(data):
384    """Return the high median of data.
385
386    When the number of data points is odd, the middle value is returned.
387    When it is even, the larger of the two middle values is returned.
388
389    >>> median_high([1, 3, 5])
390    3
391    >>> median_high([1, 3, 5, 7])
392    5
393
394    """
395    data = sorted(data)
396    n = len(data)
397    if n == 0:
398        raise StatisticsError("no median for empty data")
399    return data[n//2]
400
401
402def median_grouped(data, interval=1):
403    """Return the 50th percentile (median) of grouped continuous data.
404
405    >>> median_grouped([1, 2, 2, 3, 4, 4, 4, 4, 4, 5])
406    3.7
407    >>> median_grouped([52, 52, 53, 54])
408    52.5
409
410    This calculates the median as the 50th percentile, and should be
411    used when your data is continuous and grouped. In the above example,
412    the values 1, 2, 3, etc. actually represent the midpoint of classes
413    0.5-1.5, 1.5-2.5, 2.5-3.5, etc. The middle value falls somewhere in
414    class 3.5-4.5, and interpolation is used to estimate it.
415
416    Optional argument ``interval`` represents the class interval, and
417    defaults to 1. Changing the class interval naturally will change the
418    interpolated 50th percentile value:
419
420    >>> median_grouped([1, 3, 3, 5, 7], interval=1)
421    3.25
422    >>> median_grouped([1, 3, 3, 5, 7], interval=2)
423    3.5
424
425    This function does not check whether the data points are at least
426    ``interval`` apart.
427    """
428    data = sorted(data)
429    n = len(data)
430    if n == 0:
431        raise StatisticsError("no median for empty data")
432    elif n == 1:
433        return data[0]
434    # Find the value at the midpoint. Remember this corresponds to the
435    # centre of the class interval.
436    x = data[n//2]
437    for obj in (x, interval):
438        if isinstance(obj, (str, bytes)):
439            raise TypeError('expected number but got %r' % obj)
440    try:
441        L = x - interval/2  # The lower limit of the median interval.
442    except TypeError:
443        # Mixed type. For now we just coerce to float.
444        L = float(x) - float(interval)/2
445    cf = data.index(x)  # Number of values below the median interval.
446    # FIXME The following line could be more efficient for big lists.
447    f = data.count(x)  # Number of data points in the median interval.
448    return L + interval*(n/2 - cf)/f
449
450
451def mode(data):
452    """Return the most common data point from discrete or nominal data.
453
454    ``mode`` assumes discrete data, and returns a single value. This is the
455    standard treatment of the mode as commonly taught in schools:
456
457    >>> mode([1, 1, 2, 3, 3, 3, 3, 4])
458    3
459
460    This also works with nominal (non-numeric) data:
461
462    >>> mode(["red", "blue", "blue", "red", "green", "red", "red"])
463    'red'
464
465    If there is not exactly one most common value, ``mode`` will raise
466    StatisticsError.
467    """
468    # Generate a table of sorted (value, frequency) pairs.
469    table = _counts(data)
470    if len(table) == 1:
471        return table[0][0]
472    elif table:
473        raise StatisticsError(
474                'no unique mode; found %d equally common values' % len(table)
475                )
476    else:
477        raise StatisticsError('no mode for empty data')
478
479
480# === Measures of spread ===
481
482# See http://mathworld.wolfram.com/Variance.html
483#     http://mathworld.wolfram.com/SampleVariance.html
484#     http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
485#
486# Under no circumstances use the so-called "computational formula for
487# variance", as that is only suitable for hand calculations with a small
488# amount of low-precision data. It has terrible numeric properties.
489#
490# See a comparison of three computational methods here:
491# http://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
492
493def _ss(data, c=None):
494    """Return sum of square deviations of sequence data.
495
496    If ``c`` is None, the mean is calculated in one pass, and the deviations
497    from the mean are calculated in a second pass. Otherwise, deviations are
498    calculated from ``c`` as given. Use the second case with care, as it can
499    lead to garbage results.
500    """
501    if c is None:
502        c = mean(data)
503    T, total, count = _sum((x-c)**2 for x in data)
504    # The following sum should mathematically equal zero, but due to rounding
505    # error may not.
506    U, total2, count2 = _sum((x-c) for x in data)
507    assert T == U and count == count2
508    total -=  total2**2/len(data)
509    assert not total < 0, 'negative sum of square deviations: %f' % total
510    return (T, total)
511
512
513def variance(data, xbar=None):
514    """Return the sample variance of data.
515
516    data should be an iterable of Real-valued numbers, with at least two
517    values. The optional argument xbar, if given, should be the mean of
518    the data. If it is missing or None, the mean is automatically calculated.
519
520    Use this function when your data is a sample from a population. To
521    calculate the variance from the entire population, see ``pvariance``.
522
523    Examples:
524
525    >>> data = [2.75, 1.75, 1.25, 0.25, 0.5, 1.25, 3.5]
526    >>> variance(data)
527    1.3720238095238095
528
529    If you have already calculated the mean of your data, you can pass it as
530    the optional second argument ``xbar`` to avoid recalculating it:
531
532    >>> m = mean(data)
533    >>> variance(data, m)
534    1.3720238095238095
535
536    This function does not check that ``xbar`` is actually the mean of
537    ``data``. Giving arbitrary values for ``xbar`` may lead to invalid or
538    impossible results.
539
540    Decimals and Fractions are supported:
541
542    >>> from decimal import Decimal as D
543    >>> variance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
544    Decimal('31.01875')
545
546    >>> from fractions import Fraction as F
547    >>> variance([F(1, 6), F(1, 2), F(5, 3)])
548    Fraction(67, 108)
549
550    """
551    if iter(data) is data:
552        data = list(data)
553    n = len(data)
554    if n < 2:
555        raise StatisticsError('variance requires at least two data points')
556    T, ss = _ss(data, xbar)
557    return _convert(ss/(n-1), T)
558
559
560def pvariance(data, mu=None):
561    """Return the population variance of ``data``.
562
563    data should be an iterable of Real-valued numbers, with at least one
564    value. The optional argument mu, if given, should be the mean of
565    the data. If it is missing or None, the mean is automatically calculated.
566
567    Use this function to calculate the variance from the entire population.
568    To estimate the variance from a sample, the ``variance`` function is
569    usually a better choice.
570
571    Examples:
572
573    >>> data = [0.0, 0.25, 0.25, 1.25, 1.5, 1.75, 2.75, 3.25]
574    >>> pvariance(data)
575    1.25
576
577    If you have already calculated the mean of the data, you can pass it as
578    the optional second argument to avoid recalculating it:
579
580    >>> mu = mean(data)
581    >>> pvariance(data, mu)
582    1.25
583
584    This function does not check that ``mu`` is actually the mean of ``data``.
585    Giving arbitrary values for ``mu`` may lead to invalid or impossible
586    results.
587
588    Decimals and Fractions are supported:
589
590    >>> from decimal import Decimal as D
591    >>> pvariance([D("27.5"), D("30.25"), D("30.25"), D("34.5"), D("41.75")])
592    Decimal('24.815')
593
594    >>> from fractions import Fraction as F
595    >>> pvariance([F(1, 4), F(5, 4), F(1, 2)])
596    Fraction(13, 72)
597
598    """
599    if iter(data) is data:
600        data = list(data)
601    n = len(data)
602    if n < 1:
603        raise StatisticsError('pvariance requires at least one data point')
604    ss = _ss(data, mu)
605    T, ss = _ss(data, mu)
606    return _convert(ss/n, T)
607
608
609def stdev(data, xbar=None):
610    """Return the square root of the sample variance.
611
612    See ``variance`` for arguments and other details.
613
614    >>> stdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
615    1.0810874155219827
616
617    """
618    var = variance(data, xbar)
619    try:
620        return var.sqrt()
621    except AttributeError:
622        return math.sqrt(var)
623
624
625def pstdev(data, mu=None):
626    """Return the square root of the population variance.
627
628    See ``pvariance`` for arguments and other details.
629
630    >>> pstdev([1.5, 2.5, 2.5, 2.75, 3.25, 4.75])
631    0.986893273527251
632
633    """
634    var = pvariance(data, mu)
635    try:
636        return var.sqrt()
637    except AttributeError:
638        return math.sqrt(var)
639