statistics.py revision a1401311d1ab56c4ed0a474bd38c108f75cb0cd9
1# Copyright 2014 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""A collection of statistical utility functions to be used by metrics."""
6
7import bisect
8import math
9
10
11def Clamp(value, low=0.0, high=1.0):
12  """Clamp a value between some low and high value."""
13  return min(max(value, low), high)
14
15
16def NormalizeSamples(samples):
17  """Sorts the samples, and map them linearly to the range [0,1].
18
19  They're mapped such that for the N samples, the first sample is 0.5/N and the
20  last sample is (N-0.5)/N.
21
22  Background: The discrepancy of the sample set i/(N-1); i=0, ..., N-1 is 2/N,
23  twice the discrepancy of the sample set (i+1/2)/N; i=0, ..., N-1. In our case
24  we don't want to distinguish between these two cases, as our original domain
25  is not bounded (it is for Monte Carlo integration, where discrepancy was
26  first used).
27  """
28  if not samples:
29    return samples, 1.0
30  samples = sorted(samples)
31  low = min(samples)
32  high = max(samples)
33  new_low = 0.5 / len(samples)
34  new_high = (len(samples)-0.5) / len(samples)
35  if high-low == 0.0:
36    return samples, 1.0
37  scale = (new_high - new_low) / (high - low)
38  for i in xrange(0, len(samples)):
39    samples[i] = float(samples[i] - low) * scale + new_low
40  return samples, scale
41
42
43def Discrepancy(samples, interval_multiplier=10000):
44  """Computes the discrepancy of a set of 1D samples from the interval [0,1].
45
46  The samples must be sorted.
47
48  http://en.wikipedia.org/wiki/Low-discrepancy_sequence
49  http://mathworld.wolfram.com/Discrepancy.html
50  """
51  if not samples:
52    return 1.0
53
54  max_local_discrepancy = 0
55  locations = []
56  # For each location, stores the number of samples less than that location.
57  left = []
58  # For each location, stores the number of samples less than or equal to that
59  # location.
60  right = []
61
62  interval_count = len(samples) * interval_multiplier
63  # Compute number of locations the will roughly result in the requested number
64  # of intervals.
65  location_count = int(math.ceil(math.sqrt(interval_count*2)))
66  inv_sample_count = 1.0 / len(samples)
67
68  # Generate list of equally spaced locations.
69  for i in xrange(0, location_count):
70    location = float(i) / (location_count-1)
71    locations.append(location)
72    left.append(bisect.bisect_left(samples, location))
73    right.append(bisect.bisect_right(samples, location))
74
75  # Iterate over the intervals defined by any pair of locations.
76  for i in xrange(0, len(locations)):
77    for j in xrange(i, len(locations)):
78      # Compute length of interval and number of samples in the interval.
79      length = locations[j] - locations[i]
80      count = right[j] - left[i]
81
82      # Compute local discrepancy and update max_local_discrepancy.
83      local_discrepancy = abs(float(count)*inv_sample_count - length)
84      max_local_discrepancy = max(local_discrepancy, max_local_discrepancy)
85
86  return max_local_discrepancy
87
88
89def TimestampsDiscrepancy(timestamps, absolute=True,
90                          interval_multiplier=10000):
91  """A discrepancy based metric for measuring timestamp jank.
92
93  TimestampsDiscrepancy quantifies the largest area of jank observed in a series
94  of timestamps.  Note that this is different from metrics based on the
95  max_time_interval. For example, the time stamp series A = [0,1,2,3,5,6] and
96  B = [0,1,2,3,5,7] have the same max_time_interval = 2, but
97  Discrepancy(B) > Discrepancy(A).
98
99  Two variants of discrepancy can be computed:
100
101  Relative discrepancy is following the original definition of
102  discrepancy. It characterized the largest area of jank, relative to the
103  duration of the entire time stamp series.  We normalize the raw results,
104  because the best case discrepancy for a set of N samples is 1/N (for
105  equally spaced samples), and we want our metric to report 0.0 in that
106  case.
107
108  Absolute discrepancy also characterizes the largest area of jank, but its
109  value wouldn't change (except for imprecisions due to a low
110  |interval_multiplier|) if additional 'good' intervals were added to an
111  exisiting list of time stamps.  Its range is [0,inf] and the unit is
112  milliseconds.
113
114  The time stamp series C = [0,2,3,4] and D = [0,2,3,4,5] have the same
115  absolute discrepancy, but D has lower relative discrepancy than C.
116
117  |timestamps| may be a list of lists S = [S_1, S_2, ..., S_N], where each
118  S_i is a time stamp series. In that case, the discrepancy D(S) is:
119  D(S) = max(D(S_1), D(S_2), ..., D(S_N))
120  """
121  if not timestamps:
122    return 1.0
123
124  if isinstance(timestamps[0], list):
125    range_discrepancies = [TimestampsDiscrepancy(r) for r in timestamps]
126    return max(range_discrepancies)
127
128  samples, sample_scale = NormalizeSamples(timestamps)
129  discrepancy = Discrepancy(samples, interval_multiplier)
130  inv_sample_count = 1.0 / len(samples)
131  if absolute:
132    # Compute absolute discrepancy
133    discrepancy /= sample_scale
134  else:
135    # Compute relative discrepancy
136    discrepancy = Clamp((discrepancy-inv_sample_count) / (1.0-inv_sample_count))
137  return discrepancy
138
139
140def DurationsDiscrepancy(durations, absolute=True,
141                         interval_multiplier=10000):
142  """A discrepancy based metric for measuring duration jank.
143
144  DurationsDiscrepancy computes a jank metric which measures how irregular a
145  given sequence of intervals is. In order to minimize jank, each duration
146  should be equally long. This is similar to how timestamp jank works,
147  and we therefore reuse the timestamp discrepancy function above to compute a
148  similar duration discrepancy number.
149
150  Because timestamp discrepancy is defined in terms of timestamps, we first
151  convert the list of durations to monotonically increasing timestamps.
152
153  Args:
154    durations: List of interval lengths in milliseconds.
155    absolute: See TimestampsDiscrepancy.
156    interval_multiplier: See TimestampsDiscrepancy.
157  """
158  timestamps = reduce(lambda x, y: x + [x[-1] + y], durations, [0])[1:]
159  return TimestampsDiscrepancy(timestamps, absolute, interval_multiplier)
160
161
162def ArithmeticMean(data):
163  """Calculates arithmetic mean.
164
165  Args:
166    data: A list of samples.
167
168  Returns:
169    The arithmetic mean value, or 0 if the list is empty.
170  """
171  numerator_total = Total(data)
172  denominator_total = Total(len(data))
173  return DivideIfPossibleOrZero(numerator_total, denominator_total)
174
175
176def StandardDeviation(data):
177  """Calculates the standard deviation.
178
179  Args:
180    data: A list of samples.
181
182  Returns:
183    The standard deviation of the samples provided.
184  """
185  if len(data) == 1:
186    return 0.0
187
188  mean = ArithmeticMean(data)
189  variances = [float(x) - mean for x in data]
190  variances = [x * x for x in variances]
191  std_dev = math.sqrt(ArithmeticMean(variances))
192
193  return std_dev
194
195
196def TrapezoidalRule(data, dx):
197  """ Calculate the integral according to the trapezoidal rule
198
199  TrapezoidalRule approximates the definite integral of f from a to b by
200  the composite trapezoidal rule, using n subintervals.
201  http://en.wikipedia.org/wiki/Trapezoidal_rule#Uniform_grid
202
203  Args:
204    data: A list of samples
205    dx: The uniform distance along the x axis between any two samples
206
207  Returns:
208    The area under the curve defined by the samples and the uniform distance
209    according to the trapezoidal rule.
210  """
211
212  n = len(data) - 1
213  s = data[0] + data[n]
214
215  if n == 0:
216    return 0.0
217
218  for i in range(1, n):
219    s += 2 * data[i]
220
221  return s * dx / 2.0
222
223def Total(data):
224  """Returns the float value of a number or the sum of a list."""
225  if type(data) == float:
226    total = data
227  elif type(data) == int:
228    total = float(data)
229  elif type(data) == list:
230    total = float(sum(data))
231  else:
232    raise TypeError
233  return total
234
235
236def DivideIfPossibleOrZero(numerator, denominator):
237  """Returns the quotient, or zero if the denominator is zero."""
238  if not denominator:
239    return 0.0
240  else:
241    return numerator / denominator
242
243
244def GeneralizedMean(values, exponent):
245  """See http://en.wikipedia.org/wiki/Generalized_mean"""
246  if not values:
247    return 0.0
248  sum_of_powers = 0.0
249  for v in values:
250    sum_of_powers += v ** exponent
251  return (sum_of_powers / len(values)) ** (1.0/exponent)
252
253
254def Median(values):
255  """Gets the median of a list of values."""
256  return Percentile(values, 50)
257
258
259def Percentile(values, percentile):
260  """Calculates the value below which a given percentage of values fall.
261
262  For example, if 17% of the values are less than 5.0, then 5.0 is the 17th
263  percentile for this set of values. When the percentage doesn't exactly
264  match a rank in the list of values, the percentile is computed using linear
265  interpolation between closest ranks.
266
267  Args:
268    values: A list of numerical values.
269    percentile: A number between 0 and 100.
270
271  Returns:
272    The Nth percentile for the list of values, where N is the given percentage.
273  """
274  if not values:
275    return 0.0
276  sorted_values = sorted(values)
277  n = len(values)
278  percentile /= 100.0
279  if percentile <= 0.5 / n:
280    return sorted_values[0]
281  elif percentile >= (n - 0.5) / n:
282    return sorted_values[-1]
283  else:
284    floor_index = int(math.floor(n * percentile -  0.5))
285    floor_value = sorted_values[floor_index]
286    ceil_value = sorted_values[floor_index+1]
287    alpha = n * percentile - 0.5 - floor_index
288    return floor_value + alpha * (ceil_value - floor_value)
289
290
291def GeometricMean(values):
292  """Compute a rounded geometric mean from an array of values."""
293  if not values:
294    return None
295  # To avoid infinite value errors, make sure no value is less than 0.001.
296  new_values = []
297  for value in values:
298    if value > 0.001:
299      new_values.append(value)
300    else:
301      new_values.append(0.001)
302  # Compute the sum of the log of the values.
303  log_sum = sum(map(math.log, new_values))
304  # Raise e to that sum over the number of values.
305  mean = math.pow(math.e, (log_sum / len(new_values)))
306  # Return the rounded mean.
307  return int(round(mean))
308
309