1# Copyright 2014 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5"""A collection of statistical utility functions to be used by metrics."""
6
7import math
8
9
10def Clamp(value, low=0.0, high=1.0):
11  """Clamp a value between some low and high value."""
12  return min(max(value, low), high)
13
14
15def NormalizeSamples(samples):
16  """Sorts the samples, and map them linearly to the range [0,1].
17
18  They're mapped such that for the N samples, the first sample is 0.5/N and the
19  last sample is (N-0.5)/N.
20
21  Background: The discrepancy of the sample set i/(N-1); i=0, ..., N-1 is 2/N,
22  twice the discrepancy of the sample set (i+1/2)/N; i=0, ..., N-1. In our case
23  we don't want to distinguish between these two cases, as our original domain
24  is not bounded (it is for Monte Carlo integration, where discrepancy was
25  first used).
26  """
27  if not samples:
28    return samples, 1.0
29  samples = sorted(samples)
30  low = min(samples)
31  high = max(samples)
32  new_low = 0.5 / len(samples)
33  new_high = (len(samples)-0.5) / len(samples)
34  if high-low == 0.0:
35    return [0.5] * len(samples), 1.0
36  scale = (new_high - new_low) / (high - low)
37  for i in xrange(0, len(samples)):
38    samples[i] = float(samples[i] - low) * scale + new_low
39  return samples, scale
40
41
42def Discrepancy(samples, location_count=None):
43  """Computes the discrepancy of a set of 1D samples from the interval [0,1].
44
45  The samples must be sorted. We define the discrepancy of an empty set
46  of samples to be zero.
47
48  http://en.wikipedia.org/wiki/Low-discrepancy_sequence
49  http://mathworld.wolfram.com/Discrepancy.html
50  """
51  if not samples:
52    return 0.0
53
54  max_local_discrepancy = 0
55  inv_sample_count = 1.0 / len(samples)
56  locations = []
57  # For each location, stores the number of samples less than that location.
58  count_less = []
59  # For each location, stores the number of samples less than or equal to that
60  # location.
61  count_less_equal = []
62
63  if location_count:
64    # Generate list of equally spaced locations.
65    sample_index = 0
66    for i in xrange(0, int(location_count)):
67      location = float(i) / (location_count-1)
68      locations.append(location)
69      while sample_index < len(samples) and samples[sample_index] < location:
70        sample_index += 1
71      count_less.append(sample_index)
72      while  sample_index < len(samples) and samples[sample_index] <= location:
73        sample_index += 1
74      count_less_equal.append(sample_index)
75  else:
76    # Populate locations with sample positions. Append 0 and 1 if necessary.
77    if samples[0] > 0.0:
78      locations.append(0.0)
79      count_less.append(0)
80      count_less_equal.append(0)
81    for i in xrange(0, len(samples)):
82      locations.append(samples[i])
83      count_less.append(i)
84      count_less_equal.append(i+1)
85    if samples[-1] < 1.0:
86      locations.append(1.0)
87      count_less.append(len(samples))
88      count_less_equal.append(len(samples))
89
90  # Iterate over the intervals defined by any pair of locations.
91  for i in xrange(0, len(locations)):
92    for j in xrange(i+1, len(locations)):
93      # Length of interval
94      length = locations[j] - locations[i]
95
96      # Local discrepancy for closed interval
97      count_closed = count_less_equal[j] - count_less[i]
98      local_discrepancy_closed = abs(float(count_closed) *
99                                     inv_sample_count - length)
100      max_local_discrepancy = max(local_discrepancy_closed,
101                                  max_local_discrepancy)
102
103      # Local discrepancy for open interval
104      count_open = count_less[j] - count_less_equal[i]
105      local_discrepancy_open = abs(float(count_open) *
106                                   inv_sample_count - length)
107      max_local_discrepancy = max(local_discrepancy_open,
108                                  max_local_discrepancy)
109
110  return max_local_discrepancy
111
112
113def TimestampsDiscrepancy(timestamps, absolute=True,
114                          location_count=None):
115  """A discrepancy based metric for measuring timestamp jank.
116
117  TimestampsDiscrepancy quantifies the largest area of jank observed in a series
118  of timestamps.  Note that this is different from metrics based on the
119  max_time_interval. For example, the time stamp series A = [0,1,2,3,5,6] and
120  B = [0,1,2,3,5,7] have the same max_time_interval = 2, but
121  Discrepancy(B) > Discrepancy(A).
122
123  Two variants of discrepancy can be computed:
124
125  Relative discrepancy is following the original definition of
126  discrepancy. It characterized the largest area of jank, relative to the
127  duration of the entire time stamp series.  We normalize the raw results,
128  because the best case discrepancy for a set of N samples is 1/N (for
129  equally spaced samples), and we want our metric to report 0.0 in that
130  case.
131
132  Absolute discrepancy also characterizes the largest area of jank, but its
133  value wouldn't change (except for imprecisions due to a low
134  |interval_multiplier|) if additional 'good' intervals were added to an
135  exisiting list of time stamps.  Its range is [0,inf] and the unit is
136  milliseconds.
137
138  The time stamp series C = [0,2,3,4] and D = [0,2,3,4,5] have the same
139  absolute discrepancy, but D has lower relative discrepancy than C.
140
141  |timestamps| may be a list of lists S = [S_1, S_2, ..., S_N], where each
142  S_i is a time stamp series. In that case, the discrepancy D(S) is:
143  D(S) = max(D(S_1), D(S_2), ..., D(S_N))
144  """
145  if not timestamps:
146    return 0.0
147
148  if isinstance(timestamps[0], list):
149    range_discrepancies = [TimestampsDiscrepancy(r) for r in timestamps]
150    return max(range_discrepancies)
151
152  samples, sample_scale = NormalizeSamples(timestamps)
153  discrepancy = Discrepancy(samples, location_count)
154  inv_sample_count = 1.0 / len(samples)
155  if absolute:
156    # Compute absolute discrepancy
157    discrepancy /= sample_scale
158  else:
159    # Compute relative discrepancy
160    discrepancy = Clamp((discrepancy-inv_sample_count) / (1.0-inv_sample_count))
161  return discrepancy
162
163
164def DurationsDiscrepancy(durations, absolute=True,
165                         location_count=None):
166  """A discrepancy based metric for measuring duration jank.
167
168  DurationsDiscrepancy computes a jank metric which measures how irregular a
169  given sequence of intervals is. In order to minimize jank, each duration
170  should be equally long. This is similar to how timestamp jank works,
171  and we therefore reuse the timestamp discrepancy function above to compute a
172  similar duration discrepancy number.
173
174  Because timestamp discrepancy is defined in terms of timestamps, we first
175  convert the list of durations to monotonically increasing timestamps.
176
177  Args:
178    durations: List of interval lengths in milliseconds.
179    absolute: See TimestampsDiscrepancy.
180    interval_multiplier: See TimestampsDiscrepancy.
181  """
182  if not durations:
183    return 0.0
184
185  timestamps = reduce(lambda x, y: x + [x[-1] + y], durations, [0])
186  return TimestampsDiscrepancy(timestamps, absolute, location_count)
187
188
189def ArithmeticMean(data):
190  """Calculates arithmetic mean.
191
192  Args:
193    data: A list of samples.
194
195  Returns:
196    The arithmetic mean value, or 0 if the list is empty.
197  """
198  numerator_total = Total(data)
199  denominator_total = Total(len(data))
200  return DivideIfPossibleOrZero(numerator_total, denominator_total)
201
202
203def StandardDeviation(data):
204  """Calculates the standard deviation.
205
206  Args:
207    data: A list of samples.
208
209  Returns:
210    The standard deviation of the samples provided.
211  """
212  if len(data) == 1:
213    return 0.0
214
215  mean = ArithmeticMean(data)
216  variances = [float(x) - mean for x in data]
217  variances = [x * x for x in variances]
218  std_dev = math.sqrt(ArithmeticMean(variances))
219
220  return std_dev
221
222
223def TrapezoidalRule(data, dx):
224  """ Calculate the integral according to the trapezoidal rule
225
226  TrapezoidalRule approximates the definite integral of f from a to b by
227  the composite trapezoidal rule, using n subintervals.
228  http://en.wikipedia.org/wiki/Trapezoidal_rule#Uniform_grid
229
230  Args:
231    data: A list of samples
232    dx: The uniform distance along the x axis between any two samples
233
234  Returns:
235    The area under the curve defined by the samples and the uniform distance
236    according to the trapezoidal rule.
237  """
238
239  n = len(data) - 1
240  s = data[0] + data[n]
241
242  if n == 0:
243    return 0.0
244
245  for i in range(1, n):
246    s += 2 * data[i]
247
248  return s * dx / 2.0
249
250def Total(data):
251  """Returns the float value of a number or the sum of a list."""
252  if type(data) == float:
253    total = data
254  elif type(data) == int:
255    total = float(data)
256  elif type(data) == list:
257    total = float(sum(data))
258  else:
259    raise TypeError
260  return total
261
262
263def DivideIfPossibleOrZero(numerator, denominator):
264  """Returns the quotient, or zero if the denominator is zero."""
265  if not denominator:
266    return 0.0
267  else:
268    return numerator / denominator
269
270
271def GeneralizedMean(values, exponent):
272  """See http://en.wikipedia.org/wiki/Generalized_mean"""
273  if not values:
274    return 0.0
275  sum_of_powers = 0.0
276  for v in values:
277    sum_of_powers += v ** exponent
278  return (sum_of_powers / len(values)) ** (1.0/exponent)
279
280
281def Median(values):
282  """Gets the median of a list of values."""
283  return Percentile(values, 50)
284
285
286def Percentile(values, percentile):
287  """Calculates the value below which a given percentage of values fall.
288
289  For example, if 17% of the values are less than 5.0, then 5.0 is the 17th
290  percentile for this set of values. When the percentage doesn't exactly
291  match a rank in the list of values, the percentile is computed using linear
292  interpolation between closest ranks.
293
294  Args:
295    values: A list of numerical values.
296    percentile: A number between 0 and 100.
297
298  Returns:
299    The Nth percentile for the list of values, where N is the given percentage.
300  """
301  if not values:
302    return 0.0
303  sorted_values = sorted(values)
304  n = len(values)
305  percentile /= 100.0
306  if percentile <= 0.5 / n:
307    return sorted_values[0]
308  elif percentile >= (n - 0.5) / n:
309    return sorted_values[-1]
310  else:
311    floor_index = int(math.floor(n * percentile -  0.5))
312    floor_value = sorted_values[floor_index]
313    ceil_value = sorted_values[floor_index+1]
314    alpha = n * percentile - 0.5 - floor_index
315    return floor_value + alpha * (ceil_value - floor_value)
316
317
318def GeometricMean(values):
319  """Compute a rounded geometric mean from an array of values."""
320  if not values:
321    return None
322  # To avoid infinite value errors, make sure no value is less than 0.001.
323  new_values = []
324  for value in values:
325    if value > 0.001:
326      new_values.append(value)
327    else:
328      new_values.append(0.001)
329  # Compute the sum of the log of the values.
330  log_sum = sum(map(math.log, new_values))
331  # Raise e to that sum over the number of values.
332  mean = math.pow(math.e, (log_sum / len(new_values)))
333  # Return the rounded mean.
334  return int(round(mean))
335