statistics.py revision a1401311d1ab56c4ed0a474bd38c108f75cb0cd9
1# Copyright 2014 The Chromium Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5"""A collection of statistical utility functions to be used by metrics.""" 6 7import bisect 8import math 9 10 11def Clamp(value, low=0.0, high=1.0): 12 """Clamp a value between some low and high value.""" 13 return min(max(value, low), high) 14 15 16def NormalizeSamples(samples): 17 """Sorts the samples, and map them linearly to the range [0,1]. 18 19 They're mapped such that for the N samples, the first sample is 0.5/N and the 20 last sample is (N-0.5)/N. 21 22 Background: The discrepancy of the sample set i/(N-1); i=0, ..., N-1 is 2/N, 23 twice the discrepancy of the sample set (i+1/2)/N; i=0, ..., N-1. In our case 24 we don't want to distinguish between these two cases, as our original domain 25 is not bounded (it is for Monte Carlo integration, where discrepancy was 26 first used). 27 """ 28 if not samples: 29 return samples, 1.0 30 samples = sorted(samples) 31 low = min(samples) 32 high = max(samples) 33 new_low = 0.5 / len(samples) 34 new_high = (len(samples)-0.5) / len(samples) 35 if high-low == 0.0: 36 return samples, 1.0 37 scale = (new_high - new_low) / (high - low) 38 for i in xrange(0, len(samples)): 39 samples[i] = float(samples[i] - low) * scale + new_low 40 return samples, scale 41 42 43def Discrepancy(samples, interval_multiplier=10000): 44 """Computes the discrepancy of a set of 1D samples from the interval [0,1]. 45 46 The samples must be sorted. 47 48 http://en.wikipedia.org/wiki/Low-discrepancy_sequence 49 http://mathworld.wolfram.com/Discrepancy.html 50 """ 51 if not samples: 52 return 1.0 53 54 max_local_discrepancy = 0 55 locations = [] 56 # For each location, stores the number of samples less than that location. 57 left = [] 58 # For each location, stores the number of samples less than or equal to that 59 # location. 60 right = [] 61 62 interval_count = len(samples) * interval_multiplier 63 # Compute number of locations the will roughly result in the requested number 64 # of intervals. 65 location_count = int(math.ceil(math.sqrt(interval_count*2))) 66 inv_sample_count = 1.0 / len(samples) 67 68 # Generate list of equally spaced locations. 69 for i in xrange(0, location_count): 70 location = float(i) / (location_count-1) 71 locations.append(location) 72 left.append(bisect.bisect_left(samples, location)) 73 right.append(bisect.bisect_right(samples, location)) 74 75 # Iterate over the intervals defined by any pair of locations. 76 for i in xrange(0, len(locations)): 77 for j in xrange(i, len(locations)): 78 # Compute length of interval and number of samples in the interval. 79 length = locations[j] - locations[i] 80 count = right[j] - left[i] 81 82 # Compute local discrepancy and update max_local_discrepancy. 83 local_discrepancy = abs(float(count)*inv_sample_count - length) 84 max_local_discrepancy = max(local_discrepancy, max_local_discrepancy) 85 86 return max_local_discrepancy 87 88 89def TimestampsDiscrepancy(timestamps, absolute=True, 90 interval_multiplier=10000): 91 """A discrepancy based metric for measuring timestamp jank. 92 93 TimestampsDiscrepancy quantifies the largest area of jank observed in a series 94 of timestamps. Note that this is different from metrics based on the 95 max_time_interval. For example, the time stamp series A = [0,1,2,3,5,6] and 96 B = [0,1,2,3,5,7] have the same max_time_interval = 2, but 97 Discrepancy(B) > Discrepancy(A). 98 99 Two variants of discrepancy can be computed: 100 101 Relative discrepancy is following the original definition of 102 discrepancy. It characterized the largest area of jank, relative to the 103 duration of the entire time stamp series. We normalize the raw results, 104 because the best case discrepancy for a set of N samples is 1/N (for 105 equally spaced samples), and we want our metric to report 0.0 in that 106 case. 107 108 Absolute discrepancy also characterizes the largest area of jank, but its 109 value wouldn't change (except for imprecisions due to a low 110 |interval_multiplier|) if additional 'good' intervals were added to an 111 exisiting list of time stamps. Its range is [0,inf] and the unit is 112 milliseconds. 113 114 The time stamp series C = [0,2,3,4] and D = [0,2,3,4,5] have the same 115 absolute discrepancy, but D has lower relative discrepancy than C. 116 117 |timestamps| may be a list of lists S = [S_1, S_2, ..., S_N], where each 118 S_i is a time stamp series. In that case, the discrepancy D(S) is: 119 D(S) = max(D(S_1), D(S_2), ..., D(S_N)) 120 """ 121 if not timestamps: 122 return 1.0 123 124 if isinstance(timestamps[0], list): 125 range_discrepancies = [TimestampsDiscrepancy(r) for r in timestamps] 126 return max(range_discrepancies) 127 128 samples, sample_scale = NormalizeSamples(timestamps) 129 discrepancy = Discrepancy(samples, interval_multiplier) 130 inv_sample_count = 1.0 / len(samples) 131 if absolute: 132 # Compute absolute discrepancy 133 discrepancy /= sample_scale 134 else: 135 # Compute relative discrepancy 136 discrepancy = Clamp((discrepancy-inv_sample_count) / (1.0-inv_sample_count)) 137 return discrepancy 138 139 140def DurationsDiscrepancy(durations, absolute=True, 141 interval_multiplier=10000): 142 """A discrepancy based metric for measuring duration jank. 143 144 DurationsDiscrepancy computes a jank metric which measures how irregular a 145 given sequence of intervals is. In order to minimize jank, each duration 146 should be equally long. This is similar to how timestamp jank works, 147 and we therefore reuse the timestamp discrepancy function above to compute a 148 similar duration discrepancy number. 149 150 Because timestamp discrepancy is defined in terms of timestamps, we first 151 convert the list of durations to monotonically increasing timestamps. 152 153 Args: 154 durations: List of interval lengths in milliseconds. 155 absolute: See TimestampsDiscrepancy. 156 interval_multiplier: See TimestampsDiscrepancy. 157 """ 158 timestamps = reduce(lambda x, y: x + [x[-1] + y], durations, [0])[1:] 159 return TimestampsDiscrepancy(timestamps, absolute, interval_multiplier) 160 161 162def ArithmeticMean(data): 163 """Calculates arithmetic mean. 164 165 Args: 166 data: A list of samples. 167 168 Returns: 169 The arithmetic mean value, or 0 if the list is empty. 170 """ 171 numerator_total = Total(data) 172 denominator_total = Total(len(data)) 173 return DivideIfPossibleOrZero(numerator_total, denominator_total) 174 175 176def StandardDeviation(data): 177 """Calculates the standard deviation. 178 179 Args: 180 data: A list of samples. 181 182 Returns: 183 The standard deviation of the samples provided. 184 """ 185 if len(data) == 1: 186 return 0.0 187 188 mean = ArithmeticMean(data) 189 variances = [float(x) - mean for x in data] 190 variances = [x * x for x in variances] 191 std_dev = math.sqrt(ArithmeticMean(variances)) 192 193 return std_dev 194 195 196def TrapezoidalRule(data, dx): 197 """ Calculate the integral according to the trapezoidal rule 198 199 TrapezoidalRule approximates the definite integral of f from a to b by 200 the composite trapezoidal rule, using n subintervals. 201 http://en.wikipedia.org/wiki/Trapezoidal_rule#Uniform_grid 202 203 Args: 204 data: A list of samples 205 dx: The uniform distance along the x axis between any two samples 206 207 Returns: 208 The area under the curve defined by the samples and the uniform distance 209 according to the trapezoidal rule. 210 """ 211 212 n = len(data) - 1 213 s = data[0] + data[n] 214 215 if n == 0: 216 return 0.0 217 218 for i in range(1, n): 219 s += 2 * data[i] 220 221 return s * dx / 2.0 222 223def Total(data): 224 """Returns the float value of a number or the sum of a list.""" 225 if type(data) == float: 226 total = data 227 elif type(data) == int: 228 total = float(data) 229 elif type(data) == list: 230 total = float(sum(data)) 231 else: 232 raise TypeError 233 return total 234 235 236def DivideIfPossibleOrZero(numerator, denominator): 237 """Returns the quotient, or zero if the denominator is zero.""" 238 if not denominator: 239 return 0.0 240 else: 241 return numerator / denominator 242 243 244def GeneralizedMean(values, exponent): 245 """See http://en.wikipedia.org/wiki/Generalized_mean""" 246 if not values: 247 return 0.0 248 sum_of_powers = 0.0 249 for v in values: 250 sum_of_powers += v ** exponent 251 return (sum_of_powers / len(values)) ** (1.0/exponent) 252 253 254def Median(values): 255 """Gets the median of a list of values.""" 256 return Percentile(values, 50) 257 258 259def Percentile(values, percentile): 260 """Calculates the value below which a given percentage of values fall. 261 262 For example, if 17% of the values are less than 5.0, then 5.0 is the 17th 263 percentile for this set of values. When the percentage doesn't exactly 264 match a rank in the list of values, the percentile is computed using linear 265 interpolation between closest ranks. 266 267 Args: 268 values: A list of numerical values. 269 percentile: A number between 0 and 100. 270 271 Returns: 272 The Nth percentile for the list of values, where N is the given percentage. 273 """ 274 if not values: 275 return 0.0 276 sorted_values = sorted(values) 277 n = len(values) 278 percentile /= 100.0 279 if percentile <= 0.5 / n: 280 return sorted_values[0] 281 elif percentile >= (n - 0.5) / n: 282 return sorted_values[-1] 283 else: 284 floor_index = int(math.floor(n * percentile - 0.5)) 285 floor_value = sorted_values[floor_index] 286 ceil_value = sorted_values[floor_index+1] 287 alpha = n * percentile - 0.5 - floor_index 288 return floor_value + alpha * (ceil_value - floor_value) 289 290 291def GeometricMean(values): 292 """Compute a rounded geometric mean from an array of values.""" 293 if not values: 294 return None 295 # To avoid infinite value errors, make sure no value is less than 0.001. 296 new_values = [] 297 for value in values: 298 if value > 0.001: 299 new_values.append(value) 300 else: 301 new_values.append(0.001) 302 # Compute the sum of the log of the values. 303 log_sum = sum(map(math.log, new_values)) 304 # Raise e to that sum over the number of values. 305 mean = math.pow(math.e, (log_sum / len(new_values))) 306 # Return the rounded mean. 307 return int(round(mean)) 308 309