1# Copyright 2014 The Chromium Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5"""A collection of statistical utility functions to be used by metrics.""" 6 7import math 8 9 10def Clamp(value, low=0.0, high=1.0): 11 """Clamp a value between some low and high value.""" 12 return min(max(value, low), high) 13 14 15def NormalizeSamples(samples): 16 """Sorts the samples, and map them linearly to the range [0,1]. 17 18 They're mapped such that for the N samples, the first sample is 0.5/N and the 19 last sample is (N-0.5)/N. 20 21 Background: The discrepancy of the sample set i/(N-1); i=0, ..., N-1 is 2/N, 22 twice the discrepancy of the sample set (i+1/2)/N; i=0, ..., N-1. In our case 23 we don't want to distinguish between these two cases, as our original domain 24 is not bounded (it is for Monte Carlo integration, where discrepancy was 25 first used). 26 """ 27 if not samples: 28 return samples, 1.0 29 samples = sorted(samples) 30 low = min(samples) 31 high = max(samples) 32 new_low = 0.5 / len(samples) 33 new_high = (len(samples)-0.5) / len(samples) 34 if high-low == 0.0: 35 return [0.5] * len(samples), 1.0 36 scale = (new_high - new_low) / (high - low) 37 for i in xrange(0, len(samples)): 38 samples[i] = float(samples[i] - low) * scale + new_low 39 return samples, scale 40 41 42def Discrepancy(samples, location_count=None): 43 """Computes the discrepancy of a set of 1D samples from the interval [0,1]. 44 45 The samples must be sorted. We define the discrepancy of an empty set 46 of samples to be zero. 47 48 http://en.wikipedia.org/wiki/Low-discrepancy_sequence 49 http://mathworld.wolfram.com/Discrepancy.html 50 """ 51 if not samples: 52 return 0.0 53 54 max_local_discrepancy = 0 55 inv_sample_count = 1.0 / len(samples) 56 locations = [] 57 # For each location, stores the number of samples less than that location. 58 count_less = [] 59 # For each location, stores the number of samples less than or equal to that 60 # location. 61 count_less_equal = [] 62 63 if location_count: 64 # Generate list of equally spaced locations. 65 sample_index = 0 66 for i in xrange(0, int(location_count)): 67 location = float(i) / (location_count-1) 68 locations.append(location) 69 while sample_index < len(samples) and samples[sample_index] < location: 70 sample_index += 1 71 count_less.append(sample_index) 72 while sample_index < len(samples) and samples[sample_index] <= location: 73 sample_index += 1 74 count_less_equal.append(sample_index) 75 else: 76 # Populate locations with sample positions. Append 0 and 1 if necessary. 77 if samples[0] > 0.0: 78 locations.append(0.0) 79 count_less.append(0) 80 count_less_equal.append(0) 81 for i in xrange(0, len(samples)): 82 locations.append(samples[i]) 83 count_less.append(i) 84 count_less_equal.append(i+1) 85 if samples[-1] < 1.0: 86 locations.append(1.0) 87 count_less.append(len(samples)) 88 count_less_equal.append(len(samples)) 89 90 # Iterate over the intervals defined by any pair of locations. 91 for i in xrange(0, len(locations)): 92 for j in xrange(i+1, len(locations)): 93 # Length of interval 94 length = locations[j] - locations[i] 95 96 # Local discrepancy for closed interval 97 count_closed = count_less_equal[j] - count_less[i] 98 local_discrepancy_closed = abs(float(count_closed) * 99 inv_sample_count - length) 100 max_local_discrepancy = max(local_discrepancy_closed, 101 max_local_discrepancy) 102 103 # Local discrepancy for open interval 104 count_open = count_less[j] - count_less_equal[i] 105 local_discrepancy_open = abs(float(count_open) * 106 inv_sample_count - length) 107 max_local_discrepancy = max(local_discrepancy_open, 108 max_local_discrepancy) 109 110 return max_local_discrepancy 111 112 113def TimestampsDiscrepancy(timestamps, absolute=True, 114 location_count=None): 115 """A discrepancy based metric for measuring timestamp jank. 116 117 TimestampsDiscrepancy quantifies the largest area of jank observed in a series 118 of timestamps. Note that this is different from metrics based on the 119 max_time_interval. For example, the time stamp series A = [0,1,2,3,5,6] and 120 B = [0,1,2,3,5,7] have the same max_time_interval = 2, but 121 Discrepancy(B) > Discrepancy(A). 122 123 Two variants of discrepancy can be computed: 124 125 Relative discrepancy is following the original definition of 126 discrepancy. It characterized the largest area of jank, relative to the 127 duration of the entire time stamp series. We normalize the raw results, 128 because the best case discrepancy for a set of N samples is 1/N (for 129 equally spaced samples), and we want our metric to report 0.0 in that 130 case. 131 132 Absolute discrepancy also characterizes the largest area of jank, but its 133 value wouldn't change (except for imprecisions due to a low 134 |interval_multiplier|) if additional 'good' intervals were added to an 135 exisiting list of time stamps. Its range is [0,inf] and the unit is 136 milliseconds. 137 138 The time stamp series C = [0,2,3,4] and D = [0,2,3,4,5] have the same 139 absolute discrepancy, but D has lower relative discrepancy than C. 140 141 |timestamps| may be a list of lists S = [S_1, S_2, ..., S_N], where each 142 S_i is a time stamp series. In that case, the discrepancy D(S) is: 143 D(S) = max(D(S_1), D(S_2), ..., D(S_N)) 144 """ 145 if not timestamps: 146 return 0.0 147 148 if isinstance(timestamps[0], list): 149 range_discrepancies = [TimestampsDiscrepancy(r) for r in timestamps] 150 return max(range_discrepancies) 151 152 samples, sample_scale = NormalizeSamples(timestamps) 153 discrepancy = Discrepancy(samples, location_count) 154 inv_sample_count = 1.0 / len(samples) 155 if absolute: 156 # Compute absolute discrepancy 157 discrepancy /= sample_scale 158 else: 159 # Compute relative discrepancy 160 discrepancy = Clamp((discrepancy-inv_sample_count) / (1.0-inv_sample_count)) 161 return discrepancy 162 163 164def DurationsDiscrepancy(durations, absolute=True, 165 location_count=None): 166 """A discrepancy based metric for measuring duration jank. 167 168 DurationsDiscrepancy computes a jank metric which measures how irregular a 169 given sequence of intervals is. In order to minimize jank, each duration 170 should be equally long. This is similar to how timestamp jank works, 171 and we therefore reuse the timestamp discrepancy function above to compute a 172 similar duration discrepancy number. 173 174 Because timestamp discrepancy is defined in terms of timestamps, we first 175 convert the list of durations to monotonically increasing timestamps. 176 177 Args: 178 durations: List of interval lengths in milliseconds. 179 absolute: See TimestampsDiscrepancy. 180 interval_multiplier: See TimestampsDiscrepancy. 181 """ 182 if not durations: 183 return 0.0 184 185 timestamps = reduce(lambda x, y: x + [x[-1] + y], durations, [0]) 186 return TimestampsDiscrepancy(timestamps, absolute, location_count) 187 188 189def ArithmeticMean(data): 190 """Calculates arithmetic mean. 191 192 Args: 193 data: A list of samples. 194 195 Returns: 196 The arithmetic mean value, or 0 if the list is empty. 197 """ 198 numerator_total = Total(data) 199 denominator_total = Total(len(data)) 200 return DivideIfPossibleOrZero(numerator_total, denominator_total) 201 202 203def StandardDeviation(data): 204 """Calculates the standard deviation. 205 206 Args: 207 data: A list of samples. 208 209 Returns: 210 The standard deviation of the samples provided. 211 """ 212 if len(data) == 1: 213 return 0.0 214 215 mean = ArithmeticMean(data) 216 variances = [float(x) - mean for x in data] 217 variances = [x * x for x in variances] 218 std_dev = math.sqrt(ArithmeticMean(variances)) 219 220 return std_dev 221 222 223def TrapezoidalRule(data, dx): 224 """ Calculate the integral according to the trapezoidal rule 225 226 TrapezoidalRule approximates the definite integral of f from a to b by 227 the composite trapezoidal rule, using n subintervals. 228 http://en.wikipedia.org/wiki/Trapezoidal_rule#Uniform_grid 229 230 Args: 231 data: A list of samples 232 dx: The uniform distance along the x axis between any two samples 233 234 Returns: 235 The area under the curve defined by the samples and the uniform distance 236 according to the trapezoidal rule. 237 """ 238 239 n = len(data) - 1 240 s = data[0] + data[n] 241 242 if n == 0: 243 return 0.0 244 245 for i in range(1, n): 246 s += 2 * data[i] 247 248 return s * dx / 2.0 249 250def Total(data): 251 """Returns the float value of a number or the sum of a list.""" 252 if type(data) == float: 253 total = data 254 elif type(data) == int: 255 total = float(data) 256 elif type(data) == list: 257 total = float(sum(data)) 258 else: 259 raise TypeError 260 return total 261 262 263def DivideIfPossibleOrZero(numerator, denominator): 264 """Returns the quotient, or zero if the denominator is zero.""" 265 if not denominator: 266 return 0.0 267 else: 268 return numerator / denominator 269 270 271def GeneralizedMean(values, exponent): 272 """See http://en.wikipedia.org/wiki/Generalized_mean""" 273 if not values: 274 return 0.0 275 sum_of_powers = 0.0 276 for v in values: 277 sum_of_powers += v ** exponent 278 return (sum_of_powers / len(values)) ** (1.0/exponent) 279 280 281def Median(values): 282 """Gets the median of a list of values.""" 283 return Percentile(values, 50) 284 285 286def Percentile(values, percentile): 287 """Calculates the value below which a given percentage of values fall. 288 289 For example, if 17% of the values are less than 5.0, then 5.0 is the 17th 290 percentile for this set of values. When the percentage doesn't exactly 291 match a rank in the list of values, the percentile is computed using linear 292 interpolation between closest ranks. 293 294 Args: 295 values: A list of numerical values. 296 percentile: A number between 0 and 100. 297 298 Returns: 299 The Nth percentile for the list of values, where N is the given percentage. 300 """ 301 if not values: 302 return 0.0 303 sorted_values = sorted(values) 304 n = len(values) 305 percentile /= 100.0 306 if percentile <= 0.5 / n: 307 return sorted_values[0] 308 elif percentile >= (n - 0.5) / n: 309 return sorted_values[-1] 310 else: 311 floor_index = int(math.floor(n * percentile - 0.5)) 312 floor_value = sorted_values[floor_index] 313 ceil_value = sorted_values[floor_index+1] 314 alpha = n * percentile - 0.5 - floor_index 315 return floor_value + alpha * (ceil_value - floor_value) 316 317 318def GeometricMean(values): 319 """Compute a rounded geometric mean from an array of values.""" 320 if not values: 321 return None 322 # To avoid infinite value errors, make sure no value is less than 0.001. 323 new_values = [] 324 for value in values: 325 if value > 0.001: 326 new_values.append(value) 327 else: 328 new_values.append(0.001) 329 # Compute the sum of the log of the values. 330 log_sum = sum(map(math.log, new_values)) 331 # Raise e to that sum over the number of values. 332 mean = math.pow(math.e, (log_sum / len(new_values))) 333 # Return the rounded mean. 334 return int(round(mean)) 335