1553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold# Copyright (c) 2013 The Chromium OS Authors. All rights reserved. 2553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold# Use of this source code is governed by a BSD-style license that can be 3553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold# found in the LICENSE file. 4553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold 5553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold"""Histogram generation tools.""" 6553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold 7553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnoldfrom collections import defaultdict 8553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold 9553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnoldimport format_utils 10553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold 11553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold 12553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnoldclass Histogram(object): 13553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold """A histogram generating object. 14553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold 15553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold This object serves the sole purpose of formatting (key, val) pairs as an 16553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold ASCII histogram, including bars and percentage markers, and taking care of 17553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold label alignment, scaling, etc. In addition to the standard __init__ 18553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold interface, two static methods are provided for conveniently converting data 19553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold in different formats into a histogram. Histogram generation is exported via 20553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold its __str__ method, and looks as follows: 21553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold 22553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold Yes |################ | 5 (83.3%) 23553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold No |### | 1 (16.6%) 24553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold 25553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold TODO(garnold) we may want to add actual methods for adding data or tweaking 26553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold the output layout and formatting. For now, though, this is fine. 27553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold 28553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold """ 29553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold 30553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold def __init__(self, data, scale=20, formatter=None): 31553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold """Initialize a histogram object. 32553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold 33553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold Args: 34553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold data: list of (key, count) pairs constituting the histogram 35553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold scale: number of characters used to indicate 100% 36553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold formatter: function used for formatting raw histogram values 37553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold 38553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold """ 39553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold self.data = data 40553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold self.scale = scale 41553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold self.formatter = formatter or str 42553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold self.max_key_len = max([len(str(key)) for key, count in self.data]) 43553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold self.total = sum([count for key, count in self.data]) 44553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold 45553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold @staticmethod 46553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold def FromCountDict(count_dict, scale=20, formatter=None, key_names=None): 47553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold """Takes a dictionary of counts and returns a histogram object. 48553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold 49553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold This simply converts a mapping from names to counts into a list of (key, 50553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold count) pairs, optionally translating keys into name strings, then 51553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold generating and returning a histogram for them. This is a useful convenience 52553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold call for clients that update a dictionary of counters as they (say) scan a 53553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold data stream. 54553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold 55553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold Args: 56553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold count_dict: dictionary mapping keys to occurrence counts 57553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold scale: number of characters used to indicate 100% 58553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold formatter: function used for formatting raw histogram values 59553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold key_names: dictionary mapping keys to name strings 60553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold Returns: 61553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold A histogram object based on the given data. 62553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold 63553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold """ 64553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold namer = None 65553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold if key_names: 66553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold namer = lambda key: key_names[key] 67553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold else: 68553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold namer = lambda key: key 69553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold 70553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold hist = [(namer(key), count) for key, count in count_dict.items()] 71553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold return Histogram(hist, scale, formatter) 72553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold 73553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold @staticmethod 74553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold def FromKeyList(key_list, scale=20, formatter=None, key_names=None): 75553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold """Takes a list of (possibly recurring) keys and returns a histogram object. 76553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold 77553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold This converts the list into a dictionary of counters, then uses 78553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold FromCountDict() to generate the actual histogram. For example: 79553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold 80553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold ['a', 'a', 'b', 'a', 'b'] --> {'a': 3, 'b': 2} --> ... 81553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold 82553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold Args: 83553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold key_list: list of (possibly recurring) keys 84553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold scale: number of characters used to indicate 100% 85553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold formatter: function used for formatting raw histogram values 86553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold key_names: dictionary mapping keys to name strings 87553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold Returns: 88553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold A histogram object based on the given data. 89553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold 90553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold """ 91553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold count_dict = defaultdict(int) # Unset items default to zero 92553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold for key in key_list: 93553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold count_dict[key] += 1 94553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold return Histogram.FromCountDict(count_dict, scale, formatter, key_names) 95553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold 96553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold def __str__(self): 97553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold hist_lines = [] 98553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold hist_bar = '|' 99553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold for key, count in self.data: 100553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold if self.total: 101553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold bar_len = count * self.scale / self.total 102553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold hist_bar = '|%s|' % ('#' * bar_len).ljust(self.scale) 103553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold 1046a3a3878daddc2536a2ac9033189274cfc5ef52dGilad Arnold line = '%s %s %s' % ( 105553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold str(key).ljust(self.max_key_len), 106553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold hist_bar, 1076a3a3878daddc2536a2ac9033189274cfc5ef52dGilad Arnold self.formatter(count)) 1086a3a3878daddc2536a2ac9033189274cfc5ef52dGilad Arnold percent_str = format_utils.NumToPercent(count, self.total) 1096a3a3878daddc2536a2ac9033189274cfc5ef52dGilad Arnold if percent_str: 1106a3a3878daddc2536a2ac9033189274cfc5ef52dGilad Arnold line += ' (%s)' % percent_str 111553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold hist_lines.append(line) 112553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold 113553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold return '\n'.join(hist_lines) 114553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold 115553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold def GetKeys(self): 116553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold """Returns the keys of the histogram.""" 117553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold return [key for key, _ in self.data] 118