1553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold# Copyright (c) 2013 The Chromium OS Authors. All rights reserved.
2553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold# Use of this source code is governed by a BSD-style license that can be
3553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold# found in the LICENSE file.
4553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold
5553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold"""Histogram generation tools."""
6553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold
7553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnoldfrom collections import defaultdict
8553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold
9553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnoldimport format_utils
10553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold
11553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold
12553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnoldclass Histogram(object):
13553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold  """A histogram generating object.
14553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold
15553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold  This object serves the sole purpose of formatting (key, val) pairs as an
16553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold  ASCII histogram, including bars and percentage markers, and taking care of
17553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold  label alignment, scaling, etc. In addition to the standard __init__
18553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold  interface, two static methods are provided for conveniently converting data
19553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold  in different formats into a histogram. Histogram generation is exported via
20553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold  its __str__ method, and looks as follows:
21553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold
22553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    Yes |################    | 5 (83.3%)
23553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    No  |###                 | 1 (16.6%)
24553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold
25553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold  TODO(garnold) we may want to add actual methods for adding data or tweaking
26553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold  the output layout and formatting. For now, though, this is fine.
27553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold
28553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold  """
29553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold
30553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold  def __init__(self, data, scale=20, formatter=None):
31553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    """Initialize a histogram object.
32553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold
33553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    Args:
34553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold      data: list of (key, count) pairs constituting the histogram
35553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold      scale: number of characters used to indicate 100%
36553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold      formatter: function used for formatting raw histogram values
37553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold
38553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    """
39553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    self.data = data
40553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    self.scale = scale
41553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    self.formatter = formatter or str
42553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    self.max_key_len = max([len(str(key)) for key, count in self.data])
43553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    self.total = sum([count for key, count in self.data])
44553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold
45553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold  @staticmethod
46553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold  def FromCountDict(count_dict, scale=20, formatter=None, key_names=None):
47553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    """Takes a dictionary of counts and returns a histogram object.
48553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold
49553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    This simply converts a mapping from names to counts into a list of (key,
50553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    count) pairs, optionally translating keys into name strings, then
51553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    generating and returning a histogram for them. This is a useful convenience
52553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    call for clients that update a dictionary of counters as they (say) scan a
53553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    data stream.
54553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold
55553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    Args:
56553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold      count_dict: dictionary mapping keys to occurrence counts
57553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold      scale: number of characters used to indicate 100%
58553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold      formatter: function used for formatting raw histogram values
59553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold      key_names: dictionary mapping keys to name strings
60553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    Returns:
61553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold      A histogram object based on the given data.
62553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold
63553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    """
64553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    namer = None
65553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    if key_names:
66553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold      namer = lambda key: key_names[key]
67553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    else:
68553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold      namer = lambda key: key
69553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold
70553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    hist = [(namer(key), count) for key, count in count_dict.items()]
71553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    return Histogram(hist, scale, formatter)
72553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold
73553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold  @staticmethod
74553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold  def FromKeyList(key_list, scale=20, formatter=None, key_names=None):
75553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    """Takes a list of (possibly recurring) keys and returns a histogram object.
76553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold
77553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    This converts the list into a dictionary of counters, then uses
78553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    FromCountDict() to generate the actual histogram. For example:
79553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold
80553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold      ['a', 'a', 'b', 'a', 'b'] --> {'a': 3, 'b': 2} --> ...
81553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold
82553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    Args:
83553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold      key_list: list of (possibly recurring) keys
84553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold      scale: number of characters used to indicate 100%
85553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold      formatter: function used for formatting raw histogram values
86553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold      key_names: dictionary mapping keys to name strings
87553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    Returns:
88553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold      A histogram object based on the given data.
89553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold
90553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    """
91553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    count_dict = defaultdict(int)  # Unset items default to zero
92553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    for key in key_list:
93553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold      count_dict[key] += 1
94553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    return Histogram.FromCountDict(count_dict, scale, formatter, key_names)
95553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold
96553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold  def __str__(self):
97553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    hist_lines = []
98553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    hist_bar = '|'
99553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    for key, count in self.data:
100553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold      if self.total:
101553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold        bar_len = count * self.scale / self.total
102553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold        hist_bar = '|%s|' % ('#' * bar_len).ljust(self.scale)
103553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold
1046a3a3878daddc2536a2ac9033189274cfc5ef52dGilad Arnold      line = '%s %s %s' % (
105553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold          str(key).ljust(self.max_key_len),
106553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold          hist_bar,
1076a3a3878daddc2536a2ac9033189274cfc5ef52dGilad Arnold          self.formatter(count))
1086a3a3878daddc2536a2ac9033189274cfc5ef52dGilad Arnold      percent_str = format_utils.NumToPercent(count, self.total)
1096a3a3878daddc2536a2ac9033189274cfc5ef52dGilad Arnold      if percent_str:
1106a3a3878daddc2536a2ac9033189274cfc5ef52dGilad Arnold        line += ' (%s)' % percent_str
111553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold      hist_lines.append(line)
112553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold
113553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    return '\n'.join(hist_lines)
114553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold
115553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold  def GetKeys(self):
116553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    """Returns the keys of the histogram."""
117553b0ec49bc64fc4b7df4358cd31396a87276d2bGilad Arnold    return [key for key, _ in self.data]
118