1#!/usr/bin/env python
2# Copyright 2013 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6"""Pretty-prints the histograms.xml file, alphabetizing tags, wrapping text
7at 80 chars, enforcing standard attribute ordering, and standardizing
8indentation.
9
10This is quite a bit more complicated than just calling tree.toprettyxml();
11we need additional customization, like special attribute ordering in tags
12and wrapping text nodes, so we implement our own full custom XML pretty-printer.
13"""
14
15from __future__ import with_statement
16
17import diffutil
18import json
19import logging
20import shutil
21import sys
22import textwrap
23import xml.dom.minidom
24
25
26WRAP_COLUMN = 80
27
28# Desired order for tag attributes; attributes listed here will appear first,
29# and in the same order as in these lists.
30# { tag_name: [attribute_name, ...] }
31ATTRIBUTE_ORDER = {
32  'enum': ['name', 'type'],
33  'histogram': ['name', 'enum', 'units'],
34  'int': ['value', 'label'],
35  'fieldtrial': ['name', 'separator', 'ordering'],
36  'group': ['name', 'label'],
37  'affected-histogram': ['name'],
38  'with-group': ['name'],
39}
40
41# Tag names for top-level nodes whose children we don't want to indent.
42TAGS_THAT_DONT_INDENT = [
43  'histogram-configuration',
44  'histograms',
45  'fieldtrials',
46  'enums'
47]
48
49# Extra vertical spacing rules for special tag names.
50# {tag_name: (newlines_after_open, newlines_before_close, newlines_after_close)}
51TAGS_THAT_HAVE_EXTRA_NEWLINE = {
52  'histogram-configuration': (2, 1, 1),
53  'histograms': (2, 1, 1),
54  'fieldtrials': (2, 1, 1),
55  'enums': (2, 1, 1),
56  'histogram': (1, 1, 1),
57  'enum': (1, 1, 1),
58  'fieldtrial': (1, 1, 1),
59}
60
61# Tags that we allow to be squished into a single line for brevity.
62TAGS_THAT_ALLOW_SINGLE_LINE = [
63  'summary',
64  'int',
65]
66
67# Tags whose children we want to alphabetize. The key is the parent tag name,
68# and the value is a pair of the tag name of the children we want to sort,
69# and a key function that maps each child node to the desired sort key.
70ALPHABETIZATION_RULES = {
71  'histograms': ('histogram', lambda n: n.attributes['name'].value.lower()),
72  'enums': ('enum', lambda n: n.attributes['name'].value.lower()),
73  'enum': ('int', lambda n: int(n.attributes['value'].value)),
74  'fieldtrials': ('fieldtrial', lambda n: n.attributes['name'].value.lower()),
75  'fieldtrial': ('affected-histogram',
76                 lambda n: n.attributes['name'].value.lower()),
77}
78
79
80class Error(Exception):
81  pass
82
83
84def LastLineLength(s):
85  """Returns the length of the last line in s.
86
87  Args:
88    s: A multi-line string, including newlines.
89
90  Returns:
91    The length of the last line in s, in characters.
92  """
93  if s.rfind('\n') == -1: return len(s)
94  return len(s) - s.rfind('\n') - len('\n')
95
96
97def XmlEscape(s):
98  """XML-escapes the given string, replacing magic characters (&<>") with their
99  escaped equivalents."""
100  s = s.replace("&", "&amp;").replace("<", "&lt;")
101  s = s.replace("\"", "&quot;").replace(">", "&gt;")
102  return s
103
104
105def PrettyPrintNode(node, indent=0):
106  """Pretty-prints the given XML node at the given indent level.
107
108  Args:
109    node: The minidom node to pretty-print.
110    indent: The current indent level.
111
112  Returns:
113    The pretty-printed string (including embedded newlines).
114
115  Raises:
116    Error if the XML has unknown tags or attributes.
117  """
118  # Handle the top-level document node.
119  if node.nodeType == xml.dom.minidom.Node.DOCUMENT_NODE:
120    return '\n'.join([PrettyPrintNode(n) for n in node.childNodes])
121
122  # Handle text nodes.
123  if node.nodeType == xml.dom.minidom.Node.TEXT_NODE:
124    # Wrap each paragraph in the text to fit in the 80 column limit.
125    wrapper = textwrap.TextWrapper()
126    wrapper.initial_indent = ' ' * indent
127    wrapper.subsequent_indent = ' ' * indent
128    wrapper.break_on_hyphens = False
129    wrapper.break_long_words = False
130    wrapper.width = WRAP_COLUMN
131    text = XmlEscape(node.data)
132    # Remove any common indent.
133    text = textwrap.dedent(text.strip('\n'))
134    lines = text.split('\n')
135    # Split the text into paragraphs at blank line boundaries.
136    paragraphs = [[]]
137    for l in lines:
138      if len(l.strip()) == 0 and len(paragraphs[-1]) > 0:
139        paragraphs.append([])
140      else:
141        paragraphs[-1].append(l)
142    # Remove trailing empty paragraph if present.
143    if len(paragraphs) > 0 and len(paragraphs[-1]) == 0:
144      paragraphs = paragraphs[:-1]
145    # Wrap each paragraph and separate with two newlines.
146    return '\n\n'.join([wrapper.fill('\n'.join(p)) for p in paragraphs])
147
148  # Handle element nodes.
149  if node.nodeType == xml.dom.minidom.Node.ELEMENT_NODE:
150    newlines_after_open, newlines_before_close, newlines_after_close = (
151      TAGS_THAT_HAVE_EXTRA_NEWLINE.get(node.tagName, (1, 1, 0)))
152    # Open the tag.
153    s = ' ' * indent + '<' + node.tagName
154
155    # Calculate how much space to allow for the '>' or '/>'.
156    closing_chars = 1
157    if not node.childNodes:
158      closing_chars = 2
159
160    # Pretty-print the attributes.
161    attributes = node.attributes.keys()
162    if attributes:
163      # Reorder the attributes.
164      if not node.tagName in ATTRIBUTE_ORDER:
165        unrecognized_attributes = attributes;
166      else:
167        unrecognized_attributes = (
168          [a for a in attributes if not a in ATTRIBUTE_ORDER[node.tagName]])
169        attributes = (
170          [a for a in ATTRIBUTE_ORDER[node.tagName] if a in attributes])
171
172      for a in unrecognized_attributes:
173        logging.error(
174            'Unrecognized attribute "%s" in tag "%s"' % (a, node.tagName))
175      if unrecognized_attributes:
176        raise Error()
177
178      for a in attributes:
179        value = XmlEscape(node.attributes[a].value)
180        # Replace sequences of whitespace with single spaces.
181        words = value.split()
182        a_str = ' %s="%s"' % (a, ' '.join(words))
183        # Start a new line if the attribute will make this line too long.
184        if LastLineLength(s) + len(a_str) + closing_chars > WRAP_COLUMN:
185          s += '\n' + ' ' * (indent + 3)
186        # Output everything up to the first quote.
187        s += ' %s="' % (a)
188        value_indent_level = LastLineLength(s)
189        # Output one word at a time, splitting to the next line where necessary.
190        column = value_indent_level
191        for i, word in enumerate(words):
192          # This is slightly too conservative since not every word will be
193          # followed by the closing characters...
194          if i > 0 and (column + len(word) + 1 + closing_chars > WRAP_COLUMN):
195            s = s.rstrip()  # remove any trailing whitespace
196            s += '\n' + ' ' * value_indent_level
197            column = value_indent_level
198          s += word + ' '
199          column += len(word) + 1
200        s = s.rstrip()  # remove any trailing whitespace
201        s += '"'
202      s = s.rstrip()  # remove any trailing whitespace
203
204    # Pretty-print the child nodes.
205    if node.childNodes:
206      s += '>'
207      # Calculate the new indent level for child nodes.
208      new_indent = indent
209      if node.tagName not in TAGS_THAT_DONT_INDENT:
210        new_indent += 2
211      child_nodes = node.childNodes
212
213      # Recursively pretty-print the child nodes.
214      child_nodes = [PrettyPrintNode(n, indent=new_indent) for n in child_nodes]
215      child_nodes = [c for c in child_nodes if len(c.strip()) > 0]
216
217      # Determine whether we can fit the entire node on a single line.
218      close_tag = '</%s>' % node.tagName
219      space_left = WRAP_COLUMN - LastLineLength(s) - len(close_tag)
220      if (node.tagName in TAGS_THAT_ALLOW_SINGLE_LINE and
221          len(child_nodes) == 1 and len(child_nodes[0].strip()) <= space_left):
222        s += child_nodes[0].strip()
223      else:
224        s += '\n' * newlines_after_open + '\n'.join(child_nodes)
225        s += '\n' * newlines_before_close + ' ' * indent
226      s += close_tag
227    else:
228      s += '/>'
229    s += '\n' * newlines_after_close
230    return s
231
232  # Handle comment nodes.
233  if node.nodeType == xml.dom.minidom.Node.COMMENT_NODE:
234    return '<!--%s-->\n' % node.data
235
236  # Ignore other node types. This could be a processing instruction (<? ... ?>)
237  # or cdata section (<![CDATA[...]]!>), neither of which are legal in the
238  # histograms XML at present.
239  logging.error('Ignoring unrecognized node data: %s' % node.toxml())
240  raise Error()
241
242
243def unsafeAppendChild(parent, child):
244  """Append child to parent's list of children, ignoring the possibility that it
245  is already in another node's childNodes list.  Requires that the previous
246  parent of child is discarded (to avoid non-tree DOM graphs).
247  This can provide a significant speedup as O(n^2) operations are removed (in
248  particular, each child insertion avoids the need to traverse the old parent's
249  entire list of children)."""
250  child.parentNode = None
251  parent.appendChild(child)
252  child.parentNode = parent
253
254
255def TransformByAlphabetizing(node):
256  """Transform the given XML by alphabetizing specific node types according to
257  the rules in ALPHABETIZATION_RULES.
258
259  Args:
260    node: The minidom node to transform.
261
262  Returns:
263    The minidom node, with children appropriately alphabetized. Note that the
264    transformation is done in-place, i.e. the original minidom tree is modified
265    directly.
266  """
267  if node.nodeType != xml.dom.minidom.Node.ELEMENT_NODE:
268    for c in node.childNodes: TransformByAlphabetizing(c)
269    return node
270
271  # Element node with a tag name that we alphabetize the children of?
272  if node.tagName in ALPHABETIZATION_RULES:
273    # Put subnodes in a list of node,key pairs to allow for custom sorting.
274    subtag, key_function = ALPHABETIZATION_RULES[node.tagName]
275    subnodes = []
276    last_key = -1
277    for c in node.childNodes:
278      if (c.nodeType == xml.dom.minidom.Node.ELEMENT_NODE and
279          c.tagName == subtag):
280        last_key = key_function(c)
281      # Subnodes that we don't want to rearrange use the last node's key,
282      # so they stay in the same relative position.
283      subnodes.append( (c, last_key) )
284
285    # Sort the subnode list.
286    subnodes.sort(key=lambda pair: pair[1])
287
288    # Re-add the subnodes, transforming each recursively.
289    while node.firstChild:
290      node.removeChild(node.firstChild)
291    for (c, _) in subnodes:
292      unsafeAppendChild(node, TransformByAlphabetizing(c))
293    return node
294
295  # Recursively handle other element nodes and other node types.
296  for c in node.childNodes: TransformByAlphabetizing(c)
297  return node
298
299
300def PrettyPrint(raw_xml):
301  """Pretty-print the given XML.
302
303  Args:
304    xml: The contents of the histograms XML file, as a string.
305
306  Returns:
307    The pretty-printed version.
308  """
309  tree = xml.dom.minidom.parseString(raw_xml)
310  tree = TransformByAlphabetizing(tree)
311  return PrettyPrintNode(tree)
312
313
314def main():
315  logging.basicConfig(level=logging.INFO)
316
317  presubmit = ('--presubmit' in sys.argv)
318
319  logging.info('Loading histograms.xml...')
320  with open('histograms.xml', 'rb') as f:
321    xml = f.read()
322
323  # Check there are no CR ('\r') characters in the file.
324  if '\r' in xml:
325    logging.info('DOS-style line endings (CR characters) detected - these are '
326                 'not allowed. Please run dos2unix histograms.xml')
327    sys.exit(1)
328
329  logging.info('Pretty-printing...')
330  try:
331    pretty = PrettyPrint(xml)
332  except Error:
333    logging.error('Aborting parsing due to fatal errors.')
334    sys.exit(1)
335
336  if xml == pretty:
337    logging.info('histograms.xml is correctly pretty-printed.')
338    sys.exit(0)
339  if presubmit:
340    logging.info('histograms.xml is not formatted correctly; run '
341                 'pretty_print.py to fix.')
342    sys.exit(1)
343  if not diffutil.PromptUserToAcceptDiff(
344      xml, pretty,
345      'Is the prettified version acceptable?'):
346    logging.error('Aborting')
347    return
348
349  logging.info('Creating backup file histograms.before.pretty-print.xml')
350  shutil.move('histograms.xml', 'histograms.before.pretty-print.xml')
351
352  logging.info('Writing new histograms.xml file')
353  with open('histograms.xml', 'wb') as f:
354    f.write(pretty)
355
356
357if __name__ == '__main__':
358  main()
359