1#!/usr/bin/env python 2# Copyright 2013 The Chromium Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6"""Pretty-prints the histograms.xml file, alphabetizing tags, wrapping text 7at 80 chars, enforcing standard attribute ordering, and standardizing 8indentation. 9 10This is quite a bit more complicated than just calling tree.toprettyxml(); 11we need additional customization, like special attribute ordering in tags 12and wrapping text nodes, so we implement our own full custom XML pretty-printer. 13""" 14 15from __future__ import with_statement 16 17import diffutil 18import json 19import logging 20import shutil 21import sys 22import textwrap 23import xml.dom.minidom 24 25 26WRAP_COLUMN = 80 27 28# Desired order for tag attributes; attributes listed here will appear first, 29# and in the same order as in these lists. 30# { tag_name: [attribute_name, ...] } 31ATTRIBUTE_ORDER = { 32 'enum': ['name', 'type'], 33 'histogram': ['name', 'enum', 'units'], 34 'int': ['value', 'label'], 35 'fieldtrial': ['name', 'separator', 'ordering'], 36 'group': ['name', 'label'], 37 'affected-histogram': ['name'], 38 'with-group': ['name'], 39} 40 41# Tag names for top-level nodes whose children we don't want to indent. 42TAGS_THAT_DONT_INDENT = [ 43 'histogram-configuration', 44 'histograms', 45 'fieldtrials', 46 'enums' 47] 48 49# Extra vertical spacing rules for special tag names. 50# {tag_name: (newlines_after_open, newlines_before_close, newlines_after_close)} 51TAGS_THAT_HAVE_EXTRA_NEWLINE = { 52 'histogram-configuration': (2, 1, 1), 53 'histograms': (2, 1, 1), 54 'fieldtrials': (2, 1, 1), 55 'enums': (2, 1, 1), 56 'histogram': (1, 1, 1), 57 'enum': (1, 1, 1), 58 'fieldtrial': (1, 1, 1), 59} 60 61# Tags that we allow to be squished into a single line for brevity. 62TAGS_THAT_ALLOW_SINGLE_LINE = [ 63 'summary', 64 'int', 65] 66 67# Tags whose children we want to alphabetize. The key is the parent tag name, 68# and the value is a pair of the tag name of the children we want to sort, 69# and a key function that maps each child node to the desired sort key. 70ALPHABETIZATION_RULES = { 71 'histograms': ('histogram', lambda n: n.attributes['name'].value.lower()), 72 'enums': ('enum', lambda n: n.attributes['name'].value.lower()), 73 'enum': ('int', lambda n: int(n.attributes['value'].value)), 74 'fieldtrials': ('fieldtrial', lambda n: n.attributes['name'].value.lower()), 75 'fieldtrial': ('affected-histogram', 76 lambda n: n.attributes['name'].value.lower()), 77} 78 79 80class Error(Exception): 81 pass 82 83 84def LastLineLength(s): 85 """Returns the length of the last line in s. 86 87 Args: 88 s: A multi-line string, including newlines. 89 90 Returns: 91 The length of the last line in s, in characters. 92 """ 93 if s.rfind('\n') == -1: return len(s) 94 return len(s) - s.rfind('\n') - len('\n') 95 96 97def XmlEscape(s): 98 """XML-escapes the given string, replacing magic characters (&<>") with their 99 escaped equivalents.""" 100 s = s.replace("&", "&").replace("<", "<") 101 s = s.replace("\"", """).replace(">", ">") 102 return s 103 104 105def PrettyPrintNode(node, indent=0): 106 """Pretty-prints the given XML node at the given indent level. 107 108 Args: 109 node: The minidom node to pretty-print. 110 indent: The current indent level. 111 112 Returns: 113 The pretty-printed string (including embedded newlines). 114 115 Raises: 116 Error if the XML has unknown tags or attributes. 117 """ 118 # Handle the top-level document node. 119 if node.nodeType == xml.dom.minidom.Node.DOCUMENT_NODE: 120 return '\n'.join([PrettyPrintNode(n) for n in node.childNodes]) 121 122 # Handle text nodes. 123 if node.nodeType == xml.dom.minidom.Node.TEXT_NODE: 124 # Wrap each paragraph in the text to fit in the 80 column limit. 125 wrapper = textwrap.TextWrapper() 126 wrapper.initial_indent = ' ' * indent 127 wrapper.subsequent_indent = ' ' * indent 128 wrapper.break_on_hyphens = False 129 wrapper.break_long_words = False 130 wrapper.width = WRAP_COLUMN 131 text = XmlEscape(node.data) 132 # Remove any common indent. 133 text = textwrap.dedent(text.strip('\n')) 134 lines = text.split('\n') 135 # Split the text into paragraphs at blank line boundaries. 136 paragraphs = [[]] 137 for l in lines: 138 if len(l.strip()) == 0 and len(paragraphs[-1]) > 0: 139 paragraphs.append([]) 140 else: 141 paragraphs[-1].append(l) 142 # Remove trailing empty paragraph if present. 143 if len(paragraphs) > 0 and len(paragraphs[-1]) == 0: 144 paragraphs = paragraphs[:-1] 145 # Wrap each paragraph and separate with two newlines. 146 return '\n\n'.join([wrapper.fill('\n'.join(p)) for p in paragraphs]) 147 148 # Handle element nodes. 149 if node.nodeType == xml.dom.minidom.Node.ELEMENT_NODE: 150 newlines_after_open, newlines_before_close, newlines_after_close = ( 151 TAGS_THAT_HAVE_EXTRA_NEWLINE.get(node.tagName, (1, 1, 0))) 152 # Open the tag. 153 s = ' ' * indent + '<' + node.tagName 154 155 # Calculate how much space to allow for the '>' or '/>'. 156 closing_chars = 1 157 if not node.childNodes: 158 closing_chars = 2 159 160 # Pretty-print the attributes. 161 attributes = node.attributes.keys() 162 if attributes: 163 # Reorder the attributes. 164 if not node.tagName in ATTRIBUTE_ORDER: 165 unrecognized_attributes = attributes; 166 else: 167 unrecognized_attributes = ( 168 [a for a in attributes if not a in ATTRIBUTE_ORDER[node.tagName]]) 169 attributes = ( 170 [a for a in ATTRIBUTE_ORDER[node.tagName] if a in attributes]) 171 172 for a in unrecognized_attributes: 173 logging.error( 174 'Unrecognized attribute "%s" in tag "%s"' % (a, node.tagName)) 175 if unrecognized_attributes: 176 raise Error() 177 178 for a in attributes: 179 value = XmlEscape(node.attributes[a].value) 180 # Replace sequences of whitespace with single spaces. 181 words = value.split() 182 a_str = ' %s="%s"' % (a, ' '.join(words)) 183 # Start a new line if the attribute will make this line too long. 184 if LastLineLength(s) + len(a_str) + closing_chars > WRAP_COLUMN: 185 s += '\n' + ' ' * (indent + 3) 186 # Output everything up to the first quote. 187 s += ' %s="' % (a) 188 value_indent_level = LastLineLength(s) 189 # Output one word at a time, splitting to the next line where necessary. 190 column = value_indent_level 191 for i, word in enumerate(words): 192 # This is slightly too conservative since not every word will be 193 # followed by the closing characters... 194 if i > 0 and (column + len(word) + 1 + closing_chars > WRAP_COLUMN): 195 s = s.rstrip() # remove any trailing whitespace 196 s += '\n' + ' ' * value_indent_level 197 column = value_indent_level 198 s += word + ' ' 199 column += len(word) + 1 200 s = s.rstrip() # remove any trailing whitespace 201 s += '"' 202 s = s.rstrip() # remove any trailing whitespace 203 204 # Pretty-print the child nodes. 205 if node.childNodes: 206 s += '>' 207 # Calculate the new indent level for child nodes. 208 new_indent = indent 209 if node.tagName not in TAGS_THAT_DONT_INDENT: 210 new_indent += 2 211 child_nodes = node.childNodes 212 213 # Recursively pretty-print the child nodes. 214 child_nodes = [PrettyPrintNode(n, indent=new_indent) for n in child_nodes] 215 child_nodes = [c for c in child_nodes if len(c.strip()) > 0] 216 217 # Determine whether we can fit the entire node on a single line. 218 close_tag = '</%s>' % node.tagName 219 space_left = WRAP_COLUMN - LastLineLength(s) - len(close_tag) 220 if (node.tagName in TAGS_THAT_ALLOW_SINGLE_LINE and 221 len(child_nodes) == 1 and len(child_nodes[0].strip()) <= space_left): 222 s += child_nodes[0].strip() 223 else: 224 s += '\n' * newlines_after_open + '\n'.join(child_nodes) 225 s += '\n' * newlines_before_close + ' ' * indent 226 s += close_tag 227 else: 228 s += '/>' 229 s += '\n' * newlines_after_close 230 return s 231 232 # Handle comment nodes. 233 if node.nodeType == xml.dom.minidom.Node.COMMENT_NODE: 234 return '<!--%s-->\n' % node.data 235 236 # Ignore other node types. This could be a processing instruction (<? ... ?>) 237 # or cdata section (<![CDATA[...]]!>), neither of which are legal in the 238 # histograms XML at present. 239 logging.error('Ignoring unrecognized node data: %s' % node.toxml()) 240 raise Error() 241 242 243def unsafeAppendChild(parent, child): 244 """Append child to parent's list of children, ignoring the possibility that it 245 is already in another node's childNodes list. Requires that the previous 246 parent of child is discarded (to avoid non-tree DOM graphs). 247 This can provide a significant speedup as O(n^2) operations are removed (in 248 particular, each child insertion avoids the need to traverse the old parent's 249 entire list of children).""" 250 child.parentNode = None 251 parent.appendChild(child) 252 child.parentNode = parent 253 254 255def TransformByAlphabetizing(node): 256 """Transform the given XML by alphabetizing specific node types according to 257 the rules in ALPHABETIZATION_RULES. 258 259 Args: 260 node: The minidom node to transform. 261 262 Returns: 263 The minidom node, with children appropriately alphabetized. Note that the 264 transformation is done in-place, i.e. the original minidom tree is modified 265 directly. 266 """ 267 if node.nodeType != xml.dom.minidom.Node.ELEMENT_NODE: 268 for c in node.childNodes: TransformByAlphabetizing(c) 269 return node 270 271 # Element node with a tag name that we alphabetize the children of? 272 if node.tagName in ALPHABETIZATION_RULES: 273 # Put subnodes in a list of node,key pairs to allow for custom sorting. 274 subtag, key_function = ALPHABETIZATION_RULES[node.tagName] 275 subnodes = [] 276 last_key = -1 277 for c in node.childNodes: 278 if (c.nodeType == xml.dom.minidom.Node.ELEMENT_NODE and 279 c.tagName == subtag): 280 last_key = key_function(c) 281 # Subnodes that we don't want to rearrange use the last node's key, 282 # so they stay in the same relative position. 283 subnodes.append( (c, last_key) ) 284 285 # Sort the subnode list. 286 subnodes.sort(key=lambda pair: pair[1]) 287 288 # Re-add the subnodes, transforming each recursively. 289 while node.firstChild: 290 node.removeChild(node.firstChild) 291 for (c, _) in subnodes: 292 unsafeAppendChild(node, TransformByAlphabetizing(c)) 293 return node 294 295 # Recursively handle other element nodes and other node types. 296 for c in node.childNodes: TransformByAlphabetizing(c) 297 return node 298 299 300def PrettyPrint(raw_xml): 301 """Pretty-print the given XML. 302 303 Args: 304 xml: The contents of the histograms XML file, as a string. 305 306 Returns: 307 The pretty-printed version. 308 """ 309 tree = xml.dom.minidom.parseString(raw_xml) 310 tree = TransformByAlphabetizing(tree) 311 return PrettyPrintNode(tree) 312 313 314def main(): 315 logging.basicConfig(level=logging.INFO) 316 317 presubmit = ('--presubmit' in sys.argv) 318 319 logging.info('Loading histograms.xml...') 320 with open('histograms.xml', 'rb') as f: 321 xml = f.read() 322 323 # Check there are no CR ('\r') characters in the file. 324 if '\r' in xml: 325 logging.info('DOS-style line endings (CR characters) detected - these are ' 326 'not allowed. Please run dos2unix histograms.xml') 327 sys.exit(1) 328 329 logging.info('Pretty-printing...') 330 try: 331 pretty = PrettyPrint(xml) 332 except Error: 333 logging.error('Aborting parsing due to fatal errors.') 334 sys.exit(1) 335 336 if xml == pretty: 337 logging.info('histograms.xml is correctly pretty-printed.') 338 sys.exit(0) 339 if presubmit: 340 logging.info('histograms.xml is not formatted correctly; run ' 341 'pretty_print.py to fix.') 342 sys.exit(1) 343 if not diffutil.PromptUserToAcceptDiff( 344 xml, pretty, 345 'Is the prettified version acceptable?'): 346 logging.error('Aborting') 347 return 348 349 logging.info('Creating backup file histograms.before.pretty-print.xml') 350 shutil.move('histograms.xml', 'histograms.before.pretty-print.xml') 351 352 logging.info('Writing new histograms.xml file') 353 with open('histograms.xml', 'wb') as f: 354 f.write(pretty) 355 356 357if __name__ == '__main__': 358 main() 359