1#!/usr/bin/python
2
3# Copyright (c) 2011 The Chromium Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7'''This utility cleans up the html files as emitted by doxygen so
8that they are suitable for publication on a Google documentation site.
9'''
10
11import glob
12import optparse
13import os
14import re
15import shutil
16import sys
17try:
18  from BeautifulSoup import BeautifulSoup, Tag
19except (ImportError, NotImplementedError):
20  print ("This tool requires the BeautifulSoup package "
21         "(see http://www.crummy.com/software/BeautifulSoup/).\n"
22         "Make sure that the file BeautifulSoup.py is either in this directory "
23         "or is available in your PYTHON_PATH")
24  raise
25
26
27def Trace(msg):
28  if Trace.verbose:
29    sys.stderr.write(str(msg) + '\n')
30
31Trace.verbose = False
32
33
34FILES_TO_REMOVE = [
35  '*.css',
36  '*.map',
37  '*.md5',
38  'annotated.html',
39  'bc_s.png',
40  'classes.html',
41  'closed.png',
42  'doxygen.png',
43  'files.html',
44  'functions*.html',
45  'globals_0x*.html',
46  'globals_enum.html',
47  'globals_eval.html',
48  'globals_func.html',
49  'globals.html',
50  'globals_type.html',
51  'globals_vars.html',
52  'graph_legend.html',
53  'graph_legend.png',
54  'hierarchy.html',
55  'index_8dox.html',
56  'index.html',
57  'modules.html',
58  'namespacemembers_func.html',
59  'namespacemembers.html',
60  'namespaces.html',
61  'nav_f.png',
62  'nav_h.png',
63  'open.png',
64  'tab_a.png',
65  'tab_b.png',
66  'tab_h.png',
67  'tab_s.png',
68]
69
70
71class HTMLFixer(object):
72  '''This class cleans up the html strings as produced by Doxygen
73  '''
74
75  def __init__(self, html):
76    self.soup = BeautifulSoup(html)
77
78  def FixTableHeadings(self):
79    '''Fixes the doxygen table headings.
80
81    This includes:
82      - Using bare <h2> title row instead of row embedded in <tr><td> in table
83      - Putting the "name" attribute into the "id" attribute of the <tr> tag.
84      - Splitting up tables into multiple separate tables if a table
85        heading appears in the middle of a table.
86
87    For example, this html:
88     <table>
89      <tr><td colspan="2"><h2><a name="pub-attribs"></a>
90      Data Fields List</h2></td></tr>
91      ...
92     </table>
93
94    would be converted to this:
95     <h2>Data Fields List</h2>
96     <table>
97      ...
98     </table>
99    '''
100
101    table_headers = []
102    for tag in self.soup.findAll('tr'):
103      if tag.td and tag.td.h2 and tag.td.h2.a and tag.td.h2.a['name']:
104        #tag['id'] = tag.td.h2.a['name']
105        tag.string = tag.td.h2.a.next
106        tag.name = 'h2'
107        table_headers.append(tag)
108
109    # reverse the list so that earlier tags don't delete later tags
110    table_headers.reverse()
111    # Split up tables that have multiple table header (th) rows
112    for tag in table_headers:
113      Trace("Header tag: %s is %s" % (tag.name, tag.string.strip()))
114      # Is this a heading in the middle of a table?
115      if tag.findPreviousSibling('tr') and tag.parent.name == 'table':
116        Trace("Splitting Table named %s" % tag.string.strip())
117        table = tag.parent
118        table_parent = table.parent
119        table_index = table_parent.contents.index(table)
120        new_table = Tag(self.soup, name='table', attrs=table.attrs)
121        table_parent.insert(table_index + 1, new_table)
122        tag_index = table.contents.index(tag)
123        for index, row in enumerate(table.contents[tag_index:]):
124          new_table.insert(index, row)
125      # Now move the <h2> tag to be in front of the <table> tag
126      assert tag.parent.name == 'table'
127      table = tag.parent
128      table_parent = table.parent
129      table_index = table_parent.contents.index(table)
130      table_parent.insert(table_index, tag)
131
132  def RemoveTopHeadings(self):
133    '''Removes <div> sections with a header, tabs, or navpath class attribute'''
134    header_tags = self.soup.findAll(
135        name='div',
136        attrs={'class' : re.compile('^(header|tabs[0-9]*|navpath)$')})
137    [tag.extract() for tag in header_tags]
138
139  def RemoveVersionNumbers(self, html):
140    '''Horrible hack to strip _#_# from struct names.'''
141    return re.sub(r'(_\d_\d)(?=[": <])', '', html)
142
143  def FixAll(self):
144    self.FixTableHeadings()
145    self.RemoveTopHeadings()
146    html = str(self.soup)
147    html = self.RemoveVersionNumbers(html)
148    return html
149
150
151def main(argv):
152  """Main entry for the doxy_cleanup utility
153
154  doxy_cleanup cleans up the html files generated by doxygen.
155  """
156
157  parser = optparse.OptionParser(usage='Usage: %prog [options] directory')
158  parser.add_option('-v', '--verbose', help='verbose output.',
159                    action='store_true')
160  options, files = parser.parse_args(argv)
161
162  if len(files) != 1:
163    parser.error('Expected one directory')
164
165  if options.verbose:
166    Trace.verbose = True
167
168  root_dir = files[0]
169  html_dir = os.path.join(root_dir, 'html')
170
171  # Doxygen puts all files in an 'html' directory.
172  # First, move all files from that directory to root_dir.
173  for filename in glob.glob(os.path.join(html_dir, '*')):
174    Trace('Moving %s -> %s' % (filename, root_dir))
175    shutil.move(filename, root_dir)
176
177  # Now remove the 'html' directory.
178  Trace('Removing %s' % html_dir)
179  os.rmdir(html_dir)
180
181  # Then remove unneeded files.
182  for wildcard in FILES_TO_REMOVE:
183    Trace('Removing "%s":' % wildcard)
184    path = os.path.join(root_dir, wildcard)
185    for filename in glob.glob(path):
186      Trace('  Removing "%s"' % filename)
187      os.remove(filename)
188
189  # Now, fix the HTML files we've kept.
190  Trace('Fixing HTML files...')
191  for root, _, files in os.walk(root_dir):
192    for filename in files:
193      if not os.path.splitext(filename)[1] == '.html':
194        Trace('Skipping %s' % filename)
195        continue
196
197      filename = os.path.join(root, filename)
198      Trace('Processing "%s"...' % filename)
199      try:
200        with open(filename) as f:
201          html = f.read()
202
203        fixer = HTMLFixer(html)
204        output = fixer.FixAll()
205        with open(filename, 'w') as f:
206          f.write(output)
207      except:
208        sys.stderr.write("Error while processing %s\n" % filename)
209        raise
210
211  return 0
212
213if __name__ == '__main__':
214  try:
215    rtn = main(sys.argv[1:])
216  except KeyboardInterrupt:
217    sys.stderr.write('%s: interrupted\n' % os.path.basename(__file__))
218    rtn = 1
219  sys.exit(rtn)
220