1#!/usr/bin/python
2
3# Copyright (c) 2011 The Chromium Authors. All rights reserved.
4# Use of this source code is governed by a BSD-style license that can be
5# found in the LICENSE file.
6
7'''This utility cleans up the html files as emitted by doxygen so
8that they are suitable for publication on a Google documentation site.
9'''
10
11import optparse
12import os
13import re
14import shutil
15import string
16import sys
17try:
18  from BeautifulSoup import BeautifulSoup, Tag
19except (ImportError, NotImplementedError):
20  print ("This tool requires the BeautifulSoup package "
21         "(see http://www.crummy.com/software/BeautifulSoup/).\n"
22         "Make sure that the file BeautifulSoup.py is either in this directory "
23         "or is available in your PYTHON_PATH")
24  raise
25
26
27class HTMLFixer(object):
28  '''This class cleans up the html strings as produced by Doxygen
29  '''
30
31  def __init__(self, html):
32    self.soup = BeautifulSoup(html)
33
34  def FixTableHeadings(self):
35    '''Fixes the doxygen table headings.
36
37    This includes:
38      - Using bare <h2> title row instead of row embedded in <tr><td> in table
39      - Putting the "name" attribute into the "id" attribute of the <tr> tag.
40      - Splitting up tables into multiple separate tables if a table
41        heading appears in the middle of a table.
42
43    For example, this html:
44     <table>
45      <tr><td colspan="2"><h2><a name="pub-attribs"></a>
46      Data Fields List</h2></td></tr>
47      ...
48     </table>
49
50    would be converted to this:
51     <h2>Data Fields List</h2>
52     <table>
53      ...
54     </table>
55    '''
56
57    table_headers = []
58    for tag in self.soup.findAll('tr'):
59      if tag.td and tag.td.h2 and tag.td.h2.a and tag.td.h2.a['name']:
60        #tag['id'] = tag.td.h2.a['name']
61        tag.string = tag.td.h2.a.next
62        tag.name = 'h2'
63        table_headers.append(tag)
64
65    # reverse the list so that earlier tags don't delete later tags
66    table_headers.reverse()
67    # Split up tables that have multiple table header (th) rows
68    for tag in table_headers:
69      print "Header tag: %s is %s" % (tag.name, tag.string.strip())
70      # Is this a heading in the middle of a table?
71      if tag.findPreviousSibling('tr') and tag.parent.name == 'table':
72        print "Splitting Table named %s" % tag.string.strip()
73        table = tag.parent
74        table_parent = table.parent
75        table_index = table_parent.contents.index(table)
76        new_table = Tag(self.soup, name='table', attrs=table.attrs)
77        table_parent.insert(table_index + 1, new_table)
78        tag_index = table.contents.index(tag)
79        for index, row in enumerate(table.contents[tag_index:]):
80          new_table.insert(index, row)
81      # Now move the <h2> tag to be in front of the <table> tag
82      assert tag.parent.name == 'table'
83      table = tag.parent
84      table_parent = table.parent
85      table_index = table_parent.contents.index(table)
86      table_parent.insert(table_index, tag)
87
88  def RemoveTopHeadings(self):
89    '''Removes <div> sections with a header, tabs, or navpath class attribute'''
90    header_tags = self.soup.findAll(
91        name='div',
92        attrs={'class' : re.compile('^(header|tabs[0-9]*|navpath)$')})
93    [tag.extract() for tag in header_tags]
94
95  def FixAll(self):
96    self.FixTableHeadings()
97    self.RemoveTopHeadings()
98
99  def __str__(self):
100    return str(self.soup)
101
102
103def main():
104  '''Main entry for the doxy_cleanup utility
105
106  doxy_cleanup takes a list of html files and modifies them in place.'''
107
108  parser = optparse.OptionParser(usage='Usage: %prog [options] files...')
109
110  parser.add_option('-m', '--move', dest='move', action='store_true',
111                    default=False, help='move html files to "original_html"')
112
113  options, files = parser.parse_args()
114
115  if not files:
116    parser.print_usage()
117    return 1
118
119  for filename in files:
120    try:
121      with open(filename, 'r') as file:
122        html = file.read()
123
124      print "Processing %s" % filename
125      fixer = HTMLFixer(html)
126      fixer.FixAll()
127      with open(filename, 'w') as file:
128        file.write(str(fixer))
129      if options.move:
130        new_directory = os.path.join(
131            os.path.dirname(os.path.dirname(filename)), 'original_html')
132        if not os.path.exists(new_directory):
133          os.mkdir(new_directory)
134        shutil.move(filename, new_directory)
135    except:
136      print "Error while processing %s" % filename
137      raise
138
139  return 0
140
141if __name__ == '__main__':
142  sys.exit(main())
143