1# Copyright 2013 The Chromium Authors. All rights reserved.
2# Use of this source code is governed by a BSD-style license that can be
3# found in the LICENSE file.
4
5from HTMLParser import HTMLParser
6
7
8class ParseResult(object):
9  '''The result of |ParseDocument|:
10  |title|             The title of the page, as pulled from the first <h1>.
11  |title_attributes|  The attributes of the <h1> tag the title is derived from.
12  |sections|          The list of Sections within this document.
13  |warnings|          Any warnings while parsing the document.
14  '''
15
16  def __init__(self, title, title_attributes, sections, warnings):
17    self.title = title
18    self.title_attributes = title_attributes
19    self.sections = sections
20    self.warnings = warnings
21
22
23class DocumentSection(object):
24  '''A section of the document as grouped by <section>...</section>. Any content
25  not within section tags is considered an implicit section, so:
26  "Foo <section>Bar</section> Baz" is 3 sections.
27  |structure|  A list of DocumentStructureEntry for each top-level heading.
28  '''
29
30  def __init__(self):
31    self.structure = []
32
33
34class DocumentStructureEntry(object):
35  '''An entry in the document structure.
36  |attributes| The attributes of the header tag this entry is derived from.
37  |name|       The name of this entry, as pulled from the header tag this entry
38               is derived from.
39  |entries|    A list of child DocumentStructureEntry items.
40  '''
41
42  def __init__(self, tag, attributes):
43    self.attributes = attributes
44    self.name = ''
45    self.entries = []
46    # Callers shouldn't care about the tag, but we need it for sanity checking,
47    # so make it private. In particular we pretend that anything but the first
48    # h1 is an h2, and it'd be odd to expose that.
49    self._tag = tag
50    # Documents can override the name of the entry using title="".
51    self._has_explicit_name = False
52
53  def __repr__(self):
54    return '<%s>%s</%s>' % (self._tag, self.name, self._tag)
55
56  def __str__(self):
57    return repr(self)
58
59
60def ParseDocument(document, expect_title=False):
61  '''Parses the title and a document structure form |document| and returns a
62  ParseResult.
63  '''
64  parser = _DocumentParser(expect_title)
65  parser.feed(document)
66  parser.close()
67  return parser.parse_result
68
69
70def RemoveTitle(document):
71  '''Removes the first <h1>..</h1> tag found in |document| and returns a
72  (result, warning) tuple.
73
74  If no title is found or |document| is malformed in some way, returns the
75  original document and a warning message. Otherwise, returns the result of
76  removing the title from |document| with a None warning message.
77  '''
78
79  def min_index(lhs, rhs):
80    lhs_index, rhs_index = document.find(lhs), document.find(rhs)
81    if lhs_index == -1: return rhs_index
82    if rhs_index == -1: return lhs_index
83    return min(lhs_index, rhs_index)
84
85  title_start = min_index('<h1', '<H1')
86  if title_start == -1:
87    return document, 'No opening <h1> was found'
88  title_end = min_index('/h1>', '/H1>')
89  if title_end == -1:
90    return document, 'No closing </h1> was found'
91  if title_end < title_start:
92    return document, 'The </h1> appeared before the <h1>'
93
94  return (document[:title_start] + document[title_end + 4:], None)
95
96
97_HEADER_TAGS = ['h2', 'h3', 'h4']
98
99
100class _DocumentParser(HTMLParser):
101  '''HTMLParser for ParseDocument.
102  '''
103
104  def __init__(self, expect_title):
105    HTMLParser.__init__(self)
106    # Public.
107    self.parse_result = None
108    # Private.
109    self._expect_title = expect_title
110    self._title_entry = None
111    self._sections = []
112    self._processing_section = DocumentSection()
113    self._processing_entry = None
114    self._warnings = []
115
116  def handle_starttag(self, tag, attrs):
117    if tag == 'section':
118      self._OnSectionBoundary()
119      return
120
121    if tag != 'h1' and tag not in _HEADER_TAGS:
122      return
123
124    if self._processing_entry is not None:
125      self._WarnWithPosition('Found <%s> in the middle of processing a <%s>' %
126                             (tag, self._processing_entry._tag))
127      return
128
129    attrs_dict = dict(attrs)
130    self._processing_entry = DocumentStructureEntry(tag, attrs_dict)
131
132    explicit_name = attrs_dict.pop('title', None)
133    if explicit_name == '':
134      # Don't create a TOC entry at all if the tag has specified title="".
135      return
136    if explicit_name is not None:
137      self._processing_entry.name = explicit_name
138      self._processing_entry._has_explicit_name = True
139
140    if tag == 'h1' and self._title_entry is not None:
141      self._WarnWithPosition('Found multiple <h1> tags. Subsequent <h1> tags '
142                             'will be classified as <h2> for the purpose of '
143                             'the structure')
144      tag = 'h2'
145
146    if tag == 'h1':
147      self._title_entry = self._processing_entry
148    else:
149      belongs_to = self._processing_section.structure
150      for header in _HEADER_TAGS[:_HEADER_TAGS.index(tag)]:
151        if len(belongs_to) == 0:
152          # TODO(kalman): Re-enable this warning once the reference pages have
153          # their references fixed.
154          #self._WarnWithPosition('Found <%s> without any preceding <%s>' %
155          #                       (tag, header))
156          break
157        belongs_to = belongs_to[-1].entries
158      belongs_to.append(self._processing_entry)
159
160  def handle_endtag(self, tag):
161    if tag == 'section':
162      self._OnSectionBoundary()
163      return
164
165    if tag != 'h1' and tag not in _HEADER_TAGS:
166      return
167
168    if self._processing_entry is None:
169      self._WarnWithPosition('Found closing </%s> without an opening <%s>' %
170                             (tag, tag))
171      return
172
173    if self._processing_entry._tag != tag:
174      self._WarnWithPosition('Found closing </%s> while processing a <%s>' %
175                             (tag, self._processing_entry._tag))
176      # Note: no early return, it's more likely that the mismatched header was
177      # a typo rather than a misplaced closing header tag.
178
179    self._processing_entry = None
180
181  def handle_data(self, data):
182    if (self._processing_entry is not None and
183        not self._processing_entry._has_explicit_name):
184      # += is inefficient, but probably fine here because the chances of a
185      # large number of nested tags within header tags is pretty low.
186      self._processing_entry.name += data
187
188  def close(self):
189    HTMLParser.close(self)
190
191    self._OnSectionBoundary()
192
193    if self._processing_entry is not None:
194      self._warnings.append('Finished parsing while still processing a <%s>' %
195                            parser._processing_entry._tag)
196
197    if self._expect_title:
198      if not self._title_entry:
199        self._warnings.append('Expected a title')
200        title, title_attributes = '', {}
201      else:
202        title, title_attributes = (
203            self._title_entry.name, self._title_entry.attributes)
204    else:
205      if self._title_entry:
206        self._warnings.append('Found unexpected title "%s"' %
207                              self._title_entry.name)
208      title, title_attributes = None, None
209
210    self.parse_result = ParseResult(
211        title, title_attributes, self._sections, self._warnings)
212
213  def _OnSectionBoundary(self):
214    # Only start a new section if the previous section was non-empty.
215    if self._processing_section.structure:
216      self._sections.append(self._processing_section)
217      self._processing_section = DocumentSection()
218
219  def _WarnWithPosition(self, message):
220    line, col = self.getpos()
221    self._warnings.append('%s (line %s, column %s)' % (message, line, col + 1))
222