1# Copyright 2013 The Chromium Authors. All rights reserved. 2# Use of this source code is governed by a BSD-style license that can be 3# found in the LICENSE file. 4 5from HTMLParser import HTMLParser 6 7 8class ParseResult(object): 9 '''The result of |ParseDocument|: 10 |title| The title of the page, as pulled from the first <h1>. 11 |title_attributes| The attributes of the <h1> tag the title is derived from. 12 |sections| The list of Sections within this document. 13 |warnings| Any warnings while parsing the document. 14 ''' 15 16 def __init__(self, title, title_attributes, sections, warnings): 17 self.title = title 18 self.title_attributes = title_attributes 19 self.sections = sections 20 self.warnings = warnings 21 22 23class DocumentSection(object): 24 '''A section of the document as grouped by <section>...</section>. Any content 25 not within section tags is considered an implicit section, so: 26 "Foo <section>Bar</section> Baz" is 3 sections. 27 |structure| A list of DocumentStructureEntry for each top-level heading. 28 ''' 29 30 def __init__(self): 31 self.structure = [] 32 33 34class DocumentStructureEntry(object): 35 '''An entry in the document structure. 36 |attributes| The attributes of the header tag this entry is derived from. 37 |name| The name of this entry, as pulled from the header tag this entry 38 is derived from. 39 |entries| A list of child DocumentStructureEntry items. 40 ''' 41 42 def __init__(self, tag, attributes): 43 self.attributes = attributes 44 self.name = '' 45 self.entries = [] 46 # Callers shouldn't care about the tag, but we need it for sanity checking, 47 # so make it private. In particular we pretend that anything but the first 48 # h1 is an h2, and it'd be odd to expose that. 49 self._tag = tag 50 # Documents can override the name of the entry using title="". 51 self._has_explicit_name = False 52 53 def __repr__(self): 54 return '<%s>%s</%s>' % (self._tag, self.name, self._tag) 55 56 def __str__(self): 57 return repr(self) 58 59 60def ParseDocument(document, expect_title=False): 61 '''Parses the title and a document structure form |document| and returns a 62 ParseResult. 63 ''' 64 parser = _DocumentParser(expect_title) 65 parser.feed(document) 66 parser.close() 67 return parser.parse_result 68 69 70def RemoveTitle(document): 71 '''Removes the first <h1>..</h1> tag found in |document| and returns a 72 (result, warning) tuple. 73 74 If no title is found or |document| is malformed in some way, returns the 75 original document and a warning message. Otherwise, returns the result of 76 removing the title from |document| with a None warning message. 77 ''' 78 79 def min_index(lhs, rhs): 80 lhs_index, rhs_index = document.find(lhs), document.find(rhs) 81 if lhs_index == -1: return rhs_index 82 if rhs_index == -1: return lhs_index 83 return min(lhs_index, rhs_index) 84 85 title_start = min_index('<h1', '<H1') 86 if title_start == -1: 87 return document, 'No opening <h1> was found' 88 title_end = min_index('/h1>', '/H1>') 89 if title_end == -1: 90 return document, 'No closing </h1> was found' 91 if title_end < title_start: 92 return document, 'The </h1> appeared before the <h1>' 93 94 return (document[:title_start] + document[title_end + 4:], None) 95 96 97_HEADER_TAGS = ['h2', 'h3', 'h4'] 98 99 100class _DocumentParser(HTMLParser): 101 '''HTMLParser for ParseDocument. 102 ''' 103 104 def __init__(self, expect_title): 105 HTMLParser.__init__(self) 106 # Public. 107 self.parse_result = None 108 # Private. 109 self._expect_title = expect_title 110 self._title_entry = None 111 self._sections = [] 112 self._processing_section = DocumentSection() 113 self._processing_entry = None 114 self._warnings = [] 115 116 def handle_starttag(self, tag, attrs): 117 if tag == 'section': 118 self._OnSectionBoundary() 119 return 120 121 if tag != 'h1' and tag not in _HEADER_TAGS: 122 return 123 124 if self._processing_entry is not None: 125 self._WarnWithPosition('Found <%s> in the middle of processing a <%s>' % 126 (tag, self._processing_entry._tag)) 127 return 128 129 attrs_dict = dict(attrs) 130 self._processing_entry = DocumentStructureEntry(tag, attrs_dict) 131 132 explicit_name = attrs_dict.pop('title', None) 133 if explicit_name == '': 134 # Don't create a TOC entry at all if the tag has specified title="". 135 return 136 if explicit_name is not None: 137 self._processing_entry.name = explicit_name 138 self._processing_entry._has_explicit_name = True 139 140 if tag == 'h1' and self._title_entry is not None: 141 self._WarnWithPosition('Found multiple <h1> tags. Subsequent <h1> tags ' 142 'will be classified as <h2> for the purpose of ' 143 'the structure') 144 tag = 'h2' 145 146 if tag == 'h1': 147 self._title_entry = self._processing_entry 148 else: 149 belongs_to = self._processing_section.structure 150 for header in _HEADER_TAGS[:_HEADER_TAGS.index(tag)]: 151 if len(belongs_to) == 0: 152 # TODO(kalman): Re-enable this warning once the reference pages have 153 # their references fixed. 154 #self._WarnWithPosition('Found <%s> without any preceding <%s>' % 155 # (tag, header)) 156 break 157 belongs_to = belongs_to[-1].entries 158 belongs_to.append(self._processing_entry) 159 160 def handle_endtag(self, tag): 161 if tag == 'section': 162 self._OnSectionBoundary() 163 return 164 165 if tag != 'h1' and tag not in _HEADER_TAGS: 166 return 167 168 if self._processing_entry is None: 169 self._WarnWithPosition('Found closing </%s> without an opening <%s>' % 170 (tag, tag)) 171 return 172 173 if self._processing_entry._tag != tag: 174 self._WarnWithPosition('Found closing </%s> while processing a <%s>' % 175 (tag, self._processing_entry._tag)) 176 # Note: no early return, it's more likely that the mismatched header was 177 # a typo rather than a misplaced closing header tag. 178 179 self._processing_entry = None 180 181 def handle_data(self, data): 182 if (self._processing_entry is not None and 183 not self._processing_entry._has_explicit_name): 184 # += is inefficient, but probably fine here because the chances of a 185 # large number of nested tags within header tags is pretty low. 186 self._processing_entry.name += data 187 188 def close(self): 189 HTMLParser.close(self) 190 191 self._OnSectionBoundary() 192 193 if self._processing_entry is not None: 194 self._warnings.append('Finished parsing while still processing a <%s>' % 195 parser._processing_entry._tag) 196 197 if self._expect_title: 198 if not self._title_entry: 199 self._warnings.append('Expected a title') 200 title, title_attributes = '', {} 201 else: 202 title, title_attributes = ( 203 self._title_entry.name, self._title_entry.attributes) 204 else: 205 if self._title_entry: 206 self._warnings.append('Found unexpected title "%s"' % 207 self._title_entry.name) 208 title, title_attributes = None, None 209 210 self.parse_result = ParseResult( 211 title, title_attributes, self._sections, self._warnings) 212 213 def _OnSectionBoundary(self): 214 # Only start a new section if the previous section was non-empty. 215 if self._processing_section.structure: 216 self._sections.append(self._processing_section) 217 self._processing_section = DocumentSection() 218 219 def _WarnWithPosition(self, message): 220 line, col = self.getpos() 221 self._warnings.append('%s (line %s, column %s)' % (message, line, col + 1)) 222