docs/server2/document_parser.py

# Copyright 2013 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

from HTMLParser import HTMLParser


class ParseResult(object):
  '''The result of |ParseDocument|:
  |title|             The title of the page, as pulled from the first <h1>.
  |title_attributes|  The attributes of the <h1> tag the title is derived from.
  |sections|          The list of Sections within this document.
  |warnings|          Any warnings while parsing the document.
  '''

  def __init__(self, title, title_attributes, sections, warnings):
    self.title = title
    self.title_attributes = title_attributes
    self.sections = sections
    self.warnings = warnings


class DocumentSection(object):
  '''A section of the document as grouped by <section>...</section>. Any content
  not within section tags is considered an implicit section, so:
  "Foo <section>Bar</section> Baz" is 3 sections.
  |structure|  A list of DocumentStructureEntry for each top-level heading.
  '''

  def __init__(self):
    self.structure = []


class DocumentStructureEntry(object):
  '''An entry in the document structure.
  |attributes| The attributes of the header tag this entry is derived from.
  |name|       The name of this entry, as pulled from the header tag this entry
               is derived from.
  |entries|    A list of child DocumentStructureEntry items.
  '''

  def __init__(self, tag, attributes):
    self.attributes = attributes
    self.name = ''
    self.entries = []
    # Callers shouldn't care about the tag, but we need it for sanity checking,
    # so make it private. In particular we pretend that anything but the first
    # h1 is an h2, and it'd be odd to expose that.
    self._tag = tag
    # Documents can override the name of the entry using title="".
    self._has_explicit_name = False

  def __repr__(self):
    return '<%s>%s</%s>' % (self._tag, self.name, self._tag)

  def __str__(self):
    return repr(self)


def ParseDocument(document, expect_title=False):
  '''Parses the title and a document structure form |document| and returns a
  ParseResult.
  '''
  parser = _DocumentParser(expect_title)
  parser.feed(document)
  parser.close()
  return parser.parse_result


def RemoveTitle(document):
  '''Removes the first <h1>..</h1> tag found in |document| and returns a
  (result, warning) tuple.

  If no title is found or |document| is malformed in some way, returns the
  original document and a warning message. Otherwise, returns the result of
  removing the title from |document| with a None warning message.
  '''

  def min_index(lhs, rhs):
    lhs_index, rhs_index = document.find(lhs), document.find(rhs)
    if lhs_index == -1: return rhs_index
    if rhs_index == -1: return lhs_index
    return min(lhs_index, rhs_index)

  title_start = min_index('<h1', '<H1')
  if title_start == -1:
    return document, 'No opening <h1> was found'
  title_end = min_index('/h1>', '/H1>')
  if title_end == -1:
    return document, 'No closing </h1> was found'
  if title_end < title_start:
    return document, 'The </h1> appeared before the <h1>'

  return (document[:title_start] + document[title_end + 4:], None)


_HEADER_TAGS = ['h2', 'h3', 'h4']


class _DocumentParser(HTMLParser):
  '''HTMLParser for ParseDocument.
  '''

  def __init__(self, expect_title):
    HTMLParser.__init__(self)
    # Public.
    self.parse_result = None
    # Private.
    self._expect_title = expect_title
    self._title_entry = None
    self._sections = []
    self._processing_section = DocumentSection()
    self._processing_entry = None
    self._warnings = []

  def handle_starttag(self, tag, attrs):
    if tag == 'section':
      self._OnSectionBoundary()
      return

    if tag != 'h1' and tag not in _HEADER_TAGS:
      return

    if self._processing_entry is not None:
      self._WarnWithPosition('Found <%s> in the middle of processing a <%s>' %
                             (tag, self._processing_entry._tag))
      return

    attrs_dict = dict(attrs)
    self._processing_entry = DocumentStructureEntry(tag, attrs_dict)

    explicit_name = attrs_dict.pop('title', None)
    if explicit_name == '':
      # Don't create a TOC entry at all if the tag has specified title="".
      return
    if explicit_name is not None:
      self._processing_entry.name = explicit_name
      self._processing_entry._has_explicit_name = True

    if tag == 'h1' and self._title_entry is not None:
      self._WarnWithPosition('Found multiple <h1> tags. Subsequent <h1> tags '
                             'will be classified as <h2> for the purpose of '
                             'the structure')
      tag = 'h2'

    if tag == 'h1':
      self._title_entry = self._processing_entry
    else:
      belongs_to = self._processing_section.structure
      for header in _HEADER_TAGS[:_HEADER_TAGS.index(tag)]:
        if len(belongs_to) == 0:
          # TODO(kalman): Re-enable this warning once the reference pages have
          # their references fixed.
          #self._WarnWithPosition('Found <%s> without any preceding <%s>' %
          #                       (tag, header))
          break
        belongs_to = belongs_to[-1].entries
      belongs_to.append(self._processing_entry)

  def handle_endtag(self, tag):
    if tag == 'section':
      self._OnSectionBoundary()
      return

    if tag != 'h1' and tag not in _HEADER_TAGS:
      return

    if self._processing_entry is None:
      self._WarnWithPosition('Found closing </%s> without an opening <%s>' %
                             (tag, tag))
      return

    if self._processing_entry._tag != tag:
      self._WarnWithPosition('Found closing </%s> while processing a <%s>' %
                             (tag, self._processing_entry._tag))
      # Note: no early return, it's more likely that the mismatched header was
      # a typo rather than a misplaced closing header tag.

    self._processing_entry = None

  def handle_data(self, data):
    if (self._processing_entry is not None and
        not self._processing_entry._has_explicit_name):
      # += is inefficient, but probably fine here because the chances of a
      # large number of nested tags within header tags is pretty low.
      self._processing_entry.name += data

  def close(self):
    HTMLParser.close(self)

    self._OnSectionBoundary()

    if self._processing_entry is not None:
      self._warnings.append('Finished parsing while still processing a <%s>' %
                            parser._processing_entry._tag)

    if self._expect_title:
      if not self._title_entry:
        self._warnings.append('Expected a title')
        title, title_attributes = '', {}
      else:
        title, title_attributes = (
            self._title_entry.name, self._title_entry.attributes)
    else:
      if self._title_entry:
        self._warnings.append('Found unexpected title "%s"' %
                              self._title_entry.name)
      title, title_attributes = None, None

    self.parse_result = ParseResult(
        title, title_attributes, self._sections, self._warnings)

  def _OnSectionBoundary(self):
    # Only start a new section if the previous section was non-empty.
    if self._processing_section.structure:
      self._sections.append(self._processing_section)
      self._processing_section = DocumentSection()

  def _WarnWithPosition(self, message):
    line, col = self.getpos()
    self._warnings.append('%s (line %s, column %s)' % (message, line, col + 1))