docs/server2/document_parser_test.py

#!/usr/bin/env python
# Copyright 2013 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import unittest

from document_parser import ParseDocument, RemoveTitle


_WHOLE_DOCUMENT = '''
Preamble before heading.

<h1 id='main' class='header'>Main header</h1>
Some intro to the content.

<h2 id='banana' class='header'>Bananas</h2>
Something about bananas.

<h2 id='orange'>Oranges</h2>
Something about oranges.

<h3 id='valencia'>Valencia Oranges</h3>
A description of valencia oranges.

<h3 id='seville'>Seville Oranges</h3>
A description of seville oranges.

<h2>Grapefruit</h3>
Grapefruit closed a h2 with a h3. This should be a warning.

<h1 id='not-main'>Not the main header</h1>
But it should still show up in the TOC as though it were an h2.

<h2>Not <h3>a banana</h2>
The embedded h3 should be ignored.

<h4>It's a h4</h4>
h4 are part of the document structure, but this is not inside a h3.

<h3>Plantains</h3>
Now I'm just getting lazy.

<h4>Another h4</h4>
This h4 is inside a h3 so will show up.

<h5>Header 5</h5>
Header 5s are not parsed.
'''


_WHOLE_DOCUMENT_WITHOUT_TITLE = '''
Preamble before heading.


Some intro to the content.

<h2 id='banana' class='header'>Bananas</h2>
Something about bananas.

<h2 id='orange'>Oranges</h2>
Something about oranges.

<h3 id='valencia'>Valencia Oranges</h3>
A description of valencia oranges.

<h3 id='seville'>Seville Oranges</h3>
A description of seville oranges.

<h2>Grapefruit</h3>
Grapefruit closed a h2 with a h3. This should be a warning.

<h1 id='not-main'>Not the main header</h1>
But it should still show up in the TOC as though it were an h2.

<h2>Not <h3>a banana</h2>
The embedded h3 should be ignored.

<h4>It's a h4</h4>
h4 are part of the document structure, but this is not inside a h3.

<h3>Plantains</h3>
Now I'm just getting lazy.

<h4>Another h4</h4>
This h4 is inside a h3 so will show up.

<h5>Header 5</h5>
Header 5s are not parsed.
'''


class DocumentParserUnittest(unittest.TestCase):

  def testEmptyDocument(self):
    self.assertEqual(('', 'No opening <h1> was found'), RemoveTitle(''))

    result = ParseDocument('')
    self.assertEqual(None, result.title)
    self.assertEqual(None, result.title_attributes)
    self.assertEqual([], result.sections)
    self.assertEqual([], result.warnings)

    result = ParseDocument('', expect_title=True)
    self.assertEqual('', result.title)
    self.assertEqual({}, result.title_attributes)
    self.assertEqual([], result.sections)
    self.assertEqual(['Expected a title'], result.warnings)

  def testRemoveTitle(self):
    no_closing_tag = '<h1>No closing tag'
    self.assertEqual((no_closing_tag, 'No closing </h1> was found'),
                     RemoveTitle(no_closing_tag))

    no_opening_tag = 'No opening tag</h1>'
    self.assertEqual((no_opening_tag, 'No opening <h1> was found'),
                     RemoveTitle(no_opening_tag))

    tags_wrong_order = '</h1>Tags in wrong order<h1>'
    self.assertEqual((tags_wrong_order, 'The </h1> appeared before the <h1>'),
                     RemoveTitle(tags_wrong_order))

    multiple_titles = '<h1>First header</h1> and <h1>Second header</h1>'
    self.assertEqual((' and <h1>Second header</h1>', None),
                     RemoveTitle(multiple_titles))

    upper_case = '<H1>Upper case header tag</H1> hi'
    self.assertEqual((' hi', None), RemoveTitle(upper_case))
    mixed_case = '<H1>Mixed case header tag</h1> hi'
    self.assertEqual((' hi', None), RemoveTitle(mixed_case))

  def testOnlyTitleDocument(self):
    document = '<h1 id="header">heading</h1>'
    self.assertEqual(('', None), RemoveTitle(document))

    result = ParseDocument(document)
    self.assertEqual(None, result.title)
    self.assertEqual(None, result.title_attributes)
    self.assertEqual([], result.sections)
    self.assertEqual(['Found unexpected title "heading"'], result.warnings)

    result = ParseDocument(document, expect_title=True)
    self.assertEqual('heading', result.title)
    self.assertEqual({'id': 'header'}, result.title_attributes)
    self.assertEqual([], result.sections)
    self.assertEqual([], result.warnings)

  def testWholeDocument(self):
    self.assertEqual((_WHOLE_DOCUMENT_WITHOUT_TITLE, None),
                     RemoveTitle(_WHOLE_DOCUMENT))
    result = ParseDocument(_WHOLE_DOCUMENT, expect_title=True)
    self.assertEqual('Main header', result.title)
    self.assertEqual({'id': 'main', 'class': 'header'}, result.title_attributes)
    self.assertEqual([
      'Found closing </h3> while processing a <h2> (line 19, column 15)',
      'Found multiple <h1> tags. Subsequent <h1> tags will be classified as '
          '<h2> for the purpose of the structure (line 22, column 1)',
      'Found <h3> in the middle of processing a <h2> (line 25, column 9)',
      # TODO(kalman): Re-enable this warning once the reference pages have
      # their references fixed.
      #'Found <h4> without any preceding <h3> (line 28, column 1)',
    ], result.warnings)

    # The non-trivial table of contents assertions...
    self.assertEqual(1, len(result.sections))
    entries = result.sections[0].structure

    self.assertEqual(5, len(entries), entries)
    entry0, entry1, entry2, entry3, entry4 = entries

    self.assertEqual('Bananas', entry0.name)
    self.assertEqual({'id': 'banana', 'class': 'header'}, entry0.attributes)
    self.assertEqual([], entry0.entries)

    self.assertEqual('Oranges', entry1.name)
    self.assertEqual({'id': 'orange'}, entry1.attributes)
    self.assertEqual(2, len(entry1.entries))
    entry1_0, entry1_1 = entry1.entries

    self.assertEqual('Valencia Oranges', entry1_0.name)
    self.assertEqual({'id': 'valencia'}, entry1_0.attributes)
    self.assertEqual([], entry1_0.entries)
    self.assertEqual('Seville Oranges', entry1_1.name)
    self.assertEqual({'id': 'seville'}, entry1_1.attributes)
    self.assertEqual([], entry1_1.entries)

    self.assertEqual('Grapefruit', entry2.name)
    self.assertEqual({}, entry2.attributes)
    self.assertEqual([], entry2.entries)

    self.assertEqual('Not the main header', entry3.name)
    self.assertEqual({'id': 'not-main'}, entry3.attributes)
    self.assertEqual([], entry3.entries)

    self.assertEqual('Not a banana', entry4.name)
    self.assertEqual({}, entry4.attributes)
    self.assertEqual(2, len(entry4.entries))
    entry4_1, entry4_2 = entry4.entries

    self.assertEqual('It\'s a h4', entry4_1.name)
    self.assertEqual({}, entry4_1.attributes)
    self.assertEqual([], entry4_1.entries)

    self.assertEqual('Plantains', entry4_2.name)
    self.assertEqual({}, entry4_2.attributes)
    self.assertEqual(1, len(entry4_2.entries))
    entry4_2_1, = entry4_2.entries

    self.assertEqual('Another h4', entry4_2_1.name)
    self.assertEqual({}, entry4_2_1.attributes)
    self.assertEqual([], entry4_2_1.entries)

  def testSingleExplicitSection(self):
    def test(document):
      result = ParseDocument(document, expect_title=True)
      self.assertEqual([], result.warnings)
      self.assertEqual('Header', result.title)
      self.assertEqual(1, len(result.sections))
      section0, = result.sections
      entry0, = section0.structure
      self.assertEqual('An inner header', entry0.name)
    # A single section, one with the title inside the section, the other out.
    test('<h1>Header</h1>'
         '<section>'
         'Just a single section here.'
         '<h2>An inner header</h2>'
         '</section>')
    test('<section>'
         'Another single section here.'
         '<h1>Header</h1>'
         '<h2>An inner header</h2>'
         '</section>')

  def testMultipleSections(self):
    result = ParseDocument(
        '<h1>Header</h1>'
        '<h2>First header</h2>'
        'This content outside a section is the first section.'
        '<section>'
        'Second section'
        '<h2>Second header</h2>'
        '</section>'
        '<section>'
        'Third section'
        '<h2>Third header</h2>'
        '</section>',
        expect_title=True)
    self.assertEqual([], result.warnings)
    self.assertEqual('Header', result.title)
    self.assertEqual(3, len(result.sections))
    section0, section1, section2 = result.sections
    def assert_single_header(section, name):
      self.assertEqual(1, len(section.structure))
      self.assertEqual(name, section.structure[0].name)
    assert_single_header(section0, 'First header')
    assert_single_header(section1, 'Second header')
    assert_single_header(section2, 'Third header')


if __name__ == '__main__':
  unittest.main()