document_parser_test.py revision a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7
1#!/usr/bin/env python
2# Copyright 2013 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6import unittest
7
8from document_parser import ParseDocument, RemoveTitle
9
10
11_WHOLE_DOCUMENT = '''
12Preamble before heading.
13
14<h1 id='main' class='header'>Main header</h1>
15Some intro to the content.
16
17<h2 id='banana' class='header'>Bananas</h2>
18Something about bananas.
19
20<h2 id='orange'>Oranges</h2>
21Something about oranges.
22
23<h3 id='valencia'>Valencia Oranges</h3>
24A description of valencia oranges.
25
26<h3 id='seville'>Seville Oranges</h3>
27A description of seville oranges.
28
29<h2>Grapefruit</h3>
30Grapefruit closed a h2 with a h3. This should be a warning.
31
32<h1 id='not-main'>Not the main header</h1>
33But it should still show up in the TOC as though it were an h2.
34
35<h2>Not <h3>a banana</h2>
36The embedded h3 should be ignored.
37
38<h4>It's a h4</h4>
39h4 are part of the document structure, but this is not inside a h3.
40
41<h3>Plantains</h3>
42Now I'm just getting lazy.
43
44<h4>Another h4</h4>
45This h4 is inside a h3 so will show up.
46
47<h5>Header 5</h5>
48Header 5s are not parsed.
49'''
50
51
52_WHOLE_DOCUMENT_WITHOUT_TITLE = '''
53Preamble before heading.
54
55
56Some intro to the content.
57
58<h2 id='banana' class='header'>Bananas</h2>
59Something about bananas.
60
61<h2 id='orange'>Oranges</h2>
62Something about oranges.
63
64<h3 id='valencia'>Valencia Oranges</h3>
65A description of valencia oranges.
66
67<h3 id='seville'>Seville Oranges</h3>
68A description of seville oranges.
69
70<h2>Grapefruit</h3>
71Grapefruit closed a h2 with a h3. This should be a warning.
72
73<h1 id='not-main'>Not the main header</h1>
74But it should still show up in the TOC as though it were an h2.
75
76<h2>Not <h3>a banana</h2>
77The embedded h3 should be ignored.
78
79<h4>It's a h4</h4>
80h4 are part of the document structure, but this is not inside a h3.
81
82<h3>Plantains</h3>
83Now I'm just getting lazy.
84
85<h4>Another h4</h4>
86This h4 is inside a h3 so will show up.
87
88<h5>Header 5</h5>
89Header 5s are not parsed.
90'''
91
92
93class DocumentParserUnittest(unittest.TestCase):
94
95  def testEmptyDocument(self):
96    self.assertEqual(('', 'No opening <h1> was found'), RemoveTitle(''))
97
98    result = ParseDocument('')
99    self.assertEqual(None, result.title)
100    self.assertEqual(None, result.title_attributes)
101    self.assertEqual([], result.sections)
102    self.assertEqual([], result.warnings)
103
104    result = ParseDocument('', expect_title=True)
105    self.assertEqual('', result.title)
106    self.assertEqual({}, result.title_attributes)
107    self.assertEqual([], result.sections)
108    self.assertEqual(['Expected a title'], result.warnings)
109
110  def testRemoveTitle(self):
111    no_closing_tag = '<h1>No closing tag'
112    self.assertEqual((no_closing_tag, 'No closing </h1> was found'),
113                     RemoveTitle(no_closing_tag))
114
115    no_opening_tag = 'No opening tag</h1>'
116    self.assertEqual((no_opening_tag, 'No opening <h1> was found'),
117                     RemoveTitle(no_opening_tag))
118
119    tags_wrong_order = '</h1>Tags in wrong order<h1>'
120    self.assertEqual((tags_wrong_order, 'The </h1> appeared before the <h1>'),
121                     RemoveTitle(tags_wrong_order))
122
123    multiple_titles = '<h1>First header</h1> and <h1>Second header</h1>'
124    self.assertEqual((' and <h1>Second header</h1>', None),
125                     RemoveTitle(multiple_titles))
126
127    upper_case = '<H1>Upper case header tag</H1> hi'
128    self.assertEqual((' hi', None), RemoveTitle(upper_case))
129    mixed_case = '<H1>Mixed case header tag</h1> hi'
130    self.assertEqual((' hi', None), RemoveTitle(mixed_case))
131
132  def testOnlyTitleDocument(self):
133    document = '<h1 id="header">heading</h1>'
134    self.assertEqual(('', None), RemoveTitle(document))
135
136    result = ParseDocument(document)
137    self.assertEqual(None, result.title)
138    self.assertEqual(None, result.title_attributes)
139    self.assertEqual([], result.sections)
140    self.assertEqual(['Found unexpected title "heading"'], result.warnings)
141
142    result = ParseDocument(document, expect_title=True)
143    self.assertEqual('heading', result.title)
144    self.assertEqual({'id': 'header'}, result.title_attributes)
145    self.assertEqual([], result.sections)
146    self.assertEqual([], result.warnings)
147
148  def testWholeDocument(self):
149    self.assertEqual((_WHOLE_DOCUMENT_WITHOUT_TITLE, None),
150                     RemoveTitle(_WHOLE_DOCUMENT))
151    result = ParseDocument(_WHOLE_DOCUMENT, expect_title=True)
152    self.assertEqual('Main header', result.title)
153    self.assertEqual({'id': 'main', 'class': 'header'}, result.title_attributes)
154    self.assertEqual([
155      'Found closing </h3> while processing a <h2> (line 19, column 15)',
156      'Found multiple <h1> tags. Subsequent <h1> tags will be classified as '
157          '<h2> for the purpose of the structure (line 22, column 1)',
158      'Found <h3> in the middle of processing a <h2> (line 25, column 9)',
159      # TODO(kalman): Re-enable this warning once the reference pages have
160      # their references fixed.
161      #'Found <h4> without any preceding <h3> (line 28, column 1)',
162    ], result.warnings)
163
164    # The non-trivial table of contents assertions...
165    self.assertEqual(1, len(result.sections))
166    entries = result.sections[0].structure
167
168    self.assertEqual(5, len(entries), entries)
169    entry0, entry1, entry2, entry3, entry4 = entries
170
171    self.assertEqual('Bananas', entry0.name)
172    self.assertEqual({'id': 'banana', 'class': 'header'}, entry0.attributes)
173    self.assertEqual([], entry0.entries)
174
175    self.assertEqual('Oranges', entry1.name)
176    self.assertEqual({'id': 'orange'}, entry1.attributes)
177    self.assertEqual(2, len(entry1.entries))
178    entry1_0, entry1_1 = entry1.entries
179
180    self.assertEqual('Valencia Oranges', entry1_0.name)
181    self.assertEqual({'id': 'valencia'}, entry1_0.attributes)
182    self.assertEqual([], entry1_0.entries)
183    self.assertEqual('Seville Oranges', entry1_1.name)
184    self.assertEqual({'id': 'seville'}, entry1_1.attributes)
185    self.assertEqual([], entry1_1.entries)
186
187    self.assertEqual('Grapefruit', entry2.name)
188    self.assertEqual({}, entry2.attributes)
189    self.assertEqual([], entry2.entries)
190
191    self.assertEqual('Not the main header', entry3.name)
192    self.assertEqual({'id': 'not-main'}, entry3.attributes)
193    self.assertEqual([], entry3.entries)
194
195    self.assertEqual('Not a banana', entry4.name)
196    self.assertEqual({}, entry4.attributes)
197    self.assertEqual(2, len(entry4.entries))
198    entry4_1, entry4_2 = entry4.entries
199
200    self.assertEqual('It\'s a h4', entry4_1.name)
201    self.assertEqual({}, entry4_1.attributes)
202    self.assertEqual([], entry4_1.entries)
203
204    self.assertEqual('Plantains', entry4_2.name)
205    self.assertEqual({}, entry4_2.attributes)
206    self.assertEqual(1, len(entry4_2.entries))
207    entry4_2_1, = entry4_2.entries
208
209    self.assertEqual('Another h4', entry4_2_1.name)
210    self.assertEqual({}, entry4_2_1.attributes)
211    self.assertEqual([], entry4_2_1.entries)
212
213  def testSingleExplicitSection(self):
214    def test(document):
215      result = ParseDocument(document, expect_title=True)
216      self.assertEqual([], result.warnings)
217      self.assertEqual('Header', result.title)
218      self.assertEqual(1, len(result.sections))
219      section0, = result.sections
220      entry0, = section0.structure
221      self.assertEqual('An inner header', entry0.name)
222    # A single section, one with the title inside the section, the other out.
223    test('<h1>Header</h1>'
224         '<section>'
225         'Just a single section here.'
226         '<h2>An inner header</h2>'
227         '</section>')
228    test('<section>'
229         'Another single section here.'
230         '<h1>Header</h1>'
231         '<h2>An inner header</h2>'
232         '</section>')
233
234  def testMultipleSections(self):
235    result = ParseDocument(
236        '<h1>Header</h1>'
237        '<h2>First header</h2>'
238        'This content outside a section is the first section.'
239        '<section>'
240        'Second section'
241        '<h2>Second header</h2>'
242        '</section>'
243        '<section>'
244        'Third section'
245        '<h2>Third header</h2>'
246        '</section>',
247        expect_title=True)
248    self.assertEqual([], result.warnings)
249    self.assertEqual('Header', result.title)
250    self.assertEqual(3, len(result.sections))
251    section0, section1, section2 = result.sections
252    def assert_single_header(section, name):
253      self.assertEqual(1, len(section.structure))
254      self.assertEqual(name, section.structure[0].name)
255    assert_single_header(section0, 'First header')
256    assert_single_header(section1, 'Second header')
257    assert_single_header(section2, 'Third header')
258
259
260if __name__ == '__main__':
261  unittest.main()
262