document_parser_test.py revision a3f6a49ab37290eeeb8db0f41ec0f1cb74a68be7
1#!/usr/bin/env python 2# Copyright 2013 The Chromium Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6import unittest 7 8from document_parser import ParseDocument, RemoveTitle 9 10 11_WHOLE_DOCUMENT = ''' 12Preamble before heading. 13 14<h1 id='main' class='header'>Main header</h1> 15Some intro to the content. 16 17<h2 id='banana' class='header'>Bananas</h2> 18Something about bananas. 19 20<h2 id='orange'>Oranges</h2> 21Something about oranges. 22 23<h3 id='valencia'>Valencia Oranges</h3> 24A description of valencia oranges. 25 26<h3 id='seville'>Seville Oranges</h3> 27A description of seville oranges. 28 29<h2>Grapefruit</h3> 30Grapefruit closed a h2 with a h3. This should be a warning. 31 32<h1 id='not-main'>Not the main header</h1> 33But it should still show up in the TOC as though it were an h2. 34 35<h2>Not <h3>a banana</h2> 36The embedded h3 should be ignored. 37 38<h4>It's a h4</h4> 39h4 are part of the document structure, but this is not inside a h3. 40 41<h3>Plantains</h3> 42Now I'm just getting lazy. 43 44<h4>Another h4</h4> 45This h4 is inside a h3 so will show up. 46 47<h5>Header 5</h5> 48Header 5s are not parsed. 49''' 50 51 52_WHOLE_DOCUMENT_WITHOUT_TITLE = ''' 53Preamble before heading. 54 55 56Some intro to the content. 57 58<h2 id='banana' class='header'>Bananas</h2> 59Something about bananas. 60 61<h2 id='orange'>Oranges</h2> 62Something about oranges. 63 64<h3 id='valencia'>Valencia Oranges</h3> 65A description of valencia oranges. 66 67<h3 id='seville'>Seville Oranges</h3> 68A description of seville oranges. 69 70<h2>Grapefruit</h3> 71Grapefruit closed a h2 with a h3. This should be a warning. 72 73<h1 id='not-main'>Not the main header</h1> 74But it should still show up in the TOC as though it were an h2. 75 76<h2>Not <h3>a banana</h2> 77The embedded h3 should be ignored. 78 79<h4>It's a h4</h4> 80h4 are part of the document structure, but this is not inside a h3. 81 82<h3>Plantains</h3> 83Now I'm just getting lazy. 84 85<h4>Another h4</h4> 86This h4 is inside a h3 so will show up. 87 88<h5>Header 5</h5> 89Header 5s are not parsed. 90''' 91 92 93class DocumentParserUnittest(unittest.TestCase): 94 95 def testEmptyDocument(self): 96 self.assertEqual(('', 'No opening <h1> was found'), RemoveTitle('')) 97 98 result = ParseDocument('') 99 self.assertEqual(None, result.title) 100 self.assertEqual(None, result.title_attributes) 101 self.assertEqual([], result.sections) 102 self.assertEqual([], result.warnings) 103 104 result = ParseDocument('', expect_title=True) 105 self.assertEqual('', result.title) 106 self.assertEqual({}, result.title_attributes) 107 self.assertEqual([], result.sections) 108 self.assertEqual(['Expected a title'], result.warnings) 109 110 def testRemoveTitle(self): 111 no_closing_tag = '<h1>No closing tag' 112 self.assertEqual((no_closing_tag, 'No closing </h1> was found'), 113 RemoveTitle(no_closing_tag)) 114 115 no_opening_tag = 'No opening tag</h1>' 116 self.assertEqual((no_opening_tag, 'No opening <h1> was found'), 117 RemoveTitle(no_opening_tag)) 118 119 tags_wrong_order = '</h1>Tags in wrong order<h1>' 120 self.assertEqual((tags_wrong_order, 'The </h1> appeared before the <h1>'), 121 RemoveTitle(tags_wrong_order)) 122 123 multiple_titles = '<h1>First header</h1> and <h1>Second header</h1>' 124 self.assertEqual((' and <h1>Second header</h1>', None), 125 RemoveTitle(multiple_titles)) 126 127 upper_case = '<H1>Upper case header tag</H1> hi' 128 self.assertEqual((' hi', None), RemoveTitle(upper_case)) 129 mixed_case = '<H1>Mixed case header tag</h1> hi' 130 self.assertEqual((' hi', None), RemoveTitle(mixed_case)) 131 132 def testOnlyTitleDocument(self): 133 document = '<h1 id="header">heading</h1>' 134 self.assertEqual(('', None), RemoveTitle(document)) 135 136 result = ParseDocument(document) 137 self.assertEqual(None, result.title) 138 self.assertEqual(None, result.title_attributes) 139 self.assertEqual([], result.sections) 140 self.assertEqual(['Found unexpected title "heading"'], result.warnings) 141 142 result = ParseDocument(document, expect_title=True) 143 self.assertEqual('heading', result.title) 144 self.assertEqual({'id': 'header'}, result.title_attributes) 145 self.assertEqual([], result.sections) 146 self.assertEqual([], result.warnings) 147 148 def testWholeDocument(self): 149 self.assertEqual((_WHOLE_DOCUMENT_WITHOUT_TITLE, None), 150 RemoveTitle(_WHOLE_DOCUMENT)) 151 result = ParseDocument(_WHOLE_DOCUMENT, expect_title=True) 152 self.assertEqual('Main header', result.title) 153 self.assertEqual({'id': 'main', 'class': 'header'}, result.title_attributes) 154 self.assertEqual([ 155 'Found closing </h3> while processing a <h2> (line 19, column 15)', 156 'Found multiple <h1> tags. Subsequent <h1> tags will be classified as ' 157 '<h2> for the purpose of the structure (line 22, column 1)', 158 'Found <h3> in the middle of processing a <h2> (line 25, column 9)', 159 # TODO(kalman): Re-enable this warning once the reference pages have 160 # their references fixed. 161 #'Found <h4> without any preceding <h3> (line 28, column 1)', 162 ], result.warnings) 163 164 # The non-trivial table of contents assertions... 165 self.assertEqual(1, len(result.sections)) 166 entries = result.sections[0].structure 167 168 self.assertEqual(5, len(entries), entries) 169 entry0, entry1, entry2, entry3, entry4 = entries 170 171 self.assertEqual('Bananas', entry0.name) 172 self.assertEqual({'id': 'banana', 'class': 'header'}, entry0.attributes) 173 self.assertEqual([], entry0.entries) 174 175 self.assertEqual('Oranges', entry1.name) 176 self.assertEqual({'id': 'orange'}, entry1.attributes) 177 self.assertEqual(2, len(entry1.entries)) 178 entry1_0, entry1_1 = entry1.entries 179 180 self.assertEqual('Valencia Oranges', entry1_0.name) 181 self.assertEqual({'id': 'valencia'}, entry1_0.attributes) 182 self.assertEqual([], entry1_0.entries) 183 self.assertEqual('Seville Oranges', entry1_1.name) 184 self.assertEqual({'id': 'seville'}, entry1_1.attributes) 185 self.assertEqual([], entry1_1.entries) 186 187 self.assertEqual('Grapefruit', entry2.name) 188 self.assertEqual({}, entry2.attributes) 189 self.assertEqual([], entry2.entries) 190 191 self.assertEqual('Not the main header', entry3.name) 192 self.assertEqual({'id': 'not-main'}, entry3.attributes) 193 self.assertEqual([], entry3.entries) 194 195 self.assertEqual('Not a banana', entry4.name) 196 self.assertEqual({}, entry4.attributes) 197 self.assertEqual(2, len(entry4.entries)) 198 entry4_1, entry4_2 = entry4.entries 199 200 self.assertEqual('It\'s a h4', entry4_1.name) 201 self.assertEqual({}, entry4_1.attributes) 202 self.assertEqual([], entry4_1.entries) 203 204 self.assertEqual('Plantains', entry4_2.name) 205 self.assertEqual({}, entry4_2.attributes) 206 self.assertEqual(1, len(entry4_2.entries)) 207 entry4_2_1, = entry4_2.entries 208 209 self.assertEqual('Another h4', entry4_2_1.name) 210 self.assertEqual({}, entry4_2_1.attributes) 211 self.assertEqual([], entry4_2_1.entries) 212 213 def testSingleExplicitSection(self): 214 def test(document): 215 result = ParseDocument(document, expect_title=True) 216 self.assertEqual([], result.warnings) 217 self.assertEqual('Header', result.title) 218 self.assertEqual(1, len(result.sections)) 219 section0, = result.sections 220 entry0, = section0.structure 221 self.assertEqual('An inner header', entry0.name) 222 # A single section, one with the title inside the section, the other out. 223 test('<h1>Header</h1>' 224 '<section>' 225 'Just a single section here.' 226 '<h2>An inner header</h2>' 227 '</section>') 228 test('<section>' 229 'Another single section here.' 230 '<h1>Header</h1>' 231 '<h2>An inner header</h2>' 232 '</section>') 233 234 def testMultipleSections(self): 235 result = ParseDocument( 236 '<h1>Header</h1>' 237 '<h2>First header</h2>' 238 'This content outside a section is the first section.' 239 '<section>' 240 'Second section' 241 '<h2>Second header</h2>' 242 '</section>' 243 '<section>' 244 'Third section' 245 '<h2>Third header</h2>' 246 '</section>', 247 expect_title=True) 248 self.assertEqual([], result.warnings) 249 self.assertEqual('Header', result.title) 250 self.assertEqual(3, len(result.sections)) 251 section0, section1, section2 = result.sections 252 def assert_single_header(section, name): 253 self.assertEqual(1, len(section.structure)) 254 self.assertEqual(name, section.structure[0].name) 255 assert_single_header(section0, 'First header') 256 assert_single_header(section1, 'Second header') 257 assert_single_header(section2, 'Third header') 258 259 260if __name__ == '__main__': 261 unittest.main() 262