#!/usr/bin/env python
# Copyright 2013 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
import unittest
from document_parser import ParseDocument, RemoveTitle
_WHOLE_DOCUMENT = '''
Preamble before heading.
Main header
Some intro to the content.
Something about bananas.
Oranges
Something about oranges.
Valencia Oranges
A description of valencia oranges.
Seville Oranges
A description of seville oranges.
Grapefruit
Grapefruit closed a h2 with a h3. This should be a warning.
Not the main header
But it should still show up in the TOC as though it were an h2.
Not a banana
The embedded h3 should be ignored.
It's a h4
h4 are part of the document structure, but this is not inside a h3.
Plantains
Now I'm just getting lazy.
Another h4
This h4 is inside a h3 so will show up.
Header 5
Header 5s are not parsed.
'''
_WHOLE_DOCUMENT_WITHOUT_TITLE = '''
Preamble before heading.
Some intro to the content.
Something about bananas.
Oranges
Something about oranges.
Valencia Oranges
A description of valencia oranges.
Seville Oranges
A description of seville oranges.
Grapefruit
Grapefruit closed a h2 with a h3. This should be a warning.
Not the main header
But it should still show up in the TOC as though it were an h2.
Not a banana
The embedded h3 should be ignored.
It's a h4
h4 are part of the document structure, but this is not inside a h3.
Plantains
Now I'm just getting lazy.
Another h4
This h4 is inside a h3 so will show up.
Header 5
Header 5s are not parsed.
'''
class DocumentParserUnittest(unittest.TestCase):
def testEmptyDocument(self):
self.assertEqual(('', 'No opening was found'), RemoveTitle(''))
result = ParseDocument('')
self.assertEqual(None, result.title)
self.assertEqual(None, result.title_attributes)
self.assertEqual([], result.sections)
self.assertEqual([], result.warnings)
result = ParseDocument('', expect_title=True)
self.assertEqual('', result.title)
self.assertEqual({}, result.title_attributes)
self.assertEqual([], result.sections)
self.assertEqual(['Expected a title'], result.warnings)
def testRemoveTitle(self):
no_closing_tag = 'No closing tag'
self.assertEqual((no_closing_tag, 'No closing
was found'),
RemoveTitle(no_closing_tag))
no_opening_tag = 'No opening tag
'
self.assertEqual((no_opening_tag, 'No opening was found'),
RemoveTitle(no_opening_tag))
tags_wrong_order = '
Tags in wrong order'
self.assertEqual((tags_wrong_order, 'The
appeared before the '),
RemoveTitle(tags_wrong_order))
multiple_titles = 'First header
and Second header
'
self.assertEqual((' and Second header
', None),
RemoveTitle(multiple_titles))
upper_case = 'Upper case header tag
hi'
self.assertEqual((' hi', None), RemoveTitle(upper_case))
mixed_case = 'Mixed case header tag
hi'
self.assertEqual((' hi', None), RemoveTitle(mixed_case))
def testOnlyTitleDocument(self):
document = ''
self.assertEqual(('', None), RemoveTitle(document))
result = ParseDocument(document)
self.assertEqual(None, result.title)
self.assertEqual(None, result.title_attributes)
self.assertEqual([], result.sections)
self.assertEqual(['Found unexpected title "heading"'], result.warnings)
result = ParseDocument(document, expect_title=True)
self.assertEqual('heading', result.title)
self.assertEqual({'id': 'header'}, result.title_attributes)
self.assertEqual([], result.sections)
self.assertEqual([], result.warnings)
def testWholeDocument(self):
self.assertEqual((_WHOLE_DOCUMENT_WITHOUT_TITLE, None),
RemoveTitle(_WHOLE_DOCUMENT))
result = ParseDocument(_WHOLE_DOCUMENT, expect_title=True)
self.assertEqual('Main header', result.title)
self.assertEqual({'id': 'main', 'class': 'header'}, result.title_attributes)
self.assertEqual([
'Found closing while processing a (line 19, column 15)',
'Found multiple tags. Subsequent tags will be classified as '
' for the purpose of the structure (line 22, column 1)',
'Found in the middle of processing a (line 25, column 9)',
# TODO(kalman): Re-enable this warning once the reference pages have
# their references fixed.
#'Found without any preceding (line 28, column 1)',
], result.warnings)
# The non-trivial table of contents assertions...
self.assertEqual(1, len(result.sections))
entries = result.sections[0].structure
self.assertEqual(4, len(entries), entries)
entry0, entry1, entry2, entry3 = entries
self.assertEqual('hello', entry0.name)
self.assertEqual({'id': 'orange'}, entry0.attributes)
self.assertEqual(2, len(entry0.entries))
entry0_0, entry0_1 = entry0.entries
self.assertEqual('Valencia Oranges', entry0_0.name)
self.assertEqual({'id': 'valencia'}, entry0_0.attributes)
self.assertEqual([], entry0_0.entries)
self.assertEqual('Seville Oranges', entry0_1.name)
self.assertEqual({'id': 'seville'}, entry0_1.attributes)
self.assertEqual([], entry0_1.entries)
self.assertEqual('Grapefruit', entry1.name)
self.assertEqual({}, entry1.attributes)
self.assertEqual([], entry1.entries)
self.assertEqual('Not the main header', entry2.name)
self.assertEqual({'id': 'not-main'}, entry2.attributes)
self.assertEqual([], entry2.entries)
self.assertEqual('Not a banana', entry3.name)
self.assertEqual({}, entry3.attributes)
self.assertEqual(2, len(entry3.entries))
entry3_1, entry3_2 = entry3.entries
self.assertEqual('It\'s a h4', entry3_1.name)
self.assertEqual({}, entry3_1.attributes)
self.assertEqual([], entry3_1.entries)
self.assertEqual('Plantains', entry3_2.name)
self.assertEqual({}, entry3_2.attributes)
self.assertEqual(1, len(entry3_2.entries))
entry3_2_1, = entry3_2.entries
self.assertEqual('Another h4', entry3_2_1.name)
self.assertEqual({}, entry3_2_1.attributes)
self.assertEqual([], entry3_2_1.entries)
def testSingleExplicitSection(self):
def test(document):
result = ParseDocument(document, expect_title=True)
self.assertEqual([], result.warnings)
self.assertEqual('Header', result.title)
self.assertEqual(1, len(result.sections))
section0, = result.sections
entry0, = section0.structure
self.assertEqual('An inner header', entry0.name)
# A single section, one with the title inside the section, the other out.
test('Header
'
''
'Just a single section here.'
'An inner header
'
'')
test(''
'Another single section here.'
'Header
'
'An inner header
'
'')
def testMultipleSections(self):
result = ParseDocument(
'Header
'
'First header
'
'This content outside a section is the first section.'
''
'Second section'
'Second header
'
''
''
'Third section'
'Third header
'
'',
expect_title=True)
self.assertEqual([], result.warnings)
self.assertEqual('Header', result.title)
self.assertEqual(3, len(result.sections))
section0, section1, section2 = result.sections
def assert_single_header(section, name):
self.assertEqual(1, len(section.structure))
self.assertEqual(name, section.structure[0].name)
assert_single_header(section0, 'First header')
assert_single_header(section1, 'Second header')
assert_single_header(section2, 'Third header')
if __name__ == '__main__':
unittest.main()