# Copyright 2013 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
from HTMLParser import HTMLParser
class ParseResult(object):
'''The result of |ParseDocument|:
|title| The title of the page, as pulled from the first
|title_attributes| The attributes of the tag the title is derived from.
|sections| The list of Sections within this document.
|warnings| Any warnings while parsing the document.
def __init__(self, title, title_attributes, sections, warnings):
self.title = title
self.title_attributes = title_attributes
self.sections = sections
self.warnings = warnings
class DocumentSection(object):
'''A section of the document as grouped by . Any content
not within section tags is considered an implicit section, so:
"Foo Baz" is 3 sections.
|structure| A list of DocumentStructureEntry for each top-level heading.
def __init__(self):
self.structure = []
class DocumentStructureEntry(object):
'''An entry in the document structure.
|attributes| The attributes of the header tag this entry is derived from.
|name| The name of this entry, as pulled from the header tag this entry
is derived from.
|entries| A list of child DocumentStructureEntry items.
def __init__(self, tag, attributes):
self.attributes = attributes
self.name = ''
self.entries = []
# Callers shouldn't care about the tag, but we need it for sanity checking,
# so make it private. In particular we pretend that anything but the first
# h1 is an h2, and it'd be odd to expose that.
self._tag = tag
# Documents can override the name of the entry using title="".
self._has_explicit_name = False
def __repr__(self):
return '<%s>%s%s>' % (self._tag, self.name, self._tag)
def __str__(self):
return repr(self)
def ParseDocument(document, expect_title=False):
'''Parses the title and a document structure form |document| and returns a
parser = _DocumentParser(expect_title)
return parser.parse_result
def RemoveTitle(document):
'''Removes the first ..
tag found in |document| and returns a
(result, warning) tuple.
If no title is found or |document| is malformed in some way, returns the
original document and a warning message. Otherwise, returns the result of
removing the title from |document| with a None warning message.
def min_index(lhs, rhs):
lhs_index, rhs_index = document.find(lhs), document.find(rhs)
if lhs_index == -1: return rhs_index
if rhs_index == -1: return lhs_index
return min(lhs_index, rhs_index)
title_start = min_index(' was found'
title_end = min_index('/h1>', '/H1>')
if title_end == -1:
return document, 'No closing
was found'
if title_end < title_start:
return document, 'The
appeared before the '
return (document[:title_start] + document[title_end + 4:], None)
_HEADER_TAGS = ['h2', 'h3', 'h4']
class _DocumentParser(HTMLParser):
'''HTMLParser for ParseDocument.
def __init__(self, expect_title):
# Public.
self.parse_result = None
# Private.
self._expect_title = expect_title
self._title_entry = None
self._sections = []
self._processing_section = DocumentSection()
self._processing_entry = None
self._warnings = []
def handle_starttag(self, tag, attrs):
if tag == 'section':
if tag != 'h1' and tag not in _HEADER_TAGS:
if self._processing_entry is not None:
self._WarnWithPosition('Found <%s> in the middle of processing a <%s>' %
(tag, self._processing_entry._tag))
attrs_dict = dict(attrs)
self._processing_entry = DocumentStructureEntry(tag, attrs_dict)
explicit_name = attrs_dict.pop('title', None)
if explicit_name == '':
# Don't create a TOC entry at all if the tag has specified title="".
if explicit_name is not None:
self._processing_entry.name = explicit_name
self._processing_entry._has_explicit_name = True
if tag == 'h1' and self._title_entry is not None:
self._WarnWithPosition('Found multiple tags. Subsequent tags '
'will be classified as for the purpose of '
'the structure')
tag = 'h2'
if tag == 'h1':
self._title_entry = self._processing_entry
belongs_to = self._processing_section.structure
for header in _HEADER_TAGS[:_HEADER_TAGS.index(tag)]:
if len(belongs_to) == 0:
# TODO(kalman): Re-enable this warning once the reference pages have
# their references fixed.
#self._WarnWithPosition('Found <%s> without any preceding <%s>' %
# (tag, header))
belongs_to = belongs_to[-1].entries
def handle_endtag(self, tag):
if tag == 'section':
if tag != 'h1' and tag not in _HEADER_TAGS:
if self._processing_entry is None:
self._WarnWithPosition('Found closing %s> without an opening <%s>' %
(tag, tag))
if self._processing_entry._tag != tag:
self._WarnWithPosition('Found closing %s> while processing a <%s>' %
(tag, self._processing_entry._tag))
# Note: no early return, it's more likely that the mismatched header was
# a typo rather than a misplaced closing header tag.
self._processing_entry = None
def handle_data(self, data):
if (self._processing_entry is not None and
not self._processing_entry._has_explicit_name):
# += is inefficient, but probably fine here because the chances of a
# large number of nested tags within header tags is pretty low.
self._processing_entry.name += data
def close(self):
if self._processing_entry is not None:
self._warnings.append('Finished parsing while still processing a <%s>' %
if self._expect_title:
if not self._title_entry:
self._warnings.append('Expected a title')
title, title_attributes = '', {}
title, title_attributes = (
self._title_entry.name, self._title_entry.attributes)
if self._title_entry:
self._warnings.append('Found unexpected title "%s"' %
title, title_attributes = None, None
self.parse_result = ParseResult(
title, title_attributes, self._sections, self._warnings)
def _OnSectionBoundary(self):
# Only start a new section if the previous section was non-empty.
if self._processing_section.structure:
self._processing_section = DocumentSection()
def _WarnWithPosition(self, message):
line, col = self.getpos()
self._warnings.append('%s (line %s, column %s)' % (message, line, col + 1))