# Copyright 2013 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. from HTMLParser import HTMLParser class ParseResult(object): '''The result of |ParseDocument|: |title| The title of the page, as pulled from the first

. |title_attributes| The attributes of the

tag the title is derived from. |sections| The list of Sections within this document. |warnings| Any warnings while parsing the document. ''' def __init__(self, title, title_attributes, sections, warnings): self.title = title self.title_attributes = title_attributes self.sections = sections self.warnings = warnings class DocumentSection(object): '''A section of the document as grouped by
...
. Any content not within section tags is considered an implicit section, so: "Foo
Bar
Baz" is 3 sections. |structure| A list of DocumentStructureEntry for each top-level heading. ''' def __init__(self): self.structure = [] class DocumentStructureEntry(object): '''An entry in the document structure. |attributes| The attributes of the header tag this entry is derived from. |name| The name of this entry, as pulled from the header tag this entry is derived from. |entries| A list of child DocumentStructureEntry items. ''' def __init__(self, tag, attributes): self.attributes = attributes self.name = '' self.entries = [] # Callers shouldn't care about the tag, but we need it for sanity checking, # so make it private. In particular we pretend that anything but the first # h1 is an h2, and it'd be odd to expose that. self._tag = tag # Documents can override the name of the entry using title="". self._has_explicit_name = False def __repr__(self): return '<%s>%s' % (self._tag, self.name, self._tag) def __str__(self): return repr(self) def ParseDocument(document, expect_title=False): '''Parses the title and a document structure form |document| and returns a ParseResult. ''' parser = _DocumentParser(expect_title) parser.feed(document) parser.close() return parser.parse_result def RemoveTitle(document): '''Removes the first

..

tag found in |document| and returns a (result, warning) tuple. If no title is found or |document| is malformed in some way, returns the original document and a warning message. Otherwise, returns the result of removing the title from |document| with a None warning message. ''' def min_index(lhs, rhs): lhs_index, rhs_index = document.find(lhs), document.find(rhs) if lhs_index == -1: return rhs_index if rhs_index == -1: return lhs_index return min(lhs_index, rhs_index) title_start = min_index(' was found' title_end = min_index('/h1>', '/H1>') if title_end == -1: return document, 'No closing was found' if title_end < title_start: return document, 'The appeared before the

' return (document[:title_start] + document[title_end + 4:], None) _HEADER_TAGS = ['h2', 'h3', 'h4'] class _DocumentParser(HTMLParser): '''HTMLParser for ParseDocument. ''' def __init__(self, expect_title): HTMLParser.__init__(self) # Public. self.parse_result = None # Private. self._expect_title = expect_title self._title_entry = None self._sections = [] self._processing_section = DocumentSection() self._processing_entry = None self._warnings = [] def handle_starttag(self, tag, attrs): if tag == 'section': self._OnSectionBoundary() return if tag != 'h1' and tag not in _HEADER_TAGS: return if self._processing_entry is not None: self._WarnWithPosition('Found <%s> in the middle of processing a <%s>' % (tag, self._processing_entry._tag)) return attrs_dict = dict(attrs) self._processing_entry = DocumentStructureEntry(tag, attrs_dict) explicit_name = attrs_dict.pop('title', None) if explicit_name == '': # Don't create a TOC entry at all if the tag has specified title="". return if explicit_name is not None: self._processing_entry.name = explicit_name self._processing_entry._has_explicit_name = True if tag == 'h1' and self._title_entry is not None: self._WarnWithPosition('Found multiple

tags. Subsequent

tags ' 'will be classified as

for the purpose of ' 'the structure') tag = 'h2' if tag == 'h1': self._title_entry = self._processing_entry else: belongs_to = self._processing_section.structure for header in _HEADER_TAGS[:_HEADER_TAGS.index(tag)]: if len(belongs_to) == 0: # TODO(kalman): Re-enable this warning once the reference pages have # their references fixed. #self._WarnWithPosition('Found <%s> without any preceding <%s>' % # (tag, header)) break belongs_to = belongs_to[-1].entries belongs_to.append(self._processing_entry) def handle_endtag(self, tag): if tag == 'section': self._OnSectionBoundary() return if tag != 'h1' and tag not in _HEADER_TAGS: return if self._processing_entry is None: self._WarnWithPosition('Found closing without an opening <%s>' % (tag, tag)) return if self._processing_entry._tag != tag: self._WarnWithPosition('Found closing while processing a <%s>' % (tag, self._processing_entry._tag)) # Note: no early return, it's more likely that the mismatched header was # a typo rather than a misplaced closing header tag. self._processing_entry = None def handle_data(self, data): if (self._processing_entry is not None and not self._processing_entry._has_explicit_name): # += is inefficient, but probably fine here because the chances of a # large number of nested tags within header tags is pretty low. self._processing_entry.name += data def close(self): HTMLParser.close(self) self._OnSectionBoundary() if self._processing_entry is not None: self._warnings.append('Finished parsing while still processing a <%s>' % parser._processing_entry._tag) if self._expect_title: if not self._title_entry: self._warnings.append('Expected a title') title, title_attributes = '', {} else: title, title_attributes = ( self._title_entry.name, self._title_entry.attributes) else: if self._title_entry: self._warnings.append('Found unexpected title "%s"' % self._title_entry.name) title, title_attributes = None, None self.parse_result = ParseResult( title, title_attributes, self._sections, self._warnings) def _OnSectionBoundary(self): # Only start a new section if the previous section was non-empty. if self._processing_section.structure: self._sections.append(self._processing_section) self._processing_section = DocumentSection() def _WarnWithPosition(self, message): line, col = self.getpos() self._warnings.append('%s (line %s, column %s)' % (message, line, col + 1))