1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
|
#!/usr/bin/env python
# Copyright (c) 2012 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
'''This file provides a simple HTML checker, based on the standard HTMLParser
library.'''
import HTMLParser
import os
import sys
class HTMLChecker(HTMLParser.HTMLParser):
'''A simple html parser that can find attribute tags and validate syntax'''
def __init__(self):
self.tag_list = []
self.links = set()
HTMLParser.HTMLParser.__init__(self)
def handle_starttag(self, tag, attrs):
attributes = dict(attrs)
# Skip tags if they're within a <script> tag so that we don't get confused.
if not self.tag_list or self.tag_list[-1] != 'script':
self.tag_list.append(tag)
if tag == 'a' and attributes.get('href'):
self.links.add(attributes['href'])
def handle_endtag(self, tag):
try:
matching_tag = self.tag_list.pop()
except IndexError:
raise Exception('Unmatched tag %s at %s' % (tag, self.getpos()))
if matching_tag != tag:
if matching_tag == 'script':
self.tag_list.append(matching_tag)
else:
raise Exception('Wrong tag: Expected %s but got %s at %s' %
(matching_tag, tag, self.getpos()))
def close(self):
if self.tag_list:
raise Exception('Reached end-of-file with unclosed tags: %s'
% self.tag_list)
HTMLParser.HTMLParser.close(self)
def ValidateFile(filename):
'''Run simple html syntax checks on given file to validate tags and links
Args:
filename: Name of file to validate
Returns:
tuple containing:
set of urls from this file
set of absolute paths from this file'''
(directory, basename) = os.path.split(os.path.abspath(filename))
parser = HTMLChecker()
with open(filename, 'r') as file:
parser.feed(file.read())
parser.close()
files = set()
urls = set()
for link in parser.links:
if link.startswith('http://') or link.startswith('https://'):
urls.add(link)
else:
files.add(os.path.abspath(os.path.join(directory, link)))
return urls, files
def ValidateAllLinks(filenames):
'''Validate all the links in filename and all linked files on this domain'''
validated_files = set()
validated_urls = set()
need_to_validate = set([os.path.abspath(file) for file in filenames])
while need_to_validate:
file = need_to_validate.pop()
print 'Evaluating %s' % file
urls, files = ValidateFile(file)
validated_files.add(file)
need_to_validate |= files - validated_files
def main(argv):
'''Run ValidateFile on each argument
Args:
argv: Command-line arguments'''
ValidateAllLinks(argv)
return 0
if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))
|