1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
|
# Copyright 2015 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
import HTMLParser
import json
import logging
import urllib2
import urlparse
class _HRefParser(HTMLParser.HTMLParser):
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
self.hrefs = []
def handle_starttag(self, tag, attrs):
if tag == "a":
for name, value in attrs:
if name == "href":
self.hrefs.append(value)
def _AbsoluteUrlHasSaneScheme(absolute_url):
if len(absolute_url) < 4:
return False
return absolute_url[0:4] == 'http'
def GenerateSafeUrls():
"""Prints a list of safe urls.
Generates a safe list of urls from a seed list. Each href in the HTML
fetched from the url from the seed list is placed into the safe list. The
safe list contains unsanitized urls.
"""
# A list of websites whose hrefs are unlikely to link to sites that contain
# malware.
seed_urls = [
"http://www.cnn.com",
"https://www.youtube.com",
"https://www.facebook.com",
"https://www.twitter.com",
"https://www.yahoo.com",
"https://www.amazon.com",
"https://www.wikipedia.com",
"https://www.bing.com",
"https://www.dailymotion.com",
"https://www.stackoverflow.com",
"https://www.google.com/#q=dumpling",
"http://www.baidu.com/s?wd=rice",
"http://www.baidu.com/s?wd=cow",
"https://www.google.com/#q=fox",
"http://www.yahoo.co.jp/",
"http://www.yandex.ru/",
"https://www.imdb.com/",
"http://www.huffingtonpost.com/",
"https://www.deviantart.com/",
"http://www.wsj.com/",
]
safe_urls = set()
for url in seed_urls:
try:
# Fetch and parse the HTML.
response = urllib2.urlopen(url)
encoding = response.headers.getparam('charset')
html = response.read()
if encoding:
html = html.decode(encoding)
parser = _HRefParser()
parser.feed(html)
except:
logging.exception("Error fetching or parsing url: %s", url)
raise
# Looks for all hrefs.
for relative_url in parser.hrefs:
if not relative_url:
continue
absolute_url = urlparse.urljoin(url, relative_url)
if not _AbsoluteUrlHasSaneScheme(absolute_url):
continue
safe_urls.add(absolute_url)
# Sort the urls, to make them easier to view in bulk.
safe_urls_list = list(safe_urls)
safe_urls_list.sort()
print json.dumps(safe_urls_list, indent=2, separators=(",", ":"))
if __name__ == "__main__":
GenerateSafeUrls()
|