summaryrefslogtreecommitdiffstats
path: root/tools/perf/profile_creators/profile_safe_url_generator.py
blob: 7524a9d78092ad33f9264fa920b73936f813cfd9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# Copyright 2015 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import HTMLParser
import json
import logging
import urllib2
import urlparse


class _HRefParser(HTMLParser.HTMLParser):
  def __init__(self):
    HTMLParser.HTMLParser.__init__(self)
    self.hrefs = []

  def handle_starttag(self, tag, attrs):
    if tag == "a":
      for name, value in attrs:
        if name == "href":
          self.hrefs.append(value)


def _AbsoluteUrlHasSaneScheme(absolute_url):
  if len(absolute_url) < 4:
    return False
  return absolute_url[0:4] == 'http'


def GenerateSafeUrls():
  """Prints a list of safe urls.

  Generates a safe list of urls from a seed list. Each href in the HTML
  fetched from the url from the seed list is placed into the safe list. The
  safe list contains unsanitized urls.
  """
  # A list of websites whose hrefs are unlikely to link to sites that contain
  # malware.
  seed_urls = [
    "http://www.cnn.com",
    "https://www.youtube.com",
    "https://www.facebook.com",
    "https://www.twitter.com",
    "https://www.yahoo.com",
    "https://www.amazon.com",
    "https://www.wikipedia.com",
    "https://www.bing.com",
    "https://www.dailymotion.com",
    "https://www.stackoverflow.com",
    "https://www.google.com/#q=dumpling",
    "http://www.baidu.com/s?wd=rice",
    "http://www.baidu.com/s?wd=cow",
    "https://www.google.com/#q=fox",
    "http://www.yahoo.co.jp/",
    "http://www.yandex.ru/",
    "https://www.imdb.com/",
    "http://www.huffingtonpost.com/",
    "https://www.deviantart.com/",
    "http://www.wsj.com/",
  ]

  safe_urls = set()

  for url in seed_urls:
    try:
      # Fetch and parse the HTML.
      response = urllib2.urlopen(url)
      encoding = response.headers.getparam('charset')
      html = response.read()
      if encoding:
        html = html.decode(encoding)

      parser = _HRefParser()
      parser.feed(html)
    except:
      logging.exception("Error fetching or parsing url: %s", url)
      raise

    # Looks for all hrefs.
    for relative_url in parser.hrefs:
      if not relative_url:
        continue

      absolute_url = urlparse.urljoin(url, relative_url)
      if not _AbsoluteUrlHasSaneScheme(absolute_url):
        continue
      safe_urls.add(absolute_url)

  # Sort the urls, to make them easier to view in bulk.
  safe_urls_list = list(safe_urls)
  safe_urls_list.sort()

  print json.dumps(safe_urls_list, indent=2, separators=(",", ":"))

if __name__ == "__main__":
  GenerateSafeUrls()