path: root/common
diff options
authorShimeng (Simon) Wang <>2010-02-10 11:22:01 -0800
committerShimeng (Simon) Wang <>2010-02-10 11:22:01 -0800
commitb4489000d1355d10426bb4ad4eedb5c2fb80e886 (patch)
tree4f32cbbd94751e9c63755cff815b3f5f69fd349f /common
parent3ed6fbd9e141f20ca382306aa6a355cd544158d1 (diff)
Add back lost python script.
The script is used to generate top level domains' regular expressions. This is enhanced and used to regenerate the new top level domains. new file: common/tools/
Diffstat (limited to 'common')
1 files changed, 160 insertions, 0 deletions
diff --git a/common/tools/ b/common/tools/
new file mode 100755
index 0000000..ece4dcf
--- /dev/null
+++ b/common/tools/
@@ -0,0 +1,160 @@
+#!/usr/bin/env python
+from urllib2 import urlopen
+TLD_PREFIX = r"""
+ /**
+ * Regular expression pattern to match all IANA top-level domains.
+ * List accurate as of 2010/02/05. List taken from:
+ *
+ * This pattern is auto-generated by frameworks/base/common/tools/
+ */
+ public static final Pattern TOP_LEVEL_DOMAIN = Pattern.compile(
+TLD_SUFFIX = '");'
+URL_PREFIX = r"""
+ /**
+ * Regular expression pattern to match RFC 1738 URLs
+ * List accurate as of 2010/02/05. List taken from:
+ *
+ * This pattern is auto-generated by frameworkds/base/common/tools/
+ */
+ public static final Pattern WEB_URL = Pattern.compile(
+ "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)"
+ + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_"
+ + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?"
+ + "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+" // named host
+ + "(?:" // plus top level domain
+URL_SUFFIX = r"""
+ + "|(?:(?:25[0-5]|2[0-4]" // or ip address
+ + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]"
+ + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]"
+ + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}"
+ + "|[1-9][0-9]|[0-9])))"
+ + "(?:\\:\\d{1,5})?)" // plus option port number
+ + "(\\/(?:(?:[a-zA-Z0-9\\;\\/\\?\\:\\@\\&\\=\\#\\~" // plus option query params
+ + "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?"
+ + "(?:\\b|$)"); // and finally, a word boundary or end of
+ // input. This is to stop foo.sure from
+ // matching as
+class Bucket:
+ def __init__(self, baseLetter):
+ self.base=baseLetter
+ self.words=[]
+ self.letters=[]
+ def dump(self, isWebUrl=False, isFirst=False, isLast=False):
+ if (len(self.words) == 0) and (len(self.letters) == 0):
+ return ''
+ self.words.sort()
+ self.letters.sort()
+ output = ' ';
+ if isFirst:
+ if isWebUrl:
+ output += '+ "'
+ else:
+ output += '"('
+ else:
+ output += '+ "|'
+ if len(self.words) != 0:
+ output += '('
+ if isWebUrl:
+ output += '?:'
+ firstWord = 1
+ for word in self.words:
+ if firstWord == 0:
+ output += '|'
+ firstWord = 0
+ for letter in word:
+ if letter == '-':
+ output += '\\\\' # escape the '-' character.
+ output += letter
+ if len(self.words) > 0 and len(self.letters) > 0:
+ output += '|'
+ if len(self.letters) == 1:
+ output += '%c%c' % (self.base, self.letters[0])
+ elif len(self.letters) > 0:
+ output += '%c[' % self.base
+ for letter in self.letters:
+ output += letter
+ output += ']'
+ if len(self.words) != 0:
+ output += ')'
+ if not isLast:
+ output += '"'
+ output += '\n'
+ return output;
+ def add(self, line):
+ length = len(line)
+ if line.startswith('#') or (length == 0):
+ return;
+ if length == 2:
+ self.letters.append(line[1:2])
+ else:
+ self.words.append(line)
+def getBucket(buckets, line):
+ letter = line[0]
+ bucket = buckets.get(letter)
+ if bucket is None:
+ bucket = Bucket(letter)
+ buckets[letter] = bucket
+ return bucket
+def makePattern(prefix, suffix, buckets, isWebUrl=False):
+ output = prefix
+ output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl)
+ for letter in range(ord('b'), ord('z')):
+ output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl)
+ output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl)
+ if isWebUrl:
+ output += '))"'
+ else:
+ output += ')'
+ output += suffix
+ print output
+if __name__ == "__main__":
+ f = urlopen('')
+ domains = f.readlines()
+ f.close()
+ buckets = {}
+ for domain in domains:
+ domain = domain.lower()
+ if len(domain) > 0:
+ getBucket(buckets, domain[0]).add(domain.strip())
+ makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False)
+ makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True)