diff options
author | Shimeng (Simon) Wang <swang@google.com> | 2010-02-10 11:22:01 -0800 |
---|---|---|
committer | Shimeng (Simon) Wang <swang@google.com> | 2010-02-10 11:22:01 -0800 |
commit | b4489000d1355d10426bb4ad4eedb5c2fb80e886 (patch) | |
tree | 4f32cbbd94751e9c63755cff815b3f5f69fd349f /common | |
parent | 3ed6fbd9e141f20ca382306aa6a355cd544158d1 (diff) | |
download | frameworks_base-b4489000d1355d10426bb4ad4eedb5c2fb80e886.zip frameworks_base-b4489000d1355d10426bb4ad4eedb5c2fb80e886.tar.gz frameworks_base-b4489000d1355d10426bb4ad4eedb5c2fb80e886.tar.bz2 |
Add back lost python script.
The script is used to generate top level domains' regular expressions.
This is enhanced and used to regenerate the new top level domains.
new file: common/tools/make-iana-tld-pattern.py
Diffstat (limited to 'common')
-rwxr-xr-x | common/tools/make-iana-tld-pattern.py | 160 |
1 files changed, 160 insertions, 0 deletions
diff --git a/common/tools/make-iana-tld-pattern.py b/common/tools/make-iana-tld-pattern.py new file mode 100755 index 0000000..ece4dcf --- /dev/null +++ b/common/tools/make-iana-tld-pattern.py @@ -0,0 +1,160 @@ +#!/usr/bin/env python + +from urllib2 import urlopen + +TLD_PREFIX = r""" + /** + * Regular expression pattern to match all IANA top-level domains. + * List accurate as of 2010/02/05. List taken from: + * http://data.iana.org/TLD/tlds-alpha-by-domain.txt + * This pattern is auto-generated by frameworks/base/common/tools/make-iana-tld-pattern.py + */ + public static final Pattern TOP_LEVEL_DOMAIN = Pattern.compile( +""" +TLD_SUFFIX = '");' + +URL_PREFIX = r""" + /** + * Regular expression pattern to match RFC 1738 URLs + * List accurate as of 2010/02/05. List taken from: + * http://data.iana.org/TLD/tlds-alpha-by-domain.txt + * This pattern is auto-generated by frameworkds/base/common/tools/make-iana-tld-pattern.py + */ + public static final Pattern WEB_URL = Pattern.compile( + "((?:(http|https|Http|Https|rtsp|Rtsp):\\/\\/(?:(?:[a-zA-Z0-9\\$\\-\\_\\.\\+\\!\\*\\'\\(\\)" + + "\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,64}(?:\\:(?:[a-zA-Z0-9\\$\\-\\_" + + "\\.\\+\\!\\*\\'\\(\\)\\,\\;\\?\\&\\=]|(?:\\%[a-fA-F0-9]{2})){1,25})?\\@)?)?" + + "((?:(?:[a-zA-Z0-9][a-zA-Z0-9\\-]{0,64}\\.)+" // named host + + "(?:" // plus top level domain +""" + +URL_SUFFIX = r""" + + "|(?:(?:25[0-5]|2[0-4]" // or ip address + + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]" + + "|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1]" + + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}" + + "|[1-9][0-9]|[0-9])))" + + "(?:\\:\\d{1,5})?)" // plus option port number + + "(\\/(?:(?:[a-zA-Z0-9\\;\\/\\?\\:\\@\\&\\=\\#\\~" // plus option query params + + "\\-\\.\\+\\!\\*\\'\\(\\)\\,\\_])|(?:\\%[a-fA-F0-9]{2}))*)?" + + "(?:\\b|$)"); // and finally, a word boundary or end of + // input. This is to stop foo.sure from + // matching as foo.su +""" + +class Bucket: + def __init__(self, baseLetter): + self.base=baseLetter + self.words=[] + self.letters=[] + + def dump(self, isWebUrl=False, isFirst=False, isLast=False): + if (len(self.words) == 0) and (len(self.letters) == 0): + return '' + + self.words.sort() + self.letters.sort() + + output = ' '; + + if isFirst: + if isWebUrl: + output += '+ "' + else: + output += '"(' + else: + output += '+ "|' + + if len(self.words) != 0: + output += '(' + + if isWebUrl: + output += '?:' + + firstWord = 1 + for word in self.words: + if firstWord == 0: + output += '|' + firstWord = 0 + for letter in word: + if letter == '-': + output += '\\\\' # escape the '-' character. + output += letter + + if len(self.words) > 0 and len(self.letters) > 0: + output += '|' + + if len(self.letters) == 1: + output += '%c%c' % (self.base, self.letters[0]) + elif len(self.letters) > 0: + output += '%c[' % self.base + + for letter in self.letters: + output += letter + + output += ']' + + if len(self.words) != 0: + output += ')' + + if not isLast: + output += '"' + output += '\n' + + return output; + + def add(self, line): + length = len(line) + + if line.startswith('#') or (length == 0): + return; + + if length == 2: + self.letters.append(line[1:2]) + else: + self.words.append(line) + +def getBucket(buckets, line): + letter = line[0] + bucket = buckets.get(letter) + + if bucket is None: + bucket = Bucket(letter) + buckets[letter] = bucket + + return bucket + +def makePattern(prefix, suffix, buckets, isWebUrl=False): + output = prefix + + output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl) + + for letter in range(ord('b'), ord('z')): + output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl) + + output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl) + + if isWebUrl: + output += '))"' + else: + output += ')' + + output += suffix + + print output + +if __name__ == "__main__": + f = urlopen('http://data.iana.org/TLD/tlds-alpha-by-domain.txt') + domains = f.readlines() + f.close() + + buckets = {} + + for domain in domains: + domain = domain.lower() + + if len(domain) > 0: + getBucket(buckets, domain[0]).add(domain.strip()) + + makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False) + makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True) |