1823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang#!/usr/bin/env python 2823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 3823b6f3516076b92f78c3fc27037d24bb514e653Ying Wangfrom urllib2 import urlopen 4823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 5823b6f3516076b92f78c3fc27037d24bb514e653Ying WangTLD_PREFIX = r""" 6823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang /** 7823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang * Regular expression to match all IANA top-level domains. 8829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang * List accurate as of 2011/07/18. List taken from: 9823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang * http://data.iana.org/TLD/tlds-alpha-by-domain.txt 10829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang * This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py 11823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang */ 12823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang public static final String TOP_LEVEL_DOMAIN_STR = 13823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang""" 14823b6f3516076b92f78c3fc27037d24bb514e653Ying WangTLD_SUFFIX = '";' 15823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 16823b6f3516076b92f78c3fc27037d24bb514e653Ying WangURL_PREFIX = r""" 17823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang /** 18823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang * Regular expression to match all IANA top-level domains for WEB_URL. 19829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang * List accurate as of 2011/07/18. List taken from: 20823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang * http://data.iana.org/TLD/tlds-alpha-by-domain.txt 21829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang * This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py 22823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang */ 23823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL = 24823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang "(?:" 25823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang""" 26823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 27823b6f3516076b92f78c3fc27037d24bb514e653Ying WangURL_SUFFIX = ';' 28823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 29823b6f3516076b92f78c3fc27037d24bb514e653Ying Wangclass Bucket: 30823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang def __init__(self, baseLetter): 31823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang self.base=baseLetter 32823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang self.words=[] 33823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang self.letters=[] 34823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 35823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang def dump(self, isWebUrl=False, isFirst=False, isLast=False): 36823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang if (len(self.words) == 0) and (len(self.letters) == 0): 37823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang return '' 38823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 39823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang self.words.sort() 40823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang self.letters.sort() 41823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 42823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output = ' '; 43823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 44823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang if isFirst: 45823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang if isWebUrl: 46823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += '+ "' 47823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang else: 48823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += '"(' 49823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang else: 50823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += '+ "|' 51823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 52823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang if len(self.words) != 0: 53823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += '(' 54823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 55823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang if isWebUrl: 56823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += '?:' 57823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 58823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang firstWord = 1 59823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang for word in self.words: 60823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang if firstWord == 0: 61823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += '|' 62823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang firstWord = 0 63823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang for letter in word: 64823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang if letter == '-': 65823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += '\\\\' # escape the '-' character. 66823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += letter 67823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 68823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang if len(self.words) > 0 and len(self.letters) > 0: 69823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += '|' 70823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 71823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang if len(self.letters) == 1: 72823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += '%c%c' % (self.base, self.letters[0]) 73823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang elif len(self.letters) > 0: 74823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += '%c[' % self.base 75823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 76823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang for letter in self.letters: 77823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += letter 78823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 79823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += ']' 80823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 81823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang if len(self.words) != 0: 82823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += ')' 83823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 84823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang if not isLast: 85823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += '"' 86823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += '\n' 87823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 88823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang return output; 89823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 90823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang def add(self, line): 91823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang length = len(line) 92823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 93823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang if line.startswith('#') or (length == 0): 94823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang return; 95823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 96823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang if length == 2: 97823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang self.letters.append(line[1:2]) 98823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang else: 99823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang self.words.append(line) 100823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 101823b6f3516076b92f78c3fc27037d24bb514e653Ying Wangdef getBucket(buckets, line): 102823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang letter = line[0] 103823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang bucket = buckets.get(letter) 104823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 105823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang if bucket is None: 106823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang bucket = Bucket(letter) 107823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang buckets[letter] = bucket 108823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 109823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang return bucket 110823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 111823b6f3516076b92f78c3fc27037d24bb514e653Ying Wangdef makePattern(prefix, suffix, buckets, isWebUrl=False): 112823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output = prefix 113823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 114823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl) 115823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 116823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang for letter in range(ord('b'), ord('z')): 117823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl) 118823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 119823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl) 120823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 121823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang if isWebUrl: 122823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += '))"' 123823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang else: 124823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += ')' 125823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 126823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += suffix 127823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 128823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang print output 129823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 130823b6f3516076b92f78c3fc27037d24bb514e653Ying Wangif __name__ == "__main__": 131823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang f = urlopen('http://data.iana.org/TLD/tlds-alpha-by-domain.txt') 132823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang domains = f.readlines() 133823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang f.close() 134823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 135823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang buckets = {} 136823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 137823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang for domain in domains: 138823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang domain = domain.lower() 139823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 140823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang if len(domain) > 0: 141823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang getBucket(buckets, domain[0]).add(domain.strip()) 142823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 143829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang if domain.startswith('xn--'): 144829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang puny = domain.strip()[4:] 145829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang result = puny.decode('punycode') 146829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang result = repr(result) 147829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang getBucket(buckets, 'xn--').add(result[2:-1]) 148829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang 149823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False) 150823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True) 151