1823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang#!/usr/bin/env python 2823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 3823b6f3516076b92f78c3fc27037d24bb514e653Ying Wangfrom urllib2 import urlopen 4823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 5823b6f3516076b92f78c3fc27037d24bb514e653Ying WangTLD_PREFIX = r""" 6823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang /** 7823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang * Regular expression to match all IANA top-level domains. 8829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang * List accurate as of 2011/07/18. List taken from: 9823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang * http://data.iana.org/TLD/tlds-alpha-by-domain.txt 10829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang * This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py 11823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang */ 12823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang public static final String TOP_LEVEL_DOMAIN_STR = 13823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang""" 14823b6f3516076b92f78c3fc27037d24bb514e653Ying WangTLD_SUFFIX = '";' 15823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 16823b6f3516076b92f78c3fc27037d24bb514e653Ying WangURL_PREFIX = r""" 17823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang /** 18823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang * Regular expression to match all IANA top-level domains for WEB_URL. 19829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang * List accurate as of 2011/07/18. List taken from: 20823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang * http://data.iana.org/TLD/tlds-alpha-by-domain.txt 21829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang * This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py 22823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang */ 23823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL = 24823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang "(?:" 25823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang""" 26823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 27823b6f3516076b92f78c3fc27037d24bb514e653Ying WangURL_SUFFIX = ';' 28ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed SinirTAB = ' ' 29ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir 30ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinirclass BucketOutput: 31ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir def __init__(self): 32ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir self.buffer = TAB 33ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir self.lineLength = len(TAB) 34ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir 35ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir def __iadd__(self, other): 36ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir self.buffer += other 37ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir self.lineLength += len(other) 38ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir return self 39ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir 40ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir def addPipe(self): 41ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir if self.lineLength > 90: 42ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir self.buffer += '"\n' 43ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir self.buffer += TAB 44ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir self.buffer += '+ "' 45ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir self.lineLength = len(TAB) 46ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir 47ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir self += '|' 48ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir 49ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir def value(self): 50ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir return self.buffer 51823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 52823b6f3516076b92f78c3fc27037d24bb514e653Ying Wangclass Bucket: 53823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang def __init__(self, baseLetter): 54823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang self.base=baseLetter 55823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang self.words=[] 56823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang self.letters=[] 57823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 58823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang def dump(self, isWebUrl=False, isFirst=False, isLast=False): 59823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang if (len(self.words) == 0) and (len(self.letters) == 0): 60823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang return '' 61823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 62823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang self.words.sort() 63823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang self.letters.sort() 64823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 65ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir output = BucketOutput() 66823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 67823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang if isFirst: 68823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang if isWebUrl: 69823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += '+ "' 70823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang else: 71823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += '"(' 72823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang else: 73823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += '+ "|' 74823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 75823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang if len(self.words) != 0: 76823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += '(' 77823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 78823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang if isWebUrl: 79823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += '?:' 80823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 81823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang firstWord = 1 82823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang for word in self.words: 83823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang if firstWord == 0: 84ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir output.addPipe() 85823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang firstWord = 0 86823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang for letter in word: 87823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang if letter == '-': 88823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += '\\\\' # escape the '-' character. 89823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += letter 90823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 91823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang if len(self.words) > 0 and len(self.letters) > 0: 92ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir output.addPipe() 93823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 94823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang if len(self.letters) == 1: 95823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += '%c%c' % (self.base, self.letters[0]) 96823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang elif len(self.letters) > 0: 97823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += '%c[' % self.base 98823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 99823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang for letter in self.letters: 100823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += letter 101823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 102823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += ']' 103823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 104823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang if len(self.words) != 0: 105823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += ')' 106823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 107823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang if not isLast: 108823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += '"' 109823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += '\n' 110823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 111ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir return output.value(); 112823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 113823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang def add(self, line): 114823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang length = len(line) 115823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 116823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang if line.startswith('#') or (length == 0): 117823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang return; 118823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 119823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang if length == 2: 120823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang self.letters.append(line[1:2]) 121823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang else: 122823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang self.words.append(line) 123823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 124823b6f3516076b92f78c3fc27037d24bb514e653Ying Wangdef getBucket(buckets, line): 125823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang letter = line[0] 126823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang bucket = buckets.get(letter) 127823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 128823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang if bucket is None: 129823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang bucket = Bucket(letter) 130823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang buckets[letter] = bucket 131823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 132823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang return bucket 133823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 134823b6f3516076b92f78c3fc27037d24bb514e653Ying Wangdef makePattern(prefix, suffix, buckets, isWebUrl=False): 135823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output = prefix 136823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 137823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl) 138823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 139823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang for letter in range(ord('b'), ord('z')): 140823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl) 141823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 142823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl) 143823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 144823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang if isWebUrl: 145823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += '))"' 146823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang else: 147823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += ')' 148823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 149823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang output += suffix 150823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 151823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang print output 152823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 153823b6f3516076b92f78c3fc27037d24bb514e653Ying Wangif __name__ == "__main__": 154823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang f = urlopen('http://data.iana.org/TLD/tlds-alpha-by-domain.txt') 155823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang domains = f.readlines() 156823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang f.close() 157823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 158823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang buckets = {} 159823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 160823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang for domain in domains: 161823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang domain = domain.lower() 162823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 163823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang if len(domain) > 0: 164823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang getBucket(buckets, domain[0]).add(domain.strip()) 165823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang 166829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang if domain.startswith('xn--'): 167829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang puny = domain.strip()[4:] 168829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang result = puny.decode('punycode') 169829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang result = repr(result) 170829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang getBucket(buckets, 'xn--').add(result[2:-1]) 171829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang 172823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False) 173823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True) 174