1823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang#!/usr/bin/env python
2823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
3823b6f3516076b92f78c3fc27037d24bb514e653Ying Wangfrom urllib2 import urlopen
4823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
5823b6f3516076b92f78c3fc27037d24bb514e653Ying WangTLD_PREFIX = r"""
6823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    /**
7823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang     *  Regular expression to match all IANA top-level domains.
8829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang     *  List accurate as of 2011/07/18.  List taken from:
9823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang     *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
10829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang     *  This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py
11823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang     */
12823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    public static final String TOP_LEVEL_DOMAIN_STR =
13823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang"""
14823b6f3516076b92f78c3fc27037d24bb514e653Ying WangTLD_SUFFIX = '";'
15823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
16823b6f3516076b92f78c3fc27037d24bb514e653Ying WangURL_PREFIX = r"""
17823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    /**
18823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang     *  Regular expression to match all IANA top-level domains for WEB_URL.
19829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang     *  List accurate as of 2011/07/18.  List taken from:
20823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang     *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
21829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang     *  This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py
22823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang     */
23823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL =
24823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        "(?:"
25823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang"""
26823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
27823b6f3516076b92f78c3fc27037d24bb514e653Ying WangURL_SUFFIX = ';'
28ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed SinirTAB = '        '
29ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir
30ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinirclass BucketOutput:
31ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir    def __init__(self):
32ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir        self.buffer = TAB
33ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir        self.lineLength = len(TAB)
34ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir
35ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir    def __iadd__(self, other):
36ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir        self.buffer += other
37ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir        self.lineLength += len(other)
38ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir        return self
39ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir
40ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir    def addPipe(self):
41ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir        if self.lineLength > 90:
42ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir            self.buffer += '"\n'
43ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir            self.buffer += TAB
44ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir            self.buffer += '+ "'
45ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir            self.lineLength = len(TAB)
46ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir
47ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir        self += '|'
48ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir
49ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir    def value(self):
50ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir        return self.buffer
51823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
52823b6f3516076b92f78c3fc27037d24bb514e653Ying Wangclass Bucket:
53823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    def __init__(self, baseLetter):
54823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        self.base=baseLetter
55823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        self.words=[]
56823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        self.letters=[]
57823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
58823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    def dump(self, isWebUrl=False, isFirst=False, isLast=False):
59823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        if (len(self.words) == 0) and (len(self.letters) == 0):
60823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            return ''
61823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
62823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        self.words.sort()
63823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        self.letters.sort()
64823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
65ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir        output = BucketOutput()
66823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
67823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        if isFirst:
68823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            if isWebUrl:
69823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang                output += '+ "'
70823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            else:
71823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang                output += '"('
72823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        else:
73823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            output += '+ "|'
74823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
75823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        if len(self.words) != 0:
76823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            output += '('
77823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
78823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            if isWebUrl:
79823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang                output += '?:'
80823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
81823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        firstWord = 1
82823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        for word in self.words:
83823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            if firstWord == 0:
84ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir                output.addPipe()
85823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            firstWord = 0
86823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            for letter in word:
87823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang                if letter == '-':
88823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang                    output += '\\\\'  # escape the '-' character.
89823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang                output += letter
90823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
91823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        if len(self.words) > 0 and len(self.letters) > 0:
92ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir            output.addPipe()
93823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
94823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        if len(self.letters) == 1:
95823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            output += '%c%c' % (self.base, self.letters[0])
96823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        elif len(self.letters) > 0:
97823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            output += '%c[' % self.base
98823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
99823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            for letter in self.letters:
100823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang                output += letter
101823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
102823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            output += ']'
103823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
104823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        if len(self.words) != 0:
105823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            output += ')'
106823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
107823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        if not isLast:
108823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            output += '"'
109823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            output += '\n'
110823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
111ef8ab805d2b114e9d0dd98ae067c056b4a09a377Siyamed Sinir        return output.value();
112823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
113823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    def add(self, line):
114823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        length = len(line)
115823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
116823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        if line.startswith('#') or (length == 0):
117823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            return;
118823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
119823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        if length == 2:
120823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            self.letters.append(line[1:2])
121823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        else:
122823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            self.words.append(line)
123823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
124823b6f3516076b92f78c3fc27037d24bb514e653Ying Wangdef getBucket(buckets, line):
125823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    letter = line[0]
126823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    bucket = buckets.get(letter)
127823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
128823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    if bucket is None:
129823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        bucket = Bucket(letter)
130823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        buckets[letter] = bucket
131823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
132823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    return bucket
133823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
134823b6f3516076b92f78c3fc27037d24bb514e653Ying Wangdef makePattern(prefix, suffix, buckets, isWebUrl=False):
135823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    output = prefix
136823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
137823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl)
138823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
139823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    for letter in range(ord('b'), ord('z')):
140823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl)
141823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
142823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl)
143823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
144823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    if isWebUrl:
145823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        output += '))"'
146823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    else:
147823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        output += ')'
148823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
149823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    output += suffix
150823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
151823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    print output
152823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
153823b6f3516076b92f78c3fc27037d24bb514e653Ying Wangif __name__ == "__main__":
154823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    f = urlopen('http://data.iana.org/TLD/tlds-alpha-by-domain.txt')
155823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    domains = f.readlines()
156823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    f.close()
157823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
158823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    buckets = {}
159823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
160823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    for domain in domains:
161823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        domain = domain.lower()
162823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
163823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        if len(domain) > 0:
164823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            getBucket(buckets, domain[0]).add(domain.strip())
165823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
166829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang        if domain.startswith('xn--'):
167829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang	   puny = domain.strip()[4:]
168829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang	   result = puny.decode('punycode')
169829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang	   result = repr(result)
170829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang           getBucket(buckets, 'xn--').add(result[2:-1])
171829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang
172823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False)
173823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True)
174