1823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang#!/usr/bin/env python
2823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
3823b6f3516076b92f78c3fc27037d24bb514e653Ying Wangfrom urllib2 import urlopen
4823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
5823b6f3516076b92f78c3fc27037d24bb514e653Ying WangTLD_PREFIX = r"""
6823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    /**
7823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang     *  Regular expression to match all IANA top-level domains.
8829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang     *  List accurate as of 2011/07/18.  List taken from:
9823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang     *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
10829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang     *  This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py
11823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang     */
12823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    public static final String TOP_LEVEL_DOMAIN_STR =
13823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang"""
14823b6f3516076b92f78c3fc27037d24bb514e653Ying WangTLD_SUFFIX = '";'
15823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
16823b6f3516076b92f78c3fc27037d24bb514e653Ying WangURL_PREFIX = r"""
17823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    /**
18823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang     *  Regular expression to match all IANA top-level domains for WEB_URL.
19829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang     *  List accurate as of 2011/07/18.  List taken from:
20823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang     *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
21829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang     *  This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py
22823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang     */
23823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL =
24823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        "(?:"
25823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang"""
26823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
27823b6f3516076b92f78c3fc27037d24bb514e653Ying WangURL_SUFFIX = ';'
28823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
29823b6f3516076b92f78c3fc27037d24bb514e653Ying Wangclass Bucket:
30823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    def __init__(self, baseLetter):
31823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        self.base=baseLetter
32823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        self.words=[]
33823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        self.letters=[]
34823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
35823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    def dump(self, isWebUrl=False, isFirst=False, isLast=False):
36823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        if (len(self.words) == 0) and (len(self.letters) == 0):
37823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            return ''
38823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
39823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        self.words.sort()
40823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        self.letters.sort()
41823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
42823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        output = '        ';
43823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
44823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        if isFirst:
45823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            if isWebUrl:
46823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang                output += '+ "'
47823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            else:
48823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang                output += '"('
49823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        else:
50823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            output += '+ "|'
51823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
52823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        if len(self.words) != 0:
53823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            output += '('
54823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
55823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            if isWebUrl:
56823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang                output += '?:'
57823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
58823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        firstWord = 1
59823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        for word in self.words:
60823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            if firstWord == 0:
61823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang                output += '|'
62823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            firstWord = 0
63823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            for letter in word:
64823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang                if letter == '-':
65823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang                    output += '\\\\'  # escape the '-' character.
66823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang                output += letter
67823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
68823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        if len(self.words) > 0 and len(self.letters) > 0:
69823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            output += '|'
70823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
71823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        if len(self.letters) == 1:
72823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            output += '%c%c' % (self.base, self.letters[0])
73823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        elif len(self.letters) > 0:
74823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            output += '%c[' % self.base
75823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
76823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            for letter in self.letters:
77823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang                output += letter
78823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
79823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            output += ']'
80823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
81823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        if len(self.words) != 0:
82823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            output += ')'
83823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
84823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        if not isLast:
85823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            output += '"'
86823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            output += '\n'
87823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
88823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        return output;
89823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
90823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    def add(self, line):
91823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        length = len(line)
92823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
93823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        if line.startswith('#') or (length == 0):
94823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            return;
95823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
96823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        if length == 2:
97823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            self.letters.append(line[1:2])
98823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        else:
99823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            self.words.append(line)
100823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
101823b6f3516076b92f78c3fc27037d24bb514e653Ying Wangdef getBucket(buckets, line):
102823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    letter = line[0]
103823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    bucket = buckets.get(letter)
104823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
105823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    if bucket is None:
106823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        bucket = Bucket(letter)
107823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        buckets[letter] = bucket
108823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
109823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    return bucket
110823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
111823b6f3516076b92f78c3fc27037d24bb514e653Ying Wangdef makePattern(prefix, suffix, buckets, isWebUrl=False):
112823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    output = prefix
113823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
114823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl)
115823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
116823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    for letter in range(ord('b'), ord('z')):
117823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl)
118823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
119823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl)
120823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
121823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    if isWebUrl:
122823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        output += '))"'
123823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    else:
124823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        output += ')'
125823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
126823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    output += suffix
127823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
128823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    print output
129823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
130823b6f3516076b92f78c3fc27037d24bb514e653Ying Wangif __name__ == "__main__":
131823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    f = urlopen('http://data.iana.org/TLD/tlds-alpha-by-domain.txt')
132823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    domains = f.readlines()
133823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    f.close()
134823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
135823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    buckets = {}
136823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
137823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    for domain in domains:
138823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        domain = domain.lower()
139823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
140823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang        if len(domain) > 0:
141823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang            getBucket(buckets, domain[0]).add(domain.strip())
142823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang
143829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang        if domain.startswith('xn--'):
144829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang	   puny = domain.strip()[4:]
145829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang	   result = puny.decode('punycode')
146829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang	   result = repr(result)
147829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang           getBucket(buckets, 'xn--').add(result[2:-1])
148829b84ec6b4f5fac7eea8da1de3378ce47033838Shimeng (Simon) Wang
149823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False)
150823b6f3516076b92f78c3fc27037d24bb514e653Ying Wang    makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True)
151