1#!/usr/bin/env python
2
3from urllib2 import urlopen
4from datetime import date
5
6URL='http://data.iana.org/TLD/tlds-alpha-by-domain.txt'
7
8TLD_PREFIX = r"""
9    /**
10     *  Regular expression to match all IANA top-level domains.
11     *  List accurate as of {gen_date}.  List taken from:
12     *  {url}
13     *  This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py
14     */
15    public static final String TOP_LEVEL_DOMAIN_STR =
16"""
17TLD_SUFFIX = '";'
18
19URL_PREFIX = r"""
20    /**
21     *  Regular expression to match all IANA top-level domains for WEB_URL.
22     *  List accurate as of {gen_date}.  List taken from:
23     *  {url}
24     *  This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py
25     */
26    public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL =
27        "(?:"
28"""
29
30URL_SUFFIX = ';'
31TAB = '        '
32
33class BucketOutput:
34    def __init__(self):
35        self.buffer = TAB
36        self.lineLength = len(TAB)
37
38    def __iadd__(self, other):
39        self.buffer += other
40        self.lineLength += len(other)
41        return self
42
43    def addPipe(self):
44        if self.lineLength > 90:
45            self.buffer += '"\n'
46            self.buffer += TAB
47            self.buffer += '+ "'
48            self.lineLength = len(TAB)
49
50        self += '|'
51
52    def value(self):
53        return self.buffer
54
55class Bucket:
56    def __init__(self, baseLetter):
57        self.base=baseLetter
58        self.words=[]
59        self.letters=[]
60
61    def dump(self, isWebUrl=False, isFirst=False, isLast=False):
62        if (len(self.words) == 0) and (len(self.letters) == 0):
63            return ''
64
65        self.words.sort()
66        self.letters.sort()
67
68        output = BucketOutput()
69
70        if isFirst:
71            if isWebUrl:
72                output += '+ "'
73            else:
74                output += '"('
75        else:
76            output += '+ "|'
77
78        if len(self.words) != 0:
79            output += '('
80
81            if isWebUrl:
82                output += '?:'
83
84        firstWord = 1
85        for word in self.words:
86            if firstWord == 0:
87                output.addPipe()
88            firstWord = 0
89            for letter in word:
90                if letter == '-':
91                    output += '\\\\'  # escape the '-' character.
92                output += letter
93
94        if len(self.words) > 0 and len(self.letters) > 0:
95            output.addPipe()
96
97        if len(self.letters) == 1:
98            output += '%c%c' % (self.base, self.letters[0])
99        elif len(self.letters) > 0:
100            output += '%c[' % self.base
101
102            for letter in self.letters:
103                output += letter
104
105            output += ']'
106
107        if len(self.words) != 0:
108            output += ')'
109
110        if not isLast:
111            output += '"'
112            output += '\n'
113
114        return output.value();
115
116    def add(self, line):
117        length = len(line)
118
119        if line.startswith('#') or (length == 0):
120            return;
121
122        if length == 2:
123            self.letters.append(line[1:2])
124        else:
125            self.words.append(line)
126
127def getBucket(buckets, line):
128    letter = line[0]
129    bucket = buckets.get(letter)
130
131    if bucket is None:
132        bucket = Bucket(letter)
133        buckets[letter] = bucket
134
135    return bucket
136
137def makePattern(prefix, suffix, buckets, isWebUrl=False):
138    output = prefix.format(gen_date = date.today(), url=URL)
139
140    output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl)
141
142    for letter in range(ord('b'), ord('z')):
143        output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl)
144
145    output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl)
146
147    if isWebUrl:
148        output += '))"'
149    else:
150        output += ')'
151
152    output += suffix
153
154    print output
155
156if __name__ == "__main__":
157    f = urlopen(URL)
158    domains = f.readlines()
159    f.close()
160
161    buckets = {}
162
163    for domain in domains:
164        domain = domain.lower()
165
166        if len(domain) > 0:
167            getBucket(buckets, domain[0]).add(domain.strip())
168
169        if domain.startswith('xn--'):
170	   puny = domain.strip()[4:]
171	   result = puny.decode('punycode')
172	   result = repr(result)
173           getBucket(buckets, 'xn--').add(result[2:-1])
174
175    makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False)
176    makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True)
177