1#!/usr/bin/env python 2 3from urllib2 import urlopen 4from datetime import date 5 6URL='http://data.iana.org/TLD/tlds-alpha-by-domain.txt' 7 8TLD_PREFIX = r""" 9 /** 10 * Regular expression to match all IANA top-level domains. 11 * List accurate as of {gen_date}. List taken from: 12 * {url} 13 * This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py 14 */ 15 public static final String TOP_LEVEL_DOMAIN_STR = 16""" 17TLD_SUFFIX = '";' 18 19URL_PREFIX = r""" 20 /** 21 * Regular expression to match all IANA top-level domains for WEB_URL. 22 * List accurate as of {gen_date}. List taken from: 23 * {url} 24 * This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py 25 */ 26 public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL = 27 "(?:" 28""" 29 30URL_SUFFIX = ';' 31TAB = ' ' 32 33class BucketOutput: 34 def __init__(self): 35 self.buffer = TAB 36 self.lineLength = len(TAB) 37 38 def __iadd__(self, other): 39 self.buffer += other 40 self.lineLength += len(other) 41 return self 42 43 def addPipe(self): 44 if self.lineLength > 90: 45 self.buffer += '"\n' 46 self.buffer += TAB 47 self.buffer += '+ "' 48 self.lineLength = len(TAB) 49 50 self += '|' 51 52 def value(self): 53 return self.buffer 54 55class Bucket: 56 def __init__(self, baseLetter): 57 self.base=baseLetter 58 self.words=[] 59 self.letters=[] 60 61 def dump(self, isWebUrl=False, isFirst=False, isLast=False): 62 if (len(self.words) == 0) and (len(self.letters) == 0): 63 return '' 64 65 self.words.sort() 66 self.letters.sort() 67 68 output = BucketOutput() 69 70 if isFirst: 71 if isWebUrl: 72 output += '+ "' 73 else: 74 output += '"(' 75 else: 76 output += '+ "|' 77 78 if len(self.words) != 0: 79 output += '(' 80 81 if isWebUrl: 82 output += '?:' 83 84 firstWord = 1 85 for word in self.words: 86 if firstWord == 0: 87 output.addPipe() 88 firstWord = 0 89 for letter in word: 90 if letter == '-': 91 output += '\\\\' # escape the '-' character. 92 output += letter 93 94 if len(self.words) > 0 and len(self.letters) > 0: 95 output.addPipe() 96 97 if len(self.letters) == 1: 98 output += '%c%c' % (self.base, self.letters[0]) 99 elif len(self.letters) > 0: 100 output += '%c[' % self.base 101 102 for letter in self.letters: 103 output += letter 104 105 output += ']' 106 107 if len(self.words) != 0: 108 output += ')' 109 110 if not isLast: 111 output += '"' 112 output += '\n' 113 114 return output.value(); 115 116 def add(self, line): 117 length = len(line) 118 119 if line.startswith('#') or (length == 0): 120 return; 121 122 if length == 2: 123 self.letters.append(line[1:2]) 124 else: 125 self.words.append(line) 126 127def getBucket(buckets, line): 128 letter = line[0] 129 bucket = buckets.get(letter) 130 131 if bucket is None: 132 bucket = Bucket(letter) 133 buckets[letter] = bucket 134 135 return bucket 136 137def makePattern(prefix, suffix, buckets, isWebUrl=False): 138 output = prefix.format(gen_date = date.today(), url=URL) 139 140 output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl) 141 142 for letter in range(ord('b'), ord('z')): 143 output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl) 144 145 output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl) 146 147 if isWebUrl: 148 output += '))"' 149 else: 150 output += ')' 151 152 output += suffix 153 154 print output 155 156if __name__ == "__main__": 157 f = urlopen(URL) 158 domains = f.readlines() 159 f.close() 160 161 buckets = {} 162 163 for domain in domains: 164 domain = domain.lower() 165 166 if len(domain) > 0: 167 getBucket(buckets, domain[0]).add(domain.strip()) 168 169 if domain.startswith('xn--'): 170 puny = domain.strip()[4:] 171 result = puny.decode('punycode') 172 result = repr(result) 173 getBucket(buckets, 'xn--').add(result[2:-1]) 174 175 makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False) 176 makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True) 177