1#!/usr/bin/env python 2 3from urllib2 import urlopen 4 5TLD_PREFIX = r""" 6 /** 7 * Regular expression to match all IANA top-level domains. 8 * List accurate as of 2011/07/18. List taken from: 9 * http://data.iana.org/TLD/tlds-alpha-by-domain.txt 10 * This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py 11 */ 12 public static final String TOP_LEVEL_DOMAIN_STR = 13""" 14TLD_SUFFIX = '";' 15 16URL_PREFIX = r""" 17 /** 18 * Regular expression to match all IANA top-level domains for WEB_URL. 19 * List accurate as of 2011/07/18. List taken from: 20 * http://data.iana.org/TLD/tlds-alpha-by-domain.txt 21 * This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py 22 */ 23 public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL = 24 "(?:" 25""" 26 27URL_SUFFIX = ';' 28 29class Bucket: 30 def __init__(self, baseLetter): 31 self.base=baseLetter 32 self.words=[] 33 self.letters=[] 34 35 def dump(self, isWebUrl=False, isFirst=False, isLast=False): 36 if (len(self.words) == 0) and (len(self.letters) == 0): 37 return '' 38 39 self.words.sort() 40 self.letters.sort() 41 42 output = ' '; 43 44 if isFirst: 45 if isWebUrl: 46 output += '+ "' 47 else: 48 output += '"(' 49 else: 50 output += '+ "|' 51 52 if len(self.words) != 0: 53 output += '(' 54 55 if isWebUrl: 56 output += '?:' 57 58 firstWord = 1 59 for word in self.words: 60 if firstWord == 0: 61 output += '|' 62 firstWord = 0 63 for letter in word: 64 if letter == '-': 65 output += '\\\\' # escape the '-' character. 66 output += letter 67 68 if len(self.words) > 0 and len(self.letters) > 0: 69 output += '|' 70 71 if len(self.letters) == 1: 72 output += '%c%c' % (self.base, self.letters[0]) 73 elif len(self.letters) > 0: 74 output += '%c[' % self.base 75 76 for letter in self.letters: 77 output += letter 78 79 output += ']' 80 81 if len(self.words) != 0: 82 output += ')' 83 84 if not isLast: 85 output += '"' 86 output += '\n' 87 88 return output; 89 90 def add(self, line): 91 length = len(line) 92 93 if line.startswith('#') or (length == 0): 94 return; 95 96 if length == 2: 97 self.letters.append(line[1:2]) 98 else: 99 self.words.append(line) 100 101def getBucket(buckets, line): 102 letter = line[0] 103 bucket = buckets.get(letter) 104 105 if bucket is None: 106 bucket = Bucket(letter) 107 buckets[letter] = bucket 108 109 return bucket 110 111def makePattern(prefix, suffix, buckets, isWebUrl=False): 112 output = prefix 113 114 output += getBucket(buckets, 'a').dump(isFirst=True, isWebUrl=isWebUrl) 115 116 for letter in range(ord('b'), ord('z')): 117 output += getBucket(buckets, chr(letter)).dump(isWebUrl=isWebUrl) 118 119 output += getBucket(buckets, 'z').dump(isLast=True, isWebUrl=isWebUrl) 120 121 if isWebUrl: 122 output += '))"' 123 else: 124 output += ')' 125 126 output += suffix 127 128 print output 129 130if __name__ == "__main__": 131 f = urlopen('http://data.iana.org/TLD/tlds-alpha-by-domain.txt') 132 domains = f.readlines() 133 f.close() 134 135 buckets = {} 136 137 for domain in domains: 138 domain = domain.lower() 139 140 if len(domain) > 0: 141 getBucket(buckets, domain[0]).add(domain.strip()) 142 143 if domain.startswith('xn--'): 144 puny = domain.strip()[4:] 145 result = puny.decode('punycode') 146 result = repr(result) 147 getBucket(buckets, 'xn--').add(result[2:-1]) 148 149 makePattern(TLD_PREFIX, TLD_SUFFIX, buckets, isWebUrl=False) 150 makePattern(URL_PREFIX, URL_SUFFIX, buckets, isWebUrl=True) 151