1""" Unicode Mapping Parser and Codec Generator. 2 3This script parses Unicode mapping files as available from the Unicode 4site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec 5modules from them. The codecs use the standard character mapping codec 6to actually apply the mapping. 7 8Synopsis: gencodec.py dir codec_prefix 9 10All files in dir are scanned and those producing non-empty mappings 11will be written to <codec_prefix><mapname>.py with <mapname> being the 12first part of the map's filename ('a' in a.b.c.txt) converted to 13lowercase with hyphens replaced by underscores. 14 15The tool also writes marshalled versions of the mapping tables to the 16same location (with .mapping extension). 17 18Written by Marc-Andre Lemburg (mal@lemburg.com). 19 20(c) Copyright CNRI, All Rights Reserved. NO WARRANTY. 21(c) Copyright Guido van Rossum, 2000. 22 23Table generation: 24(c) Copyright Marc-Andre Lemburg, 2005. 25 Licensed to PSF under a Contributor Agreement. 26 27"""#" 28 29import re, os, marshal, codecs 30 31# Maximum allowed size of charmap tables 32MAX_TABLE_SIZE = 8192 33 34# Standard undefined Unicode code point 35UNI_UNDEFINED = chr(0xFFFE) 36 37# Placeholder for a missing code point 38MISSING_CODE = -1 39 40mapRE = re.compile(r'((?:0x[0-9a-fA-F]+\+?)+)' 41 r'\s+' 42 r'((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)' 43 r'\s*' 44 r'(#.+)?') 45 46def parsecodes(codes, len=len, range=range): 47 48 """ Converts code combinations to either a single code integer 49 or a tuple of integers. 50 51 meta-codes (in angular brackets, e.g. <LR> and <RL>) are 52 ignored. 53 54 Empty codes or illegal ones are returned as None. 55 56 """ 57 if not codes: 58 return MISSING_CODE 59 l = codes.split('+') 60 if len(l) == 1: 61 return int(l[0],16) 62 for i in range(len(l)): 63 try: 64 l[i] = int(l[i],16) 65 except ValueError: 66 l[i] = MISSING_CODE 67 l = [x for x in l if x != MISSING_CODE] 68 if len(l) == 1: 69 return l[0] 70 else: 71 return tuple(l) 72 73def readmap(filename): 74 75 f = open(filename,'r') 76 lines = f.readlines() 77 f.close() 78 enc2uni = {} 79 identity = [] 80 unmapped = list(range(256)) 81 82 # UTC mapping tables per convention don't include the identity 83 # mappings for code points 0x00 - 0x1F and 0x7F, unless these are 84 # explicitly mapped to different characters or undefined 85 for i in list(range(32)) + [127]: 86 identity.append(i) 87 unmapped.remove(i) 88 enc2uni[i] = (i, 'CONTROL CHARACTER') 89 90 for line in lines: 91 line = line.strip() 92 if not line or line[0] == '#': 93 continue 94 m = mapRE.match(line) 95 if not m: 96 #print '* not matched: %s' % repr(line) 97 continue 98 enc,uni,comment = m.groups() 99 enc = parsecodes(enc) 100 uni = parsecodes(uni) 101 if comment is None: 102 comment = '' 103 else: 104 comment = comment[1:].strip() 105 if not isinstance(enc, tuple) and enc < 256: 106 if enc in unmapped: 107 unmapped.remove(enc) 108 if enc == uni: 109 identity.append(enc) 110 enc2uni[enc] = (uni,comment) 111 else: 112 enc2uni[enc] = (uni,comment) 113 114 # If there are more identity-mapped entries than unmapped entries, 115 # it pays to generate an identity dictionary first, and add explicit 116 # mappings to None for the rest 117 if len(identity) >= len(unmapped): 118 for enc in unmapped: 119 enc2uni[enc] = (MISSING_CODE, "") 120 enc2uni['IDENTITY'] = 256 121 122 return enc2uni 123 124def hexrepr(t, precision=4): 125 126 if t is None: 127 return 'None' 128 try: 129 len(t) 130 except TypeError: 131 return '0x%0*X' % (precision, t) 132 try: 133 return '(' + ', '.join(['0x%0*X' % (precision, item) 134 for item in t]) + ')' 135 except TypeError as why: 136 print('* failed to convert %r: %s' % (t, why)) 137 raise 138 139def python_mapdef_code(varname, map, comments=1, precisions=(2, 4)): 140 141 l = [] 142 append = l.append 143 if "IDENTITY" in map: 144 append("%s = codecs.make_identity_dict(range(%d))" % 145 (varname, map["IDENTITY"])) 146 append("%s.update({" % varname) 147 splits = 1 148 del map["IDENTITY"] 149 identity = 1 150 else: 151 append("%s = {" % varname) 152 splits = 0 153 identity = 0 154 155 mappings = sorted(map.items()) 156 i = 0 157 key_precision, value_precision = precisions 158 for mapkey, mapvalue in mappings: 159 mapcomment = '' 160 if isinstance(mapkey, tuple): 161 (mapkey, mapcomment) = mapkey 162 if isinstance(mapvalue, tuple): 163 (mapvalue, mapcomment) = mapvalue 164 if mapkey is None: 165 continue 166 if (identity and 167 mapkey == mapvalue and 168 mapkey < 256): 169 # No need to include identity mappings, since these 170 # are already set for the first 256 code points. 171 continue 172 key = hexrepr(mapkey, key_precision) 173 value = hexrepr(mapvalue, value_precision) 174 if mapcomment and comments: 175 append(' %s: %s,\t# %s' % (key, value, mapcomment)) 176 else: 177 append(' %s: %s,' % (key, value)) 178 i += 1 179 if i == 4096: 180 # Split the definition into parts to that the Python 181 # parser doesn't dump core 182 if splits == 0: 183 append('}') 184 else: 185 append('})') 186 append('%s.update({' % varname) 187 i = 0 188 splits = splits + 1 189 if splits == 0: 190 append('}') 191 else: 192 append('})') 193 194 return l 195 196def python_tabledef_code(varname, map, comments=1, key_precision=2): 197 198 l = [] 199 append = l.append 200 append('%s = (' % varname) 201 202 # Analyze map and create table dict 203 mappings = sorted(map.items()) 204 table = {} 205 maxkey = 255 206 if 'IDENTITY' in map: 207 for key in range(256): 208 table[key] = (key, '') 209 del map['IDENTITY'] 210 for mapkey, mapvalue in mappings: 211 mapcomment = '' 212 if isinstance(mapkey, tuple): 213 (mapkey, mapcomment) = mapkey 214 if isinstance(mapvalue, tuple): 215 (mapvalue, mapcomment) = mapvalue 216 if mapkey == MISSING_CODE: 217 continue 218 table[mapkey] = (mapvalue, mapcomment) 219 if mapkey > maxkey: 220 maxkey = mapkey 221 if maxkey > MAX_TABLE_SIZE: 222 # Table too large 223 return None 224 225 # Create table code 226 maxchar = 0 227 for key in range(maxkey + 1): 228 if key not in table: 229 mapvalue = MISSING_CODE 230 mapcomment = 'UNDEFINED' 231 else: 232 mapvalue, mapcomment = table[key] 233 if mapvalue == MISSING_CODE: 234 mapchar = UNI_UNDEFINED 235 else: 236 if isinstance(mapvalue, tuple): 237 # 1-n mappings not supported 238 return None 239 else: 240 mapchar = chr(mapvalue) 241 maxchar = max(maxchar, ord(mapchar)) 242 if mapcomment and comments: 243 append(' %a \t# %s -> %s' % (mapchar, 244 hexrepr(key, key_precision), 245 mapcomment)) 246 else: 247 append(' %a' % mapchar) 248 249 if maxchar < 256: 250 append(' %a \t## Widen to UCS2 for optimization' % UNI_UNDEFINED) 251 append(')') 252 return l 253 254def codegen(name, map, encodingname, comments=1): 255 256 """ Returns Python source for the given map. 257 258 Comments are included in the source, if comments is true (default). 259 260 """ 261 # Generate code 262 decoding_map_code = python_mapdef_code( 263 'decoding_map', 264 map, 265 comments=comments) 266 decoding_table_code = python_tabledef_code( 267 'decoding_table', 268 map, 269 comments=comments) 270 encoding_map_code = python_mapdef_code( 271 'encoding_map', 272 codecs.make_encoding_map(map), 273 comments=comments, 274 precisions=(4, 2)) 275 276 if decoding_table_code: 277 suffix = 'table' 278 else: 279 suffix = 'map' 280 281 l = [ 282 '''\ 283""" Python Character Mapping Codec %s generated from '%s' with gencodec.py. 284 285"""#" 286 287import codecs 288 289### Codec APIs 290 291class Codec(codecs.Codec): 292 293 def encode(self, input, errors='strict'): 294 return codecs.charmap_encode(input, errors, encoding_%s) 295 296 def decode(self, input, errors='strict'): 297 return codecs.charmap_decode(input, errors, decoding_%s) 298''' % (encodingname, name, suffix, suffix)] 299 l.append('''\ 300class IncrementalEncoder(codecs.IncrementalEncoder): 301 def encode(self, input, final=False): 302 return codecs.charmap_encode(input, self.errors, encoding_%s)[0] 303 304class IncrementalDecoder(codecs.IncrementalDecoder): 305 def decode(self, input, final=False): 306 return codecs.charmap_decode(input, self.errors, decoding_%s)[0]''' % 307 (suffix, suffix)) 308 309 l.append(''' 310class StreamWriter(Codec, codecs.StreamWriter): 311 pass 312 313class StreamReader(Codec, codecs.StreamReader): 314 pass 315 316### encodings module API 317 318def getregentry(): 319 return codecs.CodecInfo( 320 name=%r, 321 encode=Codec().encode, 322 decode=Codec().decode, 323 incrementalencoder=IncrementalEncoder, 324 incrementaldecoder=IncrementalDecoder, 325 streamreader=StreamReader, 326 streamwriter=StreamWriter, 327 ) 328''' % encodingname.replace('_', '-')) 329 330 # Add decoding table or map (with preference to the table) 331 if not decoding_table_code: 332 l.append(''' 333### Decoding Map 334''') 335 l.extend(decoding_map_code) 336 else: 337 l.append(''' 338### Decoding Table 339''') 340 l.extend(decoding_table_code) 341 342 # Add encoding map 343 if decoding_table_code: 344 l.append(''' 345### Encoding table 346encoding_table = codecs.charmap_build(decoding_table) 347''') 348 else: 349 l.append(''' 350### Encoding Map 351''') 352 l.extend(encoding_map_code) 353 354 # Final new-line 355 l.append('') 356 357 return '\n'.join(l).expandtabs() 358 359def pymap(name,map,pyfile,encodingname,comments=1): 360 361 code = codegen(name,map,encodingname,comments) 362 f = open(pyfile,'w') 363 f.write(code) 364 f.close() 365 366def marshalmap(name,map,marshalfile): 367 368 d = {} 369 for e,(u,c) in map.items(): 370 d[e] = (u,c) 371 f = open(marshalfile,'wb') 372 marshal.dump(d,f) 373 f.close() 374 375def convertdir(dir, dirprefix='', nameprefix='', comments=1): 376 377 mapnames = os.listdir(dir) 378 for mapname in mapnames: 379 mappathname = os.path.join(dir, mapname) 380 if not os.path.isfile(mappathname): 381 continue 382 name = os.path.split(mapname)[1] 383 name = name.replace('-','_') 384 name = name.split('.')[0] 385 name = name.lower() 386 name = nameprefix + name 387 codefile = name + '.py' 388 marshalfile = name + '.mapping' 389 print('converting %s to %s and %s' % (mapname, 390 dirprefix + codefile, 391 dirprefix + marshalfile)) 392 try: 393 map = readmap(os.path.join(dir,mapname)) 394 if not map: 395 print('* map is empty; skipping') 396 else: 397 pymap(mappathname, map, dirprefix + codefile,name,comments) 398 marshalmap(mappathname, map, dirprefix + marshalfile) 399 except ValueError as why: 400 print('* conversion failed: %s' % why) 401 raise 402 403def rewritepythondir(dir, dirprefix='', comments=1): 404 405 mapnames = os.listdir(dir) 406 for mapname in mapnames: 407 if not mapname.endswith('.mapping'): 408 continue 409 name = mapname[:-len('.mapping')] 410 codefile = name + '.py' 411 print('converting %s to %s' % (mapname, 412 dirprefix + codefile)) 413 try: 414 map = marshal.load(open(os.path.join(dir,mapname), 415 'rb')) 416 if not map: 417 print('* map is empty; skipping') 418 else: 419 pymap(mapname, map, dirprefix + codefile,name,comments) 420 except ValueError as why: 421 print('* conversion failed: %s' % why) 422 423if __name__ == '__main__': 424 425 import sys 426 if 1: 427 convertdir(*sys.argv[1:]) 428 else: 429 rewritepythondir(*sys.argv[1:]) 430