1""" Unicode Mapping Parser and Codec Generator.
2
3This script parses Unicode mapping files as available from the Unicode
4site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
5modules from them. The codecs use the standard character mapping codec
6to actually apply the mapping.
7
8Synopsis: gencodec.py dir codec_prefix
9
10All files in dir are scanned and those producing non-empty mappings
11will be written to <codec_prefix><mapname>.py with <mapname> being the
12first part of the map's filename ('a' in a.b.c.txt) converted to
13lowercase with hyphens replaced by underscores.
14
15The tool also writes marshalled versions of the mapping tables to the
16same location (with .mapping extension).
17
18Written by Marc-Andre Lemburg (mal@lemburg.com).
19
20(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
21(c) Copyright Guido van Rossum, 2000.
22
23Table generation:
24(c) Copyright Marc-Andre Lemburg, 2005.
25    Licensed to PSF under a Contributor Agreement.
26
27"""#"
28
29import re, os, marshal, codecs
30
31# Maximum allowed size of charmap tables
32MAX_TABLE_SIZE = 8192
33
34# Standard undefined Unicode code point
35UNI_UNDEFINED = chr(0xFFFE)
36
37# Placeholder for a missing code point
38MISSING_CODE = -1
39
40mapRE = re.compile(r'((?:0x[0-9a-fA-F]+\+?)+)'
41                   r'\s+'
42                   r'((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
43                   r'\s*'
44                   r'(#.+)?')
45
46def parsecodes(codes, len=len, range=range):
47
48    """ Converts code combinations to either a single code integer
49        or a tuple of integers.
50
51        meta-codes (in angular brackets, e.g. <LR> and <RL>) are
52        ignored.
53
54        Empty codes or illegal ones are returned as None.
55
56    """
57    if not codes:
58        return MISSING_CODE
59    l = codes.split('+')
60    if len(l) == 1:
61        return int(l[0],16)
62    for i in range(len(l)):
63        try:
64            l[i] = int(l[i],16)
65        except ValueError:
66            l[i] = MISSING_CODE
67    l = [x for x in l if x != MISSING_CODE]
68    if len(l) == 1:
69        return l[0]
70    else:
71        return tuple(l)
72
73def readmap(filename):
74
75    f = open(filename,'r')
76    lines = f.readlines()
77    f.close()
78    enc2uni = {}
79    identity = []
80    unmapped = list(range(256))
81
82    # UTC mapping tables per convention don't include the identity
83    # mappings for code points 0x00 - 0x1F and 0x7F, unless these are
84    # explicitly mapped to different characters or undefined
85    for i in list(range(32)) + [127]:
86        identity.append(i)
87        unmapped.remove(i)
88        enc2uni[i] = (i, 'CONTROL CHARACTER')
89
90    for line in lines:
91        line = line.strip()
92        if not line or line[0] == '#':
93            continue
94        m = mapRE.match(line)
95        if not m:
96            #print '* not matched: %s' % repr(line)
97            continue
98        enc,uni,comment = m.groups()
99        enc = parsecodes(enc)
100        uni = parsecodes(uni)
101        if comment is None:
102            comment = ''
103        else:
104            comment = comment[1:].strip()
105        if not isinstance(enc, tuple) and enc < 256:
106            if enc in unmapped:
107                unmapped.remove(enc)
108            if enc == uni:
109                identity.append(enc)
110            enc2uni[enc] = (uni,comment)
111        else:
112            enc2uni[enc] = (uni,comment)
113
114    # If there are more identity-mapped entries than unmapped entries,
115    # it pays to generate an identity dictionary first, and add explicit
116    # mappings to None for the rest
117    if len(identity) >= len(unmapped):
118        for enc in unmapped:
119            enc2uni[enc] = (MISSING_CODE, "")
120        enc2uni['IDENTITY'] = 256
121
122    return enc2uni
123
124def hexrepr(t, precision=4):
125
126    if t is None:
127        return 'None'
128    try:
129        len(t)
130    except TypeError:
131        return '0x%0*X' % (precision, t)
132    try:
133        return '(' + ', '.join(['0x%0*X' % (precision, item)
134                                for item in t]) + ')'
135    except TypeError as why:
136        print('* failed to convert %r: %s' % (t, why))
137        raise
138
139def python_mapdef_code(varname, map, comments=1, precisions=(2, 4)):
140
141    l = []
142    append = l.append
143    if "IDENTITY" in map:
144        append("%s = codecs.make_identity_dict(range(%d))" %
145               (varname, map["IDENTITY"]))
146        append("%s.update({" % varname)
147        splits = 1
148        del map["IDENTITY"]
149        identity = 1
150    else:
151        append("%s = {" % varname)
152        splits = 0
153        identity = 0
154
155    mappings = sorted(map.items())
156    i = 0
157    key_precision, value_precision = precisions
158    for mapkey, mapvalue in mappings:
159        mapcomment = ''
160        if isinstance(mapkey, tuple):
161            (mapkey, mapcomment) = mapkey
162        if isinstance(mapvalue, tuple):
163            (mapvalue, mapcomment) = mapvalue
164        if mapkey is None:
165            continue
166        if (identity and
167            mapkey == mapvalue and
168            mapkey < 256):
169            # No need to include identity mappings, since these
170            # are already set for the first 256 code points.
171            continue
172        key = hexrepr(mapkey, key_precision)
173        value = hexrepr(mapvalue, value_precision)
174        if mapcomment and comments:
175            append('    %s: %s,\t#  %s' % (key, value, mapcomment))
176        else:
177            append('    %s: %s,' % (key, value))
178        i += 1
179        if i == 4096:
180            # Split the definition into parts to that the Python
181            # parser doesn't dump core
182            if splits == 0:
183                append('}')
184            else:
185                append('})')
186            append('%s.update({' % varname)
187            i = 0
188            splits = splits + 1
189    if splits == 0:
190        append('}')
191    else:
192        append('})')
193
194    return l
195
196def python_tabledef_code(varname, map, comments=1, key_precision=2):
197
198    l = []
199    append = l.append
200    append('%s = (' % varname)
201
202    # Analyze map and create table dict
203    mappings = sorted(map.items())
204    table = {}
205    maxkey = 255
206    if 'IDENTITY' in map:
207        for key in range(256):
208            table[key] = (key, '')
209        del map['IDENTITY']
210    for mapkey, mapvalue in mappings:
211        mapcomment = ''
212        if isinstance(mapkey, tuple):
213            (mapkey, mapcomment) = mapkey
214        if isinstance(mapvalue, tuple):
215            (mapvalue, mapcomment) = mapvalue
216        if mapkey == MISSING_CODE:
217            continue
218        table[mapkey] = (mapvalue, mapcomment)
219        if mapkey > maxkey:
220            maxkey = mapkey
221    if maxkey > MAX_TABLE_SIZE:
222        # Table too large
223        return None
224
225    # Create table code
226    maxchar = 0
227    for key in range(maxkey + 1):
228        if key not in table:
229            mapvalue = MISSING_CODE
230            mapcomment = 'UNDEFINED'
231        else:
232            mapvalue, mapcomment = table[key]
233        if mapvalue == MISSING_CODE:
234            mapchar = UNI_UNDEFINED
235        else:
236            if isinstance(mapvalue, tuple):
237                # 1-n mappings not supported
238                return None
239            else:
240                mapchar = chr(mapvalue)
241        maxchar = max(maxchar, ord(mapchar))
242        if mapcomment and comments:
243            append('    %a \t#  %s -> %s' % (mapchar,
244                                            hexrepr(key, key_precision),
245                                            mapcomment))
246        else:
247            append('    %a' % mapchar)
248
249    if maxchar < 256:
250        append('    %a \t## Widen to UCS2 for optimization' % UNI_UNDEFINED)
251    append(')')
252    return l
253
254def codegen(name, map, encodingname, comments=1):
255
256    """ Returns Python source for the given map.
257
258        Comments are included in the source, if comments is true (default).
259
260    """
261    # Generate code
262    decoding_map_code = python_mapdef_code(
263        'decoding_map',
264        map,
265        comments=comments)
266    decoding_table_code = python_tabledef_code(
267        'decoding_table',
268        map,
269        comments=comments)
270    encoding_map_code = python_mapdef_code(
271        'encoding_map',
272        codecs.make_encoding_map(map),
273        comments=comments,
274        precisions=(4, 2))
275
276    if decoding_table_code:
277        suffix = 'table'
278    else:
279        suffix = 'map'
280
281    l = [
282        '''\
283""" Python Character Mapping Codec %s generated from '%s' with gencodec.py.
284
285"""#"
286
287import codecs
288
289### Codec APIs
290
291class Codec(codecs.Codec):
292
293    def encode(self, input, errors='strict'):
294        return codecs.charmap_encode(input, errors, encoding_%s)
295
296    def decode(self, input, errors='strict'):
297        return codecs.charmap_decode(input, errors, decoding_%s)
298''' % (encodingname, name, suffix, suffix)]
299    l.append('''\
300class IncrementalEncoder(codecs.IncrementalEncoder):
301    def encode(self, input, final=False):
302        return codecs.charmap_encode(input, self.errors, encoding_%s)[0]
303
304class IncrementalDecoder(codecs.IncrementalDecoder):
305    def decode(self, input, final=False):
306        return codecs.charmap_decode(input, self.errors, decoding_%s)[0]''' %
307        (suffix, suffix))
308
309    l.append('''
310class StreamWriter(Codec, codecs.StreamWriter):
311    pass
312
313class StreamReader(Codec, codecs.StreamReader):
314    pass
315
316### encodings module API
317
318def getregentry():
319    return codecs.CodecInfo(
320        name=%r,
321        encode=Codec().encode,
322        decode=Codec().decode,
323        incrementalencoder=IncrementalEncoder,
324        incrementaldecoder=IncrementalDecoder,
325        streamreader=StreamReader,
326        streamwriter=StreamWriter,
327    )
328''' % encodingname.replace('_', '-'))
329
330    # Add decoding table or map (with preference to the table)
331    if not decoding_table_code:
332        l.append('''
333### Decoding Map
334''')
335        l.extend(decoding_map_code)
336    else:
337        l.append('''
338### Decoding Table
339''')
340        l.extend(decoding_table_code)
341
342    # Add encoding map
343    if decoding_table_code:
344        l.append('''
345### Encoding table
346encoding_table = codecs.charmap_build(decoding_table)
347''')
348    else:
349        l.append('''
350### Encoding Map
351''')
352        l.extend(encoding_map_code)
353
354    # Final new-line
355    l.append('')
356
357    return '\n'.join(l).expandtabs()
358
359def pymap(name,map,pyfile,encodingname,comments=1):
360
361    code = codegen(name,map,encodingname,comments)
362    f = open(pyfile,'w')
363    f.write(code)
364    f.close()
365
366def marshalmap(name,map,marshalfile):
367
368    d = {}
369    for e,(u,c) in map.items():
370        d[e] = (u,c)
371    f = open(marshalfile,'wb')
372    marshal.dump(d,f)
373    f.close()
374
375def convertdir(dir, dirprefix='', nameprefix='', comments=1):
376
377    mapnames = os.listdir(dir)
378    for mapname in mapnames:
379        mappathname = os.path.join(dir, mapname)
380        if not os.path.isfile(mappathname):
381            continue
382        name = os.path.split(mapname)[1]
383        name = name.replace('-','_')
384        name = name.split('.')[0]
385        name = name.lower()
386        name = nameprefix + name
387        codefile = name + '.py'
388        marshalfile = name + '.mapping'
389        print('converting %s to %s and %s' % (mapname,
390                                              dirprefix + codefile,
391                                              dirprefix + marshalfile))
392        try:
393            map = readmap(os.path.join(dir,mapname))
394            if not map:
395                print('* map is empty; skipping')
396            else:
397                pymap(mappathname, map, dirprefix + codefile,name,comments)
398                marshalmap(mappathname, map, dirprefix + marshalfile)
399        except ValueError as why:
400            print('* conversion failed: %s' % why)
401            raise
402
403def rewritepythondir(dir, dirprefix='', comments=1):
404
405    mapnames = os.listdir(dir)
406    for mapname in mapnames:
407        if not mapname.endswith('.mapping'):
408            continue
409        name = mapname[:-len('.mapping')]
410        codefile = name + '.py'
411        print('converting %s to %s' % (mapname,
412                                       dirprefix + codefile))
413        try:
414            map = marshal.load(open(os.path.join(dir,mapname),
415                               'rb'))
416            if not map:
417                print('* map is empty; skipping')
418            else:
419                pymap(mapname, map, dirprefix + codefile,name,comments)
420        except ValueError as why:
421            print('* conversion failed: %s' % why)
422
423if __name__ == '__main__':
424
425    import sys
426    if 1:
427        convertdir(*sys.argv[1:])
428    else:
429        rewritepythondir(*sys.argv[1:])
430