1""" Unicode Mapping Parser and Codec Generator.
2
3This script parses Unicode mapping files as available from the Unicode
4site (ftp://ftp.unicode.org/Public/MAPPINGS/) and creates Python codec
5modules from them. The codecs use the standard character mapping codec
6to actually apply the mapping.
7
8Synopsis: gencodec.py dir codec_prefix
9
10All files in dir are scanned and those producing non-empty mappings
11will be written to <codec_prefix><mapname>.py with <mapname> being the
12first part of the map's filename ('a' in a.b.c.txt) converted to
13lowercase with hyphens replaced by underscores.
14
15The tool also writes marshalled versions of the mapping tables to the
16same location (with .mapping extension).
17
18Written by Marc-Andre Lemburg (mal@lemburg.com).
19
20(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
21(c) Copyright Guido van Rossum, 2000.
22
23Table generation:
24(c) Copyright Marc-Andre Lemburg, 2005.
25    Licensed to PSF under a Contributor Agreement.
26
27"""#"
28
29import re, os, marshal, codecs
30
31# Maximum allowed size of charmap tables
32MAX_TABLE_SIZE = 8192
33
34# Standard undefined Unicode code point
35UNI_UNDEFINED = unichr(0xFFFE)
36
37mapRE = re.compile('((?:0x[0-9a-fA-F]+\+?)+)'
38                   '\s+'
39                   '((?:(?:0x[0-9a-fA-Z]+|<[A-Za-z]+>)\+?)*)'
40                   '\s*'
41                   '(#.+)?')
42
43def parsecodes(codes, len=len, range=range):
44
45    """ Converts code combinations to either a single code integer
46        or a tuple of integers.
47
48        meta-codes (in angular brackets, e.g. <LR> and <RL>) are
49        ignored.
50
51        Empty codes or illegal ones are returned as None.
52
53    """
54    if not codes:
55        return None
56    l = codes.split('+')
57    if len(l) == 1:
58        return int(l[0],16)
59    for i in range(len(l)):
60        try:
61            l[i] = int(l[i],16)
62        except ValueError:
63            l[i] = None
64    l = [x for x in l if x is not None]
65    if len(l) == 1:
66        return l[0]
67    else:
68        return tuple(l)
69
70def readmap(filename):
71
72    f = open(filename,'r')
73    lines = f.readlines()
74    f.close()
75    enc2uni = {}
76    identity = []
77    unmapped = range(256)
78
79    # UTC mapping tables per convention don't include the identity
80    # mappings for code points 0x00 - 0x1F and 0x7F, unless these are
81    # explicitly mapped to different characters or undefined
82    for i in range(32) + [127]:
83        identity.append(i)
84        unmapped.remove(i)
85        enc2uni[i] = (i, 'CONTROL CHARACTER')
86
87    for line in lines:
88        line = line.strip()
89        if not line or line[0] == '#':
90            continue
91        m = mapRE.match(line)
92        if not m:
93            #print '* not matched: %s' % repr(line)
94            continue
95        enc,uni,comment = m.groups()
96        enc = parsecodes(enc)
97        uni = parsecodes(uni)
98        if comment is None:
99            comment = ''
100        else:
101            comment = comment[1:].strip()
102        if enc < 256:
103            if enc in unmapped:
104                unmapped.remove(enc)
105            if enc == uni:
106                identity.append(enc)
107            enc2uni[enc] = (uni,comment)
108        else:
109            enc2uni[enc] = (uni,comment)
110
111    # If there are more identity-mapped entries than unmapped entries,
112    # it pays to generate an identity dictionary first, and add explicit
113    # mappings to None for the rest
114    if len(identity) >= len(unmapped):
115        for enc in unmapped:
116            enc2uni[enc] = (None, "")
117        enc2uni['IDENTITY'] = 256
118
119    return enc2uni
120
121def hexrepr(t, precision=4):
122
123    if t is None:
124        return 'None'
125    try:
126        len(t)
127    except:
128        return '0x%0*X' % (precision, t)
129    try:
130        return '(' + ', '.join(['0x%0*X' % (precision, item)
131                                for item in t]) + ')'
132    except TypeError, why:
133        print '* failed to convert %r: %s' % (t, why)
134        raise
135
136def python_mapdef_code(varname, map, comments=1, precisions=(2, 4)):
137
138    l = []
139    append = l.append
140    if "IDENTITY" in map:
141        append("%s = codecs.make_identity_dict(range(%d))" %
142               (varname, map["IDENTITY"]))
143        append("%s.update({" % varname)
144        splits = 1
145        del map["IDENTITY"]
146        identity = 1
147    else:
148        append("%s = {" % varname)
149        splits = 0
150        identity = 0
151
152    mappings = sorted(map.items())
153    i = 0
154    key_precision, value_precision = precisions
155    for mapkey, mapvalue in mappings:
156        mapcomment = ''
157        if isinstance(mapkey, tuple):
158            (mapkey, mapcomment) = mapkey
159        if isinstance(mapvalue, tuple):
160            (mapvalue, mapcomment) = mapvalue
161        if mapkey is None:
162            continue
163        if (identity and
164            mapkey == mapvalue and
165            mapkey < 256):
166            # No need to include identity mappings, since these
167            # are already set for the first 256 code points.
168            continue
169        key = hexrepr(mapkey, key_precision)
170        value = hexrepr(mapvalue, value_precision)
171        if mapcomment and comments:
172            append('    %s: %s,\t#  %s' % (key, value, mapcomment))
173        else:
174            append('    %s: %s,' % (key, value))
175        i += 1
176        if i == 4096:
177            # Split the definition into parts to that the Python
178            # parser doesn't dump core
179            if splits == 0:
180                append('}')
181            else:
182                append('})')
183            append('%s.update({' % varname)
184            i = 0
185            splits = splits + 1
186    if splits == 0:
187        append('}')
188    else:
189        append('})')
190
191    return l
192
193def python_tabledef_code(varname, map, comments=1, key_precision=2):
194
195    l = []
196    append = l.append
197    append('%s = (' % varname)
198
199    # Analyze map and create table dict
200    mappings = sorted(map.items())
201    table = {}
202    maxkey = 0
203    if 'IDENTITY' in map:
204        for key in range(256):
205            table[key] = (key, '')
206        maxkey = 255
207        del map['IDENTITY']
208    for mapkey, mapvalue in mappings:
209        mapcomment = ''
210        if isinstance(mapkey, tuple):
211            (mapkey, mapcomment) = mapkey
212        if isinstance(mapvalue, tuple):
213            (mapvalue, mapcomment) = mapvalue
214        if mapkey is None:
215            continue
216        table[mapkey] = (mapvalue, mapcomment)
217        if mapkey > maxkey:
218            maxkey = mapkey
219    if maxkey > MAX_TABLE_SIZE:
220        # Table too large
221        return None
222
223    # Create table code
224    for key in range(maxkey + 1):
225        if key not in table:
226            mapvalue = None
227            mapcomment = 'UNDEFINED'
228        else:
229            mapvalue, mapcomment = table[key]
230        if mapvalue is None:
231            mapchar = UNI_UNDEFINED
232        else:
233            if isinstance(mapvalue, tuple):
234                # 1-n mappings not supported
235                return None
236            else:
237                mapchar = unichr(mapvalue)
238        if mapcomment and comments:
239            append('    %r\t#  %s -> %s' % (mapchar,
240                                            hexrepr(key, key_precision),
241                                            mapcomment))
242        else:
243            append('    %r' % mapchar)
244
245    append(')')
246    return l
247
248def codegen(name, map, encodingname, comments=1):
249
250    """ Returns Python source for the given map.
251
252        Comments are included in the source, if comments is true (default).
253
254    """
255    # Generate code
256    decoding_map_code = python_mapdef_code(
257        'decoding_map',
258        map,
259        comments=comments)
260    decoding_table_code = python_tabledef_code(
261        'decoding_table',
262        map,
263        comments=comments)
264    encoding_map_code = python_mapdef_code(
265        'encoding_map',
266        codecs.make_encoding_map(map),
267        comments=comments,
268        precisions=(4, 2))
269
270    if decoding_table_code:
271        suffix = 'table'
272    else:
273        suffix = 'map'
274
275    l = [
276        '''\
277""" Python Character Mapping Codec %s generated from '%s' with gencodec.py.
278
279"""#"
280
281import codecs
282
283### Codec APIs
284
285class Codec(codecs.Codec):
286
287    def encode(self,input,errors='strict'):
288        return codecs.charmap_encode(input,errors,encoding_%s)
289
290    def decode(self,input,errors='strict'):
291        return codecs.charmap_decode(input,errors,decoding_%s)
292''' % (encodingname, name, suffix, suffix)]
293    l.append('''\
294class IncrementalEncoder(codecs.IncrementalEncoder):
295    def encode(self, input, final=False):
296        return codecs.charmap_encode(input,self.errors,encoding_%s)[0]
297
298class IncrementalDecoder(codecs.IncrementalDecoder):
299    def decode(self, input, final=False):
300        return codecs.charmap_decode(input,self.errors,decoding_%s)[0]''' %
301        (suffix, suffix))
302
303    l.append('''
304class StreamWriter(Codec,codecs.StreamWriter):
305    pass
306
307class StreamReader(Codec,codecs.StreamReader):
308    pass
309
310### encodings module API
311
312def getregentry():
313    return codecs.CodecInfo(
314        name=%r,
315        encode=Codec().encode,
316        decode=Codec().decode,
317        incrementalencoder=IncrementalEncoder,
318        incrementaldecoder=IncrementalDecoder,
319        streamreader=StreamReader,
320        streamwriter=StreamWriter,
321    )
322''' % encodingname.replace('_', '-'))
323
324    # Add decoding table or map (with preference to the table)
325    if not decoding_table_code:
326        l.append('''
327### Decoding Map
328''')
329        l.extend(decoding_map_code)
330    else:
331        l.append('''
332### Decoding Table
333''')
334        l.extend(decoding_table_code)
335
336    # Add encoding map
337    if decoding_table_code:
338        l.append('''
339### Encoding table
340encoding_table=codecs.charmap_build(decoding_table)
341''')
342    else:
343        l.append('''
344### Encoding Map
345''')
346        l.extend(encoding_map_code)
347
348    # Final new-line
349    l.append('')
350
351    return '\n'.join(l).expandtabs()
352
353def pymap(name,map,pyfile,encodingname,comments=1):
354
355    code = codegen(name,map,encodingname,comments)
356    f = open(pyfile,'w')
357    f.write(code)
358    f.close()
359
360def marshalmap(name,map,marshalfile):
361
362    d = {}
363    for e,(u,c) in map.items():
364        d[e] = (u,c)
365    f = open(marshalfile,'wb')
366    marshal.dump(d,f)
367    f.close()
368
369def convertdir(dir, dirprefix='', nameprefix='', comments=1):
370
371    mapnames = os.listdir(dir)
372    for mapname in mapnames:
373        mappathname = os.path.join(dir, mapname)
374        if not os.path.isfile(mappathname):
375            continue
376        name = os.path.split(mapname)[1]
377        name = name.replace('-','_')
378        name = name.split('.')[0]
379        name = name.lower()
380        name = nameprefix + name
381        codefile = name + '.py'
382        marshalfile = name + '.mapping'
383        print 'converting %s to %s and %s' % (mapname,
384                                              dirprefix + codefile,
385                                              dirprefix + marshalfile)
386        try:
387            map = readmap(os.path.join(dir,mapname))
388            if not map:
389                print '* map is empty; skipping'
390            else:
391                pymap(mappathname, map, dirprefix + codefile,name,comments)
392                marshalmap(mappathname, map, dirprefix + marshalfile)
393        except ValueError, why:
394            print '* conversion failed: %s' % why
395            raise
396
397def rewritepythondir(dir, dirprefix='', comments=1):
398
399    mapnames = os.listdir(dir)
400    for mapname in mapnames:
401        if not mapname.endswith('.mapping'):
402            continue
403        name = mapname[:-len('.mapping')]
404        codefile = name + '.py'
405        print 'converting %s to %s' % (mapname,
406                                       dirprefix + codefile)
407        try:
408            map = marshal.load(open(os.path.join(dir,mapname),
409                               'rb'))
410            if not map:
411                print '* map is empty; skipping'
412            else:
413                pymap(mapname, map, dirprefix + codefile,name,comments)
414        except ValueError, why:
415            print '* conversion failed: %s' % why
416
417if __name__ == '__main__':
418
419    import sys
420    if 1:
421        convertdir(*sys.argv[1:])
422    else:
423        rewritepythondir(*sys.argv[1:])
424