1#!/usr/bin/env python3
2""" Utility for parsing HTML entity definitions available from:
3
4      http://www.w3.org/ as e.g.
5      http://www.w3.org/TR/REC-html40/HTMLlat1.ent
6
7    Input is read from stdin, output is written to stdout in form of a
8    Python snippet defining a dictionary "entitydefs" mapping literal
9    entity name to character or numeric entity.
10
11    Marc-Andre Lemburg, mal@lemburg.com, 1999.
12    Use as you like. NO WARRANTIES.
13
14"""
15import re,sys
16
17entityRE = re.compile(r'<!ENTITY +(\w+) +CDATA +"([^"]+)" +-- +((?:.|\n)+?) *-->')
18
19def parse(text,pos=0,endpos=None):
20
21    pos = 0
22    if endpos is None:
23        endpos = len(text)
24    d = {}
25    while 1:
26        m = entityRE.search(text,pos,endpos)
27        if not m:
28            break
29        name,charcode,comment = m.groups()
30        d[name] = charcode,comment
31        pos = m.end()
32    return d
33
34def writefile(f,defs):
35
36    f.write("entitydefs = {\n")
37    items = sorted(defs.items())
38    for name, (charcode,comment) in items:
39        if charcode[:2] == '&#':
40            code = int(charcode[2:-1])
41            if code < 256:
42                charcode = r"'\%o'" % code
43            else:
44                charcode = repr(charcode)
45        else:
46            charcode = repr(charcode)
47        comment = ' '.join(comment.split())
48        f.write("    '%s':\t%s,  \t# %s\n" % (name,charcode,comment))
49    f.write('\n}\n')
50
51if __name__ == '__main__':
52    if len(sys.argv) > 1:
53        with open(sys.argv[1]) as infile:
54            text = infile.read()
55    else:
56        text = sys.stdin.read()
57
58    defs = parse(text)
59
60    if len(sys.argv) > 2:
61        with open(sys.argv[2],'w') as outfile:
62            writefile(outfile, defs)
63    else:
64        writefile(sys.stdout, defs)
65