1#!/usr/bin/env python3 2""" Utility for parsing HTML entity definitions available from: 3 4 http://www.w3.org/ as e.g. 5 http://www.w3.org/TR/REC-html40/HTMLlat1.ent 6 7 Input is read from stdin, output is written to stdout in form of a 8 Python snippet defining a dictionary "entitydefs" mapping literal 9 entity name to character or numeric entity. 10 11 Marc-Andre Lemburg, mal@lemburg.com, 1999. 12 Use as you like. NO WARRANTIES. 13 14""" 15import re,sys 16 17entityRE = re.compile(r'<!ENTITY +(\w+) +CDATA +"([^"]+)" +-- +((?:.|\n)+?) *-->') 18 19def parse(text,pos=0,endpos=None): 20 21 pos = 0 22 if endpos is None: 23 endpos = len(text) 24 d = {} 25 while 1: 26 m = entityRE.search(text,pos,endpos) 27 if not m: 28 break 29 name,charcode,comment = m.groups() 30 d[name] = charcode,comment 31 pos = m.end() 32 return d 33 34def writefile(f,defs): 35 36 f.write("entitydefs = {\n") 37 items = sorted(defs.items()) 38 for name, (charcode,comment) in items: 39 if charcode[:2] == '&#': 40 code = int(charcode[2:-1]) 41 if code < 256: 42 charcode = r"'\%o'" % code 43 else: 44 charcode = repr(charcode) 45 else: 46 charcode = repr(charcode) 47 comment = ' '.join(comment.split()) 48 f.write(" '%s':\t%s, \t# %s\n" % (name,charcode,comment)) 49 f.write('\n}\n') 50 51if __name__ == '__main__': 52 if len(sys.argv) > 1: 53 with open(sys.argv[1]) as infile: 54 text = infile.read() 55 else: 56 text = sys.stdin.read() 57 58 defs = parse(text) 59 60 if len(sys.argv) > 2: 61 with open(sys.argv[2],'w') as outfile: 62 writefile(outfile, defs) 63 else: 64 writefile(sys.stdout, defs) 65