1"""
2General functions for HTML manipulation.
3"""
4
5import re as _re
6from html.entities import html5 as _html5
7
8
9__all__ = ['escape', 'unescape']
10
11
12def escape(s, quote=True):
13    """
14    Replace special characters "&", "<" and ">" to HTML-safe sequences.
15    If the optional flag quote is true (the default), the quotation mark
16    characters, both double quote (") and single quote (') characters are also
17    translated.
18    """
19    s = s.replace("&", "&amp;") # Must be done first!
20    s = s.replace("<", "&lt;")
21    s = s.replace(">", "&gt;")
22    if quote:
23        s = s.replace('"', "&quot;")
24        s = s.replace('\'', "&#x27;")
25    return s
26
27
28# see http://www.w3.org/TR/html5/syntax.html#tokenizing-character-references
29
30_invalid_charrefs = {
31    0x00: '\ufffd',  # REPLACEMENT CHARACTER
32    0x0d: '\r',      # CARRIAGE RETURN
33    0x80: '\u20ac',  # EURO SIGN
34    0x81: '\x81',    # <control>
35    0x82: '\u201a',  # SINGLE LOW-9 QUOTATION MARK
36    0x83: '\u0192',  # LATIN SMALL LETTER F WITH HOOK
37    0x84: '\u201e',  # DOUBLE LOW-9 QUOTATION MARK
38    0x85: '\u2026',  # HORIZONTAL ELLIPSIS
39    0x86: '\u2020',  # DAGGER
40    0x87: '\u2021',  # DOUBLE DAGGER
41    0x88: '\u02c6',  # MODIFIER LETTER CIRCUMFLEX ACCENT
42    0x89: '\u2030',  # PER MILLE SIGN
43    0x8a: '\u0160',  # LATIN CAPITAL LETTER S WITH CARON
44    0x8b: '\u2039',  # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
45    0x8c: '\u0152',  # LATIN CAPITAL LIGATURE OE
46    0x8d: '\x8d',    # <control>
47    0x8e: '\u017d',  # LATIN CAPITAL LETTER Z WITH CARON
48    0x8f: '\x8f',    # <control>
49    0x90: '\x90',    # <control>
50    0x91: '\u2018',  # LEFT SINGLE QUOTATION MARK
51    0x92: '\u2019',  # RIGHT SINGLE QUOTATION MARK
52    0x93: '\u201c',  # LEFT DOUBLE QUOTATION MARK
53    0x94: '\u201d',  # RIGHT DOUBLE QUOTATION MARK
54    0x95: '\u2022',  # BULLET
55    0x96: '\u2013',  # EN DASH
56    0x97: '\u2014',  # EM DASH
57    0x98: '\u02dc',  # SMALL TILDE
58    0x99: '\u2122',  # TRADE MARK SIGN
59    0x9a: '\u0161',  # LATIN SMALL LETTER S WITH CARON
60    0x9b: '\u203a',  # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
61    0x9c: '\u0153',  # LATIN SMALL LIGATURE OE
62    0x9d: '\x9d',    # <control>
63    0x9e: '\u017e',  # LATIN SMALL LETTER Z WITH CARON
64    0x9f: '\u0178',  # LATIN CAPITAL LETTER Y WITH DIAERESIS
65}
66
67_invalid_codepoints = {
68    # 0x0001 to 0x0008
69    0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
70    # 0x000E to 0x001F
71    0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
72    0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
73    # 0x007F to 0x009F
74    0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a,
75    0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
76    0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
77    # 0xFDD0 to 0xFDEF
78    0xfdd0, 0xfdd1, 0xfdd2, 0xfdd3, 0xfdd4, 0xfdd5, 0xfdd6, 0xfdd7, 0xfdd8,
79    0xfdd9, 0xfdda, 0xfddb, 0xfddc, 0xfddd, 0xfdde, 0xfddf, 0xfde0, 0xfde1,
80    0xfde2, 0xfde3, 0xfde4, 0xfde5, 0xfde6, 0xfde7, 0xfde8, 0xfde9, 0xfdea,
81    0xfdeb, 0xfdec, 0xfded, 0xfdee, 0xfdef,
82    # others
83    0xb, 0xfffe, 0xffff, 0x1fffe, 0x1ffff, 0x2fffe, 0x2ffff, 0x3fffe, 0x3ffff,
84    0x4fffe, 0x4ffff, 0x5fffe, 0x5ffff, 0x6fffe, 0x6ffff, 0x7fffe, 0x7ffff,
85    0x8fffe, 0x8ffff, 0x9fffe, 0x9ffff, 0xafffe, 0xaffff, 0xbfffe, 0xbffff,
86    0xcfffe, 0xcffff, 0xdfffe, 0xdffff, 0xefffe, 0xeffff, 0xffffe, 0xfffff,
87    0x10fffe, 0x10ffff
88}
89
90
91def _replace_charref(s):
92    s = s.group(1)
93    if s[0] == '#':
94        # numeric charref
95        if s[1] in 'xX':
96            num = int(s[2:].rstrip(';'), 16)
97        else:
98            num = int(s[1:].rstrip(';'))
99        if num in _invalid_charrefs:
100            return _invalid_charrefs[num]
101        if 0xD800 <= num <= 0xDFFF or num > 0x10FFFF:
102            return '\uFFFD'
103        if num in _invalid_codepoints:
104            return ''
105        return chr(num)
106    else:
107        # named charref
108        if s in _html5:
109            return _html5[s]
110        # find the longest matching name (as defined by the standard)
111        for x in range(len(s)-1, 1, -1):
112            if s[:x] in _html5:
113                return _html5[s[:x]] + s[x:]
114        else:
115            return '&' + s
116
117
118_charref = _re.compile(r'&(#[0-9]+;?'
119                       r'|#[xX][0-9a-fA-F]+;?'
120                       r'|[^\t\n\f <&#;]{1,32};?)')
121
122def unescape(s):
123    """
124    Convert all named and numeric character references (e.g. &gt;, &#62;,
125    &x3e;) in the string s to the corresponding unicode characters.
126    This function uses the rules defined by the HTML 5 standard
127    for both valid and invalid character references, and the list of
128    HTML 5 named character references defined in html.entities.html5.
129    """
130    if '&' not in s:
131        return s
132    return _charref.sub(_replace_charref, s)
133