1""" 2Tests for the html module functions. 3""" 4 5import html 6import unittest 7 8 9class HtmlTests(unittest.TestCase): 10 def test_escape(self): 11 self.assertEqual( 12 html.escape('\'<script>"&foo;"</script>\''), 13 ''<script>"&foo;"</script>'') 14 self.assertEqual( 15 html.escape('\'<script>"&foo;"</script>\'', False), 16 '\'<script>"&foo;"</script>\'') 17 18 def test_unescape(self): 19 numeric_formats = ['&#%d', '&#%d;', '&#x%x', '&#x%x;'] 20 errmsg = 'unescape(%r) should have returned %r' 21 def check(text, expected): 22 self.assertEqual(html.unescape(text), expected, 23 msg=errmsg % (text, expected)) 24 def check_num(num, expected): 25 for format in numeric_formats: 26 text = format % num 27 self.assertEqual(html.unescape(text), expected, 28 msg=errmsg % (text, expected)) 29 # check text with no character references 30 check('no character references', 'no character references') 31 # check & followed by invalid chars 32 check('&\n&\t& &&', '&\n&\t& &&') 33 # check & followed by numbers and letters 34 check('&0 &9 &a &0; &9; &a;', '&0 &9 &a &0; &9; &a;') 35 # check incomplete entities at the end of the string 36 for x in ['&', '&#', '&#x', '&#X', '&#y', '&#xy', '&#Xy']: 37 check(x, x) 38 check(x+';', x+';') 39 # check several combinations of numeric character references, 40 # possibly followed by different characters 41 formats = ['&#%d', '&#%07d', '&#%d;', '&#%07d;', 42 '&#x%x', '&#x%06x', '&#x%x;', '&#x%06x;', 43 '&#x%X', '&#x%06X', '&#X%x;', '&#X%06x;'] 44 for num, char in zip([65, 97, 34, 38, 0x2603, 0x101234], 45 ['A', 'a', '"', '&', '\u2603', '\U00101234']): 46 for s in formats: 47 check(s % num, char) 48 for end in [' ', 'X']: 49 check((s+end) % num, char+end) 50 # check invalid code points 51 for cp in [0xD800, 0xDB00, 0xDC00, 0xDFFF, 0x110000]: 52 check_num(cp, '\uFFFD') 53 # check more invalid code points 54 for cp in [0x1, 0xb, 0xe, 0x7f, 0xfffe, 0xffff, 0x10fffe, 0x10ffff]: 55 check_num(cp, '') 56 # check invalid numbers 57 for num, ch in zip([0x0d, 0x80, 0x95, 0x9d], '\r\u20ac\u2022\x9d'): 58 check_num(num, ch) 59 # check small numbers 60 check_num(0, '\uFFFD') 61 check_num(9, '\t') 62 # check a big number 63 check_num(1000000000000000000, '\uFFFD') 64 # check that multiple trailing semicolons are handled correctly 65 for e in ['";', '";', '";', '";']: 66 check(e, '";') 67 # check that semicolons in the middle don't create problems 68 for e in ['"quot;', '"quot;', '"quot;', '"quot;']: 69 check(e, '"quot;') 70 # check triple adjacent charrefs 71 for e in ['"', '"', '"', '"']: 72 check(e*3, '"""') 73 check((e+';')*3, '"""') 74 # check that the case is respected 75 for e in ['&', '&', '&', '&']: 76 check(e, '&') 77 for e in ['&Amp', '&Amp;']: 78 check(e, e) 79 # check that non-existent named entities are returned unchanged 80 check('&svadilfari;', '&svadilfari;') 81 # the following examples are in the html5 specs 82 check('¬it', '¬it') 83 check('¬it;', '¬it;') 84 check('¬in', '¬in') 85 check('∉', '∉') 86 # a similar example with a long name 87 check('¬ReallyAnExistingNamedCharacterReference;', 88 '¬ReallyAnExistingNamedCharacterReference;') 89 # longest valid name 90 check('∳', '∳') 91 # check a charref that maps to two unicode chars 92 check('∾̳', '\u223E\u0333') 93 check('&acE', '&acE') 94 # see #12888 95 check('{ ' * 1050, '{ ' * 1050) 96 # see #15156 97 check('ÉricÉric&alphacentauriαcentauri', 98 'ÉricÉric&alphacentauriαcentauri') 99 check('&co;', '&co;') 100 101 102if __name__ == '__main__': 103 unittest.main() 104