test_html.py revision d3faf43f9ba7da0ae504c9186b10d0fa3a8eb300
1""" 2Tests for the html module functions. 3""" 4 5import html 6import unittest 7from test.support import run_unittest 8 9 10class HtmlTests(unittest.TestCase): 11 def test_escape(self): 12 self.assertEqual( 13 html.escape('\'<script>"&foo;"</script>\''), 14 ''<script>"&foo;"</script>'') 15 self.assertEqual( 16 html.escape('\'<script>"&foo;"</script>\'', False), 17 '\'<script>"&foo;"</script>\'') 18 19 def test_unescape(self): 20 numeric_formats = ['&#%d', '&#%d;', '&#x%x', '&#x%x;'] 21 errmsg = 'unescape(%r) should have returned %r' 22 def check(text, expected): 23 self.assertEqual(html.unescape(text), expected, 24 msg=errmsg % (text, expected)) 25 def check_num(num, expected): 26 for format in numeric_formats: 27 text = format % num 28 self.assertEqual(html.unescape(text), expected, 29 msg=errmsg % (text, expected)) 30 # check text with no character references 31 check('no character references', 'no character references') 32 # check & followed by invalid chars 33 check('&\n&\t& &&', '&\n&\t& &&') 34 # check & followed by numbers and letters 35 check('&0 &9 &a &0; &9; &a;', '&0 &9 &a &0; &9; &a;') 36 # check incomplete entities at the end of the string 37 for x in ['&', '&#', '&#x', '&#X', '&#y', '&#xy', '&#Xy']: 38 check(x, x) 39 check(x+';', x+';') 40 # check several combinations of numeric character references, 41 # possibly followed by different characters 42 formats = ['&#%d', '&#%07d', '&#%d;', '&#%07d;', 43 '&#x%x', '&#x%06x', '&#x%x;', '&#x%06x;', 44 '&#x%X', '&#x%06X', '&#X%x;', '&#X%06x;'] 45 for num, char in zip([65, 97, 34, 38, 0x2603, 0x101234], 46 ['A', 'a', '"', '&', '\u2603', '\U00101234']): 47 for s in formats: 48 check(s % num, char) 49 for end in [' ', 'X']: 50 check((s+end) % num, char+end) 51 # check invalid code points 52 for cp in [0xD800, 0xDB00, 0xDC00, 0xDFFF, 0x110000]: 53 check_num(cp, '\uFFFD') 54 # check more invalid code points 55 for cp in [0x1, 0xb, 0xe, 0x7f, 0xfffe, 0xffff, 0x10fffe, 0x10ffff]: 56 check_num(cp, '') 57 # check invalid numbers 58 for num, ch in zip([0x0d, 0x80, 0x95, 0x9d], '\r\u20ac\u2022\x9d'): 59 check_num(num, ch) 60 # check small numbers 61 check_num(0, '\uFFFD') 62 check_num(9, '\t') 63 # check a big number 64 check_num(1000000000000000000, '\uFFFD') 65 # check that multiple trailing semicolons are handled correctly 66 for e in ['";', '";', '";', '";']: 67 check(e, '";') 68 # check that semicolons in the middle don't create problems 69 for e in ['"quot;', '"quot;', '"quot;', '"quot;']: 70 check(e, '"quot;') 71 # check triple adjacent charrefs 72 for e in ['"', '"', '"', '"']: 73 check(e*3, '"""') 74 check((e+';')*3, '"""') 75 # check that the case is respected 76 for e in ['&', '&', '&', '&']: 77 check(e, '&') 78 for e in ['&Amp', '&Amp;']: 79 check(e, e) 80 # check that non-existent named entities are returned unchanged 81 check('&svadilfari;', '&svadilfari;') 82 # the following examples are in the html5 specs 83 check('¬it', '¬it') 84 check('¬it;', '¬it;') 85 check('¬in', '¬in') 86 check('∉', '∉') 87 # a similar example with a long name 88 check('¬ReallyAnExistingNamedCharacterReference;', 89 '¬ReallyAnExistingNamedCharacterReference;') 90 # longest valid name 91 check('∳', '∳') 92 # check a charref that maps to two unicode chars 93 check('∾̳', '\u223E\u0333') 94 check('&acE', '&acE') 95 # see #12888 96 check('{ ' * 1050, '{ ' * 1050) 97 # see #15156 98 check('ÉricÉric&alphacentauriαcentauri', 99 'ÉricÉric&alphacentauriαcentauri') 100 check('&co;', '&co;') 101 102 103if __name__ == '__main__': 104 unittest.main() 105