1import codecs 2import html.entities 3import sys 4import test.support 5import unicodedata 6import unittest 7 8class PosReturn: 9 # this can be used for configurable callbacks 10 11 def __init__(self): 12 self.pos = 0 13 14 def handle(self, exc): 15 oldpos = self.pos 16 realpos = oldpos 17 if realpos<0: 18 realpos = len(exc.object) + realpos 19 # if we don't advance this time, terminate on the next call 20 # otherwise we'd get an endless loop 21 if realpos <= exc.start: 22 self.pos = len(exc.object) 23 return ("<?>", oldpos) 24 25# A UnicodeEncodeError object with a bad start attribute 26class BadStartUnicodeEncodeError(UnicodeEncodeError): 27 def __init__(self): 28 UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad") 29 self.start = [] 30 31# A UnicodeEncodeError object with a bad object attribute 32class BadObjectUnicodeEncodeError(UnicodeEncodeError): 33 def __init__(self): 34 UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad") 35 self.object = [] 36 37# A UnicodeDecodeError object without an end attribute 38class NoEndUnicodeDecodeError(UnicodeDecodeError): 39 def __init__(self): 40 UnicodeDecodeError.__init__(self, "ascii", bytearray(b""), 0, 1, "bad") 41 del self.end 42 43# A UnicodeDecodeError object with a bad object attribute 44class BadObjectUnicodeDecodeError(UnicodeDecodeError): 45 def __init__(self): 46 UnicodeDecodeError.__init__(self, "ascii", bytearray(b""), 0, 1, "bad") 47 self.object = [] 48 49# A UnicodeTranslateError object without a start attribute 50class NoStartUnicodeTranslateError(UnicodeTranslateError): 51 def __init__(self): 52 UnicodeTranslateError.__init__(self, "", 0, 1, "bad") 53 del self.start 54 55# A UnicodeTranslateError object without an end attribute 56class NoEndUnicodeTranslateError(UnicodeTranslateError): 57 def __init__(self): 58 UnicodeTranslateError.__init__(self, "", 0, 1, "bad") 59 del self.end 60 61# A UnicodeTranslateError object without an object attribute 62class NoObjectUnicodeTranslateError(UnicodeTranslateError): 63 def __init__(self): 64 UnicodeTranslateError.__init__(self, "", 0, 1, "bad") 65 del self.object 66 67class CodecCallbackTest(unittest.TestCase): 68 69 def test_xmlcharrefreplace(self): 70 # replace unencodable characters which numeric character entities. 71 # For ascii, latin-1 and charmaps this is completely implemented 72 # in C and should be reasonably fast. 73 s = "\u30b9\u30d1\u30e2 \xe4nd eggs" 74 self.assertEqual( 75 s.encode("ascii", "xmlcharrefreplace"), 76 b"スパモ änd eggs" 77 ) 78 self.assertEqual( 79 s.encode("latin-1", "xmlcharrefreplace"), 80 b"スパモ \xe4nd eggs" 81 ) 82 83 def test_xmlcharnamereplace(self): 84 # This time use a named character entity for unencodable 85 # characters, if one is available. 86 87 def xmlcharnamereplace(exc): 88 if not isinstance(exc, UnicodeEncodeError): 89 raise TypeError("don't know how to handle %r" % exc) 90 l = [] 91 for c in exc.object[exc.start:exc.end]: 92 try: 93 l.append("&%s;" % html.entities.codepoint2name[ord(c)]) 94 except KeyError: 95 l.append("&#%d;" % ord(c)) 96 return ("".join(l), exc.end) 97 98 codecs.register_error( 99 "test.xmlcharnamereplace", xmlcharnamereplace) 100 101 sin = "\xab\u211c\xbb = \u2329\u1234\u20ac\u232a" 102 sout = b"«ℜ» = ⟨ሴ€⟩" 103 self.assertEqual(sin.encode("ascii", "test.xmlcharnamereplace"), sout) 104 sout = b"\xabℜ\xbb = ⟨ሴ€⟩" 105 self.assertEqual(sin.encode("latin-1", "test.xmlcharnamereplace"), sout) 106 sout = b"\xabℜ\xbb = ⟨ሴ\xa4⟩" 107 self.assertEqual(sin.encode("iso-8859-15", "test.xmlcharnamereplace"), sout) 108 109 def test_uninamereplace(self): 110 # We're using the names from the unicode database this time, 111 # and we're doing "syntax highlighting" here, i.e. we include 112 # the replaced text in ANSI escape sequences. For this it is 113 # useful that the error handler is not called for every single 114 # unencodable character, but for a complete sequence of 115 # unencodable characters, otherwise we would output many 116 # unnecessary escape sequences. 117 118 def uninamereplace(exc): 119 if not isinstance(exc, UnicodeEncodeError): 120 raise TypeError("don't know how to handle %r" % exc) 121 l = [] 122 for c in exc.object[exc.start:exc.end]: 123 l.append(unicodedata.name(c, "0x%x" % ord(c))) 124 return ("\033[1m%s\033[0m" % ", ".join(l), exc.end) 125 126 codecs.register_error( 127 "test.uninamereplace", uninamereplace) 128 129 sin = "\xac\u1234\u20ac\u8000" 130 sout = b"\033[1mNOT SIGN, ETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m" 131 self.assertEqual(sin.encode("ascii", "test.uninamereplace"), sout) 132 133 sout = b"\xac\033[1mETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m" 134 self.assertEqual(sin.encode("latin-1", "test.uninamereplace"), sout) 135 136 sout = b"\xac\033[1mETHIOPIC SYLLABLE SEE\033[0m\xa4\033[1mCJK UNIFIED IDEOGRAPH-8000\033[0m" 137 self.assertEqual(sin.encode("iso-8859-15", "test.uninamereplace"), sout) 138 139 def test_backslashescape(self): 140 # Does the same as the "unicode-escape" encoding, but with different 141 # base encodings. 142 sin = "a\xac\u1234\u20ac\u8000\U0010ffff" 143 sout = b"a\\xac\\u1234\\u20ac\\u8000\\U0010ffff" 144 self.assertEqual(sin.encode("ascii", "backslashreplace"), sout) 145 146 sout = b"a\xac\\u1234\\u20ac\\u8000\\U0010ffff" 147 self.assertEqual(sin.encode("latin-1", "backslashreplace"), sout) 148 149 sout = b"a\xac\\u1234\xa4\\u8000\\U0010ffff" 150 self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout) 151 152 def test_nameescape(self): 153 # Does the same as backslashescape, but prefers ``\N{...}`` escape 154 # sequences. 155 sin = "a\xac\u1234\u20ac\u8000\U0010ffff" 156 sout = (b'a\\N{NOT SIGN}\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}' 157 b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff') 158 self.assertEqual(sin.encode("ascii", "namereplace"), sout) 159 160 sout = (b'a\xac\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}' 161 b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff') 162 self.assertEqual(sin.encode("latin-1", "namereplace"), sout) 163 164 sout = (b'a\xac\\N{ETHIOPIC SYLLABLE SEE}\xa4' 165 b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff') 166 self.assertEqual(sin.encode("iso-8859-15", "namereplace"), sout) 167 168 def test_decoding_callbacks(self): 169 # This is a test for a decoding callback handler 170 # that allows the decoding of the invalid sequence 171 # "\xc0\x80" and returns "\x00" instead of raising an error. 172 # All other illegal sequences will be handled strictly. 173 def relaxedutf8(exc): 174 if not isinstance(exc, UnicodeDecodeError): 175 raise TypeError("don't know how to handle %r" % exc) 176 if exc.object[exc.start:exc.start+2] == b"\xc0\x80": 177 return ("\x00", exc.start+2) # retry after two bytes 178 else: 179 raise exc 180 181 codecs.register_error("test.relaxedutf8", relaxedutf8) 182 183 # all the "\xc0\x80" will be decoded to "\x00" 184 sin = b"a\x00b\xc0\x80c\xc3\xbc\xc0\x80\xc0\x80" 185 sout = "a\x00b\x00c\xfc\x00\x00" 186 self.assertEqual(sin.decode("utf-8", "test.relaxedutf8"), sout) 187 188 # "\xc0\x81" is not valid and a UnicodeDecodeError will be raised 189 sin = b"\xc0\x80\xc0\x81" 190 self.assertRaises(UnicodeDecodeError, sin.decode, 191 "utf-8", "test.relaxedutf8") 192 193 def test_charmapencode(self): 194 # For charmap encodings the replacement string will be 195 # mapped through the encoding again. This means, that 196 # to be able to use e.g. the "replace" handler, the 197 # charmap has to have a mapping for "?". 198 charmap = dict((ord(c), bytes(2*c.upper(), 'ascii')) for c in "abcdefgh") 199 sin = "abc" 200 sout = b"AABBCC" 201 self.assertEqual(codecs.charmap_encode(sin, "strict", charmap)[0], sout) 202 203 sin = "abcA" 204 self.assertRaises(UnicodeError, codecs.charmap_encode, sin, "strict", charmap) 205 206 charmap[ord("?")] = b"XYZ" 207 sin = "abcDEF" 208 sout = b"AABBCCXYZXYZXYZ" 209 self.assertEqual(codecs.charmap_encode(sin, "replace", charmap)[0], sout) 210 211 charmap[ord("?")] = "XYZ" # wrong type in mapping 212 self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap) 213 214 def test_decodeunicodeinternal(self): 215 with test.support.check_warnings(('unicode_internal codec has been ' 216 'deprecated', DeprecationWarning)): 217 self.assertRaises( 218 UnicodeDecodeError, 219 b"\x00\x00\x00\x00\x00".decode, 220 "unicode-internal", 221 ) 222 if len('\0'.encode('unicode-internal')) == 4: 223 def handler_unicodeinternal(exc): 224 if not isinstance(exc, UnicodeDecodeError): 225 raise TypeError("don't know how to handle %r" % exc) 226 return ("\x01", 1) 227 228 self.assertEqual( 229 b"\x00\x00\x00\x00\x00".decode("unicode-internal", "ignore"), 230 "\u0000" 231 ) 232 233 self.assertEqual( 234 b"\x00\x00\x00\x00\x00".decode("unicode-internal", "replace"), 235 "\u0000\ufffd" 236 ) 237 238 self.assertEqual( 239 b"\x00\x00\x00\x00\x00".decode("unicode-internal", "backslashreplace"), 240 "\u0000\\x00" 241 ) 242 243 codecs.register_error("test.hui", handler_unicodeinternal) 244 245 self.assertEqual( 246 b"\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui"), 247 "\u0000\u0001\u0000" 248 ) 249 250 def test_callbacks(self): 251 def handler1(exc): 252 r = range(exc.start, exc.end) 253 if isinstance(exc, UnicodeEncodeError): 254 l = ["<%d>" % ord(exc.object[pos]) for pos in r] 255 elif isinstance(exc, UnicodeDecodeError): 256 l = ["<%d>" % exc.object[pos] for pos in r] 257 else: 258 raise TypeError("don't know how to handle %r" % exc) 259 return ("[%s]" % "".join(l), exc.end) 260 261 codecs.register_error("test.handler1", handler1) 262 263 def handler2(exc): 264 if not isinstance(exc, UnicodeDecodeError): 265 raise TypeError("don't know how to handle %r" % exc) 266 l = ["<%d>" % exc.object[pos] for pos in range(exc.start, exc.end)] 267 return ("[%s]" % "".join(l), exc.end+1) # skip one character 268 269 codecs.register_error("test.handler2", handler2) 270 271 s = b"\x00\x81\x7f\x80\xff" 272 273 self.assertEqual( 274 s.decode("ascii", "test.handler1"), 275 "\x00[<129>]\x7f[<128>][<255>]" 276 ) 277 self.assertEqual( 278 s.decode("ascii", "test.handler2"), 279 "\x00[<129>][<128>]" 280 ) 281 282 self.assertEqual( 283 b"\\u3042\\u3xxx".decode("unicode-escape", "test.handler1"), 284 "\u3042[<92><117><51>]xxx" 285 ) 286 287 self.assertEqual( 288 b"\\u3042\\u3xx".decode("unicode-escape", "test.handler1"), 289 "\u3042[<92><117><51>]xx" 290 ) 291 292 self.assertEqual( 293 codecs.charmap_decode(b"abc", "test.handler1", {ord("a"): "z"})[0], 294 "z[<98>][<99>]" 295 ) 296 297 self.assertEqual( 298 "g\xfc\xdfrk".encode("ascii", "test.handler1"), 299 b"g[<252><223>]rk" 300 ) 301 302 self.assertEqual( 303 "g\xfc\xdf".encode("ascii", "test.handler1"), 304 b"g[<252><223>]" 305 ) 306 307 def test_longstrings(self): 308 # test long strings to check for memory overflow problems 309 errors = [ "strict", "ignore", "replace", "xmlcharrefreplace", 310 "backslashreplace", "namereplace"] 311 # register the handlers under different names, 312 # to prevent the codec from recognizing the name 313 for err in errors: 314 codecs.register_error("test." + err, codecs.lookup_error(err)) 315 l = 1000 316 errors += [ "test." + err for err in errors ] 317 for uni in [ s*l for s in ("x", "\u3042", "a\xe4") ]: 318 for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15", 319 "utf-8", "utf-7", "utf-16", "utf-32"): 320 for err in errors: 321 try: 322 uni.encode(enc, err) 323 except UnicodeError: 324 pass 325 326 def check_exceptionobjectargs(self, exctype, args, msg): 327 # Test UnicodeError subclasses: construction, attribute assignment and __str__ conversion 328 # check with one missing argument 329 self.assertRaises(TypeError, exctype, *args[:-1]) 330 # check with one argument too much 331 self.assertRaises(TypeError, exctype, *(args + ["too much"])) 332 # check with one argument of the wrong type 333 wrongargs = [ "spam", b"eggs", b"spam", 42, 1.0, None ] 334 for i in range(len(args)): 335 for wrongarg in wrongargs: 336 if type(wrongarg) is type(args[i]): 337 continue 338 # build argument array 339 callargs = [] 340 for j in range(len(args)): 341 if i==j: 342 callargs.append(wrongarg) 343 else: 344 callargs.append(args[i]) 345 self.assertRaises(TypeError, exctype, *callargs) 346 347 # check with the correct number and type of arguments 348 exc = exctype(*args) 349 self.assertEqual(str(exc), msg) 350 351 def test_unicodeencodeerror(self): 352 self.check_exceptionobjectargs( 353 UnicodeEncodeError, 354 ["ascii", "g\xfcrk", 1, 2, "ouch"], 355 "'ascii' codec can't encode character '\\xfc' in position 1: ouch" 356 ) 357 self.check_exceptionobjectargs( 358 UnicodeEncodeError, 359 ["ascii", "g\xfcrk", 1, 4, "ouch"], 360 "'ascii' codec can't encode characters in position 1-3: ouch" 361 ) 362 self.check_exceptionobjectargs( 363 UnicodeEncodeError, 364 ["ascii", "\xfcx", 0, 1, "ouch"], 365 "'ascii' codec can't encode character '\\xfc' in position 0: ouch" 366 ) 367 self.check_exceptionobjectargs( 368 UnicodeEncodeError, 369 ["ascii", "\u0100x", 0, 1, "ouch"], 370 "'ascii' codec can't encode character '\\u0100' in position 0: ouch" 371 ) 372 self.check_exceptionobjectargs( 373 UnicodeEncodeError, 374 ["ascii", "\uffffx", 0, 1, "ouch"], 375 "'ascii' codec can't encode character '\\uffff' in position 0: ouch" 376 ) 377 self.check_exceptionobjectargs( 378 UnicodeEncodeError, 379 ["ascii", "\U00010000x", 0, 1, "ouch"], 380 "'ascii' codec can't encode character '\\U00010000' in position 0: ouch" 381 ) 382 383 def test_unicodedecodeerror(self): 384 self.check_exceptionobjectargs( 385 UnicodeDecodeError, 386 ["ascii", bytearray(b"g\xfcrk"), 1, 2, "ouch"], 387 "'ascii' codec can't decode byte 0xfc in position 1: ouch" 388 ) 389 self.check_exceptionobjectargs( 390 UnicodeDecodeError, 391 ["ascii", bytearray(b"g\xfcrk"), 1, 3, "ouch"], 392 "'ascii' codec can't decode bytes in position 1-2: ouch" 393 ) 394 395 def test_unicodetranslateerror(self): 396 self.check_exceptionobjectargs( 397 UnicodeTranslateError, 398 ["g\xfcrk", 1, 2, "ouch"], 399 "can't translate character '\\xfc' in position 1: ouch" 400 ) 401 self.check_exceptionobjectargs( 402 UnicodeTranslateError, 403 ["g\u0100rk", 1, 2, "ouch"], 404 "can't translate character '\\u0100' in position 1: ouch" 405 ) 406 self.check_exceptionobjectargs( 407 UnicodeTranslateError, 408 ["g\uffffrk", 1, 2, "ouch"], 409 "can't translate character '\\uffff' in position 1: ouch" 410 ) 411 self.check_exceptionobjectargs( 412 UnicodeTranslateError, 413 ["g\U00010000rk", 1, 2, "ouch"], 414 "can't translate character '\\U00010000' in position 1: ouch" 415 ) 416 self.check_exceptionobjectargs( 417 UnicodeTranslateError, 418 ["g\xfcrk", 1, 3, "ouch"], 419 "can't translate characters in position 1-2: ouch" 420 ) 421 422 def test_badandgoodstrictexceptions(self): 423 # "strict" complains about a non-exception passed in 424 self.assertRaises( 425 TypeError, 426 codecs.strict_errors, 427 42 428 ) 429 # "strict" complains about the wrong exception type 430 self.assertRaises( 431 Exception, 432 codecs.strict_errors, 433 Exception("ouch") 434 ) 435 436 # If the correct exception is passed in, "strict" raises it 437 self.assertRaises( 438 UnicodeEncodeError, 439 codecs.strict_errors, 440 UnicodeEncodeError("ascii", "\u3042", 0, 1, "ouch") 441 ) 442 self.assertRaises( 443 UnicodeDecodeError, 444 codecs.strict_errors, 445 UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch") 446 ) 447 self.assertRaises( 448 UnicodeTranslateError, 449 codecs.strict_errors, 450 UnicodeTranslateError("\u3042", 0, 1, "ouch") 451 ) 452 453 def test_badandgoodignoreexceptions(self): 454 # "ignore" complains about a non-exception passed in 455 self.assertRaises( 456 TypeError, 457 codecs.ignore_errors, 458 42 459 ) 460 # "ignore" complains about the wrong exception type 461 self.assertRaises( 462 TypeError, 463 codecs.ignore_errors, 464 UnicodeError("ouch") 465 ) 466 # If the correct exception is passed in, "ignore" returns an empty replacement 467 self.assertEqual( 468 codecs.ignore_errors( 469 UnicodeEncodeError("ascii", "a\u3042b", 1, 2, "ouch")), 470 ("", 2) 471 ) 472 self.assertEqual( 473 codecs.ignore_errors( 474 UnicodeDecodeError("ascii", bytearray(b"a\xffb"), 1, 2, "ouch")), 475 ("", 2) 476 ) 477 self.assertEqual( 478 codecs.ignore_errors( 479 UnicodeTranslateError("a\u3042b", 1, 2, "ouch")), 480 ("", 2) 481 ) 482 483 def test_badandgoodreplaceexceptions(self): 484 # "replace" complains about a non-exception passed in 485 self.assertRaises( 486 TypeError, 487 codecs.replace_errors, 488 42 489 ) 490 # "replace" complains about the wrong exception type 491 self.assertRaises( 492 TypeError, 493 codecs.replace_errors, 494 UnicodeError("ouch") 495 ) 496 self.assertRaises( 497 TypeError, 498 codecs.replace_errors, 499 BadObjectUnicodeEncodeError() 500 ) 501 self.assertRaises( 502 TypeError, 503 codecs.replace_errors, 504 BadObjectUnicodeDecodeError() 505 ) 506 # With the correct exception, "replace" returns an "?" or "\ufffd" replacement 507 self.assertEqual( 508 codecs.replace_errors( 509 UnicodeEncodeError("ascii", "a\u3042b", 1, 2, "ouch")), 510 ("?", 2) 511 ) 512 self.assertEqual( 513 codecs.replace_errors( 514 UnicodeDecodeError("ascii", bytearray(b"a\xffb"), 1, 2, "ouch")), 515 ("\ufffd", 2) 516 ) 517 self.assertEqual( 518 codecs.replace_errors( 519 UnicodeTranslateError("a\u3042b", 1, 2, "ouch")), 520 ("\ufffd", 2) 521 ) 522 523 def test_badandgoodxmlcharrefreplaceexceptions(self): 524 # "xmlcharrefreplace" complains about a non-exception passed in 525 self.assertRaises( 526 TypeError, 527 codecs.xmlcharrefreplace_errors, 528 42 529 ) 530 # "xmlcharrefreplace" complains about the wrong exception types 531 self.assertRaises( 532 TypeError, 533 codecs.xmlcharrefreplace_errors, 534 UnicodeError("ouch") 535 ) 536 # "xmlcharrefreplace" can only be used for encoding 537 self.assertRaises( 538 TypeError, 539 codecs.xmlcharrefreplace_errors, 540 UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch") 541 ) 542 self.assertRaises( 543 TypeError, 544 codecs.xmlcharrefreplace_errors, 545 UnicodeTranslateError("\u3042", 0, 1, "ouch") 546 ) 547 # Use the correct exception 548 cs = (0, 1, 9, 10, 99, 100, 999, 1000, 9999, 10000, 99999, 100000, 549 999999, 1000000) 550 cs += (0xd800, 0xdfff) 551 s = "".join(chr(c) for c in cs) 552 self.assertEqual( 553 codecs.xmlcharrefreplace_errors( 554 UnicodeEncodeError("ascii", "a" + s + "b", 555 1, 1 + len(s), "ouch") 556 ), 557 ("".join("&#%d;" % c for c in cs), 1 + len(s)) 558 ) 559 560 def test_badandgoodbackslashreplaceexceptions(self): 561 # "backslashreplace" complains about a non-exception passed in 562 self.assertRaises( 563 TypeError, 564 codecs.backslashreplace_errors, 565 42 566 ) 567 # "backslashreplace" complains about the wrong exception types 568 self.assertRaises( 569 TypeError, 570 codecs.backslashreplace_errors, 571 UnicodeError("ouch") 572 ) 573 # Use the correct exception 574 tests = [ 575 ("\u3042", "\\u3042"), 576 ("\n", "\\x0a"), 577 ("a", "\\x61"), 578 ("\x00", "\\x00"), 579 ("\xff", "\\xff"), 580 ("\u0100", "\\u0100"), 581 ("\uffff", "\\uffff"), 582 ("\U00010000", "\\U00010000"), 583 ("\U0010ffff", "\\U0010ffff"), 584 # Lone surrogates 585 ("\ud800", "\\ud800"), 586 ("\udfff", "\\udfff"), 587 ("\ud800\udfff", "\\ud800\\udfff"), 588 ] 589 for s, r in tests: 590 with self.subTest(str=s): 591 self.assertEqual( 592 codecs.backslashreplace_errors( 593 UnicodeEncodeError("ascii", "a" + s + "b", 594 1, 1 + len(s), "ouch")), 595 (r, 1 + len(s)) 596 ) 597 self.assertEqual( 598 codecs.backslashreplace_errors( 599 UnicodeTranslateError("a" + s + "b", 600 1, 1 + len(s), "ouch")), 601 (r, 1 + len(s)) 602 ) 603 tests = [ 604 (b"a", "\\x61"), 605 (b"\n", "\\x0a"), 606 (b"\x00", "\\x00"), 607 (b"\xff", "\\xff"), 608 ] 609 for b, r in tests: 610 with self.subTest(bytes=b): 611 self.assertEqual( 612 codecs.backslashreplace_errors( 613 UnicodeDecodeError("ascii", bytearray(b"a" + b + b"b"), 614 1, 2, "ouch")), 615 (r, 2) 616 ) 617 618 def test_badandgoodnamereplaceexceptions(self): 619 # "namereplace" complains about a non-exception passed in 620 self.assertRaises( 621 TypeError, 622 codecs.namereplace_errors, 623 42 624 ) 625 # "namereplace" complains about the wrong exception types 626 self.assertRaises( 627 TypeError, 628 codecs.namereplace_errors, 629 UnicodeError("ouch") 630 ) 631 # "namereplace" can only be used for encoding 632 self.assertRaises( 633 TypeError, 634 codecs.namereplace_errors, 635 UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch") 636 ) 637 self.assertRaises( 638 TypeError, 639 codecs.namereplace_errors, 640 UnicodeTranslateError("\u3042", 0, 1, "ouch") 641 ) 642 # Use the correct exception 643 tests = [ 644 ("\u3042", "\\N{HIRAGANA LETTER A}"), 645 ("\x00", "\\x00"), 646 ("\ufbf9", "\\N{ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH " 647 "HAMZA ABOVE WITH ALEF MAKSURA ISOLATED FORM}"), 648 ("\U000e007f", "\\N{CANCEL TAG}"), 649 ("\U0010ffff", "\\U0010ffff"), 650 # Lone surrogates 651 ("\ud800", "\\ud800"), 652 ("\udfff", "\\udfff"), 653 ("\ud800\udfff", "\\ud800\\udfff"), 654 ] 655 for s, r in tests: 656 with self.subTest(str=s): 657 self.assertEqual( 658 codecs.namereplace_errors( 659 UnicodeEncodeError("ascii", "a" + s + "b", 660 1, 1 + len(s), "ouch")), 661 (r, 1 + len(s)) 662 ) 663 664 def test_badandgoodsurrogateescapeexceptions(self): 665 surrogateescape_errors = codecs.lookup_error('surrogateescape') 666 # "surrogateescape" complains about a non-exception passed in 667 self.assertRaises( 668 TypeError, 669 surrogateescape_errors, 670 42 671 ) 672 # "surrogateescape" complains about the wrong exception types 673 self.assertRaises( 674 TypeError, 675 surrogateescape_errors, 676 UnicodeError("ouch") 677 ) 678 # "surrogateescape" can not be used for translating 679 self.assertRaises( 680 TypeError, 681 surrogateescape_errors, 682 UnicodeTranslateError("\udc80", 0, 1, "ouch") 683 ) 684 # Use the correct exception 685 for s in ("a", "\udc7f", "\udd00"): 686 with self.subTest(str=s): 687 self.assertRaises( 688 UnicodeEncodeError, 689 surrogateescape_errors, 690 UnicodeEncodeError("ascii", s, 0, 1, "ouch") 691 ) 692 self.assertEqual( 693 surrogateescape_errors( 694 UnicodeEncodeError("ascii", "a\udc80b", 1, 2, "ouch")), 695 (b"\x80", 2) 696 ) 697 self.assertRaises( 698 UnicodeDecodeError, 699 surrogateescape_errors, 700 UnicodeDecodeError("ascii", bytearray(b"a"), 0, 1, "ouch") 701 ) 702 self.assertEqual( 703 surrogateescape_errors( 704 UnicodeDecodeError("ascii", bytearray(b"a\x80b"), 1, 2, "ouch")), 705 ("\udc80", 2) 706 ) 707 708 def test_badandgoodsurrogatepassexceptions(self): 709 surrogatepass_errors = codecs.lookup_error('surrogatepass') 710 # "surrogatepass" complains about a non-exception passed in 711 self.assertRaises( 712 TypeError, 713 surrogatepass_errors, 714 42 715 ) 716 # "surrogatepass" complains about the wrong exception types 717 self.assertRaises( 718 TypeError, 719 surrogatepass_errors, 720 UnicodeError("ouch") 721 ) 722 # "surrogatepass" can not be used for translating 723 self.assertRaises( 724 TypeError, 725 surrogatepass_errors, 726 UnicodeTranslateError("\ud800", 0, 1, "ouch") 727 ) 728 # Use the correct exception 729 for enc in ("utf-8", "utf-16le", "utf-16be", "utf-32le", "utf-32be"): 730 with self.subTest(encoding=enc): 731 self.assertRaises( 732 UnicodeEncodeError, 733 surrogatepass_errors, 734 UnicodeEncodeError(enc, "a", 0, 1, "ouch") 735 ) 736 self.assertRaises( 737 UnicodeDecodeError, 738 surrogatepass_errors, 739 UnicodeDecodeError(enc, "a".encode(enc), 0, 1, "ouch") 740 ) 741 for s in ("\ud800", "\udfff", "\ud800\udfff"): 742 with self.subTest(str=s): 743 self.assertRaises( 744 UnicodeEncodeError, 745 surrogatepass_errors, 746 UnicodeEncodeError("ascii", s, 0, len(s), "ouch") 747 ) 748 tests = [ 749 ("utf-8", "\ud800", b'\xed\xa0\x80', 3), 750 ("utf-16le", "\ud800", b'\x00\xd8', 2), 751 ("utf-16be", "\ud800", b'\xd8\x00', 2), 752 ("utf-32le", "\ud800", b'\x00\xd8\x00\x00', 4), 753 ("utf-32be", "\ud800", b'\x00\x00\xd8\x00', 4), 754 ("utf-8", "\udfff", b'\xed\xbf\xbf', 3), 755 ("utf-16le", "\udfff", b'\xff\xdf', 2), 756 ("utf-16be", "\udfff", b'\xdf\xff', 2), 757 ("utf-32le", "\udfff", b'\xff\xdf\x00\x00', 4), 758 ("utf-32be", "\udfff", b'\x00\x00\xdf\xff', 4), 759 ("utf-8", "\ud800\udfff", b'\xed\xa0\x80\xed\xbf\xbf', 3), 760 ("utf-16le", "\ud800\udfff", b'\x00\xd8\xff\xdf', 2), 761 ("utf-16be", "\ud800\udfff", b'\xd8\x00\xdf\xff', 2), 762 ("utf-32le", "\ud800\udfff", b'\x00\xd8\x00\x00\xff\xdf\x00\x00', 4), 763 ("utf-32be", "\ud800\udfff", b'\x00\x00\xd8\x00\x00\x00\xdf\xff', 4), 764 ] 765 for enc, s, b, n in tests: 766 with self.subTest(encoding=enc, str=s, bytes=b): 767 self.assertEqual( 768 surrogatepass_errors( 769 UnicodeEncodeError(enc, "a" + s + "b", 770 1, 1 + len(s), "ouch")), 771 (b, 1 + len(s)) 772 ) 773 self.assertEqual( 774 surrogatepass_errors( 775 UnicodeDecodeError(enc, bytearray(b"a" + b[:n] + b"b"), 776 1, 1 + n, "ouch")), 777 (s[:1], 1 + n) 778 ) 779 780 def test_badhandlerresults(self): 781 results = ( 42, "foo", (1,2,3), ("foo", 1, 3), ("foo", None), ("foo",), ("foo", 1, 3), ("foo", None), ("foo",) ) 782 encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15") 783 784 for res in results: 785 codecs.register_error("test.badhandler", lambda x: res) 786 for enc in encs: 787 self.assertRaises( 788 TypeError, 789 "\u3042".encode, 790 enc, 791 "test.badhandler" 792 ) 793 for (enc, bytes) in ( 794 ("ascii", b"\xff"), 795 ("utf-8", b"\xff"), 796 ("utf-7", b"+x-"), 797 ("unicode-internal", b"\x00"), 798 ): 799 with test.support.check_warnings(): 800 # unicode-internal has been deprecated 801 self.assertRaises( 802 TypeError, 803 bytes.decode, 804 enc, 805 "test.badhandler" 806 ) 807 808 def test_lookup(self): 809 self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict")) 810 self.assertEqual(codecs.ignore_errors, codecs.lookup_error("ignore")) 811 self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict")) 812 self.assertEqual( 813 codecs.xmlcharrefreplace_errors, 814 codecs.lookup_error("xmlcharrefreplace") 815 ) 816 self.assertEqual( 817 codecs.backslashreplace_errors, 818 codecs.lookup_error("backslashreplace") 819 ) 820 self.assertEqual( 821 codecs.namereplace_errors, 822 codecs.lookup_error("namereplace") 823 ) 824 825 def test_unencodablereplacement(self): 826 def unencrepl(exc): 827 if isinstance(exc, UnicodeEncodeError): 828 return ("\u4242", exc.end) 829 else: 830 raise TypeError("don't know how to handle %r" % exc) 831 codecs.register_error("test.unencreplhandler", unencrepl) 832 for enc in ("ascii", "iso-8859-1", "iso-8859-15"): 833 self.assertRaises( 834 UnicodeEncodeError, 835 "\u4242".encode, 836 enc, 837 "test.unencreplhandler" 838 ) 839 840 def test_badregistercall(self): 841 # enhance coverage of: 842 # Modules/_codecsmodule.c::register_error() 843 # Python/codecs.c::PyCodec_RegisterError() 844 self.assertRaises(TypeError, codecs.register_error, 42) 845 self.assertRaises(TypeError, codecs.register_error, "test.dummy", 42) 846 847 def test_badlookupcall(self): 848 # enhance coverage of: 849 # Modules/_codecsmodule.c::lookup_error() 850 self.assertRaises(TypeError, codecs.lookup_error) 851 852 def test_unknownhandler(self): 853 # enhance coverage of: 854 # Modules/_codecsmodule.c::lookup_error() 855 self.assertRaises(LookupError, codecs.lookup_error, "test.unknown") 856 857 def test_xmlcharrefvalues(self): 858 # enhance coverage of: 859 # Python/codecs.c::PyCodec_XMLCharRefReplaceErrors() 860 # and inline implementations 861 v = (1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000, 862 500000, 1000000) 863 s = "".join([chr(x) for x in v]) 864 codecs.register_error("test.xmlcharrefreplace", codecs.xmlcharrefreplace_errors) 865 for enc in ("ascii", "iso-8859-15"): 866 for err in ("xmlcharrefreplace", "test.xmlcharrefreplace"): 867 s.encode(enc, err) 868 869 def test_decodehelper(self): 870 # enhance coverage of: 871 # Objects/unicodeobject.c::unicode_decode_call_errorhandler() 872 # and callers 873 self.assertRaises(LookupError, b"\xff".decode, "ascii", "test.unknown") 874 875 def baddecodereturn1(exc): 876 return 42 877 codecs.register_error("test.baddecodereturn1", baddecodereturn1) 878 self.assertRaises(TypeError, b"\xff".decode, "ascii", "test.baddecodereturn1") 879 self.assertRaises(TypeError, b"\\".decode, "unicode-escape", "test.baddecodereturn1") 880 self.assertRaises(TypeError, b"\\x0".decode, "unicode-escape", "test.baddecodereturn1") 881 self.assertRaises(TypeError, b"\\x0y".decode, "unicode-escape", "test.baddecodereturn1") 882 self.assertRaises(TypeError, b"\\Uffffeeee".decode, "unicode-escape", "test.baddecodereturn1") 883 self.assertRaises(TypeError, b"\\uyyyy".decode, "raw-unicode-escape", "test.baddecodereturn1") 884 885 def baddecodereturn2(exc): 886 return ("?", None) 887 codecs.register_error("test.baddecodereturn2", baddecodereturn2) 888 self.assertRaises(TypeError, b"\xff".decode, "ascii", "test.baddecodereturn2") 889 890 handler = PosReturn() 891 codecs.register_error("test.posreturn", handler.handle) 892 893 # Valid negative position 894 handler.pos = -1 895 self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>0") 896 897 # Valid negative position 898 handler.pos = -2 899 self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?><?>") 900 901 # Negative position out of bounds 902 handler.pos = -3 903 self.assertRaises(IndexError, b"\xff0".decode, "ascii", "test.posreturn") 904 905 # Valid positive position 906 handler.pos = 1 907 self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>0") 908 909 # Largest valid positive position (one beyond end of input) 910 handler.pos = 2 911 self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>") 912 913 # Invalid positive position 914 handler.pos = 3 915 self.assertRaises(IndexError, b"\xff0".decode, "ascii", "test.posreturn") 916 917 # Restart at the "0" 918 handler.pos = 6 919 self.assertEqual(b"\\uyyyy0".decode("raw-unicode-escape", "test.posreturn"), "<?>0") 920 921 class D(dict): 922 def __getitem__(self, key): 923 raise ValueError 924 self.assertRaises(UnicodeError, codecs.charmap_decode, b"\xff", "strict", {0xff: None}) 925 self.assertRaises(ValueError, codecs.charmap_decode, b"\xff", "strict", D()) 926 self.assertRaises(TypeError, codecs.charmap_decode, b"\xff", "strict", {0xff: sys.maxunicode+1}) 927 928 def test_encodehelper(self): 929 # enhance coverage of: 930 # Objects/unicodeobject.c::unicode_encode_call_errorhandler() 931 # and callers 932 self.assertRaises(LookupError, "\xff".encode, "ascii", "test.unknown") 933 934 def badencodereturn1(exc): 935 return 42 936 codecs.register_error("test.badencodereturn1", badencodereturn1) 937 self.assertRaises(TypeError, "\xff".encode, "ascii", "test.badencodereturn1") 938 939 def badencodereturn2(exc): 940 return ("?", None) 941 codecs.register_error("test.badencodereturn2", badencodereturn2) 942 self.assertRaises(TypeError, "\xff".encode, "ascii", "test.badencodereturn2") 943 944 handler = PosReturn() 945 codecs.register_error("test.posreturn", handler.handle) 946 947 # Valid negative position 948 handler.pos = -1 949 self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>0") 950 951 # Valid negative position 952 handler.pos = -2 953 self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?><?>") 954 955 # Negative position out of bounds 956 handler.pos = -3 957 self.assertRaises(IndexError, "\xff0".encode, "ascii", "test.posreturn") 958 959 # Valid positive position 960 handler.pos = 1 961 self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>0") 962 963 # Largest valid positive position (one beyond end of input 964 handler.pos = 2 965 self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>") 966 967 # Invalid positive position 968 handler.pos = 3 969 self.assertRaises(IndexError, "\xff0".encode, "ascii", "test.posreturn") 970 971 handler.pos = 0 972 973 class D(dict): 974 def __getitem__(self, key): 975 raise ValueError 976 for err in ("strict", "replace", "xmlcharrefreplace", 977 "backslashreplace", "namereplace", "test.posreturn"): 978 self.assertRaises(UnicodeError, codecs.charmap_encode, "\xff", err, {0xff: None}) 979 self.assertRaises(ValueError, codecs.charmap_encode, "\xff", err, D()) 980 self.assertRaises(TypeError, codecs.charmap_encode, "\xff", err, {0xff: 300}) 981 982 def test_translatehelper(self): 983 # enhance coverage of: 984 # Objects/unicodeobject.c::unicode_encode_call_errorhandler() 985 # and callers 986 # (Unfortunately the errors argument is not directly accessible 987 # from Python, so we can't test that much) 988 class D(dict): 989 def __getitem__(self, key): 990 raise ValueError 991 #self.assertRaises(ValueError, "\xff".translate, D()) 992 self.assertRaises(ValueError, "\xff".translate, {0xff: sys.maxunicode+1}) 993 self.assertRaises(TypeError, "\xff".translate, {0xff: ()}) 994 995 def test_bug828737(self): 996 charmap = { 997 ord("&"): "&", 998 ord("<"): "<", 999 ord(">"): ">", 1000 ord('"'): """, 1001 } 1002 1003 for n in (1, 10, 100, 1000): 1004 text = 'abc<def>ghi'*n 1005 text.translate(charmap) 1006 1007 def test_mutatingdecodehandler(self): 1008 baddata = [ 1009 ("ascii", b"\xff"), 1010 ("utf-7", b"++"), 1011 ("utf-8", b"\xff"), 1012 ("utf-16", b"\xff"), 1013 ("utf-32", b"\xff"), 1014 ("unicode-escape", b"\\u123g"), 1015 ("raw-unicode-escape", b"\\u123g"), 1016 ("unicode-internal", b"\xff"), 1017 ] 1018 1019 def replacing(exc): 1020 if isinstance(exc, UnicodeDecodeError): 1021 exc.object = 42 1022 return ("\u4242", 0) 1023 else: 1024 raise TypeError("don't know how to handle %r" % exc) 1025 codecs.register_error("test.replacing", replacing) 1026 1027 with test.support.check_warnings(): 1028 # unicode-internal has been deprecated 1029 for (encoding, data) in baddata: 1030 with self.assertRaises(TypeError): 1031 data.decode(encoding, "test.replacing") 1032 1033 def mutating(exc): 1034 if isinstance(exc, UnicodeDecodeError): 1035 exc.object = b"" 1036 return ("\u4242", 0) 1037 else: 1038 raise TypeError("don't know how to handle %r" % exc) 1039 codecs.register_error("test.mutating", mutating) 1040 # If the decoder doesn't pick up the modified input the following 1041 # will lead to an endless loop 1042 with test.support.check_warnings(): 1043 # unicode-internal has been deprecated 1044 for (encoding, data) in baddata: 1045 self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242") 1046 1047 # issue32583 1048 def test_crashing_decode_handler(self): 1049 # better generating one more character to fill the extra space slot 1050 # so in debug build it can steadily fail 1051 def forward_shorter_than_end(exc): 1052 if isinstance(exc, UnicodeDecodeError): 1053 # size one character, 0 < forward < exc.end 1054 return ('\ufffd', exc.start+1) 1055 else: 1056 raise TypeError("don't know how to handle %r" % exc) 1057 codecs.register_error( 1058 "test.forward_shorter_than_end", forward_shorter_than_end) 1059 1060 self.assertEqual( 1061 b'\xd8\xd8\xd8\xd8\xd8\x00\x00\x00'.decode( 1062 'utf-16-le', 'test.forward_shorter_than_end'), 1063 '\ufffd\ufffd\ufffd\ufffd\xd8\x00' 1064 ) 1065 self.assertEqual( 1066 b'\xd8\xd8\xd8\xd8\x00\xd8\x00\x00'.decode( 1067 'utf-16-be', 'test.forward_shorter_than_end'), 1068 '\ufffd\ufffd\ufffd\ufffd\xd8\x00' 1069 ) 1070 self.assertEqual( 1071 b'\x11\x11\x11\x11\x11\x00\x00\x00\x00\x00\x00'.decode( 1072 'utf-32-le', 'test.forward_shorter_than_end'), 1073 '\ufffd\ufffd\ufffd\u1111\x00' 1074 ) 1075 self.assertEqual( 1076 b'\x11\x11\x11\x00\x00\x11\x11\x00\x00\x00\x00'.decode( 1077 'utf-32-be', 'test.forward_shorter_than_end'), 1078 '\ufffd\ufffd\ufffd\u1111\x00' 1079 ) 1080 1081 def replace_with_long(exc): 1082 if isinstance(exc, UnicodeDecodeError): 1083 exc.object = b"\x00" * 8 1084 return ('\ufffd', exc.start) 1085 else: 1086 raise TypeError("don't know how to handle %r" % exc) 1087 codecs.register_error("test.replace_with_long", replace_with_long) 1088 1089 self.assertEqual( 1090 b'\x00'.decode('utf-16', 'test.replace_with_long'), 1091 '\ufffd\x00\x00\x00\x00' 1092 ) 1093 self.assertEqual( 1094 b'\x00'.decode('utf-32', 'test.replace_with_long'), 1095 '\ufffd\x00\x00' 1096 ) 1097 1098 1099 def test_fake_error_class(self): 1100 handlers = [ 1101 codecs.strict_errors, 1102 codecs.ignore_errors, 1103 codecs.replace_errors, 1104 codecs.backslashreplace_errors, 1105 codecs.namereplace_errors, 1106 codecs.xmlcharrefreplace_errors, 1107 codecs.lookup_error('surrogateescape'), 1108 codecs.lookup_error('surrogatepass'), 1109 ] 1110 for cls in UnicodeEncodeError, UnicodeDecodeError, UnicodeTranslateError: 1111 class FakeUnicodeError(str): 1112 __class__ = cls 1113 for handler in handlers: 1114 with self.subTest(handler=handler, error_class=cls): 1115 self.assertRaises(TypeError, handler, FakeUnicodeError()) 1116 class FakeUnicodeError(Exception): 1117 __class__ = cls 1118 for handler in handlers: 1119 with self.subTest(handler=handler, error_class=cls): 1120 with self.assertRaises((TypeError, FakeUnicodeError)): 1121 handler(FakeUnicodeError()) 1122 1123 1124if __name__ == "__main__": 1125 unittest.main() 1126