1import codecs
2import html.entities
3import sys
4import test.support
5import unicodedata
6import unittest
7
8class PosReturn:
9    # this can be used for configurable callbacks
10
11    def __init__(self):
12        self.pos = 0
13
14    def handle(self, exc):
15        oldpos = self.pos
16        realpos = oldpos
17        if realpos<0:
18            realpos = len(exc.object) + realpos
19        # if we don't advance this time, terminate on the next call
20        # otherwise we'd get an endless loop
21        if realpos <= exc.start:
22            self.pos = len(exc.object)
23        return ("<?>", oldpos)
24
25# A UnicodeEncodeError object with a bad start attribute
26class BadStartUnicodeEncodeError(UnicodeEncodeError):
27    def __init__(self):
28        UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad")
29        self.start = []
30
31# A UnicodeEncodeError object with a bad object attribute
32class BadObjectUnicodeEncodeError(UnicodeEncodeError):
33    def __init__(self):
34        UnicodeEncodeError.__init__(self, "ascii", "", 0, 1, "bad")
35        self.object = []
36
37# A UnicodeDecodeError object without an end attribute
38class NoEndUnicodeDecodeError(UnicodeDecodeError):
39    def __init__(self):
40        UnicodeDecodeError.__init__(self, "ascii", bytearray(b""), 0, 1, "bad")
41        del self.end
42
43# A UnicodeDecodeError object with a bad object attribute
44class BadObjectUnicodeDecodeError(UnicodeDecodeError):
45    def __init__(self):
46        UnicodeDecodeError.__init__(self, "ascii", bytearray(b""), 0, 1, "bad")
47        self.object = []
48
49# A UnicodeTranslateError object without a start attribute
50class NoStartUnicodeTranslateError(UnicodeTranslateError):
51    def __init__(self):
52        UnicodeTranslateError.__init__(self, "", 0, 1, "bad")
53        del self.start
54
55# A UnicodeTranslateError object without an end attribute
56class NoEndUnicodeTranslateError(UnicodeTranslateError):
57    def __init__(self):
58        UnicodeTranslateError.__init__(self,  "", 0, 1, "bad")
59        del self.end
60
61# A UnicodeTranslateError object without an object attribute
62class NoObjectUnicodeTranslateError(UnicodeTranslateError):
63    def __init__(self):
64        UnicodeTranslateError.__init__(self, "", 0, 1, "bad")
65        del self.object
66
67class CodecCallbackTest(unittest.TestCase):
68
69    def test_xmlcharrefreplace(self):
70        # replace unencodable characters which numeric character entities.
71        # For ascii, latin-1 and charmaps this is completely implemented
72        # in C and should be reasonably fast.
73        s = "\u30b9\u30d1\u30e2 \xe4nd eggs"
74        self.assertEqual(
75            s.encode("ascii", "xmlcharrefreplace"),
76            b"&#12473;&#12497;&#12514; &#228;nd eggs"
77        )
78        self.assertEqual(
79            s.encode("latin-1", "xmlcharrefreplace"),
80            b"&#12473;&#12497;&#12514; \xe4nd eggs"
81        )
82
83    def test_xmlcharnamereplace(self):
84        # This time use a named character entity for unencodable
85        # characters, if one is available.
86
87        def xmlcharnamereplace(exc):
88            if not isinstance(exc, UnicodeEncodeError):
89                raise TypeError("don't know how to handle %r" % exc)
90            l = []
91            for c in exc.object[exc.start:exc.end]:
92                try:
93                    l.append("&%s;" % html.entities.codepoint2name[ord(c)])
94                except KeyError:
95                    l.append("&#%d;" % ord(c))
96            return ("".join(l), exc.end)
97
98        codecs.register_error(
99            "test.xmlcharnamereplace", xmlcharnamereplace)
100
101        sin = "\xab\u211c\xbb = \u2329\u1234\u20ac\u232a"
102        sout = b"&laquo;&real;&raquo; = &lang;&#4660;&euro;&rang;"
103        self.assertEqual(sin.encode("ascii", "test.xmlcharnamereplace"), sout)
104        sout = b"\xab&real;\xbb = &lang;&#4660;&euro;&rang;"
105        self.assertEqual(sin.encode("latin-1", "test.xmlcharnamereplace"), sout)
106        sout = b"\xab&real;\xbb = &lang;&#4660;\xa4&rang;"
107        self.assertEqual(sin.encode("iso-8859-15", "test.xmlcharnamereplace"), sout)
108
109    def test_uninamereplace(self):
110        # We're using the names from the unicode database this time,
111        # and we're doing "syntax highlighting" here, i.e. we include
112        # the replaced text in ANSI escape sequences. For this it is
113        # useful that the error handler is not called for every single
114        # unencodable character, but for a complete sequence of
115        # unencodable characters, otherwise we would output many
116        # unnecessary escape sequences.
117
118        def uninamereplace(exc):
119            if not isinstance(exc, UnicodeEncodeError):
120                raise TypeError("don't know how to handle %r" % exc)
121            l = []
122            for c in exc.object[exc.start:exc.end]:
123                l.append(unicodedata.name(c, "0x%x" % ord(c)))
124            return ("\033[1m%s\033[0m" % ", ".join(l), exc.end)
125
126        codecs.register_error(
127            "test.uninamereplace", uninamereplace)
128
129        sin = "\xac\u1234\u20ac\u8000"
130        sout = b"\033[1mNOT SIGN, ETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m"
131        self.assertEqual(sin.encode("ascii", "test.uninamereplace"), sout)
132
133        sout = b"\xac\033[1mETHIOPIC SYLLABLE SEE, EURO SIGN, CJK UNIFIED IDEOGRAPH-8000\033[0m"
134        self.assertEqual(sin.encode("latin-1", "test.uninamereplace"), sout)
135
136        sout = b"\xac\033[1mETHIOPIC SYLLABLE SEE\033[0m\xa4\033[1mCJK UNIFIED IDEOGRAPH-8000\033[0m"
137        self.assertEqual(sin.encode("iso-8859-15", "test.uninamereplace"), sout)
138
139    def test_backslashescape(self):
140        # Does the same as the "unicode-escape" encoding, but with different
141        # base encodings.
142        sin = "a\xac\u1234\u20ac\u8000\U0010ffff"
143        sout = b"a\\xac\\u1234\\u20ac\\u8000\\U0010ffff"
144        self.assertEqual(sin.encode("ascii", "backslashreplace"), sout)
145
146        sout = b"a\xac\\u1234\\u20ac\\u8000\\U0010ffff"
147        self.assertEqual(sin.encode("latin-1", "backslashreplace"), sout)
148
149        sout = b"a\xac\\u1234\xa4\\u8000\\U0010ffff"
150        self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout)
151
152    def test_nameescape(self):
153        # Does the same as backslashescape, but prefers ``\N{...}`` escape
154        # sequences.
155        sin = "a\xac\u1234\u20ac\u8000\U0010ffff"
156        sout = (b'a\\N{NOT SIGN}\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}'
157                b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff')
158        self.assertEqual(sin.encode("ascii", "namereplace"), sout)
159
160        sout = (b'a\xac\\N{ETHIOPIC SYLLABLE SEE}\\N{EURO SIGN}'
161                b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff')
162        self.assertEqual(sin.encode("latin-1", "namereplace"), sout)
163
164        sout = (b'a\xac\\N{ETHIOPIC SYLLABLE SEE}\xa4'
165                b'\\N{CJK UNIFIED IDEOGRAPH-8000}\\U0010ffff')
166        self.assertEqual(sin.encode("iso-8859-15", "namereplace"), sout)
167
168    def test_decoding_callbacks(self):
169        # This is a test for a decoding callback handler
170        # that allows the decoding of the invalid sequence
171        # "\xc0\x80" and returns "\x00" instead of raising an error.
172        # All other illegal sequences will be handled strictly.
173        def relaxedutf8(exc):
174            if not isinstance(exc, UnicodeDecodeError):
175                raise TypeError("don't know how to handle %r" % exc)
176            if exc.object[exc.start:exc.start+2] == b"\xc0\x80":
177                return ("\x00", exc.start+2) # retry after two bytes
178            else:
179                raise exc
180
181        codecs.register_error("test.relaxedutf8", relaxedutf8)
182
183        # all the "\xc0\x80" will be decoded to "\x00"
184        sin = b"a\x00b\xc0\x80c\xc3\xbc\xc0\x80\xc0\x80"
185        sout = "a\x00b\x00c\xfc\x00\x00"
186        self.assertEqual(sin.decode("utf-8", "test.relaxedutf8"), sout)
187
188        # "\xc0\x81" is not valid and a UnicodeDecodeError will be raised
189        sin = b"\xc0\x80\xc0\x81"
190        self.assertRaises(UnicodeDecodeError, sin.decode,
191                          "utf-8", "test.relaxedutf8")
192
193    def test_charmapencode(self):
194        # For charmap encodings the replacement string will be
195        # mapped through the encoding again. This means, that
196        # to be able to use e.g. the "replace" handler, the
197        # charmap has to have a mapping for "?".
198        charmap = dict((ord(c), bytes(2*c.upper(), 'ascii')) for c in "abcdefgh")
199        sin = "abc"
200        sout = b"AABBCC"
201        self.assertEqual(codecs.charmap_encode(sin, "strict", charmap)[0], sout)
202
203        sin = "abcA"
204        self.assertRaises(UnicodeError, codecs.charmap_encode, sin, "strict", charmap)
205
206        charmap[ord("?")] = b"XYZ"
207        sin = "abcDEF"
208        sout = b"AABBCCXYZXYZXYZ"
209        self.assertEqual(codecs.charmap_encode(sin, "replace", charmap)[0], sout)
210
211        charmap[ord("?")] = "XYZ" # wrong type in mapping
212        self.assertRaises(TypeError, codecs.charmap_encode, sin, "replace", charmap)
213
214    def test_decodeunicodeinternal(self):
215        with test.support.check_warnings(('unicode_internal codec has been '
216                                          'deprecated', DeprecationWarning)):
217            self.assertRaises(
218                UnicodeDecodeError,
219                b"\x00\x00\x00\x00\x00".decode,
220                "unicode-internal",
221            )
222            if len('\0'.encode('unicode-internal')) == 4:
223                def handler_unicodeinternal(exc):
224                    if not isinstance(exc, UnicodeDecodeError):
225                        raise TypeError("don't know how to handle %r" % exc)
226                    return ("\x01", 1)
227
228                self.assertEqual(
229                    b"\x00\x00\x00\x00\x00".decode("unicode-internal", "ignore"),
230                    "\u0000"
231                )
232
233                self.assertEqual(
234                    b"\x00\x00\x00\x00\x00".decode("unicode-internal", "replace"),
235                    "\u0000\ufffd"
236                )
237
238                self.assertEqual(
239                    b"\x00\x00\x00\x00\x00".decode("unicode-internal", "backslashreplace"),
240                    "\u0000\\x00"
241                )
242
243                codecs.register_error("test.hui", handler_unicodeinternal)
244
245                self.assertEqual(
246                    b"\x00\x00\x00\x00\x00".decode("unicode-internal", "test.hui"),
247                    "\u0000\u0001\u0000"
248                )
249
250    def test_callbacks(self):
251        def handler1(exc):
252            r = range(exc.start, exc.end)
253            if isinstance(exc, UnicodeEncodeError):
254                l = ["<%d>" % ord(exc.object[pos]) for pos in r]
255            elif isinstance(exc, UnicodeDecodeError):
256                l = ["<%d>" % exc.object[pos] for pos in r]
257            else:
258                raise TypeError("don't know how to handle %r" % exc)
259            return ("[%s]" % "".join(l), exc.end)
260
261        codecs.register_error("test.handler1", handler1)
262
263        def handler2(exc):
264            if not isinstance(exc, UnicodeDecodeError):
265                raise TypeError("don't know how to handle %r" % exc)
266            l = ["<%d>" % exc.object[pos] for pos in range(exc.start, exc.end)]
267            return ("[%s]" % "".join(l), exc.end+1) # skip one character
268
269        codecs.register_error("test.handler2", handler2)
270
271        s = b"\x00\x81\x7f\x80\xff"
272
273        self.assertEqual(
274            s.decode("ascii", "test.handler1"),
275            "\x00[<129>]\x7f[<128>][<255>]"
276        )
277        self.assertEqual(
278            s.decode("ascii", "test.handler2"),
279            "\x00[<129>][<128>]"
280        )
281
282        self.assertEqual(
283            b"\\u3042\\u3xxx".decode("unicode-escape", "test.handler1"),
284            "\u3042[<92><117><51>]xxx"
285        )
286
287        self.assertEqual(
288            b"\\u3042\\u3xx".decode("unicode-escape", "test.handler1"),
289            "\u3042[<92><117><51>]xx"
290        )
291
292        self.assertEqual(
293            codecs.charmap_decode(b"abc", "test.handler1", {ord("a"): "z"})[0],
294            "z[<98>][<99>]"
295        )
296
297        self.assertEqual(
298            "g\xfc\xdfrk".encode("ascii", "test.handler1"),
299            b"g[<252><223>]rk"
300        )
301
302        self.assertEqual(
303            "g\xfc\xdf".encode("ascii", "test.handler1"),
304            b"g[<252><223>]"
305        )
306
307    def test_longstrings(self):
308        # test long strings to check for memory overflow problems
309        errors = [ "strict", "ignore", "replace", "xmlcharrefreplace",
310                   "backslashreplace", "namereplace"]
311        # register the handlers under different names,
312        # to prevent the codec from recognizing the name
313        for err in errors:
314            codecs.register_error("test." + err, codecs.lookup_error(err))
315        l = 1000
316        errors += [ "test." + err for err in errors ]
317        for uni in [ s*l for s in ("x", "\u3042", "a\xe4") ]:
318            for enc in ("ascii", "latin-1", "iso-8859-1", "iso-8859-15",
319                        "utf-8", "utf-7", "utf-16", "utf-32"):
320                for err in errors:
321                    try:
322                        uni.encode(enc, err)
323                    except UnicodeError:
324                        pass
325
326    def check_exceptionobjectargs(self, exctype, args, msg):
327        # Test UnicodeError subclasses: construction, attribute assignment and __str__ conversion
328        # check with one missing argument
329        self.assertRaises(TypeError, exctype, *args[:-1])
330        # check with one argument too much
331        self.assertRaises(TypeError, exctype, *(args + ["too much"]))
332        # check with one argument of the wrong type
333        wrongargs = [ "spam", b"eggs", b"spam", 42, 1.0, None ]
334        for i in range(len(args)):
335            for wrongarg in wrongargs:
336                if type(wrongarg) is type(args[i]):
337                    continue
338                # build argument array
339                callargs = []
340                for j in range(len(args)):
341                    if i==j:
342                        callargs.append(wrongarg)
343                    else:
344                        callargs.append(args[i])
345                self.assertRaises(TypeError, exctype, *callargs)
346
347        # check with the correct number and type of arguments
348        exc = exctype(*args)
349        self.assertEqual(str(exc), msg)
350
351    def test_unicodeencodeerror(self):
352        self.check_exceptionobjectargs(
353            UnicodeEncodeError,
354            ["ascii", "g\xfcrk", 1, 2, "ouch"],
355            "'ascii' codec can't encode character '\\xfc' in position 1: ouch"
356        )
357        self.check_exceptionobjectargs(
358            UnicodeEncodeError,
359            ["ascii", "g\xfcrk", 1, 4, "ouch"],
360            "'ascii' codec can't encode characters in position 1-3: ouch"
361        )
362        self.check_exceptionobjectargs(
363            UnicodeEncodeError,
364            ["ascii", "\xfcx", 0, 1, "ouch"],
365            "'ascii' codec can't encode character '\\xfc' in position 0: ouch"
366        )
367        self.check_exceptionobjectargs(
368            UnicodeEncodeError,
369            ["ascii", "\u0100x", 0, 1, "ouch"],
370            "'ascii' codec can't encode character '\\u0100' in position 0: ouch"
371        )
372        self.check_exceptionobjectargs(
373            UnicodeEncodeError,
374            ["ascii", "\uffffx", 0, 1, "ouch"],
375            "'ascii' codec can't encode character '\\uffff' in position 0: ouch"
376        )
377        self.check_exceptionobjectargs(
378            UnicodeEncodeError,
379            ["ascii", "\U00010000x", 0, 1, "ouch"],
380            "'ascii' codec can't encode character '\\U00010000' in position 0: ouch"
381        )
382
383    def test_unicodedecodeerror(self):
384        self.check_exceptionobjectargs(
385            UnicodeDecodeError,
386            ["ascii", bytearray(b"g\xfcrk"), 1, 2, "ouch"],
387            "'ascii' codec can't decode byte 0xfc in position 1: ouch"
388        )
389        self.check_exceptionobjectargs(
390            UnicodeDecodeError,
391            ["ascii", bytearray(b"g\xfcrk"), 1, 3, "ouch"],
392            "'ascii' codec can't decode bytes in position 1-2: ouch"
393        )
394
395    def test_unicodetranslateerror(self):
396        self.check_exceptionobjectargs(
397            UnicodeTranslateError,
398            ["g\xfcrk", 1, 2, "ouch"],
399            "can't translate character '\\xfc' in position 1: ouch"
400        )
401        self.check_exceptionobjectargs(
402            UnicodeTranslateError,
403            ["g\u0100rk", 1, 2, "ouch"],
404            "can't translate character '\\u0100' in position 1: ouch"
405        )
406        self.check_exceptionobjectargs(
407            UnicodeTranslateError,
408            ["g\uffffrk", 1, 2, "ouch"],
409            "can't translate character '\\uffff' in position 1: ouch"
410        )
411        self.check_exceptionobjectargs(
412            UnicodeTranslateError,
413            ["g\U00010000rk", 1, 2, "ouch"],
414            "can't translate character '\\U00010000' in position 1: ouch"
415        )
416        self.check_exceptionobjectargs(
417            UnicodeTranslateError,
418            ["g\xfcrk", 1, 3, "ouch"],
419            "can't translate characters in position 1-2: ouch"
420        )
421
422    def test_badandgoodstrictexceptions(self):
423        # "strict" complains about a non-exception passed in
424        self.assertRaises(
425            TypeError,
426            codecs.strict_errors,
427            42
428        )
429        # "strict" complains about the wrong exception type
430        self.assertRaises(
431            Exception,
432            codecs.strict_errors,
433            Exception("ouch")
434        )
435
436        # If the correct exception is passed in, "strict" raises it
437        self.assertRaises(
438            UnicodeEncodeError,
439            codecs.strict_errors,
440            UnicodeEncodeError("ascii", "\u3042", 0, 1, "ouch")
441        )
442        self.assertRaises(
443            UnicodeDecodeError,
444            codecs.strict_errors,
445            UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch")
446        )
447        self.assertRaises(
448            UnicodeTranslateError,
449            codecs.strict_errors,
450            UnicodeTranslateError("\u3042", 0, 1, "ouch")
451        )
452
453    def test_badandgoodignoreexceptions(self):
454        # "ignore" complains about a non-exception passed in
455        self.assertRaises(
456           TypeError,
457           codecs.ignore_errors,
458           42
459        )
460        # "ignore" complains about the wrong exception type
461        self.assertRaises(
462           TypeError,
463           codecs.ignore_errors,
464           UnicodeError("ouch")
465        )
466        # If the correct exception is passed in, "ignore" returns an empty replacement
467        self.assertEqual(
468            codecs.ignore_errors(
469                UnicodeEncodeError("ascii", "a\u3042b", 1, 2, "ouch")),
470            ("", 2)
471        )
472        self.assertEqual(
473            codecs.ignore_errors(
474                UnicodeDecodeError("ascii", bytearray(b"a\xffb"), 1, 2, "ouch")),
475            ("", 2)
476        )
477        self.assertEqual(
478            codecs.ignore_errors(
479                UnicodeTranslateError("a\u3042b", 1, 2, "ouch")),
480            ("", 2)
481        )
482
483    def test_badandgoodreplaceexceptions(self):
484        # "replace" complains about a non-exception passed in
485        self.assertRaises(
486           TypeError,
487           codecs.replace_errors,
488           42
489        )
490        # "replace" complains about the wrong exception type
491        self.assertRaises(
492           TypeError,
493           codecs.replace_errors,
494           UnicodeError("ouch")
495        )
496        self.assertRaises(
497            TypeError,
498            codecs.replace_errors,
499            BadObjectUnicodeEncodeError()
500        )
501        self.assertRaises(
502            TypeError,
503            codecs.replace_errors,
504            BadObjectUnicodeDecodeError()
505        )
506        # With the correct exception, "replace" returns an "?" or "\ufffd" replacement
507        self.assertEqual(
508            codecs.replace_errors(
509                UnicodeEncodeError("ascii", "a\u3042b", 1, 2, "ouch")),
510            ("?", 2)
511        )
512        self.assertEqual(
513            codecs.replace_errors(
514                UnicodeDecodeError("ascii", bytearray(b"a\xffb"), 1, 2, "ouch")),
515            ("\ufffd", 2)
516        )
517        self.assertEqual(
518            codecs.replace_errors(
519                UnicodeTranslateError("a\u3042b", 1, 2, "ouch")),
520            ("\ufffd", 2)
521        )
522
523    def test_badandgoodxmlcharrefreplaceexceptions(self):
524        # "xmlcharrefreplace" complains about a non-exception passed in
525        self.assertRaises(
526           TypeError,
527           codecs.xmlcharrefreplace_errors,
528           42
529        )
530        # "xmlcharrefreplace" complains about the wrong exception types
531        self.assertRaises(
532           TypeError,
533           codecs.xmlcharrefreplace_errors,
534           UnicodeError("ouch")
535        )
536        # "xmlcharrefreplace" can only be used for encoding
537        self.assertRaises(
538            TypeError,
539            codecs.xmlcharrefreplace_errors,
540            UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch")
541        )
542        self.assertRaises(
543            TypeError,
544            codecs.xmlcharrefreplace_errors,
545            UnicodeTranslateError("\u3042", 0, 1, "ouch")
546        )
547        # Use the correct exception
548        cs = (0, 1, 9, 10, 99, 100, 999, 1000, 9999, 10000, 99999, 100000,
549              999999, 1000000)
550        cs += (0xd800, 0xdfff)
551        s = "".join(chr(c) for c in cs)
552        self.assertEqual(
553            codecs.xmlcharrefreplace_errors(
554                UnicodeEncodeError("ascii", "a" + s + "b",
555                                   1, 1 + len(s), "ouch")
556            ),
557            ("".join("&#%d;" % c for c in cs), 1 + len(s))
558        )
559
560    def test_badandgoodbackslashreplaceexceptions(self):
561        # "backslashreplace" complains about a non-exception passed in
562        self.assertRaises(
563           TypeError,
564           codecs.backslashreplace_errors,
565           42
566        )
567        # "backslashreplace" complains about the wrong exception types
568        self.assertRaises(
569           TypeError,
570           codecs.backslashreplace_errors,
571           UnicodeError("ouch")
572        )
573        # Use the correct exception
574        tests = [
575            ("\u3042", "\\u3042"),
576            ("\n", "\\x0a"),
577            ("a", "\\x61"),
578            ("\x00", "\\x00"),
579            ("\xff", "\\xff"),
580            ("\u0100", "\\u0100"),
581            ("\uffff", "\\uffff"),
582            ("\U00010000", "\\U00010000"),
583            ("\U0010ffff", "\\U0010ffff"),
584            # Lone surrogates
585            ("\ud800", "\\ud800"),
586            ("\udfff", "\\udfff"),
587            ("\ud800\udfff", "\\ud800\\udfff"),
588        ]
589        for s, r in tests:
590            with self.subTest(str=s):
591                self.assertEqual(
592                    codecs.backslashreplace_errors(
593                        UnicodeEncodeError("ascii", "a" + s + "b",
594                                           1, 1 + len(s), "ouch")),
595                    (r, 1 + len(s))
596                )
597                self.assertEqual(
598                    codecs.backslashreplace_errors(
599                        UnicodeTranslateError("a" + s + "b",
600                                              1, 1 + len(s), "ouch")),
601                    (r, 1 + len(s))
602                )
603        tests = [
604            (b"a", "\\x61"),
605            (b"\n", "\\x0a"),
606            (b"\x00", "\\x00"),
607            (b"\xff", "\\xff"),
608        ]
609        for b, r in tests:
610            with self.subTest(bytes=b):
611                self.assertEqual(
612                    codecs.backslashreplace_errors(
613                        UnicodeDecodeError("ascii", bytearray(b"a" + b + b"b"),
614                                           1, 2, "ouch")),
615                    (r, 2)
616                )
617
618    def test_badandgoodnamereplaceexceptions(self):
619        # "namereplace" complains about a non-exception passed in
620        self.assertRaises(
621           TypeError,
622           codecs.namereplace_errors,
623           42
624        )
625        # "namereplace" complains about the wrong exception types
626        self.assertRaises(
627           TypeError,
628           codecs.namereplace_errors,
629           UnicodeError("ouch")
630        )
631        # "namereplace" can only be used for encoding
632        self.assertRaises(
633            TypeError,
634            codecs.namereplace_errors,
635            UnicodeDecodeError("ascii", bytearray(b"\xff"), 0, 1, "ouch")
636        )
637        self.assertRaises(
638            TypeError,
639            codecs.namereplace_errors,
640            UnicodeTranslateError("\u3042", 0, 1, "ouch")
641        )
642        # Use the correct exception
643        tests = [
644            ("\u3042", "\\N{HIRAGANA LETTER A}"),
645            ("\x00", "\\x00"),
646            ("\ufbf9", "\\N{ARABIC LIGATURE UIGHUR KIRGHIZ YEH WITH "
647                       "HAMZA ABOVE WITH ALEF MAKSURA ISOLATED FORM}"),
648            ("\U000e007f", "\\N{CANCEL TAG}"),
649            ("\U0010ffff", "\\U0010ffff"),
650            # Lone surrogates
651            ("\ud800", "\\ud800"),
652            ("\udfff", "\\udfff"),
653            ("\ud800\udfff", "\\ud800\\udfff"),
654        ]
655        for s, r in tests:
656            with self.subTest(str=s):
657                self.assertEqual(
658                    codecs.namereplace_errors(
659                        UnicodeEncodeError("ascii", "a" + s + "b",
660                                           1, 1 + len(s), "ouch")),
661                    (r, 1 + len(s))
662                )
663
664    def test_badandgoodsurrogateescapeexceptions(self):
665        surrogateescape_errors = codecs.lookup_error('surrogateescape')
666        # "surrogateescape" complains about a non-exception passed in
667        self.assertRaises(
668           TypeError,
669           surrogateescape_errors,
670           42
671        )
672        # "surrogateescape" complains about the wrong exception types
673        self.assertRaises(
674           TypeError,
675           surrogateescape_errors,
676           UnicodeError("ouch")
677        )
678        # "surrogateescape" can not be used for translating
679        self.assertRaises(
680            TypeError,
681            surrogateescape_errors,
682            UnicodeTranslateError("\udc80", 0, 1, "ouch")
683        )
684        # Use the correct exception
685        for s in ("a", "\udc7f", "\udd00"):
686            with self.subTest(str=s):
687                self.assertRaises(
688                    UnicodeEncodeError,
689                    surrogateescape_errors,
690                    UnicodeEncodeError("ascii", s, 0, 1, "ouch")
691                )
692        self.assertEqual(
693            surrogateescape_errors(
694                UnicodeEncodeError("ascii", "a\udc80b", 1, 2, "ouch")),
695            (b"\x80", 2)
696        )
697        self.assertRaises(
698            UnicodeDecodeError,
699            surrogateescape_errors,
700            UnicodeDecodeError("ascii", bytearray(b"a"), 0, 1, "ouch")
701        )
702        self.assertEqual(
703            surrogateescape_errors(
704                UnicodeDecodeError("ascii", bytearray(b"a\x80b"), 1, 2, "ouch")),
705            ("\udc80", 2)
706        )
707
708    def test_badandgoodsurrogatepassexceptions(self):
709        surrogatepass_errors = codecs.lookup_error('surrogatepass')
710        # "surrogatepass" complains about a non-exception passed in
711        self.assertRaises(
712           TypeError,
713           surrogatepass_errors,
714           42
715        )
716        # "surrogatepass" complains about the wrong exception types
717        self.assertRaises(
718           TypeError,
719           surrogatepass_errors,
720           UnicodeError("ouch")
721        )
722        # "surrogatepass" can not be used for translating
723        self.assertRaises(
724            TypeError,
725            surrogatepass_errors,
726            UnicodeTranslateError("\ud800", 0, 1, "ouch")
727        )
728        # Use the correct exception
729        for enc in ("utf-8", "utf-16le", "utf-16be", "utf-32le", "utf-32be"):
730            with self.subTest(encoding=enc):
731                self.assertRaises(
732                    UnicodeEncodeError,
733                    surrogatepass_errors,
734                    UnicodeEncodeError(enc, "a", 0, 1, "ouch")
735                )
736                self.assertRaises(
737                    UnicodeDecodeError,
738                    surrogatepass_errors,
739                    UnicodeDecodeError(enc, "a".encode(enc), 0, 1, "ouch")
740                )
741        for s in ("\ud800", "\udfff", "\ud800\udfff"):
742            with self.subTest(str=s):
743                self.assertRaises(
744                    UnicodeEncodeError,
745                    surrogatepass_errors,
746                    UnicodeEncodeError("ascii", s, 0, len(s), "ouch")
747                )
748        tests = [
749            ("utf-8", "\ud800", b'\xed\xa0\x80', 3),
750            ("utf-16le", "\ud800", b'\x00\xd8', 2),
751            ("utf-16be", "\ud800", b'\xd8\x00', 2),
752            ("utf-32le", "\ud800", b'\x00\xd8\x00\x00', 4),
753            ("utf-32be", "\ud800", b'\x00\x00\xd8\x00', 4),
754            ("utf-8", "\udfff", b'\xed\xbf\xbf', 3),
755            ("utf-16le", "\udfff", b'\xff\xdf', 2),
756            ("utf-16be", "\udfff", b'\xdf\xff', 2),
757            ("utf-32le", "\udfff", b'\xff\xdf\x00\x00', 4),
758            ("utf-32be", "\udfff", b'\x00\x00\xdf\xff', 4),
759            ("utf-8", "\ud800\udfff", b'\xed\xa0\x80\xed\xbf\xbf', 3),
760            ("utf-16le", "\ud800\udfff", b'\x00\xd8\xff\xdf', 2),
761            ("utf-16be", "\ud800\udfff", b'\xd8\x00\xdf\xff', 2),
762            ("utf-32le", "\ud800\udfff", b'\x00\xd8\x00\x00\xff\xdf\x00\x00', 4),
763            ("utf-32be", "\ud800\udfff", b'\x00\x00\xd8\x00\x00\x00\xdf\xff', 4),
764        ]
765        for enc, s, b, n in tests:
766            with self.subTest(encoding=enc, str=s, bytes=b):
767                self.assertEqual(
768                    surrogatepass_errors(
769                        UnicodeEncodeError(enc, "a" + s + "b",
770                                           1, 1 + len(s), "ouch")),
771                    (b, 1 + len(s))
772                )
773                self.assertEqual(
774                    surrogatepass_errors(
775                        UnicodeDecodeError(enc, bytearray(b"a" + b[:n] + b"b"),
776                                           1, 1 + n, "ouch")),
777                    (s[:1], 1 + n)
778                )
779
780    def test_badhandlerresults(self):
781        results = ( 42, "foo", (1,2,3), ("foo", 1, 3), ("foo", None), ("foo",), ("foo", 1, 3), ("foo", None), ("foo",) )
782        encs = ("ascii", "latin-1", "iso-8859-1", "iso-8859-15")
783
784        for res in results:
785            codecs.register_error("test.badhandler", lambda x: res)
786            for enc in encs:
787                self.assertRaises(
788                    TypeError,
789                    "\u3042".encode,
790                    enc,
791                    "test.badhandler"
792                )
793            for (enc, bytes) in (
794                ("ascii", b"\xff"),
795                ("utf-8", b"\xff"),
796                ("utf-7", b"+x-"),
797                ("unicode-internal", b"\x00"),
798            ):
799                with test.support.check_warnings():
800                    # unicode-internal has been deprecated
801                    self.assertRaises(
802                        TypeError,
803                        bytes.decode,
804                        enc,
805                        "test.badhandler"
806                    )
807
808    def test_lookup(self):
809        self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict"))
810        self.assertEqual(codecs.ignore_errors, codecs.lookup_error("ignore"))
811        self.assertEqual(codecs.strict_errors, codecs.lookup_error("strict"))
812        self.assertEqual(
813            codecs.xmlcharrefreplace_errors,
814            codecs.lookup_error("xmlcharrefreplace")
815        )
816        self.assertEqual(
817            codecs.backslashreplace_errors,
818            codecs.lookup_error("backslashreplace")
819        )
820        self.assertEqual(
821            codecs.namereplace_errors,
822            codecs.lookup_error("namereplace")
823        )
824
825    def test_unencodablereplacement(self):
826        def unencrepl(exc):
827            if isinstance(exc, UnicodeEncodeError):
828                return ("\u4242", exc.end)
829            else:
830                raise TypeError("don't know how to handle %r" % exc)
831        codecs.register_error("test.unencreplhandler", unencrepl)
832        for enc in ("ascii", "iso-8859-1", "iso-8859-15"):
833            self.assertRaises(
834                UnicodeEncodeError,
835                "\u4242".encode,
836                enc,
837                "test.unencreplhandler"
838            )
839
840    def test_badregistercall(self):
841        # enhance coverage of:
842        # Modules/_codecsmodule.c::register_error()
843        # Python/codecs.c::PyCodec_RegisterError()
844        self.assertRaises(TypeError, codecs.register_error, 42)
845        self.assertRaises(TypeError, codecs.register_error, "test.dummy", 42)
846
847    def test_badlookupcall(self):
848        # enhance coverage of:
849        # Modules/_codecsmodule.c::lookup_error()
850        self.assertRaises(TypeError, codecs.lookup_error)
851
852    def test_unknownhandler(self):
853        # enhance coverage of:
854        # Modules/_codecsmodule.c::lookup_error()
855        self.assertRaises(LookupError, codecs.lookup_error, "test.unknown")
856
857    def test_xmlcharrefvalues(self):
858        # enhance coverage of:
859        # Python/codecs.c::PyCodec_XMLCharRefReplaceErrors()
860        # and inline implementations
861        v = (1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000,
862             500000, 1000000)
863        s = "".join([chr(x) for x in v])
864        codecs.register_error("test.xmlcharrefreplace", codecs.xmlcharrefreplace_errors)
865        for enc in ("ascii", "iso-8859-15"):
866            for err in ("xmlcharrefreplace", "test.xmlcharrefreplace"):
867                s.encode(enc, err)
868
869    def test_decodehelper(self):
870        # enhance coverage of:
871        # Objects/unicodeobject.c::unicode_decode_call_errorhandler()
872        # and callers
873        self.assertRaises(LookupError, b"\xff".decode, "ascii", "test.unknown")
874
875        def baddecodereturn1(exc):
876            return 42
877        codecs.register_error("test.baddecodereturn1", baddecodereturn1)
878        self.assertRaises(TypeError, b"\xff".decode, "ascii", "test.baddecodereturn1")
879        self.assertRaises(TypeError, b"\\".decode, "unicode-escape", "test.baddecodereturn1")
880        self.assertRaises(TypeError, b"\\x0".decode, "unicode-escape", "test.baddecodereturn1")
881        self.assertRaises(TypeError, b"\\x0y".decode, "unicode-escape", "test.baddecodereturn1")
882        self.assertRaises(TypeError, b"\\Uffffeeee".decode, "unicode-escape", "test.baddecodereturn1")
883        self.assertRaises(TypeError, b"\\uyyyy".decode, "raw-unicode-escape", "test.baddecodereturn1")
884
885        def baddecodereturn2(exc):
886            return ("?", None)
887        codecs.register_error("test.baddecodereturn2", baddecodereturn2)
888        self.assertRaises(TypeError, b"\xff".decode, "ascii", "test.baddecodereturn2")
889
890        handler = PosReturn()
891        codecs.register_error("test.posreturn", handler.handle)
892
893        # Valid negative position
894        handler.pos = -1
895        self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>0")
896
897        # Valid negative position
898        handler.pos = -2
899        self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?><?>")
900
901        # Negative position out of bounds
902        handler.pos = -3
903        self.assertRaises(IndexError, b"\xff0".decode, "ascii", "test.posreturn")
904
905        # Valid positive position
906        handler.pos = 1
907        self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>0")
908
909        # Largest valid positive position (one beyond end of input)
910        handler.pos = 2
911        self.assertEqual(b"\xff0".decode("ascii", "test.posreturn"), "<?>")
912
913        # Invalid positive position
914        handler.pos = 3
915        self.assertRaises(IndexError, b"\xff0".decode, "ascii", "test.posreturn")
916
917        # Restart at the "0"
918        handler.pos = 6
919        self.assertEqual(b"\\uyyyy0".decode("raw-unicode-escape", "test.posreturn"), "<?>0")
920
921        class D(dict):
922            def __getitem__(self, key):
923                raise ValueError
924        self.assertRaises(UnicodeError, codecs.charmap_decode, b"\xff", "strict", {0xff: None})
925        self.assertRaises(ValueError, codecs.charmap_decode, b"\xff", "strict", D())
926        self.assertRaises(TypeError, codecs.charmap_decode, b"\xff", "strict", {0xff: sys.maxunicode+1})
927
928    def test_encodehelper(self):
929        # enhance coverage of:
930        # Objects/unicodeobject.c::unicode_encode_call_errorhandler()
931        # and callers
932        self.assertRaises(LookupError, "\xff".encode, "ascii", "test.unknown")
933
934        def badencodereturn1(exc):
935            return 42
936        codecs.register_error("test.badencodereturn1", badencodereturn1)
937        self.assertRaises(TypeError, "\xff".encode, "ascii", "test.badencodereturn1")
938
939        def badencodereturn2(exc):
940            return ("?", None)
941        codecs.register_error("test.badencodereturn2", badencodereturn2)
942        self.assertRaises(TypeError, "\xff".encode, "ascii", "test.badencodereturn2")
943
944        handler = PosReturn()
945        codecs.register_error("test.posreturn", handler.handle)
946
947        # Valid negative position
948        handler.pos = -1
949        self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>0")
950
951        # Valid negative position
952        handler.pos = -2
953        self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?><?>")
954
955        # Negative position out of bounds
956        handler.pos = -3
957        self.assertRaises(IndexError, "\xff0".encode, "ascii", "test.posreturn")
958
959        # Valid positive position
960        handler.pos = 1
961        self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>0")
962
963        # Largest valid positive position (one beyond end of input
964        handler.pos = 2
965        self.assertEqual("\xff0".encode("ascii", "test.posreturn"), b"<?>")
966
967        # Invalid positive position
968        handler.pos = 3
969        self.assertRaises(IndexError, "\xff0".encode, "ascii", "test.posreturn")
970
971        handler.pos = 0
972
973        class D(dict):
974            def __getitem__(self, key):
975                raise ValueError
976        for err in ("strict", "replace", "xmlcharrefreplace",
977                    "backslashreplace", "namereplace", "test.posreturn"):
978            self.assertRaises(UnicodeError, codecs.charmap_encode, "\xff", err, {0xff: None})
979            self.assertRaises(ValueError, codecs.charmap_encode, "\xff", err, D())
980            self.assertRaises(TypeError, codecs.charmap_encode, "\xff", err, {0xff: 300})
981
982    def test_translatehelper(self):
983        # enhance coverage of:
984        # Objects/unicodeobject.c::unicode_encode_call_errorhandler()
985        # and callers
986        # (Unfortunately the errors argument is not directly accessible
987        # from Python, so we can't test that much)
988        class D(dict):
989            def __getitem__(self, key):
990                raise ValueError
991        #self.assertRaises(ValueError, "\xff".translate, D())
992        self.assertRaises(ValueError, "\xff".translate, {0xff: sys.maxunicode+1})
993        self.assertRaises(TypeError, "\xff".translate, {0xff: ()})
994
995    def test_bug828737(self):
996        charmap = {
997            ord("&"): "&amp;",
998            ord("<"): "&lt;",
999            ord(">"): "&gt;",
1000            ord('"'): "&quot;",
1001        }
1002
1003        for n in (1, 10, 100, 1000):
1004            text = 'abc<def>ghi'*n
1005            text.translate(charmap)
1006
1007    def test_mutatingdecodehandler(self):
1008        baddata = [
1009            ("ascii", b"\xff"),
1010            ("utf-7", b"++"),
1011            ("utf-8",  b"\xff"),
1012            ("utf-16", b"\xff"),
1013            ("utf-32", b"\xff"),
1014            ("unicode-escape", b"\\u123g"),
1015            ("raw-unicode-escape", b"\\u123g"),
1016            ("unicode-internal", b"\xff"),
1017        ]
1018
1019        def replacing(exc):
1020            if isinstance(exc, UnicodeDecodeError):
1021                exc.object = 42
1022                return ("\u4242", 0)
1023            else:
1024                raise TypeError("don't know how to handle %r" % exc)
1025        codecs.register_error("test.replacing", replacing)
1026
1027        with test.support.check_warnings():
1028            # unicode-internal has been deprecated
1029            for (encoding, data) in baddata:
1030                with self.assertRaises(TypeError):
1031                    data.decode(encoding, "test.replacing")
1032
1033        def mutating(exc):
1034            if isinstance(exc, UnicodeDecodeError):
1035                exc.object = b""
1036                return ("\u4242", 0)
1037            else:
1038                raise TypeError("don't know how to handle %r" % exc)
1039        codecs.register_error("test.mutating", mutating)
1040        # If the decoder doesn't pick up the modified input the following
1041        # will lead to an endless loop
1042        with test.support.check_warnings():
1043            # unicode-internal has been deprecated
1044            for (encoding, data) in baddata:
1045                self.assertEqual(data.decode(encoding, "test.mutating"), "\u4242")
1046
1047    # issue32583
1048    def test_crashing_decode_handler(self):
1049        # better generating one more character to fill the extra space slot
1050        # so in debug build it can steadily fail
1051        def forward_shorter_than_end(exc):
1052            if isinstance(exc, UnicodeDecodeError):
1053                # size one character, 0 < forward < exc.end
1054                return ('\ufffd', exc.start+1)
1055            else:
1056                raise TypeError("don't know how to handle %r" % exc)
1057        codecs.register_error(
1058            "test.forward_shorter_than_end", forward_shorter_than_end)
1059
1060        self.assertEqual(
1061            b'\xd8\xd8\xd8\xd8\xd8\x00\x00\x00'.decode(
1062                'utf-16-le', 'test.forward_shorter_than_end'),
1063            '\ufffd\ufffd\ufffd\ufffd\xd8\x00'
1064        )
1065        self.assertEqual(
1066            b'\xd8\xd8\xd8\xd8\x00\xd8\x00\x00'.decode(
1067                'utf-16-be', 'test.forward_shorter_than_end'),
1068            '\ufffd\ufffd\ufffd\ufffd\xd8\x00'
1069        )
1070        self.assertEqual(
1071            b'\x11\x11\x11\x11\x11\x00\x00\x00\x00\x00\x00'.decode(
1072                'utf-32-le', 'test.forward_shorter_than_end'),
1073            '\ufffd\ufffd\ufffd\u1111\x00'
1074        )
1075        self.assertEqual(
1076            b'\x11\x11\x11\x00\x00\x11\x11\x00\x00\x00\x00'.decode(
1077                'utf-32-be', 'test.forward_shorter_than_end'),
1078            '\ufffd\ufffd\ufffd\u1111\x00'
1079        )
1080
1081        def replace_with_long(exc):
1082            if isinstance(exc, UnicodeDecodeError):
1083                exc.object = b"\x00" * 8
1084                return ('\ufffd', exc.start)
1085            else:
1086                raise TypeError("don't know how to handle %r" % exc)
1087        codecs.register_error("test.replace_with_long", replace_with_long)
1088
1089        self.assertEqual(
1090            b'\x00'.decode('utf-16', 'test.replace_with_long'),
1091            '\ufffd\x00\x00\x00\x00'
1092        )
1093        self.assertEqual(
1094            b'\x00'.decode('utf-32', 'test.replace_with_long'),
1095            '\ufffd\x00\x00'
1096        )
1097
1098
1099    def test_fake_error_class(self):
1100        handlers = [
1101            codecs.strict_errors,
1102            codecs.ignore_errors,
1103            codecs.replace_errors,
1104            codecs.backslashreplace_errors,
1105            codecs.namereplace_errors,
1106            codecs.xmlcharrefreplace_errors,
1107            codecs.lookup_error('surrogateescape'),
1108            codecs.lookup_error('surrogatepass'),
1109        ]
1110        for cls in UnicodeEncodeError, UnicodeDecodeError, UnicodeTranslateError:
1111            class FakeUnicodeError(str):
1112                __class__ = cls
1113            for handler in handlers:
1114                with self.subTest(handler=handler, error_class=cls):
1115                    self.assertRaises(TypeError, handler, FakeUnicodeError())
1116            class FakeUnicodeError(Exception):
1117                __class__ = cls
1118            for handler in handlers:
1119                with self.subTest(handler=handler, error_class=cls):
1120                    with self.assertRaises((TypeError, FakeUnicodeError)):
1121                        handler(FakeUnicodeError())
1122
1123
1124if __name__ == "__main__":
1125    unittest.main()
1126