1import codecs
2import contextlib
3import io
4import locale
5import sys
6import unittest
7import encodings
8from unittest import mock
9
10from test import support
11
12try:
13    import ctypes
14except ImportError:
15    ctypes = None
16    SIZEOF_WCHAR_T = -1
17else:
18    SIZEOF_WCHAR_T = ctypes.sizeof(ctypes.c_wchar)
19
20def coding_checker(self, coder):
21    def check(input, expect):
22        self.assertEqual(coder(input), (expect, len(input)))
23    return check
24
25
26class Queue(object):
27    """
28    queue: write bytes at one end, read bytes from the other end
29    """
30    def __init__(self, buffer):
31        self._buffer = buffer
32
33    def write(self, chars):
34        self._buffer += chars
35
36    def read(self, size=-1):
37        if size<0:
38            s = self._buffer
39            self._buffer = self._buffer[:0] # make empty
40            return s
41        else:
42            s = self._buffer[:size]
43            self._buffer = self._buffer[size:]
44            return s
45
46
47class MixInCheckStateHandling:
48    def check_state_handling_decode(self, encoding, u, s):
49        for i in range(len(s)+1):
50            d = codecs.getincrementaldecoder(encoding)()
51            part1 = d.decode(s[:i])
52            state = d.getstate()
53            self.assertIsInstance(state[1], int)
54            # Check that the condition stated in the documentation for
55            # IncrementalDecoder.getstate() holds
56            if not state[1]:
57                # reset decoder to the default state without anything buffered
58                d.setstate((state[0][:0], 0))
59                # Feeding the previous input may not produce any output
60                self.assertTrue(not d.decode(state[0]))
61                # The decoder must return to the same state
62                self.assertEqual(state, d.getstate())
63            # Create a new decoder and set it to the state
64            # we extracted from the old one
65            d = codecs.getincrementaldecoder(encoding)()
66            d.setstate(state)
67            part2 = d.decode(s[i:], True)
68            self.assertEqual(u, part1+part2)
69
70    def check_state_handling_encode(self, encoding, u, s):
71        for i in range(len(u)+1):
72            d = codecs.getincrementalencoder(encoding)()
73            part1 = d.encode(u[:i])
74            state = d.getstate()
75            d = codecs.getincrementalencoder(encoding)()
76            d.setstate(state)
77            part2 = d.encode(u[i:], True)
78            self.assertEqual(s, part1+part2)
79
80
81class ReadTest(MixInCheckStateHandling):
82    def check_partial(self, input, partialresults):
83        # get a StreamReader for the encoding and feed the bytestring version
84        # of input to the reader byte by byte. Read everything available from
85        # the StreamReader and check that the results equal the appropriate
86        # entries from partialresults.
87        q = Queue(b"")
88        r = codecs.getreader(self.encoding)(q)
89        result = ""
90        for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
91            q.write(bytes([c]))
92            result += r.read()
93            self.assertEqual(result, partialresult)
94        # check that there's nothing left in the buffers
95        self.assertEqual(r.read(), "")
96        self.assertEqual(r.bytebuffer, b"")
97
98        # do the check again, this time using an incremental decoder
99        d = codecs.getincrementaldecoder(self.encoding)()
100        result = ""
101        for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
102            result += d.decode(bytes([c]))
103            self.assertEqual(result, partialresult)
104        # check that there's nothing left in the buffers
105        self.assertEqual(d.decode(b"", True), "")
106        self.assertEqual(d.buffer, b"")
107
108        # Check whether the reset method works properly
109        d.reset()
110        result = ""
111        for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
112            result += d.decode(bytes([c]))
113            self.assertEqual(result, partialresult)
114        # check that there's nothing left in the buffers
115        self.assertEqual(d.decode(b"", True), "")
116        self.assertEqual(d.buffer, b"")
117
118        # check iterdecode()
119        encoded = input.encode(self.encoding)
120        self.assertEqual(
121            input,
122            "".join(codecs.iterdecode([bytes([c]) for c in encoded], self.encoding))
123        )
124
125    def test_readline(self):
126        def getreader(input):
127            stream = io.BytesIO(input.encode(self.encoding))
128            return codecs.getreader(self.encoding)(stream)
129
130        def readalllines(input, keepends=True, size=None):
131            reader = getreader(input)
132            lines = []
133            while True:
134                line = reader.readline(size=size, keepends=keepends)
135                if not line:
136                    break
137                lines.append(line)
138            return "|".join(lines)
139
140        s = "foo\nbar\r\nbaz\rspam\u2028eggs"
141        sexpected = "foo\n|bar\r\n|baz\r|spam\u2028|eggs"
142        sexpectednoends = "foo|bar|baz|spam|eggs"
143        self.assertEqual(readalllines(s, True), sexpected)
144        self.assertEqual(readalllines(s, False), sexpectednoends)
145        self.assertEqual(readalllines(s, True, 10), sexpected)
146        self.assertEqual(readalllines(s, False, 10), sexpectednoends)
147
148        lineends = ("\n", "\r\n", "\r", "\u2028")
149        # Test long lines (multiple calls to read() in readline())
150        vw = []
151        vwo = []
152        for (i, lineend) in enumerate(lineends):
153            vw.append((i*200+200)*"\u3042" + lineend)
154            vwo.append((i*200+200)*"\u3042")
155        self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
156        self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
157
158        # Test lines where the first read might end with \r, so the
159        # reader has to look ahead whether this is a lone \r or a \r\n
160        for size in range(80):
161            for lineend in lineends:
162                s = 10*(size*"a" + lineend + "xxx\n")
163                reader = getreader(s)
164                for i in range(10):
165                    self.assertEqual(
166                        reader.readline(keepends=True),
167                        size*"a" + lineend,
168                    )
169                    self.assertEqual(
170                        reader.readline(keepends=True),
171                        "xxx\n",
172                    )
173                reader = getreader(s)
174                for i in range(10):
175                    self.assertEqual(
176                        reader.readline(keepends=False),
177                        size*"a",
178                    )
179                    self.assertEqual(
180                        reader.readline(keepends=False),
181                        "xxx",
182                    )
183
184    def test_mixed_readline_and_read(self):
185        lines = ["Humpty Dumpty sat on a wall,\n",
186                 "Humpty Dumpty had a great fall.\r\n",
187                 "All the king's horses and all the king's men\r",
188                 "Couldn't put Humpty together again."]
189        data = ''.join(lines)
190        def getreader():
191            stream = io.BytesIO(data.encode(self.encoding))
192            return codecs.getreader(self.encoding)(stream)
193
194        # Issue #8260: Test readline() followed by read()
195        f = getreader()
196        self.assertEqual(f.readline(), lines[0])
197        self.assertEqual(f.read(), ''.join(lines[1:]))
198        self.assertEqual(f.read(), '')
199
200        # Issue #32110: Test readline() followed by read(n)
201        f = getreader()
202        self.assertEqual(f.readline(), lines[0])
203        self.assertEqual(f.read(1), lines[1][0])
204        self.assertEqual(f.read(0), '')
205        self.assertEqual(f.read(100), data[len(lines[0]) + 1:][:100])
206
207        # Issue #16636: Test readline() followed by readlines()
208        f = getreader()
209        self.assertEqual(f.readline(), lines[0])
210        self.assertEqual(f.readlines(), lines[1:])
211        self.assertEqual(f.read(), '')
212
213        # Test read(n) followed by read()
214        f = getreader()
215        self.assertEqual(f.read(size=40, chars=5), data[:5])
216        self.assertEqual(f.read(), data[5:])
217        self.assertEqual(f.read(), '')
218
219        # Issue #32110: Test read(n) followed by read(n)
220        f = getreader()
221        self.assertEqual(f.read(size=40, chars=5), data[:5])
222        self.assertEqual(f.read(1), data[5])
223        self.assertEqual(f.read(0), '')
224        self.assertEqual(f.read(100), data[6:106])
225
226        # Issue #12446: Test read(n) followed by readlines()
227        f = getreader()
228        self.assertEqual(f.read(size=40, chars=5), data[:5])
229        self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
230        self.assertEqual(f.read(), '')
231
232    def test_bug1175396(self):
233        s = [
234            '<%!--===================================================\r\n',
235            '    BLOG index page: show recent articles,\r\n',
236            '    today\'s articles, or articles of a specific date.\r\n',
237            '========================================================--%>\r\n',
238            '<%@inputencoding="ISO-8859-1"%>\r\n',
239            '<%@pagetemplate=TEMPLATE.y%>\r\n',
240            '<%@import=import frog.util, frog%>\r\n',
241            '<%@import=import frog.objects%>\r\n',
242            '<%@import=from frog.storageerrors import StorageError%>\r\n',
243            '<%\r\n',
244            '\r\n',
245            'import logging\r\n',
246            'log=logging.getLogger("Snakelets.logger")\r\n',
247            '\r\n',
248            '\r\n',
249            'user=self.SessionCtx.user\r\n',
250            'storageEngine=self.SessionCtx.storageEngine\r\n',
251            '\r\n',
252            '\r\n',
253            'def readArticlesFromDate(date, count=None):\r\n',
254            '    entryids=storageEngine.listBlogEntries(date)\r\n',
255            '    entryids.reverse() # descending\r\n',
256            '    if count:\r\n',
257            '        entryids=entryids[:count]\r\n',
258            '    try:\r\n',
259            '        return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
260            '    except StorageError,x:\r\n',
261            '        log.error("Error loading articles: "+str(x))\r\n',
262            '        self.abort("cannot load articles")\r\n',
263            '\r\n',
264            'showdate=None\r\n',
265            '\r\n',
266            'arg=self.Request.getArg()\r\n',
267            'if arg=="today":\r\n',
268            '    #-------------------- TODAY\'S ARTICLES\r\n',
269            '    self.write("<h2>Today\'s articles</h2>")\r\n',
270            '    showdate = frog.util.isodatestr() \r\n',
271            '    entries = readArticlesFromDate(showdate)\r\n',
272            'elif arg=="active":\r\n',
273            '    #-------------------- ACTIVE ARTICLES redirect\r\n',
274            '    self.Yredirect("active.y")\r\n',
275            'elif arg=="login":\r\n',
276            '    #-------------------- LOGIN PAGE redirect\r\n',
277            '    self.Yredirect("login.y")\r\n',
278            'elif arg=="date":\r\n',
279            '    #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
280            '    showdate = self.Request.getParameter("date")\r\n',
281            '    self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
282            '    entries = readArticlesFromDate(showdate)\r\n',
283            'else:\r\n',
284            '    #-------------------- RECENT ARTICLES\r\n',
285            '    self.write("<h2>Recent articles</h2>")\r\n',
286            '    dates=storageEngine.listBlogEntryDates()\r\n',
287            '    if dates:\r\n',
288            '        entries=[]\r\n',
289            '        SHOWAMOUNT=10\r\n',
290            '        for showdate in dates:\r\n',
291            '            entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
292            '            if len(entries)>=SHOWAMOUNT:\r\n',
293            '                break\r\n',
294            '                \r\n',
295        ]
296        stream = io.BytesIO("".join(s).encode(self.encoding))
297        reader = codecs.getreader(self.encoding)(stream)
298        for (i, line) in enumerate(reader):
299            self.assertEqual(line, s[i])
300
301    def test_readlinequeue(self):
302        q = Queue(b"")
303        writer = codecs.getwriter(self.encoding)(q)
304        reader = codecs.getreader(self.encoding)(q)
305
306        # No lineends
307        writer.write("foo\r")
308        self.assertEqual(reader.readline(keepends=False), "foo")
309        writer.write("\nbar\r")
310        self.assertEqual(reader.readline(keepends=False), "")
311        self.assertEqual(reader.readline(keepends=False), "bar")
312        writer.write("baz")
313        self.assertEqual(reader.readline(keepends=False), "baz")
314        self.assertEqual(reader.readline(keepends=False), "")
315
316        # Lineends
317        writer.write("foo\r")
318        self.assertEqual(reader.readline(keepends=True), "foo\r")
319        writer.write("\nbar\r")
320        self.assertEqual(reader.readline(keepends=True), "\n")
321        self.assertEqual(reader.readline(keepends=True), "bar\r")
322        writer.write("baz")
323        self.assertEqual(reader.readline(keepends=True), "baz")
324        self.assertEqual(reader.readline(keepends=True), "")
325        writer.write("foo\r\n")
326        self.assertEqual(reader.readline(keepends=True), "foo\r\n")
327
328    def test_bug1098990_a(self):
329        s1 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
330        s2 = "offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
331        s3 = "next line.\r\n"
332
333        s = (s1+s2+s3).encode(self.encoding)
334        stream = io.BytesIO(s)
335        reader = codecs.getreader(self.encoding)(stream)
336        self.assertEqual(reader.readline(), s1)
337        self.assertEqual(reader.readline(), s2)
338        self.assertEqual(reader.readline(), s3)
339        self.assertEqual(reader.readline(), "")
340
341    def test_bug1098990_b(self):
342        s1 = "aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
343        s2 = "bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
344        s3 = "stillokay:bbbbxx\r\n"
345        s4 = "broken!!!!badbad\r\n"
346        s5 = "againokay.\r\n"
347
348        s = (s1+s2+s3+s4+s5).encode(self.encoding)
349        stream = io.BytesIO(s)
350        reader = codecs.getreader(self.encoding)(stream)
351        self.assertEqual(reader.readline(), s1)
352        self.assertEqual(reader.readline(), s2)
353        self.assertEqual(reader.readline(), s3)
354        self.assertEqual(reader.readline(), s4)
355        self.assertEqual(reader.readline(), s5)
356        self.assertEqual(reader.readline(), "")
357
358    ill_formed_sequence_replace = "\ufffd"
359
360    def test_lone_surrogates(self):
361        self.assertRaises(UnicodeEncodeError, "\ud800".encode, self.encoding)
362        self.assertEqual("[\uDC80]".encode(self.encoding, "backslashreplace"),
363                         "[\\udc80]".encode(self.encoding))
364        self.assertEqual("[\uDC80]".encode(self.encoding, "namereplace"),
365                         "[\\udc80]".encode(self.encoding))
366        self.assertEqual("[\uDC80]".encode(self.encoding, "xmlcharrefreplace"),
367                         "[&#56448;]".encode(self.encoding))
368        self.assertEqual("[\uDC80]".encode(self.encoding, "ignore"),
369                         "[]".encode(self.encoding))
370        self.assertEqual("[\uDC80]".encode(self.encoding, "replace"),
371                         "[?]".encode(self.encoding))
372
373        # sequential surrogate characters
374        self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "ignore"),
375                         "[]".encode(self.encoding))
376        self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "replace"),
377                         "[??]".encode(self.encoding))
378
379        bom = "".encode(self.encoding)
380        for before, after in [("\U00010fff", "A"), ("[", "]"),
381                              ("A", "\U00010fff")]:
382            before_sequence = before.encode(self.encoding)[len(bom):]
383            after_sequence = after.encode(self.encoding)[len(bom):]
384            test_string = before + "\uDC80" + after
385            test_sequence = (bom + before_sequence +
386                             self.ill_formed_sequence + after_sequence)
387            self.assertRaises(UnicodeDecodeError, test_sequence.decode,
388                              self.encoding)
389            self.assertEqual(test_string.encode(self.encoding,
390                                                "surrogatepass"),
391                             test_sequence)
392            self.assertEqual(test_sequence.decode(self.encoding,
393                                                  "surrogatepass"),
394                             test_string)
395            self.assertEqual(test_sequence.decode(self.encoding, "ignore"),
396                             before + after)
397            self.assertEqual(test_sequence.decode(self.encoding, "replace"),
398                             before + self.ill_formed_sequence_replace + after)
399            backslashreplace = ''.join('\\x%02x' % b
400                                       for b in self.ill_formed_sequence)
401            self.assertEqual(test_sequence.decode(self.encoding, "backslashreplace"),
402                             before + backslashreplace + after)
403
404
405class UTF32Test(ReadTest, unittest.TestCase):
406    encoding = "utf-32"
407    if sys.byteorder == 'little':
408        ill_formed_sequence = b"\x80\xdc\x00\x00"
409    else:
410        ill_formed_sequence = b"\x00\x00\xdc\x80"
411
412    spamle = (b'\xff\xfe\x00\x00'
413              b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
414              b's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
415    spambe = (b'\x00\x00\xfe\xff'
416              b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
417              b'\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
418
419    def test_only_one_bom(self):
420        _,_,reader,writer = codecs.lookup(self.encoding)
421        # encode some stream
422        s = io.BytesIO()
423        f = writer(s)
424        f.write("spam")
425        f.write("spam")
426        d = s.getvalue()
427        # check whether there is exactly one BOM in it
428        self.assertTrue(d == self.spamle or d == self.spambe)
429        # try to read it back
430        s = io.BytesIO(d)
431        f = reader(s)
432        self.assertEqual(f.read(), "spamspam")
433
434    def test_badbom(self):
435        s = io.BytesIO(4*b"\xff")
436        f = codecs.getreader(self.encoding)(s)
437        self.assertRaises(UnicodeError, f.read)
438
439        s = io.BytesIO(8*b"\xff")
440        f = codecs.getreader(self.encoding)(s)
441        self.assertRaises(UnicodeError, f.read)
442
443    def test_partial(self):
444        self.check_partial(
445            "\x00\xff\u0100\uffff\U00010000",
446            [
447                "", # first byte of BOM read
448                "", # second byte of BOM read
449                "", # third byte of BOM read
450                "", # fourth byte of BOM read => byteorder known
451                "",
452                "",
453                "",
454                "\x00",
455                "\x00",
456                "\x00",
457                "\x00",
458                "\x00\xff",
459                "\x00\xff",
460                "\x00\xff",
461                "\x00\xff",
462                "\x00\xff\u0100",
463                "\x00\xff\u0100",
464                "\x00\xff\u0100",
465                "\x00\xff\u0100",
466                "\x00\xff\u0100\uffff",
467                "\x00\xff\u0100\uffff",
468                "\x00\xff\u0100\uffff",
469                "\x00\xff\u0100\uffff",
470                "\x00\xff\u0100\uffff\U00010000",
471            ]
472        )
473
474    def test_handlers(self):
475        self.assertEqual(('\ufffd', 1),
476                         codecs.utf_32_decode(b'\x01', 'replace', True))
477        self.assertEqual(('', 1),
478                         codecs.utf_32_decode(b'\x01', 'ignore', True))
479
480    def test_errors(self):
481        self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
482                          b"\xff", "strict", True)
483
484    def test_decoder_state(self):
485        self.check_state_handling_decode(self.encoding,
486                                         "spamspam", self.spamle)
487        self.check_state_handling_decode(self.encoding,
488                                         "spamspam", self.spambe)
489
490    def test_issue8941(self):
491        # Issue #8941: insufficient result allocation when decoding into
492        # surrogate pairs on UCS-2 builds.
493        encoded_le = b'\xff\xfe\x00\x00' + b'\x00\x00\x01\x00' * 1024
494        self.assertEqual('\U00010000' * 1024,
495                         codecs.utf_32_decode(encoded_le)[0])
496        encoded_be = b'\x00\x00\xfe\xff' + b'\x00\x01\x00\x00' * 1024
497        self.assertEqual('\U00010000' * 1024,
498                         codecs.utf_32_decode(encoded_be)[0])
499
500
501class UTF32LETest(ReadTest, unittest.TestCase):
502    encoding = "utf-32-le"
503    ill_formed_sequence = b"\x80\xdc\x00\x00"
504
505    def test_partial(self):
506        self.check_partial(
507            "\x00\xff\u0100\uffff\U00010000",
508            [
509                "",
510                "",
511                "",
512                "\x00",
513                "\x00",
514                "\x00",
515                "\x00",
516                "\x00\xff",
517                "\x00\xff",
518                "\x00\xff",
519                "\x00\xff",
520                "\x00\xff\u0100",
521                "\x00\xff\u0100",
522                "\x00\xff\u0100",
523                "\x00\xff\u0100",
524                "\x00\xff\u0100\uffff",
525                "\x00\xff\u0100\uffff",
526                "\x00\xff\u0100\uffff",
527                "\x00\xff\u0100\uffff",
528                "\x00\xff\u0100\uffff\U00010000",
529            ]
530        )
531
532    def test_simple(self):
533        self.assertEqual("\U00010203".encode(self.encoding), b"\x03\x02\x01\x00")
534
535    def test_errors(self):
536        self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
537                          b"\xff", "strict", True)
538
539    def test_issue8941(self):
540        # Issue #8941: insufficient result allocation when decoding into
541        # surrogate pairs on UCS-2 builds.
542        encoded = b'\x00\x00\x01\x00' * 1024
543        self.assertEqual('\U00010000' * 1024,
544                         codecs.utf_32_le_decode(encoded)[0])
545
546
547class UTF32BETest(ReadTest, unittest.TestCase):
548    encoding = "utf-32-be"
549    ill_formed_sequence = b"\x00\x00\xdc\x80"
550
551    def test_partial(self):
552        self.check_partial(
553            "\x00\xff\u0100\uffff\U00010000",
554            [
555                "",
556                "",
557                "",
558                "\x00",
559                "\x00",
560                "\x00",
561                "\x00",
562                "\x00\xff",
563                "\x00\xff",
564                "\x00\xff",
565                "\x00\xff",
566                "\x00\xff\u0100",
567                "\x00\xff\u0100",
568                "\x00\xff\u0100",
569                "\x00\xff\u0100",
570                "\x00\xff\u0100\uffff",
571                "\x00\xff\u0100\uffff",
572                "\x00\xff\u0100\uffff",
573                "\x00\xff\u0100\uffff",
574                "\x00\xff\u0100\uffff\U00010000",
575            ]
576        )
577
578    def test_simple(self):
579        self.assertEqual("\U00010203".encode(self.encoding), b"\x00\x01\x02\x03")
580
581    def test_errors(self):
582        self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
583                          b"\xff", "strict", True)
584
585    def test_issue8941(self):
586        # Issue #8941: insufficient result allocation when decoding into
587        # surrogate pairs on UCS-2 builds.
588        encoded = b'\x00\x01\x00\x00' * 1024
589        self.assertEqual('\U00010000' * 1024,
590                         codecs.utf_32_be_decode(encoded)[0])
591
592
593class UTF16Test(ReadTest, unittest.TestCase):
594    encoding = "utf-16"
595    if sys.byteorder == 'little':
596        ill_formed_sequence = b"\x80\xdc"
597    else:
598        ill_formed_sequence = b"\xdc\x80"
599
600    spamle = b'\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
601    spambe = b'\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
602
603    def test_only_one_bom(self):
604        _,_,reader,writer = codecs.lookup(self.encoding)
605        # encode some stream
606        s = io.BytesIO()
607        f = writer(s)
608        f.write("spam")
609        f.write("spam")
610        d = s.getvalue()
611        # check whether there is exactly one BOM in it
612        self.assertTrue(d == self.spamle or d == self.spambe)
613        # try to read it back
614        s = io.BytesIO(d)
615        f = reader(s)
616        self.assertEqual(f.read(), "spamspam")
617
618    def test_badbom(self):
619        s = io.BytesIO(b"\xff\xff")
620        f = codecs.getreader(self.encoding)(s)
621        self.assertRaises(UnicodeError, f.read)
622
623        s = io.BytesIO(b"\xff\xff\xff\xff")
624        f = codecs.getreader(self.encoding)(s)
625        self.assertRaises(UnicodeError, f.read)
626
627    def test_partial(self):
628        self.check_partial(
629            "\x00\xff\u0100\uffff\U00010000",
630            [
631                "", # first byte of BOM read
632                "", # second byte of BOM read => byteorder known
633                "",
634                "\x00",
635                "\x00",
636                "\x00\xff",
637                "\x00\xff",
638                "\x00\xff\u0100",
639                "\x00\xff\u0100",
640                "\x00\xff\u0100\uffff",
641                "\x00\xff\u0100\uffff",
642                "\x00\xff\u0100\uffff",
643                "\x00\xff\u0100\uffff",
644                "\x00\xff\u0100\uffff\U00010000",
645            ]
646        )
647
648    def test_handlers(self):
649        self.assertEqual(('\ufffd', 1),
650                         codecs.utf_16_decode(b'\x01', 'replace', True))
651        self.assertEqual(('', 1),
652                         codecs.utf_16_decode(b'\x01', 'ignore', True))
653
654    def test_errors(self):
655        self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode,
656                          b"\xff", "strict", True)
657
658    def test_decoder_state(self):
659        self.check_state_handling_decode(self.encoding,
660                                         "spamspam", self.spamle)
661        self.check_state_handling_decode(self.encoding,
662                                         "spamspam", self.spambe)
663
664    def test_bug691291(self):
665        # Files are always opened in binary mode, even if no binary mode was
666        # specified.  This means that no automatic conversion of '\n' is done
667        # on reading and writing.
668        s1 = 'Hello\r\nworld\r\n'
669
670        s = s1.encode(self.encoding)
671        self.addCleanup(support.unlink, support.TESTFN)
672        with open(support.TESTFN, 'wb') as fp:
673            fp.write(s)
674        with support.check_warnings(('', DeprecationWarning)):
675            reader = codecs.open(support.TESTFN, 'U', encoding=self.encoding)
676        with reader:
677            self.assertEqual(reader.read(), s1)
678
679class UTF16LETest(ReadTest, unittest.TestCase):
680    encoding = "utf-16-le"
681    ill_formed_sequence = b"\x80\xdc"
682
683    def test_partial(self):
684        self.check_partial(
685            "\x00\xff\u0100\uffff\U00010000",
686            [
687                "",
688                "\x00",
689                "\x00",
690                "\x00\xff",
691                "\x00\xff",
692                "\x00\xff\u0100",
693                "\x00\xff\u0100",
694                "\x00\xff\u0100\uffff",
695                "\x00\xff\u0100\uffff",
696                "\x00\xff\u0100\uffff",
697                "\x00\xff\u0100\uffff",
698                "\x00\xff\u0100\uffff\U00010000",
699            ]
700        )
701
702    def test_errors(self):
703        tests = [
704            (b'\xff', '\ufffd'),
705            (b'A\x00Z', 'A\ufffd'),
706            (b'A\x00B\x00C\x00D\x00Z', 'ABCD\ufffd'),
707            (b'\x00\xd8', '\ufffd'),
708            (b'\x00\xd8A', '\ufffd'),
709            (b'\x00\xd8A\x00', '\ufffdA'),
710            (b'\x00\xdcA\x00', '\ufffdA'),
711        ]
712        for raw, expected in tests:
713            self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode,
714                              raw, 'strict', True)
715            self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
716
717    def test_nonbmp(self):
718        self.assertEqual("\U00010203".encode(self.encoding),
719                         b'\x00\xd8\x03\xde')
720        self.assertEqual(b'\x00\xd8\x03\xde'.decode(self.encoding),
721                         "\U00010203")
722
723class UTF16BETest(ReadTest, unittest.TestCase):
724    encoding = "utf-16-be"
725    ill_formed_sequence = b"\xdc\x80"
726
727    def test_partial(self):
728        self.check_partial(
729            "\x00\xff\u0100\uffff\U00010000",
730            [
731                "",
732                "\x00",
733                "\x00",
734                "\x00\xff",
735                "\x00\xff",
736                "\x00\xff\u0100",
737                "\x00\xff\u0100",
738                "\x00\xff\u0100\uffff",
739                "\x00\xff\u0100\uffff",
740                "\x00\xff\u0100\uffff",
741                "\x00\xff\u0100\uffff",
742                "\x00\xff\u0100\uffff\U00010000",
743            ]
744        )
745
746    def test_errors(self):
747        tests = [
748            (b'\xff', '\ufffd'),
749            (b'\x00A\xff', 'A\ufffd'),
750            (b'\x00A\x00B\x00C\x00DZ', 'ABCD\ufffd'),
751            (b'\xd8\x00', '\ufffd'),
752            (b'\xd8\x00\xdc', '\ufffd'),
753            (b'\xd8\x00\x00A', '\ufffdA'),
754            (b'\xdc\x00\x00A', '\ufffdA'),
755        ]
756        for raw, expected in tests:
757            self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode,
758                              raw, 'strict', True)
759            self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
760
761    def test_nonbmp(self):
762        self.assertEqual("\U00010203".encode(self.encoding),
763                         b'\xd8\x00\xde\x03')
764        self.assertEqual(b'\xd8\x00\xde\x03'.decode(self.encoding),
765                         "\U00010203")
766
767class UTF8Test(ReadTest, unittest.TestCase):
768    encoding = "utf-8"
769    ill_formed_sequence = b"\xed\xb2\x80"
770    ill_formed_sequence_replace = "\ufffd" * 3
771    BOM = b''
772
773    def test_partial(self):
774        self.check_partial(
775            "\x00\xff\u07ff\u0800\uffff\U00010000",
776            [
777                "\x00",
778                "\x00",
779                "\x00\xff",
780                "\x00\xff",
781                "\x00\xff\u07ff",
782                "\x00\xff\u07ff",
783                "\x00\xff\u07ff",
784                "\x00\xff\u07ff\u0800",
785                "\x00\xff\u07ff\u0800",
786                "\x00\xff\u07ff\u0800",
787                "\x00\xff\u07ff\u0800\uffff",
788                "\x00\xff\u07ff\u0800\uffff",
789                "\x00\xff\u07ff\u0800\uffff",
790                "\x00\xff\u07ff\u0800\uffff",
791                "\x00\xff\u07ff\u0800\uffff\U00010000",
792            ]
793        )
794
795    def test_decoder_state(self):
796        u = "\x00\x7f\x80\xff\u0100\u07ff\u0800\uffff\U0010ffff"
797        self.check_state_handling_decode(self.encoding,
798                                         u, u.encode(self.encoding))
799
800    def test_decode_error(self):
801        for data, error_handler, expected in (
802            (b'[\x80\xff]', 'ignore', '[]'),
803            (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
804            (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
805            (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
806        ):
807            with self.subTest(data=data, error_handler=error_handler,
808                              expected=expected):
809                self.assertEqual(data.decode(self.encoding, error_handler),
810                                 expected)
811
812    def test_lone_surrogates(self):
813        super().test_lone_surrogates()
814        # not sure if this is making sense for
815        # UTF-16 and UTF-32
816        self.assertEqual("[\uDC80]".encode(self.encoding, "surrogateescape"),
817                         self.BOM + b'[\x80]')
818
819        with self.assertRaises(UnicodeEncodeError) as cm:
820            "[\uDC80\uD800\uDFFF]".encode(self.encoding, "surrogateescape")
821        exc = cm.exception
822        self.assertEqual(exc.object[exc.start:exc.end], '\uD800\uDFFF')
823
824    def test_surrogatepass_handler(self):
825        self.assertEqual("abc\ud800def".encode(self.encoding, "surrogatepass"),
826                         self.BOM + b"abc\xed\xa0\x80def")
827        self.assertEqual("\U00010fff\uD800".encode(self.encoding, "surrogatepass"),
828                         self.BOM + b"\xf0\x90\xbf\xbf\xed\xa0\x80")
829        self.assertEqual("[\uD800\uDC80]".encode(self.encoding, "surrogatepass"),
830                         self.BOM + b'[\xed\xa0\x80\xed\xb2\x80]')
831
832        self.assertEqual(b"abc\xed\xa0\x80def".decode(self.encoding, "surrogatepass"),
833                         "abc\ud800def")
834        self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode(self.encoding, "surrogatepass"),
835                         "\U00010fff\uD800")
836
837        self.assertTrue(codecs.lookup_error("surrogatepass"))
838        with self.assertRaises(UnicodeDecodeError):
839            b"abc\xed\xa0".decode(self.encoding, "surrogatepass")
840        with self.assertRaises(UnicodeDecodeError):
841            b"abc\xed\xa0z".decode(self.encoding, "surrogatepass")
842
843
844@unittest.skipUnless(sys.platform == 'win32',
845                     'cp65001 is a Windows-only codec')
846class CP65001Test(ReadTest, unittest.TestCase):
847    encoding = "cp65001"
848
849    def test_encode(self):
850        tests = [
851            ('abc', 'strict', b'abc'),
852            ('\xe9\u20ac', 'strict',  b'\xc3\xa9\xe2\x82\xac'),
853            ('\U0010ffff', 'strict', b'\xf4\x8f\xbf\xbf'),
854            ('\udc80', 'strict', None),
855            ('\udc80', 'ignore', b''),
856            ('\udc80', 'replace', b'?'),
857            ('\udc80', 'backslashreplace', b'\\udc80'),
858            ('\udc80', 'namereplace', b'\\udc80'),
859            ('\udc80', 'surrogatepass', b'\xed\xb2\x80'),
860        ]
861        for text, errors, expected in tests:
862            if expected is not None:
863                try:
864                    encoded = text.encode('cp65001', errors)
865                except UnicodeEncodeError as err:
866                    self.fail('Unable to encode %a to cp65001 with '
867                              'errors=%r: %s' % (text, errors, err))
868                self.assertEqual(encoded, expected,
869                    '%a.encode("cp65001", %r)=%a != %a'
870                    % (text, errors, encoded, expected))
871            else:
872                self.assertRaises(UnicodeEncodeError,
873                    text.encode, "cp65001", errors)
874
875    def test_decode(self):
876        tests = [
877            (b'abc', 'strict', 'abc'),
878            (b'\xc3\xa9\xe2\x82\xac', 'strict', '\xe9\u20ac'),
879            (b'\xf4\x8f\xbf\xbf', 'strict', '\U0010ffff'),
880            (b'\xef\xbf\xbd', 'strict', '\ufffd'),
881            (b'[\xc3\xa9]', 'strict', '[\xe9]'),
882            # invalid bytes
883            (b'[\xff]', 'strict', None),
884            (b'[\xff]', 'ignore', '[]'),
885            (b'[\xff]', 'replace', '[\ufffd]'),
886            (b'[\xff]', 'surrogateescape', '[\udcff]'),
887            (b'[\xed\xb2\x80]', 'strict', None),
888            (b'[\xed\xb2\x80]', 'ignore', '[]'),
889            (b'[\xed\xb2\x80]', 'replace', '[\ufffd\ufffd\ufffd]'),
890        ]
891        for raw, errors, expected in tests:
892            if expected is not None:
893                try:
894                    decoded = raw.decode('cp65001', errors)
895                except UnicodeDecodeError as err:
896                    self.fail('Unable to decode %a from cp65001 with '
897                              'errors=%r: %s' % (raw, errors, err))
898                self.assertEqual(decoded, expected,
899                    '%a.decode("cp65001", %r)=%a != %a'
900                    % (raw, errors, decoded, expected))
901            else:
902                self.assertRaises(UnicodeDecodeError,
903                    raw.decode, 'cp65001', errors)
904
905    def test_lone_surrogates(self):
906        self.assertRaises(UnicodeEncodeError, "\ud800".encode, "cp65001")
907        self.assertRaises(UnicodeDecodeError, b"\xed\xa0\x80".decode, "cp65001")
908        self.assertEqual("[\uDC80]".encode("cp65001", "backslashreplace"),
909                         b'[\\udc80]')
910        self.assertEqual("[\uDC80]".encode("cp65001", "namereplace"),
911                         b'[\\udc80]')
912        self.assertEqual("[\uDC80]".encode("cp65001", "xmlcharrefreplace"),
913                         b'[&#56448;]')
914        self.assertEqual("[\uDC80]".encode("cp65001", "surrogateescape"),
915                         b'[\x80]')
916        self.assertEqual("[\uDC80]".encode("cp65001", "ignore"),
917                         b'[]')
918        self.assertEqual("[\uDC80]".encode("cp65001", "replace"),
919                         b'[?]')
920
921    def test_surrogatepass_handler(self):
922        self.assertEqual("abc\ud800def".encode("cp65001", "surrogatepass"),
923                         b"abc\xed\xa0\x80def")
924        self.assertEqual(b"abc\xed\xa0\x80def".decode("cp65001", "surrogatepass"),
925                         "abc\ud800def")
926        self.assertEqual("\U00010fff\uD800".encode("cp65001", "surrogatepass"),
927                         b"\xf0\x90\xbf\xbf\xed\xa0\x80")
928        self.assertEqual(b"\xf0\x90\xbf\xbf\xed\xa0\x80".decode("cp65001", "surrogatepass"),
929                         "\U00010fff\uD800")
930        self.assertTrue(codecs.lookup_error("surrogatepass"))
931
932
933class UTF7Test(ReadTest, unittest.TestCase):
934    encoding = "utf-7"
935
936    def test_ascii(self):
937        # Set D (directly encoded characters)
938        set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
939                 'abcdefghijklmnopqrstuvwxyz'
940                 '0123456789'
941                 '\'(),-./:?')
942        self.assertEqual(set_d.encode(self.encoding), set_d.encode('ascii'))
943        self.assertEqual(set_d.encode('ascii').decode(self.encoding), set_d)
944        # Set O (optional direct characters)
945        set_o = ' !"#$%&*;<=>@[]^_`{|}'
946        self.assertEqual(set_o.encode(self.encoding), set_o.encode('ascii'))
947        self.assertEqual(set_o.encode('ascii').decode(self.encoding), set_o)
948        # +
949        self.assertEqual('a+b'.encode(self.encoding), b'a+-b')
950        self.assertEqual(b'a+-b'.decode(self.encoding), 'a+b')
951        # White spaces
952        ws = ' \t\n\r'
953        self.assertEqual(ws.encode(self.encoding), ws.encode('ascii'))
954        self.assertEqual(ws.encode('ascii').decode(self.encoding), ws)
955        # Other ASCII characters
956        other_ascii = ''.join(sorted(set(bytes(range(0x80)).decode()) -
957                                     set(set_d + set_o + '+' + ws)))
958        self.assertEqual(other_ascii.encode(self.encoding),
959                         b'+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
960                         b'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
961
962    def test_partial(self):
963        self.check_partial(
964            'a+-b\x00c\x80d\u0100e\U00010000f',
965            [
966                'a',
967                'a',
968                'a+',
969                'a+-',
970                'a+-b',
971                'a+-b',
972                'a+-b',
973                'a+-b',
974                'a+-b',
975                'a+-b\x00',
976                'a+-b\x00c',
977                'a+-b\x00c',
978                'a+-b\x00c',
979                'a+-b\x00c',
980                'a+-b\x00c',
981                'a+-b\x00c\x80',
982                'a+-b\x00c\x80d',
983                'a+-b\x00c\x80d',
984                'a+-b\x00c\x80d',
985                'a+-b\x00c\x80d',
986                'a+-b\x00c\x80d',
987                'a+-b\x00c\x80d\u0100',
988                'a+-b\x00c\x80d\u0100e',
989                'a+-b\x00c\x80d\u0100e',
990                'a+-b\x00c\x80d\u0100e',
991                'a+-b\x00c\x80d\u0100e',
992                'a+-b\x00c\x80d\u0100e',
993                'a+-b\x00c\x80d\u0100e',
994                'a+-b\x00c\x80d\u0100e',
995                'a+-b\x00c\x80d\u0100e',
996                'a+-b\x00c\x80d\u0100e\U00010000',
997                'a+-b\x00c\x80d\u0100e\U00010000f',
998            ]
999        )
1000
1001    def test_errors(self):
1002        tests = [
1003            (b'\xffb', '\ufffdb'),
1004            (b'a\xffb', 'a\ufffdb'),
1005            (b'a\xff\xffb', 'a\ufffd\ufffdb'),
1006            (b'a+IK', 'a\ufffd'),
1007            (b'a+IK-b', 'a\ufffdb'),
1008            (b'a+IK,b', 'a\ufffdb'),
1009            (b'a+IKx', 'a\u20ac\ufffd'),
1010            (b'a+IKx-b', 'a\u20ac\ufffdb'),
1011            (b'a+IKwgr', 'a\u20ac\ufffd'),
1012            (b'a+IKwgr-b', 'a\u20ac\ufffdb'),
1013            (b'a+IKwgr,', 'a\u20ac\ufffd'),
1014            (b'a+IKwgr,-b', 'a\u20ac\ufffd-b'),
1015            (b'a+IKwgrB', 'a\u20ac\u20ac\ufffd'),
1016            (b'a+IKwgrB-b', 'a\u20ac\u20ac\ufffdb'),
1017            (b'a+/,+IKw-b', 'a\ufffd\u20acb'),
1018            (b'a+//,+IKw-b', 'a\ufffd\u20acb'),
1019            (b'a+///,+IKw-b', 'a\uffff\ufffd\u20acb'),
1020            (b'a+////,+IKw-b', 'a\uffff\ufffd\u20acb'),
1021            (b'a+IKw-b\xff', 'a\u20acb\ufffd'),
1022            (b'a+IKw\xffb', 'a\u20ac\ufffdb'),
1023        ]
1024        for raw, expected in tests:
1025            with self.subTest(raw=raw):
1026                self.assertRaises(UnicodeDecodeError, codecs.utf_7_decode,
1027                                raw, 'strict', True)
1028                self.assertEqual(raw.decode('utf-7', 'replace'), expected)
1029
1030    def test_nonbmp(self):
1031        self.assertEqual('\U000104A0'.encode(self.encoding), b'+2AHcoA-')
1032        self.assertEqual('\ud801\udca0'.encode(self.encoding), b'+2AHcoA-')
1033        self.assertEqual(b'+2AHcoA-'.decode(self.encoding), '\U000104A0')
1034        self.assertEqual(b'+2AHcoA'.decode(self.encoding), '\U000104A0')
1035        self.assertEqual('\u20ac\U000104A0'.encode(self.encoding), b'+IKzYAdyg-')
1036        self.assertEqual(b'+IKzYAdyg-'.decode(self.encoding), '\u20ac\U000104A0')
1037        self.assertEqual(b'+IKzYAdyg'.decode(self.encoding), '\u20ac\U000104A0')
1038        self.assertEqual('\u20ac\u20ac\U000104A0'.encode(self.encoding),
1039                         b'+IKwgrNgB3KA-')
1040        self.assertEqual(b'+IKwgrNgB3KA-'.decode(self.encoding),
1041                         '\u20ac\u20ac\U000104A0')
1042        self.assertEqual(b'+IKwgrNgB3KA'.decode(self.encoding),
1043                         '\u20ac\u20ac\U000104A0')
1044
1045    def test_lone_surrogates(self):
1046        tests = [
1047            (b'a+2AE-b', 'a\ud801b'),
1048            (b'a+2AE\xffb', 'a\ufffdb'),
1049            (b'a+2AE', 'a\ufffd'),
1050            (b'a+2AEA-b', 'a\ufffdb'),
1051            (b'a+2AH-b', 'a\ufffdb'),
1052            (b'a+IKzYAQ-b', 'a\u20ac\ud801b'),
1053            (b'a+IKzYAQ\xffb', 'a\u20ac\ufffdb'),
1054            (b'a+IKzYAQA-b', 'a\u20ac\ufffdb'),
1055            (b'a+IKzYAd-b', 'a\u20ac\ufffdb'),
1056            (b'a+IKwgrNgB-b', 'a\u20ac\u20ac\ud801b'),
1057            (b'a+IKwgrNgB\xffb', 'a\u20ac\u20ac\ufffdb'),
1058            (b'a+IKwgrNgB', 'a\u20ac\u20ac\ufffd'),
1059            (b'a+IKwgrNgBA-b', 'a\u20ac\u20ac\ufffdb'),
1060        ]
1061        for raw, expected in tests:
1062            with self.subTest(raw=raw):
1063                self.assertEqual(raw.decode('utf-7', 'replace'), expected)
1064
1065
1066class UTF16ExTest(unittest.TestCase):
1067
1068    def test_errors(self):
1069        self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, b"\xff", "strict", 0, True)
1070
1071    def test_bad_args(self):
1072        self.assertRaises(TypeError, codecs.utf_16_ex_decode)
1073
1074class ReadBufferTest(unittest.TestCase):
1075
1076    def test_array(self):
1077        import array
1078        self.assertEqual(
1079            codecs.readbuffer_encode(array.array("b", b"spam")),
1080            (b"spam", 4)
1081        )
1082
1083    def test_empty(self):
1084        self.assertEqual(codecs.readbuffer_encode(""), (b"", 0))
1085
1086    def test_bad_args(self):
1087        self.assertRaises(TypeError, codecs.readbuffer_encode)
1088        self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
1089
1090class UTF8SigTest(UTF8Test, unittest.TestCase):
1091    encoding = "utf-8-sig"
1092    BOM = codecs.BOM_UTF8
1093
1094    def test_partial(self):
1095        self.check_partial(
1096            "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
1097            [
1098                "",
1099                "",
1100                "", # First BOM has been read and skipped
1101                "",
1102                "",
1103                "\ufeff", # Second BOM has been read and emitted
1104                "\ufeff\x00", # "\x00" read and emitted
1105                "\ufeff\x00", # First byte of encoded "\xff" read
1106                "\ufeff\x00\xff", # Second byte of encoded "\xff" read
1107                "\ufeff\x00\xff", # First byte of encoded "\u07ff" read
1108                "\ufeff\x00\xff\u07ff", # Second byte of encoded "\u07ff" read
1109                "\ufeff\x00\xff\u07ff",
1110                "\ufeff\x00\xff\u07ff",
1111                "\ufeff\x00\xff\u07ff\u0800",
1112                "\ufeff\x00\xff\u07ff\u0800",
1113                "\ufeff\x00\xff\u07ff\u0800",
1114                "\ufeff\x00\xff\u07ff\u0800\uffff",
1115                "\ufeff\x00\xff\u07ff\u0800\uffff",
1116                "\ufeff\x00\xff\u07ff\u0800\uffff",
1117                "\ufeff\x00\xff\u07ff\u0800\uffff",
1118                "\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
1119            ]
1120        )
1121
1122    def test_bug1601501(self):
1123        # SF bug #1601501: check that the codec works with a buffer
1124        self.assertEqual(str(b"\xef\xbb\xbf", "utf-8-sig"), "")
1125
1126    def test_bom(self):
1127        d = codecs.getincrementaldecoder("utf-8-sig")()
1128        s = "spam"
1129        self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
1130
1131    def test_stream_bom(self):
1132        unistring = "ABC\u00A1\u2200XYZ"
1133        bytestring = codecs.BOM_UTF8 + b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1134
1135        reader = codecs.getreader("utf-8-sig")
1136        for sizehint in [None] + list(range(1, 11)) + \
1137                        [64, 128, 256, 512, 1024]:
1138            istream = reader(io.BytesIO(bytestring))
1139            ostream = io.StringIO()
1140            while 1:
1141                if sizehint is not None:
1142                    data = istream.read(sizehint)
1143                else:
1144                    data = istream.read()
1145
1146                if not data:
1147                    break
1148                ostream.write(data)
1149
1150            got = ostream.getvalue()
1151            self.assertEqual(got, unistring)
1152
1153    def test_stream_bare(self):
1154        unistring = "ABC\u00A1\u2200XYZ"
1155        bytestring = b"ABC\xC2\xA1\xE2\x88\x80XYZ"
1156
1157        reader = codecs.getreader("utf-8-sig")
1158        for sizehint in [None] + list(range(1, 11)) + \
1159                        [64, 128, 256, 512, 1024]:
1160            istream = reader(io.BytesIO(bytestring))
1161            ostream = io.StringIO()
1162            while 1:
1163                if sizehint is not None:
1164                    data = istream.read(sizehint)
1165                else:
1166                    data = istream.read()
1167
1168                if not data:
1169                    break
1170                ostream.write(data)
1171
1172            got = ostream.getvalue()
1173            self.assertEqual(got, unistring)
1174
1175class EscapeDecodeTest(unittest.TestCase):
1176    def test_empty(self):
1177        self.assertEqual(codecs.escape_decode(b""), (b"", 0))
1178        self.assertEqual(codecs.escape_decode(bytearray()), (b"", 0))
1179
1180    def test_raw(self):
1181        decode = codecs.escape_decode
1182        for b in range(256):
1183            b = bytes([b])
1184            if b != b'\\':
1185                self.assertEqual(decode(b + b'0'), (b + b'0', 2))
1186
1187    def test_escape(self):
1188        decode = codecs.escape_decode
1189        check = coding_checker(self, decode)
1190        check(b"[\\\n]", b"[]")
1191        check(br'[\"]', b'["]')
1192        check(br"[\']", b"[']")
1193        check(br"[\\]", b"[\\]")
1194        check(br"[\a]", b"[\x07]")
1195        check(br"[\b]", b"[\x08]")
1196        check(br"[\t]", b"[\x09]")
1197        check(br"[\n]", b"[\x0a]")
1198        check(br"[\v]", b"[\x0b]")
1199        check(br"[\f]", b"[\x0c]")
1200        check(br"[\r]", b"[\x0d]")
1201        check(br"[\7]", b"[\x07]")
1202        check(br"[\78]", b"[\x078]")
1203        check(br"[\41]", b"[!]")
1204        check(br"[\418]", b"[!8]")
1205        check(br"[\101]", b"[A]")
1206        check(br"[\1010]", b"[A0]")
1207        check(br"[\501]", b"[A]")
1208        check(br"[\x41]", b"[A]")
1209        check(br"[\x410]", b"[A0]")
1210        for i in range(97, 123):
1211            b = bytes([i])
1212            if b not in b'abfnrtvx':
1213                with self.assertWarns(DeprecationWarning):
1214                    check(b"\\" + b, b"\\" + b)
1215            with self.assertWarns(DeprecationWarning):
1216                check(b"\\" + b.upper(), b"\\" + b.upper())
1217        with self.assertWarns(DeprecationWarning):
1218            check(br"\8", b"\\8")
1219        with self.assertWarns(DeprecationWarning):
1220            check(br"\9", b"\\9")
1221        with self.assertWarns(DeprecationWarning):
1222            check(b"\\\xfa", b"\\\xfa")
1223
1224    def test_errors(self):
1225        decode = codecs.escape_decode
1226        self.assertRaises(ValueError, decode, br"\x")
1227        self.assertRaises(ValueError, decode, br"[\x]")
1228        self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
1229        self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
1230        self.assertRaises(ValueError, decode, br"\x0")
1231        self.assertRaises(ValueError, decode, br"[\x0]")
1232        self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
1233        self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
1234
1235
1236class RecodingTest(unittest.TestCase):
1237    def test_recoding(self):
1238        f = io.BytesIO()
1239        f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
1240        f2.write("a")
1241        f2.close()
1242        # Python used to crash on this at exit because of a refcount
1243        # bug in _codecsmodule.c
1244
1245        self.assertTrue(f.closed)
1246
1247# From RFC 3492
1248punycode_testcases = [
1249    # A Arabic (Egyptian):
1250    ("\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
1251     "\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
1252     b"egbpdaj6bu4bxfgehfvwxn"),
1253    # B Chinese (simplified):
1254    ("\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
1255     b"ihqwcrb4cv8a8dqg056pqjye"),
1256    # C Chinese (traditional):
1257    ("\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
1258     b"ihqwctvzc91f659drss3x8bo0yb"),
1259    # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
1260    ("\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
1261     "\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
1262     "\u0065\u0073\u006B\u0079",
1263     b"Proprostnemluvesky-uyb24dma41a"),
1264    # E Hebrew:
1265    ("\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
1266     "\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
1267     "\u05D1\u05E8\u05D9\u05EA",
1268     b"4dbcagdahymbxekheh6e0a7fei0b"),
1269    # F Hindi (Devanagari):
1270    ("\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
1271     "\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
1272     "\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
1273     "\u0939\u0948\u0902",
1274     b"i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
1275
1276    #(G) Japanese (kanji and hiragana):
1277    ("\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
1278     "\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
1279     b"n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
1280
1281    # (H) Korean (Hangul syllables):
1282    ("\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
1283     "\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
1284     "\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
1285     b"989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
1286     b"psd879ccm6fea98c"),
1287
1288    # (I) Russian (Cyrillic):
1289    ("\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
1290     "\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1291     "\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1292     "\u0438",
1293     b"b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
1294
1295    # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
1296    ("\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1297     "\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1298     "\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1299     "\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1300     "\u0061\u00F1\u006F\u006C",
1301     b"PorqunopuedensimplementehablarenEspaol-fmd56a"),
1302
1303    # (K) Vietnamese:
1304    #  T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1305    #   <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
1306    ("\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1307     "\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1308     "\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1309     "\u0056\u0069\u1EC7\u0074",
1310     b"TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
1311
1312    #(L) 3<nen>B<gumi><kinpachi><sensei>
1313    ("\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
1314     b"3B-ww4c5e180e575a65lsy2b"),
1315
1316    # (M) <amuro><namie>-with-SUPER-MONKEYS
1317    ("\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1318     "\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1319     "\u004F\u004E\u004B\u0045\u0059\u0053",
1320     b"-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
1321
1322    # (N) Hello-Another-Way-<sorezore><no><basho>
1323    ("\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1324     "\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1325     "\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
1326     b"Hello-Another-Way--fc4qua05auwb3674vfr0b"),
1327
1328    # (O) <hitotsu><yane><no><shita>2
1329    ("\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
1330     b"2-u9tlzr9756bt3uc0v"),
1331
1332    # (P) Maji<de>Koi<suru>5<byou><mae>
1333    ("\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1334     "\u308B\u0035\u79D2\u524D",
1335     b"MajiKoi5-783gue6qz075azm5e"),
1336
1337     # (Q) <pafii>de<runba>
1338    ("\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
1339     b"de-jg4avhby1noc0d"),
1340
1341    # (R) <sono><supiido><de>
1342    ("\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
1343     b"d9juau41awczczp"),
1344
1345    # (S) -> $1.00 <-
1346    ("\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1347     "\u003C\u002D",
1348     b"-> $1.00 <--")
1349    ]
1350
1351for i in punycode_testcases:
1352    if len(i)!=2:
1353        print(repr(i))
1354
1355
1356class PunycodeTest(unittest.TestCase):
1357    def test_encode(self):
1358        for uni, puny in punycode_testcases:
1359            # Need to convert both strings to lower case, since
1360            # some of the extended encodings use upper case, but our
1361            # code produces only lower case. Converting just puny to
1362            # lower is also insufficient, since some of the input characters
1363            # are upper case.
1364            self.assertEqual(
1365                str(uni.encode("punycode"), "ascii").lower(),
1366                str(puny, "ascii").lower()
1367            )
1368
1369    def test_decode(self):
1370        for uni, puny in punycode_testcases:
1371            self.assertEqual(uni, puny.decode("punycode"))
1372            puny = puny.decode("ascii").encode("ascii")
1373            self.assertEqual(uni, puny.decode("punycode"))
1374
1375
1376class UnicodeInternalTest(unittest.TestCase):
1377    @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
1378    def test_bug1251300(self):
1379        # Decoding with unicode_internal used to not correctly handle "code
1380        # points" above 0x10ffff on UCS-4 builds.
1381        ok = [
1382            (b"\x00\x10\xff\xff", "\U0010ffff"),
1383            (b"\x00\x00\x01\x01", "\U00000101"),
1384            (b"", ""),
1385        ]
1386        not_ok = [
1387            b"\x7f\xff\xff\xff",
1388            b"\x80\x00\x00\x00",
1389            b"\x81\x00\x00\x00",
1390            b"\x00",
1391            b"\x00\x00\x00\x00\x00",
1392        ]
1393        for internal, uni in ok:
1394            if sys.byteorder == "little":
1395                internal = bytes(reversed(internal))
1396            with support.check_warnings():
1397                self.assertEqual(uni, internal.decode("unicode_internal"))
1398        for internal in not_ok:
1399            if sys.byteorder == "little":
1400                internal = bytes(reversed(internal))
1401            with support.check_warnings(('unicode_internal codec has been '
1402                                         'deprecated', DeprecationWarning)):
1403                self.assertRaises(UnicodeDecodeError, internal.decode,
1404                                  "unicode_internal")
1405        if sys.byteorder == "little":
1406            invalid = b"\x00\x00\x11\x00"
1407            invalid_backslashreplace = r"\x00\x00\x11\x00"
1408        else:
1409            invalid = b"\x00\x11\x00\x00"
1410            invalid_backslashreplace = r"\x00\x11\x00\x00"
1411        with support.check_warnings():
1412            self.assertRaises(UnicodeDecodeError,
1413                              invalid.decode, "unicode_internal")
1414        with support.check_warnings():
1415            self.assertEqual(invalid.decode("unicode_internal", "replace"),
1416                             '\ufffd')
1417        with support.check_warnings():
1418            self.assertEqual(invalid.decode("unicode_internal", "backslashreplace"),
1419                             invalid_backslashreplace)
1420
1421    @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
1422    def test_decode_error_attributes(self):
1423        try:
1424            with support.check_warnings(('unicode_internal codec has been '
1425                                         'deprecated', DeprecationWarning)):
1426                b"\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
1427        except UnicodeDecodeError as ex:
1428            self.assertEqual("unicode_internal", ex.encoding)
1429            self.assertEqual(b"\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1430            self.assertEqual(4, ex.start)
1431            self.assertEqual(8, ex.end)
1432        else:
1433            self.fail()
1434
1435    @unittest.skipUnless(SIZEOF_WCHAR_T == 4, 'specific to 32-bit wchar_t')
1436    def test_decode_callback(self):
1437        codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1438        decoder = codecs.getdecoder("unicode_internal")
1439        with support.check_warnings(('unicode_internal codec has been '
1440                                     'deprecated', DeprecationWarning)):
1441            ab = "ab".encode("unicode_internal").decode()
1442            ignored = decoder(bytes("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1443                                    "ascii"),
1444                              "UnicodeInternalTest")
1445        self.assertEqual(("ab", 12), ignored)
1446
1447    def test_encode_length(self):
1448        with support.check_warnings(('unicode_internal codec has been '
1449                                     'deprecated', DeprecationWarning)):
1450            # Issue 3739
1451            encoder = codecs.getencoder("unicode_internal")
1452            self.assertEqual(encoder("a")[1], 1)
1453            self.assertEqual(encoder("\xe9\u0142")[1], 2)
1454
1455            self.assertEqual(codecs.escape_encode(br'\x00')[1], 4)
1456
1457# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1458nameprep_tests = [
1459    # 3.1 Map to nothing.
1460    (b'foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1461     b'\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1462     b'\xb8\x8f\xef\xbb\xbf',
1463     b'foobarbaz'),
1464    # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
1465    (b'CAFE',
1466     b'cafe'),
1467    # 3.3 Case folding 8bit U+00DF (german sharp s).
1468    # The original test case is bogus; it says \xc3\xdf
1469    (b'\xc3\x9f',
1470     b'ss'),
1471    # 3.4 Case folding U+0130 (turkish capital I with dot).
1472    (b'\xc4\xb0',
1473     b'i\xcc\x87'),
1474    # 3.5 Case folding multibyte U+0143 U+037A.
1475    (b'\xc5\x83\xcd\xba',
1476     b'\xc5\x84 \xce\xb9'),
1477    # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1478    # XXX: skip this as it fails in UCS-2 mode
1479    #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1480    # 'telc\xe2\x88\x95kg\xcf\x83'),
1481    (None, None),
1482    # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
1483    (b'j\xcc\x8c\xc2\xa0\xc2\xaa',
1484     b'\xc7\xb0 a'),
1485    # 3.8 Case folding U+1FB7 and normalization.
1486    (b'\xe1\xbe\xb7',
1487     b'\xe1\xbe\xb6\xce\xb9'),
1488    # 3.9 Self-reverting case folding U+01F0 and normalization.
1489    # The original test case is bogus, it says `\xc7\xf0'
1490    (b'\xc7\xb0',
1491     b'\xc7\xb0'),
1492    # 3.10 Self-reverting case folding U+0390 and normalization.
1493    (b'\xce\x90',
1494     b'\xce\x90'),
1495    # 3.11 Self-reverting case folding U+03B0 and normalization.
1496    (b'\xce\xb0',
1497     b'\xce\xb0'),
1498    # 3.12 Self-reverting case folding U+1E96 and normalization.
1499    (b'\xe1\xba\x96',
1500     b'\xe1\xba\x96'),
1501    # 3.13 Self-reverting case folding U+1F56 and normalization.
1502    (b'\xe1\xbd\x96',
1503     b'\xe1\xbd\x96'),
1504    # 3.14 ASCII space character U+0020.
1505    (b' ',
1506     b' '),
1507    # 3.15 Non-ASCII 8bit space character U+00A0.
1508    (b'\xc2\xa0',
1509     b' '),
1510    # 3.16 Non-ASCII multibyte space character U+1680.
1511    (b'\xe1\x9a\x80',
1512     None),
1513    # 3.17 Non-ASCII multibyte space character U+2000.
1514    (b'\xe2\x80\x80',
1515     b' '),
1516    # 3.18 Zero Width Space U+200b.
1517    (b'\xe2\x80\x8b',
1518     b''),
1519    # 3.19 Non-ASCII multibyte space character U+3000.
1520    (b'\xe3\x80\x80',
1521     b' '),
1522    # 3.20 ASCII control characters U+0010 U+007F.
1523    (b'\x10\x7f',
1524     b'\x10\x7f'),
1525    # 3.21 Non-ASCII 8bit control character U+0085.
1526    (b'\xc2\x85',
1527     None),
1528    # 3.22 Non-ASCII multibyte control character U+180E.
1529    (b'\xe1\xa0\x8e',
1530     None),
1531    # 3.23 Zero Width No-Break Space U+FEFF.
1532    (b'\xef\xbb\xbf',
1533     b''),
1534    # 3.24 Non-ASCII control character U+1D175.
1535    (b'\xf0\x9d\x85\xb5',
1536     None),
1537    # 3.25 Plane 0 private use character U+F123.
1538    (b'\xef\x84\xa3',
1539     None),
1540    # 3.26 Plane 15 private use character U+F1234.
1541    (b'\xf3\xb1\x88\xb4',
1542     None),
1543    # 3.27 Plane 16 private use character U+10F234.
1544    (b'\xf4\x8f\x88\xb4',
1545     None),
1546    # 3.28 Non-character code point U+8FFFE.
1547    (b'\xf2\x8f\xbf\xbe',
1548     None),
1549    # 3.29 Non-character code point U+10FFFF.
1550    (b'\xf4\x8f\xbf\xbf',
1551     None),
1552    # 3.30 Surrogate code U+DF42.
1553    (b'\xed\xbd\x82',
1554     None),
1555    # 3.31 Non-plain text character U+FFFD.
1556    (b'\xef\xbf\xbd',
1557     None),
1558    # 3.32 Ideographic description character U+2FF5.
1559    (b'\xe2\xbf\xb5',
1560     None),
1561    # 3.33 Display property character U+0341.
1562    (b'\xcd\x81',
1563     b'\xcc\x81'),
1564    # 3.34 Left-to-right mark U+200E.
1565    (b'\xe2\x80\x8e',
1566     None),
1567    # 3.35 Deprecated U+202A.
1568    (b'\xe2\x80\xaa',
1569     None),
1570    # 3.36 Language tagging character U+E0001.
1571    (b'\xf3\xa0\x80\x81',
1572     None),
1573    # 3.37 Language tagging character U+E0042.
1574    (b'\xf3\xa0\x81\x82',
1575     None),
1576    # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
1577    (b'foo\xd6\xbebar',
1578     None),
1579    # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
1580    (b'foo\xef\xb5\x90bar',
1581     None),
1582    # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
1583    (b'foo\xef\xb9\xb6bar',
1584     b'foo \xd9\x8ebar'),
1585    # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
1586    (b'\xd8\xa71',
1587     None),
1588    # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
1589    (b'\xd8\xa71\xd8\xa8',
1590     b'\xd8\xa71\xd8\xa8'),
1591    # 3.43 Unassigned code point U+E0002.
1592    # Skip this test as we allow unassigned
1593    #(b'\xf3\xa0\x80\x82',
1594    # None),
1595    (None, None),
1596    # 3.44 Larger test (shrinking).
1597    # Original test case reads \xc3\xdf
1598    (b'X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1599     b'\xaa\xce\xb0\xe2\x80\x80',
1600     b'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
1601    # 3.45 Larger test (expanding).
1602    # Original test case reads \xc3\x9f
1603    (b'X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1604     b'\x80',
1605     b'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1606     b'\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1607     b'\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
1608    ]
1609
1610
1611class NameprepTest(unittest.TestCase):
1612    def test_nameprep(self):
1613        from encodings.idna import nameprep
1614        for pos, (orig, prepped) in enumerate(nameprep_tests):
1615            if orig is None:
1616                # Skipped
1617                continue
1618            # The Unicode strings are given in UTF-8
1619            orig = str(orig, "utf-8", "surrogatepass")
1620            if prepped is None:
1621                # Input contains prohibited characters
1622                self.assertRaises(UnicodeError, nameprep, orig)
1623            else:
1624                prepped = str(prepped, "utf-8", "surrogatepass")
1625                try:
1626                    self.assertEqual(nameprep(orig), prepped)
1627                except Exception as e:
1628                    raise support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
1629
1630
1631class IDNACodecTest(unittest.TestCase):
1632    def test_builtin_decode(self):
1633        self.assertEqual(str(b"python.org", "idna"), "python.org")
1634        self.assertEqual(str(b"python.org.", "idna"), "python.org.")
1635        self.assertEqual(str(b"xn--pythn-mua.org", "idna"), "pyth\xf6n.org")
1636        self.assertEqual(str(b"xn--pythn-mua.org.", "idna"), "pyth\xf6n.org.")
1637
1638    def test_builtin_encode(self):
1639        self.assertEqual("python.org".encode("idna"), b"python.org")
1640        self.assertEqual("python.org.".encode("idna"), b"python.org.")
1641        self.assertEqual("pyth\xf6n.org".encode("idna"), b"xn--pythn-mua.org")
1642        self.assertEqual("pyth\xf6n.org.".encode("idna"), b"xn--pythn-mua.org.")
1643
1644    def test_stream(self):
1645        r = codecs.getreader("idna")(io.BytesIO(b"abc"))
1646        r.read(3)
1647        self.assertEqual(r.read(), "")
1648
1649    def test_incremental_decode(self):
1650        self.assertEqual(
1651            "".join(codecs.iterdecode((bytes([c]) for c in b"python.org"), "idna")),
1652            "python.org"
1653        )
1654        self.assertEqual(
1655            "".join(codecs.iterdecode((bytes([c]) for c in b"python.org."), "idna")),
1656            "python.org."
1657        )
1658        self.assertEqual(
1659            "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
1660            "pyth\xf6n.org."
1661        )
1662        self.assertEqual(
1663            "".join(codecs.iterdecode((bytes([c]) for c in b"xn--pythn-mua.org."), "idna")),
1664            "pyth\xf6n.org."
1665        )
1666
1667        decoder = codecs.getincrementaldecoder("idna")()
1668        self.assertEqual(decoder.decode(b"xn--xam", ), "")
1669        self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1670        self.assertEqual(decoder.decode(b"rg"), "")
1671        self.assertEqual(decoder.decode(b"", True), "org")
1672
1673        decoder.reset()
1674        self.assertEqual(decoder.decode(b"xn--xam", ), "")
1675        self.assertEqual(decoder.decode(b"ple-9ta.o", ), "\xe4xample.")
1676        self.assertEqual(decoder.decode(b"rg."), "org.")
1677        self.assertEqual(decoder.decode(b"", True), "")
1678
1679    def test_incremental_encode(self):
1680        self.assertEqual(
1681            b"".join(codecs.iterencode("python.org", "idna")),
1682            b"python.org"
1683        )
1684        self.assertEqual(
1685            b"".join(codecs.iterencode("python.org.", "idna")),
1686            b"python.org."
1687        )
1688        self.assertEqual(
1689            b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1690            b"xn--pythn-mua.org."
1691        )
1692        self.assertEqual(
1693            b"".join(codecs.iterencode("pyth\xf6n.org.", "idna")),
1694            b"xn--pythn-mua.org."
1695        )
1696
1697        encoder = codecs.getincrementalencoder("idna")()
1698        self.assertEqual(encoder.encode("\xe4x"), b"")
1699        self.assertEqual(encoder.encode("ample.org"), b"xn--xample-9ta.")
1700        self.assertEqual(encoder.encode("", True), b"org")
1701
1702        encoder.reset()
1703        self.assertEqual(encoder.encode("\xe4x"), b"")
1704        self.assertEqual(encoder.encode("ample.org."), b"xn--xample-9ta.org.")
1705        self.assertEqual(encoder.encode("", True), b"")
1706
1707    def test_errors(self):
1708        """Only supports "strict" error handler"""
1709        "python.org".encode("idna", "strict")
1710        b"python.org".decode("idna", "strict")
1711        for errors in ("ignore", "replace", "backslashreplace",
1712                "surrogateescape"):
1713            self.assertRaises(Exception, "python.org".encode, "idna", errors)
1714            self.assertRaises(Exception,
1715                b"python.org".decode, "idna", errors)
1716
1717
1718class CodecsModuleTest(unittest.TestCase):
1719
1720    def test_decode(self):
1721        self.assertEqual(codecs.decode(b'\xe4\xf6\xfc', 'latin-1'),
1722                         '\xe4\xf6\xfc')
1723        self.assertRaises(TypeError, codecs.decode)
1724        self.assertEqual(codecs.decode(b'abc'), 'abc')
1725        self.assertRaises(UnicodeDecodeError, codecs.decode, b'\xff', 'ascii')
1726
1727        # test keywords
1728        self.assertEqual(codecs.decode(obj=b'\xe4\xf6\xfc', encoding='latin-1'),
1729                         '\xe4\xf6\xfc')
1730        self.assertEqual(codecs.decode(b'[\xff]', 'ascii', errors='ignore'),
1731                         '[]')
1732
1733    def test_encode(self):
1734        self.assertEqual(codecs.encode('\xe4\xf6\xfc', 'latin-1'),
1735                         b'\xe4\xf6\xfc')
1736        self.assertRaises(TypeError, codecs.encode)
1737        self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
1738        self.assertEqual(codecs.encode('abc'), b'abc')
1739        self.assertRaises(UnicodeEncodeError, codecs.encode, '\xffff', 'ascii')
1740
1741        # test keywords
1742        self.assertEqual(codecs.encode(obj='\xe4\xf6\xfc', encoding='latin-1'),
1743                         b'\xe4\xf6\xfc')
1744        self.assertEqual(codecs.encode('[\xff]', 'ascii', errors='ignore'),
1745                         b'[]')
1746
1747    def test_register(self):
1748        self.assertRaises(TypeError, codecs.register)
1749        self.assertRaises(TypeError, codecs.register, 42)
1750
1751    def test_lookup(self):
1752        self.assertRaises(TypeError, codecs.lookup)
1753        self.assertRaises(LookupError, codecs.lookup, "__spam__")
1754        self.assertRaises(LookupError, codecs.lookup, " ")
1755
1756    def test_getencoder(self):
1757        self.assertRaises(TypeError, codecs.getencoder)
1758        self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1759
1760    def test_getdecoder(self):
1761        self.assertRaises(TypeError, codecs.getdecoder)
1762        self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1763
1764    def test_getreader(self):
1765        self.assertRaises(TypeError, codecs.getreader)
1766        self.assertRaises(LookupError, codecs.getreader, "__spam__")
1767
1768    def test_getwriter(self):
1769        self.assertRaises(TypeError, codecs.getwriter)
1770        self.assertRaises(LookupError, codecs.getwriter, "__spam__")
1771
1772    def test_lookup_issue1813(self):
1773        # Issue #1813: under Turkish locales, lookup of some codecs failed
1774        # because 'I' is lowercased as "ı" (dotless i)
1775        oldlocale = locale.setlocale(locale.LC_CTYPE)
1776        self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1777        try:
1778            locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1779        except locale.Error:
1780            # Unsupported locale on this system
1781            self.skipTest('test needs Turkish locale')
1782        c = codecs.lookup('ASCII')
1783        self.assertEqual(c.name, 'ascii')
1784
1785    def test_all(self):
1786        api = (
1787            "encode", "decode",
1788            "register", "CodecInfo", "Codec", "IncrementalEncoder",
1789            "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1790            "getencoder", "getdecoder", "getincrementalencoder",
1791            "getincrementaldecoder", "getreader", "getwriter",
1792            "register_error", "lookup_error",
1793            "strict_errors", "replace_errors", "ignore_errors",
1794            "xmlcharrefreplace_errors", "backslashreplace_errors",
1795            "namereplace_errors",
1796            "open", "EncodedFile",
1797            "iterencode", "iterdecode",
1798            "BOM", "BOM_BE", "BOM_LE",
1799            "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1800            "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1801            "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",  # Undocumented
1802            "StreamReaderWriter", "StreamRecoder",
1803        )
1804        self.assertCountEqual(api, codecs.__all__)
1805        for api in codecs.__all__:
1806            getattr(codecs, api)
1807
1808    def test_open(self):
1809        self.addCleanup(support.unlink, support.TESTFN)
1810        for mode in ('w', 'r', 'r+', 'w+', 'a', 'a+'):
1811            with self.subTest(mode), \
1812                    codecs.open(support.TESTFN, mode, 'ascii') as file:
1813                self.assertIsInstance(file, codecs.StreamReaderWriter)
1814
1815    def test_undefined(self):
1816        self.assertRaises(UnicodeError, codecs.encode, 'abc', 'undefined')
1817        self.assertRaises(UnicodeError, codecs.decode, b'abc', 'undefined')
1818        self.assertRaises(UnicodeError, codecs.encode, '', 'undefined')
1819        self.assertRaises(UnicodeError, codecs.decode, b'', 'undefined')
1820        for errors in ('strict', 'ignore', 'replace', 'backslashreplace'):
1821            self.assertRaises(UnicodeError,
1822                codecs.encode, 'abc', 'undefined', errors)
1823            self.assertRaises(UnicodeError,
1824                codecs.decode, b'abc', 'undefined', errors)
1825
1826
1827class StreamReaderTest(unittest.TestCase):
1828
1829    def setUp(self):
1830        self.reader = codecs.getreader('utf-8')
1831        self.stream = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
1832
1833    def test_readlines(self):
1834        f = self.reader(self.stream)
1835        self.assertEqual(f.readlines(), ['\ud55c\n', '\uae00'])
1836
1837
1838class EncodedFileTest(unittest.TestCase):
1839
1840    def test_basic(self):
1841        f = io.BytesIO(b'\xed\x95\x9c\n\xea\xb8\x80')
1842        ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
1843        self.assertEqual(ef.read(), b'\\\xd5\n\x00\x00\xae')
1844
1845        f = io.BytesIO()
1846        ef = codecs.EncodedFile(f, 'utf-8', 'latin-1')
1847        ef.write(b'\xc3\xbc')
1848        self.assertEqual(f.getvalue(), b'\xfc')
1849
1850all_unicode_encodings = [
1851    "ascii",
1852    "big5",
1853    "big5hkscs",
1854    "charmap",
1855    "cp037",
1856    "cp1006",
1857    "cp1026",
1858    "cp1125",
1859    "cp1140",
1860    "cp1250",
1861    "cp1251",
1862    "cp1252",
1863    "cp1253",
1864    "cp1254",
1865    "cp1255",
1866    "cp1256",
1867    "cp1257",
1868    "cp1258",
1869    "cp424",
1870    "cp437",
1871    "cp500",
1872    "cp720",
1873    "cp737",
1874    "cp775",
1875    "cp850",
1876    "cp852",
1877    "cp855",
1878    "cp856",
1879    "cp857",
1880    "cp858",
1881    "cp860",
1882    "cp861",
1883    "cp862",
1884    "cp863",
1885    "cp864",
1886    "cp865",
1887    "cp866",
1888    "cp869",
1889    "cp874",
1890    "cp875",
1891    "cp932",
1892    "cp949",
1893    "cp950",
1894    "euc_jis_2004",
1895    "euc_jisx0213",
1896    "euc_jp",
1897    "euc_kr",
1898    "gb18030",
1899    "gb2312",
1900    "gbk",
1901    "hp_roman8",
1902    "hz",
1903    "idna",
1904    "iso2022_jp",
1905    "iso2022_jp_1",
1906    "iso2022_jp_2",
1907    "iso2022_jp_2004",
1908    "iso2022_jp_3",
1909    "iso2022_jp_ext",
1910    "iso2022_kr",
1911    "iso8859_1",
1912    "iso8859_10",
1913    "iso8859_11",
1914    "iso8859_13",
1915    "iso8859_14",
1916    "iso8859_15",
1917    "iso8859_16",
1918    "iso8859_2",
1919    "iso8859_3",
1920    "iso8859_4",
1921    "iso8859_5",
1922    "iso8859_6",
1923    "iso8859_7",
1924    "iso8859_8",
1925    "iso8859_9",
1926    "johab",
1927    "koi8_r",
1928    "koi8_t",
1929    "koi8_u",
1930    "kz1048",
1931    "latin_1",
1932    "mac_cyrillic",
1933    "mac_greek",
1934    "mac_iceland",
1935    "mac_latin2",
1936    "mac_roman",
1937    "mac_turkish",
1938    "palmos",
1939    "ptcp154",
1940    "punycode",
1941    "raw_unicode_escape",
1942    "shift_jis",
1943    "shift_jis_2004",
1944    "shift_jisx0213",
1945    "tis_620",
1946    "unicode_escape",
1947    "unicode_internal",
1948    "utf_16",
1949    "utf_16_be",
1950    "utf_16_le",
1951    "utf_7",
1952    "utf_8",
1953]
1954
1955if hasattr(codecs, "mbcs_encode"):
1956    all_unicode_encodings.append("mbcs")
1957if hasattr(codecs, "oem_encode"):
1958    all_unicode_encodings.append("oem")
1959
1960# The following encoding is not tested, because it's not supposed
1961# to work:
1962#    "undefined"
1963
1964# The following encodings don't work in stateful mode
1965broken_unicode_with_stateful = [
1966    "punycode",
1967    "unicode_internal"
1968]
1969
1970
1971class BasicUnicodeTest(unittest.TestCase, MixInCheckStateHandling):
1972    def test_basics(self):
1973        s = "abc123"  # all codecs should be able to encode these
1974        for encoding in all_unicode_encodings:
1975            name = codecs.lookup(encoding).name
1976            if encoding.endswith("_codec"):
1977                name += "_codec"
1978            elif encoding == "latin_1":
1979                name = "latin_1"
1980            self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
1981
1982            with support.check_warnings():
1983                # unicode-internal has been deprecated
1984                (b, size) = codecs.getencoder(encoding)(s)
1985                self.assertEqual(size, len(s), "encoding=%r" % encoding)
1986                (chars, size) = codecs.getdecoder(encoding)(b)
1987                self.assertEqual(chars, s, "encoding=%r" % encoding)
1988
1989            if encoding not in broken_unicode_with_stateful:
1990                # check stream reader/writer
1991                q = Queue(b"")
1992                writer = codecs.getwriter(encoding)(q)
1993                encodedresult = b""
1994                for c in s:
1995                    writer.write(c)
1996                    chunk = q.read()
1997                    self.assertTrue(type(chunk) is bytes, type(chunk))
1998                    encodedresult += chunk
1999                q = Queue(b"")
2000                reader = codecs.getreader(encoding)(q)
2001                decodedresult = ""
2002                for c in encodedresult:
2003                    q.write(bytes([c]))
2004                    decodedresult += reader.read()
2005                self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
2006
2007            if encoding not in broken_unicode_with_stateful:
2008                # check incremental decoder/encoder and iterencode()/iterdecode()
2009                try:
2010                    encoder = codecs.getincrementalencoder(encoding)()
2011                except LookupError:  # no IncrementalEncoder
2012                    pass
2013                else:
2014                    # check incremental decoder/encoder
2015                    encodedresult = b""
2016                    for c in s:
2017                        encodedresult += encoder.encode(c)
2018                    encodedresult += encoder.encode("", True)
2019                    decoder = codecs.getincrementaldecoder(encoding)()
2020                    decodedresult = ""
2021                    for c in encodedresult:
2022                        decodedresult += decoder.decode(bytes([c]))
2023                    decodedresult += decoder.decode(b"", True)
2024                    self.assertEqual(decodedresult, s,
2025                                     "encoding=%r" % encoding)
2026
2027                    # check iterencode()/iterdecode()
2028                    result = "".join(codecs.iterdecode(
2029                            codecs.iterencode(s, encoding), encoding))
2030                    self.assertEqual(result, s, "encoding=%r" % encoding)
2031
2032                    # check iterencode()/iterdecode() with empty string
2033                    result = "".join(codecs.iterdecode(
2034                            codecs.iterencode("", encoding), encoding))
2035                    self.assertEqual(result, "")
2036
2037                if encoding not in ("idna", "mbcs"):
2038                    # check incremental decoder/encoder with errors argument
2039                    try:
2040                        encoder = codecs.getincrementalencoder(encoding)("ignore")
2041                    except LookupError:  # no IncrementalEncoder
2042                        pass
2043                    else:
2044                        encodedresult = b"".join(encoder.encode(c) for c in s)
2045                        decoder = codecs.getincrementaldecoder(encoding)("ignore")
2046                        decodedresult = "".join(decoder.decode(bytes([c]))
2047                                                for c in encodedresult)
2048                        self.assertEqual(decodedresult, s,
2049                                         "encoding=%r" % encoding)
2050
2051    @support.cpython_only
2052    def test_basics_capi(self):
2053        from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
2054        s = "abc123"  # all codecs should be able to encode these
2055        for encoding in all_unicode_encodings:
2056            if encoding not in broken_unicode_with_stateful:
2057                # check incremental decoder/encoder (fetched via the C API)
2058                try:
2059                    cencoder = codec_incrementalencoder(encoding)
2060                except LookupError:  # no IncrementalEncoder
2061                    pass
2062                else:
2063                    # check C API
2064                    encodedresult = b""
2065                    for c in s:
2066                        encodedresult += cencoder.encode(c)
2067                    encodedresult += cencoder.encode("", True)
2068                    cdecoder = codec_incrementaldecoder(encoding)
2069                    decodedresult = ""
2070                    for c in encodedresult:
2071                        decodedresult += cdecoder.decode(bytes([c]))
2072                    decodedresult += cdecoder.decode(b"", True)
2073                    self.assertEqual(decodedresult, s,
2074                                     "encoding=%r" % encoding)
2075
2076                if encoding not in ("idna", "mbcs"):
2077                    # check incremental decoder/encoder with errors argument
2078                    try:
2079                        cencoder = codec_incrementalencoder(encoding, "ignore")
2080                    except LookupError:  # no IncrementalEncoder
2081                        pass
2082                    else:
2083                        encodedresult = b"".join(cencoder.encode(c) for c in s)
2084                        cdecoder = codec_incrementaldecoder(encoding, "ignore")
2085                        decodedresult = "".join(cdecoder.decode(bytes([c]))
2086                                                for c in encodedresult)
2087                        self.assertEqual(decodedresult, s,
2088                                         "encoding=%r" % encoding)
2089
2090    def test_seek(self):
2091        # all codecs should be able to encode these
2092        s = "%s\n%s\n" % (100*"abc123", 100*"def456")
2093        for encoding in all_unicode_encodings:
2094            if encoding == "idna": # FIXME: See SF bug #1163178
2095                continue
2096            if encoding in broken_unicode_with_stateful:
2097                continue
2098            reader = codecs.getreader(encoding)(io.BytesIO(s.encode(encoding)))
2099            for t in range(5):
2100                # Test that calling seek resets the internal codec state and buffers
2101                reader.seek(0, 0)
2102                data = reader.read()
2103                self.assertEqual(s, data)
2104
2105    def test_bad_decode_args(self):
2106        for encoding in all_unicode_encodings:
2107            decoder = codecs.getdecoder(encoding)
2108            self.assertRaises(TypeError, decoder)
2109            if encoding not in ("idna", "punycode"):
2110                self.assertRaises(TypeError, decoder, 42)
2111
2112    def test_bad_encode_args(self):
2113        for encoding in all_unicode_encodings:
2114            encoder = codecs.getencoder(encoding)
2115            with support.check_warnings():
2116                # unicode-internal has been deprecated
2117                self.assertRaises(TypeError, encoder)
2118
2119    def test_encoding_map_type_initialized(self):
2120        from encodings import cp1140
2121        # This used to crash, we are only verifying there's no crash.
2122        table_type = type(cp1140.encoding_table)
2123        self.assertEqual(table_type, table_type)
2124
2125    def test_decoder_state(self):
2126        # Check that getstate() and setstate() handle the state properly
2127        u = "abc123"
2128        for encoding in all_unicode_encodings:
2129            if encoding not in broken_unicode_with_stateful:
2130                self.check_state_handling_decode(encoding, u, u.encode(encoding))
2131                self.check_state_handling_encode(encoding, u, u.encode(encoding))
2132
2133
2134class CharmapTest(unittest.TestCase):
2135    def test_decode_with_string_map(self):
2136        self.assertEqual(
2137            codecs.charmap_decode(b"\x00\x01\x02", "strict", "abc"),
2138            ("abc", 3)
2139        )
2140
2141        self.assertEqual(
2142            codecs.charmap_decode(b"\x00\x01\x02", "strict", "\U0010FFFFbc"),
2143            ("\U0010FFFFbc", 3)
2144        )
2145
2146        self.assertRaises(UnicodeDecodeError,
2147            codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab"
2148        )
2149
2150        self.assertRaises(UnicodeDecodeError,
2151            codecs.charmap_decode, b"\x00\x01\x02", "strict", "ab\ufffe"
2152        )
2153
2154        self.assertEqual(
2155            codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab"),
2156            ("ab\ufffd", 3)
2157        )
2158
2159        self.assertEqual(
2160            codecs.charmap_decode(b"\x00\x01\x02", "replace", "ab\ufffe"),
2161            ("ab\ufffd", 3)
2162        )
2163
2164        self.assertEqual(
2165            codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab"),
2166            ("ab\\x02", 3)
2167        )
2168
2169        self.assertEqual(
2170            codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace", "ab\ufffe"),
2171            ("ab\\x02", 3)
2172        )
2173
2174        self.assertEqual(
2175            codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab"),
2176            ("ab", 3)
2177        )
2178
2179        self.assertEqual(
2180            codecs.charmap_decode(b"\x00\x01\x02", "ignore", "ab\ufffe"),
2181            ("ab", 3)
2182        )
2183
2184        allbytes = bytes(range(256))
2185        self.assertEqual(
2186            codecs.charmap_decode(allbytes, "ignore", ""),
2187            ("", len(allbytes))
2188        )
2189
2190    def test_decode_with_int2str_map(self):
2191        self.assertEqual(
2192            codecs.charmap_decode(b"\x00\x01\x02", "strict",
2193                                  {0: 'a', 1: 'b', 2: 'c'}),
2194            ("abc", 3)
2195        )
2196
2197        self.assertEqual(
2198            codecs.charmap_decode(b"\x00\x01\x02", "strict",
2199                                  {0: 'Aa', 1: 'Bb', 2: 'Cc'}),
2200            ("AaBbCc", 3)
2201        )
2202
2203        self.assertEqual(
2204            codecs.charmap_decode(b"\x00\x01\x02", "strict",
2205                                  {0: '\U0010FFFF', 1: 'b', 2: 'c'}),
2206            ("\U0010FFFFbc", 3)
2207        )
2208
2209        self.assertEqual(
2210            codecs.charmap_decode(b"\x00\x01\x02", "strict",
2211                                  {0: 'a', 1: 'b', 2: ''}),
2212            ("ab", 3)
2213        )
2214
2215        self.assertRaises(UnicodeDecodeError,
2216            codecs.charmap_decode, b"\x00\x01\x02", "strict",
2217                                   {0: 'a', 1: 'b'}
2218        )
2219
2220        self.assertRaises(UnicodeDecodeError,
2221            codecs.charmap_decode, b"\x00\x01\x02", "strict",
2222                                   {0: 'a', 1: 'b', 2: None}
2223        )
2224
2225        # Issue #14850
2226        self.assertRaises(UnicodeDecodeError,
2227            codecs.charmap_decode, b"\x00\x01\x02", "strict",
2228                                   {0: 'a', 1: 'b', 2: '\ufffe'}
2229        )
2230
2231        self.assertEqual(
2232            codecs.charmap_decode(b"\x00\x01\x02", "replace",
2233                                  {0: 'a', 1: 'b'}),
2234            ("ab\ufffd", 3)
2235        )
2236
2237        self.assertEqual(
2238            codecs.charmap_decode(b"\x00\x01\x02", "replace",
2239                                  {0: 'a', 1: 'b', 2: None}),
2240            ("ab\ufffd", 3)
2241        )
2242
2243        # Issue #14850
2244        self.assertEqual(
2245            codecs.charmap_decode(b"\x00\x01\x02", "replace",
2246                                  {0: 'a', 1: 'b', 2: '\ufffe'}),
2247            ("ab\ufffd", 3)
2248        )
2249
2250        self.assertEqual(
2251            codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2252                                  {0: 'a', 1: 'b'}),
2253            ("ab\\x02", 3)
2254        )
2255
2256        self.assertEqual(
2257            codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2258                                  {0: 'a', 1: 'b', 2: None}),
2259            ("ab\\x02", 3)
2260        )
2261
2262        # Issue #14850
2263        self.assertEqual(
2264            codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2265                                  {0: 'a', 1: 'b', 2: '\ufffe'}),
2266            ("ab\\x02", 3)
2267        )
2268
2269        self.assertEqual(
2270            codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2271                                  {0: 'a', 1: 'b'}),
2272            ("ab", 3)
2273        )
2274
2275        self.assertEqual(
2276            codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2277                                  {0: 'a', 1: 'b', 2: None}),
2278            ("ab", 3)
2279        )
2280
2281        # Issue #14850
2282        self.assertEqual(
2283            codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2284                                  {0: 'a', 1: 'b', 2: '\ufffe'}),
2285            ("ab", 3)
2286        )
2287
2288        allbytes = bytes(range(256))
2289        self.assertEqual(
2290            codecs.charmap_decode(allbytes, "ignore", {}),
2291            ("", len(allbytes))
2292        )
2293
2294    def test_decode_with_int2int_map(self):
2295        a = ord('a')
2296        b = ord('b')
2297        c = ord('c')
2298
2299        self.assertEqual(
2300            codecs.charmap_decode(b"\x00\x01\x02", "strict",
2301                                  {0: a, 1: b, 2: c}),
2302            ("abc", 3)
2303        )
2304
2305        # Issue #15379
2306        self.assertEqual(
2307            codecs.charmap_decode(b"\x00\x01\x02", "strict",
2308                                  {0: 0x10FFFF, 1: b, 2: c}),
2309            ("\U0010FFFFbc", 3)
2310        )
2311
2312        self.assertEqual(
2313            codecs.charmap_decode(b"\x00\x01\x02", "strict",
2314                                  {0: sys.maxunicode, 1: b, 2: c}),
2315            (chr(sys.maxunicode) + "bc", 3)
2316        )
2317
2318        self.assertRaises(TypeError,
2319            codecs.charmap_decode, b"\x00\x01\x02", "strict",
2320                                   {0: sys.maxunicode + 1, 1: b, 2: c}
2321        )
2322
2323        self.assertRaises(UnicodeDecodeError,
2324            codecs.charmap_decode, b"\x00\x01\x02", "strict",
2325                                   {0: a, 1: b},
2326        )
2327
2328        self.assertRaises(UnicodeDecodeError,
2329            codecs.charmap_decode, b"\x00\x01\x02", "strict",
2330                                   {0: a, 1: b, 2: 0xFFFE},
2331        )
2332
2333        self.assertEqual(
2334            codecs.charmap_decode(b"\x00\x01\x02", "replace",
2335                                  {0: a, 1: b}),
2336            ("ab\ufffd", 3)
2337        )
2338
2339        self.assertEqual(
2340            codecs.charmap_decode(b"\x00\x01\x02", "replace",
2341                                  {0: a, 1: b, 2: 0xFFFE}),
2342            ("ab\ufffd", 3)
2343        )
2344
2345        self.assertEqual(
2346            codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2347                                  {0: a, 1: b}),
2348            ("ab\\x02", 3)
2349        )
2350
2351        self.assertEqual(
2352            codecs.charmap_decode(b"\x00\x01\x02", "backslashreplace",
2353                                  {0: a, 1: b, 2: 0xFFFE}),
2354            ("ab\\x02", 3)
2355        )
2356
2357        self.assertEqual(
2358            codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2359                                  {0: a, 1: b}),
2360            ("ab", 3)
2361        )
2362
2363        self.assertEqual(
2364            codecs.charmap_decode(b"\x00\x01\x02", "ignore",
2365                                  {0: a, 1: b, 2: 0xFFFE}),
2366            ("ab", 3)
2367        )
2368
2369
2370class WithStmtTest(unittest.TestCase):
2371    def test_encodedfile(self):
2372        f = io.BytesIO(b"\xc3\xbc")
2373        with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
2374            self.assertEqual(ef.read(), b"\xfc")
2375        self.assertTrue(f.closed)
2376
2377    def test_streamreaderwriter(self):
2378        f = io.BytesIO(b"\xc3\xbc")
2379        info = codecs.lookup("utf-8")
2380        with codecs.StreamReaderWriter(f, info.streamreader,
2381                                       info.streamwriter, 'strict') as srw:
2382            self.assertEqual(srw.read(), "\xfc")
2383
2384
2385class TypesTest(unittest.TestCase):
2386    def test_decode_unicode(self):
2387        # Most decoders don't accept unicode input
2388        decoders = [
2389            codecs.utf_7_decode,
2390            codecs.utf_8_decode,
2391            codecs.utf_16_le_decode,
2392            codecs.utf_16_be_decode,
2393            codecs.utf_16_ex_decode,
2394            codecs.utf_32_decode,
2395            codecs.utf_32_le_decode,
2396            codecs.utf_32_be_decode,
2397            codecs.utf_32_ex_decode,
2398            codecs.latin_1_decode,
2399            codecs.ascii_decode,
2400            codecs.charmap_decode,
2401        ]
2402        if hasattr(codecs, "mbcs_decode"):
2403            decoders.append(codecs.mbcs_decode)
2404        for decoder in decoders:
2405            self.assertRaises(TypeError, decoder, "xxx")
2406
2407    def test_unicode_escape(self):
2408        # Escape-decoding a unicode string is supported and gives the same
2409        # result as decoding the equivalent ASCII bytes string.
2410        self.assertEqual(codecs.unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2411        self.assertEqual(codecs.unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2412        self.assertEqual(codecs.raw_unicode_escape_decode(r"\u1234"), ("\u1234", 6))
2413        self.assertEqual(codecs.raw_unicode_escape_decode(br"\u1234"), ("\u1234", 6))
2414
2415        self.assertRaises(UnicodeDecodeError, codecs.unicode_escape_decode, br"\U00110000")
2416        self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2417        self.assertEqual(codecs.unicode_escape_decode(r"\U00110000", "backslashreplace"),
2418                         (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
2419
2420        self.assertRaises(UnicodeDecodeError, codecs.raw_unicode_escape_decode, br"\U00110000")
2421        self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "replace"), ("\ufffd", 10))
2422        self.assertEqual(codecs.raw_unicode_escape_decode(r"\U00110000", "backslashreplace"),
2423                         (r"\x5c\x55\x30\x30\x31\x31\x30\x30\x30\x30", 10))
2424
2425
2426class UnicodeEscapeTest(unittest.TestCase):
2427    def test_empty(self):
2428        self.assertEqual(codecs.unicode_escape_encode(""), (b"", 0))
2429        self.assertEqual(codecs.unicode_escape_decode(b""), ("", 0))
2430
2431    def test_raw_encode(self):
2432        encode = codecs.unicode_escape_encode
2433        for b in range(32, 127):
2434            if b != b'\\'[0]:
2435                self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2436
2437    def test_raw_decode(self):
2438        decode = codecs.unicode_escape_decode
2439        for b in range(256):
2440            if b != b'\\'[0]:
2441                self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2442
2443    def test_escape_encode(self):
2444        encode = codecs.unicode_escape_encode
2445        check = coding_checker(self, encode)
2446        check('\t', br'\t')
2447        check('\n', br'\n')
2448        check('\r', br'\r')
2449        check('\\', br'\\')
2450        for b in range(32):
2451            if chr(b) not in '\t\n\r':
2452                check(chr(b), ('\\x%02x' % b).encode())
2453        for b in range(127, 256):
2454            check(chr(b), ('\\x%02x' % b).encode())
2455        check('\u20ac', br'\u20ac')
2456        check('\U0001d120', br'\U0001d120')
2457
2458    def test_escape_decode(self):
2459        decode = codecs.unicode_escape_decode
2460        check = coding_checker(self, decode)
2461        check(b"[\\\n]", "[]")
2462        check(br'[\"]', '["]')
2463        check(br"[\']", "[']")
2464        check(br"[\\]", r"[\]")
2465        check(br"[\a]", "[\x07]")
2466        check(br"[\b]", "[\x08]")
2467        check(br"[\t]", "[\x09]")
2468        check(br"[\n]", "[\x0a]")
2469        check(br"[\v]", "[\x0b]")
2470        check(br"[\f]", "[\x0c]")
2471        check(br"[\r]", "[\x0d]")
2472        check(br"[\7]", "[\x07]")
2473        check(br"[\78]", "[\x078]")
2474        check(br"[\41]", "[!]")
2475        check(br"[\418]", "[!8]")
2476        check(br"[\101]", "[A]")
2477        check(br"[\1010]", "[A0]")
2478        check(br"[\x41]", "[A]")
2479        check(br"[\x410]", "[A0]")
2480        check(br"\u20ac", "\u20ac")
2481        check(br"\U0001d120", "\U0001d120")
2482        for i in range(97, 123):
2483            b = bytes([i])
2484            if b not in b'abfnrtuvx':
2485                with self.assertWarns(DeprecationWarning):
2486                    check(b"\\" + b, "\\" + chr(i))
2487            if b.upper() not in b'UN':
2488                with self.assertWarns(DeprecationWarning):
2489                    check(b"\\" + b.upper(), "\\" + chr(i-32))
2490        with self.assertWarns(DeprecationWarning):
2491            check(br"\8", "\\8")
2492        with self.assertWarns(DeprecationWarning):
2493            check(br"\9", "\\9")
2494        with self.assertWarns(DeprecationWarning):
2495            check(b"\\\xfa", "\\\xfa")
2496
2497    def test_decode_errors(self):
2498        decode = codecs.unicode_escape_decode
2499        for c, d in (b'x', 2), (b'u', 4), (b'U', 4):
2500            for i in range(d):
2501                self.assertRaises(UnicodeDecodeError, decode,
2502                                  b"\\" + c + b"0"*i)
2503                self.assertRaises(UnicodeDecodeError, decode,
2504                                  b"[\\" + c + b"0"*i + b"]")
2505                data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2506                self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2507                self.assertEqual(decode(data, "replace"),
2508                                 ("[\ufffd]\ufffd", len(data)))
2509        self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2510        self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2511        self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2512
2513
2514class RawUnicodeEscapeTest(unittest.TestCase):
2515    def test_empty(self):
2516        self.assertEqual(codecs.raw_unicode_escape_encode(""), (b"", 0))
2517        self.assertEqual(codecs.raw_unicode_escape_decode(b""), ("", 0))
2518
2519    def test_raw_encode(self):
2520        encode = codecs.raw_unicode_escape_encode
2521        for b in range(256):
2522            self.assertEqual(encode(chr(b)), (bytes([b]), 1))
2523
2524    def test_raw_decode(self):
2525        decode = codecs.raw_unicode_escape_decode
2526        for b in range(256):
2527            self.assertEqual(decode(bytes([b]) + b'0'), (chr(b) + '0', 2))
2528
2529    def test_escape_encode(self):
2530        encode = codecs.raw_unicode_escape_encode
2531        check = coding_checker(self, encode)
2532        for b in range(256):
2533            if b not in b'uU':
2534                check('\\' + chr(b), b'\\' + bytes([b]))
2535        check('\u20ac', br'\u20ac')
2536        check('\U0001d120', br'\U0001d120')
2537
2538    def test_escape_decode(self):
2539        decode = codecs.raw_unicode_escape_decode
2540        check = coding_checker(self, decode)
2541        for b in range(256):
2542            if b not in b'uU':
2543                check(b'\\' + bytes([b]), '\\' + chr(b))
2544        check(br"\u20ac", "\u20ac")
2545        check(br"\U0001d120", "\U0001d120")
2546
2547    def test_decode_errors(self):
2548        decode = codecs.raw_unicode_escape_decode
2549        for c, d in (b'u', 4), (b'U', 4):
2550            for i in range(d):
2551                self.assertRaises(UnicodeDecodeError, decode,
2552                                  b"\\" + c + b"0"*i)
2553                self.assertRaises(UnicodeDecodeError, decode,
2554                                  b"[\\" + c + b"0"*i + b"]")
2555                data = b"[\\" + c + b"0"*i + b"]\\" + c + b"0"*i
2556                self.assertEqual(decode(data, "ignore"), ("[]", len(data)))
2557                self.assertEqual(decode(data, "replace"),
2558                                 ("[\ufffd]\ufffd", len(data)))
2559        self.assertRaises(UnicodeDecodeError, decode, br"\U00110000")
2560        self.assertEqual(decode(br"\U00110000", "ignore"), ("", 10))
2561        self.assertEqual(decode(br"\U00110000", "replace"), ("\ufffd", 10))
2562
2563
2564class EscapeEncodeTest(unittest.TestCase):
2565
2566    def test_escape_encode(self):
2567        tests = [
2568            (b'', (b'', 0)),
2569            (b'foobar', (b'foobar', 6)),
2570            (b'spam\0eggs', (b'spam\\x00eggs', 9)),
2571            (b'a\'b', (b"a\\'b", 3)),
2572            (b'b\\c', (b'b\\\\c', 3)),
2573            (b'c\nd', (b'c\\nd', 3)),
2574            (b'd\re', (b'd\\re', 3)),
2575            (b'f\x7fg', (b'f\\x7fg', 3)),
2576        ]
2577        for data, output in tests:
2578            with self.subTest(data=data):
2579                self.assertEqual(codecs.escape_encode(data), output)
2580        self.assertRaises(TypeError, codecs.escape_encode, 'spam')
2581        self.assertRaises(TypeError, codecs.escape_encode, bytearray(b'spam'))
2582
2583
2584class SurrogateEscapeTest(unittest.TestCase):
2585
2586    def test_utf8(self):
2587        # Bad byte
2588        self.assertEqual(b"foo\x80bar".decode("utf-8", "surrogateescape"),
2589                         "foo\udc80bar")
2590        self.assertEqual("foo\udc80bar".encode("utf-8", "surrogateescape"),
2591                         b"foo\x80bar")
2592        # bad-utf-8 encoded surrogate
2593        self.assertEqual(b"\xed\xb0\x80".decode("utf-8", "surrogateescape"),
2594                         "\udced\udcb0\udc80")
2595        self.assertEqual("\udced\udcb0\udc80".encode("utf-8", "surrogateescape"),
2596                         b"\xed\xb0\x80")
2597
2598    def test_ascii(self):
2599        # bad byte
2600        self.assertEqual(b"foo\x80bar".decode("ascii", "surrogateescape"),
2601                         "foo\udc80bar")
2602        self.assertEqual("foo\udc80bar".encode("ascii", "surrogateescape"),
2603                         b"foo\x80bar")
2604
2605    def test_charmap(self):
2606        # bad byte: \xa5 is unmapped in iso-8859-3
2607        self.assertEqual(b"foo\xa5bar".decode("iso-8859-3", "surrogateescape"),
2608                         "foo\udca5bar")
2609        self.assertEqual("foo\udca5bar".encode("iso-8859-3", "surrogateescape"),
2610                         b"foo\xa5bar")
2611
2612    def test_latin1(self):
2613        # Issue6373
2614        self.assertEqual("\udce4\udceb\udcef\udcf6\udcfc".encode("latin-1", "surrogateescape"),
2615                         b"\xe4\xeb\xef\xf6\xfc")
2616
2617
2618class BomTest(unittest.TestCase):
2619    def test_seek0(self):
2620        data = "1234567890"
2621        tests = ("utf-16",
2622                 "utf-16-le",
2623                 "utf-16-be",
2624                 "utf-32",
2625                 "utf-32-le",
2626                 "utf-32-be")
2627        self.addCleanup(support.unlink, support.TESTFN)
2628        for encoding in tests:
2629            # Check if the BOM is written only once
2630            with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2631                f.write(data)
2632                f.write(data)
2633                f.seek(0)
2634                self.assertEqual(f.read(), data * 2)
2635                f.seek(0)
2636                self.assertEqual(f.read(), data * 2)
2637
2638            # Check that the BOM is written after a seek(0)
2639            with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2640                f.write(data[0])
2641                self.assertNotEqual(f.tell(), 0)
2642                f.seek(0)
2643                f.write(data)
2644                f.seek(0)
2645                self.assertEqual(f.read(), data)
2646
2647            # (StreamWriter) Check that the BOM is written after a seek(0)
2648            with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2649                f.writer.write(data[0])
2650                self.assertNotEqual(f.writer.tell(), 0)
2651                f.writer.seek(0)
2652                f.writer.write(data)
2653                f.seek(0)
2654                self.assertEqual(f.read(), data)
2655
2656            # Check that the BOM is not written after a seek() at a position
2657            # different than the start
2658            with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2659                f.write(data)
2660                f.seek(f.tell())
2661                f.write(data)
2662                f.seek(0)
2663                self.assertEqual(f.read(), data * 2)
2664
2665            # (StreamWriter) Check that the BOM is not written after a seek()
2666            # at a position different than the start
2667            with codecs.open(support.TESTFN, 'w+', encoding=encoding) as f:
2668                f.writer.write(data)
2669                f.writer.seek(f.writer.tell())
2670                f.writer.write(data)
2671                f.seek(0)
2672                self.assertEqual(f.read(), data * 2)
2673
2674
2675bytes_transform_encodings = [
2676    "base64_codec",
2677    "uu_codec",
2678    "quopri_codec",
2679    "hex_codec",
2680]
2681
2682transform_aliases = {
2683    "base64_codec": ["base64", "base_64"],
2684    "uu_codec": ["uu"],
2685    "quopri_codec": ["quopri", "quoted_printable", "quotedprintable"],
2686    "hex_codec": ["hex"],
2687    "rot_13": ["rot13"],
2688}
2689
2690try:
2691    import zlib
2692except ImportError:
2693    zlib = None
2694else:
2695    bytes_transform_encodings.append("zlib_codec")
2696    transform_aliases["zlib_codec"] = ["zip", "zlib"]
2697try:
2698    import bz2
2699except ImportError:
2700    pass
2701else:
2702    bytes_transform_encodings.append("bz2_codec")
2703    transform_aliases["bz2_codec"] = ["bz2"]
2704
2705
2706class TransformCodecTest(unittest.TestCase):
2707
2708    def test_basics(self):
2709        binput = bytes(range(256))
2710        for encoding in bytes_transform_encodings:
2711            with self.subTest(encoding=encoding):
2712                # generic codecs interface
2713                (o, size) = codecs.getencoder(encoding)(binput)
2714                self.assertEqual(size, len(binput))
2715                (i, size) = codecs.getdecoder(encoding)(o)
2716                self.assertEqual(size, len(o))
2717                self.assertEqual(i, binput)
2718
2719    def test_read(self):
2720        for encoding in bytes_transform_encodings:
2721            with self.subTest(encoding=encoding):
2722                sin = codecs.encode(b"\x80", encoding)
2723                reader = codecs.getreader(encoding)(io.BytesIO(sin))
2724                sout = reader.read()
2725                self.assertEqual(sout, b"\x80")
2726
2727    def test_readline(self):
2728        for encoding in bytes_transform_encodings:
2729            with self.subTest(encoding=encoding):
2730                sin = codecs.encode(b"\x80", encoding)
2731                reader = codecs.getreader(encoding)(io.BytesIO(sin))
2732                sout = reader.readline()
2733                self.assertEqual(sout, b"\x80")
2734
2735    def test_buffer_api_usage(self):
2736        # We check all the transform codecs accept memoryview input
2737        # for encoding and decoding
2738        # and also that they roundtrip correctly
2739        original = b"12345\x80"
2740        for encoding in bytes_transform_encodings:
2741            with self.subTest(encoding=encoding):
2742                data = original
2743                view = memoryview(data)
2744                data = codecs.encode(data, encoding)
2745                view_encoded = codecs.encode(view, encoding)
2746                self.assertEqual(view_encoded, data)
2747                view = memoryview(data)
2748                data = codecs.decode(data, encoding)
2749                self.assertEqual(data, original)
2750                view_decoded = codecs.decode(view, encoding)
2751                self.assertEqual(view_decoded, data)
2752
2753    def test_text_to_binary_blacklists_binary_transforms(self):
2754        # Check binary -> binary codecs give a good error for str input
2755        bad_input = "bad input type"
2756        for encoding in bytes_transform_encodings:
2757            with self.subTest(encoding=encoding):
2758                fmt = (r"{!r} is not a text encoding; "
2759                       r"use codecs.encode\(\) to handle arbitrary codecs")
2760                msg = fmt.format(encoding)
2761                with self.assertRaisesRegex(LookupError, msg) as failure:
2762                    bad_input.encode(encoding)
2763                self.assertIsNone(failure.exception.__cause__)
2764
2765    def test_text_to_binary_blacklists_text_transforms(self):
2766        # Check str.encode gives a good error message for str -> str codecs
2767        msg = (r"^'rot_13' is not a text encoding; "
2768               r"use codecs.encode\(\) to handle arbitrary codecs")
2769        with self.assertRaisesRegex(LookupError, msg):
2770            "just an example message".encode("rot_13")
2771
2772    def test_binary_to_text_blacklists_binary_transforms(self):
2773        # Check bytes.decode and bytearray.decode give a good error
2774        # message for binary -> binary codecs
2775        data = b"encode first to ensure we meet any format restrictions"
2776        for encoding in bytes_transform_encodings:
2777            with self.subTest(encoding=encoding):
2778                encoded_data = codecs.encode(data, encoding)
2779                fmt = (r"{!r} is not a text encoding; "
2780                       r"use codecs.decode\(\) to handle arbitrary codecs")
2781                msg = fmt.format(encoding)
2782                with self.assertRaisesRegex(LookupError, msg):
2783                    encoded_data.decode(encoding)
2784                with self.assertRaisesRegex(LookupError, msg):
2785                    bytearray(encoded_data).decode(encoding)
2786
2787    def test_binary_to_text_blacklists_text_transforms(self):
2788        # Check str -> str codec gives a good error for binary input
2789        for bad_input in (b"immutable", bytearray(b"mutable")):
2790            with self.subTest(bad_input=bad_input):
2791                msg = (r"^'rot_13' is not a text encoding; "
2792                       r"use codecs.decode\(\) to handle arbitrary codecs")
2793                with self.assertRaisesRegex(LookupError, msg) as failure:
2794                    bad_input.decode("rot_13")
2795                self.assertIsNone(failure.exception.__cause__)
2796
2797    @unittest.skipUnless(zlib, "Requires zlib support")
2798    def test_custom_zlib_error_is_wrapped(self):
2799        # Check zlib codec gives a good error for malformed input
2800        msg = "^decoding with 'zlib_codec' codec failed"
2801        with self.assertRaisesRegex(Exception, msg) as failure:
2802            codecs.decode(b"hello", "zlib_codec")
2803        self.assertIsInstance(failure.exception.__cause__,
2804                                                type(failure.exception))
2805
2806    def test_custom_hex_error_is_wrapped(self):
2807        # Check hex codec gives a good error for malformed input
2808        msg = "^decoding with 'hex_codec' codec failed"
2809        with self.assertRaisesRegex(Exception, msg) as failure:
2810            codecs.decode(b"hello", "hex_codec")
2811        self.assertIsInstance(failure.exception.__cause__,
2812                                                type(failure.exception))
2813
2814    # Unfortunately, the bz2 module throws OSError, which the codec
2815    # machinery currently can't wrap :(
2816
2817    # Ensure codec aliases from http://bugs.python.org/issue7475 work
2818    def test_aliases(self):
2819        for codec_name, aliases in transform_aliases.items():
2820            expected_name = codecs.lookup(codec_name).name
2821            for alias in aliases:
2822                with self.subTest(alias=alias):
2823                    info = codecs.lookup(alias)
2824                    self.assertEqual(info.name, expected_name)
2825
2826    def test_quopri_stateless(self):
2827        # Should encode with quotetabs=True
2828        encoded = codecs.encode(b"space tab\teol \n", "quopri-codec")
2829        self.assertEqual(encoded, b"space=20tab=09eol=20\n")
2830        # But should still support unescaped tabs and spaces
2831        unescaped = b"space tab eol\n"
2832        self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped)
2833
2834    def test_uu_invalid(self):
2835        # Missing "begin" line
2836        self.assertRaises(ValueError, codecs.decode, b"", "uu-codec")
2837
2838
2839# The codec system tries to wrap exceptions in order to ensure the error
2840# mentions the operation being performed and the codec involved. We
2841# currently *only* want this to happen for relatively stateless
2842# exceptions, where the only significant information they contain is their
2843# type and a single str argument.
2844
2845# Use a local codec registry to avoid appearing to leak objects when
2846# registering multiple search functions
2847_TEST_CODECS = {}
2848
2849def _get_test_codec(codec_name):
2850    return _TEST_CODECS.get(codec_name)
2851codecs.register(_get_test_codec) # Returns None, not usable as a decorator
2852
2853try:
2854    # Issue #22166: Also need to clear the internal cache in CPython
2855    from _codecs import _forget_codec
2856except ImportError:
2857    def _forget_codec(codec_name):
2858        pass
2859
2860
2861class ExceptionChainingTest(unittest.TestCase):
2862
2863    def setUp(self):
2864        # There's no way to unregister a codec search function, so we just
2865        # ensure we render this one fairly harmless after the test
2866        # case finishes by using the test case repr as the codec name
2867        # The codecs module normalizes codec names, although this doesn't
2868        # appear to be formally documented...
2869        # We also make sure we use a truly unique id for the custom codec
2870        # to avoid issues with the codec cache when running these tests
2871        # multiple times (e.g. when hunting for refleaks)
2872        unique_id = repr(self) + str(id(self))
2873        self.codec_name = encodings.normalize_encoding(unique_id).lower()
2874
2875        # We store the object to raise on the instance because of a bad
2876        # interaction between the codec caching (which means we can't
2877        # recreate the codec entry) and regrtest refleak hunting (which
2878        # runs the same test instance multiple times). This means we
2879        # need to ensure the codecs call back in to the instance to find
2880        # out which exception to raise rather than binding them in a
2881        # closure to an object that may change on the next run
2882        self.obj_to_raise = RuntimeError
2883
2884    def tearDown(self):
2885        _TEST_CODECS.pop(self.codec_name, None)
2886        # Issue #22166: Also pop from caches to avoid appearance of ref leaks
2887        encodings._cache.pop(self.codec_name, None)
2888        try:
2889            _forget_codec(self.codec_name)
2890        except KeyError:
2891            pass
2892
2893    def set_codec(self, encode, decode):
2894        codec_info = codecs.CodecInfo(encode, decode,
2895                                      name=self.codec_name)
2896        _TEST_CODECS[self.codec_name] = codec_info
2897
2898    @contextlib.contextmanager
2899    def assertWrapped(self, operation, exc_type, msg):
2900        full_msg = r"{} with {!r} codec failed \({}: {}\)".format(
2901                  operation, self.codec_name, exc_type.__name__, msg)
2902        with self.assertRaisesRegex(exc_type, full_msg) as caught:
2903            yield caught
2904        self.assertIsInstance(caught.exception.__cause__, exc_type)
2905        self.assertIsNotNone(caught.exception.__cause__.__traceback__)
2906
2907    def raise_obj(self, *args, **kwds):
2908        # Helper to dynamically change the object raised by a test codec
2909        raise self.obj_to_raise
2910
2911    def check_wrapped(self, obj_to_raise, msg, exc_type=RuntimeError):
2912        self.obj_to_raise = obj_to_raise
2913        self.set_codec(self.raise_obj, self.raise_obj)
2914        with self.assertWrapped("encoding", exc_type, msg):
2915            "str_input".encode(self.codec_name)
2916        with self.assertWrapped("encoding", exc_type, msg):
2917            codecs.encode("str_input", self.codec_name)
2918        with self.assertWrapped("decoding", exc_type, msg):
2919            b"bytes input".decode(self.codec_name)
2920        with self.assertWrapped("decoding", exc_type, msg):
2921            codecs.decode(b"bytes input", self.codec_name)
2922
2923    def test_raise_by_type(self):
2924        self.check_wrapped(RuntimeError, "")
2925
2926    def test_raise_by_value(self):
2927        msg = "This should be wrapped"
2928        self.check_wrapped(RuntimeError(msg), msg)
2929
2930    def test_raise_grandchild_subclass_exact_size(self):
2931        msg = "This should be wrapped"
2932        class MyRuntimeError(RuntimeError):
2933            __slots__ = ()
2934        self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2935
2936    def test_raise_subclass_with_weakref_support(self):
2937        msg = "This should be wrapped"
2938        class MyRuntimeError(RuntimeError):
2939            pass
2940        self.check_wrapped(MyRuntimeError(msg), msg, MyRuntimeError)
2941
2942    def check_not_wrapped(self, obj_to_raise, msg):
2943        def raise_obj(*args, **kwds):
2944            raise obj_to_raise
2945        self.set_codec(raise_obj, raise_obj)
2946        with self.assertRaisesRegex(RuntimeError, msg):
2947            "str input".encode(self.codec_name)
2948        with self.assertRaisesRegex(RuntimeError, msg):
2949            codecs.encode("str input", self.codec_name)
2950        with self.assertRaisesRegex(RuntimeError, msg):
2951            b"bytes input".decode(self.codec_name)
2952        with self.assertRaisesRegex(RuntimeError, msg):
2953            codecs.decode(b"bytes input", self.codec_name)
2954
2955    def test_init_override_is_not_wrapped(self):
2956        class CustomInit(RuntimeError):
2957            def __init__(self):
2958                pass
2959        self.check_not_wrapped(CustomInit, "")
2960
2961    def test_new_override_is_not_wrapped(self):
2962        class CustomNew(RuntimeError):
2963            def __new__(cls):
2964                return super().__new__(cls)
2965        self.check_not_wrapped(CustomNew, "")
2966
2967    def test_instance_attribute_is_not_wrapped(self):
2968        msg = "This should NOT be wrapped"
2969        exc = RuntimeError(msg)
2970        exc.attr = 1
2971        self.check_not_wrapped(exc, "^{}$".format(msg))
2972
2973    def test_non_str_arg_is_not_wrapped(self):
2974        self.check_not_wrapped(RuntimeError(1), "1")
2975
2976    def test_multiple_args_is_not_wrapped(self):
2977        msg_re = r"^\('a', 'b', 'c'\)$"
2978        self.check_not_wrapped(RuntimeError('a', 'b', 'c'), msg_re)
2979
2980    # http://bugs.python.org/issue19609
2981    def test_codec_lookup_failure_not_wrapped(self):
2982        msg = "^unknown encoding: {}$".format(self.codec_name)
2983        # The initial codec lookup should not be wrapped
2984        with self.assertRaisesRegex(LookupError, msg):
2985            "str input".encode(self.codec_name)
2986        with self.assertRaisesRegex(LookupError, msg):
2987            codecs.encode("str input", self.codec_name)
2988        with self.assertRaisesRegex(LookupError, msg):
2989            b"bytes input".decode(self.codec_name)
2990        with self.assertRaisesRegex(LookupError, msg):
2991            codecs.decode(b"bytes input", self.codec_name)
2992
2993    def test_unflagged_non_text_codec_handling(self):
2994        # The stdlib non-text codecs are now marked so they're
2995        # pre-emptively skipped by the text model related methods
2996        # However, third party codecs won't be flagged, so we still make
2997        # sure the case where an inappropriate output type is produced is
2998        # handled appropriately
2999        def encode_to_str(*args, **kwds):
3000            return "not bytes!", 0
3001        def decode_to_bytes(*args, **kwds):
3002            return b"not str!", 0
3003        self.set_codec(encode_to_str, decode_to_bytes)
3004        # No input or output type checks on the codecs module functions
3005        encoded = codecs.encode(None, self.codec_name)
3006        self.assertEqual(encoded, "not bytes!")
3007        decoded = codecs.decode(None, self.codec_name)
3008        self.assertEqual(decoded, b"not str!")
3009        # Text model methods should complain
3010        fmt = (r"^{!r} encoder returned 'str' instead of 'bytes'; "
3011               r"use codecs.encode\(\) to encode to arbitrary types$")
3012        msg = fmt.format(self.codec_name)
3013        with self.assertRaisesRegex(TypeError, msg):
3014            "str_input".encode(self.codec_name)
3015        fmt = (r"^{!r} decoder returned 'bytes' instead of 'str'; "
3016               r"use codecs.decode\(\) to decode to arbitrary types$")
3017        msg = fmt.format(self.codec_name)
3018        with self.assertRaisesRegex(TypeError, msg):
3019            b"bytes input".decode(self.codec_name)
3020
3021
3022
3023@unittest.skipUnless(sys.platform == 'win32',
3024                     'code pages are specific to Windows')
3025class CodePageTest(unittest.TestCase):
3026    # CP_UTF8 is already tested by CP65001Test
3027    CP_UTF8 = 65001
3028
3029    def test_invalid_code_page(self):
3030        self.assertRaises(ValueError, codecs.code_page_encode, -1, 'a')
3031        self.assertRaises(ValueError, codecs.code_page_decode, -1, b'a')
3032        self.assertRaises(OSError, codecs.code_page_encode, 123, 'a')
3033        self.assertRaises(OSError, codecs.code_page_decode, 123, b'a')
3034
3035    def test_code_page_name(self):
3036        self.assertRaisesRegex(UnicodeEncodeError, 'cp932',
3037            codecs.code_page_encode, 932, '\xff')
3038        self.assertRaisesRegex(UnicodeDecodeError, 'cp932',
3039            codecs.code_page_decode, 932, b'\x81\x00', 'strict', True)
3040        self.assertRaisesRegex(UnicodeDecodeError, 'CP_UTF8',
3041            codecs.code_page_decode, self.CP_UTF8, b'\xff', 'strict', True)
3042
3043    def check_decode(self, cp, tests):
3044        for raw, errors, expected in tests:
3045            if expected is not None:
3046                try:
3047                    decoded = codecs.code_page_decode(cp, raw, errors, True)
3048                except UnicodeDecodeError as err:
3049                    self.fail('Unable to decode %a from "cp%s" with '
3050                              'errors=%r: %s' % (raw, cp, errors, err))
3051                self.assertEqual(decoded[0], expected,
3052                    '%a.decode("cp%s", %r)=%a != %a'
3053                    % (raw, cp, errors, decoded[0], expected))
3054                # assert 0 <= decoded[1] <= len(raw)
3055                self.assertGreaterEqual(decoded[1], 0)
3056                self.assertLessEqual(decoded[1], len(raw))
3057            else:
3058                self.assertRaises(UnicodeDecodeError,
3059                    codecs.code_page_decode, cp, raw, errors, True)
3060
3061    def check_encode(self, cp, tests):
3062        for text, errors, expected in tests:
3063            if expected is not None:
3064                try:
3065                    encoded = codecs.code_page_encode(cp, text, errors)
3066                except UnicodeEncodeError as err:
3067                    self.fail('Unable to encode %a to "cp%s" with '
3068                              'errors=%r: %s' % (text, cp, errors, err))
3069                self.assertEqual(encoded[0], expected,
3070                    '%a.encode("cp%s", %r)=%a != %a'
3071                    % (text, cp, errors, encoded[0], expected))
3072                self.assertEqual(encoded[1], len(text))
3073            else:
3074                self.assertRaises(UnicodeEncodeError,
3075                    codecs.code_page_encode, cp, text, errors)
3076
3077    def test_cp932(self):
3078        self.check_encode(932, (
3079            ('abc', 'strict', b'abc'),
3080            ('\uff44\u9a3e', 'strict', b'\x82\x84\xe9\x80'),
3081            # test error handlers
3082            ('\xff', 'strict', None),
3083            ('[\xff]', 'ignore', b'[]'),
3084            ('[\xff]', 'replace', b'[y]'),
3085            ('[\u20ac]', 'replace', b'[?]'),
3086            ('[\xff]', 'backslashreplace', b'[\\xff]'),
3087            ('[\xff]', 'namereplace',
3088             b'[\\N{LATIN SMALL LETTER Y WITH DIAERESIS}]'),
3089            ('[\xff]', 'xmlcharrefreplace', b'[&#255;]'),
3090            ('\udcff', 'strict', None),
3091            ('[\udcff]', 'surrogateescape', b'[\xff]'),
3092            ('[\udcff]', 'surrogatepass', None),
3093        ))
3094        self.check_decode(932, (
3095            (b'abc', 'strict', 'abc'),
3096            (b'\x82\x84\xe9\x80', 'strict', '\uff44\u9a3e'),
3097            # invalid bytes
3098            (b'[\xff]', 'strict', None),
3099            (b'[\xff]', 'ignore', '[]'),
3100            (b'[\xff]', 'replace', '[\ufffd]'),
3101            (b'[\xff]', 'backslashreplace', '[\\xff]'),
3102            (b'[\xff]', 'surrogateescape', '[\udcff]'),
3103            (b'[\xff]', 'surrogatepass', None),
3104            (b'\x81\x00abc', 'strict', None),
3105            (b'\x81\x00abc', 'ignore', '\x00abc'),
3106            (b'\x81\x00abc', 'replace', '\ufffd\x00abc'),
3107            (b'\x81\x00abc', 'backslashreplace', '\\x81\x00abc'),
3108        ))
3109
3110    def test_cp1252(self):
3111        self.check_encode(1252, (
3112            ('abc', 'strict', b'abc'),
3113            ('\xe9\u20ac', 'strict',  b'\xe9\x80'),
3114            ('\xff', 'strict', b'\xff'),
3115            # test error handlers
3116            ('\u0141', 'strict', None),
3117            ('\u0141', 'ignore', b''),
3118            ('\u0141', 'replace', b'L'),
3119            ('\udc98', 'surrogateescape', b'\x98'),
3120            ('\udc98', 'surrogatepass', None),
3121        ))
3122        self.check_decode(1252, (
3123            (b'abc', 'strict', 'abc'),
3124            (b'\xe9\x80', 'strict', '\xe9\u20ac'),
3125            (b'\xff', 'strict', '\xff'),
3126        ))
3127
3128    def test_cp_utf7(self):
3129        cp = 65000
3130        self.check_encode(cp, (
3131            ('abc', 'strict', b'abc'),
3132            ('\xe9\u20ac', 'strict',  b'+AOkgrA-'),
3133            ('\U0010ffff', 'strict',  b'+2//f/w-'),
3134            ('\udc80', 'strict', b'+3IA-'),
3135            ('\ufffd', 'strict', b'+//0-'),
3136        ))
3137        self.check_decode(cp, (
3138            (b'abc', 'strict', 'abc'),
3139            (b'+AOkgrA-', 'strict', '\xe9\u20ac'),
3140            (b'+2//f/w-', 'strict', '\U0010ffff'),
3141            (b'+3IA-', 'strict', '\udc80'),
3142            (b'+//0-', 'strict', '\ufffd'),
3143            # invalid bytes
3144            (b'[+/]', 'strict', '[]'),
3145            (b'[\xff]', 'strict', '[\xff]'),
3146        ))
3147
3148    def test_multibyte_encoding(self):
3149        self.check_decode(932, (
3150            (b'\x84\xe9\x80', 'ignore', '\u9a3e'),
3151            (b'\x84\xe9\x80', 'replace', '\ufffd\u9a3e'),
3152        ))
3153        self.check_decode(self.CP_UTF8, (
3154            (b'\xff\xf4\x8f\xbf\xbf', 'ignore', '\U0010ffff'),
3155            (b'\xff\xf4\x8f\xbf\xbf', 'replace', '\ufffd\U0010ffff'),
3156        ))
3157        self.check_encode(self.CP_UTF8, (
3158            ('[\U0010ffff\uDC80]', 'ignore', b'[\xf4\x8f\xbf\xbf]'),
3159            ('[\U0010ffff\uDC80]', 'replace', b'[\xf4\x8f\xbf\xbf?]'),
3160        ))
3161
3162    def test_incremental(self):
3163        decoded = codecs.code_page_decode(932, b'\x82', 'strict', False)
3164        self.assertEqual(decoded, ('', 0))
3165
3166        decoded = codecs.code_page_decode(932,
3167                                          b'\xe9\x80\xe9', 'strict',
3168                                          False)
3169        self.assertEqual(decoded, ('\u9a3e', 2))
3170
3171        decoded = codecs.code_page_decode(932,
3172                                          b'\xe9\x80\xe9\x80', 'strict',
3173                                          False)
3174        self.assertEqual(decoded, ('\u9a3e\u9a3e', 4))
3175
3176        decoded = codecs.code_page_decode(932,
3177                                          b'abc', 'strict',
3178                                          False)
3179        self.assertEqual(decoded, ('abc', 3))
3180
3181    def test_mbcs_alias(self):
3182        # Check that looking up our 'default' codepage will return
3183        # mbcs when we don't have a more specific one available
3184        with mock.patch('_winapi.GetACP', return_value=123):
3185            codec = codecs.lookup('cp123')
3186            self.assertEqual(codec.name, 'mbcs')
3187
3188    @support.bigmemtest(size=2**31, memuse=7, dry_run=False)
3189    def test_large_input(self):
3190        # Test input longer than INT_MAX.
3191        # Input should contain undecodable bytes before and after
3192        # the INT_MAX limit.
3193        encoded = (b'01234567' * (2**28-1) +
3194                   b'\x85\x86\xea\xeb\xec\xef\xfc\xfd\xfe\xff')
3195        self.assertEqual(len(encoded), 2**31+2)
3196        decoded = codecs.code_page_decode(932, encoded, 'surrogateescape', True)
3197        self.assertEqual(decoded[1], len(encoded))
3198        del encoded
3199        self.assertEqual(len(decoded[0]), decoded[1])
3200        self.assertEqual(decoded[0][:10], '0123456701')
3201        self.assertEqual(decoded[0][-20:],
3202                         '6701234567'
3203                         '\udc85\udc86\udcea\udceb\udcec'
3204                         '\udcef\udcfc\udcfd\udcfe\udcff')
3205
3206
3207class ASCIITest(unittest.TestCase):
3208    def test_encode(self):
3209        self.assertEqual('abc123'.encode('ascii'), b'abc123')
3210
3211    def test_encode_error(self):
3212        for data, error_handler, expected in (
3213            ('[\x80\xff\u20ac]', 'ignore', b'[]'),
3214            ('[\x80\xff\u20ac]', 'replace', b'[???]'),
3215            ('[\x80\xff\u20ac]', 'xmlcharrefreplace', b'[&#128;&#255;&#8364;]'),
3216            ('[\x80\xff\u20ac\U000abcde]', 'backslashreplace',
3217             b'[\\x80\\xff\\u20ac\\U000abcde]'),
3218            ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3219        ):
3220            with self.subTest(data=data, error_handler=error_handler,
3221                              expected=expected):
3222                self.assertEqual(data.encode('ascii', error_handler),
3223                                 expected)
3224
3225    def test_encode_surrogateescape_error(self):
3226        with self.assertRaises(UnicodeEncodeError):
3227            # the first character can be decoded, but not the second
3228            '\udc80\xff'.encode('ascii', 'surrogateescape')
3229
3230    def test_decode(self):
3231        self.assertEqual(b'abc'.decode('ascii'), 'abc')
3232
3233    def test_decode_error(self):
3234        for data, error_handler, expected in (
3235            (b'[\x80\xff]', 'ignore', '[]'),
3236            (b'[\x80\xff]', 'replace', '[\ufffd\ufffd]'),
3237            (b'[\x80\xff]', 'surrogateescape', '[\udc80\udcff]'),
3238            (b'[\x80\xff]', 'backslashreplace', '[\\x80\\xff]'),
3239        ):
3240            with self.subTest(data=data, error_handler=error_handler,
3241                              expected=expected):
3242                self.assertEqual(data.decode('ascii', error_handler),
3243                                 expected)
3244
3245
3246class Latin1Test(unittest.TestCase):
3247    def test_encode(self):
3248        for data, expected in (
3249            ('abc', b'abc'),
3250            ('\x80\xe9\xff', b'\x80\xe9\xff'),
3251        ):
3252            with self.subTest(data=data, expected=expected):
3253                self.assertEqual(data.encode('latin1'), expected)
3254
3255    def test_encode_errors(self):
3256        for data, error_handler, expected in (
3257            ('[\u20ac\udc80]', 'ignore', b'[]'),
3258            ('[\u20ac\udc80]', 'replace', b'[??]'),
3259            ('[\u20ac\U000abcde]', 'backslashreplace',
3260             b'[\\u20ac\\U000abcde]'),
3261            ('[\u20ac\udc80]', 'xmlcharrefreplace', b'[&#8364;&#56448;]'),
3262            ('[\udc80\udcff]', 'surrogateescape', b'[\x80\xff]'),
3263        ):
3264            with self.subTest(data=data, error_handler=error_handler,
3265                              expected=expected):
3266                self.assertEqual(data.encode('latin1', error_handler),
3267                                 expected)
3268
3269    def test_encode_surrogateescape_error(self):
3270        with self.assertRaises(UnicodeEncodeError):
3271            # the first character can be decoded, but not the second
3272            '\udc80\u20ac'.encode('latin1', 'surrogateescape')
3273
3274    def test_decode(self):
3275        for data, expected in (
3276            (b'abc', 'abc'),
3277            (b'[\x80\xff]', '[\x80\xff]'),
3278        ):
3279            with self.subTest(data=data, expected=expected):
3280                self.assertEqual(data.decode('latin1'), expected)
3281
3282
3283if __name__ == "__main__":
3284    unittest.main()
3285