1from test import test_support
2import unittest
3import codecs
4import locale
5import sys, StringIO
6
7def coding_checker(self, coder):
8    def check(input, expect):
9        self.assertEqual(coder(input), (expect, len(input)))
10    return check
11
12class Queue(object):
13    """
14    queue: write bytes at one end, read bytes from the other end
15    """
16    def __init__(self):
17        self._buffer = ""
18
19    def write(self, chars):
20        self._buffer += chars
21
22    def read(self, size=-1):
23        if size<0:
24            s = self._buffer
25            self._buffer = ""
26            return s
27        else:
28            s = self._buffer[:size]
29            self._buffer = self._buffer[size:]
30            return s
31
32class ReadTest(unittest.TestCase):
33    def check_partial(self, input, partialresults):
34        # get a StreamReader for the encoding and feed the bytestring version
35        # of input to the reader byte by byte. Read everything available from
36        # the StreamReader and check that the results equal the appropriate
37        # entries from partialresults.
38        q = Queue()
39        r = codecs.getreader(self.encoding)(q)
40        result = u""
41        for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
42            q.write(c)
43            result += r.read()
44            self.assertEqual(result, partialresult)
45        # check that there's nothing left in the buffers
46        self.assertEqual(r.read(), u"")
47        self.assertEqual(r.bytebuffer, "")
48        self.assertEqual(r.charbuffer, u"")
49
50        # do the check again, this time using an incremental decoder
51        d = codecs.getincrementaldecoder(self.encoding)()
52        result = u""
53        for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
54            result += d.decode(c)
55            self.assertEqual(result, partialresult)
56        # check that there's nothing left in the buffers
57        self.assertEqual(d.decode("", True), u"")
58        self.assertEqual(d.buffer, "")
59
60        # Check whether the reset method works properly
61        d.reset()
62        result = u""
63        for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
64            result += d.decode(c)
65            self.assertEqual(result, partialresult)
66        # check that there's nothing left in the buffers
67        self.assertEqual(d.decode("", True), u"")
68        self.assertEqual(d.buffer, "")
69
70        # check iterdecode()
71        encoded = input.encode(self.encoding)
72        self.assertEqual(
73            input,
74            u"".join(codecs.iterdecode(encoded, self.encoding))
75        )
76
77    def test_readline(self):
78        def getreader(input):
79            stream = StringIO.StringIO(input.encode(self.encoding))
80            return codecs.getreader(self.encoding)(stream)
81
82        def readalllines(input, keepends=True, size=None):
83            reader = getreader(input)
84            lines = []
85            while True:
86                line = reader.readline(size=size, keepends=keepends)
87                if not line:
88                    break
89                lines.append(line)
90            return "|".join(lines)
91
92        s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
93        sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
94        sexpectednoends = u"foo|bar|baz|spam|eggs"
95        self.assertEqual(readalllines(s, True), sexpected)
96        self.assertEqual(readalllines(s, False), sexpectednoends)
97        self.assertEqual(readalllines(s, True, 10), sexpected)
98        self.assertEqual(readalllines(s, False, 10), sexpectednoends)
99
100        lineends = ("\n", "\r\n", "\r", u"\u2028")
101        # Test long lines (multiple calls to read() in readline())
102        vw = []
103        vwo = []
104        for (i, lineend) in enumerate(lineends):
105            vw.append((i*200+200)*u"\u3042" + lineend)
106            vwo.append((i*200+200)*u"\u3042")
107        self.assertEqual(readalllines("".join(vw), True), "|".join(vw))
108        self.assertEqual(readalllines("".join(vw), False), "|".join(vwo))
109
110        # Test lines where the first read might end with \r, so the
111        # reader has to look ahead whether this is a lone \r or a \r\n
112        for size in xrange(80):
113            for lineend in lineends:
114                s = 10*(size*u"a" + lineend + u"xxx\n")
115                reader = getreader(s)
116                for i in xrange(10):
117                    self.assertEqual(
118                        reader.readline(keepends=True),
119                        size*u"a" + lineend,
120                    )
121                    self.assertEqual(
122                        reader.readline(keepends=True),
123                        "xxx\n",
124                    )
125                reader = getreader(s)
126                for i in xrange(10):
127                    self.assertEqual(
128                        reader.readline(keepends=False),
129                        size*u"a",
130                    )
131                    self.assertEqual(
132                        reader.readline(keepends=False),
133                        "xxx",
134                    )
135
136    def test_mixed_readline_and_read(self):
137        lines = ["Humpty Dumpty sat on a wall,\n",
138                 "Humpty Dumpty had a great fall.\r\n",
139                 "All the king's horses and all the king's men\r",
140                 "Couldn't put Humpty together again."]
141        data = ''.join(lines)
142        def getreader():
143            stream = StringIO.StringIO(data.encode(self.encoding))
144            return codecs.getreader(self.encoding)(stream)
145
146        # Issue #8260: Test readline() followed by read()
147        f = getreader()
148        self.assertEqual(f.readline(), lines[0])
149        self.assertEqual(f.read(), ''.join(lines[1:]))
150        self.assertEqual(f.read(), '')
151
152        # Issue #32110: Test readline() followed by read(n)
153        f = getreader()
154        self.assertEqual(f.readline(), lines[0])
155        self.assertEqual(f.read(1), lines[1][0])
156        self.assertEqual(f.read(0), '')
157        self.assertEqual(f.read(100), data[len(lines[0]) + 1:][:100])
158
159        # Issue #16636: Test readline() followed by readlines()
160        f = getreader()
161        self.assertEqual(f.readline(), lines[0])
162        self.assertEqual(f.readlines(), lines[1:])
163        self.assertEqual(f.read(), '')
164
165        # Test read(n) followed by read()
166        f = getreader()
167        self.assertEqual(f.read(size=40, chars=5), data[:5])
168        self.assertEqual(f.read(), data[5:])
169        self.assertEqual(f.read(), '')
170
171        # Issue #32110: Test read(n) followed by read(n)
172        f = getreader()
173        self.assertEqual(f.read(size=40, chars=5), data[:5])
174        self.assertEqual(f.read(1), data[5])
175        self.assertEqual(f.read(0), '')
176        self.assertEqual(f.read(100), data[6:106])
177
178        # Issue #12446: Test read(n) followed by readlines()
179        f = getreader()
180        self.assertEqual(f.read(size=40, chars=5), data[:5])
181        self.assertEqual(f.readlines(), [lines[0][5:]] + lines[1:])
182        self.assertEqual(f.read(), '')
183
184    def test_bug1175396(self):
185        s = [
186            '<%!--===================================================\r\n',
187            '    BLOG index page: show recent articles,\r\n',
188            '    today\'s articles, or articles of a specific date.\r\n',
189            '========================================================--%>\r\n',
190            '<%@inputencoding="ISO-8859-1"%>\r\n',
191            '<%@pagetemplate=TEMPLATE.y%>\r\n',
192            '<%@import=import frog.util, frog%>\r\n',
193            '<%@import=import frog.objects%>\r\n',
194            '<%@import=from frog.storageerrors import StorageError%>\r\n',
195            '<%\r\n',
196            '\r\n',
197            'import logging\r\n',
198            'log=logging.getLogger("Snakelets.logger")\r\n',
199            '\r\n',
200            '\r\n',
201            'user=self.SessionCtx.user\r\n',
202            'storageEngine=self.SessionCtx.storageEngine\r\n',
203            '\r\n',
204            '\r\n',
205            'def readArticlesFromDate(date, count=None):\r\n',
206            '    entryids=storageEngine.listBlogEntries(date)\r\n',
207            '    entryids.reverse() # descending\r\n',
208            '    if count:\r\n',
209            '        entryids=entryids[:count]\r\n',
210            '    try:\r\n',
211            '        return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
212            '    except StorageError,x:\r\n',
213            '        log.error("Error loading articles: "+str(x))\r\n',
214            '        self.abort("cannot load articles")\r\n',
215            '\r\n',
216            'showdate=None\r\n',
217            '\r\n',
218            'arg=self.Request.getArg()\r\n',
219            'if arg=="today":\r\n',
220            '    #-------------------- TODAY\'S ARTICLES\r\n',
221            '    self.write("<h2>Today\'s articles</h2>")\r\n',
222            '    showdate = frog.util.isodatestr() \r\n',
223            '    entries = readArticlesFromDate(showdate)\r\n',
224            'elif arg=="active":\r\n',
225            '    #-------------------- ACTIVE ARTICLES redirect\r\n',
226            '    self.Yredirect("active.y")\r\n',
227            'elif arg=="login":\r\n',
228            '    #-------------------- LOGIN PAGE redirect\r\n',
229            '    self.Yredirect("login.y")\r\n',
230            'elif arg=="date":\r\n',
231            '    #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
232            '    showdate = self.Request.getParameter("date")\r\n',
233            '    self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
234            '    entries = readArticlesFromDate(showdate)\r\n',
235            'else:\r\n',
236            '    #-------------------- RECENT ARTICLES\r\n',
237            '    self.write("<h2>Recent articles</h2>")\r\n',
238            '    dates=storageEngine.listBlogEntryDates()\r\n',
239            '    if dates:\r\n',
240            '        entries=[]\r\n',
241            '        SHOWAMOUNT=10\r\n',
242            '        for showdate in dates:\r\n',
243            '            entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
244            '            if len(entries)>=SHOWAMOUNT:\r\n',
245            '                break\r\n',
246            '                \r\n',
247        ]
248        stream = StringIO.StringIO("".join(s).encode(self.encoding))
249        reader = codecs.getreader(self.encoding)(stream)
250        for (i, line) in enumerate(reader):
251            self.assertEqual(line, s[i])
252
253    def test_readlinequeue(self):
254        q = Queue()
255        writer = codecs.getwriter(self.encoding)(q)
256        reader = codecs.getreader(self.encoding)(q)
257
258        # No lineends
259        writer.write(u"foo\r")
260        self.assertEqual(reader.readline(keepends=False), u"foo")
261        writer.write(u"\nbar\r")
262        self.assertEqual(reader.readline(keepends=False), u"")
263        self.assertEqual(reader.readline(keepends=False), u"bar")
264        writer.write(u"baz")
265        self.assertEqual(reader.readline(keepends=False), u"baz")
266        self.assertEqual(reader.readline(keepends=False), u"")
267
268        # Lineends
269        writer.write(u"foo\r")
270        self.assertEqual(reader.readline(keepends=True), u"foo\r")
271        writer.write(u"\nbar\r")
272        self.assertEqual(reader.readline(keepends=True), u"\n")
273        self.assertEqual(reader.readline(keepends=True), u"bar\r")
274        writer.write(u"baz")
275        self.assertEqual(reader.readline(keepends=True), u"baz")
276        self.assertEqual(reader.readline(keepends=True), u"")
277        writer.write(u"foo\r\n")
278        self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
279
280    def test_bug1098990_a(self):
281        s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
282        s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
283        s3 = u"next line.\r\n"
284
285        s = (s1+s2+s3).encode(self.encoding)
286        stream = StringIO.StringIO(s)
287        reader = codecs.getreader(self.encoding)(stream)
288        self.assertEqual(reader.readline(), s1)
289        self.assertEqual(reader.readline(), s2)
290        self.assertEqual(reader.readline(), s3)
291        self.assertEqual(reader.readline(), u"")
292
293    def test_bug1098990_b(self):
294        s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
295        s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
296        s3 = u"stillokay:bbbbxx\r\n"
297        s4 = u"broken!!!!badbad\r\n"
298        s5 = u"againokay.\r\n"
299
300        s = (s1+s2+s3+s4+s5).encode(self.encoding)
301        stream = StringIO.StringIO(s)
302        reader = codecs.getreader(self.encoding)(stream)
303        self.assertEqual(reader.readline(), s1)
304        self.assertEqual(reader.readline(), s2)
305        self.assertEqual(reader.readline(), s3)
306        self.assertEqual(reader.readline(), s4)
307        self.assertEqual(reader.readline(), s5)
308        self.assertEqual(reader.readline(), u"")
309
310class UTF32Test(ReadTest):
311    encoding = "utf-32"
312
313    spamle = ('\xff\xfe\x00\x00'
314              's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
315              's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
316    spambe = ('\x00\x00\xfe\xff'
317              '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
318              '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
319
320    def test_only_one_bom(self):
321        _,_,reader,writer = codecs.lookup(self.encoding)
322        # encode some stream
323        s = StringIO.StringIO()
324        f = writer(s)
325        f.write(u"spam")
326        f.write(u"spam")
327        d = s.getvalue()
328        # check whether there is exactly one BOM in it
329        self.assertTrue(d == self.spamle or d == self.spambe)
330        # try to read it back
331        s = StringIO.StringIO(d)
332        f = reader(s)
333        self.assertEqual(f.read(), u"spamspam")
334
335    def test_badbom(self):
336        s = StringIO.StringIO(4*"\xff")
337        f = codecs.getreader(self.encoding)(s)
338        self.assertRaises(UnicodeError, f.read)
339
340        s = StringIO.StringIO(8*"\xff")
341        f = codecs.getreader(self.encoding)(s)
342        self.assertRaises(UnicodeError, f.read)
343
344    def test_partial(self):
345        self.check_partial(
346            u"\x00\xff\u0100\uffff\U00010000",
347            [
348                u"", # first byte of BOM read
349                u"", # second byte of BOM read
350                u"", # third byte of BOM read
351                u"", # fourth byte of BOM read => byteorder known
352                u"",
353                u"",
354                u"",
355                u"\x00",
356                u"\x00",
357                u"\x00",
358                u"\x00",
359                u"\x00\xff",
360                u"\x00\xff",
361                u"\x00\xff",
362                u"\x00\xff",
363                u"\x00\xff\u0100",
364                u"\x00\xff\u0100",
365                u"\x00\xff\u0100",
366                u"\x00\xff\u0100",
367                u"\x00\xff\u0100\uffff",
368                u"\x00\xff\u0100\uffff",
369                u"\x00\xff\u0100\uffff",
370                u"\x00\xff\u0100\uffff",
371                u"\x00\xff\u0100\uffff\U00010000",
372            ]
373        )
374
375    def test_handlers(self):
376        self.assertEqual((u'\ufffd', 1),
377                         codecs.utf_32_decode('\x01', 'replace', True))
378        self.assertEqual((u'', 1),
379                         codecs.utf_32_decode('\x01', 'ignore', True))
380
381    def test_errors(self):
382        self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
383                          "\xff", "strict", True)
384
385    def test_issue8941(self):
386        # Issue #8941: insufficient result allocation when decoding into
387        # surrogate pairs on UCS-2 builds.
388        encoded_le = '\xff\xfe\x00\x00' + '\x00\x00\x01\x00' * 1024
389        self.assertEqual(u'\U00010000' * 1024,
390                         codecs.utf_32_decode(encoded_le)[0])
391        encoded_be = '\x00\x00\xfe\xff' + '\x00\x01\x00\x00' * 1024
392        self.assertEqual(u'\U00010000' * 1024,
393                         codecs.utf_32_decode(encoded_be)[0])
394
395class UTF32LETest(ReadTest):
396    encoding = "utf-32-le"
397
398    def test_partial(self):
399        self.check_partial(
400            u"\x00\xff\u0100\uffff\U00010000",
401            [
402                u"",
403                u"",
404                u"",
405                u"\x00",
406                u"\x00",
407                u"\x00",
408                u"\x00",
409                u"\x00\xff",
410                u"\x00\xff",
411                u"\x00\xff",
412                u"\x00\xff",
413                u"\x00\xff\u0100",
414                u"\x00\xff\u0100",
415                u"\x00\xff\u0100",
416                u"\x00\xff\u0100",
417                u"\x00\xff\u0100\uffff",
418                u"\x00\xff\u0100\uffff",
419                u"\x00\xff\u0100\uffff",
420                u"\x00\xff\u0100\uffff",
421                u"\x00\xff\u0100\uffff\U00010000",
422            ]
423        )
424
425    def test_simple(self):
426        self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
427
428    def test_errors(self):
429        self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
430                          "\xff", "strict", True)
431
432    def test_issue8941(self):
433        # Issue #8941: insufficient result allocation when decoding into
434        # surrogate pairs on UCS-2 builds.
435        encoded = '\x00\x00\x01\x00' * 1024
436        self.assertEqual(u'\U00010000' * 1024,
437                         codecs.utf_32_le_decode(encoded)[0])
438
439class UTF32BETest(ReadTest):
440    encoding = "utf-32-be"
441
442    def test_partial(self):
443        self.check_partial(
444            u"\x00\xff\u0100\uffff\U00010000",
445            [
446                u"",
447                u"",
448                u"",
449                u"\x00",
450                u"\x00",
451                u"\x00",
452                u"\x00",
453                u"\x00\xff",
454                u"\x00\xff",
455                u"\x00\xff",
456                u"\x00\xff",
457                u"\x00\xff\u0100",
458                u"\x00\xff\u0100",
459                u"\x00\xff\u0100",
460                u"\x00\xff\u0100",
461                u"\x00\xff\u0100\uffff",
462                u"\x00\xff\u0100\uffff",
463                u"\x00\xff\u0100\uffff",
464                u"\x00\xff\u0100\uffff",
465                u"\x00\xff\u0100\uffff\U00010000",
466            ]
467        )
468
469    def test_simple(self):
470        self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
471
472    def test_errors(self):
473        self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
474                          "\xff", "strict", True)
475
476    def test_issue8941(self):
477        # Issue #8941: insufficient result allocation when decoding into
478        # surrogate pairs on UCS-2 builds.
479        encoded = '\x00\x01\x00\x00' * 1024
480        self.assertEqual(u'\U00010000' * 1024,
481                         codecs.utf_32_be_decode(encoded)[0])
482
483
484class UTF16Test(ReadTest):
485    encoding = "utf-16"
486
487    spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
488    spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
489
490    def test_only_one_bom(self):
491        _,_,reader,writer = codecs.lookup(self.encoding)
492        # encode some stream
493        s = StringIO.StringIO()
494        f = writer(s)
495        f.write(u"spam")
496        f.write(u"spam")
497        d = s.getvalue()
498        # check whether there is exactly one BOM in it
499        self.assertTrue(d == self.spamle or d == self.spambe)
500        # try to read it back
501        s = StringIO.StringIO(d)
502        f = reader(s)
503        self.assertEqual(f.read(), u"spamspam")
504
505    def test_badbom(self):
506        s = StringIO.StringIO("\xff\xff")
507        f = codecs.getreader(self.encoding)(s)
508        self.assertRaises(UnicodeError, f.read)
509
510        s = StringIO.StringIO("\xff\xff\xff\xff")
511        f = codecs.getreader(self.encoding)(s)
512        self.assertRaises(UnicodeError, f.read)
513
514    def test_partial(self):
515        self.check_partial(
516            u"\x00\xff\u0100\uffff\U00010000",
517            [
518                u"", # first byte of BOM read
519                u"", # second byte of BOM read => byteorder known
520                u"",
521                u"\x00",
522                u"\x00",
523                u"\x00\xff",
524                u"\x00\xff",
525                u"\x00\xff\u0100",
526                u"\x00\xff\u0100",
527                u"\x00\xff\u0100\uffff",
528                u"\x00\xff\u0100\uffff",
529                u"\x00\xff\u0100\uffff",
530                u"\x00\xff\u0100\uffff",
531                u"\x00\xff\u0100\uffff\U00010000",
532            ]
533        )
534
535    def test_handlers(self):
536        self.assertEqual((u'\ufffd', 1),
537                         codecs.utf_16_decode('\x01', 'replace', True))
538        self.assertEqual((u'', 1),
539                         codecs.utf_16_decode('\x01', 'ignore', True))
540
541    def test_errors(self):
542        self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
543
544    def test_bug691291(self):
545        # Files are always opened in binary mode, even if no binary mode was
546        # specified.  This means that no automatic conversion of '\n' is done
547        # on reading and writing.
548        s1 = u'Hello\r\nworld\r\n'
549
550        s = s1.encode(self.encoding)
551        self.addCleanup(test_support.unlink, test_support.TESTFN)
552        with open(test_support.TESTFN, 'wb') as fp:
553            fp.write(s)
554        with codecs.open(test_support.TESTFN, 'U', encoding=self.encoding) as reader:
555            self.assertEqual(reader.read(), s1)
556
557class UTF16LETest(ReadTest):
558    encoding = "utf-16-le"
559
560    def test_partial(self):
561        self.check_partial(
562            u"\x00\xff\u0100\uffff\U00010000",
563            [
564                u"",
565                u"\x00",
566                u"\x00",
567                u"\x00\xff",
568                u"\x00\xff",
569                u"\x00\xff\u0100",
570                u"\x00\xff\u0100",
571                u"\x00\xff\u0100\uffff",
572                u"\x00\xff\u0100\uffff",
573                u"\x00\xff\u0100\uffff",
574                u"\x00\xff\u0100\uffff",
575                u"\x00\xff\u0100\uffff\U00010000",
576            ]
577        )
578
579    def test_errors(self):
580        tests = [
581            (b'\xff', u'\ufffd'),
582            (b'A\x00Z', u'A\ufffd'),
583            (b'A\x00B\x00C\x00D\x00Z', u'ABCD\ufffd'),
584            (b'\x00\xd8', u'\ufffd'),
585            (b'\x00\xd8A', u'\ufffd'),
586            (b'\x00\xd8A\x00', u'\ufffdA'),
587            (b'\x00\xdcA\x00', u'\ufffdA'),
588        ]
589        for raw, expected in tests:
590            try:
591                with self.assertRaises(UnicodeDecodeError):
592                    codecs.utf_16_le_decode(raw, 'strict', True)
593                self.assertEqual(raw.decode('utf-16le', 'replace'), expected)
594            except:
595                print 'raw=%r' % raw
596                raise
597
598class UTF16BETest(ReadTest):
599    encoding = "utf-16-be"
600
601    def test_partial(self):
602        self.check_partial(
603            u"\x00\xff\u0100\uffff\U00010000",
604            [
605                u"",
606                u"\x00",
607                u"\x00",
608                u"\x00\xff",
609                u"\x00\xff",
610                u"\x00\xff\u0100",
611                u"\x00\xff\u0100",
612                u"\x00\xff\u0100\uffff",
613                u"\x00\xff\u0100\uffff",
614                u"\x00\xff\u0100\uffff",
615                u"\x00\xff\u0100\uffff",
616                u"\x00\xff\u0100\uffff\U00010000",
617            ]
618        )
619
620    def test_errors(self):
621        tests = [
622            (b'\xff', u'\ufffd'),
623            (b'\x00A\xff', u'A\ufffd'),
624            (b'\x00A\x00B\x00C\x00DZ', u'ABCD\ufffd'),
625            (b'\xd8\x00', u'\ufffd'),
626            (b'\xd8\x00\xdc', u'\ufffd'),
627            (b'\xd8\x00\x00A', u'\ufffdA'),
628            (b'\xdc\x00\x00A', u'\ufffdA'),
629        ]
630        for raw, expected in tests:
631            try:
632                with self.assertRaises(UnicodeDecodeError):
633                    codecs.utf_16_be_decode(raw, 'strict', True)
634                self.assertEqual(raw.decode('utf-16be', 'replace'), expected)
635            except:
636                print 'raw=%r' % raw
637                raise
638
639class UTF8Test(ReadTest):
640    encoding = "utf-8"
641
642    def test_partial(self):
643        self.check_partial(
644            u"\x00\xff\u07ff\u0800\uffff\U00010000",
645            [
646                u"\x00",
647                u"\x00",
648                u"\x00\xff",
649                u"\x00\xff",
650                u"\x00\xff\u07ff",
651                u"\x00\xff\u07ff",
652                u"\x00\xff\u07ff",
653                u"\x00\xff\u07ff\u0800",
654                u"\x00\xff\u07ff\u0800",
655                u"\x00\xff\u07ff\u0800",
656                u"\x00\xff\u07ff\u0800\uffff",
657                u"\x00\xff\u07ff\u0800\uffff",
658                u"\x00\xff\u07ff\u0800\uffff",
659                u"\x00\xff\u07ff\u0800\uffff",
660                u"\x00\xff\u07ff\u0800\uffff\U00010000",
661            ]
662        )
663
664class UTF7Test(ReadTest):
665    encoding = "utf-7"
666
667    def test_ascii(self):
668        # Set D (directly encoded characters)
669        set_d = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ'
670                 'abcdefghijklmnopqrstuvwxyz'
671                 '0123456789'
672                 '\'(),-./:?')
673        self.assertEqual(set_d.encode(self.encoding), set_d)
674        self.assertEqual(set_d.decode(self.encoding), set_d)
675        # Set O (optional direct characters)
676        set_o = ' !"#$%&*;<=>@[]^_`{|}'
677        self.assertEqual(set_o.encode(self.encoding), set_o)
678        self.assertEqual(set_o.decode(self.encoding), set_o)
679        # +
680        self.assertEqual(u'a+b'.encode(self.encoding), 'a+-b')
681        self.assertEqual('a+-b'.decode(self.encoding), u'a+b')
682        # White spaces
683        ws = ' \t\n\r'
684        self.assertEqual(ws.encode(self.encoding), ws)
685        self.assertEqual(ws.decode(self.encoding), ws)
686        # Other ASCII characters
687        other_ascii = ''.join(sorted(set(chr(i) for i in range(0x80)) -
688                                     set(set_d + set_o + '+' + ws)))
689        self.assertEqual(other_ascii.encode(self.encoding),
690                         '+AAAAAQACAAMABAAFAAYABwAIAAsADAAOAA8AEAARABIAEwAU'
691                         'ABUAFgAXABgAGQAaABsAHAAdAB4AHwBcAH4Afw-')
692
693    def test_partial(self):
694        self.check_partial(
695            u"a+-b",
696            [
697                u"a",
698                u"a",
699                u"a+",
700                u"a+-",
701                u"a+-b",
702            ]
703        )
704
705    def test_errors(self):
706        tests = [
707            ('\xe1b', u'\ufffdb'),
708            ('a\xe1b', u'a\ufffdb'),
709            ('a\xe1\xe1b', u'a\ufffd\ufffdb'),
710            ('a+IK', u'a\ufffd'),
711            ('a+IK-b', u'a\ufffdb'),
712            ('a+IK,b', u'a\ufffdb'),
713            ('a+IKx', u'a\u20ac\ufffd'),
714            ('a+IKx-b', u'a\u20ac\ufffdb'),
715            ('a+IKwgr', u'a\u20ac\ufffd'),
716            ('a+IKwgr-b', u'a\u20ac\ufffdb'),
717            ('a+IKwgr,', u'a\u20ac\ufffd'),
718            ('a+IKwgr,-b', u'a\u20ac\ufffd-b'),
719            ('a+IKwgrB', u'a\u20ac\u20ac\ufffd'),
720            ('a+IKwgrB-b', u'a\u20ac\u20ac\ufffdb'),
721            ('a+/,+IKw-b', u'a\ufffd\u20acb'),
722            ('a+//,+IKw-b', u'a\ufffd\u20acb'),
723            ('a+///,+IKw-b', u'a\uffff\ufffd\u20acb'),
724            ('a+////,+IKw-b', u'a\uffff\ufffd\u20acb'),
725            ('a+IKw-b\xe1', u'a\u20acb\ufffd'),
726            ('a+IKw\xe1b', u'a\u20ac\ufffdb'),
727        ]
728        for raw, expected in tests:
729            try:
730                with self.assertRaises(UnicodeDecodeError):
731                    codecs.utf_7_decode(raw, 'strict', True)
732                self.assertEqual(raw.decode('utf-7', 'replace'), expected)
733            except:
734                print 'raw=%r' % raw
735                raise
736
737    def test_nonbmp(self):
738        self.assertEqual(u'\U000104A0'.encode(self.encoding), '+2AHcoA-')
739        self.assertEqual(u'\ud801\udca0'.encode(self.encoding), '+2AHcoA-')
740        self.assertEqual('+2AHcoA-'.decode(self.encoding), u'\U000104A0')
741        self.assertEqual('+2AHcoA'.decode(self.encoding), u'\U000104A0')
742        self.assertEqual(u'\u20ac\U000104A0'.encode(self.encoding), '+IKzYAdyg-')
743        self.assertEqual('+IKzYAdyg-'.decode(self.encoding), u'\u20ac\U000104A0')
744        self.assertEqual('+IKzYAdyg'.decode(self.encoding), u'\u20ac\U000104A0')
745        self.assertEqual(u'\u20ac\u20ac\U000104A0'.encode(self.encoding),
746                         '+IKwgrNgB3KA-')
747        self.assertEqual('+IKwgrNgB3KA-'.decode(self.encoding),
748                         u'\u20ac\u20ac\U000104A0')
749        self.assertEqual('+IKwgrNgB3KA'.decode(self.encoding),
750                         u'\u20ac\u20ac\U000104A0')
751
752    def test_lone_surrogates(self):
753        tests = [
754            ('a+2AE-b', u'a\ud801b'),
755            ('a+2AE\xe1b', u'a\ufffdb'),
756            ('a+2AE', u'a\ufffd'),
757            ('a+2AEA-b', u'a\ufffdb'),
758            ('a+2AH-b', u'a\ufffdb'),
759            ('a+IKzYAQ-b', u'a\u20ac\ud801b'),
760            ('a+IKzYAQ\xe1b', u'a\u20ac\ufffdb'),
761            ('a+IKzYAQA-b', u'a\u20ac\ufffdb'),
762            ('a+IKzYAd-b', u'a\u20ac\ufffdb'),
763            ('a+IKwgrNgB-b', u'a\u20ac\u20ac\ud801b'),
764            ('a+IKwgrNgB\xe1b', u'a\u20ac\u20ac\ufffdb'),
765            ('a+IKwgrNgB', u'a\u20ac\u20ac\ufffd'),
766            ('a+IKwgrNgBA-b', u'a\u20ac\u20ac\ufffdb'),
767        ]
768        for raw, expected in tests:
769            try:
770                self.assertEqual(raw.decode('utf-7', 'replace'), expected)
771            except:
772                print 'raw=%r' % raw
773                raise
774
775class UTF16ExTest(unittest.TestCase):
776
777    def test_errors(self):
778        self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
779
780    def test_bad_args(self):
781        self.assertRaises(TypeError, codecs.utf_16_ex_decode)
782
783class ReadBufferTest(unittest.TestCase):
784
785    def test_array(self):
786        import array
787        self.assertEqual(
788            codecs.readbuffer_encode(array.array("c", "spam")),
789            ("spam", 4)
790        )
791
792    def test_empty(self):
793        self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
794
795    def test_bad_args(self):
796        self.assertRaises(TypeError, codecs.readbuffer_encode)
797        self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
798
799class CharBufferTest(unittest.TestCase):
800
801    def test_string(self):
802        self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
803
804    def test_empty(self):
805        self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
806
807    def test_bad_args(self):
808        self.assertRaises(TypeError, codecs.charbuffer_encode)
809        self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
810
811class UTF8SigTest(ReadTest):
812    encoding = "utf-8-sig"
813
814    def test_partial(self):
815        self.check_partial(
816            u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
817            [
818                u"",
819                u"",
820                u"", # First BOM has been read and skipped
821                u"",
822                u"",
823                u"\ufeff", # Second BOM has been read and emitted
824                u"\ufeff\x00", # "\x00" read and emitted
825                u"\ufeff\x00", # First byte of encoded u"\xff" read
826                u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
827                u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
828                u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
829                u"\ufeff\x00\xff\u07ff",
830                u"\ufeff\x00\xff\u07ff",
831                u"\ufeff\x00\xff\u07ff\u0800",
832                u"\ufeff\x00\xff\u07ff\u0800",
833                u"\ufeff\x00\xff\u07ff\u0800",
834                u"\ufeff\x00\xff\u07ff\u0800\uffff",
835                u"\ufeff\x00\xff\u07ff\u0800\uffff",
836                u"\ufeff\x00\xff\u07ff\u0800\uffff",
837                u"\ufeff\x00\xff\u07ff\u0800\uffff",
838                u"\ufeff\x00\xff\u07ff\u0800\uffff\U00010000",
839            ]
840        )
841
842    def test_bug1601501(self):
843        # SF bug #1601501: check that the codec works with a buffer
844        unicode("\xef\xbb\xbf", "utf-8-sig")
845
846    def test_bom(self):
847        d = codecs.getincrementaldecoder("utf-8-sig")()
848        s = u"spam"
849        self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
850
851    def test_stream_bom(self):
852        unistring = u"ABC\u00A1\u2200XYZ"
853        bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
854
855        reader = codecs.getreader("utf-8-sig")
856        for sizehint in [None] + range(1, 11) + \
857                        [64, 128, 256, 512, 1024]:
858            istream = reader(StringIO.StringIO(bytestring))
859            ostream = StringIO.StringIO()
860            while 1:
861                if sizehint is not None:
862                    data = istream.read(sizehint)
863                else:
864                    data = istream.read()
865
866                if not data:
867                    break
868                ostream.write(data)
869
870            got = ostream.getvalue()
871            self.assertEqual(got, unistring)
872
873    def test_stream_bare(self):
874        unistring = u"ABC\u00A1\u2200XYZ"
875        bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
876
877        reader = codecs.getreader("utf-8-sig")
878        for sizehint in [None] + range(1, 11) + \
879                        [64, 128, 256, 512, 1024]:
880            istream = reader(StringIO.StringIO(bytestring))
881            ostream = StringIO.StringIO()
882            while 1:
883                if sizehint is not None:
884                    data = istream.read(sizehint)
885                else:
886                    data = istream.read()
887
888                if not data:
889                    break
890                ostream.write(data)
891
892            got = ostream.getvalue()
893            self.assertEqual(got, unistring)
894
895class EscapeDecodeTest(unittest.TestCase):
896    def test_empty(self):
897        self.assertEqual(codecs.escape_decode(""), ("", 0))
898
899    def test_raw(self):
900        decode = codecs.escape_decode
901        for b in range(256):
902            b = chr(b)
903            if b != '\\':
904                self.assertEqual(decode(b + '0'), (b + '0', 2))
905
906    def test_escape(self):
907        decode = codecs.escape_decode
908        check = coding_checker(self, decode)
909        check(b"[\\\n]", b"[]")
910        check(br'[\"]', b'["]')
911        check(br"[\']", b"[']")
912        check(br"[\\]", br"[\]")
913        check(br"[\a]", b"[\x07]")
914        check(br"[\b]", b"[\x08]")
915        check(br"[\t]", b"[\x09]")
916        check(br"[\n]", b"[\x0a]")
917        check(br"[\v]", b"[\x0b]")
918        check(br"[\f]", b"[\x0c]")
919        check(br"[\r]", b"[\x0d]")
920        check(br"[\7]", b"[\x07]")
921        check(br"[\8]", br"[\8]")
922        check(br"[\78]", b"[\x078]")
923        check(br"[\41]", b"[!]")
924        check(br"[\418]", b"[!8]")
925        check(br"[\101]", b"[A]")
926        check(br"[\1010]", b"[A0]")
927        check(br"[\501]", b"[A]")
928        check(br"[\x41]", b"[A]")
929        check(br"[\X41]", br"[\X41]")
930        check(br"[\x410]", b"[A0]")
931        for b in range(256):
932            b = chr(b)
933            if b not in '\n"\'\\abtnvfr01234567x':
934                check('\\' + b, '\\' + b)
935
936    def test_errors(self):
937        decode = codecs.escape_decode
938        self.assertRaises(ValueError, decode, br"\x")
939        self.assertRaises(ValueError, decode, br"[\x]")
940        self.assertEqual(decode(br"[\x]\x", "ignore"), (b"[]", 6))
941        self.assertEqual(decode(br"[\x]\x", "replace"), (b"[?]?", 6))
942        self.assertRaises(ValueError, decode, br"\x0")
943        self.assertRaises(ValueError, decode, br"[\x0]")
944        self.assertEqual(decode(br"[\x0]\x0", "ignore"), (b"[]", 8))
945        self.assertEqual(decode(br"[\x0]\x0", "replace"), (b"[?]?", 8))
946
947class RecodingTest(unittest.TestCase):
948    def test_recoding(self):
949        f = StringIO.StringIO()
950        f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
951        f2.write(u"a")
952        f2.close()
953        # Python used to crash on this at exit because of a refcount
954        # bug in _codecsmodule.c
955
956# From RFC 3492
957punycode_testcases = [
958    # A Arabic (Egyptian):
959    (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
960     u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
961     "egbpdaj6bu4bxfgehfvwxn"),
962    # B Chinese (simplified):
963    (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
964     "ihqwcrb4cv8a8dqg056pqjye"),
965    # C Chinese (traditional):
966    (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
967     "ihqwctvzc91f659drss3x8bo0yb"),
968    # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
969    (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
970     u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
971     u"\u0065\u0073\u006B\u0079",
972     "Proprostnemluvesky-uyb24dma41a"),
973    # E Hebrew:
974    (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
975     u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
976     u"\u05D1\u05E8\u05D9\u05EA",
977     "4dbcagdahymbxekheh6e0a7fei0b"),
978    # F Hindi (Devanagari):
979    (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
980    u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
981    u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
982    u"\u0939\u0948\u0902",
983    "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
984
985    #(G) Japanese (kanji and hiragana):
986    (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
987    u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
988     "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
989
990    # (H) Korean (Hangul syllables):
991    (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
992     u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
993     u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
994     "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
995     "psd879ccm6fea98c"),
996
997    # (I) Russian (Cyrillic):
998    (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
999     u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
1000     u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
1001     u"\u0438",
1002     "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
1003
1004    # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
1005    (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
1006     u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
1007     u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
1008     u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
1009     u"\u0061\u00F1\u006F\u006C",
1010     "PorqunopuedensimplementehablarenEspaol-fmd56a"),
1011
1012    # (K) Vietnamese:
1013    #  T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
1014    #   <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
1015    (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
1016     u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
1017     u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
1018     u"\u0056\u0069\u1EC7\u0074",
1019     "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
1020
1021    #(L) 3<nen>B<gumi><kinpachi><sensei>
1022    (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
1023     "3B-ww4c5e180e575a65lsy2b"),
1024
1025    # (M) <amuro><namie>-with-SUPER-MONKEYS
1026    (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
1027     u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
1028     u"\u004F\u004E\u004B\u0045\u0059\u0053",
1029     "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
1030
1031    # (N) Hello-Another-Way-<sorezore><no><basho>
1032    (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
1033     u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
1034     u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
1035     "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
1036
1037    # (O) <hitotsu><yane><no><shita>2
1038    (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
1039     "2-u9tlzr9756bt3uc0v"),
1040
1041    # (P) Maji<de>Koi<suru>5<byou><mae>
1042    (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
1043     u"\u308B\u0035\u79D2\u524D",
1044     "MajiKoi5-783gue6qz075azm5e"),
1045
1046     # (Q) <pafii>de<runba>
1047    (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
1048     "de-jg4avhby1noc0d"),
1049
1050    # (R) <sono><supiido><de>
1051    (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
1052     "d9juau41awczczp"),
1053
1054    # (S) -> $1.00 <-
1055    (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
1056     u"\u003C\u002D",
1057     "-> $1.00 <--")
1058    ]
1059
1060for i in punycode_testcases:
1061    if len(i)!=2:
1062        print repr(i)
1063
1064class PunycodeTest(unittest.TestCase):
1065    def test_encode(self):
1066        for uni, puny in punycode_testcases:
1067            # Need to convert both strings to lower case, since
1068            # some of the extended encodings use upper case, but our
1069            # code produces only lower case. Converting just puny to
1070            # lower is also insufficient, since some of the input characters
1071            # are upper case.
1072            self.assertEqual(uni.encode("punycode").lower(), puny.lower())
1073
1074    def test_decode(self):
1075        for uni, puny in punycode_testcases:
1076            self.assertEqual(uni, puny.decode("punycode"))
1077
1078class UnicodeInternalTest(unittest.TestCase):
1079    def test_bug1251300(self):
1080        # Decoding with unicode_internal used to not correctly handle "code
1081        # points" above 0x10ffff on UCS-4 builds.
1082        if sys.maxunicode > 0xffff:
1083            ok = [
1084                ("\x00\x10\xff\xff", u"\U0010ffff"),
1085                ("\x00\x00\x01\x01", u"\U00000101"),
1086                ("", u""),
1087            ]
1088            not_ok = [
1089                "\x7f\xff\xff\xff",
1090                "\x80\x00\x00\x00",
1091                "\x81\x00\x00\x00",
1092                "\x00",
1093                "\x00\x00\x00\x00\x00",
1094            ]
1095            for internal, uni in ok:
1096                if sys.byteorder == "little":
1097                    internal = "".join(reversed(internal))
1098                self.assertEqual(uni, internal.decode("unicode_internal"))
1099            for internal in not_ok:
1100                if sys.byteorder == "little":
1101                    internal = "".join(reversed(internal))
1102                self.assertRaises(UnicodeDecodeError, internal.decode,
1103                    "unicode_internal")
1104
1105    def test_decode_error_attributes(self):
1106        if sys.maxunicode > 0xffff:
1107            try:
1108                "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
1109            except UnicodeDecodeError, ex:
1110                self.assertEqual("unicode_internal", ex.encoding)
1111                self.assertEqual("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
1112                self.assertEqual(4, ex.start)
1113                self.assertEqual(8, ex.end)
1114            else:
1115                self.fail()
1116
1117    def test_decode_callback(self):
1118        if sys.maxunicode > 0xffff:
1119            codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
1120            decoder = codecs.getdecoder("unicode_internal")
1121            ab = u"ab".encode("unicode_internal")
1122            ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
1123                "UnicodeInternalTest")
1124            self.assertEqual((u"ab", 12), ignored)
1125
1126    def test_encode_length(self):
1127        # Issue 3739
1128        encoder = codecs.getencoder("unicode_internal")
1129        self.assertEqual(encoder(u"a")[1], 1)
1130        self.assertEqual(encoder(u"\xe9\u0142")[1], 2)
1131
1132        encoder = codecs.getencoder("string-escape")
1133        self.assertEqual(encoder(r'\x00')[1], 4)
1134
1135# From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
1136nameprep_tests = [
1137    # 3.1 Map to nothing.
1138    ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
1139     '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
1140     '\xb8\x8f\xef\xbb\xbf',
1141     'foobarbaz'),
1142    # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
1143    ('CAFE',
1144     'cafe'),
1145    # 3.3 Case folding 8bit U+00DF (german sharp s).
1146    # The original test case is bogus; it says \xc3\xdf
1147    ('\xc3\x9f',
1148     'ss'),
1149    # 3.4 Case folding U+0130 (turkish capital I with dot).
1150    ('\xc4\xb0',
1151     'i\xcc\x87'),
1152    # 3.5 Case folding multibyte U+0143 U+037A.
1153    ('\xc5\x83\xcd\xba',
1154     '\xc5\x84 \xce\xb9'),
1155    # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
1156    # XXX: skip this as it fails in UCS-2 mode
1157    #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
1158    # 'telc\xe2\x88\x95kg\xcf\x83'),
1159    (None, None),
1160    # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
1161    ('j\xcc\x8c\xc2\xa0\xc2\xaa',
1162     '\xc7\xb0 a'),
1163    # 3.8 Case folding U+1FB7 and normalization.
1164    ('\xe1\xbe\xb7',
1165     '\xe1\xbe\xb6\xce\xb9'),
1166    # 3.9 Self-reverting case folding U+01F0 and normalization.
1167    # The original test case is bogus, it says `\xc7\xf0'
1168    ('\xc7\xb0',
1169     '\xc7\xb0'),
1170    # 3.10 Self-reverting case folding U+0390 and normalization.
1171    ('\xce\x90',
1172     '\xce\x90'),
1173    # 3.11 Self-reverting case folding U+03B0 and normalization.
1174    ('\xce\xb0',
1175     '\xce\xb0'),
1176    # 3.12 Self-reverting case folding U+1E96 and normalization.
1177    ('\xe1\xba\x96',
1178     '\xe1\xba\x96'),
1179    # 3.13 Self-reverting case folding U+1F56 and normalization.
1180    ('\xe1\xbd\x96',
1181     '\xe1\xbd\x96'),
1182    # 3.14 ASCII space character U+0020.
1183    (' ',
1184     ' '),
1185    # 3.15 Non-ASCII 8bit space character U+00A0.
1186    ('\xc2\xa0',
1187     ' '),
1188    # 3.16 Non-ASCII multibyte space character U+1680.
1189    ('\xe1\x9a\x80',
1190     None),
1191    # 3.17 Non-ASCII multibyte space character U+2000.
1192    ('\xe2\x80\x80',
1193     ' '),
1194    # 3.18 Zero Width Space U+200b.
1195    ('\xe2\x80\x8b',
1196     ''),
1197    # 3.19 Non-ASCII multibyte space character U+3000.
1198    ('\xe3\x80\x80',
1199     ' '),
1200    # 3.20 ASCII control characters U+0010 U+007F.
1201    ('\x10\x7f',
1202     '\x10\x7f'),
1203    # 3.21 Non-ASCII 8bit control character U+0085.
1204    ('\xc2\x85',
1205     None),
1206    # 3.22 Non-ASCII multibyte control character U+180E.
1207    ('\xe1\xa0\x8e',
1208     None),
1209    # 3.23 Zero Width No-Break Space U+FEFF.
1210    ('\xef\xbb\xbf',
1211     ''),
1212    # 3.24 Non-ASCII control character U+1D175.
1213    ('\xf0\x9d\x85\xb5',
1214     None),
1215    # 3.25 Plane 0 private use character U+F123.
1216    ('\xef\x84\xa3',
1217     None),
1218    # 3.26 Plane 15 private use character U+F1234.
1219    ('\xf3\xb1\x88\xb4',
1220     None),
1221    # 3.27 Plane 16 private use character U+10F234.
1222    ('\xf4\x8f\x88\xb4',
1223     None),
1224    # 3.28 Non-character code point U+8FFFE.
1225    ('\xf2\x8f\xbf\xbe',
1226     None),
1227    # 3.29 Non-character code point U+10FFFF.
1228    ('\xf4\x8f\xbf\xbf',
1229     None),
1230    # 3.30 Surrogate code U+DF42.
1231    ('\xed\xbd\x82',
1232     None),
1233    # 3.31 Non-plain text character U+FFFD.
1234    ('\xef\xbf\xbd',
1235     None),
1236    # 3.32 Ideographic description character U+2FF5.
1237    ('\xe2\xbf\xb5',
1238     None),
1239    # 3.33 Display property character U+0341.
1240    ('\xcd\x81',
1241     '\xcc\x81'),
1242    # 3.34 Left-to-right mark U+200E.
1243    ('\xe2\x80\x8e',
1244     None),
1245    # 3.35 Deprecated U+202A.
1246    ('\xe2\x80\xaa',
1247     None),
1248    # 3.36 Language tagging character U+E0001.
1249    ('\xf3\xa0\x80\x81',
1250     None),
1251    # 3.37 Language tagging character U+E0042.
1252    ('\xf3\xa0\x81\x82',
1253     None),
1254    # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
1255    ('foo\xd6\xbebar',
1256     None),
1257    # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
1258    ('foo\xef\xb5\x90bar',
1259     None),
1260    # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
1261    ('foo\xef\xb9\xb6bar',
1262     'foo \xd9\x8ebar'),
1263    # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
1264    ('\xd8\xa71',
1265     None),
1266    # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
1267    ('\xd8\xa71\xd8\xa8',
1268     '\xd8\xa71\xd8\xa8'),
1269    # 3.43 Unassigned code point U+E0002.
1270    # Skip this test as we allow unassigned
1271    #('\xf3\xa0\x80\x82',
1272    # None),
1273    (None, None),
1274    # 3.44 Larger test (shrinking).
1275    # Original test case reads \xc3\xdf
1276    ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1277     '\xaa\xce\xb0\xe2\x80\x80',
1278     'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
1279    # 3.45 Larger test (expanding).
1280    # Original test case reads \xc3\x9f
1281    ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1282     '\x80',
1283     'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1284     '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1285     '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
1286    ]
1287
1288
1289class NameprepTest(unittest.TestCase):
1290    def test_nameprep(self):
1291        from encodings.idna import nameprep
1292        for pos, (orig, prepped) in enumerate(nameprep_tests):
1293            if orig is None:
1294                # Skipped
1295                continue
1296            # The Unicode strings are given in UTF-8
1297            orig = unicode(orig, "utf-8")
1298            if prepped is None:
1299                # Input contains prohibited characters
1300                self.assertRaises(UnicodeError, nameprep, orig)
1301            else:
1302                prepped = unicode(prepped, "utf-8")
1303                try:
1304                    self.assertEqual(nameprep(orig), prepped)
1305                except Exception,e:
1306                    raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
1307
1308class IDNACodecTest(unittest.TestCase):
1309    def test_builtin_decode(self):
1310        self.assertEqual(unicode("python.org", "idna"), u"python.org")
1311        self.assertEqual(unicode("python.org.", "idna"), u"python.org.")
1312        self.assertEqual(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
1313        self.assertEqual(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
1314
1315    def test_builtin_encode(self):
1316        self.assertEqual(u"python.org".encode("idna"), "python.org")
1317        self.assertEqual("python.org.".encode("idna"), "python.org.")
1318        self.assertEqual(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
1319        self.assertEqual(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
1320
1321    def test_stream(self):
1322        import StringIO
1323        r = codecs.getreader("idna")(StringIO.StringIO("abc"))
1324        r.read(3)
1325        self.assertEqual(r.read(), u"")
1326
1327    def test_incremental_decode(self):
1328        self.assertEqual(
1329            "".join(codecs.iterdecode("python.org", "idna")),
1330            u"python.org"
1331        )
1332        self.assertEqual(
1333            "".join(codecs.iterdecode("python.org.", "idna")),
1334            u"python.org."
1335        )
1336        self.assertEqual(
1337            "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1338            u"pyth\xf6n.org."
1339        )
1340        self.assertEqual(
1341            "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1342            u"pyth\xf6n.org."
1343        )
1344
1345        decoder = codecs.getincrementaldecoder("idna")()
1346        self.assertEqual(decoder.decode("xn--xam", ), u"")
1347        self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1348        self.assertEqual(decoder.decode(u"rg"), u"")
1349        self.assertEqual(decoder.decode(u"", True), u"org")
1350
1351        decoder.reset()
1352        self.assertEqual(decoder.decode("xn--xam", ), u"")
1353        self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1354        self.assertEqual(decoder.decode("rg."), u"org.")
1355        self.assertEqual(decoder.decode("", True), u"")
1356
1357    def test_incremental_encode(self):
1358        self.assertEqual(
1359            "".join(codecs.iterencode(u"python.org", "idna")),
1360            "python.org"
1361        )
1362        self.assertEqual(
1363            "".join(codecs.iterencode(u"python.org.", "idna")),
1364            "python.org."
1365        )
1366        self.assertEqual(
1367            "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1368            "xn--pythn-mua.org."
1369        )
1370        self.assertEqual(
1371            "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1372            "xn--pythn-mua.org."
1373        )
1374
1375        encoder = codecs.getincrementalencoder("idna")()
1376        self.assertEqual(encoder.encode(u"\xe4x"), "")
1377        self.assertEqual(encoder.encode(u"ample.org"), "xn--xample-9ta.")
1378        self.assertEqual(encoder.encode(u"", True), "org")
1379
1380        encoder.reset()
1381        self.assertEqual(encoder.encode(u"\xe4x"), "")
1382        self.assertEqual(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
1383        self.assertEqual(encoder.encode(u"", True), "")
1384
1385class CodecsModuleTest(unittest.TestCase):
1386
1387    def test_decode(self):
1388        self.assertEqual(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
1389                          u'\xe4\xf6\xfc')
1390        self.assertRaises(TypeError, codecs.decode)
1391        self.assertEqual(codecs.decode('abc'), u'abc')
1392        self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
1393
1394    def test_encode(self):
1395        self.assertEqual(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
1396                          '\xe4\xf6\xfc')
1397        self.assertRaises(TypeError, codecs.encode)
1398        self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
1399        self.assertEqual(codecs.encode(u'abc'), 'abc')
1400        self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
1401
1402    def test_register(self):
1403        self.assertRaises(TypeError, codecs.register)
1404        self.assertRaises(TypeError, codecs.register, 42)
1405
1406    def test_lookup(self):
1407        self.assertRaises(TypeError, codecs.lookup)
1408        self.assertRaises(LookupError, codecs.lookup, "__spam__")
1409        self.assertRaises(LookupError, codecs.lookup, " ")
1410
1411    def test_getencoder(self):
1412        self.assertRaises(TypeError, codecs.getencoder)
1413        self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1414
1415    def test_getdecoder(self):
1416        self.assertRaises(TypeError, codecs.getdecoder)
1417        self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1418
1419    def test_getreader(self):
1420        self.assertRaises(TypeError, codecs.getreader)
1421        self.assertRaises(LookupError, codecs.getreader, "__spam__")
1422
1423    def test_getwriter(self):
1424        self.assertRaises(TypeError, codecs.getwriter)
1425        self.assertRaises(LookupError, codecs.getwriter, "__spam__")
1426
1427    def test_lookup_issue1813(self):
1428        # Issue #1813: under Turkish locales, lookup of some codecs failed
1429        # because 'I' is lowercased as a dotless "i"
1430        oldlocale = locale.getlocale(locale.LC_CTYPE)
1431        self.addCleanup(locale.setlocale, locale.LC_CTYPE, oldlocale)
1432        try:
1433            locale.setlocale(locale.LC_CTYPE, 'tr_TR')
1434        except locale.Error:
1435            # Unsupported locale on this system
1436            self.skipTest('test needs Turkish locale')
1437        c = codecs.lookup('ASCII')
1438        self.assertEqual(c.name, 'ascii')
1439
1440    def test_all(self):
1441        api = (
1442            "encode", "decode",
1443            "register", "CodecInfo", "Codec", "IncrementalEncoder",
1444            "IncrementalDecoder", "StreamReader", "StreamWriter", "lookup",
1445            "getencoder", "getdecoder", "getincrementalencoder",
1446            "getincrementaldecoder", "getreader", "getwriter",
1447            "register_error", "lookup_error",
1448            "strict_errors", "replace_errors", "ignore_errors",
1449            "xmlcharrefreplace_errors", "backslashreplace_errors",
1450            "open", "EncodedFile",
1451            "iterencode", "iterdecode",
1452            "BOM", "BOM_BE", "BOM_LE",
1453            "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_BE", "BOM_UTF16_LE",
1454            "BOM_UTF32", "BOM_UTF32_BE", "BOM_UTF32_LE",
1455            "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",  # Undocumented
1456            "StreamReaderWriter", "StreamRecoder",
1457        )
1458        self.assertEqual(sorted(api), sorted(codecs.__all__))
1459        for api in codecs.__all__:
1460            getattr(codecs, api)
1461
1462class StreamReaderTest(unittest.TestCase):
1463
1464    def setUp(self):
1465        self.reader = codecs.getreader('utf-8')
1466        self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
1467
1468    def test_readlines(self):
1469        f = self.reader(self.stream)
1470        self.assertEqual(f.readlines(), [u'\ud55c\n', u'\uae00'])
1471
1472class EncodedFileTest(unittest.TestCase):
1473
1474    def test_basic(self):
1475        f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
1476        ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
1477        self.assertEqual(ef.read(), '\\\xd5\n\x00\x00\xae')
1478
1479        f = StringIO.StringIO()
1480        ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
1481        ef.write('\xc3\xbc')
1482        self.assertEqual(f.getvalue(), '\xfc')
1483
1484class Str2StrTest(unittest.TestCase):
1485
1486    def test_read(self):
1487        sin = codecs.encode("\x80", "base64_codec")
1488        reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1489        sout = reader.read()
1490        self.assertEqual(sout, "\x80")
1491        self.assertIsInstance(sout, str)
1492
1493    def test_readline(self):
1494        sin = codecs.encode("\x80", "base64_codec")
1495        reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1496        sout = reader.readline()
1497        self.assertEqual(sout, "\x80")
1498        self.assertIsInstance(sout, str)
1499
1500all_unicode_encodings = [
1501    "ascii",
1502    "base64_codec",
1503    "big5",
1504    "big5hkscs",
1505    "charmap",
1506    "cp037",
1507    "cp1006",
1508    "cp1026",
1509    "cp1140",
1510    "cp1250",
1511    "cp1251",
1512    "cp1252",
1513    "cp1253",
1514    "cp1254",
1515    "cp1255",
1516    "cp1256",
1517    "cp1257",
1518    "cp1258",
1519    "cp424",
1520    "cp437",
1521    "cp500",
1522    "cp720",
1523    "cp737",
1524    "cp775",
1525    "cp850",
1526    "cp852",
1527    "cp855",
1528    "cp856",
1529    "cp857",
1530    "cp858",
1531    "cp860",
1532    "cp861",
1533    "cp862",
1534    "cp863",
1535    "cp864",
1536    "cp865",
1537    "cp866",
1538    "cp869",
1539    "cp874",
1540    "cp875",
1541    "cp932",
1542    "cp949",
1543    "cp950",
1544    "euc_jis_2004",
1545    "euc_jisx0213",
1546    "euc_jp",
1547    "euc_kr",
1548    "gb18030",
1549    "gb2312",
1550    "gbk",
1551    "hex_codec",
1552    "hp_roman8",
1553    "hz",
1554    "idna",
1555    "iso2022_jp",
1556    "iso2022_jp_1",
1557    "iso2022_jp_2",
1558    "iso2022_jp_2004",
1559    "iso2022_jp_3",
1560    "iso2022_jp_ext",
1561    "iso2022_kr",
1562    "iso8859_1",
1563    "iso8859_10",
1564    "iso8859_11",
1565    "iso8859_13",
1566    "iso8859_14",
1567    "iso8859_15",
1568    "iso8859_16",
1569    "iso8859_2",
1570    "iso8859_3",
1571    "iso8859_4",
1572    "iso8859_5",
1573    "iso8859_6",
1574    "iso8859_7",
1575    "iso8859_8",
1576    "iso8859_9",
1577    "johab",
1578    "koi8_r",
1579    "koi8_u",
1580    "latin_1",
1581    "mac_cyrillic",
1582    "mac_greek",
1583    "mac_iceland",
1584    "mac_latin2",
1585    "mac_roman",
1586    "mac_turkish",
1587    "palmos",
1588    "ptcp154",
1589    "punycode",
1590    "raw_unicode_escape",
1591    "rot_13",
1592    "shift_jis",
1593    "shift_jis_2004",
1594    "shift_jisx0213",
1595    "tis_620",
1596    "unicode_escape",
1597    "unicode_internal",
1598    "utf_16",
1599    "utf_16_be",
1600    "utf_16_le",
1601    "utf_7",
1602    "utf_8",
1603]
1604
1605if hasattr(codecs, "mbcs_encode"):
1606    all_unicode_encodings.append("mbcs")
1607
1608# The following encodings work only with str, not unicode
1609all_string_encodings = [
1610    "quopri_codec",
1611    "string_escape",
1612    "uu_codec",
1613]
1614
1615# The following encoding is not tested, because it's not supposed
1616# to work:
1617#    "undefined"
1618
1619# The following encodings don't work in stateful mode
1620broken_unicode_with_streams = [
1621    "base64_codec",
1622    "hex_codec",
1623    "punycode",
1624    "unicode_internal"
1625]
1626broken_incremental_coders = broken_unicode_with_streams[:]
1627
1628if sys.flags.py3k_warning:
1629    broken_unicode_with_streams.append("rot_13")
1630
1631# The following encodings only support "strict" mode
1632only_strict_mode = [
1633    "idna",
1634    "zlib_codec",
1635    "bz2_codec",
1636]
1637
1638try:
1639    import bz2
1640except ImportError:
1641    pass
1642else:
1643    all_unicode_encodings.append("bz2_codec")
1644    broken_unicode_with_streams.append("bz2_codec")
1645
1646try:
1647    import zlib
1648except ImportError:
1649    pass
1650else:
1651    all_unicode_encodings.append("zlib_codec")
1652    broken_unicode_with_streams.append("zlib_codec")
1653
1654class BasicUnicodeTest(unittest.TestCase):
1655    def test_basics(self):
1656        s = u"abc123"  # all codecs should be able to encode these
1657        for encoding in all_unicode_encodings:
1658            name = codecs.lookup(encoding).name
1659            if encoding.endswith("_codec"):
1660                name += "_codec"
1661            elif encoding == "latin_1":
1662                name = "latin_1"
1663            self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
1664            (bytes, size) = codecs.getencoder(encoding)(s)
1665            self.assertEqual(size, len(s), "encoding=%r" % encoding)
1666            (chars, size) = codecs.getdecoder(encoding)(bytes)
1667            self.assertEqual(chars, s, "encoding=%r" % encoding)
1668
1669            if encoding not in broken_unicode_with_streams:
1670                # check stream reader/writer
1671                q = Queue()
1672                writer = codecs.getwriter(encoding)(q)
1673                encodedresult = ""
1674                for c in s:
1675                    writer.write(c)
1676                    encodedresult += q.read()
1677                q = Queue()
1678                reader = codecs.getreader(encoding)(q)
1679                decodedresult = u""
1680                for c in encodedresult:
1681                    q.write(c)
1682                    decodedresult += reader.read()
1683                self.assertEqual(decodedresult, s, "encoding=%r" % encoding)
1684
1685            if encoding not in broken_incremental_coders:
1686                # check incremental decoder/encoder and iterencode()/iterdecode()
1687                try:
1688                    encoder = codecs.getincrementalencoder(encoding)()
1689                except LookupError:  # no IncrementalEncoder
1690                    pass
1691                else:
1692                    # check incremental decoder/encoder
1693                    encodedresult = ""
1694                    for c in s:
1695                        encodedresult += encoder.encode(c)
1696                    encodedresult += encoder.encode(u"", True)
1697                    decoder = codecs.getincrementaldecoder(encoding)()
1698                    decodedresult = u""
1699                    for c in encodedresult:
1700                        decodedresult += decoder.decode(c)
1701                    decodedresult += decoder.decode("", True)
1702                    self.assertEqual(decodedresult, s,
1703                                     "encoding=%r" % encoding)
1704
1705                    # check iterencode()/iterdecode()
1706                    result = u"".join(codecs.iterdecode(
1707                            codecs.iterencode(s, encoding), encoding))
1708                    self.assertEqual(result, s, "encoding=%r" % encoding)
1709
1710                    # check iterencode()/iterdecode() with empty string
1711                    result = u"".join(codecs.iterdecode(
1712                            codecs.iterencode(u"", encoding), encoding))
1713                    self.assertEqual(result, u"")
1714
1715                if encoding not in only_strict_mode:
1716                    # check incremental decoder/encoder with errors argument
1717                    try:
1718                        encoder = codecs.getincrementalencoder(encoding)("ignore")
1719                    except LookupError:  # no IncrementalEncoder
1720                        pass
1721                    else:
1722                        encodedresult = "".join(encoder.encode(c) for c in s)
1723                        decoder = codecs.getincrementaldecoder(encoding)("ignore")
1724                        decodedresult = u"".join(decoder.decode(c)
1725                                                 for c in encodedresult)
1726                        self.assertEqual(decodedresult, s,
1727                                         "encoding=%r" % encoding)
1728
1729    @test_support.cpython_only
1730    def test_basics_capi(self):
1731        from _testcapi import codec_incrementalencoder, codec_incrementaldecoder
1732        s = u"abc123"  # all codecs should be able to encode these
1733        for encoding in all_unicode_encodings:
1734            if encoding not in broken_incremental_coders:
1735                # check incremental decoder/encoder and iterencode()/iterdecode()
1736                try:
1737                    cencoder = codec_incrementalencoder(encoding)
1738                except LookupError:  # no IncrementalEncoder
1739                    pass
1740                else:
1741                    # check C API
1742                    encodedresult = ""
1743                    for c in s:
1744                        encodedresult += cencoder.encode(c)
1745                    encodedresult += cencoder.encode(u"", True)
1746                    cdecoder = codec_incrementaldecoder(encoding)
1747                    decodedresult = u""
1748                    for c in encodedresult:
1749                        decodedresult += cdecoder.decode(c)
1750                    decodedresult += cdecoder.decode("", True)
1751                    self.assertEqual(decodedresult, s,
1752                                     "encoding=%r" % encoding)
1753
1754                if encoding not in only_strict_mode:
1755                    # check incremental decoder/encoder with errors argument
1756                    try:
1757                        cencoder = codec_incrementalencoder(encoding, "ignore")
1758                    except LookupError:  # no IncrementalEncoder
1759                        pass
1760                    else:
1761                        encodedresult = "".join(cencoder.encode(c) for c in s)
1762                        cdecoder = codec_incrementaldecoder(encoding, "ignore")
1763                        decodedresult = u"".join(cdecoder.decode(c)
1764                                                 for c in encodedresult)
1765                        self.assertEqual(decodedresult, s,
1766                                         "encoding=%r" % encoding)
1767
1768    def test_seek(self):
1769        # all codecs should be able to encode these
1770        s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1771        for encoding in all_unicode_encodings:
1772            if encoding == "idna": # FIXME: See SF bug #1163178
1773                continue
1774            if encoding in broken_unicode_with_streams:
1775                continue
1776            reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1777            for t in xrange(5):
1778                # Test that calling seek resets the internal codec state and buffers
1779                reader.seek(0, 0)
1780                line = reader.readline()
1781                self.assertEqual(s[:len(line)], line)
1782
1783    def test_bad_decode_args(self):
1784        for encoding in all_unicode_encodings:
1785            decoder = codecs.getdecoder(encoding)
1786            self.assertRaises(TypeError, decoder)
1787            if encoding not in ("idna", "punycode"):
1788                self.assertRaises(TypeError, decoder, 42)
1789
1790    def test_bad_encode_args(self):
1791        for encoding in all_unicode_encodings:
1792            encoder = codecs.getencoder(encoding)
1793            self.assertRaises(TypeError, encoder)
1794
1795    def test_encoding_map_type_initialized(self):
1796        from encodings import cp1140
1797        # This used to crash, we are only verifying there's no crash.
1798        table_type = type(cp1140.encoding_table)
1799        self.assertEqual(table_type, table_type)
1800
1801class BasicStrTest(unittest.TestCase):
1802    def test_basics(self):
1803        s = "abc123"
1804        for encoding in all_string_encodings:
1805            (bytes, size) = codecs.getencoder(encoding)(s)
1806            self.assertEqual(size, len(s))
1807            (chars, size) = codecs.getdecoder(encoding)(bytes)
1808            self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1809
1810class CharmapTest(unittest.TestCase):
1811    def test_decode_with_string_map(self):
1812        self.assertEqual(
1813            codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1814            (u"abc", 3)
1815        )
1816
1817        self.assertRaises(UnicodeDecodeError,
1818            codecs.charmap_decode, b"\x00\x01\x02", "strict", u"ab"
1819        )
1820
1821        self.assertRaises(UnicodeDecodeError,
1822            codecs.charmap_decode, "\x00\x01\x02", "strict", u"ab\ufffe"
1823        )
1824
1825        self.assertEqual(
1826            codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1827            (u"ab\ufffd", 3)
1828        )
1829
1830        self.assertEqual(
1831            codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1832            (u"ab\ufffd", 3)
1833        )
1834
1835        self.assertEqual(
1836            codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1837            (u"ab", 3)
1838        )
1839
1840        self.assertEqual(
1841            codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1842            (u"ab", 3)
1843        )
1844
1845        allbytes = "".join(chr(i) for i in xrange(256))
1846        self.assertEqual(
1847            codecs.charmap_decode(allbytes, "ignore", u""),
1848            (u"", len(allbytes))
1849        )
1850
1851    def test_decode_with_int2str_map(self):
1852        self.assertEqual(
1853            codecs.charmap_decode("\x00\x01\x02", "strict",
1854                                  {0: u'a', 1: u'b', 2: u'c'}),
1855            (u"abc", 3)
1856        )
1857
1858        self.assertEqual(
1859            codecs.charmap_decode("\x00\x01\x02", "strict",
1860                                  {0: u'Aa', 1: u'Bb', 2: u'Cc'}),
1861            (u"AaBbCc", 3)
1862        )
1863
1864        self.assertEqual(
1865            codecs.charmap_decode("\x00\x01\x02", "strict",
1866                                  {0: u'\U0010FFFF', 1: u'b', 2: u'c'}),
1867            (u"\U0010FFFFbc", 3)
1868        )
1869
1870        self.assertEqual(
1871            codecs.charmap_decode("\x00\x01\x02", "strict",
1872                                  {0: u'a', 1: u'b', 2: u''}),
1873            (u"ab", 3)
1874        )
1875
1876        self.assertRaises(UnicodeDecodeError,
1877            codecs.charmap_decode, "\x00\x01\x02", "strict",
1878                                   {0: u'a', 1: u'b'}
1879        )
1880
1881        self.assertRaises(UnicodeDecodeError,
1882            codecs.charmap_decode, "\x00\x01\x02", "strict",
1883                                   {0: u'a', 1: u'b', 2: None}
1884        )
1885
1886        # Issue #14850
1887        self.assertRaises(UnicodeDecodeError,
1888            codecs.charmap_decode, "\x00\x01\x02", "strict",
1889                                   {0: u'a', 1: u'b', 2: u'\ufffe'}
1890        )
1891
1892        self.assertEqual(
1893            codecs.charmap_decode("\x00\x01\x02", "replace",
1894                                  {0: u'a', 1: u'b'}),
1895            (u"ab\ufffd", 3)
1896        )
1897
1898        self.assertEqual(
1899            codecs.charmap_decode("\x00\x01\x02", "replace",
1900                                  {0: u'a', 1: u'b', 2: None}),
1901            (u"ab\ufffd", 3)
1902        )
1903
1904        # Issue #14850
1905        self.assertEqual(
1906            codecs.charmap_decode("\x00\x01\x02", "replace",
1907                                  {0: u'a', 1: u'b', 2: u'\ufffe'}),
1908            (u"ab\ufffd", 3)
1909        )
1910
1911        self.assertEqual(
1912            codecs.charmap_decode("\x00\x01\x02", "ignore",
1913                                  {0: u'a', 1: u'b'}),
1914            (u"ab", 3)
1915        )
1916
1917        self.assertEqual(
1918            codecs.charmap_decode("\x00\x01\x02", "ignore",
1919                                  {0: u'a', 1: u'b', 2: None}),
1920            (u"ab", 3)
1921        )
1922
1923        # Issue #14850
1924        self.assertEqual(
1925            codecs.charmap_decode("\x00\x01\x02", "ignore",
1926                                  {0: u'a', 1: u'b', 2: u'\ufffe'}),
1927            (u"ab", 3)
1928        )
1929
1930        allbytes = "".join(chr(i) for i in xrange(256))
1931        self.assertEqual(
1932            codecs.charmap_decode(allbytes, "ignore", {}),
1933            (u"", len(allbytes))
1934        )
1935
1936    def test_decode_with_int2int_map(self):
1937        a = ord(u'a')
1938        b = ord(u'b')
1939        c = ord(u'c')
1940
1941        self.assertEqual(
1942            codecs.charmap_decode("\x00\x01\x02", "strict",
1943                                  {0: a, 1: b, 2: c}),
1944            (u"abc", 3)
1945        )
1946
1947        # Issue #15379
1948        self.assertEqual(
1949            codecs.charmap_decode("\x00\x01\x02", "strict",
1950                                  {0: 0x10FFFF, 1: b, 2: c}),
1951            (u"\U0010FFFFbc", 3)
1952        )
1953
1954        self.assertRaises(TypeError,
1955            codecs.charmap_decode, "\x00\x01\x02", "strict",
1956                                   {0: 0x110000, 1: b, 2: c}
1957        )
1958
1959        self.assertRaises(UnicodeDecodeError,
1960            codecs.charmap_decode, "\x00\x01\x02", "strict",
1961                                   {0: a, 1: b},
1962        )
1963
1964        self.assertRaises(UnicodeDecodeError,
1965            codecs.charmap_decode, "\x00\x01\x02", "strict",
1966                                   {0: a, 1: b, 2: 0xFFFE},
1967        )
1968
1969        self.assertEqual(
1970            codecs.charmap_decode("\x00\x01\x02", "replace",
1971                                  {0: a, 1: b}),
1972            (u"ab\ufffd", 3)
1973        )
1974
1975        self.assertEqual(
1976            codecs.charmap_decode("\x00\x01\x02", "replace",
1977                                  {0: a, 1: b, 2: 0xFFFE}),
1978            (u"ab\ufffd", 3)
1979        )
1980
1981        self.assertEqual(
1982            codecs.charmap_decode("\x00\x01\x02", "ignore",
1983                                  {0: a, 1: b}),
1984            (u"ab", 3)
1985        )
1986
1987        self.assertEqual(
1988            codecs.charmap_decode("\x00\x01\x02", "ignore",
1989                                  {0: a, 1: b, 2: 0xFFFE}),
1990            (u"ab", 3)
1991        )
1992
1993
1994class WithStmtTest(unittest.TestCase):
1995    def test_encodedfile(self):
1996        f = StringIO.StringIO("\xc3\xbc")
1997        with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
1998            self.assertEqual(ef.read(), "\xfc")
1999
2000    def test_streamreaderwriter(self):
2001        f = StringIO.StringIO("\xc3\xbc")
2002        info = codecs.lookup("utf-8")
2003        with codecs.StreamReaderWriter(f, info.streamreader,
2004                                       info.streamwriter, 'strict') as srw:
2005            self.assertEqual(srw.read(), u"\xfc")
2006
2007
2008class UnicodeEscapeTest(unittest.TestCase):
2009    def test_empty(self):
2010        self.assertEqual(codecs.unicode_escape_encode(u""), ("", 0))
2011        self.assertEqual(codecs.unicode_escape_decode(""), (u"", 0))
2012
2013    def test_raw_encode(self):
2014        encode = codecs.unicode_escape_encode
2015        for b in range(32, 127):
2016            if b != ord('\\'):
2017                self.assertEqual(encode(unichr(b)), (chr(b), 1))
2018
2019    def test_raw_decode(self):
2020        decode = codecs.unicode_escape_decode
2021        for b in range(256):
2022            if b != ord('\\'):
2023                self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2))
2024
2025    def test_escape_encode(self):
2026        encode = codecs.unicode_escape_encode
2027        check = coding_checker(self, encode)
2028        check(u'\t', r'\t')
2029        check(u'\n', r'\n')
2030        check(u'\r', r'\r')
2031        check(u'\\', r'\\')
2032        for b in range(32):
2033            if chr(b) not in '\t\n\r':
2034                check(unichr(b), '\\x%02x' % b)
2035        for b in range(127, 256):
2036            check(unichr(b), '\\x%02x' % b)
2037        check(u'\u20ac', r'\u20ac')
2038        check(u'\U0001d120', r'\U0001d120')
2039
2040    def test_escape_decode(self):
2041        decode = codecs.unicode_escape_decode
2042        check = coding_checker(self, decode)
2043        check("[\\\n]", u"[]")
2044        check(r'[\"]', u'["]')
2045        check(r"[\']", u"[']")
2046        check(r"[\\]", ur"[\]")
2047        check(r"[\a]", u"[\x07]")
2048        check(r"[\b]", u"[\x08]")
2049        check(r"[\t]", u"[\x09]")
2050        check(r"[\n]", u"[\x0a]")
2051        check(r"[\v]", u"[\x0b]")
2052        check(r"[\f]", u"[\x0c]")
2053        check(r"[\r]", u"[\x0d]")
2054        check(r"[\7]", u"[\x07]")
2055        check(r"[\8]", ur"[\8]")
2056        check(r"[\78]", u"[\x078]")
2057        check(r"[\41]", u"[!]")
2058        check(r"[\418]", u"[!8]")
2059        check(r"[\101]", u"[A]")
2060        check(r"[\1010]", u"[A0]")
2061        check(r"[\x41]", u"[A]")
2062        check(r"[\x410]", u"[A0]")
2063        check(r"\u20ac", u"\u20ac")
2064        check(r"\U0001d120", u"\U0001d120")
2065        for b in range(256):
2066            if chr(b) not in '\n"\'\\abtnvfr01234567xuUN':
2067                check('\\' + chr(b), u'\\' + unichr(b))
2068
2069    def test_decode_errors(self):
2070        decode = codecs.unicode_escape_decode
2071        for c, d in ('x', 2), ('u', 4), ('U', 4):
2072            for i in range(d):
2073                self.assertRaises(UnicodeDecodeError, decode,
2074                                  "\\" + c + "0"*i)
2075                self.assertRaises(UnicodeDecodeError, decode,
2076                                  "[\\" + c + "0"*i + "]")
2077                data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
2078                self.assertEqual(decode(data, "ignore"), (u"[]", len(data)))
2079                self.assertEqual(decode(data, "replace"),
2080                                 (u"[\ufffd]\ufffd", len(data)))
2081        self.assertRaises(UnicodeDecodeError, decode, r"\U00110000")
2082        self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10))
2083        self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10))
2084
2085
2086class RawUnicodeEscapeTest(unittest.TestCase):
2087    def test_empty(self):
2088        self.assertEqual(codecs.raw_unicode_escape_encode(u""), ("", 0))
2089        self.assertEqual(codecs.raw_unicode_escape_decode(""), (u"", 0))
2090
2091    def test_raw_encode(self):
2092        encode = codecs.raw_unicode_escape_encode
2093        for b in range(256):
2094            self.assertEqual(encode(unichr(b)), (chr(b), 1))
2095
2096    def test_raw_decode(self):
2097        decode = codecs.raw_unicode_escape_decode
2098        for b in range(256):
2099            self.assertEqual(decode(chr(b) + '0'), (unichr(b) + u'0', 2))
2100
2101    def test_escape_encode(self):
2102        encode = codecs.raw_unicode_escape_encode
2103        check = coding_checker(self, encode)
2104        for b in range(256):
2105            if chr(b) not in 'uU':
2106                check(u'\\' + unichr(b), '\\' + chr(b))
2107        check(u'\u20ac', r'\u20ac')
2108        check(u'\U0001d120', r'\U0001d120')
2109
2110    def test_escape_decode(self):
2111        decode = codecs.raw_unicode_escape_decode
2112        check = coding_checker(self, decode)
2113        for b in range(256):
2114            if chr(b) not in 'uU':
2115                check('\\' + chr(b), u'\\' + unichr(b))
2116        check(r"\u20ac", u"\u20ac")
2117        check(r"\U0001d120", u"\U0001d120")
2118
2119    def test_decode_errors(self):
2120        decode = codecs.raw_unicode_escape_decode
2121        for c, d in ('u', 4), ('U', 4):
2122            for i in range(d):
2123                self.assertRaises(UnicodeDecodeError, decode,
2124                                  "\\" + c + "0"*i)
2125                self.assertRaises(UnicodeDecodeError, decode,
2126                                  "[\\" + c + "0"*i + "]")
2127                data = "[\\" + c + "0"*i + "]\\" + c + "0"*i
2128                self.assertEqual(decode(data, "ignore"), (u"[]", len(data)))
2129                self.assertEqual(decode(data, "replace"),
2130                                 (u"[\ufffd]\ufffd", len(data)))
2131        self.assertRaises(UnicodeDecodeError, decode, r"\U00110000")
2132        self.assertEqual(decode(r"\U00110000", "ignore"), (u"", 10))
2133        self.assertEqual(decode(r"\U00110000", "replace"), (u"\ufffd", 10))
2134
2135
2136class BomTest(unittest.TestCase):
2137    def test_seek0(self):
2138        data = u"1234567890"
2139        tests = ("utf-16",
2140                 "utf-16-le",
2141                 "utf-16-be",
2142                 "utf-32",
2143                 "utf-32-le",
2144                 "utf-32-be")
2145        self.addCleanup(test_support.unlink, test_support.TESTFN)
2146        for encoding in tests:
2147            # Check if the BOM is written only once
2148            with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2149                f.write(data)
2150                f.write(data)
2151                f.seek(0)
2152                self.assertEqual(f.read(), data * 2)
2153                f.seek(0)
2154                self.assertEqual(f.read(), data * 2)
2155
2156            # Check that the BOM is written after a seek(0)
2157            with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2158                f.write(data[0])
2159                self.assertNotEqual(f.tell(), 0)
2160                f.seek(0)
2161                f.write(data)
2162                f.seek(0)
2163                self.assertEqual(f.read(), data)
2164
2165            # (StreamWriter) Check that the BOM is written after a seek(0)
2166            with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2167                f.writer.write(data[0])
2168                self.assertNotEqual(f.writer.tell(), 0)
2169                f.writer.seek(0)
2170                f.writer.write(data)
2171                f.seek(0)
2172                self.assertEqual(f.read(), data)
2173
2174            # Check that the BOM is not written after a seek() at a position
2175            # different than the start
2176            with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2177                f.write(data)
2178                f.seek(f.tell())
2179                f.write(data)
2180                f.seek(0)
2181                self.assertEqual(f.read(), data * 2)
2182
2183            # (StreamWriter) Check that the BOM is not written after a seek()
2184            # at a position different than the start
2185            with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
2186                f.writer.write(data)
2187                f.writer.seek(f.writer.tell())
2188                f.writer.write(data)
2189                f.seek(0)
2190                self.assertEqual(f.read(), data * 2)
2191
2192
2193class TransformCodecTest(unittest.TestCase):
2194
2195    def test_quopri_stateless(self):
2196        # Should encode with quotetabs=True
2197        encoded = codecs.encode(b"space tab\teol \n", "quopri-codec")
2198        self.assertEqual(encoded, b"space=20tab=09eol=20\n")
2199        # But should still support unescaped tabs and spaces
2200        unescaped = b"space tab eol\n"
2201        self.assertEqual(codecs.decode(unescaped, "quopri-codec"), unescaped)
2202
2203    def test_uu_invalid(self):
2204        # Missing "begin" line
2205        self.assertRaises(ValueError, codecs.decode, "", "uu-codec")
2206
2207
2208def test_main():
2209    test_support.run_unittest(
2210        UTF32Test,
2211        UTF32LETest,
2212        UTF32BETest,
2213        UTF16Test,
2214        UTF16LETest,
2215        UTF16BETest,
2216        UTF8Test,
2217        UTF8SigTest,
2218        UTF7Test,
2219        UTF16ExTest,
2220        ReadBufferTest,
2221        CharBufferTest,
2222        EscapeDecodeTest,
2223        RecodingTest,
2224        PunycodeTest,
2225        UnicodeInternalTest,
2226        NameprepTest,
2227        IDNACodecTest,
2228        CodecsModuleTest,
2229        StreamReaderTest,
2230        EncodedFileTest,
2231        Str2StrTest,
2232        BasicUnicodeTest,
2233        BasicStrTest,
2234        CharmapTest,
2235        WithStmtTest,
2236        UnicodeEscapeTest,
2237        RawUnicodeEscapeTest,
2238        BomTest,
2239        TransformCodecTest,
2240    )
2241
2242
2243if __name__ == "__main__":
2244    test_main()
2245