1""" Test script for the Unicode implementation.
2
3Written by Marc-Andre Lemburg (mal@lemburg.com).
4
5(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.
6
7"""#"
8import sys
9import struct
10import codecs
11import unittest
12from test import test_support, string_tests
13
14# decorator to skip tests on narrow builds
15requires_wide_build = unittest.skipIf(sys.maxunicode == 65535,
16                                      'requires wide build')
17
18# Error handling (bad decoder return)
19def search_function(encoding):
20    def decode1(input, errors="strict"):
21        return 42 # not a tuple
22    def encode1(input, errors="strict"):
23        return 42 # not a tuple
24    def encode2(input, errors="strict"):
25        return (42, 42) # no unicode
26    def decode2(input, errors="strict"):
27        return (42, 42) # no unicode
28    if encoding=="test.unicode1":
29        return (encode1, decode1, None, None)
30    elif encoding=="test.unicode2":
31        return (encode2, decode2, None, None)
32    else:
33        return None
34codecs.register(search_function)
35
36class UnicodeSubclass(unicode):
37    pass
38
39class UnicodeTest(
40    string_tests.CommonTest,
41    string_tests.MixinStrUnicodeUserStringTest,
42    string_tests.MixinStrUnicodeTest,
43    ):
44    type2test = unicode
45
46    def assertEqual(self, first, second, msg=None):
47        # strict assertEqual method: reject implicit bytes/unicode equality
48        super(UnicodeTest, self).assertEqual(first, second, msg)
49        if isinstance(first, unicode) or isinstance(second, unicode):
50            self.assertIsInstance(first, unicode)
51            self.assertIsInstance(second, unicode)
52        elif isinstance(first, str) or isinstance(second, str):
53            self.assertIsInstance(first, str)
54            self.assertIsInstance(second, str)
55
56    def checkequalnofix(self, result, object, methodname, *args):
57        method = getattr(object, methodname)
58        realresult = method(*args)
59        self.assertEqual(realresult, result)
60        self.assertTrue(type(realresult) is type(result))
61
62        # if the original is returned make sure that
63        # this doesn't happen with subclasses
64        if realresult is object:
65            class usub(unicode):
66                def __repr__(self):
67                    return 'usub(%r)' % unicode.__repr__(self)
68            object = usub(object)
69            method = getattr(object, methodname)
70            realresult = method(*args)
71            self.assertEqual(realresult, result)
72            self.assertTrue(object is not realresult)
73
74    def test_literals(self):
75        self.assertEqual(u'\xff', u'\u00ff')
76        self.assertEqual(u'\uffff', u'\U0000ffff')
77        self.assertRaises(SyntaxError, eval, 'u\'\\Ufffffffe\'')
78        self.assertRaises(SyntaxError, eval, 'u\'\\Uffffffff\'')
79        self.assertRaises(SyntaxError, eval, 'u\'\\U%08x\'' % 0x110000)
80
81    def test_repr(self):
82        if not sys.platform.startswith('java'):
83            # Test basic sanity of repr()
84            self.assertEqual(repr(u'abc'), "u'abc'")
85            self.assertEqual(repr(u'ab\\c'), "u'ab\\\\c'")
86            self.assertEqual(repr(u'ab\\'), "u'ab\\\\'")
87            self.assertEqual(repr(u'\\c'), "u'\\\\c'")
88            self.assertEqual(repr(u'\\'), "u'\\\\'")
89            self.assertEqual(repr(u'\n'), "u'\\n'")
90            self.assertEqual(repr(u'\r'), "u'\\r'")
91            self.assertEqual(repr(u'\t'), "u'\\t'")
92            self.assertEqual(repr(u'\b'), "u'\\x08'")
93            self.assertEqual(repr(u"'\""), """u'\\'"'""")
94            self.assertEqual(repr(u"'\""), """u'\\'"'""")
95            self.assertEqual(repr(u"'"), '''u"'"''')
96            self.assertEqual(repr(u'"'), """u'"'""")
97            latin1repr = (
98                "u'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07\\x08\\t\\n\\x0b\\x0c\\r"
99                "\\x0e\\x0f\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17\\x18\\x19\\x1a"
100                "\\x1b\\x1c\\x1d\\x1e\\x1f !\"#$%&\\'()*+,-./0123456789:;<=>?@ABCDEFGHI"
101                "JKLMNOPQRSTUVWXYZ[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\x7f"
102                "\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87\\x88\\x89\\x8a\\x8b\\x8c\\x8d"
103                "\\x8e\\x8f\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97\\x98\\x99\\x9a\\x9b"
104                "\\x9c\\x9d\\x9e\\x9f\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7\\xa8\\xa9"
105                "\\xaa\\xab\\xac\\xad\\xae\\xaf\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
106                "\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5"
107                "\\xc6\\xc7\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf\\xd0\\xd1\\xd2\\xd3"
108                "\\xd4\\xd5\\xd6\\xd7\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf\\xe0\\xe1"
109                "\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
110                "\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd"
111                "\\xfe\\xff'")
112            testrepr = repr(u''.join(map(unichr, xrange(256))))
113            self.assertEqual(testrepr, latin1repr)
114            # Test repr works on wide unicode escapes without overflow.
115            self.assertEqual(repr(u"\U00010000" * 39 + u"\uffff" * 4096),
116                             repr(u"\U00010000" * 39 + u"\uffff" * 4096))
117
118
119    def test_count(self):
120        string_tests.CommonTest.test_count(self)
121        # check mixed argument types
122        self.checkequalnofix(3,  'aaa', 'count', u'a')
123        self.checkequalnofix(0,  'aaa', 'count', u'b')
124        self.checkequalnofix(3, u'aaa', 'count',  'a')
125        self.checkequalnofix(0, u'aaa', 'count',  'b')
126        self.checkequalnofix(0, u'aaa', 'count',  'b')
127        self.checkequalnofix(1, u'aaa', 'count',  'a', -1)
128        self.checkequalnofix(3, u'aaa', 'count',  'a', -10)
129        self.checkequalnofix(2, u'aaa', 'count',  'a', 0, -1)
130        self.checkequalnofix(0, u'aaa', 'count',  'a', 0, -10)
131
132    def test_find(self):
133        self.checkequalnofix(0,  u'abcdefghiabc', 'find', u'abc')
134        self.checkequalnofix(9,  u'abcdefghiabc', 'find', u'abc', 1)
135        self.checkequalnofix(-1, u'abcdefghiabc', 'find', u'def', 4)
136
137        self.assertRaises(TypeError, u'hello'.find)
138        self.assertRaises(TypeError, u'hello'.find, 42)
139
140    def test_rfind(self):
141        string_tests.CommonTest.test_rfind(self)
142        # check mixed argument types
143        self.checkequalnofix(9,   'abcdefghiabc', 'rfind', u'abc')
144        self.checkequalnofix(12,  'abcdefghiabc', 'rfind', u'')
145        self.checkequalnofix(12, u'abcdefghiabc', 'rfind',  '')
146
147    def test_index(self):
148        string_tests.CommonTest.test_index(self)
149        # check mixed argument types
150        for (t1, t2) in ((str, unicode), (unicode, str)):
151            self.checkequalnofix(0, t1('abcdefghiabc'), 'index',  t2(''))
152            self.checkequalnofix(3, t1('abcdefghiabc'), 'index',  t2('def'))
153            self.checkequalnofix(0, t1('abcdefghiabc'), 'index',  t2('abc'))
154            self.checkequalnofix(9, t1('abcdefghiabc'), 'index',  t2('abc'), 1)
155            self.assertRaises(ValueError, t1('abcdefghiabc').index, t2('hib'))
156            self.assertRaises(ValueError, t1('abcdefghiab').index,  t2('abc'), 1)
157            self.assertRaises(ValueError, t1('abcdefghi').index,  t2('ghi'), 8)
158            self.assertRaises(ValueError, t1('abcdefghi').index,  t2('ghi'), -1)
159
160    def test_rindex(self):
161        string_tests.CommonTest.test_rindex(self)
162        # check mixed argument types
163        for (t1, t2) in ((str, unicode), (unicode, str)):
164            self.checkequalnofix(12, t1('abcdefghiabc'), 'rindex',  t2(''))
165            self.checkequalnofix(3,  t1('abcdefghiabc'), 'rindex',  t2('def'))
166            self.checkequalnofix(9,  t1('abcdefghiabc'), 'rindex',  t2('abc'))
167            self.checkequalnofix(0,  t1('abcdefghiabc'), 'rindex',  t2('abc'), 0, -1)
168
169            self.assertRaises(ValueError, t1('abcdefghiabc').rindex,  t2('hib'))
170            self.assertRaises(ValueError, t1('defghiabc').rindex,  t2('def'), 1)
171            self.assertRaises(ValueError, t1('defghiabc').rindex,  t2('abc'), 0, -1)
172            self.assertRaises(ValueError, t1('abcdefghi').rindex,  t2('ghi'), 0, 8)
173            self.assertRaises(ValueError, t1('abcdefghi').rindex,  t2('ghi'), 0, -1)
174
175    def test_translate(self):
176        self.checkequalnofix(u'bbbc', u'abababc', 'translate', {ord('a'):None})
177        self.checkequalnofix(u'iiic', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i')})
178        self.checkequalnofix(u'iiix', u'abababc', 'translate', {ord('a'):None, ord('b'):ord('i'), ord('c'):u'x'})
179        self.checkequalnofix(u'<i><i><i>c', u'abababc', 'translate', {ord('a'):None, ord('b'):u'<i>'})
180        self.checkequalnofix(u'c', u'abababc', 'translate', {ord('a'):None, ord('b'):u''})
181        self.checkequalnofix(u'xyyx', u'xzx', 'translate', {ord('z'):u'yy'})
182
183        self.assertRaises(TypeError, u'hello'.translate)
184        self.assertRaises(TypeError, u'abababc'.translate, {ord('a'):''})
185
186    def test_split(self):
187        string_tests.CommonTest.test_split(self)
188
189        # Mixed arguments
190        self.checkequalnofix([u'a', u'b', u'c', u'd'], u'a//b//c//d', 'split', '//')
191        self.checkequalnofix([u'a', u'b', u'c', u'd'], 'a//b//c//d', 'split', u'//')
192        self.checkequalnofix([u'endcase ', u''], u'endcase test', 'split', 'test')
193
194    def test_join(self):
195        string_tests.MixinStrUnicodeUserStringTest.test_join(self)
196
197        # mixed arguments
198        self.checkequalnofix(u'a b c d', u' ', 'join', ['a', 'b', u'c', u'd'])
199        self.checkequalnofix(u'abcd', u'', 'join', (u'a', u'b', u'c', u'd'))
200        self.checkequalnofix(u'w x y z', u' ', 'join', string_tests.Sequence('wxyz'))
201        self.checkequalnofix(u'a b c d', ' ', 'join', [u'a', u'b', u'c', u'd'])
202        self.checkequalnofix(u'a b c d', ' ', 'join', ['a', 'b', u'c', u'd'])
203        self.checkequalnofix(u'abcd', '', 'join', (u'a', u'b', u'c', u'd'))
204        self.checkequalnofix(u'w x y z', ' ', 'join', string_tests.Sequence(u'wxyz'))
205
206    def test_strip(self):
207        string_tests.CommonTest.test_strip(self)
208        self.assertRaises(UnicodeError, u"hello".strip, "\xff")
209
210    def test_replace(self):
211        string_tests.CommonTest.test_replace(self)
212
213        # method call forwarded from str implementation because of unicode argument
214        self.checkequalnofix(u'one@two!three!', 'one!two!three!', 'replace', u'!', u'@', 1)
215        self.assertRaises(TypeError, 'replace'.replace, u"r", 42)
216
217    def test_comparison(self):
218        # Comparisons:
219        self.assertTrue(u'abc' == 'abc')
220        self.assertTrue('abc' == u'abc')
221        self.assertTrue(u'abc' == u'abc')
222        self.assertTrue(u'abcd' > 'abc')
223        self.assertTrue('abcd' > u'abc')
224        self.assertTrue(u'abcd' > u'abc')
225        self.assertTrue(u'abc' < 'abcd')
226        self.assertTrue('abc' < u'abcd')
227        self.assertTrue(u'abc' < u'abcd')
228
229        if 0:
230            # Move these tests to a Unicode collation module test...
231            # Testing UTF-16 code point order comparisons...
232
233            # No surrogates, no fixup required.
234            self.assertTrue(u'\u0061' < u'\u20ac')
235            # Non surrogate below surrogate value, no fixup required
236            self.assertTrue(u'\u0061' < u'\ud800\udc02')
237
238            # Non surrogate above surrogate value, fixup required
239            def test_lecmp(s, s2):
240                self.assertTrue(s < s2)
241
242            def test_fixup(s):
243                s2 = u'\ud800\udc01'
244                test_lecmp(s, s2)
245                s2 = u'\ud900\udc01'
246                test_lecmp(s, s2)
247                s2 = u'\uda00\udc01'
248                test_lecmp(s, s2)
249                s2 = u'\udb00\udc01'
250                test_lecmp(s, s2)
251                s2 = u'\ud800\udd01'
252                test_lecmp(s, s2)
253                s2 = u'\ud900\udd01'
254                test_lecmp(s, s2)
255                s2 = u'\uda00\udd01'
256                test_lecmp(s, s2)
257                s2 = u'\udb00\udd01'
258                test_lecmp(s, s2)
259                s2 = u'\ud800\ude01'
260                test_lecmp(s, s2)
261                s2 = u'\ud900\ude01'
262                test_lecmp(s, s2)
263                s2 = u'\uda00\ude01'
264                test_lecmp(s, s2)
265                s2 = u'\udb00\ude01'
266                test_lecmp(s, s2)
267                s2 = u'\ud800\udfff'
268                test_lecmp(s, s2)
269                s2 = u'\ud900\udfff'
270                test_lecmp(s, s2)
271                s2 = u'\uda00\udfff'
272                test_lecmp(s, s2)
273                s2 = u'\udb00\udfff'
274                test_lecmp(s, s2)
275
276                test_fixup(u'\ue000')
277                test_fixup(u'\uff61')
278
279        # Surrogates on both sides, no fixup required
280        self.assertTrue(u'\ud800\udc02' < u'\ud84d\udc56')
281
282    def test_capitalize(self):
283        string_tests.CommonTest.test_capitalize(self)
284        # check that titlecased chars are lowered correctly
285        # \u1ffc is the titlecased char
286        self.checkequal(u'\u1ffc\u1ff3\u1ff3\u1ff3',
287                        u'\u1ff3\u1ff3\u1ffc\u1ffc', 'capitalize')
288        # check with cased non-letter chars
289        self.checkequal(u'\u24c5\u24e8\u24e3\u24d7\u24de\u24dd',
290                        u'\u24c5\u24ce\u24c9\u24bd\u24c4\u24c3', 'capitalize')
291        self.checkequal(u'\u24c5\u24e8\u24e3\u24d7\u24de\u24dd',
292                        u'\u24df\u24e8\u24e3\u24d7\u24de\u24dd', 'capitalize')
293        self.checkequal(u'\u2160\u2171\u2172',
294                        u'\u2160\u2161\u2162', 'capitalize')
295        self.checkequal(u'\u2160\u2171\u2172',
296                        u'\u2170\u2171\u2172', 'capitalize')
297        # check with Ll chars with no upper - nothing changes here
298        self.checkequal(u'\u019b\u1d00\u1d86\u0221\u1fb7',
299                        u'\u019b\u1d00\u1d86\u0221\u1fb7', 'capitalize')
300
301    def test_islower(self):
302        string_tests.MixinStrUnicodeUserStringTest.test_islower(self)
303        self.checkequalnofix(False, u'\u1FFc', 'islower')
304
305    @requires_wide_build
306    def test_islower_non_bmp(self):
307        # non-BMP, uppercase
308        self.assertFalse(u'\U00010401'.islower())
309        self.assertFalse(u'\U00010427'.islower())
310        # non-BMP, lowercase
311        self.assertTrue(u'\U00010429'.islower())
312        self.assertTrue(u'\U0001044E'.islower())
313        # non-BMP, non-cased
314        self.assertFalse(u'\U0001F40D'.islower())
315        self.assertFalse(u'\U0001F46F'.islower())
316
317    def test_isupper(self):
318        string_tests.MixinStrUnicodeUserStringTest.test_isupper(self)
319        if not sys.platform.startswith('java'):
320            self.checkequalnofix(False, u'\u1FFc', 'isupper')
321
322    @requires_wide_build
323    def test_isupper_non_bmp(self):
324        # non-BMP, uppercase
325        self.assertTrue(u'\U00010401'.isupper())
326        self.assertTrue(u'\U00010427'.isupper())
327        # non-BMP, lowercase
328        self.assertFalse(u'\U00010429'.isupper())
329        self.assertFalse(u'\U0001044E'.isupper())
330        # non-BMP, non-cased
331        self.assertFalse(u'\U0001F40D'.isupper())
332        self.assertFalse(u'\U0001F46F'.isupper())
333
334    def test_istitle(self):
335        string_tests.MixinStrUnicodeUserStringTest.test_istitle(self)
336        self.checkequalnofix(True, u'\u1FFc', 'istitle')
337        self.checkequalnofix(True, u'Greek \u1FFcitlecases ...', 'istitle')
338
339    @requires_wide_build
340    def test_istitle_non_bmp(self):
341        # non-BMP, uppercase + lowercase
342        self.assertTrue(u'\U00010401\U00010429'.istitle())
343        self.assertTrue(u'\U00010427\U0001044E'.istitle())
344        # apparently there are no titlecased (Lt) non-BMP chars in Unicode 6
345        for ch in [u'\U00010429', u'\U0001044E', u'\U0001F40D', u'\U0001F46F']:
346            self.assertFalse(ch.istitle(), '{!r} is not title'.format(ch))
347
348    def test_isspace(self):
349        string_tests.MixinStrUnicodeUserStringTest.test_isspace(self)
350        self.checkequalnofix(True, u'\u2000', 'isspace')
351        self.checkequalnofix(True, u'\u200a', 'isspace')
352        self.checkequalnofix(False, u'\u2014', 'isspace')
353
354    @requires_wide_build
355    def test_isspace_non_bmp(self):
356        # apparently there are no non-BMP spaces chars in Unicode 6
357        for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
358                   u'\U0001F40D', u'\U0001F46F']:
359            self.assertFalse(ch.isspace(), '{!r} is not space.'.format(ch))
360
361    @requires_wide_build
362    def test_isalnum_non_bmp(self):
363        for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
364                   u'\U0001D7F6', u'\U000104A0', u'\U000104A0', u'\U0001F107']:
365            self.assertTrue(ch.isalnum(), '{!r} is alnum.'.format(ch))
366
367    def test_isalpha(self):
368        string_tests.MixinStrUnicodeUserStringTest.test_isalpha(self)
369        self.checkequalnofix(True, u'\u1FFc', 'isalpha')
370
371    @requires_wide_build
372    def test_isalpha_non_bmp(self):
373        # non-BMP, cased
374        self.assertTrue(u'\U00010401'.isalpha())
375        self.assertTrue(u'\U00010427'.isalpha())
376        self.assertTrue(u'\U00010429'.isalpha())
377        self.assertTrue(u'\U0001044E'.isalpha())
378        # non-BMP, non-cased
379        self.assertFalse(u'\U0001F40D'.isalpha())
380        self.assertFalse(u'\U0001F46F'.isalpha())
381
382    def test_isdecimal(self):
383        self.checkequalnofix(False, u'', 'isdecimal')
384        self.checkequalnofix(False, u'a', 'isdecimal')
385        self.checkequalnofix(True, u'0', 'isdecimal')
386        self.checkequalnofix(False, u'\u2460', 'isdecimal') # CIRCLED DIGIT ONE
387        self.checkequalnofix(False, u'\xbc', 'isdecimal') # VULGAR FRACTION ONE QUARTER
388        self.checkequalnofix(True, u'\u0660', 'isdecimal') # ARABIC-INDIC DIGIT ZERO
389        self.checkequalnofix(True, u'0123456789', 'isdecimal')
390        self.checkequalnofix(False, u'0123456789a', 'isdecimal')
391
392        self.checkraises(TypeError, 'abc', 'isdecimal', 42)
393
394    @requires_wide_build
395    def test_isdecimal_non_bmp(self):
396        for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
397                   u'\U0001F40D', u'\U0001F46F', u'\U00011065', u'\U0001F107']:
398            self.assertFalse(ch.isdecimal(), '{!r} is not decimal.'.format(ch))
399        for ch in [u'\U0001D7F6', u'\U000104A0', u'\U000104A0']:
400            self.assertTrue(ch.isdecimal(), '{!r} is decimal.'.format(ch))
401
402    def test_isdigit(self):
403        string_tests.MixinStrUnicodeUserStringTest.test_isdigit(self)
404        self.checkequalnofix(True, u'\u2460', 'isdigit')
405        self.checkequalnofix(False, u'\xbc', 'isdigit')
406        self.checkequalnofix(True, u'\u0660', 'isdigit')
407
408    @requires_wide_build
409    def test_isdigit_non_bmp(self):
410        for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
411                   u'\U0001F40D', u'\U0001F46F', u'\U00011065']:
412            self.assertFalse(ch.isdigit(), '{!r} is not a digit.'.format(ch))
413        for ch in [u'\U0001D7F6', u'\U000104A0', u'\U000104A0', u'\U0001F107']:
414            self.assertTrue(ch.isdigit(), '{!r} is a digit.'.format(ch))
415
416    def test_isnumeric(self):
417        self.checkequalnofix(False, u'', 'isnumeric')
418        self.checkequalnofix(False, u'a', 'isnumeric')
419        self.checkequalnofix(True, u'0', 'isnumeric')
420        self.checkequalnofix(True, u'\u2460', 'isnumeric')
421        self.checkequalnofix(True, u'\xbc', 'isnumeric')
422        self.checkequalnofix(True, u'\u0660', 'isnumeric')
423        self.checkequalnofix(True, u'0123456789', 'isnumeric')
424        self.checkequalnofix(False, u'0123456789a', 'isnumeric')
425
426        self.assertRaises(TypeError, u"abc".isnumeric, 42)
427
428    @requires_wide_build
429    def test_isnumeric_non_bmp(self):
430        for ch in [u'\U00010401', u'\U00010427', u'\U00010429', u'\U0001044E',
431                   u'\U0001F40D', u'\U0001F46F']:
432            self.assertFalse(ch.isnumeric(), '{!r} is not numeric.'.format(ch))
433        for ch in [u'\U00010107', u'\U0001D7F6', u'\U00023b1b',
434                   u'\U000104A0', u'\U0001F107']:
435            self.assertTrue(ch.isnumeric(), '{!r} is numeric.'.format(ch))
436
437    @requires_wide_build
438    def test_surrogates(self):
439        # this test actually passes on narrow too, but it's just by accident.
440        # Surrogates are seen as non-cased chars, so u'X\uD800X' is as
441        # uppercase as 'X X'
442        for s in (u'a\uD800b\uDFFF', u'a\uDFFFb\uD800',
443                  u'a\uD800b\uDFFFa', u'a\uDFFFb\uD800a'):
444            self.assertTrue(s.islower())
445            self.assertFalse(s.isupper())
446            self.assertFalse(s.istitle())
447        for s in (u'A\uD800B\uDFFF', u'A\uDFFFB\uD800',
448                  u'A\uD800B\uDFFFA', u'A\uDFFFB\uD800A'):
449            self.assertFalse(s.islower())
450            self.assertTrue(s.isupper())
451            self.assertTrue(s.istitle())
452
453        for meth_name in ('islower', 'isupper', 'istitle'):
454            meth = getattr(unicode, meth_name)
455            for s in (u'\uD800', u'\uDFFF', u'\uD800\uD800', u'\uDFFF\uDFFF'):
456                self.assertFalse(meth(s), '%r.%s() is False' % (s, meth_name))
457
458        for meth_name in ('isalpha', 'isalnum', 'isdigit', 'isspace',
459                          'isdecimal', 'isnumeric'):
460            meth = getattr(unicode, meth_name)
461            for s in (u'\uD800', u'\uDFFF', u'\uD800\uD800', u'\uDFFF\uDFFF',
462                      u'a\uD800b\uDFFF', u'a\uDFFFb\uD800',
463                      u'a\uD800b\uDFFFa', u'a\uDFFFb\uD800a'):
464                self.assertFalse(meth(s), '%r.%s() is False' % (s, meth_name))
465
466
467    @requires_wide_build
468    def test_lower(self):
469        string_tests.CommonTest.test_lower(self)
470        self.assertEqual(u'\U00010427'.lower(), u'\U0001044F')
471        self.assertEqual(u'\U00010427\U00010427'.lower(),
472                         u'\U0001044F\U0001044F')
473        self.assertEqual(u'\U00010427\U0001044F'.lower(),
474                         u'\U0001044F\U0001044F')
475        self.assertEqual(u'X\U00010427x\U0001044F'.lower(),
476                         u'x\U0001044Fx\U0001044F')
477
478    @requires_wide_build
479    def test_upper(self):
480        string_tests.CommonTest.test_upper(self)
481        self.assertEqual(u'\U0001044F'.upper(), u'\U00010427')
482        self.assertEqual(u'\U0001044F\U0001044F'.upper(),
483                         u'\U00010427\U00010427')
484        self.assertEqual(u'\U00010427\U0001044F'.upper(),
485                         u'\U00010427\U00010427')
486        self.assertEqual(u'X\U00010427x\U0001044F'.upper(),
487                         u'X\U00010427X\U00010427')
488
489    @requires_wide_build
490    def test_capitalize_wide_build(self):
491        string_tests.CommonTest.test_capitalize(self)
492        self.assertEqual(u'\U0001044F'.capitalize(), u'\U00010427')
493        self.assertEqual(u'\U0001044F\U0001044F'.capitalize(),
494                         u'\U00010427\U0001044F')
495        self.assertEqual(u'\U00010427\U0001044F'.capitalize(),
496                         u'\U00010427\U0001044F')
497        self.assertEqual(u'\U0001044F\U00010427'.capitalize(),
498                         u'\U00010427\U0001044F')
499        self.assertEqual(u'X\U00010427x\U0001044F'.capitalize(),
500                         u'X\U0001044Fx\U0001044F')
501
502    @requires_wide_build
503    def test_title(self):
504        string_tests.MixinStrUnicodeUserStringTest.test_title(self)
505        self.assertEqual(u'\U0001044F'.title(), u'\U00010427')
506        self.assertEqual(u'\U0001044F\U0001044F'.title(),
507                         u'\U00010427\U0001044F')
508        self.assertEqual(u'\U0001044F\U0001044F \U0001044F\U0001044F'.title(),
509                         u'\U00010427\U0001044F \U00010427\U0001044F')
510        self.assertEqual(u'\U00010427\U0001044F \U00010427\U0001044F'.title(),
511                         u'\U00010427\U0001044F \U00010427\U0001044F')
512        self.assertEqual(u'\U0001044F\U00010427 \U0001044F\U00010427'.title(),
513                         u'\U00010427\U0001044F \U00010427\U0001044F')
514        self.assertEqual(u'X\U00010427x\U0001044F X\U00010427x\U0001044F'.title(),
515                         u'X\U0001044Fx\U0001044F X\U0001044Fx\U0001044F')
516
517    @requires_wide_build
518    def test_swapcase(self):
519        string_tests.CommonTest.test_swapcase(self)
520        self.assertEqual(u'\U0001044F'.swapcase(), u'\U00010427')
521        self.assertEqual(u'\U00010427'.swapcase(), u'\U0001044F')
522        self.assertEqual(u'\U0001044F\U0001044F'.swapcase(),
523                         u'\U00010427\U00010427')
524        self.assertEqual(u'\U00010427\U0001044F'.swapcase(),
525                         u'\U0001044F\U00010427')
526        self.assertEqual(u'\U0001044F\U00010427'.swapcase(),
527                         u'\U00010427\U0001044F')
528        self.assertEqual(u'X\U00010427x\U0001044F'.swapcase(),
529                         u'x\U0001044FX\U00010427')
530
531    def test_contains(self):
532        # Testing Unicode contains method
533        self.assertIn('a', u'abdb')
534        self.assertIn('a', u'bdab')
535        self.assertIn('a', u'bdaba')
536        self.assertIn('a', u'bdba')
537        self.assertIn('a', u'bdba')
538        self.assertIn(u'a', u'bdba')
539        self.assertNotIn(u'a', u'bdb')
540        self.assertNotIn(u'a', 'bdb')
541        self.assertIn(u'a', 'bdba')
542        self.assertIn(u'a', ('a',1,None))
543        self.assertIn(u'a', (1,None,'a'))
544        self.assertIn(u'a', (1,None,u'a'))
545        self.assertIn('a', ('a',1,None))
546        self.assertIn('a', (1,None,'a'))
547        self.assertIn('a', (1,None,u'a'))
548        self.assertNotIn('a', ('x',1,u'y'))
549        self.assertNotIn('a', ('x',1,None))
550        self.assertNotIn(u'abcd', u'abcxxxx')
551        self.assertIn(u'ab', u'abcd')
552        self.assertIn('ab', u'abc')
553        self.assertIn(u'ab', 'abc')
554        self.assertIn(u'ab', (1,None,u'ab'))
555        self.assertIn(u'', u'abc')
556        self.assertIn('', u'abc')
557
558        # If the following fails either
559        # the contains operator does not propagate UnicodeErrors or
560        # someone has changed the default encoding
561        self.assertRaises(UnicodeDecodeError, 'g\xe2teau'.__contains__, u'\xe2')
562        self.assertRaises(UnicodeDecodeError, u'g\xe2teau'.__contains__, '\xe2')
563
564        self.assertIn(u'', '')
565        self.assertIn('', u'')
566        self.assertIn(u'', u'')
567        self.assertIn(u'', 'abc')
568        self.assertIn('', u'abc')
569        self.assertIn(u'', u'abc')
570        self.assertNotIn(u'\0', 'abc')
571        self.assertNotIn('\0', u'abc')
572        self.assertNotIn(u'\0', u'abc')
573        self.assertIn(u'\0', '\0abc')
574        self.assertIn('\0', u'\0abc')
575        self.assertIn(u'\0', u'\0abc')
576        self.assertIn(u'\0', 'abc\0')
577        self.assertIn('\0', u'abc\0')
578        self.assertIn(u'\0', u'abc\0')
579        self.assertIn(u'a', '\0abc')
580        self.assertIn('a', u'\0abc')
581        self.assertIn(u'a', u'\0abc')
582        self.assertIn(u'asdf', 'asdf')
583        self.assertIn('asdf', u'asdf')
584        self.assertIn(u'asdf', u'asdf')
585        self.assertNotIn(u'asdf', 'asd')
586        self.assertNotIn('asdf', u'asd')
587        self.assertNotIn(u'asdf', u'asd')
588        self.assertNotIn(u'asdf', '')
589        self.assertNotIn('asdf', u'')
590        self.assertNotIn(u'asdf', u'')
591
592        self.assertRaises(TypeError, u"abc".__contains__)
593        self.assertRaises(TypeError, u"abc".__contains__, object())
594
595    def test_formatting(self):
596        string_tests.MixinStrUnicodeUserStringTest.test_formatting(self)
597        # Testing Unicode formatting strings...
598        self.assertEqual(u"%s, %s" % (u"abc", "abc"), u'abc, abc')
599        self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, 2, 3), u'abc, abc, 1, 2.000000,  3.00')
600        self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", 1, -2, 3), u'abc, abc, 1, -2.000000,  3.00')
601        self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.5), u'abc, abc, -1, -2.000000,  3.50')
602        self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 3.57), u'abc, abc, -1, -2.000000,  3.57')
603        self.assertEqual(u"%s, %s, %i, %f, %5.2f" % (u"abc", "abc", -1, -2, 1003.57), u'abc, abc, -1, -2.000000, 1003.57')
604        if not sys.platform.startswith('java'):
605            self.assertEqual(u"%r, %r" % (u"abc", "abc"), u"u'abc', 'abc'")
606        self.assertEqual(u"%(x)s, %(y)s" % {'x':u"abc", 'y':"def"}, u'abc, def')
607        self.assertEqual(u"%(x)s, %(\xfc)s" % {'x':u"abc", u'\xfc':"def"}, u'abc, def')
608
609        self.assertEqual(u'%c' % 0x1234, u'\u1234')
610        self.assertRaises(OverflowError, u"%c".__mod__, (sys.maxunicode+1,))
611        self.assertRaises(ValueError, u"%.1\u1032f".__mod__, (1.0/3))
612
613        for num in range(0x00,0x80):
614            char = chr(num)
615            self.assertEqual(u"%c" % char, unicode(char))
616            self.assertEqual(u"%c" % num, unicode(char))
617            self.assertTrue(char == u"%c" % char)
618            self.assertTrue(char == u"%c" % num)
619        # Issue 7649
620        for num in range(0x80,0x100):
621            uchar = unichr(num)
622            self.assertEqual(uchar, u"%c" % num)   # works only with ints
623            self.assertEqual(uchar, u"%c" % uchar) # and unicode chars
624            # the implicit decoding should fail for non-ascii chars
625            self.assertRaises(UnicodeDecodeError, u"%c".__mod__, chr(num))
626            self.assertRaises(UnicodeDecodeError, u"%s".__mod__, chr(num))
627
628        # formatting jobs delegated from the string implementation:
629        self.assertEqual('...%(foo)s...' % {'foo':u"abc"}, u'...abc...')
630        self.assertEqual('...%(foo)s...' % {'foo':"abc"}, '...abc...')
631        self.assertEqual('...%(foo)s...' % {u'foo':"abc"}, '...abc...')
632        self.assertEqual('...%(foo)s...' % {u'foo':u"abc"}, u'...abc...')
633        self.assertEqual('...%(foo)s...' % {u'foo':u"abc",'def':123},  u'...abc...')
634        self.assertEqual('...%(foo)s...' % {u'foo':u"abc",u'def':123}, u'...abc...')
635        self.assertEqual('...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...1...2...3...abc...')
636        self.assertEqual('...%%...%%s...%s...%s...%s...%s...' % (1,2,3,u"abc"), u'...%...%s...1...2...3...abc...')
637        self.assertEqual('...%s...' % u"abc", u'...abc...')
638        self.assertEqual('%*s' % (5,u'abc',), u'  abc')
639        self.assertEqual('%*s' % (-5,u'abc',), u'abc  ')
640        self.assertEqual('%*.*s' % (5,2,u'abc',), u'   ab')
641        self.assertEqual('%*.*s' % (5,3,u'abc',), u'  abc')
642        self.assertEqual('%i %*.*s' % (10, 5,3,u'abc',), u'10   abc')
643        self.assertEqual('%i%s %*.*s' % (10, 3, 5, 3, u'abc',), u'103   abc')
644        self.assertEqual('%c' % u'a', u'a')
645        class Wrapper:
646            def __str__(self):
647                return u'\u1234'
648        self.assertEqual('%s' % Wrapper(), u'\u1234')
649
650    def test_formatting_huge_precision(self):
651        format_string = u"%.{}f".format(sys.maxsize + 1)
652        with self.assertRaises(ValueError):
653            result = format_string % 2.34
654
655    @test_support.cpython_only
656    def test_formatting_huge_precision_c_limits(self):
657        from _testcapi import INT_MAX
658        format_string = u"%.{}f".format(INT_MAX + 1)
659        with self.assertRaises(ValueError):
660            result = format_string % 2.34
661
662    def test_formatting_huge_width(self):
663        format_string = u"%{}f".format(sys.maxsize + 1)
664        with self.assertRaises(ValueError):
665            result = format_string % 2.34
666
667    def test_startswith_endswith_errors(self):
668        for meth in (u'foo'.startswith, u'foo'.endswith):
669            with self.assertRaises(UnicodeDecodeError):
670                meth('\xff')
671            with self.assertRaises(TypeError) as cm:
672                meth(['f'])
673            exc = str(cm.exception)
674            self.assertIn('unicode', exc)
675            self.assertIn('str', exc)
676            self.assertIn('tuple', exc)
677
678    @test_support.run_with_locale('LC_ALL', 'de_DE', 'fr_FR')
679    def test_format_float(self):
680        # should not format with a comma, but always with C locale
681        self.assertEqual(u'1.0', u'%.1f' % 1.0)
682
683    def test_constructor(self):
684        # unicode(obj) tests (this maps to PyObject_Unicode() at C level)
685
686        self.assertEqual(
687            unicode(u'unicode remains unicode'),
688            u'unicode remains unicode'
689        )
690
691        self.assertEqual(
692            unicode(UnicodeSubclass('unicode subclass becomes unicode')),
693            u'unicode subclass becomes unicode'
694        )
695
696        self.assertEqual(
697            unicode('strings are converted to unicode'),
698            u'strings are converted to unicode'
699        )
700
701        class UnicodeCompat:
702            def __init__(self, x):
703                self.x = x
704            def __unicode__(self):
705                return self.x
706
707        self.assertEqual(
708            unicode(UnicodeCompat('__unicode__ compatible objects are recognized')),
709            u'__unicode__ compatible objects are recognized')
710
711        class StringCompat:
712            def __init__(self, x):
713                self.x = x
714            def __str__(self):
715                return self.x
716
717        self.assertEqual(
718            unicode(StringCompat('__str__ compatible objects are recognized')),
719            u'__str__ compatible objects are recognized'
720        )
721
722        # unicode(obj) is compatible to str():
723
724        o = StringCompat('unicode(obj) is compatible to str()')
725        self.assertEqual(unicode(o), u'unicode(obj) is compatible to str()')
726        self.assertEqual(str(o), 'unicode(obj) is compatible to str()')
727
728        # %-formatting and .__unicode__()
729        self.assertEqual(u'%s' %
730                         UnicodeCompat(u"u'%s' % obj uses obj.__unicode__()"),
731                         u"u'%s' % obj uses obj.__unicode__()")
732        self.assertEqual(u'%s' %
733                         UnicodeCompat(u"u'%s' % obj falls back to obj.__str__()"),
734                         u"u'%s' % obj falls back to obj.__str__()")
735
736        for obj in (123, 123.45, 123L):
737            self.assertEqual(unicode(obj), unicode(str(obj)))
738
739        # unicode(obj, encoding, error) tests (this maps to
740        # PyUnicode_FromEncodedObject() at C level)
741
742        if not sys.platform.startswith('java'):
743            self.assertRaises(
744                TypeError,
745                unicode,
746                u'decoding unicode is not supported',
747                'utf-8',
748                'strict'
749            )
750
751        self.assertEqual(
752            unicode('strings are decoded to unicode', 'utf-8', 'strict'),
753            u'strings are decoded to unicode'
754        )
755
756        if not sys.platform.startswith('java'):
757            with test_support.check_py3k_warnings():
758                buf = buffer('character buffers are decoded to unicode')
759            self.assertEqual(
760                unicode(
761                    buf,
762                    'utf-8',
763                    'strict'
764                ),
765                u'character buffers are decoded to unicode'
766            )
767
768        self.assertRaises(TypeError, unicode, 42, 42, 42)
769
770    def test_codecs_utf7(self):
771        utfTests = [
772            (u'A\u2262\u0391.', 'A+ImIDkQ.'),             # RFC2152 example
773            (u'Hi Mom -\u263a-!', 'Hi Mom -+Jjo--!'),     # RFC2152 example
774            (u'\u65E5\u672C\u8A9E', '+ZeVnLIqe-'),        # RFC2152 example
775            (u'Item 3 is \u00a31.', 'Item 3 is +AKM-1.'), # RFC2152 example
776            (u'+', '+-'),
777            (u'+-', '+--'),
778            (u'+?', '+-?'),
779            (u'\?', '+AFw?'),
780            (u'+?', '+-?'),
781            (ur'\\?', '+AFwAXA?'),
782            (ur'\\\?', '+AFwAXABc?'),
783            (ur'++--', '+-+---'),
784            (u'\U000abcde', '+2m/c3g-'),                  # surrogate pairs
785            (u'/', '/'),
786        ]
787
788        for (x, y) in utfTests:
789            self.assertEqual(x.encode('utf-7'), y)
790
791        # Unpaired surrogates are passed through
792        self.assertEqual(u'\uD801'.encode('utf-7'), '+2AE-')
793        self.assertEqual(u'\uD801x'.encode('utf-7'), '+2AE-x')
794        self.assertEqual(u'\uDC01'.encode('utf-7'), '+3AE-')
795        self.assertEqual(u'\uDC01x'.encode('utf-7'), '+3AE-x')
796        self.assertEqual('+2AE-'.decode('utf-7'), u'\uD801')
797        self.assertEqual('+2AE-x'.decode('utf-7'), u'\uD801x')
798        self.assertEqual('+3AE-'.decode('utf-7'), u'\uDC01')
799        self.assertEqual('+3AE-x'.decode('utf-7'), u'\uDC01x')
800
801        self.assertEqual(u'\uD801\U000abcde'.encode('utf-7'), '+2AHab9ze-')
802        self.assertEqual('+2AHab9ze-'.decode('utf-7'), u'\uD801\U000abcde')
803
804        # Direct encoded characters
805        set_d = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?"
806        # Optional direct characters
807        set_o = '!"#$%&*;<=>@[]^_`{|}'
808        for c in set_d:
809            self.assertEqual(c.encode('utf7'), c.encode('ascii'))
810            self.assertEqual(c.encode('ascii').decode('utf7'), unicode(c))
811            self.assertTrue(c == c.encode('ascii').decode('utf7'))
812        for c in set_o:
813            self.assertEqual(c.encode('ascii').decode('utf7'), unicode(c))
814            self.assertTrue(c == c.encode('ascii').decode('utf7'))
815
816    def test_codecs_utf8(self):
817        self.assertEqual(u''.encode('utf-8'), '')
818        self.assertEqual(u'\u20ac'.encode('utf-8'), '\xe2\x82\xac')
819        self.assertEqual(u'\ud800\udc02'.encode('utf-8'), '\xf0\x90\x80\x82')
820        self.assertEqual(u'\ud84d\udc56'.encode('utf-8'), '\xf0\xa3\x91\x96')
821        self.assertEqual(u'\ud800'.encode('utf-8'), '\xed\xa0\x80')
822        self.assertEqual(u'\udc00'.encode('utf-8'), '\xed\xb0\x80')
823        self.assertEqual(
824            (u'\ud800\udc02'*1000).encode('utf-8'),
825            '\xf0\x90\x80\x82'*1000
826        )
827        self.assertEqual(
828            u'\u6b63\u78ba\u306b\u8a00\u3046\u3068\u7ffb\u8a33\u306f'
829            u'\u3055\u308c\u3066\u3044\u307e\u305b\u3093\u3002\u4e00'
830            u'\u90e8\u306f\u30c9\u30a4\u30c4\u8a9e\u3067\u3059\u304c'
831            u'\u3001\u3042\u3068\u306f\u3067\u305f\u3089\u3081\u3067'
832            u'\u3059\u3002\u5b9f\u969b\u306b\u306f\u300cWenn ist das'
833            u' Nunstuck git und'.encode('utf-8'),
834            '\xe6\xad\xa3\xe7\xa2\xba\xe3\x81\xab\xe8\xa8\x80\xe3\x81'
835            '\x86\xe3\x81\xa8\xe7\xbf\xbb\xe8\xa8\xb3\xe3\x81\xaf\xe3'
836            '\x81\x95\xe3\x82\x8c\xe3\x81\xa6\xe3\x81\x84\xe3\x81\xbe'
837            '\xe3\x81\x9b\xe3\x82\x93\xe3\x80\x82\xe4\xb8\x80\xe9\x83'
838            '\xa8\xe3\x81\xaf\xe3\x83\x89\xe3\x82\xa4\xe3\x83\x84\xe8'
839            '\xaa\x9e\xe3\x81\xa7\xe3\x81\x99\xe3\x81\x8c\xe3\x80\x81'
840            '\xe3\x81\x82\xe3\x81\xa8\xe3\x81\xaf\xe3\x81\xa7\xe3\x81'
841            '\x9f\xe3\x82\x89\xe3\x82\x81\xe3\x81\xa7\xe3\x81\x99\xe3'
842            '\x80\x82\xe5\xae\x9f\xe9\x9a\x9b\xe3\x81\xab\xe3\x81\xaf'
843            '\xe3\x80\x8cWenn ist das Nunstuck git und'
844        )
845
846        # UTF-8 specific decoding tests
847        self.assertEqual(unicode('\xf0\xa3\x91\x96', 'utf-8'), u'\U00023456')
848        self.assertEqual(unicode('\xf0\x90\x80\x82', 'utf-8'), u'\U00010002')
849        self.assertEqual(unicode('\xe2\x82\xac', 'utf-8'), u'\u20ac')
850
851        # Other possible utf-8 test cases:
852        # * strict decoding testing for all of the
853        #   UTF8_ERROR cases in PyUnicode_DecodeUTF8
854
855    def test_utf8_decode_valid_sequences(self):
856        sequences = [
857            # single byte
858            ('\x00', u'\x00'), ('a', u'a'), ('\x7f', u'\x7f'),
859            # 2 bytes
860            ('\xc2\x80', u'\x80'), ('\xdf\xbf', u'\u07ff'),
861            # 3 bytes
862            ('\xe0\xa0\x80', u'\u0800'), ('\xed\x9f\xbf', u'\ud7ff'),
863            ('\xee\x80\x80', u'\uE000'), ('\xef\xbf\xbf', u'\uffff'),
864            # 4 bytes
865            ('\xF0\x90\x80\x80', u'\U00010000'),
866            ('\xf4\x8f\xbf\xbf', u'\U0010FFFF')
867        ]
868        for seq, res in sequences:
869            self.assertEqual(seq.decode('utf-8'), res)
870
871        for ch in map(unichr, range(0, sys.maxunicode)):
872            self.assertEqual(ch, ch.encode('utf-8').decode('utf-8'))
873
874    def test_utf8_decode_invalid_sequences(self):
875        # continuation bytes in a sequence of 2, 3, or 4 bytes
876        continuation_bytes = map(chr, range(0x80, 0xC0))
877        # start bytes of a 2-byte sequence equivalent to code points < 0x7F
878        invalid_2B_seq_start_bytes = map(chr, range(0xC0, 0xC2))
879        # start bytes of a 4-byte sequence equivalent to code points > 0x10FFFF
880        invalid_4B_seq_start_bytes = map(chr, range(0xF5, 0xF8))
881        invalid_start_bytes = (
882            continuation_bytes + invalid_2B_seq_start_bytes +
883            invalid_4B_seq_start_bytes + map(chr, range(0xF7, 0x100))
884        )
885
886        for byte in invalid_start_bytes:
887            self.assertRaises(UnicodeDecodeError, byte.decode, 'utf-8')
888
889        for sb in invalid_2B_seq_start_bytes:
890            for cb in continuation_bytes:
891                self.assertRaises(UnicodeDecodeError, (sb+cb).decode, 'utf-8')
892
893        for sb in invalid_4B_seq_start_bytes:
894            for cb1 in continuation_bytes[:3]:
895                for cb3 in continuation_bytes[:3]:
896                    self.assertRaises(UnicodeDecodeError,
897                                      (sb+cb1+'\x80'+cb3).decode, 'utf-8')
898
899        for cb in map(chr, range(0x80, 0xA0)):
900            self.assertRaises(UnicodeDecodeError,
901                              ('\xE0'+cb+'\x80').decode, 'utf-8')
902            self.assertRaises(UnicodeDecodeError,
903                              ('\xE0'+cb+'\xBF').decode, 'utf-8')
904        # XXX: surrogates shouldn't be valid UTF-8!
905        # see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
906        # (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
907        #for cb in map(chr, range(0xA0, 0xC0)):
908            #self.assertRaises(UnicodeDecodeError,
909                              #('\xED'+cb+'\x80').decode, 'utf-8')
910            #self.assertRaises(UnicodeDecodeError,
911                              #('\xED'+cb+'\xBF').decode, 'utf-8')
912        # but since they are valid on Python 2 add a test for that:
913        for cb, surrogate in zip(map(chr, range(0xA0, 0xC0)),
914                                 map(unichr, range(0xd800, 0xe000, 64))):
915            encoded = '\xED'+cb+'\x80'
916            self.assertEqual(encoded.decode('utf-8'), surrogate)
917            self.assertEqual(surrogate.encode('utf-8'), encoded)
918
919        for cb in map(chr, range(0x80, 0x90)):
920            self.assertRaises(UnicodeDecodeError,
921                              ('\xF0'+cb+'\x80\x80').decode, 'utf-8')
922            self.assertRaises(UnicodeDecodeError,
923                              ('\xF0'+cb+'\xBF\xBF').decode, 'utf-8')
924        for cb in map(chr, range(0x90, 0xC0)):
925            self.assertRaises(UnicodeDecodeError,
926                              ('\xF4'+cb+'\x80\x80').decode, 'utf-8')
927            self.assertRaises(UnicodeDecodeError,
928                              ('\xF4'+cb+'\xBF\xBF').decode, 'utf-8')
929
930    def test_issue8271(self):
931        # Issue #8271: during the decoding of an invalid UTF-8 byte sequence,
932        # only the start byte and the continuation byte(s) are now considered
933        # invalid, instead of the number of bytes specified by the start byte.
934        # See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf (page 95,
935        # table 3-8, Row 2) for more information about the algorithm used.
936        FFFD = u'\ufffd'
937        sequences = [
938            # invalid start bytes
939            ('\x80', FFFD), # continuation byte
940            ('\x80\x80', FFFD*2), # 2 continuation bytes
941            ('\xc0', FFFD),
942            ('\xc0\xc0', FFFD*2),
943            ('\xc1', FFFD),
944            ('\xc1\xc0', FFFD*2),
945            ('\xc0\xc1', FFFD*2),
946            # with start byte of a 2-byte sequence
947            ('\xc2', FFFD), # only the start byte
948            ('\xc2\xc2', FFFD*2), # 2 start bytes
949            ('\xc2\xc2\xc2', FFFD*3), # 2 start bytes
950            ('\xc2\x41', FFFD+'A'), # invalid continuation byte
951            # with start byte of a 3-byte sequence
952            ('\xe1', FFFD), # only the start byte
953            ('\xe1\xe1', FFFD*2), # 2 start bytes
954            ('\xe1\xe1\xe1', FFFD*3), # 3 start bytes
955            ('\xe1\xe1\xe1\xe1', FFFD*4), # 4 start bytes
956            ('\xe1\x80', FFFD), # only 1 continuation byte
957            ('\xe1\x41', FFFD+'A'), # invalid continuation byte
958            ('\xe1\x41\x80', FFFD+'A'+FFFD), # invalid cb followed by valid cb
959            ('\xe1\x41\x41', FFFD+'AA'), # 2 invalid continuation bytes
960            ('\xe1\x80\x41', FFFD+'A'), # only 1 valid continuation byte
961            ('\xe1\x80\xe1\x41', FFFD*2+'A'), # 1 valid and the other invalid
962            ('\xe1\x41\xe1\x80', FFFD+'A'+FFFD), # 1 invalid and the other valid
963            # with start byte of a 4-byte sequence
964            ('\xf1', FFFD), # only the start byte
965            ('\xf1\xf1', FFFD*2), # 2 start bytes
966            ('\xf1\xf1\xf1', FFFD*3), # 3 start bytes
967            ('\xf1\xf1\xf1\xf1', FFFD*4), # 4 start bytes
968            ('\xf1\xf1\xf1\xf1\xf1', FFFD*5), # 5 start bytes
969            ('\xf1\x80', FFFD), # only 1 continuation bytes
970            ('\xf1\x80\x80', FFFD), # only 2 continuation bytes
971            ('\xf1\x80\x41', FFFD+'A'), # 1 valid cb and 1 invalid
972            ('\xf1\x80\x41\x41', FFFD+'AA'), # 1 valid cb and 1 invalid
973            ('\xf1\x80\x80\x41', FFFD+'A'), # 2 valid cb and 1 invalid
974            ('\xf1\x41\x80', FFFD+'A'+FFFD), # 1 invalid cv and 1 valid
975            ('\xf1\x41\x80\x80', FFFD+'A'+FFFD*2), # 1 invalid cb and 2 invalid
976            ('\xf1\x41\x80\x41', FFFD+'A'+FFFD+'A'), # 2 invalid cb and 1 invalid
977            ('\xf1\x41\x41\x80', FFFD+'AA'+FFFD), # 1 valid cb and 1 invalid
978            ('\xf1\x41\xf1\x80', FFFD+'A'+FFFD),
979            ('\xf1\x41\x80\xf1', FFFD+'A'+FFFD*2),
980            ('\xf1\xf1\x80\x41', FFFD*2+'A'),
981            ('\xf1\x41\xf1\xf1', FFFD+'A'+FFFD*2),
982            # with invalid start byte of a 4-byte sequence (rfc2279)
983            ('\xf5', FFFD), # only the start byte
984            ('\xf5\xf5', FFFD*2), # 2 start bytes
985            ('\xf5\x80', FFFD*2), # only 1 continuation byte
986            ('\xf5\x80\x80', FFFD*3), # only 2 continuation byte
987            ('\xf5\x80\x80\x80', FFFD*4), # 3 continuation bytes
988            ('\xf5\x80\x41', FFFD*2+'A'), #  1 valid cb and 1 invalid
989            ('\xf5\x80\x41\xf5', FFFD*2+'A'+FFFD),
990            ('\xf5\x41\x80\x80\x41', FFFD+'A'+FFFD*2+'A'),
991            # with invalid start byte of a 5-byte sequence (rfc2279)
992            ('\xf8', FFFD), # only the start byte
993            ('\xf8\xf8', FFFD*2), # 2 start bytes
994            ('\xf8\x80', FFFD*2), # only one continuation byte
995            ('\xf8\x80\x41', FFFD*2 + 'A'), # 1 valid cb and 1 invalid
996            ('\xf8\x80\x80\x80\x80', FFFD*5), # invalid 5 bytes seq with 5 bytes
997            # with invalid start byte of a 6-byte sequence (rfc2279)
998            ('\xfc', FFFD), # only the start byte
999            ('\xfc\xfc', FFFD*2), # 2 start bytes
1000            ('\xfc\x80\x80', FFFD*3), # only 2 continuation bytes
1001            ('\xfc\x80\x80\x80\x80\x80', FFFD*6), # 6 continuation bytes
1002            # invalid start byte
1003            ('\xfe', FFFD),
1004            ('\xfe\x80\x80', FFFD*3),
1005            # other sequences
1006            ('\xf1\x80\x41\x42\x43', u'\ufffd\x41\x42\x43'),
1007            ('\xf1\x80\xff\x42\x43', u'\ufffd\ufffd\x42\x43'),
1008            ('\xf1\x80\xc2\x81\x43', u'\ufffd\x81\x43'),
1009            ('\x61\xF1\x80\x80\xE1\x80\xC2\x62\x80\x63\x80\xBF\x64',
1010             u'\x61\uFFFD\uFFFD\uFFFD\x62\uFFFD\x63\uFFFD\uFFFD\x64'),
1011        ]
1012        for n, (seq, res) in enumerate(sequences):
1013            self.assertRaises(UnicodeDecodeError, seq.decode, 'utf-8', 'strict')
1014            self.assertEqual(seq.decode('utf-8', 'replace'), res)
1015            self.assertEqual((seq+'b').decode('utf-8', 'replace'), res+'b')
1016            self.assertEqual(seq.decode('utf-8', 'ignore'),
1017                             res.replace(u'\uFFFD', ''))
1018
1019    def test_codecs_idna(self):
1020        # Test whether trailing dot is preserved
1021        self.assertEqual(u"www.python.org.".encode("idna"), "www.python.org.")
1022
1023    def test_codecs_errors(self):
1024        # Error handling (encoding)
1025        self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii')
1026        self.assertRaises(UnicodeError, u'Andr\202 x'.encode, 'ascii','strict')
1027        self.assertEqual(u'Andr\202 x'.encode('ascii','ignore'), "Andr x")
1028        self.assertEqual(u'Andr\202 x'.encode('ascii','replace'), "Andr? x")
1029        self.assertEqual(u'Andr\202 x'.encode('ascii', 'replace'),
1030                         u'Andr\202 x'.encode('ascii', errors='replace'))
1031        self.assertEqual(u'Andr\202 x'.encode('ascii', 'ignore'),
1032                         u'Andr\202 x'.encode(encoding='ascii', errors='ignore'))
1033
1034        # Error handling (decoding)
1035        self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii')
1036        self.assertRaises(UnicodeError, unicode, 'Andr\202 x', 'ascii','strict')
1037        self.assertEqual(unicode('Andr\202 x','ascii','ignore'), u"Andr x")
1038        self.assertEqual(unicode('Andr\202 x','ascii','replace'), u'Andr\uFFFD x')
1039        self.assertEqual(unicode('\202 x', 'ascii', 'replace'), u'\uFFFD x')
1040        with test_support.check_py3k_warnings():
1041            self.assertEqual(u'abcde'.decode('ascii', 'ignore'),
1042                             u'abcde'.decode('ascii', errors='ignore'))
1043        with test_support.check_py3k_warnings():
1044            self.assertEqual(u'abcde'.decode('ascii', 'replace'),
1045                             u'abcde'.decode(encoding='ascii', errors='replace'))
1046
1047        # Error handling (unknown character names)
1048        self.assertEqual("\\N{foo}xx".decode("unicode-escape", "ignore"), u"xx")
1049
1050        # Error handling (truncated escape sequence)
1051        self.assertRaises(UnicodeError, "\\".decode, "unicode-escape")
1052
1053        self.assertRaises(TypeError, "hello".decode, "test.unicode1")
1054        self.assertRaises(TypeError, unicode, "hello", "test.unicode2")
1055        self.assertRaises(TypeError, u"hello".encode, "test.unicode1")
1056        self.assertRaises(TypeError, u"hello".encode, "test.unicode2")
1057        # executes PyUnicode_Encode()
1058        import imp
1059        self.assertRaises(
1060            ImportError,
1061            imp.find_module,
1062            "non-existing module",
1063            [u"non-existing dir"]
1064        )
1065
1066        # Error handling (wrong arguments)
1067        self.assertRaises(TypeError, u"hello".encode, 42, 42, 42)
1068
1069        # Error handling (PyUnicode_EncodeDecimal())
1070        self.assertRaises(UnicodeError, int, u"\u0200")
1071
1072    def test_codecs(self):
1073        # Encoding
1074        self.assertEqual(u'hello'.encode('ascii'), 'hello')
1075        self.assertEqual(u'hello'.encode('utf-7'), 'hello')
1076        self.assertEqual(u'hello'.encode('utf-8'), 'hello')
1077        self.assertEqual(u'hello'.encode('utf8'), 'hello')
1078        self.assertEqual(u'hello'.encode('utf-16-le'), 'h\000e\000l\000l\000o\000')
1079        self.assertEqual(u'hello'.encode('utf-16-be'), '\000h\000e\000l\000l\000o')
1080        self.assertEqual(u'hello'.encode('latin-1'), 'hello')
1081
1082        # Roundtrip safety for BMP (just the first 1024 chars)
1083        for c in xrange(1024):
1084            u = unichr(c)
1085            for encoding in ('utf-7', 'utf-8', 'utf-16', 'utf-16-le',
1086                             'utf-16-be', 'raw_unicode_escape',
1087                             'unicode_escape', 'unicode_internal'):
1088                self.assertEqual(unicode(u.encode(encoding),encoding), u)
1089
1090        # Roundtrip safety for BMP (just the first 256 chars)
1091        for c in xrange(256):
1092            u = unichr(c)
1093            for encoding in ('latin-1',):
1094                self.assertEqual(unicode(u.encode(encoding),encoding), u)
1095
1096        # Roundtrip safety for BMP (just the first 128 chars)
1097        for c in xrange(128):
1098            u = unichr(c)
1099            for encoding in ('ascii',):
1100                self.assertEqual(unicode(u.encode(encoding),encoding), u)
1101
1102        # Roundtrip safety for non-BMP (just a few chars)
1103        u = u'\U00010001\U00020002\U00030003\U00040004\U00050005'
1104        for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
1105                         #'raw_unicode_escape',
1106                         'unicode_escape', 'unicode_internal'):
1107            self.assertEqual(unicode(u.encode(encoding),encoding), u)
1108
1109        # UTF-8 must be roundtrip safe for all UCS-2 code points
1110        # This excludes surrogates: in the full range, there would be
1111        # a surrogate pair (\udbff\udc00), which gets converted back
1112        # to a non-BMP character (\U0010fc00)
1113        u = u''.join(map(unichr, range(0,0xd800)+range(0xe000,0x10000)))
1114        for encoding in ('utf-8',):
1115            self.assertEqual(unicode(u.encode(encoding),encoding), u)
1116
1117    def test_codecs_charmap(self):
1118        # 0-127
1119        s = ''.join(map(chr, xrange(128)))
1120        for encoding in (
1121            'cp037', 'cp1026',
1122            'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
1123            'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
1124            'cp863', 'cp865', 'cp866',
1125            'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
1126            'iso8859_2', 'iso8859_3', 'iso8859_4', 'iso8859_5', 'iso8859_6',
1127            'iso8859_7', 'iso8859_9', 'koi8_r', 'latin_1',
1128            'mac_cyrillic', 'mac_latin2',
1129
1130            'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
1131            'cp1256', 'cp1257', 'cp1258',
1132            'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
1133
1134            'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
1135            'cp1006', 'iso8859_8',
1136
1137            ### These have undefined mappings:
1138            #'cp424',
1139
1140            ### These fail the round-trip:
1141            #'cp875'
1142
1143            ):
1144            self.assertEqual(unicode(s, encoding).encode(encoding), s)
1145
1146        # 128-255
1147        s = ''.join(map(chr, xrange(128, 256)))
1148        for encoding in (
1149            'cp037', 'cp1026',
1150            'cp437', 'cp500', 'cp720', 'cp737', 'cp775', 'cp850',
1151            'cp852', 'cp855', 'cp858', 'cp860', 'cp861', 'cp862',
1152            'cp863', 'cp865', 'cp866',
1153            'iso8859_10', 'iso8859_13', 'iso8859_14', 'iso8859_15',
1154            'iso8859_2', 'iso8859_4', 'iso8859_5',
1155            'iso8859_9', 'koi8_r', 'latin_1',
1156            'mac_cyrillic', 'mac_latin2',
1157
1158            ### These have undefined mappings:
1159            #'cp1250', 'cp1251', 'cp1252', 'cp1253', 'cp1254', 'cp1255',
1160            #'cp1256', 'cp1257', 'cp1258',
1161            #'cp424', 'cp856', 'cp857', 'cp864', 'cp869', 'cp874',
1162            #'iso8859_3', 'iso8859_6', 'iso8859_7',
1163            #'mac_greek', 'mac_iceland','mac_roman', 'mac_turkish',
1164
1165            ### These fail the round-trip:
1166            #'cp1006', 'cp875', 'iso8859_8',
1167
1168            ):
1169            self.assertEqual(unicode(s, encoding).encode(encoding), s)
1170
1171    def test_concatenation(self):
1172        self.assertEqual((u"abc" u"def"), u"abcdef")
1173        self.assertEqual(("abc" u"def"), u"abcdef")
1174        self.assertEqual((u"abc" "def"), u"abcdef")
1175        self.assertEqual((u"abc" u"def" "ghi"), u"abcdefghi")
1176        self.assertEqual(("abc" "def" u"ghi"), u"abcdefghi")
1177
1178    def test_printing(self):
1179        class BitBucket:
1180            def write(self, text):
1181                pass
1182
1183        out = BitBucket()
1184        print >>out, u'abc'
1185        print >>out, u'abc', u'def'
1186        print >>out, u'abc', 'def'
1187        print >>out, 'abc', u'def'
1188        print >>out, u'abc\n'
1189        print >>out, u'abc\n',
1190        print >>out, u'abc\n',
1191        print >>out, u'def\n'
1192        print >>out, u'def\n'
1193
1194    def test_ucs4(self):
1195        x = u'\U00100000'
1196        y = x.encode("raw-unicode-escape").decode("raw-unicode-escape")
1197        self.assertEqual(x, y)
1198
1199        y = r'\U00100000'
1200        x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
1201        self.assertEqual(x, y)
1202        y = r'\U00010000'
1203        x = y.decode("raw-unicode-escape").encode("raw-unicode-escape")
1204        self.assertEqual(x, y)
1205
1206        try:
1207            '\U11111111'.decode("raw-unicode-escape")
1208        except UnicodeDecodeError as e:
1209            self.assertEqual(e.start, 0)
1210            self.assertEqual(e.end, 10)
1211        else:
1212            self.fail("Should have raised UnicodeDecodeError")
1213
1214    def test_conversion(self):
1215        # Make sure __unicode__() works properly
1216        class Foo0:
1217            def __str__(self):
1218                return "foo"
1219
1220        class Foo1:
1221            def __unicode__(self):
1222                return u"foo"
1223
1224        class Foo2(object):
1225            def __unicode__(self):
1226                return u"foo"
1227
1228        class Foo3(object):
1229            def __unicode__(self):
1230                return "foo"
1231
1232        class Foo4(str):
1233            def __unicode__(self):
1234                return "foo"
1235
1236        class Foo5(unicode):
1237            def __unicode__(self):
1238                return "foo"
1239
1240        class Foo6(str):
1241            def __str__(self):
1242                return "foos"
1243
1244            def __unicode__(self):
1245                return u"foou"
1246
1247        class Foo7(unicode):
1248            def __str__(self):
1249                return "foos"
1250            def __unicode__(self):
1251                return u"foou"
1252
1253        class Foo8(unicode):
1254            def __new__(cls, content=""):
1255                return unicode.__new__(cls, 2*content)
1256            def __unicode__(self):
1257                return self
1258
1259        class Foo9(unicode):
1260            def __str__(self):
1261                return "string"
1262            def __unicode__(self):
1263                return "not unicode"
1264
1265        self.assertEqual(unicode(Foo0()), u"foo")
1266        self.assertEqual(unicode(Foo1()), u"foo")
1267        self.assertEqual(unicode(Foo2()), u"foo")
1268        self.assertEqual(unicode(Foo3()), u"foo")
1269        self.assertEqual(unicode(Foo4("bar")), u"foo")
1270        self.assertEqual(unicode(Foo5("bar")), u"foo")
1271        self.assertEqual(unicode(Foo6("bar")), u"foou")
1272        self.assertEqual(unicode(Foo7("bar")), u"foou")
1273        self.assertEqual(unicode(Foo8("foo")), u"foofoo")
1274        self.assertIs(type(unicode(Foo8("foo"))), Foo8)
1275        self.assertEqual(UnicodeSubclass(Foo8("foo")), u"foofoo")
1276        self.assertIs(type(UnicodeSubclass(Foo8("foo"))), UnicodeSubclass)
1277        self.assertEqual(str(Foo9("foo")), "string")
1278        self.assertEqual(unicode(Foo9("foo")), u"not unicode")
1279
1280    def test_unicode_repr(self):
1281        class s1:
1282            def __repr__(self):
1283                return '\\n'
1284
1285        class s2:
1286            def __repr__(self):
1287                return u'\\n'
1288
1289        self.assertEqual(repr(s1()), '\\n')
1290        self.assertEqual(repr(s2()), '\\n')
1291
1292    # This test only affects 32-bit platforms because expandtabs can only take
1293    # an int as the max value, not a 64-bit C long.  If expandtabs is changed
1294    # to take a 64-bit long, this test should apply to all platforms.
1295    @unittest.skipIf(sys.maxint > (1 << 32) or struct.calcsize('P') != 4,
1296                     'only applies to 32-bit platforms')
1297    def test_expandtabs_overflows_gracefully(self):
1298        self.assertRaises(OverflowError, u't\tt\t'.expandtabs, sys.maxint)
1299
1300    def test__format__(self):
1301        def test(value, format, expected):
1302            # test both with and without the trailing 's'
1303            self.assertEqual(value.__format__(format), expected)
1304            self.assertEqual(value.__format__(format + u's'), expected)
1305
1306        test(u'', u'', u'')
1307        test(u'abc', u'', u'abc')
1308        test(u'abc', u'.3', u'abc')
1309        test(u'ab', u'.3', u'ab')
1310        test(u'abcdef', u'.3', u'abc')
1311        test(u'abcdef', u'.0', u'')
1312        test(u'abc', u'3.3', u'abc')
1313        test(u'abc', u'2.3', u'abc')
1314        test(u'abc', u'2.2', u'ab')
1315        test(u'abc', u'3.2', u'ab ')
1316        test(u'result', u'x<0', u'result')
1317        test(u'result', u'x<5', u'result')
1318        test(u'result', u'x<6', u'result')
1319        test(u'result', u'x<7', u'resultx')
1320        test(u'result', u'x<8', u'resultxx')
1321        test(u'result', u' <7', u'result ')
1322        test(u'result', u'<7', u'result ')
1323        test(u'result', u'>7', u' result')
1324        test(u'result', u'>8', u'  result')
1325        test(u'result', u'^8', u' result ')
1326        test(u'result', u'^9', u' result  ')
1327        test(u'result', u'^10', u'  result  ')
1328        test(u'a', u'10000', u'a' + u' ' * 9999)
1329        test(u'', u'10000', u' ' * 10000)
1330        test(u'', u'10000000', u' ' * 10000000)
1331
1332        # test mixing unicode and str
1333        self.assertEqual(u'abc'.__format__('s'), u'abc')
1334        self.assertEqual(u'abc'.__format__('->10s'), u'-------abc')
1335
1336    def test_format(self):
1337        self.assertEqual(u''.format(), u'')
1338        self.assertEqual(u'a'.format(), u'a')
1339        self.assertEqual(u'ab'.format(), u'ab')
1340        self.assertEqual(u'a{{'.format(), u'a{')
1341        self.assertEqual(u'a}}'.format(), u'a}')
1342        self.assertEqual(u'{{b'.format(), u'{b')
1343        self.assertEqual(u'}}b'.format(), u'}b')
1344        self.assertEqual(u'a{{b'.format(), u'a{b')
1345
1346        # examples from the PEP:
1347        import datetime
1348        self.assertEqual(u"My name is {0}".format(u'Fred'), u"My name is Fred")
1349        self.assertEqual(u"My name is {0[name]}".format(dict(name=u'Fred')),
1350                         u"My name is Fred")
1351        self.assertEqual(u"My name is {0} :-{{}}".format(u'Fred'),
1352                         u"My name is Fred :-{}")
1353
1354        # datetime.__format__ doesn't work with unicode
1355        #d = datetime.date(2007, 8, 18)
1356        #self.assertEqual("The year is {0.year}".format(d),
1357        #                 "The year is 2007")
1358
1359        # classes we'll use for testing
1360        class C:
1361            def __init__(self, x=100):
1362                self._x = x
1363            def __format__(self, spec):
1364                return spec
1365
1366        class D:
1367            def __init__(self, x):
1368                self.x = x
1369            def __format__(self, spec):
1370                return str(self.x)
1371
1372        # class with __str__, but no __format__
1373        class E:
1374            def __init__(self, x):
1375                self.x = x
1376            def __str__(self):
1377                return u'E(' + self.x + u')'
1378
1379        # class with __repr__, but no __format__ or __str__
1380        class F:
1381            def __init__(self, x):
1382                self.x = x
1383            def __repr__(self):
1384                return u'F(' + self.x + u')'
1385
1386        # class with __format__ that forwards to string, for some format_spec's
1387        class G:
1388            def __init__(self, x):
1389                self.x = x
1390            def __str__(self):
1391                return u"string is " + self.x
1392            def __format__(self, format_spec):
1393                if format_spec == 'd':
1394                    return u'G(' + self.x + u')'
1395                return object.__format__(self, format_spec)
1396
1397        # class that returns a bad type from __format__
1398        class H:
1399            def __format__(self, format_spec):
1400                return 1.0
1401
1402        class I(datetime.date):
1403            def __format__(self, format_spec):
1404                return self.strftime(format_spec)
1405
1406        class J(int):
1407            def __format__(self, format_spec):
1408                return int.__format__(self * 2, format_spec)
1409
1410
1411        self.assertEqual(u''.format(), u'')
1412        self.assertEqual(u'abc'.format(), u'abc')
1413        self.assertEqual(u'{0}'.format(u'abc'), u'abc')
1414        self.assertEqual(u'{0:}'.format(u'abc'), u'abc')
1415        self.assertEqual(u'X{0}'.format(u'abc'), u'Xabc')
1416        self.assertEqual(u'{0}X'.format(u'abc'), u'abcX')
1417        self.assertEqual(u'X{0}Y'.format(u'abc'), u'XabcY')
1418        self.assertEqual(u'{1}'.format(1, u'abc'), u'abc')
1419        self.assertEqual(u'X{1}'.format(1, u'abc'), u'Xabc')
1420        self.assertEqual(u'{1}X'.format(1, u'abc'), u'abcX')
1421        self.assertEqual(u'X{1}Y'.format(1, u'abc'), u'XabcY')
1422        self.assertEqual(u'{0}'.format(-15), u'-15')
1423        self.assertEqual(u'{0}{1}'.format(-15, u'abc'), u'-15abc')
1424        self.assertEqual(u'{0}X{1}'.format(-15, u'abc'), u'-15Xabc')
1425        self.assertEqual(u'{{'.format(), u'{')
1426        self.assertEqual(u'}}'.format(), u'}')
1427        self.assertEqual(u'{{}}'.format(), u'{}')
1428        self.assertEqual(u'{{x}}'.format(), u'{x}')
1429        self.assertEqual(u'{{{0}}}'.format(123), u'{123}')
1430        self.assertEqual(u'{{{{0}}}}'.format(), u'{{0}}')
1431        self.assertEqual(u'}}{{'.format(), u'}{')
1432        self.assertEqual(u'}}x{{'.format(), u'}x{')
1433
1434        # weird field names
1435        self.assertEqual(u"{0[foo-bar]}".format({u'foo-bar':u'baz'}), u'baz')
1436        self.assertEqual(u"{0[foo bar]}".format({u'foo bar':u'baz'}), u'baz')
1437        self.assertEqual(u"{0[ ]}".format({u' ':3}), u'3')
1438
1439        self.assertEqual(u'{foo._x}'.format(foo=C(20)), u'20')
1440        self.assertEqual(u'{1}{0}'.format(D(10), D(20)), u'2010')
1441        self.assertEqual(u'{0._x.x}'.format(C(D(u'abc'))), u'abc')
1442        self.assertEqual(u'{0[0]}'.format([u'abc', u'def']), u'abc')
1443        self.assertEqual(u'{0[1]}'.format([u'abc', u'def']), u'def')
1444        self.assertEqual(u'{0[1][0]}'.format([u'abc', [u'def']]), u'def')
1445        self.assertEqual(u'{0[1][0].x}'.format(['abc', [D(u'def')]]), u'def')
1446
1447        # strings
1448        self.assertEqual(u'{0:.3s}'.format(u'abc'), u'abc')
1449        self.assertEqual(u'{0:.3s}'.format(u'ab'), u'ab')
1450        self.assertEqual(u'{0:.3s}'.format(u'abcdef'), u'abc')
1451        self.assertEqual(u'{0:.0s}'.format(u'abcdef'), u'')
1452        self.assertEqual(u'{0:3.3s}'.format(u'abc'), u'abc')
1453        self.assertEqual(u'{0:2.3s}'.format(u'abc'), u'abc')
1454        self.assertEqual(u'{0:2.2s}'.format(u'abc'), u'ab')
1455        self.assertEqual(u'{0:3.2s}'.format(u'abc'), u'ab ')
1456        self.assertEqual(u'{0:x<0s}'.format(u'result'), u'result')
1457        self.assertEqual(u'{0:x<5s}'.format(u'result'), u'result')
1458        self.assertEqual(u'{0:x<6s}'.format(u'result'), u'result')
1459        self.assertEqual(u'{0:x<7s}'.format(u'result'), u'resultx')
1460        self.assertEqual(u'{0:x<8s}'.format(u'result'), u'resultxx')
1461        self.assertEqual(u'{0: <7s}'.format(u'result'), u'result ')
1462        self.assertEqual(u'{0:<7s}'.format(u'result'), u'result ')
1463        self.assertEqual(u'{0:>7s}'.format(u'result'), u' result')
1464        self.assertEqual(u'{0:>8s}'.format(u'result'), u'  result')
1465        self.assertEqual(u'{0:^8s}'.format(u'result'), u' result ')
1466        self.assertEqual(u'{0:^9s}'.format(u'result'), u' result  ')
1467        self.assertEqual(u'{0:^10s}'.format(u'result'), u'  result  ')
1468        self.assertEqual(u'{0:10000}'.format(u'a'), u'a' + u' ' * 9999)
1469        self.assertEqual(u'{0:10000}'.format(u''), u' ' * 10000)
1470        self.assertEqual(u'{0:10000000}'.format(u''), u' ' * 10000000)
1471
1472        # issue 12546: use \x00 as a fill character
1473        self.assertEqual('{0:\x00<6s}'.format('foo'), 'foo\x00\x00\x00')
1474        self.assertEqual('{0:\x01<6s}'.format('foo'), 'foo\x01\x01\x01')
1475        self.assertEqual('{0:\x00^6s}'.format('foo'), '\x00foo\x00\x00')
1476        self.assertEqual('{0:^6s}'.format('foo'), ' foo  ')
1477
1478        self.assertEqual('{0:\x00<6}'.format(3), '3\x00\x00\x00\x00\x00')
1479        self.assertEqual('{0:\x01<6}'.format(3), '3\x01\x01\x01\x01\x01')
1480        self.assertEqual('{0:\x00^6}'.format(3), '\x00\x003\x00\x00\x00')
1481        self.assertEqual('{0:<6}'.format(3), '3     ')
1482
1483        self.assertEqual('{0:\x00<6}'.format(3.14), '3.14\x00\x00')
1484        self.assertEqual('{0:\x01<6}'.format(3.14), '3.14\x01\x01')
1485        self.assertEqual('{0:\x00^6}'.format(3.14), '\x003.14\x00')
1486        self.assertEqual('{0:^6}'.format(3.14), ' 3.14 ')
1487
1488        self.assertEqual('{0:\x00<12}'.format(3+2.0j), '(3+2j)\x00\x00\x00\x00\x00\x00')
1489        self.assertEqual('{0:\x01<12}'.format(3+2.0j), '(3+2j)\x01\x01\x01\x01\x01\x01')
1490        self.assertEqual('{0:\x00^12}'.format(3+2.0j), '\x00\x00\x00(3+2j)\x00\x00\x00')
1491        self.assertEqual('{0:^12}'.format(3+2.0j), '   (3+2j)   ')
1492
1493        # format specifiers for user defined type
1494        self.assertEqual(u'{0:abc}'.format(C()), u'abc')
1495
1496        # !r and !s coercions
1497        self.assertEqual(u'{0!s}'.format(u'Hello'), u'Hello')
1498        self.assertEqual(u'{0!s:}'.format(u'Hello'), u'Hello')
1499        self.assertEqual(u'{0!s:15}'.format(u'Hello'), u'Hello          ')
1500        self.assertEqual(u'{0!s:15s}'.format(u'Hello'), u'Hello          ')
1501        self.assertEqual(u'{0!r}'.format(u'Hello'), u"u'Hello'")
1502        self.assertEqual(u'{0!r:}'.format(u'Hello'), u"u'Hello'")
1503        self.assertEqual(u'{0!r}'.format(F(u'Hello')), u'F(Hello)')
1504
1505        # test fallback to object.__format__
1506        self.assertEqual(u'{0}'.format({}), u'{}')
1507        self.assertEqual(u'{0}'.format([]), u'[]')
1508        self.assertEqual(u'{0}'.format([1]), u'[1]')
1509        self.assertEqual(u'{0}'.format(E(u'data')), u'E(data)')
1510        self.assertEqual(u'{0:d}'.format(G(u'data')), u'G(data)')
1511        self.assertEqual(u'{0!s}'.format(G(u'data')), u'string is data')
1512
1513        msg = 'object.__format__ with a non-empty format string is deprecated'
1514        with test_support.check_warnings((msg, PendingDeprecationWarning)):
1515            self.assertEqual(u'{0:^10}'.format(E(u'data')), u' E(data)  ')
1516            self.assertEqual(u'{0:^10s}'.format(E(u'data')), u' E(data)  ')
1517            self.assertEqual(u'{0:>15s}'.format(G(u'data')), u' string is data')
1518
1519        self.assertEqual(u"{0:date: %Y-%m-%d}".format(I(year=2007,
1520                                                        month=8,
1521                                                        day=27)),
1522                         u"date: 2007-08-27")
1523
1524        # test deriving from a builtin type and overriding __format__
1525        self.assertEqual(u"{0}".format(J(10)), u"20")
1526
1527
1528        # string format specifiers
1529        self.assertEqual(u'{0:}'.format('a'), u'a')
1530
1531        # computed format specifiers
1532        self.assertEqual(u"{0:.{1}}".format(u'hello world', 5), u'hello')
1533        self.assertEqual(u"{0:.{1}s}".format(u'hello world', 5), u'hello')
1534        self.assertEqual(u"{0:.{precision}s}".format('hello world', precision=5), u'hello')
1535        self.assertEqual(u"{0:{width}.{precision}s}".format('hello world', width=10, precision=5), u'hello     ')
1536        self.assertEqual(u"{0:{width}.{precision}s}".format('hello world', width='10', precision='5'), u'hello     ')
1537
1538        # test various errors
1539        self.assertRaises(ValueError, u'{'.format)
1540        self.assertRaises(ValueError, u'}'.format)
1541        self.assertRaises(ValueError, u'a{'.format)
1542        self.assertRaises(ValueError, u'a}'.format)
1543        self.assertRaises(ValueError, u'{a'.format)
1544        self.assertRaises(ValueError, u'}a'.format)
1545        self.assertRaises(IndexError, u'{0}'.format)
1546        self.assertRaises(IndexError, u'{1}'.format, u'abc')
1547        self.assertRaises(KeyError,   u'{x}'.format)
1548        self.assertRaises(ValueError, u"}{".format)
1549        self.assertRaises(ValueError, u"{".format)
1550        self.assertRaises(ValueError, u"}".format)
1551        self.assertRaises(ValueError, u"abc{0:{}".format)
1552        self.assertRaises(ValueError, u"{0".format)
1553        self.assertRaises(IndexError, u"{0.}".format)
1554        self.assertRaises(ValueError, u"{0.}".format, 0)
1555        self.assertRaises(IndexError, u"{0[}".format)
1556        self.assertRaises(ValueError, u"{0[}".format, [])
1557        self.assertRaises(KeyError,   u"{0]}".format)
1558        self.assertRaises(ValueError, u"{0.[]}".format, 0)
1559        self.assertRaises(ValueError, u"{0..foo}".format, 0)
1560        self.assertRaises(ValueError, u"{0[0}".format, 0)
1561        self.assertRaises(ValueError, u"{0[0:foo}".format, 0)
1562        self.assertRaises(KeyError,   u"{c]}".format)
1563        self.assertRaises(ValueError, u"{{ {{{0}}".format, 0)
1564        self.assertRaises(ValueError, u"{0}}".format, 0)
1565        self.assertRaises(KeyError,   u"{foo}".format, bar=3)
1566        self.assertRaises(ValueError, u"{0!x}".format, 3)
1567        self.assertRaises(ValueError, u"{0!}".format, 0)
1568        self.assertRaises(ValueError, u"{0!rs}".format, 0)
1569        self.assertRaises(ValueError, u"{!}".format)
1570        self.assertRaises(IndexError, u"{:}".format)
1571        self.assertRaises(IndexError, u"{:s}".format)
1572        self.assertRaises(IndexError, u"{}".format)
1573        big = u"23098475029384702983476098230754973209482573"
1574        self.assertRaises(ValueError, (u"{" + big + u"}").format)
1575        self.assertRaises(ValueError, (u"{[" + big + u"]}").format, [0])
1576
1577        # issue 6089
1578        self.assertRaises(ValueError, u"{0[0]x}".format, [None])
1579        self.assertRaises(ValueError, u"{0[0](10)}".format, [None])
1580
1581        # can't have a replacement on the field name portion
1582        self.assertRaises(TypeError, u'{0[{1}]}'.format, u'abcdefg', 4)
1583
1584        # exceed maximum recursion depth
1585        self.assertRaises(ValueError, u"{0:{1:{2}}}".format, u'abc', u's', u'')
1586        self.assertRaises(ValueError, u"{0:{1:{2:{3:{4:{5:{6}}}}}}}".format,
1587                          0, 1, 2, 3, 4, 5, 6, 7)
1588
1589        # string format spec errors
1590        self.assertRaises(ValueError, u"{0:-s}".format, u'')
1591        self.assertRaises(ValueError, format, u"", u"-")
1592        self.assertRaises(ValueError, u"{0:=s}".format, u'')
1593
1594        # test combining string and unicode
1595        self.assertEqual(u"foo{0}".format('bar'), u'foobar')
1596        # This will try to convert the argument from unicode to str, which
1597        #  will succeed
1598        self.assertEqual("foo{0}".format(u'bar'), 'foobar')
1599        # This will try to convert the argument from unicode to str, which
1600        #  will fail
1601        self.assertRaises(UnicodeEncodeError, "foo{0}".format, u'\u1000bar')
1602
1603    def test_format_huge_precision(self):
1604        format_string = u".{}f".format(sys.maxsize + 1)
1605        with self.assertRaises(ValueError):
1606            result = format(2.34, format_string)
1607
1608    def test_format_huge_width(self):
1609        format_string = u"{}f".format(sys.maxsize + 1)
1610        with self.assertRaises(ValueError):
1611            result = format(2.34, format_string)
1612
1613    def test_format_huge_item_number(self):
1614        format_string = u"{{{}:.6f}}".format(sys.maxsize + 1)
1615        with self.assertRaises(ValueError):
1616            result = format_string.format(2.34)
1617
1618    def test_format_auto_numbering(self):
1619        class C:
1620            def __init__(self, x=100):
1621                self._x = x
1622            def __format__(self, spec):
1623                return spec
1624
1625        self.assertEqual(u'{}'.format(10), u'10')
1626        self.assertEqual(u'{:5}'.format('s'), u's    ')
1627        self.assertEqual(u'{!r}'.format('s'), u"'s'")
1628        self.assertEqual(u'{._x}'.format(C(10)), u'10')
1629        self.assertEqual(u'{[1]}'.format([1, 2]), u'2')
1630        self.assertEqual(u'{[a]}'.format({'a':4, 'b':2}), u'4')
1631        self.assertEqual(u'a{}b{}c'.format(0, 1), u'a0b1c')
1632
1633        self.assertEqual(u'a{:{}}b'.format('x', '^10'), u'a    x     b')
1634        self.assertEqual(u'a{:{}x}b'.format(20, '#'), u'a0x14b')
1635
1636        # can't mix and match numbering and auto-numbering
1637        self.assertRaises(ValueError, u'{}{1}'.format, 1, 2)
1638        self.assertRaises(ValueError, u'{1}{}'.format, 1, 2)
1639        self.assertRaises(ValueError, u'{:{1}}'.format, 1, 2)
1640        self.assertRaises(ValueError, u'{0:{}}'.format, 1, 2)
1641
1642        # can mix and match auto-numbering and named
1643        self.assertEqual(u'{f}{}'.format(4, f='test'), u'test4')
1644        self.assertEqual(u'{}{f}'.format(4, f='test'), u'4test')
1645        self.assertEqual(u'{:{f}}{g}{}'.format(1, 3, g='g', f=2), u' 1g3')
1646        self.assertEqual(u'{f:{}}{}{g}'.format(2, 4, f=1, g='g'), u' 14g')
1647
1648    def test_raiseMemError(self):
1649        # Ensure that the freelist contains a consistent object, even
1650        # when a string allocation fails with a MemoryError.
1651        # This used to crash the interpreter,
1652        # or leak references when the number was smaller.
1653        charwidth = 4 if sys.maxunicode >= 0x10000 else 2
1654        # Note: sys.maxsize is half of the actual max allocation because of
1655        # the signedness of Py_ssize_t.
1656        alloc = lambda: u"a" * (sys.maxsize // charwidth * 2)
1657        self.assertRaises(MemoryError, alloc)
1658        self.assertRaises(MemoryError, alloc)
1659
1660    def test_format_subclass(self):
1661        class U(unicode):
1662            def __unicode__(self):
1663                return u'__unicode__ overridden'
1664        u = U(u'xxx')
1665        self.assertEqual("%s" % u, u'__unicode__ overridden')
1666        self.assertEqual("{}".format(u), '__unicode__ overridden')
1667
1668    def test_free_after_iterating(self):
1669        test_support.check_free_after_iterating(self, iter, unicode)
1670        test_support.check_free_after_iterating(self, reversed, unicode)
1671
1672
1673class CAPITest(unittest.TestCase):
1674
1675    # Test PyUnicode_FromFormat()
1676    def test_from_format(self):
1677        test_support.import_module('ctypes')
1678        from ctypes import (
1679            pythonapi, py_object, sizeof,
1680            c_int, c_long, c_longlong, c_ssize_t,
1681            c_uint, c_ulong, c_ulonglong, c_size_t, c_void_p)
1682        if sys.maxunicode == 0xffff:
1683            name = "PyUnicodeUCS2_FromFormat"
1684        else:
1685            name = "PyUnicodeUCS4_FromFormat"
1686        _PyUnicode_FromFormat = getattr(pythonapi, name)
1687        _PyUnicode_FromFormat.restype = py_object
1688
1689        def PyUnicode_FromFormat(format, *args):
1690            cargs = tuple(
1691                py_object(arg) if isinstance(arg, unicode) else arg
1692                for arg in args)
1693            return _PyUnicode_FromFormat(format, *cargs)
1694
1695        def check_format(expected, format, *args):
1696            text = PyUnicode_FromFormat(format, *args)
1697            self.assertEqual(expected, text)
1698
1699        # ascii format, non-ascii argument
1700        check_format(u'ascii\x7f=unicode\xe9',
1701                     b'ascii\x7f=%U', u'unicode\xe9')
1702
1703        # non-ascii format, ascii argument: ensure that PyUnicode_FromFormatV()
1704        # raises an error
1705        #self.assertRaisesRegex(ValueError,
1706        #    '^PyUnicode_FromFormatV\(\) expects an ASCII-encoded format '
1707        #    'string, got a non-ASCII byte: 0xe9$',
1708        #    PyUnicode_FromFormat, b'unicode\xe9=%s', u'ascii')
1709
1710        # test "%c"
1711        check_format(u'\uabcd',
1712                     b'%c', c_int(0xabcd))
1713        if sys.maxunicode > 0xffff:
1714            check_format(u'\U0010ffff',
1715                         b'%c', c_int(0x10ffff))
1716        else:
1717            with self.assertRaises(OverflowError):
1718                PyUnicode_FromFormat(b'%c', c_int(0x10000))
1719        with self.assertRaises(OverflowError):
1720            PyUnicode_FromFormat(b'%c', c_int(0x110000))
1721        # Issue #18183
1722        if sys.maxunicode > 0xffff:
1723            check_format(u'\U00010000\U00100000',
1724                         b'%c%c', c_int(0x10000), c_int(0x100000))
1725
1726        # test "%"
1727        check_format(u'%',
1728                     b'%')
1729        check_format(u'%',
1730                     b'%%')
1731        check_format(u'%s',
1732                     b'%%s')
1733        check_format(u'[%]',
1734                     b'[%%]')
1735        check_format(u'%abc',
1736                     b'%%%s', b'abc')
1737
1738        # test %S
1739        check_format(u"repr=abc",
1740                     b'repr=%S', u'abc')
1741
1742        # test %R
1743        check_format(u"repr=u'abc'",
1744                     b'repr=%R', u'abc')
1745
1746        # test integer formats (%i, %d, %u)
1747        check_format(u'010',
1748                     b'%03i', c_int(10))
1749        check_format(u'0010',
1750                     b'%0.4i', c_int(10))
1751        check_format(u'-123',
1752                     b'%i', c_int(-123))
1753
1754        check_format(u'-123',
1755                     b'%d', c_int(-123))
1756        check_format(u'-123',
1757                     b'%ld', c_long(-123))
1758        check_format(u'-123',
1759                     b'%zd', c_ssize_t(-123))
1760
1761        check_format(u'123',
1762                     b'%u', c_uint(123))
1763        check_format(u'123',
1764                     b'%lu', c_ulong(123))
1765        check_format(u'123',
1766                     b'%zu', c_size_t(123))
1767
1768        # test long output
1769        min_long = -(2 ** (8 * sizeof(c_long) - 1))
1770        max_long = -min_long - 1
1771        check_format(unicode(min_long),
1772                     b'%ld', c_long(min_long))
1773        check_format(unicode(max_long),
1774                     b'%ld', c_long(max_long))
1775        max_ulong = 2 ** (8 * sizeof(c_ulong)) - 1
1776        check_format(unicode(max_ulong),
1777                     b'%lu', c_ulong(max_ulong))
1778        PyUnicode_FromFormat(b'%p', c_void_p(-1))
1779
1780        # test padding (width and/or precision)
1781        check_format(u'123'.rjust(10, u'0'),
1782                     b'%010i', c_int(123))
1783        check_format(u'123'.rjust(100),
1784                     b'%100i', c_int(123))
1785        check_format(u'123'.rjust(100, u'0'),
1786                     b'%.100i', c_int(123))
1787        check_format(u'123'.rjust(80, u'0').rjust(100),
1788                     b'%100.80i', c_int(123))
1789
1790        check_format(u'123'.rjust(10, u'0'),
1791                     b'%010u', c_uint(123))
1792        check_format(u'123'.rjust(100),
1793                     b'%100u', c_uint(123))
1794        check_format(u'123'.rjust(100, u'0'),
1795                     b'%.100u', c_uint(123))
1796        check_format(u'123'.rjust(80, u'0').rjust(100),
1797                     b'%100.80u', c_uint(123))
1798
1799        check_format(u'123'.rjust(10, u'0'),
1800                     b'%010x', c_int(0x123))
1801        check_format(u'123'.rjust(100),
1802                     b'%100x', c_int(0x123))
1803        check_format(u'123'.rjust(100, u'0'),
1804                     b'%.100x', c_int(0x123))
1805        check_format(u'123'.rjust(80, u'0').rjust(100),
1806                     b'%100.80x', c_int(0x123))
1807
1808        # test %V
1809        check_format(u'repr=abc',
1810                     b'repr=%V', u'abc', b'xyz')
1811        check_format(u'repr=\xe4\xba\xba\xe6\xb0\x91',
1812                     b'repr=%V', None, b'\xe4\xba\xba\xe6\xb0\x91')
1813        check_format(u'repr=abc\xff',
1814                     b'repr=%V', None, b'abc\xff')
1815
1816        # not supported: copy the raw format string. these tests are just here
1817        # to check for crashs and should not be considered as specifications
1818        check_format(u'%s',
1819                     b'%1%s', b'abc')
1820        check_format(u'%1abc',
1821                     b'%1abc')
1822        check_format(u'%+i',
1823                     b'%+i', c_int(10))
1824        check_format(u'%s',
1825                     b'%.%s', b'abc')
1826
1827    @test_support.cpython_only
1828    def test_encode_decimal(self):
1829        from _testcapi import unicode_encodedecimal
1830        self.assertEqual(unicode_encodedecimal(u'123'),
1831                         b'123')
1832        self.assertEqual(unicode_encodedecimal(u'\u0663.\u0661\u0664'),
1833                         b'3.14')
1834        self.assertEqual(unicode_encodedecimal(u"\N{EM SPACE}3.14\N{EN SPACE}"),
1835                         b' 3.14 ')
1836        self.assertRaises(UnicodeEncodeError,
1837                          unicode_encodedecimal, u"123\u20ac", "strict")
1838        self.assertEqual(unicode_encodedecimal(u"123\u20ac", "replace"),
1839                         b'123?')
1840        self.assertEqual(unicode_encodedecimal(u"123\u20ac", "ignore"),
1841                         b'123')
1842        self.assertEqual(unicode_encodedecimal(u"123\u20ac", "xmlcharrefreplace"),
1843                         b'123&#8364;')
1844        self.assertEqual(unicode_encodedecimal(u"123\u20ac", "backslashreplace"),
1845                         b'123\\u20ac')
1846        self.assertEqual(unicode_encodedecimal(u"123\u20ac\N{EM SPACE}", "replace"),
1847                         b'123? ')
1848        self.assertEqual(unicode_encodedecimal(u"123\u20ac\u20ac", "replace"),
1849                         b'123??')
1850        self.assertEqual(unicode_encodedecimal(u"123\u20ac\u0660", "replace"),
1851                         b'123?0')
1852
1853    @test_support.cpython_only
1854    def test_encode_decimal_with_surrogates(self):
1855        from _testcapi import unicode_encodedecimal
1856        tests = [(u'\U0001f49d', '&#128157;'),
1857                 (u'\ud83d', '&#55357;'),
1858                 (u'\udc9d', '&#56477;'),
1859                ]
1860        if u'\ud83d\udc9d' != u'\U0001f49d':
1861            tests += [(u'\ud83d\udc9d', '&#55357;&#56477;')]
1862        for s, exp in tests:
1863            self.assertEqual(
1864                    unicode_encodedecimal(u"123" + s, "xmlcharrefreplace"),
1865                    '123' + exp)
1866
1867def test_main():
1868    test_support.run_unittest(__name__)
1869
1870if __name__ == "__main__":
1871    test_main()
1872