1# test_multibytecodec.py 2# Unit test for multibytecodec itself 3# 4 5from test import test_support 6from test.test_support import TESTFN 7import unittest, StringIO, codecs, sys, os 8import _multibytecodec 9 10ALL_CJKENCODINGS = [ 11# _codecs_cn 12 'gb2312', 'gbk', 'gb18030', 'hz', 13# _codecs_hk 14 'big5hkscs', 15# _codecs_jp 16 'cp932', 'shift_jis', 'euc_jp', 'euc_jisx0213', 'shift_jisx0213', 17 'euc_jis_2004', 'shift_jis_2004', 18# _codecs_kr 19 'cp949', 'euc_kr', 'johab', 20# _codecs_tw 21 'big5', 'cp950', 22# _codecs_iso2022 23 'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2', 'iso2022_jp_2004', 24 'iso2022_jp_3', 'iso2022_jp_ext', 'iso2022_kr', 25] 26 27class Test_MultibyteCodec(unittest.TestCase): 28 29 def test_nullcoding(self): 30 for enc in ALL_CJKENCODINGS: 31 self.assertEqual(''.decode(enc), u'') 32 self.assertEqual(unicode('', enc), u'') 33 self.assertEqual(u''.encode(enc), '') 34 35 def test_str_decode(self): 36 for enc in ALL_CJKENCODINGS: 37 self.assertEqual('abcd'.encode(enc), 'abcd') 38 39 def test_errorcallback_longindex(self): 40 dec = codecs.getdecoder('euc-kr') 41 myreplace = lambda exc: (u'', sys.maxint+1) 42 codecs.register_error('test.cjktest', myreplace) 43 self.assertRaises(IndexError, dec, 44 'apple\x92ham\x93spam', 'test.cjktest') 45 46 def test_errorcallback_custom_ignore(self): 47 # Issue #23215: MemoryError with custom error handlers and multibyte codecs 48 data = 100 * unichr(0xdc00) 49 codecs.register_error("test.ignore", codecs.ignore_errors) 50 for enc in ALL_CJKENCODINGS: 51 self.assertEqual(data.encode(enc, "test.ignore"), b'') 52 53 def test_codingspec(self): 54 for enc in ALL_CJKENCODINGS: 55 code = '# coding: {}\n'.format(enc) 56 exec code 57 58 def test_init_segfault(self): 59 # bug #3305: this used to segfault 60 self.assertRaises(AttributeError, 61 _multibytecodec.MultibyteStreamReader, None) 62 self.assertRaises(AttributeError, 63 _multibytecodec.MultibyteStreamWriter, None) 64 65 66class Test_IncrementalEncoder(unittest.TestCase): 67 68 def test_stateless(self): 69 # cp949 encoder isn't stateful at all. 70 encoder = codecs.getincrementalencoder('cp949')() 71 self.assertEqual(encoder.encode(u'\ud30c\uc774\uc36c \ub9c8\uc744'), 72 '\xc6\xc4\xc0\xcc\xbd\xe3 \xb8\xb6\xc0\xbb') 73 self.assertEqual(encoder.reset(), None) 74 self.assertEqual(encoder.encode(u'\u2606\u223c\u2606', True), 75 '\xa1\xd9\xa1\xad\xa1\xd9') 76 self.assertEqual(encoder.reset(), None) 77 self.assertEqual(encoder.encode(u'', True), '') 78 self.assertEqual(encoder.encode(u'', False), '') 79 self.assertEqual(encoder.reset(), None) 80 81 def test_stateful(self): 82 # jisx0213 encoder is stateful for a few code points. eg) 83 # U+00E6 => A9DC 84 # U+00E6 U+0300 => ABC4 85 # U+0300 => ABDC 86 87 encoder = codecs.getincrementalencoder('jisx0213')() 88 self.assertEqual(encoder.encode(u'\u00e6\u0300'), '\xab\xc4') 89 self.assertEqual(encoder.encode(u'\u00e6'), '') 90 self.assertEqual(encoder.encode(u'\u0300'), '\xab\xc4') 91 self.assertEqual(encoder.encode(u'\u00e6', True), '\xa9\xdc') 92 93 self.assertEqual(encoder.reset(), None) 94 self.assertEqual(encoder.encode(u'\u0300'), '\xab\xdc') 95 96 self.assertEqual(encoder.encode(u'\u00e6'), '') 97 self.assertEqual(encoder.encode('', True), '\xa9\xdc') 98 self.assertEqual(encoder.encode('', True), '') 99 100 def test_stateful_keep_buffer(self): 101 encoder = codecs.getincrementalencoder('jisx0213')() 102 self.assertEqual(encoder.encode(u'\u00e6'), '') 103 self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123') 104 self.assertEqual(encoder.encode(u'\u0300\u00e6'), '\xab\xc4') 105 self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123') 106 self.assertEqual(encoder.reset(), None) 107 self.assertEqual(encoder.encode(u'\u0300'), '\xab\xdc') 108 self.assertEqual(encoder.encode(u'\u00e6'), '') 109 self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123') 110 self.assertEqual(encoder.encode(u'', True), '\xa9\xdc') 111 112 def test_issue5640(self): 113 encoder = codecs.getincrementalencoder('shift-jis')('backslashreplace') 114 self.assertEqual(encoder.encode(u'\xff'), b'\\xff') 115 self.assertEqual(encoder.encode(u'\n'), b'\n') 116 117class Test_IncrementalDecoder(unittest.TestCase): 118 119 def test_dbcs(self): 120 # cp949 decoder is simple with only 1 or 2 bytes sequences. 121 decoder = codecs.getincrementaldecoder('cp949')() 122 self.assertEqual(decoder.decode('\xc6\xc4\xc0\xcc\xbd'), 123 u'\ud30c\uc774') 124 self.assertEqual(decoder.decode('\xe3 \xb8\xb6\xc0\xbb'), 125 u'\uc36c \ub9c8\uc744') 126 self.assertEqual(decoder.decode(''), u'') 127 128 def test_dbcs_keep_buffer(self): 129 decoder = codecs.getincrementaldecoder('cp949')() 130 self.assertEqual(decoder.decode('\xc6\xc4\xc0'), u'\ud30c') 131 self.assertRaises(UnicodeDecodeError, decoder.decode, '', True) 132 self.assertEqual(decoder.decode('\xcc'), u'\uc774') 133 134 self.assertEqual(decoder.decode('\xc6\xc4\xc0'), u'\ud30c') 135 self.assertRaises(UnicodeDecodeError, decoder.decode, '\xcc\xbd', True) 136 self.assertEqual(decoder.decode('\xcc'), u'\uc774') 137 138 def test_iso2022(self): 139 decoder = codecs.getincrementaldecoder('iso2022-jp')() 140 ESC = '\x1b' 141 self.assertEqual(decoder.decode(ESC + '('), u'') 142 self.assertEqual(decoder.decode('B', True), u'') 143 self.assertEqual(decoder.decode(ESC + '$'), u'') 144 self.assertEqual(decoder.decode('B@$'), u'\u4e16') 145 self.assertEqual(decoder.decode('@$@'), u'\u4e16') 146 self.assertEqual(decoder.decode('$', True), u'\u4e16') 147 self.assertEqual(decoder.reset(), None) 148 self.assertEqual(decoder.decode('@$'), u'@$') 149 self.assertEqual(decoder.decode(ESC + '$'), u'') 150 self.assertRaises(UnicodeDecodeError, decoder.decode, '', True) 151 self.assertEqual(decoder.decode('B@$'), u'\u4e16') 152 153class Test_StreamReader(unittest.TestCase): 154 def test_bug1728403(self): 155 try: 156 open(TESTFN, 'w').write('\xa1') 157 f = codecs.open(TESTFN, encoding='cp949') 158 self.assertRaises(UnicodeDecodeError, f.read, 2) 159 finally: 160 try: f.close() 161 except: pass 162 os.unlink(TESTFN) 163 164class Test_StreamWriter(unittest.TestCase): 165 @unittest.skipUnless(len(u'\U00012345') == 2, 'need a narrow build') 166 def test_gb18030(self): 167 s = StringIO.StringIO() 168 c = codecs.getwriter('gb18030')(s) 169 c.write(u'123') 170 self.assertEqual(s.getvalue(), '123') 171 c.write(u'\U00012345') 172 self.assertEqual(s.getvalue(), '123\x907\x959') 173 c.write(u'\U00012345'[0]) 174 self.assertEqual(s.getvalue(), '123\x907\x959') 175 c.write(u'\U00012345'[1] + u'\U00012345' + u'\uac00\u00ac') 176 self.assertEqual(s.getvalue(), 177 '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851') 178 c.write(u'\U00012345'[0]) 179 self.assertEqual(s.getvalue(), 180 '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851') 181 self.assertRaises(UnicodeError, c.reset) 182 self.assertEqual(s.getvalue(), 183 '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851') 184 185 @unittest.skipUnless(len(u'\U00012345') == 2, 'need a narrow build') 186 def test_utf_8(self): 187 s= StringIO.StringIO() 188 c = codecs.getwriter('utf-8')(s) 189 c.write(u'123') 190 self.assertEqual(s.getvalue(), '123') 191 c.write(u'\U00012345') 192 self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85') 193 194 # Python utf-8 codec can't buffer surrogate pairs yet. 195 if 0: 196 c.write(u'\U00012345'[0]) 197 self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85') 198 c.write(u'\U00012345'[1] + u'\U00012345' + u'\uac00\u00ac') 199 self.assertEqual(s.getvalue(), 200 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85' 201 '\xea\xb0\x80\xc2\xac') 202 c.write(u'\U00012345'[0]) 203 self.assertEqual(s.getvalue(), 204 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85' 205 '\xea\xb0\x80\xc2\xac') 206 c.reset() 207 self.assertEqual(s.getvalue(), 208 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85' 209 '\xea\xb0\x80\xc2\xac\xed\xa0\x88') 210 c.write(u'\U00012345'[1]) 211 self.assertEqual(s.getvalue(), 212 '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85' 213 '\xea\xb0\x80\xc2\xac\xed\xa0\x88\xed\xbd\x85') 214 215 def test_streamwriter_strwrite(self): 216 s = StringIO.StringIO() 217 wr = codecs.getwriter('gb18030')(s) 218 wr.write('abcd') 219 self.assertEqual(s.getvalue(), 'abcd') 220 221class Test_ISO2022(unittest.TestCase): 222 def test_g2(self): 223 iso2022jp2 = '\x1b(B:hu4:unit\x1b.A\x1bNi de famille' 224 uni = u':hu4:unit\xe9 de famille' 225 self.assertEqual(iso2022jp2.decode('iso2022-jp-2'), uni) 226 227 def test_iso2022_jp_g0(self): 228 self.assertNotIn('\x0e', u'\N{SOFT HYPHEN}'.encode('iso-2022-jp-2')) 229 for encoding in ('iso-2022-jp-2004', 'iso-2022-jp-3'): 230 e = u'\u3406'.encode(encoding) 231 self.assertFalse(filter(lambda x: x >= '\x80', e)) 232 233 def test_bug1572832(self): 234 if sys.maxunicode >= 0x10000: 235 myunichr = unichr 236 else: 237 myunichr = lambda x: unichr(0xD7C0+(x>>10)) + unichr(0xDC00+(x&0x3FF)) 238 239 for x in xrange(0x10000, 0x110000): 240 # Any ISO 2022 codec will cause the segfault 241 myunichr(x).encode('iso_2022_jp', 'ignore') 242 243class TestStateful(unittest.TestCase): 244 text = u'\u4E16\u4E16' 245 encoding = 'iso-2022-jp' 246 expected = b'\x1b$B@$@$' 247 expected_reset = b'\x1b$B@$@$\x1b(B' 248 249 def test_encode(self): 250 self.assertEqual(self.text.encode(self.encoding), self.expected_reset) 251 252 def test_incrementalencoder(self): 253 encoder = codecs.getincrementalencoder(self.encoding)() 254 output = b''.join( 255 encoder.encode(char) 256 for char in self.text) 257 self.assertEqual(output, self.expected) 258 259 def test_incrementalencoder_final(self): 260 encoder = codecs.getincrementalencoder(self.encoding)() 261 last_index = len(self.text) - 1 262 output = b''.join( 263 encoder.encode(char, index == last_index) 264 for index, char in enumerate(self.text)) 265 self.assertEqual(output, self.expected_reset) 266 267class TestHZStateful(TestStateful): 268 text = u'\u804a\u804a' 269 encoding = 'hz' 270 expected = b'~{ADAD' 271 expected_reset = b'~{ADAD~}' 272 273def test_main(): 274 test_support.run_unittest(__name__) 275 276if __name__ == "__main__": 277 test_main() 278