1from test import support 2from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP, 3 STRING, ENDMARKER, ENCODING, tok_name, detect_encoding, 4 open as tokenize_open, Untokenizer) 5from io import BytesIO 6from unittest import TestCase, mock 7from test.test_grammar import (VALID_UNDERSCORE_LITERALS, 8 INVALID_UNDERSCORE_LITERALS) 9import os 10import token 11 12 13class TokenizeTest(TestCase): 14 # Tests for the tokenize module. 15 16 # The tests can be really simple. Given a small fragment of source 17 # code, print out a table with tokens. The ENDMARKER is omitted for 18 # brevity. 19 20 def check_tokenize(self, s, expected): 21 # Format the tokens in s in a table format. 22 # The ENDMARKER is omitted. 23 result = [] 24 f = BytesIO(s.encode('utf-8')) 25 for type, token, start, end, line in tokenize(f.readline): 26 if type == ENDMARKER: 27 break 28 type = tok_name[type] 29 result.append(f" {type:10} {token!r:13} {start} {end}") 30 self.assertEqual(result, 31 [" ENCODING 'utf-8' (0, 0) (0, 0)"] + 32 expected.rstrip().splitlines()) 33 34 def test_basic(self): 35 self.check_tokenize("1 + 1", """\ 36 NUMBER '1' (1, 0) (1, 1) 37 OP '+' (1, 2) (1, 3) 38 NUMBER '1' (1, 4) (1, 5) 39 """) 40 self.check_tokenize("if False:\n" 41 " # NL\n" 42 " True = False # NEWLINE\n", """\ 43 NAME 'if' (1, 0) (1, 2) 44 NAME 'False' (1, 3) (1, 8) 45 OP ':' (1, 8) (1, 9) 46 NEWLINE '\\n' (1, 9) (1, 10) 47 COMMENT '# NL' (2, 4) (2, 8) 48 NL '\\n' (2, 8) (2, 9) 49 INDENT ' ' (3, 0) (3, 4) 50 NAME 'True' (3, 4) (3, 8) 51 OP '=' (3, 9) (3, 10) 52 NAME 'False' (3, 11) (3, 16) 53 COMMENT '# NEWLINE' (3, 17) (3, 26) 54 NEWLINE '\\n' (3, 26) (3, 27) 55 DEDENT '' (4, 0) (4, 0) 56 """) 57 indent_error_file = b"""\ 58def k(x): 59 x += 2 60 x += 5 61""" 62 readline = BytesIO(indent_error_file).readline 63 with self.assertRaisesRegex(IndentationError, 64 "unindent does not match any " 65 "outer indentation level"): 66 for tok in tokenize(readline): 67 pass 68 69 def test_int(self): 70 # Ordinary integers and binary operators 71 self.check_tokenize("0xff <= 255", """\ 72 NUMBER '0xff' (1, 0) (1, 4) 73 OP '<=' (1, 5) (1, 7) 74 NUMBER '255' (1, 8) (1, 11) 75 """) 76 self.check_tokenize("0b10 <= 255", """\ 77 NUMBER '0b10' (1, 0) (1, 4) 78 OP '<=' (1, 5) (1, 7) 79 NUMBER '255' (1, 8) (1, 11) 80 """) 81 self.check_tokenize("0o123 <= 0O123", """\ 82 NUMBER '0o123' (1, 0) (1, 5) 83 OP '<=' (1, 6) (1, 8) 84 NUMBER '0O123' (1, 9) (1, 14) 85 """) 86 self.check_tokenize("1234567 > ~0x15", """\ 87 NUMBER '1234567' (1, 0) (1, 7) 88 OP '>' (1, 8) (1, 9) 89 OP '~' (1, 10) (1, 11) 90 NUMBER '0x15' (1, 11) (1, 15) 91 """) 92 self.check_tokenize("2134568 != 1231515", """\ 93 NUMBER '2134568' (1, 0) (1, 7) 94 OP '!=' (1, 8) (1, 10) 95 NUMBER '1231515' (1, 11) (1, 18) 96 """) 97 self.check_tokenize("(-124561-1) & 200000000", """\ 98 OP '(' (1, 0) (1, 1) 99 OP '-' (1, 1) (1, 2) 100 NUMBER '124561' (1, 2) (1, 8) 101 OP '-' (1, 8) (1, 9) 102 NUMBER '1' (1, 9) (1, 10) 103 OP ')' (1, 10) (1, 11) 104 OP '&' (1, 12) (1, 13) 105 NUMBER '200000000' (1, 14) (1, 23) 106 """) 107 self.check_tokenize("0xdeadbeef != -1", """\ 108 NUMBER '0xdeadbeef' (1, 0) (1, 10) 109 OP '!=' (1, 11) (1, 13) 110 OP '-' (1, 14) (1, 15) 111 NUMBER '1' (1, 15) (1, 16) 112 """) 113 self.check_tokenize("0xdeadc0de & 12345", """\ 114 NUMBER '0xdeadc0de' (1, 0) (1, 10) 115 OP '&' (1, 11) (1, 12) 116 NUMBER '12345' (1, 13) (1, 18) 117 """) 118 self.check_tokenize("0xFF & 0x15 | 1234", """\ 119 NUMBER '0xFF' (1, 0) (1, 4) 120 OP '&' (1, 5) (1, 6) 121 NUMBER '0x15' (1, 7) (1, 11) 122 OP '|' (1, 12) (1, 13) 123 NUMBER '1234' (1, 14) (1, 18) 124 """) 125 126 def test_long(self): 127 # Long integers 128 self.check_tokenize("x = 0", """\ 129 NAME 'x' (1, 0) (1, 1) 130 OP '=' (1, 2) (1, 3) 131 NUMBER '0' (1, 4) (1, 5) 132 """) 133 self.check_tokenize("x = 0xfffffffffff", """\ 134 NAME 'x' (1, 0) (1, 1) 135 OP '=' (1, 2) (1, 3) 136 NUMBER '0xfffffffffff' (1, 4) (1, 17) 137 """) 138 self.check_tokenize("x = 123141242151251616110", """\ 139 NAME 'x' (1, 0) (1, 1) 140 OP '=' (1, 2) (1, 3) 141 NUMBER '123141242151251616110' (1, 4) (1, 25) 142 """) 143 self.check_tokenize("x = -15921590215012591", """\ 144 NAME 'x' (1, 0) (1, 1) 145 OP '=' (1, 2) (1, 3) 146 OP '-' (1, 4) (1, 5) 147 NUMBER '15921590215012591' (1, 5) (1, 22) 148 """) 149 150 def test_float(self): 151 # Floating point numbers 152 self.check_tokenize("x = 3.14159", """\ 153 NAME 'x' (1, 0) (1, 1) 154 OP '=' (1, 2) (1, 3) 155 NUMBER '3.14159' (1, 4) (1, 11) 156 """) 157 self.check_tokenize("x = 314159.", """\ 158 NAME 'x' (1, 0) (1, 1) 159 OP '=' (1, 2) (1, 3) 160 NUMBER '314159.' (1, 4) (1, 11) 161 """) 162 self.check_tokenize("x = .314159", """\ 163 NAME 'x' (1, 0) (1, 1) 164 OP '=' (1, 2) (1, 3) 165 NUMBER '.314159' (1, 4) (1, 11) 166 """) 167 self.check_tokenize("x = 3e14159", """\ 168 NAME 'x' (1, 0) (1, 1) 169 OP '=' (1, 2) (1, 3) 170 NUMBER '3e14159' (1, 4) (1, 11) 171 """) 172 self.check_tokenize("x = 3E123", """\ 173 NAME 'x' (1, 0) (1, 1) 174 OP '=' (1, 2) (1, 3) 175 NUMBER '3E123' (1, 4) (1, 9) 176 """) 177 self.check_tokenize("x+y = 3e-1230", """\ 178 NAME 'x' (1, 0) (1, 1) 179 OP '+' (1, 1) (1, 2) 180 NAME 'y' (1, 2) (1, 3) 181 OP '=' (1, 4) (1, 5) 182 NUMBER '3e-1230' (1, 6) (1, 13) 183 """) 184 self.check_tokenize("x = 3.14e159", """\ 185 NAME 'x' (1, 0) (1, 1) 186 OP '=' (1, 2) (1, 3) 187 NUMBER '3.14e159' (1, 4) (1, 12) 188 """) 189 190 def test_underscore_literals(self): 191 def number_token(s): 192 f = BytesIO(s.encode('utf-8')) 193 for toktype, token, start, end, line in tokenize(f.readline): 194 if toktype == NUMBER: 195 return token 196 return 'invalid token' 197 for lit in VALID_UNDERSCORE_LITERALS: 198 if '(' in lit: 199 # this won't work with compound complex inputs 200 continue 201 self.assertEqual(number_token(lit), lit) 202 for lit in INVALID_UNDERSCORE_LITERALS: 203 self.assertNotEqual(number_token(lit), lit) 204 205 def test_string(self): 206 # String literals 207 self.check_tokenize("x = ''; y = \"\"", """\ 208 NAME 'x' (1, 0) (1, 1) 209 OP '=' (1, 2) (1, 3) 210 STRING "''" (1, 4) (1, 6) 211 OP ';' (1, 6) (1, 7) 212 NAME 'y' (1, 8) (1, 9) 213 OP '=' (1, 10) (1, 11) 214 STRING '""' (1, 12) (1, 14) 215 """) 216 self.check_tokenize("x = '\"'; y = \"'\"", """\ 217 NAME 'x' (1, 0) (1, 1) 218 OP '=' (1, 2) (1, 3) 219 STRING '\\'"\\'' (1, 4) (1, 7) 220 OP ';' (1, 7) (1, 8) 221 NAME 'y' (1, 9) (1, 10) 222 OP '=' (1, 11) (1, 12) 223 STRING '"\\'"' (1, 13) (1, 16) 224 """) 225 self.check_tokenize("x = \"doesn't \"shrink\", does it\"", """\ 226 NAME 'x' (1, 0) (1, 1) 227 OP '=' (1, 2) (1, 3) 228 STRING '"doesn\\'t "' (1, 4) (1, 14) 229 NAME 'shrink' (1, 14) (1, 20) 230 STRING '", does it"' (1, 20) (1, 31) 231 """) 232 self.check_tokenize("x = 'abc' + 'ABC'", """\ 233 NAME 'x' (1, 0) (1, 1) 234 OP '=' (1, 2) (1, 3) 235 STRING "'abc'" (1, 4) (1, 9) 236 OP '+' (1, 10) (1, 11) 237 STRING "'ABC'" (1, 12) (1, 17) 238 """) 239 self.check_tokenize('y = "ABC" + "ABC"', """\ 240 NAME 'y' (1, 0) (1, 1) 241 OP '=' (1, 2) (1, 3) 242 STRING '"ABC"' (1, 4) (1, 9) 243 OP '+' (1, 10) (1, 11) 244 STRING '"ABC"' (1, 12) (1, 17) 245 """) 246 self.check_tokenize("x = r'abc' + r'ABC' + R'ABC' + R'ABC'", """\ 247 NAME 'x' (1, 0) (1, 1) 248 OP '=' (1, 2) (1, 3) 249 STRING "r'abc'" (1, 4) (1, 10) 250 OP '+' (1, 11) (1, 12) 251 STRING "r'ABC'" (1, 13) (1, 19) 252 OP '+' (1, 20) (1, 21) 253 STRING "R'ABC'" (1, 22) (1, 28) 254 OP '+' (1, 29) (1, 30) 255 STRING "R'ABC'" (1, 31) (1, 37) 256 """) 257 self.check_tokenize('y = r"abc" + r"ABC" + R"ABC" + R"ABC"', """\ 258 NAME 'y' (1, 0) (1, 1) 259 OP '=' (1, 2) (1, 3) 260 STRING 'r"abc"' (1, 4) (1, 10) 261 OP '+' (1, 11) (1, 12) 262 STRING 'r"ABC"' (1, 13) (1, 19) 263 OP '+' (1, 20) (1, 21) 264 STRING 'R"ABC"' (1, 22) (1, 28) 265 OP '+' (1, 29) (1, 30) 266 STRING 'R"ABC"' (1, 31) (1, 37) 267 """) 268 269 self.check_tokenize("u'abc' + U'abc'", """\ 270 STRING "u'abc'" (1, 0) (1, 6) 271 OP '+' (1, 7) (1, 8) 272 STRING "U'abc'" (1, 9) (1, 15) 273 """) 274 self.check_tokenize('u"abc" + U"abc"', """\ 275 STRING 'u"abc"' (1, 0) (1, 6) 276 OP '+' (1, 7) (1, 8) 277 STRING 'U"abc"' (1, 9) (1, 15) 278 """) 279 280 self.check_tokenize("b'abc' + B'abc'", """\ 281 STRING "b'abc'" (1, 0) (1, 6) 282 OP '+' (1, 7) (1, 8) 283 STRING "B'abc'" (1, 9) (1, 15) 284 """) 285 self.check_tokenize('b"abc" + B"abc"', """\ 286 STRING 'b"abc"' (1, 0) (1, 6) 287 OP '+' (1, 7) (1, 8) 288 STRING 'B"abc"' (1, 9) (1, 15) 289 """) 290 self.check_tokenize("br'abc' + bR'abc' + Br'abc' + BR'abc'", """\ 291 STRING "br'abc'" (1, 0) (1, 7) 292 OP '+' (1, 8) (1, 9) 293 STRING "bR'abc'" (1, 10) (1, 17) 294 OP '+' (1, 18) (1, 19) 295 STRING "Br'abc'" (1, 20) (1, 27) 296 OP '+' (1, 28) (1, 29) 297 STRING "BR'abc'" (1, 30) (1, 37) 298 """) 299 self.check_tokenize('br"abc" + bR"abc" + Br"abc" + BR"abc"', """\ 300 STRING 'br"abc"' (1, 0) (1, 7) 301 OP '+' (1, 8) (1, 9) 302 STRING 'bR"abc"' (1, 10) (1, 17) 303 OP '+' (1, 18) (1, 19) 304 STRING 'Br"abc"' (1, 20) (1, 27) 305 OP '+' (1, 28) (1, 29) 306 STRING 'BR"abc"' (1, 30) (1, 37) 307 """) 308 self.check_tokenize("rb'abc' + rB'abc' + Rb'abc' + RB'abc'", """\ 309 STRING "rb'abc'" (1, 0) (1, 7) 310 OP '+' (1, 8) (1, 9) 311 STRING "rB'abc'" (1, 10) (1, 17) 312 OP '+' (1, 18) (1, 19) 313 STRING "Rb'abc'" (1, 20) (1, 27) 314 OP '+' (1, 28) (1, 29) 315 STRING "RB'abc'" (1, 30) (1, 37) 316 """) 317 self.check_tokenize('rb"abc" + rB"abc" + Rb"abc" + RB"abc"', """\ 318 STRING 'rb"abc"' (1, 0) (1, 7) 319 OP '+' (1, 8) (1, 9) 320 STRING 'rB"abc"' (1, 10) (1, 17) 321 OP '+' (1, 18) (1, 19) 322 STRING 'Rb"abc"' (1, 20) (1, 27) 323 OP '+' (1, 28) (1, 29) 324 STRING 'RB"abc"' (1, 30) (1, 37) 325 """) 326 # Check 0, 1, and 2 character string prefixes. 327 self.check_tokenize(r'"a\ 328de\ 329fg"', """\ 330 STRING '"a\\\\\\nde\\\\\\nfg"\' (1, 0) (3, 3) 331 """) 332 self.check_tokenize(r'u"a\ 333de"', """\ 334 STRING 'u"a\\\\\\nde"\' (1, 0) (2, 3) 335 """) 336 self.check_tokenize(r'rb"a\ 337d"', """\ 338 STRING 'rb"a\\\\\\nd"\' (1, 0) (2, 2) 339 """) 340 self.check_tokenize(r'"""a\ 341b"""', """\ 342 STRING '\"\""a\\\\\\nb\"\""' (1, 0) (2, 4) 343 """) 344 self.check_tokenize(r'u"""a\ 345b"""', """\ 346 STRING 'u\"\""a\\\\\\nb\"\""' (1, 0) (2, 4) 347 """) 348 self.check_tokenize(r'rb"""a\ 349b\ 350c"""', """\ 351 STRING 'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4) 352 """) 353 self.check_tokenize('f"abc"', """\ 354 STRING 'f"abc"' (1, 0) (1, 6) 355 """) 356 self.check_tokenize('fR"a{b}c"', """\ 357 STRING 'fR"a{b}c"' (1, 0) (1, 9) 358 """) 359 self.check_tokenize('f"""abc"""', """\ 360 STRING 'f\"\"\"abc\"\"\"' (1, 0) (1, 10) 361 """) 362 self.check_tokenize(r'f"abc\ 363def"', """\ 364 STRING 'f"abc\\\\\\ndef"' (1, 0) (2, 4) 365 """) 366 self.check_tokenize(r'Rf"abc\ 367def"', """\ 368 STRING 'Rf"abc\\\\\\ndef"' (1, 0) (2, 4) 369 """) 370 371 def test_function(self): 372 self.check_tokenize("def d22(a, b, c=2, d=2, *k): pass", """\ 373 NAME 'def' (1, 0) (1, 3) 374 NAME 'd22' (1, 4) (1, 7) 375 OP '(' (1, 7) (1, 8) 376 NAME 'a' (1, 8) (1, 9) 377 OP ',' (1, 9) (1, 10) 378 NAME 'b' (1, 11) (1, 12) 379 OP ',' (1, 12) (1, 13) 380 NAME 'c' (1, 14) (1, 15) 381 OP '=' (1, 15) (1, 16) 382 NUMBER '2' (1, 16) (1, 17) 383 OP ',' (1, 17) (1, 18) 384 NAME 'd' (1, 19) (1, 20) 385 OP '=' (1, 20) (1, 21) 386 NUMBER '2' (1, 21) (1, 22) 387 OP ',' (1, 22) (1, 23) 388 OP '*' (1, 24) (1, 25) 389 NAME 'k' (1, 25) (1, 26) 390 OP ')' (1, 26) (1, 27) 391 OP ':' (1, 27) (1, 28) 392 NAME 'pass' (1, 29) (1, 33) 393 """) 394 self.check_tokenize("def d01v_(a=1, *k, **w): pass", """\ 395 NAME 'def' (1, 0) (1, 3) 396 NAME 'd01v_' (1, 4) (1, 9) 397 OP '(' (1, 9) (1, 10) 398 NAME 'a' (1, 10) (1, 11) 399 OP '=' (1, 11) (1, 12) 400 NUMBER '1' (1, 12) (1, 13) 401 OP ',' (1, 13) (1, 14) 402 OP '*' (1, 15) (1, 16) 403 NAME 'k' (1, 16) (1, 17) 404 OP ',' (1, 17) (1, 18) 405 OP '**' (1, 19) (1, 21) 406 NAME 'w' (1, 21) (1, 22) 407 OP ')' (1, 22) (1, 23) 408 OP ':' (1, 23) (1, 24) 409 NAME 'pass' (1, 25) (1, 29) 410 """) 411 412 def test_comparison(self): 413 # Comparison 414 self.check_tokenize("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " 415 "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass", """\ 416 NAME 'if' (1, 0) (1, 2) 417 NUMBER '1' (1, 3) (1, 4) 418 OP '<' (1, 5) (1, 6) 419 NUMBER '1' (1, 7) (1, 8) 420 OP '>' (1, 9) (1, 10) 421 NUMBER '1' (1, 11) (1, 12) 422 OP '==' (1, 13) (1, 15) 423 NUMBER '1' (1, 16) (1, 17) 424 OP '>=' (1, 18) (1, 20) 425 NUMBER '5' (1, 21) (1, 22) 426 OP '<=' (1, 23) (1, 25) 427 NUMBER '0x15' (1, 26) (1, 30) 428 OP '<=' (1, 31) (1, 33) 429 NUMBER '0x12' (1, 34) (1, 38) 430 OP '!=' (1, 39) (1, 41) 431 NUMBER '1' (1, 42) (1, 43) 432 NAME 'and' (1, 44) (1, 47) 433 NUMBER '5' (1, 48) (1, 49) 434 NAME 'in' (1, 50) (1, 52) 435 NUMBER '1' (1, 53) (1, 54) 436 NAME 'not' (1, 55) (1, 58) 437 NAME 'in' (1, 59) (1, 61) 438 NUMBER '1' (1, 62) (1, 63) 439 NAME 'is' (1, 64) (1, 66) 440 NUMBER '1' (1, 67) (1, 68) 441 NAME 'or' (1, 69) (1, 71) 442 NUMBER '5' (1, 72) (1, 73) 443 NAME 'is' (1, 74) (1, 76) 444 NAME 'not' (1, 77) (1, 80) 445 NUMBER '1' (1, 81) (1, 82) 446 OP ':' (1, 82) (1, 83) 447 NAME 'pass' (1, 84) (1, 88) 448 """) 449 450 def test_shift(self): 451 # Shift 452 self.check_tokenize("x = 1 << 1 >> 5", """\ 453 NAME 'x' (1, 0) (1, 1) 454 OP '=' (1, 2) (1, 3) 455 NUMBER '1' (1, 4) (1, 5) 456 OP '<<' (1, 6) (1, 8) 457 NUMBER '1' (1, 9) (1, 10) 458 OP '>>' (1, 11) (1, 13) 459 NUMBER '5' (1, 14) (1, 15) 460 """) 461 462 def test_additive(self): 463 # Additive 464 self.check_tokenize("x = 1 - y + 15 - 1 + 0x124 + z + a[5]", """\ 465 NAME 'x' (1, 0) (1, 1) 466 OP '=' (1, 2) (1, 3) 467 NUMBER '1' (1, 4) (1, 5) 468 OP '-' (1, 6) (1, 7) 469 NAME 'y' (1, 8) (1, 9) 470 OP '+' (1, 10) (1, 11) 471 NUMBER '15' (1, 12) (1, 14) 472 OP '-' (1, 15) (1, 16) 473 NUMBER '1' (1, 17) (1, 18) 474 OP '+' (1, 19) (1, 20) 475 NUMBER '0x124' (1, 21) (1, 26) 476 OP '+' (1, 27) (1, 28) 477 NAME 'z' (1, 29) (1, 30) 478 OP '+' (1, 31) (1, 32) 479 NAME 'a' (1, 33) (1, 34) 480 OP '[' (1, 34) (1, 35) 481 NUMBER '5' (1, 35) (1, 36) 482 OP ']' (1, 36) (1, 37) 483 """) 484 485 def test_multiplicative(self): 486 # Multiplicative 487 self.check_tokenize("x = 1//1*1/5*12%0x12@42", """\ 488 NAME 'x' (1, 0) (1, 1) 489 OP '=' (1, 2) (1, 3) 490 NUMBER '1' (1, 4) (1, 5) 491 OP '//' (1, 5) (1, 7) 492 NUMBER '1' (1, 7) (1, 8) 493 OP '*' (1, 8) (1, 9) 494 NUMBER '1' (1, 9) (1, 10) 495 OP '/' (1, 10) (1, 11) 496 NUMBER '5' (1, 11) (1, 12) 497 OP '*' (1, 12) (1, 13) 498 NUMBER '12' (1, 13) (1, 15) 499 OP '%' (1, 15) (1, 16) 500 NUMBER '0x12' (1, 16) (1, 20) 501 OP '@' (1, 20) (1, 21) 502 NUMBER '42' (1, 21) (1, 23) 503 """) 504 505 def test_unary(self): 506 # Unary 507 self.check_tokenize("~1 ^ 1 & 1 |1 ^ -1", """\ 508 OP '~' (1, 0) (1, 1) 509 NUMBER '1' (1, 1) (1, 2) 510 OP '^' (1, 3) (1, 4) 511 NUMBER '1' (1, 5) (1, 6) 512 OP '&' (1, 7) (1, 8) 513 NUMBER '1' (1, 9) (1, 10) 514 OP '|' (1, 11) (1, 12) 515 NUMBER '1' (1, 12) (1, 13) 516 OP '^' (1, 14) (1, 15) 517 OP '-' (1, 16) (1, 17) 518 NUMBER '1' (1, 17) (1, 18) 519 """) 520 self.check_tokenize("-1*1/1+1*1//1 - ---1**1", """\ 521 OP '-' (1, 0) (1, 1) 522 NUMBER '1' (1, 1) (1, 2) 523 OP '*' (1, 2) (1, 3) 524 NUMBER '1' (1, 3) (1, 4) 525 OP '/' (1, 4) (1, 5) 526 NUMBER '1' (1, 5) (1, 6) 527 OP '+' (1, 6) (1, 7) 528 NUMBER '1' (1, 7) (1, 8) 529 OP '*' (1, 8) (1, 9) 530 NUMBER '1' (1, 9) (1, 10) 531 OP '//' (1, 10) (1, 12) 532 NUMBER '1' (1, 12) (1, 13) 533 OP '-' (1, 14) (1, 15) 534 OP '-' (1, 16) (1, 17) 535 OP '-' (1, 17) (1, 18) 536 OP '-' (1, 18) (1, 19) 537 NUMBER '1' (1, 19) (1, 20) 538 OP '**' (1, 20) (1, 22) 539 NUMBER '1' (1, 22) (1, 23) 540 """) 541 542 def test_selector(self): 543 # Selector 544 self.check_tokenize("import sys, time\nx = sys.modules['time'].time()", """\ 545 NAME 'import' (1, 0) (1, 6) 546 NAME 'sys' (1, 7) (1, 10) 547 OP ',' (1, 10) (1, 11) 548 NAME 'time' (1, 12) (1, 16) 549 NEWLINE '\\n' (1, 16) (1, 17) 550 NAME 'x' (2, 0) (2, 1) 551 OP '=' (2, 2) (2, 3) 552 NAME 'sys' (2, 4) (2, 7) 553 OP '.' (2, 7) (2, 8) 554 NAME 'modules' (2, 8) (2, 15) 555 OP '[' (2, 15) (2, 16) 556 STRING "'time'" (2, 16) (2, 22) 557 OP ']' (2, 22) (2, 23) 558 OP '.' (2, 23) (2, 24) 559 NAME 'time' (2, 24) (2, 28) 560 OP '(' (2, 28) (2, 29) 561 OP ')' (2, 29) (2, 30) 562 """) 563 564 def test_method(self): 565 # Methods 566 self.check_tokenize("@staticmethod\ndef foo(x,y): pass", """\ 567 OP '@' (1, 0) (1, 1) 568 NAME 'staticmethod' (1, 1) (1, 13) 569 NEWLINE '\\n' (1, 13) (1, 14) 570 NAME 'def' (2, 0) (2, 3) 571 NAME 'foo' (2, 4) (2, 7) 572 OP '(' (2, 7) (2, 8) 573 NAME 'x' (2, 8) (2, 9) 574 OP ',' (2, 9) (2, 10) 575 NAME 'y' (2, 10) (2, 11) 576 OP ')' (2, 11) (2, 12) 577 OP ':' (2, 12) (2, 13) 578 NAME 'pass' (2, 14) (2, 18) 579 """) 580 581 def test_tabs(self): 582 # Evil tabs 583 self.check_tokenize("def f():\n" 584 "\tif x\n" 585 " \tpass", """\ 586 NAME 'def' (1, 0) (1, 3) 587 NAME 'f' (1, 4) (1, 5) 588 OP '(' (1, 5) (1, 6) 589 OP ')' (1, 6) (1, 7) 590 OP ':' (1, 7) (1, 8) 591 NEWLINE '\\n' (1, 8) (1, 9) 592 INDENT '\\t' (2, 0) (2, 1) 593 NAME 'if' (2, 1) (2, 3) 594 NAME 'x' (2, 4) (2, 5) 595 NEWLINE '\\n' (2, 5) (2, 6) 596 INDENT ' \\t' (3, 0) (3, 9) 597 NAME 'pass' (3, 9) (3, 13) 598 DEDENT '' (4, 0) (4, 0) 599 DEDENT '' (4, 0) (4, 0) 600 """) 601 602 def test_non_ascii_identifiers(self): 603 # Non-ascii identifiers 604 self.check_tokenize("Örter = 'places'\ngrün = 'green'", """\ 605 NAME 'Örter' (1, 0) (1, 5) 606 OP '=' (1, 6) (1, 7) 607 STRING "'places'" (1, 8) (1, 16) 608 NEWLINE '\\n' (1, 16) (1, 17) 609 NAME 'grün' (2, 0) (2, 4) 610 OP '=' (2, 5) (2, 6) 611 STRING "'green'" (2, 7) (2, 14) 612 """) 613 614 def test_unicode(self): 615 # Legacy unicode literals: 616 self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\ 617 NAME 'Örter' (1, 0) (1, 5) 618 OP '=' (1, 6) (1, 7) 619 STRING "u'places'" (1, 8) (1, 17) 620 NEWLINE '\\n' (1, 17) (1, 18) 621 NAME 'grün' (2, 0) (2, 4) 622 OP '=' (2, 5) (2, 6) 623 STRING "U'green'" (2, 7) (2, 15) 624 """) 625 626 def test_async(self): 627 # Async/await extension: 628 self.check_tokenize("async = 1", """\ 629 NAME 'async' (1, 0) (1, 5) 630 OP '=' (1, 6) (1, 7) 631 NUMBER '1' (1, 8) (1, 9) 632 """) 633 634 self.check_tokenize("a = (async = 1)", """\ 635 NAME 'a' (1, 0) (1, 1) 636 OP '=' (1, 2) (1, 3) 637 OP '(' (1, 4) (1, 5) 638 NAME 'async' (1, 5) (1, 10) 639 OP '=' (1, 11) (1, 12) 640 NUMBER '1' (1, 13) (1, 14) 641 OP ')' (1, 14) (1, 15) 642 """) 643 644 self.check_tokenize("async()", """\ 645 NAME 'async' (1, 0) (1, 5) 646 OP '(' (1, 5) (1, 6) 647 OP ')' (1, 6) (1, 7) 648 """) 649 650 self.check_tokenize("class async(Bar):pass", """\ 651 NAME 'class' (1, 0) (1, 5) 652 NAME 'async' (1, 6) (1, 11) 653 OP '(' (1, 11) (1, 12) 654 NAME 'Bar' (1, 12) (1, 15) 655 OP ')' (1, 15) (1, 16) 656 OP ':' (1, 16) (1, 17) 657 NAME 'pass' (1, 17) (1, 21) 658 """) 659 660 self.check_tokenize("class async:pass", """\ 661 NAME 'class' (1, 0) (1, 5) 662 NAME 'async' (1, 6) (1, 11) 663 OP ':' (1, 11) (1, 12) 664 NAME 'pass' (1, 12) (1, 16) 665 """) 666 667 self.check_tokenize("await = 1", """\ 668 NAME 'await' (1, 0) (1, 5) 669 OP '=' (1, 6) (1, 7) 670 NUMBER '1' (1, 8) (1, 9) 671 """) 672 673 self.check_tokenize("foo.async", """\ 674 NAME 'foo' (1, 0) (1, 3) 675 OP '.' (1, 3) (1, 4) 676 NAME 'async' (1, 4) (1, 9) 677 """) 678 679 self.check_tokenize("async for a in b: pass", """\ 680 NAME 'async' (1, 0) (1, 5) 681 NAME 'for' (1, 6) (1, 9) 682 NAME 'a' (1, 10) (1, 11) 683 NAME 'in' (1, 12) (1, 14) 684 NAME 'b' (1, 15) (1, 16) 685 OP ':' (1, 16) (1, 17) 686 NAME 'pass' (1, 18) (1, 22) 687 """) 688 689 self.check_tokenize("async with a as b: pass", """\ 690 NAME 'async' (1, 0) (1, 5) 691 NAME 'with' (1, 6) (1, 10) 692 NAME 'a' (1, 11) (1, 12) 693 NAME 'as' (1, 13) (1, 15) 694 NAME 'b' (1, 16) (1, 17) 695 OP ':' (1, 17) (1, 18) 696 NAME 'pass' (1, 19) (1, 23) 697 """) 698 699 self.check_tokenize("async.foo", """\ 700 NAME 'async' (1, 0) (1, 5) 701 OP '.' (1, 5) (1, 6) 702 NAME 'foo' (1, 6) (1, 9) 703 """) 704 705 self.check_tokenize("async", """\ 706 NAME 'async' (1, 0) (1, 5) 707 """) 708 709 self.check_tokenize("async\n#comment\nawait", """\ 710 NAME 'async' (1, 0) (1, 5) 711 NEWLINE '\\n' (1, 5) (1, 6) 712 COMMENT '#comment' (2, 0) (2, 8) 713 NL '\\n' (2, 8) (2, 9) 714 NAME 'await' (3, 0) (3, 5) 715 """) 716 717 self.check_tokenize("async\n...\nawait", """\ 718 NAME 'async' (1, 0) (1, 5) 719 NEWLINE '\\n' (1, 5) (1, 6) 720 OP '...' (2, 0) (2, 3) 721 NEWLINE '\\n' (2, 3) (2, 4) 722 NAME 'await' (3, 0) (3, 5) 723 """) 724 725 self.check_tokenize("async\nawait", """\ 726 NAME 'async' (1, 0) (1, 5) 727 NEWLINE '\\n' (1, 5) (1, 6) 728 NAME 'await' (2, 0) (2, 5) 729 """) 730 731 self.check_tokenize("foo.async + 1", """\ 732 NAME 'foo' (1, 0) (1, 3) 733 OP '.' (1, 3) (1, 4) 734 NAME 'async' (1, 4) (1, 9) 735 OP '+' (1, 10) (1, 11) 736 NUMBER '1' (1, 12) (1, 13) 737 """) 738 739 self.check_tokenize("async def foo(): pass", """\ 740 ASYNC 'async' (1, 0) (1, 5) 741 NAME 'def' (1, 6) (1, 9) 742 NAME 'foo' (1, 10) (1, 13) 743 OP '(' (1, 13) (1, 14) 744 OP ')' (1, 14) (1, 15) 745 OP ':' (1, 15) (1, 16) 746 NAME 'pass' (1, 17) (1, 21) 747 """) 748 749 self.check_tokenize('''\ 750async def foo(): 751 def foo(await): 752 await = 1 753 if 1: 754 await 755async += 1 756''', """\ 757 ASYNC 'async' (1, 0) (1, 5) 758 NAME 'def' (1, 6) (1, 9) 759 NAME 'foo' (1, 10) (1, 13) 760 OP '(' (1, 13) (1, 14) 761 OP ')' (1, 14) (1, 15) 762 OP ':' (1, 15) (1, 16) 763 NEWLINE '\\n' (1, 16) (1, 17) 764 INDENT ' ' (2, 0) (2, 2) 765 NAME 'def' (2, 2) (2, 5) 766 NAME 'foo' (2, 6) (2, 9) 767 OP '(' (2, 9) (2, 10) 768 AWAIT 'await' (2, 10) (2, 15) 769 OP ')' (2, 15) (2, 16) 770 OP ':' (2, 16) (2, 17) 771 NEWLINE '\\n' (2, 17) (2, 18) 772 INDENT ' ' (3, 0) (3, 4) 773 AWAIT 'await' (3, 4) (3, 9) 774 OP '=' (3, 10) (3, 11) 775 NUMBER '1' (3, 12) (3, 13) 776 NEWLINE '\\n' (3, 13) (3, 14) 777 DEDENT '' (4, 2) (4, 2) 778 NAME 'if' (4, 2) (4, 4) 779 NUMBER '1' (4, 5) (4, 6) 780 OP ':' (4, 6) (4, 7) 781 NEWLINE '\\n' (4, 7) (4, 8) 782 INDENT ' ' (5, 0) (5, 4) 783 AWAIT 'await' (5, 4) (5, 9) 784 NEWLINE '\\n' (5, 9) (5, 10) 785 DEDENT '' (6, 0) (6, 0) 786 DEDENT '' (6, 0) (6, 0) 787 NAME 'async' (6, 0) (6, 5) 788 OP '+=' (6, 6) (6, 8) 789 NUMBER '1' (6, 9) (6, 10) 790 NEWLINE '\\n' (6, 10) (6, 11) 791 """) 792 793 self.check_tokenize('''\ 794async def foo(): 795 async for i in 1: pass''', """\ 796 ASYNC 'async' (1, 0) (1, 5) 797 NAME 'def' (1, 6) (1, 9) 798 NAME 'foo' (1, 10) (1, 13) 799 OP '(' (1, 13) (1, 14) 800 OP ')' (1, 14) (1, 15) 801 OP ':' (1, 15) (1, 16) 802 NEWLINE '\\n' (1, 16) (1, 17) 803 INDENT ' ' (2, 0) (2, 2) 804 ASYNC 'async' (2, 2) (2, 7) 805 NAME 'for' (2, 8) (2, 11) 806 NAME 'i' (2, 12) (2, 13) 807 NAME 'in' (2, 14) (2, 16) 808 NUMBER '1' (2, 17) (2, 18) 809 OP ':' (2, 18) (2, 19) 810 NAME 'pass' (2, 20) (2, 24) 811 DEDENT '' (3, 0) (3, 0) 812 """) 813 814 self.check_tokenize('''async def foo(async): await''', """\ 815 ASYNC 'async' (1, 0) (1, 5) 816 NAME 'def' (1, 6) (1, 9) 817 NAME 'foo' (1, 10) (1, 13) 818 OP '(' (1, 13) (1, 14) 819 ASYNC 'async' (1, 14) (1, 19) 820 OP ')' (1, 19) (1, 20) 821 OP ':' (1, 20) (1, 21) 822 AWAIT 'await' (1, 22) (1, 27) 823 """) 824 825 self.check_tokenize('''\ 826def f(): 827 828 def baz(): pass 829 async def bar(): pass 830 831 await = 2''', """\ 832 NAME 'def' (1, 0) (1, 3) 833 NAME 'f' (1, 4) (1, 5) 834 OP '(' (1, 5) (1, 6) 835 OP ')' (1, 6) (1, 7) 836 OP ':' (1, 7) (1, 8) 837 NEWLINE '\\n' (1, 8) (1, 9) 838 NL '\\n' (2, 0) (2, 1) 839 INDENT ' ' (3, 0) (3, 2) 840 NAME 'def' (3, 2) (3, 5) 841 NAME 'baz' (3, 6) (3, 9) 842 OP '(' (3, 9) (3, 10) 843 OP ')' (3, 10) (3, 11) 844 OP ':' (3, 11) (3, 12) 845 NAME 'pass' (3, 13) (3, 17) 846 NEWLINE '\\n' (3, 17) (3, 18) 847 ASYNC 'async' (4, 2) (4, 7) 848 NAME 'def' (4, 8) (4, 11) 849 NAME 'bar' (4, 12) (4, 15) 850 OP '(' (4, 15) (4, 16) 851 OP ')' (4, 16) (4, 17) 852 OP ':' (4, 17) (4, 18) 853 NAME 'pass' (4, 19) (4, 23) 854 NEWLINE '\\n' (4, 23) (4, 24) 855 NL '\\n' (5, 0) (5, 1) 856 NAME 'await' (6, 2) (6, 7) 857 OP '=' (6, 8) (6, 9) 858 NUMBER '2' (6, 10) (6, 11) 859 DEDENT '' (7, 0) (7, 0) 860 """) 861 862 self.check_tokenize('''\ 863async def f(): 864 865 def baz(): pass 866 async def bar(): pass 867 868 await = 2''', """\ 869 ASYNC 'async' (1, 0) (1, 5) 870 NAME 'def' (1, 6) (1, 9) 871 NAME 'f' (1, 10) (1, 11) 872 OP '(' (1, 11) (1, 12) 873 OP ')' (1, 12) (1, 13) 874 OP ':' (1, 13) (1, 14) 875 NEWLINE '\\n' (1, 14) (1, 15) 876 NL '\\n' (2, 0) (2, 1) 877 INDENT ' ' (3, 0) (3, 2) 878 NAME 'def' (3, 2) (3, 5) 879 NAME 'baz' (3, 6) (3, 9) 880 OP '(' (3, 9) (3, 10) 881 OP ')' (3, 10) (3, 11) 882 OP ':' (3, 11) (3, 12) 883 NAME 'pass' (3, 13) (3, 17) 884 NEWLINE '\\n' (3, 17) (3, 18) 885 ASYNC 'async' (4, 2) (4, 7) 886 NAME 'def' (4, 8) (4, 11) 887 NAME 'bar' (4, 12) (4, 15) 888 OP '(' (4, 15) (4, 16) 889 OP ')' (4, 16) (4, 17) 890 OP ':' (4, 17) (4, 18) 891 NAME 'pass' (4, 19) (4, 23) 892 NEWLINE '\\n' (4, 23) (4, 24) 893 NL '\\n' (5, 0) (5, 1) 894 AWAIT 'await' (6, 2) (6, 7) 895 OP '=' (6, 8) (6, 9) 896 NUMBER '2' (6, 10) (6, 11) 897 DEDENT '' (7, 0) (7, 0) 898 """) 899 900 901def decistmt(s): 902 result = [] 903 g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string 904 for toknum, tokval, _, _, _ in g: 905 if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens 906 result.extend([ 907 (NAME, 'Decimal'), 908 (OP, '('), 909 (STRING, repr(tokval)), 910 (OP, ')') 911 ]) 912 else: 913 result.append((toknum, tokval)) 914 return untokenize(result).decode('utf-8') 915 916class TestMisc(TestCase): 917 918 def test_decistmt(self): 919 # Substitute Decimals for floats in a string of statements. 920 # This is an example from the docs. 921 922 from decimal import Decimal 923 s = '+21.3e-5*-.1234/81.7' 924 self.assertEqual(decistmt(s), 925 "+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7')") 926 927 # The format of the exponent is inherited from the platform C library. 928 # Known cases are "e-007" (Windows) and "e-07" (not Windows). Since 929 # we're only showing 11 digits, and the 12th isn't close to 5, the 930 # rest of the output should be platform-independent. 931 self.assertRegex(repr(eval(s)), '-3.2171603427[0-9]*e-0+7') 932 933 # Output from calculations with Decimal should be identical across all 934 # platforms. 935 self.assertEqual(eval(decistmt(s)), 936 Decimal('-3.217160342717258261933904529E-7')) 937 938 939class TestTokenizerAdheresToPep0263(TestCase): 940 """ 941 Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263. 942 """ 943 944 def _testFile(self, filename): 945 path = os.path.join(os.path.dirname(__file__), filename) 946 TestRoundtrip.check_roundtrip(self, open(path, 'rb')) 947 948 def test_utf8_coding_cookie_and_no_utf8_bom(self): 949 f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt' 950 self._testFile(f) 951 952 def test_latin1_coding_cookie_and_utf8_bom(self): 953 """ 954 As per PEP 0263, if a file starts with a utf-8 BOM signature, the only 955 allowed encoding for the comment is 'utf-8'. The text file used in 956 this test starts with a BOM signature, but specifies latin1 as the 957 coding, so verify that a SyntaxError is raised, which matches the 958 behaviour of the interpreter when it encounters a similar condition. 959 """ 960 f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt' 961 self.assertRaises(SyntaxError, self._testFile, f) 962 963 def test_no_coding_cookie_and_utf8_bom(self): 964 f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt' 965 self._testFile(f) 966 967 def test_utf8_coding_cookie_and_utf8_bom(self): 968 f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt' 969 self._testFile(f) 970 971 def test_bad_coding_cookie(self): 972 self.assertRaises(SyntaxError, self._testFile, 'bad_coding.py') 973 self.assertRaises(SyntaxError, self._testFile, 'bad_coding2.py') 974 975 976class Test_Tokenize(TestCase): 977 978 def test__tokenize_decodes_with_specified_encoding(self): 979 literal = '"ЉЊЈЁЂ"' 980 line = literal.encode('utf-8') 981 first = False 982 def readline(): 983 nonlocal first 984 if not first: 985 first = True 986 return line 987 else: 988 return b'' 989 990 # skip the initial encoding token and the end token 991 tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1] 992 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] 993 self.assertEqual(tokens, expected_tokens, 994 "bytes not decoded with encoding") 995 996 def test__tokenize_does_not_decode_with_encoding_none(self): 997 literal = '"ЉЊЈЁЂ"' 998 first = False 999 def readline(): 1000 nonlocal first 1001 if not first: 1002 first = True 1003 return literal 1004 else: 1005 return b'' 1006 1007 # skip the end token 1008 tokens = list(_tokenize(readline, encoding=None))[:-1] 1009 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] 1010 self.assertEqual(tokens, expected_tokens, 1011 "string not tokenized when encoding is None") 1012 1013 1014class TestDetectEncoding(TestCase): 1015 1016 def get_readline(self, lines): 1017 index = 0 1018 def readline(): 1019 nonlocal index 1020 if index == len(lines): 1021 raise StopIteration 1022 line = lines[index] 1023 index += 1 1024 return line 1025 return readline 1026 1027 def test_no_bom_no_encoding_cookie(self): 1028 lines = ( 1029 b'# something\n', 1030 b'print(something)\n', 1031 b'do_something(else)\n' 1032 ) 1033 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) 1034 self.assertEqual(encoding, 'utf-8') 1035 self.assertEqual(consumed_lines, list(lines[:2])) 1036 1037 def test_bom_no_cookie(self): 1038 lines = ( 1039 b'\xef\xbb\xbf# something\n', 1040 b'print(something)\n', 1041 b'do_something(else)\n' 1042 ) 1043 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) 1044 self.assertEqual(encoding, 'utf-8-sig') 1045 self.assertEqual(consumed_lines, 1046 [b'# something\n', b'print(something)\n']) 1047 1048 def test_cookie_first_line_no_bom(self): 1049 lines = ( 1050 b'# -*- coding: latin-1 -*-\n', 1051 b'print(something)\n', 1052 b'do_something(else)\n' 1053 ) 1054 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) 1055 self.assertEqual(encoding, 'iso-8859-1') 1056 self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n']) 1057 1058 def test_matched_bom_and_cookie_first_line(self): 1059 lines = ( 1060 b'\xef\xbb\xbf# coding=utf-8\n', 1061 b'print(something)\n', 1062 b'do_something(else)\n' 1063 ) 1064 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) 1065 self.assertEqual(encoding, 'utf-8-sig') 1066 self.assertEqual(consumed_lines, [b'# coding=utf-8\n']) 1067 1068 def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self): 1069 lines = ( 1070 b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n', 1071 b'print(something)\n', 1072 b'do_something(else)\n' 1073 ) 1074 readline = self.get_readline(lines) 1075 self.assertRaises(SyntaxError, detect_encoding, readline) 1076 1077 def test_cookie_second_line_no_bom(self): 1078 lines = ( 1079 b'#! something\n', 1080 b'# vim: set fileencoding=ascii :\n', 1081 b'print(something)\n', 1082 b'do_something(else)\n' 1083 ) 1084 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) 1085 self.assertEqual(encoding, 'ascii') 1086 expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n'] 1087 self.assertEqual(consumed_lines, expected) 1088 1089 def test_matched_bom_and_cookie_second_line(self): 1090 lines = ( 1091 b'\xef\xbb\xbf#! something\n', 1092 b'f# coding=utf-8\n', 1093 b'print(something)\n', 1094 b'do_something(else)\n' 1095 ) 1096 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) 1097 self.assertEqual(encoding, 'utf-8-sig') 1098 self.assertEqual(consumed_lines, 1099 [b'#! something\n', b'f# coding=utf-8\n']) 1100 1101 def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self): 1102 lines = ( 1103 b'\xef\xbb\xbf#! something\n', 1104 b'# vim: set fileencoding=ascii :\n', 1105 b'print(something)\n', 1106 b'do_something(else)\n' 1107 ) 1108 readline = self.get_readline(lines) 1109 self.assertRaises(SyntaxError, detect_encoding, readline) 1110 1111 def test_cookie_second_line_noncommented_first_line(self): 1112 lines = ( 1113 b"print('\xc2\xa3')\n", 1114 b'# vim: set fileencoding=iso8859-15 :\n', 1115 b"print('\xe2\x82\xac')\n" 1116 ) 1117 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) 1118 self.assertEqual(encoding, 'utf-8') 1119 expected = [b"print('\xc2\xa3')\n"] 1120 self.assertEqual(consumed_lines, expected) 1121 1122 def test_cookie_second_line_commented_first_line(self): 1123 lines = ( 1124 b"#print('\xc2\xa3')\n", 1125 b'# vim: set fileencoding=iso8859-15 :\n', 1126 b"print('\xe2\x82\xac')\n" 1127 ) 1128 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) 1129 self.assertEqual(encoding, 'iso8859-15') 1130 expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n'] 1131 self.assertEqual(consumed_lines, expected) 1132 1133 def test_cookie_second_line_empty_first_line(self): 1134 lines = ( 1135 b'\n', 1136 b'# vim: set fileencoding=iso8859-15 :\n', 1137 b"print('\xe2\x82\xac')\n" 1138 ) 1139 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) 1140 self.assertEqual(encoding, 'iso8859-15') 1141 expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n'] 1142 self.assertEqual(consumed_lines, expected) 1143 1144 def test_latin1_normalization(self): 1145 # See get_normal_name() in tokenizer.c. 1146 encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix", 1147 "iso-8859-1-unix", "iso-latin-1-mac") 1148 for encoding in encodings: 1149 for rep in ("-", "_"): 1150 enc = encoding.replace("-", rep) 1151 lines = (b"#!/usr/bin/python\n", 1152 b"# coding: " + enc.encode("ascii") + b"\n", 1153 b"print(things)\n", 1154 b"do_something += 4\n") 1155 rl = self.get_readline(lines) 1156 found, consumed_lines = detect_encoding(rl) 1157 self.assertEqual(found, "iso-8859-1") 1158 1159 def test_syntaxerror_latin1(self): 1160 # Issue 14629: need to raise SyntaxError if the first 1161 # line(s) have non-UTF-8 characters 1162 lines = ( 1163 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S 1164 ) 1165 readline = self.get_readline(lines) 1166 self.assertRaises(SyntaxError, detect_encoding, readline) 1167 1168 1169 def test_utf8_normalization(self): 1170 # See get_normal_name() in tokenizer.c. 1171 encodings = ("utf-8", "utf-8-mac", "utf-8-unix") 1172 for encoding in encodings: 1173 for rep in ("-", "_"): 1174 enc = encoding.replace("-", rep) 1175 lines = (b"#!/usr/bin/python\n", 1176 b"# coding: " + enc.encode("ascii") + b"\n", 1177 b"1 + 3\n") 1178 rl = self.get_readline(lines) 1179 found, consumed_lines = detect_encoding(rl) 1180 self.assertEqual(found, "utf-8") 1181 1182 def test_short_files(self): 1183 readline = self.get_readline((b'print(something)\n',)) 1184 encoding, consumed_lines = detect_encoding(readline) 1185 self.assertEqual(encoding, 'utf-8') 1186 self.assertEqual(consumed_lines, [b'print(something)\n']) 1187 1188 encoding, consumed_lines = detect_encoding(self.get_readline(())) 1189 self.assertEqual(encoding, 'utf-8') 1190 self.assertEqual(consumed_lines, []) 1191 1192 readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',)) 1193 encoding, consumed_lines = detect_encoding(readline) 1194 self.assertEqual(encoding, 'utf-8-sig') 1195 self.assertEqual(consumed_lines, [b'print(something)\n']) 1196 1197 readline = self.get_readline((b'\xef\xbb\xbf',)) 1198 encoding, consumed_lines = detect_encoding(readline) 1199 self.assertEqual(encoding, 'utf-8-sig') 1200 self.assertEqual(consumed_lines, []) 1201 1202 readline = self.get_readline((b'# coding: bad\n',)) 1203 self.assertRaises(SyntaxError, detect_encoding, readline) 1204 1205 def test_false_encoding(self): 1206 # Issue 18873: "Encoding" detected in non-comment lines 1207 readline = self.get_readline((b'print("#coding=fake")',)) 1208 encoding, consumed_lines = detect_encoding(readline) 1209 self.assertEqual(encoding, 'utf-8') 1210 self.assertEqual(consumed_lines, [b'print("#coding=fake")']) 1211 1212 def test_open(self): 1213 filename = support.TESTFN + '.py' 1214 self.addCleanup(support.unlink, filename) 1215 1216 # test coding cookie 1217 for encoding in ('iso-8859-15', 'utf-8'): 1218 with open(filename, 'w', encoding=encoding) as fp: 1219 print("# coding: %s" % encoding, file=fp) 1220 print("print('euro:\u20ac')", file=fp) 1221 with tokenize_open(filename) as fp: 1222 self.assertEqual(fp.encoding, encoding) 1223 self.assertEqual(fp.mode, 'r') 1224 1225 # test BOM (no coding cookie) 1226 with open(filename, 'w', encoding='utf-8-sig') as fp: 1227 print("print('euro:\u20ac')", file=fp) 1228 with tokenize_open(filename) as fp: 1229 self.assertEqual(fp.encoding, 'utf-8-sig') 1230 self.assertEqual(fp.mode, 'r') 1231 1232 def test_filename_in_exception(self): 1233 # When possible, include the file name in the exception. 1234 path = 'some_file_path' 1235 lines = ( 1236 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S 1237 ) 1238 class Bunk: 1239 def __init__(self, lines, path): 1240 self.name = path 1241 self._lines = lines 1242 self._index = 0 1243 1244 def readline(self): 1245 if self._index == len(lines): 1246 raise StopIteration 1247 line = lines[self._index] 1248 self._index += 1 1249 return line 1250 1251 with self.assertRaises(SyntaxError): 1252 ins = Bunk(lines, path) 1253 # Make sure lacking a name isn't an issue. 1254 del ins.name 1255 detect_encoding(ins.readline) 1256 with self.assertRaisesRegex(SyntaxError, '.*{}'.format(path)): 1257 ins = Bunk(lines, path) 1258 detect_encoding(ins.readline) 1259 1260 def test_open_error(self): 1261 # Issue #23840: open() must close the binary file on error 1262 m = BytesIO(b'#coding:xxx') 1263 with mock.patch('tokenize._builtin_open', return_value=m): 1264 self.assertRaises(SyntaxError, tokenize_open, 'foobar') 1265 self.assertTrue(m.closed) 1266 1267 1268class TestTokenize(TestCase): 1269 1270 def test_tokenize(self): 1271 import tokenize as tokenize_module 1272 encoding = object() 1273 encoding_used = None 1274 def mock_detect_encoding(readline): 1275 return encoding, [b'first', b'second'] 1276 1277 def mock__tokenize(readline, encoding): 1278 nonlocal encoding_used 1279 encoding_used = encoding 1280 out = [] 1281 while True: 1282 next_line = readline() 1283 if next_line: 1284 out.append(next_line) 1285 continue 1286 return out 1287 1288 counter = 0 1289 def mock_readline(): 1290 nonlocal counter 1291 counter += 1 1292 if counter == 5: 1293 return b'' 1294 return str(counter).encode() 1295 1296 orig_detect_encoding = tokenize_module.detect_encoding 1297 orig__tokenize = tokenize_module._tokenize 1298 tokenize_module.detect_encoding = mock_detect_encoding 1299 tokenize_module._tokenize = mock__tokenize 1300 try: 1301 results = tokenize(mock_readline) 1302 self.assertEqual(list(results), 1303 [b'first', b'second', b'1', b'2', b'3', b'4']) 1304 finally: 1305 tokenize_module.detect_encoding = orig_detect_encoding 1306 tokenize_module._tokenize = orig__tokenize 1307 1308 self.assertTrue(encoding_used, encoding) 1309 1310 def test_oneline_defs(self): 1311 buf = [] 1312 for i in range(500): 1313 buf.append('def i{i}(): return {i}'.format(i=i)) 1314 buf.append('OK') 1315 buf = '\n'.join(buf) 1316 1317 # Test that 500 consequent, one-line defs is OK 1318 toks = list(tokenize(BytesIO(buf.encode('utf-8')).readline)) 1319 self.assertEqual(toks[-2].string, 'OK') # [-1] is always ENDMARKER 1320 1321 def assertExactTypeEqual(self, opstr, *optypes): 1322 tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline)) 1323 num_optypes = len(optypes) 1324 self.assertEqual(len(tokens), 2 + num_optypes) 1325 self.assertEqual(token.tok_name[tokens[0].exact_type], 1326 token.tok_name[ENCODING]) 1327 for i in range(num_optypes): 1328 self.assertEqual(token.tok_name[tokens[i + 1].exact_type], 1329 token.tok_name[optypes[i]]) 1330 self.assertEqual(token.tok_name[tokens[1 + num_optypes].exact_type], 1331 token.tok_name[token.ENDMARKER]) 1332 1333 def test_exact_type(self): 1334 self.assertExactTypeEqual('()', token.LPAR, token.RPAR) 1335 self.assertExactTypeEqual('[]', token.LSQB, token.RSQB) 1336 self.assertExactTypeEqual(':', token.COLON) 1337 self.assertExactTypeEqual(',', token.COMMA) 1338 self.assertExactTypeEqual(';', token.SEMI) 1339 self.assertExactTypeEqual('+', token.PLUS) 1340 self.assertExactTypeEqual('-', token.MINUS) 1341 self.assertExactTypeEqual('*', token.STAR) 1342 self.assertExactTypeEqual('/', token.SLASH) 1343 self.assertExactTypeEqual('|', token.VBAR) 1344 self.assertExactTypeEqual('&', token.AMPER) 1345 self.assertExactTypeEqual('<', token.LESS) 1346 self.assertExactTypeEqual('>', token.GREATER) 1347 self.assertExactTypeEqual('=', token.EQUAL) 1348 self.assertExactTypeEqual('.', token.DOT) 1349 self.assertExactTypeEqual('%', token.PERCENT) 1350 self.assertExactTypeEqual('{}', token.LBRACE, token.RBRACE) 1351 self.assertExactTypeEqual('==', token.EQEQUAL) 1352 self.assertExactTypeEqual('!=', token.NOTEQUAL) 1353 self.assertExactTypeEqual('<=', token.LESSEQUAL) 1354 self.assertExactTypeEqual('>=', token.GREATEREQUAL) 1355 self.assertExactTypeEqual('~', token.TILDE) 1356 self.assertExactTypeEqual('^', token.CIRCUMFLEX) 1357 self.assertExactTypeEqual('<<', token.LEFTSHIFT) 1358 self.assertExactTypeEqual('>>', token.RIGHTSHIFT) 1359 self.assertExactTypeEqual('**', token.DOUBLESTAR) 1360 self.assertExactTypeEqual('+=', token.PLUSEQUAL) 1361 self.assertExactTypeEqual('-=', token.MINEQUAL) 1362 self.assertExactTypeEqual('*=', token.STAREQUAL) 1363 self.assertExactTypeEqual('/=', token.SLASHEQUAL) 1364 self.assertExactTypeEqual('%=', token.PERCENTEQUAL) 1365 self.assertExactTypeEqual('&=', token.AMPEREQUAL) 1366 self.assertExactTypeEqual('|=', token.VBAREQUAL) 1367 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL) 1368 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL) 1369 self.assertExactTypeEqual('<<=', token.LEFTSHIFTEQUAL) 1370 self.assertExactTypeEqual('>>=', token.RIGHTSHIFTEQUAL) 1371 self.assertExactTypeEqual('**=', token.DOUBLESTAREQUAL) 1372 self.assertExactTypeEqual('//', token.DOUBLESLASH) 1373 self.assertExactTypeEqual('//=', token.DOUBLESLASHEQUAL) 1374 self.assertExactTypeEqual('@', token.AT) 1375 self.assertExactTypeEqual('@=', token.ATEQUAL) 1376 1377 self.assertExactTypeEqual('a**2+b**2==c**2', 1378 NAME, token.DOUBLESTAR, NUMBER, 1379 token.PLUS, 1380 NAME, token.DOUBLESTAR, NUMBER, 1381 token.EQEQUAL, 1382 NAME, token.DOUBLESTAR, NUMBER) 1383 self.assertExactTypeEqual('{1, 2, 3}', 1384 token.LBRACE, 1385 token.NUMBER, token.COMMA, 1386 token.NUMBER, token.COMMA, 1387 token.NUMBER, 1388 token.RBRACE) 1389 self.assertExactTypeEqual('^(x & 0x1)', 1390 token.CIRCUMFLEX, 1391 token.LPAR, 1392 token.NAME, token.AMPER, token.NUMBER, 1393 token.RPAR) 1394 1395 def test_pathological_trailing_whitespace(self): 1396 # See http://bugs.python.org/issue16152 1397 self.assertExactTypeEqual('@ ', token.AT) 1398 1399 1400class UntokenizeTest(TestCase): 1401 1402 def test_bad_input_order(self): 1403 # raise if previous row 1404 u = Untokenizer() 1405 u.prev_row = 2 1406 u.prev_col = 2 1407 with self.assertRaises(ValueError) as cm: 1408 u.add_whitespace((1,3)) 1409 self.assertEqual(cm.exception.args[0], 1410 'start (1,3) precedes previous end (2,2)') 1411 # raise if previous column in row 1412 self.assertRaises(ValueError, u.add_whitespace, (2,1)) 1413 1414 def test_backslash_continuation(self): 1415 # The problem is that <whitespace>\<newline> leaves no token 1416 u = Untokenizer() 1417 u.prev_row = 1 1418 u.prev_col = 1 1419 u.tokens = [] 1420 u.add_whitespace((2, 0)) 1421 self.assertEqual(u.tokens, ['\\\n']) 1422 u.prev_row = 2 1423 u.add_whitespace((4, 4)) 1424 self.assertEqual(u.tokens, ['\\\n', '\\\n\\\n', ' ']) 1425 TestRoundtrip.check_roundtrip(self, 'a\n b\n c\n \\\n c\n') 1426 1427 def test_iter_compat(self): 1428 u = Untokenizer() 1429 token = (NAME, 'Hello') 1430 tokens = [(ENCODING, 'utf-8'), token] 1431 u.compat(token, iter([])) 1432 self.assertEqual(u.tokens, ["Hello "]) 1433 u = Untokenizer() 1434 self.assertEqual(u.untokenize(iter([token])), 'Hello ') 1435 u = Untokenizer() 1436 self.assertEqual(u.untokenize(iter(tokens)), 'Hello ') 1437 self.assertEqual(u.encoding, 'utf-8') 1438 self.assertEqual(untokenize(iter(tokens)), b'Hello ') 1439 1440 1441class TestRoundtrip(TestCase): 1442 1443 def check_roundtrip(self, f): 1444 """ 1445 Test roundtrip for `untokenize`. `f` is an open file or a string. 1446 The source code in f is tokenized to both 5- and 2-tuples. 1447 Both sequences are converted back to source code via 1448 tokenize.untokenize(), and the latter tokenized again to 2-tuples. 1449 The test fails if the 3 pair tokenizations do not match. 1450 1451 When untokenize bugs are fixed, untokenize with 5-tuples should 1452 reproduce code that does not contain a backslash continuation 1453 following spaces. A proper test should test this. 1454 """ 1455 # Get source code and original tokenizations 1456 if isinstance(f, str): 1457 code = f.encode('utf-8') 1458 else: 1459 code = f.read() 1460 f.close() 1461 readline = iter(code.splitlines(keepends=True)).__next__ 1462 tokens5 = list(tokenize(readline)) 1463 tokens2 = [tok[:2] for tok in tokens5] 1464 # Reproduce tokens2 from pairs 1465 bytes_from2 = untokenize(tokens2) 1466 readline2 = iter(bytes_from2.splitlines(keepends=True)).__next__ 1467 tokens2_from2 = [tok[:2] for tok in tokenize(readline2)] 1468 self.assertEqual(tokens2_from2, tokens2) 1469 # Reproduce tokens2 from 5-tuples 1470 bytes_from5 = untokenize(tokens5) 1471 readline5 = iter(bytes_from5.splitlines(keepends=True)).__next__ 1472 tokens2_from5 = [tok[:2] for tok in tokenize(readline5)] 1473 self.assertEqual(tokens2_from5, tokens2) 1474 1475 def test_roundtrip(self): 1476 # There are some standard formatting practices that are easy to get right. 1477 1478 self.check_roundtrip("if x == 1:\n" 1479 " print(x)\n") 1480 self.check_roundtrip("# This is a comment\n" 1481 "# This also") 1482 1483 # Some people use different formatting conventions, which makes 1484 # untokenize a little trickier. Note that this test involves trailing 1485 # whitespace after the colon. Note that we use hex escapes to make the 1486 # two trailing blanks apparent in the expected output. 1487 1488 self.check_roundtrip("if x == 1 : \n" 1489 " print(x)\n") 1490 fn = support.findfile("tokenize_tests.txt") 1491 with open(fn, 'rb') as f: 1492 self.check_roundtrip(f) 1493 self.check_roundtrip("if x == 1:\n" 1494 " # A comment by itself.\n" 1495 " print(x) # Comment here, too.\n" 1496 " # Another comment.\n" 1497 "after_if = True\n") 1498 self.check_roundtrip("if (x # The comments need to go in the right place\n" 1499 " == 1):\n" 1500 " print('x==1')\n") 1501 self.check_roundtrip("class Test: # A comment here\n" 1502 " # A comment with weird indent\n" 1503 " after_com = 5\n" 1504 " def x(m): return m*5 # a one liner\n" 1505 " def y(m): # A whitespace after the colon\n" 1506 " return y*4 # 3-space indent\n") 1507 1508 # Some error-handling code 1509 self.check_roundtrip("try: import somemodule\n" 1510 "except ImportError: # comment\n" 1511 " print('Can not import' # comment2\n)" 1512 "else: print('Loaded')\n") 1513 1514 def test_continuation(self): 1515 # Balancing continuation 1516 self.check_roundtrip("a = (3,4, \n" 1517 "5,6)\n" 1518 "y = [3, 4,\n" 1519 "5]\n" 1520 "z = {'a': 5,\n" 1521 "'b':15, 'c':True}\n" 1522 "x = len(y) + 5 - a[\n" 1523 "3] - a[2]\n" 1524 "+ len(z) - z[\n" 1525 "'b']\n") 1526 1527 def test_backslash_continuation(self): 1528 # Backslash means line continuation, except for comments 1529 self.check_roundtrip("x=1+\\\n" 1530 "1\n" 1531 "# This is a comment\\\n" 1532 "# This also\n") 1533 self.check_roundtrip("# Comment \\\n" 1534 "x = 0") 1535 1536 def test_string_concatenation(self): 1537 # Two string literals on the same line 1538 self.check_roundtrip("'' ''") 1539 1540 def test_random_files(self): 1541 # Test roundtrip on random python modules. 1542 # pass the '-ucpu' option to process the full directory. 1543 1544 import glob, random 1545 fn = support.findfile("tokenize_tests.txt") 1546 tempdir = os.path.dirname(fn) or os.curdir 1547 testfiles = glob.glob(os.path.join(tempdir, "test*.py")) 1548 1549 # Tokenize is broken on test_pep3131.py because regular expressions are 1550 # broken on the obscure unicode identifiers in it. *sigh* 1551 # With roundtrip extended to test the 5-tuple mode of untokenize, 1552 # 7 more testfiles fail. Remove them also until the failure is diagnosed. 1553 1554 testfiles.remove(os.path.join(tempdir, "test_unicode_identifiers.py")) 1555 for f in ('buffer', 'builtin', 'fileio', 'inspect', 'os', 'platform', 'sys'): 1556 testfiles.remove(os.path.join(tempdir, "test_%s.py") % f) 1557 1558 if not support.is_resource_enabled("cpu"): 1559 testfiles = random.sample(testfiles, 10) 1560 1561 for testfile in testfiles: 1562 with open(testfile, 'rb') as f: 1563 with self.subTest(file=testfile): 1564 self.check_roundtrip(f) 1565 1566 1567 def roundtrip(self, code): 1568 if isinstance(code, str): 1569 code = code.encode('utf-8') 1570 return untokenize(tokenize(BytesIO(code).readline)).decode('utf-8') 1571 1572 def test_indentation_semantics_retained(self): 1573 """ 1574 Ensure that although whitespace might be mutated in a roundtrip, 1575 the semantic meaning of the indentation remains consistent. 1576 """ 1577 code = "if False:\n\tx=3\n\tx=3\n" 1578 codelines = self.roundtrip(code).split('\n') 1579 self.assertEqual(codelines[1], codelines[2]) 1580 self.check_roundtrip(code) 1581 1582 1583if __name__ == "__main__": 1584 unittest.main() 1585