1from test import support
2from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
3                     STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
4                     open as tokenize_open, Untokenizer)
5from io import BytesIO
6from unittest import TestCase, mock
7from test.test_grammar import (VALID_UNDERSCORE_LITERALS,
8                               INVALID_UNDERSCORE_LITERALS)
9import os
10import token
11
12
13class TokenizeTest(TestCase):
14    # Tests for the tokenize module.
15
16    # The tests can be really simple. Given a small fragment of source
17    # code, print out a table with tokens. The ENDMARKER is omitted for
18    # brevity.
19
20    def check_tokenize(self, s, expected):
21        # Format the tokens in s in a table format.
22        # The ENDMARKER is omitted.
23        result = []
24        f = BytesIO(s.encode('utf-8'))
25        for type, token, start, end, line in tokenize(f.readline):
26            if type == ENDMARKER:
27                break
28            type = tok_name[type]
29            result.append(f"    {type:10} {token!r:13} {start} {end}")
30        self.assertEqual(result,
31                         ["    ENCODING   'utf-8'       (0, 0) (0, 0)"] +
32                         expected.rstrip().splitlines())
33
34    def test_basic(self):
35        self.check_tokenize("1 + 1", """\
36    NUMBER     '1'           (1, 0) (1, 1)
37    OP         '+'           (1, 2) (1, 3)
38    NUMBER     '1'           (1, 4) (1, 5)
39    """)
40        self.check_tokenize("if False:\n"
41                            "    # NL\n"
42                            "    True = False # NEWLINE\n", """\
43    NAME       'if'          (1, 0) (1, 2)
44    NAME       'False'       (1, 3) (1, 8)
45    OP         ':'           (1, 8) (1, 9)
46    NEWLINE    '\\n'          (1, 9) (1, 10)
47    COMMENT    '# NL'        (2, 4) (2, 8)
48    NL         '\\n'          (2, 8) (2, 9)
49    INDENT     '    '        (3, 0) (3, 4)
50    NAME       'True'        (3, 4) (3, 8)
51    OP         '='           (3, 9) (3, 10)
52    NAME       'False'       (3, 11) (3, 16)
53    COMMENT    '# NEWLINE'   (3, 17) (3, 26)
54    NEWLINE    '\\n'          (3, 26) (3, 27)
55    DEDENT     ''            (4, 0) (4, 0)
56    """)
57        indent_error_file = b"""\
58def k(x):
59    x += 2
60  x += 5
61"""
62        readline = BytesIO(indent_error_file).readline
63        with self.assertRaisesRegex(IndentationError,
64                                    "unindent does not match any "
65                                    "outer indentation level"):
66            for tok in tokenize(readline):
67                pass
68
69    def test_int(self):
70        # Ordinary integers and binary operators
71        self.check_tokenize("0xff <= 255", """\
72    NUMBER     '0xff'        (1, 0) (1, 4)
73    OP         '<='          (1, 5) (1, 7)
74    NUMBER     '255'         (1, 8) (1, 11)
75    """)
76        self.check_tokenize("0b10 <= 255", """\
77    NUMBER     '0b10'        (1, 0) (1, 4)
78    OP         '<='          (1, 5) (1, 7)
79    NUMBER     '255'         (1, 8) (1, 11)
80    """)
81        self.check_tokenize("0o123 <= 0O123", """\
82    NUMBER     '0o123'       (1, 0) (1, 5)
83    OP         '<='          (1, 6) (1, 8)
84    NUMBER     '0O123'       (1, 9) (1, 14)
85    """)
86        self.check_tokenize("1234567 > ~0x15", """\
87    NUMBER     '1234567'     (1, 0) (1, 7)
88    OP         '>'           (1, 8) (1, 9)
89    OP         '~'           (1, 10) (1, 11)
90    NUMBER     '0x15'        (1, 11) (1, 15)
91    """)
92        self.check_tokenize("2134568 != 1231515", """\
93    NUMBER     '2134568'     (1, 0) (1, 7)
94    OP         '!='          (1, 8) (1, 10)
95    NUMBER     '1231515'     (1, 11) (1, 18)
96    """)
97        self.check_tokenize("(-124561-1) & 200000000", """\
98    OP         '('           (1, 0) (1, 1)
99    OP         '-'           (1, 1) (1, 2)
100    NUMBER     '124561'      (1, 2) (1, 8)
101    OP         '-'           (1, 8) (1, 9)
102    NUMBER     '1'           (1, 9) (1, 10)
103    OP         ')'           (1, 10) (1, 11)
104    OP         '&'           (1, 12) (1, 13)
105    NUMBER     '200000000'   (1, 14) (1, 23)
106    """)
107        self.check_tokenize("0xdeadbeef != -1", """\
108    NUMBER     '0xdeadbeef'  (1, 0) (1, 10)
109    OP         '!='          (1, 11) (1, 13)
110    OP         '-'           (1, 14) (1, 15)
111    NUMBER     '1'           (1, 15) (1, 16)
112    """)
113        self.check_tokenize("0xdeadc0de & 12345", """\
114    NUMBER     '0xdeadc0de'  (1, 0) (1, 10)
115    OP         '&'           (1, 11) (1, 12)
116    NUMBER     '12345'       (1, 13) (1, 18)
117    """)
118        self.check_tokenize("0xFF & 0x15 | 1234", """\
119    NUMBER     '0xFF'        (1, 0) (1, 4)
120    OP         '&'           (1, 5) (1, 6)
121    NUMBER     '0x15'        (1, 7) (1, 11)
122    OP         '|'           (1, 12) (1, 13)
123    NUMBER     '1234'        (1, 14) (1, 18)
124    """)
125
126    def test_long(self):
127        # Long integers
128        self.check_tokenize("x = 0", """\
129    NAME       'x'           (1, 0) (1, 1)
130    OP         '='           (1, 2) (1, 3)
131    NUMBER     '0'           (1, 4) (1, 5)
132    """)
133        self.check_tokenize("x = 0xfffffffffff", """\
134    NAME       'x'           (1, 0) (1, 1)
135    OP         '='           (1, 2) (1, 3)
136    NUMBER     '0xfffffffffff' (1, 4) (1, 17)
137    """)
138        self.check_tokenize("x = 123141242151251616110", """\
139    NAME       'x'           (1, 0) (1, 1)
140    OP         '='           (1, 2) (1, 3)
141    NUMBER     '123141242151251616110' (1, 4) (1, 25)
142    """)
143        self.check_tokenize("x = -15921590215012591", """\
144    NAME       'x'           (1, 0) (1, 1)
145    OP         '='           (1, 2) (1, 3)
146    OP         '-'           (1, 4) (1, 5)
147    NUMBER     '15921590215012591' (1, 5) (1, 22)
148    """)
149
150    def test_float(self):
151        # Floating point numbers
152        self.check_tokenize("x = 3.14159", """\
153    NAME       'x'           (1, 0) (1, 1)
154    OP         '='           (1, 2) (1, 3)
155    NUMBER     '3.14159'     (1, 4) (1, 11)
156    """)
157        self.check_tokenize("x = 314159.", """\
158    NAME       'x'           (1, 0) (1, 1)
159    OP         '='           (1, 2) (1, 3)
160    NUMBER     '314159.'     (1, 4) (1, 11)
161    """)
162        self.check_tokenize("x = .314159", """\
163    NAME       'x'           (1, 0) (1, 1)
164    OP         '='           (1, 2) (1, 3)
165    NUMBER     '.314159'     (1, 4) (1, 11)
166    """)
167        self.check_tokenize("x = 3e14159", """\
168    NAME       'x'           (1, 0) (1, 1)
169    OP         '='           (1, 2) (1, 3)
170    NUMBER     '3e14159'     (1, 4) (1, 11)
171    """)
172        self.check_tokenize("x = 3E123", """\
173    NAME       'x'           (1, 0) (1, 1)
174    OP         '='           (1, 2) (1, 3)
175    NUMBER     '3E123'       (1, 4) (1, 9)
176    """)
177        self.check_tokenize("x+y = 3e-1230", """\
178    NAME       'x'           (1, 0) (1, 1)
179    OP         '+'           (1, 1) (1, 2)
180    NAME       'y'           (1, 2) (1, 3)
181    OP         '='           (1, 4) (1, 5)
182    NUMBER     '3e-1230'     (1, 6) (1, 13)
183    """)
184        self.check_tokenize("x = 3.14e159", """\
185    NAME       'x'           (1, 0) (1, 1)
186    OP         '='           (1, 2) (1, 3)
187    NUMBER     '3.14e159'    (1, 4) (1, 12)
188    """)
189
190    def test_underscore_literals(self):
191        def number_token(s):
192            f = BytesIO(s.encode('utf-8'))
193            for toktype, token, start, end, line in tokenize(f.readline):
194                if toktype == NUMBER:
195                    return token
196            return 'invalid token'
197        for lit in VALID_UNDERSCORE_LITERALS:
198            if '(' in lit:
199                # this won't work with compound complex inputs
200                continue
201            self.assertEqual(number_token(lit), lit)
202        for lit in INVALID_UNDERSCORE_LITERALS:
203            self.assertNotEqual(number_token(lit), lit)
204
205    def test_string(self):
206        # String literals
207        self.check_tokenize("x = ''; y = \"\"", """\
208    NAME       'x'           (1, 0) (1, 1)
209    OP         '='           (1, 2) (1, 3)
210    STRING     "''"          (1, 4) (1, 6)
211    OP         ';'           (1, 6) (1, 7)
212    NAME       'y'           (1, 8) (1, 9)
213    OP         '='           (1, 10) (1, 11)
214    STRING     '""'          (1, 12) (1, 14)
215    """)
216        self.check_tokenize("x = '\"'; y = \"'\"", """\
217    NAME       'x'           (1, 0) (1, 1)
218    OP         '='           (1, 2) (1, 3)
219    STRING     '\\'"\\''       (1, 4) (1, 7)
220    OP         ';'           (1, 7) (1, 8)
221    NAME       'y'           (1, 9) (1, 10)
222    OP         '='           (1, 11) (1, 12)
223    STRING     '"\\'"'        (1, 13) (1, 16)
224    """)
225        self.check_tokenize("x = \"doesn't \"shrink\", does it\"", """\
226    NAME       'x'           (1, 0) (1, 1)
227    OP         '='           (1, 2) (1, 3)
228    STRING     '"doesn\\'t "' (1, 4) (1, 14)
229    NAME       'shrink'      (1, 14) (1, 20)
230    STRING     '", does it"' (1, 20) (1, 31)
231    """)
232        self.check_tokenize("x = 'abc' + 'ABC'", """\
233    NAME       'x'           (1, 0) (1, 1)
234    OP         '='           (1, 2) (1, 3)
235    STRING     "'abc'"       (1, 4) (1, 9)
236    OP         '+'           (1, 10) (1, 11)
237    STRING     "'ABC'"       (1, 12) (1, 17)
238    """)
239        self.check_tokenize('y = "ABC" + "ABC"', """\
240    NAME       'y'           (1, 0) (1, 1)
241    OP         '='           (1, 2) (1, 3)
242    STRING     '"ABC"'       (1, 4) (1, 9)
243    OP         '+'           (1, 10) (1, 11)
244    STRING     '"ABC"'       (1, 12) (1, 17)
245    """)
246        self.check_tokenize("x = r'abc' + r'ABC' + R'ABC' + R'ABC'", """\
247    NAME       'x'           (1, 0) (1, 1)
248    OP         '='           (1, 2) (1, 3)
249    STRING     "r'abc'"      (1, 4) (1, 10)
250    OP         '+'           (1, 11) (1, 12)
251    STRING     "r'ABC'"      (1, 13) (1, 19)
252    OP         '+'           (1, 20) (1, 21)
253    STRING     "R'ABC'"      (1, 22) (1, 28)
254    OP         '+'           (1, 29) (1, 30)
255    STRING     "R'ABC'"      (1, 31) (1, 37)
256    """)
257        self.check_tokenize('y = r"abc" + r"ABC" + R"ABC" + R"ABC"', """\
258    NAME       'y'           (1, 0) (1, 1)
259    OP         '='           (1, 2) (1, 3)
260    STRING     'r"abc"'      (1, 4) (1, 10)
261    OP         '+'           (1, 11) (1, 12)
262    STRING     'r"ABC"'      (1, 13) (1, 19)
263    OP         '+'           (1, 20) (1, 21)
264    STRING     'R"ABC"'      (1, 22) (1, 28)
265    OP         '+'           (1, 29) (1, 30)
266    STRING     'R"ABC"'      (1, 31) (1, 37)
267    """)
268
269        self.check_tokenize("u'abc' + U'abc'", """\
270    STRING     "u'abc'"      (1, 0) (1, 6)
271    OP         '+'           (1, 7) (1, 8)
272    STRING     "U'abc'"      (1, 9) (1, 15)
273    """)
274        self.check_tokenize('u"abc" + U"abc"', """\
275    STRING     'u"abc"'      (1, 0) (1, 6)
276    OP         '+'           (1, 7) (1, 8)
277    STRING     'U"abc"'      (1, 9) (1, 15)
278    """)
279
280        self.check_tokenize("b'abc' + B'abc'", """\
281    STRING     "b'abc'"      (1, 0) (1, 6)
282    OP         '+'           (1, 7) (1, 8)
283    STRING     "B'abc'"      (1, 9) (1, 15)
284    """)
285        self.check_tokenize('b"abc" + B"abc"', """\
286    STRING     'b"abc"'      (1, 0) (1, 6)
287    OP         '+'           (1, 7) (1, 8)
288    STRING     'B"abc"'      (1, 9) (1, 15)
289    """)
290        self.check_tokenize("br'abc' + bR'abc' + Br'abc' + BR'abc'", """\
291    STRING     "br'abc'"     (1, 0) (1, 7)
292    OP         '+'           (1, 8) (1, 9)
293    STRING     "bR'abc'"     (1, 10) (1, 17)
294    OP         '+'           (1, 18) (1, 19)
295    STRING     "Br'abc'"     (1, 20) (1, 27)
296    OP         '+'           (1, 28) (1, 29)
297    STRING     "BR'abc'"     (1, 30) (1, 37)
298    """)
299        self.check_tokenize('br"abc" + bR"abc" + Br"abc" + BR"abc"', """\
300    STRING     'br"abc"'     (1, 0) (1, 7)
301    OP         '+'           (1, 8) (1, 9)
302    STRING     'bR"abc"'     (1, 10) (1, 17)
303    OP         '+'           (1, 18) (1, 19)
304    STRING     'Br"abc"'     (1, 20) (1, 27)
305    OP         '+'           (1, 28) (1, 29)
306    STRING     'BR"abc"'     (1, 30) (1, 37)
307    """)
308        self.check_tokenize("rb'abc' + rB'abc' + Rb'abc' + RB'abc'", """\
309    STRING     "rb'abc'"     (1, 0) (1, 7)
310    OP         '+'           (1, 8) (1, 9)
311    STRING     "rB'abc'"     (1, 10) (1, 17)
312    OP         '+'           (1, 18) (1, 19)
313    STRING     "Rb'abc'"     (1, 20) (1, 27)
314    OP         '+'           (1, 28) (1, 29)
315    STRING     "RB'abc'"     (1, 30) (1, 37)
316    """)
317        self.check_tokenize('rb"abc" + rB"abc" + Rb"abc" + RB"abc"', """\
318    STRING     'rb"abc"'     (1, 0) (1, 7)
319    OP         '+'           (1, 8) (1, 9)
320    STRING     'rB"abc"'     (1, 10) (1, 17)
321    OP         '+'           (1, 18) (1, 19)
322    STRING     'Rb"abc"'     (1, 20) (1, 27)
323    OP         '+'           (1, 28) (1, 29)
324    STRING     'RB"abc"'     (1, 30) (1, 37)
325    """)
326        # Check 0, 1, and 2 character string prefixes.
327        self.check_tokenize(r'"a\
328de\
329fg"', """\
330    STRING     '"a\\\\\\nde\\\\\\nfg"\' (1, 0) (3, 3)
331    """)
332        self.check_tokenize(r'u"a\
333de"', """\
334    STRING     'u"a\\\\\\nde"\'  (1, 0) (2, 3)
335    """)
336        self.check_tokenize(r'rb"a\
337d"', """\
338    STRING     'rb"a\\\\\\nd"\'  (1, 0) (2, 2)
339    """)
340        self.check_tokenize(r'"""a\
341b"""', """\
342    STRING     '\"\""a\\\\\\nb\"\""' (1, 0) (2, 4)
343    """)
344        self.check_tokenize(r'u"""a\
345b"""', """\
346    STRING     'u\"\""a\\\\\\nb\"\""' (1, 0) (2, 4)
347    """)
348        self.check_tokenize(r'rb"""a\
349b\
350c"""', """\
351    STRING     'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4)
352    """)
353        self.check_tokenize('f"abc"', """\
354    STRING     'f"abc"'      (1, 0) (1, 6)
355    """)
356        self.check_tokenize('fR"a{b}c"', """\
357    STRING     'fR"a{b}c"'   (1, 0) (1, 9)
358    """)
359        self.check_tokenize('f"""abc"""', """\
360    STRING     'f\"\"\"abc\"\"\"'  (1, 0) (1, 10)
361    """)
362        self.check_tokenize(r'f"abc\
363def"', """\
364    STRING     'f"abc\\\\\\ndef"' (1, 0) (2, 4)
365    """)
366        self.check_tokenize(r'Rf"abc\
367def"', """\
368    STRING     'Rf"abc\\\\\\ndef"' (1, 0) (2, 4)
369    """)
370
371    def test_function(self):
372        self.check_tokenize("def d22(a, b, c=2, d=2, *k): pass", """\
373    NAME       'def'         (1, 0) (1, 3)
374    NAME       'd22'         (1, 4) (1, 7)
375    OP         '('           (1, 7) (1, 8)
376    NAME       'a'           (1, 8) (1, 9)
377    OP         ','           (1, 9) (1, 10)
378    NAME       'b'           (1, 11) (1, 12)
379    OP         ','           (1, 12) (1, 13)
380    NAME       'c'           (1, 14) (1, 15)
381    OP         '='           (1, 15) (1, 16)
382    NUMBER     '2'           (1, 16) (1, 17)
383    OP         ','           (1, 17) (1, 18)
384    NAME       'd'           (1, 19) (1, 20)
385    OP         '='           (1, 20) (1, 21)
386    NUMBER     '2'           (1, 21) (1, 22)
387    OP         ','           (1, 22) (1, 23)
388    OP         '*'           (1, 24) (1, 25)
389    NAME       'k'           (1, 25) (1, 26)
390    OP         ')'           (1, 26) (1, 27)
391    OP         ':'           (1, 27) (1, 28)
392    NAME       'pass'        (1, 29) (1, 33)
393    """)
394        self.check_tokenize("def d01v_(a=1, *k, **w): pass", """\
395    NAME       'def'         (1, 0) (1, 3)
396    NAME       'd01v_'       (1, 4) (1, 9)
397    OP         '('           (1, 9) (1, 10)
398    NAME       'a'           (1, 10) (1, 11)
399    OP         '='           (1, 11) (1, 12)
400    NUMBER     '1'           (1, 12) (1, 13)
401    OP         ','           (1, 13) (1, 14)
402    OP         '*'           (1, 15) (1, 16)
403    NAME       'k'           (1, 16) (1, 17)
404    OP         ','           (1, 17) (1, 18)
405    OP         '**'          (1, 19) (1, 21)
406    NAME       'w'           (1, 21) (1, 22)
407    OP         ')'           (1, 22) (1, 23)
408    OP         ':'           (1, 23) (1, 24)
409    NAME       'pass'        (1, 25) (1, 29)
410    """)
411
412    def test_comparison(self):
413        # Comparison
414        self.check_tokenize("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != "
415                            "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass", """\
416    NAME       'if'          (1, 0) (1, 2)
417    NUMBER     '1'           (1, 3) (1, 4)
418    OP         '<'           (1, 5) (1, 6)
419    NUMBER     '1'           (1, 7) (1, 8)
420    OP         '>'           (1, 9) (1, 10)
421    NUMBER     '1'           (1, 11) (1, 12)
422    OP         '=='          (1, 13) (1, 15)
423    NUMBER     '1'           (1, 16) (1, 17)
424    OP         '>='          (1, 18) (1, 20)
425    NUMBER     '5'           (1, 21) (1, 22)
426    OP         '<='          (1, 23) (1, 25)
427    NUMBER     '0x15'        (1, 26) (1, 30)
428    OP         '<='          (1, 31) (1, 33)
429    NUMBER     '0x12'        (1, 34) (1, 38)
430    OP         '!='          (1, 39) (1, 41)
431    NUMBER     '1'           (1, 42) (1, 43)
432    NAME       'and'         (1, 44) (1, 47)
433    NUMBER     '5'           (1, 48) (1, 49)
434    NAME       'in'          (1, 50) (1, 52)
435    NUMBER     '1'           (1, 53) (1, 54)
436    NAME       'not'         (1, 55) (1, 58)
437    NAME       'in'          (1, 59) (1, 61)
438    NUMBER     '1'           (1, 62) (1, 63)
439    NAME       'is'          (1, 64) (1, 66)
440    NUMBER     '1'           (1, 67) (1, 68)
441    NAME       'or'          (1, 69) (1, 71)
442    NUMBER     '5'           (1, 72) (1, 73)
443    NAME       'is'          (1, 74) (1, 76)
444    NAME       'not'         (1, 77) (1, 80)
445    NUMBER     '1'           (1, 81) (1, 82)
446    OP         ':'           (1, 82) (1, 83)
447    NAME       'pass'        (1, 84) (1, 88)
448    """)
449
450    def test_shift(self):
451        # Shift
452        self.check_tokenize("x = 1 << 1 >> 5", """\
453    NAME       'x'           (1, 0) (1, 1)
454    OP         '='           (1, 2) (1, 3)
455    NUMBER     '1'           (1, 4) (1, 5)
456    OP         '<<'          (1, 6) (1, 8)
457    NUMBER     '1'           (1, 9) (1, 10)
458    OP         '>>'          (1, 11) (1, 13)
459    NUMBER     '5'           (1, 14) (1, 15)
460    """)
461
462    def test_additive(self):
463        # Additive
464        self.check_tokenize("x = 1 - y + 15 - 1 + 0x124 + z + a[5]", """\
465    NAME       'x'           (1, 0) (1, 1)
466    OP         '='           (1, 2) (1, 3)
467    NUMBER     '1'           (1, 4) (1, 5)
468    OP         '-'           (1, 6) (1, 7)
469    NAME       'y'           (1, 8) (1, 9)
470    OP         '+'           (1, 10) (1, 11)
471    NUMBER     '15'          (1, 12) (1, 14)
472    OP         '-'           (1, 15) (1, 16)
473    NUMBER     '1'           (1, 17) (1, 18)
474    OP         '+'           (1, 19) (1, 20)
475    NUMBER     '0x124'       (1, 21) (1, 26)
476    OP         '+'           (1, 27) (1, 28)
477    NAME       'z'           (1, 29) (1, 30)
478    OP         '+'           (1, 31) (1, 32)
479    NAME       'a'           (1, 33) (1, 34)
480    OP         '['           (1, 34) (1, 35)
481    NUMBER     '5'           (1, 35) (1, 36)
482    OP         ']'           (1, 36) (1, 37)
483    """)
484
485    def test_multiplicative(self):
486        # Multiplicative
487        self.check_tokenize("x = 1//1*1/5*12%0x12@42", """\
488    NAME       'x'           (1, 0) (1, 1)
489    OP         '='           (1, 2) (1, 3)
490    NUMBER     '1'           (1, 4) (1, 5)
491    OP         '//'          (1, 5) (1, 7)
492    NUMBER     '1'           (1, 7) (1, 8)
493    OP         '*'           (1, 8) (1, 9)
494    NUMBER     '1'           (1, 9) (1, 10)
495    OP         '/'           (1, 10) (1, 11)
496    NUMBER     '5'           (1, 11) (1, 12)
497    OP         '*'           (1, 12) (1, 13)
498    NUMBER     '12'          (1, 13) (1, 15)
499    OP         '%'           (1, 15) (1, 16)
500    NUMBER     '0x12'        (1, 16) (1, 20)
501    OP         '@'           (1, 20) (1, 21)
502    NUMBER     '42'          (1, 21) (1, 23)
503    """)
504
505    def test_unary(self):
506        # Unary
507        self.check_tokenize("~1 ^ 1 & 1 |1 ^ -1", """\
508    OP         '~'           (1, 0) (1, 1)
509    NUMBER     '1'           (1, 1) (1, 2)
510    OP         '^'           (1, 3) (1, 4)
511    NUMBER     '1'           (1, 5) (1, 6)
512    OP         '&'           (1, 7) (1, 8)
513    NUMBER     '1'           (1, 9) (1, 10)
514    OP         '|'           (1, 11) (1, 12)
515    NUMBER     '1'           (1, 12) (1, 13)
516    OP         '^'           (1, 14) (1, 15)
517    OP         '-'           (1, 16) (1, 17)
518    NUMBER     '1'           (1, 17) (1, 18)
519    """)
520        self.check_tokenize("-1*1/1+1*1//1 - ---1**1", """\
521    OP         '-'           (1, 0) (1, 1)
522    NUMBER     '1'           (1, 1) (1, 2)
523    OP         '*'           (1, 2) (1, 3)
524    NUMBER     '1'           (1, 3) (1, 4)
525    OP         '/'           (1, 4) (1, 5)
526    NUMBER     '1'           (1, 5) (1, 6)
527    OP         '+'           (1, 6) (1, 7)
528    NUMBER     '1'           (1, 7) (1, 8)
529    OP         '*'           (1, 8) (1, 9)
530    NUMBER     '1'           (1, 9) (1, 10)
531    OP         '//'          (1, 10) (1, 12)
532    NUMBER     '1'           (1, 12) (1, 13)
533    OP         '-'           (1, 14) (1, 15)
534    OP         '-'           (1, 16) (1, 17)
535    OP         '-'           (1, 17) (1, 18)
536    OP         '-'           (1, 18) (1, 19)
537    NUMBER     '1'           (1, 19) (1, 20)
538    OP         '**'          (1, 20) (1, 22)
539    NUMBER     '1'           (1, 22) (1, 23)
540    """)
541
542    def test_selector(self):
543        # Selector
544        self.check_tokenize("import sys, time\nx = sys.modules['time'].time()", """\
545    NAME       'import'      (1, 0) (1, 6)
546    NAME       'sys'         (1, 7) (1, 10)
547    OP         ','           (1, 10) (1, 11)
548    NAME       'time'        (1, 12) (1, 16)
549    NEWLINE    '\\n'          (1, 16) (1, 17)
550    NAME       'x'           (2, 0) (2, 1)
551    OP         '='           (2, 2) (2, 3)
552    NAME       'sys'         (2, 4) (2, 7)
553    OP         '.'           (2, 7) (2, 8)
554    NAME       'modules'     (2, 8) (2, 15)
555    OP         '['           (2, 15) (2, 16)
556    STRING     "'time'"      (2, 16) (2, 22)
557    OP         ']'           (2, 22) (2, 23)
558    OP         '.'           (2, 23) (2, 24)
559    NAME       'time'        (2, 24) (2, 28)
560    OP         '('           (2, 28) (2, 29)
561    OP         ')'           (2, 29) (2, 30)
562    """)
563
564    def test_method(self):
565        # Methods
566        self.check_tokenize("@staticmethod\ndef foo(x,y): pass", """\
567    OP         '@'           (1, 0) (1, 1)
568    NAME       'staticmethod' (1, 1) (1, 13)
569    NEWLINE    '\\n'          (1, 13) (1, 14)
570    NAME       'def'         (2, 0) (2, 3)
571    NAME       'foo'         (2, 4) (2, 7)
572    OP         '('           (2, 7) (2, 8)
573    NAME       'x'           (2, 8) (2, 9)
574    OP         ','           (2, 9) (2, 10)
575    NAME       'y'           (2, 10) (2, 11)
576    OP         ')'           (2, 11) (2, 12)
577    OP         ':'           (2, 12) (2, 13)
578    NAME       'pass'        (2, 14) (2, 18)
579    """)
580
581    def test_tabs(self):
582        # Evil tabs
583        self.check_tokenize("def f():\n"
584                            "\tif x\n"
585                            "        \tpass", """\
586    NAME       'def'         (1, 0) (1, 3)
587    NAME       'f'           (1, 4) (1, 5)
588    OP         '('           (1, 5) (1, 6)
589    OP         ')'           (1, 6) (1, 7)
590    OP         ':'           (1, 7) (1, 8)
591    NEWLINE    '\\n'          (1, 8) (1, 9)
592    INDENT     '\\t'          (2, 0) (2, 1)
593    NAME       'if'          (2, 1) (2, 3)
594    NAME       'x'           (2, 4) (2, 5)
595    NEWLINE    '\\n'          (2, 5) (2, 6)
596    INDENT     '        \\t'  (3, 0) (3, 9)
597    NAME       'pass'        (3, 9) (3, 13)
598    DEDENT     ''            (4, 0) (4, 0)
599    DEDENT     ''            (4, 0) (4, 0)
600    """)
601
602    def test_non_ascii_identifiers(self):
603        # Non-ascii identifiers
604        self.check_tokenize("Örter = 'places'\ngrün = 'green'", """\
605    NAME       'Örter'       (1, 0) (1, 5)
606    OP         '='           (1, 6) (1, 7)
607    STRING     "'places'"    (1, 8) (1, 16)
608    NEWLINE    '\\n'          (1, 16) (1, 17)
609    NAME       'grün'        (2, 0) (2, 4)
610    OP         '='           (2, 5) (2, 6)
611    STRING     "'green'"     (2, 7) (2, 14)
612    """)
613
614    def test_unicode(self):
615        # Legacy unicode literals:
616        self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\
617    NAME       'Örter'       (1, 0) (1, 5)
618    OP         '='           (1, 6) (1, 7)
619    STRING     "u'places'"   (1, 8) (1, 17)
620    NEWLINE    '\\n'          (1, 17) (1, 18)
621    NAME       'grün'        (2, 0) (2, 4)
622    OP         '='           (2, 5) (2, 6)
623    STRING     "U'green'"    (2, 7) (2, 15)
624    """)
625
626    def test_async(self):
627        # Async/await extension:
628        self.check_tokenize("async = 1", """\
629    NAME       'async'       (1, 0) (1, 5)
630    OP         '='           (1, 6) (1, 7)
631    NUMBER     '1'           (1, 8) (1, 9)
632    """)
633
634        self.check_tokenize("a = (async = 1)", """\
635    NAME       'a'           (1, 0) (1, 1)
636    OP         '='           (1, 2) (1, 3)
637    OP         '('           (1, 4) (1, 5)
638    NAME       'async'       (1, 5) (1, 10)
639    OP         '='           (1, 11) (1, 12)
640    NUMBER     '1'           (1, 13) (1, 14)
641    OP         ')'           (1, 14) (1, 15)
642    """)
643
644        self.check_tokenize("async()", """\
645    NAME       'async'       (1, 0) (1, 5)
646    OP         '('           (1, 5) (1, 6)
647    OP         ')'           (1, 6) (1, 7)
648    """)
649
650        self.check_tokenize("class async(Bar):pass", """\
651    NAME       'class'       (1, 0) (1, 5)
652    NAME       'async'       (1, 6) (1, 11)
653    OP         '('           (1, 11) (1, 12)
654    NAME       'Bar'         (1, 12) (1, 15)
655    OP         ')'           (1, 15) (1, 16)
656    OP         ':'           (1, 16) (1, 17)
657    NAME       'pass'        (1, 17) (1, 21)
658    """)
659
660        self.check_tokenize("class async:pass", """\
661    NAME       'class'       (1, 0) (1, 5)
662    NAME       'async'       (1, 6) (1, 11)
663    OP         ':'           (1, 11) (1, 12)
664    NAME       'pass'        (1, 12) (1, 16)
665    """)
666
667        self.check_tokenize("await = 1", """\
668    NAME       'await'       (1, 0) (1, 5)
669    OP         '='           (1, 6) (1, 7)
670    NUMBER     '1'           (1, 8) (1, 9)
671    """)
672
673        self.check_tokenize("foo.async", """\
674    NAME       'foo'         (1, 0) (1, 3)
675    OP         '.'           (1, 3) (1, 4)
676    NAME       'async'       (1, 4) (1, 9)
677    """)
678
679        self.check_tokenize("async for a in b: pass", """\
680    NAME       'async'       (1, 0) (1, 5)
681    NAME       'for'         (1, 6) (1, 9)
682    NAME       'a'           (1, 10) (1, 11)
683    NAME       'in'          (1, 12) (1, 14)
684    NAME       'b'           (1, 15) (1, 16)
685    OP         ':'           (1, 16) (1, 17)
686    NAME       'pass'        (1, 18) (1, 22)
687    """)
688
689        self.check_tokenize("async with a as b: pass", """\
690    NAME       'async'       (1, 0) (1, 5)
691    NAME       'with'        (1, 6) (1, 10)
692    NAME       'a'           (1, 11) (1, 12)
693    NAME       'as'          (1, 13) (1, 15)
694    NAME       'b'           (1, 16) (1, 17)
695    OP         ':'           (1, 17) (1, 18)
696    NAME       'pass'        (1, 19) (1, 23)
697    """)
698
699        self.check_tokenize("async.foo", """\
700    NAME       'async'       (1, 0) (1, 5)
701    OP         '.'           (1, 5) (1, 6)
702    NAME       'foo'         (1, 6) (1, 9)
703    """)
704
705        self.check_tokenize("async", """\
706    NAME       'async'       (1, 0) (1, 5)
707    """)
708
709        self.check_tokenize("async\n#comment\nawait", """\
710    NAME       'async'       (1, 0) (1, 5)
711    NEWLINE    '\\n'          (1, 5) (1, 6)
712    COMMENT    '#comment'    (2, 0) (2, 8)
713    NL         '\\n'          (2, 8) (2, 9)
714    NAME       'await'       (3, 0) (3, 5)
715    """)
716
717        self.check_tokenize("async\n...\nawait", """\
718    NAME       'async'       (1, 0) (1, 5)
719    NEWLINE    '\\n'          (1, 5) (1, 6)
720    OP         '...'         (2, 0) (2, 3)
721    NEWLINE    '\\n'          (2, 3) (2, 4)
722    NAME       'await'       (3, 0) (3, 5)
723    """)
724
725        self.check_tokenize("async\nawait", """\
726    NAME       'async'       (1, 0) (1, 5)
727    NEWLINE    '\\n'          (1, 5) (1, 6)
728    NAME       'await'       (2, 0) (2, 5)
729    """)
730
731        self.check_tokenize("foo.async + 1", """\
732    NAME       'foo'         (1, 0) (1, 3)
733    OP         '.'           (1, 3) (1, 4)
734    NAME       'async'       (1, 4) (1, 9)
735    OP         '+'           (1, 10) (1, 11)
736    NUMBER     '1'           (1, 12) (1, 13)
737    """)
738
739        self.check_tokenize("async def foo(): pass", """\
740    ASYNC      'async'       (1, 0) (1, 5)
741    NAME       'def'         (1, 6) (1, 9)
742    NAME       'foo'         (1, 10) (1, 13)
743    OP         '('           (1, 13) (1, 14)
744    OP         ')'           (1, 14) (1, 15)
745    OP         ':'           (1, 15) (1, 16)
746    NAME       'pass'        (1, 17) (1, 21)
747    """)
748
749        self.check_tokenize('''\
750async def foo():
751  def foo(await):
752    await = 1
753  if 1:
754    await
755async += 1
756''', """\
757    ASYNC      'async'       (1, 0) (1, 5)
758    NAME       'def'         (1, 6) (1, 9)
759    NAME       'foo'         (1, 10) (1, 13)
760    OP         '('           (1, 13) (1, 14)
761    OP         ')'           (1, 14) (1, 15)
762    OP         ':'           (1, 15) (1, 16)
763    NEWLINE    '\\n'          (1, 16) (1, 17)
764    INDENT     '  '          (2, 0) (2, 2)
765    NAME       'def'         (2, 2) (2, 5)
766    NAME       'foo'         (2, 6) (2, 9)
767    OP         '('           (2, 9) (2, 10)
768    AWAIT      'await'       (2, 10) (2, 15)
769    OP         ')'           (2, 15) (2, 16)
770    OP         ':'           (2, 16) (2, 17)
771    NEWLINE    '\\n'          (2, 17) (2, 18)
772    INDENT     '    '        (3, 0) (3, 4)
773    AWAIT      'await'       (3, 4) (3, 9)
774    OP         '='           (3, 10) (3, 11)
775    NUMBER     '1'           (3, 12) (3, 13)
776    NEWLINE    '\\n'          (3, 13) (3, 14)
777    DEDENT     ''            (4, 2) (4, 2)
778    NAME       'if'          (4, 2) (4, 4)
779    NUMBER     '1'           (4, 5) (4, 6)
780    OP         ':'           (4, 6) (4, 7)
781    NEWLINE    '\\n'          (4, 7) (4, 8)
782    INDENT     '    '        (5, 0) (5, 4)
783    AWAIT      'await'       (5, 4) (5, 9)
784    NEWLINE    '\\n'          (5, 9) (5, 10)
785    DEDENT     ''            (6, 0) (6, 0)
786    DEDENT     ''            (6, 0) (6, 0)
787    NAME       'async'       (6, 0) (6, 5)
788    OP         '+='          (6, 6) (6, 8)
789    NUMBER     '1'           (6, 9) (6, 10)
790    NEWLINE    '\\n'          (6, 10) (6, 11)
791    """)
792
793        self.check_tokenize('''\
794async def foo():
795  async for i in 1: pass''', """\
796    ASYNC      'async'       (1, 0) (1, 5)
797    NAME       'def'         (1, 6) (1, 9)
798    NAME       'foo'         (1, 10) (1, 13)
799    OP         '('           (1, 13) (1, 14)
800    OP         ')'           (1, 14) (1, 15)
801    OP         ':'           (1, 15) (1, 16)
802    NEWLINE    '\\n'          (1, 16) (1, 17)
803    INDENT     '  '          (2, 0) (2, 2)
804    ASYNC      'async'       (2, 2) (2, 7)
805    NAME       'for'         (2, 8) (2, 11)
806    NAME       'i'           (2, 12) (2, 13)
807    NAME       'in'          (2, 14) (2, 16)
808    NUMBER     '1'           (2, 17) (2, 18)
809    OP         ':'           (2, 18) (2, 19)
810    NAME       'pass'        (2, 20) (2, 24)
811    DEDENT     ''            (3, 0) (3, 0)
812    """)
813
814        self.check_tokenize('''async def foo(async): await''', """\
815    ASYNC      'async'       (1, 0) (1, 5)
816    NAME       'def'         (1, 6) (1, 9)
817    NAME       'foo'         (1, 10) (1, 13)
818    OP         '('           (1, 13) (1, 14)
819    ASYNC      'async'       (1, 14) (1, 19)
820    OP         ')'           (1, 19) (1, 20)
821    OP         ':'           (1, 20) (1, 21)
822    AWAIT      'await'       (1, 22) (1, 27)
823    """)
824
825        self.check_tokenize('''\
826def f():
827
828  def baz(): pass
829  async def bar(): pass
830
831  await = 2''', """\
832    NAME       'def'         (1, 0) (1, 3)
833    NAME       'f'           (1, 4) (1, 5)
834    OP         '('           (1, 5) (1, 6)
835    OP         ')'           (1, 6) (1, 7)
836    OP         ':'           (1, 7) (1, 8)
837    NEWLINE    '\\n'          (1, 8) (1, 9)
838    NL         '\\n'          (2, 0) (2, 1)
839    INDENT     '  '          (3, 0) (3, 2)
840    NAME       'def'         (3, 2) (3, 5)
841    NAME       'baz'         (3, 6) (3, 9)
842    OP         '('           (3, 9) (3, 10)
843    OP         ')'           (3, 10) (3, 11)
844    OP         ':'           (3, 11) (3, 12)
845    NAME       'pass'        (3, 13) (3, 17)
846    NEWLINE    '\\n'          (3, 17) (3, 18)
847    ASYNC      'async'       (4, 2) (4, 7)
848    NAME       'def'         (4, 8) (4, 11)
849    NAME       'bar'         (4, 12) (4, 15)
850    OP         '('           (4, 15) (4, 16)
851    OP         ')'           (4, 16) (4, 17)
852    OP         ':'           (4, 17) (4, 18)
853    NAME       'pass'        (4, 19) (4, 23)
854    NEWLINE    '\\n'          (4, 23) (4, 24)
855    NL         '\\n'          (5, 0) (5, 1)
856    NAME       'await'       (6, 2) (6, 7)
857    OP         '='           (6, 8) (6, 9)
858    NUMBER     '2'           (6, 10) (6, 11)
859    DEDENT     ''            (7, 0) (7, 0)
860    """)
861
862        self.check_tokenize('''\
863async def f():
864
865  def baz(): pass
866  async def bar(): pass
867
868  await = 2''', """\
869    ASYNC      'async'       (1, 0) (1, 5)
870    NAME       'def'         (1, 6) (1, 9)
871    NAME       'f'           (1, 10) (1, 11)
872    OP         '('           (1, 11) (1, 12)
873    OP         ')'           (1, 12) (1, 13)
874    OP         ':'           (1, 13) (1, 14)
875    NEWLINE    '\\n'          (1, 14) (1, 15)
876    NL         '\\n'          (2, 0) (2, 1)
877    INDENT     '  '          (3, 0) (3, 2)
878    NAME       'def'         (3, 2) (3, 5)
879    NAME       'baz'         (3, 6) (3, 9)
880    OP         '('           (3, 9) (3, 10)
881    OP         ')'           (3, 10) (3, 11)
882    OP         ':'           (3, 11) (3, 12)
883    NAME       'pass'        (3, 13) (3, 17)
884    NEWLINE    '\\n'          (3, 17) (3, 18)
885    ASYNC      'async'       (4, 2) (4, 7)
886    NAME       'def'         (4, 8) (4, 11)
887    NAME       'bar'         (4, 12) (4, 15)
888    OP         '('           (4, 15) (4, 16)
889    OP         ')'           (4, 16) (4, 17)
890    OP         ':'           (4, 17) (4, 18)
891    NAME       'pass'        (4, 19) (4, 23)
892    NEWLINE    '\\n'          (4, 23) (4, 24)
893    NL         '\\n'          (5, 0) (5, 1)
894    AWAIT      'await'       (6, 2) (6, 7)
895    OP         '='           (6, 8) (6, 9)
896    NUMBER     '2'           (6, 10) (6, 11)
897    DEDENT     ''            (7, 0) (7, 0)
898    """)
899
900
901def decistmt(s):
902    result = []
903    g = tokenize(BytesIO(s.encode('utf-8')).readline)   # tokenize the string
904    for toknum, tokval, _, _, _  in g:
905        if toknum == NUMBER and '.' in tokval:  # replace NUMBER tokens
906            result.extend([
907                (NAME, 'Decimal'),
908                (OP, '('),
909                (STRING, repr(tokval)),
910                (OP, ')')
911            ])
912        else:
913            result.append((toknum, tokval))
914    return untokenize(result).decode('utf-8')
915
916class TestMisc(TestCase):
917
918    def test_decistmt(self):
919        # Substitute Decimals for floats in a string of statements.
920        # This is an example from the docs.
921
922        from decimal import Decimal
923        s = '+21.3e-5*-.1234/81.7'
924        self.assertEqual(decistmt(s),
925                         "+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7')")
926
927        # The format of the exponent is inherited from the platform C library.
928        # Known cases are "e-007" (Windows) and "e-07" (not Windows).  Since
929        # we're only showing 11 digits, and the 12th isn't close to 5, the
930        # rest of the output should be platform-independent.
931        self.assertRegex(repr(eval(s)), '-3.2171603427[0-9]*e-0+7')
932
933        # Output from calculations with Decimal should be identical across all
934        # platforms.
935        self.assertEqual(eval(decistmt(s)),
936                         Decimal('-3.217160342717258261933904529E-7'))
937
938
939class TestTokenizerAdheresToPep0263(TestCase):
940    """
941    Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.
942    """
943
944    def _testFile(self, filename):
945        path = os.path.join(os.path.dirname(__file__), filename)
946        TestRoundtrip.check_roundtrip(self, open(path, 'rb'))
947
948    def test_utf8_coding_cookie_and_no_utf8_bom(self):
949        f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt'
950        self._testFile(f)
951
952    def test_latin1_coding_cookie_and_utf8_bom(self):
953        """
954        As per PEP 0263, if a file starts with a utf-8 BOM signature, the only
955        allowed encoding for the comment is 'utf-8'.  The text file used in
956        this test starts with a BOM signature, but specifies latin1 as the
957        coding, so verify that a SyntaxError is raised, which matches the
958        behaviour of the interpreter when it encounters a similar condition.
959        """
960        f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'
961        self.assertRaises(SyntaxError, self._testFile, f)
962
963    def test_no_coding_cookie_and_utf8_bom(self):
964        f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'
965        self._testFile(f)
966
967    def test_utf8_coding_cookie_and_utf8_bom(self):
968        f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
969        self._testFile(f)
970
971    def test_bad_coding_cookie(self):
972        self.assertRaises(SyntaxError, self._testFile, 'bad_coding.py')
973        self.assertRaises(SyntaxError, self._testFile, 'bad_coding2.py')
974
975
976class Test_Tokenize(TestCase):
977
978    def test__tokenize_decodes_with_specified_encoding(self):
979        literal = '"ЉЊЈЁЂ"'
980        line = literal.encode('utf-8')
981        first = False
982        def readline():
983            nonlocal first
984            if not first:
985                first = True
986                return line
987            else:
988                return b''
989
990        # skip the initial encoding token and the end token
991        tokens = list(_tokenize(readline, encoding='utf-8'))[1:-1]
992        expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
993        self.assertEqual(tokens, expected_tokens,
994                         "bytes not decoded with encoding")
995
996    def test__tokenize_does_not_decode_with_encoding_none(self):
997        literal = '"ЉЊЈЁЂ"'
998        first = False
999        def readline():
1000            nonlocal first
1001            if not first:
1002                first = True
1003                return literal
1004            else:
1005                return b''
1006
1007        # skip the end token
1008        tokens = list(_tokenize(readline, encoding=None))[:-1]
1009        expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
1010        self.assertEqual(tokens, expected_tokens,
1011                         "string not tokenized when encoding is None")
1012
1013
1014class TestDetectEncoding(TestCase):
1015
1016    def get_readline(self, lines):
1017        index = 0
1018        def readline():
1019            nonlocal index
1020            if index == len(lines):
1021                raise StopIteration
1022            line = lines[index]
1023            index += 1
1024            return line
1025        return readline
1026
1027    def test_no_bom_no_encoding_cookie(self):
1028        lines = (
1029            b'# something\n',
1030            b'print(something)\n',
1031            b'do_something(else)\n'
1032        )
1033        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1034        self.assertEqual(encoding, 'utf-8')
1035        self.assertEqual(consumed_lines, list(lines[:2]))
1036
1037    def test_bom_no_cookie(self):
1038        lines = (
1039            b'\xef\xbb\xbf# something\n',
1040            b'print(something)\n',
1041            b'do_something(else)\n'
1042        )
1043        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1044        self.assertEqual(encoding, 'utf-8-sig')
1045        self.assertEqual(consumed_lines,
1046                         [b'# something\n', b'print(something)\n'])
1047
1048    def test_cookie_first_line_no_bom(self):
1049        lines = (
1050            b'# -*- coding: latin-1 -*-\n',
1051            b'print(something)\n',
1052            b'do_something(else)\n'
1053        )
1054        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1055        self.assertEqual(encoding, 'iso-8859-1')
1056        self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
1057
1058    def test_matched_bom_and_cookie_first_line(self):
1059        lines = (
1060            b'\xef\xbb\xbf# coding=utf-8\n',
1061            b'print(something)\n',
1062            b'do_something(else)\n'
1063        )
1064        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1065        self.assertEqual(encoding, 'utf-8-sig')
1066        self.assertEqual(consumed_lines, [b'# coding=utf-8\n'])
1067
1068    def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):
1069        lines = (
1070            b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',
1071            b'print(something)\n',
1072            b'do_something(else)\n'
1073        )
1074        readline = self.get_readline(lines)
1075        self.assertRaises(SyntaxError, detect_encoding, readline)
1076
1077    def test_cookie_second_line_no_bom(self):
1078        lines = (
1079            b'#! something\n',
1080            b'# vim: set fileencoding=ascii :\n',
1081            b'print(something)\n',
1082            b'do_something(else)\n'
1083        )
1084        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1085        self.assertEqual(encoding, 'ascii')
1086        expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']
1087        self.assertEqual(consumed_lines, expected)
1088
1089    def test_matched_bom_and_cookie_second_line(self):
1090        lines = (
1091            b'\xef\xbb\xbf#! something\n',
1092            b'f# coding=utf-8\n',
1093            b'print(something)\n',
1094            b'do_something(else)\n'
1095        )
1096        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1097        self.assertEqual(encoding, 'utf-8-sig')
1098        self.assertEqual(consumed_lines,
1099                         [b'#! something\n', b'f# coding=utf-8\n'])
1100
1101    def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):
1102        lines = (
1103            b'\xef\xbb\xbf#! something\n',
1104            b'# vim: set fileencoding=ascii :\n',
1105            b'print(something)\n',
1106            b'do_something(else)\n'
1107        )
1108        readline = self.get_readline(lines)
1109        self.assertRaises(SyntaxError, detect_encoding, readline)
1110
1111    def test_cookie_second_line_noncommented_first_line(self):
1112        lines = (
1113            b"print('\xc2\xa3')\n",
1114            b'# vim: set fileencoding=iso8859-15 :\n',
1115            b"print('\xe2\x82\xac')\n"
1116        )
1117        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1118        self.assertEqual(encoding, 'utf-8')
1119        expected = [b"print('\xc2\xa3')\n"]
1120        self.assertEqual(consumed_lines, expected)
1121
1122    def test_cookie_second_line_commented_first_line(self):
1123        lines = (
1124            b"#print('\xc2\xa3')\n",
1125            b'# vim: set fileencoding=iso8859-15 :\n',
1126            b"print('\xe2\x82\xac')\n"
1127        )
1128        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1129        self.assertEqual(encoding, 'iso8859-15')
1130        expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n']
1131        self.assertEqual(consumed_lines, expected)
1132
1133    def test_cookie_second_line_empty_first_line(self):
1134        lines = (
1135            b'\n',
1136            b'# vim: set fileencoding=iso8859-15 :\n',
1137            b"print('\xe2\x82\xac')\n"
1138        )
1139        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1140        self.assertEqual(encoding, 'iso8859-15')
1141        expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n']
1142        self.assertEqual(consumed_lines, expected)
1143
1144    def test_latin1_normalization(self):
1145        # See get_normal_name() in tokenizer.c.
1146        encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
1147                     "iso-8859-1-unix", "iso-latin-1-mac")
1148        for encoding in encodings:
1149            for rep in ("-", "_"):
1150                enc = encoding.replace("-", rep)
1151                lines = (b"#!/usr/bin/python\n",
1152                         b"# coding: " + enc.encode("ascii") + b"\n",
1153                         b"print(things)\n",
1154                         b"do_something += 4\n")
1155                rl = self.get_readline(lines)
1156                found, consumed_lines = detect_encoding(rl)
1157                self.assertEqual(found, "iso-8859-1")
1158
1159    def test_syntaxerror_latin1(self):
1160        # Issue 14629: need to raise SyntaxError if the first
1161        # line(s) have non-UTF-8 characters
1162        lines = (
1163            b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
1164            )
1165        readline = self.get_readline(lines)
1166        self.assertRaises(SyntaxError, detect_encoding, readline)
1167
1168
1169    def test_utf8_normalization(self):
1170        # See get_normal_name() in tokenizer.c.
1171        encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
1172        for encoding in encodings:
1173            for rep in ("-", "_"):
1174                enc = encoding.replace("-", rep)
1175                lines = (b"#!/usr/bin/python\n",
1176                         b"# coding: " + enc.encode("ascii") + b"\n",
1177                         b"1 + 3\n")
1178                rl = self.get_readline(lines)
1179                found, consumed_lines = detect_encoding(rl)
1180                self.assertEqual(found, "utf-8")
1181
1182    def test_short_files(self):
1183        readline = self.get_readline((b'print(something)\n',))
1184        encoding, consumed_lines = detect_encoding(readline)
1185        self.assertEqual(encoding, 'utf-8')
1186        self.assertEqual(consumed_lines, [b'print(something)\n'])
1187
1188        encoding, consumed_lines = detect_encoding(self.get_readline(()))
1189        self.assertEqual(encoding, 'utf-8')
1190        self.assertEqual(consumed_lines, [])
1191
1192        readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))
1193        encoding, consumed_lines = detect_encoding(readline)
1194        self.assertEqual(encoding, 'utf-8-sig')
1195        self.assertEqual(consumed_lines, [b'print(something)\n'])
1196
1197        readline = self.get_readline((b'\xef\xbb\xbf',))
1198        encoding, consumed_lines = detect_encoding(readline)
1199        self.assertEqual(encoding, 'utf-8-sig')
1200        self.assertEqual(consumed_lines, [])
1201
1202        readline = self.get_readline((b'# coding: bad\n',))
1203        self.assertRaises(SyntaxError, detect_encoding, readline)
1204
1205    def test_false_encoding(self):
1206        # Issue 18873: "Encoding" detected in non-comment lines
1207        readline = self.get_readline((b'print("#coding=fake")',))
1208        encoding, consumed_lines = detect_encoding(readline)
1209        self.assertEqual(encoding, 'utf-8')
1210        self.assertEqual(consumed_lines, [b'print("#coding=fake")'])
1211
1212    def test_open(self):
1213        filename = support.TESTFN + '.py'
1214        self.addCleanup(support.unlink, filename)
1215
1216        # test coding cookie
1217        for encoding in ('iso-8859-15', 'utf-8'):
1218            with open(filename, 'w', encoding=encoding) as fp:
1219                print("# coding: %s" % encoding, file=fp)
1220                print("print('euro:\u20ac')", file=fp)
1221            with tokenize_open(filename) as fp:
1222                self.assertEqual(fp.encoding, encoding)
1223                self.assertEqual(fp.mode, 'r')
1224
1225        # test BOM (no coding cookie)
1226        with open(filename, 'w', encoding='utf-8-sig') as fp:
1227            print("print('euro:\u20ac')", file=fp)
1228        with tokenize_open(filename) as fp:
1229            self.assertEqual(fp.encoding, 'utf-8-sig')
1230            self.assertEqual(fp.mode, 'r')
1231
1232    def test_filename_in_exception(self):
1233        # When possible, include the file name in the exception.
1234        path = 'some_file_path'
1235        lines = (
1236            b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
1237            )
1238        class Bunk:
1239            def __init__(self, lines, path):
1240                self.name = path
1241                self._lines = lines
1242                self._index = 0
1243
1244            def readline(self):
1245                if self._index == len(lines):
1246                    raise StopIteration
1247                line = lines[self._index]
1248                self._index += 1
1249                return line
1250
1251        with self.assertRaises(SyntaxError):
1252            ins = Bunk(lines, path)
1253            # Make sure lacking a name isn't an issue.
1254            del ins.name
1255            detect_encoding(ins.readline)
1256        with self.assertRaisesRegex(SyntaxError, '.*{}'.format(path)):
1257            ins = Bunk(lines, path)
1258            detect_encoding(ins.readline)
1259
1260    def test_open_error(self):
1261        # Issue #23840: open() must close the binary file on error
1262        m = BytesIO(b'#coding:xxx')
1263        with mock.patch('tokenize._builtin_open', return_value=m):
1264            self.assertRaises(SyntaxError, tokenize_open, 'foobar')
1265        self.assertTrue(m.closed)
1266
1267
1268class TestTokenize(TestCase):
1269
1270    def test_tokenize(self):
1271        import tokenize as tokenize_module
1272        encoding = object()
1273        encoding_used = None
1274        def mock_detect_encoding(readline):
1275            return encoding, [b'first', b'second']
1276
1277        def mock__tokenize(readline, encoding):
1278            nonlocal encoding_used
1279            encoding_used = encoding
1280            out = []
1281            while True:
1282                next_line = readline()
1283                if next_line:
1284                    out.append(next_line)
1285                    continue
1286                return out
1287
1288        counter = 0
1289        def mock_readline():
1290            nonlocal counter
1291            counter += 1
1292            if counter == 5:
1293                return b''
1294            return str(counter).encode()
1295
1296        orig_detect_encoding = tokenize_module.detect_encoding
1297        orig__tokenize = tokenize_module._tokenize
1298        tokenize_module.detect_encoding = mock_detect_encoding
1299        tokenize_module._tokenize = mock__tokenize
1300        try:
1301            results = tokenize(mock_readline)
1302            self.assertEqual(list(results),
1303                             [b'first', b'second', b'1', b'2', b'3', b'4'])
1304        finally:
1305            tokenize_module.detect_encoding = orig_detect_encoding
1306            tokenize_module._tokenize = orig__tokenize
1307
1308        self.assertTrue(encoding_used, encoding)
1309
1310    def test_oneline_defs(self):
1311        buf = []
1312        for i in range(500):
1313            buf.append('def i{i}(): return {i}'.format(i=i))
1314        buf.append('OK')
1315        buf = '\n'.join(buf)
1316
1317        # Test that 500 consequent, one-line defs is OK
1318        toks = list(tokenize(BytesIO(buf.encode('utf-8')).readline))
1319        self.assertEqual(toks[-2].string, 'OK') # [-1] is always ENDMARKER
1320
1321    def assertExactTypeEqual(self, opstr, *optypes):
1322        tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
1323        num_optypes = len(optypes)
1324        self.assertEqual(len(tokens), 2 + num_optypes)
1325        self.assertEqual(token.tok_name[tokens[0].exact_type],
1326                         token.tok_name[ENCODING])
1327        for i in range(num_optypes):
1328            self.assertEqual(token.tok_name[tokens[i + 1].exact_type],
1329                             token.tok_name[optypes[i]])
1330        self.assertEqual(token.tok_name[tokens[1 + num_optypes].exact_type],
1331                         token.tok_name[token.ENDMARKER])
1332
1333    def test_exact_type(self):
1334        self.assertExactTypeEqual('()', token.LPAR, token.RPAR)
1335        self.assertExactTypeEqual('[]', token.LSQB, token.RSQB)
1336        self.assertExactTypeEqual(':', token.COLON)
1337        self.assertExactTypeEqual(',', token.COMMA)
1338        self.assertExactTypeEqual(';', token.SEMI)
1339        self.assertExactTypeEqual('+', token.PLUS)
1340        self.assertExactTypeEqual('-', token.MINUS)
1341        self.assertExactTypeEqual('*', token.STAR)
1342        self.assertExactTypeEqual('/', token.SLASH)
1343        self.assertExactTypeEqual('|', token.VBAR)
1344        self.assertExactTypeEqual('&', token.AMPER)
1345        self.assertExactTypeEqual('<', token.LESS)
1346        self.assertExactTypeEqual('>', token.GREATER)
1347        self.assertExactTypeEqual('=', token.EQUAL)
1348        self.assertExactTypeEqual('.', token.DOT)
1349        self.assertExactTypeEqual('%', token.PERCENT)
1350        self.assertExactTypeEqual('{}', token.LBRACE, token.RBRACE)
1351        self.assertExactTypeEqual('==', token.EQEQUAL)
1352        self.assertExactTypeEqual('!=', token.NOTEQUAL)
1353        self.assertExactTypeEqual('<=', token.LESSEQUAL)
1354        self.assertExactTypeEqual('>=', token.GREATEREQUAL)
1355        self.assertExactTypeEqual('~', token.TILDE)
1356        self.assertExactTypeEqual('^', token.CIRCUMFLEX)
1357        self.assertExactTypeEqual('<<', token.LEFTSHIFT)
1358        self.assertExactTypeEqual('>>', token.RIGHTSHIFT)
1359        self.assertExactTypeEqual('**', token.DOUBLESTAR)
1360        self.assertExactTypeEqual('+=', token.PLUSEQUAL)
1361        self.assertExactTypeEqual('-=', token.MINEQUAL)
1362        self.assertExactTypeEqual('*=', token.STAREQUAL)
1363        self.assertExactTypeEqual('/=', token.SLASHEQUAL)
1364        self.assertExactTypeEqual('%=', token.PERCENTEQUAL)
1365        self.assertExactTypeEqual('&=', token.AMPEREQUAL)
1366        self.assertExactTypeEqual('|=', token.VBAREQUAL)
1367        self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1368        self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1369        self.assertExactTypeEqual('<<=', token.LEFTSHIFTEQUAL)
1370        self.assertExactTypeEqual('>>=', token.RIGHTSHIFTEQUAL)
1371        self.assertExactTypeEqual('**=', token.DOUBLESTAREQUAL)
1372        self.assertExactTypeEqual('//', token.DOUBLESLASH)
1373        self.assertExactTypeEqual('//=', token.DOUBLESLASHEQUAL)
1374        self.assertExactTypeEqual('@', token.AT)
1375        self.assertExactTypeEqual('@=', token.ATEQUAL)
1376
1377        self.assertExactTypeEqual('a**2+b**2==c**2',
1378                                  NAME, token.DOUBLESTAR, NUMBER,
1379                                  token.PLUS,
1380                                  NAME, token.DOUBLESTAR, NUMBER,
1381                                  token.EQEQUAL,
1382                                  NAME, token.DOUBLESTAR, NUMBER)
1383        self.assertExactTypeEqual('{1, 2, 3}',
1384                                  token.LBRACE,
1385                                  token.NUMBER, token.COMMA,
1386                                  token.NUMBER, token.COMMA,
1387                                  token.NUMBER,
1388                                  token.RBRACE)
1389        self.assertExactTypeEqual('^(x & 0x1)',
1390                                  token.CIRCUMFLEX,
1391                                  token.LPAR,
1392                                  token.NAME, token.AMPER, token.NUMBER,
1393                                  token.RPAR)
1394
1395    def test_pathological_trailing_whitespace(self):
1396        # See http://bugs.python.org/issue16152
1397        self.assertExactTypeEqual('@          ', token.AT)
1398
1399
1400class UntokenizeTest(TestCase):
1401
1402    def test_bad_input_order(self):
1403        # raise if previous row
1404        u = Untokenizer()
1405        u.prev_row = 2
1406        u.prev_col = 2
1407        with self.assertRaises(ValueError) as cm:
1408            u.add_whitespace((1,3))
1409        self.assertEqual(cm.exception.args[0],
1410                'start (1,3) precedes previous end (2,2)')
1411        # raise if previous column in row
1412        self.assertRaises(ValueError, u.add_whitespace, (2,1))
1413
1414    def test_backslash_continuation(self):
1415        # The problem is that <whitespace>\<newline> leaves no token
1416        u = Untokenizer()
1417        u.prev_row = 1
1418        u.prev_col =  1
1419        u.tokens = []
1420        u.add_whitespace((2, 0))
1421        self.assertEqual(u.tokens, ['\\\n'])
1422        u.prev_row = 2
1423        u.add_whitespace((4, 4))
1424        self.assertEqual(u.tokens, ['\\\n', '\\\n\\\n', '    '])
1425        TestRoundtrip.check_roundtrip(self, 'a\n  b\n    c\n  \\\n  c\n')
1426
1427    def test_iter_compat(self):
1428        u = Untokenizer()
1429        token = (NAME, 'Hello')
1430        tokens = [(ENCODING, 'utf-8'), token]
1431        u.compat(token, iter([]))
1432        self.assertEqual(u.tokens, ["Hello "])
1433        u = Untokenizer()
1434        self.assertEqual(u.untokenize(iter([token])), 'Hello ')
1435        u = Untokenizer()
1436        self.assertEqual(u.untokenize(iter(tokens)), 'Hello ')
1437        self.assertEqual(u.encoding, 'utf-8')
1438        self.assertEqual(untokenize(iter(tokens)), b'Hello ')
1439
1440
1441class TestRoundtrip(TestCase):
1442
1443    def check_roundtrip(self, f):
1444        """
1445        Test roundtrip for `untokenize`. `f` is an open file or a string.
1446        The source code in f is tokenized to both 5- and 2-tuples.
1447        Both sequences are converted back to source code via
1448        tokenize.untokenize(), and the latter tokenized again to 2-tuples.
1449        The test fails if the 3 pair tokenizations do not match.
1450
1451        When untokenize bugs are fixed, untokenize with 5-tuples should
1452        reproduce code that does not contain a backslash continuation
1453        following spaces.  A proper test should test this.
1454        """
1455        # Get source code and original tokenizations
1456        if isinstance(f, str):
1457            code = f.encode('utf-8')
1458        else:
1459            code = f.read()
1460            f.close()
1461        readline = iter(code.splitlines(keepends=True)).__next__
1462        tokens5 = list(tokenize(readline))
1463        tokens2 = [tok[:2] for tok in tokens5]
1464        # Reproduce tokens2 from pairs
1465        bytes_from2 = untokenize(tokens2)
1466        readline2 = iter(bytes_from2.splitlines(keepends=True)).__next__
1467        tokens2_from2 = [tok[:2] for tok in tokenize(readline2)]
1468        self.assertEqual(tokens2_from2, tokens2)
1469        # Reproduce tokens2 from 5-tuples
1470        bytes_from5 = untokenize(tokens5)
1471        readline5 = iter(bytes_from5.splitlines(keepends=True)).__next__
1472        tokens2_from5 = [tok[:2] for tok in tokenize(readline5)]
1473        self.assertEqual(tokens2_from5, tokens2)
1474
1475    def test_roundtrip(self):
1476        # There are some standard formatting practices that are easy to get right.
1477
1478        self.check_roundtrip("if x == 1:\n"
1479                             "    print(x)\n")
1480        self.check_roundtrip("# This is a comment\n"
1481                             "# This also")
1482
1483        # Some people use different formatting conventions, which makes
1484        # untokenize a little trickier. Note that this test involves trailing
1485        # whitespace after the colon. Note that we use hex escapes to make the
1486        # two trailing blanks apparent in the expected output.
1487
1488        self.check_roundtrip("if x == 1 : \n"
1489                             "  print(x)\n")
1490        fn = support.findfile("tokenize_tests.txt")
1491        with open(fn, 'rb') as f:
1492            self.check_roundtrip(f)
1493        self.check_roundtrip("if x == 1:\n"
1494                             "    # A comment by itself.\n"
1495                             "    print(x) # Comment here, too.\n"
1496                             "    # Another comment.\n"
1497                             "after_if = True\n")
1498        self.check_roundtrip("if (x # The comments need to go in the right place\n"
1499                             "    == 1):\n"
1500                             "    print('x==1')\n")
1501        self.check_roundtrip("class Test: # A comment here\n"
1502                             "  # A comment with weird indent\n"
1503                             "  after_com = 5\n"
1504                             "  def x(m): return m*5 # a one liner\n"
1505                             "  def y(m): # A whitespace after the colon\n"
1506                             "     return y*4 # 3-space indent\n")
1507
1508        # Some error-handling code
1509        self.check_roundtrip("try: import somemodule\n"
1510                             "except ImportError: # comment\n"
1511                             "    print('Can not import' # comment2\n)"
1512                             "else:   print('Loaded')\n")
1513
1514    def test_continuation(self):
1515        # Balancing continuation
1516        self.check_roundtrip("a = (3,4, \n"
1517                             "5,6)\n"
1518                             "y = [3, 4,\n"
1519                             "5]\n"
1520                             "z = {'a': 5,\n"
1521                             "'b':15, 'c':True}\n"
1522                             "x = len(y) + 5 - a[\n"
1523                             "3] - a[2]\n"
1524                             "+ len(z) - z[\n"
1525                             "'b']\n")
1526
1527    def test_backslash_continuation(self):
1528        # Backslash means line continuation, except for comments
1529        self.check_roundtrip("x=1+\\\n"
1530                             "1\n"
1531                             "# This is a comment\\\n"
1532                             "# This also\n")
1533        self.check_roundtrip("# Comment \\\n"
1534                             "x = 0")
1535
1536    def test_string_concatenation(self):
1537        # Two string literals on the same line
1538        self.check_roundtrip("'' ''")
1539
1540    def test_random_files(self):
1541        # Test roundtrip on random python modules.
1542        # pass the '-ucpu' option to process the full directory.
1543
1544        import glob, random
1545        fn = support.findfile("tokenize_tests.txt")
1546        tempdir = os.path.dirname(fn) or os.curdir
1547        testfiles = glob.glob(os.path.join(tempdir, "test*.py"))
1548
1549        # Tokenize is broken on test_pep3131.py because regular expressions are
1550        # broken on the obscure unicode identifiers in it. *sigh*
1551        # With roundtrip extended to test the 5-tuple mode of untokenize,
1552        # 7 more testfiles fail.  Remove them also until the failure is diagnosed.
1553
1554        testfiles.remove(os.path.join(tempdir, "test_unicode_identifiers.py"))
1555        for f in ('buffer', 'builtin', 'fileio', 'inspect', 'os', 'platform', 'sys'):
1556            testfiles.remove(os.path.join(tempdir, "test_%s.py") % f)
1557
1558        if not support.is_resource_enabled("cpu"):
1559            testfiles = random.sample(testfiles, 10)
1560
1561        for testfile in testfiles:
1562            with open(testfile, 'rb') as f:
1563                with self.subTest(file=testfile):
1564                    self.check_roundtrip(f)
1565
1566
1567    def roundtrip(self, code):
1568        if isinstance(code, str):
1569            code = code.encode('utf-8')
1570        return untokenize(tokenize(BytesIO(code).readline)).decode('utf-8')
1571
1572    def test_indentation_semantics_retained(self):
1573        """
1574        Ensure that although whitespace might be mutated in a roundtrip,
1575        the semantic meaning of the indentation remains consistent.
1576        """
1577        code = "if False:\n\tx=3\n\tx=3\n"
1578        codelines = self.roundtrip(code).split('\n')
1579        self.assertEqual(codelines[1], codelines[2])
1580        self.check_roundtrip(code)
1581
1582
1583if __name__ == "__main__":
1584    unittest.main()
1585