1 
2 /* Tokenizer implementation */
3 
4 #define PY_SSIZE_T_CLEAN
5 #include "Python.h"
6 
7 #include <ctype.h>
8 #include <assert.h>
9 
10 #include "tokenizer.h"
11 #include "errcode.h"
12 
13 #include "unicodeobject.h"
14 #include "bytesobject.h"
15 #include "fileobject.h"
16 #include "codecs.h"
17 #include "abstract.h"
18 
19 /* Alternate tab spacing */
20 #define ALTTABSIZE 1
21 
22 #define is_potential_identifier_start(c) (\
23               (c >= 'a' && c <= 'z')\
24                || (c >= 'A' && c <= 'Z')\
25                || c == '_'\
26                || (c >= 128))
27 
28 #define is_potential_identifier_char(c) (\
29               (c >= 'a' && c <= 'z')\
30                || (c >= 'A' && c <= 'Z')\
31                || (c >= '0' && c <= '9')\
32                || c == '_'\
33                || (c >= 128))
34 
35 
36 /* Don't ever change this -- it would break the portability of Python code */
37 #define TABSIZE 8
38 
39 /* Forward */
40 static struct tok_state *tok_new(void);
41 static int tok_nextc(struct tok_state *tok);
42 static void tok_backup(struct tok_state *tok, int c);
43 
44 
45 /* Spaces in this constant are treated as "zero or more spaces or tabs" when
46    tokenizing. */
47 static const char* type_comment_prefix = "# type: ";
48 
49 /* Create and initialize a new tok_state structure */
50 
51 static struct tok_state *
tok_new(void)52 tok_new(void)
53 {
54     struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
55                                             sizeof(struct tok_state));
56     if (tok == NULL)
57         return NULL;
58     tok->buf = tok->cur = tok->inp = NULL;
59     tok->start = NULL;
60     tok->end = NULL;
61     tok->done = E_OK;
62     tok->fp = NULL;
63     tok->input = NULL;
64     tok->tabsize = TABSIZE;
65     tok->indent = 0;
66     tok->indstack[0] = 0;
67 
68     tok->atbol = 1;
69     tok->pendin = 0;
70     tok->prompt = tok->nextprompt = NULL;
71     tok->lineno = 0;
72     tok->level = 0;
73     tok->altindstack[0] = 0;
74     tok->decoding_state = STATE_INIT;
75     tok->decoding_erred = 0;
76     tok->read_coding_spec = 0;
77     tok->enc = NULL;
78     tok->encoding = NULL;
79     tok->cont_line = 0;
80     tok->filename = NULL;
81     tok->decoding_readline = NULL;
82     tok->decoding_buffer = NULL;
83     tok->type_comments = 0;
84 
85     tok->async_hacks = 0;
86     tok->async_def = 0;
87     tok->async_def_indent = 0;
88     tok->async_def_nl = 0;
89 
90     return tok;
91 }
92 
93 static char *
new_string(const char * s,Py_ssize_t len,struct tok_state * tok)94 new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
95 {
96     char* result = (char *)PyMem_MALLOC(len + 1);
97     if (!result) {
98         tok->done = E_NOMEM;
99         return NULL;
100     }
101     memcpy(result, s, len);
102     result[len] = '\0';
103     return result;
104 }
105 
106 static char *
error_ret(struct tok_state * tok)107 error_ret(struct tok_state *tok) /* XXX */
108 {
109     tok->decoding_erred = 1;
110     if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
111         PyMem_FREE(tok->buf);
112     tok->buf = tok->cur = tok->inp = NULL;
113     tok->start = NULL;
114     tok->end = NULL;
115     tok->done = E_DECODE;
116     return NULL;                /* as if it were EOF */
117 }
118 
119 
120 static const char *
get_normal_name(const char * s)121 get_normal_name(const char *s)  /* for utf-8 and latin-1 */
122 {
123     char buf[13];
124     int i;
125     for (i = 0; i < 12; i++) {
126         int c = s[i];
127         if (c == '\0')
128             break;
129         else if (c == '_')
130             buf[i] = '-';
131         else
132             buf[i] = tolower(c);
133     }
134     buf[i] = '\0';
135     if (strcmp(buf, "utf-8") == 0 ||
136         strncmp(buf, "utf-8-", 6) == 0)
137         return "utf-8";
138     else if (strcmp(buf, "latin-1") == 0 ||
139              strcmp(buf, "iso-8859-1") == 0 ||
140              strcmp(buf, "iso-latin-1") == 0 ||
141              strncmp(buf, "latin-1-", 8) == 0 ||
142              strncmp(buf, "iso-8859-1-", 11) == 0 ||
143              strncmp(buf, "iso-latin-1-", 12) == 0)
144         return "iso-8859-1";
145     else
146         return s;
147 }
148 
149 /* Return the coding spec in S, or NULL if none is found.  */
150 
151 static int
get_coding_spec(const char * s,char ** spec,Py_ssize_t size,struct tok_state * tok)152 get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
153 {
154     Py_ssize_t i;
155     *spec = NULL;
156     /* Coding spec must be in a comment, and that comment must be
157      * the only statement on the source code line. */
158     for (i = 0; i < size - 6; i++) {
159         if (s[i] == '#')
160             break;
161         if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
162             return 1;
163     }
164     for (; i < size - 6; i++) { /* XXX inefficient search */
165         const char* t = s + i;
166         if (strncmp(t, "coding", 6) == 0) {
167             const char* begin = NULL;
168             t += 6;
169             if (t[0] != ':' && t[0] != '=')
170                 continue;
171             do {
172                 t++;
173             } while (t[0] == '\x20' || t[0] == '\t');
174 
175             begin = t;
176             while (Py_ISALNUM(t[0]) ||
177                    t[0] == '-' || t[0] == '_' || t[0] == '.')
178                 t++;
179 
180             if (begin < t) {
181                 char* r = new_string(begin, t - begin, tok);
182                 const char* q;
183                 if (!r)
184                     return 0;
185                 q = get_normal_name(r);
186                 if (r != q) {
187                     PyMem_FREE(r);
188                     r = new_string(q, strlen(q), tok);
189                     if (!r)
190                         return 0;
191                 }
192                 *spec = r;
193                 break;
194             }
195         }
196     }
197     return 1;
198 }
199 
200 /* Check whether the line contains a coding spec. If it does,
201    invoke the set_readline function for the new encoding.
202    This function receives the tok_state and the new encoding.
203    Return 1 on success, 0 on failure.  */
204 
205 static int
check_coding_spec(const char * line,Py_ssize_t size,struct tok_state * tok,int set_readline (struct tok_state *,const char *))206 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
207                   int set_readline(struct tok_state *, const char *))
208 {
209     char *cs;
210     int r = 1;
211 
212     if (tok->cont_line) {
213         /* It's a continuation line, so it can't be a coding spec. */
214         tok->read_coding_spec = 1;
215         return 1;
216     }
217     if (!get_coding_spec(line, &cs, size, tok))
218         return 0;
219     if (!cs) {
220         Py_ssize_t i;
221         for (i = 0; i < size; i++) {
222             if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
223                 break;
224             if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
225                 /* Stop checking coding spec after a line containing
226                  * anything except a comment. */
227                 tok->read_coding_spec = 1;
228                 break;
229             }
230         }
231         return 1;
232     }
233     tok->read_coding_spec = 1;
234     if (tok->encoding == NULL) {
235         assert(tok->decoding_state == STATE_RAW);
236         if (strcmp(cs, "utf-8") == 0) {
237             tok->encoding = cs;
238         } else {
239             r = set_readline(tok, cs);
240             if (r) {
241                 tok->encoding = cs;
242                 tok->decoding_state = STATE_NORMAL;
243             }
244             else {
245                 PyErr_Format(PyExc_SyntaxError,
246                              "encoding problem: %s", cs);
247                 PyMem_FREE(cs);
248             }
249         }
250     } else {                /* then, compare cs with BOM */
251         r = (strcmp(tok->encoding, cs) == 0);
252         if (!r)
253             PyErr_Format(PyExc_SyntaxError,
254                          "encoding problem: %s with BOM", cs);
255         PyMem_FREE(cs);
256     }
257     return r;
258 }
259 
260 /* See whether the file starts with a BOM. If it does,
261    invoke the set_readline function with the new encoding.
262    Return 1 on success, 0 on failure.  */
263 
264 static int
check_bom(int get_char (struct tok_state *),void unget_char (int,struct tok_state *),int set_readline (struct tok_state *,const char *),struct tok_state * tok)265 check_bom(int get_char(struct tok_state *),
266           void unget_char(int, struct tok_state *),
267           int set_readline(struct tok_state *, const char *),
268           struct tok_state *tok)
269 {
270     int ch1, ch2, ch3;
271     ch1 = get_char(tok);
272     tok->decoding_state = STATE_RAW;
273     if (ch1 == EOF) {
274         return 1;
275     } else if (ch1 == 0xEF) {
276         ch2 = get_char(tok);
277         if (ch2 != 0xBB) {
278             unget_char(ch2, tok);
279             unget_char(ch1, tok);
280             return 1;
281         }
282         ch3 = get_char(tok);
283         if (ch3 != 0xBF) {
284             unget_char(ch3, tok);
285             unget_char(ch2, tok);
286             unget_char(ch1, tok);
287             return 1;
288         }
289 #if 0
290     /* Disable support for UTF-16 BOMs until a decision
291        is made whether this needs to be supported.  */
292     } else if (ch1 == 0xFE) {
293         ch2 = get_char(tok);
294         if (ch2 != 0xFF) {
295             unget_char(ch2, tok);
296             unget_char(ch1, tok);
297             return 1;
298         }
299         if (!set_readline(tok, "utf-16-be"))
300             return 0;
301         tok->decoding_state = STATE_NORMAL;
302     } else if (ch1 == 0xFF) {
303         ch2 = get_char(tok);
304         if (ch2 != 0xFE) {
305             unget_char(ch2, tok);
306             unget_char(ch1, tok);
307             return 1;
308         }
309         if (!set_readline(tok, "utf-16-le"))
310             return 0;
311         tok->decoding_state = STATE_NORMAL;
312 #endif
313     } else {
314         unget_char(ch1, tok);
315         return 1;
316     }
317     if (tok->encoding != NULL)
318         PyMem_FREE(tok->encoding);
319     tok->encoding = new_string("utf-8", 5, tok);
320     if (!tok->encoding)
321         return 0;
322     /* No need to set_readline: input is already utf-8 */
323     return 1;
324 }
325 
326 /* Read a line of text from TOK into S, using the stream in TOK.
327    Return NULL on failure, else S.
328 
329    On entry, tok->decoding_buffer will be one of:
330      1) NULL: need to call tok->decoding_readline to get a new line
331      2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
332        stored the result in tok->decoding_buffer
333      3) PyByteArrayObject *: previous call to fp_readl did not have enough room
334        (in the s buffer) to copy entire contents of the line read
335        by tok->decoding_readline.  tok->decoding_buffer has the overflow.
336        In this case, fp_readl is called in a loop (with an expanded buffer)
337        until the buffer ends with a '\n' (or until the end of the file is
338        reached): see tok_nextc and its calls to decoding_fgets.
339 */
340 
341 static char *
fp_readl(char * s,int size,struct tok_state * tok)342 fp_readl(char *s, int size, struct tok_state *tok)
343 {
344     PyObject* bufobj;
345     const char *buf;
346     Py_ssize_t buflen;
347 
348     /* Ask for one less byte so we can terminate it */
349     assert(size > 0);
350     size--;
351 
352     if (tok->decoding_buffer) {
353         bufobj = tok->decoding_buffer;
354         Py_INCREF(bufobj);
355     }
356     else
357     {
358         bufobj = _PyObject_CallNoArg(tok->decoding_readline);
359         if (bufobj == NULL)
360             goto error;
361     }
362     if (PyUnicode_CheckExact(bufobj))
363     {
364         buf = PyUnicode_AsUTF8AndSize(bufobj, &buflen);
365         if (buf == NULL) {
366             goto error;
367         }
368     }
369     else
370     {
371         buf = PyByteArray_AsString(bufobj);
372         if (buf == NULL) {
373             goto error;
374         }
375         buflen = PyByteArray_GET_SIZE(bufobj);
376     }
377 
378     Py_XDECREF(tok->decoding_buffer);
379     if (buflen > size) {
380         /* Too many chars, the rest goes into tok->decoding_buffer */
381         tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
382                                                          buflen-size);
383         if (tok->decoding_buffer == NULL)
384             goto error;
385         buflen = size;
386     }
387     else
388         tok->decoding_buffer = NULL;
389 
390     memcpy(s, buf, buflen);
391     s[buflen] = '\0';
392     if (buflen == 0) /* EOF */
393         s = NULL;
394     Py_DECREF(bufobj);
395     return s;
396 
397 error:
398     Py_XDECREF(bufobj);
399     return error_ret(tok);
400 }
401 
402 /* Set the readline function for TOK to a StreamReader's
403    readline function. The StreamReader is named ENC.
404 
405    This function is called from check_bom and check_coding_spec.
406 
407    ENC is usually identical to the future value of tok->encoding,
408    except for the (currently unsupported) case of UTF-16.
409 
410    Return 1 on success, 0 on failure. */
411 
412 static int
fp_setreadl(struct tok_state * tok,const char * enc)413 fp_setreadl(struct tok_state *tok, const char* enc)
414 {
415     PyObject *readline, *io, *stream;
416     _Py_IDENTIFIER(open);
417     _Py_IDENTIFIER(readline);
418     int fd;
419     long pos;
420 
421     fd = fileno(tok->fp);
422     /* Due to buffering the file offset for fd can be different from the file
423      * position of tok->fp.  If tok->fp was opened in text mode on Windows,
424      * its file position counts CRLF as one char and can't be directly mapped
425      * to the file offset for fd.  Instead we step back one byte and read to
426      * the end of line.*/
427     pos = ftell(tok->fp);
428     if (pos == -1 ||
429         lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
430         PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
431         return 0;
432     }
433 
434     io = PyImport_ImportModuleNoBlock("io");
435     if (io == NULL)
436         return 0;
437 
438     stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
439                     fd, "r", -1, enc, Py_None, Py_None, Py_False);
440     Py_DECREF(io);
441     if (stream == NULL)
442         return 0;
443 
444     readline = _PyObject_GetAttrId(stream, &PyId_readline);
445     Py_DECREF(stream);
446     if (readline == NULL)
447         return 0;
448     Py_XSETREF(tok->decoding_readline, readline);
449 
450     if (pos > 0) {
451         PyObject *bufobj = _PyObject_CallNoArg(readline);
452         if (bufobj == NULL)
453             return 0;
454         Py_DECREF(bufobj);
455     }
456 
457     return 1;
458 }
459 
460 /* Fetch the next byte from TOK. */
461 
fp_getc(struct tok_state * tok)462 static int fp_getc(struct tok_state *tok) {
463     return getc(tok->fp);
464 }
465 
466 /* Unfetch the last byte back into TOK.  */
467 
fp_ungetc(int c,struct tok_state * tok)468 static void fp_ungetc(int c, struct tok_state *tok) {
469     ungetc(c, tok->fp);
470 }
471 
472 /* Check whether the characters at s start a valid
473    UTF-8 sequence. Return the number of characters forming
474    the sequence if yes, 0 if not.  */
valid_utf8(const unsigned char * s)475 static int valid_utf8(const unsigned char* s)
476 {
477     int expected = 0;
478     int length;
479     if (*s < 0x80)
480         /* single-byte code */
481         return 1;
482     if (*s < 0xc0)
483         /* following byte */
484         return 0;
485     if (*s < 0xE0)
486         expected = 1;
487     else if (*s < 0xF0)
488         expected = 2;
489     else if (*s < 0xF8)
490         expected = 3;
491     else
492         return 0;
493     length = expected + 1;
494     for (; expected; expected--)
495         if (s[expected] < 0x80 || s[expected] >= 0xC0)
496             return 0;
497     return length;
498 }
499 
500 /* Read a line of input from TOK. Determine encoding
501    if necessary.  */
502 
503 static char *
decoding_fgets(char * s,int size,struct tok_state * tok)504 decoding_fgets(char *s, int size, struct tok_state *tok)
505 {
506     char *line = NULL;
507     int badchar = 0;
508     for (;;) {
509         if (tok->decoding_state == STATE_NORMAL) {
510             /* We already have a codec associated with
511                this input. */
512             line = fp_readl(s, size, tok);
513             break;
514         } else if (tok->decoding_state == STATE_RAW) {
515             /* We want a 'raw' read. */
516             line = Py_UniversalNewlineFgets(s, size,
517                                             tok->fp, NULL);
518             break;
519         } else {
520             /* We have not yet determined the encoding.
521                If an encoding is found, use the file-pointer
522                reader functions from now on. */
523             if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
524                 return error_ret(tok);
525             assert(tok->decoding_state != STATE_INIT);
526         }
527     }
528     if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
529         if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
530             return error_ret(tok);
531         }
532     }
533     /* The default encoding is UTF-8, so make sure we don't have any
534        non-UTF-8 sequences in it. */
535     if (line && !tok->encoding) {
536         unsigned char *c;
537         int length;
538         for (c = (unsigned char *)line; *c; c += length)
539             if (!(length = valid_utf8(c))) {
540                 badchar = *c;
541                 break;
542             }
543     }
544     if (badchar) {
545         /* Need to add 1 to the line number, since this line
546            has not been counted, yet.  */
547         PyErr_Format(PyExc_SyntaxError,
548                 "Non-UTF-8 code starting with '\\x%.2x' "
549                 "in file %U on line %i, "
550                 "but no encoding declared; "
551                 "see http://python.org/dev/peps/pep-0263/ for details",
552                 badchar, tok->filename, tok->lineno + 1);
553         return error_ret(tok);
554     }
555     return line;
556 }
557 
558 static int
decoding_feof(struct tok_state * tok)559 decoding_feof(struct tok_state *tok)
560 {
561     if (tok->decoding_state != STATE_NORMAL) {
562         return feof(tok->fp);
563     } else {
564         PyObject* buf = tok->decoding_buffer;
565         if (buf == NULL) {
566             buf = _PyObject_CallNoArg(tok->decoding_readline);
567             if (buf == NULL) {
568                 error_ret(tok);
569                 return 1;
570             } else {
571                 tok->decoding_buffer = buf;
572             }
573         }
574         return PyObject_Length(buf) == 0;
575     }
576 }
577 
578 /* Fetch a byte from TOK, using the string buffer. */
579 
580 static int
buf_getc(struct tok_state * tok)581 buf_getc(struct tok_state *tok) {
582     return Py_CHARMASK(*tok->str++);
583 }
584 
585 /* Unfetch a byte from TOK, using the string buffer. */
586 
587 static void
buf_ungetc(int c,struct tok_state * tok)588 buf_ungetc(int c, struct tok_state *tok) {
589     tok->str--;
590     assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */
591 }
592 
593 /* Set the readline function for TOK to ENC. For the string-based
594    tokenizer, this means to just record the encoding. */
595 
596 static int
buf_setreadl(struct tok_state * tok,const char * enc)597 buf_setreadl(struct tok_state *tok, const char* enc) {
598     tok->enc = enc;
599     return 1;
600 }
601 
602 /* Return a UTF-8 encoding Python string object from the
603    C byte string STR, which is encoded with ENC. */
604 
605 static PyObject *
translate_into_utf8(const char * str,const char * enc)606 translate_into_utf8(const char* str, const char* enc) {
607     PyObject *utf8;
608     PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
609     if (buf == NULL)
610         return NULL;
611     utf8 = PyUnicode_AsUTF8String(buf);
612     Py_DECREF(buf);
613     return utf8;
614 }
615 
616 
617 static char *
translate_newlines(const char * s,int exec_input,struct tok_state * tok)618 translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
619     int skip_next_lf = 0;
620     size_t needed_length = strlen(s) + 2, final_length;
621     char *buf, *current;
622     char c = '\0';
623     buf = PyMem_MALLOC(needed_length);
624     if (buf == NULL) {
625         tok->done = E_NOMEM;
626         return NULL;
627     }
628     for (current = buf; *s; s++, current++) {
629         c = *s;
630         if (skip_next_lf) {
631             skip_next_lf = 0;
632             if (c == '\n') {
633                 c = *++s;
634                 if (!c)
635                     break;
636             }
637         }
638         if (c == '\r') {
639             skip_next_lf = 1;
640             c = '\n';
641         }
642         *current = c;
643     }
644     /* If this is exec input, add a newline to the end of the string if
645        there isn't one already. */
646     if (exec_input && c != '\n') {
647         *current = '\n';
648         current++;
649     }
650     *current = '\0';
651     final_length = current - buf + 1;
652     if (final_length < needed_length && final_length) {
653         /* should never fail */
654         char* result = PyMem_REALLOC(buf, final_length);
655         if (result == NULL) {
656             PyMem_FREE(buf);
657         }
658         buf = result;
659     }
660     return buf;
661 }
662 
663 /* Decode a byte string STR for use as the buffer of TOK.
664    Look for encoding declarations inside STR, and record them
665    inside TOK.  */
666 
667 static char *
decode_str(const char * input,int single,struct tok_state * tok)668 decode_str(const char *input, int single, struct tok_state *tok)
669 {
670     PyObject* utf8 = NULL;
671     char *str;
672     const char *s;
673     const char *newl[2] = {NULL, NULL};
674     int lineno = 0;
675     tok->input = str = translate_newlines(input, single, tok);
676     if (str == NULL)
677         return NULL;
678     tok->enc = NULL;
679     tok->str = str;
680     if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
681         return error_ret(tok);
682     str = tok->str;             /* string after BOM if any */
683     assert(str);
684     if (tok->enc != NULL) {
685         utf8 = translate_into_utf8(str, tok->enc);
686         if (utf8 == NULL)
687             return error_ret(tok);
688         str = PyBytes_AsString(utf8);
689     }
690     for (s = str;; s++) {
691         if (*s == '\0') break;
692         else if (*s == '\n') {
693             assert(lineno < 2);
694             newl[lineno] = s;
695             lineno++;
696             if (lineno == 2) break;
697         }
698     }
699     tok->enc = NULL;
700     /* need to check line 1 and 2 separately since check_coding_spec
701        assumes a single line as input */
702     if (newl[0]) {
703         if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
704             return error_ret(tok);
705         if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
706             if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
707                                    tok, buf_setreadl))
708                 return error_ret(tok);
709         }
710     }
711     if (tok->enc != NULL) {
712         assert(utf8 == NULL);
713         utf8 = translate_into_utf8(str, tok->enc);
714         if (utf8 == NULL)
715             return error_ret(tok);
716         str = PyBytes_AS_STRING(utf8);
717     }
718     assert(tok->decoding_buffer == NULL);
719     tok->decoding_buffer = utf8; /* CAUTION */
720     return str;
721 }
722 
723 /* Set up tokenizer for string */
724 
725 struct tok_state *
PyTokenizer_FromString(const char * str,int exec_input)726 PyTokenizer_FromString(const char *str, int exec_input)
727 {
728     struct tok_state *tok = tok_new();
729     char *decoded;
730 
731     if (tok == NULL)
732         return NULL;
733     decoded = decode_str(str, exec_input, tok);
734     if (decoded == NULL) {
735         PyTokenizer_Free(tok);
736         return NULL;
737     }
738 
739     tok->buf = tok->cur = tok->inp = decoded;
740     tok->end = decoded;
741     return tok;
742 }
743 
744 struct tok_state *
PyTokenizer_FromUTF8(const char * str,int exec_input)745 PyTokenizer_FromUTF8(const char *str, int exec_input)
746 {
747     struct tok_state *tok = tok_new();
748     char *translated;
749     if (tok == NULL)
750         return NULL;
751     tok->input = translated = translate_newlines(str, exec_input, tok);
752     if (translated == NULL) {
753         PyTokenizer_Free(tok);
754         return NULL;
755     }
756     tok->decoding_state = STATE_RAW;
757     tok->read_coding_spec = 1;
758     tok->enc = NULL;
759     tok->str = translated;
760     tok->encoding = (char *)PyMem_MALLOC(6);
761     if (!tok->encoding) {
762         PyTokenizer_Free(tok);
763         return NULL;
764     }
765     strcpy(tok->encoding, "utf-8");
766 
767     tok->buf = tok->cur = tok->inp = translated;
768     tok->end = translated;
769     return tok;
770 }
771 
772 /* Set up tokenizer for file */
773 
774 struct tok_state *
PyTokenizer_FromFile(FILE * fp,const char * enc,const char * ps1,const char * ps2)775 PyTokenizer_FromFile(FILE *fp, const char* enc,
776                      const char *ps1, const char *ps2)
777 {
778     struct tok_state *tok = tok_new();
779     if (tok == NULL)
780         return NULL;
781     if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
782         PyTokenizer_Free(tok);
783         return NULL;
784     }
785     tok->cur = tok->inp = tok->buf;
786     tok->end = tok->buf + BUFSIZ;
787     tok->fp = fp;
788     tok->prompt = ps1;
789     tok->nextprompt = ps2;
790     if (enc != NULL) {
791         /* Must copy encoding declaration since it
792            gets copied into the parse tree. */
793         tok->encoding = PyMem_MALLOC(strlen(enc)+1);
794         if (!tok->encoding) {
795             PyTokenizer_Free(tok);
796             return NULL;
797         }
798         strcpy(tok->encoding, enc);
799         tok->decoding_state = STATE_NORMAL;
800     }
801     return tok;
802 }
803 
804 
805 /* Free a tok_state structure */
806 
807 void
PyTokenizer_Free(struct tok_state * tok)808 PyTokenizer_Free(struct tok_state *tok)
809 {
810     if (tok->encoding != NULL)
811         PyMem_FREE(tok->encoding);
812     Py_XDECREF(tok->decoding_readline);
813     Py_XDECREF(tok->decoding_buffer);
814     Py_XDECREF(tok->filename);
815     if (tok->fp != NULL && tok->buf != NULL)
816         PyMem_FREE(tok->buf);
817     if (tok->input)
818         PyMem_FREE(tok->input);
819     PyMem_FREE(tok);
820 }
821 
822 /* Get next char, updating state; error code goes into tok->done */
823 
824 static int
tok_nextc(struct tok_state * tok)825 tok_nextc(struct tok_state *tok)
826 {
827     for (;;) {
828         if (tok->cur != tok->inp) {
829             return Py_CHARMASK(*tok->cur++); /* Fast path */
830         }
831         if (tok->done != E_OK)
832             return EOF;
833         if (tok->fp == NULL) {
834             char *end = strchr(tok->inp, '\n');
835             if (end != NULL)
836                 end++;
837             else {
838                 end = strchr(tok->inp, '\0');
839                 if (end == tok->inp) {
840                     tok->done = E_EOF;
841                     return EOF;
842                 }
843             }
844             if (tok->start == NULL)
845                 tok->buf = tok->cur;
846             tok->line_start = tok->cur;
847             tok->lineno++;
848             tok->inp = end;
849             return Py_CHARMASK(*tok->cur++);
850         }
851         if (tok->prompt != NULL) {
852             char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
853             if (newtok != NULL) {
854                 char *translated = translate_newlines(newtok, 0, tok);
855                 PyMem_FREE(newtok);
856                 if (translated == NULL)
857                     return EOF;
858                 newtok = translated;
859             }
860             if (tok->encoding && newtok && *newtok) {
861                 /* Recode to UTF-8 */
862                 Py_ssize_t buflen;
863                 const char* buf;
864                 PyObject *u = translate_into_utf8(newtok, tok->encoding);
865                 PyMem_FREE(newtok);
866                 if (!u) {
867                     tok->done = E_DECODE;
868                     return EOF;
869                 }
870                 buflen = PyBytes_GET_SIZE(u);
871                 buf = PyBytes_AS_STRING(u);
872                 newtok = PyMem_MALLOC(buflen+1);
873                 if (newtok == NULL) {
874                     Py_DECREF(u);
875                     tok->done = E_NOMEM;
876                     return EOF;
877                 }
878                 strcpy(newtok, buf);
879                 Py_DECREF(u);
880             }
881             if (tok->nextprompt != NULL)
882                 tok->prompt = tok->nextprompt;
883             if (newtok == NULL)
884                 tok->done = E_INTR;
885             else if (*newtok == '\0') {
886                 PyMem_FREE(newtok);
887                 tok->done = E_EOF;
888             }
889             else if (tok->start != NULL) {
890                 size_t start = tok->start - tok->buf;
891                 size_t oldlen = tok->cur - tok->buf;
892                 size_t newlen = oldlen + strlen(newtok);
893                 Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
894                 char *buf = tok->buf;
895                 buf = (char *)PyMem_REALLOC(buf, newlen+1);
896                 tok->lineno++;
897                 if (buf == NULL) {
898                     PyMem_FREE(tok->buf);
899                     tok->buf = NULL;
900                     PyMem_FREE(newtok);
901                     tok->done = E_NOMEM;
902                     return EOF;
903                 }
904                 tok->buf = buf;
905                 tok->cur = tok->buf + oldlen;
906                 tok->multi_line_start = tok->buf + cur_multi_line_start;
907                 tok->line_start = tok->cur;
908                 strcpy(tok->buf + oldlen, newtok);
909                 PyMem_FREE(newtok);
910                 tok->inp = tok->buf + newlen;
911                 tok->end = tok->inp + 1;
912                 tok->start = tok->buf + start;
913             }
914             else {
915                 tok->lineno++;
916                 if (tok->buf != NULL)
917                     PyMem_FREE(tok->buf);
918                 tok->buf = newtok;
919                 tok->cur = tok->buf;
920                 tok->line_start = tok->buf;
921                 tok->inp = strchr(tok->buf, '\0');
922                 tok->end = tok->inp + 1;
923             }
924         }
925         else {
926             int done = 0;
927             Py_ssize_t cur = 0;
928             char *pt;
929             if (tok->start == NULL) {
930                 if (tok->buf == NULL) {
931                     tok->buf = (char *)
932                         PyMem_MALLOC(BUFSIZ);
933                     if (tok->buf == NULL) {
934                         tok->done = E_NOMEM;
935                         return EOF;
936                     }
937                     tok->end = tok->buf + BUFSIZ;
938                 }
939                 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
940                           tok) == NULL) {
941                     if (!tok->decoding_erred)
942                         tok->done = E_EOF;
943                     done = 1;
944                 }
945                 else {
946                     tok->done = E_OK;
947                     tok->inp = strchr(tok->buf, '\0');
948                     done = tok->inp == tok->buf || tok->inp[-1] == '\n';
949                 }
950             }
951             else {
952                 cur = tok->cur - tok->buf;
953                 if (decoding_feof(tok)) {
954                     tok->done = E_EOF;
955                     done = 1;
956                 }
957                 else
958                     tok->done = E_OK;
959             }
960             tok->lineno++;
961             /* Read until '\n' or EOF */
962             while (!done) {
963                 Py_ssize_t curstart = tok->start == NULL ? -1 :
964                           tok->start - tok->buf;
965                 Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
966                 Py_ssize_t curvalid = tok->inp - tok->buf;
967                 Py_ssize_t newsize = curvalid + BUFSIZ;
968                 char *newbuf = tok->buf;
969                 newbuf = (char *)PyMem_REALLOC(newbuf,
970                                                newsize);
971                 if (newbuf == NULL) {
972                     tok->done = E_NOMEM;
973                     tok->cur = tok->inp;
974                     return EOF;
975                 }
976                 tok->buf = newbuf;
977                 tok->cur = tok->buf + cur;
978                 tok->multi_line_start = tok->buf + cur_multi_line_start;
979                 tok->line_start = tok->cur;
980                 tok->inp = tok->buf + curvalid;
981                 tok->end = tok->buf + newsize;
982                 tok->start = curstart < 0 ? NULL :
983                          tok->buf + curstart;
984                 if (decoding_fgets(tok->inp,
985                                (int)(tok->end - tok->inp),
986                                tok) == NULL) {
987                     /* Break out early on decoding
988                        errors, as tok->buf will be NULL
989                      */
990                     if (tok->decoding_erred)
991                         return EOF;
992                     /* Last line does not end in \n,
993                        fake one */
994                     if (tok->inp[-1] != '\n')
995                         strcpy(tok->inp, "\n");
996                 }
997                 tok->inp = strchr(tok->inp, '\0');
998                 done = tok->inp[-1] == '\n';
999             }
1000             if (tok->buf != NULL) {
1001                 tok->cur = tok->buf + cur;
1002                 tok->line_start = tok->cur;
1003                 /* replace "\r\n" with "\n" */
1004                 /* For Mac leave the \r, giving a syntax error */
1005                 pt = tok->inp - 2;
1006                 if (pt >= tok->buf && *pt == '\r') {
1007                     *pt++ = '\n';
1008                     *pt = '\0';
1009                     tok->inp = pt;
1010                 }
1011             }
1012         }
1013         if (tok->done != E_OK) {
1014             if (tok->prompt != NULL)
1015                 PySys_WriteStderr("\n");
1016             tok->cur = tok->inp;
1017             return EOF;
1018         }
1019     }
1020     /*NOTREACHED*/
1021 }
1022 
1023 
1024 /* Back-up one character */
1025 
1026 static void
tok_backup(struct tok_state * tok,int c)1027 tok_backup(struct tok_state *tok, int c)
1028 {
1029     if (c != EOF) {
1030         if (--tok->cur < tok->buf) {
1031             Py_FatalError("tokenizer beginning of buffer");
1032         }
1033         if (*tok->cur != c) {
1034             *tok->cur = c;
1035         }
1036     }
1037 }
1038 
1039 
1040 static int
syntaxerror(struct tok_state * tok,const char * format,...)1041 syntaxerror(struct tok_state *tok, const char *format, ...)
1042 {
1043     PyObject *errmsg, *errtext, *args;
1044     va_list vargs;
1045 #ifdef HAVE_STDARG_PROTOTYPES
1046     va_start(vargs, format);
1047 #else
1048     va_start(vargs);
1049 #endif
1050     errmsg = PyUnicode_FromFormatV(format, vargs);
1051     va_end(vargs);
1052     if (!errmsg) {
1053         goto error;
1054     }
1055 
1056     errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,
1057                                    "replace");
1058     if (!errtext) {
1059         goto error;
1060     }
1061     int offset = (int)PyUnicode_GET_LENGTH(errtext);
1062     Py_ssize_t line_len = strcspn(tok->line_start, "\n");
1063     if (line_len != tok->cur - tok->line_start) {
1064         Py_DECREF(errtext);
1065         errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,
1066                                        "replace");
1067     }
1068     if (!errtext) {
1069         goto error;
1070     }
1071 
1072     args = Py_BuildValue("(O(OiiN))", errmsg,
1073                          tok->filename, tok->lineno, offset, errtext);
1074     if (args) {
1075         PyErr_SetObject(PyExc_SyntaxError, args);
1076         Py_DECREF(args);
1077     }
1078 
1079 error:
1080     Py_XDECREF(errmsg);
1081     tok->done = E_ERROR;
1082     return ERRORTOKEN;
1083 }
1084 
1085 static int
indenterror(struct tok_state * tok)1086 indenterror(struct tok_state *tok)
1087 {
1088     tok->done = E_TABSPACE;
1089     tok->cur = tok->inp;
1090     return ERRORTOKEN;
1091 }
1092 
1093 /* Verify that the identifier follows PEP 3131.
1094    All identifier strings are guaranteed to be "ready" unicode objects.
1095  */
1096 static int
verify_identifier(struct tok_state * tok)1097 verify_identifier(struct tok_state *tok)
1098 {
1099     PyObject *s;
1100     if (tok->decoding_erred)
1101         return 0;
1102     s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
1103     if (s == NULL) {
1104         if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1105             tok->done = E_DECODE;
1106         }
1107         else {
1108             tok->done = E_ERROR;
1109         }
1110         return 0;
1111     }
1112     Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
1113     if (invalid < 0) {
1114         Py_DECREF(s);
1115         tok->done = E_ERROR;
1116         return 0;
1117     }
1118     assert(PyUnicode_GET_LENGTH(s) > 0);
1119     if (invalid < PyUnicode_GET_LENGTH(s)) {
1120         Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
1121         if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
1122             /* Determine the offset in UTF-8 encoded input */
1123             Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
1124             if (s != NULL) {
1125                 Py_SETREF(s, PyUnicode_AsUTF8String(s));
1126             }
1127             if (s == NULL) {
1128                 tok->done = E_ERROR;
1129                 return 0;
1130             }
1131             tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
1132         }
1133         Py_DECREF(s);
1134         // PyUnicode_FromFormatV() does not support %X
1135         char hex[9];
1136         (void)PyOS_snprintf(hex, sizeof(hex), "%04X", ch);
1137         if (Py_UNICODE_ISPRINTABLE(ch)) {
1138             syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
1139         }
1140         else {
1141             syntaxerror(tok, "invalid non-printable character U+%s", hex);
1142         }
1143         return 0;
1144     }
1145     Py_DECREF(s);
1146     return 1;
1147 }
1148 
1149 static int
tok_decimal_tail(struct tok_state * tok)1150 tok_decimal_tail(struct tok_state *tok)
1151 {
1152     int c;
1153 
1154     while (1) {
1155         do {
1156             c = tok_nextc(tok);
1157         } while (isdigit(c));
1158         if (c != '_') {
1159             break;
1160         }
1161         c = tok_nextc(tok);
1162         if (!isdigit(c)) {
1163             tok_backup(tok, c);
1164             syntaxerror(tok, "invalid decimal literal");
1165             return 0;
1166         }
1167     }
1168     return c;
1169 }
1170 
1171 /* Get next token, after space stripping etc. */
1172 
1173 static int
tok_get(struct tok_state * tok,const char ** p_start,const char ** p_end)1174 tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
1175 {
1176     int c;
1177     int blankline, nonascii;
1178 
1179     *p_start = *p_end = NULL;
1180   nextline:
1181     tok->start = NULL;
1182     blankline = 0;
1183 
1184     /* Get indentation level */
1185     if (tok->atbol) {
1186         int col = 0;
1187         int altcol = 0;
1188         tok->atbol = 0;
1189         for (;;) {
1190             c = tok_nextc(tok);
1191             if (c == ' ') {
1192                 col++, altcol++;
1193             }
1194             else if (c == '\t') {
1195                 col = (col / tok->tabsize + 1) * tok->tabsize;
1196                 altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
1197             }
1198             else if (c == '\014')  {/* Control-L (formfeed) */
1199                 col = altcol = 0; /* For Emacs users */
1200             }
1201             else {
1202                 break;
1203             }
1204         }
1205         tok_backup(tok, c);
1206         if (c == '#' || c == '\n' || c == '\\') {
1207             /* Lines with only whitespace and/or comments
1208                and/or a line continuation character
1209                shouldn't affect the indentation and are
1210                not passed to the parser as NEWLINE tokens,
1211                except *totally* empty lines in interactive
1212                mode, which signal the end of a command group. */
1213             if (col == 0 && c == '\n' && tok->prompt != NULL) {
1214                 blankline = 0; /* Let it through */
1215             }
1216             else if (tok->prompt != NULL && tok->lineno == 1) {
1217                 /* In interactive mode, if the first line contains
1218                    only spaces and/or a comment, let it through. */
1219                 blankline = 0;
1220                 col = altcol = 0;
1221             }
1222             else {
1223                 blankline = 1; /* Ignore completely */
1224             }
1225             /* We can't jump back right here since we still
1226                may need to skip to the end of a comment */
1227         }
1228         if (!blankline && tok->level == 0) {
1229             if (col == tok->indstack[tok->indent]) {
1230                 /* No change */
1231                 if (altcol != tok->altindstack[tok->indent]) {
1232                     return indenterror(tok);
1233                 }
1234             }
1235             else if (col > tok->indstack[tok->indent]) {
1236                 /* Indent -- always one */
1237                 if (tok->indent+1 >= MAXINDENT) {
1238                     tok->done = E_TOODEEP;
1239                     tok->cur = tok->inp;
1240                     return ERRORTOKEN;
1241                 }
1242                 if (altcol <= tok->altindstack[tok->indent]) {
1243                     return indenterror(tok);
1244                 }
1245                 tok->pendin++;
1246                 tok->indstack[++tok->indent] = col;
1247                 tok->altindstack[tok->indent] = altcol;
1248             }
1249             else /* col < tok->indstack[tok->indent] */ {
1250                 /* Dedent -- any number, must be consistent */
1251                 while (tok->indent > 0 &&
1252                     col < tok->indstack[tok->indent]) {
1253                     tok->pendin--;
1254                     tok->indent--;
1255                 }
1256                 if (col != tok->indstack[tok->indent]) {
1257                     tok->done = E_DEDENT;
1258                     tok->cur = tok->inp;
1259                     return ERRORTOKEN;
1260                 }
1261                 if (altcol != tok->altindstack[tok->indent]) {
1262                     return indenterror(tok);
1263                 }
1264             }
1265         }
1266     }
1267 
1268     tok->start = tok->cur;
1269 
1270     /* Return pending indents/dedents */
1271     if (tok->pendin != 0) {
1272         if (tok->pendin < 0) {
1273             tok->pendin++;
1274             return DEDENT;
1275         }
1276         else {
1277             tok->pendin--;
1278             return INDENT;
1279         }
1280     }
1281 
1282     /* Peek ahead at the next character */
1283     c = tok_nextc(tok);
1284     tok_backup(tok, c);
1285     /* Check if we are closing an async function */
1286     if (tok->async_def
1287         && !blankline
1288         /* Due to some implementation artifacts of type comments,
1289          * a TYPE_COMMENT at the start of a function won't set an
1290          * indentation level and it will produce a NEWLINE after it.
1291          * To avoid spuriously ending an async function due to this,
1292          * wait until we have some non-newline char in front of us. */
1293         && c != '\n'
1294         && tok->level == 0
1295         /* There was a NEWLINE after ASYNC DEF,
1296            so we're past the signature. */
1297         && tok->async_def_nl
1298         /* Current indentation level is less than where
1299            the async function was defined */
1300         && tok->async_def_indent >= tok->indent)
1301     {
1302         tok->async_def = 0;
1303         tok->async_def_indent = 0;
1304         tok->async_def_nl = 0;
1305     }
1306 
1307  again:
1308     tok->start = NULL;
1309     /* Skip spaces */
1310     do {
1311         c = tok_nextc(tok);
1312     } while (c == ' ' || c == '\t' || c == '\014');
1313 
1314     /* Set start of current token */
1315     tok->start = tok->cur - 1;
1316 
1317     /* Skip comment, unless it's a type comment */
1318     if (c == '#') {
1319         const char *prefix, *p, *type_start;
1320 
1321         while (c != EOF && c != '\n') {
1322             c = tok_nextc(tok);
1323         }
1324 
1325         if (tok->type_comments) {
1326             p = tok->start;
1327             prefix = type_comment_prefix;
1328             while (*prefix && p < tok->cur) {
1329                 if (*prefix == ' ') {
1330                     while (*p == ' ' || *p == '\t') {
1331                         p++;
1332                     }
1333                 } else if (*prefix == *p) {
1334                     p++;
1335                 } else {
1336                     break;
1337                 }
1338 
1339                 prefix++;
1340             }
1341 
1342             /* This is a type comment if we matched all of type_comment_prefix. */
1343             if (!*prefix) {
1344                 int is_type_ignore = 1;
1345                 const char *ignore_end = p + 6;
1346                 tok_backup(tok, c);  /* don't eat the newline or EOF */
1347 
1348                 type_start = p;
1349 
1350                 /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
1351                  * or anything ASCII and non-alphanumeric. */
1352                 is_type_ignore = (
1353                     tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
1354                     && !(tok->cur > ignore_end
1355                          && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
1356 
1357                 if (is_type_ignore) {
1358                     *p_start = ignore_end;
1359                     *p_end = tok->cur;
1360 
1361                     /* If this type ignore is the only thing on the line, consume the newline also. */
1362                     if (blankline) {
1363                         tok_nextc(tok);
1364                         tok->atbol = 1;
1365                     }
1366                     return TYPE_IGNORE;
1367                 } else {
1368                     *p_start = type_start;  /* after type_comment_prefix */
1369                     *p_end = tok->cur;
1370                     return TYPE_COMMENT;
1371                 }
1372             }
1373         }
1374     }
1375 
1376     /* Check for EOF and errors now */
1377     if (c == EOF) {
1378         return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1379     }
1380 
1381     /* Identifier (most frequent token!) */
1382     nonascii = 0;
1383     if (is_potential_identifier_start(c)) {
1384         /* Process the various legal combinations of b"", r"", u"", and f"". */
1385         int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
1386         while (1) {
1387             if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
1388                 saw_b = 1;
1389             /* Since this is a backwards compatibility support literal we don't
1390                want to support it in arbitrary order like byte literals. */
1391             else if (!(saw_b || saw_u || saw_r || saw_f)
1392                      && (c == 'u'|| c == 'U')) {
1393                 saw_u = 1;
1394             }
1395             /* ur"" and ru"" are not supported */
1396             else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
1397                 saw_r = 1;
1398             }
1399             else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
1400                 saw_f = 1;
1401             }
1402             else {
1403                 break;
1404             }
1405             c = tok_nextc(tok);
1406             if (c == '"' || c == '\'') {
1407                 goto letter_quote;
1408             }
1409         }
1410         while (is_potential_identifier_char(c)) {
1411             if (c >= 128) {
1412                 nonascii = 1;
1413             }
1414             c = tok_nextc(tok);
1415         }
1416         tok_backup(tok, c);
1417         if (nonascii && !verify_identifier(tok)) {
1418             return ERRORTOKEN;
1419         }
1420 
1421         *p_start = tok->start;
1422         *p_end = tok->cur;
1423 
1424         /* async/await parsing block. */
1425         if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
1426             /* May be an 'async' or 'await' token.  For Python 3.7 or
1427                later we recognize them unconditionally.  For Python
1428                3.5 or 3.6 we recognize 'async' in front of 'def', and
1429                either one inside of 'async def'.  (Technically we
1430                shouldn't recognize these at all for 3.4 or earlier,
1431                but there's no *valid* Python 3.4 code that would be
1432                rejected, and async functions will be rejected in a
1433                later phase.) */
1434             if (!tok->async_hacks || tok->async_def) {
1435                 /* Always recognize the keywords. */
1436                 if (memcmp(tok->start, "async", 5) == 0) {
1437                     return ASYNC;
1438                 }
1439                 if (memcmp(tok->start, "await", 5) == 0) {
1440                     return AWAIT;
1441                 }
1442             }
1443             else if (memcmp(tok->start, "async", 5) == 0) {
1444                 /* The current token is 'async'.
1445                    Look ahead one token to see if that is 'def'. */
1446 
1447                 struct tok_state ahead_tok;
1448                 const char *ahead_tok_start = NULL;
1449                 const char *ahead_tok_end = NULL;
1450                 int ahead_tok_kind;
1451 
1452                 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1453                 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1454                                          &ahead_tok_end);
1455 
1456                 if (ahead_tok_kind == NAME
1457                     && ahead_tok.cur - ahead_tok.start == 3
1458                     && memcmp(ahead_tok.start, "def", 3) == 0)
1459                 {
1460                     /* The next token is going to be 'def', so instead of
1461                        returning a plain NAME token, return ASYNC. */
1462                     tok->async_def_indent = tok->indent;
1463                     tok->async_def = 1;
1464                     return ASYNC;
1465                 }
1466             }
1467         }
1468 
1469         return NAME;
1470     }
1471 
1472     /* Newline */
1473     if (c == '\n') {
1474         tok->atbol = 1;
1475         if (blankline || tok->level > 0) {
1476             goto nextline;
1477         }
1478         *p_start = tok->start;
1479         *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1480         tok->cont_line = 0;
1481         if (tok->async_def) {
1482             /* We're somewhere inside an 'async def' function, and
1483                we've encountered a NEWLINE after its signature. */
1484             tok->async_def_nl = 1;
1485         }
1486         return NEWLINE;
1487     }
1488 
1489     /* Period or number starting with period? */
1490     if (c == '.') {
1491         c = tok_nextc(tok);
1492         if (isdigit(c)) {
1493             goto fraction;
1494         } else if (c == '.') {
1495             c = tok_nextc(tok);
1496             if (c == '.') {
1497                 *p_start = tok->start;
1498                 *p_end = tok->cur;
1499                 return ELLIPSIS;
1500             }
1501             else {
1502                 tok_backup(tok, c);
1503             }
1504             tok_backup(tok, '.');
1505         }
1506         else {
1507             tok_backup(tok, c);
1508         }
1509         *p_start = tok->start;
1510         *p_end = tok->cur;
1511         return DOT;
1512     }
1513 
1514     /* Number */
1515     if (isdigit(c)) {
1516         if (c == '0') {
1517             /* Hex, octal or binary -- maybe. */
1518             c = tok_nextc(tok);
1519             if (c == 'x' || c == 'X') {
1520                 /* Hex */
1521                 c = tok_nextc(tok);
1522                 do {
1523                     if (c == '_') {
1524                         c = tok_nextc(tok);
1525                     }
1526                     if (!isxdigit(c)) {
1527                         tok_backup(tok, c);
1528                         return syntaxerror(tok, "invalid hexadecimal literal");
1529                     }
1530                     do {
1531                         c = tok_nextc(tok);
1532                     } while (isxdigit(c));
1533                 } while (c == '_');
1534             }
1535             else if (c == 'o' || c == 'O') {
1536                 /* Octal */
1537                 c = tok_nextc(tok);
1538                 do {
1539                     if (c == '_') {
1540                         c = tok_nextc(tok);
1541                     }
1542                     if (c < '0' || c >= '8') {
1543                         tok_backup(tok, c);
1544                         if (isdigit(c)) {
1545                             return syntaxerror(tok,
1546                                     "invalid digit '%c' in octal literal", c);
1547                         }
1548                         else {
1549                             return syntaxerror(tok, "invalid octal literal");
1550                         }
1551                     }
1552                     do {
1553                         c = tok_nextc(tok);
1554                     } while ('0' <= c && c < '8');
1555                 } while (c == '_');
1556                 if (isdigit(c)) {
1557                     return syntaxerror(tok,
1558                             "invalid digit '%c' in octal literal", c);
1559                 }
1560             }
1561             else if (c == 'b' || c == 'B') {
1562                 /* Binary */
1563                 c = tok_nextc(tok);
1564                 do {
1565                     if (c == '_') {
1566                         c = tok_nextc(tok);
1567                     }
1568                     if (c != '0' && c != '1') {
1569                         tok_backup(tok, c);
1570                         if (isdigit(c)) {
1571                             return syntaxerror(tok,
1572                                     "invalid digit '%c' in binary literal", c);
1573                         }
1574                         else {
1575                             return syntaxerror(tok, "invalid binary literal");
1576                         }
1577                     }
1578                     do {
1579                         c = tok_nextc(tok);
1580                     } while (c == '0' || c == '1');
1581                 } while (c == '_');
1582                 if (isdigit(c)) {
1583                     return syntaxerror(tok,
1584                             "invalid digit '%c' in binary literal", c);
1585                 }
1586             }
1587             else {
1588                 int nonzero = 0;
1589                 /* maybe old-style octal; c is first char of it */
1590                 /* in any case, allow '0' as a literal */
1591                 while (1) {
1592                     if (c == '_') {
1593                         c = tok_nextc(tok);
1594                         if (!isdigit(c)) {
1595                             tok_backup(tok, c);
1596                             return syntaxerror(tok, "invalid decimal literal");
1597                         }
1598                     }
1599                     if (c != '0') {
1600                         break;
1601                     }
1602                     c = tok_nextc(tok);
1603                 }
1604                 if (isdigit(c)) {
1605                     nonzero = 1;
1606                     c = tok_decimal_tail(tok);
1607                     if (c == 0) {
1608                         return ERRORTOKEN;
1609                     }
1610                 }
1611                 if (c == '.') {
1612                     c = tok_nextc(tok);
1613                     goto fraction;
1614                 }
1615                 else if (c == 'e' || c == 'E') {
1616                     goto exponent;
1617                 }
1618                 else if (c == 'j' || c == 'J') {
1619                     goto imaginary;
1620                 }
1621                 else if (nonzero) {
1622                     /* Old-style octal: now disallowed. */
1623                     tok_backup(tok, c);
1624                     return syntaxerror(tok,
1625                                        "leading zeros in decimal integer "
1626                                        "literals are not permitted; "
1627                                        "use an 0o prefix for octal integers");
1628                 }
1629             }
1630         }
1631         else {
1632             /* Decimal */
1633             c = tok_decimal_tail(tok);
1634             if (c == 0) {
1635                 return ERRORTOKEN;
1636             }
1637             {
1638                 /* Accept floating point numbers. */
1639                 if (c == '.') {
1640                     c = tok_nextc(tok);
1641         fraction:
1642                     /* Fraction */
1643                     if (isdigit(c)) {
1644                         c = tok_decimal_tail(tok);
1645                         if (c == 0) {
1646                             return ERRORTOKEN;
1647                         }
1648                     }
1649                 }
1650                 if (c == 'e' || c == 'E') {
1651                     int e;
1652                   exponent:
1653                     e = c;
1654                     /* Exponent part */
1655                     c = tok_nextc(tok);
1656                     if (c == '+' || c == '-') {
1657                         c = tok_nextc(tok);
1658                         if (!isdigit(c)) {
1659                             tok_backup(tok, c);
1660                             return syntaxerror(tok, "invalid decimal literal");
1661                         }
1662                     } else if (!isdigit(c)) {
1663                         tok_backup(tok, c);
1664                         tok_backup(tok, e);
1665                         *p_start = tok->start;
1666                         *p_end = tok->cur;
1667                         return NUMBER;
1668                     }
1669                     c = tok_decimal_tail(tok);
1670                     if (c == 0) {
1671                         return ERRORTOKEN;
1672                     }
1673                 }
1674                 if (c == 'j' || c == 'J') {
1675                     /* Imaginary part */
1676         imaginary:
1677                     c = tok_nextc(tok);
1678                 }
1679             }
1680         }
1681         tok_backup(tok, c);
1682         *p_start = tok->start;
1683         *p_end = tok->cur;
1684         return NUMBER;
1685     }
1686 
1687   letter_quote:
1688     /* String */
1689     if (c == '\'' || c == '"') {
1690         int quote = c;
1691         int quote_size = 1;             /* 1 or 3 */
1692         int end_quote_size = 0;
1693 
1694         /* Nodes of type STRING, especially multi line strings
1695            must be handled differently in order to get both
1696            the starting line number and the column offset right.
1697            (cf. issue 16806) */
1698         tok->first_lineno = tok->lineno;
1699         tok->multi_line_start = tok->line_start;
1700 
1701         /* Find the quote size and start of string */
1702         c = tok_nextc(tok);
1703         if (c == quote) {
1704             c = tok_nextc(tok);
1705             if (c == quote) {
1706                 quote_size = 3;
1707             }
1708             else {
1709                 end_quote_size = 1;     /* empty string found */
1710             }
1711         }
1712         if (c != quote) {
1713             tok_backup(tok, c);
1714         }
1715 
1716         /* Get rest of string */
1717         while (end_quote_size != quote_size) {
1718             c = tok_nextc(tok);
1719             if (c == EOF) {
1720                 if (quote_size == 3) {
1721                     tok->done = E_EOFS;
1722                 }
1723                 else {
1724                     tok->done = E_EOLS;
1725                 }
1726                 tok->cur = tok->inp;
1727                 return ERRORTOKEN;
1728             }
1729             if (quote_size == 1 && c == '\n') {
1730                 tok->done = E_EOLS;
1731                 tok->cur = tok->inp;
1732                 return ERRORTOKEN;
1733             }
1734             if (c == quote) {
1735                 end_quote_size += 1;
1736             }
1737             else {
1738                 end_quote_size = 0;
1739                 if (c == '\\') {
1740                     tok_nextc(tok);  /* skip escaped char */
1741                 }
1742             }
1743         }
1744 
1745         *p_start = tok->start;
1746         *p_end = tok->cur;
1747         return STRING;
1748     }
1749 
1750     /* Line continuation */
1751     if (c == '\\') {
1752         c = tok_nextc(tok);
1753         if (c != '\n') {
1754             tok->done = E_LINECONT;
1755             tok->cur = tok->inp;
1756             return ERRORTOKEN;
1757         }
1758         c = tok_nextc(tok);
1759         if (c == EOF) {
1760             tok->done = E_EOF;
1761             tok->cur = tok->inp;
1762             return ERRORTOKEN;
1763         } else {
1764             tok_backup(tok, c);
1765         }
1766         tok->cont_line = 1;
1767         goto again; /* Read next line */
1768     }
1769 
1770     /* Check for two-character token */
1771     {
1772         int c2 = tok_nextc(tok);
1773         int token = PyToken_TwoChars(c, c2);
1774         if (token != OP) {
1775             int c3 = tok_nextc(tok);
1776             int token3 = PyToken_ThreeChars(c, c2, c3);
1777             if (token3 != OP) {
1778                 token = token3;
1779             }
1780             else {
1781                 tok_backup(tok, c3);
1782             }
1783             *p_start = tok->start;
1784             *p_end = tok->cur;
1785             return token;
1786         }
1787         tok_backup(tok, c2);
1788     }
1789 
1790     /* Keep track of parentheses nesting level */
1791     switch (c) {
1792     case '(':
1793     case '[':
1794     case '{':
1795         if (tok->level >= MAXLEVEL) {
1796             return syntaxerror(tok, "too many nested parentheses");
1797         }
1798         tok->parenstack[tok->level] = c;
1799         tok->parenlinenostack[tok->level] = tok->lineno;
1800         tok->level++;
1801         break;
1802     case ')':
1803     case ']':
1804     case '}':
1805         if (!tok->level) {
1806             return syntaxerror(tok, "unmatched '%c'", c);
1807         }
1808         tok->level--;
1809         int opening = tok->parenstack[tok->level];
1810         if (!((opening == '(' && c == ')') ||
1811               (opening == '[' && c == ']') ||
1812               (opening == '{' && c == '}')))
1813         {
1814             if (tok->parenlinenostack[tok->level] != tok->lineno) {
1815                 return syntaxerror(tok,
1816                         "closing parenthesis '%c' does not match "
1817                         "opening parenthesis '%c' on line %d",
1818                         c, opening, tok->parenlinenostack[tok->level]);
1819             }
1820             else {
1821                 return syntaxerror(tok,
1822                         "closing parenthesis '%c' does not match "
1823                         "opening parenthesis '%c'",
1824                         c, opening);
1825             }
1826         }
1827         break;
1828     }
1829 
1830     /* Punctuation character */
1831     *p_start = tok->start;
1832     *p_end = tok->cur;
1833     return PyToken_OneChar(c);
1834 }
1835 
1836 int
PyTokenizer_Get(struct tok_state * tok,const char ** p_start,const char ** p_end)1837 PyTokenizer_Get(struct tok_state *tok, const char **p_start, const char **p_end)
1838 {
1839     int result = tok_get(tok, p_start, p_end);
1840     if (tok->decoding_erred) {
1841         result = ERRORTOKEN;
1842         tok->done = E_DECODE;
1843     }
1844     return result;
1845 }
1846 
1847 /* Get the encoding of a Python file. Check for the coding cookie and check if
1848    the file starts with a BOM.
1849 
1850    PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1851    encoding in the first or second line of the file (in which case the encoding
1852    should be assumed to be UTF-8).
1853 
1854    The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1855    by the caller. */
1856 
1857 char *
PyTokenizer_FindEncodingFilename(int fd,PyObject * filename)1858 PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
1859 {
1860     struct tok_state *tok;
1861     FILE *fp;
1862     const char *p_start = NULL;
1863     const char *p_end = NULL;
1864     char *encoding = NULL;
1865 
1866     fd = _Py_dup(fd);
1867     if (fd < 0) {
1868         return NULL;
1869     }
1870 
1871     fp = fdopen(fd, "r");
1872     if (fp == NULL) {
1873         return NULL;
1874     }
1875     tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1876     if (tok == NULL) {
1877         fclose(fp);
1878         return NULL;
1879     }
1880     if (filename != NULL) {
1881         Py_INCREF(filename);
1882         tok->filename = filename;
1883     }
1884     else {
1885         tok->filename = PyUnicode_FromString("<string>");
1886         if (tok->filename == NULL) {
1887             fclose(fp);
1888             PyTokenizer_Free(tok);
1889             return encoding;
1890         }
1891     }
1892     while (tok->lineno < 2 && tok->done == E_OK) {
1893         PyTokenizer_Get(tok, &p_start, &p_end);
1894     }
1895     fclose(fp);
1896     if (tok->encoding) {
1897         encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1898         if (encoding)
1899             strcpy(encoding, tok->encoding);
1900     }
1901     PyTokenizer_Free(tok);
1902     return encoding;
1903 }
1904 
1905 char *
PyTokenizer_FindEncoding(int fd)1906 PyTokenizer_FindEncoding(int fd)
1907 {
1908     return PyTokenizer_FindEncodingFilename(fd, NULL);
1909 }
1910 
1911 #ifdef Py_DEBUG
1912 
1913 void
tok_dump(int type,char * start,char * end)1914 tok_dump(int type, char *start, char *end)
1915 {
1916     printf("%s", _PyParser_TokenNames[type]);
1917     if (type == NAME || type == NUMBER || type == STRING || type == OP)
1918         printf("(%.*s)", (int)(end - start), start);
1919 }
1920 
1921 #endif
1922