1 
2 /* Tokenizer implementation */
3 
4 #include "Python.h"
5 #include "pgenheaders.h"
6 
7 #include <ctype.h>
8 #include <assert.h>
9 
10 #include "tokenizer.h"
11 #include "errcode.h"
12 
13 #ifndef PGEN
14 #include "unicodeobject.h"
15 #include "bytesobject.h"
16 #include "fileobject.h"
17 #include "codecs.h"
18 #include "abstract.h"
19 #endif /* PGEN */
20 
21 #define is_potential_identifier_start(c) (\
22               (c >= 'a' && c <= 'z')\
23                || (c >= 'A' && c <= 'Z')\
24                || c == '_'\
25                || (c >= 128))
26 
27 #define is_potential_identifier_char(c) (\
28               (c >= 'a' && c <= 'z')\
29                || (c >= 'A' && c <= 'Z')\
30                || (c >= '0' && c <= '9')\
31                || c == '_'\
32                || (c >= 128))
33 
34 extern char *PyOS_Readline(FILE *, FILE *, const char *);
35 /* Return malloc'ed string including trailing \n;
36    empty malloc'ed string for EOF;
37    NULL if interrupted */
38 
39 /* Don't ever change this -- it would break the portability of Python code */
40 #define TABSIZE 8
41 
42 /* Forward */
43 static struct tok_state *tok_new(void);
44 static int tok_nextc(struct tok_state *tok);
45 static void tok_backup(struct tok_state *tok, int c);
46 
47 
48 /* Token names */
49 
50 const char *_PyParser_TokenNames[] = {
51     "ENDMARKER",
52     "NAME",
53     "NUMBER",
54     "STRING",
55     "NEWLINE",
56     "INDENT",
57     "DEDENT",
58     "LPAR",
59     "RPAR",
60     "LSQB",
61     "RSQB",
62     "COLON",
63     "COMMA",
64     "SEMI",
65     "PLUS",
66     "MINUS",
67     "STAR",
68     "SLASH",
69     "VBAR",
70     "AMPER",
71     "LESS",
72     "GREATER",
73     "EQUAL",
74     "DOT",
75     "PERCENT",
76     "LBRACE",
77     "RBRACE",
78     "EQEQUAL",
79     "NOTEQUAL",
80     "LESSEQUAL",
81     "GREATEREQUAL",
82     "TILDE",
83     "CIRCUMFLEX",
84     "LEFTSHIFT",
85     "RIGHTSHIFT",
86     "DOUBLESTAR",
87     "PLUSEQUAL",
88     "MINEQUAL",
89     "STAREQUAL",
90     "SLASHEQUAL",
91     "PERCENTEQUAL",
92     "AMPEREQUAL",
93     "VBAREQUAL",
94     "CIRCUMFLEXEQUAL",
95     "LEFTSHIFTEQUAL",
96     "RIGHTSHIFTEQUAL",
97     "DOUBLESTAREQUAL",
98     "DOUBLESLASH",
99     "DOUBLESLASHEQUAL",
100     "AT",
101     "ATEQUAL",
102     "RARROW",
103     "ELLIPSIS",
104     /* This table must match the #defines in token.h! */
105     "OP",
106     "AWAIT",
107     "ASYNC",
108     "<ERRORTOKEN>",
109     "<N_TOKENS>"
110 };
111 
112 
113 /* Create and initialize a new tok_state structure */
114 
115 static struct tok_state *
tok_new(void)116 tok_new(void)
117 {
118     struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
119                                             sizeof(struct tok_state));
120     if (tok == NULL)
121         return NULL;
122     tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
123     tok->done = E_OK;
124     tok->fp = NULL;
125     tok->input = NULL;
126     tok->tabsize = TABSIZE;
127     tok->indent = 0;
128     tok->indstack[0] = 0;
129 
130     tok->atbol = 1;
131     tok->pendin = 0;
132     tok->prompt = tok->nextprompt = NULL;
133     tok->lineno = 0;
134     tok->level = 0;
135     tok->altwarning = 1;
136     tok->alterror = 1;
137     tok->alttabsize = 1;
138     tok->altindstack[0] = 0;
139     tok->decoding_state = STATE_INIT;
140     tok->decoding_erred = 0;
141     tok->read_coding_spec = 0;
142     tok->enc = NULL;
143     tok->encoding = NULL;
144     tok->cont_line = 0;
145 #ifndef PGEN
146     tok->filename = NULL;
147     tok->decoding_readline = NULL;
148     tok->decoding_buffer = NULL;
149 #endif
150 
151     tok->async_def = 0;
152     tok->async_def_indent = 0;
153     tok->async_def_nl = 0;
154 
155     return tok;
156 }
157 
158 static char *
new_string(const char * s,Py_ssize_t len,struct tok_state * tok)159 new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
160 {
161     char* result = (char *)PyMem_MALLOC(len + 1);
162     if (!result) {
163         tok->done = E_NOMEM;
164         return NULL;
165     }
166     memcpy(result, s, len);
167     result[len] = '\0';
168     return result;
169 }
170 
171 #ifdef PGEN
172 
173 static char *
decoding_fgets(char * s,int size,struct tok_state * tok)174 decoding_fgets(char *s, int size, struct tok_state *tok)
175 {
176     return fgets(s, size, tok->fp);
177 }
178 
179 static int
decoding_feof(struct tok_state * tok)180 decoding_feof(struct tok_state *tok)
181 {
182     return feof(tok->fp);
183 }
184 
185 static char *
decode_str(const char * str,int exec_input,struct tok_state * tok)186 decode_str(const char *str, int exec_input, struct tok_state *tok)
187 {
188     return new_string(str, strlen(str), tok);
189 }
190 
191 #else /* PGEN */
192 
193 static char *
error_ret(struct tok_state * tok)194 error_ret(struct tok_state *tok) /* XXX */
195 {
196     tok->decoding_erred = 1;
197     if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
198         PyMem_FREE(tok->buf);
199     tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;
200     tok->done = E_DECODE;
201     return NULL;                /* as if it were EOF */
202 }
203 
204 
205 static const char *
get_normal_name(const char * s)206 get_normal_name(const char *s)  /* for utf-8 and latin-1 */
207 {
208     char buf[13];
209     int i;
210     for (i = 0; i < 12; i++) {
211         int c = s[i];
212         if (c == '\0')
213             break;
214         else if (c == '_')
215             buf[i] = '-';
216         else
217             buf[i] = tolower(c);
218     }
219     buf[i] = '\0';
220     if (strcmp(buf, "utf-8") == 0 ||
221         strncmp(buf, "utf-8-", 6) == 0)
222         return "utf-8";
223     else if (strcmp(buf, "latin-1") == 0 ||
224              strcmp(buf, "iso-8859-1") == 0 ||
225              strcmp(buf, "iso-latin-1") == 0 ||
226              strncmp(buf, "latin-1-", 8) == 0 ||
227              strncmp(buf, "iso-8859-1-", 11) == 0 ||
228              strncmp(buf, "iso-latin-1-", 12) == 0)
229         return "iso-8859-1";
230     else
231         return s;
232 }
233 
234 /* Return the coding spec in S, or NULL if none is found.  */
235 
236 static int
get_coding_spec(const char * s,char ** spec,Py_ssize_t size,struct tok_state * tok)237 get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
238 {
239     Py_ssize_t i;
240     *spec = NULL;
241     /* Coding spec must be in a comment, and that comment must be
242      * the only statement on the source code line. */
243     for (i = 0; i < size - 6; i++) {
244         if (s[i] == '#')
245             break;
246         if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
247             return 1;
248     }
249     for (; i < size - 6; i++) { /* XXX inefficient search */
250         const char* t = s + i;
251         if (strncmp(t, "coding", 6) == 0) {
252             const char* begin = NULL;
253             t += 6;
254             if (t[0] != ':' && t[0] != '=')
255                 continue;
256             do {
257                 t++;
258             } while (t[0] == '\x20' || t[0] == '\t');
259 
260             begin = t;
261             while (Py_ISALNUM(t[0]) ||
262                    t[0] == '-' || t[0] == '_' || t[0] == '.')
263                 t++;
264 
265             if (begin < t) {
266                 char* r = new_string(begin, t - begin, tok);
267                 const char* q;
268                 if (!r)
269                     return 0;
270                 q = get_normal_name(r);
271                 if (r != q) {
272                     PyMem_FREE(r);
273                     r = new_string(q, strlen(q), tok);
274                     if (!r)
275                         return 0;
276                 }
277                 *spec = r;
278                 break;
279             }
280         }
281     }
282     return 1;
283 }
284 
285 /* Check whether the line contains a coding spec. If it does,
286    invoke the set_readline function for the new encoding.
287    This function receives the tok_state and the new encoding.
288    Return 1 on success, 0 on failure.  */
289 
290 static int
check_coding_spec(const char * line,Py_ssize_t size,struct tok_state * tok,int set_readline (struct tok_state *,const char *))291 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
292                   int set_readline(struct tok_state *, const char *))
293 {
294     char *cs;
295     int r = 1;
296 
297     if (tok->cont_line) {
298         /* It's a continuation line, so it can't be a coding spec. */
299         tok->read_coding_spec = 1;
300         return 1;
301     }
302     if (!get_coding_spec(line, &cs, size, tok))
303         return 0;
304     if (!cs) {
305         Py_ssize_t i;
306         for (i = 0; i < size; i++) {
307             if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
308                 break;
309             if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
310                 /* Stop checking coding spec after a line containing
311                  * anything except a comment. */
312                 tok->read_coding_spec = 1;
313                 break;
314             }
315         }
316         return 1;
317     }
318     tok->read_coding_spec = 1;
319     if (tok->encoding == NULL) {
320         assert(tok->decoding_state == STATE_RAW);
321         if (strcmp(cs, "utf-8") == 0) {
322             tok->encoding = cs;
323         } else {
324             r = set_readline(tok, cs);
325             if (r) {
326                 tok->encoding = cs;
327                 tok->decoding_state = STATE_NORMAL;
328             }
329             else {
330                 PyErr_Format(PyExc_SyntaxError,
331                              "encoding problem: %s", cs);
332                 PyMem_FREE(cs);
333             }
334         }
335     } else {                /* then, compare cs with BOM */
336         r = (strcmp(tok->encoding, cs) == 0);
337         if (!r)
338             PyErr_Format(PyExc_SyntaxError,
339                          "encoding problem: %s with BOM", cs);
340         PyMem_FREE(cs);
341     }
342     return r;
343 }
344 
345 /* See whether the file starts with a BOM. If it does,
346    invoke the set_readline function with the new encoding.
347    Return 1 on success, 0 on failure.  */
348 
349 static int
check_bom(int get_char (struct tok_state *),void unget_char (int,struct tok_state *),int set_readline (struct tok_state *,const char *),struct tok_state * tok)350 check_bom(int get_char(struct tok_state *),
351           void unget_char(int, struct tok_state *),
352           int set_readline(struct tok_state *, const char *),
353           struct tok_state *tok)
354 {
355     int ch1, ch2, ch3;
356     ch1 = get_char(tok);
357     tok->decoding_state = STATE_RAW;
358     if (ch1 == EOF) {
359         return 1;
360     } else if (ch1 == 0xEF) {
361         ch2 = get_char(tok);
362         if (ch2 != 0xBB) {
363             unget_char(ch2, tok);
364             unget_char(ch1, tok);
365             return 1;
366         }
367         ch3 = get_char(tok);
368         if (ch3 != 0xBF) {
369             unget_char(ch3, tok);
370             unget_char(ch2, tok);
371             unget_char(ch1, tok);
372             return 1;
373         }
374 #if 0
375     /* Disable support for UTF-16 BOMs until a decision
376        is made whether this needs to be supported.  */
377     } else if (ch1 == 0xFE) {
378         ch2 = get_char(tok);
379         if (ch2 != 0xFF) {
380             unget_char(ch2, tok);
381             unget_char(ch1, tok);
382             return 1;
383         }
384         if (!set_readline(tok, "utf-16-be"))
385             return 0;
386         tok->decoding_state = STATE_NORMAL;
387     } else if (ch1 == 0xFF) {
388         ch2 = get_char(tok);
389         if (ch2 != 0xFE) {
390             unget_char(ch2, tok);
391             unget_char(ch1, tok);
392             return 1;
393         }
394         if (!set_readline(tok, "utf-16-le"))
395             return 0;
396         tok->decoding_state = STATE_NORMAL;
397 #endif
398     } else {
399         unget_char(ch1, tok);
400         return 1;
401     }
402     if (tok->encoding != NULL)
403         PyMem_FREE(tok->encoding);
404     tok->encoding = new_string("utf-8", 5, tok);
405     if (!tok->encoding)
406         return 0;
407     /* No need to set_readline: input is already utf-8 */
408     return 1;
409 }
410 
411 /* Read a line of text from TOK into S, using the stream in TOK.
412    Return NULL on failure, else S.
413 
414    On entry, tok->decoding_buffer will be one of:
415      1) NULL: need to call tok->decoding_readline to get a new line
416      2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
417        stored the result in tok->decoding_buffer
418      3) PyByteArrayObject *: previous call to fp_readl did not have enough room
419        (in the s buffer) to copy entire contents of the line read
420        by tok->decoding_readline.  tok->decoding_buffer has the overflow.
421        In this case, fp_readl is called in a loop (with an expanded buffer)
422        until the buffer ends with a '\n' (or until the end of the file is
423        reached): see tok_nextc and its calls to decoding_fgets.
424 */
425 
426 static char *
fp_readl(char * s,int size,struct tok_state * tok)427 fp_readl(char *s, int size, struct tok_state *tok)
428 {
429     PyObject* bufobj;
430     const char *buf;
431     Py_ssize_t buflen;
432 
433     /* Ask for one less byte so we can terminate it */
434     assert(size > 0);
435     size--;
436 
437     if (tok->decoding_buffer) {
438         bufobj = tok->decoding_buffer;
439         Py_INCREF(bufobj);
440     }
441     else
442     {
443         bufobj = PyObject_CallObject(tok->decoding_readline, NULL);
444         if (bufobj == NULL)
445             goto error;
446     }
447     if (PyUnicode_CheckExact(bufobj))
448     {
449         buf = PyUnicode_AsUTF8AndSize(bufobj, &buflen);
450         if (buf == NULL) {
451             goto error;
452         }
453     }
454     else
455     {
456         buf = PyByteArray_AsString(bufobj);
457         if (buf == NULL) {
458             goto error;
459         }
460         buflen = PyByteArray_GET_SIZE(bufobj);
461     }
462 
463     Py_XDECREF(tok->decoding_buffer);
464     if (buflen > size) {
465         /* Too many chars, the rest goes into tok->decoding_buffer */
466         tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
467                                                          buflen-size);
468         if (tok->decoding_buffer == NULL)
469             goto error;
470         buflen = size;
471     }
472     else
473         tok->decoding_buffer = NULL;
474 
475     memcpy(s, buf, buflen);
476     s[buflen] = '\0';
477     if (buflen == 0) /* EOF */
478         s = NULL;
479     Py_DECREF(bufobj);
480     return s;
481 
482 error:
483     Py_XDECREF(bufobj);
484     return error_ret(tok);
485 }
486 
487 /* Set the readline function for TOK to a StreamReader's
488    readline function. The StreamReader is named ENC.
489 
490    This function is called from check_bom and check_coding_spec.
491 
492    ENC is usually identical to the future value of tok->encoding,
493    except for the (currently unsupported) case of UTF-16.
494 
495    Return 1 on success, 0 on failure. */
496 
497 static int
fp_setreadl(struct tok_state * tok,const char * enc)498 fp_setreadl(struct tok_state *tok, const char* enc)
499 {
500     PyObject *readline, *io, *stream;
501     _Py_IDENTIFIER(open);
502     _Py_IDENTIFIER(readline);
503     int fd;
504     long pos;
505 
506     fd = fileno(tok->fp);
507     /* Due to buffering the file offset for fd can be different from the file
508      * position of tok->fp.  If tok->fp was opened in text mode on Windows,
509      * its file position counts CRLF as one char and can't be directly mapped
510      * to the file offset for fd.  Instead we step back one byte and read to
511      * the end of line.*/
512     pos = ftell(tok->fp);
513     if (pos == -1 ||
514         lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
515         PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
516         return 0;
517     }
518 
519     io = PyImport_ImportModuleNoBlock("io");
520     if (io == NULL)
521         return 0;
522 
523     stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
524                     fd, "r", -1, enc, Py_None, Py_None, Py_False);
525     Py_DECREF(io);
526     if (stream == NULL)
527         return 0;
528 
529     readline = _PyObject_GetAttrId(stream, &PyId_readline);
530     Py_DECREF(stream);
531     if (readline == NULL)
532         return 0;
533     Py_XSETREF(tok->decoding_readline, readline);
534 
535     if (pos > 0) {
536         PyObject *bufobj = PyObject_CallObject(readline, NULL);
537         if (bufobj == NULL)
538             return 0;
539         Py_DECREF(bufobj);
540     }
541 
542     return 1;
543 }
544 
545 /* Fetch the next byte from TOK. */
546 
fp_getc(struct tok_state * tok)547 static int fp_getc(struct tok_state *tok) {
548     return getc(tok->fp);
549 }
550 
551 /* Unfetch the last byte back into TOK.  */
552 
fp_ungetc(int c,struct tok_state * tok)553 static void fp_ungetc(int c, struct tok_state *tok) {
554     ungetc(c, tok->fp);
555 }
556 
557 /* Check whether the characters at s start a valid
558    UTF-8 sequence. Return the number of characters forming
559    the sequence if yes, 0 if not.  */
valid_utf8(const unsigned char * s)560 static int valid_utf8(const unsigned char* s)
561 {
562     int expected = 0;
563     int length;
564     if (*s < 0x80)
565         /* single-byte code */
566         return 1;
567     if (*s < 0xc0)
568         /* following byte */
569         return 0;
570     if (*s < 0xE0)
571         expected = 1;
572     else if (*s < 0xF0)
573         expected = 2;
574     else if (*s < 0xF8)
575         expected = 3;
576     else
577         return 0;
578     length = expected + 1;
579     for (; expected; expected--)
580         if (s[expected] < 0x80 || s[expected] >= 0xC0)
581             return 0;
582     return length;
583 }
584 
585 /* Read a line of input from TOK. Determine encoding
586    if necessary.  */
587 
588 static char *
decoding_fgets(char * s,int size,struct tok_state * tok)589 decoding_fgets(char *s, int size, struct tok_state *tok)
590 {
591     char *line = NULL;
592     int badchar = 0;
593     for (;;) {
594         if (tok->decoding_state == STATE_NORMAL) {
595             /* We already have a codec associated with
596                this input. */
597             line = fp_readl(s, size, tok);
598             break;
599         } else if (tok->decoding_state == STATE_RAW) {
600             /* We want a 'raw' read. */
601             line = Py_UniversalNewlineFgets(s, size,
602                                             tok->fp, NULL);
603             break;
604         } else {
605             /* We have not yet determined the encoding.
606                If an encoding is found, use the file-pointer
607                reader functions from now on. */
608             if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
609                 return error_ret(tok);
610             assert(tok->decoding_state != STATE_INIT);
611         }
612     }
613     if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
614         if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
615             return error_ret(tok);
616         }
617     }
618 #ifndef PGEN
619     /* The default encoding is UTF-8, so make sure we don't have any
620        non-UTF-8 sequences in it. */
621     if (line && !tok->encoding) {
622         unsigned char *c;
623         int length;
624         for (c = (unsigned char *)line; *c; c += length)
625             if (!(length = valid_utf8(c))) {
626                 badchar = *c;
627                 break;
628             }
629     }
630     if (badchar) {
631         /* Need to add 1 to the line number, since this line
632            has not been counted, yet.  */
633         PyErr_Format(PyExc_SyntaxError,
634                 "Non-UTF-8 code starting with '\\x%.2x' "
635                 "in file %U on line %i, "
636                 "but no encoding declared; "
637                 "see http://python.org/dev/peps/pep-0263/ for details",
638                 badchar, tok->filename, tok->lineno + 1);
639         return error_ret(tok);
640     }
641 #endif
642     return line;
643 }
644 
645 static int
decoding_feof(struct tok_state * tok)646 decoding_feof(struct tok_state *tok)
647 {
648     if (tok->decoding_state != STATE_NORMAL) {
649         return feof(tok->fp);
650     } else {
651         PyObject* buf = tok->decoding_buffer;
652         if (buf == NULL) {
653             buf = PyObject_CallObject(tok->decoding_readline, NULL);
654             if (buf == NULL) {
655                 error_ret(tok);
656                 return 1;
657             } else {
658                 tok->decoding_buffer = buf;
659             }
660         }
661         return PyObject_Length(buf) == 0;
662     }
663 }
664 
665 /* Fetch a byte from TOK, using the string buffer. */
666 
667 static int
buf_getc(struct tok_state * tok)668 buf_getc(struct tok_state *tok) {
669     return Py_CHARMASK(*tok->str++);
670 }
671 
672 /* Unfetch a byte from TOK, using the string buffer. */
673 
674 static void
buf_ungetc(int c,struct tok_state * tok)675 buf_ungetc(int c, struct tok_state *tok) {
676     tok->str--;
677     assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */
678 }
679 
680 /* Set the readline function for TOK to ENC. For the string-based
681    tokenizer, this means to just record the encoding. */
682 
683 static int
buf_setreadl(struct tok_state * tok,const char * enc)684 buf_setreadl(struct tok_state *tok, const char* enc) {
685     tok->enc = enc;
686     return 1;
687 }
688 
689 /* Return a UTF-8 encoding Python string object from the
690    C byte string STR, which is encoded with ENC. */
691 
692 static PyObject *
translate_into_utf8(const char * str,const char * enc)693 translate_into_utf8(const char* str, const char* enc) {
694     PyObject *utf8;
695     PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
696     if (buf == NULL)
697         return NULL;
698     utf8 = PyUnicode_AsUTF8String(buf);
699     Py_DECREF(buf);
700     return utf8;
701 }
702 
703 
704 static char *
translate_newlines(const char * s,int exec_input,struct tok_state * tok)705 translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
706     int skip_next_lf = 0;
707     size_t needed_length = strlen(s) + 2, final_length;
708     char *buf, *current;
709     char c = '\0';
710     buf = PyMem_MALLOC(needed_length);
711     if (buf == NULL) {
712         tok->done = E_NOMEM;
713         return NULL;
714     }
715     for (current = buf; *s; s++, current++) {
716         c = *s;
717         if (skip_next_lf) {
718             skip_next_lf = 0;
719             if (c == '\n') {
720                 c = *++s;
721                 if (!c)
722                     break;
723             }
724         }
725         if (c == '\r') {
726             skip_next_lf = 1;
727             c = '\n';
728         }
729         *current = c;
730     }
731     /* If this is exec input, add a newline to the end of the string if
732        there isn't one already. */
733     if (exec_input && c != '\n') {
734         *current = '\n';
735         current++;
736     }
737     *current = '\0';
738     final_length = current - buf + 1;
739     if (final_length < needed_length && final_length)
740         /* should never fail */
741         buf = PyMem_REALLOC(buf, final_length);
742     return buf;
743 }
744 
745 /* Decode a byte string STR for use as the buffer of TOK.
746    Look for encoding declarations inside STR, and record them
747    inside TOK.  */
748 
749 static const char *
decode_str(const char * input,int single,struct tok_state * tok)750 decode_str(const char *input, int single, struct tok_state *tok)
751 {
752     PyObject* utf8 = NULL;
753     const char *str;
754     const char *s;
755     const char *newl[2] = {NULL, NULL};
756     int lineno = 0;
757     tok->input = str = translate_newlines(input, single, tok);
758     if (str == NULL)
759         return NULL;
760     tok->enc = NULL;
761     tok->str = str;
762     if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
763         return error_ret(tok);
764     str = tok->str;             /* string after BOM if any */
765     assert(str);
766     if (tok->enc != NULL) {
767         utf8 = translate_into_utf8(str, tok->enc);
768         if (utf8 == NULL)
769             return error_ret(tok);
770         str = PyBytes_AsString(utf8);
771     }
772     for (s = str;; s++) {
773         if (*s == '\0') break;
774         else if (*s == '\n') {
775             assert(lineno < 2);
776             newl[lineno] = s;
777             lineno++;
778             if (lineno == 2) break;
779         }
780     }
781     tok->enc = NULL;
782     /* need to check line 1 and 2 separately since check_coding_spec
783        assumes a single line as input */
784     if (newl[0]) {
785         if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
786             return error_ret(tok);
787         if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
788             if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
789                                    tok, buf_setreadl))
790                 return error_ret(tok);
791         }
792     }
793     if (tok->enc != NULL) {
794         assert(utf8 == NULL);
795         utf8 = translate_into_utf8(str, tok->enc);
796         if (utf8 == NULL)
797             return error_ret(tok);
798         str = PyBytes_AS_STRING(utf8);
799     }
800     assert(tok->decoding_buffer == NULL);
801     tok->decoding_buffer = utf8; /* CAUTION */
802     return str;
803 }
804 
805 #endif /* PGEN */
806 
807 /* Set up tokenizer for string */
808 
809 struct tok_state *
PyTokenizer_FromString(const char * str,int exec_input)810 PyTokenizer_FromString(const char *str, int exec_input)
811 {
812     struct tok_state *tok = tok_new();
813     if (tok == NULL)
814         return NULL;
815     str = decode_str(str, exec_input, tok);
816     if (str == NULL) {
817         PyTokenizer_Free(tok);
818         return NULL;
819     }
820 
821     /* XXX: constify members. */
822     tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
823     return tok;
824 }
825 
826 struct tok_state *
PyTokenizer_FromUTF8(const char * str,int exec_input)827 PyTokenizer_FromUTF8(const char *str, int exec_input)
828 {
829     struct tok_state *tok = tok_new();
830     if (tok == NULL)
831         return NULL;
832 #ifndef PGEN
833     tok->input = str = translate_newlines(str, exec_input, tok);
834 #endif
835     if (str == NULL) {
836         PyTokenizer_Free(tok);
837         return NULL;
838     }
839     tok->decoding_state = STATE_RAW;
840     tok->read_coding_spec = 1;
841     tok->enc = NULL;
842     tok->str = str;
843     tok->encoding = (char *)PyMem_MALLOC(6);
844     if (!tok->encoding) {
845         PyTokenizer_Free(tok);
846         return NULL;
847     }
848     strcpy(tok->encoding, "utf-8");
849 
850     /* XXX: constify members. */
851     tok->buf = tok->cur = tok->end = tok->inp = (char*)str;
852     return tok;
853 }
854 
855 /* Set up tokenizer for file */
856 
857 struct tok_state *
PyTokenizer_FromFile(FILE * fp,const char * enc,const char * ps1,const char * ps2)858 PyTokenizer_FromFile(FILE *fp, const char* enc,
859                      const char *ps1, const char *ps2)
860 {
861     struct tok_state *tok = tok_new();
862     if (tok == NULL)
863         return NULL;
864     if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
865         PyTokenizer_Free(tok);
866         return NULL;
867     }
868     tok->cur = tok->inp = tok->buf;
869     tok->end = tok->buf + BUFSIZ;
870     tok->fp = fp;
871     tok->prompt = ps1;
872     tok->nextprompt = ps2;
873     if (enc != NULL) {
874         /* Must copy encoding declaration since it
875            gets copied into the parse tree. */
876         tok->encoding = PyMem_MALLOC(strlen(enc)+1);
877         if (!tok->encoding) {
878             PyTokenizer_Free(tok);
879             return NULL;
880         }
881         strcpy(tok->encoding, enc);
882         tok->decoding_state = STATE_NORMAL;
883     }
884     return tok;
885 }
886 
887 
888 /* Free a tok_state structure */
889 
890 void
PyTokenizer_Free(struct tok_state * tok)891 PyTokenizer_Free(struct tok_state *tok)
892 {
893     if (tok->encoding != NULL)
894         PyMem_FREE(tok->encoding);
895 #ifndef PGEN
896     Py_XDECREF(tok->decoding_readline);
897     Py_XDECREF(tok->decoding_buffer);
898     Py_XDECREF(tok->filename);
899 #endif
900     if (tok->fp != NULL && tok->buf != NULL)
901         PyMem_FREE(tok->buf);
902     if (tok->input)
903         PyMem_FREE((char *)tok->input);
904     PyMem_FREE(tok);
905 }
906 
907 /* Get next char, updating state; error code goes into tok->done */
908 
909 static int
tok_nextc(struct tok_state * tok)910 tok_nextc(struct tok_state *tok)
911 {
912     for (;;) {
913         if (tok->cur != tok->inp) {
914             return Py_CHARMASK(*tok->cur++); /* Fast path */
915         }
916         if (tok->done != E_OK)
917             return EOF;
918         if (tok->fp == NULL) {
919             char *end = strchr(tok->inp, '\n');
920             if (end != NULL)
921                 end++;
922             else {
923                 end = strchr(tok->inp, '\0');
924                 if (end == tok->inp) {
925                     tok->done = E_EOF;
926                     return EOF;
927                 }
928             }
929             if (tok->start == NULL)
930                 tok->buf = tok->cur;
931             tok->line_start = tok->cur;
932             tok->lineno++;
933             tok->inp = end;
934             return Py_CHARMASK(*tok->cur++);
935         }
936         if (tok->prompt != NULL) {
937             char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
938 #ifndef PGEN
939             if (newtok != NULL) {
940                 char *translated = translate_newlines(newtok, 0, tok);
941                 PyMem_FREE(newtok);
942                 if (translated == NULL)
943                     return EOF;
944                 newtok = translated;
945             }
946             if (tok->encoding && newtok && *newtok) {
947                 /* Recode to UTF-8 */
948                 Py_ssize_t buflen;
949                 const char* buf;
950                 PyObject *u = translate_into_utf8(newtok, tok->encoding);
951                 PyMem_FREE(newtok);
952                 if (!u) {
953                     tok->done = E_DECODE;
954                     return EOF;
955                 }
956                 buflen = PyBytes_GET_SIZE(u);
957                 buf = PyBytes_AS_STRING(u);
958                 newtok = PyMem_MALLOC(buflen+1);
959                 strcpy(newtok, buf);
960                 Py_DECREF(u);
961             }
962 #endif
963             if (tok->nextprompt != NULL)
964                 tok->prompt = tok->nextprompt;
965             if (newtok == NULL)
966                 tok->done = E_INTR;
967             else if (*newtok == '\0') {
968                 PyMem_FREE(newtok);
969                 tok->done = E_EOF;
970             }
971             else if (tok->start != NULL) {
972                 size_t start = tok->start - tok->buf;
973                 size_t oldlen = tok->cur - tok->buf;
974                 size_t newlen = oldlen + strlen(newtok);
975                 char *buf = tok->buf;
976                 buf = (char *)PyMem_REALLOC(buf, newlen+1);
977                 tok->lineno++;
978                 if (buf == NULL) {
979                     PyMem_FREE(tok->buf);
980                     tok->buf = NULL;
981                     PyMem_FREE(newtok);
982                     tok->done = E_NOMEM;
983                     return EOF;
984                 }
985                 tok->buf = buf;
986                 tok->cur = tok->buf + oldlen;
987                 tok->line_start = tok->cur;
988                 strcpy(tok->buf + oldlen, newtok);
989                 PyMem_FREE(newtok);
990                 tok->inp = tok->buf + newlen;
991                 tok->end = tok->inp + 1;
992                 tok->start = tok->buf + start;
993             }
994             else {
995                 tok->lineno++;
996                 if (tok->buf != NULL)
997                     PyMem_FREE(tok->buf);
998                 tok->buf = newtok;
999                 tok->cur = tok->buf;
1000                 tok->line_start = tok->buf;
1001                 tok->inp = strchr(tok->buf, '\0');
1002                 tok->end = tok->inp + 1;
1003             }
1004         }
1005         else {
1006             int done = 0;
1007             Py_ssize_t cur = 0;
1008             char *pt;
1009             if (tok->start == NULL) {
1010                 if (tok->buf == NULL) {
1011                     tok->buf = (char *)
1012                         PyMem_MALLOC(BUFSIZ);
1013                     if (tok->buf == NULL) {
1014                         tok->done = E_NOMEM;
1015                         return EOF;
1016                     }
1017                     tok->end = tok->buf + BUFSIZ;
1018                 }
1019                 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
1020                           tok) == NULL) {
1021                     if (!tok->decoding_erred)
1022                         tok->done = E_EOF;
1023                     done = 1;
1024                 }
1025                 else {
1026                     tok->done = E_OK;
1027                     tok->inp = strchr(tok->buf, '\0');
1028                     done = tok->inp == tok->buf || tok->inp[-1] == '\n';
1029                 }
1030             }
1031             else {
1032                 cur = tok->cur - tok->buf;
1033                 if (decoding_feof(tok)) {
1034                     tok->done = E_EOF;
1035                     done = 1;
1036                 }
1037                 else
1038                     tok->done = E_OK;
1039             }
1040             tok->lineno++;
1041             /* Read until '\n' or EOF */
1042             while (!done) {
1043                 Py_ssize_t curstart = tok->start == NULL ? -1 :
1044                           tok->start - tok->buf;
1045                 Py_ssize_t curvalid = tok->inp - tok->buf;
1046                 Py_ssize_t newsize = curvalid + BUFSIZ;
1047                 char *newbuf = tok->buf;
1048                 newbuf = (char *)PyMem_REALLOC(newbuf,
1049                                                newsize);
1050                 if (newbuf == NULL) {
1051                     tok->done = E_NOMEM;
1052                     tok->cur = tok->inp;
1053                     return EOF;
1054                 }
1055                 tok->buf = newbuf;
1056                 tok->cur = tok->buf + cur;
1057                 tok->line_start = tok->cur;
1058                 tok->inp = tok->buf + curvalid;
1059                 tok->end = tok->buf + newsize;
1060                 tok->start = curstart < 0 ? NULL :
1061                          tok->buf + curstart;
1062                 if (decoding_fgets(tok->inp,
1063                                (int)(tok->end - tok->inp),
1064                                tok) == NULL) {
1065                     /* Break out early on decoding
1066                        errors, as tok->buf will be NULL
1067                      */
1068                     if (tok->decoding_erred)
1069                         return EOF;
1070                     /* Last line does not end in \n,
1071                        fake one */
1072                     strcpy(tok->inp, "\n");
1073                 }
1074                 tok->inp = strchr(tok->inp, '\0');
1075                 done = tok->inp[-1] == '\n';
1076             }
1077             if (tok->buf != NULL) {
1078                 tok->cur = tok->buf + cur;
1079                 tok->line_start = tok->cur;
1080                 /* replace "\r\n" with "\n" */
1081                 /* For Mac leave the \r, giving a syntax error */
1082                 pt = tok->inp - 2;
1083                 if (pt >= tok->buf && *pt == '\r') {
1084                     *pt++ = '\n';
1085                     *pt = '\0';
1086                     tok->inp = pt;
1087                 }
1088             }
1089         }
1090         if (tok->done != E_OK) {
1091             if (tok->prompt != NULL)
1092                 PySys_WriteStderr("\n");
1093             tok->cur = tok->inp;
1094             return EOF;
1095         }
1096     }
1097     /*NOTREACHED*/
1098 }
1099 
1100 
1101 /* Back-up one character */
1102 
1103 static void
tok_backup(struct tok_state * tok,int c)1104 tok_backup(struct tok_state *tok, int c)
1105 {
1106     if (c != EOF) {
1107         if (--tok->cur < tok->buf)
1108             Py_FatalError("tok_backup: beginning of buffer");
1109         if (*tok->cur != c)
1110             *tok->cur = c;
1111     }
1112 }
1113 
1114 
1115 /* Return the token corresponding to a single character */
1116 
1117 int
PyToken_OneChar(int c)1118 PyToken_OneChar(int c)
1119 {
1120     switch (c) {
1121     case '(':           return LPAR;
1122     case ')':           return RPAR;
1123     case '[':           return LSQB;
1124     case ']':           return RSQB;
1125     case ':':           return COLON;
1126     case ',':           return COMMA;
1127     case ';':           return SEMI;
1128     case '+':           return PLUS;
1129     case '-':           return MINUS;
1130     case '*':           return STAR;
1131     case '/':           return SLASH;
1132     case '|':           return VBAR;
1133     case '&':           return AMPER;
1134     case '<':           return LESS;
1135     case '>':           return GREATER;
1136     case '=':           return EQUAL;
1137     case '.':           return DOT;
1138     case '%':           return PERCENT;
1139     case '{':           return LBRACE;
1140     case '}':           return RBRACE;
1141     case '^':           return CIRCUMFLEX;
1142     case '~':           return TILDE;
1143     case '@':           return AT;
1144     default:            return OP;
1145     }
1146 }
1147 
1148 
1149 int
PyToken_TwoChars(int c1,int c2)1150 PyToken_TwoChars(int c1, int c2)
1151 {
1152     switch (c1) {
1153     case '=':
1154         switch (c2) {
1155         case '=':               return EQEQUAL;
1156         }
1157         break;
1158     case '!':
1159         switch (c2) {
1160         case '=':               return NOTEQUAL;
1161         }
1162         break;
1163     case '<':
1164         switch (c2) {
1165         case '>':               return NOTEQUAL;
1166         case '=':               return LESSEQUAL;
1167         case '<':               return LEFTSHIFT;
1168         }
1169         break;
1170     case '>':
1171         switch (c2) {
1172         case '=':               return GREATEREQUAL;
1173         case '>':               return RIGHTSHIFT;
1174         }
1175         break;
1176     case '+':
1177         switch (c2) {
1178         case '=':               return PLUSEQUAL;
1179         }
1180         break;
1181     case '-':
1182         switch (c2) {
1183         case '=':               return MINEQUAL;
1184         case '>':               return RARROW;
1185         }
1186         break;
1187     case '*':
1188         switch (c2) {
1189         case '*':               return DOUBLESTAR;
1190         case '=':               return STAREQUAL;
1191         }
1192         break;
1193     case '/':
1194         switch (c2) {
1195         case '/':               return DOUBLESLASH;
1196         case '=':               return SLASHEQUAL;
1197         }
1198         break;
1199     case '|':
1200         switch (c2) {
1201         case '=':               return VBAREQUAL;
1202         }
1203         break;
1204     case '%':
1205         switch (c2) {
1206         case '=':               return PERCENTEQUAL;
1207         }
1208         break;
1209     case '&':
1210         switch (c2) {
1211         case '=':               return AMPEREQUAL;
1212         }
1213         break;
1214     case '^':
1215         switch (c2) {
1216         case '=':               return CIRCUMFLEXEQUAL;
1217         }
1218         break;
1219     case '@':
1220         switch (c2) {
1221         case '=':               return ATEQUAL;
1222         }
1223         break;
1224     }
1225     return OP;
1226 }
1227 
1228 int
PyToken_ThreeChars(int c1,int c2,int c3)1229 PyToken_ThreeChars(int c1, int c2, int c3)
1230 {
1231     switch (c1) {
1232     case '<':
1233         switch (c2) {
1234         case '<':
1235             switch (c3) {
1236             case '=':
1237                 return LEFTSHIFTEQUAL;
1238             }
1239             break;
1240         }
1241         break;
1242     case '>':
1243         switch (c2) {
1244         case '>':
1245             switch (c3) {
1246             case '=':
1247                 return RIGHTSHIFTEQUAL;
1248             }
1249             break;
1250         }
1251         break;
1252     case '*':
1253         switch (c2) {
1254         case '*':
1255             switch (c3) {
1256             case '=':
1257                 return DOUBLESTAREQUAL;
1258             }
1259             break;
1260         }
1261         break;
1262     case '/':
1263         switch (c2) {
1264         case '/':
1265             switch (c3) {
1266             case '=':
1267                 return DOUBLESLASHEQUAL;
1268             }
1269             break;
1270         }
1271         break;
1272     case '.':
1273         switch (c2) {
1274         case '.':
1275             switch (c3) {
1276             case '.':
1277                 return ELLIPSIS;
1278             }
1279             break;
1280         }
1281         break;
1282     }
1283     return OP;
1284 }
1285 
1286 static int
indenterror(struct tok_state * tok)1287 indenterror(struct tok_state *tok)
1288 {
1289     if (tok->alterror) {
1290         tok->done = E_TABSPACE;
1291         tok->cur = tok->inp;
1292         return 1;
1293     }
1294     if (tok->altwarning) {
1295 #ifdef PGEN
1296         PySys_WriteStderr("inconsistent use of tabs and spaces "
1297                           "in indentation\n");
1298 #else
1299         PySys_FormatStderr("%U: inconsistent use of tabs and spaces "
1300                           "in indentation\n", tok->filename);
1301 #endif
1302         tok->altwarning = 0;
1303     }
1304     return 0;
1305 }
1306 
1307 #ifdef PGEN
1308 #define verify_identifier(tok) 1
1309 #else
1310 /* Verify that the identifier follows PEP 3131.
1311    All identifier strings are guaranteed to be "ready" unicode objects.
1312  */
1313 static int
verify_identifier(struct tok_state * tok)1314 verify_identifier(struct tok_state *tok)
1315 {
1316     PyObject *s;
1317     int result;
1318     if (tok->decoding_erred)
1319         return 0;
1320     s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
1321     if (s == NULL || PyUnicode_READY(s) == -1) {
1322         if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1323             PyErr_Clear();
1324             tok->done = E_IDENTIFIER;
1325         } else {
1326             tok->done = E_ERROR;
1327         }
1328         return 0;
1329     }
1330     result = PyUnicode_IsIdentifier(s);
1331     Py_DECREF(s);
1332     if (result == 0)
1333         tok->done = E_IDENTIFIER;
1334     return result;
1335 }
1336 #endif
1337 
1338 static int
tok_decimal_tail(struct tok_state * tok)1339 tok_decimal_tail(struct tok_state *tok)
1340 {
1341     int c;
1342 
1343     while (1) {
1344         do {
1345             c = tok_nextc(tok);
1346         } while (isdigit(c));
1347         if (c != '_') {
1348             break;
1349         }
1350         c = tok_nextc(tok);
1351         if (!isdigit(c)) {
1352             tok->done = E_TOKEN;
1353             tok_backup(tok, c);
1354             return 0;
1355         }
1356     }
1357     return c;
1358 }
1359 
1360 /* Get next token, after space stripping etc. */
1361 
1362 static int
tok_get(struct tok_state * tok,char ** p_start,char ** p_end)1363 tok_get(struct tok_state *tok, char **p_start, char **p_end)
1364 {
1365     int c;
1366     int blankline, nonascii;
1367 
1368     *p_start = *p_end = NULL;
1369   nextline:
1370     tok->start = NULL;
1371     blankline = 0;
1372 
1373     /* Get indentation level */
1374     if (tok->atbol) {
1375         int col = 0;
1376         int altcol = 0;
1377         tok->atbol = 0;
1378         for (;;) {
1379             c = tok_nextc(tok);
1380             if (c == ' ') {
1381                 col++, altcol++;
1382             }
1383             else if (c == '\t') {
1384                 col = (col/tok->tabsize + 1) * tok->tabsize;
1385                 altcol = (altcol/tok->alttabsize + 1)
1386                     * tok->alttabsize;
1387             }
1388             else if (c == '\014')  {/* Control-L (formfeed) */
1389                 col = altcol = 0; /* For Emacs users */
1390             }
1391             else {
1392                 break;
1393             }
1394         }
1395         tok_backup(tok, c);
1396         if (c == '#' || c == '\n') {
1397             /* Lines with only whitespace and/or comments
1398                shouldn't affect the indentation and are
1399                not passed to the parser as NEWLINE tokens,
1400                except *totally* empty lines in interactive
1401                mode, which signal the end of a command group. */
1402             if (col == 0 && c == '\n' && tok->prompt != NULL) {
1403                 blankline = 0; /* Let it through */
1404             }
1405             else {
1406                 blankline = 1; /* Ignore completely */
1407             }
1408             /* We can't jump back right here since we still
1409                may need to skip to the end of a comment */
1410         }
1411         if (!blankline && tok->level == 0) {
1412             if (col == tok->indstack[tok->indent]) {
1413                 /* No change */
1414                 if (altcol != tok->altindstack[tok->indent]) {
1415                     if (indenterror(tok)) {
1416                         return ERRORTOKEN;
1417                     }
1418                 }
1419             }
1420             else if (col > tok->indstack[tok->indent]) {
1421                 /* Indent -- always one */
1422                 if (tok->indent+1 >= MAXINDENT) {
1423                     tok->done = E_TOODEEP;
1424                     tok->cur = tok->inp;
1425                     return ERRORTOKEN;
1426                 }
1427                 if (altcol <= tok->altindstack[tok->indent]) {
1428                     if (indenterror(tok)) {
1429                         return ERRORTOKEN;
1430                     }
1431                 }
1432                 tok->pendin++;
1433                 tok->indstack[++tok->indent] = col;
1434                 tok->altindstack[tok->indent] = altcol;
1435             }
1436             else /* col < tok->indstack[tok->indent] */ {
1437                 /* Dedent -- any number, must be consistent */
1438                 while (tok->indent > 0 &&
1439                     col < tok->indstack[tok->indent]) {
1440                     tok->pendin--;
1441                     tok->indent--;
1442                 }
1443                 if (col != tok->indstack[tok->indent]) {
1444                     tok->done = E_DEDENT;
1445                     tok->cur = tok->inp;
1446                     return ERRORTOKEN;
1447                 }
1448                 if (altcol != tok->altindstack[tok->indent]) {
1449                     if (indenterror(tok)) {
1450                         return ERRORTOKEN;
1451                     }
1452                 }
1453             }
1454         }
1455     }
1456 
1457     tok->start = tok->cur;
1458 
1459     /* Return pending indents/dedents */
1460     if (tok->pendin != 0) {
1461         if (tok->pendin < 0) {
1462             tok->pendin++;
1463             return DEDENT;
1464         }
1465         else {
1466             tok->pendin--;
1467             return INDENT;
1468         }
1469     }
1470 
1471     if (tok->async_def
1472         && !blankline
1473         && tok->level == 0
1474         /* There was a NEWLINE after ASYNC DEF,
1475            so we're past the signature. */
1476         && tok->async_def_nl
1477         /* Current indentation level is less than where
1478            the async function was defined */
1479         && tok->async_def_indent >= tok->indent)
1480     {
1481         tok->async_def = 0;
1482         tok->async_def_indent = 0;
1483         tok->async_def_nl = 0;
1484     }
1485 
1486  again:
1487     tok->start = NULL;
1488     /* Skip spaces */
1489     do {
1490         c = tok_nextc(tok);
1491     } while (c == ' ' || c == '\t' || c == '\014');
1492 
1493     /* Set start of current token */
1494     tok->start = tok->cur - 1;
1495 
1496     /* Skip comment */
1497     if (c == '#') {
1498         while (c != EOF && c != '\n') {
1499             c = tok_nextc(tok);
1500         }
1501     }
1502 
1503     /* Check for EOF and errors now */
1504     if (c == EOF) {
1505         return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1506     }
1507 
1508     /* Identifier (most frequent token!) */
1509     nonascii = 0;
1510     if (is_potential_identifier_start(c)) {
1511         /* Process the various legal combinations of b"", r"", u"", and f"". */
1512         int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
1513         while (1) {
1514             if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
1515                 saw_b = 1;
1516             /* Since this is a backwards compatibility support literal we don't
1517                want to support it in arbitrary order like byte literals. */
1518             else if (!(saw_b || saw_u || saw_r || saw_f)
1519                      && (c == 'u'|| c == 'U')) {
1520                 saw_u = 1;
1521             }
1522             /* ur"" and ru"" are not supported */
1523             else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
1524                 saw_r = 1;
1525             }
1526             else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
1527                 saw_f = 1;
1528             }
1529             else {
1530                 break;
1531             }
1532             c = tok_nextc(tok);
1533             if (c == '"' || c == '\'') {
1534                 goto letter_quote;
1535             }
1536         }
1537         while (is_potential_identifier_char(c)) {
1538             if (c >= 128) {
1539                 nonascii = 1;
1540             }
1541             c = tok_nextc(tok);
1542         }
1543         tok_backup(tok, c);
1544         if (nonascii && !verify_identifier(tok)) {
1545             return ERRORTOKEN;
1546         }
1547         *p_start = tok->start;
1548         *p_end = tok->cur;
1549 
1550         /* async/await parsing block. */
1551         if (tok->cur - tok->start == 5) {
1552             /* Current token length is 5. */
1553             if (tok->async_def) {
1554                 /* We're inside an 'async def' function. */
1555                 if (memcmp(tok->start, "async", 5) == 0) {
1556                     return ASYNC;
1557                 }
1558                 if (memcmp(tok->start, "await", 5) == 0) {
1559                     return AWAIT;
1560                 }
1561             }
1562             else if (memcmp(tok->start, "async", 5) == 0) {
1563                 /* The current token is 'async'.
1564                    Look ahead one token.*/
1565 
1566                 struct tok_state ahead_tok;
1567                 char *ahead_tok_start = NULL, *ahead_tok_end = NULL;
1568                 int ahead_tok_kind;
1569 
1570                 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1571                 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1572                                          &ahead_tok_end);
1573 
1574                 if (ahead_tok_kind == NAME
1575                     && ahead_tok.cur - ahead_tok.start == 3
1576                     && memcmp(ahead_tok.start, "def", 3) == 0)
1577                 {
1578                     /* The next token is going to be 'def', so instead of
1579                        returning 'async' NAME token, we return ASYNC. */
1580                     tok->async_def_indent = tok->indent;
1581                     tok->async_def = 1;
1582                     return ASYNC;
1583                 }
1584             }
1585         }
1586 
1587         return NAME;
1588     }
1589 
1590     /* Newline */
1591     if (c == '\n') {
1592         tok->atbol = 1;
1593         if (blankline || tok->level > 0) {
1594             goto nextline;
1595         }
1596         *p_start = tok->start;
1597         *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1598         tok->cont_line = 0;
1599         if (tok->async_def) {
1600             /* We're somewhere inside an 'async def' function, and
1601                we've encountered a NEWLINE after its signature. */
1602             tok->async_def_nl = 1;
1603         }
1604         return NEWLINE;
1605     }
1606 
1607     /* Period or number starting with period? */
1608     if (c == '.') {
1609         c = tok_nextc(tok);
1610         if (isdigit(c)) {
1611             goto fraction;
1612         } else if (c == '.') {
1613             c = tok_nextc(tok);
1614             if (c == '.') {
1615                 *p_start = tok->start;
1616                 *p_end = tok->cur;
1617                 return ELLIPSIS;
1618             }
1619             else {
1620                 tok_backup(tok, c);
1621             }
1622             tok_backup(tok, '.');
1623         }
1624         else {
1625             tok_backup(tok, c);
1626         }
1627         *p_start = tok->start;
1628         *p_end = tok->cur;
1629         return DOT;
1630     }
1631 
1632     /* Number */
1633     if (isdigit(c)) {
1634         if (c == '0') {
1635             /* Hex, octal or binary -- maybe. */
1636             c = tok_nextc(tok);
1637             if (c == 'x' || c == 'X') {
1638                 /* Hex */
1639                 c = tok_nextc(tok);
1640                 do {
1641                     if (c == '_') {
1642                         c = tok_nextc(tok);
1643                     }
1644                     if (!isxdigit(c)) {
1645                         tok->done = E_TOKEN;
1646                         tok_backup(tok, c);
1647                         return ERRORTOKEN;
1648                     }
1649                     do {
1650                         c = tok_nextc(tok);
1651                     } while (isxdigit(c));
1652                 } while (c == '_');
1653             }
1654             else if (c == 'o' || c == 'O') {
1655                 /* Octal */
1656                 c = tok_nextc(tok);
1657                 do {
1658                     if (c == '_') {
1659                         c = tok_nextc(tok);
1660                     }
1661                     if (c < '0' || c >= '8') {
1662                         tok->done = E_TOKEN;
1663                         tok_backup(tok, c);
1664                         return ERRORTOKEN;
1665                     }
1666                     do {
1667                         c = tok_nextc(tok);
1668                     } while ('0' <= c && c < '8');
1669                 } while (c == '_');
1670             }
1671             else if (c == 'b' || c == 'B') {
1672                 /* Binary */
1673                 c = tok_nextc(tok);
1674                 do {
1675                     if (c == '_') {
1676                         c = tok_nextc(tok);
1677                     }
1678                     if (c != '0' && c != '1') {
1679                         tok->done = E_TOKEN;
1680                         tok_backup(tok, c);
1681                         return ERRORTOKEN;
1682                     }
1683                     do {
1684                         c = tok_nextc(tok);
1685                     } while (c == '0' || c == '1');
1686                 } while (c == '_');
1687             }
1688             else {
1689                 int nonzero = 0;
1690                 /* maybe old-style octal; c is first char of it */
1691                 /* in any case, allow '0' as a literal */
1692                 while (1) {
1693                     if (c == '_') {
1694                         c = tok_nextc(tok);
1695                         if (!isdigit(c)) {
1696                             tok->done = E_TOKEN;
1697                             tok_backup(tok, c);
1698                             return ERRORTOKEN;
1699                         }
1700                     }
1701                     if (c != '0') {
1702                         break;
1703                     }
1704                     c = tok_nextc(tok);
1705                 }
1706                 if (isdigit(c)) {
1707                     nonzero = 1;
1708                     c = tok_decimal_tail(tok);
1709                     if (c == 0) {
1710                         return ERRORTOKEN;
1711                     }
1712                 }
1713                 if (c == '.') {
1714                     c = tok_nextc(tok);
1715                     goto fraction;
1716                 }
1717                 else if (c == 'e' || c == 'E') {
1718                     goto exponent;
1719                 }
1720                 else if (c == 'j' || c == 'J') {
1721                     goto imaginary;
1722                 }
1723                 else if (nonzero) {
1724                     /* Old-style octal: now disallowed. */
1725                     tok->done = E_TOKEN;
1726                     tok_backup(tok, c);
1727                     return ERRORTOKEN;
1728                 }
1729             }
1730         }
1731         else {
1732             /* Decimal */
1733             c = tok_decimal_tail(tok);
1734             if (c == 0) {
1735                 return ERRORTOKEN;
1736             }
1737             {
1738                 /* Accept floating point numbers. */
1739                 if (c == '.') {
1740                     c = tok_nextc(tok);
1741         fraction:
1742                     /* Fraction */
1743                     if (isdigit(c)) {
1744                         c = tok_decimal_tail(tok);
1745                         if (c == 0) {
1746                             return ERRORTOKEN;
1747                         }
1748                     }
1749                 }
1750                 if (c == 'e' || c == 'E') {
1751                     int e;
1752                   exponent:
1753                     e = c;
1754                     /* Exponent part */
1755                     c = tok_nextc(tok);
1756                     if (c == '+' || c == '-') {
1757                         c = tok_nextc(tok);
1758                         if (!isdigit(c)) {
1759                             tok->done = E_TOKEN;
1760                             tok_backup(tok, c);
1761                             return ERRORTOKEN;
1762                         }
1763                     } else if (!isdigit(c)) {
1764                         tok_backup(tok, c);
1765                         tok_backup(tok, e);
1766                         *p_start = tok->start;
1767                         *p_end = tok->cur;
1768                         return NUMBER;
1769                     }
1770                     c = tok_decimal_tail(tok);
1771                     if (c == 0) {
1772                         return ERRORTOKEN;
1773                     }
1774                 }
1775                 if (c == 'j' || c == 'J') {
1776                     /* Imaginary part */
1777         imaginary:
1778                     c = tok_nextc(tok);
1779                 }
1780             }
1781         }
1782         tok_backup(tok, c);
1783         *p_start = tok->start;
1784         *p_end = tok->cur;
1785         return NUMBER;
1786     }
1787 
1788   letter_quote:
1789     /* String */
1790     if (c == '\'' || c == '"') {
1791         int quote = c;
1792         int quote_size = 1;             /* 1 or 3 */
1793         int end_quote_size = 0;
1794 
1795         /* Find the quote size and start of string */
1796         c = tok_nextc(tok);
1797         if (c == quote) {
1798             c = tok_nextc(tok);
1799             if (c == quote) {
1800                 quote_size = 3;
1801             }
1802             else {
1803                 end_quote_size = 1;     /* empty string found */
1804             }
1805         }
1806         if (c != quote) {
1807             tok_backup(tok, c);
1808         }
1809 
1810         /* Get rest of string */
1811         while (end_quote_size != quote_size) {
1812             c = tok_nextc(tok);
1813             if (c == EOF) {
1814                 if (quote_size == 3) {
1815                     tok->done = E_EOFS;
1816                 }
1817                 else {
1818                     tok->done = E_EOLS;
1819                 }
1820                 tok->cur = tok->inp;
1821                 return ERRORTOKEN;
1822             }
1823             if (quote_size == 1 && c == '\n') {
1824                 tok->done = E_EOLS;
1825                 tok->cur = tok->inp;
1826                 return ERRORTOKEN;
1827             }
1828             if (c == quote) {
1829                 end_quote_size += 1;
1830             }
1831             else {
1832                 end_quote_size = 0;
1833                 if (c == '\\') {
1834                     tok_nextc(tok);  /* skip escaped char */
1835                 }
1836             }
1837         }
1838 
1839         *p_start = tok->start;
1840         *p_end = tok->cur;
1841         return STRING;
1842     }
1843 
1844     /* Line continuation */
1845     if (c == '\\') {
1846         c = tok_nextc(tok);
1847         if (c != '\n') {
1848             tok->done = E_LINECONT;
1849             tok->cur = tok->inp;
1850             return ERRORTOKEN;
1851         }
1852         tok->cont_line = 1;
1853         goto again; /* Read next line */
1854     }
1855 
1856     /* Check for two-character token */
1857     {
1858         int c2 = tok_nextc(tok);
1859         int token = PyToken_TwoChars(c, c2);
1860         if (token != OP) {
1861             int c3 = tok_nextc(tok);
1862             int token3 = PyToken_ThreeChars(c, c2, c3);
1863             if (token3 != OP) {
1864                 token = token3;
1865             }
1866             else {
1867                 tok_backup(tok, c3);
1868             }
1869             *p_start = tok->start;
1870             *p_end = tok->cur;
1871             return token;
1872         }
1873         tok_backup(tok, c2);
1874     }
1875 
1876     /* Keep track of parentheses nesting level */
1877     switch (c) {
1878     case '(':
1879     case '[':
1880     case '{':
1881         tok->level++;
1882         break;
1883     case ')':
1884     case ']':
1885     case '}':
1886         tok->level--;
1887         break;
1888     }
1889 
1890     /* Punctuation character */
1891     *p_start = tok->start;
1892     *p_end = tok->cur;
1893     return PyToken_OneChar(c);
1894 }
1895 
1896 int
PyTokenizer_Get(struct tok_state * tok,char ** p_start,char ** p_end)1897 PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)
1898 {
1899     int result = tok_get(tok, p_start, p_end);
1900     if (tok->decoding_erred) {
1901         result = ERRORTOKEN;
1902         tok->done = E_DECODE;
1903     }
1904     return result;
1905 }
1906 
1907 /* Get the encoding of a Python file. Check for the coding cookie and check if
1908    the file starts with a BOM.
1909 
1910    PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1911    encoding in the first or second line of the file (in which case the encoding
1912    should be assumed to be UTF-8).
1913 
1914    The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1915    by the caller. */
1916 
1917 char *
PyTokenizer_FindEncodingFilename(int fd,PyObject * filename)1918 PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
1919 {
1920     struct tok_state *tok;
1921     FILE *fp;
1922     char *p_start =NULL , *p_end =NULL , *encoding = NULL;
1923 
1924 #ifndef PGEN
1925     fd = _Py_dup(fd);
1926 #else
1927     fd = dup(fd);
1928 #endif
1929     if (fd < 0) {
1930         return NULL;
1931     }
1932 
1933     fp = fdopen(fd, "r");
1934     if (fp == NULL) {
1935         return NULL;
1936     }
1937     tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1938     if (tok == NULL) {
1939         fclose(fp);
1940         return NULL;
1941     }
1942 #ifndef PGEN
1943     if (filename != NULL) {
1944         Py_INCREF(filename);
1945         tok->filename = filename;
1946     }
1947     else {
1948         tok->filename = PyUnicode_FromString("<string>");
1949         if (tok->filename == NULL) {
1950             fclose(fp);
1951             PyTokenizer_Free(tok);
1952             return encoding;
1953         }
1954     }
1955 #endif
1956     while (tok->lineno < 2 && tok->done == E_OK) {
1957         PyTokenizer_Get(tok, &p_start, &p_end);
1958     }
1959     fclose(fp);
1960     if (tok->encoding) {
1961         encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1962         if (encoding)
1963         strcpy(encoding, tok->encoding);
1964     }
1965     PyTokenizer_Free(tok);
1966     return encoding;
1967 }
1968 
1969 char *
PyTokenizer_FindEncoding(int fd)1970 PyTokenizer_FindEncoding(int fd)
1971 {
1972     return PyTokenizer_FindEncodingFilename(fd, NULL);
1973 }
1974 
1975 #ifdef Py_DEBUG
1976 
1977 void
tok_dump(int type,char * start,char * end)1978 tok_dump(int type, char *start, char *end)
1979 {
1980     printf("%s", _PyParser_TokenNames[type]);
1981     if (type == NAME || type == NUMBER || type == STRING || type == OP)
1982         printf("(%.*s)", (int)(end - start), start);
1983 }
1984 
1985 #endif
1986