1 #include <stdbool.h>
2 
3 #include <Python.h>
4 
5 #include "../tokenizer.h"
6 #include "pegen.h"
7 #include "parse_string.h"
8 
9 //// STRING HANDLING FUNCTIONS ////
10 
11 // These functions are ported directly from Python/ast.c with some modifications
12 // to account for the use of "Parser *p", the fact that don't have parser nodes
13 // to pass around and the usage of some specialized APIs present only in this
14 // file (like "_PyPegen_raise_syntax_error").
15 
16 static int
warn_invalid_escape_sequence(Parser * p,unsigned char first_invalid_escape_char,Token * t)17 warn_invalid_escape_sequence(Parser *p, unsigned char first_invalid_escape_char, Token *t)
18 {
19     PyObject *msg =
20         PyUnicode_FromFormat("invalid escape sequence \\%c", first_invalid_escape_char);
21     if (msg == NULL) {
22         return -1;
23     }
24     if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename,
25                                  t->lineno, NULL, NULL) < 0) {
26         if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
27             /* Replace the DeprecationWarning exception with a SyntaxError
28                to get a more accurate error report */
29             PyErr_Clear();
30 
31             /* This is needed, in order for the SyntaxError to point to the token t,
32                since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
33                error location, if p->known_err_token is not set. */
34             p->known_err_token = t;
35             RAISE_SYNTAX_ERROR("invalid escape sequence \\%c", first_invalid_escape_char);
36         }
37         Py_DECREF(msg);
38         return -1;
39     }
40     Py_DECREF(msg);
41     return 0;
42 }
43 
44 static PyObject *
decode_utf8(const char ** sPtr,const char * end)45 decode_utf8(const char **sPtr, const char *end)
46 {
47     const char *s;
48     const char *t;
49     t = s = *sPtr;
50     while (s < end && (*s & 0x80)) {
51         s++;
52     }
53     *sPtr = s;
54     return PyUnicode_DecodeUTF8(t, s - t, NULL);
55 }
56 
57 static PyObject *
decode_unicode_with_escapes(Parser * parser,const char * s,size_t len,Token * t)58 decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
59 {
60     PyObject *v;
61     PyObject *u;
62     char *buf;
63     char *p;
64     const char *end;
65 
66     /* check for integer overflow */
67     if (len > SIZE_MAX / 6) {
68         return NULL;
69     }
70     /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
71        "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
72     u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
73     if (u == NULL) {
74         return NULL;
75     }
76     p = buf = PyBytes_AsString(u);
77     if (p == NULL) {
78         return NULL;
79     }
80     end = s + len;
81     while (s < end) {
82         if (*s == '\\') {
83             *p++ = *s++;
84             if (s >= end || *s & 0x80) {
85                 strcpy(p, "u005c");
86                 p += 5;
87                 if (s >= end) {
88                     break;
89                 }
90             }
91         }
92         if (*s & 0x80) {
93             PyObject *w;
94             int kind;
95             void *data;
96             Py_ssize_t w_len;
97             Py_ssize_t i;
98             w = decode_utf8(&s, end);
99             if (w == NULL) {
100                 Py_DECREF(u);
101                 return NULL;
102             }
103             kind = PyUnicode_KIND(w);
104             data = PyUnicode_DATA(w);
105             w_len = PyUnicode_GET_LENGTH(w);
106             for (i = 0; i < w_len; i++) {
107                 Py_UCS4 chr = PyUnicode_READ(kind, data, i);
108                 sprintf(p, "\\U%08x", chr);
109                 p += 10;
110             }
111             /* Should be impossible to overflow */
112             assert(p - buf <= PyBytes_GET_SIZE(u));
113             Py_DECREF(w);
114         }
115         else {
116             *p++ = *s++;
117         }
118     }
119     len = p - buf;
120     s = buf;
121 
122     const char *first_invalid_escape;
123     v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape);
124 
125     if (v != NULL && first_invalid_escape != NULL) {
126         if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
127             /* We have not decref u before because first_invalid_escape points
128                inside u. */
129             Py_XDECREF(u);
130             Py_DECREF(v);
131             return NULL;
132         }
133     }
134     Py_XDECREF(u);
135     return v;
136 }
137 
138 static PyObject *
decode_bytes_with_escapes(Parser * p,const char * s,Py_ssize_t len,Token * t)139 decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
140 {
141     const char *first_invalid_escape;
142     PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
143     if (result == NULL) {
144         return NULL;
145     }
146 
147     if (first_invalid_escape != NULL) {
148         if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
149             Py_DECREF(result);
150             return NULL;
151         }
152     }
153     return result;
154 }
155 
156 /* s must include the bracketing quote characters, and r, b, u,
157    &/or f prefixes (if any), and embedded escape sequences (if any).
158    _PyPegen_parsestr parses it, and sets *result to decoded Python string object.
159    If the string is an f-string, set *fstr and *fstrlen to the unparsed
160    string object.  Return 0 if no errors occurred.  */
161 int
_PyPegen_parsestr(Parser * p,int * bytesmode,int * rawmode,PyObject ** result,const char ** fstr,Py_ssize_t * fstrlen,Token * t)162 _PyPegen_parsestr(Parser *p, int *bytesmode, int *rawmode, PyObject **result,
163                   const char **fstr, Py_ssize_t *fstrlen, Token *t)
164 {
165     const char *s = PyBytes_AsString(t->bytes);
166     if (s == NULL) {
167         return -1;
168     }
169 
170     size_t len;
171     int quote = Py_CHARMASK(*s);
172     int fmode = 0;
173     *bytesmode = 0;
174     *rawmode = 0;
175     *result = NULL;
176     *fstr = NULL;
177     if (Py_ISALPHA(quote)) {
178         while (!*bytesmode || !*rawmode) {
179             if (quote == 'b' || quote == 'B') {
180                 quote =(unsigned char)*++s;
181                 *bytesmode = 1;
182             }
183             else if (quote == 'u' || quote == 'U') {
184                 quote = (unsigned char)*++s;
185             }
186             else if (quote == 'r' || quote == 'R') {
187                 quote = (unsigned char)*++s;
188                 *rawmode = 1;
189             }
190             else if (quote == 'f' || quote == 'F') {
191                 quote = (unsigned char)*++s;
192                 fmode = 1;
193             }
194             else {
195                 break;
196             }
197         }
198     }
199 
200     /* fstrings are only allowed in Python 3.6 and greater */
201     if (fmode && p->feature_version < 6) {
202         p->error_indicator = 1;
203         RAISE_SYNTAX_ERROR("Format strings are only supported in Python 3.6 and greater");
204         return -1;
205     }
206 
207     if (fmode && *bytesmode) {
208         PyErr_BadInternalCall();
209         return -1;
210     }
211     if (quote != '\'' && quote != '\"') {
212         PyErr_BadInternalCall();
213         return -1;
214     }
215     /* Skip the leading quote char. */
216     s++;
217     len = strlen(s);
218     if (len > INT_MAX) {
219         PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
220         return -1;
221     }
222     if (s[--len] != quote) {
223         /* Last quote char must match the first. */
224         PyErr_BadInternalCall();
225         return -1;
226     }
227     if (len >= 4 && s[0] == quote && s[1] == quote) {
228         /* A triple quoted string. We've already skipped one quote at
229            the start and one at the end of the string. Now skip the
230            two at the start. */
231         s += 2;
232         len -= 2;
233         /* And check that the last two match. */
234         if (s[--len] != quote || s[--len] != quote) {
235             PyErr_BadInternalCall();
236             return -1;
237         }
238     }
239 
240     if (fmode) {
241         /* Just return the bytes. The caller will parse the resulting
242            string. */
243         *fstr = s;
244         *fstrlen = len;
245         return 0;
246     }
247 
248     /* Not an f-string. */
249     /* Avoid invoking escape decoding routines if possible. */
250     *rawmode = *rawmode || strchr(s, '\\') == NULL;
251     if (*bytesmode) {
252         /* Disallow non-ASCII characters. */
253         const char *ch;
254         for (ch = s; *ch; ch++) {
255             if (Py_CHARMASK(*ch) >= 0x80) {
256                 RAISE_SYNTAX_ERROR(
257                                    "bytes can only contain ASCII "
258                                    "literal characters.");
259                 return -1;
260             }
261         }
262         if (*rawmode) {
263             *result = PyBytes_FromStringAndSize(s, len);
264         }
265         else {
266             *result = decode_bytes_with_escapes(p, s, len, t);
267         }
268     }
269     else {
270         if (*rawmode) {
271             *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
272         }
273         else {
274             *result = decode_unicode_with_escapes(p, s, len, t);
275         }
276     }
277     return *result == NULL ? -1 : 0;
278 }
279 
280 
281 
282 // FSTRING STUFF
283 
284 /* Fix locations for the given node and its children.
285 
286    `parent` is the enclosing node.
287    `n` is the node which locations are going to be fixed relative to parent.
288    `expr_str` is the child node's string representation, including braces.
289 */
290 static bool
fstring_find_expr_location(Token * parent,char * expr_str,int * p_lines,int * p_cols)291 fstring_find_expr_location(Token *parent, char *expr_str, int *p_lines, int *p_cols)
292 {
293     *p_lines = 0;
294     *p_cols = 0;
295     if (parent && parent->bytes) {
296         char *parent_str = PyBytes_AsString(parent->bytes);
297         if (!parent_str) {
298             return false;
299         }
300         char *substr = strstr(parent_str, expr_str);
301         if (substr) {
302             // The following is needed, in order to correctly shift the column
303             // offset, in the case that (disregarding any whitespace) a newline
304             // immediately follows the opening curly brace of the fstring expression.
305             bool newline_after_brace = 1;
306             char *start = substr + 1;
307             while (start && *start != '}' && *start != '\n') {
308                 if (*start != ' ' && *start != '\t' && *start != '\f') {
309                     newline_after_brace = 0;
310                     break;
311                 }
312                 start++;
313             }
314 
315             // Account for the characters from the last newline character to our
316             // left until the beginning of substr.
317             if (!newline_after_brace) {
318                 start = substr;
319                 while (start > parent_str && *start != '\n') {
320                     start--;
321                 }
322                 *p_cols += (int)(substr - start);
323             }
324             /* adjust the start based on the number of newlines encountered
325                before the f-string expression */
326             for (char* p = parent_str; p < substr; p++) {
327                 if (*p == '\n') {
328                     (*p_lines)++;
329                 }
330             }
331         }
332     }
333     return true;
334 }
335 
336 
337 /* Compile this expression in to an expr_ty.  Add parens around the
338    expression, in order to allow leading spaces in the expression. */
339 static expr_ty
fstring_compile_expr(Parser * p,const char * expr_start,const char * expr_end,Token * t)340 fstring_compile_expr(Parser *p, const char *expr_start, const char *expr_end,
341                      Token *t)
342 {
343     expr_ty expr = NULL;
344     char *str;
345     Py_ssize_t len;
346     const char *s;
347     expr_ty result = NULL;
348 
349     assert(expr_end >= expr_start);
350     assert(*(expr_start-1) == '{');
351     assert(*expr_end == '}' || *expr_end == '!' || *expr_end == ':' ||
352            *expr_end == '=');
353 
354     /* If the substring is all whitespace, it's an error.  We need to catch this
355        here, and not when we call PyParser_SimpleParseStringFlagsFilename,
356        because turning the expression '' in to '()' would go from being invalid
357        to valid. */
358     for (s = expr_start; s != expr_end; s++) {
359         char c = *s;
360         /* The Python parser ignores only the following whitespace
361            characters (\r already is converted to \n). */
362         if (!(c == ' ' || c == '\t' || c == '\n' || c == '\f')) {
363             break;
364         }
365     }
366     if (s == expr_end) {
367         RAISE_SYNTAX_ERROR("f-string: empty expression not allowed");
368         return NULL;
369     }
370 
371     len = expr_end - expr_start;
372     /* Allocate 3 extra bytes: open paren, close paren, null byte. */
373     str = PyMem_Malloc(len + 3);
374     if (str == NULL) {
375         PyErr_NoMemory();
376         return NULL;
377     }
378 
379     // The call to fstring_find_expr_location is responsible for finding the column offset
380     // the generated AST nodes need to be shifted to the right, which is equal to the number
381     // of the f-string characters before the expression starts. In order to correctly compute
382     // this offset, strstr gets called in fstring_find_expr_location which only succeeds
383     // if curly braces appear before and after the f-string expression (exactly like they do
384     // in the f-string itself), hence the following lines.
385     str[0] = '{';
386     memcpy(str+1, expr_start, len);
387     str[len+1] = '}';
388     str[len+2] = 0;
389 
390     int lines, cols;
391     if (!fstring_find_expr_location(t, str, &lines, &cols)) {
392         PyMem_FREE(str);
393         return NULL;
394     }
395 
396     // The parentheses are needed in order to allow for leading whitespace within
397     // the f-string expression. This consequently gets parsed as a group (see the
398     // group rule in python.gram).
399     str[0] = '(';
400     str[len+1] = ')';
401 
402     struct tok_state* tok = PyTokenizer_FromString(str, 1);
403     if (tok == NULL) {
404         PyMem_Free(str);
405         return NULL;
406     }
407     Py_INCREF(p->tok->filename);
408     tok->filename = p->tok->filename;
409 
410     Parser *p2 = _PyPegen_Parser_New(tok, Py_fstring_input, p->flags, p->feature_version,
411                                      NULL, p->arena);
412     p2->starting_lineno = t->lineno + lines - 1;
413     p2->starting_col_offset = p->tok->first_lineno == p->tok->lineno ? t->col_offset + cols : cols;
414 
415     expr = _PyPegen_run_parser(p2);
416 
417     if (expr == NULL) {
418         goto exit;
419     }
420     result = expr;
421 
422 exit:
423     PyMem_Free(str);
424     _PyPegen_Parser_Free(p2);
425     PyTokenizer_Free(tok);
426     return result;
427 }
428 
429 /* Return -1 on error.
430 
431    Return 0 if we reached the end of the literal.
432 
433    Return 1 if we haven't reached the end of the literal, but we want
434    the caller to process the literal up to this point. Used for
435    doubled braces.
436 */
437 static int
fstring_find_literal(Parser * p,const char ** str,const char * end,int raw,PyObject ** literal,int recurse_lvl,Token * t)438 fstring_find_literal(Parser *p, const char **str, const char *end, int raw,
439                      PyObject **literal, int recurse_lvl, Token *t)
440 {
441     /* Get any literal string. It ends when we hit an un-doubled left
442        brace (which isn't part of a unicode name escape such as
443        "\N{EULER CONSTANT}"), or the end of the string. */
444 
445     const char *s = *str;
446     const char *literal_start = s;
447     int result = 0;
448 
449     assert(*literal == NULL);
450     while (s < end) {
451         char ch = *s++;
452         if (!raw && ch == '\\' && s < end) {
453             ch = *s++;
454             if (ch == 'N') {
455                 if (s < end && *s++ == '{') {
456                     while (s < end && *s++ != '}') {
457                     }
458                     continue;
459                 }
460                 break;
461             }
462             if (ch == '{' && warn_invalid_escape_sequence(p, ch, t) < 0) {
463                 return -1;
464             }
465         }
466         if (ch == '{' || ch == '}') {
467             /* Check for doubled braces, but only at the top level. If
468                we checked at every level, then f'{0:{3}}' would fail
469                with the two closing braces. */
470             if (recurse_lvl == 0) {
471                 if (s < end && *s == ch) {
472                     /* We're going to tell the caller that the literal ends
473                        here, but that they should continue scanning. But also
474                        skip over the second brace when we resume scanning. */
475                     *str = s + 1;
476                     result = 1;
477                     goto done;
478                 }
479 
480                 /* Where a single '{' is the start of a new expression, a
481                    single '}' is not allowed. */
482                 if (ch == '}') {
483                     *str = s - 1;
484                     RAISE_SYNTAX_ERROR("f-string: single '}' is not allowed");
485                     return -1;
486                 }
487             }
488             /* We're either at a '{', which means we're starting another
489                expression; or a '}', which means we're at the end of this
490                f-string (for a nested format_spec). */
491             s--;
492             break;
493         }
494     }
495     *str = s;
496     assert(s <= end);
497     assert(s == end || *s == '{' || *s == '}');
498 done:
499     if (literal_start != s) {
500         if (raw) {
501             *literal = PyUnicode_DecodeUTF8Stateful(literal_start,
502                                                     s - literal_start,
503                                                     NULL, NULL);
504         } else {
505             *literal = decode_unicode_with_escapes(p, literal_start,
506                                                    s - literal_start, t);
507         }
508         if (!*literal) {
509             return -1;
510         }
511     }
512     return result;
513 }
514 
515 /* Forward declaration because parsing is recursive. */
516 static expr_ty
517 fstring_parse(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
518               Token *first_token, Token* t, Token *last_token);
519 
520 /* Parse the f-string at *str, ending at end.  We know *str starts an
521    expression (so it must be a '{'). Returns the FormattedValue node, which
522    includes the expression, conversion character, format_spec expression, and
523    optionally the text of the expression (if = is used).
524 
525    Note that I don't do a perfect job here: I don't make sure that a
526    closing brace doesn't match an opening paren, for example. It
527    doesn't need to error on all invalid expressions, just correctly
528    find the end of all valid ones. Any errors inside the expression
529    will be caught when we parse it later.
530 
531    *expression is set to the expression.  For an '=' "debug" expression,
532    *expr_text is set to the debug text (the original text of the expression,
533    including the '=' and any whitespace around it, as a string object).  If
534    not a debug expression, *expr_text set to NULL. */
535 static int
fstring_find_expr(Parser * p,const char ** str,const char * end,int raw,int recurse_lvl,PyObject ** expr_text,expr_ty * expression,Token * first_token,Token * t,Token * last_token)536 fstring_find_expr(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
537                   PyObject **expr_text, expr_ty *expression, Token *first_token,
538                   Token *t, Token *last_token)
539 {
540     /* Return -1 on error, else 0. */
541 
542     const char *expr_start;
543     const char *expr_end;
544     expr_ty simple_expression;
545     expr_ty format_spec = NULL; /* Optional format specifier. */
546     int conversion = -1; /* The conversion char.  Use default if not
547                             specified, or !r if using = and no format
548                             spec. */
549 
550     /* 0 if we're not in a string, else the quote char we're trying to
551        match (single or double quote). */
552     char quote_char = 0;
553 
554     /* If we're inside a string, 1=normal, 3=triple-quoted. */
555     int string_type = 0;
556 
557     /* Keep track of nesting level for braces/parens/brackets in
558        expressions. */
559     Py_ssize_t nested_depth = 0;
560     char parenstack[MAXLEVEL];
561 
562     *expr_text = NULL;
563 
564     /* Can only nest one level deep. */
565     if (recurse_lvl >= 2) {
566         RAISE_SYNTAX_ERROR("f-string: expressions nested too deeply");
567         goto error;
568     }
569 
570     /* The first char must be a left brace, or we wouldn't have gotten
571        here. Skip over it. */
572     assert(**str == '{');
573     *str += 1;
574 
575     expr_start = *str;
576     for (; *str < end; (*str)++) {
577         char ch;
578 
579         /* Loop invariants. */
580         assert(nested_depth >= 0);
581         assert(*str >= expr_start && *str < end);
582         if (quote_char) {
583             assert(string_type == 1 || string_type == 3);
584         } else {
585             assert(string_type == 0);
586         }
587 
588         ch = **str;
589         /* Nowhere inside an expression is a backslash allowed. */
590         if (ch == '\\') {
591             /* Error: can't include a backslash character, inside
592                parens or strings or not. */
593             RAISE_SYNTAX_ERROR(
594                       "f-string expression part "
595                       "cannot include a backslash");
596             goto error;
597         }
598         if (quote_char) {
599             /* We're inside a string. See if we're at the end. */
600             /* This code needs to implement the same non-error logic
601                as tok_get from tokenizer.c, at the letter_quote
602                label. To actually share that code would be a
603                nightmare. But, it's unlikely to change and is small,
604                so duplicate it here. Note we don't need to catch all
605                of the errors, since they'll be caught when parsing the
606                expression. We just need to match the non-error
607                cases. Thus we can ignore \n in single-quoted strings,
608                for example. Or non-terminated strings. */
609             if (ch == quote_char) {
610                 /* Does this match the string_type (single or triple
611                    quoted)? */
612                 if (string_type == 3) {
613                     if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
614                         /* We're at the end of a triple quoted string. */
615                         *str += 2;
616                         string_type = 0;
617                         quote_char = 0;
618                         continue;
619                     }
620                 } else {
621                     /* We're at the end of a normal string. */
622                     quote_char = 0;
623                     string_type = 0;
624                     continue;
625                 }
626             }
627         } else if (ch == '\'' || ch == '"') {
628             /* Is this a triple quoted string? */
629             if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
630                 string_type = 3;
631                 *str += 2;
632             } else {
633                 /* Start of a normal string. */
634                 string_type = 1;
635             }
636             /* Start looking for the end of the string. */
637             quote_char = ch;
638         } else if (ch == '[' || ch == '{' || ch == '(') {
639             if (nested_depth >= MAXLEVEL) {
640                 RAISE_SYNTAX_ERROR("f-string: too many nested parenthesis");
641                 goto error;
642             }
643             parenstack[nested_depth] = ch;
644             nested_depth++;
645         } else if (ch == '#') {
646             /* Error: can't include a comment character, inside parens
647                or not. */
648             RAISE_SYNTAX_ERROR("f-string expression part cannot include '#'");
649             goto error;
650         } else if (nested_depth == 0 &&
651                    (ch == '!' || ch == ':' || ch == '}' ||
652                     ch == '=' || ch == '>' || ch == '<')) {
653             /* See if there's a next character. */
654             if (*str+1 < end) {
655                 char next = *(*str+1);
656 
657                 /* For "!=". since '=' is not an allowed conversion character,
658                    nothing is lost in this test. */
659                 if ((ch == '!' && next == '=') ||   /* != */
660                     (ch == '=' && next == '=') ||   /* == */
661                     (ch == '<' && next == '=') ||   /* <= */
662                     (ch == '>' && next == '=')      /* >= */
663                     ) {
664                     *str += 1;
665                     continue;
666                 }
667                 /* Don't get out of the loop for these, if they're single
668                    chars (not part of 2-char tokens). If by themselves, they
669                    don't end an expression (unlike say '!'). */
670                 if (ch == '>' || ch == '<') {
671                     continue;
672                 }
673             }
674 
675             /* Normal way out of this loop. */
676             break;
677         } else if (ch == ']' || ch == '}' || ch == ')') {
678             if (!nested_depth) {
679                 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", ch);
680                 goto error;
681             }
682             nested_depth--;
683             int opening = (unsigned char)parenstack[nested_depth];
684             if (!((opening == '(' && ch == ')') ||
685                   (opening == '[' && ch == ']') ||
686                   (opening == '{' && ch == '}')))
687             {
688                 RAISE_SYNTAX_ERROR(
689                           "f-string: closing parenthesis '%c' "
690                           "does not match opening parenthesis '%c'",
691                           ch, opening);
692                 goto error;
693             }
694         } else {
695             /* Just consume this char and loop around. */
696         }
697     }
698     expr_end = *str;
699     /* If we leave this loop in a string or with mismatched parens, we
700        don't care. We'll get a syntax error when compiling the
701        expression. But, we can produce a better error message, so
702        let's just do that.*/
703     if (quote_char) {
704         RAISE_SYNTAX_ERROR("f-string: unterminated string");
705         goto error;
706     }
707     if (nested_depth) {
708         int opening = (unsigned char)parenstack[nested_depth - 1];
709         RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", opening);
710         goto error;
711     }
712 
713     if (*str >= end) {
714         goto unexpected_end_of_string;
715     }
716 
717     /* Compile the expression as soon as possible, so we show errors
718        related to the expression before errors related to the
719        conversion or format_spec. */
720     simple_expression = fstring_compile_expr(p, expr_start, expr_end, t);
721     if (!simple_expression) {
722         goto error;
723     }
724 
725     /* Check for =, which puts the text value of the expression in
726        expr_text. */
727     if (**str == '=') {
728         if (p->feature_version < 8) {
729             RAISE_SYNTAX_ERROR("f-string: self documenting expressions are "
730                                "only supported in Python 3.8 and greater");
731             goto error;
732         }
733         *str += 1;
734 
735         /* Skip over ASCII whitespace.  No need to test for end of string
736            here, since we know there's at least a trailing quote somewhere
737            ahead. */
738         while (Py_ISSPACE(**str)) {
739             *str += 1;
740         }
741 
742         /* Set *expr_text to the text of the expression. */
743         *expr_text = PyUnicode_FromStringAndSize(expr_start, *str-expr_start);
744         if (!*expr_text) {
745             goto error;
746         }
747     }
748 
749     /* Check for a conversion char, if present. */
750     if (**str == '!') {
751         *str += 1;
752         if (*str >= end) {
753             goto unexpected_end_of_string;
754         }
755 
756         conversion = (unsigned char)**str;
757         *str += 1;
758 
759         /* Validate the conversion. */
760         if (!(conversion == 's' || conversion == 'r' || conversion == 'a')) {
761             RAISE_SYNTAX_ERROR(
762                       "f-string: invalid conversion character: "
763                       "expected 's', 'r', or 'a'");
764             goto error;
765         }
766 
767     }
768 
769     /* Check for the format spec, if present. */
770     if (*str >= end) {
771         goto unexpected_end_of_string;
772     }
773     if (**str == ':') {
774         *str += 1;
775         if (*str >= end) {
776             goto unexpected_end_of_string;
777         }
778 
779         /* Parse the format spec. */
780         format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1,
781                                     first_token, t, last_token);
782         if (!format_spec) {
783             goto error;
784         }
785     }
786 
787     if (*str >= end || **str != '}') {
788         goto unexpected_end_of_string;
789     }
790 
791     /* We're at a right brace. Consume it. */
792     assert(*str < end);
793     assert(**str == '}');
794     *str += 1;
795 
796     /* If we're in = mode (detected by non-NULL expr_text), and have no format
797        spec and no explicit conversion, set the conversion to 'r'. */
798     if (*expr_text && format_spec == NULL && conversion == -1) {
799         conversion = 'r';
800     }
801 
802     /* And now create the FormattedValue node that represents this
803        entire expression with the conversion and format spec. */
804     //TODO: Fix this
805     *expression = FormattedValue(simple_expression, conversion,
806                                  format_spec, first_token->lineno,
807                                  first_token->col_offset, last_token->end_lineno,
808                                  last_token->end_col_offset, p->arena);
809     if (!*expression) {
810         goto error;
811     }
812 
813     return 0;
814 
815 unexpected_end_of_string:
816     RAISE_SYNTAX_ERROR("f-string: expecting '}'");
817     /* Falls through to error. */
818 
819 error:
820     Py_XDECREF(*expr_text);
821     return -1;
822 
823 }
824 
825 /* Return -1 on error.
826 
827    Return 0 if we have a literal (possible zero length) and an
828    expression (zero length if at the end of the string.
829 
830    Return 1 if we have a literal, but no expression, and we want the
831    caller to call us again. This is used to deal with doubled
832    braces.
833 
834    When called multiple times on the string 'a{{b{0}c', this function
835    will return:
836 
837    1. the literal 'a{' with no expression, and a return value
838       of 1. Despite the fact that there's no expression, the return
839       value of 1 means we're not finished yet.
840 
841    2. the literal 'b' and the expression '0', with a return value of
842       0. The fact that there's an expression means we're not finished.
843 
844    3. literal 'c' with no expression and a return value of 0. The
845       combination of the return value of 0 with no expression means
846       we're finished.
847 */
848 static int
fstring_find_literal_and_expr(Parser * p,const char ** str,const char * end,int raw,int recurse_lvl,PyObject ** literal,PyObject ** expr_text,expr_ty * expression,Token * first_token,Token * t,Token * last_token)849 fstring_find_literal_and_expr(Parser *p, const char **str, const char *end, int raw,
850                               int recurse_lvl, PyObject **literal,
851                               PyObject **expr_text, expr_ty *expression,
852                               Token *first_token, Token *t, Token *last_token)
853 {
854     int result;
855 
856     assert(*literal == NULL && *expression == NULL);
857 
858     /* Get any literal string. */
859     result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
860     if (result < 0) {
861         goto error;
862     }
863 
864     assert(result == 0 || result == 1);
865 
866     if (result == 1) {
867         /* We have a literal, but don't look at the expression. */
868         return 1;
869     }
870 
871     if (*str >= end || **str == '}') {
872         /* We're at the end of the string or the end of a nested
873            f-string: no expression. The top-level error case where we
874            expect to be at the end of the string but we're at a '}' is
875            handled later. */
876         return 0;
877     }
878 
879     /* We must now be the start of an expression, on a '{'. */
880     assert(**str == '{');
881 
882     if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text,
883                           expression, first_token, t, last_token) < 0) {
884         goto error;
885     }
886 
887     return 0;
888 
889 error:
890     Py_CLEAR(*literal);
891     return -1;
892 }
893 
894 #ifdef NDEBUG
895 #define ExprList_check_invariants(l)
896 #else
897 static void
ExprList_check_invariants(ExprList * l)898 ExprList_check_invariants(ExprList *l)
899 {
900     /* Check our invariants. Make sure this object is "live", and
901        hasn't been deallocated. */
902     assert(l->size >= 0);
903     assert(l->p != NULL);
904     if (l->size <= EXPRLIST_N_CACHED) {
905         assert(l->data == l->p);
906     }
907 }
908 #endif
909 
910 static void
ExprList_Init(ExprList * l)911 ExprList_Init(ExprList *l)
912 {
913     l->allocated = EXPRLIST_N_CACHED;
914     l->size = 0;
915 
916     /* Until we start allocating dynamically, p points to data. */
917     l->p = l->data;
918 
919     ExprList_check_invariants(l);
920 }
921 
922 static int
ExprList_Append(ExprList * l,expr_ty exp)923 ExprList_Append(ExprList *l, expr_ty exp)
924 {
925     ExprList_check_invariants(l);
926     if (l->size >= l->allocated) {
927         /* We need to alloc (or realloc) the memory. */
928         Py_ssize_t new_size = l->allocated * 2;
929 
930         /* See if we've ever allocated anything dynamically. */
931         if (l->p == l->data) {
932             Py_ssize_t i;
933             /* We're still using the cached data. Switch to
934                alloc-ing. */
935             l->p = PyMem_Malloc(sizeof(expr_ty) * new_size);
936             if (!l->p) {
937                 return -1;
938             }
939             /* Copy the cached data into the new buffer. */
940             for (i = 0; i < l->size; i++) {
941                 l->p[i] = l->data[i];
942             }
943         } else {
944             /* Just realloc. */
945             expr_ty *tmp = PyMem_Realloc(l->p, sizeof(expr_ty) * new_size);
946             if (!tmp) {
947                 PyMem_Free(l->p);
948                 l->p = NULL;
949                 return -1;
950             }
951             l->p = tmp;
952         }
953 
954         l->allocated = new_size;
955         assert(l->allocated == 2 * l->size);
956     }
957 
958     l->p[l->size++] = exp;
959 
960     ExprList_check_invariants(l);
961     return 0;
962 }
963 
964 static void
ExprList_Dealloc(ExprList * l)965 ExprList_Dealloc(ExprList *l)
966 {
967     ExprList_check_invariants(l);
968 
969     /* If there's been an error, or we've never dynamically allocated,
970        do nothing. */
971     if (!l->p || l->p == l->data) {
972         /* Do nothing. */
973     } else {
974         /* We have dynamically allocated. Free the memory. */
975         PyMem_Free(l->p);
976     }
977     l->p = NULL;
978     l->size = -1;
979 }
980 
981 static asdl_seq *
ExprList_Finish(ExprList * l,PyArena * arena)982 ExprList_Finish(ExprList *l, PyArena *arena)
983 {
984     asdl_seq *seq;
985 
986     ExprList_check_invariants(l);
987 
988     /* Allocate the asdl_seq and copy the expressions in to it. */
989     seq = _Py_asdl_seq_new(l->size, arena);
990     if (seq) {
991         Py_ssize_t i;
992         for (i = 0; i < l->size; i++) {
993             asdl_seq_SET(seq, i, l->p[i]);
994         }
995     }
996     ExprList_Dealloc(l);
997     return seq;
998 }
999 
1000 #ifdef NDEBUG
1001 #define FstringParser_check_invariants(state)
1002 #else
1003 static void
FstringParser_check_invariants(FstringParser * state)1004 FstringParser_check_invariants(FstringParser *state)
1005 {
1006     if (state->last_str) {
1007         assert(PyUnicode_CheckExact(state->last_str));
1008     }
1009     ExprList_check_invariants(&state->expr_list);
1010 }
1011 #endif
1012 
1013 void
_PyPegen_FstringParser_Init(FstringParser * state)1014 _PyPegen_FstringParser_Init(FstringParser *state)
1015 {
1016     state->last_str = NULL;
1017     state->fmode = 0;
1018     ExprList_Init(&state->expr_list);
1019     FstringParser_check_invariants(state);
1020 }
1021 
1022 void
_PyPegen_FstringParser_Dealloc(FstringParser * state)1023 _PyPegen_FstringParser_Dealloc(FstringParser *state)
1024 {
1025     FstringParser_check_invariants(state);
1026 
1027     Py_XDECREF(state->last_str);
1028     ExprList_Dealloc(&state->expr_list);
1029 }
1030 
1031 /* Make a Constant node, but decref the PyUnicode object being added. */
1032 static expr_ty
make_str_node_and_del(Parser * p,PyObject ** str,Token * first_token,Token * last_token)1033 make_str_node_and_del(Parser *p, PyObject **str, Token* first_token, Token *last_token)
1034 {
1035     PyObject *s = *str;
1036     PyObject *kind = NULL;
1037     *str = NULL;
1038     assert(PyUnicode_CheckExact(s));
1039     if (PyArena_AddPyObject(p->arena, s) < 0) {
1040         Py_DECREF(s);
1041         return NULL;
1042     }
1043     const char* the_str = PyBytes_AsString(first_token->bytes);
1044     if (the_str && the_str[0] == 'u') {
1045         kind = _PyPegen_new_identifier(p, "u");
1046     }
1047 
1048     if (kind == NULL && PyErr_Occurred()) {
1049         return NULL;
1050     }
1051 
1052     return Constant(s, kind, first_token->lineno, first_token->col_offset,
1053                     last_token->end_lineno, last_token->end_col_offset, p->arena);
1054 
1055 }
1056 
1057 
1058 /* Add a non-f-string (that is, a regular literal string). str is
1059    decref'd. */
1060 int
_PyPegen_FstringParser_ConcatAndDel(FstringParser * state,PyObject * str)1061 _PyPegen_FstringParser_ConcatAndDel(FstringParser *state, PyObject *str)
1062 {
1063     FstringParser_check_invariants(state);
1064 
1065     assert(PyUnicode_CheckExact(str));
1066 
1067     if (PyUnicode_GET_LENGTH(str) == 0) {
1068         Py_DECREF(str);
1069         return 0;
1070     }
1071 
1072     if (!state->last_str) {
1073         /* We didn't have a string before, so just remember this one. */
1074         state->last_str = str;
1075     } else {
1076         /* Concatenate this with the previous string. */
1077         PyUnicode_AppendAndDel(&state->last_str, str);
1078         if (!state->last_str) {
1079             return -1;
1080         }
1081     }
1082     FstringParser_check_invariants(state);
1083     return 0;
1084 }
1085 
1086 /* Parse an f-string. The f-string is in *str to end, with no
1087    'f' or quotes. */
1088 int
_PyPegen_FstringParser_ConcatFstring(Parser * p,FstringParser * state,const char ** str,const char * end,int raw,int recurse_lvl,Token * first_token,Token * t,Token * last_token)1089 _PyPegen_FstringParser_ConcatFstring(Parser *p, FstringParser *state, const char **str,
1090                             const char *end, int raw, int recurse_lvl,
1091                             Token *first_token, Token* t, Token *last_token)
1092 {
1093     FstringParser_check_invariants(state);
1094     state->fmode = 1;
1095 
1096     /* Parse the f-string. */
1097     while (1) {
1098         PyObject *literal = NULL;
1099         PyObject *expr_text = NULL;
1100         expr_ty expression = NULL;
1101 
1102         /* If there's a zero length literal in front of the
1103            expression, literal will be NULL. If we're at the end of
1104            the f-string, expression will be NULL (unless result == 1,
1105            see below). */
1106         int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl,
1107                                                    &literal, &expr_text,
1108                                                    &expression, first_token, t, last_token);
1109         if (result < 0) {
1110             return -1;
1111         }
1112 
1113         /* Add the literal, if any. */
1114         if (literal && _PyPegen_FstringParser_ConcatAndDel(state, literal) < 0) {
1115             Py_XDECREF(expr_text);
1116             return -1;
1117         }
1118         /* Add the expr_text, if any. */
1119         if (expr_text && _PyPegen_FstringParser_ConcatAndDel(state, expr_text) < 0) {
1120             return -1;
1121         }
1122 
1123         /* We've dealt with the literal and expr_text, their ownership has
1124            been transferred to the state object.  Don't look at them again. */
1125 
1126         /* See if we should just loop around to get the next literal
1127            and expression, while ignoring the expression this
1128            time. This is used for un-doubling braces, as an
1129            optimization. */
1130         if (result == 1) {
1131             continue;
1132         }
1133 
1134         if (!expression) {
1135             /* We're done with this f-string. */
1136             break;
1137         }
1138 
1139         /* We know we have an expression. Convert any existing string
1140            to a Constant node. */
1141         if (!state->last_str) {
1142             /* Do nothing. No previous literal. */
1143         } else {
1144             /* Convert the existing last_str literal to a Constant node. */
1145             expr_ty last_str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1146             if (!last_str || ExprList_Append(&state->expr_list, last_str) < 0) {
1147                 return -1;
1148             }
1149         }
1150 
1151         if (ExprList_Append(&state->expr_list, expression) < 0) {
1152             return -1;
1153         }
1154     }
1155 
1156     /* If recurse_lvl is zero, then we must be at the end of the
1157        string. Otherwise, we must be at a right brace. */
1158 
1159     if (recurse_lvl == 0 && *str < end-1) {
1160         RAISE_SYNTAX_ERROR("f-string: unexpected end of string");
1161         return -1;
1162     }
1163     if (recurse_lvl != 0 && **str != '}') {
1164         RAISE_SYNTAX_ERROR("f-string: expecting '}'");
1165         return -1;
1166     }
1167 
1168     FstringParser_check_invariants(state);
1169     return 0;
1170 }
1171 
1172 /* Convert the partial state reflected in last_str and expr_list to an
1173    expr_ty. The expr_ty can be a Constant, or a JoinedStr. */
1174 expr_ty
_PyPegen_FstringParser_Finish(Parser * p,FstringParser * state,Token * first_token,Token * last_token)1175 _PyPegen_FstringParser_Finish(Parser *p, FstringParser *state, Token* first_token,
1176                      Token *last_token)
1177 {
1178     asdl_seq *seq;
1179 
1180     FstringParser_check_invariants(state);
1181 
1182     /* If we're just a constant string with no expressions, return
1183        that. */
1184     if (!state->fmode) {
1185         assert(!state->expr_list.size);
1186         if (!state->last_str) {
1187             /* Create a zero length string. */
1188             state->last_str = PyUnicode_FromStringAndSize(NULL, 0);
1189             if (!state->last_str) {
1190                 goto error;
1191             }
1192         }
1193         return make_str_node_and_del(p, &state->last_str, first_token, last_token);
1194     }
1195 
1196     /* Create a Constant node out of last_str, if needed. It will be the
1197        last node in our expression list. */
1198     if (state->last_str) {
1199         expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1200         if (!str || ExprList_Append(&state->expr_list, str) < 0) {
1201             goto error;
1202         }
1203     }
1204     /* This has already been freed. */
1205     assert(state->last_str == NULL);
1206 
1207     seq = ExprList_Finish(&state->expr_list, p->arena);
1208     if (!seq) {
1209         goto error;
1210     }
1211 
1212     return _Py_JoinedStr(seq, first_token->lineno, first_token->col_offset,
1213                          last_token->end_lineno, last_token->end_col_offset, p->arena);
1214 
1215 error:
1216     _PyPegen_FstringParser_Dealloc(state);
1217     return NULL;
1218 }
1219 
1220 /* Given an f-string (with no 'f' or quotes) that's in *str and ends
1221    at end, parse it into an expr_ty.  Return NULL on error.  Adjust
1222    str to point past the parsed portion. */
1223 static expr_ty
fstring_parse(Parser * p,const char ** str,const char * end,int raw,int recurse_lvl,Token * first_token,Token * t,Token * last_token)1224 fstring_parse(Parser *p, const char **str, const char *end, int raw,
1225               int recurse_lvl, Token *first_token, Token* t, Token *last_token)
1226 {
1227     FstringParser state;
1228 
1229     _PyPegen_FstringParser_Init(&state);
1230     if (_PyPegen_FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl,
1231                                     first_token, t, last_token) < 0) {
1232         _PyPegen_FstringParser_Dealloc(&state);
1233         return NULL;
1234     }
1235 
1236     return _PyPegen_FstringParser_Finish(p, &state, t, t);
1237 }
1238