1 #include <stdbool.h>
2
3 #include <Python.h>
4
5 #include "../tokenizer.h"
6 #include "pegen.h"
7 #include "parse_string.h"
8
9 //// STRING HANDLING FUNCTIONS ////
10
11 // These functions are ported directly from Python/ast.c with some modifications
12 // to account for the use of "Parser *p", the fact that don't have parser nodes
13 // to pass around and the usage of some specialized APIs present only in this
14 // file (like "_PyPegen_raise_syntax_error").
15
16 static int
warn_invalid_escape_sequence(Parser * p,unsigned char first_invalid_escape_char,Token * t)17 warn_invalid_escape_sequence(Parser *p, unsigned char first_invalid_escape_char, Token *t)
18 {
19 PyObject *msg =
20 PyUnicode_FromFormat("invalid escape sequence \\%c", first_invalid_escape_char);
21 if (msg == NULL) {
22 return -1;
23 }
24 if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename,
25 t->lineno, NULL, NULL) < 0) {
26 if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
27 /* Replace the DeprecationWarning exception with a SyntaxError
28 to get a more accurate error report */
29 PyErr_Clear();
30
31 /* This is needed, in order for the SyntaxError to point to the token t,
32 since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
33 error location, if p->known_err_token is not set. */
34 p->known_err_token = t;
35 RAISE_SYNTAX_ERROR("invalid escape sequence \\%c", first_invalid_escape_char);
36 }
37 Py_DECREF(msg);
38 return -1;
39 }
40 Py_DECREF(msg);
41 return 0;
42 }
43
44 static PyObject *
decode_utf8(const char ** sPtr,const char * end)45 decode_utf8(const char **sPtr, const char *end)
46 {
47 const char *s;
48 const char *t;
49 t = s = *sPtr;
50 while (s < end && (*s & 0x80)) {
51 s++;
52 }
53 *sPtr = s;
54 return PyUnicode_DecodeUTF8(t, s - t, NULL);
55 }
56
57 static PyObject *
decode_unicode_with_escapes(Parser * parser,const char * s,size_t len,Token * t)58 decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
59 {
60 PyObject *v;
61 PyObject *u;
62 char *buf;
63 char *p;
64 const char *end;
65
66 /* check for integer overflow */
67 if (len > SIZE_MAX / 6) {
68 return NULL;
69 }
70 /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
71 "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
72 u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
73 if (u == NULL) {
74 return NULL;
75 }
76 p = buf = PyBytes_AsString(u);
77 if (p == NULL) {
78 return NULL;
79 }
80 end = s + len;
81 while (s < end) {
82 if (*s == '\\') {
83 *p++ = *s++;
84 if (s >= end || *s & 0x80) {
85 strcpy(p, "u005c");
86 p += 5;
87 if (s >= end) {
88 break;
89 }
90 }
91 }
92 if (*s & 0x80) {
93 PyObject *w;
94 int kind;
95 void *data;
96 Py_ssize_t w_len;
97 Py_ssize_t i;
98 w = decode_utf8(&s, end);
99 if (w == NULL) {
100 Py_DECREF(u);
101 return NULL;
102 }
103 kind = PyUnicode_KIND(w);
104 data = PyUnicode_DATA(w);
105 w_len = PyUnicode_GET_LENGTH(w);
106 for (i = 0; i < w_len; i++) {
107 Py_UCS4 chr = PyUnicode_READ(kind, data, i);
108 sprintf(p, "\\U%08x", chr);
109 p += 10;
110 }
111 /* Should be impossible to overflow */
112 assert(p - buf <= PyBytes_GET_SIZE(u));
113 Py_DECREF(w);
114 }
115 else {
116 *p++ = *s++;
117 }
118 }
119 len = p - buf;
120 s = buf;
121
122 const char *first_invalid_escape;
123 v = _PyUnicode_DecodeUnicodeEscape(s, len, NULL, &first_invalid_escape);
124
125 if (v != NULL && first_invalid_escape != NULL) {
126 if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
127 /* We have not decref u before because first_invalid_escape points
128 inside u. */
129 Py_XDECREF(u);
130 Py_DECREF(v);
131 return NULL;
132 }
133 }
134 Py_XDECREF(u);
135 return v;
136 }
137
138 static PyObject *
decode_bytes_with_escapes(Parser * p,const char * s,Py_ssize_t len,Token * t)139 decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
140 {
141 const char *first_invalid_escape;
142 PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
143 if (result == NULL) {
144 return NULL;
145 }
146
147 if (first_invalid_escape != NULL) {
148 if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
149 Py_DECREF(result);
150 return NULL;
151 }
152 }
153 return result;
154 }
155
156 /* s must include the bracketing quote characters, and r, b, u,
157 &/or f prefixes (if any), and embedded escape sequences (if any).
158 _PyPegen_parsestr parses it, and sets *result to decoded Python string object.
159 If the string is an f-string, set *fstr and *fstrlen to the unparsed
160 string object. Return 0 if no errors occurred. */
161 int
_PyPegen_parsestr(Parser * p,int * bytesmode,int * rawmode,PyObject ** result,const char ** fstr,Py_ssize_t * fstrlen,Token * t)162 _PyPegen_parsestr(Parser *p, int *bytesmode, int *rawmode, PyObject **result,
163 const char **fstr, Py_ssize_t *fstrlen, Token *t)
164 {
165 const char *s = PyBytes_AsString(t->bytes);
166 if (s == NULL) {
167 return -1;
168 }
169
170 size_t len;
171 int quote = Py_CHARMASK(*s);
172 int fmode = 0;
173 *bytesmode = 0;
174 *rawmode = 0;
175 *result = NULL;
176 *fstr = NULL;
177 if (Py_ISALPHA(quote)) {
178 while (!*bytesmode || !*rawmode) {
179 if (quote == 'b' || quote == 'B') {
180 quote =(unsigned char)*++s;
181 *bytesmode = 1;
182 }
183 else if (quote == 'u' || quote == 'U') {
184 quote = (unsigned char)*++s;
185 }
186 else if (quote == 'r' || quote == 'R') {
187 quote = (unsigned char)*++s;
188 *rawmode = 1;
189 }
190 else if (quote == 'f' || quote == 'F') {
191 quote = (unsigned char)*++s;
192 fmode = 1;
193 }
194 else {
195 break;
196 }
197 }
198 }
199
200 /* fstrings are only allowed in Python 3.6 and greater */
201 if (fmode && p->feature_version < 6) {
202 p->error_indicator = 1;
203 RAISE_SYNTAX_ERROR("Format strings are only supported in Python 3.6 and greater");
204 return -1;
205 }
206
207 if (fmode && *bytesmode) {
208 PyErr_BadInternalCall();
209 return -1;
210 }
211 if (quote != '\'' && quote != '\"') {
212 PyErr_BadInternalCall();
213 return -1;
214 }
215 /* Skip the leading quote char. */
216 s++;
217 len = strlen(s);
218 if (len > INT_MAX) {
219 PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
220 return -1;
221 }
222 if (s[--len] != quote) {
223 /* Last quote char must match the first. */
224 PyErr_BadInternalCall();
225 return -1;
226 }
227 if (len >= 4 && s[0] == quote && s[1] == quote) {
228 /* A triple quoted string. We've already skipped one quote at
229 the start and one at the end of the string. Now skip the
230 two at the start. */
231 s += 2;
232 len -= 2;
233 /* And check that the last two match. */
234 if (s[--len] != quote || s[--len] != quote) {
235 PyErr_BadInternalCall();
236 return -1;
237 }
238 }
239
240 if (fmode) {
241 /* Just return the bytes. The caller will parse the resulting
242 string. */
243 *fstr = s;
244 *fstrlen = len;
245 return 0;
246 }
247
248 /* Not an f-string. */
249 /* Avoid invoking escape decoding routines if possible. */
250 *rawmode = *rawmode || strchr(s, '\\') == NULL;
251 if (*bytesmode) {
252 /* Disallow non-ASCII characters. */
253 const char *ch;
254 for (ch = s; *ch; ch++) {
255 if (Py_CHARMASK(*ch) >= 0x80) {
256 RAISE_SYNTAX_ERROR(
257 "bytes can only contain ASCII "
258 "literal characters.");
259 return -1;
260 }
261 }
262 if (*rawmode) {
263 *result = PyBytes_FromStringAndSize(s, len);
264 }
265 else {
266 *result = decode_bytes_with_escapes(p, s, len, t);
267 }
268 }
269 else {
270 if (*rawmode) {
271 *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
272 }
273 else {
274 *result = decode_unicode_with_escapes(p, s, len, t);
275 }
276 }
277 return *result == NULL ? -1 : 0;
278 }
279
280
281
282 // FSTRING STUFF
283
284 /* Fix locations for the given node and its children.
285
286 `parent` is the enclosing node.
287 `n` is the node which locations are going to be fixed relative to parent.
288 `expr_str` is the child node's string representation, including braces.
289 */
290 static bool
fstring_find_expr_location(Token * parent,char * expr_str,int * p_lines,int * p_cols)291 fstring_find_expr_location(Token *parent, char *expr_str, int *p_lines, int *p_cols)
292 {
293 *p_lines = 0;
294 *p_cols = 0;
295 if (parent && parent->bytes) {
296 char *parent_str = PyBytes_AsString(parent->bytes);
297 if (!parent_str) {
298 return false;
299 }
300 char *substr = strstr(parent_str, expr_str);
301 if (substr) {
302 // The following is needed, in order to correctly shift the column
303 // offset, in the case that (disregarding any whitespace) a newline
304 // immediately follows the opening curly brace of the fstring expression.
305 bool newline_after_brace = 1;
306 char *start = substr + 1;
307 while (start && *start != '}' && *start != '\n') {
308 if (*start != ' ' && *start != '\t' && *start != '\f') {
309 newline_after_brace = 0;
310 break;
311 }
312 start++;
313 }
314
315 // Account for the characters from the last newline character to our
316 // left until the beginning of substr.
317 if (!newline_after_brace) {
318 start = substr;
319 while (start > parent_str && *start != '\n') {
320 start--;
321 }
322 *p_cols += (int)(substr - start);
323 }
324 /* adjust the start based on the number of newlines encountered
325 before the f-string expression */
326 for (char* p = parent_str; p < substr; p++) {
327 if (*p == '\n') {
328 (*p_lines)++;
329 }
330 }
331 }
332 }
333 return true;
334 }
335
336
337 /* Compile this expression in to an expr_ty. Add parens around the
338 expression, in order to allow leading spaces in the expression. */
339 static expr_ty
fstring_compile_expr(Parser * p,const char * expr_start,const char * expr_end,Token * t)340 fstring_compile_expr(Parser *p, const char *expr_start, const char *expr_end,
341 Token *t)
342 {
343 expr_ty expr = NULL;
344 char *str;
345 Py_ssize_t len;
346 const char *s;
347 expr_ty result = NULL;
348
349 assert(expr_end >= expr_start);
350 assert(*(expr_start-1) == '{');
351 assert(*expr_end == '}' || *expr_end == '!' || *expr_end == ':' ||
352 *expr_end == '=');
353
354 /* If the substring is all whitespace, it's an error. We need to catch this
355 here, and not when we call PyParser_SimpleParseStringFlagsFilename,
356 because turning the expression '' in to '()' would go from being invalid
357 to valid. */
358 for (s = expr_start; s != expr_end; s++) {
359 char c = *s;
360 /* The Python parser ignores only the following whitespace
361 characters (\r already is converted to \n). */
362 if (!(c == ' ' || c == '\t' || c == '\n' || c == '\f')) {
363 break;
364 }
365 }
366 if (s == expr_end) {
367 RAISE_SYNTAX_ERROR("f-string: empty expression not allowed");
368 return NULL;
369 }
370
371 len = expr_end - expr_start;
372 /* Allocate 3 extra bytes: open paren, close paren, null byte. */
373 str = PyMem_Malloc(len + 3);
374 if (str == NULL) {
375 PyErr_NoMemory();
376 return NULL;
377 }
378
379 // The call to fstring_find_expr_location is responsible for finding the column offset
380 // the generated AST nodes need to be shifted to the right, which is equal to the number
381 // of the f-string characters before the expression starts. In order to correctly compute
382 // this offset, strstr gets called in fstring_find_expr_location which only succeeds
383 // if curly braces appear before and after the f-string expression (exactly like they do
384 // in the f-string itself), hence the following lines.
385 str[0] = '{';
386 memcpy(str+1, expr_start, len);
387 str[len+1] = '}';
388 str[len+2] = 0;
389
390 int lines, cols;
391 if (!fstring_find_expr_location(t, str, &lines, &cols)) {
392 PyMem_FREE(str);
393 return NULL;
394 }
395
396 // The parentheses are needed in order to allow for leading whitespace within
397 // the f-string expression. This consequently gets parsed as a group (see the
398 // group rule in python.gram).
399 str[0] = '(';
400 str[len+1] = ')';
401
402 struct tok_state* tok = PyTokenizer_FromString(str, 1);
403 if (tok == NULL) {
404 PyMem_Free(str);
405 return NULL;
406 }
407 Py_INCREF(p->tok->filename);
408 tok->filename = p->tok->filename;
409
410 Parser *p2 = _PyPegen_Parser_New(tok, Py_fstring_input, p->flags, p->feature_version,
411 NULL, p->arena);
412 p2->starting_lineno = t->lineno + lines - 1;
413 p2->starting_col_offset = p->tok->first_lineno == p->tok->lineno ? t->col_offset + cols : cols;
414
415 expr = _PyPegen_run_parser(p2);
416
417 if (expr == NULL) {
418 goto exit;
419 }
420 result = expr;
421
422 exit:
423 PyMem_Free(str);
424 _PyPegen_Parser_Free(p2);
425 PyTokenizer_Free(tok);
426 return result;
427 }
428
429 /* Return -1 on error.
430
431 Return 0 if we reached the end of the literal.
432
433 Return 1 if we haven't reached the end of the literal, but we want
434 the caller to process the literal up to this point. Used for
435 doubled braces.
436 */
437 static int
fstring_find_literal(Parser * p,const char ** str,const char * end,int raw,PyObject ** literal,int recurse_lvl,Token * t)438 fstring_find_literal(Parser *p, const char **str, const char *end, int raw,
439 PyObject **literal, int recurse_lvl, Token *t)
440 {
441 /* Get any literal string. It ends when we hit an un-doubled left
442 brace (which isn't part of a unicode name escape such as
443 "\N{EULER CONSTANT}"), or the end of the string. */
444
445 const char *s = *str;
446 const char *literal_start = s;
447 int result = 0;
448
449 assert(*literal == NULL);
450 while (s < end) {
451 char ch = *s++;
452 if (!raw && ch == '\\' && s < end) {
453 ch = *s++;
454 if (ch == 'N') {
455 if (s < end && *s++ == '{') {
456 while (s < end && *s++ != '}') {
457 }
458 continue;
459 }
460 break;
461 }
462 if (ch == '{' && warn_invalid_escape_sequence(p, ch, t) < 0) {
463 return -1;
464 }
465 }
466 if (ch == '{' || ch == '}') {
467 /* Check for doubled braces, but only at the top level. If
468 we checked at every level, then f'{0:{3}}' would fail
469 with the two closing braces. */
470 if (recurse_lvl == 0) {
471 if (s < end && *s == ch) {
472 /* We're going to tell the caller that the literal ends
473 here, but that they should continue scanning. But also
474 skip over the second brace when we resume scanning. */
475 *str = s + 1;
476 result = 1;
477 goto done;
478 }
479
480 /* Where a single '{' is the start of a new expression, a
481 single '}' is not allowed. */
482 if (ch == '}') {
483 *str = s - 1;
484 RAISE_SYNTAX_ERROR("f-string: single '}' is not allowed");
485 return -1;
486 }
487 }
488 /* We're either at a '{', which means we're starting another
489 expression; or a '}', which means we're at the end of this
490 f-string (for a nested format_spec). */
491 s--;
492 break;
493 }
494 }
495 *str = s;
496 assert(s <= end);
497 assert(s == end || *s == '{' || *s == '}');
498 done:
499 if (literal_start != s) {
500 if (raw) {
501 *literal = PyUnicode_DecodeUTF8Stateful(literal_start,
502 s - literal_start,
503 NULL, NULL);
504 } else {
505 *literal = decode_unicode_with_escapes(p, literal_start,
506 s - literal_start, t);
507 }
508 if (!*literal) {
509 return -1;
510 }
511 }
512 return result;
513 }
514
515 /* Forward declaration because parsing is recursive. */
516 static expr_ty
517 fstring_parse(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
518 Token *first_token, Token* t, Token *last_token);
519
520 /* Parse the f-string at *str, ending at end. We know *str starts an
521 expression (so it must be a '{'). Returns the FormattedValue node, which
522 includes the expression, conversion character, format_spec expression, and
523 optionally the text of the expression (if = is used).
524
525 Note that I don't do a perfect job here: I don't make sure that a
526 closing brace doesn't match an opening paren, for example. It
527 doesn't need to error on all invalid expressions, just correctly
528 find the end of all valid ones. Any errors inside the expression
529 will be caught when we parse it later.
530
531 *expression is set to the expression. For an '=' "debug" expression,
532 *expr_text is set to the debug text (the original text of the expression,
533 including the '=' and any whitespace around it, as a string object). If
534 not a debug expression, *expr_text set to NULL. */
535 static int
fstring_find_expr(Parser * p,const char ** str,const char * end,int raw,int recurse_lvl,PyObject ** expr_text,expr_ty * expression,Token * first_token,Token * t,Token * last_token)536 fstring_find_expr(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
537 PyObject **expr_text, expr_ty *expression, Token *first_token,
538 Token *t, Token *last_token)
539 {
540 /* Return -1 on error, else 0. */
541
542 const char *expr_start;
543 const char *expr_end;
544 expr_ty simple_expression;
545 expr_ty format_spec = NULL; /* Optional format specifier. */
546 int conversion = -1; /* The conversion char. Use default if not
547 specified, or !r if using = and no format
548 spec. */
549
550 /* 0 if we're not in a string, else the quote char we're trying to
551 match (single or double quote). */
552 char quote_char = 0;
553
554 /* If we're inside a string, 1=normal, 3=triple-quoted. */
555 int string_type = 0;
556
557 /* Keep track of nesting level for braces/parens/brackets in
558 expressions. */
559 Py_ssize_t nested_depth = 0;
560 char parenstack[MAXLEVEL];
561
562 *expr_text = NULL;
563
564 /* Can only nest one level deep. */
565 if (recurse_lvl >= 2) {
566 RAISE_SYNTAX_ERROR("f-string: expressions nested too deeply");
567 goto error;
568 }
569
570 /* The first char must be a left brace, or we wouldn't have gotten
571 here. Skip over it. */
572 assert(**str == '{');
573 *str += 1;
574
575 expr_start = *str;
576 for (; *str < end; (*str)++) {
577 char ch;
578
579 /* Loop invariants. */
580 assert(nested_depth >= 0);
581 assert(*str >= expr_start && *str < end);
582 if (quote_char) {
583 assert(string_type == 1 || string_type == 3);
584 } else {
585 assert(string_type == 0);
586 }
587
588 ch = **str;
589 /* Nowhere inside an expression is a backslash allowed. */
590 if (ch == '\\') {
591 /* Error: can't include a backslash character, inside
592 parens or strings or not. */
593 RAISE_SYNTAX_ERROR(
594 "f-string expression part "
595 "cannot include a backslash");
596 goto error;
597 }
598 if (quote_char) {
599 /* We're inside a string. See if we're at the end. */
600 /* This code needs to implement the same non-error logic
601 as tok_get from tokenizer.c, at the letter_quote
602 label. To actually share that code would be a
603 nightmare. But, it's unlikely to change and is small,
604 so duplicate it here. Note we don't need to catch all
605 of the errors, since they'll be caught when parsing the
606 expression. We just need to match the non-error
607 cases. Thus we can ignore \n in single-quoted strings,
608 for example. Or non-terminated strings. */
609 if (ch == quote_char) {
610 /* Does this match the string_type (single or triple
611 quoted)? */
612 if (string_type == 3) {
613 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
614 /* We're at the end of a triple quoted string. */
615 *str += 2;
616 string_type = 0;
617 quote_char = 0;
618 continue;
619 }
620 } else {
621 /* We're at the end of a normal string. */
622 quote_char = 0;
623 string_type = 0;
624 continue;
625 }
626 }
627 } else if (ch == '\'' || ch == '"') {
628 /* Is this a triple quoted string? */
629 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
630 string_type = 3;
631 *str += 2;
632 } else {
633 /* Start of a normal string. */
634 string_type = 1;
635 }
636 /* Start looking for the end of the string. */
637 quote_char = ch;
638 } else if (ch == '[' || ch == '{' || ch == '(') {
639 if (nested_depth >= MAXLEVEL) {
640 RAISE_SYNTAX_ERROR("f-string: too many nested parenthesis");
641 goto error;
642 }
643 parenstack[nested_depth] = ch;
644 nested_depth++;
645 } else if (ch == '#') {
646 /* Error: can't include a comment character, inside parens
647 or not. */
648 RAISE_SYNTAX_ERROR("f-string expression part cannot include '#'");
649 goto error;
650 } else if (nested_depth == 0 &&
651 (ch == '!' || ch == ':' || ch == '}' ||
652 ch == '=' || ch == '>' || ch == '<')) {
653 /* See if there's a next character. */
654 if (*str+1 < end) {
655 char next = *(*str+1);
656
657 /* For "!=". since '=' is not an allowed conversion character,
658 nothing is lost in this test. */
659 if ((ch == '!' && next == '=') || /* != */
660 (ch == '=' && next == '=') || /* == */
661 (ch == '<' && next == '=') || /* <= */
662 (ch == '>' && next == '=') /* >= */
663 ) {
664 *str += 1;
665 continue;
666 }
667 /* Don't get out of the loop for these, if they're single
668 chars (not part of 2-char tokens). If by themselves, they
669 don't end an expression (unlike say '!'). */
670 if (ch == '>' || ch == '<') {
671 continue;
672 }
673 }
674
675 /* Normal way out of this loop. */
676 break;
677 } else if (ch == ']' || ch == '}' || ch == ')') {
678 if (!nested_depth) {
679 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", ch);
680 goto error;
681 }
682 nested_depth--;
683 int opening = (unsigned char)parenstack[nested_depth];
684 if (!((opening == '(' && ch == ')') ||
685 (opening == '[' && ch == ']') ||
686 (opening == '{' && ch == '}')))
687 {
688 RAISE_SYNTAX_ERROR(
689 "f-string: closing parenthesis '%c' "
690 "does not match opening parenthesis '%c'",
691 ch, opening);
692 goto error;
693 }
694 } else {
695 /* Just consume this char and loop around. */
696 }
697 }
698 expr_end = *str;
699 /* If we leave this loop in a string or with mismatched parens, we
700 don't care. We'll get a syntax error when compiling the
701 expression. But, we can produce a better error message, so
702 let's just do that.*/
703 if (quote_char) {
704 RAISE_SYNTAX_ERROR("f-string: unterminated string");
705 goto error;
706 }
707 if (nested_depth) {
708 int opening = (unsigned char)parenstack[nested_depth - 1];
709 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", opening);
710 goto error;
711 }
712
713 if (*str >= end) {
714 goto unexpected_end_of_string;
715 }
716
717 /* Compile the expression as soon as possible, so we show errors
718 related to the expression before errors related to the
719 conversion or format_spec. */
720 simple_expression = fstring_compile_expr(p, expr_start, expr_end, t);
721 if (!simple_expression) {
722 goto error;
723 }
724
725 /* Check for =, which puts the text value of the expression in
726 expr_text. */
727 if (**str == '=') {
728 if (p->feature_version < 8) {
729 RAISE_SYNTAX_ERROR("f-string: self documenting expressions are "
730 "only supported in Python 3.8 and greater");
731 goto error;
732 }
733 *str += 1;
734
735 /* Skip over ASCII whitespace. No need to test for end of string
736 here, since we know there's at least a trailing quote somewhere
737 ahead. */
738 while (Py_ISSPACE(**str)) {
739 *str += 1;
740 }
741
742 /* Set *expr_text to the text of the expression. */
743 *expr_text = PyUnicode_FromStringAndSize(expr_start, *str-expr_start);
744 if (!*expr_text) {
745 goto error;
746 }
747 }
748
749 /* Check for a conversion char, if present. */
750 if (**str == '!') {
751 *str += 1;
752 if (*str >= end) {
753 goto unexpected_end_of_string;
754 }
755
756 conversion = (unsigned char)**str;
757 *str += 1;
758
759 /* Validate the conversion. */
760 if (!(conversion == 's' || conversion == 'r' || conversion == 'a')) {
761 RAISE_SYNTAX_ERROR(
762 "f-string: invalid conversion character: "
763 "expected 's', 'r', or 'a'");
764 goto error;
765 }
766
767 }
768
769 /* Check for the format spec, if present. */
770 if (*str >= end) {
771 goto unexpected_end_of_string;
772 }
773 if (**str == ':') {
774 *str += 1;
775 if (*str >= end) {
776 goto unexpected_end_of_string;
777 }
778
779 /* Parse the format spec. */
780 format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1,
781 first_token, t, last_token);
782 if (!format_spec) {
783 goto error;
784 }
785 }
786
787 if (*str >= end || **str != '}') {
788 goto unexpected_end_of_string;
789 }
790
791 /* We're at a right brace. Consume it. */
792 assert(*str < end);
793 assert(**str == '}');
794 *str += 1;
795
796 /* If we're in = mode (detected by non-NULL expr_text), and have no format
797 spec and no explicit conversion, set the conversion to 'r'. */
798 if (*expr_text && format_spec == NULL && conversion == -1) {
799 conversion = 'r';
800 }
801
802 /* And now create the FormattedValue node that represents this
803 entire expression with the conversion and format spec. */
804 //TODO: Fix this
805 *expression = FormattedValue(simple_expression, conversion,
806 format_spec, first_token->lineno,
807 first_token->col_offset, last_token->end_lineno,
808 last_token->end_col_offset, p->arena);
809 if (!*expression) {
810 goto error;
811 }
812
813 return 0;
814
815 unexpected_end_of_string:
816 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
817 /* Falls through to error. */
818
819 error:
820 Py_XDECREF(*expr_text);
821 return -1;
822
823 }
824
825 /* Return -1 on error.
826
827 Return 0 if we have a literal (possible zero length) and an
828 expression (zero length if at the end of the string.
829
830 Return 1 if we have a literal, but no expression, and we want the
831 caller to call us again. This is used to deal with doubled
832 braces.
833
834 When called multiple times on the string 'a{{b{0}c', this function
835 will return:
836
837 1. the literal 'a{' with no expression, and a return value
838 of 1. Despite the fact that there's no expression, the return
839 value of 1 means we're not finished yet.
840
841 2. the literal 'b' and the expression '0', with a return value of
842 0. The fact that there's an expression means we're not finished.
843
844 3. literal 'c' with no expression and a return value of 0. The
845 combination of the return value of 0 with no expression means
846 we're finished.
847 */
848 static int
fstring_find_literal_and_expr(Parser * p,const char ** str,const char * end,int raw,int recurse_lvl,PyObject ** literal,PyObject ** expr_text,expr_ty * expression,Token * first_token,Token * t,Token * last_token)849 fstring_find_literal_and_expr(Parser *p, const char **str, const char *end, int raw,
850 int recurse_lvl, PyObject **literal,
851 PyObject **expr_text, expr_ty *expression,
852 Token *first_token, Token *t, Token *last_token)
853 {
854 int result;
855
856 assert(*literal == NULL && *expression == NULL);
857
858 /* Get any literal string. */
859 result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
860 if (result < 0) {
861 goto error;
862 }
863
864 assert(result == 0 || result == 1);
865
866 if (result == 1) {
867 /* We have a literal, but don't look at the expression. */
868 return 1;
869 }
870
871 if (*str >= end || **str == '}') {
872 /* We're at the end of the string or the end of a nested
873 f-string: no expression. The top-level error case where we
874 expect to be at the end of the string but we're at a '}' is
875 handled later. */
876 return 0;
877 }
878
879 /* We must now be the start of an expression, on a '{'. */
880 assert(**str == '{');
881
882 if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text,
883 expression, first_token, t, last_token) < 0) {
884 goto error;
885 }
886
887 return 0;
888
889 error:
890 Py_CLEAR(*literal);
891 return -1;
892 }
893
894 #ifdef NDEBUG
895 #define ExprList_check_invariants(l)
896 #else
897 static void
ExprList_check_invariants(ExprList * l)898 ExprList_check_invariants(ExprList *l)
899 {
900 /* Check our invariants. Make sure this object is "live", and
901 hasn't been deallocated. */
902 assert(l->size >= 0);
903 assert(l->p != NULL);
904 if (l->size <= EXPRLIST_N_CACHED) {
905 assert(l->data == l->p);
906 }
907 }
908 #endif
909
910 static void
ExprList_Init(ExprList * l)911 ExprList_Init(ExprList *l)
912 {
913 l->allocated = EXPRLIST_N_CACHED;
914 l->size = 0;
915
916 /* Until we start allocating dynamically, p points to data. */
917 l->p = l->data;
918
919 ExprList_check_invariants(l);
920 }
921
922 static int
ExprList_Append(ExprList * l,expr_ty exp)923 ExprList_Append(ExprList *l, expr_ty exp)
924 {
925 ExprList_check_invariants(l);
926 if (l->size >= l->allocated) {
927 /* We need to alloc (or realloc) the memory. */
928 Py_ssize_t new_size = l->allocated * 2;
929
930 /* See if we've ever allocated anything dynamically. */
931 if (l->p == l->data) {
932 Py_ssize_t i;
933 /* We're still using the cached data. Switch to
934 alloc-ing. */
935 l->p = PyMem_Malloc(sizeof(expr_ty) * new_size);
936 if (!l->p) {
937 return -1;
938 }
939 /* Copy the cached data into the new buffer. */
940 for (i = 0; i < l->size; i++) {
941 l->p[i] = l->data[i];
942 }
943 } else {
944 /* Just realloc. */
945 expr_ty *tmp = PyMem_Realloc(l->p, sizeof(expr_ty) * new_size);
946 if (!tmp) {
947 PyMem_Free(l->p);
948 l->p = NULL;
949 return -1;
950 }
951 l->p = tmp;
952 }
953
954 l->allocated = new_size;
955 assert(l->allocated == 2 * l->size);
956 }
957
958 l->p[l->size++] = exp;
959
960 ExprList_check_invariants(l);
961 return 0;
962 }
963
964 static void
ExprList_Dealloc(ExprList * l)965 ExprList_Dealloc(ExprList *l)
966 {
967 ExprList_check_invariants(l);
968
969 /* If there's been an error, or we've never dynamically allocated,
970 do nothing. */
971 if (!l->p || l->p == l->data) {
972 /* Do nothing. */
973 } else {
974 /* We have dynamically allocated. Free the memory. */
975 PyMem_Free(l->p);
976 }
977 l->p = NULL;
978 l->size = -1;
979 }
980
981 static asdl_seq *
ExprList_Finish(ExprList * l,PyArena * arena)982 ExprList_Finish(ExprList *l, PyArena *arena)
983 {
984 asdl_seq *seq;
985
986 ExprList_check_invariants(l);
987
988 /* Allocate the asdl_seq and copy the expressions in to it. */
989 seq = _Py_asdl_seq_new(l->size, arena);
990 if (seq) {
991 Py_ssize_t i;
992 for (i = 0; i < l->size; i++) {
993 asdl_seq_SET(seq, i, l->p[i]);
994 }
995 }
996 ExprList_Dealloc(l);
997 return seq;
998 }
999
1000 #ifdef NDEBUG
1001 #define FstringParser_check_invariants(state)
1002 #else
1003 static void
FstringParser_check_invariants(FstringParser * state)1004 FstringParser_check_invariants(FstringParser *state)
1005 {
1006 if (state->last_str) {
1007 assert(PyUnicode_CheckExact(state->last_str));
1008 }
1009 ExprList_check_invariants(&state->expr_list);
1010 }
1011 #endif
1012
1013 void
_PyPegen_FstringParser_Init(FstringParser * state)1014 _PyPegen_FstringParser_Init(FstringParser *state)
1015 {
1016 state->last_str = NULL;
1017 state->fmode = 0;
1018 ExprList_Init(&state->expr_list);
1019 FstringParser_check_invariants(state);
1020 }
1021
1022 void
_PyPegen_FstringParser_Dealloc(FstringParser * state)1023 _PyPegen_FstringParser_Dealloc(FstringParser *state)
1024 {
1025 FstringParser_check_invariants(state);
1026
1027 Py_XDECREF(state->last_str);
1028 ExprList_Dealloc(&state->expr_list);
1029 }
1030
1031 /* Make a Constant node, but decref the PyUnicode object being added. */
1032 static expr_ty
make_str_node_and_del(Parser * p,PyObject ** str,Token * first_token,Token * last_token)1033 make_str_node_and_del(Parser *p, PyObject **str, Token* first_token, Token *last_token)
1034 {
1035 PyObject *s = *str;
1036 PyObject *kind = NULL;
1037 *str = NULL;
1038 assert(PyUnicode_CheckExact(s));
1039 if (PyArena_AddPyObject(p->arena, s) < 0) {
1040 Py_DECREF(s);
1041 return NULL;
1042 }
1043 const char* the_str = PyBytes_AsString(first_token->bytes);
1044 if (the_str && the_str[0] == 'u') {
1045 kind = _PyPegen_new_identifier(p, "u");
1046 }
1047
1048 if (kind == NULL && PyErr_Occurred()) {
1049 return NULL;
1050 }
1051
1052 return Constant(s, kind, first_token->lineno, first_token->col_offset,
1053 last_token->end_lineno, last_token->end_col_offset, p->arena);
1054
1055 }
1056
1057
1058 /* Add a non-f-string (that is, a regular literal string). str is
1059 decref'd. */
1060 int
_PyPegen_FstringParser_ConcatAndDel(FstringParser * state,PyObject * str)1061 _PyPegen_FstringParser_ConcatAndDel(FstringParser *state, PyObject *str)
1062 {
1063 FstringParser_check_invariants(state);
1064
1065 assert(PyUnicode_CheckExact(str));
1066
1067 if (PyUnicode_GET_LENGTH(str) == 0) {
1068 Py_DECREF(str);
1069 return 0;
1070 }
1071
1072 if (!state->last_str) {
1073 /* We didn't have a string before, so just remember this one. */
1074 state->last_str = str;
1075 } else {
1076 /* Concatenate this with the previous string. */
1077 PyUnicode_AppendAndDel(&state->last_str, str);
1078 if (!state->last_str) {
1079 return -1;
1080 }
1081 }
1082 FstringParser_check_invariants(state);
1083 return 0;
1084 }
1085
1086 /* Parse an f-string. The f-string is in *str to end, with no
1087 'f' or quotes. */
1088 int
_PyPegen_FstringParser_ConcatFstring(Parser * p,FstringParser * state,const char ** str,const char * end,int raw,int recurse_lvl,Token * first_token,Token * t,Token * last_token)1089 _PyPegen_FstringParser_ConcatFstring(Parser *p, FstringParser *state, const char **str,
1090 const char *end, int raw, int recurse_lvl,
1091 Token *first_token, Token* t, Token *last_token)
1092 {
1093 FstringParser_check_invariants(state);
1094 state->fmode = 1;
1095
1096 /* Parse the f-string. */
1097 while (1) {
1098 PyObject *literal = NULL;
1099 PyObject *expr_text = NULL;
1100 expr_ty expression = NULL;
1101
1102 /* If there's a zero length literal in front of the
1103 expression, literal will be NULL. If we're at the end of
1104 the f-string, expression will be NULL (unless result == 1,
1105 see below). */
1106 int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl,
1107 &literal, &expr_text,
1108 &expression, first_token, t, last_token);
1109 if (result < 0) {
1110 return -1;
1111 }
1112
1113 /* Add the literal, if any. */
1114 if (literal && _PyPegen_FstringParser_ConcatAndDel(state, literal) < 0) {
1115 Py_XDECREF(expr_text);
1116 return -1;
1117 }
1118 /* Add the expr_text, if any. */
1119 if (expr_text && _PyPegen_FstringParser_ConcatAndDel(state, expr_text) < 0) {
1120 return -1;
1121 }
1122
1123 /* We've dealt with the literal and expr_text, their ownership has
1124 been transferred to the state object. Don't look at them again. */
1125
1126 /* See if we should just loop around to get the next literal
1127 and expression, while ignoring the expression this
1128 time. This is used for un-doubling braces, as an
1129 optimization. */
1130 if (result == 1) {
1131 continue;
1132 }
1133
1134 if (!expression) {
1135 /* We're done with this f-string. */
1136 break;
1137 }
1138
1139 /* We know we have an expression. Convert any existing string
1140 to a Constant node. */
1141 if (!state->last_str) {
1142 /* Do nothing. No previous literal. */
1143 } else {
1144 /* Convert the existing last_str literal to a Constant node. */
1145 expr_ty last_str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1146 if (!last_str || ExprList_Append(&state->expr_list, last_str) < 0) {
1147 return -1;
1148 }
1149 }
1150
1151 if (ExprList_Append(&state->expr_list, expression) < 0) {
1152 return -1;
1153 }
1154 }
1155
1156 /* If recurse_lvl is zero, then we must be at the end of the
1157 string. Otherwise, we must be at a right brace. */
1158
1159 if (recurse_lvl == 0 && *str < end-1) {
1160 RAISE_SYNTAX_ERROR("f-string: unexpected end of string");
1161 return -1;
1162 }
1163 if (recurse_lvl != 0 && **str != '}') {
1164 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
1165 return -1;
1166 }
1167
1168 FstringParser_check_invariants(state);
1169 return 0;
1170 }
1171
1172 /* Convert the partial state reflected in last_str and expr_list to an
1173 expr_ty. The expr_ty can be a Constant, or a JoinedStr. */
1174 expr_ty
_PyPegen_FstringParser_Finish(Parser * p,FstringParser * state,Token * first_token,Token * last_token)1175 _PyPegen_FstringParser_Finish(Parser *p, FstringParser *state, Token* first_token,
1176 Token *last_token)
1177 {
1178 asdl_seq *seq;
1179
1180 FstringParser_check_invariants(state);
1181
1182 /* If we're just a constant string with no expressions, return
1183 that. */
1184 if (!state->fmode) {
1185 assert(!state->expr_list.size);
1186 if (!state->last_str) {
1187 /* Create a zero length string. */
1188 state->last_str = PyUnicode_FromStringAndSize(NULL, 0);
1189 if (!state->last_str) {
1190 goto error;
1191 }
1192 }
1193 return make_str_node_and_del(p, &state->last_str, first_token, last_token);
1194 }
1195
1196 /* Create a Constant node out of last_str, if needed. It will be the
1197 last node in our expression list. */
1198 if (state->last_str) {
1199 expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1200 if (!str || ExprList_Append(&state->expr_list, str) < 0) {
1201 goto error;
1202 }
1203 }
1204 /* This has already been freed. */
1205 assert(state->last_str == NULL);
1206
1207 seq = ExprList_Finish(&state->expr_list, p->arena);
1208 if (!seq) {
1209 goto error;
1210 }
1211
1212 return _Py_JoinedStr(seq, first_token->lineno, first_token->col_offset,
1213 last_token->end_lineno, last_token->end_col_offset, p->arena);
1214
1215 error:
1216 _PyPegen_FstringParser_Dealloc(state);
1217 return NULL;
1218 }
1219
1220 /* Given an f-string (with no 'f' or quotes) that's in *str and ends
1221 at end, parse it into an expr_ty. Return NULL on error. Adjust
1222 str to point past the parsed portion. */
1223 static expr_ty
fstring_parse(Parser * p,const char ** str,const char * end,int raw,int recurse_lvl,Token * first_token,Token * t,Token * last_token)1224 fstring_parse(Parser *p, const char **str, const char *end, int raw,
1225 int recurse_lvl, Token *first_token, Token* t, Token *last_token)
1226 {
1227 FstringParser state;
1228
1229 _PyPegen_FstringParser_Init(&state);
1230 if (_PyPegen_FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl,
1231 first_token, t, last_token) < 0) {
1232 _PyPegen_FstringParser_Dealloc(&state);
1233 return NULL;
1234 }
1235
1236 return _PyPegen_FstringParser_Finish(p, &state, t, t);
1237 }
1238