1 
2 /* Parser implementation */
3 
4 /* For a description, see the comments at end of this file */
5 
6 /* XXX To do: error recovery */
7 
8 #include "Python.h"
9 #include "pgenheaders.h"
10 #include "token.h"
11 #include "grammar.h"
12 #include "node.h"
13 #include "parser.h"
14 #include "errcode.h"
15 
16 
17 #ifdef Py_DEBUG
18 extern int Py_DebugFlag;
19 #define D(x) if (!Py_DebugFlag); else x
20 #else
21 #define D(x)
22 #endif
23 
24 
25 /* STACK DATA TYPE */
26 
27 static void s_reset(stack *);
28 
29 static void
s_reset(stack * s)30 s_reset(stack *s)
31 {
32     s->s_top = &s->s_base[MAXSTACK];
33 }
34 
35 #define s_empty(s) ((s)->s_top == &(s)->s_base[MAXSTACK])
36 
37 static int
s_push(register stack * s,dfa * d,node * parent)38 s_push(register stack *s, dfa *d, node *parent)
39 {
40     register stackentry *top;
41     if (s->s_top == s->s_base) {
42         fprintf(stderr, "s_push: parser stack overflow\n");
43         return E_NOMEM;
44     }
45     top = --s->s_top;
46     top->s_dfa = d;
47     top->s_parent = parent;
48     top->s_state = 0;
49     return 0;
50 }
51 
52 #ifdef Py_DEBUG
53 
54 static void
s_pop(register stack * s)55 s_pop(register stack *s)
56 {
57     if (s_empty(s))
58         Py_FatalError("s_pop: parser stack underflow -- FATAL");
59     s->s_top++;
60 }
61 
62 #else /* !Py_DEBUG */
63 
64 #define s_pop(s) (s)->s_top++
65 
66 #endif
67 
68 
69 /* PARSER CREATION */
70 
71 parser_state *
PyParser_New(grammar * g,int start)72 PyParser_New(grammar *g, int start)
73 {
74     parser_state *ps;
75 
76     if (!g->g_accel)
77         PyGrammar_AddAccelerators(g);
78     ps = (parser_state *)PyMem_MALLOC(sizeof(parser_state));
79     if (ps == NULL)
80         return NULL;
81     ps->p_grammar = g;
82 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
83     ps->p_flags = 0;
84 #endif
85     ps->p_tree = PyNode_New(start);
86     if (ps->p_tree == NULL) {
87         PyMem_FREE(ps);
88         return NULL;
89     }
90     s_reset(&ps->p_stack);
91     (void) s_push(&ps->p_stack, PyGrammar_FindDFA(g, start), ps->p_tree);
92     return ps;
93 }
94 
95 void
PyParser_Delete(parser_state * ps)96 PyParser_Delete(parser_state *ps)
97 {
98     /* NB If you want to save the parse tree,
99        you must set p_tree to NULL before calling delparser! */
100     PyNode_Free(ps->p_tree);
101     PyMem_FREE(ps);
102 }
103 
104 
105 /* PARSER STACK OPERATIONS */
106 
107 static int
shift(register stack * s,int type,char * str,int newstate,int lineno,int col_offset)108 shift(register stack *s, int type, char *str, int newstate, int lineno, int col_offset)
109 {
110     int err;
111     assert(!s_empty(s));
112     err = PyNode_AddChild(s->s_top->s_parent, type, str, lineno, col_offset);
113     if (err)
114         return err;
115     s->s_top->s_state = newstate;
116     return 0;
117 }
118 
119 static int
push(register stack * s,int type,dfa * d,int newstate,int lineno,int col_offset)120 push(register stack *s, int type, dfa *d, int newstate, int lineno, int col_offset)
121 {
122     int err;
123     register node *n;
124     n = s->s_top->s_parent;
125     assert(!s_empty(s));
126     err = PyNode_AddChild(n, type, (char *)NULL, lineno, col_offset);
127     if (err)
128         return err;
129     s->s_top->s_state = newstate;
130     return s_push(s, d, CHILD(n, NCH(n)-1));
131 }
132 
133 
134 /* PARSER PROPER */
135 
136 static int
classify(parser_state * ps,int type,char * str)137 classify(parser_state *ps, int type, char *str)
138 {
139     grammar *g = ps->p_grammar;
140     register int n = g->g_ll.ll_nlabels;
141 
142     if (type == NAME) {
143         register char *s = str;
144         register label *l = g->g_ll.ll_label;
145         register int i;
146         for (i = n; i > 0; i--, l++) {
147             if (l->lb_type != NAME || l->lb_str == NULL ||
148                 l->lb_str[0] != s[0] ||
149                 strcmp(l->lb_str, s) != 0)
150                 continue;
151 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
152             if (ps->p_flags & CO_FUTURE_PRINT_FUNCTION &&
153                 s[0] == 'p' && strcmp(s, "print") == 0) {
154                 break; /* no longer a keyword */
155             }
156 #endif
157             D(printf("It's a keyword\n"));
158             return n - i;
159         }
160     }
161 
162     {
163         register label *l = g->g_ll.ll_label;
164         register int i;
165         for (i = n; i > 0; i--, l++) {
166             if (l->lb_type == type && l->lb_str == NULL) {
167                 D(printf("It's a token we know\n"));
168                 return n - i;
169             }
170         }
171     }
172 
173     D(printf("Illegal token\n"));
174     return -1;
175 }
176 
177 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
178 static void
future_hack(parser_state * ps)179 future_hack(parser_state *ps)
180 {
181     node *n = ps->p_stack.s_top->s_parent;
182     node *ch, *cch;
183     int i;
184 
185     /* from __future__ import ..., must have at least 4 children */
186     n = CHILD(n, 0);
187     if (NCH(n) < 4)
188         return;
189     ch = CHILD(n, 0);
190     if (STR(ch) == NULL || strcmp(STR(ch), "from") != 0)
191         return;
192     ch = CHILD(n, 1);
193     if (NCH(ch) == 1 && STR(CHILD(ch, 0)) &&
194         strcmp(STR(CHILD(ch, 0)), "__future__") != 0)
195         return;
196     ch = CHILD(n, 3);
197     /* ch can be a star, a parenthesis or import_as_names */
198     if (TYPE(ch) == STAR)
199         return;
200     if (TYPE(ch) == LPAR)
201         ch = CHILD(n, 4);
202 
203     for (i = 0; i < NCH(ch); i += 2) {
204         cch = CHILD(ch, i);
205         if (NCH(cch) >= 1 && TYPE(CHILD(cch, 0)) == NAME) {
206             char *str_ch = STR(CHILD(cch, 0));
207             if (strcmp(str_ch, FUTURE_WITH_STATEMENT) == 0) {
208                 ps->p_flags |= CO_FUTURE_WITH_STATEMENT;
209             } else if (strcmp(str_ch, FUTURE_PRINT_FUNCTION) == 0) {
210                 ps->p_flags |= CO_FUTURE_PRINT_FUNCTION;
211             } else if (strcmp(str_ch, FUTURE_UNICODE_LITERALS) == 0) {
212                 ps->p_flags |= CO_FUTURE_UNICODE_LITERALS;
213             }
214         }
215     }
216 }
217 #endif /* future keyword */
218 
219 int
PyParser_AddToken(register parser_state * ps,register int type,char * str,int lineno,int col_offset,int * expected_ret)220 PyParser_AddToken(register parser_state *ps, register int type, char *str,
221                   int lineno, int col_offset, int *expected_ret)
222 {
223     register int ilabel;
224     int err;
225 
226     D(printf("Token %s/'%s' ... ", _PyParser_TokenNames[type], str));
227 
228     /* Find out which label this token is */
229     ilabel = classify(ps, type, str);
230     if (ilabel < 0)
231         return E_SYNTAX;
232 
233     /* Loop until the token is shifted or an error occurred */
234     for (;;) {
235         /* Fetch the current dfa and state */
236         register dfa *d = ps->p_stack.s_top->s_dfa;
237         register state *s = &d->d_state[ps->p_stack.s_top->s_state];
238 
239         D(printf(" DFA '%s', state %d:",
240             d->d_name, ps->p_stack.s_top->s_state));
241 
242         /* Check accelerator */
243         if (s->s_lower <= ilabel && ilabel < s->s_upper) {
244             register int x = s->s_accel[ilabel - s->s_lower];
245             if (x != -1) {
246                 if (x & (1<<7)) {
247                     /* Push non-terminal */
248                     int nt = (x >> 8) + NT_OFFSET;
249                     int arrow = x & ((1<<7)-1);
250                     dfa *d1 = PyGrammar_FindDFA(
251                         ps->p_grammar, nt);
252                     if ((err = push(&ps->p_stack, nt, d1,
253                         arrow, lineno, col_offset)) > 0) {
254                         D(printf(" MemError: push\n"));
255                         return err;
256                     }
257                     D(printf(" Push ...\n"));
258                     continue;
259                 }
260 
261                 /* Shift the token */
262                 if ((err = shift(&ps->p_stack, type, str,
263                                 x, lineno, col_offset)) > 0) {
264                     D(printf(" MemError: shift.\n"));
265                     return err;
266                 }
267                 D(printf(" Shift.\n"));
268                 /* Pop while we are in an accept-only state */
269                 while (s = &d->d_state
270                                 [ps->p_stack.s_top->s_state],
271                     s->s_accept && s->s_narcs == 1) {
272                     D(printf("  DFA '%s', state %d: "
273                              "Direct pop.\n",
274                              d->d_name,
275                              ps->p_stack.s_top->s_state));
276 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
277                     if (d->d_name[0] == 'i' &&
278                         strcmp(d->d_name,
279                            "import_stmt") == 0)
280                         future_hack(ps);
281 #endif
282                     s_pop(&ps->p_stack);
283                     if (s_empty(&ps->p_stack)) {
284                         D(printf("  ACCEPT.\n"));
285                         return E_DONE;
286                     }
287                     d = ps->p_stack.s_top->s_dfa;
288                 }
289                 return E_OK;
290             }
291         }
292 
293         if (s->s_accept) {
294 #ifdef PY_PARSER_REQUIRES_FUTURE_KEYWORD
295             if (d->d_name[0] == 'i' &&
296                 strcmp(d->d_name, "import_stmt") == 0)
297                 future_hack(ps);
298 #endif
299             /* Pop this dfa and try again */
300             s_pop(&ps->p_stack);
301             D(printf(" Pop ...\n"));
302             if (s_empty(&ps->p_stack)) {
303                 D(printf(" Error: bottom of stack.\n"));
304                 return E_SYNTAX;
305             }
306             continue;
307         }
308 
309         /* Stuck, report syntax error */
310         D(printf(" Error.\n"));
311         if (expected_ret) {
312             if (s->s_lower == s->s_upper - 1) {
313                 /* Only one possible expected token */
314                 *expected_ret = ps->p_grammar->
315                     g_ll.ll_label[s->s_lower].lb_type;
316             }
317             else
318                 *expected_ret = -1;
319         }
320         return E_SYNTAX;
321     }
322 }
323 
324 
325 #ifdef Py_DEBUG
326 
327 /* DEBUG OUTPUT */
328 
329 void
dumptree(grammar * g,node * n)330 dumptree(grammar *g, node *n)
331 {
332     int i;
333 
334     if (n == NULL)
335         printf("NIL");
336     else {
337         label l;
338         l.lb_type = TYPE(n);
339         l.lb_str = STR(n);
340         printf("%s", PyGrammar_LabelRepr(&l));
341         if (ISNONTERMINAL(TYPE(n))) {
342             printf("(");
343             for (i = 0; i < NCH(n); i++) {
344                 if (i > 0)
345                     printf(",");
346                 dumptree(g, CHILD(n, i));
347             }
348             printf(")");
349         }
350     }
351 }
352 
353 void
showtree(grammar * g,node * n)354 showtree(grammar *g, node *n)
355 {
356     int i;
357 
358     if (n == NULL)
359         return;
360     if (ISNONTERMINAL(TYPE(n))) {
361         for (i = 0; i < NCH(n); i++)
362             showtree(g, CHILD(n, i));
363     }
364     else if (ISTERMINAL(TYPE(n))) {
365         printf("%s", _PyParser_TokenNames[TYPE(n)]);
366         if (TYPE(n) == NUMBER || TYPE(n) == NAME)
367             printf("(%s)", STR(n));
368         printf(" ");
369     }
370     else
371         printf("? ");
372 }
373 
374 void
printtree(parser_state * ps)375 printtree(parser_state *ps)
376 {
377     if (Py_DebugFlag) {
378         printf("Parse tree:\n");
379         dumptree(ps->p_grammar, ps->p_tree);
380         printf("\n");
381         printf("Tokens:\n");
382         showtree(ps->p_grammar, ps->p_tree);
383         printf("\n");
384     }
385     printf("Listing:\n");
386     PyNode_ListTree(ps->p_tree);
387     printf("\n");
388 }
389 
390 #endif /* Py_DEBUG */
391 
392 /*
393 
394 Description
395 -----------
396 
397 The parser's interface is different than usual: the function addtoken()
398 must be called for each token in the input.  This makes it possible to
399 turn it into an incremental parsing system later.  The parsing system
400 constructs a parse tree as it goes.
401 
402 A parsing rule is represented as a Deterministic Finite-state Automaton
403 (DFA).  A node in a DFA represents a state of the parser; an arc represents
404 a transition.  Transitions are either labeled with terminal symbols or
405 with non-terminals.  When the parser decides to follow an arc labeled
406 with a non-terminal, it is invoked recursively with the DFA representing
407 the parsing rule for that as its initial state; when that DFA accepts,
408 the parser that invoked it continues.  The parse tree constructed by the
409 recursively called parser is inserted as a child in the current parse tree.
410 
411 The DFA's can be constructed automatically from a more conventional
412 language description.  An extended LL(1) grammar (ELL(1)) is suitable.
413 Certain restrictions make the parser's life easier: rules that can produce
414 the empty string should be outlawed (there are other ways to put loops
415 or optional parts in the language).  To avoid the need to construct
416 FIRST sets, we can require that all but the last alternative of a rule
417 (really: arc going out of a DFA's state) must begin with a terminal
418 symbol.
419 
420 As an example, consider this grammar:
421 
422 expr:   term (OP term)*
423 term:   CONSTANT | '(' expr ')'
424 
425 The DFA corresponding to the rule for expr is:
426 
427 ------->.---term-->.------->
428     ^          |
429     |          |
430     \----OP----/
431 
432 The parse tree generated for the input a+b is:
433 
434 (expr: (term: (NAME: a)), (OP: +), (term: (NAME: b)))
435 
436 */
437