1# GardenSnake - a parser generator demonstration program
2#
3# This implements a modified version of a subset of Python:
4#  - only 'def', 'return' and 'if' statements
5#  - 'if' only has 'then' clause (no elif nor else)
6#  - single-quoted strings only, content in raw format
7#  - numbers are decimal.Decimal instances (not integers or floats)
8#  - no print statment; use the built-in 'print' function
9#  - only < > == + - / * implemented (and unary + -)
10#  - assignment and tuple assignment work
11#  - no generators of any sort
12#  - no ... well, no quite a lot
13
14# Why?  I'm thinking about a new indentation-based configuration
15# language for a project and wanted to figure out how to do it.  Once
16# I got that working I needed a way to test it out.  My original AST
17# was dumb so I decided to target Python's AST and compile it into
18# Python code.  Plus, it's pretty cool that it only took a day or so
19# from sitting down with Ply to having working code.
20
21# This uses David Beazley's Ply from http://www.dabeaz.com/ply/
22
23# This work is hereby released into the Public Domain. To view a copy of
24# the public domain dedication, visit
25# http://creativecommons.org/licenses/publicdomain/ or send a letter to
26# Creative Commons, 543 Howard Street, 5th Floor, San Francisco,
27# California, 94105, USA.
28#
29# Portions of this work are derived from Python's Grammar definition
30# and may be covered under the Python copyright and license
31#
32#          Andrew Dalke / Dalke Scientific Software, LLC
33#             30 August 2006 / Cape Town, South Africa
34
35# Changelog:
36#  30 August - added link to CC license; removed the "swapcase" encoding
37
38# Modifications for inclusion in PLY distribution
39import sys
40sys.path.insert(0, "../..")
41from ply import *
42
43##### Lexer ######
44#import lex
45import decimal
46
47tokens = (
48    'DEF',
49    'IF',
50    'NAME',
51    'NUMBER',  # Python decimals
52    'STRING',  # single quoted strings only; syntax of raw strings
53    'LPAR',
54    'RPAR',
55    'COLON',
56    'EQ',
57    'ASSIGN',
58    'LT',
59    'GT',
60    'PLUS',
61    'MINUS',
62    'MULT',
63    'DIV',
64    'RETURN',
65    'WS',
66    'NEWLINE',
67    'COMMA',
68    'SEMICOLON',
69    'INDENT',
70    'DEDENT',
71    'ENDMARKER',
72)
73
74#t_NUMBER = r'\d+'
75# taken from decmial.py but without the leading sign
76
77
78def t_NUMBER(t):
79    r"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?"""
80    t.value = decimal.Decimal(t.value)
81    return t
82
83
84def t_STRING(t):
85    r"'([^\\']+|\\'|\\\\)*'"  # I think this is right ...
86    t.value = t.value[1:-1].decode("string-escape")  # .swapcase() # for fun
87    return t
88
89t_COLON = r':'
90t_EQ = r'=='
91t_ASSIGN = r'='
92t_LT = r'<'
93t_GT = r'>'
94t_PLUS = r'\+'
95t_MINUS = r'-'
96t_MULT = r'\*'
97t_DIV = r'/'
98t_COMMA = r','
99t_SEMICOLON = r';'
100
101# Ply nicely documented how to do this.
102
103RESERVED = {
104    "def": "DEF",
105    "if": "IF",
106    "return": "RETURN",
107}
108
109
110def t_NAME(t):
111    r'[a-zA-Z_][a-zA-Z0-9_]*'
112    t.type = RESERVED.get(t.value, "NAME")
113    return t
114
115# Putting this before t_WS let it consume lines with only comments in
116# them so the latter code never sees the WS part.  Not consuming the
117# newline.  Needed for "if 1: #comment"
118
119
120def t_comment(t):
121    r"[ ]*\043[^\n]*"  # \043 is '#'
122    pass
123
124
125# Whitespace
126def t_WS(t):
127    r' [ ]+ '
128    if t.lexer.at_line_start and t.lexer.paren_count == 0:
129        return t
130
131# Don't generate newline tokens when inside of parenthesis, eg
132#   a = (1,
133#        2, 3)
134
135
136def t_newline(t):
137    r'\n+'
138    t.lexer.lineno += len(t.value)
139    t.type = "NEWLINE"
140    if t.lexer.paren_count == 0:
141        return t
142
143
144def t_LPAR(t):
145    r'\('
146    t.lexer.paren_count += 1
147    return t
148
149
150def t_RPAR(t):
151    r'\)'
152    # check for underflow?  should be the job of the parser
153    t.lexer.paren_count -= 1
154    return t
155
156
157def t_error(t):
158    raise SyntaxError("Unknown symbol %r" % (t.value[0],))
159    print "Skipping", repr(t.value[0])
160    t.lexer.skip(1)
161
162# I implemented INDENT / DEDENT generation as a post-processing filter
163
164# The original lex token stream contains WS and NEWLINE characters.
165# WS will only occur before any other tokens on a line.
166
167# I have three filters.  One tags tokens by adding two attributes.
168# "must_indent" is True if the token must be indented from the
169# previous code.  The other is "at_line_start" which is True for WS
170# and the first non-WS/non-NEWLINE on a line.  It flags the check so
171# see if the new line has changed indication level.
172
173# Python's syntax has three INDENT states
174#  0) no colon hence no need to indent
175#  1) "if 1: go()" - simple statements have a COLON but no need for an indent
176#  2) "if 1:\n  go()" - complex statements have a COLON NEWLINE and must indent
177NO_INDENT = 0
178MAY_INDENT = 1
179MUST_INDENT = 2
180
181# only care about whitespace at the start of a line
182
183
184def track_tokens_filter(lexer, tokens):
185    lexer.at_line_start = at_line_start = True
186    indent = NO_INDENT
187    saw_colon = False
188    for token in tokens:
189        token.at_line_start = at_line_start
190
191        if token.type == "COLON":
192            at_line_start = False
193            indent = MAY_INDENT
194            token.must_indent = False
195
196        elif token.type == "NEWLINE":
197            at_line_start = True
198            if indent == MAY_INDENT:
199                indent = MUST_INDENT
200            token.must_indent = False
201
202        elif token.type == "WS":
203            assert token.at_line_start == True
204            at_line_start = True
205            token.must_indent = False
206
207        else:
208            # A real token; only indent after COLON NEWLINE
209            if indent == MUST_INDENT:
210                token.must_indent = True
211            else:
212                token.must_indent = False
213            at_line_start = False
214            indent = NO_INDENT
215
216        yield token
217        lexer.at_line_start = at_line_start
218
219
220def _new_token(type, lineno):
221    tok = lex.LexToken()
222    tok.type = type
223    tok.value = None
224    tok.lineno = lineno
225    return tok
226
227# Synthesize a DEDENT tag
228
229
230def DEDENT(lineno):
231    return _new_token("DEDENT", lineno)
232
233# Synthesize an INDENT tag
234
235
236def INDENT(lineno):
237    return _new_token("INDENT", lineno)
238
239
240# Track the indentation level and emit the right INDENT / DEDENT events.
241def indentation_filter(tokens):
242    # A stack of indentation levels; will never pop item 0
243    levels = [0]
244    token = None
245    depth = 0
246    prev_was_ws = False
247    for token in tokens:
248        # if 1:
249        # print "Process", token,
250        # if token.at_line_start:
251        # print "at_line_start",
252        # if token.must_indent:
253        # print "must_indent",
254        # print
255
256        # WS only occurs at the start of the line
257        # There may be WS followed by NEWLINE so
258        # only track the depth here.  Don't indent/dedent
259        # until there's something real.
260        if token.type == "WS":
261            assert depth == 0
262            depth = len(token.value)
263            prev_was_ws = True
264            # WS tokens are never passed to the parser
265            continue
266
267        if token.type == "NEWLINE":
268            depth = 0
269            if prev_was_ws or token.at_line_start:
270                # ignore blank lines
271                continue
272            # pass the other cases on through
273            yield token
274            continue
275
276        # then it must be a real token (not WS, not NEWLINE)
277        # which can affect the indentation level
278
279        prev_was_ws = False
280        if token.must_indent:
281            # The current depth must be larger than the previous level
282            if not (depth > levels[-1]):
283                raise IndentationError("expected an indented block")
284
285            levels.append(depth)
286            yield INDENT(token.lineno)
287
288        elif token.at_line_start:
289            # Must be on the same level or one of the previous levels
290            if depth == levels[-1]:
291                # At the same level
292                pass
293            elif depth > levels[-1]:
294                raise IndentationError(
295                    "indentation increase but not in new block")
296            else:
297                # Back up; but only if it matches a previous level
298                try:
299                    i = levels.index(depth)
300                except ValueError:
301                    raise IndentationError("inconsistent indentation")
302                for _ in range(i + 1, len(levels)):
303                    yield DEDENT(token.lineno)
304                    levels.pop()
305
306        yield token
307
308    ### Finished processing ###
309
310    # Must dedent any remaining levels
311    if len(levels) > 1:
312        assert token is not None
313        for _ in range(1, len(levels)):
314            yield DEDENT(token.lineno)
315
316
317# The top-level filter adds an ENDMARKER, if requested.
318# Python's grammar uses it.
319def filter(lexer, add_endmarker=True):
320    token = None
321    tokens = iter(lexer.token, None)
322    tokens = track_tokens_filter(lexer, tokens)
323    for token in indentation_filter(tokens):
324        yield token
325
326    if add_endmarker:
327        lineno = 1
328        if token is not None:
329            lineno = token.lineno
330        yield _new_token("ENDMARKER", lineno)
331
332# Combine Ply and my filters into a new lexer
333
334
335class IndentLexer(object):
336
337    def __init__(self, debug=0, optimize=0, lextab='lextab', reflags=0):
338        self.lexer = lex.lex(debug=debug, optimize=optimize,
339                             lextab=lextab, reflags=reflags)
340        self.token_stream = None
341
342    def input(self, s, add_endmarker=True):
343        self.lexer.paren_count = 0
344        self.lexer.input(s)
345        self.token_stream = filter(self.lexer, add_endmarker)
346
347    def token(self):
348        try:
349            return self.token_stream.next()
350        except StopIteration:
351            return None
352
353##########   Parser (tokens -> AST) ######
354
355# also part of Ply
356#import yacc
357
358# I use the Python AST
359from compiler import ast
360
361# Helper function
362
363
364def Assign(left, right):
365    names = []
366    if isinstance(left, ast.Name):
367        # Single assignment on left
368        return ast.Assign([ast.AssName(left.name, 'OP_ASSIGN')], right)
369    elif isinstance(left, ast.Tuple):
370        # List of things - make sure they are Name nodes
371        names = []
372        for child in left.getChildren():
373            if not isinstance(child, ast.Name):
374                raise SyntaxError("that assignment not supported")
375            names.append(child.name)
376        ass_list = [ast.AssName(name, 'OP_ASSIGN') for name in names]
377        return ast.Assign([ast.AssTuple(ass_list)], right)
378    else:
379        raise SyntaxError("Can't do that yet")
380
381
382# The grammar comments come from Python's Grammar/Grammar file
383
384# NB: compound_stmt in single_input is followed by extra NEWLINE!
385# file_input: (NEWLINE | stmt)* ENDMARKER
386def p_file_input_end(p):
387    """file_input_end : file_input ENDMARKER"""
388    p[0] = ast.Stmt(p[1])
389
390
391def p_file_input(p):
392    """file_input : file_input NEWLINE
393                  | file_input stmt
394                  | NEWLINE
395                  | stmt"""
396    if isinstance(p[len(p) - 1], basestring):
397        if len(p) == 3:
398            p[0] = p[1]
399        else:
400            p[0] = []  # p == 2 --> only a blank line
401    else:
402        if len(p) == 3:
403            p[0] = p[1] + p[2]
404        else:
405            p[0] = p[1]
406
407
408# funcdef: [decorators] 'def' NAME parameters ':' suite
409# ignoring decorators
410def p_funcdef(p):
411    "funcdef : DEF NAME parameters COLON suite"
412    p[0] = ast.Function(None, p[2], tuple(p[3]), (), 0, None, p[5])
413
414# parameters: '(' [varargslist] ')'
415
416
417def p_parameters(p):
418    """parameters : LPAR RPAR
419                  | LPAR varargslist RPAR"""
420    if len(p) == 3:
421        p[0] = []
422    else:
423        p[0] = p[2]
424
425
426# varargslist: (fpdef ['=' test] ',')* ('*' NAME [',' '**' NAME] | '**' NAME) |
427# highly simplified
428def p_varargslist(p):
429    """varargslist : varargslist COMMA NAME
430                   | NAME"""
431    if len(p) == 4:
432        p[0] = p[1] + p[3]
433    else:
434        p[0] = [p[1]]
435
436# stmt: simple_stmt | compound_stmt
437
438
439def p_stmt_simple(p):
440    """stmt : simple_stmt"""
441    # simple_stmt is a list
442    p[0] = p[1]
443
444
445def p_stmt_compound(p):
446    """stmt : compound_stmt"""
447    p[0] = [p[1]]
448
449# simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE
450
451
452def p_simple_stmt(p):
453    """simple_stmt : small_stmts NEWLINE
454                   | small_stmts SEMICOLON NEWLINE"""
455    p[0] = p[1]
456
457
458def p_small_stmts(p):
459    """small_stmts : small_stmts SEMICOLON small_stmt
460                   | small_stmt"""
461    if len(p) == 4:
462        p[0] = p[1] + [p[3]]
463    else:
464        p[0] = [p[1]]
465
466# small_stmt: expr_stmt | print_stmt  | del_stmt | pass_stmt | flow_stmt |
467#    import_stmt | global_stmt | exec_stmt | assert_stmt
468
469
470def p_small_stmt(p):
471    """small_stmt : flow_stmt
472                  | expr_stmt"""
473    p[0] = p[1]
474
475# expr_stmt: testlist (augassign (yield_expr|testlist) |
476#                      ('=' (yield_expr|testlist))*)
477# augassign: ('+=' | '-=' | '*=' | '/=' | '%=' | '&=' | '|=' | '^=' |
478#             '<<=' | '>>=' | '**=' | '//=')
479
480
481def p_expr_stmt(p):
482    """expr_stmt : testlist ASSIGN testlist
483                 | testlist """
484    if len(p) == 2:
485        # a list of expressions
486        p[0] = ast.Discard(p[1])
487    else:
488        p[0] = Assign(p[1], p[3])
489
490
491def p_flow_stmt(p):
492    "flow_stmt : return_stmt"
493    p[0] = p[1]
494
495# return_stmt: 'return' [testlist]
496
497
498def p_return_stmt(p):
499    "return_stmt : RETURN testlist"
500    p[0] = ast.Return(p[2])
501
502
503def p_compound_stmt(p):
504    """compound_stmt : if_stmt
505                     | funcdef"""
506    p[0] = p[1]
507
508
509def p_if_stmt(p):
510    'if_stmt : IF test COLON suite'
511    p[0] = ast.If([(p[2], p[4])], None)
512
513
514def p_suite(p):
515    """suite : simple_stmt
516             | NEWLINE INDENT stmts DEDENT"""
517    if len(p) == 2:
518        p[0] = ast.Stmt(p[1])
519    else:
520        p[0] = ast.Stmt(p[3])
521
522
523def p_stmts(p):
524    """stmts : stmts stmt
525             | stmt"""
526    if len(p) == 3:
527        p[0] = p[1] + p[2]
528    else:
529        p[0] = p[1]
530
531# No using Python's approach because Ply supports precedence
532
533# comparison: expr (comp_op expr)*
534# arith_expr: term (('+'|'-') term)*
535# term: factor (('*'|'/'|'%'|'//') factor)*
536# factor: ('+'|'-'|'~') factor | power
537# comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not'
538
539
540def make_lt_compare((left, right)):
541    return ast.Compare(left, [('<', right), ])
542
543
544def make_gt_compare((left, right)):
545    return ast.Compare(left, [('>', right), ])
546
547
548def make_eq_compare((left, right)):
549    return ast.Compare(left, [('==', right), ])
550
551
552binary_ops = {
553    "+": ast.Add,
554    "-": ast.Sub,
555    "*": ast.Mul,
556    "/": ast.Div,
557    "<": make_lt_compare,
558    ">": make_gt_compare,
559    "==": make_eq_compare,
560}
561unary_ops = {
562    "+": ast.UnaryAdd,
563    "-": ast.UnarySub,
564}
565precedence = (
566    ("left", "EQ", "GT", "LT"),
567    ("left", "PLUS", "MINUS"),
568    ("left", "MULT", "DIV"),
569)
570
571
572def p_comparison(p):
573    """comparison : comparison PLUS comparison
574                  | comparison MINUS comparison
575                  | comparison MULT comparison
576                  | comparison DIV comparison
577                  | comparison LT comparison
578                  | comparison EQ comparison
579                  | comparison GT comparison
580                  | PLUS comparison
581                  | MINUS comparison
582                  | power"""
583    if len(p) == 4:
584        p[0] = binary_ops[p[2]]((p[1], p[3]))
585    elif len(p) == 3:
586        p[0] = unary_ops[p[1]](p[2])
587    else:
588        p[0] = p[1]
589
590# power: atom trailer* ['**' factor]
591# trailers enables function calls.  I only allow one level of calls
592# so this is 'trailer'
593
594
595def p_power(p):
596    """power : atom
597             | atom trailer"""
598    if len(p) == 2:
599        p[0] = p[1]
600    else:
601        if p[2][0] == "CALL":
602            p[0] = ast.CallFunc(p[1], p[2][1], None, None)
603        else:
604            raise AssertionError("not implemented")
605
606
607def p_atom_name(p):
608    """atom : NAME"""
609    p[0] = ast.Name(p[1])
610
611
612def p_atom_number(p):
613    """atom : NUMBER
614            | STRING"""
615    p[0] = ast.Const(p[1])
616
617
618def p_atom_tuple(p):
619    """atom : LPAR testlist RPAR"""
620    p[0] = p[2]
621
622# trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME
623
624
625def p_trailer(p):
626    "trailer : LPAR arglist RPAR"
627    p[0] = ("CALL", p[2])
628
629# testlist: test (',' test)* [',']
630# Contains shift/reduce error
631
632
633def p_testlist(p):
634    """testlist : testlist_multi COMMA
635                | testlist_multi """
636    if len(p) == 2:
637        p[0] = p[1]
638    else:
639        # May need to promote singleton to tuple
640        if isinstance(p[1], list):
641            p[0] = p[1]
642        else:
643            p[0] = [p[1]]
644    # Convert into a tuple?
645    if isinstance(p[0], list):
646        p[0] = ast.Tuple(p[0])
647
648
649def p_testlist_multi(p):
650    """testlist_multi : testlist_multi COMMA test
651                      | test"""
652    if len(p) == 2:
653        # singleton
654        p[0] = p[1]
655    else:
656        if isinstance(p[1], list):
657            p[0] = p[1] + [p[3]]
658        else:
659            # singleton -> tuple
660            p[0] = [p[1], p[3]]
661
662
663# test: or_test ['if' or_test 'else' test] | lambdef
664#  as I don't support 'and', 'or', and 'not' this works down to 'comparison'
665def p_test(p):
666    "test : comparison"
667    p[0] = p[1]
668
669
670# arglist: (argument ',')* (argument [',']| '*' test [',' '**' test] | '**' test)
671# XXX INCOMPLETE: this doesn't allow the trailing comma
672def p_arglist(p):
673    """arglist : arglist COMMA argument
674               | argument"""
675    if len(p) == 4:
676        p[0] = p[1] + [p[3]]
677    else:
678        p[0] = [p[1]]
679
680# argument: test [gen_for] | test '=' test  # Really [keyword '='] test
681
682
683def p_argument(p):
684    "argument : test"
685    p[0] = p[1]
686
687
688def p_error(p):
689    # print "Error!", repr(p)
690    raise SyntaxError(p)
691
692
693class GardenSnakeParser(object):
694
695    def __init__(self, lexer=None):
696        if lexer is None:
697            lexer = IndentLexer()
698        self.lexer = lexer
699        self.parser = yacc.yacc(start="file_input_end")
700
701    def parse(self, code):
702        self.lexer.input(code)
703        result = self.parser.parse(lexer=self.lexer)
704        return ast.Module(None, result)
705
706
707###### Code generation ######
708
709from compiler import misc, syntax, pycodegen
710
711
712class GardenSnakeCompiler(object):
713
714    def __init__(self):
715        self.parser = GardenSnakeParser()
716
717    def compile(self, code, filename="<string>"):
718        tree = self.parser.parse(code)
719        # print  tree
720        misc.set_filename(filename, tree)
721        syntax.check(tree)
722        gen = pycodegen.ModuleCodeGenerator(tree)
723        code = gen.getCode()
724        return code
725
726####### Test code #######
727
728compile = GardenSnakeCompiler().compile
729
730code = r"""
731
732print('LET\'S TRY THIS \\OUT')
733
734#Comment here
735def x(a):
736    print('called with',a)
737    if a == 1:
738        return 2
739    if a*2 > 10: return 999 / 4
740        # Another comment here
741
742    return a+2*3
743
744ints = (1, 2,
745   3, 4,
7465)
747print('mutiline-expression', ints)
748
749t = 4+1/3*2+6*(9-5+1)
750print('predence test; should be 34+2/3:', t, t==(34+2/3))
751
752print('numbers', 1,2,3,4,5)
753if 1:
754 8
755 a=9
756 print(x(a))
757
758print(x(1))
759print(x(2))
760print(x(8),'3')
761print('this is decimal', 1/5)
762print('BIG DECIMAL', 1.234567891234567e12345)
763
764"""
765
766# Set up the GardenSnake run-time environment
767
768
769def print_(*args):
770    print "-->", " ".join(map(str, args))
771
772globals()["print"] = print_
773
774compiled_code = compile(code)
775
776exec compiled_code in globals()
777print "Done"
778