1# GardenSnake - a parser generator demonstration program 2# 3# This implements a modified version of a subset of Python: 4# - only 'def', 'return' and 'if' statements 5# - 'if' only has 'then' clause (no elif nor else) 6# - single-quoted strings only, content in raw format 7# - numbers are decimal.Decimal instances (not integers or floats) 8# - no print statment; use the built-in 'print' function 9# - only < > == + - / * implemented (and unary + -) 10# - assignment and tuple assignment work 11# - no generators of any sort 12# - no ... well, no quite a lot 13 14# Why? I'm thinking about a new indentation-based configuration 15# language for a project and wanted to figure out how to do it. Once 16# I got that working I needed a way to test it out. My original AST 17# was dumb so I decided to target Python's AST and compile it into 18# Python code. Plus, it's pretty cool that it only took a day or so 19# from sitting down with Ply to having working code. 20 21# This uses David Beazley's Ply from http://www.dabeaz.com/ply/ 22 23# This work is hereby released into the Public Domain. To view a copy of 24# the public domain dedication, visit 25# http://creativecommons.org/licenses/publicdomain/ or send a letter to 26# Creative Commons, 543 Howard Street, 5th Floor, San Francisco, 27# California, 94105, USA. 28# 29# Portions of this work are derived from Python's Grammar definition 30# and may be covered under the Python copyright and license 31# 32# Andrew Dalke / Dalke Scientific Software, LLC 33# 30 August 2006 / Cape Town, South Africa 34 35# Changelog: 36# 30 August - added link to CC license; removed the "swapcase" encoding 37 38# Modifications for inclusion in PLY distribution 39import sys 40sys.path.insert(0, "../..") 41from ply import * 42 43##### Lexer ###### 44#import lex 45import decimal 46 47tokens = ( 48 'DEF', 49 'IF', 50 'NAME', 51 'NUMBER', # Python decimals 52 'STRING', # single quoted strings only; syntax of raw strings 53 'LPAR', 54 'RPAR', 55 'COLON', 56 'EQ', 57 'ASSIGN', 58 'LT', 59 'GT', 60 'PLUS', 61 'MINUS', 62 'MULT', 63 'DIV', 64 'RETURN', 65 'WS', 66 'NEWLINE', 67 'COMMA', 68 'SEMICOLON', 69 'INDENT', 70 'DEDENT', 71 'ENDMARKER', 72) 73 74#t_NUMBER = r'\d+' 75# taken from decmial.py but without the leading sign 76 77 78def t_NUMBER(t): 79 r"""(\d+(\.\d*)?|\.\d+)([eE][-+]? \d+)?""" 80 t.value = decimal.Decimal(t.value) 81 return t 82 83 84def t_STRING(t): 85 r"'([^\\']+|\\'|\\\\)*'" # I think this is right ... 86 t.value = t.value[1:-1].decode("string-escape") # .swapcase() # for fun 87 return t 88 89t_COLON = r':' 90t_EQ = r'==' 91t_ASSIGN = r'=' 92t_LT = r'<' 93t_GT = r'>' 94t_PLUS = r'\+' 95t_MINUS = r'-' 96t_MULT = r'\*' 97t_DIV = r'/' 98t_COMMA = r',' 99t_SEMICOLON = r';' 100 101# Ply nicely documented how to do this. 102 103RESERVED = { 104 "def": "DEF", 105 "if": "IF", 106 "return": "RETURN", 107} 108 109 110def t_NAME(t): 111 r'[a-zA-Z_][a-zA-Z0-9_]*' 112 t.type = RESERVED.get(t.value, "NAME") 113 return t 114 115# Putting this before t_WS let it consume lines with only comments in 116# them so the latter code never sees the WS part. Not consuming the 117# newline. Needed for "if 1: #comment" 118 119 120def t_comment(t): 121 r"[ ]*\043[^\n]*" # \043 is '#' 122 pass 123 124 125# Whitespace 126def t_WS(t): 127 r' [ ]+ ' 128 if t.lexer.at_line_start and t.lexer.paren_count == 0: 129 return t 130 131# Don't generate newline tokens when inside of parenthesis, eg 132# a = (1, 133# 2, 3) 134 135 136def t_newline(t): 137 r'\n+' 138 t.lexer.lineno += len(t.value) 139 t.type = "NEWLINE" 140 if t.lexer.paren_count == 0: 141 return t 142 143 144def t_LPAR(t): 145 r'\(' 146 t.lexer.paren_count += 1 147 return t 148 149 150def t_RPAR(t): 151 r'\)' 152 # check for underflow? should be the job of the parser 153 t.lexer.paren_count -= 1 154 return t 155 156 157def t_error(t): 158 raise SyntaxError("Unknown symbol %r" % (t.value[0],)) 159 print "Skipping", repr(t.value[0]) 160 t.lexer.skip(1) 161 162# I implemented INDENT / DEDENT generation as a post-processing filter 163 164# The original lex token stream contains WS and NEWLINE characters. 165# WS will only occur before any other tokens on a line. 166 167# I have three filters. One tags tokens by adding two attributes. 168# "must_indent" is True if the token must be indented from the 169# previous code. The other is "at_line_start" which is True for WS 170# and the first non-WS/non-NEWLINE on a line. It flags the check so 171# see if the new line has changed indication level. 172 173# Python's syntax has three INDENT states 174# 0) no colon hence no need to indent 175# 1) "if 1: go()" - simple statements have a COLON but no need for an indent 176# 2) "if 1:\n go()" - complex statements have a COLON NEWLINE and must indent 177NO_INDENT = 0 178MAY_INDENT = 1 179MUST_INDENT = 2 180 181# only care about whitespace at the start of a line 182 183 184def track_tokens_filter(lexer, tokens): 185 lexer.at_line_start = at_line_start = True 186 indent = NO_INDENT 187 saw_colon = False 188 for token in tokens: 189 token.at_line_start = at_line_start 190 191 if token.type == "COLON": 192 at_line_start = False 193 indent = MAY_INDENT 194 token.must_indent = False 195 196 elif token.type == "NEWLINE": 197 at_line_start = True 198 if indent == MAY_INDENT: 199 indent = MUST_INDENT 200 token.must_indent = False 201 202 elif token.type == "WS": 203 assert token.at_line_start == True 204 at_line_start = True 205 token.must_indent = False 206 207 else: 208 # A real token; only indent after COLON NEWLINE 209 if indent == MUST_INDENT: 210 token.must_indent = True 211 else: 212 token.must_indent = False 213 at_line_start = False 214 indent = NO_INDENT 215 216 yield token 217 lexer.at_line_start = at_line_start 218 219 220def _new_token(type, lineno): 221 tok = lex.LexToken() 222 tok.type = type 223 tok.value = None 224 tok.lineno = lineno 225 return tok 226 227# Synthesize a DEDENT tag 228 229 230def DEDENT(lineno): 231 return _new_token("DEDENT", lineno) 232 233# Synthesize an INDENT tag 234 235 236def INDENT(lineno): 237 return _new_token("INDENT", lineno) 238 239 240# Track the indentation level and emit the right INDENT / DEDENT events. 241def indentation_filter(tokens): 242 # A stack of indentation levels; will never pop item 0 243 levels = [0] 244 token = None 245 depth = 0 246 prev_was_ws = False 247 for token in tokens: 248 # if 1: 249 # print "Process", token, 250 # if token.at_line_start: 251 # print "at_line_start", 252 # if token.must_indent: 253 # print "must_indent", 254 # print 255 256 # WS only occurs at the start of the line 257 # There may be WS followed by NEWLINE so 258 # only track the depth here. Don't indent/dedent 259 # until there's something real. 260 if token.type == "WS": 261 assert depth == 0 262 depth = len(token.value) 263 prev_was_ws = True 264 # WS tokens are never passed to the parser 265 continue 266 267 if token.type == "NEWLINE": 268 depth = 0 269 if prev_was_ws or token.at_line_start: 270 # ignore blank lines 271 continue 272 # pass the other cases on through 273 yield token 274 continue 275 276 # then it must be a real token (not WS, not NEWLINE) 277 # which can affect the indentation level 278 279 prev_was_ws = False 280 if token.must_indent: 281 # The current depth must be larger than the previous level 282 if not (depth > levels[-1]): 283 raise IndentationError("expected an indented block") 284 285 levels.append(depth) 286 yield INDENT(token.lineno) 287 288 elif token.at_line_start: 289 # Must be on the same level or one of the previous levels 290 if depth == levels[-1]: 291 # At the same level 292 pass 293 elif depth > levels[-1]: 294 raise IndentationError( 295 "indentation increase but not in new block") 296 else: 297 # Back up; but only if it matches a previous level 298 try: 299 i = levels.index(depth) 300 except ValueError: 301 raise IndentationError("inconsistent indentation") 302 for _ in range(i + 1, len(levels)): 303 yield DEDENT(token.lineno) 304 levels.pop() 305 306 yield token 307 308 ### Finished processing ### 309 310 # Must dedent any remaining levels 311 if len(levels) > 1: 312 assert token is not None 313 for _ in range(1, len(levels)): 314 yield DEDENT(token.lineno) 315 316 317# The top-level filter adds an ENDMARKER, if requested. 318# Python's grammar uses it. 319def filter(lexer, add_endmarker=True): 320 token = None 321 tokens = iter(lexer.token, None) 322 tokens = track_tokens_filter(lexer, tokens) 323 for token in indentation_filter(tokens): 324 yield token 325 326 if add_endmarker: 327 lineno = 1 328 if token is not None: 329 lineno = token.lineno 330 yield _new_token("ENDMARKER", lineno) 331 332# Combine Ply and my filters into a new lexer 333 334 335class IndentLexer(object): 336 337 def __init__(self, debug=0, optimize=0, lextab='lextab', reflags=0): 338 self.lexer = lex.lex(debug=debug, optimize=optimize, 339 lextab=lextab, reflags=reflags) 340 self.token_stream = None 341 342 def input(self, s, add_endmarker=True): 343 self.lexer.paren_count = 0 344 self.lexer.input(s) 345 self.token_stream = filter(self.lexer, add_endmarker) 346 347 def token(self): 348 try: 349 return self.token_stream.next() 350 except StopIteration: 351 return None 352 353########## Parser (tokens -> AST) ###### 354 355# also part of Ply 356#import yacc 357 358# I use the Python AST 359from compiler import ast 360 361# Helper function 362 363 364def Assign(left, right): 365 names = [] 366 if isinstance(left, ast.Name): 367 # Single assignment on left 368 return ast.Assign([ast.AssName(left.name, 'OP_ASSIGN')], right) 369 elif isinstance(left, ast.Tuple): 370 # List of things - make sure they are Name nodes 371 names = [] 372 for child in left.getChildren(): 373 if not isinstance(child, ast.Name): 374 raise SyntaxError("that assignment not supported") 375 names.append(child.name) 376 ass_list = [ast.AssName(name, 'OP_ASSIGN') for name in names] 377 return ast.Assign([ast.AssTuple(ass_list)], right) 378 else: 379 raise SyntaxError("Can't do that yet") 380 381 382# The grammar comments come from Python's Grammar/Grammar file 383 384# NB: compound_stmt in single_input is followed by extra NEWLINE! 385# file_input: (NEWLINE | stmt)* ENDMARKER 386def p_file_input_end(p): 387 """file_input_end : file_input ENDMARKER""" 388 p[0] = ast.Stmt(p[1]) 389 390 391def p_file_input(p): 392 """file_input : file_input NEWLINE 393 | file_input stmt 394 | NEWLINE 395 | stmt""" 396 if isinstance(p[len(p) - 1], basestring): 397 if len(p) == 3: 398 p[0] = p[1] 399 else: 400 p[0] = [] # p == 2 --> only a blank line 401 else: 402 if len(p) == 3: 403 p[0] = p[1] + p[2] 404 else: 405 p[0] = p[1] 406 407 408# funcdef: [decorators] 'def' NAME parameters ':' suite 409# ignoring decorators 410def p_funcdef(p): 411 "funcdef : DEF NAME parameters COLON suite" 412 p[0] = ast.Function(None, p[2], tuple(p[3]), (), 0, None, p[5]) 413 414# parameters: '(' [varargslist] ')' 415 416 417def p_parameters(p): 418 """parameters : LPAR RPAR 419 | LPAR varargslist RPAR""" 420 if len(p) == 3: 421 p[0] = [] 422 else: 423 p[0] = p[2] 424 425 426# varargslist: (fpdef ['=' test] ',')* ('*' NAME [',' '**' NAME] | '**' NAME) | 427# highly simplified 428def p_varargslist(p): 429 """varargslist : varargslist COMMA NAME 430 | NAME""" 431 if len(p) == 4: 432 p[0] = p[1] + p[3] 433 else: 434 p[0] = [p[1]] 435 436# stmt: simple_stmt | compound_stmt 437 438 439def p_stmt_simple(p): 440 """stmt : simple_stmt""" 441 # simple_stmt is a list 442 p[0] = p[1] 443 444 445def p_stmt_compound(p): 446 """stmt : compound_stmt""" 447 p[0] = [p[1]] 448 449# simple_stmt: small_stmt (';' small_stmt)* [';'] NEWLINE 450 451 452def p_simple_stmt(p): 453 """simple_stmt : small_stmts NEWLINE 454 | small_stmts SEMICOLON NEWLINE""" 455 p[0] = p[1] 456 457 458def p_small_stmts(p): 459 """small_stmts : small_stmts SEMICOLON small_stmt 460 | small_stmt""" 461 if len(p) == 4: 462 p[0] = p[1] + [p[3]] 463 else: 464 p[0] = [p[1]] 465 466# small_stmt: expr_stmt | print_stmt | del_stmt | pass_stmt | flow_stmt | 467# import_stmt | global_stmt | exec_stmt | assert_stmt 468 469 470def p_small_stmt(p): 471 """small_stmt : flow_stmt 472 | expr_stmt""" 473 p[0] = p[1] 474 475# expr_stmt: testlist (augassign (yield_expr|testlist) | 476# ('=' (yield_expr|testlist))*) 477# augassign: ('+=' | '-=' | '*=' | '/=' | '%=' | '&=' | '|=' | '^=' | 478# '<<=' | '>>=' | '**=' | '//=') 479 480 481def p_expr_stmt(p): 482 """expr_stmt : testlist ASSIGN testlist 483 | testlist """ 484 if len(p) == 2: 485 # a list of expressions 486 p[0] = ast.Discard(p[1]) 487 else: 488 p[0] = Assign(p[1], p[3]) 489 490 491def p_flow_stmt(p): 492 "flow_stmt : return_stmt" 493 p[0] = p[1] 494 495# return_stmt: 'return' [testlist] 496 497 498def p_return_stmt(p): 499 "return_stmt : RETURN testlist" 500 p[0] = ast.Return(p[2]) 501 502 503def p_compound_stmt(p): 504 """compound_stmt : if_stmt 505 | funcdef""" 506 p[0] = p[1] 507 508 509def p_if_stmt(p): 510 'if_stmt : IF test COLON suite' 511 p[0] = ast.If([(p[2], p[4])], None) 512 513 514def p_suite(p): 515 """suite : simple_stmt 516 | NEWLINE INDENT stmts DEDENT""" 517 if len(p) == 2: 518 p[0] = ast.Stmt(p[1]) 519 else: 520 p[0] = ast.Stmt(p[3]) 521 522 523def p_stmts(p): 524 """stmts : stmts stmt 525 | stmt""" 526 if len(p) == 3: 527 p[0] = p[1] + p[2] 528 else: 529 p[0] = p[1] 530 531# No using Python's approach because Ply supports precedence 532 533# comparison: expr (comp_op expr)* 534# arith_expr: term (('+'|'-') term)* 535# term: factor (('*'|'/'|'%'|'//') factor)* 536# factor: ('+'|'-'|'~') factor | power 537# comp_op: '<'|'>'|'=='|'>='|'<='|'<>'|'!='|'in'|'not' 'in'|'is'|'is' 'not' 538 539 540def make_lt_compare((left, right)): 541 return ast.Compare(left, [('<', right), ]) 542 543 544def make_gt_compare((left, right)): 545 return ast.Compare(left, [('>', right), ]) 546 547 548def make_eq_compare((left, right)): 549 return ast.Compare(left, [('==', right), ]) 550 551 552binary_ops = { 553 "+": ast.Add, 554 "-": ast.Sub, 555 "*": ast.Mul, 556 "/": ast.Div, 557 "<": make_lt_compare, 558 ">": make_gt_compare, 559 "==": make_eq_compare, 560} 561unary_ops = { 562 "+": ast.UnaryAdd, 563 "-": ast.UnarySub, 564} 565precedence = ( 566 ("left", "EQ", "GT", "LT"), 567 ("left", "PLUS", "MINUS"), 568 ("left", "MULT", "DIV"), 569) 570 571 572def p_comparison(p): 573 """comparison : comparison PLUS comparison 574 | comparison MINUS comparison 575 | comparison MULT comparison 576 | comparison DIV comparison 577 | comparison LT comparison 578 | comparison EQ comparison 579 | comparison GT comparison 580 | PLUS comparison 581 | MINUS comparison 582 | power""" 583 if len(p) == 4: 584 p[0] = binary_ops[p[2]]((p[1], p[3])) 585 elif len(p) == 3: 586 p[0] = unary_ops[p[1]](p[2]) 587 else: 588 p[0] = p[1] 589 590# power: atom trailer* ['**' factor] 591# trailers enables function calls. I only allow one level of calls 592# so this is 'trailer' 593 594 595def p_power(p): 596 """power : atom 597 | atom trailer""" 598 if len(p) == 2: 599 p[0] = p[1] 600 else: 601 if p[2][0] == "CALL": 602 p[0] = ast.CallFunc(p[1], p[2][1], None, None) 603 else: 604 raise AssertionError("not implemented") 605 606 607def p_atom_name(p): 608 """atom : NAME""" 609 p[0] = ast.Name(p[1]) 610 611 612def p_atom_number(p): 613 """atom : NUMBER 614 | STRING""" 615 p[0] = ast.Const(p[1]) 616 617 618def p_atom_tuple(p): 619 """atom : LPAR testlist RPAR""" 620 p[0] = p[2] 621 622# trailer: '(' [arglist] ')' | '[' subscriptlist ']' | '.' NAME 623 624 625def p_trailer(p): 626 "trailer : LPAR arglist RPAR" 627 p[0] = ("CALL", p[2]) 628 629# testlist: test (',' test)* [','] 630# Contains shift/reduce error 631 632 633def p_testlist(p): 634 """testlist : testlist_multi COMMA 635 | testlist_multi """ 636 if len(p) == 2: 637 p[0] = p[1] 638 else: 639 # May need to promote singleton to tuple 640 if isinstance(p[1], list): 641 p[0] = p[1] 642 else: 643 p[0] = [p[1]] 644 # Convert into a tuple? 645 if isinstance(p[0], list): 646 p[0] = ast.Tuple(p[0]) 647 648 649def p_testlist_multi(p): 650 """testlist_multi : testlist_multi COMMA test 651 | test""" 652 if len(p) == 2: 653 # singleton 654 p[0] = p[1] 655 else: 656 if isinstance(p[1], list): 657 p[0] = p[1] + [p[3]] 658 else: 659 # singleton -> tuple 660 p[0] = [p[1], p[3]] 661 662 663# test: or_test ['if' or_test 'else' test] | lambdef 664# as I don't support 'and', 'or', and 'not' this works down to 'comparison' 665def p_test(p): 666 "test : comparison" 667 p[0] = p[1] 668 669 670# arglist: (argument ',')* (argument [',']| '*' test [',' '**' test] | '**' test) 671# XXX INCOMPLETE: this doesn't allow the trailing comma 672def p_arglist(p): 673 """arglist : arglist COMMA argument 674 | argument""" 675 if len(p) == 4: 676 p[0] = p[1] + [p[3]] 677 else: 678 p[0] = [p[1]] 679 680# argument: test [gen_for] | test '=' test # Really [keyword '='] test 681 682 683def p_argument(p): 684 "argument : test" 685 p[0] = p[1] 686 687 688def p_error(p): 689 # print "Error!", repr(p) 690 raise SyntaxError(p) 691 692 693class GardenSnakeParser(object): 694 695 def __init__(self, lexer=None): 696 if lexer is None: 697 lexer = IndentLexer() 698 self.lexer = lexer 699 self.parser = yacc.yacc(start="file_input_end") 700 701 def parse(self, code): 702 self.lexer.input(code) 703 result = self.parser.parse(lexer=self.lexer) 704 return ast.Module(None, result) 705 706 707###### Code generation ###### 708 709from compiler import misc, syntax, pycodegen 710 711 712class GardenSnakeCompiler(object): 713 714 def __init__(self): 715 self.parser = GardenSnakeParser() 716 717 def compile(self, code, filename="<string>"): 718 tree = self.parser.parse(code) 719 # print tree 720 misc.set_filename(filename, tree) 721 syntax.check(tree) 722 gen = pycodegen.ModuleCodeGenerator(tree) 723 code = gen.getCode() 724 return code 725 726####### Test code ####### 727 728compile = GardenSnakeCompiler().compile 729 730code = r""" 731 732print('LET\'S TRY THIS \\OUT') 733 734#Comment here 735def x(a): 736 print('called with',a) 737 if a == 1: 738 return 2 739 if a*2 > 10: return 999 / 4 740 # Another comment here 741 742 return a+2*3 743 744ints = (1, 2, 745 3, 4, 7465) 747print('mutiline-expression', ints) 748 749t = 4+1/3*2+6*(9-5+1) 750print('predence test; should be 34+2/3:', t, t==(34+2/3)) 751 752print('numbers', 1,2,3,4,5) 753if 1: 754 8 755 a=9 756 print(x(a)) 757 758print(x(1)) 759print(x(2)) 760print(x(8),'3') 761print('this is decimal', 1/5) 762print('BIG DECIMAL', 1.234567891234567e12345) 763 764""" 765 766# Set up the GardenSnake run-time environment 767 768 769def print_(*args): 770 print "-->", " ".join(map(str, args)) 771 772globals()["print"] = print_ 773 774compiled_code = compile(code) 775 776exec compiled_code in globals() 777print "Done" 778