1"""ANTLR3 runtime package""" 2 3# begin[licence] 4# 5# [The "BSD licence"] 6# Copyright (c) 2005-2012 Terence Parr 7# All rights reserved. 8# 9# Redistribution and use in source and binary forms, with or without 10# modification, are permitted provided that the following conditions 11# are met: 12# 1. Redistributions of source code must retain the above copyright 13# notice, this list of conditions and the following disclaimer. 14# 2. Redistributions in binary form must reproduce the above copyright 15# notice, this list of conditions and the following disclaimer in the 16# documentation and/or other materials provided with the distribution. 17# 3. The name of the author may not be used to endorse or promote products 18# derived from this software without specific prior written permission. 19# 20# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 21# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 22# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 23# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 24# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 25# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 29# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30# 31# end[licence] 32 33import sys 34import inspect 35 36from .constants import compatible_api_versions, DEFAULT_CHANNEL, \ 37 HIDDEN_CHANNEL, EOF, EOR_TOKEN_TYPE, INVALID_TOKEN_TYPE 38from .exceptions import RecognitionException, MismatchedTokenException, \ 39 MismatchedRangeException, MismatchedTreeNodeException, \ 40 NoViableAltException, EarlyExitException, MismatchedSetException, \ 41 MismatchedNotSetException, FailedPredicateException, \ 42 BacktrackingFailed, UnwantedTokenException, MissingTokenException 43from .tokens import CommonToken, SKIP_TOKEN 44 45 46class RecognizerSharedState(object): 47 """ 48 The set of fields needed by an abstract recognizer to recognize input 49 and recover from errors etc... As a separate state object, it can be 50 shared among multiple grammars; e.g., when one grammar imports another. 51 52 These fields are publically visible but the actual state pointer per 53 parser is protected. 54 """ 55 56 def __init__(self): 57 # Track the set of token types that can follow any rule invocation. 58 # Stack grows upwards. 59 self.following = [] 60 61 # This is true when we see an error and before having successfully 62 # matched a token. Prevents generation of more than one error message 63 # per error. 64 self.errorRecovery = False 65 66 # The index into the input stream where the last error occurred. 67 # This is used to prevent infinite loops where an error is found 68 # but no token is consumed during recovery...another error is found, 69 # ad naseum. This is a failsafe mechanism to guarantee that at least 70 # one token/tree node is consumed for two errors. 71 self.lastErrorIndex = -1 72 73 # If 0, no backtracking is going on. Safe to exec actions etc... 74 # If >0 then it's the level of backtracking. 75 self.backtracking = 0 76 77 # An array[size num rules] of (int -> int) dicts that tracks 78 # the stop token index for each rule. ruleMemo[ruleIndex] is 79 # the memoization table for ruleIndex. For key ruleStartIndex, you 80 # get back the stop token for associated rule or MEMO_RULE_FAILED. 81 # 82 # This is only used if rule memoization is on (which it is by default). 83 self.ruleMemo = None 84 85 ## Did the recognizer encounter a syntax error? Track how many. 86 self.syntaxErrors = 0 87 88 89 # LEXER FIELDS (must be in same state object to avoid casting 90 # constantly in generated code and Lexer object) :( 91 92 93 ## The goal of all lexer rules/methods is to create a token object. 94 # This is an instance variable as multiple rules may collaborate to 95 # create a single token. nextToken will return this object after 96 # matching lexer rule(s). If you subclass to allow multiple token 97 # emissions, then set this to the last token to be matched or 98 # something nonnull so that the auto token emit mechanism will not 99 # emit another token. 100 self.token = None 101 102 ## What character index in the stream did the current token start at? 103 # Needed, for example, to get the text for current token. Set at 104 # the start of nextToken. 105 self.tokenStartCharIndex = -1 106 107 ## The line on which the first character of the token resides 108 self.tokenStartLine = None 109 110 ## The character position of first character within the line 111 self.tokenStartCharPositionInLine = None 112 113 ## The channel number for the current token 114 self.channel = None 115 116 ## The token type for the current token 117 self.type = None 118 119 ## You can set the text for the current token to override what is in 120 # the input char buffer. Use setText() or can set this instance var. 121 self.text = None 122 123 124class BaseRecognizer(object): 125 """ 126 @brief Common recognizer functionality. 127 128 A generic recognizer that can handle recognizers generated from 129 lexer, parser, and tree grammars. This is all the parsing 130 support code essentially; most of it is error recovery stuff and 131 backtracking. 132 """ 133 134 MEMO_RULE_FAILED = -2 135 MEMO_RULE_UNKNOWN = -1 136 137 # copies from Token object for convenience in actions 138 DEFAULT_TOKEN_CHANNEL = DEFAULT_CHANNEL 139 140 # for convenience in actions 141 HIDDEN = HIDDEN_CHANNEL 142 143 # overridden by generated subclasses 144 grammarFileName = None 145 tokenNames = None 146 147 # The api_version attribute has been introduced in 3.3. If it is not 148 # overwritten in the generated recognizer, we assume a default of v0. 149 api_version = 0 150 151 def __init__(self, state=None): 152 # Input stream of the recognizer. Must be initialized by a subclass. 153 self.input = None 154 155 ## State of a lexer, parser, or tree parser are collected into a state 156 # object so the state can be shared. This sharing is needed to 157 # have one grammar import others and share same error variables 158 # and other state variables. It's a kind of explicit multiple 159 # inheritance via delegation of methods and shared state. 160 if state is None: 161 state = RecognizerSharedState() 162 self._state = state 163 164 if self.api_version not in compatible_api_versions: 165 raise RuntimeError( 166 "ANTLR version mismatch: " 167 "The recognizer has been generated with API V{}, " 168 "but this runtime does not support this." 169 .format(self.api_version)) 170 171 # this one only exists to shut up pylint :( 172 def setInput(self, input): 173 self.input = input 174 175 176 def reset(self): 177 """ 178 reset the parser's state; subclasses must rewind the input stream 179 """ 180 181 # wack everything related to error recovery 182 if self._state is None: 183 # no shared state work to do 184 return 185 186 self._state.following = [] 187 self._state.errorRecovery = False 188 self._state.lastErrorIndex = -1 189 self._state.syntaxErrors = 0 190 # wack everything related to backtracking and memoization 191 self._state.backtracking = 0 192 if self._state.ruleMemo is not None: 193 self._state.ruleMemo = {} 194 195 196 def match(self, input, ttype, follow): 197 """ 198 Match current input symbol against ttype. Attempt 199 single token insertion or deletion error recovery. If 200 that fails, throw MismatchedTokenException. 201 202 To turn off single token insertion or deletion error 203 recovery, override recoverFromMismatchedToken() and have it 204 throw an exception. See TreeParser.recoverFromMismatchedToken(). 205 This way any error in a rule will cause an exception and 206 immediate exit from rule. Rule would recover by resynchronizing 207 to the set of symbols that can follow rule ref. 208 """ 209 210 matchedSymbol = self.getCurrentInputSymbol(input) 211 if self.input.LA(1) == ttype: 212 self.input.consume() 213 self._state.errorRecovery = False 214 return matchedSymbol 215 216 if self._state.backtracking > 0: 217 # FIXME: need to return matchedSymbol here as well. damn!! 218 raise BacktrackingFailed 219 220 matchedSymbol = self.recoverFromMismatchedToken(input, ttype, follow) 221 return matchedSymbol 222 223 224 def matchAny(self): 225 """Match the wildcard: in a symbol""" 226 227 self._state.errorRecovery = False 228 self.input.consume() 229 230 231 def mismatchIsUnwantedToken(self, input, ttype): 232 return input.LA(2) == ttype 233 234 235 def mismatchIsMissingToken(self, input, follow): 236 if follow is None: 237 # we have no information about the follow; we can only consume 238 # a single token and hope for the best 239 return False 240 241 # compute what can follow this grammar element reference 242 if EOR_TOKEN_TYPE in follow: 243 viableTokensFollowingThisRule = self.computeContextSensitiveRuleFOLLOW() 244 follow |= viableTokensFollowingThisRule 245 246 if len(self._state.following) > 0: 247 # remove EOR if we're not the start symbol 248 follow -= {EOR_TOKEN_TYPE} 249 250 # if current token is consistent with what could come after set 251 # then we know we're missing a token; error recovery is free to 252 # "insert" the missing token 253 if input.LA(1) in follow or EOR_TOKEN_TYPE in follow: 254 return True 255 256 return False 257 258 259 def reportError(self, e): 260 """Report a recognition problem. 261 262 This method sets errorRecovery to indicate the parser is recovering 263 not parsing. Once in recovery mode, no errors are generated. 264 To get out of recovery mode, the parser must successfully match 265 a token (after a resync). So it will go: 266 267 1. error occurs 268 2. enter recovery mode, report error 269 3. consume until token found in resync set 270 4. try to resume parsing 271 5. next match() will reset errorRecovery mode 272 273 If you override, make sure to update syntaxErrors if you care about 274 that. 275 276 """ 277 278 # if we've already reported an error and have not matched a token 279 # yet successfully, don't report any errors. 280 if self._state.errorRecovery: 281 return 282 283 self._state.syntaxErrors += 1 # don't count spurious 284 self._state.errorRecovery = True 285 286 self.displayRecognitionError(e) 287 288 289 def displayRecognitionError(self, e): 290 hdr = self.getErrorHeader(e) 291 msg = self.getErrorMessage(e) 292 self.emitErrorMessage(hdr + " " + msg) 293 294 295 def getErrorMessage(self, e): 296 """ 297 What error message should be generated for the various 298 exception types? 299 300 Not very object-oriented code, but I like having all error message 301 generation within one method rather than spread among all of the 302 exception classes. This also makes it much easier for the exception 303 handling because the exception classes do not have to have pointers back 304 to this object to access utility routines and so on. Also, changing 305 the message for an exception type would be difficult because you 306 would have to subclassing exception, but then somehow get ANTLR 307 to make those kinds of exception objects instead of the default. 308 This looks weird, but trust me--it makes the most sense in terms 309 of flexibility. 310 311 For grammar debugging, you will want to override this to add 312 more information such as the stack frame with 313 getRuleInvocationStack(e, this.getClass().getName()) and, 314 for no viable alts, the decision description and state etc... 315 316 Override this to change the message generated for one or more 317 exception types. 318 """ 319 320 if isinstance(e, UnwantedTokenException): 321 if e.expecting == EOF: 322 tokenName = "EOF" 323 else: 324 tokenName = self.tokenNames[e.expecting] 325 326 msg = "extraneous input {} expecting {}".format( 327 self.getTokenErrorDisplay(e.getUnexpectedToken()), 328 tokenName 329 ) 330 331 elif isinstance(e, MissingTokenException): 332 if e.expecting == EOF: 333 tokenName = "EOF" 334 else: 335 tokenName = self.tokenNames[e.expecting] 336 337 msg = "missing {} at {}".format( 338 tokenName, self.getTokenErrorDisplay(e.token) 339 ) 340 341 elif isinstance(e, MismatchedTokenException): 342 if e.expecting == EOF: 343 tokenName = "EOF" 344 else: 345 tokenName = self.tokenNames[e.expecting] 346 347 msg = "mismatched input {} expecting {}".format( 348 self.getTokenErrorDisplay(e.token), 349 tokenName 350 ) 351 352 elif isinstance(e, MismatchedTreeNodeException): 353 if e.expecting == EOF: 354 tokenName = "EOF" 355 else: 356 tokenName = self.tokenNames[e.expecting] 357 358 msg = "mismatched tree node: {} expecting {}".format( 359 e.node, tokenName) 360 361 elif isinstance(e, NoViableAltException): 362 msg = "no viable alternative at input {}".format( 363 self.getTokenErrorDisplay(e.token)) 364 365 elif isinstance(e, EarlyExitException): 366 msg = "required (...)+ loop did not match anything at input {}".format( 367 self.getTokenErrorDisplay(e.token)) 368 369 elif isinstance(e, MismatchedSetException): 370 msg = "mismatched input {} expecting set {!r}".format( 371 self.getTokenErrorDisplay(e.token), 372 e.expecting 373 ) 374 375 elif isinstance(e, MismatchedNotSetException): 376 msg = "mismatched input {} expecting set {!r}".format( 377 self.getTokenErrorDisplay(e.token), 378 e.expecting 379 ) 380 381 elif isinstance(e, FailedPredicateException): 382 msg = "rule {} failed predicate: {{{}}}?".format( 383 e.ruleName, 384 e.predicateText 385 ) 386 387 else: 388 msg = str(e) 389 390 return msg 391 392 393 def getNumberOfSyntaxErrors(self): 394 """ 395 Get number of recognition errors (lexer, parser, tree parser). Each 396 recognizer tracks its own number. So parser and lexer each have 397 separate count. Does not count the spurious errors found between 398 an error and next valid token match. 399 400 See also reportError(). 401 """ 402 return self._state.syntaxErrors 403 404 405 def getErrorHeader(self, e): 406 """ 407 What is the error header, normally line/character position information? 408 """ 409 410 source_name = self.getSourceName() 411 if source_name is not None: 412 return "{} line {}:{}".format(source_name, e.line, e.charPositionInLine) 413 return "line {}:{}".format(e.line, e.charPositionInLine) 414 415 416 def getTokenErrorDisplay(self, t): 417 """ 418 How should a token be displayed in an error message? The default 419 is to display just the text, but during development you might 420 want to have a lot of information spit out. Override in that case 421 to use t.toString() (which, for CommonToken, dumps everything about 422 the token). This is better than forcing you to override a method in 423 your token objects because you don't have to go modify your lexer 424 so that it creates a new Java type. 425 """ 426 427 s = t.text 428 if s is None: 429 if t.type == EOF: 430 s = "<EOF>" 431 else: 432 s = "<{}>".format(t.typeName) 433 434 return repr(s) 435 436 437 def emitErrorMessage(self, msg): 438 """Override this method to change where error messages go""" 439 sys.stderr.write(msg + '\n') 440 441 442 def recover(self, input, re): 443 """ 444 Recover from an error found on the input stream. This is 445 for NoViableAlt and mismatched symbol exceptions. If you enable 446 single token insertion and deletion, this will usually not 447 handle mismatched symbol exceptions but there could be a mismatched 448 token that the match() routine could not recover from. 449 """ 450 451 # PROBLEM? what if input stream is not the same as last time 452 # perhaps make lastErrorIndex a member of input 453 if self._state.lastErrorIndex == input.index(): 454 # uh oh, another error at same token index; must be a case 455 # where LT(1) is in the recovery token set so nothing is 456 # consumed; consume a single token so at least to prevent 457 # an infinite loop; this is a failsafe. 458 input.consume() 459 460 self._state.lastErrorIndex = input.index() 461 followSet = self.computeErrorRecoverySet() 462 463 self.beginResync() 464 self.consumeUntil(input, followSet) 465 self.endResync() 466 467 468 def beginResync(self): 469 """ 470 A hook to listen in on the token consumption during error recovery. 471 The DebugParser subclasses this to fire events to the listenter. 472 """ 473 474 pass 475 476 477 def endResync(self): 478 """ 479 A hook to listen in on the token consumption during error recovery. 480 The DebugParser subclasses this to fire events to the listenter. 481 """ 482 483 pass 484 485 486 def computeErrorRecoverySet(self): 487 """ 488 Compute the error recovery set for the current rule. During 489 rule invocation, the parser pushes the set of tokens that can 490 follow that rule reference on the stack; this amounts to 491 computing FIRST of what follows the rule reference in the 492 enclosing rule. This local follow set only includes tokens 493 from within the rule; i.e., the FIRST computation done by 494 ANTLR stops at the end of a rule. 495 496 EXAMPLE 497 498 When you find a "no viable alt exception", the input is not 499 consistent with any of the alternatives for rule r. The best 500 thing to do is to consume tokens until you see something that 501 can legally follow a call to r *or* any rule that called r. 502 You don't want the exact set of viable next tokens because the 503 input might just be missing a token--you might consume the 504 rest of the input looking for one of the missing tokens. 505 506 Consider grammar: 507 508 a : '[' b ']' 509 | '(' b ')' 510 ; 511 b : c '^' INT ; 512 c : ID 513 | INT 514 ; 515 516 At each rule invocation, the set of tokens that could follow 517 that rule is pushed on a stack. Here are the various "local" 518 follow sets: 519 520 FOLLOW(b1_in_a) = FIRST(']') = ']' 521 FOLLOW(b2_in_a) = FIRST(')') = ')' 522 FOLLOW(c_in_b) = FIRST('^') = '^' 523 524 Upon erroneous input "[]", the call chain is 525 526 a -> b -> c 527 528 and, hence, the follow context stack is: 529 530 depth local follow set after call to rule 531 0 \<EOF> a (from main()) 532 1 ']' b 533 3 '^' c 534 535 Notice that ')' is not included, because b would have to have 536 been called from a different context in rule a for ')' to be 537 included. 538 539 For error recovery, we cannot consider FOLLOW(c) 540 (context-sensitive or otherwise). We need the combined set of 541 all context-sensitive FOLLOW sets--the set of all tokens that 542 could follow any reference in the call chain. We need to 543 resync to one of those tokens. Note that FOLLOW(c)='^' and if 544 we resync'd to that token, we'd consume until EOF. We need to 545 sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}. 546 In this case, for input "[]", LA(1) is in this set so we would 547 not consume anything and after printing an error rule c would 548 return normally. It would not find the required '^' though. 549 At this point, it gets a mismatched token error and throws an 550 exception (since LA(1) is not in the viable following token 551 set). The rule exception handler tries to recover, but finds 552 the same recovery set and doesn't consume anything. Rule b 553 exits normally returning to rule a. Now it finds the ']' (and 554 with the successful match exits errorRecovery mode). 555 556 So, you cna see that the parser walks up call chain looking 557 for the token that was a member of the recovery set. 558 559 Errors are not generated in errorRecovery mode. 560 561 ANTLR's error recovery mechanism is based upon original ideas: 562 563 "Algorithms + Data Structures = Programs" by Niklaus Wirth 564 565 and 566 567 "A note on error recovery in recursive descent parsers": 568 http://portal.acm.org/citation.cfm?id=947902.947905 569 570 Later, Josef Grosch had some good ideas: 571 572 "Efficient and Comfortable Error Recovery in Recursive Descent 573 Parsers": 574 ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip 575 576 Like Grosch I implemented local FOLLOW sets that are combined 577 at run-time upon error to avoid overhead during parsing. 578 """ 579 580 return self.combineFollows(False) 581 582 583 def computeContextSensitiveRuleFOLLOW(self): 584 """ 585 Compute the context-sensitive FOLLOW set for current rule. 586 This is set of token types that can follow a specific rule 587 reference given a specific call chain. You get the set of 588 viable tokens that can possibly come next (lookahead depth 1) 589 given the current call chain. Contrast this with the 590 definition of plain FOLLOW for rule r: 591 592 FOLLOW(r)={x | S=>*alpha r beta in G and x in FIRST(beta)} 593 594 where x in T* and alpha, beta in V*; T is set of terminals and 595 V is the set of terminals and nonterminals. In other words, 596 FOLLOW(r) is the set of all tokens that can possibly follow 597 references to r in *any* sentential form (context). At 598 runtime, however, we know precisely which context applies as 599 we have the call chain. We may compute the exact (rather 600 than covering superset) set of following tokens. 601 602 For example, consider grammar: 603 604 stat : ID '=' expr ';' // FOLLOW(stat)=={EOF} 605 | "return" expr '.' 606 ; 607 expr : atom ('+' atom)* ; // FOLLOW(expr)=={';','.',')'} 608 atom : INT // FOLLOW(atom)=={'+',')',';','.'} 609 | '(' expr ')' 610 ; 611 612 The FOLLOW sets are all inclusive whereas context-sensitive 613 FOLLOW sets are precisely what could follow a rule reference. 614 For input "i=(3);", here is the derivation: 615 616 stat => ID '=' expr ';' 617 => ID '=' atom ('+' atom)* ';' 618 => ID '=' '(' expr ')' ('+' atom)* ';' 619 => ID '=' '(' atom ')' ('+' atom)* ';' 620 => ID '=' '(' INT ')' ('+' atom)* ';' 621 => ID '=' '(' INT ')' ';' 622 623 At the "3" token, you'd have a call chain of 624 625 stat -> expr -> atom -> expr -> atom 626 627 What can follow that specific nested ref to atom? Exactly ')' 628 as you can see by looking at the derivation of this specific 629 input. Contrast this with the FOLLOW(atom)={'+',')',';','.'}. 630 631 You want the exact viable token set when recovering from a 632 token mismatch. Upon token mismatch, if LA(1) is member of 633 the viable next token set, then you know there is most likely 634 a missing token in the input stream. "Insert" one by just not 635 throwing an exception. 636 """ 637 638 return self.combineFollows(True) 639 640 641 def combineFollows(self, exact): 642 followSet = set() 643 for idx, localFollowSet in reversed(list(enumerate(self._state.following))): 644 followSet |= localFollowSet 645 if exact: 646 # can we see end of rule? 647 if EOR_TOKEN_TYPE in localFollowSet: 648 # Only leave EOR in set if at top (start rule); this lets 649 # us know if have to include follow(start rule); i.e., EOF 650 if idx > 0: 651 followSet.remove(EOR_TOKEN_TYPE) 652 653 else: 654 # can't see end of rule, quit 655 break 656 657 return followSet 658 659 660 def recoverFromMismatchedToken(self, input, ttype, follow): 661 """Attempt to recover from a single missing or extra token. 662 663 EXTRA TOKEN 664 665 LA(1) is not what we are looking for. If LA(2) has the right token, 666 however, then assume LA(1) is some extra spurious token. Delete it 667 and LA(2) as if we were doing a normal match(), which advances the 668 input. 669 670 MISSING TOKEN 671 672 If current token is consistent with what could come after 673 ttype then it is ok to 'insert' the missing token, else throw 674 exception For example, Input 'i=(3;' is clearly missing the 675 ')'. When the parser returns from the nested call to expr, it 676 will have call chain: 677 678 stat -> expr -> atom 679 680 and it will be trying to match the ')' at this point in the 681 derivation: 682 683 => ID '=' '(' INT ')' ('+' atom)* ';' 684 ^ 685 match() will see that ';' doesn't match ')' and report a 686 mismatched token error. To recover, it sees that LA(1)==';' 687 is in the set of tokens that can follow the ')' token 688 reference in rule atom. It can assume that you forgot the ')'. 689 """ 690 691 e = None 692 693 # if next token is what we are looking for then "delete" this token 694 if self.mismatchIsUnwantedToken(input, ttype): 695 e = UnwantedTokenException(ttype, input) 696 697 self.beginResync() 698 input.consume() # simply delete extra token 699 self.endResync() 700 701 # report after consuming so AW sees the token in the exception 702 self.reportError(e) 703 704 # we want to return the token we're actually matching 705 matchedSymbol = self.getCurrentInputSymbol(input) 706 707 # move past ttype token as if all were ok 708 input.consume() 709 return matchedSymbol 710 711 # can't recover with single token deletion, try insertion 712 if self.mismatchIsMissingToken(input, follow): 713 inserted = self.getMissingSymbol(input, e, ttype, follow) 714 e = MissingTokenException(ttype, input, inserted) 715 716 # report after inserting so AW sees the token in the exception 717 self.reportError(e) 718 return inserted 719 720 # even that didn't work; must throw the exception 721 e = MismatchedTokenException(ttype, input) 722 raise e 723 724 725 def recoverFromMismatchedSet(self, input, e, follow): 726 """Not currently used""" 727 728 if self.mismatchIsMissingToken(input, follow): 729 self.reportError(e) 730 # we don't know how to conjure up a token for sets yet 731 return self.getMissingSymbol(input, e, INVALID_TOKEN_TYPE, follow) 732 733 # TODO do single token deletion like above for Token mismatch 734 raise e 735 736 737 def getCurrentInputSymbol(self, input): 738 """ 739 Match needs to return the current input symbol, which gets put 740 into the label for the associated token ref; e.g., x=ID. Token 741 and tree parsers need to return different objects. Rather than test 742 for input stream type or change the IntStream interface, I use 743 a simple method to ask the recognizer to tell me what the current 744 input symbol is. 745 746 This is ignored for lexers. 747 """ 748 749 return None 750 751 752 def getMissingSymbol(self, input, e, expectedTokenType, follow): 753 """Conjure up a missing token during error recovery. 754 755 The recognizer attempts to recover from single missing 756 symbols. But, actions might refer to that missing symbol. 757 For example, x=ID {f($x);}. The action clearly assumes 758 that there has been an identifier matched previously and that 759 $x points at that token. If that token is missing, but 760 the next token in the stream is what we want we assume that 761 this token is missing and we keep going. Because we 762 have to return some token to replace the missing token, 763 we have to conjure one up. This method gives the user control 764 over the tokens returned for missing tokens. Mostly, 765 you will want to create something special for identifier 766 tokens. For literals such as '{' and ',', the default 767 action in the parser or tree parser works. It simply creates 768 a CommonToken of the appropriate type. The text will be the token. 769 If you change what tokens must be created by the lexer, 770 override this method to create the appropriate tokens. 771 """ 772 773 return None 774 775 776 def consumeUntil(self, input, tokenTypes): 777 """ 778 Consume tokens until one matches the given token or token set 779 780 tokenTypes can be a single token type or a set of token types 781 782 """ 783 784 if not isinstance(tokenTypes, (set, frozenset)): 785 tokenTypes = frozenset([tokenTypes]) 786 787 ttype = input.LA(1) 788 while ttype != EOF and ttype not in tokenTypes: 789 input.consume() 790 ttype = input.LA(1) 791 792 793 def getRuleInvocationStack(self): 794 """ 795 Return List<String> of the rules in your parser instance 796 leading up to a call to this method. You could override if 797 you want more details such as the file/line info of where 798 in the parser java code a rule is invoked. 799 800 This is very useful for error messages and for context-sensitive 801 error recovery. 802 803 You must be careful, if you subclass a generated recognizers. 804 The default implementation will only search the module of self 805 for rules, but the subclass will not contain any rules. 806 You probably want to override this method to look like 807 808 def getRuleInvocationStack(self): 809 return self._getRuleInvocationStack(<class>.__module__) 810 811 where <class> is the class of the generated recognizer, e.g. 812 the superclass of self. 813 """ 814 815 return self._getRuleInvocationStack(self.__module__) 816 817 818 @classmethod 819 def _getRuleInvocationStack(cls, module): 820 """ 821 A more general version of getRuleInvocationStack where you can 822 pass in, for example, a RecognitionException to get it's rule 823 stack trace. This routine is shared with all recognizers, hence, 824 static. 825 826 TODO: move to a utility class or something; weird having lexer call 827 this 828 """ 829 830 # mmmhhh,... perhaps look at the first argument 831 # (f_locals[co_varnames[0]]?) and test if it's a (sub)class of 832 # requested recognizer... 833 834 rules = [] 835 for frame in reversed(inspect.stack()): 836 code = frame[0].f_code 837 codeMod = inspect.getmodule(code) 838 if codeMod is None: 839 continue 840 841 # skip frames not in requested module 842 if codeMod.__name__ != module: 843 continue 844 845 # skip some unwanted names 846 if code.co_name in ('nextToken', '<module>'): 847 continue 848 849 rules.append(code.co_name) 850 851 return rules 852 853 854 def getBacktrackingLevel(self): 855 return self._state.backtracking 856 857 def setBacktrackingLevel(self, n): 858 self._state.backtracking = n 859 860 861 def getGrammarFileName(self): 862 """For debugging and other purposes, might want the grammar name. 863 864 Have ANTLR generate an implementation for this method. 865 """ 866 867 return self.grammarFileName 868 869 870 def getSourceName(self): 871 raise NotImplementedError 872 873 874 def toStrings(self, tokens): 875 """A convenience method for use most often with template rewrites. 876 877 Convert a Token list to a str list. 878 """ 879 880 if tokens is None: 881 return None 882 883 return [token.text for token in tokens] 884 885 886 def getRuleMemoization(self, ruleIndex, ruleStartIndex): 887 """ 888 Given a rule number and a start token index number, return 889 MEMO_RULE_UNKNOWN if the rule has not parsed input starting from 890 start index. If this rule has parsed input starting from the 891 start index before, then return where the rule stopped parsing. 892 It returns the index of the last token matched by the rule. 893 """ 894 895 if ruleIndex not in self._state.ruleMemo: 896 self._state.ruleMemo[ruleIndex] = {} 897 898 return self._state.ruleMemo[ruleIndex].get( 899 ruleStartIndex, self.MEMO_RULE_UNKNOWN 900 ) 901 902 903 def alreadyParsedRule(self, input, ruleIndex): 904 """ 905 Has this rule already parsed input at the current index in the 906 input stream? Return the stop token index or MEMO_RULE_UNKNOWN. 907 If we attempted but failed to parse properly before, return 908 MEMO_RULE_FAILED. 909 910 This method has a side-effect: if we have seen this input for 911 this rule and successfully parsed before, then seek ahead to 912 1 past the stop token matched for this rule last time. 913 """ 914 915 stopIndex = self.getRuleMemoization(ruleIndex, input.index()) 916 if stopIndex == self.MEMO_RULE_UNKNOWN: 917 return False 918 919 if stopIndex == self.MEMO_RULE_FAILED: 920 raise BacktrackingFailed 921 922 else: 923 input.seek(stopIndex + 1) 924 925 return True 926 927 928 def memoize(self, input, ruleIndex, ruleStartIndex, success): 929 """ 930 Record whether or not this rule parsed the input at this position 931 successfully. 932 """ 933 934 if success: 935 stopTokenIndex = input.index() - 1 936 else: 937 stopTokenIndex = self.MEMO_RULE_FAILED 938 939 if ruleIndex in self._state.ruleMemo: 940 self._state.ruleMemo[ruleIndex][ruleStartIndex] = stopTokenIndex 941 942 943 def traceIn(self, ruleName, ruleIndex, inputSymbol): 944 sys.stdout.write("enter {} {}".format(ruleName, inputSymbol)) 945 946 if self._state.backtracking > 0: 947 sys.stdout.write(" backtracking={}".format(self._state.backtracking)) 948 949 sys.stdout.write('\n') 950 951 952 def traceOut(self, ruleName, ruleIndex, inputSymbol): 953 sys.stdout.write("exit {} {}".format(ruleName, inputSymbol)) 954 955 if self._state.backtracking > 0: 956 sys.stdout.write(" backtracking={}".format(self._state.backtracking)) 957 958 # mmmm... we use BacktrackingFailed exceptions now. So how could we 959 # get that information here? 960 #if self._state.failed: 961 # sys.stdout.write(" failed") 962 #else: 963 # sys.stdout.write(" succeeded") 964 965 sys.stdout.write('\n') 966 967 968class TokenSource(object): 969 """ 970 @brief Abstract baseclass for token producers. 971 972 A source of tokens must provide a sequence of tokens via nextToken() 973 and also must reveal it's source of characters; CommonToken's text is 974 computed from a CharStream; it only store indices into the char stream. 975 976 Errors from the lexer are never passed to the parser. Either you want 977 to keep going or you do not upon token recognition error. If you do not 978 want to continue lexing then you do not want to continue parsing. Just 979 throw an exception not under RecognitionException and Java will naturally 980 toss you all the way out of the recognizers. If you want to continue 981 lexing then you should not throw an exception to the parser--it has already 982 requested a token. Keep lexing until you get a valid one. Just report 983 errors and keep going, looking for a valid token. 984 """ 985 986 def nextToken(self): 987 """Return a Token object from your input stream (usually a CharStream). 988 989 Do not fail/return upon lexing error; keep chewing on the characters 990 until you get a good one; errors are not passed through to the parser. 991 """ 992 993 raise NotImplementedError 994 995 996 def __iter__(self): 997 """The TokenSource is an interator. 998 999 The iteration will not include the final EOF token, see also the note 1000 for the __next__() method. 1001 1002 """ 1003 1004 return self 1005 1006 1007 def __next__(self): 1008 """Return next token or raise StopIteration. 1009 1010 Note that this will raise StopIteration when hitting the EOF token, 1011 so EOF will not be part of the iteration. 1012 1013 """ 1014 1015 token = self.nextToken() 1016 if token is None or token.type == EOF: 1017 raise StopIteration 1018 return token 1019 1020 1021class Lexer(BaseRecognizer, TokenSource): 1022 """ 1023 @brief Baseclass for generated lexer classes. 1024 1025 A lexer is recognizer that draws input symbols from a character stream. 1026 lexer grammars result in a subclass of this object. A Lexer object 1027 uses simplified match() and error recovery mechanisms in the interest 1028 of speed. 1029 """ 1030 1031 def __init__(self, input, state=None): 1032 BaseRecognizer.__init__(self, state) 1033 TokenSource.__init__(self) 1034 1035 # Where is the lexer drawing characters from? 1036 self.input = input 1037 1038 1039 def reset(self): 1040 super().reset() # reset all recognizer state variables 1041 1042 if self.input is not None: 1043 # rewind the input 1044 self.input.seek(0) 1045 1046 if self._state is None: 1047 # no shared state work to do 1048 return 1049 1050 # wack Lexer state variables 1051 self._state.token = None 1052 self._state.type = INVALID_TOKEN_TYPE 1053 self._state.channel = DEFAULT_CHANNEL 1054 self._state.tokenStartCharIndex = -1 1055 self._state.tokenStartLine = -1 1056 self._state.tokenStartCharPositionInLine = -1 1057 self._state.text = None 1058 1059 1060 def makeEOFToken(self): 1061 eof = CommonToken( 1062 type=EOF, channel=DEFAULT_CHANNEL, 1063 input=self.input, 1064 start=self.input.index(), stop=self.input.index()) 1065 eof.line = self.input.line 1066 eof.charPositionInLine = self.input.charPositionInLine 1067 return eof 1068 1069 def nextToken(self): 1070 """ 1071 Return a token from this source; i.e., match a token on the char 1072 stream. 1073 """ 1074 1075 while 1: 1076 self._state.token = None 1077 self._state.channel = DEFAULT_CHANNEL 1078 self._state.tokenStartCharIndex = self.input.index() 1079 self._state.tokenStartCharPositionInLine = self.input.charPositionInLine 1080 self._state.tokenStartLine = self.input.line 1081 self._state.text = None 1082 if self.input.LA(1) == EOF: 1083 return self.makeEOFToken() 1084 1085 try: 1086 self.mTokens() 1087 1088 if self._state.token is None: 1089 self.emit() 1090 1091 elif self._state.token == SKIP_TOKEN: 1092 continue 1093 1094 return self._state.token 1095 1096 except NoViableAltException as re: 1097 self.reportError(re) 1098 self.recover(re) # throw out current char and try again 1099 1100 except RecognitionException as re: 1101 self.reportError(re) 1102 # match() routine has already called recover() 1103 1104 1105 def skip(self): 1106 """ 1107 Instruct the lexer to skip creating a token for current lexer rule 1108 and look for another token. nextToken() knows to keep looking when 1109 a lexer rule finishes with token set to SKIP_TOKEN. Recall that 1110 if token==null at end of any token rule, it creates one for you 1111 and emits it. 1112 """ 1113 1114 self._state.token = SKIP_TOKEN 1115 1116 1117 def mTokens(self): 1118 """This is the lexer entry point that sets instance var 'token'""" 1119 1120 # abstract method 1121 raise NotImplementedError 1122 1123 1124 def setCharStream(self, input): 1125 """Set the char stream and reset the lexer""" 1126 self.input = None 1127 self.reset() 1128 self.input = input 1129 1130 1131 def getSourceName(self): 1132 return self.input.getSourceName() 1133 1134 1135 def emit(self, token=None): 1136 """ 1137 The standard method called to automatically emit a token at the 1138 outermost lexical rule. The token object should point into the 1139 char buffer start..stop. If there is a text override in 'text', 1140 use that to set the token's text. Override this method to emit 1141 custom Token objects. 1142 1143 If you are building trees, then you should also override 1144 Parser or TreeParser.getMissingSymbol(). 1145 """ 1146 1147 if token is None: 1148 token = CommonToken( 1149 input=self.input, 1150 type=self._state.type, 1151 channel=self._state.channel, 1152 start=self._state.tokenStartCharIndex, 1153 stop=self.getCharIndex()-1 1154 ) 1155 token.line = self._state.tokenStartLine 1156 token.text = self._state.text 1157 token.charPositionInLine = self._state.tokenStartCharPositionInLine 1158 1159 self._state.token = token 1160 1161 return token 1162 1163 1164 def match(self, s): 1165 if isinstance(s, str): 1166 for c in s: 1167 if self.input.LA(1) != ord(c): 1168 if self._state.backtracking > 0: 1169 raise BacktrackingFailed 1170 1171 mte = MismatchedTokenException(c, self.input) 1172 self.recover(mte) 1173 raise mte 1174 1175 self.input.consume() 1176 1177 else: 1178 if self.input.LA(1) != s: 1179 if self._state.backtracking > 0: 1180 raise BacktrackingFailed 1181 1182 mte = MismatchedTokenException(chr(s), self.input) 1183 self.recover(mte) # don't really recover; just consume in lexer 1184 raise mte 1185 1186 self.input.consume() 1187 1188 1189 def matchAny(self): 1190 self.input.consume() 1191 1192 1193 def matchRange(self, a, b): 1194 if self.input.LA(1) < a or self.input.LA(1) > b: 1195 if self._state.backtracking > 0: 1196 raise BacktrackingFailed 1197 1198 mre = MismatchedRangeException(chr(a), chr(b), self.input) 1199 self.recover(mre) 1200 raise mre 1201 1202 self.input.consume() 1203 1204 1205 def getLine(self): 1206 return self.input.line 1207 1208 1209 def getCharPositionInLine(self): 1210 return self.input.charPositionInLine 1211 1212 1213 def getCharIndex(self): 1214 """What is the index of the current character of lookahead?""" 1215 1216 return self.input.index() 1217 1218 1219 def getText(self): 1220 """ 1221 Return the text matched so far for the current token or any 1222 text override. 1223 """ 1224 if self._state.text is not None: 1225 return self._state.text 1226 1227 return self.input.substring( 1228 self._state.tokenStartCharIndex, 1229 self.getCharIndex()-1 1230 ) 1231 1232 1233 def setText(self, text): 1234 """ 1235 Set the complete text of this token; it wipes any previous 1236 changes to the text. 1237 """ 1238 self._state.text = text 1239 1240 1241 text = property(getText, setText) 1242 1243 1244 def reportError(self, e): 1245 ## TODO: not thought about recovery in lexer yet. 1246 1247 ## # if we've already reported an error and have not matched a token 1248 ## # yet successfully, don't report any errors. 1249 ## if self.errorRecovery: 1250 ## return 1251 ## 1252 ## self.errorRecovery = True 1253 1254 self.displayRecognitionError(e) 1255 1256 1257 def getErrorMessage(self, e): 1258 msg = None 1259 1260 if isinstance(e, MismatchedTokenException): 1261 msg = "mismatched character {} expecting {}".format( 1262 self.getCharErrorDisplay(e.c), 1263 self.getCharErrorDisplay(e.expecting)) 1264 1265 elif isinstance(e, NoViableAltException): 1266 msg = "no viable alternative at character {}".format( 1267 self.getCharErrorDisplay(e.c)) 1268 1269 elif isinstance(e, EarlyExitException): 1270 msg = "required (...)+ loop did not match anything at character {}".format( 1271 self.getCharErrorDisplay(e.c)) 1272 1273 elif isinstance(e, MismatchedNotSetException): 1274 msg = "mismatched character {} expecting set {!r}".format( 1275 self.getCharErrorDisplay(e.c), 1276 e.expecting) 1277 1278 elif isinstance(e, MismatchedSetException): 1279 msg = "mismatched character {} expecting set {!r}".format( 1280 self.getCharErrorDisplay(e.c), 1281 e.expecting) 1282 1283 elif isinstance(e, MismatchedRangeException): 1284 msg = "mismatched character {} expecting set {}..{}".format( 1285 self.getCharErrorDisplay(e.c), 1286 self.getCharErrorDisplay(e.a), 1287 self.getCharErrorDisplay(e.b)) 1288 1289 else: 1290 msg = super().getErrorMessage(e) 1291 1292 return msg 1293 1294 1295 def getCharErrorDisplay(self, c): 1296 if c == EOF: 1297 c = '<EOF>' 1298 return repr(c) 1299 1300 1301 def recover(self, re): 1302 """ 1303 Lexers can normally match any char in it's vocabulary after matching 1304 a token, so do the easy thing and just kill a character and hope 1305 it all works out. You can instead use the rule invocation stack 1306 to do sophisticated error recovery if you are in a fragment rule. 1307 """ 1308 1309 self.input.consume() 1310 1311 1312 def traceIn(self, ruleName, ruleIndex): 1313 inputSymbol = "{} line={}:{}".format(self.input.LT(1), 1314 self.getLine(), 1315 self.getCharPositionInLine() 1316 ) 1317 1318 super().traceIn(ruleName, ruleIndex, inputSymbol) 1319 1320 1321 def traceOut(self, ruleName, ruleIndex): 1322 inputSymbol = "{} line={}:{}".format(self.input.LT(1), 1323 self.getLine(), 1324 self.getCharPositionInLine() 1325 ) 1326 1327 super().traceOut(ruleName, ruleIndex, inputSymbol) 1328 1329 1330 1331class Parser(BaseRecognizer): 1332 """ 1333 @brief Baseclass for generated parser classes. 1334 """ 1335 1336 def __init__(self, lexer, state=None): 1337 super().__init__(state) 1338 1339 self.input = lexer 1340 1341 1342 def reset(self): 1343 super().reset() # reset all recognizer state variables 1344 if self.input is not None: 1345 self.input.seek(0) # rewind the input 1346 1347 1348 def getCurrentInputSymbol(self, input): 1349 return input.LT(1) 1350 1351 1352 def getMissingSymbol(self, input, e, expectedTokenType, follow): 1353 if expectedTokenType == EOF: 1354 tokenText = "<missing EOF>" 1355 else: 1356 tokenText = "<missing {}>".format(self.tokenNames[expectedTokenType]) 1357 t = CommonToken(type=expectedTokenType, text=tokenText) 1358 current = input.LT(1) 1359 if current.type == EOF: 1360 current = input.LT(-1) 1361 1362 if current is not None: 1363 t.line = current.line 1364 t.charPositionInLine = current.charPositionInLine 1365 t.channel = DEFAULT_CHANNEL 1366 return t 1367 1368 1369 def setTokenStream(self, input): 1370 """Set the token stream and reset the parser""" 1371 1372 self.input = None 1373 self.reset() 1374 self.input = input 1375 1376 1377 def getTokenStream(self): 1378 return self.input 1379 1380 1381 def getSourceName(self): 1382 return self.input.getSourceName() 1383 1384 1385 def traceIn(self, ruleName, ruleIndex): 1386 super().traceIn(ruleName, ruleIndex, self.input.LT(1)) 1387 1388 1389 def traceOut(self, ruleName, ruleIndex): 1390 super().traceOut(ruleName, ruleIndex, self.input.LT(1)) 1391 1392 1393class RuleReturnScope(object): 1394 """ 1395 Rules can return start/stop info as well as possible trees and templates. 1396 """ 1397 1398 def getStart(self): 1399 """Return the start token or tree.""" 1400 return None 1401 1402 1403 def getStop(self): 1404 """Return the stop token or tree.""" 1405 return None 1406 1407 1408 def getTree(self): 1409 """Has a value potentially if output=AST.""" 1410 return None 1411 1412 1413 def getTemplate(self): 1414 """Has a value potentially if output=template.""" 1415 return None 1416 1417 1418class ParserRuleReturnScope(RuleReturnScope): 1419 """ 1420 Rules that return more than a single value must return an object 1421 containing all the values. Besides the properties defined in 1422 RuleLabelScope.predefinedRulePropertiesScope there may be user-defined 1423 return values. This class simply defines the minimum properties that 1424 are always defined and methods to access the others that might be 1425 available depending on output option such as template and tree. 1426 1427 Note text is not an actual property of the return value, it is computed 1428 from start and stop using the input stream's toString() method. I 1429 could add a ctor to this so that we can pass in and store the input 1430 stream, but I'm not sure we want to do that. It would seem to be undefined 1431 to get the .text property anyway if the rule matches tokens from multiple 1432 input streams. 1433 1434 I do not use getters for fields of objects that are used simply to 1435 group values such as this aggregate. The getters/setters are there to 1436 satisfy the superclass interface. 1437 """ 1438 1439 def __init__(self): 1440 super().__init__() 1441 self.start = None 1442 self.stop = None 1443 self.tree = None # only used when output=AST 1444 1445 1446 def getStart(self): 1447 return self.start 1448 1449 1450 def getStop(self): 1451 return self.stop 1452 1453 1454 def getTree(self): 1455 return self.tree 1456