1"""ANTLR3 runtime package"""
2
3# begin[licence]
4#
5# [The "BSD licence"]
6# Copyright (c) 2005-2012 Terence Parr
7# All rights reserved.
8#
9# Redistribution and use in source and binary forms, with or without
10# modification, are permitted provided that the following conditions
11# are met:
12# 1. Redistributions of source code must retain the above copyright
13#    notice, this list of conditions and the following disclaimer.
14# 2. Redistributions in binary form must reproduce the above copyright
15#    notice, this list of conditions and the following disclaimer in the
16#    documentation and/or other materials provided with the distribution.
17# 3. The name of the author may not be used to endorse or promote products
18#    derived from this software without specific prior written permission.
19#
20# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
23# IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
24# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
25# NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
29# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30#
31# end[licence]
32
33import sys
34import inspect
35
36from .constants import compatible_api_versions, DEFAULT_CHANNEL, \
37     HIDDEN_CHANNEL, EOF, EOR_TOKEN_TYPE, INVALID_TOKEN_TYPE
38from .exceptions import RecognitionException, MismatchedTokenException, \
39     MismatchedRangeException, MismatchedTreeNodeException, \
40     NoViableAltException, EarlyExitException, MismatchedSetException, \
41     MismatchedNotSetException, FailedPredicateException, \
42     BacktrackingFailed, UnwantedTokenException, MissingTokenException
43from .tokens import CommonToken, SKIP_TOKEN
44
45
46class RecognizerSharedState(object):
47    """
48    The set of fields needed by an abstract recognizer to recognize input
49    and recover from errors etc...  As a separate state object, it can be
50    shared among multiple grammars; e.g., when one grammar imports another.
51
52    These fields are publically visible but the actual state pointer per
53    parser is protected.
54    """
55
56    def __init__(self):
57        # Track the set of token types that can follow any rule invocation.
58        # Stack grows upwards.
59        self.following = []
60
61        # This is true when we see an error and before having successfully
62        # matched a token.  Prevents generation of more than one error message
63        # per error.
64        self.errorRecovery = False
65
66        # The index into the input stream where the last error occurred.
67        # This is used to prevent infinite loops where an error is found
68        # but no token is consumed during recovery...another error is found,
69        # ad naseum.  This is a failsafe mechanism to guarantee that at least
70        # one token/tree node is consumed for two errors.
71        self.lastErrorIndex = -1
72
73        # If 0, no backtracking is going on.  Safe to exec actions etc...
74        # If >0 then it's the level of backtracking.
75        self.backtracking = 0
76
77        # An array[size num rules] of (int -> int) dicts that tracks
78        # the stop token index for each rule.  ruleMemo[ruleIndex] is
79        # the memoization table for ruleIndex.  For key ruleStartIndex, you
80        # get back the stop token for associated rule or MEMO_RULE_FAILED.
81        #
82        # This is only used if rule memoization is on (which it is by default).
83        self.ruleMemo = None
84
85        ## Did the recognizer encounter a syntax error?  Track how many.
86        self.syntaxErrors = 0
87
88
89        # LEXER FIELDS (must be in same state object to avoid casting
90        # constantly in generated code and Lexer object) :(
91
92
93        ## The goal of all lexer rules/methods is to create a token object.
94        # This is an instance variable as multiple rules may collaborate to
95        # create a single token.  nextToken will return this object after
96        # matching lexer rule(s).  If you subclass to allow multiple token
97        # emissions, then set this to the last token to be matched or
98        # something nonnull so that the auto token emit mechanism will not
99        # emit another token.
100        self.token = None
101
102        ## What character index in the stream did the current token start at?
103        # Needed, for example, to get the text for current token.  Set at
104        # the start of nextToken.
105        self.tokenStartCharIndex = -1
106
107        ## The line on which the first character of the token resides
108        self.tokenStartLine = None
109
110        ## The character position of first character within the line
111        self.tokenStartCharPositionInLine = None
112
113        ## The channel number for the current token
114        self.channel = None
115
116        ## The token type for the current token
117        self.type = None
118
119        ## You can set the text for the current token to override what is in
120        # the input char buffer.  Use setText() or can set this instance var.
121        self.text = None
122
123
124class BaseRecognizer(object):
125    """
126    @brief Common recognizer functionality.
127
128    A generic recognizer that can handle recognizers generated from
129    lexer, parser, and tree grammars.  This is all the parsing
130    support code essentially; most of it is error recovery stuff and
131    backtracking.
132    """
133
134    MEMO_RULE_FAILED = -2
135    MEMO_RULE_UNKNOWN = -1
136
137    # copies from Token object for convenience in actions
138    DEFAULT_TOKEN_CHANNEL = DEFAULT_CHANNEL
139
140    # for convenience in actions
141    HIDDEN = HIDDEN_CHANNEL
142
143    # overridden by generated subclasses
144    grammarFileName = None
145    tokenNames = None
146
147    # The api_version attribute has been introduced in 3.3. If it is not
148    # overwritten in the generated recognizer, we assume a default of v0.
149    api_version = 0
150
151    def __init__(self, state=None):
152        # Input stream of the recognizer. Must be initialized by a subclass.
153        self.input = None
154
155        ## State of a lexer, parser, or tree parser are collected into a state
156        # object so the state can be shared.  This sharing is needed to
157        # have one grammar import others and share same error variables
158        # and other state variables.  It's a kind of explicit multiple
159        # inheritance via delegation of methods and shared state.
160        if state is None:
161            state = RecognizerSharedState()
162        self._state = state
163
164        if self.api_version not in compatible_api_versions:
165            raise RuntimeError(
166                "ANTLR version mismatch: "
167                "The recognizer has been generated with API V{}, "
168                "but this runtime does not support this."
169                .format(self.api_version))
170
171    # this one only exists to shut up pylint :(
172    def setInput(self, input):
173        self.input = input
174
175
176    def reset(self):
177        """
178        reset the parser's state; subclasses must rewind the input stream
179        """
180
181        # wack everything related to error recovery
182        if self._state is None:
183            # no shared state work to do
184            return
185
186        self._state.following = []
187        self._state.errorRecovery = False
188        self._state.lastErrorIndex = -1
189        self._state.syntaxErrors = 0
190        # wack everything related to backtracking and memoization
191        self._state.backtracking = 0
192        if self._state.ruleMemo is not None:
193            self._state.ruleMemo = {}
194
195
196    def match(self, input, ttype, follow):
197        """
198        Match current input symbol against ttype.  Attempt
199        single token insertion or deletion error recovery.  If
200        that fails, throw MismatchedTokenException.
201
202        To turn off single token insertion or deletion error
203        recovery, override recoverFromMismatchedToken() and have it
204        throw an exception. See TreeParser.recoverFromMismatchedToken().
205        This way any error in a rule will cause an exception and
206        immediate exit from rule.  Rule would recover by resynchronizing
207        to the set of symbols that can follow rule ref.
208        """
209
210        matchedSymbol = self.getCurrentInputSymbol(input)
211        if self.input.LA(1) == ttype:
212            self.input.consume()
213            self._state.errorRecovery = False
214            return matchedSymbol
215
216        if self._state.backtracking > 0:
217            # FIXME: need to return matchedSymbol here as well. damn!!
218            raise BacktrackingFailed
219
220        matchedSymbol = self.recoverFromMismatchedToken(input, ttype, follow)
221        return matchedSymbol
222
223
224    def matchAny(self):
225        """Match the wildcard: in a symbol"""
226
227        self._state.errorRecovery = False
228        self.input.consume()
229
230
231    def mismatchIsUnwantedToken(self, input, ttype):
232        return input.LA(2) == ttype
233
234
235    def mismatchIsMissingToken(self, input, follow):
236        if follow is None:
237            # we have no information about the follow; we can only consume
238            # a single token and hope for the best
239            return False
240
241        # compute what can follow this grammar element reference
242        if EOR_TOKEN_TYPE in follow:
243            viableTokensFollowingThisRule = self.computeContextSensitiveRuleFOLLOW()
244            follow |= viableTokensFollowingThisRule
245
246            if len(self._state.following) > 0:
247                # remove EOR if we're not the start symbol
248                follow -= {EOR_TOKEN_TYPE}
249
250        # if current token is consistent with what could come after set
251        # then we know we're missing a token; error recovery is free to
252        # "insert" the missing token
253        if input.LA(1) in follow or EOR_TOKEN_TYPE in follow:
254            return True
255
256        return False
257
258
259    def reportError(self, e):
260        """Report a recognition problem.
261
262        This method sets errorRecovery to indicate the parser is recovering
263        not parsing.  Once in recovery mode, no errors are generated.
264        To get out of recovery mode, the parser must successfully match
265        a token (after a resync).  So it will go:
266
267        1. error occurs
268        2. enter recovery mode, report error
269        3. consume until token found in resync set
270        4. try to resume parsing
271        5. next match() will reset errorRecovery mode
272
273        If you override, make sure to update syntaxErrors if you care about
274        that.
275
276        """
277
278        # if we've already reported an error and have not matched a token
279        # yet successfully, don't report any errors.
280        if self._state.errorRecovery:
281            return
282
283        self._state.syntaxErrors += 1 # don't count spurious
284        self._state.errorRecovery = True
285
286        self.displayRecognitionError(e)
287
288
289    def displayRecognitionError(self, e):
290        hdr = self.getErrorHeader(e)
291        msg = self.getErrorMessage(e)
292        self.emitErrorMessage(hdr + " " + msg)
293
294
295    def getErrorMessage(self, e):
296        """
297        What error message should be generated for the various
298        exception types?
299
300        Not very object-oriented code, but I like having all error message
301        generation within one method rather than spread among all of the
302        exception classes. This also makes it much easier for the exception
303        handling because the exception classes do not have to have pointers back
304        to this object to access utility routines and so on. Also, changing
305        the message for an exception type would be difficult because you
306        would have to subclassing exception, but then somehow get ANTLR
307        to make those kinds of exception objects instead of the default.
308        This looks weird, but trust me--it makes the most sense in terms
309        of flexibility.
310
311        For grammar debugging, you will want to override this to add
312        more information such as the stack frame with
313        getRuleInvocationStack(e, this.getClass().getName()) and,
314        for no viable alts, the decision description and state etc...
315
316        Override this to change the message generated for one or more
317        exception types.
318        """
319
320        if isinstance(e, UnwantedTokenException):
321            if e.expecting == EOF:
322                tokenName = "EOF"
323            else:
324                tokenName = self.tokenNames[e.expecting]
325
326            msg = "extraneous input {} expecting {}".format(
327                self.getTokenErrorDisplay(e.getUnexpectedToken()),
328                tokenName
329                )
330
331        elif isinstance(e, MissingTokenException):
332            if e.expecting == EOF:
333                tokenName = "EOF"
334            else:
335                tokenName = self.tokenNames[e.expecting]
336
337            msg = "missing {} at {}".format(
338                tokenName, self.getTokenErrorDisplay(e.token)
339                )
340
341        elif isinstance(e, MismatchedTokenException):
342            if e.expecting == EOF:
343                tokenName = "EOF"
344            else:
345                tokenName = self.tokenNames[e.expecting]
346
347            msg = "mismatched input {} expecting {}".format(
348                self.getTokenErrorDisplay(e.token),
349                tokenName
350                )
351
352        elif isinstance(e, MismatchedTreeNodeException):
353            if e.expecting == EOF:
354                tokenName = "EOF"
355            else:
356                tokenName = self.tokenNames[e.expecting]
357
358            msg = "mismatched tree node: {} expecting {}".format(
359                e.node, tokenName)
360
361        elif isinstance(e, NoViableAltException):
362            msg = "no viable alternative at input {}".format(
363                self.getTokenErrorDisplay(e.token))
364
365        elif isinstance(e, EarlyExitException):
366            msg = "required (...)+ loop did not match anything at input {}".format(
367                self.getTokenErrorDisplay(e.token))
368
369        elif isinstance(e, MismatchedSetException):
370            msg = "mismatched input {} expecting set {!r}".format(
371                self.getTokenErrorDisplay(e.token),
372                e.expecting
373                )
374
375        elif isinstance(e, MismatchedNotSetException):
376            msg = "mismatched input {} expecting set {!r}".format(
377                self.getTokenErrorDisplay(e.token),
378                e.expecting
379                )
380
381        elif isinstance(e, FailedPredicateException):
382            msg = "rule {} failed predicate: {{{}}}?".format(
383                e.ruleName,
384                e.predicateText
385                )
386
387        else:
388            msg = str(e)
389
390        return msg
391
392
393    def getNumberOfSyntaxErrors(self):
394        """
395        Get number of recognition errors (lexer, parser, tree parser).  Each
396        recognizer tracks its own number.  So parser and lexer each have
397        separate count.  Does not count the spurious errors found between
398        an error and next valid token match.
399
400        See also reportError().
401        """
402        return self._state.syntaxErrors
403
404
405    def getErrorHeader(self, e):
406        """
407        What is the error header, normally line/character position information?
408        """
409
410        source_name = self.getSourceName()
411        if source_name is not None:
412            return "{} line {}:{}".format(source_name, e.line, e.charPositionInLine)
413        return "line {}:{}".format(e.line, e.charPositionInLine)
414
415
416    def getTokenErrorDisplay(self, t):
417        """
418        How should a token be displayed in an error message? The default
419        is to display just the text, but during development you might
420        want to have a lot of information spit out.  Override in that case
421        to use t.toString() (which, for CommonToken, dumps everything about
422        the token). This is better than forcing you to override a method in
423        your token objects because you don't have to go modify your lexer
424        so that it creates a new Java type.
425        """
426
427        s = t.text
428        if s is None:
429            if t.type == EOF:
430                s = "<EOF>"
431            else:
432                s = "<{}>".format(t.typeName)
433
434        return repr(s)
435
436
437    def emitErrorMessage(self, msg):
438        """Override this method to change where error messages go"""
439        sys.stderr.write(msg + '\n')
440
441
442    def recover(self, input, re):
443        """
444        Recover from an error found on the input stream.  This is
445        for NoViableAlt and mismatched symbol exceptions.  If you enable
446        single token insertion and deletion, this will usually not
447        handle mismatched symbol exceptions but there could be a mismatched
448        token that the match() routine could not recover from.
449        """
450
451        # PROBLEM? what if input stream is not the same as last time
452        # perhaps make lastErrorIndex a member of input
453        if self._state.lastErrorIndex == input.index():
454            # uh oh, another error at same token index; must be a case
455            # where LT(1) is in the recovery token set so nothing is
456            # consumed; consume a single token so at least to prevent
457            # an infinite loop; this is a failsafe.
458            input.consume()
459
460        self._state.lastErrorIndex = input.index()
461        followSet = self.computeErrorRecoverySet()
462
463        self.beginResync()
464        self.consumeUntil(input, followSet)
465        self.endResync()
466
467
468    def beginResync(self):
469        """
470        A hook to listen in on the token consumption during error recovery.
471        The DebugParser subclasses this to fire events to the listenter.
472        """
473
474        pass
475
476
477    def endResync(self):
478        """
479        A hook to listen in on the token consumption during error recovery.
480        The DebugParser subclasses this to fire events to the listenter.
481        """
482
483        pass
484
485
486    def computeErrorRecoverySet(self):
487        """
488        Compute the error recovery set for the current rule.  During
489        rule invocation, the parser pushes the set of tokens that can
490        follow that rule reference on the stack; this amounts to
491        computing FIRST of what follows the rule reference in the
492        enclosing rule. This local follow set only includes tokens
493        from within the rule; i.e., the FIRST computation done by
494        ANTLR stops at the end of a rule.
495
496        EXAMPLE
497
498        When you find a "no viable alt exception", the input is not
499        consistent with any of the alternatives for rule r.  The best
500        thing to do is to consume tokens until you see something that
501        can legally follow a call to r *or* any rule that called r.
502        You don't want the exact set of viable next tokens because the
503        input might just be missing a token--you might consume the
504        rest of the input looking for one of the missing tokens.
505
506        Consider grammar:
507
508        a : '[' b ']'
509          | '(' b ')'
510          ;
511        b : c '^' INT ;
512        c : ID
513          | INT
514          ;
515
516        At each rule invocation, the set of tokens that could follow
517        that rule is pushed on a stack.  Here are the various "local"
518        follow sets:
519
520        FOLLOW(b1_in_a) = FIRST(']') = ']'
521        FOLLOW(b2_in_a) = FIRST(')') = ')'
522        FOLLOW(c_in_b) = FIRST('^') = '^'
523
524        Upon erroneous input "[]", the call chain is
525
526        a -> b -> c
527
528        and, hence, the follow context stack is:
529
530        depth  local follow set     after call to rule
531          0         \<EOF>                    a (from main())
532          1          ']'                     b
533          3          '^'                     c
534
535        Notice that ')' is not included, because b would have to have
536        been called from a different context in rule a for ')' to be
537        included.
538
539        For error recovery, we cannot consider FOLLOW(c)
540        (context-sensitive or otherwise).  We need the combined set of
541        all context-sensitive FOLLOW sets--the set of all tokens that
542        could follow any reference in the call chain.  We need to
543        resync to one of those tokens.  Note that FOLLOW(c)='^' and if
544        we resync'd to that token, we'd consume until EOF.  We need to
545        sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}.
546        In this case, for input "[]", LA(1) is in this set so we would
547        not consume anything and after printing an error rule c would
548        return normally.  It would not find the required '^' though.
549        At this point, it gets a mismatched token error and throws an
550        exception (since LA(1) is not in the viable following token
551        set).  The rule exception handler tries to recover, but finds
552        the same recovery set and doesn't consume anything.  Rule b
553        exits normally returning to rule a.  Now it finds the ']' (and
554        with the successful match exits errorRecovery mode).
555
556        So, you cna see that the parser walks up call chain looking
557        for the token that was a member of the recovery set.
558
559        Errors are not generated in errorRecovery mode.
560
561        ANTLR's error recovery mechanism is based upon original ideas:
562
563        "Algorithms + Data Structures = Programs" by Niklaus Wirth
564
565        and
566
567        "A note on error recovery in recursive descent parsers":
568        http://portal.acm.org/citation.cfm?id=947902.947905
569
570        Later, Josef Grosch had some good ideas:
571
572        "Efficient and Comfortable Error Recovery in Recursive Descent
573        Parsers":
574        ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip
575
576        Like Grosch I implemented local FOLLOW sets that are combined
577        at run-time upon error to avoid overhead during parsing.
578        """
579
580        return self.combineFollows(False)
581
582
583    def computeContextSensitiveRuleFOLLOW(self):
584        """
585        Compute the context-sensitive FOLLOW set for current rule.
586        This is set of token types that can follow a specific rule
587        reference given a specific call chain.  You get the set of
588        viable tokens that can possibly come next (lookahead depth 1)
589        given the current call chain.  Contrast this with the
590        definition of plain FOLLOW for rule r:
591
592         FOLLOW(r)={x | S=>*alpha r beta in G and x in FIRST(beta)}
593
594        where x in T* and alpha, beta in V*; T is set of terminals and
595        V is the set of terminals and nonterminals.  In other words,
596        FOLLOW(r) is the set of all tokens that can possibly follow
597        references to r in *any* sentential form (context).  At
598        runtime, however, we know precisely which context applies as
599        we have the call chain.  We may compute the exact (rather
600        than covering superset) set of following tokens.
601
602        For example, consider grammar:
603
604        stat : ID '=' expr ';'      // FOLLOW(stat)=={EOF}
605             | "return" expr '.'
606             ;
607        expr : atom ('+' atom)* ;   // FOLLOW(expr)=={';','.',')'}
608        atom : INT                  // FOLLOW(atom)=={'+',')',';','.'}
609             | '(' expr ')'
610             ;
611
612        The FOLLOW sets are all inclusive whereas context-sensitive
613        FOLLOW sets are precisely what could follow a rule reference.
614        For input "i=(3);", here is the derivation:
615
616        stat => ID '=' expr ';'
617             => ID '=' atom ('+' atom)* ';'
618             => ID '=' '(' expr ')' ('+' atom)* ';'
619             => ID '=' '(' atom ')' ('+' atom)* ';'
620             => ID '=' '(' INT ')' ('+' atom)* ';'
621             => ID '=' '(' INT ')' ';'
622
623        At the "3" token, you'd have a call chain of
624
625          stat -> expr -> atom -> expr -> atom
626
627        What can follow that specific nested ref to atom?  Exactly ')'
628        as you can see by looking at the derivation of this specific
629        input.  Contrast this with the FOLLOW(atom)={'+',')',';','.'}.
630
631        You want the exact viable token set when recovering from a
632        token mismatch.  Upon token mismatch, if LA(1) is member of
633        the viable next token set, then you know there is most likely
634        a missing token in the input stream.  "Insert" one by just not
635        throwing an exception.
636        """
637
638        return self.combineFollows(True)
639
640
641    def combineFollows(self, exact):
642        followSet = set()
643        for idx, localFollowSet in reversed(list(enumerate(self._state.following))):
644            followSet |= localFollowSet
645            if exact:
646                # can we see end of rule?
647                if EOR_TOKEN_TYPE in localFollowSet:
648                    # Only leave EOR in set if at top (start rule); this lets
649                    # us know if have to include follow(start rule); i.e., EOF
650                    if idx > 0:
651                        followSet.remove(EOR_TOKEN_TYPE)
652
653                else:
654                    # can't see end of rule, quit
655                    break
656
657        return followSet
658
659
660    def recoverFromMismatchedToken(self, input, ttype, follow):
661        """Attempt to recover from a single missing or extra token.
662
663        EXTRA TOKEN
664
665        LA(1) is not what we are looking for.  If LA(2) has the right token,
666        however, then assume LA(1) is some extra spurious token.  Delete it
667        and LA(2) as if we were doing a normal match(), which advances the
668        input.
669
670        MISSING TOKEN
671
672        If current token is consistent with what could come after
673        ttype then it is ok to 'insert' the missing token, else throw
674        exception For example, Input 'i=(3;' is clearly missing the
675        ')'.  When the parser returns from the nested call to expr, it
676        will have call chain:
677
678          stat -> expr -> atom
679
680        and it will be trying to match the ')' at this point in the
681        derivation:
682
683             => ID '=' '(' INT ')' ('+' atom)* ';'
684                                ^
685        match() will see that ';' doesn't match ')' and report a
686        mismatched token error.  To recover, it sees that LA(1)==';'
687        is in the set of tokens that can follow the ')' token
688        reference in rule atom.  It can assume that you forgot the ')'.
689        """
690
691        e = None
692
693        # if next token is what we are looking for then "delete" this token
694        if self.mismatchIsUnwantedToken(input, ttype):
695            e = UnwantedTokenException(ttype, input)
696
697            self.beginResync()
698            input.consume() # simply delete extra token
699            self.endResync()
700
701            # report after consuming so AW sees the token in the exception
702            self.reportError(e)
703
704            # we want to return the token we're actually matching
705            matchedSymbol = self.getCurrentInputSymbol(input)
706
707            # move past ttype token as if all were ok
708            input.consume()
709            return matchedSymbol
710
711        # can't recover with single token deletion, try insertion
712        if self.mismatchIsMissingToken(input, follow):
713            inserted = self.getMissingSymbol(input, e, ttype, follow)
714            e = MissingTokenException(ttype, input, inserted)
715
716            # report after inserting so AW sees the token in the exception
717            self.reportError(e)
718            return inserted
719
720        # even that didn't work; must throw the exception
721        e = MismatchedTokenException(ttype, input)
722        raise e
723
724
725    def recoverFromMismatchedSet(self, input, e, follow):
726        """Not currently used"""
727
728        if self.mismatchIsMissingToken(input, follow):
729            self.reportError(e)
730            # we don't know how to conjure up a token for sets yet
731            return self.getMissingSymbol(input, e, INVALID_TOKEN_TYPE, follow)
732
733        # TODO do single token deletion like above for Token mismatch
734        raise e
735
736
737    def getCurrentInputSymbol(self, input):
738        """
739        Match needs to return the current input symbol, which gets put
740        into the label for the associated token ref; e.g., x=ID.  Token
741        and tree parsers need to return different objects. Rather than test
742        for input stream type or change the IntStream interface, I use
743        a simple method to ask the recognizer to tell me what the current
744        input symbol is.
745
746        This is ignored for lexers.
747        """
748
749        return None
750
751
752    def getMissingSymbol(self, input, e, expectedTokenType, follow):
753        """Conjure up a missing token during error recovery.
754
755        The recognizer attempts to recover from single missing
756        symbols. But, actions might refer to that missing symbol.
757        For example, x=ID {f($x);}. The action clearly assumes
758        that there has been an identifier matched previously and that
759        $x points at that token. If that token is missing, but
760        the next token in the stream is what we want we assume that
761        this token is missing and we keep going. Because we
762        have to return some token to replace the missing token,
763        we have to conjure one up. This method gives the user control
764        over the tokens returned for missing tokens. Mostly,
765        you will want to create something special for identifier
766        tokens. For literals such as '{' and ',', the default
767        action in the parser or tree parser works. It simply creates
768        a CommonToken of the appropriate type. The text will be the token.
769        If you change what tokens must be created by the lexer,
770        override this method to create the appropriate tokens.
771        """
772
773        return None
774
775
776    def consumeUntil(self, input, tokenTypes):
777        """
778        Consume tokens until one matches the given token or token set
779
780        tokenTypes can be a single token type or a set of token types
781
782        """
783
784        if not isinstance(tokenTypes, (set, frozenset)):
785            tokenTypes = frozenset([tokenTypes])
786
787        ttype = input.LA(1)
788        while ttype != EOF and ttype not in tokenTypes:
789            input.consume()
790            ttype = input.LA(1)
791
792
793    def getRuleInvocationStack(self):
794        """
795        Return List<String> of the rules in your parser instance
796        leading up to a call to this method.  You could override if
797        you want more details such as the file/line info of where
798        in the parser java code a rule is invoked.
799
800        This is very useful for error messages and for context-sensitive
801        error recovery.
802
803        You must be careful, if you subclass a generated recognizers.
804        The default implementation will only search the module of self
805        for rules, but the subclass will not contain any rules.
806        You probably want to override this method to look like
807
808        def getRuleInvocationStack(self):
809            return self._getRuleInvocationStack(<class>.__module__)
810
811        where <class> is the class of the generated recognizer, e.g.
812        the superclass of self.
813        """
814
815        return self._getRuleInvocationStack(self.__module__)
816
817
818    @classmethod
819    def _getRuleInvocationStack(cls, module):
820        """
821        A more general version of getRuleInvocationStack where you can
822        pass in, for example, a RecognitionException to get it's rule
823        stack trace.  This routine is shared with all recognizers, hence,
824        static.
825
826        TODO: move to a utility class or something; weird having lexer call
827        this
828        """
829
830        # mmmhhh,... perhaps look at the first argument
831        # (f_locals[co_varnames[0]]?) and test if it's a (sub)class of
832        # requested recognizer...
833
834        rules = []
835        for frame in reversed(inspect.stack()):
836            code = frame[0].f_code
837            codeMod = inspect.getmodule(code)
838            if codeMod is None:
839                continue
840
841            # skip frames not in requested module
842            if codeMod.__name__ != module:
843                continue
844
845            # skip some unwanted names
846            if code.co_name in ('nextToken', '<module>'):
847                continue
848
849            rules.append(code.co_name)
850
851        return rules
852
853
854    def getBacktrackingLevel(self):
855        return self._state.backtracking
856
857    def setBacktrackingLevel(self, n):
858        self._state.backtracking = n
859
860
861    def getGrammarFileName(self):
862        """For debugging and other purposes, might want the grammar name.
863
864        Have ANTLR generate an implementation for this method.
865        """
866
867        return self.grammarFileName
868
869
870    def getSourceName(self):
871        raise NotImplementedError
872
873
874    def toStrings(self, tokens):
875        """A convenience method for use most often with template rewrites.
876
877        Convert a Token list to a str list.
878        """
879
880        if tokens is None:
881            return None
882
883        return [token.text for token in tokens]
884
885
886    def getRuleMemoization(self, ruleIndex, ruleStartIndex):
887        """
888        Given a rule number and a start token index number, return
889        MEMO_RULE_UNKNOWN if the rule has not parsed input starting from
890        start index.  If this rule has parsed input starting from the
891        start index before, then return where the rule stopped parsing.
892        It returns the index of the last token matched by the rule.
893        """
894
895        if ruleIndex not in self._state.ruleMemo:
896            self._state.ruleMemo[ruleIndex] = {}
897
898        return self._state.ruleMemo[ruleIndex].get(
899            ruleStartIndex, self.MEMO_RULE_UNKNOWN
900            )
901
902
903    def alreadyParsedRule(self, input, ruleIndex):
904        """
905        Has this rule already parsed input at the current index in the
906        input stream?  Return the stop token index or MEMO_RULE_UNKNOWN.
907        If we attempted but failed to parse properly before, return
908        MEMO_RULE_FAILED.
909
910        This method has a side-effect: if we have seen this input for
911        this rule and successfully parsed before, then seek ahead to
912        1 past the stop token matched for this rule last time.
913        """
914
915        stopIndex = self.getRuleMemoization(ruleIndex, input.index())
916        if stopIndex == self.MEMO_RULE_UNKNOWN:
917            return False
918
919        if stopIndex == self.MEMO_RULE_FAILED:
920            raise BacktrackingFailed
921
922        else:
923            input.seek(stopIndex + 1)
924
925        return True
926
927
928    def memoize(self, input, ruleIndex, ruleStartIndex, success):
929        """
930        Record whether or not this rule parsed the input at this position
931        successfully.
932        """
933
934        if success:
935            stopTokenIndex = input.index() - 1
936        else:
937            stopTokenIndex = self.MEMO_RULE_FAILED
938
939        if ruleIndex in self._state.ruleMemo:
940            self._state.ruleMemo[ruleIndex][ruleStartIndex] = stopTokenIndex
941
942
943    def traceIn(self, ruleName, ruleIndex, inputSymbol):
944        sys.stdout.write("enter {} {}".format(ruleName, inputSymbol))
945
946        if self._state.backtracking > 0:
947            sys.stdout.write(" backtracking={}".format(self._state.backtracking))
948
949        sys.stdout.write('\n')
950
951
952    def traceOut(self, ruleName, ruleIndex, inputSymbol):
953        sys.stdout.write("exit {} {}".format(ruleName, inputSymbol))
954
955        if self._state.backtracking > 0:
956            sys.stdout.write(" backtracking={}".format(self._state.backtracking))
957
958        # mmmm... we use BacktrackingFailed exceptions now. So how could we
959        # get that information here?
960        #if self._state.failed:
961        #    sys.stdout.write(" failed")
962        #else:
963        #    sys.stdout.write(" succeeded")
964
965        sys.stdout.write('\n')
966
967
968class TokenSource(object):
969    """
970    @brief Abstract baseclass for token producers.
971
972    A source of tokens must provide a sequence of tokens via nextToken()
973    and also must reveal it's source of characters; CommonToken's text is
974    computed from a CharStream; it only store indices into the char stream.
975
976    Errors from the lexer are never passed to the parser.  Either you want
977    to keep going or you do not upon token recognition error.  If you do not
978    want to continue lexing then you do not want to continue parsing.  Just
979    throw an exception not under RecognitionException and Java will naturally
980    toss you all the way out of the recognizers.  If you want to continue
981    lexing then you should not throw an exception to the parser--it has already
982    requested a token.  Keep lexing until you get a valid one.  Just report
983    errors and keep going, looking for a valid token.
984    """
985
986    def nextToken(self):
987        """Return a Token object from your input stream (usually a CharStream).
988
989        Do not fail/return upon lexing error; keep chewing on the characters
990        until you get a good one; errors are not passed through to the parser.
991        """
992
993        raise NotImplementedError
994
995
996    def __iter__(self):
997        """The TokenSource is an interator.
998
999        The iteration will not include the final EOF token, see also the note
1000        for the __next__() method.
1001
1002        """
1003
1004        return self
1005
1006
1007    def __next__(self):
1008        """Return next token or raise StopIteration.
1009
1010        Note that this will raise StopIteration when hitting the EOF token,
1011        so EOF will not be part of the iteration.
1012
1013        """
1014
1015        token = self.nextToken()
1016        if token is None or token.type == EOF:
1017            raise StopIteration
1018        return token
1019
1020
1021class Lexer(BaseRecognizer, TokenSource):
1022    """
1023    @brief Baseclass for generated lexer classes.
1024
1025    A lexer is recognizer that draws input symbols from a character stream.
1026    lexer grammars result in a subclass of this object. A Lexer object
1027    uses simplified match() and error recovery mechanisms in the interest
1028    of speed.
1029    """
1030
1031    def __init__(self, input, state=None):
1032        BaseRecognizer.__init__(self, state)
1033        TokenSource.__init__(self)
1034
1035        # Where is the lexer drawing characters from?
1036        self.input = input
1037
1038
1039    def reset(self):
1040        super().reset() # reset all recognizer state variables
1041
1042        if self.input is not None:
1043            # rewind the input
1044            self.input.seek(0)
1045
1046        if self._state is None:
1047            # no shared state work to do
1048            return
1049
1050        # wack Lexer state variables
1051        self._state.token = None
1052        self._state.type = INVALID_TOKEN_TYPE
1053        self._state.channel = DEFAULT_CHANNEL
1054        self._state.tokenStartCharIndex = -1
1055        self._state.tokenStartLine = -1
1056        self._state.tokenStartCharPositionInLine = -1
1057        self._state.text = None
1058
1059
1060    def makeEOFToken(self):
1061        eof = CommonToken(
1062            type=EOF, channel=DEFAULT_CHANNEL,
1063            input=self.input,
1064            start=self.input.index(), stop=self.input.index())
1065        eof.line = self.input.line
1066        eof.charPositionInLine = self.input.charPositionInLine
1067        return eof
1068
1069    def nextToken(self):
1070        """
1071        Return a token from this source; i.e., match a token on the char
1072        stream.
1073        """
1074
1075        while 1:
1076            self._state.token = None
1077            self._state.channel = DEFAULT_CHANNEL
1078            self._state.tokenStartCharIndex = self.input.index()
1079            self._state.tokenStartCharPositionInLine = self.input.charPositionInLine
1080            self._state.tokenStartLine = self.input.line
1081            self._state.text = None
1082            if self.input.LA(1) == EOF:
1083                return self.makeEOFToken()
1084
1085            try:
1086                self.mTokens()
1087
1088                if self._state.token is None:
1089                    self.emit()
1090
1091                elif self._state.token == SKIP_TOKEN:
1092                    continue
1093
1094                return self._state.token
1095
1096            except NoViableAltException as re:
1097                self.reportError(re)
1098                self.recover(re) # throw out current char and try again
1099
1100            except RecognitionException as re:
1101                self.reportError(re)
1102                # match() routine has already called recover()
1103
1104
1105    def skip(self):
1106        """
1107        Instruct the lexer to skip creating a token for current lexer rule
1108        and look for another token.  nextToken() knows to keep looking when
1109        a lexer rule finishes with token set to SKIP_TOKEN.  Recall that
1110        if token==null at end of any token rule, it creates one for you
1111        and emits it.
1112        """
1113
1114        self._state.token = SKIP_TOKEN
1115
1116
1117    def mTokens(self):
1118        """This is the lexer entry point that sets instance var 'token'"""
1119
1120        # abstract method
1121        raise NotImplementedError
1122
1123
1124    def setCharStream(self, input):
1125        """Set the char stream and reset the lexer"""
1126        self.input = None
1127        self.reset()
1128        self.input = input
1129
1130
1131    def getSourceName(self):
1132        return self.input.getSourceName()
1133
1134
1135    def emit(self, token=None):
1136        """
1137        The standard method called to automatically emit a token at the
1138        outermost lexical rule.  The token object should point into the
1139        char buffer start..stop.  If there is a text override in 'text',
1140        use that to set the token's text.  Override this method to emit
1141        custom Token objects.
1142
1143        If you are building trees, then you should also override
1144        Parser or TreeParser.getMissingSymbol().
1145        """
1146
1147        if token is None:
1148            token = CommonToken(
1149                input=self.input,
1150                type=self._state.type,
1151                channel=self._state.channel,
1152                start=self._state.tokenStartCharIndex,
1153                stop=self.getCharIndex()-1
1154                )
1155            token.line = self._state.tokenStartLine
1156            token.text = self._state.text
1157            token.charPositionInLine = self._state.tokenStartCharPositionInLine
1158
1159        self._state.token = token
1160
1161        return token
1162
1163
1164    def match(self, s):
1165        if isinstance(s, str):
1166            for c in s:
1167                if self.input.LA(1) != ord(c):
1168                    if self._state.backtracking > 0:
1169                        raise BacktrackingFailed
1170
1171                    mte = MismatchedTokenException(c, self.input)
1172                    self.recover(mte)
1173                    raise mte
1174
1175                self.input.consume()
1176
1177        else:
1178            if self.input.LA(1) != s:
1179                if self._state.backtracking > 0:
1180                    raise BacktrackingFailed
1181
1182                mte = MismatchedTokenException(chr(s), self.input)
1183                self.recover(mte) # don't really recover; just consume in lexer
1184                raise mte
1185
1186            self.input.consume()
1187
1188
1189    def matchAny(self):
1190        self.input.consume()
1191
1192
1193    def matchRange(self, a, b):
1194        if self.input.LA(1) < a or self.input.LA(1) > b:
1195            if self._state.backtracking > 0:
1196                raise BacktrackingFailed
1197
1198            mre = MismatchedRangeException(chr(a), chr(b), self.input)
1199            self.recover(mre)
1200            raise mre
1201
1202        self.input.consume()
1203
1204
1205    def getLine(self):
1206        return self.input.line
1207
1208
1209    def getCharPositionInLine(self):
1210        return self.input.charPositionInLine
1211
1212
1213    def getCharIndex(self):
1214        """What is the index of the current character of lookahead?"""
1215
1216        return self.input.index()
1217
1218
1219    def getText(self):
1220        """
1221        Return the text matched so far for the current token or any
1222        text override.
1223        """
1224        if self._state.text is not None:
1225            return self._state.text
1226
1227        return self.input.substring(
1228            self._state.tokenStartCharIndex,
1229            self.getCharIndex()-1
1230            )
1231
1232
1233    def setText(self, text):
1234        """
1235        Set the complete text of this token; it wipes any previous
1236        changes to the text.
1237        """
1238        self._state.text = text
1239
1240
1241    text = property(getText, setText)
1242
1243
1244    def reportError(self, e):
1245        ## TODO: not thought about recovery in lexer yet.
1246
1247        ## # if we've already reported an error and have not matched a token
1248        ## # yet successfully, don't report any errors.
1249        ## if self.errorRecovery:
1250        ##     return
1251        ##
1252        ## self.errorRecovery = True
1253
1254        self.displayRecognitionError(e)
1255
1256
1257    def getErrorMessage(self, e):
1258        msg = None
1259
1260        if isinstance(e, MismatchedTokenException):
1261            msg = "mismatched character {} expecting {}".format(
1262                self.getCharErrorDisplay(e.c),
1263                self.getCharErrorDisplay(e.expecting))
1264
1265        elif isinstance(e, NoViableAltException):
1266            msg = "no viable alternative at character {}".format(
1267                self.getCharErrorDisplay(e.c))
1268
1269        elif isinstance(e, EarlyExitException):
1270            msg = "required (...)+ loop did not match anything at character {}".format(
1271                self.getCharErrorDisplay(e.c))
1272
1273        elif isinstance(e, MismatchedNotSetException):
1274            msg = "mismatched character {} expecting set {!r}".format(
1275                self.getCharErrorDisplay(e.c),
1276                e.expecting)
1277
1278        elif isinstance(e, MismatchedSetException):
1279            msg = "mismatched character {} expecting set {!r}".format(
1280                self.getCharErrorDisplay(e.c),
1281                e.expecting)
1282
1283        elif isinstance(e, MismatchedRangeException):
1284            msg = "mismatched character {} expecting set {}..{}".format(
1285                self.getCharErrorDisplay(e.c),
1286                self.getCharErrorDisplay(e.a),
1287                self.getCharErrorDisplay(e.b))
1288
1289        else:
1290            msg = super().getErrorMessage(e)
1291
1292        return msg
1293
1294
1295    def getCharErrorDisplay(self, c):
1296        if c == EOF:
1297            c = '<EOF>'
1298        return repr(c)
1299
1300
1301    def recover(self, re):
1302        """
1303        Lexers can normally match any char in it's vocabulary after matching
1304        a token, so do the easy thing and just kill a character and hope
1305        it all works out.  You can instead use the rule invocation stack
1306        to do sophisticated error recovery if you are in a fragment rule.
1307        """
1308
1309        self.input.consume()
1310
1311
1312    def traceIn(self, ruleName, ruleIndex):
1313        inputSymbol = "{} line={}:{}".format(self.input.LT(1),
1314                                             self.getLine(),
1315                                             self.getCharPositionInLine()
1316                                             )
1317
1318        super().traceIn(ruleName, ruleIndex, inputSymbol)
1319
1320
1321    def traceOut(self, ruleName, ruleIndex):
1322        inputSymbol = "{} line={}:{}".format(self.input.LT(1),
1323                                             self.getLine(),
1324                                             self.getCharPositionInLine()
1325                                             )
1326
1327        super().traceOut(ruleName, ruleIndex, inputSymbol)
1328
1329
1330
1331class Parser(BaseRecognizer):
1332    """
1333    @brief Baseclass for generated parser classes.
1334    """
1335
1336    def __init__(self, lexer, state=None):
1337        super().__init__(state)
1338
1339        self.input = lexer
1340
1341
1342    def reset(self):
1343        super().reset() # reset all recognizer state variables
1344        if self.input is not None:
1345            self.input.seek(0) # rewind the input
1346
1347
1348    def getCurrentInputSymbol(self, input):
1349        return input.LT(1)
1350
1351
1352    def getMissingSymbol(self, input, e, expectedTokenType, follow):
1353        if expectedTokenType == EOF:
1354            tokenText = "<missing EOF>"
1355        else:
1356            tokenText = "<missing {}>".format(self.tokenNames[expectedTokenType])
1357        t = CommonToken(type=expectedTokenType, text=tokenText)
1358        current = input.LT(1)
1359        if current.type == EOF:
1360            current = input.LT(-1)
1361
1362        if current is not None:
1363            t.line = current.line
1364            t.charPositionInLine = current.charPositionInLine
1365        t.channel = DEFAULT_CHANNEL
1366        return t
1367
1368
1369    def setTokenStream(self, input):
1370        """Set the token stream and reset the parser"""
1371
1372        self.input = None
1373        self.reset()
1374        self.input = input
1375
1376
1377    def getTokenStream(self):
1378        return self.input
1379
1380
1381    def getSourceName(self):
1382        return self.input.getSourceName()
1383
1384
1385    def traceIn(self, ruleName, ruleIndex):
1386        super().traceIn(ruleName, ruleIndex, self.input.LT(1))
1387
1388
1389    def traceOut(self, ruleName, ruleIndex):
1390        super().traceOut(ruleName, ruleIndex, self.input.LT(1))
1391
1392
1393class RuleReturnScope(object):
1394    """
1395    Rules can return start/stop info as well as possible trees and templates.
1396    """
1397
1398    def getStart(self):
1399        """Return the start token or tree."""
1400        return None
1401
1402
1403    def getStop(self):
1404        """Return the stop token or tree."""
1405        return None
1406
1407
1408    def getTree(self):
1409        """Has a value potentially if output=AST."""
1410        return None
1411
1412
1413    def getTemplate(self):
1414        """Has a value potentially if output=template."""
1415        return None
1416
1417
1418class ParserRuleReturnScope(RuleReturnScope):
1419    """
1420    Rules that return more than a single value must return an object
1421    containing all the values.  Besides the properties defined in
1422    RuleLabelScope.predefinedRulePropertiesScope there may be user-defined
1423    return values.  This class simply defines the minimum properties that
1424    are always defined and methods to access the others that might be
1425    available depending on output option such as template and tree.
1426
1427    Note text is not an actual property of the return value, it is computed
1428    from start and stop using the input stream's toString() method.  I
1429    could add a ctor to this so that we can pass in and store the input
1430    stream, but I'm not sure we want to do that.  It would seem to be undefined
1431    to get the .text property anyway if the rule matches tokens from multiple
1432    input streams.
1433
1434    I do not use getters for fields of objects that are used simply to
1435    group values such as this aggregate.  The getters/setters are there to
1436    satisfy the superclass interface.
1437    """
1438
1439    def __init__(self):
1440        super().__init__()
1441        self.start = None
1442        self.stop = None
1443        self.tree = None  # only used when output=AST
1444
1445
1446    def getStart(self):
1447        return self.start
1448
1449
1450    def getStop(self):
1451        return self.stop
1452
1453
1454    def getTree(self):
1455        return self.tree
1456