1 /*
2  [The "BSD license"]
3  Copyright (c) 2005-2009 Terence Parr
4  All rights reserved.
5 
6  Redistribution and use in source and binary forms, with or without
7  modification, are permitted provided that the following conditions
8  are met:
9  1. Redistributions of source code must retain the above copyright
10      notice, this list of conditions and the following disclaimer.
11  2. Redistributions in binary form must reproduce the above copyright
12      notice, this list of conditions and the following disclaimer in the
13      documentation and/or other materials provided with the distribution.
14  3. The name of the author may not be used to endorse or promote products
15      derived from this software without specific prior written permission.
16 
17  THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 package org.antlr.runtime;
29 
30 import java.util.ArrayList;
31 import java.util.HashMap;
32 import java.util.List;
33 import java.util.Map;
34 
35 /** A generic recognizer that can handle recognizers generated from
36  *  lexer, parser, and tree grammars.  This is all the parsing
37  *  support code essentially; most of it is error recovery stuff and
38  *  backtracking.
39  */
40 public abstract class BaseRecognizer {
41 	public static final int MEMO_RULE_FAILED = -2;
42 	public static final int MEMO_RULE_UNKNOWN = -1;
43 	public static final int INITIAL_FOLLOW_STACK_SIZE = 100;
44 
45 	// copies from Token object for convenience in actions
46 	public static final int DEFAULT_TOKEN_CHANNEL = Token.DEFAULT_CHANNEL;
47 	public static final int HIDDEN = Token.HIDDEN_CHANNEL;
48 
49 	public static final String NEXT_TOKEN_RULE_NAME = "nextToken";
50 
51 	/** State of a lexer, parser, or tree parser are collected into a state
52 	 *  object so the state can be shared.  This sharing is needed to
53 	 *  have one grammar import others and share same error variables
54 	 *  and other state variables.  It's a kind of explicit multiple
55 	 *  inheritance via delegation of methods and shared state.
56 	 */
57 	protected RecognizerSharedState state;
58 
BaseRecognizer()59 	public BaseRecognizer() {
60 		state = new RecognizerSharedState();
61 	}
62 
BaseRecognizer(RecognizerSharedState state)63 	public BaseRecognizer(RecognizerSharedState state) {
64 		if ( state==null ) {
65 			state = new RecognizerSharedState();
66 		}
67 		this.state = state;
68 	}
69 
70 	/** reset the parser's state; subclasses must rewinds the input stream */
reset()71 	public void reset() {
72 		// wack everything related to error recovery
73 		if ( state==null ) {
74 			return; // no shared state work to do
75 		}
76 		state._fsp = -1;
77 		state.errorRecovery = false;
78 		state.lastErrorIndex = -1;
79 		state.failed = false;
80 		state.syntaxErrors = 0;
81 		// wack everything related to backtracking and memoization
82 		state.backtracking = 0;
83 		for (int i = 0; state.ruleMemo!=null && i < state.ruleMemo.length; i++) { // wipe cache
84 			state.ruleMemo[i] = null;
85 		}
86 	}
87 
88 
89 	/** Match current input symbol against ttype.  Attempt
90 	 *  single token insertion or deletion error recovery.  If
91 	 *  that fails, throw MismatchedTokenException.
92 	 *
93 	 *  To turn off single token insertion or deletion error
94 	 *  recovery, override recoverFromMismatchedToken() and have it
95      *  throw an exception. See TreeParser.recoverFromMismatchedToken().
96      *  This way any error in a rule will cause an exception and
97      *  immediate exit from rule.  Rule would recover by resynchronizing
98      *  to the set of symbols that can follow rule ref.
99 	 */
match(IntStream input, int ttype, BitSet follow)100 	public Object match(IntStream input, int ttype, BitSet follow)
101 		throws RecognitionException
102 	{
103 		//System.out.println("match "+((TokenStream)input).LT(1));
104 		Object matchedSymbol = getCurrentInputSymbol(input);
105 		if ( input.LA(1)==ttype ) {
106 			input.consume();
107 			state.errorRecovery = false;
108 			state.failed = false;
109 			return matchedSymbol;
110 		}
111 		if ( state.backtracking>0 ) {
112 			state.failed = true;
113 			return matchedSymbol;
114 		}
115 		matchedSymbol = recoverFromMismatchedToken(input, ttype, follow);
116 		return matchedSymbol;
117 	}
118 
119 	/** Match the wildcard: in a symbol */
matchAny(IntStream input)120 	public void matchAny(IntStream input) {
121 		state.errorRecovery = false;
122 		state.failed = false;
123 		input.consume();
124 	}
125 
mismatchIsUnwantedToken(IntStream input, int ttype)126 	public boolean mismatchIsUnwantedToken(IntStream input, int ttype) {
127 		return input.LA(2)==ttype;
128 	}
129 
mismatchIsMissingToken(IntStream input, BitSet follow)130 	public boolean mismatchIsMissingToken(IntStream input, BitSet follow) {
131 		if ( follow==null ) {
132 			// we have no information about the follow; we can only consume
133 			// a single token and hope for the best
134 			return false;
135 		}
136 		// compute what can follow this grammar element reference
137 		if ( follow.member(Token.EOR_TOKEN_TYPE) ) {
138 			BitSet viableTokensFollowingThisRule = computeContextSensitiveRuleFOLLOW();
139 			follow = follow.or(viableTokensFollowingThisRule);
140             if ( state._fsp>=0 ) { // remove EOR if we're not the start symbol
141                 follow.remove(Token.EOR_TOKEN_TYPE);
142             }
143 		}
144 		// if current token is consistent with what could come after set
145 		// then we know we're missing a token; error recovery is free to
146 		// "insert" the missing token
147 
148 		//System.out.println("viable tokens="+follow.toString(getTokenNames()));
149 		//System.out.println("LT(1)="+((TokenStream)input).LT(1));
150 
151 		// BitSet cannot handle negative numbers like -1 (EOF) so I leave EOR
152 		// in follow set to indicate that the fall of the start symbol is
153 		// in the set (EOF can follow).
154 		if ( follow.member(input.LA(1)) || follow.member(Token.EOR_TOKEN_TYPE) ) {
155 			//System.out.println("LT(1)=="+((TokenStream)input).LT(1)+" is consistent with what follows; inserting...");
156 			return true;
157 		}
158 		return false;
159 	}
160 
161 	/** Report a recognition problem.
162 	 *
163 	 *  This method sets errorRecovery to indicate the parser is recovering
164 	 *  not parsing.  Once in recovery mode, no errors are generated.
165 	 *  To get out of recovery mode, the parser must successfully match
166 	 *  a token (after a resync).  So it will go:
167 	 *
168 	 * 		1. error occurs
169 	 * 		2. enter recovery mode, report error
170 	 * 		3. consume until token found in resynch set
171 	 * 		4. try to resume parsing
172 	 * 		5. next match() will reset errorRecovery mode
173 	 *
174 	 *  If you override, make sure to update syntaxErrors if you care about that.
175 	 */
reportError(RecognitionException e)176 	public void reportError(RecognitionException e) {
177 		// if we've already reported an error and have not matched a token
178 		// yet successfully, don't report any errors.
179 		if ( state.errorRecovery ) {
180 			//System.err.print("[SPURIOUS] ");
181 			return;
182 		}
183 		state.syntaxErrors++; // don't count spurious
184 		state.errorRecovery = true;
185 
186 		displayRecognitionError(this.getTokenNames(), e);
187 	}
188 
displayRecognitionError(String[] tokenNames, RecognitionException e)189 	public void displayRecognitionError(String[] tokenNames,
190 										RecognitionException e)
191 	{
192 		String hdr = getErrorHeader(e);
193 		String msg = getErrorMessage(e, tokenNames);
194 		emitErrorMessage(hdr+" "+msg);
195 	}
196 
197 	/** What error message should be generated for the various
198 	 *  exception types?
199 	 *
200 	 *  Not very object-oriented code, but I like having all error message
201 	 *  generation within one method rather than spread among all of the
202 	 *  exception classes. This also makes it much easier for the exception
203 	 *  handling because the exception classes do not have to have pointers back
204 	 *  to this object to access utility routines and so on. Also, changing
205 	 *  the message for an exception type would be difficult because you
206 	 *  would have to subclassing exception, but then somehow get ANTLR
207 	 *  to make those kinds of exception objects instead of the default.
208 	 *  This looks weird, but trust me--it makes the most sense in terms
209 	 *  of flexibility.
210 	 *
211 	 *  For grammar debugging, you will want to override this to add
212 	 *  more information such as the stack frame with
213 	 *  getRuleInvocationStack(e, this.getClass().getName()) and,
214 	 *  for no viable alts, the decision description and state etc...
215 	 *
216 	 *  Override this to change the message generated for one or more
217 	 *  exception types.
218 	 */
getErrorMessage(RecognitionException e, String[] tokenNames)219 	public String getErrorMessage(RecognitionException e, String[] tokenNames) {
220 		String msg = e.getMessage();
221 		if ( e instanceof UnwantedTokenException ) {
222 			UnwantedTokenException ute = (UnwantedTokenException)e;
223 			String tokenName="<unknown>";
224 			if ( ute.expecting== Token.EOF ) {
225 				tokenName = "EOF";
226 			}
227 			else {
228 				tokenName = tokenNames[ute.expecting];
229 			}
230 			msg = "extraneous input "+getTokenErrorDisplay(ute.getUnexpectedToken())+
231 				" expecting "+tokenName;
232 		}
233 		else if ( e instanceof MissingTokenException ) {
234 			MissingTokenException mte = (MissingTokenException)e;
235 			String tokenName="<unknown>";
236 			if ( mte.expecting== Token.EOF ) {
237 				tokenName = "EOF";
238 			}
239 			else {
240 				tokenName = tokenNames[mte.expecting];
241 			}
242 			msg = "missing "+tokenName+" at "+getTokenErrorDisplay(e.token);
243 		}
244 		else if ( e instanceof MismatchedTokenException ) {
245 			MismatchedTokenException mte = (MismatchedTokenException)e;
246 			String tokenName="<unknown>";
247 			if ( mte.expecting== Token.EOF ) {
248 				tokenName = "EOF";
249 			}
250 			else {
251 				tokenName = tokenNames[mte.expecting];
252 			}
253 			msg = "mismatched input "+getTokenErrorDisplay(e.token)+
254 				" expecting "+tokenName;
255 		}
256 		else if ( e instanceof MismatchedTreeNodeException ) {
257 			MismatchedTreeNodeException mtne = (MismatchedTreeNodeException)e;
258 			String tokenName="<unknown>";
259 			if ( mtne.expecting==Token.EOF ) {
260 				tokenName = "EOF";
261 			}
262 			else {
263 				tokenName = tokenNames[mtne.expecting];
264 			}
265 			msg = "mismatched tree node: "+mtne.node+
266 				" expecting "+tokenName;
267 		}
268 		else if ( e instanceof NoViableAltException ) {
269 			//NoViableAltException nvae = (NoViableAltException)e;
270 			// for development, can add "decision=<<"+nvae.grammarDecisionDescription+">>"
271 			// and "(decision="+nvae.decisionNumber+") and
272 			// "state "+nvae.stateNumber
273 			msg = "no viable alternative at input "+getTokenErrorDisplay(e.token);
274 		}
275 		else if ( e instanceof EarlyExitException ) {
276 			//EarlyExitException eee = (EarlyExitException)e;
277 			// for development, can add "(decision="+eee.decisionNumber+")"
278 			msg = "required (...)+ loop did not match anything at input "+
279 				getTokenErrorDisplay(e.token);
280 		}
281 		else if ( e instanceof MismatchedSetException ) {
282 			MismatchedSetException mse = (MismatchedSetException)e;
283 			msg = "mismatched input "+getTokenErrorDisplay(e.token)+
284 				" expecting set "+mse.expecting;
285 		}
286 		else if ( e instanceof MismatchedNotSetException ) {
287 			MismatchedNotSetException mse = (MismatchedNotSetException)e;
288 			msg = "mismatched input "+getTokenErrorDisplay(e.token)+
289 				" expecting set "+mse.expecting;
290 		}
291 		else if ( e instanceof FailedPredicateException ) {
292 			FailedPredicateException fpe = (FailedPredicateException)e;
293 			msg = "rule "+fpe.ruleName+" failed predicate: {"+
294 				fpe.predicateText+"}?";
295 		}
296 		return msg;
297 	}
298 
299 	/** Get number of recognition errors (lexer, parser, tree parser).  Each
300 	 *  recognizer tracks its own number.  So parser and lexer each have
301 	 *  separate count.  Does not count the spurious errors found between
302 	 *  an error and next valid token match
303 	 *
304 	 *  See also reportError()
305 	 */
getNumberOfSyntaxErrors()306 	public int getNumberOfSyntaxErrors() {
307 		return state.syntaxErrors;
308 	}
309 
310 	/** What is the error header, normally line/character position information? */
getErrorHeader(RecognitionException e)311 	public String getErrorHeader(RecognitionException e) {
312 		if ( getSourceName()!=null )
313 			return getSourceName()+" line "+e.line+":"+e.charPositionInLine;
314 
315 		return "line "+e.line+":"+e.charPositionInLine;
316 	}
317 
318 	/** How should a token be displayed in an error message? The default
319 	 *  is to display just the text, but during development you might
320 	 *  want to have a lot of information spit out.  Override in that case
321 	 *  to use t.toString() (which, for CommonToken, dumps everything about
322 	 *  the token). This is better than forcing you to override a method in
323 	 *  your token objects because you don't have to go modify your lexer
324 	 *  so that it creates a new Java type.
325 	 */
getTokenErrorDisplay(Token t)326 	public String getTokenErrorDisplay(Token t) {
327 		String s = t.getText();
328 		if ( s==null ) {
329 			if ( t.getType()==Token.EOF ) {
330 				s = "<EOF>";
331 			}
332 			else {
333 				s = "<"+t.getType()+">";
334 			}
335 		}
336 		s = s.replaceAll("\n","\\\\n");
337 		s = s.replaceAll("\r","\\\\r");
338 		s = s.replaceAll("\t","\\\\t");
339 		return "'"+s+"'";
340 	}
341 
342 	/** Override this method to change where error messages go */
emitErrorMessage(String msg)343 	public void emitErrorMessage(String msg) {
344 		System.err.println(msg);
345 	}
346 
347 	/** Recover from an error found on the input stream.  This is
348 	 *  for NoViableAlt and mismatched symbol exceptions.  If you enable
349 	 *  single token insertion and deletion, this will usually not
350 	 *  handle mismatched symbol exceptions but there could be a mismatched
351 	 *  token that the match() routine could not recover from.
352 	 */
recover(IntStream input, RecognitionException re)353 	public void recover(IntStream input, RecognitionException re) {
354 		if ( state.lastErrorIndex==input.index() ) {
355 			// uh oh, another error at same token index; must be a case
356 			// where LT(1) is in the recovery token set so nothing is
357 			// consumed; consume a single token so at least to prevent
358 			// an infinite loop; this is a failsafe.
359 			input.consume();
360 		}
361 		state.lastErrorIndex = input.index();
362 		BitSet followSet = computeErrorRecoverySet();
363 		beginResync();
364 		consumeUntil(input, followSet);
365 		endResync();
366 	}
367 
368 	/** A hook to listen in on the token consumption during error recovery.
369 	 *  The DebugParser subclasses this to fire events to the listenter.
370 	 */
beginResync()371 	public void beginResync() {
372 	}
373 
endResync()374 	public void endResync() {
375 	}
376 
377 	/*  Compute the error recovery set for the current rule.  During
378 	 *  rule invocation, the parser pushes the set of tokens that can
379 	 *  follow that rule reference on the stack; this amounts to
380 	 *  computing FIRST of what follows the rule reference in the
381 	 *  enclosing rule. This local follow set only includes tokens
382 	 *  from within the rule; i.e., the FIRST computation done by
383 	 *  ANTLR stops at the end of a rule.
384 	 *
385 	 *  EXAMPLE
386 	 *
387 	 *  When you find a "no viable alt exception", the input is not
388 	 *  consistent with any of the alternatives for rule r.  The best
389 	 *  thing to do is to consume tokens until you see something that
390 	 *  can legally follow a call to r *or* any rule that called r.
391 	 *  You don't want the exact set of viable next tokens because the
392 	 *  input might just be missing a token--you might consume the
393 	 *  rest of the input looking for one of the missing tokens.
394 	 *
395 	 *  Consider grammar:
396 	 *
397 	 *  a : '[' b ']'
398 	 *    | '(' b ')'
399 	 *    ;
400 	 *  b : c '^' INT ;
401 	 *  c : ID
402 	 *    | INT
403 	 *    ;
404 	 *
405 	 *  At each rule invocation, the set of tokens that could follow
406 	 *  that rule is pushed on a stack.  Here are the various "local"
407 	 *  follow sets:
408 	 *
409 	 *  FOLLOW(b1_in_a) = FIRST(']') = ']'
410 	 *  FOLLOW(b2_in_a) = FIRST(')') = ')'
411 	 *  FOLLOW(c_in_b) = FIRST('^') = '^'
412 	 *
413 	 *  Upon erroneous input "[]", the call chain is
414 	 *
415 	 *  a -> b -> c
416 	 *
417 	 *  and, hence, the follow context stack is:
418 	 *
419 	 *  depth  local follow set     after call to rule
420 	 *    0         <EOF>                    a (from main())
421 	 *    1          ']'                     b
422 	 *    3          '^'                     c
423 	 *
424 	 *  Notice that ')' is not included, because b would have to have
425 	 *  been called from a different context in rule a for ')' to be
426 	 *  included.
427 	 *
428 	 *  For error recovery, we cannot consider FOLLOW(c)
429 	 *  (context-sensitive or otherwise).  We need the combined set of
430 	 *  all context-sensitive FOLLOW sets--the set of all tokens that
431 	 *  could follow any reference in the call chain.  We need to
432 	 *  resync to one of those tokens.  Note that FOLLOW(c)='^' and if
433 	 *  we resync'd to that token, we'd consume until EOF.  We need to
434 	 *  sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}.
435 	 *  In this case, for input "[]", LA(1) is in this set so we would
436 	 *  not consume anything and after printing an error rule c would
437 	 *  return normally.  It would not find the required '^' though.
438 	 *  At this point, it gets a mismatched token error and throws an
439 	 *  exception (since LA(1) is not in the viable following token
440 	 *  set).  The rule exception handler tries to recover, but finds
441 	 *  the same recovery set and doesn't consume anything.  Rule b
442 	 *  exits normally returning to rule a.  Now it finds the ']' (and
443 	 *  with the successful match exits errorRecovery mode).
444 	 *
445 	 *  So, you cna see that the parser walks up call chain looking
446 	 *  for the token that was a member of the recovery set.
447 	 *
448 	 *  Errors are not generated in errorRecovery mode.
449 	 *
450 	 *  ANTLR's error recovery mechanism is based upon original ideas:
451 	 *
452 	 *  "Algorithms + Data Structures = Programs" by Niklaus Wirth
453 	 *
454 	 *  and
455 	 *
456 	 *  "A note on error recovery in recursive descent parsers":
457 	 *  http://portal.acm.org/citation.cfm?id=947902.947905
458 	 *
459 	 *  Later, Josef Grosch had some good ideas:
460 	 *
461 	 *  "Efficient and Comfortable Error Recovery in Recursive Descent
462 	 *  Parsers":
463 	 *  ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip
464 	 *
465 	 *  Like Grosch I implemented local FOLLOW sets that are combined
466 	 *  at run-time upon error to avoid overhead during parsing.
467 	 */
computeErrorRecoverySet()468 	protected BitSet computeErrorRecoverySet() {
469 		return combineFollows(false);
470 	}
471 
472 	/** Compute the context-sensitive FOLLOW set for current rule.
473 	 *  This is set of token types that can follow a specific rule
474 	 *  reference given a specific call chain.  You get the set of
475 	 *  viable tokens that can possibly come next (lookahead depth 1)
476 	 *  given the current call chain.  Contrast this with the
477 	 *  definition of plain FOLLOW for rule r:
478 	 *
479 	 *   FOLLOW(r)={x | S=>*alpha r beta in G and x in FIRST(beta)}
480 	 *
481 	 *  where x in T* and alpha, beta in V*; T is set of terminals and
482 	 *  V is the set of terminals and nonterminals.  In other words,
483 	 *  FOLLOW(r) is the set of all tokens that can possibly follow
484 	 *  references to r in *any* sentential form (context).  At
485 	 *  runtime, however, we know precisely which context applies as
486 	 *  we have the call chain.  We may compute the exact (rather
487 	 *  than covering superset) set of following tokens.
488 	 *
489 	 *  For example, consider grammar:
490 	 *
491 	 *  stat : ID '=' expr ';'      // FOLLOW(stat)=={EOF}
492 	 *       | "return" expr '.'
493 	 *       ;
494 	 *  expr : atom ('+' atom)* ;   // FOLLOW(expr)=={';','.',')'}
495 	 *  atom : INT                  // FOLLOW(atom)=={'+',')',';','.'}
496 	 *       | '(' expr ')'
497 	 *       ;
498 	 *
499 	 *  The FOLLOW sets are all inclusive whereas context-sensitive
500 	 *  FOLLOW sets are precisely what could follow a rule reference.
501 	 *  For input input "i=(3);", here is the derivation:
502 	 *
503 	 *  stat => ID '=' expr ';'
504 	 *       => ID '=' atom ('+' atom)* ';'
505 	 *       => ID '=' '(' expr ')' ('+' atom)* ';'
506 	 *       => ID '=' '(' atom ')' ('+' atom)* ';'
507 	 *       => ID '=' '(' INT ')' ('+' atom)* ';'
508 	 *       => ID '=' '(' INT ')' ';'
509 	 *
510 	 *  At the "3" token, you'd have a call chain of
511 	 *
512 	 *    stat -> expr -> atom -> expr -> atom
513 	 *
514 	 *  What can follow that specific nested ref to atom?  Exactly ')'
515 	 *  as you can see by looking at the derivation of this specific
516 	 *  input.  Contrast this with the FOLLOW(atom)={'+',')',';','.'}.
517 	 *
518 	 *  You want the exact viable token set when recovering from a
519 	 *  token mismatch.  Upon token mismatch, if LA(1) is member of
520 	 *  the viable next token set, then you know there is most likely
521 	 *  a missing token in the input stream.  "Insert" one by just not
522 	 *  throwing an exception.
523 	 */
computeContextSensitiveRuleFOLLOW()524 	protected BitSet computeContextSensitiveRuleFOLLOW() {
525 		return combineFollows(true);
526 	}
527 
528 	// what is exact? it seems to only add sets from above on stack
529 	// if EOR is in set i.  When it sees a set w/o EOR, it stops adding.
530 	// Why would we ever want them all?  Maybe no viable alt instead of
531 	// mismatched token?
combineFollows(boolean exact)532 	protected BitSet combineFollows(boolean exact) {
533 		int top = state._fsp;
534 		BitSet followSet = new BitSet();
535 		for (int i=top; i>=0; i--) {
536 			BitSet localFollowSet = (BitSet)state.following[i];
537 			/*
538 			System.out.println("local follow depth "+i+"="+
539 							   localFollowSet.toString(getTokenNames())+")");
540 			 */
541 			followSet.orInPlace(localFollowSet);
542 			if ( exact ) {
543 				// can we see end of rule?
544 				if ( localFollowSet.member(Token.EOR_TOKEN_TYPE) ) {
545 					// Only leave EOR in set if at top (start rule); this lets
546 					// us know if have to include follow(start rule); i.e., EOF
547 					if ( i>0 ) {
548 						followSet.remove(Token.EOR_TOKEN_TYPE);
549 					}
550 				}
551 				else { // can't see end of rule, quit
552 					break;
553 				}
554 			}
555 		}
556 		return followSet;
557 	}
558 
559 	/** Attempt to recover from a single missing or extra token.
560 	 *
561 	 *  EXTRA TOKEN
562 	 *
563 	 *  LA(1) is not what we are looking for.  If LA(2) has the right token,
564 	 *  however, then assume LA(1) is some extra spurious token.  Delete it
565 	 *  and LA(2) as if we were doing a normal match(), which advances the
566 	 *  input.
567 	 *
568 	 *  MISSING TOKEN
569 	 *
570 	 *  If current token is consistent with what could come after
571 	 *  ttype then it is ok to "insert" the missing token, else throw
572 	 *  exception For example, Input "i=(3;" is clearly missing the
573 	 *  ')'.  When the parser returns from the nested call to expr, it
574 	 *  will have call chain:
575 	 *
576 	 *    stat -> expr -> atom
577 	 *
578 	 *  and it will be trying to match the ')' at this point in the
579 	 *  derivation:
580 	 *
581 	 *       => ID '=' '(' INT ')' ('+' atom)* ';'
582 	 *                          ^
583 	 *  match() will see that ';' doesn't match ')' and report a
584 	 *  mismatched token error.  To recover, it sees that LA(1)==';'
585 	 *  is in the set of tokens that can follow the ')' token
586 	 *  reference in rule atom.  It can assume that you forgot the ')'.
587 	 */
recoverFromMismatchedToken(IntStream input, int ttype, BitSet follow)588 	protected Object recoverFromMismatchedToken(IntStream input, int ttype, BitSet follow)
589 		throws RecognitionException
590 	{
591 		RecognitionException e = null;
592 		// if next token is what we are looking for then "delete" this token
593 		if ( mismatchIsUnwantedToken(input, ttype) ) {
594 			e = new UnwantedTokenException(ttype, input);
595 			/*
596 			System.err.println("recoverFromMismatchedToken deleting "+
597 							   ((TokenStream)input).LT(1)+
598 							   " since "+((TokenStream)input).LT(2)+" is what we want");
599 			 */
600 			beginResync();
601 			input.consume(); // simply delete extra token
602 			endResync();
603 			reportError(e);  // report after consuming so AW sees the token in the exception
604 			// we want to return the token we're actually matching
605 			Object matchedSymbol = getCurrentInputSymbol(input);
606 			input.consume(); // move past ttype token as if all were ok
607 			return matchedSymbol;
608 		}
609 		// can't recover with single token deletion, try insertion
610 		if ( mismatchIsMissingToken(input, follow) ) {
611 			Object inserted = getMissingSymbol(input, e, ttype, follow);
612 			e = new MissingTokenException(ttype, input, inserted);
613 			reportError(e);  // report after inserting so AW sees the token in the exception
614 			return inserted;
615 		}
616 		// even that didn't work; must throw the exception
617 		e = new MismatchedTokenException(ttype, input);
618 		throw e;
619 	}
620 
621 	/** Not currently used */
recoverFromMismatchedSet(IntStream input, RecognitionException e, BitSet follow)622 	public Object recoverFromMismatchedSet(IntStream input,
623 										   RecognitionException e,
624 										   BitSet follow)
625 		throws RecognitionException
626 	{
627 		if ( mismatchIsMissingToken(input, follow) ) {
628 			// System.out.println("missing token");
629 			reportError(e);
630 			// we don't know how to conjure up a token for sets yet
631 			return getMissingSymbol(input, e, Token.INVALID_TOKEN_TYPE, follow);
632 		}
633 		// TODO do single token deletion like above for Token mismatch
634 		throw e;
635 	}
636 
637 	/** Match needs to return the current input symbol, which gets put
638 	 *  into the label for the associated token ref; e.g., x=ID.  Token
639 	 *  and tree parsers need to return different objects. Rather than test
640 	 *  for input stream type or change the IntStream interface, I use
641 	 *  a simple method to ask the recognizer to tell me what the current
642 	 *  input symbol is.
643 	 *
644 	 *  This is ignored for lexers.
645 	 */
getCurrentInputSymbol(IntStream input)646 	protected Object getCurrentInputSymbol(IntStream input) { return null; }
647 
648 	/** Conjure up a missing token during error recovery.
649 	 *
650 	 *  The recognizer attempts to recover from single missing
651 	 *  symbols. But, actions might refer to that missing symbol.
652 	 *  For example, x=ID {f($x);}. The action clearly assumes
653 	 *  that there has been an identifier matched previously and that
654 	 *  $x points at that token. If that token is missing, but
655 	 *  the next token in the stream is what we want we assume that
656 	 *  this token is missing and we keep going. Because we
657 	 *  have to return some token to replace the missing token,
658 	 *  we have to conjure one up. This method gives the user control
659 	 *  over the tokens returned for missing tokens. Mostly,
660 	 *  you will want to create something special for identifier
661 	 *  tokens. For literals such as '{' and ',', the default
662 	 *  action in the parser or tree parser works. It simply creates
663 	 *  a CommonToken of the appropriate type. The text will be the token.
664 	 *  If you change what tokens must be created by the lexer,
665 	 *  override this method to create the appropriate tokens.
666 	 */
getMissingSymbol(IntStream input, RecognitionException e, int expectedTokenType, BitSet follow)667 	protected Object getMissingSymbol(IntStream input,
668 									  RecognitionException e,
669 									  int expectedTokenType,
670 									  BitSet follow)
671 	{
672 		return null;
673 	}
674 
consumeUntil(IntStream input, int tokenType)675 	public void consumeUntil(IntStream input, int tokenType) {
676 		//System.out.println("consumeUntil "+tokenType);
677 		int ttype = input.LA(1);
678 		while (ttype != Token.EOF && ttype != tokenType) {
679 			input.consume();
680 			ttype = input.LA(1);
681 		}
682 	}
683 
684 	/** Consume tokens until one matches the given token set */
consumeUntil(IntStream input, BitSet set)685 	public void consumeUntil(IntStream input, BitSet set) {
686 		//System.out.println("consumeUntil("+set.toString(getTokenNames())+")");
687 		int ttype = input.LA(1);
688 		while (ttype != Token.EOF && !set.member(ttype) ) {
689 			//System.out.println("consume during recover LA(1)="+getTokenNames()[input.LA(1)]);
690 			input.consume();
691 			ttype = input.LA(1);
692 		}
693 	}
694 
695 	/** Push a rule's follow set using our own hardcoded stack */
pushFollow(BitSet fset)696 	protected void pushFollow(BitSet fset) {
697 		if ( (state._fsp +1)>=state.following.length ) {
698 			BitSet[] f = new BitSet[state.following.length*2];
699 			System.arraycopy(state.following, 0, f, 0, state.following.length);
700 			state.following = f;
701 		}
702 		state.following[++state._fsp] = fset;
703 	}
704 
705 	/** Return List<String> of the rules in your parser instance
706 	 *  leading up to a call to this method.  You could override if
707 	 *  you want more details such as the file/line info of where
708 	 *  in the parser java code a rule is invoked.
709 	 *
710 	 *  This is very useful for error messages and for context-sensitive
711 	 *  error recovery.
712 	 */
getRuleInvocationStack()713 	public List getRuleInvocationStack() {
714 		String parserClassName = getClass().getName();
715 		return getRuleInvocationStack(new Throwable(), parserClassName);
716 	}
717 
718 	/** A more general version of getRuleInvocationStack where you can
719 	 *  pass in, for example, a RecognitionException to get it's rule
720 	 *  stack trace.  This routine is shared with all recognizers, hence,
721 	 *  static.
722 	 *
723 	 *  TODO: move to a utility class or something; weird having lexer call this
724 	 */
getRuleInvocationStack(Throwable e, String recognizerClassName)725 	public static List getRuleInvocationStack(Throwable e,
726 											  String recognizerClassName)
727 	{
728 		List rules = new ArrayList();
729 		StackTraceElement[] stack = e.getStackTrace();
730 		int i = 0;
731 		for (i=stack.length-1; i>=0; i--) {
732 			StackTraceElement t = stack[i];
733 			if ( t.getClassName().startsWith("org.antlr.runtime.") ) {
734 				continue; // skip support code such as this method
735 			}
736 			if ( t.getMethodName().equals(NEXT_TOKEN_RULE_NAME) ) {
737 				continue;
738 			}
739 			if ( !t.getClassName().equals(recognizerClassName) ) {
740 				continue; // must not be part of this parser
741 			}
742             rules.add(t.getMethodName());
743 		}
744 		return rules;
745 	}
746 
getBacktrackingLevel()747     public int getBacktrackingLevel() { return state.backtracking; }
748 
setBacktrackingLevel(int n)749     public void setBacktrackingLevel(int n) { state.backtracking = n; }
750 
751     /** Return whether or not a backtracking attempt failed. */
failed()752     public boolean failed() { return state.failed; }
753 
754 	/** Used to print out token names like ID during debugging and
755 	 *  error reporting.  The generated parsers implement a method
756 	 *  that overrides this to point to their String[] tokenNames.
757 	 */
getTokenNames()758 	public String[] getTokenNames() {
759 		return null;
760 	}
761 
762 	/** For debugging and other purposes, might want the grammar name.
763 	 *  Have ANTLR generate an implementation for this method.
764 	 */
getGrammarFileName()765 	public String getGrammarFileName() {
766 		return null;
767 	}
768 
getSourceName()769 	public abstract String getSourceName();
770 
771 	/** A convenience method for use most often with template rewrites.
772 	 *  Convert a List<Token> to List<String>
773 	 */
toStrings(List tokens)774 	public List toStrings(List tokens) {
775 		if ( tokens==null ) return null;
776 		List strings = new ArrayList(tokens.size());
777 		for (int i=0; i<tokens.size(); i++) {
778 			strings.add(((Token)tokens.get(i)).getText());
779 		}
780 		return strings;
781 	}
782 
783 	/** Given a rule number and a start token index number, return
784 	 *  MEMO_RULE_UNKNOWN if the rule has not parsed input starting from
785 	 *  start index.  If this rule has parsed input starting from the
786 	 *  start index before, then return where the rule stopped parsing.
787 	 *  It returns the index of the last token matched by the rule.
788 	 *
789 	 *  For now we use a hashtable and just the slow Object-based one.
790 	 *  Later, we can make a special one for ints and also one that
791 	 *  tosses out data after we commit past input position i.
792 	 */
getRuleMemoization(int ruleIndex, int ruleStartIndex)793 	public int getRuleMemoization(int ruleIndex, int ruleStartIndex) {
794 		if ( state.ruleMemo[ruleIndex]==null ) {
795 			state.ruleMemo[ruleIndex] = new HashMap();
796 		}
797 		Integer stopIndexI =
798 			(Integer)state.ruleMemo[ruleIndex].get(new Integer(ruleStartIndex));
799 		if ( stopIndexI==null ) {
800 			return MEMO_RULE_UNKNOWN;
801 		}
802 		return stopIndexI.intValue();
803 	}
804 
805 	/** Has this rule already parsed input at the current index in the
806 	 *  input stream?  Return the stop token index or MEMO_RULE_UNKNOWN.
807 	 *  If we attempted but failed to parse properly before, return
808 	 *  MEMO_RULE_FAILED.
809 	 *
810 	 *  This method has a side-effect: if we have seen this input for
811 	 *  this rule and successfully parsed before, then seek ahead to
812 	 *  1 past the stop token matched for this rule last time.
813 	 */
alreadyParsedRule(IntStream input, int ruleIndex)814 	public boolean alreadyParsedRule(IntStream input, int ruleIndex) {
815 		int stopIndex = getRuleMemoization(ruleIndex, input.index());
816 		if ( stopIndex==MEMO_RULE_UNKNOWN ) {
817 			return false;
818 		}
819 		if ( stopIndex==MEMO_RULE_FAILED ) {
820 			//System.out.println("rule "+ruleIndex+" will never succeed");
821 			state.failed=true;
822 		}
823 		else {
824 			//System.out.println("seen rule "+ruleIndex+" before; skipping ahead to @"+(stopIndex+1)+" failed="+state.failed);
825 			input.seek(stopIndex+1); // jump to one past stop token
826 		}
827 		return true;
828 	}
829 
830 	/** Record whether or not this rule parsed the input at this position
831 	 *  successfully.  Use a standard java hashtable for now.
832 	 */
memoize(IntStream input, int ruleIndex, int ruleStartIndex)833 	public void memoize(IntStream input,
834 						int ruleIndex,
835 						int ruleStartIndex)
836 	{
837 		int stopTokenIndex = state.failed?MEMO_RULE_FAILED:input.index()-1;
838 		if ( state.ruleMemo==null ) {
839 			System.err.println("!!!!!!!!! memo array is null for "+ getGrammarFileName());
840 		}
841 		if ( ruleIndex >= state.ruleMemo.length ) {
842 			System.err.println("!!!!!!!!! memo size is "+state.ruleMemo.length+", but rule index is "+ruleIndex);
843 		}
844 		if ( state.ruleMemo[ruleIndex]!=null ) {
845 			state.ruleMemo[ruleIndex].put(
846 				new Integer(ruleStartIndex), new Integer(stopTokenIndex)
847 			);
848 		}
849 	}
850 
851 	/** return how many rule/input-index pairs there are in total.
852 	 *  TODO: this includes synpreds. :(
853 	 */
getRuleMemoizationCacheSize()854 	public int getRuleMemoizationCacheSize() {
855 		int n = 0;
856 		for (int i = 0; state.ruleMemo!=null && i < state.ruleMemo.length; i++) {
857 			Map ruleMap = state.ruleMemo[i];
858 			if ( ruleMap!=null ) {
859 				n += ruleMap.size(); // how many input indexes are recorded?
860 			}
861 		}
862 		return n;
863 	}
864 
traceIn(String ruleName, int ruleIndex, Object inputSymbol)865 	public void traceIn(String ruleName, int ruleIndex, Object inputSymbol)  {
866 		System.out.print("enter "+ruleName+" "+inputSymbol);
867 		if ( state.backtracking>0 ) {
868 			System.out.print(" backtracking="+state.backtracking);
869 		}
870 		System.out.println();
871 	}
872 
traceOut(String ruleName, int ruleIndex, Object inputSymbol)873 	public void traceOut(String ruleName,
874 						 int ruleIndex,
875 						 Object inputSymbol)
876 	{
877 		System.out.print("exit "+ruleName+" "+inputSymbol);
878 		if ( state.backtracking>0 ) {
879             System.out.print(" backtracking="+state.backtracking);
880             if ( state.failed ) System.out.print(" failed");
881             else System.out.print(" succeeded");
882         }
883 		System.out.println();
884 	}
885 
886 }
887