1 /** \file
2  * Defines the basic structure to support recognizing by either a lexer,
3  * parser, or tree parser.
4  * \addtogroup BaseRecognizer
5  * @{
6  */
7 #ifndef	_ANTLR3_BASERECOGNIZER_HPP
8 #define	_ANTLR3_BASERECOGNIZER_HPP
9 
10 // [The "BSD licence"]
11 // Copyright (c) 2005-2009 Gokulakannan Somasundaram, ElectronDB
12 
13 //
14 // All rights reserved.
15 //
16 // Redistribution and use in source and binary forms, with or without
17 // modification, are permitted provided that the following conditions
18 // are met:
19 // 1. Redistributions of source code must retain the above copyright
20 //    notice, this list of conditions and the following disclaimer.
21 // 2. Redistributions in binary form must reproduce the above copyright
22 //    notice, this list of conditions and the following disclaimer in the
23 //    documentation and/or other materials provided with the distribution.
24 // 3. The name of the author may not be used to endorse or promote products
25 //    derived from this software without specific prior written permission.
26 //
27 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
28 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
29 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
30 // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
31 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
32 // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
33 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
34 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
35 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
36 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 
38 #include    "antlr3defs.hpp"
39 #include    "antlr3collections.hpp"
40 
41 ANTLR_BEGIN_NAMESPACE()
42 
43 /** \brief Base tracking context structure for all types of
44  * recognizers.
45  */
46 template< class ImplTraits, class StreamType >
47 class BaseRecognizer : public ImplTraits::AllocPolicyType
48 {
49 public:
50 	typedef typename ImplTraits::AllocPolicyType	AllocPolicyType;
51 	typedef typename StreamType::IntStreamType	IntStreamType;
52 	typedef typename ComponentTypeFinder<ImplTraits, StreamType>::ComponentType  SuperType;
53 	typedef typename StreamType::UnitType		UnitType;
54 	typedef typename ImplTraits::template ExceptionBaseType<StreamType> ExceptionBaseType;
55 	typedef typename ImplTraits::BitsetType BitsetType;
56 	typedef typename ImplTraits::BitsetListType		BitsetListType;
57 	typedef typename ImplTraits::StringType	StringType;
58 	typedef typename ImplTraits::template RecognizerSharedStateType<StreamType>  RecognizerSharedStateType;
59 	typedef typename ImplTraits::DebugEventListenerType DebugEventListenerType;
60 	typedef typename ImplTraits::LexerType LexerType;
61 	typedef typename ImplTraits::ParserType ParserType;
62 	typedef typename ImplTraits::TreeParserType TreeParserType;
63 
64 	typedef typename AllocPolicyType::template StackType<StringType>  StringStackType;
65 	typedef typename AllocPolicyType::template ListType<StringType>  StringListType;
66 
67 private:
68 	/// A pointer to the shared recognizer state, such that multiple
69 	/// recognizers can use the same inputs streams and so on (in
70 	/// the case of grammar inheritance for instance.
71 	///
72 	RecognizerSharedStateType*		m_state;
73 
74 	/// If set to something other than NULL, then this structure is
75 	/// points to an instance of the debugger interface. In general, the
76 	/// debugger is only referenced internally in recovery/error operations
77 	/// so that it does not cause overhead by having to check this pointer
78 	/// in every function/method
79 	///
80 	DebugEventListenerType*		m_debugger;
81 
82 
83 public:
84 	BaseRecognizer(ANTLR_UINT32 sizeHint, RecognizerSharedStateType* state);
85 
86 	SuperType* get_super();
87 	RecognizerSharedStateType* get_state() const;
88 	DebugEventListenerType* get_debugger() const;
89 	void  set_state( RecognizerSharedStateType* state );
90 	void  set_debugger( DebugEventListenerType* debugger );
91 
92     /// Match current input symbol against ttype.  Upon error, do one token
93 	/// insertion or deletion if possible.
94 	/// To turn off single token insertion or deletion error
95 	/// recovery, override mismatchRecover() and have it call
96 	/// plain mismatch(), which does not recover.  Then any error
97 	/// in a rule will cause an exception and immediate exit from
98 	/// rule.  Rule would recover by resynchronizing to the set of
99 	/// symbols that can follow rule ref.
100 	///
101     const UnitType*	match(ANTLR_UINT32 ttype, BitsetListType* follow);
102 
103 	/// Consumes the next token, whatever it is, and resets the recognizer state
104 	/// so that it is not in error.
105 	///
106 	/// \param recognizer
107 	/// Recognizer context pointer
108 	///
109     void	matchAny();
110 
111 	/// function that decides if the token ahead of the current one is the
112 	/// one we were loking for, in which case the curernt one is very likely extraneous
113 	/// and can be reported that way.
114 	///
115 	bool mismatchIsUnwantedToken(IntStreamType* input, ANTLR_UINT32 ttype);
116 
117 	/// function that decides if the current token is one that can logically
118 	/// follow the one we were looking for, in which case the one we were looking for is
119 	/// probably missing from the input.
120 	///
121 	bool mismatchIsMissingToken(IntStreamType* input, BitsetListType* follow);
122 
123     /// Factor out what to do upon token mismatch so tree parsers can behave
124 	/// differently.  Override and call mismatchRecover(input, ttype, follow)
125 	/// to get single token insertion and deletion.  Use this to turn off
126 	/// single token insertion and deletion. Override mismatchRecover
127 	/// to call this instead.
128 	///
129 	/// \remark mismatch only works for parsers and must be overridden for anything else.
130 	///
131     void mismatch(ANTLR_UINT32 ttype, BitsetListType* follow);
132 
133     /// Report a recognition problem.
134 	///
135 	/// This method sets errorRecovery to indicate the parser is recovering
136 	/// not parsing.  Once in recovery mode, no errors are generated.
137 	/// To get out of recovery mode, the parser must successfully match
138 	/// a token (after a resync).  So it will go:
139 	///
140 	///		1. error occurs
141 	///		2. enter recovery mode, report error
142 	///		3. consume until token found in resynch set
143 	///		4. try to resume parsing
144 	///		5. next match() will reset errorRecovery mode
145 	///
146 	/// If you override, make sure to update errorCount if you care about that.
147 	///
148     void	reportError();
149 	void	reportError( ClassForwarder<LexerType> );
150 	template<typename CompType>
151 	void	reportError( ClassForwarder<CompType> );
152 
153     /** Function that is called to display a recognition error message. You may
154      *  override this function independently of (*reportError)() above as that function calls
155      *  this one to do the actual exception printing.
156      */
157     void	displayRecognitionError(ANTLR_UINT8** tokenNames);
158 
159 	/// Get number of recognition errors (lexer, parser, tree parser).  Each
160 	/// recognizer tracks its own number.  So parser and lexer each have
161 	/// separate count.  Does not count the spurious errors found between
162 	/// an error and next valid token match
163 	///
164 	/// \see reportError()
165 	///
166 	ANTLR_UINT32 getNumberOfSyntaxErrors();
167 
168     /** Function that recovers from an error found in the input stream.
169      *  Generally, this will be a #ANTLR3_EXCEPTION_NOVIABLE_ALT but it could also
170      *  be from a mismatched token that the (*match)() could not recover from.
171      */
172     void	recover();
173 
174     /** function that is a hook to listen to token consumption during error recovery.
175      *  This is mainly used by the debug parser to send events to the listener.
176      */
177     void	beginResync();
178 
179     /** function that is a hook to listen to token consumption during error recovery.
180      *  This is mainly used by the debug parser to send events to the listener.
181      */
182     void	endResync();
183 
184 	/** function that is a hook to listen to token consumption during error recovery.
185      *  This is mainly used by the debug parser to send events to the listener.
186      */
187     void	beginBacktrack(ANTLR_UINT32 level);
188 
189     /** function that is a hook to listen to token consumption during error recovery.
190      *  This is mainly used by the debug parser to send events to the listener.
191      */
192     void	endBacktrack(ANTLR_UINT32 level, bool successful);
193 
194     /// Compute the error recovery set for the current rule.
195 	/// Documentation below is from the Java implementation.
196 	///
197 	/// During rule invocation, the parser pushes the set of tokens that can
198 	/// follow that rule reference on the stack; this amounts to
199 	/// computing FIRST of what follows the rule reference in the
200 	/// enclosing rule. This local follow set only includes tokens
201 	/// from within the rule; i.e., the FIRST computation done by
202 	/// ANTLR stops at the end of a rule.
203 	//
204 	/// EXAMPLE
205 	//
206 	/// When you find a "no viable alt exception", the input is not
207 	/// consistent with any of the alternatives for rule r.  The best
208 	/// thing to do is to consume tokens until you see something that
209 	/// can legally follow a call to r *or* any rule that called r.
210 	/// You don't want the exact set of viable next tokens because the
211 	/// input might just be missing a token--you might consume the
212 	/// rest of the input looking for one of the missing tokens.
213 	///
214 	/// Consider grammar:
215 	///
216 	/// a : '[' b ']'
217 	///   | '(' b ')'
218 	///   ;
219 	/// b : c '^' INT ;
220 	/// c : ID
221 	///   | INT
222 	///   ;
223 	///
224 	/// At each rule invocation, the set of tokens that could follow
225 	/// that rule is pushed on a stack.  Here are the various "local"
226 	/// follow sets:
227 	///
228 	/// FOLLOW(b1_in_a) = FIRST(']') = ']'
229 	/// FOLLOW(b2_in_a) = FIRST(')') = ')'
230 	/// FOLLOW(c_in_b) = FIRST('^') = '^'
231 	///
232 	/// Upon erroneous input "[]", the call chain is
233 	///
234 	/// a -> b -> c
235 	///
236 	/// and, hence, the follow context stack is:
237 	///
238 	/// depth  local follow set     after call to rule
239 	///   0         <EOF>                    a (from main())
240 	///   1          ']'                     b
241 	///   3          '^'                     c
242 	///
243 	/// Notice that ')' is not included, because b would have to have
244 	/// been called from a different context in rule a for ')' to be
245 	/// included.
246 	///
247 	/// For error recovery, we cannot consider FOLLOW(c)
248 	/// (context-sensitive or otherwise).  We need the combined set of
249 	/// all context-sensitive FOLLOW sets--the set of all tokens that
250 	/// could follow any reference in the call chain.  We need to
251 	/// resync to one of those tokens.  Note that FOLLOW(c)='^' and if
252 	/// we resync'd to that token, we'd consume until EOF.  We need to
253 	/// sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}.
254 	/// In this case, for input "[]", LA(1) is in this set so we would
255 	/// not consume anything and after printing an error rule c would
256 	/// return normally.  It would not find the required '^' though.
257 	/// At this point, it gets a mismatched token error and throws an
258 	/// exception (since LA(1) is not in the viable following token
259 	/// set).  The rule exception handler tries to recover, but finds
260 	/// the same recovery set and doesn't consume anything.  Rule b
261 	/// exits normally returning to rule a.  Now it finds the ']' (and
262 	/// with the successful match exits errorRecovery mode).
263 	///
264 	/// So, you can see that the parser walks up call chain looking
265 	/// for the token that was a member of the recovery set.
266 	///
267 	/// Errors are not generated in errorRecovery mode.
268 	///
269 	/// ANTLR's error recovery mechanism is based upon original ideas:
270 	///
271 	/// "Algorithms + Data Structures = Programs" by Niklaus Wirth
272 	///
273 	/// and
274 	///
275 	/// "A note on error recovery in recursive descent parsers":
276 	/// http://portal.acm.org/citation.cfm?id=947902.947905
277 	///
278 	/// Later, Josef Grosch had some good ideas:
279 	///
280 	/// "Efficient and Comfortable Error Recovery in Recursive Descent
281 	/// Parsers":
282 	/// ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip
283 	///
284 	/// Like Grosch I implemented local FOLLOW sets that are combined
285 	/// at run-time upon error to avoid overhead during parsing.
286 	///
287     BitsetType*	computeErrorRecoverySet();
288 
289     /// Compute the context-sensitive FOLLOW set for current rule.
290 	/// Documentation below is from the Java runtime.
291 	///
292 	/// This is the set of token types that can follow a specific rule
293 	/// reference given a specific call chain.  You get the set of
294 	/// viable tokens that can possibly come next (look ahead depth 1)
295 	/// given the current call chain.  Contrast this with the
296 	/// definition of plain FOLLOW for rule r:
297 	///
298 	///  FOLLOW(r)={x | S=>*alpha r beta in G and x in FIRST(beta)}
299 	///
300 	/// where x in T* and alpha, beta in V*; T is set of terminals and
301 	/// V is the set of terminals and non terminals.  In other words,
302 	/// FOLLOW(r) is the set of all tokens that can possibly follow
303 	/// references to r in///any* sentential form (context).  At
304 	/// runtime, however, we know precisely which context applies as
305 	/// we have the call chain.  We may compute the exact (rather
306 	/// than covering superset) set of following tokens.
307 	///
308 	/// For example, consider grammar:
309 	///
310 	/// stat : ID '=' expr ';'      // FOLLOW(stat)=={EOF}
311 	///      | "return" expr '.'
312 	///      ;
313 	/// expr : atom ('+' atom)* ;   // FOLLOW(expr)=={';','.',')'}
314 	/// atom : INT                  // FOLLOW(atom)=={'+',')',';','.'}
315 	///      | '(' expr ')'
316 	///      ;
317 	///
318 	/// The FOLLOW sets are all inclusive whereas context-sensitive
319 	/// FOLLOW sets are precisely what could follow a rule reference.
320 	/// For input input "i=(3);", here is the derivation:
321 	///
322 	/// stat => ID '=' expr ';'
323 	///      => ID '=' atom ('+' atom)* ';'
324 	///      => ID '=' '(' expr ')' ('+' atom)* ';'
325 	///      => ID '=' '(' atom ')' ('+' atom)* ';'
326 	///      => ID '=' '(' INT ')' ('+' atom)* ';'
327 	///      => ID '=' '(' INT ')' ';'
328 	///
329 	/// At the "3" token, you'd have a call chain of
330 	///
331 	///   stat -> expr -> atom -> expr -> atom
332 	///
333 	/// What can follow that specific nested ref to atom?  Exactly ')'
334 	/// as you can see by looking at the derivation of this specific
335 	/// input.  Contrast this with the FOLLOW(atom)={'+',')',';','.'}.
336 	///
337 	/// You want the exact viable token set when recovering from a
338 	/// token mismatch.  Upon token mismatch, if LA(1) is member of
339 	/// the viable next token set, then you know there is most likely
340 	/// a missing token in the input stream.  "Insert" one by just not
341 	/// throwing an exception.
342 	///
343     BitsetType*	computeCSRuleFollow();
344 
345     /// Compute the current followset for the input stream.
346 	///
347     BitsetType*	combineFollows(bool exact);
348 
349     /// Attempt to recover from a single missing or extra token.
350 	///
351 	/// EXTRA TOKEN
352 	///
353 	/// LA(1) is not what we are looking for.  If LA(2) has the right token,
354 	/// however, then assume LA(1) is some extra spurious token.  Delete it
355 	/// and LA(2) as if we were doing a normal match(), which advances the
356 	/// input.
357 	///
358 	/// MISSING TOKEN
359 	///
360 	/// If current token is consistent with what could come after
361 	/// ttype then it is ok to "insert" the missing token, else throw
362 	/// exception For example, Input "i=(3;" is clearly missing the
363 	/// ')'.  When the parser returns from the nested call to expr, it
364 	/// will have call chain:
365 	///
366 	///    stat -> expr -> atom
367 	///
368 	/// and it will be trying to match the ')' at this point in the
369 	/// derivation:
370 	///
371 	///       => ID '=' '(' INT ')' ('+' atom)* ';'
372 	///                          ^
373 	/// match() will see that ';' doesn't match ')' and report a
374 	/// mismatched token error.  To recover, it sees that LA(1)==';'
375 	/// is in the set of tokens that can follow the ')' token
376 	/// reference in rule atom.  It can assume that you forgot the ')'.
377 	///
378 	/// The exception that was passed in, in the java implementation is
379 	/// sorted in the recognizer exception stack in the C version. To 'throw' it we set the
380 	/// error flag and rules cascade back when this is set.
381 	///
382     const UnitType* recoverFromMismatchedToken( ANTLR_UINT32	ttype, BitsetListType*	follow);
383 
384     /** Function that recovers from a mismatched set in the token stream, in a similar manner
385      *  to (*recoverFromMismatchedToken)
386      */
387     const UnitType* recoverFromMismatchedSet(BitsetListType*	follow);
388 
389     /** common routine to handle single token insertion for recovery functions.
390      */
391 	/// This code is factored out from mismatched token and mismatched set
392 	///  recovery.  It handles "single token insertion" error recovery for
393 	/// both.  No tokens are consumed to recover from insertions.  Return
394 	/// true if recovery was possible else return false.
395 	///
396     bool	recoverFromMismatchedElement(BitsetListType*	follow);
397 
398     /** function that consumes input until the next token matches
399      *  the given token.
400      */
401     void	consumeUntil(ANTLR_UINT32   tokenType);
402 
403     /** function that consumes input until the next token matches
404      *  one in the given set.
405      */
406     void	consumeUntilSet(BitsetType*	set);
407 
408     /** function that returns an ANTLR3_LIST of the strings that identify
409      *  the rules in the parser that got you to this point. Can be overridden by installing your
410      *	own function set.
411      *
412      * \todo Document how to override invocation stack functions.
413      */
414 	StringStackType	getRuleInvocationStack();
415 	StringStackType	getRuleInvocationStackNamed(ANTLR_UINT8*    name);
416 
417     /** function that converts an ANLR3_LIST of tokens to an ANTLR3_LIST of
418      *  string token names. As this is mostly used in string template processing it may not be useful
419      *  in the C runtime.
420      */
421     StringListType	toStrings( const StringListType& );
422 
423     /** function to return whether the rule has parsed input starting at the supplied
424      *  start index before. If the rule has not parsed input starting from the supplied start index,
425      *  then it will return ANTLR3_MEMO_RULE_UNKNOWN. If it has parsed from the suppled start point
426      *  then it will return the point where it last stopped parsing after that start point.
427      */
428     ANTLR_MARKER	getRuleMemoization( ANTLR_INTKEY	ruleIndex,
429 												ANTLR_MARKER	ruleParseStart);
430 
431     /** function that determines whether the rule has parsed input at the current index
432      *  in the input stream
433      */
434     bool	alreadyParsedRule(ANTLR_MARKER	ruleIndex);
435 
436     /** Function that records whether the rule has parsed the input at a
437      *  current position successfully or not.
438      */
439     void	memoize(ANTLR_MARKER	ruleIndex,
440 								ANTLR_MARKER	ruleParseStart);
441 
442 	/// Function that returns the current input symbol.
443     /// The is placed into any label for the associated token ref; e.g., x=ID.  Token
444 	/// and tree parsers need to return different objects. Rather than test
445 	/// for input stream type or change the IntStream interface, I use
446 	/// a simple method to ask the recognizer to tell me what the current
447 	/// input symbol is.
448 	///
449 	/// This is ignored for lexers and the lexer implementation of this
450 	/// function should return NULL.
451 	///
452 	const UnitType*	getCurrentInputSymbol(IntStreamType* istream);
453 	const UnitType*	getCurrentInputSymbol(IntStreamType* istream, ClassForwarder<LexerType>);
454 	const UnitType*	getCurrentInputSymbol(IntStreamType* istream, ClassForwarder<ParserType>);
455 	const UnitType*	getCurrentInputSymbol(IntStreamType* istream, ClassForwarder<TreeParserType>);
456 
457 	/// Conjure up a missing token during error recovery.
458 	///
459 	/// The recognizer attempts to recover from single missing
460 	/// symbols. But, actions might refer to that missing symbol.
461 	/// For example, x=ID {f($x);}. The action clearly assumes
462 	/// that there has been an identifier matched previously and that
463 	/// $x points at that token. If that token is missing, but
464 	/// the next token in the stream is what we want we assume that
465 	/// this token is missing and we keep going. Because we
466 	/// have to return some token to replace the missing token,
467 	/// we have to conjure one up. This method gives the user control
468 	/// over the tokens returned for missing tokens. Mostly,
469 	/// you will want to create something special for identifier
470 	/// tokens. For literals such as '{' and ',', the default
471 	/// action in the parser or tree parser works. It simply creates
472 	/// a CommonToken of the appropriate type. The text will be the token.
473 	/// If you change what tokens must be created by the lexer,
474 	/// override this method to create the appropriate tokens.
475 	///
476 	UnitType*	getMissingSymbol( IntStreamType*		istream, ExceptionBaseType*		e,
477 												ANTLR_UINT32			expectedTokenType,
478 												BitsetListType*		follow);
479 
480     /** Function that returns whether the supplied grammar function
481      *  will parse the current input stream or not. This is the way that syntactic
482      *  predicates are evaluated. Unlike java, C is perfectly happy to invoke code
483      *  via a pointer to a function (hence that's what all the ANTLR3 C interfaces
484      *  do.
485      */
486 	template<typename Predicate>
487     bool  synpred( ClassForwarder<Predicate> );
488 
489 	//In place of exConstruct, just directly instantiate the Exception Object
490 
491     /** Reset the recognizer
492      */
493     void  reset();
494 	void  reset( ClassForwarder<LexerType> );
495 	template<typename CompType>
496 	void  reset( ClassForwarder<CompType> );
497 
498 	void exConstruct();
499 
500     ~BaseRecognizer();
501 
502 };
503 
504 ANTLR_END_NAMESPACE()
505 
506 #include "antlr3baserecognizer.inl"
507 
508 /// @}
509 ///
510 
511 #endif	    /* _ANTLR3_BASERECOGNIZER_H	*/
512 
513