1 /** \file 2 * Defines the basic structure to support recognizing by either a lexer, 3 * parser, or tree parser. 4 * \addtogroup BaseRecognizer 5 * @{ 6 */ 7 #ifndef _ANTLR3_BASERECOGNIZER_HPP 8 #define _ANTLR3_BASERECOGNIZER_HPP 9 10 // [The "BSD licence"] 11 // Copyright (c) 2005-2009 Gokulakannan Somasundaram, ElectronDB 12 13 // 14 // All rights reserved. 15 // 16 // Redistribution and use in source and binary forms, with or without 17 // modification, are permitted provided that the following conditions 18 // are met: 19 // 1. Redistributions of source code must retain the above copyright 20 // notice, this list of conditions and the following disclaimer. 21 // 2. Redistributions in binary form must reproduce the above copyright 22 // notice, this list of conditions and the following disclaimer in the 23 // documentation and/or other materials provided with the distribution. 24 // 3. The name of the author may not be used to endorse or promote products 25 // derived from this software without specific prior written permission. 26 // 27 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 28 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 29 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 30 // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 31 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 32 // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 33 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 34 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 35 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 36 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37 38 #include "antlr3defs.hpp" 39 #include "antlr3collections.hpp" 40 41 ANTLR_BEGIN_NAMESPACE() 42 43 /** \brief Base tracking context structure for all types of 44 * recognizers. 45 */ 46 template< class ImplTraits, class StreamType > 47 class BaseRecognizer : public ImplTraits::AllocPolicyType 48 { 49 public: 50 typedef typename ImplTraits::AllocPolicyType AllocPolicyType; 51 typedef typename StreamType::IntStreamType IntStreamType; 52 typedef typename ComponentTypeFinder<ImplTraits, StreamType>::ComponentType SuperType; 53 typedef typename StreamType::UnitType UnitType; 54 typedef typename ImplTraits::template ExceptionBaseType<StreamType> ExceptionBaseType; 55 typedef typename ImplTraits::BitsetType BitsetType; 56 typedef typename ImplTraits::BitsetListType BitsetListType; 57 typedef typename ImplTraits::StringType StringType; 58 typedef typename ImplTraits::template RecognizerSharedStateType<StreamType> RecognizerSharedStateType; 59 typedef typename ImplTraits::DebugEventListenerType DebugEventListenerType; 60 typedef typename ImplTraits::LexerType LexerType; 61 typedef typename ImplTraits::ParserType ParserType; 62 typedef typename ImplTraits::TreeParserType TreeParserType; 63 64 typedef typename AllocPolicyType::template StackType<StringType> StringStackType; 65 typedef typename AllocPolicyType::template ListType<StringType> StringListType; 66 67 private: 68 /// A pointer to the shared recognizer state, such that multiple 69 /// recognizers can use the same inputs streams and so on (in 70 /// the case of grammar inheritance for instance. 71 /// 72 RecognizerSharedStateType* m_state; 73 74 /// If set to something other than NULL, then this structure is 75 /// points to an instance of the debugger interface. In general, the 76 /// debugger is only referenced internally in recovery/error operations 77 /// so that it does not cause overhead by having to check this pointer 78 /// in every function/method 79 /// 80 DebugEventListenerType* m_debugger; 81 82 83 public: 84 BaseRecognizer(ANTLR_UINT32 sizeHint, RecognizerSharedStateType* state); 85 86 SuperType* get_super(); 87 RecognizerSharedStateType* get_state() const; 88 DebugEventListenerType* get_debugger() const; 89 void set_state( RecognizerSharedStateType* state ); 90 void set_debugger( DebugEventListenerType* debugger ); 91 92 /// Match current input symbol against ttype. Upon error, do one token 93 /// insertion or deletion if possible. 94 /// To turn off single token insertion or deletion error 95 /// recovery, override mismatchRecover() and have it call 96 /// plain mismatch(), which does not recover. Then any error 97 /// in a rule will cause an exception and immediate exit from 98 /// rule. Rule would recover by resynchronizing to the set of 99 /// symbols that can follow rule ref. 100 /// 101 const UnitType* match(ANTLR_UINT32 ttype, BitsetListType* follow); 102 103 /// Consumes the next token, whatever it is, and resets the recognizer state 104 /// so that it is not in error. 105 /// 106 /// \param recognizer 107 /// Recognizer context pointer 108 /// 109 void matchAny(); 110 111 /// function that decides if the token ahead of the current one is the 112 /// one we were loking for, in which case the curernt one is very likely extraneous 113 /// and can be reported that way. 114 /// 115 bool mismatchIsUnwantedToken(IntStreamType* input, ANTLR_UINT32 ttype); 116 117 /// function that decides if the current token is one that can logically 118 /// follow the one we were looking for, in which case the one we were looking for is 119 /// probably missing from the input. 120 /// 121 bool mismatchIsMissingToken(IntStreamType* input, BitsetListType* follow); 122 123 /// Factor out what to do upon token mismatch so tree parsers can behave 124 /// differently. Override and call mismatchRecover(input, ttype, follow) 125 /// to get single token insertion and deletion. Use this to turn off 126 /// single token insertion and deletion. Override mismatchRecover 127 /// to call this instead. 128 /// 129 /// \remark mismatch only works for parsers and must be overridden for anything else. 130 /// 131 void mismatch(ANTLR_UINT32 ttype, BitsetListType* follow); 132 133 /// Report a recognition problem. 134 /// 135 /// This method sets errorRecovery to indicate the parser is recovering 136 /// not parsing. Once in recovery mode, no errors are generated. 137 /// To get out of recovery mode, the parser must successfully match 138 /// a token (after a resync). So it will go: 139 /// 140 /// 1. error occurs 141 /// 2. enter recovery mode, report error 142 /// 3. consume until token found in resynch set 143 /// 4. try to resume parsing 144 /// 5. next match() will reset errorRecovery mode 145 /// 146 /// If you override, make sure to update errorCount if you care about that. 147 /// 148 void reportError(); 149 void reportError( ClassForwarder<LexerType> ); 150 template<typename CompType> 151 void reportError( ClassForwarder<CompType> ); 152 153 /** Function that is called to display a recognition error message. You may 154 * override this function independently of (*reportError)() above as that function calls 155 * this one to do the actual exception printing. 156 */ 157 void displayRecognitionError(ANTLR_UINT8** tokenNames); 158 159 /// Get number of recognition errors (lexer, parser, tree parser). Each 160 /// recognizer tracks its own number. So parser and lexer each have 161 /// separate count. Does not count the spurious errors found between 162 /// an error and next valid token match 163 /// 164 /// \see reportError() 165 /// 166 ANTLR_UINT32 getNumberOfSyntaxErrors(); 167 168 /** Function that recovers from an error found in the input stream. 169 * Generally, this will be a #ANTLR3_EXCEPTION_NOVIABLE_ALT but it could also 170 * be from a mismatched token that the (*match)() could not recover from. 171 */ 172 void recover(); 173 174 /** function that is a hook to listen to token consumption during error recovery. 175 * This is mainly used by the debug parser to send events to the listener. 176 */ 177 void beginResync(); 178 179 /** function that is a hook to listen to token consumption during error recovery. 180 * This is mainly used by the debug parser to send events to the listener. 181 */ 182 void endResync(); 183 184 /** function that is a hook to listen to token consumption during error recovery. 185 * This is mainly used by the debug parser to send events to the listener. 186 */ 187 void beginBacktrack(ANTLR_UINT32 level); 188 189 /** function that is a hook to listen to token consumption during error recovery. 190 * This is mainly used by the debug parser to send events to the listener. 191 */ 192 void endBacktrack(ANTLR_UINT32 level, bool successful); 193 194 /// Compute the error recovery set for the current rule. 195 /// Documentation below is from the Java implementation. 196 /// 197 /// During rule invocation, the parser pushes the set of tokens that can 198 /// follow that rule reference on the stack; this amounts to 199 /// computing FIRST of what follows the rule reference in the 200 /// enclosing rule. This local follow set only includes tokens 201 /// from within the rule; i.e., the FIRST computation done by 202 /// ANTLR stops at the end of a rule. 203 // 204 /// EXAMPLE 205 // 206 /// When you find a "no viable alt exception", the input is not 207 /// consistent with any of the alternatives for rule r. The best 208 /// thing to do is to consume tokens until you see something that 209 /// can legally follow a call to r *or* any rule that called r. 210 /// You don't want the exact set of viable next tokens because the 211 /// input might just be missing a token--you might consume the 212 /// rest of the input looking for one of the missing tokens. 213 /// 214 /// Consider grammar: 215 /// 216 /// a : '[' b ']' 217 /// | '(' b ')' 218 /// ; 219 /// b : c '^' INT ; 220 /// c : ID 221 /// | INT 222 /// ; 223 /// 224 /// At each rule invocation, the set of tokens that could follow 225 /// that rule is pushed on a stack. Here are the various "local" 226 /// follow sets: 227 /// 228 /// FOLLOW(b1_in_a) = FIRST(']') = ']' 229 /// FOLLOW(b2_in_a) = FIRST(')') = ')' 230 /// FOLLOW(c_in_b) = FIRST('^') = '^' 231 /// 232 /// Upon erroneous input "[]", the call chain is 233 /// 234 /// a -> b -> c 235 /// 236 /// and, hence, the follow context stack is: 237 /// 238 /// depth local follow set after call to rule 239 /// 0 <EOF> a (from main()) 240 /// 1 ']' b 241 /// 3 '^' c 242 /// 243 /// Notice that ')' is not included, because b would have to have 244 /// been called from a different context in rule a for ')' to be 245 /// included. 246 /// 247 /// For error recovery, we cannot consider FOLLOW(c) 248 /// (context-sensitive or otherwise). We need the combined set of 249 /// all context-sensitive FOLLOW sets--the set of all tokens that 250 /// could follow any reference in the call chain. We need to 251 /// resync to one of those tokens. Note that FOLLOW(c)='^' and if 252 /// we resync'd to that token, we'd consume until EOF. We need to 253 /// sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}. 254 /// In this case, for input "[]", LA(1) is in this set so we would 255 /// not consume anything and after printing an error rule c would 256 /// return normally. It would not find the required '^' though. 257 /// At this point, it gets a mismatched token error and throws an 258 /// exception (since LA(1) is not in the viable following token 259 /// set). The rule exception handler tries to recover, but finds 260 /// the same recovery set and doesn't consume anything. Rule b 261 /// exits normally returning to rule a. Now it finds the ']' (and 262 /// with the successful match exits errorRecovery mode). 263 /// 264 /// So, you can see that the parser walks up call chain looking 265 /// for the token that was a member of the recovery set. 266 /// 267 /// Errors are not generated in errorRecovery mode. 268 /// 269 /// ANTLR's error recovery mechanism is based upon original ideas: 270 /// 271 /// "Algorithms + Data Structures = Programs" by Niklaus Wirth 272 /// 273 /// and 274 /// 275 /// "A note on error recovery in recursive descent parsers": 276 /// http://portal.acm.org/citation.cfm?id=947902.947905 277 /// 278 /// Later, Josef Grosch had some good ideas: 279 /// 280 /// "Efficient and Comfortable Error Recovery in Recursive Descent 281 /// Parsers": 282 /// ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip 283 /// 284 /// Like Grosch I implemented local FOLLOW sets that are combined 285 /// at run-time upon error to avoid overhead during parsing. 286 /// 287 BitsetType* computeErrorRecoverySet(); 288 289 /// Compute the context-sensitive FOLLOW set for current rule. 290 /// Documentation below is from the Java runtime. 291 /// 292 /// This is the set of token types that can follow a specific rule 293 /// reference given a specific call chain. You get the set of 294 /// viable tokens that can possibly come next (look ahead depth 1) 295 /// given the current call chain. Contrast this with the 296 /// definition of plain FOLLOW for rule r: 297 /// 298 /// FOLLOW(r)={x | S=>*alpha r beta in G and x in FIRST(beta)} 299 /// 300 /// where x in T* and alpha, beta in V*; T is set of terminals and 301 /// V is the set of terminals and non terminals. In other words, 302 /// FOLLOW(r) is the set of all tokens that can possibly follow 303 /// references to r in///any* sentential form (context). At 304 /// runtime, however, we know precisely which context applies as 305 /// we have the call chain. We may compute the exact (rather 306 /// than covering superset) set of following tokens. 307 /// 308 /// For example, consider grammar: 309 /// 310 /// stat : ID '=' expr ';' // FOLLOW(stat)=={EOF} 311 /// | "return" expr '.' 312 /// ; 313 /// expr : atom ('+' atom)* ; // FOLLOW(expr)=={';','.',')'} 314 /// atom : INT // FOLLOW(atom)=={'+',')',';','.'} 315 /// | '(' expr ')' 316 /// ; 317 /// 318 /// The FOLLOW sets are all inclusive whereas context-sensitive 319 /// FOLLOW sets are precisely what could follow a rule reference. 320 /// For input input "i=(3);", here is the derivation: 321 /// 322 /// stat => ID '=' expr ';' 323 /// => ID '=' atom ('+' atom)* ';' 324 /// => ID '=' '(' expr ')' ('+' atom)* ';' 325 /// => ID '=' '(' atom ')' ('+' atom)* ';' 326 /// => ID '=' '(' INT ')' ('+' atom)* ';' 327 /// => ID '=' '(' INT ')' ';' 328 /// 329 /// At the "3" token, you'd have a call chain of 330 /// 331 /// stat -> expr -> atom -> expr -> atom 332 /// 333 /// What can follow that specific nested ref to atom? Exactly ')' 334 /// as you can see by looking at the derivation of this specific 335 /// input. Contrast this with the FOLLOW(atom)={'+',')',';','.'}. 336 /// 337 /// You want the exact viable token set when recovering from a 338 /// token mismatch. Upon token mismatch, if LA(1) is member of 339 /// the viable next token set, then you know there is most likely 340 /// a missing token in the input stream. "Insert" one by just not 341 /// throwing an exception. 342 /// 343 BitsetType* computeCSRuleFollow(); 344 345 /// Compute the current followset for the input stream. 346 /// 347 BitsetType* combineFollows(bool exact); 348 349 /// Attempt to recover from a single missing or extra token. 350 /// 351 /// EXTRA TOKEN 352 /// 353 /// LA(1) is not what we are looking for. If LA(2) has the right token, 354 /// however, then assume LA(1) is some extra spurious token. Delete it 355 /// and LA(2) as if we were doing a normal match(), which advances the 356 /// input. 357 /// 358 /// MISSING TOKEN 359 /// 360 /// If current token is consistent with what could come after 361 /// ttype then it is ok to "insert" the missing token, else throw 362 /// exception For example, Input "i=(3;" is clearly missing the 363 /// ')'. When the parser returns from the nested call to expr, it 364 /// will have call chain: 365 /// 366 /// stat -> expr -> atom 367 /// 368 /// and it will be trying to match the ')' at this point in the 369 /// derivation: 370 /// 371 /// => ID '=' '(' INT ')' ('+' atom)* ';' 372 /// ^ 373 /// match() will see that ';' doesn't match ')' and report a 374 /// mismatched token error. To recover, it sees that LA(1)==';' 375 /// is in the set of tokens that can follow the ')' token 376 /// reference in rule atom. It can assume that you forgot the ')'. 377 /// 378 /// The exception that was passed in, in the java implementation is 379 /// sorted in the recognizer exception stack in the C version. To 'throw' it we set the 380 /// error flag and rules cascade back when this is set. 381 /// 382 const UnitType* recoverFromMismatchedToken( ANTLR_UINT32 ttype, BitsetListType* follow); 383 384 /** Function that recovers from a mismatched set in the token stream, in a similar manner 385 * to (*recoverFromMismatchedToken) 386 */ 387 const UnitType* recoverFromMismatchedSet(BitsetListType* follow); 388 389 /** common routine to handle single token insertion for recovery functions. 390 */ 391 /// This code is factored out from mismatched token and mismatched set 392 /// recovery. It handles "single token insertion" error recovery for 393 /// both. No tokens are consumed to recover from insertions. Return 394 /// true if recovery was possible else return false. 395 /// 396 bool recoverFromMismatchedElement(BitsetListType* follow); 397 398 /** function that consumes input until the next token matches 399 * the given token. 400 */ 401 void consumeUntil(ANTLR_UINT32 tokenType); 402 403 /** function that consumes input until the next token matches 404 * one in the given set. 405 */ 406 void consumeUntilSet(BitsetType* set); 407 408 /** function that returns an ANTLR3_LIST of the strings that identify 409 * the rules in the parser that got you to this point. Can be overridden by installing your 410 * own function set. 411 * 412 * \todo Document how to override invocation stack functions. 413 */ 414 StringStackType getRuleInvocationStack(); 415 StringStackType getRuleInvocationStackNamed(ANTLR_UINT8* name); 416 417 /** function that converts an ANLR3_LIST of tokens to an ANTLR3_LIST of 418 * string token names. As this is mostly used in string template processing it may not be useful 419 * in the C runtime. 420 */ 421 StringListType toStrings( const StringListType& ); 422 423 /** function to return whether the rule has parsed input starting at the supplied 424 * start index before. If the rule has not parsed input starting from the supplied start index, 425 * then it will return ANTLR3_MEMO_RULE_UNKNOWN. If it has parsed from the suppled start point 426 * then it will return the point where it last stopped parsing after that start point. 427 */ 428 ANTLR_MARKER getRuleMemoization( ANTLR_INTKEY ruleIndex, 429 ANTLR_MARKER ruleParseStart); 430 431 /** function that determines whether the rule has parsed input at the current index 432 * in the input stream 433 */ 434 bool alreadyParsedRule(ANTLR_MARKER ruleIndex); 435 436 /** Function that records whether the rule has parsed the input at a 437 * current position successfully or not. 438 */ 439 void memoize(ANTLR_MARKER ruleIndex, 440 ANTLR_MARKER ruleParseStart); 441 442 /// Function that returns the current input symbol. 443 /// The is placed into any label for the associated token ref; e.g., x=ID. Token 444 /// and tree parsers need to return different objects. Rather than test 445 /// for input stream type or change the IntStream interface, I use 446 /// a simple method to ask the recognizer to tell me what the current 447 /// input symbol is. 448 /// 449 /// This is ignored for lexers and the lexer implementation of this 450 /// function should return NULL. 451 /// 452 const UnitType* getCurrentInputSymbol(IntStreamType* istream); 453 const UnitType* getCurrentInputSymbol(IntStreamType* istream, ClassForwarder<LexerType>); 454 const UnitType* getCurrentInputSymbol(IntStreamType* istream, ClassForwarder<ParserType>); 455 const UnitType* getCurrentInputSymbol(IntStreamType* istream, ClassForwarder<TreeParserType>); 456 457 /// Conjure up a missing token during error recovery. 458 /// 459 /// The recognizer attempts to recover from single missing 460 /// symbols. But, actions might refer to that missing symbol. 461 /// For example, x=ID {f($x);}. The action clearly assumes 462 /// that there has been an identifier matched previously and that 463 /// $x points at that token. If that token is missing, but 464 /// the next token in the stream is what we want we assume that 465 /// this token is missing and we keep going. Because we 466 /// have to return some token to replace the missing token, 467 /// we have to conjure one up. This method gives the user control 468 /// over the tokens returned for missing tokens. Mostly, 469 /// you will want to create something special for identifier 470 /// tokens. For literals such as '{' and ',', the default 471 /// action in the parser or tree parser works. It simply creates 472 /// a CommonToken of the appropriate type. The text will be the token. 473 /// If you change what tokens must be created by the lexer, 474 /// override this method to create the appropriate tokens. 475 /// 476 UnitType* getMissingSymbol( IntStreamType* istream, ExceptionBaseType* e, 477 ANTLR_UINT32 expectedTokenType, 478 BitsetListType* follow); 479 480 /** Function that returns whether the supplied grammar function 481 * will parse the current input stream or not. This is the way that syntactic 482 * predicates are evaluated. Unlike java, C is perfectly happy to invoke code 483 * via a pointer to a function (hence that's what all the ANTLR3 C interfaces 484 * do. 485 */ 486 template<typename Predicate> 487 bool synpred( ClassForwarder<Predicate> ); 488 489 //In place of exConstruct, just directly instantiate the Exception Object 490 491 /** Reset the recognizer 492 */ 493 void reset(); 494 void reset( ClassForwarder<LexerType> ); 495 template<typename CompType> 496 void reset( ClassForwarder<CompType> ); 497 498 void exConstruct(); 499 500 ~BaseRecognizer(); 501 502 }; 503 504 ANTLR_END_NAMESPACE() 505 506 #include "antlr3baserecognizer.inl" 507 508 /// @} 509 /// 510 511 #endif /* _ANTLR3_BASERECOGNIZER_H */ 512 513