1 /** \file 2 * Base interface for any ANTLR3 lexer. 3 * 4 * An ANLTR3 lexer builds from two sets of components: 5 * 6 * - The runtime components that provide common functionality such as 7 * traversing character streams, building tokens for output and so on. 8 * - The generated rules and struutre of the actual lexer, which call upon the 9 * runtime components. 10 * 11 * A lexer class contains a character input stream, a base recognizer interface 12 * (which it will normally implement) and a token source interface (which it also 13 * implements. The Tokensource interface is called by a token consumer (such as 14 * a parser, but in theory it can be anything that wants a set of abstract 15 * tokens in place of a raw character stream. 16 * 17 * So then, we set up a lexer in a sequence akin to: 18 * 19 * - Create a character stream (something which implements ANTLR3_INPUT_STREAM) 20 * and initialize it. 21 * - Create a lexer interface and tell it where it its input stream is. 22 * This will cause the creation of a base recognizer class, which it will 23 * override with its own implementations of some methods. The lexer creator 24 * can also then in turn override anything it likes. 25 * - The lexer token source interface is then passed to some interface that 26 * knows how to use it, byte calling for a next token. 27 * - When a next token is called, let ze lexing begin. 28 * 29 */ 30 #ifndef _ANTLR3_LEXER_HPP 31 #define _ANTLR3_LEXER_HPP 32 33 // [The "BSD licence"] 34 // Copyright (c) 2005-2009 Gokulakannan Somasundaram, ElectronDB 35 36 // 37 // All rights reserved. 38 // 39 // Redistribution and use in source and binary forms, with or without 40 // modification, are permitted provided that the following conditions 41 // are met: 42 // 1. Redistributions of source code must retain the above copyright 43 // notice, this list of conditions and the following disclaimer. 44 // 2. Redistributions in binary form must reproduce the above copyright 45 // notice, this list of conditions and the following disclaimer in the 46 // documentation and/or other materials provided with the distribution. 47 // 3. The name of the author may not be used to endorse or promote products 48 // derived from this software without specific prior written permission. 49 // 50 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 51 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 52 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 53 // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 54 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 55 // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 56 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 57 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 58 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 59 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 60 61 /* Definitions 62 */ 63 #include "antlr3defs.hpp" 64 65 ANTLR_BEGIN_NAMESPACE() 66 67 static const ANTLR_UINT32 ANTLR_STRING_TERMINATOR = 0xFFFFFFFF; 68 69 template<class ImplTraits> 70 class Lexer : public ImplTraits::template RecognizerType< typename ImplTraits::InputStreamType >, 71 public ImplTraits::TokenSourceType 72 { 73 public: 74 typedef typename ImplTraits::AllocPolicyType AllocPolicyType; 75 typedef typename ImplTraits::InputStreamType InputStreamType; 76 typedef InputStreamType StreamType; 77 typedef typename InputStreamType::IntStreamType IntStreamType; 78 typedef typename ImplTraits::CommonTokenType CommonTokenType; 79 typedef typename ImplTraits::StreamDataType TokenType; 80 typedef typename ImplTraits::StringType StringType; 81 typedef typename ImplTraits::StringStreamType StringStreamType; 82 typedef typename ImplTraits::template RecognizerType< InputStreamType > RecognizerType; 83 typedef typename RecognizerType::RecognizerSharedStateType RecognizerSharedStateType; 84 typedef typename ImplTraits::template ExceptionBaseType<InputStreamType> ExceptionBaseType; 85 typedef typename ImplTraits::BitsetListType BitsetListType; 86 typedef typename ImplTraits::TokenSourceType TokenSourceType; 87 88 typedef typename RecognizerSharedStateType::RuleMemoType RuleMemoType; 89 typedef typename RecognizerType::DebugEventListenerType DebuggerType; 90 91 private: 92 /** A pointer to the character stream whence this lexer is receiving 93 * characters. 94 * TODO: I may come back to this and implement charstream outside 95 * the input stream as per the java implementation. 96 */ 97 InputStreamType* m_input; 98 99 public: 100 Lexer(ANTLR_UINT32 sizeHint, RecognizerSharedStateType* state); 101 Lexer(ANTLR_UINT32 sizeHint, InputStreamType* input, RecognizerSharedStateType* state); 102 103 InputStreamType* get_input() const; 104 IntStreamType* get_istream() const; 105 RecognizerType* get_rec(); 106 const RecognizerType* get_rec() const; 107 TokenSourceType* get_tokSource(); 108 109 //functions used in .stg file 110 const RecognizerType* get_recognizer() const; 111 RecognizerSharedStateType* get_lexstate() const; 112 void set_lexstate( RecognizerSharedStateType* lexstate ); 113 const TokenSourceType* get_tokSource() const; 114 CommonTokenType* get_ltoken() const; 115 void set_ltoken( const CommonTokenType* ltoken ); 116 bool hasFailed() const; 117 ANTLR_INT32 get_backtracking() const; 118 void inc_backtracking(); 119 void dec_backtracking(); 120 bool get_failedflag() const; 121 void set_failedflag( bool failed ); 122 InputStreamType* get_strstream() const; 123 ANTLR_MARKER index() const; 124 void seek(ANTLR_MARKER index); 125 const CommonTokenType* EOF_Token() const; 126 bool hasException() const; 127 ExceptionBaseType* get_exception() const; 128 void constructEx(); 129 void lrecover(); 130 ANTLR_MARKER mark(); 131 void rewind(ANTLR_MARKER marker); 132 void rewindLast(); 133 void setText( const StringType& text ); 134 void skip(); 135 RuleMemoType* getRuleMemo() const; 136 DebuggerType* get_debugger() const; 137 void setRuleMemo(RuleMemoType* rulememo); 138 ANTLR_UINT32 LA(ANTLR_INT32 i); 139 void consume(); 140 void memoize(ANTLR_MARKER ruleIndex, ANTLR_MARKER ruleParseStart); 141 bool haveParsedRule(ANTLR_MARKER ruleIndex); 142 143 /** Pointer to a function that sets the charstream source for the lexer and 144 * causes it to be reset. 145 */ 146 void setCharStream(InputStreamType* input); 147 148 /*! 149 * \brief 150 * Change to a new input stream, remembering the old one. 151 * 152 * \param lexer 153 * Pointer to the lexer instance to switch input streams for. 154 * 155 * \param input 156 * New input stream to install as the current one. 157 * 158 * Switches the current character input stream to 159 * a new one, saving the old one, which we will revert to at the end of this 160 * new one. 161 */ 162 void pushCharStream(InputStreamType* input); 163 164 /*! 165 * \brief 166 * Stops using the current input stream and reverts to any prior 167 * input stream on the stack. 168 * 169 * \param lexer 170 * Description of parameter lexer. 171 * 172 * Pointer to a function that abandons the current input stream, whether it 173 * is empty or not and reverts to the previous stacked input stream. 174 * 175 * \remark 176 * The function fails silently if there are no prior input streams. 177 */ 178 void popCharStream(); 179 180 /** Function that emits (a copy of ) the supplied token as the next token in 181 * the stream. 182 */ 183 void emit(const CommonTokenType* token); 184 185 /** Pointer to a function that constructs a new token from the lexer stored information 186 */ 187 CommonTokenType* emit(); 188 189 /** Pointer to a function that attempts to match and consume the specified string from the input 190 * stream. Note that strings muse be passed as terminated arrays of ANTLR3_UCHAR. Strings are terminated 191 * with 0xFFFFFFFF, which is an invalid UTF32 character 192 */ 193 bool matchs(ANTLR_UCHAR* string); 194 195 /** Pointer to a function that matches and consumes the specified character from the input stream. 196 * The input stream is required to provide characters via LA() as UTF32 characters. The default lexer 197 * implementation is source encoding agnostic and so input streams do not generally need to 198 * override the default implmentation. 199 */ 200 bool matchc(ANTLR_UCHAR c); 201 202 /** Pointer to a function that matches any character in the supplied range (I suppose it could be a token range too 203 * but this would only be useful if the tokens were in tsome guaranteed order which is 204 * only going to happen with a hand crafted token set). 205 */ 206 bool matchRange(ANTLR_UCHAR low, ANTLR_UCHAR high); 207 208 /** Pointer to a function that matches the next token/char in the input stream 209 * regardless of what it actaully is. 210 */ 211 void matchAny(); 212 213 /** Pointer to a function that recovers from an error found in the input stream. 214 * Generally, this will be a #ANTLR3_EXCEPTION_NOVIABLE_ALT but it could also 215 * be from a mismatched token that the (*match)() could not recover from. 216 */ 217 void recover(); 218 219 /** Function to return the current line number in the input stream 220 */ 221 ANTLR_UINT32 getLine(); 222 ANTLR_MARKER getCharIndex(); 223 ANTLR_UINT32 getCharPositionInLine(); 224 225 /** Function to return the text so far for the current token being generated 226 */ 227 StringType getText(); 228 229 //Other utility functions 230 void fillExceptionData( ExceptionBaseType* ex ); 231 232 /** Default lexer error handler (works for 8 bit streams only!!!) 233 */ 234 void displayRecognitionError( ANTLR_UINT8** tokenNames, ExceptionBaseType* ex); 235 void exConstruct(); 236 TokenType* getMissingSymbol( IntStreamType* istream, ExceptionBaseType* e, 237 ANTLR_UINT32 expectedTokenType, BitsetListType* follow); 238 239 /** Pointer to a function that knows how to free the resources of a lexer 240 */ 241 ~Lexer(); 242 }; 243 244 ANTLR_END_NAMESPACE() 245 246 #include "antlr3lexer.inl" 247 248 #endif 249