1 /** \file
2  * Base interface for any ANTLR3 lexer.
3  *
4  * An ANLTR3 lexer builds from two sets of components:
5  *
6  *  - The runtime components that provide common functionality such as
7  *    traversing character streams, building tokens for output and so on.
8  *  - The generated rules and struutre of the actual lexer, which call upon the
9  *    runtime components.
10  *
11  * A lexer class contains  a character input stream, a base recognizer interface
12  * (which it will normally implement) and a token source interface (which it also
13  * implements. The Tokensource interface is called by a token consumer (such as
14  * a parser, but in theory it can be anything that wants a set of abstract
15  * tokens in place of a raw character stream.
16  *
17  * So then, we set up a lexer in a sequence akin to:
18  *
19  *  - Create a character stream (something which implements ANTLR3_INPUT_STREAM)
20  *    and initialize it.
21  *  - Create a lexer interface and tell it where it its input stream is.
22  *    This will cause the creation of a base recognizer class, which it will
23  *    override with its own implementations of some methods. The lexer creator
24  *    can also then in turn override anything it likes.
25  *  - The lexer token source interface is then passed to some interface that
26  *    knows how to use it, byte calling for a next token.
27  *  - When a next token is called, let ze lexing begin.
28  *
29  */
30 #ifndef	_ANTLR3_LEXER_HPP
31 #define	_ANTLR3_LEXER_HPP
32 
33 // [The "BSD licence"]
34 // Copyright (c) 2005-2009 Gokulakannan Somasundaram, ElectronDB
35 
36 //
37 // All rights reserved.
38 //
39 // Redistribution and use in source and binary forms, with or without
40 // modification, are permitted provided that the following conditions
41 // are met:
42 // 1. Redistributions of source code must retain the above copyright
43 //    notice, this list of conditions and the following disclaimer.
44 // 2. Redistributions in binary form must reproduce the above copyright
45 //    notice, this list of conditions and the following disclaimer in the
46 //    documentation and/or other materials provided with the distribution.
47 // 3. The name of the author may not be used to endorse or promote products
48 //    derived from this software without specific prior written permission.
49 //
50 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
51 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
52 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
53 // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
54 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
55 // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
56 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
57 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
58 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
59 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
60 
61 /* Definitions
62  */
63 #include    "antlr3defs.hpp"
64 
65 ANTLR_BEGIN_NAMESPACE()
66 
67 static const ANTLR_UINT32	ANTLR_STRING_TERMINATOR	= 0xFFFFFFFF;
68 
69 template<class ImplTraits>
70 class  Lexer : public ImplTraits::template RecognizerType< typename ImplTraits::InputStreamType >,
71 			   public ImplTraits::TokenSourceType
72 {
73 public:
74 	typedef typename ImplTraits::AllocPolicyType AllocPolicyType;
75 	typedef typename ImplTraits::InputStreamType InputStreamType;
76 	typedef InputStreamType StreamType;
77 	typedef typename InputStreamType::IntStreamType IntStreamType;
78 	typedef typename ImplTraits::CommonTokenType CommonTokenType;
79 	typedef typename ImplTraits::StreamDataType TokenType;
80 	typedef typename ImplTraits::StringType StringType;
81 	typedef typename ImplTraits::StringStreamType StringStreamType;
82 	typedef typename ImplTraits::template RecognizerType< InputStreamType > RecognizerType;
83 	typedef typename RecognizerType::RecognizerSharedStateType RecognizerSharedStateType;
84 	typedef typename ImplTraits::template ExceptionBaseType<InputStreamType> ExceptionBaseType;
85 	typedef typename ImplTraits::BitsetListType BitsetListType;
86 	typedef typename ImplTraits::TokenSourceType TokenSourceType;
87 
88 	typedef typename RecognizerSharedStateType::RuleMemoType RuleMemoType;
89 	typedef typename RecognizerType::DebugEventListenerType DebuggerType;
90 
91 private:
92     /** A pointer to the character stream whence this lexer is receiving
93      *  characters.
94      *  TODO: I may come back to this and implement charstream outside
95      *  the input stream as per the java implementation.
96      */
97     InputStreamType*		m_input;
98 
99 public:
100 	Lexer(ANTLR_UINT32 sizeHint, RecognizerSharedStateType* state);
101 	Lexer(ANTLR_UINT32 sizeHint, InputStreamType* input, RecognizerSharedStateType* state);
102 
103 	InputStreamType* get_input() const;
104 	IntStreamType* get_istream() const;
105 	RecognizerType* get_rec();
106 	const RecognizerType* get_rec() const;
107 	TokenSourceType* get_tokSource();
108 
109 	//functions used in .stg file
110 	const RecognizerType* get_recognizer() const;
111 	RecognizerSharedStateType* get_lexstate() const;
112 	void set_lexstate( RecognizerSharedStateType* lexstate );
113 	const TokenSourceType* get_tokSource() const;
114 	CommonTokenType* get_ltoken() const;
115 	void set_ltoken( const CommonTokenType* ltoken );
116 	bool hasFailed() const;
117 	ANTLR_INT32 get_backtracking() const;
118 	void inc_backtracking();
119 	void dec_backtracking();
120 	bool get_failedflag() const;
121 	void set_failedflag( bool failed );
122 	InputStreamType* get_strstream() const;
123 	ANTLR_MARKER  index() const;
124 	void	seek(ANTLR_MARKER index);
125 	const CommonTokenType* EOF_Token() const;
126 	bool hasException() const;
127 	ExceptionBaseType* get_exception() const;
128 	void constructEx();
129 	void lrecover();
130 	ANTLR_MARKER mark();
131 	void rewind(ANTLR_MARKER marker);
132 	void rewindLast();
133 	void setText( const StringType& text );
134 	void skip();
135 	RuleMemoType* getRuleMemo() const;
136 	DebuggerType* get_debugger() const;
137 	void setRuleMemo(RuleMemoType* rulememo);
138 	ANTLR_UINT32 LA(ANTLR_INT32 i);
139 	void consume();
140 	void memoize(ANTLR_MARKER	ruleIndex, ANTLR_MARKER	ruleParseStart);
141 	bool haveParsedRule(ANTLR_MARKER	ruleIndex);
142 
143     /** Pointer to a function that sets the charstream source for the lexer and
144      *  causes it to  be reset.
145      */
146     void	setCharStream(InputStreamType* input);
147 
148     /*!
149 	 * \brief
150 	 * Change to a new input stream, remembering the old one.
151 	 *
152 	 * \param lexer
153 	 * Pointer to the lexer instance to switch input streams for.
154 	 *
155 	 * \param input
156 	 * New input stream to install as the current one.
157 	 *
158 	 * Switches the current character input stream to
159 	 * a new one, saving the old one, which we will revert to at the end of this
160 	 * new one.
161 	 */
162     void	pushCharStream(InputStreamType* input);
163 
164 	/*!
165 	 * \brief
166 	 * Stops using the current input stream and reverts to any prior
167 	 * input stream on the stack.
168 	 *
169 	 * \param lexer
170 	 * Description of parameter lexer.
171 	 *
172 	 * Pointer to a function that abandons the current input stream, whether it
173 	 * is empty or not and reverts to the previous stacked input stream.
174 	 *
175 	 * \remark
176 	 * The function fails silently if there are no prior input streams.
177 	 */
178     void	popCharStream();
179 
180     /** Function that emits (a copy of ) the supplied token as the next token in
181      *  the stream.
182      */
183     void	emit(const CommonTokenType* token);
184 
185     /** Pointer to a function that constructs a new token from the lexer stored information
186      */
187     CommonTokenType*	emit();
188 
189     /** Pointer to a function that attempts to match and consume the specified string from the input
190      *  stream. Note that strings muse be passed as terminated arrays of ANTLR3_UCHAR. Strings are terminated
191      *  with 0xFFFFFFFF, which is an invalid UTF32 character
192      */
193     bool	matchs(ANTLR_UCHAR* string);
194 
195     /** Pointer to a function that matches and consumes the specified character from the input stream.
196      *  The input stream is required to provide characters via LA() as UTF32 characters. The default lexer
197      *  implementation is source encoding agnostic and so input streams do not generally need to
198      *  override the default implmentation.
199      */
200     bool	matchc(ANTLR_UCHAR c);
201 
202     /** Pointer to a function that matches any character in the supplied range (I suppose it could be a token range too
203      *  but this would only be useful if the tokens were in tsome guaranteed order which is
204      *  only going to happen with a hand crafted token set).
205      */
206     bool	matchRange(ANTLR_UCHAR low, ANTLR_UCHAR high);
207 
208     /** Pointer to a function that matches the next token/char in the input stream
209      *  regardless of what it actaully is.
210      */
211     void		matchAny();
212 
213     /** Pointer to a function that recovers from an error found in the input stream.
214      *  Generally, this will be a #ANTLR3_EXCEPTION_NOVIABLE_ALT but it could also
215      *  be from a mismatched token that the (*match)() could not recover from.
216      */
217     void		recover();
218 
219     /** Function to return the current line number in the input stream
220      */
221     ANTLR_UINT32	getLine();
222     ANTLR_MARKER	getCharIndex();
223     ANTLR_UINT32	getCharPositionInLine();
224 
225     /** Function to return the text so far for the current token being generated
226      */
227     StringType 	getText();
228 
229 	//Other utility functions
230 	void fillExceptionData( ExceptionBaseType* ex );
231 
232 	/** Default lexer error handler (works for 8 bit streams only!!!)
233 	 */
234 	void displayRecognitionError( ANTLR_UINT8** tokenNames, ExceptionBaseType* ex);
235 	void exConstruct();
236 	TokenType*	getMissingSymbol( IntStreamType* istream, ExceptionBaseType* e,
237 								  ANTLR_UINT32	expectedTokenType, BitsetListType*	follow);
238 
239     /** Pointer to a function that knows how to free the resources of a lexer
240      */
241 	~Lexer();
242 };
243 
244 ANTLR_END_NAMESPACE()
245 
246 #include "antlr3lexer.inl"
247 
248 #endif
249