1 /** \file
2  * Defines the basic structures used to manipulate character
3  * streams from any input source. Any character size and encoding
4  * can in theory be used, so long as a set of functinos is provided that
5  * can return a 32 bit Integer representation of their characters amd efficiently mark and revert
6  * to specific offsets into their input streams.
7  */
8 #ifndef	_ANTLR_INPUT_HPP
9 #define	_ANTLR_INPUT_HPP
10 
11 // [The "BSD licence"]
12 // Copyright (c) 2005-2009 Gokulakannan Somasundaram, ElectronDB
13 
14 //
15 // All rights reserved.
16 //
17 // Redistribution and use in source and binary forms, with or without
18 // modification, are permitted provided that the following conditions
19 // are met:
20 // 1. Redistributions of source code must retain the above copyright
21 //    notice, this list of conditions and the following disclaimer.
22 // 2. Redistributions in binary form must reproduce the above copyright
23 //    notice, this list of conditions and the following disclaimer in the
24 //    documentation and/or other materials provided with the distribution.
25 // 3. The name of the author may not be used to endorse or promote products
26 //    derived from this software without specific prior written permission.
27 //
28 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
29 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
30 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
31 // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
32 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
33 // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
37 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 
39 #include    "antlr3defs.hpp"
40 
41 ANTLR_BEGIN_NAMESPACE()
42 
43 /// Master context structure for an ANTLR3 C runtime based input stream.
44 /// \ingroup apistructures. Calling _LT on this doesn't seem right. You would
45 /// call it only with parser / TreeParser, and their respective input streams
46 /// has that function. calling it from lexer will throw a compile time error
47 ///
48 
49 template<class ImplTraits>
50 class	InputStream :   public ImplTraits::template IntStreamType< typename ImplTraits::InputStreamType >
51 {
52 public:
53 	typedef typename ImplTraits::AllocPolicyType AllocPolicyType;
54 	typedef typename ImplTraits::LexStateType LexStateType;
55 	typedef typename ImplTraits::template IntStreamType< typename ImplTraits::InputStreamType > IntStreamType;
56 	typedef IntStreamType BaseType;
57 	typedef typename ImplTraits::StreamDataType UnitType;
58 	typedef UnitType DataType;
59 	typedef UnitType TokenType;
60 	typedef typename AllocPolicyType::template VectorType<LexStateType> MarkersType;
61 	typedef typename ImplTraits::StringType StringType;
62 
63 private:
64     /** Pointer the start of the input string, characters may be
65      *  taken as offsets from here and in original input format encoding.
66      */
67     const DataType*		m_data;
68 
69     /** Pointer to the next character to be consumed from the input data
70      *  This is cast to point at the encoding of the original file that
71      *  was read by the functions installed as pointer in this input stream
72      *  context instance at file/string/whatever load time.
73      */
74     const DataType*		m_nextChar;
75 
76     /** Number of characters that can be consumed at this point in time.
77      *  Mostly this is just what is left in the pre-read buffer, but if the
78      *  input source is a stream such as a socket or something then we may
79      *  call special read code to wait for more input.
80      */
81     ANTLR_UINT32	m_sizeBuf;
82 
83     /** The line number we are traversing in the input file. This gets incremented
84      *  by a newline() call in the lexer grammar actions.
85      */
86     ANTLR_UINT32	m_line;
87 
88     /** Pointer into the input buffer where the current line
89      *  started.
90      */
91     const DataType*		m_currentLine;
92 
93     /** The offset within the current line of the current character
94      */
95     ANTLR_INT32		m_charPositionInLine;
96 
97     /** Tracks how deep mark() calls are nested
98      */
99     ANTLR_UINT32	m_markDepth;
100 
101     /** List of mark() points in the input stream
102      */
103     MarkersType		m_markers;
104 
105     /** File name string, set to pointer to memory if
106      * you set it manually as it will be free()d
107      */
108     StringType		m_fileName;
109 
110     /** File number, needs to be set manually to some file index of your devising.
111      */
112     ANTLR_UINT32	m_fileNo;
113 
114 	/// Character that automatically causes an internal line count
115     ///  increment.
116     ///
117     ANTLR_UCHAR		m_newlineChar;
118 
119     /// Indicates the size, in 8 bit units, of a single character. Note that
120     /// the C runtime does not deal with surrogates as this would be
121     /// slow and complicated. If this is a UTF-8 stream then this field
122     /// will be set to 0. Generally you are best working internally with 32 bit characters
123     /// as this is the most efficient.
124     ///
125     ANTLR_UINT8		m_charByteSize;
126 
127    /** Indicates if the data pointer was allocated by us, and so should be freed
128      *  when the stream dies.
129      */
130     bool			m_isAllocated;
131 
132     /// Indicates the encoding scheme used in this input stream
133     ///
134     ANTLR_UINT32    m_encoding;
135 
136     /* API */
137 public:
138 	InputStream(const ANTLR_UINT8* fileName, ANTLR_UINT32 encoding);
139 	InputStream(const ANTLR_UINT8* data, ANTLR_UINT32 encoding, ANTLR_UINT32 size, ANTLR_UINT8* name);
140 	~InputStream();
141 	const DataType* get_data() const;
142 	bool get_isAllocated() const;
143 	const DataType* get_nextChar() const;
144 	ANTLR_UINT32 get_sizeBuf() const;
145 	ANTLR_UINT32 get_line() const;
146 	const DataType* get_currentLine() const;
147 	ANTLR_INT32 get_charPositionInLine() const;
148 	ANTLR_UINT32 get_markDepth() const;
149 	MarkersType& get_markers();
150 	const StringType& get_fileName() const;
151 	ANTLR_UINT32 get_fileNo() const;
152 	ANTLR_UCHAR get_newlineChar() const;
153 	ANTLR_UINT8 get_charByteSize() const;
154 	ANTLR_UINT32 get_encoding() const;
155 
156 	void  set_data( DataType* data );
157 	void  set_isAllocated( bool isAllocated );
158 	void  set_nextChar( const DataType* nextChar );
159 	void  set_sizeBuf( ANTLR_UINT32 sizeBuf );
160 	void  set_line( ANTLR_UINT32 line );
161 	void  set_currentLine( const DataType* currentLine );
162 	void  set_charPositionInLine( ANTLR_INT32 charPositionInLine );
163 	void  set_markDepth( ANTLR_UINT32 markDepth );
164 	void  set_markers( const MarkersType& markers );
165 	void  set_fileName( const StringType& fileName );
166 	void  set_fileNo( ANTLR_UINT32 fileNo );
167 	void  set_newlineChar( ANTLR_UCHAR newlineChar );
168 	void  set_charByteSize( ANTLR_UINT8 charByteSize );
169 	void  set_encoding( ANTLR_UINT32 encoding );
170 
171 	void inc_charPositionInLine();
172 	void inc_line();
173 	void inc_markDepth();
174 
175 	IntStreamType*	get_istream();
176 
177     /** Function that resets the input stream
178      */
179     void	reset();
180 
181     /** Pointer to a function that reuses and resets an input stream by
182      *  supplying a new 'source'
183      */
184     void    reuse(ANTLR_UINT8* inString, ANTLR_UINT32 size, ANTLR_UINT8* name);
185 
186 
187     /** Function to return the total size of the input buffer. For streams
188      *  this may be just the total we have available so far. This means of course that
189      *  the input stream must be careful to accumulate enough input so that any backtracking
190      *  can be satisfied.
191      */
192     ANTLR_UINT32	size();
193 
194     /** Function to return a substring of the input stream. String is returned in allocated
195      *  memory and is in same encoding as the input stream itself, NOT internal ANTLR_UCHAR form.
196      */
197     StringType	substr(ANTLR_MARKER start, ANTLR_MARKER stop);
198 
199     /** Function to return the current line number in the input stream
200      */
201     ANTLR_UINT32	get_line();
202 
203     /** Function to return the current line buffer in the input stream
204      *  The pointer returned is directly into the input stream so you must copy
205      *  it if you wish to manipulate it without damaging the input stream. Encoding
206      *  is obviously in the same form as the input stream.
207      *  \remark
208      *    - Note taht this function wil lbe inaccurate if setLine is called as there
209      *      is no way at the moment to position the input stream at a particular line
210      *	    number offset.
211      */
212     const DataType*	getLineBuf();
213 
214     /** Function to return the current offset in the current input stream line
215      */
216     ANTLR_UINT32	get_charPositionInLine();
217 
218     /** Function to set the current position in the current line.
219      */
220     void	set_charPositionInLine(ANTLR_UINT32 position);
221 
222     /** Function to override the default newline character that the input stream
223      *  looks for to trigger the line/offset and line buffer recording information.
224      *  \remark
225      *   - By default the chracter '\n' will be installed as the newline trigger character. When this
226      *     character is seen by the consume() function then the current line number is incremented and the
227      *     current line offset is reset to 0. The Pointer for the line of input we are consuming
228      *     is updated to point to the next character after this one in the input stream (which means it
229      *     may become invalid if the last newline character in the file is seen (so watch out).
230      *   - If for some reason you do not want the counters and pointers to be restee, you can set the
231      *     chracter to some impossible character such as '\0' or whatever.
232      *   - This is a single character only, so choose the last character in a sequence of two or more.
233      *   - This is only a simple aid to error reporting - if you have a complicated binary input structure
234      *     it may not be adequate, but you can always override every function in the input stream with your
235      *     own of course, and can even write your own complete input stream set if you like.
236      *   - It is your responsiblity to set a valid character for the input stream type. There is no point
237      *     setting this to 0xFFFFFFFF if the input stream is 8 bit ASCII, as this will just be truncated and never
238      *	   trigger as the comparison will be (INT32)0xFF == (INT32)0xFFFFFFFF
239      */
240     void	set_newLineChar(ANTLR_UINT32 newlineChar);
241 
242 	ANTLR_MARKER index_impl();
243 
244 private:
245 	/** \brief Use the contents of an operating system file as the input
246 	 *         for an input stream.
247 	 *
248 	 * \param fileName Name of operating system file to read.
249 	 * \return
250 	 *	- Pointer to new input stream context upon success
251 	 *	- One of the ANTLR3_ERR_ defines on error.
252 	 */
253 	void createFileStream(const ANTLR_UINT8* fileName);
254 
255 	/** \brief Use the supplied 'string' as input to the stream
256 	 *
257 	 * \param data Pointer to the input data
258 	 * \return
259 	 *	- Pointer to new input stream context upon success
260 	 *	- NULL defines on error.
261 	 */
262 	void createStringStream(const ANTLR_UINT8* data);
263 	void genericSetupStream();
264 
265 	/// Determine endianess of the input stream and install the
266 	/// API required for the encoding in that format.
267 	///
268 	void setupInputStream();
269 
270 };
271 
272 /** \brief Structure for track lex input states as part of mark()
273  *  and rewind() of lexer.
274  */
275 template<class ImplTraits>
276 class	LexState : public ImplTraits::AllocPolicyType
277 {
278 public:
279 	typedef typename ImplTraits::StreamDataType DataType;
280 
281 private:
282         /** Pointer to the next character to be consumed from the input data
283      *  This is cast to point at the encoding of the original file that
284      *  was read by the functions installed as pointer in this input stream
285      *  context instance at file/string/whatever load time.
286      */
287     const DataType*			m_nextChar;
288 
289     /** The line number we are traversing in the input file. This gets incremented
290      *  by a newline() call in the lexer grammer actions.
291      */
292     ANTLR_UINT32	m_line;
293 
294     /** Pointer into the input buffer where the current line
295      *  started.
296      */
297     const DataType*			m_currentLine;
298 
299     /** The offset within the current line of the current character
300      */
301     ANTLR_INT32		m_charPositionInLine;
302 
303 public:
304 	LexState();
305 	const DataType* get_nextChar() const;
306 	ANTLR_UINT32 get_line() const;
307 	const DataType* get_currentLine() const;
308 	ANTLR_INT32 get_charPositionInLine() const;
309 	void  set_nextChar( const DataType* nextChar );
310 	void  set_line( ANTLR_UINT32 line );
311 	void  set_currentLine( const DataType* currentLine );
312 	void  set_charPositionInLine( ANTLR_INT32 charPositionInLine );
313 };
314 
315 class ParseNullStringException : public std::exception
316 {
what() const317 	virtual const char* what() const throw()
318 	{
319 		return "Null String";
320 	}
321 };
322 
323 ANTLR_END_NAMESPACE()
324 
325 #include "antlr3input.inl"
326 
327 #endif	/* _ANTLR_INPUT_H  */
328