1 /** \file
2  * \brief The ANTLR3 C filestream is used when the source character stream
3  * is a filesystem based input set and all the characters in the filestream
4  * can be loaded at once into memory and away the lexer goes.
5  *
6  * A number of initializers are provided in order that various character
7  * sets can be supported from input files. The ANTLR3 C runtime expects
8  * to deal with UTF32 characters only (the reasons for this are to
9  * do with the simplification of C code when using this form of Unicode
10  * encoding, though this is not a panacea. More information can be
11  * found on this by consulting:
12  *   - http://www.unicode.org/versions/Unicode4.0.0/ch02.pdf#G11178
13  * Where a well grounded discussion of the encoding formats available
14  * may be found.
15  *
16  */
17 
18 // [The "BSD licence"]
19 // Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC
20 // http://www.temporal-wave.com
21 // http://www.linkedin.com/in/jimidle
22 //
23 // All rights reserved.
24 //
25 // Redistribution and use in source and binary forms, with or without
26 // modification, are permitted provided that the following conditions
27 // are met:
28 // 1. Redistributions of source code must retain the above copyright
29 //    notice, this list of conditions and the following disclaimer.
30 // 2. Redistributions in binary form must reproduce the above copyright
31 //    notice, this list of conditions and the following disclaimer in the
32 //    documentation and/or other materials provided with the distribution.
33 // 3. The name of the author may not be used to endorse or promote products
34 //    derived from this software without specific prior written permission.
35 //
36 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
37 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
38 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
39 // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
40 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41 // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
42 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
43 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
44 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
45 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
46 
47 #include    <antlr3.h>
48 
49 static  void                    setupInputStream            (pANTLR3_INPUT_STREAM input);
50 static  pANTLR3_INPUT_STREAM    antlr3CreateFileStream      (pANTLR3_UINT8 fileName);
51 static  pANTLR3_INPUT_STREAM    antlr3CreateStringStream    (pANTLR3_UINT8 data);
52 
53 ANTLR3_API pANTLR3_INPUT_STREAM
antlr3FileStreamNew(pANTLR3_UINT8 fileName,ANTLR3_UINT32 encoding)54 antlr3FileStreamNew(pANTLR3_UINT8 fileName, ANTLR3_UINT32 encoding)
55 {
56     pANTLR3_INPUT_STREAM input;
57 
58     // First order of business is to read the file into some buffer space
59     // as just straight 8 bit bytes. Then we will work out the encoding and
60     // byte order and adjust the API functions that are installed for the
61     // default 8Bit stream accordingly.
62     //
63     input   = antlr3CreateFileStream(fileName);
64     if  (input == NULL)
65     {
66         return NULL;
67     }
68 
69     // We have the data in memory now so we can deal with it according to
70     // the encoding scheme we were given by the user.
71     //
72     input->encoding = encoding;
73 
74     // Now we need to work out the endian type and install any
75     // API functions that differ from 8Bit
76     //
77     setupInputStream(input);
78 
79     // Now we can set up the file name
80     //
81     input->istream->streamName	= input->strFactory->newStr8(input->strFactory, fileName);
82     input->fileName		= input->istream->streamName;
83 
84     return input;
85 }
86 
87 
88 ANTLR3_API pANTLR3_INPUT_STREAM
antlr3StringStreamNew(pANTLR3_UINT8 data,ANTLR3_UINT32 encoding,ANTLR3_UINT32 size,pANTLR3_UINT8 name)89 antlr3StringStreamNew(pANTLR3_UINT8 data, ANTLR3_UINT32 encoding, ANTLR3_UINT32 size, pANTLR3_UINT8 name)
90 {
91     pANTLR3_INPUT_STREAM    input;
92 
93     // First order of business is to set up the stream and install the data pointer.
94     // Then we will work out the encoding and byte order and adjust the API functions that are installed for the
95     // default 8Bit stream accordingly.
96     //
97     input   = antlr3CreateStringStream(data);
98     if  (input == NULL)
99     {
100         return NULL;
101     }
102 
103     // Size (in bytes) of the given 'string'
104     //
105     input->sizeBuf		= size;
106 
107     // We have the data in memory now so we can deal with it according to
108     // the encoding scheme we were given by the user.
109     //
110     input->encoding = encoding;
111 
112     // Now we need to work out the endian type and install any
113     // API functions that differ from 8Bit
114     //
115     setupInputStream(input);
116 
117     // Now we can set up the file name
118     //
119     input->istream->streamName	= input->strFactory->newStr8(input->strFactory, name);
120     input->fileName		= input->istream->streamName;
121 
122     return input;
123 }
124 
125 
126 /// Determine endianess of the input stream and install the
127 /// API required for the encoding in that format.
128 ///
129 static void
setupInputStream(pANTLR3_INPUT_STREAM input)130 setupInputStream(pANTLR3_INPUT_STREAM input)
131 {
132     ANTLR3_BOOLEAN  isBigEndian;
133 
134     // Used to determine the endianness of the machine we are currently
135     // running on.
136     //
137     ANTLR3_UINT16 bomTest = 0xFEFF;
138 
139     // What endianess is the machine we are running on? If the incoming
140     // encoding endianess is the same as this machine's natural byte order
141     // then we can use more efficient API calls.
142     //
143     if  (*((pANTLR3_UINT8)(&bomTest)) == 0xFE)
144     {
145         isBigEndian = ANTLR3_TRUE;
146     }
147     else
148     {
149         isBigEndian = ANTLR3_FALSE;
150     }
151 
152     // What encoding did the user tell us {s}he thought it was? I am going
153     // to get sick of the questions on antlr-interest, I know I am.
154     //
155     switch  (input->encoding)
156     {
157         case    ANTLR3_ENC_UTF8:
158 
159             // See if there is a BOM at the start of this UTF-8 sequence
160             // and just eat it if there is. Windows .TXT files have this for instance
161             // as it identifies UTF-8 even though it is of no consequence for byte order
162             // as UTF-8 does not have a byte order.
163             //
164             if  (       (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar))      == 0xEF
165                     &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1))    == 0xBB
166                     &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+2))    == 0xBF
167                 )
168             {
169                 // The UTF8 BOM is present so skip it
170                 //
171                 input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 3);
172             }
173 
174             // Install the UTF8 input routines
175             //
176             antlr3UTF8SetupStream(input);
177             break;
178 
179         case    ANTLR3_ENC_UTF16:
180 
181             // See if there is a BOM at the start of the input. If not then
182             // we assume that the byte order is the natural order of this
183             // machine (or it is really UCS2). If there is a BOM we determine if the encoding
184             // is the same as the natural order of this machine.
185             //
186             if  (       (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar))      == 0xFE
187                     &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1))    == 0xFF
188                 )
189             {
190                 // BOM Present, indicates Big Endian
191                 //
192                 input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 2);
193 
194                 antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_TRUE);
195             }
196             else if  (      (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar))      == 0xFF
197                         &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1))    == 0xFE
198                 )
199             {
200                 // BOM present, indicates Little Endian
201                 //
202                 input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 2);
203 
204                 antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_FALSE);
205             }
206             else
207             {
208                 // No BOM present, assume local computer byte order
209                 //
210                 antlr3UTF16SetupStream(input, isBigEndian, isBigEndian);
211             }
212             break;
213 
214         case    ANTLR3_ENC_UTF32:
215 
216             // See if there is a BOM at the start of the input. If not then
217             // we assume that the byte order is the natural order of this
218             // machine. If there is we determine if the encoding
219             // is the same as the natural order of this machine.
220             //
221             if  (       (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar))      == 0x00
222                     &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1))    == 0x00
223                     &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+2))    == 0xFE
224                     &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+3))    == 0xFF
225                 )
226             {
227                 // BOM Present, indicates Big Endian
228                 //
229                 input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 4);
230 
231                 antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_TRUE);
232             }
233             else if  (      (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar))      == 0xFF
234                         &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1))    == 0xFE
235                         &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1))    == 0x00
236                         &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1))    == 0x00
237                 )
238             {
239                 // BOM present, indicates Little Endian
240                 //
241                 input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 4);
242 
243                 antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_FALSE);
244             }
245             else
246             {
247                 // No BOM present, assume local computer byte order
248                 //
249                 antlr3UTF32SetupStream(input, isBigEndian, isBigEndian);
250             }
251             break;
252 
253         case    ANTLR3_ENC_UTF16BE:
254 
255             // Encoding is definately Big Endian with no BOM
256             //
257             antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_TRUE);
258             break;
259 
260         case    ANTLR3_ENC_UTF16LE:
261 
262             // Encoding is definately Little Endian with no BOM
263             //
264             antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_FALSE);
265             break;
266 
267         case    ANTLR3_ENC_UTF32BE:
268 
269             // Encoding is definately Big Endian with no BOM
270             //
271             antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_TRUE);
272             break;
273 
274         case    ANTLR3_ENC_UTF32LE:
275 
276             // Encoding is definately Little Endian with no BOM
277             //
278             antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_FALSE);
279             break;
280 
281         case    ANTLR3_ENC_EBCDIC:
282 
283             // EBCDIC is basically the same as ASCII but with an on the
284             // fly translation to ASCII
285             //
286             antlr3EBCDICSetupStream(input);
287             break;
288 
289         case    ANTLR3_ENC_8BIT:
290         default:
291 
292             // Standard 8bit/ASCII
293             //
294             antlr38BitSetupStream(input);
295             break;
296     }
297 }
298 
299 /** \brief Use the contents of an operating system file as the input
300  *         for an input stream.
301  *
302  * \param fileName Name of operating system file to read.
303  * \return
304  *	- Pointer to new input stream context upon success
305  *	- One of the ANTLR3_ERR_ defines on error.
306  */
307 static pANTLR3_INPUT_STREAM
antlr3CreateFileStream(pANTLR3_UINT8 fileName)308 antlr3CreateFileStream(pANTLR3_UINT8 fileName)
309 {
310 	// Pointer to the input stream we are going to create
311 	//
312 	pANTLR3_INPUT_STREAM    input;
313 	ANTLR3_UINT32	    status;
314 
315 	if	(fileName == NULL)
316 	{
317 		return NULL;
318 	}
319 
320 	// Allocate memory for the input stream structure
321 	//
322 	input   = (pANTLR3_INPUT_STREAM)
323 		ANTLR3_CALLOC(1, sizeof(ANTLR3_INPUT_STREAM));
324 
325 	if	(input == NULL)
326 	{
327 		return	NULL;
328 	}
329 
330 	// Structure was allocated correctly, now we can read the file.
331 	//
332 	status  = antlr3read8Bit(input, fileName);
333 
334 	// Call the common 8 bit input stream handler
335 	// initialization.
336 	//
337 	antlr3GenericSetupStream(input);
338 
339         // However if the file was not there or something then we
340         // need to close. Have to wait until here as we cannot call
341         // close until the API is installed of course.
342         //
343 	if	(status != ANTLR3_SUCCESS)
344 	{
345 		input->close(input);
346 		return	NULL;
347 	}
348 
349 	return  input;
350 }
351 
352 ANTLR3_API ANTLR3_UINT32
antlr3read8Bit(pANTLR3_INPUT_STREAM input,pANTLR3_UINT8 fileName)353 antlr3read8Bit(pANTLR3_INPUT_STREAM    input, pANTLR3_UINT8 fileName)
354 {
355 	ANTLR3_FDSC	    infile;
356 	ANTLR3_UINT32	    fSize;
357 
358 	/* Open the OS file in read binary mode
359 	*/
360 	infile  = antlr3Fopen(fileName, "rb");
361 
362 	/* Check that it was there
363 	*/
364 	if	(infile == NULL)
365 	{
366 		return	(ANTLR3_UINT32)ANTLR3_ERR_NOFILE;
367 	}
368 
369 	/* It was there, so we can read the bytes now
370 	*/
371 	fSize   = antlr3Fsize(fileName);	/* Size of input file	*/
372 
373 	/* Allocate buffer for this input set
374 	*/
375 	input->data	    = ANTLR3_MALLOC((size_t)fSize);
376 	input->sizeBuf  = fSize;
377 
378 	if	(input->data == NULL)
379 	{
380 		return	(ANTLR3_UINT32)ANTLR3_ERR_NOMEM;
381 	}
382 
383 	input->isAllocated	= ANTLR3_TRUE;
384 
385 	/* Now we read the file. Characters are not converted to
386 	* the internal ANTLR encoding until they are read from the buffer
387 	*/
388 	antlr3Fread(infile, fSize, input->data);
389 
390 	/* And close the file handle
391 	*/
392 	antlr3Fclose(infile);
393 
394 	return  ANTLR3_SUCCESS;
395 }
396 
397 /** \brief Open an operating system file and return the descriptor
398  * We just use the common open() and related functions here.
399  * Later we might find better ways on systems
400  * such as Windows and OpenVMS for instance. But the idea is to read the
401  * while file at once anyway, so it may be irrelevant.
402  */
403 ANTLR3_API ANTLR3_FDSC
antlr3Fopen(pANTLR3_UINT8 filename,const char * mode)404 antlr3Fopen(pANTLR3_UINT8 filename, const char * mode)
405 {
406     return  (ANTLR3_FDSC)fopen((const char *)filename, mode);
407 }
408 
409 /** \brief Close an operating system file and free any handles
410  *  etc.
411  */
412 ANTLR3_API void
antlr3Fclose(ANTLR3_FDSC fd)413 antlr3Fclose(ANTLR3_FDSC fd)
414 {
415     fclose(fd);
416 }
417 ANTLR3_API ANTLR3_UINT32
antlr3Fsize(pANTLR3_UINT8 fileName)418 antlr3Fsize(pANTLR3_UINT8 fileName)
419 {
420     struct _stat	statbuf;
421 
422     _stat((const char *)fileName, &statbuf);
423 
424     return (ANTLR3_UINT32)statbuf.st_size;
425 }
426 
427 ANTLR3_API ANTLR3_UINT32
antlr3Fread(ANTLR3_FDSC fdsc,ANTLR3_UINT32 count,void * data)428 antlr3Fread(ANTLR3_FDSC fdsc, ANTLR3_UINT32 count,  void * data)
429 {
430     return  (ANTLR3_UINT32)fread(data, (size_t)count, 1, fdsc);
431 }
432 
433 
434 /** \brief Use the supplied 'string' as input to the stream
435  *
436  * \param data Pointer to the input data
437  * \return
438  *	- Pointer to new input stream context upon success
439  *	- NULL defines on error.
440  */
441 static pANTLR3_INPUT_STREAM
antlr3CreateStringStream(pANTLR3_UINT8 data)442 antlr3CreateStringStream(pANTLR3_UINT8 data)
443 {
444 	// Pointer to the input stream we are going to create
445 	//
446 	pANTLR3_INPUT_STREAM    input;
447 
448 	if	(data == NULL)
449 	{
450 		return NULL;
451 	}
452 
453 	// Allocate memory for the input stream structure
454 	//
455 	input   = (pANTLR3_INPUT_STREAM)
456 		ANTLR3_CALLOC(1, sizeof(ANTLR3_INPUT_STREAM));
457 
458 	if	(input == NULL)
459 	{
460 		return	NULL;
461 	}
462 
463 	// Structure was allocated correctly, now we can install the pointer
464 	//
465         input->data             = data;
466         input->isAllocated	= ANTLR3_FALSE;
467 
468 	// Call the common 8 bit input stream handler
469 	// initialization.
470 	//
471 	antlr3GenericSetupStream(input);
472 
473         return  input;
474 }