1/** \file
2 * \brief The ANTLR3 C filestream is used when the source character stream
3 * is a filesystem based input set and all the characters in the filestream
4 * can be loaded at once into memory and away the lexer goes.
5 *
6 * A number of initializers are provided in order that various character
7 * sets can be supported from input files. The ANTLR3 C runtime expects
8 * to deal with UTF32 characters only (the reasons for this are to
9 * do with the simplification of C code when using this form of Unicode
10 * encoding, though this is not a panacea. More information can be
11 * found on this by consulting:
12 *   - http://www.unicode.org/versions/Unicode4.0.0/ch02.pdf#G11178
13 * Where a well grounded discussion of the encoding formats available
14 * may be found.
15 *
16 */
17
18// [The "BSD licence"]
19// Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC
20// http://www.temporal-wave.com
21// http://www.linkedin.com/in/jimidle
22//
23// All rights reserved.
24//
25// Redistribution and use in source and binary forms, with or without
26// modification, are permitted provided that the following conditions
27// are met:
28// 1. Redistributions of source code must retain the above copyright
29//    notice, this list of conditions and the following disclaimer.
30// 2. Redistributions in binary form must reproduce the above copyright
31//    notice, this list of conditions and the following disclaimer in the
32//    documentation and/or other materials provided with the distribution.
33// 3. The name of the author may not be used to endorse or promote products
34//    derived from this software without specific prior written permission.
35//
36// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
37// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
38// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
39// IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
40// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41// NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
42// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
43// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
44// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
45// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
46
47#include    <antlr3.h>
48
49static  void                    setupInputStream            (pANTLR3_INPUT_STREAM input);
50static  pANTLR3_INPUT_STREAM    antlr3CreateFileStream      (pANTLR3_UINT8 fileName);
51static  pANTLR3_INPUT_STREAM    antlr3CreateStringStream    (pANTLR3_UINT8 data);
52
53ANTLR3_API pANTLR3_INPUT_STREAM
54antlr3FileStreamNew(pANTLR3_UINT8 fileName, ANTLR3_UINT32 encoding)
55{
56    pANTLR3_INPUT_STREAM input;
57
58    // First order of business is to read the file into some buffer space
59    // as just straight 8 bit bytes. Then we will work out the encoding and
60    // byte order and adjust the API functions that are installed for the
61    // default 8Bit stream accordingly.
62    //
63    input   = antlr3CreateFileStream(fileName);
64    if  (input == NULL)
65    {
66        return NULL;
67    }
68
69    // We have the data in memory now so we can deal with it according to
70    // the encoding scheme we were given by the user.
71    //
72    input->encoding = encoding;
73
74    // Now we need to work out the endian type and install any
75    // API functions that differ from 8Bit
76    //
77    setupInputStream(input);
78
79    // Now we can set up the file name
80    //
81    input->istream->streamName	= input->strFactory->newStr8(input->strFactory, fileName);
82    input->fileName		= input->istream->streamName;
83
84    return input;
85}
86
87
88ANTLR3_API pANTLR3_INPUT_STREAM
89antlr3StringStreamNew(pANTLR3_UINT8 data, ANTLR3_UINT32 encoding, ANTLR3_UINT32 size, pANTLR3_UINT8 name)
90{
91    pANTLR3_INPUT_STREAM    input;
92
93    // First order of business is to set up the stream and install the data pointer.
94    // Then we will work out the encoding and byte order and adjust the API functions that are installed for the
95    // default 8Bit stream accordingly.
96    //
97    input   = antlr3CreateStringStream(data);
98    if  (input == NULL)
99    {
100        return NULL;
101    }
102
103    // Size (in bytes) of the given 'string'
104    //
105    input->sizeBuf		= size;
106
107    // We have the data in memory now so we can deal with it according to
108    // the encoding scheme we were given by the user.
109    //
110    input->encoding = encoding;
111
112    // Now we need to work out the endian type and install any
113    // API functions that differ from 8Bit
114    //
115    setupInputStream(input);
116
117    // Now we can set up the file name
118    //
119    input->istream->streamName	= input->strFactory->newStr8(input->strFactory, name);
120    input->fileName		= input->istream->streamName;
121
122    return input;
123}
124
125
126/// Determine endianess of the input stream and install the
127/// API required for the encoding in that format.
128///
129static void
130setupInputStream(pANTLR3_INPUT_STREAM input)
131{
132    ANTLR3_BOOLEAN  isBigEndian;
133
134    // Used to determine the endianness of the machine we are currently
135    // running on.
136    //
137    ANTLR3_UINT16 bomTest = 0xFEFF;
138
139    // What endianess is the machine we are running on? If the incoming
140    // encoding endianess is the same as this machine's natural byte order
141    // then we can use more efficient API calls.
142    //
143    if  (*((pANTLR3_UINT8)(&bomTest)) == 0xFE)
144    {
145        isBigEndian = ANTLR3_TRUE;
146    }
147    else
148    {
149        isBigEndian = ANTLR3_FALSE;
150    }
151
152    // What encoding did the user tell us {s}he thought it was? I am going
153    // to get sick of the questions on antlr-interest, I know I am.
154    //
155    switch  (input->encoding)
156    {
157        case    ANTLR3_ENC_UTF8:
158
159            // See if there is a BOM at the start of this UTF-8 sequence
160            // and just eat it if there is. Windows .TXT files have this for instance
161            // as it identifies UTF-8 even though it is of no consequence for byte order
162            // as UTF-8 does not have a byte order.
163            //
164            if  (       (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar))      == 0xEF
165                    &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1))    == 0xBB
166                    &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+2))    == 0xBF
167                )
168            {
169                // The UTF8 BOM is present so skip it
170                //
171                input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 3);
172            }
173
174            // Install the UTF8 input routines
175            //
176            antlr3UTF8SetupStream(input);
177            break;
178
179        case    ANTLR3_ENC_UTF16:
180
181            // See if there is a BOM at the start of the input. If not then
182            // we assume that the byte order is the natural order of this
183            // machine (or it is really UCS2). If there is a BOM we determine if the encoding
184            // is the same as the natural order of this machine.
185            //
186            if  (       (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar))      == 0xFE
187                    &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1))    == 0xFF
188                )
189            {
190                // BOM Present, indicates Big Endian
191                //
192                input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 2);
193
194                antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_TRUE);
195            }
196            else if  (      (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar))      == 0xFF
197                        &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1))    == 0xFE
198                )
199            {
200                // BOM present, indicates Little Endian
201                //
202                input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 2);
203
204                antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_FALSE);
205            }
206            else
207            {
208                // No BOM present, assume local computer byte order
209                //
210                antlr3UTF16SetupStream(input, isBigEndian, isBigEndian);
211            }
212            break;
213
214        case    ANTLR3_ENC_UTF32:
215
216            // See if there is a BOM at the start of the input. If not then
217            // we assume that the byte order is the natural order of this
218            // machine. If there is we determine if the encoding
219            // is the same as the natural order of this machine.
220            //
221            if  (       (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar))      == 0x00
222                    &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1))    == 0x00
223                    &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+2))    == 0xFE
224                    &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+3))    == 0xFF
225                )
226            {
227                // BOM Present, indicates Big Endian
228                //
229                input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 4);
230
231                antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_TRUE);
232            }
233            else if  (      (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar))      == 0xFF
234                        &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1))    == 0xFE
235                        &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1))    == 0x00
236                        &&  (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1))    == 0x00
237                )
238            {
239                // BOM present, indicates Little Endian
240                //
241                input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 4);
242
243                antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_FALSE);
244            }
245            else
246            {
247                // No BOM present, assume local computer byte order
248                //
249                antlr3UTF32SetupStream(input, isBigEndian, isBigEndian);
250            }
251            break;
252
253        case    ANTLR3_ENC_UTF16BE:
254
255            // Encoding is definately Big Endian with no BOM
256            //
257            antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_TRUE);
258            break;
259
260        case    ANTLR3_ENC_UTF16LE:
261
262            // Encoding is definately Little Endian with no BOM
263            //
264            antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_FALSE);
265            break;
266
267        case    ANTLR3_ENC_UTF32BE:
268
269            // Encoding is definately Big Endian with no BOM
270            //
271            antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_TRUE);
272            break;
273
274        case    ANTLR3_ENC_UTF32LE:
275
276            // Encoding is definately Little Endian with no BOM
277            //
278            antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_FALSE);
279            break;
280
281        case    ANTLR3_ENC_EBCDIC:
282
283            // EBCDIC is basically the same as ASCII but with an on the
284            // fly translation to ASCII
285            //
286            antlr3EBCDICSetupStream(input);
287            break;
288
289        case    ANTLR3_ENC_8BIT:
290        default:
291
292            // Standard 8bit/ASCII
293            //
294            antlr38BitSetupStream(input);
295            break;
296    }
297}
298
299/** \brief Use the contents of an operating system file as the input
300 *         for an input stream.
301 *
302 * \param fileName Name of operating system file to read.
303 * \return
304 *	- Pointer to new input stream context upon success
305 *	- One of the ANTLR3_ERR_ defines on error.
306 */
307static pANTLR3_INPUT_STREAM
308antlr3CreateFileStream(pANTLR3_UINT8 fileName)
309{
310	// Pointer to the input stream we are going to create
311	//
312	pANTLR3_INPUT_STREAM    input;
313	ANTLR3_UINT32	    status;
314
315	if	(fileName == NULL)
316	{
317		return NULL;
318	}
319
320	// Allocate memory for the input stream structure
321	//
322	input   = (pANTLR3_INPUT_STREAM)
323		ANTLR3_CALLOC(1, sizeof(ANTLR3_INPUT_STREAM));
324
325	if	(input == NULL)
326	{
327		return	NULL;
328	}
329
330	// Structure was allocated correctly, now we can read the file.
331	//
332	status  = antlr3read8Bit(input, fileName);
333
334	// Call the common 8 bit input stream handler
335	// initialization.
336	//
337	antlr3GenericSetupStream(input);
338
339        // However if the file was not there or something then we
340        // need to close. Have to wait until here as we cannot call
341        // close until the API is installed of course.
342        //
343	if	(status != ANTLR3_SUCCESS)
344	{
345		input->close(input);
346		return	NULL;
347	}
348
349	return  input;
350}
351
352ANTLR3_API ANTLR3_UINT32
353antlr3read8Bit(pANTLR3_INPUT_STREAM    input, pANTLR3_UINT8 fileName)
354{
355	ANTLR3_FDSC	    infile;
356	ANTLR3_UINT32	    fSize;
357
358	/* Open the OS file in read binary mode
359	*/
360	infile  = antlr3Fopen(fileName, "rb");
361
362	/* Check that it was there
363	*/
364	if	(infile == NULL)
365	{
366		return	(ANTLR3_UINT32)ANTLR3_ERR_NOFILE;
367	}
368
369	/* It was there, so we can read the bytes now
370	*/
371	fSize   = antlr3Fsize(fileName);	/* Size of input file	*/
372
373	/* Allocate buffer for this input set
374	*/
375	input->data	    = ANTLR3_MALLOC((size_t)fSize);
376	input->sizeBuf  = fSize;
377
378	if	(input->data == NULL)
379	{
380		return	(ANTLR3_UINT32)ANTLR3_ERR_NOMEM;
381	}
382
383	input->isAllocated	= ANTLR3_TRUE;
384
385	/* Now we read the file. Characters are not converted to
386	* the internal ANTLR encoding until they are read from the buffer
387	*/
388	antlr3Fread(infile, fSize, input->data);
389
390	/* And close the file handle
391	*/
392	antlr3Fclose(infile);
393
394	return  ANTLR3_SUCCESS;
395}
396
397/** \brief Open an operating system file and return the descriptor
398 * We just use the common open() and related functions here.
399 * Later we might find better ways on systems
400 * such as Windows and OpenVMS for instance. But the idea is to read the
401 * while file at once anyway, so it may be irrelevant.
402 */
403ANTLR3_API ANTLR3_FDSC
404antlr3Fopen(pANTLR3_UINT8 filename, const char * mode)
405{
406    return  (ANTLR3_FDSC)fopen((const char *)filename, mode);
407}
408
409/** \brief Close an operating system file and free any handles
410 *  etc.
411 */
412ANTLR3_API void
413antlr3Fclose(ANTLR3_FDSC fd)
414{
415    fclose(fd);
416}
417ANTLR3_API ANTLR3_UINT32
418antlr3Fsize(pANTLR3_UINT8 fileName)
419{
420    struct _stat	statbuf;
421
422    _stat((const char *)fileName, &statbuf);
423
424    return (ANTLR3_UINT32)statbuf.st_size;
425}
426
427ANTLR3_API ANTLR3_UINT32
428antlr3Fread(ANTLR3_FDSC fdsc, ANTLR3_UINT32 count,  void * data)
429{
430    return  (ANTLR3_UINT32)fread(data, (size_t)count, 1, fdsc);
431}
432
433
434/** \brief Use the supplied 'string' as input to the stream
435 *
436 * \param data Pointer to the input data
437 * \return
438 *	- Pointer to new input stream context upon success
439 *	- NULL defines on error.
440 */
441static pANTLR3_INPUT_STREAM
442antlr3CreateStringStream(pANTLR3_UINT8 data)
443{
444	// Pointer to the input stream we are going to create
445	//
446	pANTLR3_INPUT_STREAM    input;
447
448	if	(data == NULL)
449	{
450		return NULL;
451	}
452
453	// Allocate memory for the input stream structure
454	//
455	input   = (pANTLR3_INPUT_STREAM)
456		ANTLR3_CALLOC(1, sizeof(ANTLR3_INPUT_STREAM));
457
458	if	(input == NULL)
459	{
460		return	NULL;
461	}
462
463	// Structure was allocated correctly, now we can install the pointer
464	//
465        input->data             = data;
466        input->isAllocated	= ANTLR3_FALSE;
467
468	// Call the common 8 bit input stream handler
469	// initialization.
470	//
471	antlr3GenericSetupStream(input);
472
473        return  input;
474}