1/// \file
2/// Base functions to initialize and manipulate any input stream
3///
4
5// [The "BSD licence"]
6// Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC
7// http://www.temporal-wave.com
8// http://www.linkedin.com/in/jimidle
9//
10// All rights reserved.
11//
12// Redistribution and use in source and binary forms, with or without
13// modification, are permitted provided that the following conditions
14// are met:
15// 1. Redistributions of source code must retain the above copyright
16//    notice, this list of conditions and the following disclaimer.
17// 2. Redistributions in binary form must reproduce the above copyright
18//    notice, this list of conditions and the following disclaimer in the
19//    documentation and/or other materials provided with the distribution.
20// 3. The name of the author may not be used to endorse or promote products
21//    derived from this software without specific prior written permission.
22//
23// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
24// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
25// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
26// IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
27// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
28// NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
32// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33
34#include    <antlr3input.h>
35
36// -----------------------------------
37// Generic 8 bit input such as latin-1
38//
39
40// 8Bit INT Stream API
41//
42static	    void	    antlr38BitConsume		(pANTLR3_INT_STREAM is);
43static	    ANTLR3_UCHAR    antlr38BitLA		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
44static	    ANTLR3_UCHAR    antlr38BitLA_ucase		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
45static	    ANTLR3_MARKER   antlr38BitIndex		(pANTLR3_INT_STREAM is);
46static	    ANTLR3_MARKER   antlr38BitMark		(pANTLR3_INT_STREAM is);
47static	    void	    antlr38BitRewind		(pANTLR3_INT_STREAM is, ANTLR3_MARKER mark);
48static	    void	    antlr38BitRewindLast	(pANTLR3_INT_STREAM is);
49static	    void	    antlr38BitRelease		(pANTLR3_INT_STREAM is, ANTLR3_MARKER mark);
50static	    void	    antlr38BitSeek		(pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint);
51static	    pANTLR3_STRING  antlr38BitGetSourceName	(pANTLR3_INT_STREAM is);
52
53// 8Bit Charstream API functions
54//
55static	    void	    antlr3InputClose		(pANTLR3_INPUT_STREAM input);
56static	    void	    antlr3InputReset		(pANTLR3_INPUT_STREAM input);
57static      void            antlr38BitReuse            (pANTLR3_INPUT_STREAM input, pANTLR3_UINT8 inString, ANTLR3_UINT32 size, pANTLR3_UINT8 name);
58static	    void *	    antlr38BitLT		(pANTLR3_INPUT_STREAM input, ANTLR3_INT32 lt);
59static	    ANTLR3_UINT32   antlr38BitSize		(pANTLR3_INPUT_STREAM input);
60static	    pANTLR3_STRING  antlr38BitSubstr		(pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop);
61static	    ANTLR3_UINT32   antlr38BitGetLine		(pANTLR3_INPUT_STREAM input);
62static	    void	  * antlr38BitGetLineBuf	(pANTLR3_INPUT_STREAM input);
63static	    ANTLR3_UINT32   antlr38BitGetCharPosition	(pANTLR3_INPUT_STREAM input);
64static	    void	    antlr38BitSetLine		(pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 line);
65static	    void	    antlr38BitSetCharPosition	(pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 position);
66static	    void	    antlr38BitSetNewLineChar	(pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 newlineChar);
67static	    void	    antlr38BitSetUcaseLA	(pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN flag);
68
69// -----------------------------------
70// UTF16 (also covers UCS2)
71//
72// INT Stream API
73//
74static	    void	    antlr3UTF16Consume	        (pANTLR3_INT_STREAM is);
75static	    ANTLR3_UCHAR    antlr3UTF16LA		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
76static	    void	    antlr3UTF16ConsumeLE        (pANTLR3_INT_STREAM is);
77static	    ANTLR3_UCHAR    antlr3UTF16LALE		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
78static	    void	    antlr3UTF16ConsumeBE        (pANTLR3_INT_STREAM is);
79static	    ANTLR3_UCHAR    antlr3UTF16LABE		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
80static	    ANTLR3_MARKER   antlr3UTF16Index		(pANTLR3_INT_STREAM is);
81static	    void	    antlr3UTF16Seek		(pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint);
82
83// UTF16 Charstream API functions
84//
85static	    pANTLR3_STRING	antlr3UTF16Substr	(pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop);
86
87// -----------------------------------
88// UTF32 (also covers UCS2)
89//
90// INT Stream API
91//
92static	    void	    antlr3UTF32Consume	        (pANTLR3_INT_STREAM is);
93static	    ANTLR3_UCHAR    antlr3UTF32LA		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
94static	    ANTLR3_UCHAR    antlr3UTF32LALE		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
95static	    ANTLR3_UCHAR    antlr3UTF32LABE		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
96static	    ANTLR3_MARKER   antlr3UTF32Index		(pANTLR3_INT_STREAM is);
97static	    void	    antlr3UTF32Seek		(pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint);
98
99// UTF16 Charstream API functions
100//
101static	    pANTLR3_STRING  antlr3UTF32Substr	        (pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop);
102
103// ------------------------------------
104// UTF-8
105//
106static	    void	    antlr3UTF8Consume	        (pANTLR3_INT_STREAM is);
107static	    ANTLR3_UCHAR    antlr3UTF8LA		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
108
109// ------------------------------------
110// EBCDIC
111//
112static	    ANTLR3_UCHAR    antlr3EBCDICLA		(pANTLR3_INT_STREAM is, ANTLR3_INT32 la);
113
114/// \brief Common function to setup function interface for an 8 bit input stream.
115///
116/// \param input Input stream context pointer
117///
118/// \remark
119///   - Many of the 8 bit oriented file stream handling functions will be usable
120///     by any or at least some, other input streams. Therefore it is perfectly acceptable
121///     to call this function to install the 8Bit handler then override just those functions
122///     that would not work for the particular input encoding, such as consume for instance.
123///
124void
125antlr38BitSetupStream	(pANTLR3_INPUT_STREAM input)
126{
127    // Build a string factory for this stream
128    //
129    input->strFactory	= antlr3StringFactoryNew(input->encoding);
130
131    // Default stream API set up is for 8Bit, so we are done
132    //
133}
134
135void
136antlr3GenericSetupStream  (pANTLR3_INPUT_STREAM input)
137{
138    /* Install function pointers for an 8 bit input
139     */
140
141    /* Allocate stream interface
142     */
143    input->istream		= antlr3IntStreamNew();
144    input->istream->type        = ANTLR3_CHARSTREAM;
145    input->istream->super       = input;
146
147    /* Intstream API
148     */
149    input->istream->consume	    = antlr38BitConsume;	    // Consume the next 8 bit character in the buffer
150    input->istream->_LA		    = antlr38BitLA;	            // Return the UTF32 character at offset n (1 based)
151    input->istream->index	    = antlr38BitIndex;	            // Current index (offset from first character
152    input->istream->mark	    = antlr38BitMark;		    // Record the current lex state for later restore
153    input->istream->rewind	    = antlr38BitRewind;	            // How to rewind the input
154    input->istream->rewindLast	    = antlr38BitRewindLast;	    // How to rewind the input
155    input->istream->seek	    = antlr38BitSeek;		    // How to seek to a specific point in the stream
156    input->istream->release	    = antlr38BitRelease;	    // Reset marks after mark n
157    input->istream->getSourceName   = antlr38BitGetSourceName;      // Return a string that names the input source
158
159    /* Charstream API
160     */
161    input->close		    =  antlr3InputClose;	    // Close down the stream completely
162    input->free			    =  antlr3InputClose;	    // Synonym for free
163    input->reset		    =  antlr3InputReset;	    // Reset input to start
164    input->reuse                    =  antlr38BitReuse;             // Install a new input string and reset
165    input->_LT			    =  antlr38BitLT;		    // Same as _LA for 8 bit file
166    input->size			    =  antlr38BitSize;		    // Return the size of the input buffer
167    input->substr		    =  antlr38BitSubstr;	    // Return a string from the input stream
168    input->getLine		    =  antlr38BitGetLine;	    // Return the current line number in the input stream
169    input->getLineBuf		    =  antlr38BitGetLineBuf;	    // Return a pointer to the start of the current line being consumed
170    input->getCharPositionInLine    =  antlr38BitGetCharPosition;   // Return the offset into the current line of input
171    input->setLine		    =  antlr38BitSetLine;	    // Set the input stream line number (does not set buffer pointers)
172    input->setCharPositionInLine    =  antlr38BitSetCharPosition;   // Set the offset in to the current line (does not set any pointers)
173    input->SetNewLineChar	    =  antlr38BitSetNewLineChar;    // Set the value of the newline trigger character
174    input->setUcaseLA		    =  antlr38BitSetUcaseLA;        // Changes the LA function to return upper case always
175
176    input->charByteSize		    = 1;		// Size in bytes of characters in this stream.
177
178    /* Initialize entries for tables etc
179     */
180    input->markers  = NULL;
181
182    /* Set up the input stream brand new
183     */
184    input->reset(input);
185
186    /* Install default line separator character (it can be replaced
187     * by the grammar programmer later)
188     */
189    input->SetNewLineChar(input, (ANTLR3_UCHAR)'\n');
190}
191
192static pANTLR3_STRING
193antlr38BitGetSourceName(pANTLR3_INT_STREAM is)
194{
195	return	is->streamName;
196}
197
198/** \brief Close down an input stream and free any memory allocated by it.
199 *
200 * \param input Input stream context pointer
201 */
202static void
203antlr3InputClose(pANTLR3_INPUT_STREAM input)
204{
205    // Close any markers in the input stream
206    //
207    if	(input->markers != NULL)
208    {
209		input->markers->free(input->markers);
210		input->markers = NULL;
211    }
212
213    // Close the string factory
214    //
215    if	(input->strFactory != NULL)
216    {
217		input->strFactory->close(input->strFactory);
218    }
219
220    // Free the input stream buffer if we allocated it
221    //
222    if	(input->isAllocated && input->data != NULL)
223    {
224		ANTLR3_FREE(input->data);
225		input->data = NULL;
226    }
227
228    input->istream->free(input->istream);
229
230    // Finally, free the space for the structure itself
231    //
232    ANTLR3_FREE(input);
233
234    // Done
235    //
236}
237
238static void
239antlr38BitSetUcaseLA		(pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN flag)
240{
241	if	(flag)
242	{
243		// Return the upper case version of the characters
244		//
245		input->istream->_LA		    =  antlr38BitLA_ucase;
246	}
247	else
248	{
249		// Return the raw characters as they are in the buffer
250		//
251		input->istream->_LA		    =  antlr38BitLA;
252	}
253}
254
255
256/** \brief Reset a re-startable input stream to the start
257 *
258 * \param input Input stream context pointer
259 */
260static void
261antlr3InputReset(pANTLR3_INPUT_STREAM input)
262{
263
264    input->nextChar		= input->data;	/* Input at first character */
265    input->line			= 1;		/* starts at line 1	    */
266    input->charPositionInLine	= -1;
267    input->currentLine		= input->data;
268    input->markDepth		= 0;		/* Reset markers	    */
269
270    /* Clear out up the markers table if it is there
271     */
272    if	(input->markers != NULL)
273    {
274        input->markers->clear(input->markers);
275    }
276    else
277    {
278        /* Install a new markers table
279         */
280        input->markers  = antlr3VectorNew(0);
281    }
282}
283
284/** Install a new source code in to a working input stream so that the
285 *  input stream can be reused.
286 */
287static void
288antlr38BitReuse(pANTLR3_INPUT_STREAM input, pANTLR3_UINT8 inString, ANTLR3_UINT32 size, pANTLR3_UINT8 name)
289{
290    input->isAllocated	= ANTLR3_FALSE;
291    input->data		= inString;
292    input->sizeBuf	= size;
293
294    // Now we can set up the file name. As we are reusing the stream, there may already
295    // be a string that we can reuse for holding the filename.
296    //
297	if	(input->istream->streamName == NULL)
298	{
299		input->istream->streamName	= input->strFactory->newStr(input->strFactory, name == NULL ? (pANTLR3_UINT8)"-memory-" : name);
300		input->fileName		= input->istream->streamName;
301	}
302	else
303	{
304		input->istream->streamName->set(input->istream->streamName,  (name == NULL ? (const char *)"-memory-" : (const char *)name));
305	}
306
307    input->reset(input);
308}
309
310/** \brief Consume the next character in an 8 bit input stream
311 *
312 * \param input Input stream context pointer
313 */
314static void
315antlr38BitConsume(pANTLR3_INT_STREAM is)
316{
317    pANTLR3_INPUT_STREAM input;
318
319    input   = ((pANTLR3_INPUT_STREAM) (is->super));
320
321    if	((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
322    {
323	/* Indicate one more character in this line
324	 */
325	input->charPositionInLine++;
326
327	if  ((ANTLR3_UCHAR)(*((pANTLR3_UINT8)input->nextChar)) == input->newlineChar)
328	{
329	    /* Reset for start of a new line of input
330	     */
331	    input->line++;
332	    input->charPositionInLine	= 0;
333	    input->currentLine		= (void *)(((pANTLR3_UINT8)input->nextChar) + 1);
334	}
335
336	/* Increment to next character position
337	 */
338	input->nextChar = (void *)(((pANTLR3_UINT8)input->nextChar) + 1);
339    }
340}
341
342/** \brief Return the input element assuming an 8 bit ascii input
343 *
344 * \param[in] input Input stream context pointer
345 * \param[in] la 1 based offset of next input stream element
346 *
347 * \return Next input character in internal ANTLR3 encoding (UTF32)
348 */
349static ANTLR3_UCHAR
350antlr38BitLA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
351{
352    pANTLR3_INPUT_STREAM input;
353
354    input   = ((pANTLR3_INPUT_STREAM) (is->super));
355
356    if	(( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
357    {
358		return	ANTLR3_CHARSTREAM_EOF;
359    }
360    else
361    {
362		return	(ANTLR3_UCHAR)(*((pANTLR3_UINT8)input->nextChar + la - 1));
363    }
364}
365
366/** \brief Return the input element assuming an 8 bit input and
367 *         always return the UPPER CASE character.
368 *		   Note that this is 8 bit and so we assume that the toupper
369 *		   function will use the correct locale for 8 bits.
370 *
371 * \param[in] input Input stream context pointer
372 * \param[in] la 1 based offset of next input stream element
373 *
374 * \return Next input character in internal ANTLR3 encoding (UTF32)
375 */
376static ANTLR3_UCHAR
377antlr38BitLA_ucase	(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
378{
379    pANTLR3_INPUT_STREAM input;
380
381    input   = ((pANTLR3_INPUT_STREAM) (is->super));
382
383    if	(( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
384    {
385		return	ANTLR3_CHARSTREAM_EOF;
386    }
387    else
388    {
389		return	(ANTLR3_UCHAR)toupper((*((pANTLR3_UINT8)input->nextChar + la - 1)));
390    }
391}
392
393
394/** \brief Return the input element assuming an 8 bit ascii input
395 *
396 * \param[in] input Input stream context pointer
397 * \param[in] lt 1 based offset of next input stream element
398 *
399 * \return Next input character in internal ANTLR3 encoding (UTF32)
400 */
401static void *
402antlr38BitLT(pANTLR3_INPUT_STREAM input, ANTLR3_INT32 lt)
403{
404    /* Casting is horrible but it means no warnings and LT should never be called
405     * on a character stream anyway I think. If it is then, the void * will need to be
406     * cast back in a similar manner. Yuck! But this means that LT for Token streams and
407     * tree streams is correct.
408     */
409    return (ANTLR3_FUNC_PTR(input->istream->_LA(input->istream, lt)));
410}
411
412/** \brief Calculate the current index in the output stream.
413 * \param[in] input Input stream context pointer
414 */
415static ANTLR3_MARKER
416antlr38BitIndex(pANTLR3_INT_STREAM is)
417{
418    pANTLR3_INPUT_STREAM input;
419
420    input   = ((pANTLR3_INPUT_STREAM) (is->super));
421
422    return  (ANTLR3_MARKER)(((pANTLR3_UINT8)input->nextChar));
423}
424
425/** \brief Return the size of the current input stream, as an 8Bit file
426 *   which in this case is the total input. Other implementations may provide
427 *   more sophisticated implementations to deal with non-recoverable streams
428 *   and so on.
429 *
430 * \param[in] input Input stream context pointer
431 */
432static	ANTLR3_UINT32
433antlr38BitSize(pANTLR3_INPUT_STREAM input)
434{
435    return  input->sizeBuf;
436}
437
438/** \brief Mark the current input point in an 8Bit 8 bit stream
439 *  such as a file stream, where all the input is available in the
440 *  buffer.
441 *
442 * \param[in] is Input stream context pointer
443 */
444static ANTLR3_MARKER
445antlr38BitMark	(pANTLR3_INT_STREAM is)
446{
447    pANTLR3_LEX_STATE	    state;
448    pANTLR3_INPUT_STREAM    input;
449
450    input   = ((pANTLR3_INPUT_STREAM) (is->super));
451
452    /* New mark point
453     */
454    input->markDepth++;
455
456    /* See if we are revisiting a mark as we can just reuse the vector
457     * entry if we are, otherwise, we need a new one
458     */
459    if	(input->markDepth > input->markers->count)
460    {
461	state	= ANTLR3_MALLOC(sizeof(ANTLR3_LEX_STATE));
462
463	/* Add it to the table
464	 */
465	input->markers->add(input->markers, state, ANTLR3_FREE_FUNC);	/* No special structure, just free() on delete */
466    }
467    else
468    {
469	state	= (pANTLR3_LEX_STATE)input->markers->get(input->markers, input->markDepth - 1);
470
471	/* Assume no errors for speed, it will just blow up if the table failed
472	 * for some reasons, hence lots of unit tests on the tables ;-)
473	 */
474    }
475
476    /* We have created or retrieved the state, so update it with the current
477     * elements of the lexer state.
478     */
479    state->charPositionInLine	= input->charPositionInLine;
480    state->currentLine		= input->currentLine;
481    state->line			= input->line;
482    state->nextChar		= input->nextChar;
483
484    is->lastMarker  = input->markDepth;
485
486    /* And that's it
487     */
488    return  input->markDepth;
489}
490/** \brief Rewind the lexer input to the state specified by the last produced mark.
491 *
492 * \param[in] input Input stream context pointer
493 *
494 * \remark
495 * Assumes 8 Bit input stream.
496 */
497static void
498antlr38BitRewindLast	(pANTLR3_INT_STREAM is)
499{
500    is->rewind(is, is->lastMarker);
501}
502
503/** \brief Rewind the lexer input to the state specified by the supplied mark.
504 *
505 * \param[in] input Input stream context pointer
506 *
507 * \remark
508 * Assumes 8 Bit input stream.
509 */
510static void
511antlr38BitRewind	(pANTLR3_INT_STREAM is, ANTLR3_MARKER mark)
512{
513    pANTLR3_LEX_STATE	state;
514    pANTLR3_INPUT_STREAM input;
515
516    input   = ((pANTLR3_INPUT_STREAM) is->super);
517
518    /* Perform any clean up of the marks
519     */
520    input->istream->release(input->istream, mark);
521
522    /* Find the supplied mark state
523     */
524    state   = (pANTLR3_LEX_STATE)input->markers->get(input->markers, (ANTLR3_UINT32)(mark - 1));
525
526    /* Seek input pointer to the requested point (note we supply the void *pointer
527     * to whatever is implementing the int stream to seek).
528     */
529    antlr38BitSeek(is, (ANTLR3_MARKER)(state->nextChar));
530
531    /* Reset to the reset of the information in the mark
532     */
533    input->charPositionInLine	= state->charPositionInLine;
534    input->currentLine		= state->currentLine;
535    input->line			= state->line;
536    input->nextChar		= state->nextChar;
537
538    /* And we are done
539     */
540}
541
542/** \brief Rewind the lexer input to the state specified by the supplied mark.
543 *
544 * \param[in] input Input stream context pointer
545 *
546 * \remark
547 * Assumes 8 Bit input stream.
548 */
549static void
550antlr38BitRelease	(pANTLR3_INT_STREAM is, ANTLR3_MARKER mark)
551{
552    pANTLR3_INPUT_STREAM input;
553
554    input   = ((pANTLR3_INPUT_STREAM) (is->super));
555
556    /* We don't do much here in fact as we never free any higher marks in
557     * the hashtable as we just resuse any memory allocated for them.
558     */
559    input->markDepth	= (ANTLR3_UINT32)(mark - 1);
560}
561
562/** \brief Rewind the lexer input to the state specified by the supplied mark.
563 *
564 * \param[in] input Input stream context pointer
565 *
566 * \remark
567 * Assumes 8 Bit input stream.
568 */
569static void
570antlr38BitSeek	(pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint)
571{
572	ANTLR3_INT32   count;
573	pANTLR3_INPUT_STREAM input;
574
575	input   = ANTLR3_FUNC_PTR(((pANTLR3_INPUT_STREAM) is->super));
576
577	/* If the requested seek point is less than the current
578	* input point, then we assume that we are resetting from a mark
579	* and do not need to scan, but can just set to there.
580	*/
581	if	(seekPoint <= (ANTLR3_MARKER)(input->nextChar))
582	{
583		input->nextChar	= ((pANTLR3_UINT8) seekPoint);
584	}
585	else
586	{
587		count	= (ANTLR3_UINT32)(seekPoint - (ANTLR3_MARKER)(input->nextChar));
588
589		while (count--)
590		{
591			is->consume(is);
592		}
593	}
594}
595/** Return a substring of the 8 bit input stream in
596 *  newly allocated memory.
597 *
598 * \param input Input stream context pointer
599 * \param start Offset in input stream where the string starts
600 * \param stop  Offset in the input stream where the string ends.
601 */
602static pANTLR3_STRING
603antlr38BitSubstr		(pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop)
604{
605	return  input->strFactory->newPtr(input->strFactory, (pANTLR3_UINT8)start, (ANTLR3_UINT32)(stop - start + 1));
606}
607
608/** \brief Return the line number as understood by the 8 bit input stream.
609 *
610 * \param input Input stream context pointer
611 * \return	Line number in input stream that we believe we are working on.
612 */
613static ANTLR3_UINT32
614antlr38BitGetLine		(pANTLR3_INPUT_STREAM input)
615{
616    return  input->line;
617}
618
619/** Return a pointer into the input stream that points at the start
620 *  of the current input line as triggered by the end of line character installed
621 *  for the stream ('\n' unless told differently).
622 *
623 * \param[in] input
624 */
625static void	  *
626antlr38BitGetLineBuf	(pANTLR3_INPUT_STREAM input)
627{
628    return  input->currentLine;
629}
630
631/** Return the current offset in to the current line in the input stream.
632 *
633 * \param input Input stream context pointer
634 * \return      Current line offset
635 */
636static ANTLR3_UINT32
637antlr38BitGetCharPosition	(pANTLR3_INPUT_STREAM input)
638{
639    return  input->charPositionInLine;
640}
641
642/** Set the current line number as understood by the input stream.
643 *
644 * \param input Input stream context pointer
645 * \param line  Line number to tell the input stream we are on
646 *
647 * \remark
648 *  This function does not change any pointers, it just allows the programmer to set the
649 *  line number according to some external criterion, such as finding a lexed directive
650 *  like: #nnn "file.c" for instance, such that error reporting and so on in is in sync
651 *  with some original source format.
652 */
653static void
654antlr38BitSetLine		(pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 line)
655{
656    input->line	= line;
657}
658
659/** Set the current offset in the current line to be a particular setting.
660 *
661 * \param[in] input    Input stream context pointer
662 * \param[in] position New setting for current offset.
663 *
664 * \remark
665 * This does not set the actual pointers in the input stream, it is purely for reporting
666 * purposes and so on as per antlr38BitSetLine();
667 */
668static void
669antlr38BitSetCharPosition	(pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 position)
670{
671    input->charPositionInLine = position;
672}
673
674/** Set the newline trigger character in the input stream to the supplied parameter.
675 *
676 * \param[in] input	    Input stream context pointer
677 * \param[in] newlineChar   Character to set to be the newline trigger.
678 *
679 * \remark
680 *  - The supplied newLineChar is in UTF32 encoding (which means ASCII and latin1 etc
681 *    are the same encodings), but the input stream catered to by this function is 8 bit
682 *    only, so it is up to the programmer to ensure that the character supplied is valid.
683 */
684static void
685antlr38BitSetNewLineChar	(pANTLR3_INPUT_STREAM input, ANTLR3_UINT32 newlineChar)
686{
687    input->newlineChar	= newlineChar;
688}
689
690
691/// \brief Common function to setup function interface for a UTF16 or UCS2 input stream.
692///
693/// \param input Input stream context pointer
694///
695/// \remark
696///  - Strictly speaking, there is no such thing as a UCS2 input stream as the term
697///    tends to confuse the notions of character encoding, unicode and so on. UCS2 is
698///    essentially UTF16 without any surrogates and so the standard UTF16
699///    input stream is able to handle it without any special code.
700///
701void
702antlr3UTF16SetupStream	(pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN machineBigEndian, ANTLR3_BOOLEAN inputBigEndian)
703{
704    // Build a string factory for this stream. This is a UTF16 string factory which is a standard
705    // part of the ANTLR3 string. The string factory is then passed through the whole chain
706    // of lexer->parser->tree->treeparser and so on.
707    //
708    input->strFactory	= antlr3StringFactoryNew(input->encoding);
709
710    // Generic API that does not care about endianess.
711    //
712    input->istream->index	    =  antlr3UTF16Index;            // Calculate current index in input stream, UTF16 based
713    input->substr		    =  antlr3UTF16Substr;	    // Return a string from the input stream
714    input->istream->seek	    =  antlr3UTF16Seek;		    // How to seek to a specific point in the stream
715
716    // We must install different UTF16 routines according to whether the input
717    // is the same endianess as the machine we are executing upon or not. If it is not
718    // then we must install methods that can convert the endianess on the fly as they go
719    //
720
721    switch (machineBigEndian)
722    {
723        case    ANTLR3_TRUE:
724
725            // Machine is Big Endian, if the input is also then install the
726            // methods that do not access input by bytes and reverse them.
727            // Otherwise install endian aware methods.
728            //
729            if  (inputBigEndian == ANTLR3_TRUE)
730            {
731                // Input is machine compatible
732                //
733                input->istream->consume	    =  antlr3UTF16Consume;	    // Consume the next UTF16 character in the buffer
734                input->istream->_LA         =  antlr3UTF16LA;		    // Return the UTF32 character at offset n (1 based)
735            }
736            else
737            {
738                // Need to use methods that know that the input is little endian
739                //
740                input->istream->consume	    =  antlr3UTF16ConsumeLE;	    // Consume the next UTF16 character in the buffer
741                input->istream->_LA         =  antlr3UTF16LALE;		    // Return the UTF32 character at offset n (1 based)
742            }
743            break;
744
745        case    ANTLR3_FALSE:
746
747            // Machine is Little Endian, if the input is also then install the
748            // methods that do not access input by bytes and reverse them.
749            // Otherwise install endian aware methods.
750            //
751            if  (inputBigEndian == ANTLR3_FALSE)
752            {
753                // Input is machine compatible
754                //
755                input->istream->consume	    =  antlr3UTF16Consume;	    // Consume the next UTF16 character in the buffer
756                input->istream->_LA         =  antlr3UTF16LA;		    // Return the UTF32 character at offset n (1 based)
757            }
758            else
759            {
760                // Need to use methods that know that the input is Big Endian
761                //
762                input->istream->consume	    =  antlr3UTF16ConsumeBE;	    // Consume the next UTF16 character in the buffer
763                input->istream->_LA         =  antlr3UTF16LABE;		    // Return the UTF32 character at offset n (1 based)
764            }
765            break;
766    }
767
768
769    input->charByteSize		    = 2;			    // Size in bytes of characters in this stream.
770
771}
772
773/// \brief Consume the next character in a UTF16 input stream
774///
775/// \param input Input stream context pointer
776///
777static void
778antlr3UTF16Consume(pANTLR3_INT_STREAM is)
779{
780	pANTLR3_INPUT_STREAM input;
781        UTF32   ch;
782        UTF32   ch2;
783
784	input   = ((pANTLR3_INPUT_STREAM) (is->super));
785
786        // Buffer size is always in bytes
787        //
788	if	((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
789	{
790		// Indicate one more character in this line
791		//
792		input->charPositionInLine++;
793
794		if  ((ANTLR3_UCHAR)(*((pANTLR3_UINT16)input->nextChar)) == input->newlineChar)
795		{
796			// Reset for start of a new line of input
797			//
798			input->line++;
799			input->charPositionInLine	= 0;
800			input->currentLine		= (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
801		}
802
803		// Increment to next character position, accounting for any surrogates
804		//
805                // Next char in natural machine byte order
806                //
807                ch  = *((UTF16*)input->nextChar);
808
809                // We consumed one 16 bit character
810                //
811		input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
812
813                // If we have a surrogate pair then we need to consume
814                // a following valid LO surrogate.
815                //
816                if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
817
818                    // If the 16 bits following the high surrogate are in the source buffer...
819                    //
820                    if	((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
821                    {
822                        // Next character is in natural machine byte order
823                        //
824                        ch2 = *((UTF16*)input->nextChar);
825
826                        // If it's a valid low surrogate, consume it
827                        //
828                        if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
829                        {
830                            // We consumed one 16 bit character
831                            //
832		            input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
833                        }
834                        // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
835                        // it.
836                        //
837                    }
838                    // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
839                    // it because the buffer ended
840                    //
841                }
842                // Note that we did not check for an invalid low surrogate here, or that fact that the
843                // lo surrogate was missing. We just picked out one 16 bit character unless the character
844                // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
845                //
846	}
847}
848
849/// \brief Return the input element assuming an 8 bit ascii input
850///
851/// \param[in] input Input stream context pointer
852/// \param[in] la 1 based offset of next input stream element
853///
854/// \return Next input character in internal ANTLR3 encoding (UTF32)
855///
856static ANTLR3_UCHAR
857antlr3UTF16LA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
858{
859	pANTLR3_INPUT_STREAM input;
860        UTF32   ch;
861        UTF32   ch2;
862        UTF16   * nextChar;
863
864        // Find the input interface and where we are currently pointing to
865        // in the input stream
866        //
867	input       = ((pANTLR3_INPUT_STREAM) (is->super));
868        nextChar    = input->nextChar;
869
870        // If a positive offset then advance forward, else retreat
871        //
872        if  (la >= 0)
873        {
874            while   (--la > 0 && (pANTLR3_UINT8)nextChar < ((pANTLR3_UINT8)input->data) + input->sizeBuf )
875            {
876                // Advance our copy of the input pointer
877                //
878                // Next char in natural machine byte order
879                //
880                ch  = *nextChar++;
881
882                // If we have a surrogate pair then we need to consume
883                // a following valid LO surrogate.
884                //
885                if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
886                {
887                    // If the 16 bits following the high surrogate are in the source buffer...
888                    //
889                    if	((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
890                    {
891                        // Next character is in natural machine byte order
892                        //
893                        ch2 = *nextChar;
894
895                        // If it's a valid low surrogate, consume it
896                        //
897                        if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
898                        {
899                            // We consumed one 16 bit character
900                            //
901		            nextChar++;
902                        }
903                        // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
904                        // it.
905                        //
906                    }
907                    // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
908                    // it because the buffer ended
909                    //
910                }
911                // Note that we did not check for an invalid low surrogate here, or that fact that the
912                // lo surrogate was missing. We just picked out one 16 bit character unless the character
913                // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
914                //
915            }
916        }
917        else
918        {
919            // We need to go backwards from our input point
920            //
921            while   (la++ < 0 && (pANTLR3_UINT8)nextChar > (pANTLR3_UINT8)input->data )
922            {
923                // Get the previous 16 bit character
924                //
925                ch = *--nextChar;
926
927                // If we found a low surrogate then go back one more character if
928                // the hi surrogate is there
929                //
930                if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
931                {
932                    ch2 = *(nextChar-1);
933                    if (ch2 >= UNI_SUR_HIGH_START && ch2 <= UNI_SUR_HIGH_END)
934                    {
935                        // Yes, there is a high surrogate to match it so decrement one more and point to that
936                        //
937                        nextChar--;
938                    }
939                }
940            }
941        }
942
943        // Our local copy of nextChar is now pointing to either the correct character or end of file
944        //
945        // Input buffer size is always in bytes
946        //
947	if	( (pANTLR3_UINT8)nextChar >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
948	{
949		return	ANTLR3_CHARSTREAM_EOF;
950	}
951	else
952	{
953            // Pick up the next 16 character (native machine byte order)
954            //
955            ch = *nextChar++;
956
957            // If we have a surrogate pair then we need to consume
958            // a following valid LO surrogate.
959            //
960            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
961            {
962                // If the 16 bits following the high surrogate are in the source buffer...
963                //
964                if	((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
965                {
966                    // Next character is in natural machine byte order
967                    //
968                    ch2 = *nextChar;
969
970                    // If it's a valid low surrogate, consume it
971                    //
972                    if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
973                    {
974                        // Construct the UTF32 code point
975                        //
976                        ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
977			    + (ch2 - UNI_SUR_LOW_START) + halfBase;
978                    }
979                    // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
980                    // it.
981                    //
982                }
983                // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
984                // it because the buffer ended
985                //
986            }
987        }
988        return ch;
989}
990
991
992/// \brief Calculate the current index in the output stream.
993/// \param[in] input Input stream context pointer
994///
995static ANTLR3_MARKER
996antlr3UTF16Index(pANTLR3_INT_STREAM is)
997{
998    pANTLR3_INPUT_STREAM input;
999
1000    input   = ((pANTLR3_INPUT_STREAM) (is->super));
1001
1002    return  (ANTLR3_MARKER)(input->nextChar);
1003}
1004
1005/// \brief Rewind the lexer input to the state specified by the supplied mark.
1006///
1007/// \param[in] input Input stream context pointer
1008///
1009/// \remark
1010/// Assumes UTF16 input stream.
1011///
1012static void
1013antlr3UTF16Seek	(pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint)
1014{
1015	pANTLR3_INPUT_STREAM input;
1016
1017	input   = ((pANTLR3_INPUT_STREAM) is->super);
1018
1019	// If the requested seek point is less than the current
1020	// input point, then we assume that we are resetting from a mark
1021	// and do not need to scan, but can just set to there as rewind will
1022        // reset line numbers and so on.
1023	//
1024	if	(seekPoint <= (ANTLR3_MARKER)(input->nextChar))
1025	{
1026		input->nextChar	= (void *)seekPoint;
1027	}
1028	else
1029	{
1030            // Call consume until we reach the asked for seek point or EOF
1031            //
1032            while (is->_LA(is, 1) != ANTLR3_CHARSTREAM_EOF && seekPoint < (ANTLR3_MARKER)input->nextChar)
1033	    {
1034		is->consume(is);
1035	    }
1036	}
1037}
1038/// \brief Return a substring of the UTF16 input stream in
1039///  newly allocated memory.
1040///
1041/// \param input Input stream context pointer
1042/// \param start Offset in input stream where the string starts
1043/// \param stop  Offset in the input stream where the string ends.
1044///
1045static pANTLR3_STRING
1046antlr3UTF16Substr		(pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop)
1047{
1048    return  input->strFactory->newPtr(input->strFactory, (pANTLR3_UINT8)start, ((ANTLR3_UINT32_CAST(stop - start))/2) + 1);
1049}
1050
1051/// \brief Consume the next character in a UTF16 input stream when the input is Little Endian and the machine is not
1052/// Note that the UTF16 routines do not do any substantial verification of the input stream as for performance
1053/// sake, we assume it is validly encoded. So if a low surrogate is found at the curent input position then we
1054/// just consume it. Surrogate pairs should be seen as Hi, Lo. So if we have a Lo first, then the input stream
1055/// is fubar but we just ignore that.
1056///
1057/// \param input Input stream context pointer
1058///
1059static void
1060antlr3UTF16ConsumeLE(pANTLR3_INT_STREAM is)
1061{
1062	pANTLR3_INPUT_STREAM input;
1063        UTF32   ch;
1064        UTF32   ch2;
1065
1066	input   = ((pANTLR3_INPUT_STREAM) (is->super));
1067
1068        // Buffer size is always in bytes
1069        //
1070	if	((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1071	{
1072		// Indicate one more character in this line
1073		//
1074		input->charPositionInLine++;
1075
1076		if  ((ANTLR3_UCHAR)(*((pANTLR3_UINT16)input->nextChar)) == input->newlineChar)
1077		{
1078			// Reset for start of a new line of input
1079			//
1080			input->line++;
1081			input->charPositionInLine	= 0;
1082			input->currentLine		= (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
1083		}
1084
1085		// Increment to next character position, accounting for any surrogates
1086		//
1087                // Next char in litle endian form
1088                //
1089                ch  = *((pANTLR3_UINT8)input->nextChar) + (*((pANTLR3_UINT8)input->nextChar + 1) <<8);
1090
1091                // We consumed one 16 bit character
1092                //
1093		input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
1094
1095                // If we have a surrogate pair then we need to consume
1096                // a following valid LO surrogate.
1097                //
1098                if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
1099
1100                    // If the 16 bits following the high surrogate are in the source buffer...
1101                    //
1102                    if	((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1103                    {
1104                        ch2 = *((pANTLR3_UINT8)input->nextChar) + (*((pANTLR3_UINT8)input->nextChar + 1) <<8);
1105
1106                        // If it's a valid low surrogate, consume it
1107                        //
1108                        if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
1109                        {
1110                            // We consumed one 16 bit character
1111                            //
1112		            input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
1113                        }
1114                        // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1115                        // it.
1116                        //
1117                    }
1118                    // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1119                    // it because the buffer ended
1120                    //
1121                }
1122                // Note that we did not check for an invalid low surrogate here, or that fact that the
1123                // lo surrogate was missing. We just picked out one 16 bit character unless the character
1124                // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
1125                //
1126	}
1127}
1128
1129/// \brief Return the input element assuming a UTF16 input when the input is Little Endian and the machine is not
1130///
1131/// \param[in] input Input stream context pointer
1132/// \param[in] la 1 based offset of next input stream element
1133///
1134/// \return Next input character in internal ANTLR3 encoding (UTF32)
1135///
1136static ANTLR3_UCHAR
1137antlr3UTF16LALE(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
1138{
1139	pANTLR3_INPUT_STREAM input;
1140        UTF32           ch;
1141        UTF32           ch2;
1142        pANTLR3_UCHAR   nextChar;
1143
1144        // Find the input interface and where we are currently pointing to
1145        // in the input stream
1146        //
1147	input       = ((pANTLR3_INPUT_STREAM) (is->super));
1148        nextChar    = input->nextChar;
1149
1150        // If a positive offset then advance forward, else retreat
1151        //
1152        if  (la >= 0)
1153        {
1154            while   (--la > 0 && (pANTLR3_UINT8)nextChar < ((pANTLR3_UINT8)input->data) + input->sizeBuf )
1155            {
1156                // Advance our copy of the input pointer
1157                //
1158                // Next char in Little Endian byte order
1159                //
1160                ch  = (*nextChar) + (*(nextChar+1) << 8);
1161                nextChar += 2;
1162
1163                // If we have a surrogate pair then we need to consume
1164                // a following valid LO surrogate.
1165                //
1166                if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
1167                {
1168                    // If the 16 bits following the high surrogate are in the source buffer...
1169                    //
1170                    if	((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1171                    {
1172                        // Next character is in little endian byte order
1173                        //
1174                        ch2 = (*nextChar) + (*(nextChar+1) << 8);
1175
1176                        // If it's a valid low surrogate, consume it
1177                        //
1178                        if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
1179                        {
1180                            // We consumed one 16 bit character
1181                            //
1182		            nextChar += 2;
1183                        }
1184                        // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1185                        // it.
1186                        //
1187                    }
1188                    // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1189                    // it because the buffer ended
1190                    //
1191                }
1192                // Note that we did not check for an invalid low surrogate here, or that fact that the
1193                // lo surrogate was missing. We just picked out one 16 bit character unless the character
1194                // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
1195                //
1196            }
1197        }
1198        else
1199        {
1200            // We need to go backwards from our input point
1201            //
1202            while   (la++ < 0 && (pANTLR3_UINT8)nextChar > (pANTLR3_UINT8)input->data )
1203            {
1204                // Get the previous 16 bit character
1205                //
1206                ch = (*nextChar - 2) + ((*nextChar -1) << 8);
1207                nextChar -= 2;
1208
1209                // If we found a low surrogate then go back one more character if
1210                // the hi surrogate is there
1211                //
1212                if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
1213                {
1214                    ch2 = (*nextChar - 2) + ((*nextChar -1) << 8);
1215                    if (ch2 >= UNI_SUR_HIGH_START && ch2 <= UNI_SUR_HIGH_END)
1216                    {
1217                        // Yes, there is a high surrogate to match it so decrement one more and point to that
1218                        //
1219                        nextChar -=2;
1220                    }
1221                }
1222            }
1223        }
1224
1225        // Our local copy of nextChar is now pointing to either the correct character or end of file
1226        //
1227        // Input buffer size is always in bytes
1228        //
1229	if	( (pANTLR3_UINT8)nextChar >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1230	{
1231		return	ANTLR3_CHARSTREAM_EOF;
1232	}
1233	else
1234	{
1235            // Pick up the next 16 character (little endian byte order)
1236            //
1237            ch = (*nextChar) + (*(nextChar+1) << 8);
1238            nextChar += 2;
1239
1240            // If we have a surrogate pair then we need to consume
1241            // a following valid LO surrogate.
1242            //
1243            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
1244            {
1245                // If the 16 bits following the high surrogate are in the source buffer...
1246                //
1247                if	((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1248                {
1249                    // Next character is in little endian byte order
1250                    //
1251                    ch2 = (*nextChar) + (*(nextChar+1) << 8);
1252
1253                    // If it's a valid low surrogate, consume it
1254                    //
1255                    if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
1256                    {
1257                        // Construct the UTF32 code point
1258                        //
1259                        ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
1260			    + (ch2 - UNI_SUR_LOW_START) + halfBase;
1261                    }
1262                    // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1263                    // it.
1264                    //
1265                }
1266                // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1267                // it because the buffer ended
1268                //
1269            }
1270        }
1271        return ch;
1272}
1273
1274/// \brief Consume the next character in a UTF16 input stream when the input is Big Endian and the machine is not
1275///
1276/// \param input Input stream context pointer
1277///
1278static void
1279antlr3UTF16ConsumeBE(pANTLR3_INT_STREAM is)
1280{
1281	pANTLR3_INPUT_STREAM input;
1282        UTF32   ch;
1283        UTF32   ch2;
1284
1285	input   = ((pANTLR3_INPUT_STREAM) (is->super));
1286
1287        // Buffer size is always in bytes
1288        //
1289	if	((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1290	{
1291		// Indicate one more character in this line
1292		//
1293		input->charPositionInLine++;
1294
1295		if  ((ANTLR3_UCHAR)(*((pANTLR3_UINT16)input->nextChar)) == input->newlineChar)
1296		{
1297			// Reset for start of a new line of input
1298			//
1299			input->line++;
1300			input->charPositionInLine	= 0;
1301			input->currentLine		= (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
1302		}
1303
1304		// Increment to next character position, accounting for any surrogates
1305		//
1306                // Next char in big endian form
1307                //
1308                ch  = *((pANTLR3_UINT8)input->nextChar + 1) + (*((pANTLR3_UINT8)input->nextChar ) <<8);
1309
1310                // We consumed one 16 bit character
1311                //
1312		input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
1313
1314                // If we have a surrogate pair then we need to consume
1315                // a following valid LO surrogate.
1316                //
1317                if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
1318
1319                    // If the 16 bits following the high surrogate are in the source buffer...
1320                    //
1321                    if	((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1322                    {
1323                        // Big endian
1324                        //
1325                        ch2 = *((pANTLR3_UINT8)input->nextChar + 1) + (*((pANTLR3_UINT8)input->nextChar ) <<8);
1326
1327                        // If it's a valid low surrogate, consume it
1328                        //
1329                        if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
1330                        {
1331                            // We consumed one 16 bit character
1332                            //
1333		            input->nextChar = (void *)(((pANTLR3_UINT16)input->nextChar) + 1);
1334                        }
1335                        // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1336                        // it.
1337                        //
1338                    }
1339                    // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1340                    // it because the buffer ended
1341                    //
1342                }
1343                // Note that we did not check for an invalid low surrogate here, or that fact that the
1344                // lo surrogate was missing. We just picked out one 16 bit character unless the character
1345                // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
1346                //
1347	}
1348}
1349
1350/// \brief Return the input element assuming a UTF16 input when the input is Little Endian and the machine is not
1351///
1352/// \param[in] input Input stream context pointer
1353/// \param[in] la 1 based offset of next input stream element
1354///
1355/// \return Next input character in internal ANTLR3 encoding (UTF32)
1356///
1357static ANTLR3_UCHAR
1358antlr3UTF16LABE(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
1359{
1360	pANTLR3_INPUT_STREAM input;
1361        UTF32           ch;
1362        UTF32           ch2;
1363        pANTLR3_UCHAR   nextChar;
1364
1365        // Find the input interface and where we are currently pointing to
1366        // in the input stream
1367        //
1368	input       = ((pANTLR3_INPUT_STREAM) (is->super));
1369        nextChar    = input->nextChar;
1370
1371        // If a positive offset then advance forward, else retreat
1372        //
1373        if  (la >= 0)
1374        {
1375            while   (--la > 0 && (pANTLR3_UINT8)nextChar < ((pANTLR3_UINT8)input->data) + input->sizeBuf )
1376            {
1377                // Advance our copy of the input pointer
1378                //
1379                // Next char in Big Endian byte order
1380                //
1381                ch  = ((*nextChar) << 8) + *(nextChar+1);
1382                nextChar += 2;
1383
1384                // If we have a surrogate pair then we need to consume
1385                // a following valid LO surrogate.
1386                //
1387                if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
1388                {
1389                    // If the 16 bits following the high surrogate are in the source buffer...
1390                    //
1391                    if	((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1392                    {
1393                        // Next character is in big endian byte order
1394                        //
1395                        ch2 = ((*nextChar) << 8) + *(nextChar+1);
1396
1397                        // If it's a valid low surrogate, consume it
1398                        //
1399                        if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
1400                        {
1401                            // We consumed one 16 bit character
1402                            //
1403		            nextChar += 2;
1404                        }
1405                        // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1406                        // it.
1407                        //
1408                    }
1409                    // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1410                    // it because the buffer ended
1411                    //
1412                }
1413                // Note that we did not check for an invalid low surrogate here, or that fact that the
1414                // lo surrogate was missing. We just picked out one 16 bit character unless the character
1415                // was a valid hi surrogate, in whcih case we consumed two 16 bit characters.
1416                //
1417            }
1418        }
1419        else
1420        {
1421            // We need to go backwards from our input point
1422            //
1423            while   (la++ < 0 && (pANTLR3_UINT8)nextChar > (pANTLR3_UINT8)input->data )
1424            {
1425                // Get the previous 16 bit character
1426                //
1427                ch = ((*nextChar - 2) << 8) + (*nextChar -1);
1428                nextChar -= 2;
1429
1430                // If we found a low surrogate then go back one more character if
1431                // the hi surrogate is there
1432                //
1433                if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END)
1434                {
1435                    ch2 = ((*nextChar - 2) << 8) + (*nextChar -1);
1436                    if (ch2 >= UNI_SUR_HIGH_START && ch2 <= UNI_SUR_HIGH_END)
1437                    {
1438                        // Yes, there is a high surrogate to match it so decrement one more and point to that
1439                        //
1440                        nextChar -=2;
1441                    }
1442                }
1443            }
1444        }
1445
1446        // Our local copy of nextChar is now pointing to either the correct character or end of file
1447        //
1448        // Input buffer size is always in bytes
1449        //
1450	if	( (pANTLR3_UINT8)nextChar >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1451	{
1452		return	ANTLR3_CHARSTREAM_EOF;
1453	}
1454	else
1455	{
1456            // Pick up the next 16 character (big endian byte order)
1457            //
1458            ch = ((*nextChar) << 8) + *(nextChar+1);
1459            nextChar += 2;
1460
1461            // If we have a surrogate pair then we need to consume
1462            // a following valid LO surrogate.
1463            //
1464            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END)
1465            {
1466                // If the 16 bits following the high surrogate are in the source buffer...
1467                //
1468                if	((pANTLR3_UINT8)(nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1469                {
1470                    // Next character is in big endian byte order
1471                    //
1472                    ch2 = ((*nextChar) << 8) + *(nextChar+1);
1473
1474                    // If it's a valid low surrogate, consume it
1475                    //
1476                    if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END)
1477                    {
1478                        // Construct the UTF32 code point
1479                        //
1480                        ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
1481			    + (ch2 - UNI_SUR_LOW_START) + halfBase;
1482                    }
1483                    // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1484                    // it.
1485                    //
1486                }
1487                // Note that we ignore a valid hi surrogate that has no lo surrogate to go with
1488                // it because the buffer ended
1489                //
1490            }
1491        }
1492        return ch;
1493}
1494
1495/// \brief Common function to setup function interface for a UTF3 input stream.
1496///
1497/// \param input Input stream context pointer
1498///
1499void
1500antlr3UTF32SetupStream	(pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN machineBigEndian, ANTLR3_BOOLEAN inputBigEndian)
1501{
1502    // Build a string factory for this stream. This is a UTF32 string factory which is a standard
1503    // part of the ANTLR3 string. The string factory is then passed through the whole chain of lexer->parser->tree->treeparser
1504    // and so on.
1505    //
1506    input->strFactory	= antlr3StringFactoryNew(input->encoding);
1507
1508    // Generic API that does not care about endianess.
1509    //
1510    input->istream->index	    =  antlr3UTF32Index;            // Calculate current index in input stream, UTF16 based
1511    input->substr		    =  antlr3UTF32Substr;	    // Return a string from the input stream
1512    input->istream->seek	    =  antlr3UTF32Seek;		    // How to seek to a specific point in the stream
1513    input->istream->consume	    =  antlr3UTF32Consume;	    // Consume the next UTF32 character in the buffer
1514
1515    // We must install different UTF32 LA routines according to whether the input
1516    // is the same endianess as the machine we are executing upon or not. If it is not
1517    // then we must install methods that can convert the endianess on the fly as they go
1518    //
1519    switch (machineBigEndian)
1520    {
1521        case    ANTLR3_TRUE:
1522
1523            // Machine is Big Endian, if the input is also then install the
1524            // methods that do not access input by bytes and reverse them.
1525            // Otherwise install endian aware methods.
1526            //
1527            if  (inputBigEndian == ANTLR3_TRUE)
1528            {
1529                // Input is machine compatible
1530                //
1531                input->istream->_LA         =  antlr3UTF32LA;		    // Return the UTF32 character at offset n (1 based)
1532            }
1533            else
1534            {
1535                // Need to use methods that know that the input is little endian
1536                //
1537                input->istream->_LA         =  antlr3UTF32LALE;		    // Return the UTF32 character at offset n (1 based)
1538            }
1539            break;
1540
1541        case    ANTLR3_FALSE:
1542
1543            // Machine is Little Endian, if the input is also then install the
1544            // methods that do not access input by bytes and reverse them.
1545            // Otherwise install endian aware methods.
1546            //
1547            if  (inputBigEndian == ANTLR3_FALSE)
1548            {
1549                // Input is machine compatible
1550                //
1551                input->istream->_LA         =  antlr3UTF32LA;		    // Return the UTF32 character at offset n (1 based)
1552            }
1553            else
1554            {
1555                // Need to use methods that know that the input is Big Endian
1556                //
1557                input->istream->_LA         =  antlr3UTF32LABE;		    // Return the UTF32 character at offset n (1 based)
1558            }
1559            break;
1560    }
1561
1562    input->charByteSize		    = 4;			    // Size in bytes of characters in this stream.
1563}
1564
1565/** \brief Consume the next character in a UTF32 input stream
1566 *
1567 * \param input Input stream context pointer
1568 */
1569static void
1570antlr3UTF32Consume(pANTLR3_INT_STREAM is)
1571{
1572    pANTLR3_INPUT_STREAM input;
1573
1574    input   = ((pANTLR3_INPUT_STREAM) (is->super));
1575
1576    // SizeBuf is always in bytes
1577    //
1578    if	((pANTLR3_UINT8)(input->nextChar) < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1579    {
1580	/* Indicate one more character in this line
1581	 */
1582	input->charPositionInLine++;
1583
1584	if  ((ANTLR3_UCHAR)(*((pANTLR3_UINT32)input->nextChar)) == input->newlineChar)
1585	{
1586	    /* Reset for start of a new line of input
1587	     */
1588	    input->line++;
1589	    input->charPositionInLine	= 0;
1590	    input->currentLine		= (void *)(((pANTLR3_UINT32)input->nextChar) + 1);
1591	}
1592
1593	/* Increment to next character position
1594	 */
1595	input->nextChar = (void *)(((pANTLR3_UINT32)input->nextChar) + 1);
1596    }
1597}
1598
1599/// \brief Calculate the current index in the output stream.
1600/// \param[in] input Input stream context pointer
1601///
1602static ANTLR3_MARKER
1603antlr3UTF32Index(pANTLR3_INT_STREAM is)
1604{
1605    pANTLR3_INPUT_STREAM input;
1606
1607    input   = ((pANTLR3_INPUT_STREAM) (is->super));
1608
1609    return  (ANTLR3_MARKER)(input->nextChar);
1610}
1611
1612/// \brief Return a substring of the UTF16 input stream in
1613///  newly allocated memory.
1614///
1615/// \param input Input stream context pointer
1616/// \param start Offset in input stream where the string starts
1617/// \param stop  Offset in the input stream where the string ends.
1618///
1619static pANTLR3_STRING
1620antlr3UTF32Substr		(pANTLR3_INPUT_STREAM input, ANTLR3_MARKER start, ANTLR3_MARKER stop)
1621{
1622    return  input->strFactory->newPtr(input->strFactory, (pANTLR3_UINT8)start, ((ANTLR3_UINT32_CAST(stop - start))/4) + 1);
1623}
1624
1625/// \brief Rewind the lexer input to the state specified by the supplied mark.
1626///
1627/// \param[in] input Input stream context pointer
1628///
1629/// \remark
1630/// Assumes UTF32 input stream.
1631///
1632static void
1633antlr3UTF32Seek	(pANTLR3_INT_STREAM is, ANTLR3_MARKER seekPoint)
1634{
1635	pANTLR3_INPUT_STREAM input;
1636
1637	input   = ((pANTLR3_INPUT_STREAM) is->super);
1638
1639	// If the requested seek point is less than the current
1640	// input point, then we assume that we are resetting from a mark
1641	// and do not need to scan, but can just set to there as rewind will
1642        // reset line numbers and so on.
1643	//
1644	if	(seekPoint <= (ANTLR3_MARKER)(input->nextChar))
1645	{
1646		input->nextChar	= (void *)seekPoint;
1647	}
1648	else
1649	{
1650            // Call consume until we reach the asked for seek point or EOF
1651            //
1652            while (is->_LA(is, 1) != ANTLR3_CHARSTREAM_EOF && seekPoint < (ANTLR3_MARKER)input->nextChar)
1653	    {
1654		is->consume(is);
1655	    }
1656	}
1657}
1658
1659/** \brief Return the input element assuming a UTF32 input in natural machine byte order
1660 *
1661 * \param[in] input Input stream context pointer
1662 * \param[in] la 1 based offset of next input stream element
1663 *
1664 * \return Next input character in internal ANTLR3 encoding (UTF32)
1665 */
1666static ANTLR3_UCHAR
1667antlr3UTF32LA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
1668{
1669    pANTLR3_INPUT_STREAM input;
1670
1671    input   = ((pANTLR3_INPUT_STREAM) (is->super));
1672
1673    if	(( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1674    {
1675		return	ANTLR3_CHARSTREAM_EOF;
1676    }
1677    else
1678    {
1679		return	(ANTLR3_UCHAR)(*((pANTLR3_UINT32)input->nextChar + la - 1));
1680    }
1681}
1682
1683/** \brief Return the input element assuming a UTF32 input in little endian byte order
1684 *
1685 * \param[in] input Input stream context pointer
1686 * \param[in] la 1 based offset of next input stream element
1687 *
1688 * \return Next input character in internal ANTLR3 encoding (UTF32)
1689 */
1690static ANTLR3_UCHAR
1691antlr3UTF32LALE(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
1692{
1693    pANTLR3_INPUT_STREAM input;
1694
1695    input   = ((pANTLR3_INPUT_STREAM) (is->super));
1696
1697    if	(( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1698    {
1699		return	ANTLR3_CHARSTREAM_EOF;
1700    }
1701    else
1702    {
1703        ANTLR3_UCHAR   c;
1704
1705        c = (ANTLR3_UCHAR)(*((pANTLR3_UINT32)input->nextChar + la - 1));
1706
1707        // Swap Endianess to Big Endian
1708        //
1709        return (c>>24) | ((c<<8) & 0x00FF0000) | ((c>>8) & 0x0000FF00) | (c<<24);
1710    }
1711}
1712
1713/** \brief Return the input element assuming a UTF32 input in big endian byte order
1714 *
1715 * \param[in] input Input stream context pointer
1716 * \param[in] la 1 based offset of next input stream element
1717 *
1718 * \return Next input character in internal ANTLR3 encoding (UTF32)
1719 * \remark This is the same code as LE version but seprated in case there are better optimisations fo rendinan swap
1720 */
1721static ANTLR3_UCHAR
1722antlr3UTF32LABE(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
1723{
1724    pANTLR3_INPUT_STREAM input;
1725
1726    input   = ((pANTLR3_INPUT_STREAM) (is->super));
1727
1728    if	(( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1729    {
1730		return	ANTLR3_CHARSTREAM_EOF;
1731    }
1732    else
1733    {
1734        ANTLR3_UCHAR   c;
1735
1736        c = (ANTLR3_UCHAR)(*((pANTLR3_UINT32)input->nextChar + la - 1));
1737
1738        // Swap Endianess to Little Endian
1739        //
1740        return (c>>24) | ((c<<8) & 0x00FF0000) | ((c>>8) & 0x0000FF00) | (c<<24);
1741    }
1742}
1743
1744
1745/// \brief Common function to setup function interface for a UTF8 input stream.
1746///
1747/// \param input Input stream context pointer
1748///
1749void
1750antlr3UTF8SetupStream	(pANTLR3_INPUT_STREAM input)
1751{
1752    // Build a string factory for this stream. This is a UTF16 string factory which is a standard
1753    // part of the ANTLR3 string. The string factory is then passed through the whole chain of lexer->parser->tree->treeparser
1754    // and so on.
1755    //
1756    input->strFactory	= antlr3StringFactoryNew(input->encoding);
1757
1758    // Generic API that does not care about endianess.
1759    //
1760    input->istream->consume	= antlr3UTF8Consume;	// Consume the next UTF32 character in the buffer
1761    input->istream->_LA         = antlr3UTF8LA;         // Return the UTF32 character at offset n (1 based)
1762    input->charByteSize		= 0;	                // Size in bytes of characters in this stream.
1763}
1764
1765// ------------------------------------------------------
1766// Following is from Unicode.org (see antlr3convertutf.c)
1767//
1768
1769/// Index into the table below with the first byte of a UTF-8 sequence to
1770/// get the number of trailing bytes that are supposed to follow it.
1771/// Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
1772/// left as-is for anyone who may want to do such conversion, which was
1773/// allowed in earlier algorithms.
1774///
1775static const ANTLR3_UINT32 trailingBytesForUTF8[256] = {
1776    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1777    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1778    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1779    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1780    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1781    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1782    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1783    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
1784};
1785
1786/// Magic values subtracted from a buffer value during UTF8 conversion.
1787/// This table contains as many values as there might be trailing bytes
1788/// in a UTF-8 sequence.
1789///
1790static const UTF32 offsetsFromUTF8[6] =
1791    {   0x00000000UL, 0x00003080UL, 0x000E2080UL,
1792	0x03C82080UL, 0xFA082080UL, 0x82082080UL
1793    };
1794
1795// End of Unicode.org tables
1796// -------------------------
1797
1798
1799/** \brief Consume the next character in a UTF8 input stream
1800 *
1801 * \param input Input stream context pointer
1802 */
1803static void
1804antlr3UTF8Consume(pANTLR3_INT_STREAM is)
1805{
1806    pANTLR3_INPUT_STREAM    input;
1807    ANTLR3_UINT32           extraBytesToRead;
1808    ANTLR3_UCHAR            ch;
1809    pANTLR3_UINT8           nextChar;
1810
1811    input   = ((pANTLR3_INPUT_STREAM) (is->super));
1812
1813    nextChar = input->nextChar;
1814
1815    if	(nextChar < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1816    {
1817	// Indicate one more character in this line
1818	//
1819	input->charPositionInLine++;
1820
1821        // Are there more bytes needed to make up the whole thing?
1822        //
1823        extraBytesToRead = trailingBytesForUTF8[*nextChar];
1824
1825        if	(nextChar + extraBytesToRead >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1826        {
1827            input->nextChar = (((pANTLR3_UINT8)input->data) + input->sizeBuf);
1828            return;
1829        }
1830
1831        // Cases deliberately fall through (see note A in antlrconvertutf.c)
1832        // Legal UTF8 is only 4 bytes but 6 bytes could be used in old UTF8 so
1833        // we allow it.
1834        //
1835        ch  = 0;
1836       	switch (extraBytesToRead) {
1837	    case 5: ch += *nextChar++; ch <<= 6;
1838	    case 4: ch += *nextChar++; ch <<= 6;
1839	    case 3: ch += *nextChar++; ch <<= 6;
1840	    case 2: ch += *nextChar++; ch <<= 6;
1841	    case 1: ch += *nextChar++; ch <<= 6;
1842	    case 0: ch += *nextChar++;
1843	}
1844
1845        // Magically correct the input value
1846        //
1847	ch -= offsetsFromUTF8[extraBytesToRead];
1848	if  (ch == input->newlineChar)
1849	{
1850	    /* Reset for start of a new line of input
1851	     */
1852	    input->line++;
1853	    input->charPositionInLine	= 0;
1854	    input->currentLine		= (void *)nextChar;
1855	}
1856
1857        // Update input pointer
1858        //
1859        input->nextChar = nextChar;
1860    }
1861}
1862/** \brief Return the input element assuming a UTF8 input
1863 *
1864 * \param[in] input Input stream context pointer
1865 * \param[in] la 1 based offset of next input stream element
1866 *
1867 * \return Next input character in internal ANTLR3 encoding (UTF32)
1868 */
1869static ANTLR3_UCHAR
1870antlr3UTF8LA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
1871{
1872    pANTLR3_INPUT_STREAM    input;
1873    ANTLR3_UINT32           extraBytesToRead;
1874    ANTLR3_UCHAR            ch;
1875    pANTLR3_UINT8           nextChar;
1876
1877    input   = ((pANTLR3_INPUT_STREAM) (is->super));
1878
1879    nextChar = input->nextChar;
1880
1881    // Do we need to traverse forwards or backwards?
1882    // - LA(0) is treated as LA(1) and we assume that the nextChar is
1883    //   already positioned.
1884    // - LA(n+) ; n>1 means we must traverse forward n-1 characters catering for UTF8 encoding
1885    // - LA(-n) means we must traverse backwards n chracters
1886    //
1887    if (la > 1) {
1888
1889        // Make sure that we have at least one character left before trying to
1890        // loop through the buffer.
1891        //
1892        if	(nextChar < (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1893        {
1894            // Now traverse n-1 characters forward
1895            //
1896            while (--la > 0)
1897            {
1898                // Does the next character require trailing bytes?
1899                // If so advance the pointer by that many bytes as well as advancing
1900                // one position for what will be at least a single byte character.
1901                //
1902                nextChar += trailingBytesForUTF8[*nextChar] + 1;
1903
1904                // Does that calculation take us past the byte length of the buffer?
1905                //
1906                if	(nextChar >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1907                {
1908                    return ANTLR3_CHARSTREAM_EOF;
1909                }
1910            }
1911        }
1912        else
1913        {
1914            return ANTLR3_CHARSTREAM_EOF;
1915        }
1916    }
1917    else
1918    {
1919        // LA is negative so we decrease the pointer by n character positions
1920        //
1921        while   (nextChar > (pANTLR3_UINT8)input->data && la++ < 0)
1922        {
1923            // Traversing backwards in UTF8 means decermenting by one
1924            // then continuing to decrement while ever a character pattern
1925            // is flagged as being a trailing byte of an encoded code point.
1926            // Trailing UTF8 bytes always start with 10 in binary. We assumne that
1927            // the UTF8 is well formed and do not check boundary conditions
1928            //
1929            nextChar--;
1930            while ((*nextChar & 0xC0) == 0x80)
1931            {
1932                nextChar--;
1933            }
1934        }
1935    }
1936
1937    // nextChar is now pointing at the UTF8 encoded character that we need to
1938    // decode and return.
1939    //
1940    // Are there more bytes needed to make up the whole thing?
1941    //
1942    extraBytesToRead = trailingBytesForUTF8[*nextChar];
1943    if	(nextChar + extraBytesToRead >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
1944    {
1945        return ANTLR3_CHARSTREAM_EOF;
1946    }
1947
1948    // Cases deliberately fall through (see note A in antlrconvertutf.c)
1949    //
1950    ch  = 0;
1951    switch (extraBytesToRead) {
1952            case 5: ch += *nextChar++; ch <<= 6;
1953            case 4: ch += *nextChar++; ch <<= 6;
1954            case 3: ch += *nextChar++; ch <<= 6;
1955            case 2: ch += *nextChar++; ch <<= 6;
1956            case 1: ch += *nextChar++; ch <<= 6;
1957            case 0: ch += *nextChar++;
1958    }
1959
1960    // Magically correct the input value
1961    //
1962    ch -= offsetsFromUTF8[extraBytesToRead];
1963
1964    return ch;
1965}
1966
1967// EBCDIC to ASCII conversion table
1968//
1969// This for EBCDIC EDF04 translated to ISO-8859.1 which is the usually accepted POSIX
1970// translation and the character tables are published all over the interweb.
1971//
1972const ANTLR3_UCHAR e2a[256] =
1973{
1974    0x00, 0x01, 0x02, 0x03, 0x85, 0x09, 0x86, 0x7f,
1975    0x87, 0x8d, 0x8e, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
1976    0x10, 0x11, 0x12, 0x13, 0x8f, 0x0a, 0x08, 0x97,
1977    0x18, 0x19, 0x9c, 0x9d, 0x1c, 0x1d, 0x1e, 0x1f,
1978    0x80, 0x81, 0x82, 0x83, 0x84, 0x92, 0x17, 0x1b,
1979    0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x05, 0x06, 0x07,
1980    0x90, 0x91, 0x16, 0x93, 0x94, 0x95, 0x96, 0x04,
1981    0x98, 0x99, 0x9a, 0x9b, 0x14, 0x15, 0x9e, 0x1a,
1982    0x20, 0xa0, 0xe2, 0xe4, 0xe0, 0xe1, 0xe3, 0xe5,
1983    0xe7, 0xf1, 0x60, 0x2e, 0x3c, 0x28, 0x2b, 0x7c,
1984    0x26, 0xe9, 0xea, 0xeb, 0xe8, 0xed, 0xee, 0xef,
1985    0xec, 0xdf, 0x21, 0x24, 0x2a, 0x29, 0x3b, 0x9f,
1986    0x2d, 0x2f, 0xc2, 0xc4, 0xc0, 0xc1, 0xc3, 0xc5,
1987    0xc7, 0xd1, 0x5e, 0x2c, 0x25, 0x5f, 0x3e, 0x3f,
1988    0xf8, 0xc9, 0xca, 0xcb, 0xc8, 0xcd, 0xce, 0xcf,
1989    0xcc, 0xa8, 0x3a, 0x23, 0x40, 0x27, 0x3d, 0x22,
1990    0xd8, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
1991    0x68, 0x69, 0xab, 0xbb, 0xf0, 0xfd, 0xfe, 0xb1,
1992    0xb0, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70,
1993    0x71, 0x72, 0xaa, 0xba, 0xe6, 0xb8, 0xc6, 0xa4,
1994    0xb5, 0xaf, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
1995    0x79, 0x7a, 0xa1, 0xbf, 0xd0, 0xdd, 0xde, 0xae,
1996    0xa2, 0xa3, 0xa5, 0xb7, 0xa9, 0xa7, 0xb6, 0xbc,
1997    0xbd, 0xbe, 0xac, 0x5b, 0x5c, 0x5d, 0xb4, 0xd7,
1998    0xf9, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
1999    0x48, 0x49, 0xad, 0xf4, 0xf6, 0xf2, 0xf3, 0xf5,
2000    0xa6, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50,
2001    0x51, 0x52, 0xb9, 0xfb, 0xfc, 0xdb, 0xfa, 0xff,
2002    0xd9, 0xf7, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
2003    0x59, 0x5a, 0xb2, 0xd4, 0xd6, 0xd2, 0xd3, 0xd5,
2004    0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37,
2005    0x38, 0x39, 0xb3, 0x7b, 0xdc, 0x7d, 0xda, 0x7e
2006};
2007
2008/// \brief Common function to setup function interface for a EBCDIC input stream.
2009///
2010/// \param input Input stream context pointer
2011///
2012void
2013antlr3EBCDICSetupStream	(pANTLR3_INPUT_STREAM input)
2014{
2015    // EBCDIC streams can use the standard 8 bit string factory
2016    //
2017    input->strFactory	= antlr3StringFactoryNew(input->encoding);
2018
2019    // Generic API that does not care about endianess.
2020    //
2021    input->istream->_LA         = antlr3EBCDICLA;       // Return the UTF32 character at offset n (1 based)
2022    input->charByteSize		= 1;	                // Size in bytes of characters in this stream.
2023}
2024
2025/// \brief Return the input element assuming an 8 bit EBCDIC input
2026///
2027/// \param[in] input Input stream context pointer
2028/// \param[in] la 1 based offset of next input stream element
2029///
2030/// \return Next input character in internal ANTLR3 encoding (UTF32) after translation
2031///         from EBCDIC to ASCII
2032///
2033static ANTLR3_UCHAR
2034antlr3EBCDICLA(pANTLR3_INT_STREAM is, ANTLR3_INT32 la)
2035{
2036    pANTLR3_INPUT_STREAM input;
2037
2038    input   = ((pANTLR3_INPUT_STREAM) (is->super));
2039
2040    if	(( ((pANTLR3_UINT8)input->nextChar) + la - 1) >= (((pANTLR3_UINT8)input->data) + input->sizeBuf))
2041    {
2042        return	ANTLR3_CHARSTREAM_EOF;
2043    }
2044    else
2045    {
2046        // Translate the required character via the constant conversion table
2047        //
2048        return	e2a[(*((pANTLR3_UINT8)input->nextChar + la - 1))];
2049    }
2050}