antlr3input.h revision 324c4644fee44b9898524c09511bd33c3f12e2df
1/** \file
2 * Defines the basic structures used to manipulate character
3 * streams from any input source. Any character size and encoding
4 * can in theory be used, so long as a set of functinos is provided that
5 * can return a 32 bit Integer representation of their characters amd efficiently mark and revert
6 * to specific offsets into their input streams.
7 */
8#ifndef	_ANTLR3_INPUT_H
9#define	_ANTLR3_INPUT_H
10
11// [The "BSD licence"]
12// Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC
13// http://www.temporal-wave.com
14// http://www.linkedin.com/in/jimidle
15//
16// All rights reserved.
17//
18// Redistribution and use in source and binary forms, with or without
19// modification, are permitted provided that the following conditions
20// are met:
21// 1. Redistributions of source code must retain the above copyright
22//    notice, this list of conditions and the following disclaimer.
23// 2. Redistributions in binary form must reproduce the above copyright
24//    notice, this list of conditions and the following disclaimer in the
25//    documentation and/or other materials provided with the distribution.
26// 3. The name of the author may not be used to endorse or promote products
27//    derived from this software without specific prior written permission.
28//
29// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
30// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
31// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
32// IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
33// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
34// NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
35// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
36// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
37// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
38// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
39
40#include    <antlr3defs.h>
41#include    <antlr3string.h>
42#include    <antlr3commontoken.h>
43#include    <antlr3intstream.h>
44#include    <antlr3convertutf.h>
45
46#ifdef __cplusplus
47extern "C" {
48#endif
49
50
51
52/// Master context structure for an ANTLR3 C runtime based input stream.
53/// \ingroup apistructures
54///
55typedef	struct	ANTLR3_INPUT_STREAM_struct
56{
57    /** Interfaces that provide streams must all provide
58     *  a generic ANTLR3_INT_STREAM interface and an ANTLR3_INPUT_STREAM
59     *  is no different.
60     */
61    pANTLR3_INT_STREAM	istream;
62
63    /** Whatever super structure is providing the INPUT stream needs a pointer to itself
64     *  so that this can be passed back to it whenever the api functions
65     *  are called back from this interface.
66     */
67    void	      * super;
68
69    /** Pointer the start of the input string, characters may be
70     *  taken as offsets from here and in original input format encoding.
71     */
72    void	      *	data;
73
74    /** Indicates if the data pointer was allocated by us, and so should be freed
75     *  when the stream dies.
76     */
77    int			isAllocated;
78
79    /** String factory for this input stream
80     */
81    pANTLR3_STRING_FACTORY  strFactory;
82
83
84    /** Pointer to the next character to be consumed from the input data
85     *  This is cast to point at the encoding of the original file that
86     *  was read by the functions installed as pointer in this input stream
87     *  context instance at file/string/whatever load time.
88     */
89    void	      * nextChar;
90
91    /** Number of characters that can be consumed at this point in time.
92     *  Mostly this is just what is left in the pre-read buffer, but if the
93     *  input source is a stream such as a socket or something then we may
94     *  call special read code to wait for more input.
95     */
96    ANTLR3_UINT32	sizeBuf;
97
98    /** The line number we are traversing in the input file. This gets incremented
99     *  by a newline() call in the lexer grammar actions.
100     */
101    ANTLR3_UINT32	line;
102
103    /** Pointer into the input buffer where the current line
104     *  started.
105     */
106    void	      * currentLine;
107
108    /** The offset within the current line of the current character
109     */
110    ANTLR3_INT32	charPositionInLine;
111
112    /** Tracks how deep mark() calls are nested
113     */
114    ANTLR3_UINT32	markDepth;
115
116    /** List of mark() points in the input stream
117     */
118    pANTLR3_VECTOR	markers;
119
120    /** File name string, set to pointer to memory if
121     * you set it manually as it will be free()d
122     */
123    pANTLR3_STRING	fileName;
124
125    /** File number, needs to be set manually to some file index of your devising.
126     */
127    ANTLR3_UINT32	fileNo;
128
129    /* API */
130
131
132   /** Pointer to function that closes the input stream
133     */
134    void		(*close)	(struct	ANTLR3_INPUT_STREAM_struct * input);
135    void		(*free)		(struct	ANTLR3_INPUT_STREAM_struct * input);
136
137    /** Pointer to function that resets the input stream
138     */
139    void		(*reset)	(struct	ANTLR3_INPUT_STREAM_struct * input);
140
141    /** Pointer to a function that reuses and resets an input stream by
142     *  supplying a new 'source'
143     */
144    void                (*reuse)        (struct	ANTLR3_INPUT_STREAM_struct * input, pANTLR3_UINT8 inString, ANTLR3_UINT32 size, pANTLR3_UINT8 name);
145
146    /**
147     * Pointer to function that installs a version of LA that always
148     * returns upper case. Only valid for character streams and creates a case
149     * insensitive lexer if the lexer tokens are described in upper case. The
150     * tokens will preserve case in the token text.
151     */
152    void		(*setUcaseLA)		(pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN flag);
153
154    /** Pointer to function to return input stream element at 1 based
155     *  offset from nextChar. Same as _LA for char stream, but token
156     *  streams etc. have one of these that does other stuff of course.
157     */
158    void *		(*_LT)		(struct	ANTLR3_INPUT_STREAM_struct * input, ANTLR3_INT32 lt);
159
160    /** Pointer to function to return the total size of the input buffer. For streams
161     *  this may be just the total we have available so far. This means of course that
162     *  the input stream must be careful to accumulate enough input so that any backtracking
163     *  can be satisfied.
164     */
165    ANTLR3_UINT32	(*size)		(struct ANTLR3_INPUT_STREAM_struct * input);
166
167    /** Pointer to function to return a substring of the input stream. String is returned in allocated
168     *  memory and is in same encoding as the input stream itself, NOT internal ANTLR3_UCHAR form.
169     */
170    pANTLR3_STRING	(*substr)	(struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_MARKER start, ANTLR3_MARKER stop);
171
172    /** Pointer to function to return the current line number in the input stream
173     */
174    ANTLR3_UINT32	(*getLine)	(struct ANTLR3_INPUT_STREAM_struct * input);
175
176    /** Pointer to function to return the current line buffer in the input stream
177     *  The pointer returned is directly into the input stream so you must copy
178     *  it if you wish to manipulate it without damaging the input stream. Encoding
179     *  is obviously in the same form as the input stream.
180     *  \remark
181     *    - Note taht this function wil lbe inaccurate if setLine is called as there
182     *      is no way at the moment to position the input stream at a particular line
183     *	    number offset.
184     */
185    void	  *	(*getLineBuf)	(struct ANTLR3_INPUT_STREAM_struct * input);
186
187    /** Pointer to function to return the current offset in the current input stream line
188     */
189    ANTLR3_UINT32	(*getCharPositionInLine)  (struct ANTLR3_INPUT_STREAM_struct * input);
190
191    /** Pointer to function to set the current line number in the input stream
192     */
193    void		(*setLine)		  (struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_UINT32 line);
194
195    /** Pointer to function to set the current position in the current line.
196     */
197    void		(*setCharPositionInLine)  (struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_UINT32 position);
198
199    /** Pointer to function to override the default newline character that the input stream
200     *  looks for to trigger the line/offset and line buffer recording information.
201     *  \remark
202     *   - By default the chracter '\n' will be installed as the newline trigger character. When this
203     *     character is seen by the consume() function then the current line number is incremented and the
204     *     current line offset is reset to 0. The Pointer for the line of input we are consuming
205     *     is updated to point to the next character after this one in the input stream (which means it
206     *     may become invalid if the last newline character in the file is seen (so watch out).
207     *   - If for some reason you do not want the counters and pointers to be restee, you can set the
208     *     chracter to some impossible character such as '\0' or whatever.
209     *   - This is a single character only, so choose the last character in a sequence of two or more.
210     *   - This is only a simple aid to error reporting - if you have a complicated binary input structure
211     *     it may not be adequate, but you can always override every function in the input stream with your
212     *     own of course, and can even write your own complete input stream set if you like.
213     *   - It is your responsiblity to set a valid character for the input stream type. There is no point
214     *     setting this to 0xFFFFFFFF if the input stream is 8 bit ASCII, as this will just be truncated and never
215     *	   trigger as the comparison will be (INT32)0xFF == (INT32)0xFFFFFFFF
216     */
217    void		(*SetNewLineChar)	    (struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_UINT32 newlineChar);
218
219    /// Character that automatically causes an internal line count
220    ///  increment.
221    ///
222    ANTLR3_UCHAR	newlineChar;
223
224    /// Indicates the size, in 8 bit units, of a single character. Note that
225    /// the C runtime does not deal with surrogates as this would be
226    /// slow and complicated. If this is a UTF-8 stream then this field
227    /// will be set to 0. Generally you are best working internally with 32 bit characters
228    /// as this is the most efficient.
229    ///
230    ANTLR3_UINT8	charByteSize;
231
232    /// Indicates the encoding scheme used in this input stream
233    ///
234    ANTLR3_UINT32       encoding;
235}
236
237    ANTLR3_INPUT_STREAM;
238
239
240/** \brief Structure for track lex input states as part of mark()
241 *  and rewind() of lexer.
242 */
243typedef	struct	ANTLR3_LEX_STATE_struct
244{
245        /** Pointer to the next character to be consumed from the input data
246     *  This is cast to point at the encoding of the original file that
247     *  was read by the functions installed as pointer in this input stream
248     *  context instance at file/string/whatever load time.
249     */
250    void	      * nextChar;
251
252    /** The line number we are traversing in the input file. This gets incremented
253     *  by a newline() call in the lexer grammer actions.
254     */
255    ANTLR3_UINT32	line;
256
257    /** Pointer into the input buffer where the current line
258     *  started.
259     */
260    void	      * currentLine;
261
262    /** The offset within the current line of the current character
263     */
264    ANTLR3_INT32	charPositionInLine;
265
266}
267    ANTLR3_LEX_STATE;
268
269/* Prototypes
270 */
271void	    antlr38BitSetupStream	(pANTLR3_INPUT_STREAM input);
272void	    antlr3UTF16SetupStream	(pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN machineBigEndian, ANTLR3_BOOLEAN inputBigEndian);
273void	    antlr3UTF32SetupStream	(pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN machineBigEndian, ANTLR3_BOOLEAN inputBigEndian);
274void	    antlr3UTF8SetupStream	(pANTLR3_INPUT_STREAM input);
275void	    antlr3EBCDICSetupStream	(pANTLR3_INPUT_STREAM input);
276void        antlr3GenericSetupStream    (pANTLR3_INPUT_STREAM input);
277#ifdef __cplusplus
278}
279#endif
280
281#endif	/* _ANTLR3_INPUT_H  */
282