1/** \file
2 * Base interface for any ANTLR3 lexer.
3 *
4 * An ANLTR3 lexer builds from two sets of components:
5 *
6 *  - The runtime components that provide common functionality such as
7 *    traversing character streams, building tokens for output and so on.
8 *  - The generated rules and struutre of the actual lexer, which call upon the
9 *    runtime components.
10 *
11 * A lexer class contains  a character input stream, a base recognizer interface
12 * (which it will normally implement) and a token source interface (which it also
13 * implements. The Tokensource interface is called by a token consumer (such as
14 * a parser, but in theory it can be anything that wants a set of abstract
15 * tokens in place of a raw character stream.
16 *
17 * So then, we set up a lexer in a sequence akin to:
18 *
19 *  - Create a character stream (something which implements ANTLR3_INPUT_STREAM)
20 *    and initialize it.
21 *  - Create a lexer interface and tell it where it its input stream is.
22 *    This will cause the creation of a base recognizer class, which it will
23 *    override with its own implementations of some methods. The lexer creator
24 *    can also then in turn override anything it likes.
25 *  - The lexer token source interface is then passed to some interface that
26 *    knows how to use it, byte calling for a next token.
27 *  - When a next token is called, let ze lexing begin.
28 *
29 */
30#ifndef	_ANTLR3_LEXER
31#define	_ANTLR3_LEXER
32
33// [The "BSD licence"]
34// Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC
35// http://www.temporal-wave.com
36// http://www.linkedin.com/in/jimidle
37//
38// All rights reserved.
39//
40// Redistribution and use in source and binary forms, with or without
41// modification, are permitted provided that the following conditions
42// are met:
43// 1. Redistributions of source code must retain the above copyright
44//    notice, this list of conditions and the following disclaimer.
45// 2. Redistributions in binary form must reproduce the above copyright
46//    notice, this list of conditions and the following disclaimer in the
47//    documentation and/or other materials provided with the distribution.
48// 3. The name of the author may not be used to endorse or promote products
49//    derived from this software without specific prior written permission.
50//
51// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
52// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
53// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
54// IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
55// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
56// NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
57// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
58// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
59// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
60// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
61
62/* Definitions
63 */
64#define	ANTLR3_STRING_TERMINATOR	0xFFFFFFFF
65
66#include    <antlr3defs.h>
67#include    <antlr3input.h>
68#include    <antlr3commontoken.h>
69#include    <antlr3tokenstream.h>
70#include    <antlr3baserecognizer.h>
71
72#ifdef __cplusplus
73extern "C" {
74#endif
75
76typedef	struct ANTLR3_LEXER_struct
77{
78    /** If there is a super structure that is implementing the
79     *  lexer, then a pointer to it can be stored here in case
80     *  implementing functions are overridden by this super structure.
81     */
82    void	* super;
83
84    /** A generated lexer has an mTokens() function, which needs
85     *  the context pointer of the generated lexer, not the base lexer interface
86     *  this is stored here and initialized by the generated code (or manually
87     *  if this is a manually built lexer.
88     */
89    void	* ctx;
90
91    /** A pointer to the character stream whence this lexer is receiving
92     *  characters.
93     *  TODO: I may come back to this and implement charstream outside
94     *  the input stream as per the java implementation.
95     */
96    pANTLR3_INPUT_STREAM	input;
97
98    /** Pointer to the implementation of a base recognizer, which the lexer
99     *  creates and then overrides with its own lexer oriented functions (the
100     *  default implementation is parser oriented). This also contains a
101     *  token source interface, which the lexer instance will provide to anything
102     *  that needs it, which is anything else that implements a base recognizer,
103     *  such as a parser.
104     */
105    pANTLR3_BASE_RECOGNIZER	rec;
106
107    /** Pointer to a function that sets the charstream source for the lexer and
108     *  causes it to  be reset.
109     */
110    void			(*setCharStream)    (struct ANTLR3_LEXER_struct * lexer, pANTLR3_INPUT_STREAM input);
111
112    /** Pointer to a function that switches the current character input stream to
113     *  a new one, saving the old one, which we will revert to at the end of this
114     *  new one.
115     */
116    void			(*pushCharStream)   (struct ANTLR3_LEXER_struct * lexer, pANTLR3_INPUT_STREAM input);
117
118    /** Pointer to a function that abandons the current input stream, whether it
119     *  is empty or not and reverts to the previous stacked input stream.
120     */
121    void			(*popCharStream)    (struct ANTLR3_LEXER_struct * lexer);
122
123    /** Pointer to a function that emits the supplied token as the next token in
124     *  the stream.
125     */
126    void			(*emitNew)	    (struct ANTLR3_LEXER_struct * lexer, pANTLR3_COMMON_TOKEN token);
127
128    /** Pointer to a function that constructs a new token from the lexer stored information
129     */
130    pANTLR3_COMMON_TOKEN	(*emit)		    (struct ANTLR3_LEXER_struct * lexer);
131
132    /** Pointer to the user provided (either manually or through code generation
133     *  function that causes the lexer rules to run the lexing rules and produce
134     *  the next token if there iss one. This is called from nextToken() in the
135     *  pANTLR3_TOKEN_SOURCE. Note that the input parameter for this funciton is
136     *  the generated lexer context (stored in ctx in this interface) it is a generated
137     *  function and expects the context to be the generated lexer.
138     */
139    void	        (*mTokens)		    (void * ctx);
140
141    /** Pointer to a function that attempts to match and consume the specified string from the input
142     *  stream. Note that strings muse be passed as terminated arrays of ANTLR3_UCHAR. Strings are terminated
143     *  with 0xFFFFFFFF, which is an invalid UTF32 character
144     */
145    ANTLR3_BOOLEAN	(*matchs)	    (struct ANTLR3_LEXER_struct * lexer, ANTLR3_UCHAR * string);
146
147    /** Pointer to a function that matches and consumes the specified character from the input stream.
148     *  The input stream is required to provide characters via LA() as UTF32 characters. The default lexer
149     *  implementation is source encoding agnostic and so input streams do not generally need to
150     *  override the default implmentation.
151     */
152    ANTLR3_BOOLEAN	(*matchc)	    (struct ANTLR3_LEXER_struct * lexer, ANTLR3_UCHAR c);
153
154    /** Pointer to a function that matches any character in the supplied range (I suppose it could be a token range too
155     *  but this would only be useful if the tokens were in tsome guaranteed order which is
156     *  only going to happen with a hand crafted token set).
157     */
158    ANTLR3_BOOLEAN	(*matchRange)	    (struct ANTLR3_LEXER_struct * lexer, ANTLR3_UCHAR low, ANTLR3_UCHAR high);
159
160    /** Pointer to a function that matches the next token/char in the input stream
161     *  regardless of what it actaully is.
162     */
163    void		(*matchAny)	    (struct ANTLR3_LEXER_struct * lexer);
164
165    /** Pointer to a function that recovers from an error found in the input stream.
166     *  Generally, this will be a #ANTLR3_EXCEPTION_NOVIABLE_ALT but it could also
167     *  be from a mismatched token that the (*match)() could not recover from.
168     */
169    void		(*recover)	    (struct ANTLR3_LEXER_struct * lexer);
170
171    /** Pointer to function to return the current line number in the input stream
172     */
173    ANTLR3_UINT32	(*getLine)		(struct ANTLR3_LEXER_struct * lexer);
174    ANTLR3_MARKER	(*getCharIndex)		(struct ANTLR3_LEXER_struct * lexer);
175    ANTLR3_UINT32	(*getCharPositionInLine)(struct ANTLR3_LEXER_struct * lexer);
176
177    /** Pointer to function to return the text so far for the current token being generated
178     */
179    pANTLR3_STRING	(*getText)	    (struct ANTLR3_LEXER_struct * lexer);
180
181
182    /** Pointer to a function that knows how to free the resources of a lexer
183     */
184    void		(*free)		    (struct ANTLR3_LEXER_struct * lexer);
185
186}
187    ANTLR3_LEXER;
188
189#ifdef __cplusplus
190}
191#endif
192
193#endif
194