1/*
2 [The "BSD license"]
3 Copyright (c) 2005-2009 Terence Parr
4 All rights reserved.
5
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions
8 are met:
9 1. Redistributions of source code must retain the above copyright
10     notice, this list of conditions and the following disclaimer.
11 2. Redistributions in binary form must reproduce the above copyright
12     notice, this list of conditions and the following disclaimer in the
13     documentation and/or other materials provided with the distribution.
14 3. The name of the author may not be used to endorse or promote products
15     derived from this software without specific prior written permission.
16
17 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20 IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22 NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 */
28package org.antlr.runtime;
29
30/** A lexer is recognizer that draws input symbols from a character stream.
31 *  lexer grammars result in a subclass of this object. A Lexer object
32 *  uses simplified match() and error recovery mechanisms in the interest
33 *  of speed.
34 */
35public abstract class Lexer extends BaseRecognizer implements TokenSource {
36	/** Where is the lexer drawing characters from? */
37	protected CharStream input;
38
39	public Lexer() {
40	}
41
42	public Lexer(CharStream input) {
43		this.input = input;
44	}
45
46	public Lexer(CharStream input, RecognizerSharedState state) {
47		super(state);
48		this.input = input;
49	}
50
51	public void reset() {
52		super.reset(); // reset all recognizer state variables
53		// wack Lexer state variables
54		if ( input!=null ) {
55			input.seek(0); // rewind the input
56		}
57		if ( state==null ) {
58			return; // no shared state work to do
59		}
60		state.token = null;
61		state.type = Token.INVALID_TOKEN_TYPE;
62		state.channel = Token.DEFAULT_CHANNEL;
63		state.tokenStartCharIndex = -1;
64		state.tokenStartCharPositionInLine = -1;
65		state.tokenStartLine = -1;
66		state.text = null;
67	}
68
69	/** Return a token from this source; i.e., match a token on the char
70	 *  stream.
71	 */
72	public Token nextToken() {
73		while (true) {
74			state.token = null;
75			state.channel = Token.DEFAULT_CHANNEL;
76			state.tokenStartCharIndex = input.index();
77			state.tokenStartCharPositionInLine = input.getCharPositionInLine();
78			state.tokenStartLine = input.getLine();
79			state.text = null;
80			if ( input.LA(1)==CharStream.EOF ) {
81                Token eof = new CommonToken((CharStream)input,Token.EOF,
82                                            Token.DEFAULT_CHANNEL,
83                                            input.index(),input.index());
84                eof.setLine(getLine());
85                eof.setCharPositionInLine(getCharPositionInLine());
86                return eof;
87			}
88			try {
89				mTokens();
90				if ( state.token==null ) {
91					emit();
92				}
93				else if ( state.token==Token.SKIP_TOKEN ) {
94					continue;
95				}
96				return state.token;
97			}
98			catch (MismatchedRangeException re) {
99				reportError(re);
100				// matchRange() routine has already called recover()
101			}
102			catch (MismatchedTokenException re) {
103				reportError(re);
104				// match() routine has already called recover()
105			}
106			catch (RecognitionException re) {
107				reportError(re);
108				recover(re); // throw out current char and try again
109			}
110		}
111	}
112
113	/** Instruct the lexer to skip creating a token for current lexer rule
114	 *  and look for another token.  nextToken() knows to keep looking when
115	 *  a lexer rule finishes with token set to SKIP_TOKEN.  Recall that
116	 *  if token==null at end of any token rule, it creates one for you
117	 *  and emits it.
118	 */
119	public void skip() {
120		state.token = Token.SKIP_TOKEN;
121	}
122
123	/** This is the lexer entry point that sets instance var 'token' */
124	public abstract void mTokens() throws RecognitionException;
125
126	/** Set the char stream and reset the lexer */
127	public void setCharStream(CharStream input) {
128		this.input = null;
129		reset();
130		this.input = input;
131	}
132
133	public CharStream getCharStream() {
134		return this.input;
135	}
136
137	public String getSourceName() {
138		return input.getSourceName();
139	}
140
141	/** Currently does not support multiple emits per nextToken invocation
142	 *  for efficiency reasons.  Subclass and override this method and
143	 *  nextToken (to push tokens into a list and pull from that list rather
144	 *  than a single variable as this implementation does).
145	 */
146	public void emit(Token token) {
147		state.token = token;
148	}
149
150	/** The standard method called to automatically emit a token at the
151	 *  outermost lexical rule.  The token object should point into the
152	 *  char buffer start..stop.  If there is a text override in 'text',
153	 *  use that to set the token's text.  Override this method to emit
154	 *  custom Token objects.
155	 *
156	 *  If you are building trees, then you should also override
157	 *  Parser or TreeParser.getMissingSymbol().
158	 */
159	public Token emit() {
160		Token t = new CommonToken(input, state.type, state.channel, state.tokenStartCharIndex, getCharIndex()-1);
161		t.setLine(state.tokenStartLine);
162		t.setText(state.text);
163		t.setCharPositionInLine(state.tokenStartCharPositionInLine);
164		emit(t);
165		return t;
166	}
167
168	public void match(String s) throws MismatchedTokenException {
169		int i = 0;
170		while ( i<s.length() ) {
171			if ( input.LA(1)!=s.charAt(i) ) {
172				if ( state.backtracking>0 ) {
173					state.failed = true;
174					return;
175				}
176				MismatchedTokenException mte =
177					new MismatchedTokenException(s.charAt(i), input);
178				recover(mte);
179				throw mte;
180			}
181			i++;
182			input.consume();
183			state.failed = false;
184		}
185	}
186
187	public void matchAny() {
188		input.consume();
189	}
190
191	public void match(int c) throws MismatchedTokenException {
192		if ( input.LA(1)!=c ) {
193			if ( state.backtracking>0 ) {
194				state.failed = true;
195				return;
196			}
197			MismatchedTokenException mte =
198				new MismatchedTokenException(c, input);
199			recover(mte);  // don't really recover; just consume in lexer
200			throw mte;
201		}
202		input.consume();
203		state.failed = false;
204	}
205
206	public void matchRange(int a, int b)
207		throws MismatchedRangeException
208	{
209		if ( input.LA(1)<a || input.LA(1)>b ) {
210			if ( state.backtracking>0 ) {
211				state.failed = true;
212				return;
213			}
214			MismatchedRangeException mre =
215				new MismatchedRangeException(a,b,input);
216			recover(mre);
217			throw mre;
218		}
219		input.consume();
220		state.failed = false;
221	}
222
223	public int getLine() {
224		return input.getLine();
225	}
226
227	public int getCharPositionInLine() {
228		return input.getCharPositionInLine();
229	}
230
231	/** What is the index of the current character of lookahead? */
232	public int getCharIndex() {
233		return input.index();
234	}
235
236	/** Return the text matched so far for the current token or any
237	 *  text override.
238	 */
239	public String getText() {
240		if ( state.text!=null ) {
241			return state.text;
242		}
243		return input.substring(state.tokenStartCharIndex,getCharIndex()-1);
244	}
245
246	/** Set the complete text of this token; it wipes any previous
247	 *  changes to the text.
248	 */
249	public void setText(String text) {
250		state.text = text;
251	}
252
253	public void reportError(RecognitionException e) {
254		/** TODO: not thought about recovery in lexer yet.
255		 *
256		// if we've already reported an error and have not matched a token
257		// yet successfully, don't report any errors.
258		if ( errorRecovery ) {
259			//System.err.print("[SPURIOUS] ");
260			return;
261		}
262		errorRecovery = true;
263		 */
264
265		displayRecognitionError(this.getTokenNames(), e);
266	}
267
268	public String getErrorMessage(RecognitionException e, String[] tokenNames) {
269		String msg = null;
270		if ( e instanceof MismatchedTokenException ) {
271			MismatchedTokenException mte = (MismatchedTokenException)e;
272			msg = "mismatched character "+getCharErrorDisplay(e.c)+" expecting "+getCharErrorDisplay(mte.expecting);
273		}
274		else if ( e instanceof NoViableAltException ) {
275			NoViableAltException nvae = (NoViableAltException)e;
276			// for development, can add "decision=<<"+nvae.grammarDecisionDescription+">>"
277			// and "(decision="+nvae.decisionNumber+") and
278			// "state "+nvae.stateNumber
279			msg = "no viable alternative at character "+getCharErrorDisplay(e.c);
280		}
281		else if ( e instanceof EarlyExitException ) {
282			EarlyExitException eee = (EarlyExitException)e;
283			// for development, can add "(decision="+eee.decisionNumber+")"
284			msg = "required (...)+ loop did not match anything at character "+getCharErrorDisplay(e.c);
285		}
286		else if ( e instanceof MismatchedNotSetException ) {
287			MismatchedNotSetException mse = (MismatchedNotSetException)e;
288			msg = "mismatched character "+getCharErrorDisplay(e.c)+" expecting set "+mse.expecting;
289		}
290		else if ( e instanceof MismatchedSetException ) {
291			MismatchedSetException mse = (MismatchedSetException)e;
292			msg = "mismatched character "+getCharErrorDisplay(e.c)+" expecting set "+mse.expecting;
293		}
294		else if ( e instanceof MismatchedRangeException ) {
295			MismatchedRangeException mre = (MismatchedRangeException)e;
296			msg = "mismatched character "+getCharErrorDisplay(e.c)+" expecting set "+
297				  getCharErrorDisplay(mre.a)+".."+getCharErrorDisplay(mre.b);
298		}
299		else {
300			msg = super.getErrorMessage(e, tokenNames);
301		}
302		return msg;
303	}
304
305	public String getCharErrorDisplay(int c) {
306		String s = String.valueOf((char)c);
307		switch ( c ) {
308			case Token.EOF :
309				s = "<EOF>";
310				break;
311			case '\n' :
312				s = "\\n";
313				break;
314			case '\t' :
315				s = "\\t";
316				break;
317			case '\r' :
318				s = "\\r";
319				break;
320		}
321		return "'"+s+"'";
322	}
323
324	/** Lexers can normally match any char in it's vocabulary after matching
325	 *  a token, so do the easy thing and just kill a character and hope
326	 *  it all works out.  You can instead use the rule invocation stack
327	 *  to do sophisticated error recovery if you are in a fragment rule.
328	 */
329	public void recover(RecognitionException re) {
330		//System.out.println("consuming char "+(char)input.LA(1)+" during recovery");
331		//re.printStackTrace();
332		input.consume();
333	}
334
335	public void traceIn(String ruleName, int ruleIndex)  {
336		String inputSymbol = ((char)input.LT(1))+" line="+getLine()+":"+getCharPositionInLine();
337		super.traceIn(ruleName, ruleIndex, inputSymbol);
338	}
339
340	public void traceOut(String ruleName, int ruleIndex)  {
341		String inputSymbol = ((char)input.LT(1))+" line="+getLine()+":"+getCharPositionInLine();
342		super.traceOut(ruleName, ruleIndex, inputSymbol);
343	}
344}
345