Lexer.cs revision 324c4644fee44b9898524c09511bd33c3f12e2df
1/*
2 * [The "BSD licence"]
3 * Copyright (c) 2005-2008 Terence Parr
4 * All rights reserved.
5 *
6 * Conversion to C#:
7 * Copyright (c) 2008-2009 Sam Harwell, Pixel Mine, Inc.
8 * All rights reserved.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 *    notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 *    notice, this list of conditions and the following disclaimer in the
17 *    documentation and/or other materials provided with the distribution.
18 * 3. The name of the author may not be used to endorse or promote products
19 *    derived from this software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 */
32
33namespace Antlr.Runtime {
34    using ConditionalAttribute = System.Diagnostics.ConditionalAttribute;
35
36    /** <summary>
37     *  A lexer is recognizer that draws input symbols from a character stream.
38     *  lexer grammars result in a subclass of this object. A Lexer object
39     *  uses simplified match() and error recovery mechanisms in the interest
40     *  of speed.
41     *  </summary>
42     */
43    public abstract class Lexer : BaseRecognizer, ITokenSource {
44        /** <summary>Where is the lexer drawing characters from?</summary> */
45        protected ICharStream input;
46
47        public Lexer() {
48        }
49
50        public Lexer(ICharStream input) {
51            this.input = input;
52        }
53
54        public Lexer(ICharStream input, RecognizerSharedState state)
55            : base(state) {
56            this.input = input;
57        }
58
59        #region Properties
60        public string Text {
61            /** <summary>Return the text matched so far for the current token or any text override.</summary> */
62            get {
63                if (state.text != null) {
64                    return state.text;
65                }
66                return input.Substring(state.tokenStartCharIndex, CharIndex - state.tokenStartCharIndex);
67            }
68            /** <summary>Set the complete text of this token; it wipes any previous changes to the text.</summary> */
69            set {
70                state.text = value;
71            }
72        }
73        public int Line {
74            get {
75                return input.Line;
76            }
77            set {
78                input.Line = value;
79            }
80        }
81        public int CharPositionInLine {
82            get {
83                return input.CharPositionInLine;
84            }
85            set {
86                input.CharPositionInLine = value;
87            }
88        }
89        #endregion
90
91        public override void Reset() {
92            base.Reset(); // reset all recognizer state variables
93            // wack Lexer state variables
94            if (input != null) {
95                input.Seek(0); // rewind the input
96            }
97            if (state == null) {
98                return; // no shared state work to do
99            }
100            state.token = null;
101            state.type = TokenTypes.Invalid;
102            state.channel = TokenChannels.Default;
103            state.tokenStartCharIndex = -1;
104            state.tokenStartCharPositionInLine = -1;
105            state.tokenStartLine = -1;
106            state.text = null;
107        }
108
109        /** <summary>Return a token from this source; i.e., match a token on the char stream.</summary> */
110        public virtual IToken NextToken() {
111            for (; ; ) {
112                state.token = null;
113                state.channel = TokenChannels.Default;
114                state.tokenStartCharIndex = input.Index;
115                state.tokenStartCharPositionInLine = input.CharPositionInLine;
116                state.tokenStartLine = input.Line;
117                state.text = null;
118                if (input.LA(1) == CharStreamConstants.EndOfFile) {
119                    IToken eof = new CommonToken((ICharStream)input, CharStreamConstants.EndOfFile, TokenChannels.Default, input.Index, input.Index);
120                    eof.Line = Line;
121                    eof.CharPositionInLine = CharPositionInLine;
122                    return eof;
123                }
124                try {
125                    mTokens();
126                    if (state.token == null) {
127                        Emit();
128                    } else if (state.token == Tokens.Skip) {
129                        continue;
130                    }
131                    return state.token;
132                } catch (NoViableAltException nva) {
133                    ReportError(nva);
134                    Recover(nva); // throw out current char and try again
135                } catch (RecognitionException re) {
136                    ReportError(re);
137                    // match() routine has already called recover()
138                }
139            }
140        }
141
142        /** <summary>
143         *  Instruct the lexer to skip creating a token for current lexer rule
144         *  and look for another token.  nextToken() knows to keep looking when
145         *  a lexer rule finishes with token set to SKIP_TOKEN.  Recall that
146         *  if token==null at end of any token rule, it creates one for you
147         *  and emits it.
148         *  </summary>
149         */
150        public virtual void Skip() {
151            state.token = Tokens.Skip;
152        }
153
154        /** <summary>This is the lexer entry point that sets instance var 'token'</summary> */
155        public abstract void mTokens();
156
157        public virtual ICharStream CharStream {
158            get {
159                return input;
160            }
161            /** <summary>Set the char stream and reset the lexer</summary> */
162            set {
163                input = null;
164                Reset();
165                input = value;
166            }
167        }
168
169        public override string SourceName {
170            get {
171                return input.SourceName;
172            }
173        }
174
175        /** <summary>
176         *  Currently does not support multiple emits per nextToken invocation
177         *  for efficiency reasons.  Subclass and override this method and
178         *  nextToken (to push tokens into a list and pull from that list rather
179         *  than a single variable as this implementation does).
180         *  </summary>
181         */
182        public virtual void Emit(IToken token) {
183            state.token = token;
184        }
185
186        /** <summary>
187         *  The standard method called to automatically emit a token at the
188         *  outermost lexical rule.  The token object should point into the
189         *  char buffer start..stop.  If there is a text override in 'text',
190         *  use that to set the token's text.  Override this method to emit
191         *  custom Token objects.
192         *  </summary>
193         *
194         *  <remarks>
195         *  If you are building trees, then you should also override
196         *  Parser or TreeParser.getMissingSymbol().
197         *  </remarks>
198         */
199        public virtual IToken Emit() {
200            IToken t = new CommonToken(input, state.type, state.channel, state.tokenStartCharIndex, CharIndex - 1);
201            t.Line = state.tokenStartLine;
202            t.Text = state.text;
203            t.CharPositionInLine = state.tokenStartCharPositionInLine;
204            Emit(t);
205            return t;
206        }
207
208        public virtual void Match(string s) {
209            int i = 0;
210            while (i < s.Length) {
211                if (input.LA(1) != s[i]) {
212                    if (state.backtracking > 0) {
213                        state.failed = true;
214                        return;
215                    }
216                    MismatchedTokenException mte = new MismatchedTokenException(s[i], input, TokenNames);
217                    Recover(mte);
218                    throw mte;
219                }
220                i++;
221                input.Consume();
222                state.failed = false;
223            }
224        }
225
226        public virtual void MatchAny() {
227            input.Consume();
228        }
229
230        public virtual void Match(int c) {
231            if (input.LA(1) != c) {
232                if (state.backtracking > 0) {
233                    state.failed = true;
234                    return;
235                }
236                MismatchedTokenException mte = new MismatchedTokenException(c, input, TokenNames);
237                Recover(mte);  // don't really recover; just consume in lexer
238                throw mte;
239            }
240            input.Consume();
241            state.failed = false;
242        }
243
244        public virtual void MatchRange(int a, int b) {
245            if (input.LA(1) < a || input.LA(1) > b) {
246                if (state.backtracking > 0) {
247                    state.failed = true;
248                    return;
249                }
250                MismatchedRangeException mre = new MismatchedRangeException(a, b, input);
251                Recover(mre);
252                throw mre;
253            }
254            input.Consume();
255            state.failed = false;
256        }
257
258        /** <summary>What is the index of the current character of lookahead?</summary> */
259        public virtual int CharIndex {
260            get {
261                return input.Index;
262            }
263        }
264
265        public override void ReportError(RecognitionException e) {
266            /** TODO: not thought about recovery in lexer yet.
267             *
268            // if we've already reported an error and have not matched a token
269            // yet successfully, don't report any errors.
270            if ( errorRecovery ) {
271                //System.err.print("[SPURIOUS] ");
272                return;
273            }
274            errorRecovery = true;
275             */
276
277            DisplayRecognitionError(this.TokenNames, e);
278        }
279
280        public override string GetErrorMessage(RecognitionException e, string[] tokenNames) {
281            string msg = null;
282            if (e is MismatchedTokenException) {
283                MismatchedTokenException mte = (MismatchedTokenException)e;
284                msg = "mismatched character " + GetCharErrorDisplay(e.Character) + " expecting " + GetCharErrorDisplay(mte.Expecting);
285            } else if (e is NoViableAltException) {
286                NoViableAltException nvae = (NoViableAltException)e;
287                // for development, can add "decision=<<"+nvae.grammarDecisionDescription+">>"
288                // and "(decision="+nvae.decisionNumber+") and
289                // "state "+nvae.stateNumber
290                msg = "no viable alternative at character " + GetCharErrorDisplay(e.Character);
291            } else if (e is EarlyExitException) {
292                EarlyExitException eee = (EarlyExitException)e;
293                // for development, can add "(decision="+eee.decisionNumber+")"
294                msg = "required (...)+ loop did not match anything at character " + GetCharErrorDisplay(e.Character);
295            } else if (e is MismatchedNotSetException) {
296                MismatchedNotSetException mse = (MismatchedNotSetException)e;
297                msg = "mismatched character " + GetCharErrorDisplay(e.Character) + " expecting set " + mse.Expecting;
298            } else if (e is MismatchedSetException) {
299                MismatchedSetException mse = (MismatchedSetException)e;
300                msg = "mismatched character " + GetCharErrorDisplay(e.Character) + " expecting set " + mse.Expecting;
301            } else if (e is MismatchedRangeException) {
302                MismatchedRangeException mre = (MismatchedRangeException)e;
303                msg = "mismatched character " + GetCharErrorDisplay(e.Character) + " expecting set " +
304                      GetCharErrorDisplay(mre.A) + ".." + GetCharErrorDisplay(mre.B);
305            } else {
306                msg = base.GetErrorMessage(e, tokenNames);
307            }
308            return msg;
309        }
310
311        public virtual string GetCharErrorDisplay(int c) {
312            string s = ((char)c).ToString();
313            switch (c) {
314                case TokenTypes.EndOfFile:
315                    s = "<EOF>";
316                    break;
317                case '\n':
318                    s = "\\n";
319                    break;
320                case '\t':
321                    s = "\\t";
322                    break;
323                case '\r':
324                    s = "\\r";
325                    break;
326            }
327            return "'" + s + "'";
328        }
329
330        /** <summary>
331         *  Lexers can normally match any char in it's vocabulary after matching
332         *  a token, so do the easy thing and just kill a character and hope
333         *  it all works out.  You can instead use the rule invocation stack
334         *  to do sophisticated error recovery if you are in a fragment rule.
335         *  </summary>
336         */
337        public virtual void Recover(RecognitionException re) {
338            //System.out.println("consuming char "+(char)input.LA(1)+" during recovery");
339            //re.printStackTrace();
340            input.Consume();
341        }
342
343        [Conditional("ANTLR_TRACE")]
344        public virtual void TraceIn(string ruleName, int ruleIndex) {
345            string inputSymbol = ((char)input.LT(1)) + " line=" + Line + ":" + CharPositionInLine;
346            base.TraceIn(ruleName, ruleIndex, inputSymbol);
347        }
348
349        [Conditional("ANTLR_TRACE")]
350        public virtual void TraceOut(string ruleName, int ruleIndex) {
351            string inputSymbol = ((char)input.LT(1)) + " line=" + Line + ":" + CharPositionInLine;
352            base.TraceOut(ruleName, ruleIndex, inputSymbol);
353        }
354    }
355}
356