Lexer.cs revision 324c4644fee44b9898524c09511bd33c3f12e2df
1/* 2 * [The "BSD licence"] 3 * Copyright (c) 2005-2008 Terence Parr 4 * All rights reserved. 5 * 6 * Conversion to C#: 7 * Copyright (c) 2008-2009 Sam Harwell, Pixel Mine, Inc. 8 * All rights reserved. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. The name of the author may not be used to endorse or promote products 19 * derived from this software without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 22 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 23 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 24 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 25 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 26 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 30 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 */ 32 33namespace Antlr.Runtime { 34 using ConditionalAttribute = System.Diagnostics.ConditionalAttribute; 35 36 /** <summary> 37 * A lexer is recognizer that draws input symbols from a character stream. 38 * lexer grammars result in a subclass of this object. A Lexer object 39 * uses simplified match() and error recovery mechanisms in the interest 40 * of speed. 41 * </summary> 42 */ 43 public abstract class Lexer : BaseRecognizer, ITokenSource { 44 /** <summary>Where is the lexer drawing characters from?</summary> */ 45 protected ICharStream input; 46 47 public Lexer() { 48 } 49 50 public Lexer(ICharStream input) { 51 this.input = input; 52 } 53 54 public Lexer(ICharStream input, RecognizerSharedState state) 55 : base(state) { 56 this.input = input; 57 } 58 59 #region Properties 60 public string Text { 61 /** <summary>Return the text matched so far for the current token or any text override.</summary> */ 62 get { 63 if (state.text != null) { 64 return state.text; 65 } 66 return input.Substring(state.tokenStartCharIndex, CharIndex - state.tokenStartCharIndex); 67 } 68 /** <summary>Set the complete text of this token; it wipes any previous changes to the text.</summary> */ 69 set { 70 state.text = value; 71 } 72 } 73 public int Line { 74 get { 75 return input.Line; 76 } 77 set { 78 input.Line = value; 79 } 80 } 81 public int CharPositionInLine { 82 get { 83 return input.CharPositionInLine; 84 } 85 set { 86 input.CharPositionInLine = value; 87 } 88 } 89 #endregion 90 91 public override void Reset() { 92 base.Reset(); // reset all recognizer state variables 93 // wack Lexer state variables 94 if (input != null) { 95 input.Seek(0); // rewind the input 96 } 97 if (state == null) { 98 return; // no shared state work to do 99 } 100 state.token = null; 101 state.type = TokenTypes.Invalid; 102 state.channel = TokenChannels.Default; 103 state.tokenStartCharIndex = -1; 104 state.tokenStartCharPositionInLine = -1; 105 state.tokenStartLine = -1; 106 state.text = null; 107 } 108 109 /** <summary>Return a token from this source; i.e., match a token on the char stream.</summary> */ 110 public virtual IToken NextToken() { 111 for (; ; ) { 112 state.token = null; 113 state.channel = TokenChannels.Default; 114 state.tokenStartCharIndex = input.Index; 115 state.tokenStartCharPositionInLine = input.CharPositionInLine; 116 state.tokenStartLine = input.Line; 117 state.text = null; 118 if (input.LA(1) == CharStreamConstants.EndOfFile) { 119 IToken eof = new CommonToken((ICharStream)input, CharStreamConstants.EndOfFile, TokenChannels.Default, input.Index, input.Index); 120 eof.Line = Line; 121 eof.CharPositionInLine = CharPositionInLine; 122 return eof; 123 } 124 try { 125 mTokens(); 126 if (state.token == null) { 127 Emit(); 128 } else if (state.token == Tokens.Skip) { 129 continue; 130 } 131 return state.token; 132 } catch (NoViableAltException nva) { 133 ReportError(nva); 134 Recover(nva); // throw out current char and try again 135 } catch (RecognitionException re) { 136 ReportError(re); 137 // match() routine has already called recover() 138 } 139 } 140 } 141 142 /** <summary> 143 * Instruct the lexer to skip creating a token for current lexer rule 144 * and look for another token. nextToken() knows to keep looking when 145 * a lexer rule finishes with token set to SKIP_TOKEN. Recall that 146 * if token==null at end of any token rule, it creates one for you 147 * and emits it. 148 * </summary> 149 */ 150 public virtual void Skip() { 151 state.token = Tokens.Skip; 152 } 153 154 /** <summary>This is the lexer entry point that sets instance var 'token'</summary> */ 155 public abstract void mTokens(); 156 157 public virtual ICharStream CharStream { 158 get { 159 return input; 160 } 161 /** <summary>Set the char stream and reset the lexer</summary> */ 162 set { 163 input = null; 164 Reset(); 165 input = value; 166 } 167 } 168 169 public override string SourceName { 170 get { 171 return input.SourceName; 172 } 173 } 174 175 /** <summary> 176 * Currently does not support multiple emits per nextToken invocation 177 * for efficiency reasons. Subclass and override this method and 178 * nextToken (to push tokens into a list and pull from that list rather 179 * than a single variable as this implementation does). 180 * </summary> 181 */ 182 public virtual void Emit(IToken token) { 183 state.token = token; 184 } 185 186 /** <summary> 187 * The standard method called to automatically emit a token at the 188 * outermost lexical rule. The token object should point into the 189 * char buffer start..stop. If there is a text override in 'text', 190 * use that to set the token's text. Override this method to emit 191 * custom Token objects. 192 * </summary> 193 * 194 * <remarks> 195 * If you are building trees, then you should also override 196 * Parser or TreeParser.getMissingSymbol(). 197 * </remarks> 198 */ 199 public virtual IToken Emit() { 200 IToken t = new CommonToken(input, state.type, state.channel, state.tokenStartCharIndex, CharIndex - 1); 201 t.Line = state.tokenStartLine; 202 t.Text = state.text; 203 t.CharPositionInLine = state.tokenStartCharPositionInLine; 204 Emit(t); 205 return t; 206 } 207 208 public virtual void Match(string s) { 209 int i = 0; 210 while (i < s.Length) { 211 if (input.LA(1) != s[i]) { 212 if (state.backtracking > 0) { 213 state.failed = true; 214 return; 215 } 216 MismatchedTokenException mte = new MismatchedTokenException(s[i], input, TokenNames); 217 Recover(mte); 218 throw mte; 219 } 220 i++; 221 input.Consume(); 222 state.failed = false; 223 } 224 } 225 226 public virtual void MatchAny() { 227 input.Consume(); 228 } 229 230 public virtual void Match(int c) { 231 if (input.LA(1) != c) { 232 if (state.backtracking > 0) { 233 state.failed = true; 234 return; 235 } 236 MismatchedTokenException mte = new MismatchedTokenException(c, input, TokenNames); 237 Recover(mte); // don't really recover; just consume in lexer 238 throw mte; 239 } 240 input.Consume(); 241 state.failed = false; 242 } 243 244 public virtual void MatchRange(int a, int b) { 245 if (input.LA(1) < a || input.LA(1) > b) { 246 if (state.backtracking > 0) { 247 state.failed = true; 248 return; 249 } 250 MismatchedRangeException mre = new MismatchedRangeException(a, b, input); 251 Recover(mre); 252 throw mre; 253 } 254 input.Consume(); 255 state.failed = false; 256 } 257 258 /** <summary>What is the index of the current character of lookahead?</summary> */ 259 public virtual int CharIndex { 260 get { 261 return input.Index; 262 } 263 } 264 265 public override void ReportError(RecognitionException e) { 266 /** TODO: not thought about recovery in lexer yet. 267 * 268 // if we've already reported an error and have not matched a token 269 // yet successfully, don't report any errors. 270 if ( errorRecovery ) { 271 //System.err.print("[SPURIOUS] "); 272 return; 273 } 274 errorRecovery = true; 275 */ 276 277 DisplayRecognitionError(this.TokenNames, e); 278 } 279 280 public override string GetErrorMessage(RecognitionException e, string[] tokenNames) { 281 string msg = null; 282 if (e is MismatchedTokenException) { 283 MismatchedTokenException mte = (MismatchedTokenException)e; 284 msg = "mismatched character " + GetCharErrorDisplay(e.Character) + " expecting " + GetCharErrorDisplay(mte.Expecting); 285 } else if (e is NoViableAltException) { 286 NoViableAltException nvae = (NoViableAltException)e; 287 // for development, can add "decision=<<"+nvae.grammarDecisionDescription+">>" 288 // and "(decision="+nvae.decisionNumber+") and 289 // "state "+nvae.stateNumber 290 msg = "no viable alternative at character " + GetCharErrorDisplay(e.Character); 291 } else if (e is EarlyExitException) { 292 EarlyExitException eee = (EarlyExitException)e; 293 // for development, can add "(decision="+eee.decisionNumber+")" 294 msg = "required (...)+ loop did not match anything at character " + GetCharErrorDisplay(e.Character); 295 } else if (e is MismatchedNotSetException) { 296 MismatchedNotSetException mse = (MismatchedNotSetException)e; 297 msg = "mismatched character " + GetCharErrorDisplay(e.Character) + " expecting set " + mse.Expecting; 298 } else if (e is MismatchedSetException) { 299 MismatchedSetException mse = (MismatchedSetException)e; 300 msg = "mismatched character " + GetCharErrorDisplay(e.Character) + " expecting set " + mse.Expecting; 301 } else if (e is MismatchedRangeException) { 302 MismatchedRangeException mre = (MismatchedRangeException)e; 303 msg = "mismatched character " + GetCharErrorDisplay(e.Character) + " expecting set " + 304 GetCharErrorDisplay(mre.A) + ".." + GetCharErrorDisplay(mre.B); 305 } else { 306 msg = base.GetErrorMessage(e, tokenNames); 307 } 308 return msg; 309 } 310 311 public virtual string GetCharErrorDisplay(int c) { 312 string s = ((char)c).ToString(); 313 switch (c) { 314 case TokenTypes.EndOfFile: 315 s = "<EOF>"; 316 break; 317 case '\n': 318 s = "\\n"; 319 break; 320 case '\t': 321 s = "\\t"; 322 break; 323 case '\r': 324 s = "\\r"; 325 break; 326 } 327 return "'" + s + "'"; 328 } 329 330 /** <summary> 331 * Lexers can normally match any char in it's vocabulary after matching 332 * a token, so do the easy thing and just kill a character and hope 333 * it all works out. You can instead use the rule invocation stack 334 * to do sophisticated error recovery if you are in a fragment rule. 335 * </summary> 336 */ 337 public virtual void Recover(RecognitionException re) { 338 //System.out.println("consuming char "+(char)input.LA(1)+" during recovery"); 339 //re.printStackTrace(); 340 input.Consume(); 341 } 342 343 [Conditional("ANTLR_TRACE")] 344 public virtual void TraceIn(string ruleName, int ruleIndex) { 345 string inputSymbol = ((char)input.LT(1)) + " line=" + Line + ":" + CharPositionInLine; 346 base.TraceIn(ruleName, ruleIndex, inputSymbol); 347 } 348 349 [Conditional("ANTLR_TRACE")] 350 public virtual void TraceOut(string ruleName, int ruleIndex) { 351 string inputSymbol = ((char)input.LT(1)) + " line=" + Line + ":" + CharPositionInLine; 352 base.TraceOut(ruleName, ruleIndex, inputSymbol); 353 } 354 } 355} 356