AsmLexer.cpp revision d556fd129026f6e3fa6ea9c2c70ba489bff18954
1//===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This class implements the lexer for assembly files. 11// 12//===----------------------------------------------------------------------===// 13 14#include "llvm/MC/MCParser/AsmLexer.h" 15#include "llvm/MC/MCAsmInfo.h" 16#include "llvm/Support/MemoryBuffer.h" 17#include "llvm/Support/SMLoc.h" 18#include <cctype> 19#include <cerrno> 20#include <cstdio> 21#include <cstdlib> 22using namespace llvm; 23 24AsmLexer::AsmLexer(const MCAsmInfo &_MAI) : MAI(_MAI) { 25 CurBuf = NULL; 26 CurPtr = NULL; 27 isAtStartOfLine = true; 28} 29 30AsmLexer::~AsmLexer() { 31} 32 33void AsmLexer::setBuffer(const MemoryBuffer *buf, const char *ptr) { 34 CurBuf = buf; 35 36 if (ptr) 37 CurPtr = ptr; 38 else 39 CurPtr = CurBuf->getBufferStart(); 40 41 TokStart = 0; 42} 43 44/// ReturnError - Set the error to the specified string at the specified 45/// location. This is defined to always return AsmToken::Error. 46AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) { 47 SetError(SMLoc::getFromPointer(Loc), Msg); 48 49 return AsmToken(AsmToken::Error, StringRef(Loc, 0)); 50} 51 52int AsmLexer::getNextChar() { 53 char CurChar = *CurPtr++; 54 switch (CurChar) { 55 default: 56 return (unsigned char)CurChar; 57 case 0: 58 // A nul character in the stream is either the end of the current buffer or 59 // a random nul in the file. Disambiguate that here. 60 if (CurPtr-1 != CurBuf->getBufferEnd()) 61 return 0; // Just whitespace. 62 63 // Otherwise, return end of file. 64 --CurPtr; // Another call to lex will return EOF again. 65 return EOF; 66 } 67} 68 69/// LexFloatLiteral: [0-9]*[.][0-9]*([eE][+-]?[0-9]*)? 70/// 71/// The leading integral digit sequence and dot should have already been 72/// consumed, some or all of the fractional digit sequence *can* have been 73/// consumed. 74AsmToken AsmLexer::LexFloatLiteral() { 75 // Skip the fractional digit sequence. 76 while (isdigit(*CurPtr)) 77 ++CurPtr; 78 79 // Check for exponent; we intentionally accept a slighlty wider set of 80 // literals here and rely on the upstream client to reject invalid ones (e.g., 81 // "1e+"). 82 if (*CurPtr == 'e' || *CurPtr == 'E') { 83 ++CurPtr; 84 if (*CurPtr == '-' || *CurPtr == '+') 85 ++CurPtr; 86 while (isdigit(*CurPtr)) 87 ++CurPtr; 88 } 89 90 return AsmToken(AsmToken::Real, 91 StringRef(TokStart, CurPtr - TokStart)); 92} 93 94/// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@]* 95static bool IsIdentifierChar(char c) { 96 return isalnum(c) || c == '_' || c == '$' || c == '.' || c == '@'; 97} 98AsmToken AsmLexer::LexIdentifier() { 99 // Check for floating point literals. 100 if (CurPtr[-1] == '.' && isdigit(*CurPtr)) { 101 // Disambiguate a .1243foo identifier from a floating literal. 102 while (isdigit(*CurPtr)) 103 ++CurPtr; 104 if (*CurPtr == 'e' || *CurPtr == 'E' || !IsIdentifierChar(*CurPtr)) 105 return LexFloatLiteral(); 106 } 107 108 while (IsIdentifierChar(*CurPtr)) 109 ++CurPtr; 110 111 // Handle . as a special case. 112 if (CurPtr == TokStart+1 && TokStart[0] == '.') 113 return AsmToken(AsmToken::Dot, StringRef(TokStart, 1)); 114 115 return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart)); 116} 117 118/// LexSlash: Slash: / 119/// C-Style Comment: /* ... */ 120AsmToken AsmLexer::LexSlash() { 121 switch (*CurPtr) { 122 case '*': break; // C style comment. 123 case '/': return ++CurPtr, LexLineComment(); 124 default: return AsmToken(AsmToken::Slash, StringRef(CurPtr-1, 1)); 125 } 126 127 // C Style comment. 128 ++CurPtr; // skip the star. 129 while (1) { 130 int CurChar = getNextChar(); 131 switch (CurChar) { 132 case EOF: 133 return ReturnError(TokStart, "unterminated comment"); 134 case '*': 135 // End of the comment? 136 if (CurPtr[0] != '/') break; 137 138 ++CurPtr; // End the */. 139 return LexToken(); 140 } 141 } 142} 143 144/// LexLineComment: Comment: #[^\n]* 145/// : //[^\n]* 146AsmToken AsmLexer::LexLineComment() { 147 // FIXME: This is broken if we happen to a comment at the end of a file, which 148 // was .included, and which doesn't end with a newline. 149 int CurChar = getNextChar(); 150 while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF) 151 CurChar = getNextChar(); 152 153 if (CurChar == EOF) 154 return AsmToken(AsmToken::Eof, StringRef(CurPtr, 0)); 155 return AsmToken(AsmToken::EndOfStatement, StringRef(CurPtr, 0)); 156} 157 158static void SkipIgnoredIntegerSuffix(const char *&CurPtr) { 159 if (CurPtr[0] == 'L' && CurPtr[1] == 'L') 160 CurPtr += 2; 161 if (CurPtr[0] == 'U' && CurPtr[1] == 'L' && CurPtr[2] == 'L') 162 CurPtr += 3; 163} 164 165// Look ahead to search for first non-hex digit, if it's [hH], then we treat the 166// integer as a hexadecimal, possibly with leading zeroes. 167static unsigned doLookAhead(const char *&CurPtr, unsigned DefaultRadix) { 168 const char *FirstHex = 0; 169 const char *LookAhead = CurPtr; 170 while (1) { 171 if (isdigit(*LookAhead)) { 172 ++LookAhead; 173 } else if (isxdigit(*LookAhead)) { 174 if (!FirstHex) 175 FirstHex = LookAhead; 176 ++LookAhead; 177 } else { 178 break; 179 } 180 } 181 bool isHex = *LookAhead == 'h' || *LookAhead == 'H'; 182 CurPtr = isHex || !FirstHex ? LookAhead : FirstHex; 183 if (isHex) 184 return 16; 185 return DefaultRadix; 186} 187 188/// LexDigit: First character is [0-9]. 189/// Local Label: [0-9][:] 190/// Forward/Backward Label: [0-9][fb] 191/// Binary integer: 0b[01]+ 192/// Octal integer: 0[0-7]+ 193/// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH] 194/// Decimal integer: [1-9][0-9]* 195AsmToken AsmLexer::LexDigit() { 196 // Decimal integer: [1-9][0-9]* 197 if (CurPtr[-1] != '0' || CurPtr[0] == '.') { 198 unsigned Radix = doLookAhead(CurPtr, 10); 199 bool isHex = Radix == 16; 200 // Check for floating point literals. 201 if (!isHex && (*CurPtr == '.' || *CurPtr == 'e')) { 202 ++CurPtr; 203 return LexFloatLiteral(); 204 } 205 206 StringRef Result(TokStart, CurPtr - TokStart); 207 208 long long Value; 209 if (Result.getAsInteger(Radix, Value)) { 210 // Allow positive values that are too large to fit into a signed 64-bit 211 // integer, but that do fit in an unsigned one, we just convert them over. 212 unsigned long long UValue; 213 if (Result.getAsInteger(Radix, UValue)) 214 return ReturnError(TokStart, !isHex ? "invalid decimal number" : 215 "invalid hexdecimal number"); 216 Value = (long long)UValue; 217 } 218 219 // Consume the [bB][hH]. 220 if (Radix == 2 || Radix == 16) 221 ++CurPtr; 222 223 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 224 // suffixes on integer literals. 225 SkipIgnoredIntegerSuffix(CurPtr); 226 227 return AsmToken(AsmToken::Integer, Result, Value); 228 } 229 230 if (*CurPtr == 'b') { 231 ++CurPtr; 232 // See if we actually have "0b" as part of something like "jmp 0b\n" 233 if (!isdigit(CurPtr[0])) { 234 --CurPtr; 235 StringRef Result(TokStart, CurPtr - TokStart); 236 return AsmToken(AsmToken::Integer, Result, 0); 237 } 238 const char *NumStart = CurPtr; 239 while (CurPtr[0] == '0' || CurPtr[0] == '1') 240 ++CurPtr; 241 242 // Requires at least one binary digit. 243 if (CurPtr == NumStart) 244 return ReturnError(TokStart, "invalid binary number"); 245 246 StringRef Result(TokStart, CurPtr - TokStart); 247 248 long long Value; 249 if (Result.substr(2).getAsInteger(2, Value)) 250 return ReturnError(TokStart, "invalid binary number"); 251 252 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 253 // suffixes on integer literals. 254 SkipIgnoredIntegerSuffix(CurPtr); 255 256 return AsmToken(AsmToken::Integer, Result, Value); 257 } 258 259 if (*CurPtr == 'x') { 260 ++CurPtr; 261 const char *NumStart = CurPtr; 262 while (isxdigit(CurPtr[0])) 263 ++CurPtr; 264 265 // Requires at least one hex digit. 266 if (CurPtr == NumStart) 267 return ReturnError(CurPtr-2, "invalid hexadecimal number"); 268 269 unsigned long long Result; 270 if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result)) 271 return ReturnError(TokStart, "invalid hexadecimal number"); 272 273 // Consume the optional [hH]. 274 if (*CurPtr == 'h' || *CurPtr == 'H') 275 ++CurPtr; 276 277 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 278 // suffixes on integer literals. 279 SkipIgnoredIntegerSuffix(CurPtr); 280 281 return AsmToken(AsmToken::Integer, StringRef(TokStart, CurPtr - TokStart), 282 (int64_t)Result); 283 } 284 285 // Either octal or hexidecimal. 286 long long Value; 287 unsigned Radix = doLookAhead(CurPtr, 8); 288 StringRef Result(TokStart, CurPtr - TokStart); 289 if (Result.getAsInteger(Radix, Value)) 290 return ReturnError(TokStart, "invalid octal number"); 291 292 // Consume the [hH]. 293 if (Radix == 16) 294 ++CurPtr; 295 296 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 297 // suffixes on integer literals. 298 SkipIgnoredIntegerSuffix(CurPtr); 299 300 return AsmToken(AsmToken::Integer, Result, Value); 301} 302 303/// LexSingleQuote: Integer: 'b' 304AsmToken AsmLexer::LexSingleQuote() { 305 int CurChar = getNextChar(); 306 307 if (CurChar == '\\') 308 CurChar = getNextChar(); 309 310 if (CurChar == EOF) 311 return ReturnError(TokStart, "unterminated single quote"); 312 313 CurChar = getNextChar(); 314 315 if (CurChar != '\'') 316 return ReturnError(TokStart, "single quote way too long"); 317 318 // The idea here being that 'c' is basically just an integral 319 // constant. 320 StringRef Res = StringRef(TokStart,CurPtr - TokStart); 321 long long Value; 322 323 if (Res.startswith("\'\\")) { 324 char theChar = Res[2]; 325 switch (theChar) { 326 default: Value = theChar; break; 327 case '\'': Value = '\''; break; 328 case 't': Value = '\t'; break; 329 case 'n': Value = '\n'; break; 330 case 'b': Value = '\b'; break; 331 } 332 } else 333 Value = TokStart[1]; 334 335 return AsmToken(AsmToken::Integer, Res, Value); 336} 337 338 339/// LexQuote: String: "..." 340AsmToken AsmLexer::LexQuote() { 341 int CurChar = getNextChar(); 342 // TODO: does gas allow multiline string constants? 343 while (CurChar != '"') { 344 if (CurChar == '\\') { 345 // Allow \", etc. 346 CurChar = getNextChar(); 347 } 348 349 if (CurChar == EOF) 350 return ReturnError(TokStart, "unterminated string constant"); 351 352 CurChar = getNextChar(); 353 } 354 355 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); 356} 357 358StringRef AsmLexer::LexUntilEndOfStatement() { 359 TokStart = CurPtr; 360 361 while (!isAtStartOfComment(*CurPtr) && // Start of line comment. 362 !isAtStatementSeparator(CurPtr) && // End of statement marker. 363 *CurPtr != '\n' && 364 *CurPtr != '\r' && 365 (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) { 366 ++CurPtr; 367 } 368 return StringRef(TokStart, CurPtr-TokStart); 369} 370 371StringRef AsmLexer::LexUntilEndOfLine() { 372 TokStart = CurPtr; 373 374 while (*CurPtr != '\n' && 375 *CurPtr != '\r' && 376 (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) { 377 ++CurPtr; 378 } 379 return StringRef(TokStart, CurPtr-TokStart); 380} 381 382bool AsmLexer::isAtStartOfComment(char Char) { 383 // FIXME: This won't work for multi-character comment indicators like "//". 384 return Char == *MAI.getCommentString(); 385} 386 387bool AsmLexer::isAtStatementSeparator(const char *Ptr) { 388 return strncmp(Ptr, MAI.getSeparatorString(), 389 strlen(MAI.getSeparatorString())) == 0; 390} 391 392AsmToken AsmLexer::LexToken() { 393 TokStart = CurPtr; 394 // This always consumes at least one character. 395 int CurChar = getNextChar(); 396 397 if (isAtStartOfComment(CurChar)) { 398 // If this comment starts with a '#', then return the Hash token and let 399 // the assembler parser see if it can be parsed as a cpp line filename 400 // comment. We do this only if we are at the start of a line. 401 if (CurChar == '#' && isAtStartOfLine) 402 return AsmToken(AsmToken::Hash, StringRef(TokStart, 1)); 403 isAtStartOfLine = true; 404 return LexLineComment(); 405 } 406 if (isAtStatementSeparator(TokStart)) { 407 CurPtr += strlen(MAI.getSeparatorString()) - 1; 408 return AsmToken(AsmToken::EndOfStatement, 409 StringRef(TokStart, strlen(MAI.getSeparatorString()))); 410 } 411 412 // If we're missing a newline at EOF, make sure we still get an 413 // EndOfStatement token before the Eof token. 414 if (CurChar == EOF && !isAtStartOfLine) { 415 isAtStartOfLine = true; 416 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1)); 417 } 418 419 isAtStartOfLine = false; 420 switch (CurChar) { 421 default: 422 // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]* 423 if (isalpha(CurChar) || CurChar == '_' || CurChar == '.') 424 return LexIdentifier(); 425 426 // Unknown character, emit an error. 427 return ReturnError(TokStart, "invalid character in input"); 428 case EOF: return AsmToken(AsmToken::Eof, StringRef(TokStart, 0)); 429 case 0: 430 case ' ': 431 case '\t': 432 if (SkipSpace) { 433 // Ignore whitespace. 434 return LexToken(); 435 } else { 436 int len = 1; 437 while (*CurPtr==' ' || *CurPtr=='\t') { 438 CurPtr++; 439 len++; 440 } 441 return AsmToken(AsmToken::Space, StringRef(TokStart, len)); 442 } 443 case '\n': // FALL THROUGH. 444 case '\r': 445 isAtStartOfLine = true; 446 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1)); 447 case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1)); 448 case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1)); 449 case '-': return AsmToken(AsmToken::Minus, StringRef(TokStart, 1)); 450 case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1)); 451 case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1)); 452 case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1)); 453 case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1)); 454 case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1)); 455 case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1)); 456 case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1)); 457 case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1)); 458 case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1)); 459 case '$': return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1)); 460 case '@': return AsmToken(AsmToken::At, StringRef(TokStart, 1)); 461 case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1)); 462 case '=': 463 if (*CurPtr == '=') 464 return ++CurPtr, AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2)); 465 return AsmToken(AsmToken::Equal, StringRef(TokStart, 1)); 466 case '|': 467 if (*CurPtr == '|') 468 return ++CurPtr, AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2)); 469 return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1)); 470 case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1)); 471 case '&': 472 if (*CurPtr == '&') 473 return ++CurPtr, AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2)); 474 return AsmToken(AsmToken::Amp, StringRef(TokStart, 1)); 475 case '!': 476 if (*CurPtr == '=') 477 return ++CurPtr, AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2)); 478 return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1)); 479 case '%': return AsmToken(AsmToken::Percent, StringRef(TokStart, 1)); 480 case '/': return LexSlash(); 481 case '#': return AsmToken(AsmToken::Hash, StringRef(TokStart, 1)); 482 case '\'': return LexSingleQuote(); 483 case '"': return LexQuote(); 484 case '0': case '1': case '2': case '3': case '4': 485 case '5': case '6': case '7': case '8': case '9': 486 return LexDigit(); 487 case '<': 488 switch (*CurPtr) { 489 case '<': return ++CurPtr, AsmToken(AsmToken::LessLess, 490 StringRef(TokStart, 2)); 491 case '=': return ++CurPtr, AsmToken(AsmToken::LessEqual, 492 StringRef(TokStart, 2)); 493 case '>': return ++CurPtr, AsmToken(AsmToken::LessGreater, 494 StringRef(TokStart, 2)); 495 default: return AsmToken(AsmToken::Less, StringRef(TokStart, 1)); 496 } 497 case '>': 498 switch (*CurPtr) { 499 case '>': return ++CurPtr, AsmToken(AsmToken::GreaterGreater, 500 StringRef(TokStart, 2)); 501 case '=': return ++CurPtr, AsmToken(AsmToken::GreaterEqual, 502 StringRef(TokStart, 2)); 503 default: return AsmToken(AsmToken::Greater, StringRef(TokStart, 1)); 504 } 505 506 // TODO: Quoted identifiers (objc methods etc) 507 // local labels: [0-9][:] 508 // Forward/backward labels: [0-9][fb] 509 // Integers, fp constants, character constants. 510 } 511} 512