AsmLexer.cpp revision dce4a407a24b04eebc6a376f8e62b41aaa7b071f
1//===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This class implements the lexer for assembly files. 11// 12//===----------------------------------------------------------------------===// 13 14#include "llvm/MC/MCParser/AsmLexer.h" 15#include "llvm/MC/MCAsmInfo.h" 16#include "llvm/Support/MemoryBuffer.h" 17#include "llvm/Support/SMLoc.h" 18#include <cctype> 19#include <cerrno> 20#include <cstdio> 21#include <cstdlib> 22using namespace llvm; 23 24AsmLexer::AsmLexer(const MCAsmInfo &_MAI) : MAI(_MAI) { 25 CurBuf = nullptr; 26 CurPtr = nullptr; 27 isAtStartOfLine = true; 28 AllowAtInIdentifier = !StringRef(MAI.getCommentString()).startswith("@"); 29} 30 31AsmLexer::~AsmLexer() { 32} 33 34void AsmLexer::setBuffer(const MemoryBuffer *buf, const char *ptr) { 35 CurBuf = buf; 36 37 if (ptr) 38 CurPtr = ptr; 39 else 40 CurPtr = CurBuf->getBufferStart(); 41 42 TokStart = nullptr; 43} 44 45/// ReturnError - Set the error to the specified string at the specified 46/// location. This is defined to always return AsmToken::Error. 47AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) { 48 SetError(SMLoc::getFromPointer(Loc), Msg); 49 50 return AsmToken(AsmToken::Error, StringRef(Loc, 0)); 51} 52 53int AsmLexer::getNextChar() { 54 char CurChar = *CurPtr++; 55 switch (CurChar) { 56 default: 57 return (unsigned char)CurChar; 58 case 0: 59 // A nul character in the stream is either the end of the current buffer or 60 // a random nul in the file. Disambiguate that here. 61 if (CurPtr-1 != CurBuf->getBufferEnd()) 62 return 0; // Just whitespace. 63 64 // Otherwise, return end of file. 65 --CurPtr; // Another call to lex will return EOF again. 66 return EOF; 67 } 68} 69 70/// LexFloatLiteral: [0-9]*[.][0-9]*([eE][+-]?[0-9]*)? 71/// 72/// The leading integral digit sequence and dot should have already been 73/// consumed, some or all of the fractional digit sequence *can* have been 74/// consumed. 75AsmToken AsmLexer::LexFloatLiteral() { 76 // Skip the fractional digit sequence. 77 while (isdigit(*CurPtr)) 78 ++CurPtr; 79 80 // Check for exponent; we intentionally accept a slighlty wider set of 81 // literals here and rely on the upstream client to reject invalid ones (e.g., 82 // "1e+"). 83 if (*CurPtr == 'e' || *CurPtr == 'E') { 84 ++CurPtr; 85 if (*CurPtr == '-' || *CurPtr == '+') 86 ++CurPtr; 87 while (isdigit(*CurPtr)) 88 ++CurPtr; 89 } 90 91 return AsmToken(AsmToken::Real, 92 StringRef(TokStart, CurPtr - TokStart)); 93} 94 95/// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+ 96/// while making sure there are enough actual digits around for the constant to 97/// be valid. 98/// 99/// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed 100/// before we get here. 101AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) { 102 assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') && 103 "unexpected parse state in floating hex"); 104 bool NoFracDigits = true; 105 106 // Skip the fractional part if there is one 107 if (*CurPtr == '.') { 108 ++CurPtr; 109 110 const char *FracStart = CurPtr; 111 while (isxdigit(*CurPtr)) 112 ++CurPtr; 113 114 NoFracDigits = CurPtr == FracStart; 115 } 116 117 if (NoIntDigits && NoFracDigits) 118 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: " 119 "expected at least one significand digit"); 120 121 // Make sure we do have some kind of proper exponent part 122 if (*CurPtr != 'p' && *CurPtr != 'P') 123 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: " 124 "expected exponent part 'p'"); 125 ++CurPtr; 126 127 if (*CurPtr == '+' || *CurPtr == '-') 128 ++CurPtr; 129 130 // N.b. exponent digits are *not* hex 131 const char *ExpStart = CurPtr; 132 while (isdigit(*CurPtr)) 133 ++CurPtr; 134 135 if (CurPtr == ExpStart) 136 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: " 137 "expected at least one exponent digit"); 138 139 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart)); 140} 141 142/// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@?]* 143static bool IsIdentifierChar(char c, bool AllowAt) { 144 return isalnum(c) || c == '_' || c == '$' || c == '.' || 145 (c == '@' && AllowAt) || c == '?'; 146} 147AsmToken AsmLexer::LexIdentifier() { 148 // Check for floating point literals. 149 if (CurPtr[-1] == '.' && isdigit(*CurPtr)) { 150 // Disambiguate a .1243foo identifier from a floating literal. 151 while (isdigit(*CurPtr)) 152 ++CurPtr; 153 if (*CurPtr == 'e' || *CurPtr == 'E' || 154 !IsIdentifierChar(*CurPtr, AllowAtInIdentifier)) 155 return LexFloatLiteral(); 156 } 157 158 while (IsIdentifierChar(*CurPtr, AllowAtInIdentifier)) 159 ++CurPtr; 160 161 // Handle . as a special case. 162 if (CurPtr == TokStart+1 && TokStart[0] == '.') 163 return AsmToken(AsmToken::Dot, StringRef(TokStart, 1)); 164 165 return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart)); 166} 167 168/// LexSlash: Slash: / 169/// C-Style Comment: /* ... */ 170AsmToken AsmLexer::LexSlash() { 171 switch (*CurPtr) { 172 case '*': break; // C style comment. 173 case '/': return ++CurPtr, LexLineComment(); 174 default: return AsmToken(AsmToken::Slash, StringRef(CurPtr-1, 1)); 175 } 176 177 // C Style comment. 178 ++CurPtr; // skip the star. 179 while (1) { 180 int CurChar = getNextChar(); 181 switch (CurChar) { 182 case EOF: 183 return ReturnError(TokStart, "unterminated comment"); 184 case '*': 185 // End of the comment? 186 if (CurPtr[0] != '/') break; 187 188 ++CurPtr; // End the */. 189 return LexToken(); 190 } 191 } 192} 193 194/// LexLineComment: Comment: #[^\n]* 195/// : //[^\n]* 196AsmToken AsmLexer::LexLineComment() { 197 // FIXME: This is broken if we happen to a comment at the end of a file, which 198 // was .included, and which doesn't end with a newline. 199 int CurChar = getNextChar(); 200 while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF) 201 CurChar = getNextChar(); 202 203 if (CurChar == EOF) 204 return AsmToken(AsmToken::Eof, StringRef(CurPtr, 0)); 205 return AsmToken(AsmToken::EndOfStatement, StringRef(CurPtr, 0)); 206} 207 208static void SkipIgnoredIntegerSuffix(const char *&CurPtr) { 209 // Skip ULL, UL, U, L and LL suffices. 210 if (CurPtr[0] == 'U') 211 ++CurPtr; 212 if (CurPtr[0] == 'L') 213 ++CurPtr; 214 if (CurPtr[0] == 'L') 215 ++CurPtr; 216} 217 218// Look ahead to search for first non-hex digit, if it's [hH], then we treat the 219// integer as a hexadecimal, possibly with leading zeroes. 220static unsigned doLookAhead(const char *&CurPtr, unsigned DefaultRadix) { 221 const char *FirstHex = nullptr; 222 const char *LookAhead = CurPtr; 223 while (1) { 224 if (isdigit(*LookAhead)) { 225 ++LookAhead; 226 } else if (isxdigit(*LookAhead)) { 227 if (!FirstHex) 228 FirstHex = LookAhead; 229 ++LookAhead; 230 } else { 231 break; 232 } 233 } 234 bool isHex = *LookAhead == 'h' || *LookAhead == 'H'; 235 CurPtr = isHex || !FirstHex ? LookAhead : FirstHex; 236 if (isHex) 237 return 16; 238 return DefaultRadix; 239} 240 241static AsmToken intToken(StringRef Ref, APInt &Value) 242{ 243 if (Value.isIntN(64)) 244 return AsmToken(AsmToken::Integer, Ref, Value); 245 return AsmToken(AsmToken::BigNum, Ref, Value); 246} 247 248/// LexDigit: First character is [0-9]. 249/// Local Label: [0-9][:] 250/// Forward/Backward Label: [0-9][fb] 251/// Binary integer: 0b[01]+ 252/// Octal integer: 0[0-7]+ 253/// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH] 254/// Decimal integer: [1-9][0-9]* 255AsmToken AsmLexer::LexDigit() { 256 // Decimal integer: [1-9][0-9]* 257 if (CurPtr[-1] != '0' || CurPtr[0] == '.') { 258 unsigned Radix = doLookAhead(CurPtr, 10); 259 bool isHex = Radix == 16; 260 // Check for floating point literals. 261 if (!isHex && (*CurPtr == '.' || *CurPtr == 'e')) { 262 ++CurPtr; 263 return LexFloatLiteral(); 264 } 265 266 StringRef Result(TokStart, CurPtr - TokStart); 267 268 APInt Value(128, 0, true); 269 if (Result.getAsInteger(Radix, Value)) 270 return ReturnError(TokStart, !isHex ? "invalid decimal number" : 271 "invalid hexdecimal number"); 272 273 // Consume the [bB][hH]. 274 if (Radix == 2 || Radix == 16) 275 ++CurPtr; 276 277 // The darwin/x86 (and x86-64) assembler accepts and ignores type 278 // suffices on integer literals. 279 SkipIgnoredIntegerSuffix(CurPtr); 280 281 return intToken(Result, Value); 282 } 283 284 if (*CurPtr == 'b') { 285 ++CurPtr; 286 // See if we actually have "0b" as part of something like "jmp 0b\n" 287 if (!isdigit(CurPtr[0])) { 288 --CurPtr; 289 StringRef Result(TokStart, CurPtr - TokStart); 290 return AsmToken(AsmToken::Integer, Result, 0); 291 } 292 const char *NumStart = CurPtr; 293 while (CurPtr[0] == '0' || CurPtr[0] == '1') 294 ++CurPtr; 295 296 // Requires at least one binary digit. 297 if (CurPtr == NumStart) 298 return ReturnError(TokStart, "invalid binary number"); 299 300 StringRef Result(TokStart, CurPtr - TokStart); 301 302 APInt Value(128, 0, true); 303 if (Result.substr(2).getAsInteger(2, Value)) 304 return ReturnError(TokStart, "invalid binary number"); 305 306 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 307 // suffixes on integer literals. 308 SkipIgnoredIntegerSuffix(CurPtr); 309 310 return intToken(Result, Value); 311 } 312 313 if (*CurPtr == 'x') { 314 ++CurPtr; 315 const char *NumStart = CurPtr; 316 while (isxdigit(CurPtr[0])) 317 ++CurPtr; 318 319 // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be 320 // diagnosed by LexHexFloatLiteral). 321 if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P') 322 return LexHexFloatLiteral(NumStart == CurPtr); 323 324 // Otherwise requires at least one hex digit. 325 if (CurPtr == NumStart) 326 return ReturnError(CurPtr-2, "invalid hexadecimal number"); 327 328 APInt Result(128, 0); 329 if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result)) 330 return ReturnError(TokStart, "invalid hexadecimal number"); 331 332 // Consume the optional [hH]. 333 if (*CurPtr == 'h' || *CurPtr == 'H') 334 ++CurPtr; 335 336 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 337 // suffixes on integer literals. 338 SkipIgnoredIntegerSuffix(CurPtr); 339 340 return intToken(StringRef(TokStart, CurPtr - TokStart), Result); 341 } 342 343 // Either octal or hexadecimal. 344 APInt Value(128, 0, true); 345 unsigned Radix = doLookAhead(CurPtr, 8); 346 bool isHex = Radix == 16; 347 StringRef Result(TokStart, CurPtr - TokStart); 348 if (Result.getAsInteger(Radix, Value)) 349 return ReturnError(TokStart, !isHex ? "invalid octal number" : 350 "invalid hexdecimal number"); 351 352 // Consume the [hH]. 353 if (Radix == 16) 354 ++CurPtr; 355 356 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 357 // suffixes on integer literals. 358 SkipIgnoredIntegerSuffix(CurPtr); 359 360 return intToken(Result, Value); 361} 362 363/// LexSingleQuote: Integer: 'b' 364AsmToken AsmLexer::LexSingleQuote() { 365 int CurChar = getNextChar(); 366 367 if (CurChar == '\\') 368 CurChar = getNextChar(); 369 370 if (CurChar == EOF) 371 return ReturnError(TokStart, "unterminated single quote"); 372 373 CurChar = getNextChar(); 374 375 if (CurChar != '\'') 376 return ReturnError(TokStart, "single quote way too long"); 377 378 // The idea here being that 'c' is basically just an integral 379 // constant. 380 StringRef Res = StringRef(TokStart,CurPtr - TokStart); 381 long long Value; 382 383 if (Res.startswith("\'\\")) { 384 char theChar = Res[2]; 385 switch (theChar) { 386 default: Value = theChar; break; 387 case '\'': Value = '\''; break; 388 case 't': Value = '\t'; break; 389 case 'n': Value = '\n'; break; 390 case 'b': Value = '\b'; break; 391 } 392 } else 393 Value = TokStart[1]; 394 395 return AsmToken(AsmToken::Integer, Res, Value); 396} 397 398 399/// LexQuote: String: "..." 400AsmToken AsmLexer::LexQuote() { 401 int CurChar = getNextChar(); 402 // TODO: does gas allow multiline string constants? 403 while (CurChar != '"') { 404 if (CurChar == '\\') { 405 // Allow \", etc. 406 CurChar = getNextChar(); 407 } 408 409 if (CurChar == EOF) 410 return ReturnError(TokStart, "unterminated string constant"); 411 412 CurChar = getNextChar(); 413 } 414 415 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); 416} 417 418StringRef AsmLexer::LexUntilEndOfStatement() { 419 TokStart = CurPtr; 420 421 while (!isAtStartOfComment(*CurPtr) && // Start of line comment. 422 !isAtStatementSeparator(CurPtr) && // End of statement marker. 423 *CurPtr != '\n' && 424 *CurPtr != '\r' && 425 (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) { 426 ++CurPtr; 427 } 428 return StringRef(TokStart, CurPtr-TokStart); 429} 430 431StringRef AsmLexer::LexUntilEndOfLine() { 432 TokStart = CurPtr; 433 434 while (*CurPtr != '\n' && 435 *CurPtr != '\r' && 436 (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) { 437 ++CurPtr; 438 } 439 return StringRef(TokStart, CurPtr-TokStart); 440} 441 442const AsmToken AsmLexer::peekTok(bool ShouldSkipSpace) { 443 const char *SavedTokStart = TokStart; 444 const char *SavedCurPtr = CurPtr; 445 bool SavedAtStartOfLine = isAtStartOfLine; 446 bool SavedSkipSpace = SkipSpace; 447 448 std::string SavedErr = getErr(); 449 SMLoc SavedErrLoc = getErrLoc(); 450 451 SkipSpace = ShouldSkipSpace; 452 AsmToken Token = LexToken(); 453 454 SetError(SavedErrLoc, SavedErr); 455 456 SkipSpace = SavedSkipSpace; 457 isAtStartOfLine = SavedAtStartOfLine; 458 CurPtr = SavedCurPtr; 459 TokStart = SavedTokStart; 460 461 return Token; 462} 463 464bool AsmLexer::isAtStartOfComment(char Char) { 465 // FIXME: This won't work for multi-character comment indicators like "//". 466 return Char == *MAI.getCommentString(); 467} 468 469bool AsmLexer::isAtStatementSeparator(const char *Ptr) { 470 return strncmp(Ptr, MAI.getSeparatorString(), 471 strlen(MAI.getSeparatorString())) == 0; 472} 473 474AsmToken AsmLexer::LexToken() { 475 TokStart = CurPtr; 476 // This always consumes at least one character. 477 int CurChar = getNextChar(); 478 479 if (isAtStartOfComment(CurChar)) { 480 // If this comment starts with a '#', then return the Hash token and let 481 // the assembler parser see if it can be parsed as a cpp line filename 482 // comment. We do this only if we are at the start of a line. 483 if (CurChar == '#' && isAtStartOfLine) 484 return AsmToken(AsmToken::Hash, StringRef(TokStart, 1)); 485 isAtStartOfLine = true; 486 return LexLineComment(); 487 } 488 if (isAtStatementSeparator(TokStart)) { 489 CurPtr += strlen(MAI.getSeparatorString()) - 1; 490 return AsmToken(AsmToken::EndOfStatement, 491 StringRef(TokStart, strlen(MAI.getSeparatorString()))); 492 } 493 494 // If we're missing a newline at EOF, make sure we still get an 495 // EndOfStatement token before the Eof token. 496 if (CurChar == EOF && !isAtStartOfLine) { 497 isAtStartOfLine = true; 498 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1)); 499 } 500 501 isAtStartOfLine = false; 502 switch (CurChar) { 503 default: 504 // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]* 505 if (isalpha(CurChar) || CurChar == '_' || CurChar == '.') 506 return LexIdentifier(); 507 508 // Unknown character, emit an error. 509 return ReturnError(TokStart, "invalid character in input"); 510 case EOF: return AsmToken(AsmToken::Eof, StringRef(TokStart, 0)); 511 case 0: 512 case ' ': 513 case '\t': 514 if (SkipSpace) { 515 // Ignore whitespace. 516 return LexToken(); 517 } else { 518 int len = 1; 519 while (*CurPtr==' ' || *CurPtr=='\t') { 520 CurPtr++; 521 len++; 522 } 523 return AsmToken(AsmToken::Space, StringRef(TokStart, len)); 524 } 525 case '\n': // FALL THROUGH. 526 case '\r': 527 isAtStartOfLine = true; 528 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1)); 529 case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1)); 530 case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1)); 531 case '-': return AsmToken(AsmToken::Minus, StringRef(TokStart, 1)); 532 case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1)); 533 case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1)); 534 case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1)); 535 case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1)); 536 case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1)); 537 case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1)); 538 case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1)); 539 case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1)); 540 case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1)); 541 case '$': return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1)); 542 case '@': return AsmToken(AsmToken::At, StringRef(TokStart, 1)); 543 case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1)); 544 case '=': 545 if (*CurPtr == '=') 546 return ++CurPtr, AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2)); 547 return AsmToken(AsmToken::Equal, StringRef(TokStart, 1)); 548 case '|': 549 if (*CurPtr == '|') 550 return ++CurPtr, AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2)); 551 return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1)); 552 case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1)); 553 case '&': 554 if (*CurPtr == '&') 555 return ++CurPtr, AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2)); 556 return AsmToken(AsmToken::Amp, StringRef(TokStart, 1)); 557 case '!': 558 if (*CurPtr == '=') 559 return ++CurPtr, AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2)); 560 return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1)); 561 case '%': return AsmToken(AsmToken::Percent, StringRef(TokStart, 1)); 562 case '/': return LexSlash(); 563 case '#': return AsmToken(AsmToken::Hash, StringRef(TokStart, 1)); 564 case '\'': return LexSingleQuote(); 565 case '"': return LexQuote(); 566 case '0': case '1': case '2': case '3': case '4': 567 case '5': case '6': case '7': case '8': case '9': 568 return LexDigit(); 569 case '<': 570 switch (*CurPtr) { 571 case '<': return ++CurPtr, AsmToken(AsmToken::LessLess, 572 StringRef(TokStart, 2)); 573 case '=': return ++CurPtr, AsmToken(AsmToken::LessEqual, 574 StringRef(TokStart, 2)); 575 case '>': return ++CurPtr, AsmToken(AsmToken::LessGreater, 576 StringRef(TokStart, 2)); 577 default: return AsmToken(AsmToken::Less, StringRef(TokStart, 1)); 578 } 579 case '>': 580 switch (*CurPtr) { 581 case '>': return ++CurPtr, AsmToken(AsmToken::GreaterGreater, 582 StringRef(TokStart, 2)); 583 case '=': return ++CurPtr, AsmToken(AsmToken::GreaterEqual, 584 StringRef(TokStart, 2)); 585 default: return AsmToken(AsmToken::Greater, StringRef(TokStart, 1)); 586 } 587 588 // TODO: Quoted identifiers (objc methods etc) 589 // local labels: [0-9][:] 590 // Forward/backward labels: [0-9][fb] 591 // Integers, fp constants, character constants. 592 } 593} 594