AsmLexer.cpp revision 50e75bfc29269def44981ab5f109334d95f55007
1//===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This class implements the lexer for assembly files. 11// 12//===----------------------------------------------------------------------===// 13 14#include "llvm/MC/MCParser/AsmLexer.h" 15#include "llvm/MC/MCAsmInfo.h" 16#include "llvm/Support/MemoryBuffer.h" 17#include "llvm/Support/SMLoc.h" 18#include <cctype> 19#include <cerrno> 20#include <cstdio> 21#include <cstdlib> 22using namespace llvm; 23 24AsmLexer::AsmLexer(const MCAsmInfo &_MAI) : MAI(_MAI) { 25 CurBuf = NULL; 26 CurPtr = NULL; 27 isAtStartOfLine = true; 28} 29 30AsmLexer::~AsmLexer() { 31} 32 33void AsmLexer::setBuffer(const MemoryBuffer *buf, const char *ptr) { 34 CurBuf = buf; 35 36 if (ptr) 37 CurPtr = ptr; 38 else 39 CurPtr = CurBuf->getBufferStart(); 40 41 TokStart = 0; 42} 43 44/// ReturnError - Set the error to the specified string at the specified 45/// location. This is defined to always return AsmToken::Error. 46AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) { 47 SetError(SMLoc::getFromPointer(Loc), Msg); 48 49 return AsmToken(AsmToken::Error, StringRef(Loc, 0)); 50} 51 52int AsmLexer::getNextChar() { 53 char CurChar = *CurPtr++; 54 switch (CurChar) { 55 default: 56 return (unsigned char)CurChar; 57 case 0: 58 // A nul character in the stream is either the end of the current buffer or 59 // a random nul in the file. Disambiguate that here. 60 if (CurPtr-1 != CurBuf->getBufferEnd()) 61 return 0; // Just whitespace. 62 63 // Otherwise, return end of file. 64 --CurPtr; // Another call to lex will return EOF again. 65 return EOF; 66 } 67} 68 69/// LexFloatLiteral: [0-9]*[.][0-9]*([eE][+-]?[0-9]*)? 70/// 71/// The leading integral digit sequence and dot should have already been 72/// consumed, some or all of the fractional digit sequence *can* have been 73/// consumed. 74AsmToken AsmLexer::LexFloatLiteral() { 75 // Skip the fractional digit sequence. 76 while (isdigit(*CurPtr)) 77 ++CurPtr; 78 79 // Check for exponent; we intentionally accept a slighlty wider set of 80 // literals here and rely on the upstream client to reject invalid ones (e.g., 81 // "1e+"). 82 if (*CurPtr == 'e' || *CurPtr == 'E') { 83 ++CurPtr; 84 if (*CurPtr == '-' || *CurPtr == '+') 85 ++CurPtr; 86 while (isdigit(*CurPtr)) 87 ++CurPtr; 88 } 89 90 return AsmToken(AsmToken::Real, 91 StringRef(TokStart, CurPtr - TokStart)); 92} 93 94/// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@]* 95static bool IsIdentifierChar(char c) { 96 return isalnum(c) || c == '_' || c == '$' || c == '.' || c == '@'; 97} 98AsmToken AsmLexer::LexIdentifier() { 99 // Check for floating point literals. 100 if (CurPtr[-1] == '.' && isdigit(*CurPtr)) { 101 // Disambiguate a .1243foo identifier from a floating literal. 102 while (isdigit(*CurPtr)) 103 ++CurPtr; 104 if (*CurPtr == 'e' || *CurPtr == 'E' || !IsIdentifierChar(*CurPtr)) 105 return LexFloatLiteral(); 106 } 107 108 while (IsIdentifierChar(*CurPtr)) 109 ++CurPtr; 110 111 // Handle . as a special case. 112 if (CurPtr == TokStart+1 && TokStart[0] == '.') 113 return AsmToken(AsmToken::Dot, StringRef(TokStart, 1)); 114 115 return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart)); 116} 117 118/// LexSlash: Slash: / 119/// C-Style Comment: /* ... */ 120AsmToken AsmLexer::LexSlash() { 121 switch (*CurPtr) { 122 case '*': break; // C style comment. 123 case '/': return ++CurPtr, LexLineComment(); 124 default: return AsmToken(AsmToken::Slash, StringRef(CurPtr-1, 1)); 125 } 126 127 // C Style comment. 128 ++CurPtr; // skip the star. 129 while (1) { 130 int CurChar = getNextChar(); 131 switch (CurChar) { 132 case EOF: 133 return ReturnError(TokStart, "unterminated comment"); 134 case '*': 135 // End of the comment? 136 if (CurPtr[0] != '/') break; 137 138 ++CurPtr; // End the */. 139 return LexToken(); 140 } 141 } 142} 143 144/// LexLineComment: Comment: #[^\n]* 145/// : //[^\n]* 146AsmToken AsmLexer::LexLineComment() { 147 // FIXME: This is broken if we happen to a comment at the end of a file, which 148 // was .included, and which doesn't end with a newline. 149 int CurChar = getNextChar(); 150 while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF) 151 CurChar = getNextChar(); 152 153 if (CurChar == EOF) 154 return AsmToken(AsmToken::Eof, StringRef(CurPtr, 0)); 155 return AsmToken(AsmToken::EndOfStatement, StringRef(CurPtr, 0)); 156} 157 158static void SkipIgnoredIntegerSuffix(const char *&CurPtr) { 159 if (CurPtr[0] == 'L' && CurPtr[1] == 'L') 160 CurPtr += 2; 161 if (CurPtr[0] == 'U' && CurPtr[1] == 'L' && CurPtr[2] == 'L') 162 CurPtr += 3; 163} 164 165// Look ahead to search for first non-hex digit, if it's [hH], then we treat the 166// integer as a hexadecimal, possibly with leading zeroes. 167static unsigned doLookAhead(const char *&CurPtr, unsigned DefaultRadix) { 168 const char *FirstHex = 0; 169 const char *LookAhead = CurPtr; 170 while (1) { 171 if (isdigit(*LookAhead)) { 172 ++LookAhead; 173 } else if (isxdigit(*LookAhead)) { 174 if (!FirstHex) 175 FirstHex = LookAhead; 176 ++LookAhead; 177 } else { 178 break; 179 } 180 } 181 bool isHex = *LookAhead == 'h' || *LookAhead == 'H'; 182 CurPtr = isHex || !FirstHex ? LookAhead : FirstHex; 183 if (isHex) 184 return 16; 185 return DefaultRadix; 186} 187 188/// LexDigit: First character is [0-9]. 189/// Local Label: [0-9][:] 190/// Forward/Backward Label: [0-9][fb] 191/// Binary integer: 0b[01]+ 192/// Octal integer: 0[0-7]+ 193/// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH] 194/// Decimal integer: [1-9][0-9]* 195AsmToken AsmLexer::LexDigit() { 196 // Decimal integer: [1-9][0-9]* 197 if (CurPtr[-1] != '0' || CurPtr[0] == '.') { 198 unsigned Radix = doLookAhead(CurPtr, 10); 199 bool isHex = Radix == 16; 200 // Check for floating point literals. 201 if (!isHex && (*CurPtr == '.' || *CurPtr == 'e')) { 202 ++CurPtr; 203 return LexFloatLiteral(); 204 } 205 206 StringRef Result(TokStart, CurPtr - TokStart); 207 208 long long Value; 209 if (Result.getAsInteger(Radix, Value)) { 210 // Allow positive values that are too large to fit into a signed 64-bit 211 // integer, but that do fit in an unsigned one, we just convert them over. 212 unsigned long long UValue; 213 if (Result.getAsInteger(Radix, UValue)) 214 return ReturnError(TokStart, !isHex ? "invalid decimal number" : 215 "invalid hexdecimal number"); 216 Value = (long long)UValue; 217 } 218 219 // Consume the [bB][hH]. 220 if (Radix == 2 || Radix == 16) 221 ++CurPtr; 222 223 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 224 // suffixes on integer literals. 225 SkipIgnoredIntegerSuffix(CurPtr); 226 227 return AsmToken(AsmToken::Integer, Result, Value); 228 } 229 230 if (*CurPtr == 'b') { 231 ++CurPtr; 232 // See if we actually have "0b" as part of something like "jmp 0b\n" 233 if (!isdigit(CurPtr[0])) { 234 --CurPtr; 235 StringRef Result(TokStart, CurPtr - TokStart); 236 return AsmToken(AsmToken::Integer, Result, 0); 237 } 238 const char *NumStart = CurPtr; 239 while (CurPtr[0] == '0' || CurPtr[0] == '1') 240 ++CurPtr; 241 242 // Requires at least one binary digit. 243 if (CurPtr == NumStart) 244 return ReturnError(TokStart, "invalid binary number"); 245 246 StringRef Result(TokStart, CurPtr - TokStart); 247 248 long long Value; 249 if (Result.substr(2).getAsInteger(2, Value)) 250 return ReturnError(TokStart, "invalid binary number"); 251 252 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 253 // suffixes on integer literals. 254 SkipIgnoredIntegerSuffix(CurPtr); 255 256 return AsmToken(AsmToken::Integer, Result, Value); 257 } 258 259 if (*CurPtr == 'x') { 260 ++CurPtr; 261 const char *NumStart = CurPtr; 262 while (isxdigit(CurPtr[0])) 263 ++CurPtr; 264 265 // Requires at least one hex digit. 266 if (CurPtr == NumStart) 267 return ReturnError(CurPtr-2, "invalid hexadecimal number"); 268 269 unsigned long long Result; 270 if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result)) 271 return ReturnError(TokStart, "invalid hexadecimal number"); 272 273 // Consume the optional [hH]. 274 if (*CurPtr == 'h' || *CurPtr == 'H') 275 ++CurPtr; 276 277 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 278 // suffixes on integer literals. 279 SkipIgnoredIntegerSuffix(CurPtr); 280 281 return AsmToken(AsmToken::Integer, StringRef(TokStart, CurPtr - TokStart), 282 (int64_t)Result); 283 } 284 285 // Either octal or hexadecimal. 286 long long Value; 287 unsigned Radix = doLookAhead(CurPtr, 8); 288 bool isHex = Radix == 16; 289 StringRef Result(TokStart, CurPtr - TokStart); 290 if (Result.getAsInteger(Radix, Value)) 291 return ReturnError(TokStart, !isHex ? "invalid octal number" : 292 "invalid hexdecimal number"); 293 294 // Consume the [hH]. 295 if (Radix == 16) 296 ++CurPtr; 297 298 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 299 // suffixes on integer literals. 300 SkipIgnoredIntegerSuffix(CurPtr); 301 302 return AsmToken(AsmToken::Integer, Result, Value); 303} 304 305/// LexSingleQuote: Integer: 'b' 306AsmToken AsmLexer::LexSingleQuote() { 307 int CurChar = getNextChar(); 308 309 if (CurChar == '\\') 310 CurChar = getNextChar(); 311 312 if (CurChar == EOF) 313 return ReturnError(TokStart, "unterminated single quote"); 314 315 CurChar = getNextChar(); 316 317 if (CurChar != '\'') 318 return ReturnError(TokStart, "single quote way too long"); 319 320 // The idea here being that 'c' is basically just an integral 321 // constant. 322 StringRef Res = StringRef(TokStart,CurPtr - TokStart); 323 long long Value; 324 325 if (Res.startswith("\'\\")) { 326 char theChar = Res[2]; 327 switch (theChar) { 328 default: Value = theChar; break; 329 case '\'': Value = '\''; break; 330 case 't': Value = '\t'; break; 331 case 'n': Value = '\n'; break; 332 case 'b': Value = '\b'; break; 333 } 334 } else 335 Value = TokStart[1]; 336 337 return AsmToken(AsmToken::Integer, Res, Value); 338} 339 340 341/// LexQuote: String: "..." 342AsmToken AsmLexer::LexQuote() { 343 int CurChar = getNextChar(); 344 // TODO: does gas allow multiline string constants? 345 while (CurChar != '"') { 346 if (CurChar == '\\') { 347 // Allow \", etc. 348 CurChar = getNextChar(); 349 } 350 351 if (CurChar == EOF) 352 return ReturnError(TokStart, "unterminated string constant"); 353 354 CurChar = getNextChar(); 355 } 356 357 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); 358} 359 360StringRef AsmLexer::LexUntilEndOfStatement() { 361 TokStart = CurPtr; 362 363 while (!isAtStartOfComment(*CurPtr) && // Start of line comment. 364 !isAtStatementSeparator(CurPtr) && // End of statement marker. 365 *CurPtr != '\n' && 366 *CurPtr != '\r' && 367 (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) { 368 ++CurPtr; 369 } 370 return StringRef(TokStart, CurPtr-TokStart); 371} 372 373StringRef AsmLexer::LexUntilEndOfLine() { 374 TokStart = CurPtr; 375 376 while (*CurPtr != '\n' && 377 *CurPtr != '\r' && 378 (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) { 379 ++CurPtr; 380 } 381 return StringRef(TokStart, CurPtr-TokStart); 382} 383 384bool AsmLexer::isAtStartOfComment(char Char) { 385 // FIXME: This won't work for multi-character comment indicators like "//". 386 return Char == *MAI.getCommentString(); 387} 388 389bool AsmLexer::isAtStatementSeparator(const char *Ptr) { 390 return strncmp(Ptr, MAI.getSeparatorString(), 391 strlen(MAI.getSeparatorString())) == 0; 392} 393 394AsmToken AsmLexer::LexToken() { 395 TokStart = CurPtr; 396 // This always consumes at least one character. 397 int CurChar = getNextChar(); 398 399 if (isAtStartOfComment(CurChar)) { 400 // If this comment starts with a '#', then return the Hash token and let 401 // the assembler parser see if it can be parsed as a cpp line filename 402 // comment. We do this only if we are at the start of a line. 403 if (CurChar == '#' && isAtStartOfLine) 404 return AsmToken(AsmToken::Hash, StringRef(TokStart, 1)); 405 isAtStartOfLine = true; 406 return LexLineComment(); 407 } 408 if (isAtStatementSeparator(TokStart)) { 409 CurPtr += strlen(MAI.getSeparatorString()) - 1; 410 return AsmToken(AsmToken::EndOfStatement, 411 StringRef(TokStart, strlen(MAI.getSeparatorString()))); 412 } 413 414 // If we're missing a newline at EOF, make sure we still get an 415 // EndOfStatement token before the Eof token. 416 if (CurChar == EOF && !isAtStartOfLine) { 417 isAtStartOfLine = true; 418 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1)); 419 } 420 421 isAtStartOfLine = false; 422 switch (CurChar) { 423 default: 424 // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]* 425 if (isalpha(CurChar) || CurChar == '_' || CurChar == '.') 426 return LexIdentifier(); 427 428 // Unknown character, emit an error. 429 return ReturnError(TokStart, "invalid character in input"); 430 case EOF: return AsmToken(AsmToken::Eof, StringRef(TokStart, 0)); 431 case 0: 432 case ' ': 433 case '\t': 434 if (SkipSpace) { 435 // Ignore whitespace. 436 return LexToken(); 437 } else { 438 int len = 1; 439 while (*CurPtr==' ' || *CurPtr=='\t') { 440 CurPtr++; 441 len++; 442 } 443 return AsmToken(AsmToken::Space, StringRef(TokStart, len)); 444 } 445 case '\n': // FALL THROUGH. 446 case '\r': 447 isAtStartOfLine = true; 448 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1)); 449 case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1)); 450 case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1)); 451 case '-': return AsmToken(AsmToken::Minus, StringRef(TokStart, 1)); 452 case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1)); 453 case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1)); 454 case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1)); 455 case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1)); 456 case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1)); 457 case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1)); 458 case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1)); 459 case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1)); 460 case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1)); 461 case '$': return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1)); 462 case '@': return AsmToken(AsmToken::At, StringRef(TokStart, 1)); 463 case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1)); 464 case '=': 465 if (*CurPtr == '=') 466 return ++CurPtr, AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2)); 467 return AsmToken(AsmToken::Equal, StringRef(TokStart, 1)); 468 case '|': 469 if (*CurPtr == '|') 470 return ++CurPtr, AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2)); 471 return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1)); 472 case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1)); 473 case '&': 474 if (*CurPtr == '&') 475 return ++CurPtr, AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2)); 476 return AsmToken(AsmToken::Amp, StringRef(TokStart, 1)); 477 case '!': 478 if (*CurPtr == '=') 479 return ++CurPtr, AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2)); 480 return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1)); 481 case '%': return AsmToken(AsmToken::Percent, StringRef(TokStart, 1)); 482 case '/': return LexSlash(); 483 case '#': return AsmToken(AsmToken::Hash, StringRef(TokStart, 1)); 484 case '\'': return LexSingleQuote(); 485 case '"': return LexQuote(); 486 case '0': case '1': case '2': case '3': case '4': 487 case '5': case '6': case '7': case '8': case '9': 488 return LexDigit(); 489 case '<': 490 switch (*CurPtr) { 491 case '<': return ++CurPtr, AsmToken(AsmToken::LessLess, 492 StringRef(TokStart, 2)); 493 case '=': return ++CurPtr, AsmToken(AsmToken::LessEqual, 494 StringRef(TokStart, 2)); 495 case '>': return ++CurPtr, AsmToken(AsmToken::LessGreater, 496 StringRef(TokStart, 2)); 497 default: return AsmToken(AsmToken::Less, StringRef(TokStart, 1)); 498 } 499 case '>': 500 switch (*CurPtr) { 501 case '>': return ++CurPtr, AsmToken(AsmToken::GreaterGreater, 502 StringRef(TokStart, 2)); 503 case '=': return ++CurPtr, AsmToken(AsmToken::GreaterEqual, 504 StringRef(TokStart, 2)); 505 default: return AsmToken(AsmToken::Greater, StringRef(TokStart, 1)); 506 } 507 508 // TODO: Quoted identifiers (objc methods etc) 509 // local labels: [0-9][:] 510 // Forward/backward labels: [0-9][fb] 511 // Integers, fp constants, character constants. 512 } 513} 514