AsmLexer.cpp revision f1c21a8da6ed27a6ab4944e30bbeb4bd3ee08a71
1//===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This class implements the lexer for assembly files. 11// 12//===----------------------------------------------------------------------===// 13 14#include "llvm/MC/MCParser/AsmLexer.h" 15#include "llvm/Support/SMLoc.h" 16#include "llvm/Support/MemoryBuffer.h" 17#include "llvm/MC/MCAsmInfo.h" 18#include <cctype> 19#include <cerrno> 20#include <cstdio> 21#include <cstdlib> 22using namespace llvm; 23 24AsmLexer::AsmLexer(const MCAsmInfo &_MAI) : MAI(_MAI) { 25 CurBuf = NULL; 26 CurPtr = NULL; 27} 28 29AsmLexer::~AsmLexer() { 30} 31 32void AsmLexer::setBuffer(const MemoryBuffer *buf, const char *ptr) { 33 CurBuf = buf; 34 35 if (ptr) 36 CurPtr = ptr; 37 else 38 CurPtr = CurBuf->getBufferStart(); 39 40 TokStart = 0; 41} 42 43/// ReturnError - Set the error to the specified string at the specified 44/// location. This is defined to always return AsmToken::Error. 45AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) { 46 SetError(SMLoc::getFromPointer(Loc), Msg); 47 48 return AsmToken(AsmToken::Error, StringRef(Loc, 0)); 49} 50 51int AsmLexer::getNextChar() { 52 char CurChar = *CurPtr++; 53 switch (CurChar) { 54 default: 55 return (unsigned char)CurChar; 56 case 0: 57 // A nul character in the stream is either the end of the current buffer or 58 // a random nul in the file. Disambiguate that here. 59 if (CurPtr-1 != CurBuf->getBufferEnd()) 60 return 0; // Just whitespace. 61 62 // Otherwise, return end of file. 63 --CurPtr; // Another call to lex will return EOF again. 64 return EOF; 65 } 66} 67 68/// LexFloatLiteral: [0-9]*[.][0-9]*([eE][+-]?[0-9]*)? 69/// 70/// The leading integral digit sequence and dot should have already been 71/// consumed, some or all of the fractional digit sequence *can* have been 72/// consumed. 73AsmToken AsmLexer::LexFloatLiteral() { 74 // Skip the fractional digit sequence. 75 while (isdigit(*CurPtr)) 76 ++CurPtr; 77 78 // Check for exponent; we intentionally accept a slighlty wider set of 79 // literals here and rely on the upstream client to reject invalid ones (e.g., 80 // "1e+"). 81 if (*CurPtr == 'e' || *CurPtr == 'E') { 82 ++CurPtr; 83 if (*CurPtr == '-' || *CurPtr == '+') 84 ++CurPtr; 85 while (isdigit(*CurPtr)) 86 ++CurPtr; 87 } 88 89 return AsmToken(AsmToken::Real, 90 StringRef(TokStart, CurPtr - TokStart)); 91} 92 93/// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@]* 94static bool IsIdentifierChar(char c) { 95 return isalnum(c) || c == '_' || c == '$' || c == '.' || c == '@'; 96} 97AsmToken AsmLexer::LexIdentifier() { 98 // Check for floating point literals. 99 if (CurPtr[-1] == '.' && isdigit(*CurPtr)) { 100 // Disambiguate a .1243foo identifier from a floating literal. 101 while (isdigit(*CurPtr)) 102 ++CurPtr; 103 if (*CurPtr == 'e' || *CurPtr == 'E' || !IsIdentifierChar(*CurPtr)) 104 return LexFloatLiteral(); 105 } 106 107 while (IsIdentifierChar(*CurPtr)) 108 ++CurPtr; 109 110 // Handle . as a special case. 111 if (CurPtr == TokStart+1 && TokStart[0] == '.') 112 return AsmToken(AsmToken::Dot, StringRef(TokStart, 1)); 113 114 return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart)); 115} 116 117/// LexSlash: Slash: / 118/// C-Style Comment: /* ... */ 119AsmToken AsmLexer::LexSlash() { 120 switch (*CurPtr) { 121 case '*': break; // C style comment. 122 case '/': return ++CurPtr, LexLineComment(); 123 default: return AsmToken(AsmToken::Slash, StringRef(CurPtr-1, 1)); 124 } 125 126 // C Style comment. 127 ++CurPtr; // skip the star. 128 while (1) { 129 int CurChar = getNextChar(); 130 switch (CurChar) { 131 case EOF: 132 return ReturnError(TokStart, "unterminated comment"); 133 case '*': 134 // End of the comment? 135 if (CurPtr[0] != '/') break; 136 137 ++CurPtr; // End the */. 138 return LexToken(); 139 } 140 } 141} 142 143/// LexLineComment: Comment: #[^\n]* 144/// : //[^\n]* 145AsmToken AsmLexer::LexLineComment() { 146 // FIXME: This is broken if we happen to a comment at the end of a file, which 147 // was .included, and which doesn't end with a newline. 148 int CurChar = getNextChar(); 149 while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF) 150 CurChar = getNextChar(); 151 152 if (CurChar == EOF) 153 return AsmToken(AsmToken::Eof, StringRef(CurPtr, 0)); 154 return AsmToken(AsmToken::EndOfStatement, StringRef(CurPtr, 0)); 155} 156 157static void SkipIgnoredIntegerSuffix(const char *&CurPtr) { 158 if (CurPtr[0] == 'L' && CurPtr[1] == 'L') 159 CurPtr += 2; 160 if (CurPtr[0] == 'U' && CurPtr[1] == 'L' && CurPtr[2] == 'L') 161 CurPtr += 3; 162} 163 164/// LexDigit: First character is [0-9]. 165/// Local Label: [0-9][:] 166/// Forward/Backward Label: [0-9][fb] 167/// Binary integer: 0b[01]+ 168/// Octal integer: 0[0-7]+ 169/// Hex integer: 0x[0-9a-fA-F]+ 170/// Decimal integer: [1-9][0-9]* 171AsmToken AsmLexer::LexDigit() { 172 // Decimal integer: [1-9][0-9]* 173 if (CurPtr[-1] != '0' || CurPtr[0] == '.') { 174 while (isdigit(*CurPtr)) 175 ++CurPtr; 176 177 // Check for floating point literals. 178 if (*CurPtr == '.' || *CurPtr == 'e') { 179 ++CurPtr; 180 return LexFloatLiteral(); 181 } 182 183 StringRef Result(TokStart, CurPtr - TokStart); 184 185 long long Value; 186 if (Result.getAsInteger(10, Value)) { 187 // Allow positive values that are too large to fit into a signed 64-bit 188 // integer, but that do fit in an unsigned one, we just convert them over. 189 unsigned long long UValue; 190 if (Result.getAsInteger(10, UValue)) 191 return ReturnError(TokStart, "invalid decimal number"); 192 Value = (long long)UValue; 193 } 194 195 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 196 // suffixes on integer literals. 197 SkipIgnoredIntegerSuffix(CurPtr); 198 199 return AsmToken(AsmToken::Integer, Result, Value); 200 } 201 202 if (*CurPtr == 'b') { 203 ++CurPtr; 204 // See if we actually have "0b" as part of something like "jmp 0b\n" 205 if (!isdigit(CurPtr[0])) { 206 --CurPtr; 207 StringRef Result(TokStart, CurPtr - TokStart); 208 return AsmToken(AsmToken::Integer, Result, 0); 209 } 210 const char *NumStart = CurPtr; 211 while (CurPtr[0] == '0' || CurPtr[0] == '1') 212 ++CurPtr; 213 214 // Requires at least one binary digit. 215 if (CurPtr == NumStart) 216 return ReturnError(TokStart, "invalid binary number"); 217 218 StringRef Result(TokStart, CurPtr - TokStart); 219 220 long long Value; 221 if (Result.substr(2).getAsInteger(2, Value)) 222 return ReturnError(TokStart, "invalid binary number"); 223 224 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 225 // suffixes on integer literals. 226 SkipIgnoredIntegerSuffix(CurPtr); 227 228 return AsmToken(AsmToken::Integer, Result, Value); 229 } 230 231 if (*CurPtr == 'x') { 232 ++CurPtr; 233 const char *NumStart = CurPtr; 234 while (isxdigit(CurPtr[0])) 235 ++CurPtr; 236 237 // Requires at least one hex digit. 238 if (CurPtr == NumStart) 239 return ReturnError(CurPtr-2, "invalid hexadecimal number"); 240 241 unsigned long long Result; 242 if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result)) 243 return ReturnError(TokStart, "invalid hexadecimal number"); 244 245 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 246 // suffixes on integer literals. 247 SkipIgnoredIntegerSuffix(CurPtr); 248 249 return AsmToken(AsmToken::Integer, StringRef(TokStart, CurPtr - TokStart), 250 (int64_t)Result); 251 } 252 253 // Must be an octal number, it starts with 0. 254 while (*CurPtr >= '0' && *CurPtr <= '9') 255 ++CurPtr; 256 257 StringRef Result(TokStart, CurPtr - TokStart); 258 long long Value; 259 if (Result.getAsInteger(8, Value)) 260 return ReturnError(TokStart, "invalid octal number"); 261 262 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 263 // suffixes on integer literals. 264 SkipIgnoredIntegerSuffix(CurPtr); 265 266 return AsmToken(AsmToken::Integer, Result, Value); 267} 268 269/// LexSingleQuote: Integer: 'b' 270AsmToken AsmLexer::LexSingleQuote() { 271 int CurChar = getNextChar(); 272 273 if (CurChar == '\\') 274 CurChar = getNextChar(); 275 276 if (CurChar == EOF) 277 return ReturnError(TokStart, "unterminated single quote"); 278 279 CurChar = getNextChar(); 280 281 if (CurChar != '\'') 282 return ReturnError(TokStart, "single quote way too long"); 283 284 // The idea here being that 'c' is basically just an integral 285 // constant. 286 StringRef Res = StringRef(TokStart,CurPtr - TokStart); 287 long long Value; 288 289 if (Res.startswith("\'\\")) { 290 char theChar = Res[2]; 291 switch (theChar) { 292 default: Value = theChar; break; 293 case '\'': Value = '\''; break; 294 case 't': Value = '\t'; break; 295 case 'n': Value = '\n'; break; 296 case 'b': Value = '\b'; break; 297 } 298 } else 299 Value = TokStart[1]; 300 301 return AsmToken(AsmToken::Integer, Res, Value); 302} 303 304 305/// LexQuote: String: "..." 306AsmToken AsmLexer::LexQuote() { 307 int CurChar = getNextChar(); 308 // TODO: does gas allow multiline string constants? 309 while (CurChar != '"') { 310 if (CurChar == '\\') { 311 // Allow \", etc. 312 CurChar = getNextChar(); 313 } 314 315 if (CurChar == EOF) 316 return ReturnError(TokStart, "unterminated string constant"); 317 318 CurChar = getNextChar(); 319 } 320 321 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); 322} 323 324StringRef AsmLexer::LexUntilEndOfStatement() { 325 TokStart = CurPtr; 326 327 while (!isAtStartOfComment(*CurPtr) && // Start of line comment. 328 !isAtStatementSeparator(CurPtr) && // End of statement marker. 329 *CurPtr != '\n' && 330 *CurPtr != '\r' && 331 (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) { 332 ++CurPtr; 333 } 334 return StringRef(TokStart, CurPtr-TokStart); 335} 336 337StringRef AsmLexer::LexUntilEndOfLine() { 338 TokStart = CurPtr; 339 340 while (*CurPtr != '\n' && 341 *CurPtr != '\r' && 342 (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) { 343 ++CurPtr; 344 } 345 return StringRef(TokStart, CurPtr-TokStart); 346} 347 348bool AsmLexer::isAtStartOfComment(char Char) { 349 // FIXME: This won't work for multi-character comment indicators like "//". 350 return Char == *MAI.getCommentString(); 351} 352 353bool AsmLexer::isAtStatementSeparator(const char *Ptr) { 354 return strncmp(Ptr, MAI.getSeparatorString(), 355 strlen(MAI.getSeparatorString())) == 0; 356} 357 358AsmToken AsmLexer::LexToken() { 359 static bool isAtStartOfLine = true; 360 TokStart = CurPtr; 361 // This always consumes at least one character. 362 int CurChar = getNextChar(); 363 364 if (isAtStartOfComment(CurChar)) { 365 // If this comment starts with a '#', then return the Hash token and let 366 // the assembler parser see if it can be parsed as a cpp line filename 367 // comment. We do this only if we are at the start of a line. 368 if (CurChar == '#' && isAtStartOfLine) 369 return AsmToken(AsmToken::Hash, StringRef(TokStart, 1)); 370 isAtStartOfLine = true; 371 return LexLineComment(); 372 } 373 if (isAtStatementSeparator(TokStart)) { 374 CurPtr += strlen(MAI.getSeparatorString()) - 1; 375 return AsmToken(AsmToken::EndOfStatement, 376 StringRef(TokStart, strlen(MAI.getSeparatorString()))); 377 } 378 isAtStartOfLine = false; 379 380 switch (CurChar) { 381 default: 382 // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]* 383 if (isalpha(CurChar) || CurChar == '_' || CurChar == '.') 384 return LexIdentifier(); 385 386 // Unknown character, emit an error. 387 return ReturnError(TokStart, "invalid character in input"); 388 case EOF: return AsmToken(AsmToken::Eof, StringRef(TokStart, 0)); 389 case 0: 390 case ' ': 391 case '\t': 392 // Ignore whitespace. 393 return LexToken(); 394 case '\n': // FALL THROUGH. 395 case '\r': 396 isAtStartOfLine = true; 397 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1)); 398 case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1)); 399 case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1)); 400 case '-': return AsmToken(AsmToken::Minus, StringRef(TokStart, 1)); 401 case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1)); 402 case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1)); 403 case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1)); 404 case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1)); 405 case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1)); 406 case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1)); 407 case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1)); 408 case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1)); 409 case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1)); 410 case '$': return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1)); 411 case '@': return AsmToken(AsmToken::At, StringRef(TokStart, 1)); 412 case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1)); 413 case '=': 414 if (*CurPtr == '=') 415 return ++CurPtr, AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2)); 416 return AsmToken(AsmToken::Equal, StringRef(TokStart, 1)); 417 case '|': 418 if (*CurPtr == '|') 419 return ++CurPtr, AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2)); 420 return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1)); 421 case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1)); 422 case '&': 423 if (*CurPtr == '&') 424 return ++CurPtr, AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2)); 425 return AsmToken(AsmToken::Amp, StringRef(TokStart, 1)); 426 case '!': 427 if (*CurPtr == '=') 428 return ++CurPtr, AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2)); 429 return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1)); 430 case '%': return AsmToken(AsmToken::Percent, StringRef(TokStart, 1)); 431 case '/': return LexSlash(); 432 case '#': return AsmToken(AsmToken::Hash, StringRef(TokStart, 1)); 433 case '\'': return LexSingleQuote(); 434 case '"': return LexQuote(); 435 case '0': case '1': case '2': case '3': case '4': 436 case '5': case '6': case '7': case '8': case '9': 437 return LexDigit(); 438 case '<': 439 switch (*CurPtr) { 440 case '<': return ++CurPtr, AsmToken(AsmToken::LessLess, 441 StringRef(TokStart, 2)); 442 case '=': return ++CurPtr, AsmToken(AsmToken::LessEqual, 443 StringRef(TokStart, 2)); 444 case '>': return ++CurPtr, AsmToken(AsmToken::LessGreater, 445 StringRef(TokStart, 2)); 446 default: return AsmToken(AsmToken::Less, StringRef(TokStart, 1)); 447 } 448 case '>': 449 switch (*CurPtr) { 450 case '>': return ++CurPtr, AsmToken(AsmToken::GreaterGreater, 451 StringRef(TokStart, 2)); 452 case '=': return ++CurPtr, AsmToken(AsmToken::GreaterEqual, 453 StringRef(TokStart, 2)); 454 default: return AsmToken(AsmToken::Greater, StringRef(TokStart, 1)); 455 } 456 457 // TODO: Quoted identifiers (objc methods etc) 458 // local labels: [0-9][:] 459 // Forward/backward labels: [0-9][fb] 460 // Integers, fp constants, character constants. 461 } 462} 463