AsmLexer.cpp revision 653664471333f316020e96dd3d664f4984f66a65
1//===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This class implements the lexer for assembly files. 11// 12//===----------------------------------------------------------------------===// 13 14#include "llvm/MC/MCParser/AsmLexer.h" 15#include "llvm/Support/SMLoc.h" 16#include "llvm/Support/MemoryBuffer.h" 17#include "llvm/MC/MCAsmInfo.h" 18#include <cctype> 19#include <cerrno> 20#include <cstdio> 21#include <cstdlib> 22using namespace llvm; 23 24AsmLexer::AsmLexer(const MCAsmInfo &_MAI) : MAI(_MAI) { 25 CurBuf = NULL; 26 CurPtr = NULL; 27} 28 29AsmLexer::~AsmLexer() { 30} 31 32void AsmLexer::setBuffer(const MemoryBuffer *buf, const char *ptr) { 33 CurBuf = buf; 34 35 if (ptr) 36 CurPtr = ptr; 37 else 38 CurPtr = CurBuf->getBufferStart(); 39 40 TokStart = 0; 41} 42 43/// ReturnError - Set the error to the specified string at the specified 44/// location. This is defined to always return AsmToken::Error. 45AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) { 46 SetError(SMLoc::getFromPointer(Loc), Msg); 47 48 return AsmToken(AsmToken::Error, StringRef(Loc, 0)); 49} 50 51int AsmLexer::getNextChar() { 52 char CurChar = *CurPtr++; 53 switch (CurChar) { 54 default: 55 return (unsigned char)CurChar; 56 case 0: 57 // A nul character in the stream is either the end of the current buffer or 58 // a random nul in the file. Disambiguate that here. 59 if (CurPtr-1 != CurBuf->getBufferEnd()) 60 return 0; // Just whitespace. 61 62 // Otherwise, return end of file. 63 --CurPtr; // Another call to lex will return EOF again. 64 return EOF; 65 } 66} 67 68/// LexFloatLiteral: [0-9]*[.][0-9]*([eE][+-]?[0-9]*)? 69/// 70/// The leading integral digit sequence and dot should have already been 71/// consumed, some or all of the fractional digit sequence *can* have been 72/// consumed. 73AsmToken AsmLexer::LexFloatLiteral() { 74 // Skip the fractional digit sequence. 75 while (isdigit(*CurPtr)) 76 ++CurPtr; 77 78 // Check for exponent; we intentionally accept a slighlty wider set of 79 // literals here and rely on the upstream client to reject invalid ones (e.g., 80 // "1e+"). 81 if (*CurPtr == 'e' || *CurPtr == 'E') { 82 ++CurPtr; 83 if (*CurPtr == '-' || *CurPtr == '+') 84 ++CurPtr; 85 while (isdigit(*CurPtr)) 86 ++CurPtr; 87 } 88 89 return AsmToken(AsmToken::Real, 90 StringRef(TokStart, CurPtr - TokStart)); 91} 92 93/// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@]* 94static bool IsIdentifierChar(char c) { 95 return isalnum(c) || c == '_' || c == '$' || c == '.' || c == '@'; 96} 97AsmToken AsmLexer::LexIdentifier() { 98 // Check for floating point literals. 99 if (CurPtr[-1] == '.' && isdigit(*CurPtr)) { 100 // Disambiguate a .1243foo identifier from a floating literal. 101 while (isdigit(*CurPtr)) 102 ++CurPtr; 103 if (*CurPtr == 'e' || *CurPtr == 'E' || !IsIdentifierChar(*CurPtr)) 104 return LexFloatLiteral(); 105 } 106 107 while (IsIdentifierChar(*CurPtr)) 108 ++CurPtr; 109 110 // Handle . as a special case. 111 if (CurPtr == TokStart+1 && TokStart[0] == '.') 112 return AsmToken(AsmToken::Dot, StringRef(TokStart, 1)); 113 114 return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart)); 115} 116 117/// LexSlash: Slash: / 118/// C-Style Comment: /* ... */ 119AsmToken AsmLexer::LexSlash() { 120 switch (*CurPtr) { 121 case '*': break; // C style comment. 122 case '/': return ++CurPtr, LexLineComment(); 123 default: return AsmToken(AsmToken::Slash, StringRef(CurPtr-1, 1)); 124 } 125 126 // C Style comment. 127 ++CurPtr; // skip the star. 128 while (1) { 129 int CurChar = getNextChar(); 130 switch (CurChar) { 131 case EOF: 132 return ReturnError(TokStart, "unterminated comment"); 133 case '*': 134 // End of the comment? 135 if (CurPtr[0] != '/') break; 136 137 ++CurPtr; // End the */. 138 return LexToken(); 139 } 140 } 141} 142 143/// LexLineComment: Comment: #[^\n]* 144/// : //[^\n]* 145AsmToken AsmLexer::LexLineComment() { 146 // FIXME: This is broken if we happen to a comment at the end of a file, which 147 // was .included, and which doesn't end with a newline. 148 int CurChar = getNextChar(); 149 while (CurChar != '\n' && CurChar != '\n' && CurChar != EOF) 150 CurChar = getNextChar(); 151 152 if (CurChar == EOF) 153 return AsmToken(AsmToken::Eof, StringRef(CurPtr, 0)); 154 return AsmToken(AsmToken::EndOfStatement, StringRef(CurPtr, 0)); 155} 156 157static void SkipIgnoredIntegerSuffix(const char *&CurPtr) { 158 if (CurPtr[0] == 'L' && CurPtr[1] == 'L') 159 CurPtr += 2; 160 if (CurPtr[0] == 'U' && CurPtr[1] == 'L' && CurPtr[2] == 'L') 161 CurPtr += 3; 162} 163 164/// LexDigit: First character is [0-9]. 165/// Local Label: [0-9][:] 166/// Forward/Backward Label: [0-9][fb] 167/// Binary integer: 0b[01]+ 168/// Octal integer: 0[0-7]+ 169/// Hex integer: 0x[0-9a-fA-F]+ 170/// Decimal integer: [1-9][0-9]* 171AsmToken AsmLexer::LexDigit() { 172 // Decimal integer: [1-9][0-9]* 173 if (CurPtr[-1] != '0' || CurPtr[0] == '.') { 174 while (isdigit(*CurPtr)) 175 ++CurPtr; 176 177 // Check for floating point literals. 178 if (*CurPtr == '.' || *CurPtr == 'e') { 179 ++CurPtr; 180 return LexFloatLiteral(); 181 } 182 183 StringRef Result(TokStart, CurPtr - TokStart); 184 185 long long Value; 186 if (Result.getAsInteger(10, Value)) { 187 // Allow positive values that are too large to fit into a signed 64-bit 188 // integer, but that do fit in an unsigned one, we just convert them over. 189 unsigned long long UValue; 190 if (Result.getAsInteger(10, UValue)) 191 return ReturnError(TokStart, "invalid decimal number"); 192 Value = (long long)UValue; 193 } 194 195 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 196 // suffixes on integer literals. 197 SkipIgnoredIntegerSuffix(CurPtr); 198 199 return AsmToken(AsmToken::Integer, Result, Value); 200 } 201 202 if (*CurPtr == 'b') { 203 ++CurPtr; 204 // See if we actually have "0b" as part of something like "jmp 0b\n" 205 if (!isdigit(CurPtr[0])) { 206 --CurPtr; 207 StringRef Result(TokStart, CurPtr - TokStart); 208 return AsmToken(AsmToken::Integer, Result, 0); 209 } 210 const char *NumStart = CurPtr; 211 while (CurPtr[0] == '0' || CurPtr[0] == '1') 212 ++CurPtr; 213 214 // Requires at least one binary digit. 215 if (CurPtr == NumStart) 216 return ReturnError(TokStart, "invalid binary number"); 217 218 StringRef Result(TokStart, CurPtr - TokStart); 219 220 long long Value; 221 if (Result.substr(2).getAsInteger(2, Value)) 222 return ReturnError(TokStart, "invalid binary number"); 223 224 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 225 // suffixes on integer literals. 226 SkipIgnoredIntegerSuffix(CurPtr); 227 228 return AsmToken(AsmToken::Integer, Result, Value); 229 } 230 231 if (*CurPtr == 'x') { 232 ++CurPtr; 233 const char *NumStart = CurPtr; 234 while (isxdigit(CurPtr[0])) 235 ++CurPtr; 236 237 // Requires at least one hex digit. 238 if (CurPtr == NumStart) 239 return ReturnError(CurPtr-2, "invalid hexadecimal number"); 240 241 unsigned long long Result; 242 if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result)) 243 return ReturnError(TokStart, "invalid hexadecimal number"); 244 245 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 246 // suffixes on integer literals. 247 SkipIgnoredIntegerSuffix(CurPtr); 248 249 return AsmToken(AsmToken::Integer, StringRef(TokStart, CurPtr - TokStart), 250 (int64_t)Result); 251 } 252 253 // Must be an octal number, it starts with 0. 254 while (*CurPtr >= '0' && *CurPtr <= '9') 255 ++CurPtr; 256 257 StringRef Result(TokStart, CurPtr - TokStart); 258 long long Value; 259 if (Result.getAsInteger(8, Value)) 260 return ReturnError(TokStart, "invalid octal number"); 261 262 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 263 // suffixes on integer literals. 264 SkipIgnoredIntegerSuffix(CurPtr); 265 266 return AsmToken(AsmToken::Integer, Result, Value); 267} 268 269/// LexSingleQuote: Integer: 'b' 270AsmToken AsmLexer::LexSingleQuote() { 271 int CurChar = getNextChar(); 272 273 if (CurChar == '\\') 274 CurChar = getNextChar(); 275 276 if (CurChar == EOF) 277 return ReturnError(TokStart, "unterminated single quote"); 278 279 CurChar = getNextChar(); 280 281 if (CurChar != '\'') 282 return ReturnError(TokStart, "single quote way too long"); 283 284 // The idea here being that 'c' is basically just an integral 285 // constant. 286 StringRef Res = StringRef(TokStart,CurPtr - TokStart); 287 long long Value; 288 289 if (Res.startswith("\'\\")) { 290 char theChar = Res[2]; 291 switch (theChar) { 292 default: Value = theChar; break; 293 case '\'': Value = '\''; break; 294 case 't': Value = '\t'; break; 295 case 'n': Value = '\n'; break; 296 case 'b': Value = '\b'; break; 297 } 298 } else 299 Value = TokStart[1]; 300 301 return AsmToken(AsmToken::Integer, Res, Value); 302} 303 304 305/// LexQuote: String: "..." 306AsmToken AsmLexer::LexQuote() { 307 int CurChar = getNextChar(); 308 // TODO: does gas allow multiline string constants? 309 while (CurChar != '"') { 310 if (CurChar == '\\') { 311 // Allow \", etc. 312 CurChar = getNextChar(); 313 } 314 315 if (CurChar == EOF) 316 return ReturnError(TokStart, "unterminated string constant"); 317 318 CurChar = getNextChar(); 319 } 320 321 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); 322} 323 324StringRef AsmLexer::LexUntilEndOfStatement() { 325 TokStart = CurPtr; 326 327 while (!isAtStartOfComment(*CurPtr) && // Start of line comment. 328 !isAtStatementSeparator(CurPtr) && // End of statement marker. 329 *CurPtr != '\n' && 330 *CurPtr != '\r' && 331 (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) { 332 ++CurPtr; 333 } 334 return StringRef(TokStart, CurPtr-TokStart); 335} 336 337bool AsmLexer::isAtStartOfComment(char Char) { 338 // FIXME: This won't work for multi-character comment indicators like "//". 339 return Char == *MAI.getCommentString(); 340} 341 342bool AsmLexer::isAtStatementSeparator(const char *Ptr) { 343 return strncmp(Ptr, MAI.getSeparatorString(), 344 strlen(MAI.getSeparatorString())) == 0; 345} 346 347AsmToken AsmLexer::LexToken() { 348 TokStart = CurPtr; 349 // This always consumes at least one character. 350 int CurChar = getNextChar(); 351 352 if (isAtStartOfComment(CurChar)) 353 return LexLineComment(); 354 if (isAtStatementSeparator(TokStart)) { 355 CurPtr += strlen(MAI.getSeparatorString()) - 1; 356 return AsmToken(AsmToken::EndOfStatement, 357 StringRef(TokStart, strlen(MAI.getSeparatorString()))); 358 } 359 360 switch (CurChar) { 361 default: 362 // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]* 363 if (isalpha(CurChar) || CurChar == '_' || CurChar == '.') 364 return LexIdentifier(); 365 366 // Unknown character, emit an error. 367 return ReturnError(TokStart, "invalid character in input"); 368 case EOF: return AsmToken(AsmToken::Eof, StringRef(TokStart, 0)); 369 case 0: 370 case ' ': 371 case '\t': 372 // Ignore whitespace. 373 return LexToken(); 374 case '\n': // FALL THROUGH. 375 case '\r': 376 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1)); 377 case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1)); 378 case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1)); 379 case '-': return AsmToken(AsmToken::Minus, StringRef(TokStart, 1)); 380 case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1)); 381 case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1)); 382 case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1)); 383 case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1)); 384 case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1)); 385 case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1)); 386 case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1)); 387 case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1)); 388 case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1)); 389 case '$': return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1)); 390 case '@': return AsmToken(AsmToken::At, StringRef(TokStart, 1)); 391 case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1)); 392 case '=': 393 if (*CurPtr == '=') 394 return ++CurPtr, AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2)); 395 return AsmToken(AsmToken::Equal, StringRef(TokStart, 1)); 396 case '|': 397 if (*CurPtr == '|') 398 return ++CurPtr, AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2)); 399 return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1)); 400 case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1)); 401 case '&': 402 if (*CurPtr == '&') 403 return ++CurPtr, AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2)); 404 return AsmToken(AsmToken::Amp, StringRef(TokStart, 1)); 405 case '!': 406 if (*CurPtr == '=') 407 return ++CurPtr, AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2)); 408 return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1)); 409 case '%': return AsmToken(AsmToken::Percent, StringRef(TokStart, 1)); 410 case '/': return LexSlash(); 411 case '#': return AsmToken(AsmToken::Hash, StringRef(TokStart, 1)); 412 case '\'': return LexSingleQuote(); 413 case '"': return LexQuote(); 414 case '0': case '1': case '2': case '3': case '4': 415 case '5': case '6': case '7': case '8': case '9': 416 return LexDigit(); 417 case '<': 418 switch (*CurPtr) { 419 case '<': return ++CurPtr, AsmToken(AsmToken::LessLess, 420 StringRef(TokStart, 2)); 421 case '=': return ++CurPtr, AsmToken(AsmToken::LessEqual, 422 StringRef(TokStart, 2)); 423 case '>': return ++CurPtr, AsmToken(AsmToken::LessGreater, 424 StringRef(TokStart, 2)); 425 default: return AsmToken(AsmToken::Less, StringRef(TokStart, 1)); 426 } 427 case '>': 428 switch (*CurPtr) { 429 case '>': return ++CurPtr, AsmToken(AsmToken::GreaterGreater, 430 StringRef(TokStart, 2)); 431 case '=': return ++CurPtr, AsmToken(AsmToken::GreaterEqual, 432 StringRef(TokStart, 2)); 433 default: return AsmToken(AsmToken::Greater, StringRef(TokStart, 1)); 434 } 435 436 // TODO: Quoted identifiers (objc methods etc) 437 // local labels: [0-9][:] 438 // Forward/backward labels: [0-9][fb] 439 // Integers, fp constants, character constants. 440 } 441} 442