AsmLexer.cpp revision a78c67e9bbf6ff0253945f3ba5bc178ece76d886
1//===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This class implements the lexer for assembly files. 11// 12//===----------------------------------------------------------------------===// 13 14#include "llvm/MC/MCParser/AsmLexer.h" 15#include "llvm/Support/SMLoc.h" 16#include "llvm/Support/MemoryBuffer.h" 17#include "llvm/MC/MCAsmInfo.h" 18#include <cerrno> 19#include <cstdio> 20#include <cstdlib> 21using namespace llvm; 22 23AsmLexer::AsmLexer(const MCAsmInfo &_MAI) : MAI(_MAI) { 24 CurBuf = NULL; 25 CurPtr = NULL; 26} 27 28AsmLexer::~AsmLexer() { 29} 30 31void AsmLexer::setBuffer(const MemoryBuffer *buf, const char *ptr) { 32 CurBuf = buf; 33 34 if (ptr) 35 CurPtr = ptr; 36 else 37 CurPtr = CurBuf->getBufferStart(); 38 39 TokStart = 0; 40} 41 42/// ReturnError - Set the error to the specified string at the specified 43/// location. This is defined to always return AsmToken::Error. 44AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) { 45 SetError(SMLoc::getFromPointer(Loc), Msg); 46 47 return AsmToken(AsmToken::Error, StringRef(Loc, 0)); 48} 49 50int AsmLexer::getNextChar() { 51 char CurChar = *CurPtr++; 52 switch (CurChar) { 53 default: 54 return (unsigned char)CurChar; 55 case 0: 56 // A nul character in the stream is either the end of the current buffer or 57 // a random nul in the file. Disambiguate that here. 58 if (CurPtr-1 != CurBuf->getBufferEnd()) 59 return 0; // Just whitespace. 60 61 // Otherwise, return end of file. 62 --CurPtr; // Another call to lex will return EOF again. 63 return EOF; 64 } 65} 66 67/// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@]* 68AsmToken AsmLexer::LexIdentifier() { 69 while (isalnum(*CurPtr) || *CurPtr == '_' || *CurPtr == '$' || 70 *CurPtr == '.' || *CurPtr == '@') 71 ++CurPtr; 72 73 // Handle . as a special case. 74 if (CurPtr == TokStart+1 && TokStart[0] == '.') 75 return AsmToken(AsmToken::Dot, StringRef(TokStart, 1)); 76 77 return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart)); 78} 79 80/// LexSlash: Slash: / 81/// C-Style Comment: /* ... */ 82AsmToken AsmLexer::LexSlash() { 83 switch (*CurPtr) { 84 case '*': break; // C style comment. 85 case '/': return ++CurPtr, LexLineComment(); 86 default: return AsmToken(AsmToken::Slash, StringRef(CurPtr, 1)); 87 } 88 89 // C Style comment. 90 ++CurPtr; // skip the star. 91 while (1) { 92 int CurChar = getNextChar(); 93 switch (CurChar) { 94 case EOF: 95 return ReturnError(TokStart, "unterminated comment"); 96 case '*': 97 // End of the comment? 98 if (CurPtr[0] != '/') break; 99 100 ++CurPtr; // End the */. 101 return LexToken(); 102 } 103 } 104} 105 106/// LexLineComment: Comment: #[^\n]* 107/// : //[^\n]* 108AsmToken AsmLexer::LexLineComment() { 109 // FIXME: This is broken if we happen to a comment at the end of a file, which 110 // was .included, and which doesn't end with a newline. 111 int CurChar = getNextChar(); 112 while (CurChar != '\n' && CurChar != '\n' && CurChar != EOF) 113 CurChar = getNextChar(); 114 115 if (CurChar == EOF) 116 return AsmToken(AsmToken::Eof, StringRef(CurPtr, 0)); 117 return AsmToken(AsmToken::EndOfStatement, StringRef(CurPtr, 0)); 118} 119 120static void SkipIgnoredIntegerSuffix(const char *&CurPtr) { 121 if (CurPtr[0] == 'L' && CurPtr[1] == 'L') 122 CurPtr += 2; 123 if (CurPtr[0] == 'U' && CurPtr[1] == 'L' && CurPtr[2] == 'L') 124 CurPtr += 3; 125} 126 127 128/// LexDigit: First character is [0-9]. 129/// Local Label: [0-9][:] 130/// Forward/Backward Label: [0-9][fb] 131/// Binary integer: 0b[01]+ 132/// Octal integer: 0[0-7]+ 133/// Hex integer: 0x[0-9a-fA-F]+ 134/// Decimal integer: [1-9][0-9]* 135/// TODO: FP literal. 136AsmToken AsmLexer::LexDigit() { 137 // Decimal integer: [1-9][0-9]* 138 if (CurPtr[-1] != '0') { 139 while (isdigit(*CurPtr)) 140 ++CurPtr; 141 142 StringRef Result(TokStart, CurPtr - TokStart); 143 144 long long Value; 145 if (Result.getAsInteger(10, Value)) { 146 // We have to handle minint_as_a_positive_value specially, because 147 // - minint_as_a_positive_value = minint and it is valid. 148 if (Result == "9223372036854775808") 149 Value = -9223372036854775808ULL; 150 else 151 return ReturnError(TokStart, "Invalid decimal number"); 152 } 153 154 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 155 // suffixes on integer literals. 156 SkipIgnoredIntegerSuffix(CurPtr); 157 158 return AsmToken(AsmToken::Integer, Result, Value); 159 } 160 161 if (*CurPtr == 'b') { 162 ++CurPtr; 163 // See if we actually have "0b" as part of something like "jmp 0b\n" 164 if (!isdigit(CurPtr[0])) { 165 --CurPtr; 166 StringRef Result(TokStart, CurPtr - TokStart); 167 return AsmToken(AsmToken::Integer, Result, 0); 168 } 169 const char *NumStart = CurPtr; 170 while (CurPtr[0] == '0' || CurPtr[0] == '1') 171 ++CurPtr; 172 173 // Requires at least one binary digit. 174 if (CurPtr == NumStart) 175 return ReturnError(TokStart, "Invalid binary number"); 176 177 StringRef Result(TokStart, CurPtr - TokStart); 178 179 long long Value; 180 if (Result.substr(2).getAsInteger(2, Value)) 181 return ReturnError(TokStart, "Invalid binary number"); 182 183 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 184 // suffixes on integer literals. 185 SkipIgnoredIntegerSuffix(CurPtr); 186 187 return AsmToken(AsmToken::Integer, Result, Value); 188 } 189 190 if (*CurPtr == 'x') { 191 ++CurPtr; 192 const char *NumStart = CurPtr; 193 while (isxdigit(CurPtr[0])) 194 ++CurPtr; 195 196 // Requires at least one hex digit. 197 if (CurPtr == NumStart) 198 return ReturnError(CurPtr-2, "Invalid hexadecimal number"); 199 200 unsigned long long Result; 201 if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result)) 202 return ReturnError(TokStart, "Invalid hexadecimal number"); 203 204 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 205 // suffixes on integer literals. 206 SkipIgnoredIntegerSuffix(CurPtr); 207 208 return AsmToken(AsmToken::Integer, StringRef(TokStart, CurPtr - TokStart), 209 (int64_t)Result); 210 } 211 212 // Must be an octal number, it starts with 0. 213 while (*CurPtr >= '0' && *CurPtr <= '7') 214 ++CurPtr; 215 216 StringRef Result(TokStart, CurPtr - TokStart); 217 long long Value; 218 if (Result.getAsInteger(8, Value)) 219 return ReturnError(TokStart, "Invalid octal number"); 220 221 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL 222 // suffixes on integer literals. 223 SkipIgnoredIntegerSuffix(CurPtr); 224 225 return AsmToken(AsmToken::Integer, Result, Value); 226} 227 228/// LexQuote: String: "..." 229AsmToken AsmLexer::LexQuote() { 230 int CurChar = getNextChar(); 231 // TODO: does gas allow multiline string constants? 232 while (CurChar != '"') { 233 if (CurChar == '\\') { 234 // Allow \", etc. 235 CurChar = getNextChar(); 236 } 237 238 if (CurChar == EOF) 239 return ReturnError(TokStart, "unterminated string constant"); 240 241 CurChar = getNextChar(); 242 } 243 244 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart)); 245} 246 247StringRef AsmLexer::LexUntilEndOfStatement() { 248 TokStart = CurPtr; 249 250 while (!isAtStartOfComment(*CurPtr) && // Start of line comment. 251 *CurPtr != ';' && // End of statement marker. 252 *CurPtr != '\n' && 253 *CurPtr != '\r' && 254 (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) { 255 ++CurPtr; 256 } 257 return StringRef(TokStart, CurPtr-TokStart); 258} 259 260bool AsmLexer::isAtStartOfComment(char Char) { 261 // FIXME: This won't work for multi-character comment indicators like "//". 262 return Char == *MAI.getCommentString(); 263} 264 265AsmToken AsmLexer::LexToken() { 266 TokStart = CurPtr; 267 // This always consumes at least one character. 268 int CurChar = getNextChar(); 269 270 if (isAtStartOfComment(CurChar)) 271 return LexLineComment(); 272 273 switch (CurChar) { 274 default: 275 // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]* 276 if (isalpha(CurChar) || CurChar == '_' || CurChar == '.') 277 return LexIdentifier(); 278 279 // Unknown character, emit an error. 280 return ReturnError(TokStart, "invalid character in input"); 281 case EOF: return AsmToken(AsmToken::Eof, StringRef(TokStart, 0)); 282 case 0: 283 case ' ': 284 case '\t': 285 // Ignore whitespace. 286 return LexToken(); 287 case '\n': // FALL THROUGH. 288 case '\r': // FALL THROUGH. 289 case ';': return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1)); 290 case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1)); 291 case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1)); 292 case '-': return AsmToken(AsmToken::Minus, StringRef(TokStart, 1)); 293 case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1)); 294 case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1)); 295 case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1)); 296 case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1)); 297 case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1)); 298 case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1)); 299 case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1)); 300 case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1)); 301 case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1)); 302 case '$': return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1)); 303 case '@': return AsmToken(AsmToken::At, StringRef(TokStart, 1)); 304 case '=': 305 if (*CurPtr == '=') 306 return ++CurPtr, AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2)); 307 return AsmToken(AsmToken::Equal, StringRef(TokStart, 1)); 308 case '|': 309 if (*CurPtr == '|') 310 return ++CurPtr, AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2)); 311 return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1)); 312 case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1)); 313 case '&': 314 if (*CurPtr == '&') 315 return ++CurPtr, AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2)); 316 return AsmToken(AsmToken::Amp, StringRef(TokStart, 1)); 317 case '!': 318 if (*CurPtr == '=') 319 return ++CurPtr, AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2)); 320 return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1)); 321 case '%': return AsmToken(AsmToken::Percent, StringRef(TokStart, 1)); 322 case '/': return LexSlash(); 323 case '#': return AsmToken(AsmToken::Hash, StringRef(TokStart, 1)); 324 case '"': return LexQuote(); 325 case '0': case '1': case '2': case '3': case '4': 326 case '5': case '6': case '7': case '8': case '9': 327 return LexDigit(); 328 case '<': 329 switch (*CurPtr) { 330 case '<': return ++CurPtr, AsmToken(AsmToken::LessLess, 331 StringRef(TokStart, 2)); 332 case '=': return ++CurPtr, AsmToken(AsmToken::LessEqual, 333 StringRef(TokStart, 2)); 334 case '>': return ++CurPtr, AsmToken(AsmToken::LessGreater, 335 StringRef(TokStart, 2)); 336 default: return AsmToken(AsmToken::Less, StringRef(TokStart, 1)); 337 } 338 case '>': 339 switch (*CurPtr) { 340 case '>': return ++CurPtr, AsmToken(AsmToken::GreaterGreater, 341 StringRef(TokStart, 2)); 342 case '=': return ++CurPtr, AsmToken(AsmToken::GreaterEqual, 343 StringRef(TokStart, 2)); 344 default: return AsmToken(AsmToken::Greater, StringRef(TokStart, 1)); 345 } 346 347 // TODO: Quoted identifiers (objc methods etc) 348 // local labels: [0-9][:] 349 // Forward/backward labels: [0-9][fb] 350 // Integers, fp constants, character constants. 351 } 352} 353