1//===- TGLexer.cpp - Lexer for TableGen -----------------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// Implement the Lexer for TableGen. 11// 12//===----------------------------------------------------------------------===// 13 14#include "TGLexer.h" 15#include "llvm/TableGen/Error.h" 16#include "llvm/Support/SourceMgr.h" 17#include "llvm/Support/MemoryBuffer.h" 18#include "llvm/Config/config.h" 19#include "llvm/ADT/StringSwitch.h" 20#include "llvm/ADT/Twine.h" 21#include <cctype> 22#include <cstdio> 23#include <cstdlib> 24#include <cstring> 25#include <cerrno> 26using namespace llvm; 27 28TGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) { 29 CurBuffer = 0; 30 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer); 31 CurPtr = CurBuf->getBufferStart(); 32 TokStart = 0; 33} 34 35SMLoc TGLexer::getLoc() const { 36 return SMLoc::getFromPointer(TokStart); 37} 38 39/// ReturnError - Set the error to the specified string at the specified 40/// location. This is defined to always return tgtok::Error. 41tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) { 42 PrintError(Loc, Msg); 43 return tgtok::Error; 44} 45 46int TGLexer::getNextChar() { 47 char CurChar = *CurPtr++; 48 switch (CurChar) { 49 default: 50 return (unsigned char)CurChar; 51 case 0: { 52 // A nul character in the stream is either the end of the current buffer or 53 // a random nul in the file. Disambiguate that here. 54 if (CurPtr-1 != CurBuf->getBufferEnd()) 55 return 0; // Just whitespace. 56 57 // If this is the end of an included file, pop the parent file off the 58 // include stack. 59 SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer); 60 if (ParentIncludeLoc != SMLoc()) { 61 CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc); 62 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer); 63 CurPtr = ParentIncludeLoc.getPointer(); 64 return getNextChar(); 65 } 66 67 // Otherwise, return end of file. 68 --CurPtr; // Another call to lex will return EOF again. 69 return EOF; 70 } 71 case '\n': 72 case '\r': 73 // Handle the newline character by ignoring it and incrementing the line 74 // count. However, be careful about 'dos style' files with \n\r in them. 75 // Only treat a \n\r or \r\n as a single line. 76 if ((*CurPtr == '\n' || (*CurPtr == '\r')) && 77 *CurPtr != CurChar) 78 ++CurPtr; // Eat the two char newline sequence. 79 return '\n'; 80 } 81} 82 83tgtok::TokKind TGLexer::LexToken() { 84 TokStart = CurPtr; 85 // This always consumes at least one character. 86 int CurChar = getNextChar(); 87 88 switch (CurChar) { 89 default: 90 // Handle letters: [a-zA-Z_#] 91 if (isalpha(CurChar) || CurChar == '_' || CurChar == '#') 92 return LexIdentifier(); 93 94 // Unknown character, emit an error. 95 return ReturnError(TokStart, "Unexpected character"); 96 case EOF: return tgtok::Eof; 97 case ':': return tgtok::colon; 98 case ';': return tgtok::semi; 99 case '.': return tgtok::period; 100 case ',': return tgtok::comma; 101 case '<': return tgtok::less; 102 case '>': return tgtok::greater; 103 case ']': return tgtok::r_square; 104 case '{': return tgtok::l_brace; 105 case '}': return tgtok::r_brace; 106 case '(': return tgtok::l_paren; 107 case ')': return tgtok::r_paren; 108 case '=': return tgtok::equal; 109 case '?': return tgtok::question; 110 111 case 0: 112 case ' ': 113 case '\t': 114 case '\n': 115 case '\r': 116 // Ignore whitespace. 117 return LexToken(); 118 case '/': 119 // If this is the start of a // comment, skip until the end of the line or 120 // the end of the buffer. 121 if (*CurPtr == '/') 122 SkipBCPLComment(); 123 else if (*CurPtr == '*') { 124 if (SkipCComment()) 125 return tgtok::Error; 126 } else // Otherwise, this is an error. 127 return ReturnError(TokStart, "Unexpected character"); 128 return LexToken(); 129 case '-': case '+': 130 case '0': case '1': case '2': case '3': case '4': case '5': case '6': 131 case '7': case '8': case '9': 132 return LexNumber(); 133 case '"': return LexString(); 134 case '$': return LexVarName(); 135 case '[': return LexBracket(); 136 case '!': return LexExclaim(); 137 } 138} 139 140/// LexString - Lex "[^"]*" 141tgtok::TokKind TGLexer::LexString() { 142 const char *StrStart = CurPtr; 143 144 CurStrVal = ""; 145 146 while (*CurPtr != '"') { 147 // If we hit the end of the buffer, report an error. 148 if (*CurPtr == 0 && CurPtr == CurBuf->getBufferEnd()) 149 return ReturnError(StrStart, "End of file in string literal"); 150 151 if (*CurPtr == '\n' || *CurPtr == '\r') 152 return ReturnError(StrStart, "End of line in string literal"); 153 154 if (*CurPtr != '\\') { 155 CurStrVal += *CurPtr++; 156 continue; 157 } 158 159 ++CurPtr; 160 161 switch (*CurPtr) { 162 case '\\': case '\'': case '"': 163 // These turn into their literal character. 164 CurStrVal += *CurPtr++; 165 break; 166 case 't': 167 CurStrVal += '\t'; 168 ++CurPtr; 169 break; 170 case 'n': 171 CurStrVal += '\n'; 172 ++CurPtr; 173 break; 174 175 case '\n': 176 case '\r': 177 return ReturnError(CurPtr, "escaped newlines not supported in tblgen"); 178 179 // If we hit the end of the buffer, report an error. 180 case '\0': 181 if (CurPtr == CurBuf->getBufferEnd()) 182 return ReturnError(StrStart, "End of file in string literal"); 183 // FALL THROUGH 184 default: 185 return ReturnError(CurPtr, "invalid escape in string literal"); 186 } 187 } 188 189 ++CurPtr; 190 return tgtok::StrVal; 191} 192 193tgtok::TokKind TGLexer::LexVarName() { 194 if (!isalpha(CurPtr[0]) && CurPtr[0] != '_') 195 return ReturnError(TokStart, "Invalid variable name"); 196 197 // Otherwise, we're ok, consume the rest of the characters. 198 const char *VarNameStart = CurPtr++; 199 200 while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_') 201 ++CurPtr; 202 203 CurStrVal.assign(VarNameStart, CurPtr); 204 return tgtok::VarName; 205} 206 207 208tgtok::TokKind TGLexer::LexIdentifier() { 209 // The first letter is [a-zA-Z_#]. 210 const char *IdentStart = TokStart; 211 212 // Match the rest of the identifier regex: [0-9a-zA-Z_#]* 213 while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_' || 214 *CurPtr == '#') 215 ++CurPtr; 216 217 // Check to see if this identifier is a keyword. 218 StringRef Str(IdentStart, CurPtr-IdentStart); 219 220 if (Str == "include") { 221 if (LexInclude()) return tgtok::Error; 222 return Lex(); 223 } 224 225 tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str) 226 .Case("int", tgtok::Int) 227 .Case("bit", tgtok::Bit) 228 .Case("bits", tgtok::Bits) 229 .Case("string", tgtok::String) 230 .Case("list", tgtok::List) 231 .Case("code", tgtok::Code) 232 .Case("dag", tgtok::Dag) 233 .Case("class", tgtok::Class) 234 .Case("def", tgtok::Def) 235 .Case("defm", tgtok::Defm) 236 .Case("multiclass", tgtok::MultiClass) 237 .Case("field", tgtok::Field) 238 .Case("let", tgtok::Let) 239 .Case("in", tgtok::In) 240 .Default(tgtok::Id); 241 242 if (Kind == tgtok::Id) 243 CurStrVal.assign(Str.begin(), Str.end()); 244 return Kind; 245} 246 247/// LexInclude - We just read the "include" token. Get the string token that 248/// comes next and enter the include. 249bool TGLexer::LexInclude() { 250 // The token after the include must be a string. 251 tgtok::TokKind Tok = LexToken(); 252 if (Tok == tgtok::Error) return true; 253 if (Tok != tgtok::StrVal) { 254 PrintError(getLoc(), "Expected filename after include"); 255 return true; 256 } 257 258 // Get the string. 259 std::string Filename = CurStrVal; 260 std::string IncludedFile; 261 262 263 CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr), 264 IncludedFile); 265 if (CurBuffer == -1) { 266 PrintError(getLoc(), "Could not find include file '" + Filename + "'"); 267 return true; 268 } 269 270 Dependencies.push_back(IncludedFile); 271 // Save the line number and lex buffer of the includer. 272 CurBuf = SrcMgr.getMemoryBuffer(CurBuffer); 273 CurPtr = CurBuf->getBufferStart(); 274 return false; 275} 276 277void TGLexer::SkipBCPLComment() { 278 ++CurPtr; // skip the second slash. 279 while (1) { 280 switch (*CurPtr) { 281 case '\n': 282 case '\r': 283 return; // Newline is end of comment. 284 case 0: 285 // If this is the end of the buffer, end the comment. 286 if (CurPtr == CurBuf->getBufferEnd()) 287 return; 288 break; 289 } 290 // Otherwise, skip the character. 291 ++CurPtr; 292 } 293} 294 295/// SkipCComment - This skips C-style /**/ comments. The only difference from C 296/// is that we allow nesting. 297bool TGLexer::SkipCComment() { 298 ++CurPtr; // skip the star. 299 unsigned CommentDepth = 1; 300 301 while (1) { 302 int CurChar = getNextChar(); 303 switch (CurChar) { 304 case EOF: 305 PrintError(TokStart, "Unterminated comment!"); 306 return true; 307 case '*': 308 // End of the comment? 309 if (CurPtr[0] != '/') break; 310 311 ++CurPtr; // End the */. 312 if (--CommentDepth == 0) 313 return false; 314 break; 315 case '/': 316 // Start of a nested comment? 317 if (CurPtr[0] != '*') break; 318 ++CurPtr; 319 ++CommentDepth; 320 break; 321 } 322 } 323} 324 325/// LexNumber - Lex: 326/// [-+]?[0-9]+ 327/// 0x[0-9a-fA-F]+ 328/// 0b[01]+ 329tgtok::TokKind TGLexer::LexNumber() { 330 if (CurPtr[-1] == '0') { 331 if (CurPtr[0] == 'x') { 332 ++CurPtr; 333 const char *NumStart = CurPtr; 334 while (isxdigit(CurPtr[0])) 335 ++CurPtr; 336 337 // Requires at least one hex digit. 338 if (CurPtr == NumStart) 339 return ReturnError(TokStart, "Invalid hexadecimal number"); 340 341 errno = 0; 342 CurIntVal = strtoll(NumStart, 0, 16); 343 if (errno == EINVAL) 344 return ReturnError(TokStart, "Invalid hexadecimal number"); 345 if (errno == ERANGE) { 346 errno = 0; 347 CurIntVal = (int64_t)strtoull(NumStart, 0, 16); 348 if (errno == EINVAL) 349 return ReturnError(TokStart, "Invalid hexadecimal number"); 350 if (errno == ERANGE) 351 return ReturnError(TokStart, "Hexadecimal number out of range"); 352 } 353 return tgtok::IntVal; 354 } else if (CurPtr[0] == 'b') { 355 ++CurPtr; 356 const char *NumStart = CurPtr; 357 while (CurPtr[0] == '0' || CurPtr[0] == '1') 358 ++CurPtr; 359 360 // Requires at least one binary digit. 361 if (CurPtr == NumStart) 362 return ReturnError(CurPtr-2, "Invalid binary number"); 363 CurIntVal = strtoll(NumStart, 0, 2); 364 return tgtok::IntVal; 365 } 366 } 367 368 // Check for a sign without a digit. 369 if (!isdigit(CurPtr[0])) { 370 if (CurPtr[-1] == '-') 371 return tgtok::minus; 372 else if (CurPtr[-1] == '+') 373 return tgtok::plus; 374 } 375 376 while (isdigit(CurPtr[0])) 377 ++CurPtr; 378 CurIntVal = strtoll(TokStart, 0, 10); 379 return tgtok::IntVal; 380} 381 382/// LexBracket - We just read '['. If this is a code block, return it, 383/// otherwise return the bracket. Match: '[' and '[{ ( [^}]+ | }[^]] )* }]' 384tgtok::TokKind TGLexer::LexBracket() { 385 if (CurPtr[0] != '{') 386 return tgtok::l_square; 387 ++CurPtr; 388 const char *CodeStart = CurPtr; 389 while (1) { 390 int Char = getNextChar(); 391 if (Char == EOF) break; 392 393 if (Char != '}') continue; 394 395 Char = getNextChar(); 396 if (Char == EOF) break; 397 if (Char == ']') { 398 CurStrVal.assign(CodeStart, CurPtr-2); 399 return tgtok::CodeFragment; 400 } 401 } 402 403 return ReturnError(CodeStart-2, "Unterminated Code Block"); 404} 405 406/// LexExclaim - Lex '!' and '![a-zA-Z]+'. 407tgtok::TokKind TGLexer::LexExclaim() { 408 if (!isalpha(*CurPtr)) 409 return ReturnError(CurPtr - 1, "Invalid \"!operator\""); 410 411 const char *Start = CurPtr++; 412 while (isalpha(*CurPtr)) 413 ++CurPtr; 414 415 // Check to see which operator this is. 416 tgtok::TokKind Kind = 417 StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start)) 418 .Case("eq", tgtok::XEq) 419 .Case("if", tgtok::XIf) 420 .Case("head", tgtok::XHead) 421 .Case("tail", tgtok::XTail) 422 .Case("con", tgtok::XConcat) 423 .Case("shl", tgtok::XSHL) 424 .Case("sra", tgtok::XSRA) 425 .Case("srl", tgtok::XSRL) 426 .Case("cast", tgtok::XCast) 427 .Case("empty", tgtok::XEmpty) 428 .Case("subst", tgtok::XSubst) 429 .Case("foreach", tgtok::XForEach) 430 .Case("strconcat", tgtok::XStrConcat) 431 .Default(tgtok::Error); 432 433 return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator"); 434} 435 436