1//===- TGLexer.cpp - Lexer for TableGen -----------------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// Implement the Lexer for TableGen.
11//
12//===----------------------------------------------------------------------===//
13
14#include "TGLexer.h"
15#include "llvm/TableGen/Error.h"
16#include "llvm/Support/SourceMgr.h"
17#include "llvm/Support/MemoryBuffer.h"
18#include "llvm/ADT/StringSwitch.h"
19#include "llvm/ADT/Twine.h"
20#include <cctype>
21#include <cstdio>
22#include <cstdlib>
23#include <cstring>
24#include <cerrno>
25
26#include "llvm/Config/config.h" // for strtoull()/strtoll() define
27
28using namespace llvm;
29
30TGLexer::TGLexer(SourceMgr &SM) : SrcMgr(SM) {
31  CurBuffer = 0;
32  CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
33  CurPtr = CurBuf->getBufferStart();
34  TokStart = 0;
35}
36
37SMLoc TGLexer::getLoc() const {
38  return SMLoc::getFromPointer(TokStart);
39}
40
41/// ReturnError - Set the error to the specified string at the specified
42/// location.  This is defined to always return tgtok::Error.
43tgtok::TokKind TGLexer::ReturnError(const char *Loc, const Twine &Msg) {
44  PrintError(Loc, Msg);
45  return tgtok::Error;
46}
47
48int TGLexer::getNextChar() {
49  char CurChar = *CurPtr++;
50  switch (CurChar) {
51  default:
52    return (unsigned char)CurChar;
53  case 0: {
54    // A nul character in the stream is either the end of the current buffer or
55    // a random nul in the file.  Disambiguate that here.
56    if (CurPtr-1 != CurBuf->getBufferEnd())
57      return 0;  // Just whitespace.
58
59    // If this is the end of an included file, pop the parent file off the
60    // include stack.
61    SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
62    if (ParentIncludeLoc != SMLoc()) {
63      CurBuffer = SrcMgr.FindBufferContainingLoc(ParentIncludeLoc);
64      CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
65      CurPtr = ParentIncludeLoc.getPointer();
66      return getNextChar();
67    }
68
69    // Otherwise, return end of file.
70    --CurPtr;  // Another call to lex will return EOF again.
71    return EOF;
72  }
73  case '\n':
74  case '\r':
75    // Handle the newline character by ignoring it and incrementing the line
76    // count.  However, be careful about 'dos style' files with \n\r in them.
77    // Only treat a \n\r or \r\n as a single line.
78    if ((*CurPtr == '\n' || (*CurPtr == '\r')) &&
79        *CurPtr != CurChar)
80      ++CurPtr;  // Eat the two char newline sequence.
81    return '\n';
82  }
83}
84
85int TGLexer::peekNextChar(int Index) {
86  return *(CurPtr + Index);
87}
88
89tgtok::TokKind TGLexer::LexToken() {
90  TokStart = CurPtr;
91  // This always consumes at least one character.
92  int CurChar = getNextChar();
93
94  switch (CurChar) {
95  default:
96    // Handle letters: [a-zA-Z_]
97    if (isalpha(CurChar) || CurChar == '_')
98      return LexIdentifier();
99
100    // Unknown character, emit an error.
101    return ReturnError(TokStart, "Unexpected character");
102  case EOF: return tgtok::Eof;
103  case ':': return tgtok::colon;
104  case ';': return tgtok::semi;
105  case '.': return tgtok::period;
106  case ',': return tgtok::comma;
107  case '<': return tgtok::less;
108  case '>': return tgtok::greater;
109  case ']': return tgtok::r_square;
110  case '{': return tgtok::l_brace;
111  case '}': return tgtok::r_brace;
112  case '(': return tgtok::l_paren;
113  case ')': return tgtok::r_paren;
114  case '=': return tgtok::equal;
115  case '?': return tgtok::question;
116  case '#': return tgtok::paste;
117
118  case 0:
119  case ' ':
120  case '\t':
121  case '\n':
122  case '\r':
123    // Ignore whitespace.
124    return LexToken();
125  case '/':
126    // If this is the start of a // comment, skip until the end of the line or
127    // the end of the buffer.
128    if (*CurPtr == '/')
129      SkipBCPLComment();
130    else if (*CurPtr == '*') {
131      if (SkipCComment())
132        return tgtok::Error;
133    } else // Otherwise, this is an error.
134      return ReturnError(TokStart, "Unexpected character");
135    return LexToken();
136  case '-': case '+':
137  case '0': case '1': case '2': case '3': case '4': case '5': case '6':
138  case '7': case '8': case '9': {
139    int NextChar = 0;
140    if (isdigit(CurChar)) {
141      // Allow identifiers to start with a number if it is followed by
142      // an identifier.  This can happen with paste operations like
143      // foo#8i.
144      int i = 0;
145      do {
146        NextChar = peekNextChar(i++);
147      } while (isdigit(NextChar));
148
149      if (NextChar == 'x' || NextChar == 'b') {
150        // If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most
151        // likely a number.
152        int NextNextChar = peekNextChar(i);
153        switch (NextNextChar) {
154        default:
155          break;
156        case '0': case '1':
157          if (NextChar == 'b')
158            return LexNumber();
159          // Fallthrough
160        case '2': case '3': case '4': case '5':
161        case '6': case '7': case '8': case '9':
162        case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
163        case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
164          if (NextChar == 'x')
165            return LexNumber();
166          break;
167        }
168      }
169    }
170
171    if (isalpha(NextChar) || NextChar == '_')
172      return LexIdentifier();
173
174    return LexNumber();
175  }
176  case '"': return LexString();
177  case '$': return LexVarName();
178  case '[': return LexBracket();
179  case '!': return LexExclaim();
180  }
181}
182
183/// LexString - Lex "[^"]*"
184tgtok::TokKind TGLexer::LexString() {
185  const char *StrStart = CurPtr;
186
187  CurStrVal = "";
188
189  while (*CurPtr != '"') {
190    // If we hit the end of the buffer, report an error.
191    if (*CurPtr == 0 && CurPtr == CurBuf->getBufferEnd())
192      return ReturnError(StrStart, "End of file in string literal");
193
194    if (*CurPtr == '\n' || *CurPtr == '\r')
195      return ReturnError(StrStart, "End of line in string literal");
196
197    if (*CurPtr != '\\') {
198      CurStrVal += *CurPtr++;
199      continue;
200    }
201
202    ++CurPtr;
203
204    switch (*CurPtr) {
205    case '\\': case '\'': case '"':
206      // These turn into their literal character.
207      CurStrVal += *CurPtr++;
208      break;
209    case 't':
210      CurStrVal += '\t';
211      ++CurPtr;
212      break;
213    case 'n':
214      CurStrVal += '\n';
215      ++CurPtr;
216      break;
217
218    case '\n':
219    case '\r':
220      return ReturnError(CurPtr, "escaped newlines not supported in tblgen");
221
222    // If we hit the end of the buffer, report an error.
223    case '\0':
224      if (CurPtr == CurBuf->getBufferEnd())
225        return ReturnError(StrStart, "End of file in string literal");
226      // FALL THROUGH
227    default:
228      return ReturnError(CurPtr, "invalid escape in string literal");
229    }
230  }
231
232  ++CurPtr;
233  return tgtok::StrVal;
234}
235
236tgtok::TokKind TGLexer::LexVarName() {
237  if (!isalpha(CurPtr[0]) && CurPtr[0] != '_')
238    return ReturnError(TokStart, "Invalid variable name");
239
240  // Otherwise, we're ok, consume the rest of the characters.
241  const char *VarNameStart = CurPtr++;
242
243  while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
244    ++CurPtr;
245
246  CurStrVal.assign(VarNameStart, CurPtr);
247  return tgtok::VarName;
248}
249
250
251tgtok::TokKind TGLexer::LexIdentifier() {
252  // The first letter is [a-zA-Z_#].
253  const char *IdentStart = TokStart;
254
255  // Match the rest of the identifier regex: [0-9a-zA-Z_#]*
256  while (isalpha(*CurPtr) || isdigit(*CurPtr) || *CurPtr == '_')
257    ++CurPtr;
258
259  // Check to see if this identifier is a keyword.
260  StringRef Str(IdentStart, CurPtr-IdentStart);
261
262  if (Str == "include") {
263    if (LexInclude()) return tgtok::Error;
264    return Lex();
265  }
266
267  tgtok::TokKind Kind = StringSwitch<tgtok::TokKind>(Str)
268    .Case("int", tgtok::Int)
269    .Case("bit", tgtok::Bit)
270    .Case("bits", tgtok::Bits)
271    .Case("string", tgtok::String)
272    .Case("list", tgtok::List)
273    .Case("code", tgtok::Code)
274    .Case("dag", tgtok::Dag)
275    .Case("class", tgtok::Class)
276    .Case("def", tgtok::Def)
277    .Case("foreach", tgtok::Foreach)
278    .Case("defm", tgtok::Defm)
279    .Case("multiclass", tgtok::MultiClass)
280    .Case("field", tgtok::Field)
281    .Case("let", tgtok::Let)
282    .Case("in", tgtok::In)
283    .Default(tgtok::Id);
284
285  if (Kind == tgtok::Id)
286    CurStrVal.assign(Str.begin(), Str.end());
287  return Kind;
288}
289
290/// LexInclude - We just read the "include" token.  Get the string token that
291/// comes next and enter the include.
292bool TGLexer::LexInclude() {
293  // The token after the include must be a string.
294  tgtok::TokKind Tok = LexToken();
295  if (Tok == tgtok::Error) return true;
296  if (Tok != tgtok::StrVal) {
297    PrintError(getLoc(), "Expected filename after include");
298    return true;
299  }
300
301  // Get the string.
302  std::string Filename = CurStrVal;
303  std::string IncludedFile;
304
305
306  CurBuffer = SrcMgr.AddIncludeFile(Filename, SMLoc::getFromPointer(CurPtr),
307                                    IncludedFile);
308  if (CurBuffer == -1) {
309    PrintError(getLoc(), "Could not find include file '" + Filename + "'");
310    return true;
311  }
312
313  Dependencies.push_back(IncludedFile);
314  // Save the line number and lex buffer of the includer.
315  CurBuf = SrcMgr.getMemoryBuffer(CurBuffer);
316  CurPtr = CurBuf->getBufferStart();
317  return false;
318}
319
320void TGLexer::SkipBCPLComment() {
321  ++CurPtr;  // skip the second slash.
322  while (1) {
323    switch (*CurPtr) {
324    case '\n':
325    case '\r':
326      return;  // Newline is end of comment.
327    case 0:
328      // If this is the end of the buffer, end the comment.
329      if (CurPtr == CurBuf->getBufferEnd())
330        return;
331      break;
332    }
333    // Otherwise, skip the character.
334    ++CurPtr;
335  }
336}
337
338/// SkipCComment - This skips C-style /**/ comments.  The only difference from C
339/// is that we allow nesting.
340bool TGLexer::SkipCComment() {
341  ++CurPtr;  // skip the star.
342  unsigned CommentDepth = 1;
343
344  while (1) {
345    int CurChar = getNextChar();
346    switch (CurChar) {
347    case EOF:
348      PrintError(TokStart, "Unterminated comment!");
349      return true;
350    case '*':
351      // End of the comment?
352      if (CurPtr[0] != '/') break;
353
354      ++CurPtr;   // End the */.
355      if (--CommentDepth == 0)
356        return false;
357      break;
358    case '/':
359      // Start of a nested comment?
360      if (CurPtr[0] != '*') break;
361      ++CurPtr;
362      ++CommentDepth;
363      break;
364    }
365  }
366}
367
368/// LexNumber - Lex:
369///    [-+]?[0-9]+
370///    0x[0-9a-fA-F]+
371///    0b[01]+
372tgtok::TokKind TGLexer::LexNumber() {
373  if (CurPtr[-1] == '0') {
374    if (CurPtr[0] == 'x') {
375      ++CurPtr;
376      const char *NumStart = CurPtr;
377      while (isxdigit(CurPtr[0]))
378        ++CurPtr;
379
380      // Requires at least one hex digit.
381      if (CurPtr == NumStart)
382        return ReturnError(TokStart, "Invalid hexadecimal number");
383
384      errno = 0;
385      CurIntVal = strtoll(NumStart, 0, 16);
386      if (errno == EINVAL)
387        return ReturnError(TokStart, "Invalid hexadecimal number");
388      if (errno == ERANGE) {
389        errno = 0;
390        CurIntVal = (int64_t)strtoull(NumStart, 0, 16);
391        if (errno == EINVAL)
392          return ReturnError(TokStart, "Invalid hexadecimal number");
393        if (errno == ERANGE)
394          return ReturnError(TokStart, "Hexadecimal number out of range");
395      }
396      return tgtok::IntVal;
397    } else if (CurPtr[0] == 'b') {
398      ++CurPtr;
399      const char *NumStart = CurPtr;
400      while (CurPtr[0] == '0' || CurPtr[0] == '1')
401        ++CurPtr;
402
403      // Requires at least one binary digit.
404      if (CurPtr == NumStart)
405        return ReturnError(CurPtr-2, "Invalid binary number");
406      CurIntVal = strtoll(NumStart, 0, 2);
407      return tgtok::IntVal;
408    }
409  }
410
411  // Check for a sign without a digit.
412  if (!isdigit(CurPtr[0])) {
413    if (CurPtr[-1] == '-')
414      return tgtok::minus;
415    else if (CurPtr[-1] == '+')
416      return tgtok::plus;
417  }
418
419  while (isdigit(CurPtr[0]))
420    ++CurPtr;
421  CurIntVal = strtoll(TokStart, 0, 10);
422  return tgtok::IntVal;
423}
424
425/// LexBracket - We just read '['.  If this is a code block, return it,
426/// otherwise return the bracket.  Match: '[' and '[{ ( [^}]+ | }[^]] )* }]'
427tgtok::TokKind TGLexer::LexBracket() {
428  if (CurPtr[0] != '{')
429    return tgtok::l_square;
430  ++CurPtr;
431  const char *CodeStart = CurPtr;
432  while (1) {
433    int Char = getNextChar();
434    if (Char == EOF) break;
435
436    if (Char != '}') continue;
437
438    Char = getNextChar();
439    if (Char == EOF) break;
440    if (Char == ']') {
441      CurStrVal.assign(CodeStart, CurPtr-2);
442      return tgtok::CodeFragment;
443    }
444  }
445
446  return ReturnError(CodeStart-2, "Unterminated Code Block");
447}
448
449/// LexExclaim - Lex '!' and '![a-zA-Z]+'.
450tgtok::TokKind TGLexer::LexExclaim() {
451  if (!isalpha(*CurPtr))
452    return ReturnError(CurPtr - 1, "Invalid \"!operator\"");
453
454  const char *Start = CurPtr++;
455  while (isalpha(*CurPtr))
456    ++CurPtr;
457
458  // Check to see which operator this is.
459  tgtok::TokKind Kind =
460    StringSwitch<tgtok::TokKind>(StringRef(Start, CurPtr - Start))
461    .Case("eq", tgtok::XEq)
462    .Case("if", tgtok::XIf)
463    .Case("head", tgtok::XHead)
464    .Case("tail", tgtok::XTail)
465    .Case("con", tgtok::XConcat)
466    .Case("shl", tgtok::XSHL)
467    .Case("sra", tgtok::XSRA)
468    .Case("srl", tgtok::XSRL)
469    .Case("cast", tgtok::XCast)
470    .Case("empty", tgtok::XEmpty)
471    .Case("subst", tgtok::XSubst)
472    .Case("foreach", tgtok::XForEach)
473    .Case("strconcat", tgtok::XStrConcat)
474    .Default(tgtok::Error);
475
476  return Kind != tgtok::Error ? Kind : ReturnError(Start-1, "Unknown operator");
477}
478
479