1//===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This class implements the lexer for assembly files.
11//
12//===----------------------------------------------------------------------===//
13
14#include "llvm/MC/MCParser/AsmLexer.h"
15#include "llvm/MC/MCAsmInfo.h"
16#include "llvm/Support/MemoryBuffer.h"
17#include "llvm/Support/SMLoc.h"
18#include <cctype>
19#include <cerrno>
20#include <cstdio>
21#include <cstdlib>
22using namespace llvm;
23
24AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) {
25  CurPtr = nullptr;
26  isAtStartOfLine = true;
27  AllowAtInIdentifier = !StringRef(MAI.getCommentString()).startswith("@");
28}
29
30AsmLexer::~AsmLexer() {
31}
32
33void AsmLexer::setBuffer(StringRef Buf, const char *ptr) {
34  CurBuf = Buf;
35
36  if (ptr)
37    CurPtr = ptr;
38  else
39    CurPtr = CurBuf.begin();
40
41  TokStart = nullptr;
42}
43
44/// ReturnError - Set the error to the specified string at the specified
45/// location.  This is defined to always return AsmToken::Error.
46AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) {
47  SetError(SMLoc::getFromPointer(Loc), Msg);
48
49  return AsmToken(AsmToken::Error, StringRef(Loc, 0));
50}
51
52int AsmLexer::getNextChar() {
53  char CurChar = *CurPtr++;
54  switch (CurChar) {
55  default:
56    return (unsigned char)CurChar;
57  case 0:
58    // A nul character in the stream is either the end of the current buffer or
59    // a random nul in the file.  Disambiguate that here.
60    if (CurPtr - 1 != CurBuf.end())
61      return 0;  // Just whitespace.
62
63    // Otherwise, return end of file.
64    --CurPtr;  // Another call to lex will return EOF again.
65    return EOF;
66  }
67}
68
69/// LexFloatLiteral: [0-9]*[.][0-9]*([eE][+-]?[0-9]*)?
70///
71/// The leading integral digit sequence and dot should have already been
72/// consumed, some or all of the fractional digit sequence *can* have been
73/// consumed.
74AsmToken AsmLexer::LexFloatLiteral() {
75  // Skip the fractional digit sequence.
76  while (isdigit(*CurPtr))
77    ++CurPtr;
78
79  // Check for exponent; we intentionally accept a slighlty wider set of
80  // literals here and rely on the upstream client to reject invalid ones (e.g.,
81  // "1e+").
82  if (*CurPtr == 'e' || *CurPtr == 'E') {
83    ++CurPtr;
84    if (*CurPtr == '-' || *CurPtr == '+')
85      ++CurPtr;
86    while (isdigit(*CurPtr))
87      ++CurPtr;
88  }
89
90  return AsmToken(AsmToken::Real,
91                  StringRef(TokStart, CurPtr - TokStart));
92}
93
94/// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+
95/// while making sure there are enough actual digits around for the constant to
96/// be valid.
97///
98/// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed
99/// before we get here.
100AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
101  assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') &&
102         "unexpected parse state in floating hex");
103  bool NoFracDigits = true;
104
105  // Skip the fractional part if there is one
106  if (*CurPtr == '.') {
107    ++CurPtr;
108
109    const char *FracStart = CurPtr;
110    while (isxdigit(*CurPtr))
111      ++CurPtr;
112
113    NoFracDigits = CurPtr == FracStart;
114  }
115
116  if (NoIntDigits && NoFracDigits)
117    return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
118                                 "expected at least one significand digit");
119
120  // Make sure we do have some kind of proper exponent part
121  if (*CurPtr != 'p' && *CurPtr != 'P')
122    return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
123                                 "expected exponent part 'p'");
124  ++CurPtr;
125
126  if (*CurPtr == '+' || *CurPtr == '-')
127    ++CurPtr;
128
129  // N.b. exponent digits are *not* hex
130  const char *ExpStart = CurPtr;
131  while (isdigit(*CurPtr))
132    ++CurPtr;
133
134  if (CurPtr == ExpStart)
135    return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
136                                 "expected at least one exponent digit");
137
138  return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
139}
140
141/// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@?]*
142static bool IsIdentifierChar(char c, bool AllowAt) {
143  return isalnum(c) || c == '_' || c == '$' || c == '.' ||
144         (c == '@' && AllowAt) || c == '?';
145}
146AsmToken AsmLexer::LexIdentifier() {
147  // Check for floating point literals.
148  if (CurPtr[-1] == '.' && isdigit(*CurPtr)) {
149    // Disambiguate a .1243foo identifier from a floating literal.
150    while (isdigit(*CurPtr))
151      ++CurPtr;
152    if (*CurPtr == 'e' || *CurPtr == 'E' ||
153        !IsIdentifierChar(*CurPtr, AllowAtInIdentifier))
154      return LexFloatLiteral();
155  }
156
157  while (IsIdentifierChar(*CurPtr, AllowAtInIdentifier))
158    ++CurPtr;
159
160  // Handle . as a special case.
161  if (CurPtr == TokStart+1 && TokStart[0] == '.')
162    return AsmToken(AsmToken::Dot, StringRef(TokStart, 1));
163
164  return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart));
165}
166
167/// LexSlash: Slash: /
168///           C-Style Comment: /* ... */
169AsmToken AsmLexer::LexSlash() {
170  switch (*CurPtr) {
171  case '*': break; // C style comment.
172  case '/': return ++CurPtr, LexLineComment();
173  default:  return AsmToken(AsmToken::Slash, StringRef(CurPtr-1, 1));
174  }
175
176  // C Style comment.
177  ++CurPtr;  // skip the star.
178  while (1) {
179    int CurChar = getNextChar();
180    switch (CurChar) {
181    case EOF:
182      return ReturnError(TokStart, "unterminated comment");
183    case '*':
184      // End of the comment?
185      if (CurPtr[0] != '/') break;
186
187      ++CurPtr;   // End the */.
188      return LexToken();
189    }
190  }
191}
192
193/// LexLineComment: Comment: #[^\n]*
194///                        : //[^\n]*
195AsmToken AsmLexer::LexLineComment() {
196  // FIXME: This is broken if we happen to a comment at the end of a file, which
197  // was .included, and which doesn't end with a newline.
198  int CurChar = getNextChar();
199  while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF)
200    CurChar = getNextChar();
201
202  if (CurChar == EOF)
203    return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
204  return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0));
205}
206
207static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
208  // Skip ULL, UL, U, L and LL suffices.
209  if (CurPtr[0] == 'U')
210    ++CurPtr;
211  if (CurPtr[0] == 'L')
212    ++CurPtr;
213  if (CurPtr[0] == 'L')
214    ++CurPtr;
215}
216
217// Look ahead to search for first non-hex digit, if it's [hH], then we treat the
218// integer as a hexadecimal, possibly with leading zeroes.
219static unsigned doLookAhead(const char *&CurPtr, unsigned DefaultRadix) {
220  const char *FirstHex = nullptr;
221  const char *LookAhead = CurPtr;
222  while (1) {
223    if (isdigit(*LookAhead)) {
224      ++LookAhead;
225    } else if (isxdigit(*LookAhead)) {
226      if (!FirstHex)
227        FirstHex = LookAhead;
228      ++LookAhead;
229    } else {
230      break;
231    }
232  }
233  bool isHex = *LookAhead == 'h' || *LookAhead == 'H';
234  CurPtr = isHex || !FirstHex ? LookAhead : FirstHex;
235  if (isHex)
236    return 16;
237  return DefaultRadix;
238}
239
240static AsmToken intToken(StringRef Ref, APInt &Value)
241{
242  if (Value.isIntN(64))
243    return AsmToken(AsmToken::Integer, Ref, Value);
244  return AsmToken(AsmToken::BigNum, Ref, Value);
245}
246
247/// LexDigit: First character is [0-9].
248///   Local Label: [0-9][:]
249///   Forward/Backward Label: [0-9][fb]
250///   Binary integer: 0b[01]+
251///   Octal integer: 0[0-7]+
252///   Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH]
253///   Decimal integer: [1-9][0-9]*
254AsmToken AsmLexer::LexDigit() {
255  // Decimal integer: [1-9][0-9]*
256  if (CurPtr[-1] != '0' || CurPtr[0] == '.') {
257    unsigned Radix = doLookAhead(CurPtr, 10);
258    bool isHex = Radix == 16;
259    // Check for floating point literals.
260    if (!isHex && (*CurPtr == '.' || *CurPtr == 'e')) {
261      ++CurPtr;
262      return LexFloatLiteral();
263    }
264
265    StringRef Result(TokStart, CurPtr - TokStart);
266
267    APInt Value(128, 0, true);
268    if (Result.getAsInteger(Radix, Value))
269      return ReturnError(TokStart, !isHex ? "invalid decimal number" :
270                           "invalid hexdecimal number");
271
272    // Consume the [bB][hH].
273    if (Radix == 2 || Radix == 16)
274      ++CurPtr;
275
276    // The darwin/x86 (and x86-64) assembler accepts and ignores type
277    // suffices on integer literals.
278    SkipIgnoredIntegerSuffix(CurPtr);
279
280    return intToken(Result, Value);
281  }
282
283  if (*CurPtr == 'b') {
284    ++CurPtr;
285    // See if we actually have "0b" as part of something like "jmp 0b\n"
286    if (!isdigit(CurPtr[0])) {
287      --CurPtr;
288      StringRef Result(TokStart, CurPtr - TokStart);
289      return AsmToken(AsmToken::Integer, Result, 0);
290    }
291    const char *NumStart = CurPtr;
292    while (CurPtr[0] == '0' || CurPtr[0] == '1')
293      ++CurPtr;
294
295    // Requires at least one binary digit.
296    if (CurPtr == NumStart)
297      return ReturnError(TokStart, "invalid binary number");
298
299    StringRef Result(TokStart, CurPtr - TokStart);
300
301    APInt Value(128, 0, true);
302    if (Result.substr(2).getAsInteger(2, Value))
303      return ReturnError(TokStart, "invalid binary number");
304
305    // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
306    // suffixes on integer literals.
307    SkipIgnoredIntegerSuffix(CurPtr);
308
309    return intToken(Result, Value);
310  }
311
312  if (*CurPtr == 'x') {
313    ++CurPtr;
314    const char *NumStart = CurPtr;
315    while (isxdigit(CurPtr[0]))
316      ++CurPtr;
317
318    // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
319    // diagnosed by LexHexFloatLiteral).
320    if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P')
321      return LexHexFloatLiteral(NumStart == CurPtr);
322
323    // Otherwise requires at least one hex digit.
324    if (CurPtr == NumStart)
325      return ReturnError(CurPtr-2, "invalid hexadecimal number");
326
327    APInt Result(128, 0);
328    if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result))
329      return ReturnError(TokStart, "invalid hexadecimal number");
330
331    // Consume the optional [hH].
332    if (*CurPtr == 'h' || *CurPtr == 'H')
333      ++CurPtr;
334
335    // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
336    // suffixes on integer literals.
337    SkipIgnoredIntegerSuffix(CurPtr);
338
339    return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
340  }
341
342  // Either octal or hexadecimal.
343  APInt Value(128, 0, true);
344  unsigned Radix = doLookAhead(CurPtr, 8);
345  bool isHex = Radix == 16;
346  StringRef Result(TokStart, CurPtr - TokStart);
347  if (Result.getAsInteger(Radix, Value))
348    return ReturnError(TokStart, !isHex ? "invalid octal number" :
349                       "invalid hexdecimal number");
350
351  // Consume the [hH].
352  if (Radix == 16)
353    ++CurPtr;
354
355  // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
356  // suffixes on integer literals.
357  SkipIgnoredIntegerSuffix(CurPtr);
358
359  return intToken(Result, Value);
360}
361
362/// LexSingleQuote: Integer: 'b'
363AsmToken AsmLexer::LexSingleQuote() {
364  int CurChar = getNextChar();
365
366  if (CurChar == '\\')
367    CurChar = getNextChar();
368
369  if (CurChar == EOF)
370    return ReturnError(TokStart, "unterminated single quote");
371
372  CurChar = getNextChar();
373
374  if (CurChar != '\'')
375    return ReturnError(TokStart, "single quote way too long");
376
377  // The idea here being that 'c' is basically just an integral
378  // constant.
379  StringRef Res = StringRef(TokStart,CurPtr - TokStart);
380  long long Value;
381
382  if (Res.startswith("\'\\")) {
383    char theChar = Res[2];
384    switch (theChar) {
385      default: Value = theChar; break;
386      case '\'': Value = '\''; break;
387      case 't': Value = '\t'; break;
388      case 'n': Value = '\n'; break;
389      case 'b': Value = '\b'; break;
390    }
391  } else
392    Value = TokStart[1];
393
394  return AsmToken(AsmToken::Integer, Res, Value);
395}
396
397
398/// LexQuote: String: "..."
399AsmToken AsmLexer::LexQuote() {
400  int CurChar = getNextChar();
401  // TODO: does gas allow multiline string constants?
402  while (CurChar != '"') {
403    if (CurChar == '\\') {
404      // Allow \", etc.
405      CurChar = getNextChar();
406    }
407
408    if (CurChar == EOF)
409      return ReturnError(TokStart, "unterminated string constant");
410
411    CurChar = getNextChar();
412  }
413
414  return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
415}
416
417StringRef AsmLexer::LexUntilEndOfStatement() {
418  TokStart = CurPtr;
419
420  while (!isAtStartOfComment(CurPtr) &&     // Start of line comment.
421         !isAtStatementSeparator(CurPtr) && // End of statement marker.
422         *CurPtr != '\n' && *CurPtr != '\r' &&
423         (*CurPtr != 0 || CurPtr != CurBuf.end())) {
424    ++CurPtr;
425  }
426  return StringRef(TokStart, CurPtr-TokStart);
427}
428
429StringRef AsmLexer::LexUntilEndOfLine() {
430  TokStart = CurPtr;
431
432  while (*CurPtr != '\n' && *CurPtr != '\r' &&
433         (*CurPtr != 0 || CurPtr != CurBuf.end())) {
434    ++CurPtr;
435  }
436  return StringRef(TokStart, CurPtr-TokStart);
437}
438
439size_t AsmLexer::peekTokens(MutableArrayRef<AsmToken> Buf,
440                            bool ShouldSkipSpace) {
441  const char *SavedTokStart = TokStart;
442  const char *SavedCurPtr = CurPtr;
443  bool SavedAtStartOfLine = isAtStartOfLine;
444  bool SavedSkipSpace = SkipSpace;
445
446  std::string SavedErr = getErr();
447  SMLoc SavedErrLoc = getErrLoc();
448
449  SkipSpace = ShouldSkipSpace;
450
451  size_t ReadCount;
452  for (ReadCount = 0; ReadCount < Buf.size(); ++ReadCount) {
453    AsmToken Token = LexToken();
454
455    Buf[ReadCount] = Token;
456
457    if (Token.is(AsmToken::Eof))
458      break;
459  }
460
461  SetError(SavedErrLoc, SavedErr);
462
463  SkipSpace = SavedSkipSpace;
464  isAtStartOfLine = SavedAtStartOfLine;
465  CurPtr = SavedCurPtr;
466  TokStart = SavedTokStart;
467
468  return ReadCount;
469}
470
471bool AsmLexer::isAtStartOfComment(const char *Ptr) {
472  const char *CommentString = MAI.getCommentString();
473
474  if (CommentString[1] == '\0')
475    return CommentString[0] == Ptr[0];
476
477  // FIXME: special case for the bogus "##" comment string in X86MCAsmInfoDarwin
478  if (CommentString[1] == '#')
479    return CommentString[0] == Ptr[0];
480
481  return strncmp(Ptr, CommentString, strlen(CommentString)) == 0;
482}
483
484bool AsmLexer::isAtStatementSeparator(const char *Ptr) {
485  return strncmp(Ptr, MAI.getSeparatorString(),
486                 strlen(MAI.getSeparatorString())) == 0;
487}
488
489AsmToken AsmLexer::LexToken() {
490  TokStart = CurPtr;
491  // This always consumes at least one character.
492  int CurChar = getNextChar();
493
494  if (isAtStartOfComment(TokStart)) {
495    // If this comment starts with a '#', then return the Hash token and let
496    // the assembler parser see if it can be parsed as a cpp line filename
497    // comment. We do this only if we are at the start of a line.
498    if (CurChar == '#' && isAtStartOfLine)
499      return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
500    isAtStartOfLine = true;
501    return LexLineComment();
502  }
503  if (isAtStatementSeparator(TokStart)) {
504    CurPtr += strlen(MAI.getSeparatorString()) - 1;
505    return AsmToken(AsmToken::EndOfStatement,
506                    StringRef(TokStart, strlen(MAI.getSeparatorString())));
507  }
508
509  // If we're missing a newline at EOF, make sure we still get an
510  // EndOfStatement token before the Eof token.
511  if (CurChar == EOF && !isAtStartOfLine) {
512    isAtStartOfLine = true;
513    return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
514  }
515
516  isAtStartOfLine = false;
517  switch (CurChar) {
518  default:
519    // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]*
520    if (isalpha(CurChar) || CurChar == '_' || CurChar == '.')
521      return LexIdentifier();
522
523    // Unknown character, emit an error.
524    return ReturnError(TokStart, "invalid character in input");
525  case EOF: return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
526  case 0:
527  case ' ':
528  case '\t':
529    if (SkipSpace) {
530      // Ignore whitespace.
531      return LexToken();
532    } else {
533      int len = 1;
534      while (*CurPtr==' ' || *CurPtr=='\t') {
535        CurPtr++;
536        len++;
537      }
538      return AsmToken(AsmToken::Space, StringRef(TokStart, len));
539    }
540  case '\n': // FALL THROUGH.
541  case '\r':
542    isAtStartOfLine = true;
543    return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
544  case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
545  case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
546  case '-': return AsmToken(AsmToken::Minus, StringRef(TokStart, 1));
547  case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1));
548  case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1));
549  case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1));
550  case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1));
551  case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1));
552  case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1));
553  case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1));
554  case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1));
555  case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
556  case '$': return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1));
557  case '@': return AsmToken(AsmToken::At, StringRef(TokStart, 1));
558  case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1));
559  case '=':
560    if (*CurPtr == '=')
561      return ++CurPtr, AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
562    return AsmToken(AsmToken::Equal, StringRef(TokStart, 1));
563  case '|':
564    if (*CurPtr == '|')
565      return ++CurPtr, AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2));
566    return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1));
567  case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1));
568  case '&':
569    if (*CurPtr == '&')
570      return ++CurPtr, AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2));
571    return AsmToken(AsmToken::Amp, StringRef(TokStart, 1));
572  case '!':
573    if (*CurPtr == '=')
574      return ++CurPtr, AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2));
575    return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1));
576  case '%': return AsmToken(AsmToken::Percent, StringRef(TokStart, 1));
577  case '/': return LexSlash();
578  case '#': return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
579  case '\'': return LexSingleQuote();
580  case '"': return LexQuote();
581  case '0': case '1': case '2': case '3': case '4':
582  case '5': case '6': case '7': case '8': case '9':
583    return LexDigit();
584  case '<':
585    switch (*CurPtr) {
586    case '<': return ++CurPtr, AsmToken(AsmToken::LessLess,
587                                        StringRef(TokStart, 2));
588    case '=': return ++CurPtr, AsmToken(AsmToken::LessEqual,
589                                        StringRef(TokStart, 2));
590    case '>': return ++CurPtr, AsmToken(AsmToken::LessGreater,
591                                        StringRef(TokStart, 2));
592    default: return AsmToken(AsmToken::Less, StringRef(TokStart, 1));
593    }
594  case '>':
595    switch (*CurPtr) {
596    case '>': return ++CurPtr, AsmToken(AsmToken::GreaterGreater,
597                                        StringRef(TokStart, 2));
598    case '=': return ++CurPtr, AsmToken(AsmToken::GreaterEqual,
599                                        StringRef(TokStart, 2));
600    default: return AsmToken(AsmToken::Greater, StringRef(TokStart, 1));
601    }
602
603  // TODO: Quoted identifiers (objc methods etc)
604  // local labels: [0-9][:]
605  // Forward/backward labels: [0-9][fb]
606  // Integers, fp constants, character constants.
607  }
608}
609