AsmLexer.cpp revision cd81d94322a39503e4a3e87b6ee03d4fcb3465fb
1//===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This class implements the lexer for assembly files.
11//
12//===----------------------------------------------------------------------===//
13
14#include "llvm/MC/MCParser/AsmLexer.h"
15#include "llvm/MC/MCAsmInfo.h"
16#include "llvm/Support/MemoryBuffer.h"
17#include "llvm/Support/SMLoc.h"
18#include <cctype>
19#include <cerrno>
20#include <cstdio>
21#include <cstdlib>
22using namespace llvm;
23
24AsmLexer::AsmLexer(const MCAsmInfo &_MAI) : MAI(_MAI)  {
25  CurPtr = nullptr;
26  isAtStartOfLine = true;
27  AllowAtInIdentifier = !StringRef(MAI.getCommentString()).startswith("@");
28}
29
30AsmLexer::~AsmLexer() {
31}
32
33void AsmLexer::setBuffer(StringRef Buf, const char *ptr) {
34  CurBuf = Buf;
35
36  if (ptr)
37    CurPtr = ptr;
38  else
39    CurPtr = CurBuf.begin();
40
41  TokStart = nullptr;
42}
43
44/// ReturnError - Set the error to the specified string at the specified
45/// location.  This is defined to always return AsmToken::Error.
46AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) {
47  SetError(SMLoc::getFromPointer(Loc), Msg);
48
49  return AsmToken(AsmToken::Error, StringRef(Loc, 0));
50}
51
52int AsmLexer::getNextChar() {
53  char CurChar = *CurPtr++;
54  switch (CurChar) {
55  default:
56    return (unsigned char)CurChar;
57  case 0:
58    // A nul character in the stream is either the end of the current buffer or
59    // a random nul in the file.  Disambiguate that here.
60    if (CurPtr - 1 != CurBuf.end())
61      return 0;  // Just whitespace.
62
63    // Otherwise, return end of file.
64    --CurPtr;  // Another call to lex will return EOF again.
65    return EOF;
66  }
67}
68
69/// LexFloatLiteral: [0-9]*[.][0-9]*([eE][+-]?[0-9]*)?
70///
71/// The leading integral digit sequence and dot should have already been
72/// consumed, some or all of the fractional digit sequence *can* have been
73/// consumed.
74AsmToken AsmLexer::LexFloatLiteral() {
75  // Skip the fractional digit sequence.
76  while (isdigit(*CurPtr))
77    ++CurPtr;
78
79  // Check for exponent; we intentionally accept a slighlty wider set of
80  // literals here and rely on the upstream client to reject invalid ones (e.g.,
81  // "1e+").
82  if (*CurPtr == 'e' || *CurPtr == 'E') {
83    ++CurPtr;
84    if (*CurPtr == '-' || *CurPtr == '+')
85      ++CurPtr;
86    while (isdigit(*CurPtr))
87      ++CurPtr;
88  }
89
90  return AsmToken(AsmToken::Real,
91                  StringRef(TokStart, CurPtr - TokStart));
92}
93
94/// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+
95/// while making sure there are enough actual digits around for the constant to
96/// be valid.
97///
98/// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed
99/// before we get here.
100AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
101  assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') &&
102         "unexpected parse state in floating hex");
103  bool NoFracDigits = true;
104
105  // Skip the fractional part if there is one
106  if (*CurPtr == '.') {
107    ++CurPtr;
108
109    const char *FracStart = CurPtr;
110    while (isxdigit(*CurPtr))
111      ++CurPtr;
112
113    NoFracDigits = CurPtr == FracStart;
114  }
115
116  if (NoIntDigits && NoFracDigits)
117    return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
118                                 "expected at least one significand digit");
119
120  // Make sure we do have some kind of proper exponent part
121  if (*CurPtr != 'p' && *CurPtr != 'P')
122    return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
123                                 "expected exponent part 'p'");
124  ++CurPtr;
125
126  if (*CurPtr == '+' || *CurPtr == '-')
127    ++CurPtr;
128
129  // N.b. exponent digits are *not* hex
130  const char *ExpStart = CurPtr;
131  while (isdigit(*CurPtr))
132    ++CurPtr;
133
134  if (CurPtr == ExpStart)
135    return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
136                                 "expected at least one exponent digit");
137
138  return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
139}
140
141/// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@?]*
142static bool IsIdentifierChar(char c, bool AllowAt) {
143  return isalnum(c) || c == '_' || c == '$' || c == '.' ||
144         (c == '@' && AllowAt) || c == '?';
145}
146AsmToken AsmLexer::LexIdentifier() {
147  // Check for floating point literals.
148  if (CurPtr[-1] == '.' && isdigit(*CurPtr)) {
149    // Disambiguate a .1243foo identifier from a floating literal.
150    while (isdigit(*CurPtr))
151      ++CurPtr;
152    if (*CurPtr == 'e' || *CurPtr == 'E' ||
153        !IsIdentifierChar(*CurPtr, AllowAtInIdentifier))
154      return LexFloatLiteral();
155  }
156
157  while (IsIdentifierChar(*CurPtr, AllowAtInIdentifier))
158    ++CurPtr;
159
160  // Handle . as a special case.
161  if (CurPtr == TokStart+1 && TokStart[0] == '.')
162    return AsmToken(AsmToken::Dot, StringRef(TokStart, 1));
163
164  return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart));
165}
166
167/// LexSlash: Slash: /
168///           C-Style Comment: /* ... */
169AsmToken AsmLexer::LexSlash() {
170  switch (*CurPtr) {
171  case '*': break; // C style comment.
172  case '/': return ++CurPtr, LexLineComment();
173  default:  return AsmToken(AsmToken::Slash, StringRef(CurPtr-1, 1));
174  }
175
176  // C Style comment.
177  ++CurPtr;  // skip the star.
178  while (1) {
179    int CurChar = getNextChar();
180    switch (CurChar) {
181    case EOF:
182      return ReturnError(TokStart, "unterminated comment");
183    case '*':
184      // End of the comment?
185      if (CurPtr[0] != '/') break;
186
187      ++CurPtr;   // End the */.
188      return LexToken();
189    }
190  }
191}
192
193/// LexLineComment: Comment: #[^\n]*
194///                        : //[^\n]*
195AsmToken AsmLexer::LexLineComment() {
196  // FIXME: This is broken if we happen to a comment at the end of a file, which
197  // was .included, and which doesn't end with a newline.
198  int CurChar = getNextChar();
199  while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF)
200    CurChar = getNextChar();
201
202  if (CurChar == EOF)
203    return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
204  return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0));
205}
206
207static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
208  // Skip ULL, UL, U, L and LL suffices.
209  if (CurPtr[0] == 'U')
210    ++CurPtr;
211  if (CurPtr[0] == 'L')
212    ++CurPtr;
213  if (CurPtr[0] == 'L')
214    ++CurPtr;
215}
216
217// Look ahead to search for first non-hex digit, if it's [hH], then we treat the
218// integer as a hexadecimal, possibly with leading zeroes.
219static unsigned doLookAhead(const char *&CurPtr, unsigned DefaultRadix) {
220  const char *FirstHex = nullptr;
221  const char *LookAhead = CurPtr;
222  while (1) {
223    if (isdigit(*LookAhead)) {
224      ++LookAhead;
225    } else if (isxdigit(*LookAhead)) {
226      if (!FirstHex)
227        FirstHex = LookAhead;
228      ++LookAhead;
229    } else {
230      break;
231    }
232  }
233  bool isHex = *LookAhead == 'h' || *LookAhead == 'H';
234  CurPtr = isHex || !FirstHex ? LookAhead : FirstHex;
235  if (isHex)
236    return 16;
237  return DefaultRadix;
238}
239
240static AsmToken intToken(StringRef Ref, APInt &Value)
241{
242  if (Value.isIntN(64))
243    return AsmToken(AsmToken::Integer, Ref, Value);
244  return AsmToken(AsmToken::BigNum, Ref, Value);
245}
246
247/// LexDigit: First character is [0-9].
248///   Local Label: [0-9][:]
249///   Forward/Backward Label: [0-9][fb]
250///   Binary integer: 0b[01]+
251///   Octal integer: 0[0-7]+
252///   Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH]
253///   Decimal integer: [1-9][0-9]*
254AsmToken AsmLexer::LexDigit() {
255  // Decimal integer: [1-9][0-9]*
256  if (CurPtr[-1] != '0' || CurPtr[0] == '.') {
257    unsigned Radix = doLookAhead(CurPtr, 10);
258    bool isHex = Radix == 16;
259    // Check for floating point literals.
260    if (!isHex && (*CurPtr == '.' || *CurPtr == 'e')) {
261      ++CurPtr;
262      return LexFloatLiteral();
263    }
264
265    StringRef Result(TokStart, CurPtr - TokStart);
266
267    APInt Value(128, 0, true);
268    if (Result.getAsInteger(Radix, Value))
269      return ReturnError(TokStart, !isHex ? "invalid decimal number" :
270                           "invalid hexdecimal number");
271
272    // Consume the [bB][hH].
273    if (Radix == 2 || Radix == 16)
274      ++CurPtr;
275
276    // The darwin/x86 (and x86-64) assembler accepts and ignores type
277    // suffices on integer literals.
278    SkipIgnoredIntegerSuffix(CurPtr);
279
280    return intToken(Result, Value);
281  }
282
283  if (*CurPtr == 'b') {
284    ++CurPtr;
285    // See if we actually have "0b" as part of something like "jmp 0b\n"
286    if (!isdigit(CurPtr[0])) {
287      --CurPtr;
288      StringRef Result(TokStart, CurPtr - TokStart);
289      return AsmToken(AsmToken::Integer, Result, 0);
290    }
291    const char *NumStart = CurPtr;
292    while (CurPtr[0] == '0' || CurPtr[0] == '1')
293      ++CurPtr;
294
295    // Requires at least one binary digit.
296    if (CurPtr == NumStart)
297      return ReturnError(TokStart, "invalid binary number");
298
299    StringRef Result(TokStart, CurPtr - TokStart);
300
301    APInt Value(128, 0, true);
302    if (Result.substr(2).getAsInteger(2, Value))
303      return ReturnError(TokStart, "invalid binary number");
304
305    // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
306    // suffixes on integer literals.
307    SkipIgnoredIntegerSuffix(CurPtr);
308
309    return intToken(Result, Value);
310  }
311
312  if (*CurPtr == 'x') {
313    ++CurPtr;
314    const char *NumStart = CurPtr;
315    while (isxdigit(CurPtr[0]))
316      ++CurPtr;
317
318    // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
319    // diagnosed by LexHexFloatLiteral).
320    if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P')
321      return LexHexFloatLiteral(NumStart == CurPtr);
322
323    // Otherwise requires at least one hex digit.
324    if (CurPtr == NumStart)
325      return ReturnError(CurPtr-2, "invalid hexadecimal number");
326
327    APInt Result(128, 0);
328    if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result))
329      return ReturnError(TokStart, "invalid hexadecimal number");
330
331    // Consume the optional [hH].
332    if (*CurPtr == 'h' || *CurPtr == 'H')
333      ++CurPtr;
334
335    // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
336    // suffixes on integer literals.
337    SkipIgnoredIntegerSuffix(CurPtr);
338
339    return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
340  }
341
342  // Either octal or hexadecimal.
343  APInt Value(128, 0, true);
344  unsigned Radix = doLookAhead(CurPtr, 8);
345  bool isHex = Radix == 16;
346  StringRef Result(TokStart, CurPtr - TokStart);
347  if (Result.getAsInteger(Radix, Value))
348    return ReturnError(TokStart, !isHex ? "invalid octal number" :
349                       "invalid hexdecimal number");
350
351  // Consume the [hH].
352  if (Radix == 16)
353    ++CurPtr;
354
355  // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
356  // suffixes on integer literals.
357  SkipIgnoredIntegerSuffix(CurPtr);
358
359  return intToken(Result, Value);
360}
361
362/// LexSingleQuote: Integer: 'b'
363AsmToken AsmLexer::LexSingleQuote() {
364  int CurChar = getNextChar();
365
366  if (CurChar == '\\')
367    CurChar = getNextChar();
368
369  if (CurChar == EOF)
370    return ReturnError(TokStart, "unterminated single quote");
371
372  CurChar = getNextChar();
373
374  if (CurChar != '\'')
375    return ReturnError(TokStart, "single quote way too long");
376
377  // The idea here being that 'c' is basically just an integral
378  // constant.
379  StringRef Res = StringRef(TokStart,CurPtr - TokStart);
380  long long Value;
381
382  if (Res.startswith("\'\\")) {
383    char theChar = Res[2];
384    switch (theChar) {
385      default: Value = theChar; break;
386      case '\'': Value = '\''; break;
387      case 't': Value = '\t'; break;
388      case 'n': Value = '\n'; break;
389      case 'b': Value = '\b'; break;
390    }
391  } else
392    Value = TokStart[1];
393
394  return AsmToken(AsmToken::Integer, Res, Value);
395}
396
397
398/// LexQuote: String: "..."
399AsmToken AsmLexer::LexQuote() {
400  int CurChar = getNextChar();
401  // TODO: does gas allow multiline string constants?
402  while (CurChar != '"') {
403    if (CurChar == '\\') {
404      // Allow \", etc.
405      CurChar = getNextChar();
406    }
407
408    if (CurChar == EOF)
409      return ReturnError(TokStart, "unterminated string constant");
410
411    CurChar = getNextChar();
412  }
413
414  return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
415}
416
417StringRef AsmLexer::LexUntilEndOfStatement() {
418  TokStart = CurPtr;
419
420  while (!isAtStartOfComment(*CurPtr) &&    // Start of line comment.
421         !isAtStatementSeparator(CurPtr) && // End of statement marker.
422         *CurPtr != '\n' && *CurPtr != '\r' &&
423         (*CurPtr != 0 || CurPtr != CurBuf.end())) {
424    ++CurPtr;
425  }
426  return StringRef(TokStart, CurPtr-TokStart);
427}
428
429StringRef AsmLexer::LexUntilEndOfLine() {
430  TokStart = CurPtr;
431
432  while (*CurPtr != '\n' && *CurPtr != '\r' &&
433         (*CurPtr != 0 || CurPtr != CurBuf.end())) {
434    ++CurPtr;
435  }
436  return StringRef(TokStart, CurPtr-TokStart);
437}
438
439const AsmToken AsmLexer::peekTok(bool ShouldSkipSpace) {
440  const char *SavedTokStart = TokStart;
441  const char *SavedCurPtr = CurPtr;
442  bool SavedAtStartOfLine = isAtStartOfLine;
443  bool SavedSkipSpace = SkipSpace;
444
445  std::string SavedErr = getErr();
446  SMLoc SavedErrLoc = getErrLoc();
447
448  SkipSpace = ShouldSkipSpace;
449  AsmToken Token = LexToken();
450
451  SetError(SavedErrLoc, SavedErr);
452
453  SkipSpace = SavedSkipSpace;
454  isAtStartOfLine = SavedAtStartOfLine;
455  CurPtr = SavedCurPtr;
456  TokStart = SavedTokStart;
457
458  return Token;
459}
460
461bool AsmLexer::isAtStartOfComment(char Char) {
462  // FIXME: This won't work for multi-character comment indicators like "//".
463  return Char == *MAI.getCommentString();
464}
465
466bool AsmLexer::isAtStatementSeparator(const char *Ptr) {
467  return strncmp(Ptr, MAI.getSeparatorString(),
468                 strlen(MAI.getSeparatorString())) == 0;
469}
470
471AsmToken AsmLexer::LexToken() {
472  TokStart = CurPtr;
473  // This always consumes at least one character.
474  int CurChar = getNextChar();
475
476  if (isAtStartOfComment(CurChar)) {
477    // If this comment starts with a '#', then return the Hash token and let
478    // the assembler parser see if it can be parsed as a cpp line filename
479    // comment. We do this only if we are at the start of a line.
480    if (CurChar == '#' && isAtStartOfLine)
481      return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
482    isAtStartOfLine = true;
483    return LexLineComment();
484  }
485  if (isAtStatementSeparator(TokStart)) {
486    CurPtr += strlen(MAI.getSeparatorString()) - 1;
487    return AsmToken(AsmToken::EndOfStatement,
488                    StringRef(TokStart, strlen(MAI.getSeparatorString())));
489  }
490
491  // If we're missing a newline at EOF, make sure we still get an
492  // EndOfStatement token before the Eof token.
493  if (CurChar == EOF && !isAtStartOfLine) {
494    isAtStartOfLine = true;
495    return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
496  }
497
498  isAtStartOfLine = false;
499  switch (CurChar) {
500  default:
501    // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]*
502    if (isalpha(CurChar) || CurChar == '_' || CurChar == '.')
503      return LexIdentifier();
504
505    // Unknown character, emit an error.
506    return ReturnError(TokStart, "invalid character in input");
507  case EOF: return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
508  case 0:
509  case ' ':
510  case '\t':
511    if (SkipSpace) {
512      // Ignore whitespace.
513      return LexToken();
514    } else {
515      int len = 1;
516      while (*CurPtr==' ' || *CurPtr=='\t') {
517        CurPtr++;
518        len++;
519      }
520      return AsmToken(AsmToken::Space, StringRef(TokStart, len));
521    }
522  case '\n': // FALL THROUGH.
523  case '\r':
524    isAtStartOfLine = true;
525    return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
526  case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
527  case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
528  case '-': return AsmToken(AsmToken::Minus, StringRef(TokStart, 1));
529  case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1));
530  case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1));
531  case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1));
532  case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1));
533  case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1));
534  case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1));
535  case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1));
536  case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1));
537  case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
538  case '$': return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1));
539  case '@': return AsmToken(AsmToken::At, StringRef(TokStart, 1));
540  case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1));
541  case '=':
542    if (*CurPtr == '=')
543      return ++CurPtr, AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
544    return AsmToken(AsmToken::Equal, StringRef(TokStart, 1));
545  case '|':
546    if (*CurPtr == '|')
547      return ++CurPtr, AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2));
548    return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1));
549  case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1));
550  case '&':
551    if (*CurPtr == '&')
552      return ++CurPtr, AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2));
553    return AsmToken(AsmToken::Amp, StringRef(TokStart, 1));
554  case '!':
555    if (*CurPtr == '=')
556      return ++CurPtr, AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2));
557    return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1));
558  case '%': return AsmToken(AsmToken::Percent, StringRef(TokStart, 1));
559  case '/': return LexSlash();
560  case '#': return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
561  case '\'': return LexSingleQuote();
562  case '"': return LexQuote();
563  case '0': case '1': case '2': case '3': case '4':
564  case '5': case '6': case '7': case '8': case '9':
565    return LexDigit();
566  case '<':
567    switch (*CurPtr) {
568    case '<': return ++CurPtr, AsmToken(AsmToken::LessLess,
569                                        StringRef(TokStart, 2));
570    case '=': return ++CurPtr, AsmToken(AsmToken::LessEqual,
571                                        StringRef(TokStart, 2));
572    case '>': return ++CurPtr, AsmToken(AsmToken::LessGreater,
573                                        StringRef(TokStart, 2));
574    default: return AsmToken(AsmToken::Less, StringRef(TokStart, 1));
575    }
576  case '>':
577    switch (*CurPtr) {
578    case '>': return ++CurPtr, AsmToken(AsmToken::GreaterGreater,
579                                        StringRef(TokStart, 2));
580    case '=': return ++CurPtr, AsmToken(AsmToken::GreaterEqual,
581                                        StringRef(TokStart, 2));
582    default: return AsmToken(AsmToken::Greater, StringRef(TokStart, 1));
583    }
584
585  // TODO: Quoted identifiers (objc methods etc)
586  // local labels: [0-9][:]
587  // Forward/backward labels: [0-9][fb]
588  // Integers, fp constants, character constants.
589  }
590}
591