AsmLexer.cpp revision dce4a407a24b04eebc6a376f8e62b41aaa7b071f
1//===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This class implements the lexer for assembly files.
11//
12//===----------------------------------------------------------------------===//
13
14#include "llvm/MC/MCParser/AsmLexer.h"
15#include "llvm/MC/MCAsmInfo.h"
16#include "llvm/Support/MemoryBuffer.h"
17#include "llvm/Support/SMLoc.h"
18#include <cctype>
19#include <cerrno>
20#include <cstdio>
21#include <cstdlib>
22using namespace llvm;
23
24AsmLexer::AsmLexer(const MCAsmInfo &_MAI) : MAI(_MAI)  {
25  CurBuf = nullptr;
26  CurPtr = nullptr;
27  isAtStartOfLine = true;
28  AllowAtInIdentifier = !StringRef(MAI.getCommentString()).startswith("@");
29}
30
31AsmLexer::~AsmLexer() {
32}
33
34void AsmLexer::setBuffer(const MemoryBuffer *buf, const char *ptr) {
35  CurBuf = buf;
36
37  if (ptr)
38    CurPtr = ptr;
39  else
40    CurPtr = CurBuf->getBufferStart();
41
42  TokStart = nullptr;
43}
44
45/// ReturnError - Set the error to the specified string at the specified
46/// location.  This is defined to always return AsmToken::Error.
47AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) {
48  SetError(SMLoc::getFromPointer(Loc), Msg);
49
50  return AsmToken(AsmToken::Error, StringRef(Loc, 0));
51}
52
53int AsmLexer::getNextChar() {
54  char CurChar = *CurPtr++;
55  switch (CurChar) {
56  default:
57    return (unsigned char)CurChar;
58  case 0:
59    // A nul character in the stream is either the end of the current buffer or
60    // a random nul in the file.  Disambiguate that here.
61    if (CurPtr-1 != CurBuf->getBufferEnd())
62      return 0;  // Just whitespace.
63
64    // Otherwise, return end of file.
65    --CurPtr;  // Another call to lex will return EOF again.
66    return EOF;
67  }
68}
69
70/// LexFloatLiteral: [0-9]*[.][0-9]*([eE][+-]?[0-9]*)?
71///
72/// The leading integral digit sequence and dot should have already been
73/// consumed, some or all of the fractional digit sequence *can* have been
74/// consumed.
75AsmToken AsmLexer::LexFloatLiteral() {
76  // Skip the fractional digit sequence.
77  while (isdigit(*CurPtr))
78    ++CurPtr;
79
80  // Check for exponent; we intentionally accept a slighlty wider set of
81  // literals here and rely on the upstream client to reject invalid ones (e.g.,
82  // "1e+").
83  if (*CurPtr == 'e' || *CurPtr == 'E') {
84    ++CurPtr;
85    if (*CurPtr == '-' || *CurPtr == '+')
86      ++CurPtr;
87    while (isdigit(*CurPtr))
88      ++CurPtr;
89  }
90
91  return AsmToken(AsmToken::Real,
92                  StringRef(TokStart, CurPtr - TokStart));
93}
94
95/// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+
96/// while making sure there are enough actual digits around for the constant to
97/// be valid.
98///
99/// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed
100/// before we get here.
101AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
102  assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') &&
103         "unexpected parse state in floating hex");
104  bool NoFracDigits = true;
105
106  // Skip the fractional part if there is one
107  if (*CurPtr == '.') {
108    ++CurPtr;
109
110    const char *FracStart = CurPtr;
111    while (isxdigit(*CurPtr))
112      ++CurPtr;
113
114    NoFracDigits = CurPtr == FracStart;
115  }
116
117  if (NoIntDigits && NoFracDigits)
118    return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
119                                 "expected at least one significand digit");
120
121  // Make sure we do have some kind of proper exponent part
122  if (*CurPtr != 'p' && *CurPtr != 'P')
123    return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
124                                 "expected exponent part 'p'");
125  ++CurPtr;
126
127  if (*CurPtr == '+' || *CurPtr == '-')
128    ++CurPtr;
129
130  // N.b. exponent digits are *not* hex
131  const char *ExpStart = CurPtr;
132  while (isdigit(*CurPtr))
133    ++CurPtr;
134
135  if (CurPtr == ExpStart)
136    return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
137                                 "expected at least one exponent digit");
138
139  return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
140}
141
142/// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@?]*
143static bool IsIdentifierChar(char c, bool AllowAt) {
144  return isalnum(c) || c == '_' || c == '$' || c == '.' ||
145         (c == '@' && AllowAt) || c == '?';
146}
147AsmToken AsmLexer::LexIdentifier() {
148  // Check for floating point literals.
149  if (CurPtr[-1] == '.' && isdigit(*CurPtr)) {
150    // Disambiguate a .1243foo identifier from a floating literal.
151    while (isdigit(*CurPtr))
152      ++CurPtr;
153    if (*CurPtr == 'e' || *CurPtr == 'E' ||
154        !IsIdentifierChar(*CurPtr, AllowAtInIdentifier))
155      return LexFloatLiteral();
156  }
157
158  while (IsIdentifierChar(*CurPtr, AllowAtInIdentifier))
159    ++CurPtr;
160
161  // Handle . as a special case.
162  if (CurPtr == TokStart+1 && TokStart[0] == '.')
163    return AsmToken(AsmToken::Dot, StringRef(TokStart, 1));
164
165  return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart));
166}
167
168/// LexSlash: Slash: /
169///           C-Style Comment: /* ... */
170AsmToken AsmLexer::LexSlash() {
171  switch (*CurPtr) {
172  case '*': break; // C style comment.
173  case '/': return ++CurPtr, LexLineComment();
174  default:  return AsmToken(AsmToken::Slash, StringRef(CurPtr-1, 1));
175  }
176
177  // C Style comment.
178  ++CurPtr;  // skip the star.
179  while (1) {
180    int CurChar = getNextChar();
181    switch (CurChar) {
182    case EOF:
183      return ReturnError(TokStart, "unterminated comment");
184    case '*':
185      // End of the comment?
186      if (CurPtr[0] != '/') break;
187
188      ++CurPtr;   // End the */.
189      return LexToken();
190    }
191  }
192}
193
194/// LexLineComment: Comment: #[^\n]*
195///                        : //[^\n]*
196AsmToken AsmLexer::LexLineComment() {
197  // FIXME: This is broken if we happen to a comment at the end of a file, which
198  // was .included, and which doesn't end with a newline.
199  int CurChar = getNextChar();
200  while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF)
201    CurChar = getNextChar();
202
203  if (CurChar == EOF)
204    return AsmToken(AsmToken::Eof, StringRef(CurPtr, 0));
205  return AsmToken(AsmToken::EndOfStatement, StringRef(CurPtr, 0));
206}
207
208static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
209  // Skip ULL, UL, U, L and LL suffices.
210  if (CurPtr[0] == 'U')
211    ++CurPtr;
212  if (CurPtr[0] == 'L')
213    ++CurPtr;
214  if (CurPtr[0] == 'L')
215    ++CurPtr;
216}
217
218// Look ahead to search for first non-hex digit, if it's [hH], then we treat the
219// integer as a hexadecimal, possibly with leading zeroes.
220static unsigned doLookAhead(const char *&CurPtr, unsigned DefaultRadix) {
221  const char *FirstHex = nullptr;
222  const char *LookAhead = CurPtr;
223  while (1) {
224    if (isdigit(*LookAhead)) {
225      ++LookAhead;
226    } else if (isxdigit(*LookAhead)) {
227      if (!FirstHex)
228        FirstHex = LookAhead;
229      ++LookAhead;
230    } else {
231      break;
232    }
233  }
234  bool isHex = *LookAhead == 'h' || *LookAhead == 'H';
235  CurPtr = isHex || !FirstHex ? LookAhead : FirstHex;
236  if (isHex)
237    return 16;
238  return DefaultRadix;
239}
240
241static AsmToken intToken(StringRef Ref, APInt &Value)
242{
243  if (Value.isIntN(64))
244    return AsmToken(AsmToken::Integer, Ref, Value);
245  return AsmToken(AsmToken::BigNum, Ref, Value);
246}
247
248/// LexDigit: First character is [0-9].
249///   Local Label: [0-9][:]
250///   Forward/Backward Label: [0-9][fb]
251///   Binary integer: 0b[01]+
252///   Octal integer: 0[0-7]+
253///   Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH]
254///   Decimal integer: [1-9][0-9]*
255AsmToken AsmLexer::LexDigit() {
256  // Decimal integer: [1-9][0-9]*
257  if (CurPtr[-1] != '0' || CurPtr[0] == '.') {
258    unsigned Radix = doLookAhead(CurPtr, 10);
259    bool isHex = Radix == 16;
260    // Check for floating point literals.
261    if (!isHex && (*CurPtr == '.' || *CurPtr == 'e')) {
262      ++CurPtr;
263      return LexFloatLiteral();
264    }
265
266    StringRef Result(TokStart, CurPtr - TokStart);
267
268    APInt Value(128, 0, true);
269    if (Result.getAsInteger(Radix, Value))
270      return ReturnError(TokStart, !isHex ? "invalid decimal number" :
271                           "invalid hexdecimal number");
272
273    // Consume the [bB][hH].
274    if (Radix == 2 || Radix == 16)
275      ++CurPtr;
276
277    // The darwin/x86 (and x86-64) assembler accepts and ignores type
278    // suffices on integer literals.
279    SkipIgnoredIntegerSuffix(CurPtr);
280
281    return intToken(Result, Value);
282  }
283
284  if (*CurPtr == 'b') {
285    ++CurPtr;
286    // See if we actually have "0b" as part of something like "jmp 0b\n"
287    if (!isdigit(CurPtr[0])) {
288      --CurPtr;
289      StringRef Result(TokStart, CurPtr - TokStart);
290      return AsmToken(AsmToken::Integer, Result, 0);
291    }
292    const char *NumStart = CurPtr;
293    while (CurPtr[0] == '0' || CurPtr[0] == '1')
294      ++CurPtr;
295
296    // Requires at least one binary digit.
297    if (CurPtr == NumStart)
298      return ReturnError(TokStart, "invalid binary number");
299
300    StringRef Result(TokStart, CurPtr - TokStart);
301
302    APInt Value(128, 0, true);
303    if (Result.substr(2).getAsInteger(2, Value))
304      return ReturnError(TokStart, "invalid binary number");
305
306    // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
307    // suffixes on integer literals.
308    SkipIgnoredIntegerSuffix(CurPtr);
309
310    return intToken(Result, Value);
311  }
312
313  if (*CurPtr == 'x') {
314    ++CurPtr;
315    const char *NumStart = CurPtr;
316    while (isxdigit(CurPtr[0]))
317      ++CurPtr;
318
319    // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
320    // diagnosed by LexHexFloatLiteral).
321    if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P')
322      return LexHexFloatLiteral(NumStart == CurPtr);
323
324    // Otherwise requires at least one hex digit.
325    if (CurPtr == NumStart)
326      return ReturnError(CurPtr-2, "invalid hexadecimal number");
327
328    APInt Result(128, 0);
329    if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result))
330      return ReturnError(TokStart, "invalid hexadecimal number");
331
332    // Consume the optional [hH].
333    if (*CurPtr == 'h' || *CurPtr == 'H')
334      ++CurPtr;
335
336    // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
337    // suffixes on integer literals.
338    SkipIgnoredIntegerSuffix(CurPtr);
339
340    return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
341  }
342
343  // Either octal or hexadecimal.
344  APInt Value(128, 0, true);
345  unsigned Radix = doLookAhead(CurPtr, 8);
346  bool isHex = Radix == 16;
347  StringRef Result(TokStart, CurPtr - TokStart);
348  if (Result.getAsInteger(Radix, Value))
349    return ReturnError(TokStart, !isHex ? "invalid octal number" :
350                       "invalid hexdecimal number");
351
352  // Consume the [hH].
353  if (Radix == 16)
354    ++CurPtr;
355
356  // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
357  // suffixes on integer literals.
358  SkipIgnoredIntegerSuffix(CurPtr);
359
360  return intToken(Result, Value);
361}
362
363/// LexSingleQuote: Integer: 'b'
364AsmToken AsmLexer::LexSingleQuote() {
365  int CurChar = getNextChar();
366
367  if (CurChar == '\\')
368    CurChar = getNextChar();
369
370  if (CurChar == EOF)
371    return ReturnError(TokStart, "unterminated single quote");
372
373  CurChar = getNextChar();
374
375  if (CurChar != '\'')
376    return ReturnError(TokStart, "single quote way too long");
377
378  // The idea here being that 'c' is basically just an integral
379  // constant.
380  StringRef Res = StringRef(TokStart,CurPtr - TokStart);
381  long long Value;
382
383  if (Res.startswith("\'\\")) {
384    char theChar = Res[2];
385    switch (theChar) {
386      default: Value = theChar; break;
387      case '\'': Value = '\''; break;
388      case 't': Value = '\t'; break;
389      case 'n': Value = '\n'; break;
390      case 'b': Value = '\b'; break;
391    }
392  } else
393    Value = TokStart[1];
394
395  return AsmToken(AsmToken::Integer, Res, Value);
396}
397
398
399/// LexQuote: String: "..."
400AsmToken AsmLexer::LexQuote() {
401  int CurChar = getNextChar();
402  // TODO: does gas allow multiline string constants?
403  while (CurChar != '"') {
404    if (CurChar == '\\') {
405      // Allow \", etc.
406      CurChar = getNextChar();
407    }
408
409    if (CurChar == EOF)
410      return ReturnError(TokStart, "unterminated string constant");
411
412    CurChar = getNextChar();
413  }
414
415  return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
416}
417
418StringRef AsmLexer::LexUntilEndOfStatement() {
419  TokStart = CurPtr;
420
421  while (!isAtStartOfComment(*CurPtr) &&    // Start of line comment.
422         !isAtStatementSeparator(CurPtr) && // End of statement marker.
423         *CurPtr != '\n' &&
424         *CurPtr != '\r' &&
425         (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) {
426    ++CurPtr;
427  }
428  return StringRef(TokStart, CurPtr-TokStart);
429}
430
431StringRef AsmLexer::LexUntilEndOfLine() {
432  TokStart = CurPtr;
433
434  while (*CurPtr != '\n' &&
435         *CurPtr != '\r' &&
436         (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) {
437    ++CurPtr;
438  }
439  return StringRef(TokStart, CurPtr-TokStart);
440}
441
442const AsmToken AsmLexer::peekTok(bool ShouldSkipSpace) {
443  const char *SavedTokStart = TokStart;
444  const char *SavedCurPtr = CurPtr;
445  bool SavedAtStartOfLine = isAtStartOfLine;
446  bool SavedSkipSpace = SkipSpace;
447
448  std::string SavedErr = getErr();
449  SMLoc SavedErrLoc = getErrLoc();
450
451  SkipSpace = ShouldSkipSpace;
452  AsmToken Token = LexToken();
453
454  SetError(SavedErrLoc, SavedErr);
455
456  SkipSpace = SavedSkipSpace;
457  isAtStartOfLine = SavedAtStartOfLine;
458  CurPtr = SavedCurPtr;
459  TokStart = SavedTokStart;
460
461  return Token;
462}
463
464bool AsmLexer::isAtStartOfComment(char Char) {
465  // FIXME: This won't work for multi-character comment indicators like "//".
466  return Char == *MAI.getCommentString();
467}
468
469bool AsmLexer::isAtStatementSeparator(const char *Ptr) {
470  return strncmp(Ptr, MAI.getSeparatorString(),
471                 strlen(MAI.getSeparatorString())) == 0;
472}
473
474AsmToken AsmLexer::LexToken() {
475  TokStart = CurPtr;
476  // This always consumes at least one character.
477  int CurChar = getNextChar();
478
479  if (isAtStartOfComment(CurChar)) {
480    // If this comment starts with a '#', then return the Hash token and let
481    // the assembler parser see if it can be parsed as a cpp line filename
482    // comment. We do this only if we are at the start of a line.
483    if (CurChar == '#' && isAtStartOfLine)
484      return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
485    isAtStartOfLine = true;
486    return LexLineComment();
487  }
488  if (isAtStatementSeparator(TokStart)) {
489    CurPtr += strlen(MAI.getSeparatorString()) - 1;
490    return AsmToken(AsmToken::EndOfStatement,
491                    StringRef(TokStart, strlen(MAI.getSeparatorString())));
492  }
493
494  // If we're missing a newline at EOF, make sure we still get an
495  // EndOfStatement token before the Eof token.
496  if (CurChar == EOF && !isAtStartOfLine) {
497    isAtStartOfLine = true;
498    return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
499  }
500
501  isAtStartOfLine = false;
502  switch (CurChar) {
503  default:
504    // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]*
505    if (isalpha(CurChar) || CurChar == '_' || CurChar == '.')
506      return LexIdentifier();
507
508    // Unknown character, emit an error.
509    return ReturnError(TokStart, "invalid character in input");
510  case EOF: return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
511  case 0:
512  case ' ':
513  case '\t':
514    if (SkipSpace) {
515      // Ignore whitespace.
516      return LexToken();
517    } else {
518      int len = 1;
519      while (*CurPtr==' ' || *CurPtr=='\t') {
520        CurPtr++;
521        len++;
522      }
523      return AsmToken(AsmToken::Space, StringRef(TokStart, len));
524    }
525  case '\n': // FALL THROUGH.
526  case '\r':
527    isAtStartOfLine = true;
528    return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
529  case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
530  case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
531  case '-': return AsmToken(AsmToken::Minus, StringRef(TokStart, 1));
532  case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1));
533  case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1));
534  case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1));
535  case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1));
536  case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1));
537  case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1));
538  case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1));
539  case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1));
540  case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
541  case '$': return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1));
542  case '@': return AsmToken(AsmToken::At, StringRef(TokStart, 1));
543  case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1));
544  case '=':
545    if (*CurPtr == '=')
546      return ++CurPtr, AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
547    return AsmToken(AsmToken::Equal, StringRef(TokStart, 1));
548  case '|':
549    if (*CurPtr == '|')
550      return ++CurPtr, AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2));
551    return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1));
552  case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1));
553  case '&':
554    if (*CurPtr == '&')
555      return ++CurPtr, AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2));
556    return AsmToken(AsmToken::Amp, StringRef(TokStart, 1));
557  case '!':
558    if (*CurPtr == '=')
559      return ++CurPtr, AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2));
560    return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1));
561  case '%': return AsmToken(AsmToken::Percent, StringRef(TokStart, 1));
562  case '/': return LexSlash();
563  case '#': return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
564  case '\'': return LexSingleQuote();
565  case '"': return LexQuote();
566  case '0': case '1': case '2': case '3': case '4':
567  case '5': case '6': case '7': case '8': case '9':
568    return LexDigit();
569  case '<':
570    switch (*CurPtr) {
571    case '<': return ++CurPtr, AsmToken(AsmToken::LessLess,
572                                        StringRef(TokStart, 2));
573    case '=': return ++CurPtr, AsmToken(AsmToken::LessEqual,
574                                        StringRef(TokStart, 2));
575    case '>': return ++CurPtr, AsmToken(AsmToken::LessGreater,
576                                        StringRef(TokStart, 2));
577    default: return AsmToken(AsmToken::Less, StringRef(TokStart, 1));
578    }
579  case '>':
580    switch (*CurPtr) {
581    case '>': return ++CurPtr, AsmToken(AsmToken::GreaterGreater,
582                                        StringRef(TokStart, 2));
583    case '=': return ++CurPtr, AsmToken(AsmToken::GreaterEqual,
584                                        StringRef(TokStart, 2));
585    default: return AsmToken(AsmToken::Greater, StringRef(TokStart, 1));
586    }
587
588  // TODO: Quoted identifiers (objc methods etc)
589  // local labels: [0-9][:]
590  // Forward/backward labels: [0-9][fb]
591  // Integers, fp constants, character constants.
592  }
593}
594