AsmLexer.cpp revision f1c21a8da6ed27a6ab4944e30bbeb4bd3ee08a71
1//===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This class implements the lexer for assembly files.
11//
12//===----------------------------------------------------------------------===//
13
14#include "llvm/MC/MCParser/AsmLexer.h"
15#include "llvm/Support/SMLoc.h"
16#include "llvm/Support/MemoryBuffer.h"
17#include "llvm/MC/MCAsmInfo.h"
18#include <cctype>
19#include <cerrno>
20#include <cstdio>
21#include <cstdlib>
22using namespace llvm;
23
24AsmLexer::AsmLexer(const MCAsmInfo &_MAI) : MAI(_MAI)  {
25  CurBuf = NULL;
26  CurPtr = NULL;
27}
28
29AsmLexer::~AsmLexer() {
30}
31
32void AsmLexer::setBuffer(const MemoryBuffer *buf, const char *ptr) {
33  CurBuf = buf;
34
35  if (ptr)
36    CurPtr = ptr;
37  else
38    CurPtr = CurBuf->getBufferStart();
39
40  TokStart = 0;
41}
42
43/// ReturnError - Set the error to the specified string at the specified
44/// location.  This is defined to always return AsmToken::Error.
45AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) {
46  SetError(SMLoc::getFromPointer(Loc), Msg);
47
48  return AsmToken(AsmToken::Error, StringRef(Loc, 0));
49}
50
51int AsmLexer::getNextChar() {
52  char CurChar = *CurPtr++;
53  switch (CurChar) {
54  default:
55    return (unsigned char)CurChar;
56  case 0:
57    // A nul character in the stream is either the end of the current buffer or
58    // a random nul in the file.  Disambiguate that here.
59    if (CurPtr-1 != CurBuf->getBufferEnd())
60      return 0;  // Just whitespace.
61
62    // Otherwise, return end of file.
63    --CurPtr;  // Another call to lex will return EOF again.
64    return EOF;
65  }
66}
67
68/// LexFloatLiteral: [0-9]*[.][0-9]*([eE][+-]?[0-9]*)?
69///
70/// The leading integral digit sequence and dot should have already been
71/// consumed, some or all of the fractional digit sequence *can* have been
72/// consumed.
73AsmToken AsmLexer::LexFloatLiteral() {
74  // Skip the fractional digit sequence.
75  while (isdigit(*CurPtr))
76    ++CurPtr;
77
78  // Check for exponent; we intentionally accept a slighlty wider set of
79  // literals here and rely on the upstream client to reject invalid ones (e.g.,
80  // "1e+").
81  if (*CurPtr == 'e' || *CurPtr == 'E') {
82    ++CurPtr;
83    if (*CurPtr == '-' || *CurPtr == '+')
84      ++CurPtr;
85    while (isdigit(*CurPtr))
86      ++CurPtr;
87  }
88
89  return AsmToken(AsmToken::Real,
90                  StringRef(TokStart, CurPtr - TokStart));
91}
92
93/// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@]*
94static bool IsIdentifierChar(char c) {
95  return isalnum(c) || c == '_' || c == '$' || c == '.' || c == '@';
96}
97AsmToken AsmLexer::LexIdentifier() {
98  // Check for floating point literals.
99  if (CurPtr[-1] == '.' && isdigit(*CurPtr)) {
100    // Disambiguate a .1243foo identifier from a floating literal.
101    while (isdigit(*CurPtr))
102      ++CurPtr;
103    if (*CurPtr == 'e' || *CurPtr == 'E' || !IsIdentifierChar(*CurPtr))
104      return LexFloatLiteral();
105  }
106
107  while (IsIdentifierChar(*CurPtr))
108    ++CurPtr;
109
110  // Handle . as a special case.
111  if (CurPtr == TokStart+1 && TokStart[0] == '.')
112    return AsmToken(AsmToken::Dot, StringRef(TokStart, 1));
113
114  return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart));
115}
116
117/// LexSlash: Slash: /
118///           C-Style Comment: /* ... */
119AsmToken AsmLexer::LexSlash() {
120  switch (*CurPtr) {
121  case '*': break; // C style comment.
122  case '/': return ++CurPtr, LexLineComment();
123  default:  return AsmToken(AsmToken::Slash, StringRef(CurPtr-1, 1));
124  }
125
126  // C Style comment.
127  ++CurPtr;  // skip the star.
128  while (1) {
129    int CurChar = getNextChar();
130    switch (CurChar) {
131    case EOF:
132      return ReturnError(TokStart, "unterminated comment");
133    case '*':
134      // End of the comment?
135      if (CurPtr[0] != '/') break;
136
137      ++CurPtr;   // End the */.
138      return LexToken();
139    }
140  }
141}
142
143/// LexLineComment: Comment: #[^\n]*
144///                        : //[^\n]*
145AsmToken AsmLexer::LexLineComment() {
146  // FIXME: This is broken if we happen to a comment at the end of a file, which
147  // was .included, and which doesn't end with a newline.
148  int CurChar = getNextChar();
149  while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF)
150    CurChar = getNextChar();
151
152  if (CurChar == EOF)
153    return AsmToken(AsmToken::Eof, StringRef(CurPtr, 0));
154  return AsmToken(AsmToken::EndOfStatement, StringRef(CurPtr, 0));
155}
156
157static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
158  if (CurPtr[0] == 'L' && CurPtr[1] == 'L')
159    CurPtr += 2;
160  if (CurPtr[0] == 'U' && CurPtr[1] == 'L' && CurPtr[2] == 'L')
161    CurPtr += 3;
162}
163
164/// LexDigit: First character is [0-9].
165///   Local Label: [0-9][:]
166///   Forward/Backward Label: [0-9][fb]
167///   Binary integer: 0b[01]+
168///   Octal integer: 0[0-7]+
169///   Hex integer: 0x[0-9a-fA-F]+
170///   Decimal integer: [1-9][0-9]*
171AsmToken AsmLexer::LexDigit() {
172  // Decimal integer: [1-9][0-9]*
173  if (CurPtr[-1] != '0' || CurPtr[0] == '.') {
174    while (isdigit(*CurPtr))
175      ++CurPtr;
176
177    // Check for floating point literals.
178    if (*CurPtr == '.' || *CurPtr == 'e') {
179      ++CurPtr;
180      return LexFloatLiteral();
181    }
182
183    StringRef Result(TokStart, CurPtr - TokStart);
184
185    long long Value;
186    if (Result.getAsInteger(10, Value)) {
187      // Allow positive values that are too large to fit into a signed 64-bit
188      // integer, but that do fit in an unsigned one, we just convert them over.
189      unsigned long long UValue;
190      if (Result.getAsInteger(10, UValue))
191        return ReturnError(TokStart, "invalid decimal number");
192      Value = (long long)UValue;
193    }
194
195    // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
196    // suffixes on integer literals.
197    SkipIgnoredIntegerSuffix(CurPtr);
198
199    return AsmToken(AsmToken::Integer, Result, Value);
200  }
201
202  if (*CurPtr == 'b') {
203    ++CurPtr;
204    // See if we actually have "0b" as part of something like "jmp 0b\n"
205    if (!isdigit(CurPtr[0])) {
206      --CurPtr;
207      StringRef Result(TokStart, CurPtr - TokStart);
208      return AsmToken(AsmToken::Integer, Result, 0);
209    }
210    const char *NumStart = CurPtr;
211    while (CurPtr[0] == '0' || CurPtr[0] == '1')
212      ++CurPtr;
213
214    // Requires at least one binary digit.
215    if (CurPtr == NumStart)
216      return ReturnError(TokStart, "invalid binary number");
217
218    StringRef Result(TokStart, CurPtr - TokStart);
219
220    long long Value;
221    if (Result.substr(2).getAsInteger(2, Value))
222      return ReturnError(TokStart, "invalid binary number");
223
224    // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
225    // suffixes on integer literals.
226    SkipIgnoredIntegerSuffix(CurPtr);
227
228    return AsmToken(AsmToken::Integer, Result, Value);
229  }
230
231  if (*CurPtr == 'x') {
232    ++CurPtr;
233    const char *NumStart = CurPtr;
234    while (isxdigit(CurPtr[0]))
235      ++CurPtr;
236
237    // Requires at least one hex digit.
238    if (CurPtr == NumStart)
239      return ReturnError(CurPtr-2, "invalid hexadecimal number");
240
241    unsigned long long Result;
242    if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result))
243      return ReturnError(TokStart, "invalid hexadecimal number");
244
245    // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
246    // suffixes on integer literals.
247    SkipIgnoredIntegerSuffix(CurPtr);
248
249    return AsmToken(AsmToken::Integer, StringRef(TokStart, CurPtr - TokStart),
250                    (int64_t)Result);
251  }
252
253  // Must be an octal number, it starts with 0.
254  while (*CurPtr >= '0' && *CurPtr <= '9')
255    ++CurPtr;
256
257  StringRef Result(TokStart, CurPtr - TokStart);
258  long long Value;
259  if (Result.getAsInteger(8, Value))
260    return ReturnError(TokStart, "invalid octal number");
261
262  // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
263  // suffixes on integer literals.
264  SkipIgnoredIntegerSuffix(CurPtr);
265
266  return AsmToken(AsmToken::Integer, Result, Value);
267}
268
269/// LexSingleQuote: Integer: 'b'
270AsmToken AsmLexer::LexSingleQuote() {
271  int CurChar = getNextChar();
272
273  if (CurChar == '\\')
274    CurChar = getNextChar();
275
276  if (CurChar == EOF)
277    return ReturnError(TokStart, "unterminated single quote");
278
279  CurChar = getNextChar();
280
281  if (CurChar != '\'')
282    return ReturnError(TokStart, "single quote way too long");
283
284  // The idea here being that 'c' is basically just an integral
285  // constant.
286  StringRef Res = StringRef(TokStart,CurPtr - TokStart);
287  long long Value;
288
289  if (Res.startswith("\'\\")) {
290    char theChar = Res[2];
291    switch (theChar) {
292      default: Value = theChar; break;
293      case '\'': Value = '\''; break;
294      case 't': Value = '\t'; break;
295      case 'n': Value = '\n'; break;
296      case 'b': Value = '\b'; break;
297    }
298  } else
299    Value = TokStart[1];
300
301  return AsmToken(AsmToken::Integer, Res, Value);
302}
303
304
305/// LexQuote: String: "..."
306AsmToken AsmLexer::LexQuote() {
307  int CurChar = getNextChar();
308  // TODO: does gas allow multiline string constants?
309  while (CurChar != '"') {
310    if (CurChar == '\\') {
311      // Allow \", etc.
312      CurChar = getNextChar();
313    }
314
315    if (CurChar == EOF)
316      return ReturnError(TokStart, "unterminated string constant");
317
318    CurChar = getNextChar();
319  }
320
321  return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
322}
323
324StringRef AsmLexer::LexUntilEndOfStatement() {
325  TokStart = CurPtr;
326
327  while (!isAtStartOfComment(*CurPtr) &&    // Start of line comment.
328         !isAtStatementSeparator(CurPtr) && // End of statement marker.
329         *CurPtr != '\n' &&
330         *CurPtr != '\r' &&
331         (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) {
332    ++CurPtr;
333  }
334  return StringRef(TokStart, CurPtr-TokStart);
335}
336
337StringRef AsmLexer::LexUntilEndOfLine() {
338  TokStart = CurPtr;
339
340  while (*CurPtr != '\n' &&
341         *CurPtr != '\r' &&
342         (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) {
343    ++CurPtr;
344  }
345  return StringRef(TokStart, CurPtr-TokStart);
346}
347
348bool AsmLexer::isAtStartOfComment(char Char) {
349  // FIXME: This won't work for multi-character comment indicators like "//".
350  return Char == *MAI.getCommentString();
351}
352
353bool AsmLexer::isAtStatementSeparator(const char *Ptr) {
354  return strncmp(Ptr, MAI.getSeparatorString(),
355                 strlen(MAI.getSeparatorString())) == 0;
356}
357
358AsmToken AsmLexer::LexToken() {
359  static bool isAtStartOfLine = true;
360  TokStart = CurPtr;
361  // This always consumes at least one character.
362  int CurChar = getNextChar();
363
364  if (isAtStartOfComment(CurChar)) {
365    // If this comment starts with a '#', then return the Hash token and let
366    // the assembler parser see if it can be parsed as a cpp line filename
367    // comment. We do this only if we are at the start of a line.
368    if (CurChar == '#' && isAtStartOfLine)
369      return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
370    isAtStartOfLine = true;
371    return LexLineComment();
372  }
373  if (isAtStatementSeparator(TokStart)) {
374    CurPtr += strlen(MAI.getSeparatorString()) - 1;
375    return AsmToken(AsmToken::EndOfStatement,
376                    StringRef(TokStart, strlen(MAI.getSeparatorString())));
377  }
378  isAtStartOfLine = false;
379
380  switch (CurChar) {
381  default:
382    // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]*
383    if (isalpha(CurChar) || CurChar == '_' || CurChar == '.')
384      return LexIdentifier();
385
386    // Unknown character, emit an error.
387    return ReturnError(TokStart, "invalid character in input");
388  case EOF: return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
389  case 0:
390  case ' ':
391  case '\t':
392    // Ignore whitespace.
393    return LexToken();
394  case '\n': // FALL THROUGH.
395  case '\r':
396    isAtStartOfLine = true;
397    return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
398  case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
399  case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
400  case '-': return AsmToken(AsmToken::Minus, StringRef(TokStart, 1));
401  case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1));
402  case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1));
403  case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1));
404  case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1));
405  case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1));
406  case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1));
407  case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1));
408  case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1));
409  case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
410  case '$': return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1));
411  case '@': return AsmToken(AsmToken::At, StringRef(TokStart, 1));
412  case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1));
413  case '=':
414    if (*CurPtr == '=')
415      return ++CurPtr, AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
416    return AsmToken(AsmToken::Equal, StringRef(TokStart, 1));
417  case '|':
418    if (*CurPtr == '|')
419      return ++CurPtr, AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2));
420    return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1));
421  case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1));
422  case '&':
423    if (*CurPtr == '&')
424      return ++CurPtr, AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2));
425    return AsmToken(AsmToken::Amp, StringRef(TokStart, 1));
426  case '!':
427    if (*CurPtr == '=')
428      return ++CurPtr, AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2));
429    return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1));
430  case '%': return AsmToken(AsmToken::Percent, StringRef(TokStart, 1));
431  case '/': return LexSlash();
432  case '#': return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
433  case '\'': return LexSingleQuote();
434  case '"': return LexQuote();
435  case '0': case '1': case '2': case '3': case '4':
436  case '5': case '6': case '7': case '8': case '9':
437    return LexDigit();
438  case '<':
439    switch (*CurPtr) {
440    case '<': return ++CurPtr, AsmToken(AsmToken::LessLess,
441                                        StringRef(TokStart, 2));
442    case '=': return ++CurPtr, AsmToken(AsmToken::LessEqual,
443                                        StringRef(TokStart, 2));
444    case '>': return ++CurPtr, AsmToken(AsmToken::LessGreater,
445                                        StringRef(TokStart, 2));
446    default: return AsmToken(AsmToken::Less, StringRef(TokStart, 1));
447    }
448  case '>':
449    switch (*CurPtr) {
450    case '>': return ++CurPtr, AsmToken(AsmToken::GreaterGreater,
451                                        StringRef(TokStart, 2));
452    case '=': return ++CurPtr, AsmToken(AsmToken::GreaterEqual,
453                                        StringRef(TokStart, 2));
454    default: return AsmToken(AsmToken::Greater, StringRef(TokStart, 1));
455    }
456
457  // TODO: Quoted identifiers (objc methods etc)
458  // local labels: [0-9][:]
459  // Forward/backward labels: [0-9][fb]
460  // Integers, fp constants, character constants.
461  }
462}
463