AsmLexer.cpp revision d8ba292c9bc4e0927ea21304d735e27a43d296a6
1//===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This class implements the lexer for assembly files.
11//
12//===----------------------------------------------------------------------===//
13
14#include "llvm/MC/MCParser/AsmLexer.h"
15#include "llvm/Support/SMLoc.h"
16#include "llvm/Support/MemoryBuffer.h"
17#include "llvm/MC/MCAsmInfo.h"
18#include <cerrno>
19#include <cstdio>
20#include <cstdlib>
21using namespace llvm;
22
23AsmLexer::AsmLexer(const MCAsmInfo &_MAI) : MAI(_MAI)  {
24  CurBuf = NULL;
25  CurPtr = NULL;
26  TokStart = 0;
27}
28
29AsmLexer::~AsmLexer() {
30}
31
32void AsmLexer::setBuffer(const MemoryBuffer *buf, const char *ptr) {
33  CurBuf = buf;
34
35  if (ptr)
36    CurPtr = ptr;
37  else
38    CurPtr = CurBuf->getBufferStart();
39
40  TokStart = 0;
41}
42
43SMLoc AsmLexer::getLoc() const {
44  return SMLoc::getFromPointer(TokStart);
45}
46
47/// ReturnError - Set the error to the specified string at the specified
48/// location.  This is defined to always return AsmToken::Error.
49AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) {
50  SetError(SMLoc::getFromPointer(Loc), Msg);
51
52  return AsmToken(AsmToken::Error, StringRef(Loc, 0));
53}
54
55int AsmLexer::getNextChar() {
56  char CurChar = *CurPtr++;
57  switch (CurChar) {
58  default:
59    return (unsigned char)CurChar;
60  case 0:
61    // A nul character in the stream is either the end of the current buffer or
62    // a random nul in the file.  Disambiguate that here.
63    if (CurPtr-1 != CurBuf->getBufferEnd())
64      return 0;  // Just whitespace.
65
66    // Otherwise, return end of file.
67    --CurPtr;  // Another call to lex will return EOF again.
68    return EOF;
69  }
70}
71
72/// LexIdentifier: [a-zA-Z_.][a-zA-Z0-9_$.@]*
73AsmToken AsmLexer::LexIdentifier() {
74  while (isalnum(*CurPtr) || *CurPtr == '_' || *CurPtr == '$' ||
75         *CurPtr == '.' || *CurPtr == '@')
76    ++CurPtr;
77
78  // Handle . as a special case.
79  if (CurPtr == TokStart+1 && TokStart[0] == '.')
80    return AsmToken(AsmToken::Dot, StringRef(TokStart, 1));
81
82  return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart));
83}
84
85/// LexSlash: Slash: /
86///           C-Style Comment: /* ... */
87AsmToken AsmLexer::LexSlash() {
88  switch (*CurPtr) {
89  case '*': break; // C style comment.
90  case '/': return ++CurPtr, LexLineComment();
91  default:  return AsmToken(AsmToken::Slash, StringRef(CurPtr, 1));
92  }
93
94  // C Style comment.
95  ++CurPtr;  // skip the star.
96  while (1) {
97    int CurChar = getNextChar();
98    switch (CurChar) {
99    case EOF:
100      return ReturnError(TokStart, "unterminated comment");
101    case '*':
102      // End of the comment?
103      if (CurPtr[0] != '/') break;
104
105      ++CurPtr;   // End the */.
106      return LexToken();
107    }
108  }
109}
110
111/// LexLineComment: Comment: #[^\n]*
112///                        : //[^\n]*
113AsmToken AsmLexer::LexLineComment() {
114  // FIXME: This is broken if we happen to a comment at the end of a file, which
115  // was .included, and which doesn't end with a newline.
116  int CurChar = getNextChar();
117  while (CurChar != '\n' && CurChar != '\n' && CurChar != EOF)
118    CurChar = getNextChar();
119
120  if (CurChar == EOF)
121    return AsmToken(AsmToken::Eof, StringRef(CurPtr, 0));
122  return AsmToken(AsmToken::EndOfStatement, StringRef(CurPtr, 0));
123}
124
125
126/// LexDigit: First character is [0-9].
127///   Local Label: [0-9][:]
128///   Forward/Backward Label: [0-9][fb]
129///   Binary integer: 0b[01]+
130///   Octal integer: 0[0-7]+
131///   Hex integer: 0x[0-9a-fA-F]+
132///   Decimal integer: [1-9][0-9]*
133/// TODO: FP literal.
134AsmToken AsmLexer::LexDigit() {
135  // Decimal integer: [1-9][0-9]*
136  if (CurPtr[-1] != '0') {
137    while (isdigit(*CurPtr))
138      ++CurPtr;
139
140    StringRef Result(TokStart, CurPtr - TokStart);
141
142    long long Value;
143    if (Result.getAsInteger(10, Value)) {
144      // We have to handle minint_as_a_positive_value specially, because
145      // - minint_as_a_positive_value = minint and it is valid.
146      if (Result == "9223372036854775808")
147        Value = -9223372036854775808ULL;
148      else
149        return ReturnError(TokStart, "Invalid decimal number");
150    }
151    return AsmToken(AsmToken::Integer, Result, Value);
152  }
153
154  if (*CurPtr == 'b') {
155    ++CurPtr;
156    // See if we actually have "0b" as part of something like "jmp 0b\n"
157    if (CurPtr[0] == '\n') {
158      --CurPtr;
159      StringRef Result(TokStart, CurPtr - TokStart);
160      return AsmToken(AsmToken::Integer, Result, 0);
161    }
162    const char *NumStart = CurPtr;
163    while (CurPtr[0] == '0' || CurPtr[0] == '1')
164      ++CurPtr;
165
166    // Requires at least one binary digit.
167    if (CurPtr == NumStart)
168      return ReturnError(TokStart, "Invalid binary number");
169
170    StringRef Result(TokStart, CurPtr - TokStart);
171
172    long long Value;
173    if (Result.getAsInteger(2, Value))
174      return ReturnError(TokStart, "Invalid binary number");
175
176    return AsmToken(AsmToken::Integer, Result, Value);
177  }
178
179  if (*CurPtr == 'x') {
180    ++CurPtr;
181    const char *NumStart = CurPtr;
182    while (isxdigit(CurPtr[0]))
183      ++CurPtr;
184
185    // Requires at least one hex digit.
186    if (CurPtr == NumStart)
187      return ReturnError(CurPtr-2, "Invalid hexadecimal number");
188
189    unsigned long long Result;
190    if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result))
191      return ReturnError(TokStart, "Invalid hexadecimal number");
192
193    return AsmToken(AsmToken::Integer, StringRef(TokStart, CurPtr - TokStart),
194                    (int64_t)Result);
195  }
196
197  // Must be an octal number, it starts with 0.
198  while (*CurPtr >= '0' && *CurPtr <= '7')
199    ++CurPtr;
200
201  StringRef Result(TokStart, CurPtr - TokStart);
202  long long Value;
203  if (Result.getAsInteger(8, Value))
204    return ReturnError(TokStart, "Invalid octal number");
205
206  return AsmToken(AsmToken::Integer, Result, Value);
207}
208
209/// LexQuote: String: "..."
210AsmToken AsmLexer::LexQuote() {
211  int CurChar = getNextChar();
212  // TODO: does gas allow multiline string constants?
213  while (CurChar != '"') {
214    if (CurChar == '\\') {
215      // Allow \", etc.
216      CurChar = getNextChar();
217    }
218
219    if (CurChar == EOF)
220      return ReturnError(TokStart, "unterminated string constant");
221
222    CurChar = getNextChar();
223  }
224
225  return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
226}
227
228StringRef AsmLexer::LexUntilEndOfStatement() {
229  TokStart = CurPtr;
230
231  while (!isAtStartOfComment(*CurPtr) && // Start of line comment.
232	  *CurPtr != ';' &&  // End of statement marker.
233         *CurPtr != '\n' &&
234         *CurPtr != '\r' &&
235         (*CurPtr != 0 || CurPtr != CurBuf->getBufferEnd())) {
236    ++CurPtr;
237  }
238  return StringRef(TokStart, CurPtr-TokStart);
239}
240
241bool AsmLexer::isAtStartOfComment(char Char) {
242  // FIXME: This won't work for multi-character comment indicators like "//".
243  return Char == *MAI.getCommentString();
244}
245
246AsmToken AsmLexer::LexToken() {
247  TokStart = CurPtr;
248  // This always consumes at least one character.
249  int CurChar = getNextChar();
250
251  if (isAtStartOfComment(CurChar))
252    return LexLineComment();
253
254  switch (CurChar) {
255  default:
256    // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]*
257    if (isalpha(CurChar) || CurChar == '_' || CurChar == '.')
258      return LexIdentifier();
259
260    // Unknown character, emit an error.
261    return ReturnError(TokStart, "invalid character in input");
262  case EOF: return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
263  case 0:
264  case ' ':
265  case '\t':
266    // Ignore whitespace.
267    return LexToken();
268  case '\n': // FALL THROUGH.
269  case '\r': // FALL THROUGH.
270  case ';': return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
271  case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
272  case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
273  case '-': return AsmToken(AsmToken::Minus, StringRef(TokStart, 1));
274  case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1));
275  case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1));
276  case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1));
277  case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1));
278  case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1));
279  case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1));
280  case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1));
281  case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1));
282  case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
283  case '$': return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1));
284  case '=':
285    if (*CurPtr == '=')
286      return ++CurPtr, AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
287    return AsmToken(AsmToken::Equal, StringRef(TokStart, 1));
288  case '|':
289    if (*CurPtr == '|')
290      return ++CurPtr, AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2));
291    return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1));
292  case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1));
293  case '&':
294    if (*CurPtr == '&')
295      return ++CurPtr, AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2));
296    return AsmToken(AsmToken::Amp, StringRef(TokStart, 1));
297  case '!':
298    if (*CurPtr == '=')
299      return ++CurPtr, AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2));
300    return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1));
301  case '%': return AsmToken(AsmToken::Percent, StringRef(TokStart, 1));
302  case '/': return LexSlash();
303  case '#': return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
304  case '"': return LexQuote();
305  case '0': case '1': case '2': case '3': case '4':
306  case '5': case '6': case '7': case '8': case '9':
307    return LexDigit();
308  case '<':
309    switch (*CurPtr) {
310    case '<': return ++CurPtr, AsmToken(AsmToken::LessLess,
311                                        StringRef(TokStart, 2));
312    case '=': return ++CurPtr, AsmToken(AsmToken::LessEqual,
313                                        StringRef(TokStart, 2));
314    case '>': return ++CurPtr, AsmToken(AsmToken::LessGreater,
315                                        StringRef(TokStart, 2));
316    default: return AsmToken(AsmToken::Less, StringRef(TokStart, 1));
317    }
318  case '>':
319    switch (*CurPtr) {
320    case '>': return ++CurPtr, AsmToken(AsmToken::GreaterGreater,
321                                        StringRef(TokStart, 2));
322    case '=': return ++CurPtr, AsmToken(AsmToken::GreaterEqual,
323                                        StringRef(TokStart, 2));
324    default: return AsmToken(AsmToken::Greater, StringRef(TokStart, 1));
325    }
326
327  // TODO: Quoted identifiers (objc methods etc)
328  // local labels: [0-9][:]
329  // Forward/backward labels: [0-9][fb]
330  // Integers, fp constants, character constants.
331  }
332}
333