Lexer.h revision 6aad4a31b35df07fe818f193fcfd3c0197aea467
11c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák//===--- Lexer.h - C Language Family Lexer ----------------------*- C++ -*-===//
21c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák//
31c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák//                     The LLVM Compiler Infrastructure
41c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák//
51c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák// This file is distributed under the University of Illinois Open Source
61c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák// License. See LICENSE.TXT for details.
71c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák//
81c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák//===----------------------------------------------------------------------===//
91c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák//
101c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák//  This file defines the Lexer interface.
111c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák//
121c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák//===----------------------------------------------------------------------===//
131c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák
141c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák#ifndef LLVM_CLANG_LEXER_H
151c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák#define LLVM_CLANG_LEXER_H
161c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák
171c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák#include "clang/Basic/LangOptions.h"
181c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák#include "clang/Lex/PreprocessorLexer.h"
191c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák#include "llvm/ADT/SmallVector.h"
201c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák#include <cassert>
211c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák#include <string>
221c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák
231c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšáknamespace clang {
241c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšákclass DiagnosticsEngine;
251c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšákclass SourceManager;
261c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšákclass Preprocessor;
271c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšákclass DiagnosticBuilder;
281c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák
291c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák/// ConflictMarkerKind - Kinds of conflict marker which the lexer might be
301c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák/// recovering from.
311c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšákenum ConflictMarkerKind {
321c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// Not within a conflict marker.
331c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  CMK_None,
341c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// A normal or diff3 conflict marker, initiated by at least 7 "<"s,
351c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// separated by at least 7 "="s or "|"s, and terminated by at least 7 ">"s.
361c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  CMK_Normal,
371c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// A Perforce-style conflict marker, initiated by 4 ">"s,
381c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// separated by 4 "="s, and terminated by 4 "<"s.
391c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  CMK_Perforce
401c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák};
411c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák
421c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák/// Lexer - This provides a simple interface that turns a text buffer into a
431c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák/// stream of tokens.  This provides no support for file reading or buffering,
441c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák/// or buffering/seeking of tokens, only forward lexing is supported.  It relies
451c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák/// on the specified Preprocessor object to handle preprocessor directives, etc.
461c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšákclass Lexer : public PreprocessorLexer {
471c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  virtual void anchor();
481c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák
491c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  //===--------------------------------------------------------------------===//
501c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  // Constant configuration values for this lexer.
511c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  const char *BufferStart;       // Start of the buffer.
521c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  const char *BufferEnd;         // End of the buffer.
531c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  SourceLocation FileLoc;        // Location for start of file.
541c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  LangOptions LangOpts;          // LangOpts enabled by this language (cache).
551c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  bool Is_PragmaLexer;           // True if lexer for _Pragma handling.
561c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák
57e945fb04d04c33da5e77d22d739c5740a522a61eTom Stellard  //===--------------------------------------------------------------------===//
581c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  // Context-specific lexing flags set by the preprocessor.
591c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  //
601c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák
611c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// ExtendedTokenMode - The lexer can optionally keep comments and whitespace
621c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// and return them as tokens.  This is used for -C and -CC modes, and
631c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// whitespace preservation can be useful for some clients that want to lex
641c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// the file in raw mode and get every character from the file.
651c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  ///
661c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// When this is set to 2 it returns comments and whitespace.  When set to 1
671c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// it returns comments, when it is set to 0 it returns normal tokens only.
681c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  unsigned char ExtendedTokenMode;
691c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák
701c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  //===--------------------------------------------------------------------===//
711c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  // Context that changes as the file is lexed.
721c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  // NOTE: any state that mutates when in raw mode must have save/restore code
731c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  // in Lexer::isNextPPTokenLParen.
741c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák
751c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  // BufferPtr - Current pointer into the buffer.  This is the next character
761c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  // to be lexed.
771c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  const char *BufferPtr;
781c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák
791c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  // IsAtStartOfLine - True if the next lexed token should get the "start of
801c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  // line" flag set on it.
811c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  bool IsAtStartOfLine;
821c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák
831c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  // CurrentConflictMarkerState - The kind of conflict marker we are handling.
841c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  ConflictMarkerKind CurrentConflictMarkerState;
851c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák
861c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  Lexer(const Lexer &) LLVM_DELETED_FUNCTION;
871c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  void operator=(const Lexer &) LLVM_DELETED_FUNCTION;
881c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  friend class Preprocessor;
891c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák
901c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  void InitLexer(const char *BufStart, const char *BufPtr, const char *BufEnd);
911c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšákpublic:
921c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák
931c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// Lexer constructor - Create a new lexer object for the specified buffer
941c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// with the specified preprocessor managing the lexing process.  This lexer
951c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// assumes that the associated file buffer and Preprocessor objects will
961c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// outlive it, so it doesn't take ownership of either of them.
971c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  Lexer(FileID FID, const llvm::MemoryBuffer *InputBuffer, Preprocessor &PP);
981c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák
991c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// Lexer constructor - Create a new raw lexer object.  This object is only
1001c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// suitable for calls to 'LexFromRawLexer'.  This lexer assumes that the
1011c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// text range will outlive it, so it doesn't take ownership of it.
1021c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  Lexer(SourceLocation FileLoc, const LangOptions &LangOpts,
1031c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák        const char *BufStart, const char *BufPtr, const char *BufEnd);
1041c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák
1051c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// Lexer constructor - Create a new raw lexer object.  This object is only
1061c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// suitable for calls to 'LexFromRawLexer'.  This lexer assumes that the
1071c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// text range will outlive it, so it doesn't take ownership of it.
1081c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  Lexer(FileID FID, const llvm::MemoryBuffer *InputBuffer,
1091c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák        const SourceManager &SM, const LangOptions &LangOpts);
1101c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák
1111c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
1121c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// _Pragma expansion.  This has a variety of magic semantics that this method
1131c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// sets up.  It returns a new'd Lexer that must be delete'd when done.
1141c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  static Lexer *Create_PragmaLexer(SourceLocation SpellingLoc,
1151c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák                                   SourceLocation ExpansionLocStart,
1161c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák                                   SourceLocation ExpansionLocEnd,
1171c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák                                   unsigned TokLen, Preprocessor &PP);
1181c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák
1191c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák
1201c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// getLangOpts - Return the language features currently enabled.
1211c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// NOTE: this lexer modifies features as a file is parsed!
1221c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  const LangOptions &getLangOpts() const { return LangOpts; }
1231c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák
1241c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// getFileLoc - Return the File Location for the file we are lexing out of.
1251c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// The physical location encodes the location where the characters come from,
1261c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// the virtual location encodes where we should *claim* the characters came
1271c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// from.  Currently this is only used by _Pragma handling.
1281c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  SourceLocation getFileLoc() const { return FileLoc; }
1291c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák
1301c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// Lex - Return the next token in the file.  If this is the end of file, it
1311c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// return the tok::eof token.  This implicitly involves the preprocessor.
1321c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  void Lex(Token &Result) {
1331c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák    // Start a new token.
1341c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák    Result.startToken();
1351c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák
1361c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák    // NOTE, any changes here should also change code after calls to
1371c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák    // Preprocessor::HandleDirective
1381c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák    if (IsAtStartOfLine) {
1391c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák      Result.setFlag(Token::StartOfLine);
1401c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák      IsAtStartOfLine = false;
1411c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák    }
142b2df031a959f36743527b9abc89913ce4f895de3Tom Stellard
1431c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák    // Get a token.  Note that this may delete the current lexer if the end of
1441c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák    // file is reached.
1451c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák    LexTokenInternal(Result);
1461c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  }
1471c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák
1481c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// isPragmaLexer - Returns true if this Lexer is being used to lex a pragma.
1491c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  bool isPragmaLexer() const { return Is_PragmaLexer; }
1501c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák
1511c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// IndirectLex - An indirect call to 'Lex' that can be invoked via
1521c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  ///  the PreprocessorLexer interface.
1531c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  void IndirectLex(Token &Result) { Lex(Result); }
1541c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák
1551c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// LexFromRawLexer - Lex a token from a designated raw lexer (one with no
1561c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// associated preprocessor object.  Return true if the 'next character to
1571c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// read' pointer points at the end of the lexer buffer, false otherwise.
1581c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  bool LexFromRawLexer(Token &Result) {
1591c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák    assert(LexingRawMode && "Not already in raw mode!");
1601c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák    Lex(Result);
161e945fb04d04c33da5e77d22d739c5740a522a61eTom Stellard    // Note that lexing to the end of the buffer doesn't implicitly delete the
162befcce264c8bf8fdac233e6a01cadc595a1d11d3Tom Stellard    // lexer when in raw mode.
1631c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák    return BufferPtr == BufferEnd;
1641c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  }
1651c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák
1661c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// isKeepWhitespaceMode - Return true if the lexer should return tokens for
1671c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// every character in the file, including whitespace and comments.  This
1681c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// should only be used in raw mode, as the preprocessor is not prepared to
1691c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  /// deal with the excess tokens.
1701c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  bool isKeepWhitespaceMode() const {
1711c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák    return ExtendedTokenMode > 1;
1721c2c4ddbd1e97bfd13430521e5c09cb5ce8e36e6Marek Olšák  }
173
174  /// SetKeepWhitespaceMode - This method lets clients enable or disable
175  /// whitespace retention mode.
176  void SetKeepWhitespaceMode(bool Val) {
177    assert((!Val || LexingRawMode || LangOpts.TraditionalCPP) &&
178           "Can only retain whitespace in raw mode or -traditional-cpp");
179    ExtendedTokenMode = Val ? 2 : 0;
180  }
181
182  /// inKeepCommentMode - Return true if the lexer should return comments as
183  /// tokens.
184  bool inKeepCommentMode() const {
185    return ExtendedTokenMode > 0;
186  }
187
188  /// SetCommentRetentionMode - Change the comment retention mode of the lexer
189  /// to the specified mode.  This is really only useful when lexing in raw
190  /// mode, because otherwise the lexer needs to manage this.
191  void SetCommentRetentionState(bool Mode) {
192    assert(!isKeepWhitespaceMode() &&
193           "Can't play with comment retention state when retaining whitespace");
194    ExtendedTokenMode = Mode ? 1 : 0;
195  }
196
197  /// Sets the extended token mode back to its initial value, according to the
198  /// language options and preprocessor. This controls whether the lexer
199  /// produces comment and whitespace tokens.
200  ///
201  /// This requires the lexer to have an associated preprocessor. A standalone
202  /// lexer has nothing to reset to.
203  void resetExtendedTokenMode();
204
205  const char *getBufferStart() const { return BufferStart; }
206
207  /// ReadToEndOfLine - Read the rest of the current preprocessor line as an
208  /// uninterpreted string.  This switches the lexer out of directive mode.
209  void ReadToEndOfLine(SmallVectorImpl<char> *Result = 0);
210
211
212  /// Diag - Forwarding function for diagnostics.  This translate a source
213  /// position in the current buffer into a SourceLocation object for rendering.
214  DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const;
215
216  /// getSourceLocation - Return a source location identifier for the specified
217  /// offset in the current file.
218  SourceLocation getSourceLocation(const char *Loc, unsigned TokLen = 1) const;
219
220  /// getSourceLocation - Return a source location for the next character in
221  /// the current file.
222  SourceLocation getSourceLocation() { return getSourceLocation(BufferPtr); }
223
224  /// \brief Return the current location in the buffer.
225  const char *getBufferLocation() const { return BufferPtr; }
226
227  /// Stringify - Convert the specified string into a C string by escaping '\'
228  /// and " characters.  This does not add surrounding ""'s to the string.
229  /// If Charify is true, this escapes the ' character instead of ".
230  static std::string Stringify(const std::string &Str, bool Charify = false);
231
232  /// Stringify - Convert the specified string into a C string by escaping '\'
233  /// and " characters.  This does not add surrounding ""'s to the string.
234  static void Stringify(SmallVectorImpl<char> &Str);
235
236
237  /// getSpelling - This method is used to get the spelling of a token into a
238  /// preallocated buffer, instead of as an std::string.  The caller is required
239  /// to allocate enough space for the token, which is guaranteed to be at least
240  /// Tok.getLength() bytes long.  The length of the actual result is returned.
241  ///
242  /// Note that this method may do two possible things: it may either fill in
243  /// the buffer specified with characters, or it may *change the input pointer*
244  /// to point to a constant buffer with the data already in it (avoiding a
245  /// copy).  The caller is not allowed to modify the returned buffer pointer
246  /// if an internal buffer is returned.
247  static unsigned getSpelling(const Token &Tok, const char *&Buffer,
248                              const SourceManager &SourceMgr,
249                              const LangOptions &LangOpts,
250                              bool *Invalid = 0);
251
252  /// getSpelling() - Return the 'spelling' of the Tok token.  The spelling of a
253  /// token is the characters used to represent the token in the source file
254  /// after trigraph expansion and escaped-newline folding.  In particular, this
255  /// wants to get the true, uncanonicalized, spelling of things like digraphs
256  /// UCNs, etc.
257  static std::string getSpelling(const Token &Tok,
258                                 const SourceManager &SourceMgr,
259                                 const LangOptions &LangOpts,
260                                 bool *Invalid = 0);
261
262  /// getSpelling - This method is used to get the spelling of the
263  /// token at the given source location.  If, as is usually true, it
264  /// is not necessary to copy any data, then the returned string may
265  /// not point into the provided buffer.
266  ///
267  /// This method lexes at the expansion depth of the given
268  /// location and does not jump to the expansion or spelling
269  /// location.
270  static StringRef getSpelling(SourceLocation loc,
271                                     SmallVectorImpl<char> &buffer,
272                                     const SourceManager &SourceMgr,
273                                     const LangOptions &LangOpts,
274                                     bool *invalid = 0);
275
276  /// MeasureTokenLength - Relex the token at the specified location and return
277  /// its length in bytes in the input file.  If the token needs cleaning (e.g.
278  /// includes a trigraph or an escaped newline) then this count includes bytes
279  /// that are part of that.
280  static unsigned MeasureTokenLength(SourceLocation Loc,
281                                     const SourceManager &SM,
282                                     const LangOptions &LangOpts);
283
284  /// \brief Relex the token at the specified location.
285  /// \returns true if there was a failure, false on success.
286  static bool getRawToken(SourceLocation Loc, Token &Result,
287                          const SourceManager &SM,
288                          const LangOptions &LangOpts);
289
290  /// \brief Given a location any where in a source buffer, find the location
291  /// that corresponds to the beginning of the token in which the original
292  /// source location lands.
293  static SourceLocation GetBeginningOfToken(SourceLocation Loc,
294                                            const SourceManager &SM,
295                                            const LangOptions &LangOpts);
296
297  /// AdvanceToTokenCharacter - If the current SourceLocation specifies a
298  /// location at the start of a token, return a new location that specifies a
299  /// character within the token.  This handles trigraphs and escaped newlines.
300  static SourceLocation AdvanceToTokenCharacter(SourceLocation TokStart,
301                                                unsigned Character,
302                                                const SourceManager &SM,
303                                                const LangOptions &LangOpts);
304
305  /// \brief Computes the source location just past the end of the
306  /// token at this source location.
307  ///
308  /// This routine can be used to produce a source location that
309  /// points just past the end of the token referenced by \p Loc, and
310  /// is generally used when a diagnostic needs to point just after a
311  /// token where it expected something different that it received. If
312  /// the returned source location would not be meaningful (e.g., if
313  /// it points into a macro), this routine returns an invalid
314  /// source location.
315  ///
316  /// \param Offset an offset from the end of the token, where the source
317  /// location should refer to. The default offset (0) produces a source
318  /// location pointing just past the end of the token; an offset of 1 produces
319  /// a source location pointing to the last character in the token, etc.
320  static SourceLocation getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
321                                            const SourceManager &SM,
322                                            const LangOptions &LangOpts);
323
324  /// \brief Returns true if the given MacroID location points at the first
325  /// token of the macro expansion.
326  ///
327  /// \param MacroBegin If non-null and function returns true, it is set to
328  /// begin location of the macro.
329  static bool isAtStartOfMacroExpansion(SourceLocation loc,
330                                        const SourceManager &SM,
331                                        const LangOptions &LangOpts,
332                                        SourceLocation *MacroBegin = 0);
333
334  /// \brief Returns true if the given MacroID location points at the last
335  /// token of the macro expansion.
336  ///
337  /// \param MacroEnd If non-null and function returns true, it is set to
338  /// end location of the macro.
339  static bool isAtEndOfMacroExpansion(SourceLocation loc,
340                                      const SourceManager &SM,
341                                      const LangOptions &LangOpts,
342                                      SourceLocation *MacroEnd = 0);
343
344  /// \brief Accepts a range and returns a character range with file locations.
345  ///
346  /// Returns a null range if a part of the range resides inside a macro
347  /// expansion or the range does not reside on the same FileID.
348  ///
349  /// This function is trying to deal with macros and return a range based on
350  /// file locations. The cases where it can successfully handle macros are:
351  ///
352  /// -begin or end range lies at the start or end of a macro expansion, in
353  ///  which case the location will be set to the expansion point, e.g:
354  ///    \#define M 1 2
355  ///    a M
356  /// If you have a range [a, 2] (where 2 came from the macro), the function
357  /// will return a range for "a M"
358  /// if you have range [a, 1], the function will fail because the range
359  /// overlaps with only a part of the macro
360  ///
361  /// -The macro is a function macro and the range can be mapped to the macro
362  ///  arguments, e.g:
363  ///    \#define M 1 2
364  ///    \#define FM(x) x
365  ///    FM(a b M)
366  /// if you have range [b, 2], the function will return the file range "b M"
367  /// inside the macro arguments.
368  /// if you have range [a, 2], the function will return the file range
369  /// "FM(a b M)" since the range includes all of the macro expansion.
370  static CharSourceRange makeFileCharRange(CharSourceRange Range,
371                                           const SourceManager &SM,
372                                           const LangOptions &LangOpts);
373
374  /// \brief Returns a string for the source that the range encompasses.
375  static StringRef getSourceText(CharSourceRange Range,
376                                 const SourceManager &SM,
377                                 const LangOptions &LangOpts,
378                                 bool *Invalid = 0);
379
380  /// \brief Retrieve the name of the immediate macro expansion.
381  ///
382  /// This routine starts from a source location, and finds the name of the macro
383  /// responsible for its immediate expansion. It looks through any intervening
384  /// macro argument expansions to compute this. It returns a StringRef which
385  /// refers to the SourceManager-owned buffer of the source where that macro
386  /// name is spelled. Thus, the result shouldn't out-live that SourceManager.
387  static StringRef getImmediateMacroName(SourceLocation Loc,
388                                         const SourceManager &SM,
389                                         const LangOptions &LangOpts);
390
391  /// \brief Compute the preamble of the given file.
392  ///
393  /// The preamble of a file contains the initial comments, include directives,
394  /// and other preprocessor directives that occur before the code in this
395  /// particular file actually begins. The preamble of the main source file is
396  /// a potential prefix header.
397  ///
398  /// \param Buffer The memory buffer containing the file's contents.
399  ///
400  /// \param MaxLines If non-zero, restrict the length of the preamble
401  /// to fewer than this number of lines.
402  ///
403  /// \returns The offset into the file where the preamble ends and the rest
404  /// of the file begins along with a boolean value indicating whether
405  /// the preamble ends at the beginning of a new line.
406  static std::pair<unsigned, bool>
407  ComputePreamble(const llvm::MemoryBuffer *Buffer, const LangOptions &LangOpts,
408                  unsigned MaxLines = 0);
409
410  /// \brief Checks that the given token is the first token that occurs after
411  /// the given location (this excludes comments and whitespace). Returns the
412  /// location immediately after the specified token. If the token is not found
413  /// or the location is inside a macro, the returned source location will be
414  /// invalid.
415  static SourceLocation findLocationAfterToken(SourceLocation loc,
416                                         tok::TokenKind TKind,
417                                         const SourceManager &SM,
418                                         const LangOptions &LangOpts,
419                                         bool SkipTrailingWhitespaceAndNewLine);
420
421  /// \brief Returns true if the given character could appear in an identifier.
422  static bool isIdentifierBodyChar(char c, const LangOptions &LangOpts);
423
424  /// getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever
425  /// emit a warning.
426  static inline char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size,
427                                          const LangOptions &LangOpts) {
428    // If this is not a trigraph and not a UCN or escaped newline, return
429    // quickly.
430    if (isObviouslySimpleCharacter(Ptr[0])) {
431      Size = 1;
432      return *Ptr;
433    }
434
435    Size = 0;
436    return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
437  }
438
439  //===--------------------------------------------------------------------===//
440  // Internal implementation interfaces.
441private:
442
443  /// LexTokenInternal - Internal interface to lex a preprocessing token. Called
444  /// by Lex.
445  ///
446  void LexTokenInternal(Token &Result);
447
448  /// Given that a token begins with the Unicode character \p C, figure out
449  /// what kind of token it is and dispatch to the appropriate lexing helper
450  /// function.
451  void LexUnicode(Token &Result, uint32_t C, const char *CurPtr);
452
453  /// FormTokenWithChars - When we lex a token, we have identified a span
454  /// starting at BufferPtr, going to TokEnd that forms the token.  This method
455  /// takes that range and assigns it to the token as its location and size.  In
456  /// addition, since tokens cannot overlap, this also updates BufferPtr to be
457  /// TokEnd.
458  void FormTokenWithChars(Token &Result, const char *TokEnd,
459                          tok::TokenKind Kind) {
460    unsigned TokLen = TokEnd-BufferPtr;
461    Result.setLength(TokLen);
462    Result.setLocation(getSourceLocation(BufferPtr, TokLen));
463    Result.setKind(Kind);
464    BufferPtr = TokEnd;
465  }
466
467  /// isNextPPTokenLParen - Return 1 if the next unexpanded token will return a
468  /// tok::l_paren token, 0 if it is something else and 2 if there are no more
469  /// tokens in the buffer controlled by this lexer.
470  unsigned isNextPPTokenLParen();
471
472  //===--------------------------------------------------------------------===//
473  // Lexer character reading interfaces.
474
475  // This lexer is built on two interfaces for reading characters, both of which
476  // automatically provide phase 1/2 translation.  getAndAdvanceChar is used
477  // when we know that we will be reading a character from the input buffer and
478  // that this character will be part of the result token. This occurs in (f.e.)
479  // string processing, because we know we need to read until we find the
480  // closing '"' character.
481  //
482  // The second interface is the combination of getCharAndSize with
483  // ConsumeChar.  getCharAndSize reads a phase 1/2 translated character,
484  // returning it and its size.  If the lexer decides that this character is
485  // part of the current token, it calls ConsumeChar on it.  This two stage
486  // approach allows us to emit diagnostics for characters (e.g. warnings about
487  // trigraphs), knowing that they only are emitted if the character is
488  // consumed.
489
490  /// isObviouslySimpleCharacter - Return true if the specified character is
491  /// obviously the same in translation phase 1 and translation phase 3.  This
492  /// can return false for characters that end up being the same, but it will
493  /// never return true for something that needs to be mapped.
494  static bool isObviouslySimpleCharacter(char C) {
495    return C != '?' && C != '\\';
496  }
497
498  /// getAndAdvanceChar - Read a single 'character' from the specified buffer,
499  /// advance over it, and return it.  This is tricky in several cases.  Here we
500  /// just handle the trivial case and fall-back to the non-inlined
501  /// getCharAndSizeSlow method to handle the hard case.
502  inline char getAndAdvanceChar(const char *&Ptr, Token &Tok) {
503    // If this is not a trigraph and not a UCN or escaped newline, return
504    // quickly.
505    if (isObviouslySimpleCharacter(Ptr[0])) return *Ptr++;
506
507    unsigned Size = 0;
508    char C = getCharAndSizeSlow(Ptr, Size, &Tok);
509    Ptr += Size;
510    return C;
511  }
512
513  /// ConsumeChar - When a character (identified by getCharAndSize) is consumed
514  /// and added to a given token, check to see if there are diagnostics that
515  /// need to be emitted or flags that need to be set on the token.  If so, do
516  /// it.
517  const char *ConsumeChar(const char *Ptr, unsigned Size, Token &Tok) {
518    // Normal case, we consumed exactly one token.  Just return it.
519    if (Size == 1)
520      return Ptr+Size;
521
522    // Otherwise, re-lex the character with a current token, allowing
523    // diagnostics to be emitted and flags to be set.
524    Size = 0;
525    getCharAndSizeSlow(Ptr, Size, &Tok);
526    return Ptr+Size;
527  }
528
529  /// getCharAndSize - Peek a single 'character' from the specified buffer,
530  /// get its size, and return it.  This is tricky in several cases.  Here we
531  /// just handle the trivial case and fall-back to the non-inlined
532  /// getCharAndSizeSlow method to handle the hard case.
533  inline char getCharAndSize(const char *Ptr, unsigned &Size) {
534    // If this is not a trigraph and not a UCN or escaped newline, return
535    // quickly.
536    if (isObviouslySimpleCharacter(Ptr[0])) {
537      Size = 1;
538      return *Ptr;
539    }
540
541    Size = 0;
542    return getCharAndSizeSlow(Ptr, Size);
543  }
544
545  /// getCharAndSizeSlow - Handle the slow/uncommon case of the getCharAndSize
546  /// method.
547  char getCharAndSizeSlow(const char *Ptr, unsigned &Size, Token *Tok = 0);
548
549  /// getEscapedNewLineSize - Return the size of the specified escaped newline,
550  /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry
551  /// to this function.
552  static unsigned getEscapedNewLineSize(const char *P);
553
554  /// SkipEscapedNewLines - If P points to an escaped newline (or a series of
555  /// them), skip over them and return the first non-escaped-newline found,
556  /// otherwise return P.
557  static const char *SkipEscapedNewLines(const char *P);
558
559  /// getCharAndSizeSlowNoWarn - Same as getCharAndSizeSlow, but never emits a
560  /// diagnostic.
561  static char getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
562                                       const LangOptions &LangOpts);
563
564  //===--------------------------------------------------------------------===//
565  // Other lexer functions.
566
567  void SkipBytes(unsigned Bytes, bool StartOfLine);
568
569  const char *LexUDSuffix(Token &Result, const char *CurPtr);
570
571  // Helper functions to lex the remainder of a token of the specific type.
572  void LexIdentifier         (Token &Result, const char *CurPtr);
573  void LexNumericConstant    (Token &Result, const char *CurPtr);
574  void LexStringLiteral      (Token &Result, const char *CurPtr,
575                              tok::TokenKind Kind);
576  void LexRawStringLiteral   (Token &Result, const char *CurPtr,
577                              tok::TokenKind Kind);
578  void LexAngledStringLiteral(Token &Result, const char *CurPtr);
579  void LexCharConstant       (Token &Result, const char *CurPtr,
580                              tok::TokenKind Kind);
581  bool LexEndOfFile          (Token &Result, const char *CurPtr);
582
583  bool SkipWhitespace        (Token &Result, const char *CurPtr);
584  bool SkipLineComment       (Token &Result, const char *CurPtr);
585  bool SkipBlockComment      (Token &Result, const char *CurPtr);
586  bool SaveLineComment       (Token &Result, const char *CurPtr);
587
588  bool IsStartOfConflictMarker(const char *CurPtr);
589  bool HandleEndOfConflictMarker(const char *CurPtr);
590
591  bool isCodeCompletionPoint(const char *CurPtr) const;
592  void cutOffLexing() { BufferPtr = BufferEnd; }
593
594  bool isHexaLiteral(const char *Start, const LangOptions &LangOpts);
595
596
597  /// Read a universal character name.
598  ///
599  /// \param CurPtr The position in the source buffer after the initial '\'.
600  ///               If the UCN is syntactically well-formed (but not necessarily
601  ///               valid), this parameter will be updated to point to the
602  ///               character after the UCN.
603  /// \param SlashLoc The position in the source buffer of the '\'.
604  /// \param Tok The token being formed. Pass \c NULL to suppress diagnostics
605  ///            and handle token formation in the caller.
606  ///
607  /// \return The Unicode codepoint specified by the UCN, or 0 if the UCN is
608  ///         invalid.
609  uint32_t tryReadUCN(const char *&CurPtr, const char *SlashLoc, Token *Tok);
610};
611
612
613}  // end namespace clang
614
615#endif
616