1//===--- Lexer.h - C Language Family Lexer ----------------------*- C++ -*-===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines the Lexer interface. 11// 12//===----------------------------------------------------------------------===// 13 14#ifndef LLVM_CLANG_LEX_LEXER_H 15#define LLVM_CLANG_LEX_LEXER_H 16 17#include "clang/Basic/LangOptions.h" 18#include "clang/Lex/PreprocessorLexer.h" 19#include "llvm/ADT/SmallVector.h" 20#include <cassert> 21#include <string> 22 23namespace clang { 24class DiagnosticsEngine; 25class SourceManager; 26class Preprocessor; 27class DiagnosticBuilder; 28 29/// ConflictMarkerKind - Kinds of conflict marker which the lexer might be 30/// recovering from. 31enum ConflictMarkerKind { 32 /// Not within a conflict marker. 33 CMK_None, 34 /// A normal or diff3 conflict marker, initiated by at least 7 "<"s, 35 /// separated by at least 7 "="s or "|"s, and terminated by at least 7 ">"s. 36 CMK_Normal, 37 /// A Perforce-style conflict marker, initiated by 4 ">"s, 38 /// separated by 4 "="s, and terminated by 4 "<"s. 39 CMK_Perforce 40}; 41 42/// Describes the bounds (start, size) of the preamble and a flag required by 43/// PreprocessorOptions::PrecompiledPreambleBytes. 44/// The preamble includes the BOM, if any. 45struct PreambleBounds { 46 PreambleBounds(unsigned Size, bool PreambleEndsAtStartOfLine) 47 : Size(Size), 48 PreambleEndsAtStartOfLine(PreambleEndsAtStartOfLine) {} 49 50 /// \brief Size of the preamble in bytes. 51 unsigned Size; 52 /// \brief Whether the preamble ends at the start of a new line. 53 /// 54 /// Used to inform the lexer as to whether it's starting at the beginning of 55 /// a line after skipping the preamble. 56 bool PreambleEndsAtStartOfLine; 57}; 58 59/// Lexer - This provides a simple interface that turns a text buffer into a 60/// stream of tokens. This provides no support for file reading or buffering, 61/// or buffering/seeking of tokens, only forward lexing is supported. It relies 62/// on the specified Preprocessor object to handle preprocessor directives, etc. 63class Lexer : public PreprocessorLexer { 64 void anchor() override; 65 66 //===--------------------------------------------------------------------===// 67 // Constant configuration values for this lexer. 68 const char *BufferStart; // Start of the buffer. 69 const char *BufferEnd; // End of the buffer. 70 SourceLocation FileLoc; // Location for start of file. 71 LangOptions LangOpts; // LangOpts enabled by this language (cache). 72 bool Is_PragmaLexer; // True if lexer for _Pragma handling. 73 74 //===--------------------------------------------------------------------===// 75 // Context-specific lexing flags set by the preprocessor. 76 // 77 78 /// ExtendedTokenMode - The lexer can optionally keep comments and whitespace 79 /// and return them as tokens. This is used for -C and -CC modes, and 80 /// whitespace preservation can be useful for some clients that want to lex 81 /// the file in raw mode and get every character from the file. 82 /// 83 /// When this is set to 2 it returns comments and whitespace. When set to 1 84 /// it returns comments, when it is set to 0 it returns normal tokens only. 85 unsigned char ExtendedTokenMode; 86 87 //===--------------------------------------------------------------------===// 88 // Context that changes as the file is lexed. 89 // NOTE: any state that mutates when in raw mode must have save/restore code 90 // in Lexer::isNextPPTokenLParen. 91 92 // BufferPtr - Current pointer into the buffer. This is the next character 93 // to be lexed. 94 const char *BufferPtr; 95 96 // IsAtStartOfLine - True if the next lexed token should get the "start of 97 // line" flag set on it. 98 bool IsAtStartOfLine; 99 100 bool IsAtPhysicalStartOfLine; 101 102 bool HasLeadingSpace; 103 104 bool HasLeadingEmptyMacro; 105 106 // CurrentConflictMarkerState - The kind of conflict marker we are handling. 107 ConflictMarkerKind CurrentConflictMarkerState; 108 109 Lexer(const Lexer &) = delete; 110 void operator=(const Lexer &) = delete; 111 friend class Preprocessor; 112 113 void InitLexer(const char *BufStart, const char *BufPtr, const char *BufEnd); 114public: 115 116 /// Lexer constructor - Create a new lexer object for the specified buffer 117 /// with the specified preprocessor managing the lexing process. This lexer 118 /// assumes that the associated file buffer and Preprocessor objects will 119 /// outlive it, so it doesn't take ownership of either of them. 120 Lexer(FileID FID, const llvm::MemoryBuffer *InputBuffer, Preprocessor &PP); 121 122 /// Lexer constructor - Create a new raw lexer object. This object is only 123 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the 124 /// text range will outlive it, so it doesn't take ownership of it. 125 Lexer(SourceLocation FileLoc, const LangOptions &LangOpts, 126 const char *BufStart, const char *BufPtr, const char *BufEnd); 127 128 /// Lexer constructor - Create a new raw lexer object. This object is only 129 /// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the 130 /// text range will outlive it, so it doesn't take ownership of it. 131 Lexer(FileID FID, const llvm::MemoryBuffer *InputBuffer, 132 const SourceManager &SM, const LangOptions &LangOpts); 133 134 /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for 135 /// _Pragma expansion. This has a variety of magic semantics that this method 136 /// sets up. It returns a new'd Lexer that must be delete'd when done. 137 static Lexer *Create_PragmaLexer(SourceLocation SpellingLoc, 138 SourceLocation ExpansionLocStart, 139 SourceLocation ExpansionLocEnd, 140 unsigned TokLen, Preprocessor &PP); 141 142 143 /// getLangOpts - Return the language features currently enabled. 144 /// NOTE: this lexer modifies features as a file is parsed! 145 const LangOptions &getLangOpts() const { return LangOpts; } 146 147 /// getFileLoc - Return the File Location for the file we are lexing out of. 148 /// The physical location encodes the location where the characters come from, 149 /// the virtual location encodes where we should *claim* the characters came 150 /// from. Currently this is only used by _Pragma handling. 151 SourceLocation getFileLoc() const { return FileLoc; } 152 153private: 154 /// Lex - Return the next token in the file. If this is the end of file, it 155 /// return the tok::eof token. This implicitly involves the preprocessor. 156 bool Lex(Token &Result); 157 158public: 159 /// isPragmaLexer - Returns true if this Lexer is being used to lex a pragma. 160 bool isPragmaLexer() const { return Is_PragmaLexer; } 161 162private: 163 /// IndirectLex - An indirect call to 'Lex' that can be invoked via 164 /// the PreprocessorLexer interface. 165 void IndirectLex(Token &Result) override { Lex(Result); } 166 167public: 168 /// LexFromRawLexer - Lex a token from a designated raw lexer (one with no 169 /// associated preprocessor object. Return true if the 'next character to 170 /// read' pointer points at the end of the lexer buffer, false otherwise. 171 bool LexFromRawLexer(Token &Result) { 172 assert(LexingRawMode && "Not already in raw mode!"); 173 Lex(Result); 174 // Note that lexing to the end of the buffer doesn't implicitly delete the 175 // lexer when in raw mode. 176 return BufferPtr == BufferEnd; 177 } 178 179 /// isKeepWhitespaceMode - Return true if the lexer should return tokens for 180 /// every character in the file, including whitespace and comments. This 181 /// should only be used in raw mode, as the preprocessor is not prepared to 182 /// deal with the excess tokens. 183 bool isKeepWhitespaceMode() const { 184 return ExtendedTokenMode > 1; 185 } 186 187 /// SetKeepWhitespaceMode - This method lets clients enable or disable 188 /// whitespace retention mode. 189 void SetKeepWhitespaceMode(bool Val) { 190 assert((!Val || LexingRawMode || LangOpts.TraditionalCPP) && 191 "Can only retain whitespace in raw mode or -traditional-cpp"); 192 ExtendedTokenMode = Val ? 2 : 0; 193 } 194 195 /// inKeepCommentMode - Return true if the lexer should return comments as 196 /// tokens. 197 bool inKeepCommentMode() const { 198 return ExtendedTokenMode > 0; 199 } 200 201 /// SetCommentRetentionMode - Change the comment retention mode of the lexer 202 /// to the specified mode. This is really only useful when lexing in raw 203 /// mode, because otherwise the lexer needs to manage this. 204 void SetCommentRetentionState(bool Mode) { 205 assert(!isKeepWhitespaceMode() && 206 "Can't play with comment retention state when retaining whitespace"); 207 ExtendedTokenMode = Mode ? 1 : 0; 208 } 209 210 /// Sets the extended token mode back to its initial value, according to the 211 /// language options and preprocessor. This controls whether the lexer 212 /// produces comment and whitespace tokens. 213 /// 214 /// This requires the lexer to have an associated preprocessor. A standalone 215 /// lexer has nothing to reset to. 216 void resetExtendedTokenMode(); 217 218 /// Gets source code buffer. 219 StringRef getBuffer() const { 220 return StringRef(BufferStart, BufferEnd - BufferStart); 221 } 222 223 /// ReadToEndOfLine - Read the rest of the current preprocessor line as an 224 /// uninterpreted string. This switches the lexer out of directive mode. 225 void ReadToEndOfLine(SmallVectorImpl<char> *Result = nullptr); 226 227 228 /// Diag - Forwarding function for diagnostics. This translate a source 229 /// position in the current buffer into a SourceLocation object for rendering. 230 DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const; 231 232 /// getSourceLocation - Return a source location identifier for the specified 233 /// offset in the current file. 234 SourceLocation getSourceLocation(const char *Loc, unsigned TokLen = 1) const; 235 236 /// getSourceLocation - Return a source location for the next character in 237 /// the current file. 238 SourceLocation getSourceLocation() override { 239 return getSourceLocation(BufferPtr); 240 } 241 242 /// \brief Return the current location in the buffer. 243 const char *getBufferLocation() const { return BufferPtr; } 244 245 /// Stringify - Convert the specified string into a C string by escaping '\' 246 /// and " characters. This does not add surrounding ""'s to the string. 247 /// If Charify is true, this escapes the ' character instead of ". 248 static std::string Stringify(StringRef Str, bool Charify = false); 249 250 /// Stringify - Convert the specified string into a C string by escaping '\' 251 /// and " characters. This does not add surrounding ""'s to the string. 252 static void Stringify(SmallVectorImpl<char> &Str); 253 254 255 /// getSpelling - This method is used to get the spelling of a token into a 256 /// preallocated buffer, instead of as an std::string. The caller is required 257 /// to allocate enough space for the token, which is guaranteed to be at least 258 /// Tok.getLength() bytes long. The length of the actual result is returned. 259 /// 260 /// Note that this method may do two possible things: it may either fill in 261 /// the buffer specified with characters, or it may *change the input pointer* 262 /// to point to a constant buffer with the data already in it (avoiding a 263 /// copy). The caller is not allowed to modify the returned buffer pointer 264 /// if an internal buffer is returned. 265 static unsigned getSpelling(const Token &Tok, const char *&Buffer, 266 const SourceManager &SourceMgr, 267 const LangOptions &LangOpts, 268 bool *Invalid = nullptr); 269 270 /// getSpelling() - Return the 'spelling' of the Tok token. The spelling of a 271 /// token is the characters used to represent the token in the source file 272 /// after trigraph expansion and escaped-newline folding. In particular, this 273 /// wants to get the true, uncanonicalized, spelling of things like digraphs 274 /// UCNs, etc. 275 static std::string getSpelling(const Token &Tok, 276 const SourceManager &SourceMgr, 277 const LangOptions &LangOpts, 278 bool *Invalid = nullptr); 279 280 /// getSpelling - This method is used to get the spelling of the 281 /// token at the given source location. If, as is usually true, it 282 /// is not necessary to copy any data, then the returned string may 283 /// not point into the provided buffer. 284 /// 285 /// This method lexes at the expansion depth of the given 286 /// location and does not jump to the expansion or spelling 287 /// location. 288 static StringRef getSpelling(SourceLocation loc, 289 SmallVectorImpl<char> &buffer, 290 const SourceManager &SourceMgr, 291 const LangOptions &LangOpts, 292 bool *invalid = nullptr); 293 294 /// MeasureTokenLength - Relex the token at the specified location and return 295 /// its length in bytes in the input file. If the token needs cleaning (e.g. 296 /// includes a trigraph or an escaped newline) then this count includes bytes 297 /// that are part of that. 298 static unsigned MeasureTokenLength(SourceLocation Loc, 299 const SourceManager &SM, 300 const LangOptions &LangOpts); 301 302 /// \brief Relex the token at the specified location. 303 /// \returns true if there was a failure, false on success. 304 static bool getRawToken(SourceLocation Loc, Token &Result, 305 const SourceManager &SM, 306 const LangOptions &LangOpts, 307 bool IgnoreWhiteSpace = false); 308 309 /// \brief Given a location any where in a source buffer, find the location 310 /// that corresponds to the beginning of the token in which the original 311 /// source location lands. 312 static SourceLocation GetBeginningOfToken(SourceLocation Loc, 313 const SourceManager &SM, 314 const LangOptions &LangOpts); 315 316 /// AdvanceToTokenCharacter - If the current SourceLocation specifies a 317 /// location at the start of a token, return a new location that specifies a 318 /// character within the token. This handles trigraphs and escaped newlines. 319 static SourceLocation AdvanceToTokenCharacter(SourceLocation TokStart, 320 unsigned Character, 321 const SourceManager &SM, 322 const LangOptions &LangOpts); 323 324 /// \brief Computes the source location just past the end of the 325 /// token at this source location. 326 /// 327 /// This routine can be used to produce a source location that 328 /// points just past the end of the token referenced by \p Loc, and 329 /// is generally used when a diagnostic needs to point just after a 330 /// token where it expected something different that it received. If 331 /// the returned source location would not be meaningful (e.g., if 332 /// it points into a macro), this routine returns an invalid 333 /// source location. 334 /// 335 /// \param Offset an offset from the end of the token, where the source 336 /// location should refer to. The default offset (0) produces a source 337 /// location pointing just past the end of the token; an offset of 1 produces 338 /// a source location pointing to the last character in the token, etc. 339 static SourceLocation getLocForEndOfToken(SourceLocation Loc, unsigned Offset, 340 const SourceManager &SM, 341 const LangOptions &LangOpts); 342 343 /// \brief Given a token range, produce a corresponding CharSourceRange that 344 /// is not a token range. This allows the source range to be used by 345 /// components that don't have access to the lexer and thus can't find the 346 /// end of the range for themselves. 347 static CharSourceRange getAsCharRange(SourceRange Range, 348 const SourceManager &SM, 349 const LangOptions &LangOpts) { 350 SourceLocation End = getLocForEndOfToken(Range.getEnd(), 0, SM, LangOpts); 351 return End.isInvalid() ? CharSourceRange() 352 : CharSourceRange::getCharRange( 353 Range.getBegin(), End.getLocWithOffset(-1)); 354 } 355 static CharSourceRange getAsCharRange(CharSourceRange Range, 356 const SourceManager &SM, 357 const LangOptions &LangOpts) { 358 return Range.isTokenRange() 359 ? getAsCharRange(Range.getAsRange(), SM, LangOpts) 360 : Range; 361 } 362 363 /// \brief Returns true if the given MacroID location points at the first 364 /// token of the macro expansion. 365 /// 366 /// \param MacroBegin If non-null and function returns true, it is set to 367 /// begin location of the macro. 368 static bool isAtStartOfMacroExpansion(SourceLocation loc, 369 const SourceManager &SM, 370 const LangOptions &LangOpts, 371 SourceLocation *MacroBegin = nullptr); 372 373 /// \brief Returns true if the given MacroID location points at the last 374 /// token of the macro expansion. 375 /// 376 /// \param MacroEnd If non-null and function returns true, it is set to 377 /// end location of the macro. 378 static bool isAtEndOfMacroExpansion(SourceLocation loc, 379 const SourceManager &SM, 380 const LangOptions &LangOpts, 381 SourceLocation *MacroEnd = nullptr); 382 383 /// \brief Accepts a range and returns a character range with file locations. 384 /// 385 /// Returns a null range if a part of the range resides inside a macro 386 /// expansion or the range does not reside on the same FileID. 387 /// 388 /// This function is trying to deal with macros and return a range based on 389 /// file locations. The cases where it can successfully handle macros are: 390 /// 391 /// -begin or end range lies at the start or end of a macro expansion, in 392 /// which case the location will be set to the expansion point, e.g: 393 /// \#define M 1 2 394 /// a M 395 /// If you have a range [a, 2] (where 2 came from the macro), the function 396 /// will return a range for "a M" 397 /// if you have range [a, 1], the function will fail because the range 398 /// overlaps with only a part of the macro 399 /// 400 /// -The macro is a function macro and the range can be mapped to the macro 401 /// arguments, e.g: 402 /// \#define M 1 2 403 /// \#define FM(x) x 404 /// FM(a b M) 405 /// if you have range [b, 2], the function will return the file range "b M" 406 /// inside the macro arguments. 407 /// if you have range [a, 2], the function will return the file range 408 /// "FM(a b M)" since the range includes all of the macro expansion. 409 static CharSourceRange makeFileCharRange(CharSourceRange Range, 410 const SourceManager &SM, 411 const LangOptions &LangOpts); 412 413 /// \brief Returns a string for the source that the range encompasses. 414 static StringRef getSourceText(CharSourceRange Range, 415 const SourceManager &SM, 416 const LangOptions &LangOpts, 417 bool *Invalid = nullptr); 418 419 /// \brief Retrieve the name of the immediate macro expansion. 420 /// 421 /// This routine starts from a source location, and finds the name of the macro 422 /// responsible for its immediate expansion. It looks through any intervening 423 /// macro argument expansions to compute this. It returns a StringRef which 424 /// refers to the SourceManager-owned buffer of the source where that macro 425 /// name is spelled. Thus, the result shouldn't out-live that SourceManager. 426 static StringRef getImmediateMacroName(SourceLocation Loc, 427 const SourceManager &SM, 428 const LangOptions &LangOpts); 429 430 /// \brief Retrieve the name of the immediate macro expansion. 431 /// 432 /// This routine starts from a source location, and finds the name of the 433 /// macro responsible for its immediate expansion. It looks through any 434 /// intervening macro argument expansions to compute this. It returns a 435 /// StringRef which refers to the SourceManager-owned buffer of the source 436 /// where that macro name is spelled. Thus, the result shouldn't out-live 437 /// that SourceManager. 438 /// 439 /// This differs from Lexer::getImmediateMacroName in that any macro argument 440 /// location will result in the topmost function macro that accepted it. 441 /// e.g. 442 /// \code 443 /// MAC1( MAC2(foo) ) 444 /// \endcode 445 /// for location of 'foo' token, this function will return "MAC1" while 446 /// Lexer::getImmediateMacroName will return "MAC2". 447 static StringRef getImmediateMacroNameForDiagnostics( 448 SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts); 449 450 /// \brief Compute the preamble of the given file. 451 /// 452 /// The preamble of a file contains the initial comments, include directives, 453 /// and other preprocessor directives that occur before the code in this 454 /// particular file actually begins. The preamble of the main source file is 455 /// a potential prefix header. 456 /// 457 /// \param Buffer The memory buffer containing the file's contents. 458 /// 459 /// \param MaxLines If non-zero, restrict the length of the preamble 460 /// to fewer than this number of lines. 461 /// 462 /// \returns The offset into the file where the preamble ends and the rest 463 /// of the file begins along with a boolean value indicating whether 464 /// the preamble ends at the beginning of a new line. 465 static PreambleBounds ComputePreamble(StringRef Buffer, 466 const LangOptions &LangOpts, 467 unsigned MaxLines = 0); 468 469 /// \brief Checks that the given token is the first token that occurs after 470 /// the given location (this excludes comments and whitespace). Returns the 471 /// location immediately after the specified token. If the token is not found 472 /// or the location is inside a macro, the returned source location will be 473 /// invalid. 474 static SourceLocation findLocationAfterToken(SourceLocation loc, 475 tok::TokenKind TKind, 476 const SourceManager &SM, 477 const LangOptions &LangOpts, 478 bool SkipTrailingWhitespaceAndNewLine); 479 480 /// \brief Returns true if the given character could appear in an identifier. 481 static bool isIdentifierBodyChar(char c, const LangOptions &LangOpts); 482 483 /// \brief Checks whether new line pointed by Str is preceded by escape 484 /// sequence. 485 static bool isNewLineEscaped(const char *BufferStart, const char *Str); 486 487 /// getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever 488 /// emit a warning. 489 static inline char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size, 490 const LangOptions &LangOpts) { 491 // If this is not a trigraph and not a UCN or escaped newline, return 492 // quickly. 493 if (isObviouslySimpleCharacter(Ptr[0])) { 494 Size = 1; 495 return *Ptr; 496 } 497 498 Size = 0; 499 return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts); 500 } 501 502 /// Returns the leading whitespace for line that corresponds to the given 503 /// location \p Loc. 504 static StringRef getIndentationForLine(SourceLocation Loc, 505 const SourceManager &SM); 506 507 //===--------------------------------------------------------------------===// 508 // Internal implementation interfaces. 509private: 510 511 /// LexTokenInternal - Internal interface to lex a preprocessing token. Called 512 /// by Lex. 513 /// 514 bool LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine); 515 516 bool CheckUnicodeWhitespace(Token &Result, uint32_t C, const char *CurPtr); 517 518 /// Given that a token begins with the Unicode character \p C, figure out 519 /// what kind of token it is and dispatch to the appropriate lexing helper 520 /// function. 521 bool LexUnicode(Token &Result, uint32_t C, const char *CurPtr); 522 523 /// FormTokenWithChars - When we lex a token, we have identified a span 524 /// starting at BufferPtr, going to TokEnd that forms the token. This method 525 /// takes that range and assigns it to the token as its location and size. In 526 /// addition, since tokens cannot overlap, this also updates BufferPtr to be 527 /// TokEnd. 528 void FormTokenWithChars(Token &Result, const char *TokEnd, 529 tok::TokenKind Kind) { 530 unsigned TokLen = TokEnd-BufferPtr; 531 Result.setLength(TokLen); 532 Result.setLocation(getSourceLocation(BufferPtr, TokLen)); 533 Result.setKind(Kind); 534 BufferPtr = TokEnd; 535 } 536 537 /// isNextPPTokenLParen - Return 1 if the next unexpanded token will return a 538 /// tok::l_paren token, 0 if it is something else and 2 if there are no more 539 /// tokens in the buffer controlled by this lexer. 540 unsigned isNextPPTokenLParen(); 541 542 //===--------------------------------------------------------------------===// 543 // Lexer character reading interfaces. 544 545 // This lexer is built on two interfaces for reading characters, both of which 546 // automatically provide phase 1/2 translation. getAndAdvanceChar is used 547 // when we know that we will be reading a character from the input buffer and 548 // that this character will be part of the result token. This occurs in (f.e.) 549 // string processing, because we know we need to read until we find the 550 // closing '"' character. 551 // 552 // The second interface is the combination of getCharAndSize with 553 // ConsumeChar. getCharAndSize reads a phase 1/2 translated character, 554 // returning it and its size. If the lexer decides that this character is 555 // part of the current token, it calls ConsumeChar on it. This two stage 556 // approach allows us to emit diagnostics for characters (e.g. warnings about 557 // trigraphs), knowing that they only are emitted if the character is 558 // consumed. 559 560 /// isObviouslySimpleCharacter - Return true if the specified character is 561 /// obviously the same in translation phase 1 and translation phase 3. This 562 /// can return false for characters that end up being the same, but it will 563 /// never return true for something that needs to be mapped. 564 static bool isObviouslySimpleCharacter(char C) { 565 return C != '?' && C != '\\'; 566 } 567 568 /// getAndAdvanceChar - Read a single 'character' from the specified buffer, 569 /// advance over it, and return it. This is tricky in several cases. Here we 570 /// just handle the trivial case and fall-back to the non-inlined 571 /// getCharAndSizeSlow method to handle the hard case. 572 inline char getAndAdvanceChar(const char *&Ptr, Token &Tok) { 573 // If this is not a trigraph and not a UCN or escaped newline, return 574 // quickly. 575 if (isObviouslySimpleCharacter(Ptr[0])) return *Ptr++; 576 577 unsigned Size = 0; 578 char C = getCharAndSizeSlow(Ptr, Size, &Tok); 579 Ptr += Size; 580 return C; 581 } 582 583 /// ConsumeChar - When a character (identified by getCharAndSize) is consumed 584 /// and added to a given token, check to see if there are diagnostics that 585 /// need to be emitted or flags that need to be set on the token. If so, do 586 /// it. 587 const char *ConsumeChar(const char *Ptr, unsigned Size, Token &Tok) { 588 // Normal case, we consumed exactly one token. Just return it. 589 if (Size == 1) 590 return Ptr+Size; 591 592 // Otherwise, re-lex the character with a current token, allowing 593 // diagnostics to be emitted and flags to be set. 594 Size = 0; 595 getCharAndSizeSlow(Ptr, Size, &Tok); 596 return Ptr+Size; 597 } 598 599 /// getCharAndSize - Peek a single 'character' from the specified buffer, 600 /// get its size, and return it. This is tricky in several cases. Here we 601 /// just handle the trivial case and fall-back to the non-inlined 602 /// getCharAndSizeSlow method to handle the hard case. 603 inline char getCharAndSize(const char *Ptr, unsigned &Size) { 604 // If this is not a trigraph and not a UCN or escaped newline, return 605 // quickly. 606 if (isObviouslySimpleCharacter(Ptr[0])) { 607 Size = 1; 608 return *Ptr; 609 } 610 611 Size = 0; 612 return getCharAndSizeSlow(Ptr, Size); 613 } 614 615 /// getCharAndSizeSlow - Handle the slow/uncommon case of the getCharAndSize 616 /// method. 617 char getCharAndSizeSlow(const char *Ptr, unsigned &Size, 618 Token *Tok = nullptr); 619 620 /// getEscapedNewLineSize - Return the size of the specified escaped newline, 621 /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry 622 /// to this function. 623 static unsigned getEscapedNewLineSize(const char *P); 624 625 /// SkipEscapedNewLines - If P points to an escaped newline (or a series of 626 /// them), skip over them and return the first non-escaped-newline found, 627 /// otherwise return P. 628 static const char *SkipEscapedNewLines(const char *P); 629 630 /// getCharAndSizeSlowNoWarn - Same as getCharAndSizeSlow, but never emits a 631 /// diagnostic. 632 static char getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size, 633 const LangOptions &LangOpts); 634 635 //===--------------------------------------------------------------------===// 636 // Other lexer functions. 637 638 void SetByteOffset(unsigned Offset, bool StartOfLine); 639 640 void PropagateLineStartLeadingSpaceInfo(Token &Result); 641 642 const char *LexUDSuffix(Token &Result, const char *CurPtr, 643 bool IsStringLiteral); 644 645 // Helper functions to lex the remainder of a token of the specific type. 646 bool LexIdentifier (Token &Result, const char *CurPtr); 647 bool LexNumericConstant (Token &Result, const char *CurPtr); 648 bool LexStringLiteral (Token &Result, const char *CurPtr, 649 tok::TokenKind Kind); 650 bool LexRawStringLiteral (Token &Result, const char *CurPtr, 651 tok::TokenKind Kind); 652 bool LexAngledStringLiteral(Token &Result, const char *CurPtr); 653 bool LexCharConstant (Token &Result, const char *CurPtr, 654 tok::TokenKind Kind); 655 bool LexEndOfFile (Token &Result, const char *CurPtr); 656 bool SkipWhitespace (Token &Result, const char *CurPtr, 657 bool &TokAtPhysicalStartOfLine); 658 bool SkipLineComment (Token &Result, const char *CurPtr, 659 bool &TokAtPhysicalStartOfLine); 660 bool SkipBlockComment (Token &Result, const char *CurPtr, 661 bool &TokAtPhysicalStartOfLine); 662 bool SaveLineComment (Token &Result, const char *CurPtr); 663 664 bool IsStartOfConflictMarker(const char *CurPtr); 665 bool HandleEndOfConflictMarker(const char *CurPtr); 666 667 bool lexEditorPlaceholder(Token &Result, const char *CurPtr); 668 669 bool isCodeCompletionPoint(const char *CurPtr) const; 670 void cutOffLexing() { BufferPtr = BufferEnd; } 671 672 bool isHexaLiteral(const char *Start, const LangOptions &LangOpts); 673 674 675 /// Read a universal character name. 676 /// 677 /// \param CurPtr The position in the source buffer after the initial '\'. 678 /// If the UCN is syntactically well-formed (but not necessarily 679 /// valid), this parameter will be updated to point to the 680 /// character after the UCN. 681 /// \param SlashLoc The position in the source buffer of the '\'. 682 /// \param Tok The token being formed. Pass \c NULL to suppress diagnostics 683 /// and handle token formation in the caller. 684 /// 685 /// \return The Unicode codepoint specified by the UCN, or 0 if the UCN is 686 /// invalid. 687 uint32_t tryReadUCN(const char *&CurPtr, const char *SlashLoc, Token *Tok); 688 689 /// \brief Try to consume a UCN as part of an identifier at the current 690 /// location. 691 /// \param CurPtr Initially points to the range of characters in the source 692 /// buffer containing the '\'. Updated to point past the end of 693 /// the UCN on success. 694 /// \param Size The number of characters occupied by the '\' (including 695 /// trigraphs and escaped newlines). 696 /// \param Result The token being produced. Marked as containing a UCN on 697 /// success. 698 /// \return \c true if a UCN was lexed and it produced an acceptable 699 /// identifier character, \c false otherwise. 700 bool tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size, 701 Token &Result); 702 703 /// \brief Try to consume an identifier character encoded in UTF-8. 704 /// \param CurPtr Points to the start of the (potential) UTF-8 code unit 705 /// sequence. On success, updated to point past the end of it. 706 /// \return \c true if a UTF-8 sequence mapping to an acceptable identifier 707 /// character was lexed, \c false otherwise. 708 bool tryConsumeIdentifierUTF8Char(const char *&CurPtr); 709}; 710 711} // end namespace clang 712 713#endif 714