CommentLexer.h revision 3f38bf2d441fac379c427f86153fbb0cb41256c6
1//===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines lexer for structured comments and supporting token class. 11// 12//===----------------------------------------------------------------------===// 13 14#ifndef LLVM_CLANG_AST_COMMENT_LEXER_H 15#define LLVM_CLANG_AST_COMMENT_LEXER_H 16 17#include "clang/Basic/SourceManager.h" 18#include "llvm/ADT/StringRef.h" 19#include "llvm/ADT/SmallString.h" 20#include "llvm/ADT/SmallVector.h" 21#include "llvm/Support/Allocator.h" 22#include "llvm/Support/raw_ostream.h" 23 24namespace clang { 25namespace comments { 26 27class Lexer; 28class TextTokenRetokenizer; 29 30namespace tok { 31enum TokenKind { 32 eof, 33 newline, 34 text, 35 command, 36 verbatim_block_begin, 37 verbatim_block_line, 38 verbatim_block_end, 39 verbatim_line_name, 40 verbatim_line_text, 41 html_start_tag, // <tag 42 html_ident, // attr 43 html_equals, // = 44 html_quoted_string, // "blah\"blah" or 'blah\'blah' 45 html_greater, // > 46 html_slash_greater, // /> 47 html_end_tag // </tag 48}; 49} // end namespace tok 50 51class CommentOptions { 52public: 53 bool Markdown; 54}; 55 56/// \brief Comment token. 57class Token { 58 friend class Lexer; 59 friend class TextTokenRetokenizer; 60 61 /// The location of the token. 62 SourceLocation Loc; 63 64 /// The actual kind of the token. 65 tok::TokenKind Kind; 66 67 /// Length of the token spelling in comment. Can be 0 for synthenized 68 /// tokens. 69 unsigned Length; 70 71 /// Contains text value associated with a token. 72 const char *TextPtr1; 73 unsigned TextLen1; 74 75public: 76 SourceLocation getLocation() const LLVM_READONLY { return Loc; } 77 void setLocation(SourceLocation SL) { Loc = SL; } 78 79 SourceLocation getEndLocation() const LLVM_READONLY { 80 if (Length == 0 || Length == 1) 81 return Loc; 82 return Loc.getLocWithOffset(Length - 1); 83 } 84 85 tok::TokenKind getKind() const LLVM_READONLY { return Kind; } 86 void setKind(tok::TokenKind K) { Kind = K; } 87 88 bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; } 89 bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; } 90 91 unsigned getLength() const LLVM_READONLY { return Length; } 92 void setLength(unsigned L) { Length = L; } 93 94 StringRef getText() const LLVM_READONLY { 95 assert(is(tok::text)); 96 return StringRef(TextPtr1, TextLen1); 97 } 98 99 void setText(StringRef Text) { 100 assert(is(tok::text)); 101 TextPtr1 = Text.data(); 102 TextLen1 = Text.size(); 103 } 104 105 StringRef getCommandName() const LLVM_READONLY { 106 assert(is(tok::command)); 107 return StringRef(TextPtr1, TextLen1); 108 } 109 110 void setCommandName(StringRef Name) { 111 assert(is(tok::command)); 112 TextPtr1 = Name.data(); 113 TextLen1 = Name.size(); 114 } 115 116 StringRef getVerbatimBlockName() const LLVM_READONLY { 117 assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); 118 return StringRef(TextPtr1, TextLen1); 119 } 120 121 void setVerbatimBlockName(StringRef Name) { 122 assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); 123 TextPtr1 = Name.data(); 124 TextLen1 = Name.size(); 125 } 126 127 StringRef getVerbatimBlockText() const LLVM_READONLY { 128 assert(is(tok::verbatim_block_line)); 129 return StringRef(TextPtr1, TextLen1); 130 } 131 132 void setVerbatimBlockText(StringRef Text) { 133 assert(is(tok::verbatim_block_line)); 134 TextPtr1 = Text.data(); 135 TextLen1 = Text.size(); 136 } 137 138 /// Returns the name of verbatim line command. 139 StringRef getVerbatimLineName() const LLVM_READONLY { 140 assert(is(tok::verbatim_line_name)); 141 return StringRef(TextPtr1, TextLen1); 142 } 143 144 void setVerbatimLineName(StringRef Name) { 145 assert(is(tok::verbatim_line_name)); 146 TextPtr1 = Name.data(); 147 TextLen1 = Name.size(); 148 } 149 150 StringRef getVerbatimLineText() const LLVM_READONLY { 151 assert(is(tok::verbatim_line_text)); 152 return StringRef(TextPtr1, TextLen1); 153 } 154 155 void setVerbatimLineText(StringRef Text) { 156 assert(is(tok::verbatim_line_text)); 157 TextPtr1 = Text.data(); 158 TextLen1 = Text.size(); 159 } 160 161 StringRef getHTMLTagStartName() const LLVM_READONLY { 162 assert(is(tok::html_start_tag)); 163 return StringRef(TextPtr1, TextLen1); 164 } 165 166 void setHTMLTagStartName(StringRef Name) { 167 assert(is(tok::html_start_tag)); 168 TextPtr1 = Name.data(); 169 TextLen1 = Name.size(); 170 } 171 172 StringRef getHTMLIdent() const LLVM_READONLY { 173 assert(is(tok::html_ident)); 174 return StringRef(TextPtr1, TextLen1); 175 } 176 177 void setHTMLIdent(StringRef Name) { 178 assert(is(tok::html_ident)); 179 TextPtr1 = Name.data(); 180 TextLen1 = Name.size(); 181 } 182 183 StringRef getHTMLQuotedString() const LLVM_READONLY { 184 assert(is(tok::html_quoted_string)); 185 return StringRef(TextPtr1, TextLen1); 186 } 187 188 void setHTMLQuotedString(StringRef Str) { 189 assert(is(tok::html_quoted_string)); 190 TextPtr1 = Str.data(); 191 TextLen1 = Str.size(); 192 } 193 194 StringRef getHTMLTagEndName() const LLVM_READONLY { 195 assert(is(tok::html_end_tag)); 196 return StringRef(TextPtr1, TextLen1); 197 } 198 199 void setHTMLTagEndName(StringRef Name) { 200 assert(is(tok::html_end_tag)); 201 TextPtr1 = Name.data(); 202 TextLen1 = Name.size(); 203 } 204 205 void dump(const Lexer &L, const SourceManager &SM) const; 206}; 207 208/// \brief Comment lexer. 209class Lexer { 210private: 211 Lexer(const Lexer&); // DO NOT IMPLEMENT 212 void operator=(const Lexer&); // DO NOT IMPLEMENT 213 214 const char *const BufferStart; 215 const char *const BufferEnd; 216 SourceLocation FileLoc; 217 CommentOptions CommOpts; 218 219 const char *BufferPtr; 220 221 /// One past end pointer for the current comment. For BCPL comments points 222 /// to newline or BufferEnd, for C comments points to star in '*/'. 223 const char *CommentEnd; 224 225 enum LexerCommentState { 226 LCS_BeforeComment, 227 LCS_InsideBCPLComment, 228 LCS_InsideCComment, 229 LCS_BetweenComments 230 }; 231 232 /// Low-level lexer state, track if we are inside or outside of comment. 233 LexerCommentState CommentState; 234 235 enum LexerState { 236 /// Lexing normal comment text 237 LS_Normal, 238 239 /// Finished lexing verbatim block beginning command, will lex first body 240 /// line. 241 LS_VerbatimBlockFirstLine, 242 243 /// Lexing verbatim block body line-by-line, skipping line-starting 244 /// decorations. 245 LS_VerbatimBlockBody, 246 247 /// Finished lexing verbatim line beginning command, will lex text (one 248 /// line). 249 LS_VerbatimLineText, 250 251 /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes. 252 LS_HTMLStartTag, 253 254 /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'. 255 LS_HTMLEndTag 256 }; 257 258 /// Current lexing mode. 259 LexerState State; 260 261 /// A verbatim-like block command eats every character (except line starting 262 /// decorations) until matching end command is seen or comment end is hit. 263 struct VerbatimBlockCommand { 264 StringRef BeginName; 265 StringRef EndName; 266 }; 267 268 typedef SmallVector<VerbatimBlockCommand, 4> VerbatimBlockCommandVector; 269 270 /// Registered verbatim-like block commands. 271 VerbatimBlockCommandVector VerbatimBlockCommands; 272 273 /// If State is LS_VerbatimBlock, contains the the name of verbatim end 274 /// command, including command marker. 275 SmallString<16> VerbatimBlockEndCommandName; 276 277 bool isVerbatimBlockCommand(StringRef BeginName, StringRef &EndName) const; 278 279 /// A verbatim-like line command eats everything until a newline is seen or 280 /// comment end is hit. 281 struct VerbatimLineCommand { 282 StringRef Name; 283 }; 284 285 typedef SmallVector<VerbatimLineCommand, 4> VerbatimLineCommandVector; 286 287 /// Registered verbatim-like line commands. 288 VerbatimLineCommandVector VerbatimLineCommands; 289 290 bool isVerbatimLineCommand(StringRef Name) const; 291 292 void formTokenWithChars(Token &Result, const char *TokEnd, 293 tok::TokenKind Kind) { 294 const unsigned TokLen = TokEnd - BufferPtr; 295 Result.setLocation(getSourceLocation(BufferPtr)); 296 Result.setKind(Kind); 297 Result.setLength(TokLen); 298#ifndef NDEBUG 299 Result.TextPtr1 = "<UNSET>"; 300 Result.TextLen1 = 7; 301#endif 302 BufferPtr = TokEnd; 303 } 304 305 SourceLocation getSourceLocation(const char *Loc) const { 306 assert(Loc >= BufferStart && Loc <= BufferEnd && 307 "Location out of range for this buffer!"); 308 309 const unsigned CharNo = Loc - BufferStart; 310 return FileLoc.getLocWithOffset(CharNo); 311 } 312 313 /// Eat string matching regexp \code \s*\* \endcode. 314 void skipLineStartingDecorations(); 315 316 /// Lex stuff inside comments. CommentEnd should be set correctly. 317 void lexCommentText(Token &T); 318 319 void setupAndLexVerbatimBlock(Token &T, 320 const char *TextBegin, 321 char Marker, StringRef EndName); 322 323 void lexVerbatimBlockFirstLine(Token &T); 324 325 void lexVerbatimBlockBody(Token &T); 326 327 void setupAndLexVerbatimLine(Token &T, const char *TextBegin); 328 329 void lexVerbatimLineText(Token &T); 330 331 void setupAndLexHTMLStartTag(Token &T); 332 333 void lexHTMLStartTag(Token &T); 334 335 void setupAndLexHTMLEndTag(Token &T); 336 337 void lexHTMLEndTag(Token &T); 338 339public: 340 Lexer(SourceLocation FileLoc, const CommentOptions &CommOpts, 341 const char *BufferStart, const char *BufferEnd); 342 343 void lex(Token &T); 344 345 StringRef getSpelling(const Token &Tok, 346 const SourceManager &SourceMgr, 347 bool *Invalid = NULL) const; 348 349 /// \brief Register a new verbatim block command. 350 void addVerbatimBlockCommand(StringRef BeginName, StringRef EndName); 351 352 /// \brief Register a new verbatim line command. 353 void addVerbatimLineCommand(StringRef Name); 354}; 355 356/// Re-lexes a sequence of tok::text tokens. 357class TextTokenRetokenizer { 358 llvm::BumpPtrAllocator &Allocator; 359 static const unsigned MaxTokens = 16; 360 SmallVector<Token, MaxTokens> Toks; 361 362 struct Position { 363 unsigned CurToken; 364 const char *BufferStart; 365 const char *BufferEnd; 366 const char *BufferPtr; 367 SourceLocation BufferStartLoc; 368 }; 369 370 /// Current position in Toks. 371 Position Pos; 372 373 bool isEnd() const { 374 return Pos.CurToken >= Toks.size(); 375 } 376 377 /// Sets up the buffer pointers to point to current token. 378 void setupBuffer() { 379 assert(Pos.CurToken < Toks.size()); 380 const Token &Tok = Toks[Pos.CurToken]; 381 382 Pos.BufferStart = Tok.getText().begin(); 383 Pos.BufferEnd = Tok.getText().end(); 384 Pos.BufferPtr = Pos.BufferStart; 385 Pos.BufferStartLoc = Tok.getLocation(); 386 } 387 388 SourceLocation getSourceLocation() const { 389 const unsigned CharNo = Pos.BufferPtr - Pos.BufferStart; 390 return Pos.BufferStartLoc.getLocWithOffset(CharNo); 391 } 392 393 char peek() const { 394 assert(!isEnd()); 395 assert(Pos.BufferPtr != Pos.BufferEnd); 396 return *Pos.BufferPtr; 397 } 398 399 void consumeChar() { 400 assert(!isEnd()); 401 assert(Pos.BufferPtr != Pos.BufferEnd); 402 Pos.BufferPtr++; 403 if (Pos.BufferPtr == Pos.BufferEnd) { 404 Pos.CurToken++; 405 if (Pos.CurToken < Toks.size()) 406 setupBuffer(); 407 } 408 } 409 410 static bool isWhitespace(char C) { 411 return C == ' ' || C == '\n' || C == '\r' || 412 C == '\t' || C == '\f' || C == '\v'; 413 } 414 415 void consumeWhitespace() { 416 while (!isEnd()) { 417 if (isWhitespace(peek())) 418 consumeChar(); 419 else 420 break; 421 } 422 } 423 424 void formTokenWithChars(Token &Result, 425 SourceLocation Loc, 426 const char *TokBegin, 427 unsigned TokLength, 428 StringRef Text) { 429 Result.setLocation(Loc); 430 Result.setKind(tok::text); 431 Result.setLength(TokLength); 432#ifndef NDEBUG 433 Result.TextPtr1 = "<UNSET>"; 434 Result.TextLen1 = 7; 435#endif 436 Result.setText(Text); 437 } 438 439public: 440 TextTokenRetokenizer(llvm::BumpPtrAllocator &Allocator): 441 Allocator(Allocator) { 442 Pos.CurToken = 0; 443 } 444 445 /// Add a token. 446 /// Returns true on success, false if it seems like we have enough tokens. 447 bool addToken(const Token &Tok) { 448 assert(Tok.is(tok::text)); 449 if (Toks.size() >= MaxTokens) 450 return false; 451 452 Toks.push_back(Tok); 453 if (Toks.size() == 1) 454 setupBuffer(); 455 return true; 456 } 457 458 /// Extract a word -- sequence of non-whitespace characters. 459 bool lexWord(Token &Tok) { 460 if (isEnd()) 461 return false; 462 463 Position SavedPos = Pos; 464 465 consumeWhitespace(); 466 SmallString<32> WordText; 467 const char *WordBegin = Pos.BufferPtr; 468 SourceLocation Loc = getSourceLocation(); 469 while (!isEnd()) { 470 const char C = peek(); 471 if (!isWhitespace(C)) { 472 WordText.push_back(C); 473 consumeChar(); 474 } else 475 break; 476 } 477 const unsigned Length = WordText.size(); 478 if (Length == 0) { 479 Pos = SavedPos; 480 return false; 481 } 482 483 char *TextPtr = Allocator.Allocate<char>(Length + 1); 484 485 memcpy(TextPtr, WordText.c_str(), Length + 1); 486 StringRef Text = StringRef(TextPtr, Length); 487 488 formTokenWithChars(Tok, Loc, WordBegin, 489 Pos.BufferPtr - WordBegin, Text); 490 return true; 491 } 492 493 bool lexDelimitedSeq(Token &Tok, char OpenDelim, char CloseDelim) { 494 if (isEnd()) 495 return false; 496 497 Position SavedPos = Pos; 498 499 consumeWhitespace(); 500 SmallString<32> WordText; 501 const char *WordBegin = Pos.BufferPtr; 502 SourceLocation Loc = getSourceLocation(); 503 bool Error = false; 504 if (!isEnd()) { 505 const char C = peek(); 506 if (C == OpenDelim) { 507 WordText.push_back(C); 508 consumeChar(); 509 } else 510 Error = true; 511 } 512 char C = '\0'; 513 while (!Error && !isEnd()) { 514 C = peek(); 515 WordText.push_back(C); 516 consumeChar(); 517 if (C == CloseDelim) 518 break; 519 } 520 if (!Error && C != CloseDelim) 521 Error = true; 522 523 if (Error) { 524 Pos = SavedPos; 525 return false; 526 } 527 528 const unsigned Length = WordText.size(); 529 char *TextPtr = Allocator.Allocate<char>(Length + 1); 530 531 memcpy(TextPtr, WordText.c_str(), Length + 1); 532 StringRef Text = StringRef(TextPtr, Length); 533 534 formTokenWithChars(Tok, Loc, WordBegin, 535 Pos.BufferPtr - WordBegin, Text); 536 return true; 537 } 538 539 /// Return a text token. Useful to take tokens back. 540 bool lexText(Token &Tok) { 541 if (isEnd()) 542 return false; 543 544 if (Pos.BufferPtr != Pos.BufferStart) 545 formTokenWithChars(Tok, getSourceLocation(), 546 Pos.BufferPtr, Pos.BufferEnd - Pos.BufferPtr, 547 StringRef(Pos.BufferPtr, 548 Pos.BufferEnd - Pos.BufferPtr)); 549 else 550 Tok = Toks[Pos.CurToken]; 551 552 Pos.CurToken++; 553 if (Pos.CurToken < Toks.size()) 554 setupBuffer(); 555 return true; 556 } 557}; 558 559} // end namespace comments 560} // end namespace clang 561 562#endif 563 564