CommentParser.cpp revision c4b0f9b851ca59e61b802d58792ea3600fd9a9d4
1//===--- CommentParser.cpp - Doxygen comment parser -----------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9 10#include "clang/AST/CommentParser.h" 11#include "clang/AST/CommentSema.h" 12#include "clang/AST/CommentDiagnostic.h" 13#include "clang/Basic/SourceManager.h" 14#include "llvm/Support/ErrorHandling.h" 15 16namespace clang { 17namespace comments { 18 19/// Re-lexes a sequence of tok::text tokens. 20class TextTokenRetokenizer { 21 llvm::BumpPtrAllocator &Allocator; 22 static const unsigned MaxTokens = 16; 23 SmallVector<Token, MaxTokens> Toks; 24 25 struct Position { 26 unsigned CurToken; 27 const char *BufferStart; 28 const char *BufferEnd; 29 const char *BufferPtr; 30 SourceLocation BufferStartLoc; 31 }; 32 33 /// Current position in Toks. 34 Position Pos; 35 36 bool isEnd() const { 37 return Pos.CurToken >= Toks.size(); 38 } 39 40 /// Sets up the buffer pointers to point to current token. 41 void setupBuffer() { 42 assert(Pos.CurToken < Toks.size()); 43 const Token &Tok = Toks[Pos.CurToken]; 44 45 Pos.BufferStart = Tok.getText().begin(); 46 Pos.BufferEnd = Tok.getText().end(); 47 Pos.BufferPtr = Pos.BufferStart; 48 Pos.BufferStartLoc = Tok.getLocation(); 49 } 50 51 SourceLocation getSourceLocation() const { 52 const unsigned CharNo = Pos.BufferPtr - Pos.BufferStart; 53 return Pos.BufferStartLoc.getLocWithOffset(CharNo); 54 } 55 56 char peek() const { 57 assert(!isEnd()); 58 assert(Pos.BufferPtr != Pos.BufferEnd); 59 return *Pos.BufferPtr; 60 } 61 62 void consumeChar() { 63 assert(!isEnd()); 64 assert(Pos.BufferPtr != Pos.BufferEnd); 65 Pos.BufferPtr++; 66 if (Pos.BufferPtr == Pos.BufferEnd) { 67 Pos.CurToken++; 68 if (Pos.CurToken < Toks.size()) 69 setupBuffer(); 70 } 71 } 72 73 static bool isWhitespace(char C) { 74 return C == ' ' || C == '\n' || C == '\r' || 75 C == '\t' || C == '\f' || C == '\v'; 76 } 77 78 void consumeWhitespace() { 79 while (!isEnd()) { 80 if (isWhitespace(peek())) 81 consumeChar(); 82 else 83 break; 84 } 85 } 86 87 void formTokenWithChars(Token &Result, 88 SourceLocation Loc, 89 const char *TokBegin, 90 unsigned TokLength, 91 StringRef Text) { 92 Result.setLocation(Loc); 93 Result.setKind(tok::text); 94 Result.setLength(TokLength); 95#ifndef NDEBUG 96 Result.TextPtr1 = "<UNSET>"; 97 Result.TextLen1 = 7; 98#endif 99 Result.setText(Text); 100 } 101 102public: 103 TextTokenRetokenizer(llvm::BumpPtrAllocator &Allocator): 104 Allocator(Allocator) { 105 Pos.CurToken = 0; 106 } 107 108 /// Add a token. 109 /// Returns true on success, false if it seems like we have enough tokens. 110 bool addToken(const Token &Tok) { 111 assert(Tok.is(tok::text)); 112 if (Toks.size() >= MaxTokens) 113 return false; 114 115 Toks.push_back(Tok); 116 if (Toks.size() == 1) 117 setupBuffer(); 118 return true; 119 } 120 121 /// Extract a word -- sequence of non-whitespace characters. 122 bool lexWord(Token &Tok) { 123 if (isEnd()) 124 return false; 125 126 Position SavedPos = Pos; 127 128 consumeWhitespace(); 129 SmallString<32> WordText; 130 const char *WordBegin = Pos.BufferPtr; 131 SourceLocation Loc = getSourceLocation(); 132 while (!isEnd()) { 133 const char C = peek(); 134 if (!isWhitespace(C)) { 135 WordText.push_back(C); 136 consumeChar(); 137 } else 138 break; 139 } 140 const unsigned Length = WordText.size(); 141 if (Length == 0) { 142 Pos = SavedPos; 143 return false; 144 } 145 146 char *TextPtr = Allocator.Allocate<char>(Length + 1); 147 148 memcpy(TextPtr, WordText.c_str(), Length + 1); 149 StringRef Text = StringRef(TextPtr, Length); 150 151 formTokenWithChars(Tok, Loc, WordBegin, 152 Pos.BufferPtr - WordBegin, Text); 153 return true; 154 } 155 156 bool lexDelimitedSeq(Token &Tok, char OpenDelim, char CloseDelim) { 157 if (isEnd()) 158 return false; 159 160 Position SavedPos = Pos; 161 162 consumeWhitespace(); 163 SmallString<32> WordText; 164 const char *WordBegin = Pos.BufferPtr; 165 SourceLocation Loc = getSourceLocation(); 166 bool Error = false; 167 if (!isEnd()) { 168 const char C = peek(); 169 if (C == OpenDelim) { 170 WordText.push_back(C); 171 consumeChar(); 172 } else 173 Error = true; 174 } 175 char C = '\0'; 176 while (!Error && !isEnd()) { 177 C = peek(); 178 WordText.push_back(C); 179 consumeChar(); 180 if (C == CloseDelim) 181 break; 182 } 183 if (!Error && C != CloseDelim) 184 Error = true; 185 186 if (Error) { 187 Pos = SavedPos; 188 return false; 189 } 190 191 const unsigned Length = WordText.size(); 192 char *TextPtr = Allocator.Allocate<char>(Length + 1); 193 194 memcpy(TextPtr, WordText.c_str(), Length + 1); 195 StringRef Text = StringRef(TextPtr, Length); 196 197 formTokenWithChars(Tok, Loc, WordBegin, 198 Pos.BufferPtr - WordBegin, Text); 199 return true; 200 } 201 202 /// Return a text token. Useful to take tokens back. 203 bool lexText(Token &Tok) { 204 if (isEnd()) 205 return false; 206 207 if (Pos.BufferPtr != Pos.BufferStart) 208 formTokenWithChars(Tok, getSourceLocation(), 209 Pos.BufferPtr, Pos.BufferEnd - Pos.BufferPtr, 210 StringRef(Pos.BufferPtr, 211 Pos.BufferEnd - Pos.BufferPtr)); 212 else 213 Tok = Toks[Pos.CurToken]; 214 215 Pos.CurToken++; 216 if (Pos.CurToken < Toks.size()) 217 setupBuffer(); 218 return true; 219 } 220}; 221 222Parser::Parser(Lexer &L, Sema &S, llvm::BumpPtrAllocator &Allocator, 223 const SourceManager &SourceMgr, DiagnosticsEngine &Diags): 224 L(L), S(S), Allocator(Allocator), SourceMgr(SourceMgr), Diags(Diags) { 225 consumeToken(); 226} 227 228ParamCommandComment *Parser::parseParamCommandArgs( 229 ParamCommandComment *PC, 230 TextTokenRetokenizer &Retokenizer) { 231 Token Arg; 232 // Check if argument looks like direction specification: [dir] 233 // e.g., [in], [out], [in,out] 234 if (Retokenizer.lexDelimitedSeq(Arg, '[', ']')) 235 PC = S.actOnParamCommandDirectionArg(PC, 236 Arg.getLocation(), 237 Arg.getEndLocation(), 238 Arg.getText()); 239 240 if (Retokenizer.lexWord(Arg)) 241 PC = S.actOnParamCommandParamNameArg(PC, 242 Arg.getLocation(), 243 Arg.getEndLocation(), 244 Arg.getText()); 245 246 return PC; 247} 248 249BlockCommandComment *Parser::parseBlockCommandArgs( 250 BlockCommandComment *BC, 251 TextTokenRetokenizer &Retokenizer, 252 unsigned NumArgs) { 253 typedef BlockCommandComment::Argument Argument; 254 Argument *Args = 255 new (Allocator.Allocate<Argument>(NumArgs)) Argument[NumArgs]; 256 unsigned ParsedArgs = 0; 257 Token Arg; 258 while (ParsedArgs < NumArgs && Retokenizer.lexWord(Arg)) { 259 Args[ParsedArgs] = Argument(SourceRange(Arg.getLocation(), 260 Arg.getEndLocation()), 261 Arg.getText()); 262 ParsedArgs++; 263 } 264 265 return S.actOnBlockCommandArgs(BC, llvm::makeArrayRef(Args, ParsedArgs)); 266} 267 268BlockCommandComment *Parser::parseBlockCommand() { 269 assert(Tok.is(tok::command)); 270 271 ParamCommandComment *PC; 272 BlockCommandComment *BC; 273 bool IsParam = false; 274 unsigned NumArgs = 0; 275 if (S.isParamCommand(Tok.getCommandName())) { 276 IsParam = true; 277 PC = S.actOnParamCommandStart(Tok.getLocation(), 278 Tok.getEndLocation(), 279 Tok.getCommandName()); 280 } else { 281 NumArgs = S.getBlockCommandNumArgs(Tok.getCommandName()); 282 BC = S.actOnBlockCommandStart(Tok.getLocation(), 283 Tok.getEndLocation(), 284 Tok.getCommandName()); 285 } 286 consumeToken(); 287 288 if (Tok.is(tok::command) && S.isBlockCommand(Tok.getCommandName())) { 289 // Block command ahead. We can't nest block commands, so pretend that this 290 // command has an empty argument. 291 ParagraphComment *PC = S.actOnParagraphComment( 292 ArrayRef<InlineContentComment *>()); 293 return S.actOnBlockCommandFinish(BC, PC); 294 } 295 296 if (IsParam || NumArgs > 0) { 297 // In order to parse command arguments we need to retokenize a few 298 // following text tokens. 299 TextTokenRetokenizer Retokenizer(Allocator); 300 while (Tok.is(tok::text)) { 301 if (Retokenizer.addToken(Tok)) 302 consumeToken(); 303 } 304 305 if (IsParam) 306 PC = parseParamCommandArgs(PC, Retokenizer); 307 else 308 BC = parseBlockCommandArgs(BC, Retokenizer, NumArgs); 309 310 // Put back tokens we didn't use. 311 SmallVector<Token, 16> TextToks; 312 Token Text; 313 while (Retokenizer.lexText(Text)) { 314 TextToks.push_back(Text); 315 } 316 putBack(TextToks); 317 } 318 319 BlockContentComment *Block = parseParagraphOrBlockCommand(); 320 // Since we have checked for a block command, we should have parsed a 321 // paragraph. 322 if (IsParam) 323 return S.actOnParamCommandFinish(PC, cast<ParagraphComment>(Block)); 324 else 325 return S.actOnBlockCommandFinish(BC, cast<ParagraphComment>(Block)); 326} 327 328InlineCommandComment *Parser::parseInlineCommand() { 329 assert(Tok.is(tok::command)); 330 331 const Token CommandTok = Tok; 332 consumeToken(); 333 334 TextTokenRetokenizer Retokenizer(Allocator); 335 while (Tok.is(tok::text)) { 336 if (Retokenizer.addToken(Tok)) 337 consumeToken(); 338 } 339 340 Token ArgTok; 341 bool ArgTokValid = Retokenizer.lexWord(ArgTok); 342 343 InlineCommandComment *IC; 344 if (ArgTokValid) { 345 IC = S.actOnInlineCommand(CommandTok.getLocation(), 346 CommandTok.getEndLocation(), 347 CommandTok.getCommandName(), 348 ArgTok.getLocation(), 349 ArgTok.getEndLocation(), 350 ArgTok.getText()); 351 } else { 352 IC = S.actOnInlineCommand(CommandTok.getLocation(), 353 CommandTok.getEndLocation(), 354 CommandTok.getCommandName()); 355 } 356 357 Token Text; 358 while (Retokenizer.lexText(Text)) 359 putBack(Text); 360 361 return IC; 362} 363 364HTMLStartTagComment *Parser::parseHTMLStartTag() { 365 assert(Tok.is(tok::html_start_tag)); 366 HTMLStartTagComment *HST = 367 S.actOnHTMLStartTagStart(Tok.getLocation(), 368 Tok.getHTMLTagStartName()); 369 consumeToken(); 370 371 SmallVector<HTMLStartTagComment::Attribute, 2> Attrs; 372 while (true) { 373 switch (Tok.getKind()) { 374 case tok::html_ident: { 375 Token Ident = Tok; 376 consumeToken(); 377 if (Tok.isNot(tok::html_equals)) { 378 Attrs.push_back(HTMLStartTagComment::Attribute(Ident.getLocation(), 379 Ident.getHTMLIdent())); 380 continue; 381 } 382 Token Equals = Tok; 383 consumeToken(); 384 if (Tok.isNot(tok::html_quoted_string)) { 385 Diag(Tok.getLocation(), 386 diag::warn_doc_html_start_tag_expected_quoted_string) 387 << SourceRange(Equals.getLocation()); 388 Attrs.push_back(HTMLStartTagComment::Attribute(Ident.getLocation(), 389 Ident.getHTMLIdent())); 390 while (Tok.is(tok::html_equals) || 391 Tok.is(tok::html_quoted_string)) 392 consumeToken(); 393 continue; 394 } 395 Attrs.push_back(HTMLStartTagComment::Attribute( 396 Ident.getLocation(), 397 Ident.getHTMLIdent(), 398 Equals.getLocation(), 399 SourceRange(Tok.getLocation(), 400 Tok.getEndLocation()), 401 Tok.getHTMLQuotedString())); 402 consumeToken(); 403 continue; 404 } 405 406 case tok::html_greater: 407 HST = S.actOnHTMLStartTagFinish(HST, 408 copyArray(llvm::makeArrayRef(Attrs)), 409 Tok.getLocation(), 410 /* IsSelfClosing = */ false); 411 consumeToken(); 412 return HST; 413 414 case tok::html_slash_greater: 415 HST = S.actOnHTMLStartTagFinish(HST, 416 copyArray(llvm::makeArrayRef(Attrs)), 417 Tok.getLocation(), 418 /* IsSelfClosing = */ true); 419 consumeToken(); 420 return HST; 421 422 case tok::html_equals: 423 case tok::html_quoted_string: 424 Diag(Tok.getLocation(), 425 diag::warn_doc_html_start_tag_expected_ident_or_greater); 426 while (Tok.is(tok::html_equals) || 427 Tok.is(tok::html_quoted_string)) 428 consumeToken(); 429 if (Tok.is(tok::html_ident) || 430 Tok.is(tok::html_greater) || 431 Tok.is(tok::html_slash_greater)) 432 continue; 433 434 return S.actOnHTMLStartTagFinish(HST, 435 copyArray(llvm::makeArrayRef(Attrs)), 436 SourceLocation(), 437 /* IsSelfClosing = */ false); 438 439 default: 440 // Not a token from an HTML start tag. Thus HTML tag prematurely ended. 441 HST = S.actOnHTMLStartTagFinish(HST, 442 copyArray(llvm::makeArrayRef(Attrs)), 443 SourceLocation(), 444 /* IsSelfClosing = */ false); 445 bool StartLineInvalid; 446 const unsigned StartLine = SourceMgr.getPresumedLineNumber( 447 HST->getLocation(), 448 &StartLineInvalid); 449 bool EndLineInvalid; 450 const unsigned EndLine = SourceMgr.getPresumedLineNumber( 451 Tok.getLocation(), 452 &EndLineInvalid); 453 if (StartLineInvalid || EndLineInvalid || StartLine == EndLine) 454 Diag(Tok.getLocation(), 455 diag::warn_doc_html_start_tag_expected_ident_or_greater) 456 << HST->getSourceRange(); 457 else { 458 Diag(Tok.getLocation(), 459 diag::warn_doc_html_start_tag_expected_ident_or_greater); 460 Diag(HST->getLocation(), diag::note_doc_html_tag_started_here) 461 << HST->getSourceRange(); 462 } 463 return HST; 464 } 465 } 466} 467 468HTMLEndTagComment *Parser::parseHTMLEndTag() { 469 assert(Tok.is(tok::html_end_tag)); 470 Token TokEndTag = Tok; 471 consumeToken(); 472 SourceLocation Loc; 473 if (Tok.is(tok::html_greater)) { 474 Loc = Tok.getLocation(); 475 consumeToken(); 476 } 477 478 return S.actOnHTMLEndTag(TokEndTag.getLocation(), 479 Loc, 480 TokEndTag.getHTMLTagEndName()); 481} 482 483BlockContentComment *Parser::parseParagraphOrBlockCommand() { 484 SmallVector<InlineContentComment *, 8> Content; 485 486 while (true) { 487 switch (Tok.getKind()) { 488 case tok::verbatim_block_begin: 489 case tok::verbatim_line_name: 490 case tok::eof: 491 assert(Content.size() != 0); 492 break; // Block content or EOF ahead, finish this parapgaph. 493 494 case tok::command: 495 if (S.isBlockCommand(Tok.getCommandName())) { 496 if (Content.size() == 0) 497 return parseBlockCommand(); 498 break; // Block command ahead, finish this parapgaph. 499 } 500 if (S.isInlineCommand(Tok.getCommandName())) { 501 Content.push_back(parseInlineCommand()); 502 continue; 503 } 504 505 // Not a block command, not an inline command ==> an unknown command. 506 Content.push_back(S.actOnUnknownCommand(Tok.getLocation(), 507 Tok.getEndLocation(), 508 Tok.getCommandName())); 509 consumeToken(); 510 continue; 511 512 case tok::newline: { 513 consumeToken(); 514 if (Tok.is(tok::newline) || Tok.is(tok::eof)) { 515 consumeToken(); 516 break; // Two newlines -- end of paragraph. 517 } 518 if (Content.size() > 0) 519 Content.back()->addTrailingNewline(); 520 continue; 521 } 522 523 // Don't deal with HTML tag soup now. 524 case tok::html_start_tag: 525 Content.push_back(parseHTMLStartTag()); 526 continue; 527 528 case tok::html_end_tag: 529 Content.push_back(parseHTMLEndTag()); 530 continue; 531 532 case tok::text: 533 Content.push_back(S.actOnText(Tok.getLocation(), 534 Tok.getEndLocation(), 535 Tok.getText())); 536 consumeToken(); 537 continue; 538 539 case tok::verbatim_block_line: 540 case tok::verbatim_block_end: 541 case tok::verbatim_line_text: 542 case tok::html_ident: 543 case tok::html_equals: 544 case tok::html_quoted_string: 545 case tok::html_greater: 546 case tok::html_slash_greater: 547 llvm_unreachable("should not see this token"); 548 } 549 break; 550 } 551 552 return S.actOnParagraphComment(copyArray(llvm::makeArrayRef(Content))); 553} 554 555VerbatimBlockComment *Parser::parseVerbatimBlock() { 556 assert(Tok.is(tok::verbatim_block_begin)); 557 558 VerbatimBlockComment *VB = 559 S.actOnVerbatimBlockStart(Tok.getLocation(), 560 Tok.getVerbatimBlockName()); 561 consumeToken(); 562 563 // Don't create an empty line if verbatim opening command is followed 564 // by a newline. 565 if (Tok.is(tok::newline)) 566 consumeToken(); 567 568 SmallVector<VerbatimBlockLineComment *, 8> Lines; 569 while (Tok.is(tok::verbatim_block_line) || 570 Tok.is(tok::newline)) { 571 VerbatimBlockLineComment *Line; 572 if (Tok.is(tok::verbatim_block_line)) { 573 Line = S.actOnVerbatimBlockLine(Tok.getLocation(), 574 Tok.getVerbatimBlockText()); 575 consumeToken(); 576 if (Tok.is(tok::newline)) { 577 consumeToken(); 578 } 579 } else { 580 // Empty line, just a tok::newline. 581 Line = S.actOnVerbatimBlockLine(Tok.getLocation(), ""); 582 consumeToken(); 583 } 584 Lines.push_back(Line); 585 } 586 587 if (Tok.is(tok::verbatim_block_end)) { 588 VB = S.actOnVerbatimBlockFinish(VB, Tok.getLocation(), 589 Tok.getVerbatimBlockName(), 590 copyArray(llvm::makeArrayRef(Lines))); 591 consumeToken(); 592 } else { 593 // Unterminated \\verbatim block 594 VB = S.actOnVerbatimBlockFinish(VB, SourceLocation(), "", 595 copyArray(llvm::makeArrayRef(Lines))); 596 } 597 598 return VB; 599} 600 601VerbatimLineComment *Parser::parseVerbatimLine() { 602 assert(Tok.is(tok::verbatim_line_name)); 603 604 Token NameTok = Tok; 605 consumeToken(); 606 607 SourceLocation TextBegin; 608 StringRef Text; 609 // Next token might not be a tok::verbatim_line_text if verbatim line 610 // starting command comes just before a newline or comment end. 611 if (Tok.is(tok::verbatim_line_text)) { 612 TextBegin = Tok.getLocation(); 613 Text = Tok.getVerbatimLineText(); 614 } else { 615 TextBegin = NameTok.getEndLocation(); 616 Text = ""; 617 } 618 619 VerbatimLineComment *VL = S.actOnVerbatimLine(NameTok.getLocation(), 620 NameTok.getVerbatimLineName(), 621 TextBegin, 622 Text); 623 consumeToken(); 624 return VL; 625} 626 627BlockContentComment *Parser::parseBlockContent() { 628 switch (Tok.getKind()) { 629 case tok::text: 630 case tok::command: 631 case tok::html_start_tag: 632 case tok::html_end_tag: 633 return parseParagraphOrBlockCommand(); 634 635 case tok::verbatim_block_begin: 636 return parseVerbatimBlock(); 637 638 case tok::verbatim_line_name: 639 return parseVerbatimLine(); 640 641 case tok::eof: 642 case tok::newline: 643 case tok::verbatim_block_line: 644 case tok::verbatim_block_end: 645 case tok::verbatim_line_text: 646 case tok::html_ident: 647 case tok::html_equals: 648 case tok::html_quoted_string: 649 case tok::html_greater: 650 case tok::html_slash_greater: 651 llvm_unreachable("should not see this token"); 652 } 653 llvm_unreachable("bogus token kind"); 654} 655 656FullComment *Parser::parseFullComment() { 657 // Skip newlines at the beginning of the comment. 658 while (Tok.is(tok::newline)) 659 consumeToken(); 660 661 SmallVector<BlockContentComment *, 8> Blocks; 662 while (Tok.isNot(tok::eof)) { 663 Blocks.push_back(parseBlockContent()); 664 665 // Skip extra newlines after paragraph end. 666 while (Tok.is(tok::newline)) 667 consumeToken(); 668 } 669 return S.actOnFullComment(copyArray(llvm::makeArrayRef(Blocks))); 670} 671 672} // end namespace comments 673} // end namespace clang 674