CommentLexer.cpp revision 2d44d77fed3200e2eff289f55493317e90d3398c
1#include "clang/AST/CommentLexer.h" 2#include "llvm/ADT/StringSwitch.h" 3#include "llvm/Support/ErrorHandling.h" 4 5namespace clang { 6namespace comments { 7 8void Token::dump(const Lexer &L, const SourceManager &SM) const { 9 llvm::errs() << "comments::Token Kind=" << Kind << " "; 10 Loc.dump(SM); 11 llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n"; 12} 13 14bool Lexer::isVerbatimBlockCommand(StringRef BeginName, 15 StringRef &EndName) const { 16 const char *Result = llvm::StringSwitch<const char *>(BeginName) 17 .Case("code", "endcode") 18 .Case("verbatim", "endverbatim") 19 .Case("htmlonly", "endhtmlonly") 20 .Case("latexonly", "endlatexonly") 21 .Case("xmlonly", "endxmlonly") 22 .Case("manonly", "endmanonly") 23 .Case("rtfonly", "endrtfonly") 24 25 .Case("dot", "enddot") 26 .Case("msc", "endmsc") 27 28 .Case("f$", "f$") // Inline LaTeX formula 29 .Case("f[", "f]") // Displayed LaTeX formula 30 .Case("f{", "f}") // LaTeX environment 31 32 .Default(NULL); 33 34 if (Result) { 35 EndName = Result; 36 return true; 37 } 38 39 for (VerbatimBlockCommandVector::const_iterator 40 I = VerbatimBlockCommands.begin(), 41 E = VerbatimBlockCommands.end(); 42 I != E; ++I) 43 if (I->BeginName == BeginName) { 44 EndName = I->EndName; 45 return true; 46 } 47 48 return false; 49} 50 51bool Lexer::isVerbatimLineCommand(StringRef Name) const { 52 bool Result = llvm::StringSwitch<bool>(Name) 53 .Case("fn", true) 54 .Case("var", true) 55 .Case("property", true) 56 .Case("typedef", true) 57 58 .Case("overload", true) 59 60 .Case("defgroup", true) 61 .Case("ingroup", true) 62 .Case("addtogroup", true) 63 .Case("weakgroup", true) 64 .Case("name", true) 65 66 .Case("section", true) 67 .Case("subsection", true) 68 .Case("subsubsection", true) 69 .Case("paragraph", true) 70 71 .Case("mainpage", true) 72 .Case("subpage", true) 73 .Case("ref", true) 74 75 .Default(false); 76 77 if (Result) 78 return true; 79 80 for (VerbatimLineCommandVector::const_iterator 81 I = VerbatimLineCommands.begin(), 82 E = VerbatimLineCommands.end(); 83 I != E; ++I) 84 if (I->Name == Name) 85 return true; 86 87 return false; 88} 89 90void Lexer::skipLineStartingDecorations() { 91 // This function should be called only for C comments 92 assert(CommentState == LCS_InsideCComment); 93 94 if (BufferPtr == CommentEnd) 95 return; 96 97 switch (*BufferPtr) { 98 case ' ': 99 case '\t': 100 case '\f': 101 case '\v': { 102 const char *NewBufferPtr = BufferPtr; 103 NewBufferPtr++; 104 if (NewBufferPtr == CommentEnd) 105 return; 106 107 char C = *NewBufferPtr; 108 while (C == ' ' || C == '\t' || C == '\f' || C == '\v') { 109 NewBufferPtr++; 110 if (NewBufferPtr == CommentEnd) 111 return; 112 C = *NewBufferPtr; 113 } 114 if (C == '*') 115 BufferPtr = NewBufferPtr + 1; 116 break; 117 } 118 case '*': 119 BufferPtr++; 120 break; 121 } 122} 123 124namespace { 125const char *findNewline(const char *BufferPtr, const char *BufferEnd) { 126 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 127 const char C = *BufferPtr; 128 if (C == '\n' || C == '\r') 129 return BufferPtr; 130 } 131 return BufferEnd; 132} 133 134const char *skipNewline(const char *BufferPtr, const char *BufferEnd) { 135 if (BufferPtr == BufferEnd) 136 return BufferPtr; 137 138 if (*BufferPtr == '\n') 139 BufferPtr++; 140 else { 141 assert(*BufferPtr == '\r'); 142 BufferPtr++; 143 if (BufferPtr != BufferEnd && *BufferPtr == '\n') 144 BufferPtr++; 145 } 146 return BufferPtr; 147} 148 149bool isHTMLIdentifierCharacter(char C) { 150 return (C >= 'a' && C <= 'z') || 151 (C >= 'A' && C <= 'Z') || 152 (C >= '0' && C <= '9'); 153} 154 155const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) { 156 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 157 if (!isHTMLIdentifierCharacter(*BufferPtr)) 158 return BufferPtr; 159 } 160 return BufferEnd; 161} 162 163/// Skip HTML string quoted in single or double quotes. Escaping quotes inside 164/// string allowed. 165/// 166/// Returns pointer to closing quote. 167const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd) 168{ 169 const char Quote = *BufferPtr; 170 assert(Quote == '\"' || Quote == '\''); 171 172 BufferPtr++; 173 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 174 const char C = *BufferPtr; 175 if (C == Quote && BufferPtr[-1] != '\\') 176 return BufferPtr; 177 } 178 return BufferEnd; 179} 180 181bool isHorizontalWhitespace(char C) { 182 return C == ' ' || C == '\t' || C == '\f' || C == '\v'; 183} 184 185bool isWhitespace(char C) { 186 return C == ' ' || C == '\n' || C == '\r' || 187 C == '\t' || C == '\f' || C == '\v'; 188} 189 190const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) { 191 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 192 if (!isWhitespace(*BufferPtr)) 193 return BufferPtr; 194 } 195 return BufferEnd; 196} 197 198bool isCommandNameCharacter(char C) { 199 return (C >= 'a' && C <= 'z') || 200 (C >= 'A' && C <= 'Z') || 201 (C >= '0' && C <= '9'); 202} 203 204const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) { 205 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 206 if (!isCommandNameCharacter(*BufferPtr)) 207 return BufferPtr; 208 } 209 return BufferEnd; 210} 211 212/// Return the one past end pointer for BCPL comments. 213/// Handles newlines escaped with backslash or trigraph for backslahs. 214const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) { 215 const char *CurPtr = BufferPtr; 216 while (CurPtr != BufferEnd) { 217 char C = *CurPtr; 218 while (C != '\n' && C != '\r') { 219 CurPtr++; 220 if (CurPtr == BufferEnd) 221 return BufferEnd; 222 C = *CurPtr; 223 } 224 // We found a newline, check if it is escaped. 225 const char *EscapePtr = CurPtr - 1; 226 while(isHorizontalWhitespace(*EscapePtr)) 227 EscapePtr--; 228 229 if (*EscapePtr == '\\' || 230 (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' && 231 EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) { 232 // We found an escaped newline. 233 CurPtr = skipNewline(CurPtr, BufferEnd); 234 } else 235 return CurPtr; // Not an escaped newline. 236 } 237 return BufferEnd; 238} 239 240/// Return the one past end pointer for C comments. 241/// Very dumb, does not handle escaped newlines or trigraphs. 242const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) { 243 for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 244 if (*BufferPtr == '*') { 245 assert(BufferPtr + 1 != BufferEnd); 246 if (*(BufferPtr + 1) == '/') 247 return BufferPtr; 248 } 249 } 250 llvm_unreachable("buffer end hit before '*/' was seen"); 251} 252} // unnamed namespace 253 254void Lexer::lexCommentText(Token &T) { 255 assert(CommentState == LCS_InsideBCPLComment || 256 CommentState == LCS_InsideCComment); 257 258 switch (State) { 259 case LS_Normal: 260 break; 261 case LS_VerbatimBlockFirstLine: 262 lexVerbatimBlockFirstLine(T); 263 return; 264 case LS_VerbatimBlockBody: 265 lexVerbatimBlockBody(T); 266 return; 267 case LS_HTMLOpenTag: 268 lexHTMLOpenTag(T); 269 return; 270 } 271 272 assert(State == LS_Normal); 273 274 const char *TokenPtr = BufferPtr; 275 assert(TokenPtr < CommentEnd); 276 while (TokenPtr != CommentEnd) { 277 switch(*TokenPtr) { 278 case '\\': 279 case '@': { 280 TokenPtr++; 281 if (TokenPtr == CommentEnd) { 282 formTokenWithChars(T, TokenPtr, tok::text); 283 T.setText(StringRef(BufferPtr - T.getLength(), T.getLength())); 284 return; 285 } 286 char C = *TokenPtr; 287 switch (C) { 288 default: 289 break; 290 291 case '\\': case '@': case '&': case '$': 292 case '#': case '<': case '>': case '%': 293 case '\"': case '.': case ':': 294 // This is one of \\ \@ \& \$ etc escape sequences. 295 TokenPtr++; 296 if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') { 297 // This is the \:: escape sequence. 298 TokenPtr++; 299 } 300 formTokenWithChars(T, TokenPtr, tok::text); 301 T.setText(StringRef(BufferPtr - (T.getLength() - 1), 302 T.getLength() - 1)); 303 return; 304 } 305 306 // Don't make zero-length commands. 307 if (!isCommandNameCharacter(*TokenPtr)) { 308 formTokenWithChars(T, TokenPtr, tok::text); 309 T.setText(StringRef(BufferPtr - T.getLength(), T.getLength())); 310 return; 311 } 312 313 TokenPtr = skipCommandName(TokenPtr, CommentEnd); 314 unsigned Length = TokenPtr - (BufferPtr + 1); 315 316 // Hardcoded support for lexing LaTeX formula commands 317 // \f$ \f[ \f] \f{ \f} as a single command. 318 if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) { 319 C = *TokenPtr; 320 if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') { 321 TokenPtr++; 322 Length++; 323 } 324 } 325 326 const StringRef CommandName(BufferPtr + 1, Length); 327 StringRef EndName; 328 329 if (isVerbatimBlockCommand(CommandName, EndName)) { 330 setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, EndName); 331 return; 332 } 333 if (isVerbatimLineCommand(CommandName)) { 334 lexVerbatimLine(T, TokenPtr); 335 return; 336 } 337 formTokenWithChars(T, TokenPtr, tok::command); 338 T.setCommandName(CommandName); 339 return; 340 } 341 342 case '<': { 343 TokenPtr++; 344 if (TokenPtr == CommentEnd) { 345 formTokenWithChars(T, TokenPtr, tok::text); 346 T.setText(StringRef(BufferPtr - T.getLength(), T.getLength())); 347 return; 348 } 349 const char C = *TokenPtr; 350 if (isHTMLIdentifierCharacter(C)) 351 setupAndLexHTMLOpenTag(T); 352 else if (C == '/') 353 lexHTMLCloseTag(T); 354 return; 355 } 356 357 case '\n': 358 case '\r': 359 TokenPtr = skipNewline(TokenPtr, CommentEnd); 360 formTokenWithChars(T, TokenPtr, tok::newline); 361 362 if (CommentState == LCS_InsideCComment) 363 skipLineStartingDecorations(); 364 return; 365 366 default: { 367 while (true) { 368 TokenPtr++; 369 if (TokenPtr == CommentEnd) 370 break; 371 char C = *TokenPtr; 372 if(C == '\n' || C == '\r' || 373 C == '\\' || C == '@' || C == '<') 374 break; 375 } 376 formTokenWithChars(T, TokenPtr, tok::text); 377 T.setText(StringRef(BufferPtr - T.getLength(), T.getLength())); 378 return; 379 } 380 } 381 } 382} 383 384void Lexer::setupAndLexVerbatimBlock(Token &T, 385 const char *TextBegin, 386 char Marker, StringRef EndName) { 387 VerbatimBlockEndCommandName.clear(); 388 VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@"); 389 VerbatimBlockEndCommandName.append(EndName); 390 391 formTokenWithChars(T, TextBegin, tok::verbatim_block_begin); 392 T.setVerbatimBlockName(StringRef(TextBegin - (T.getLength() - 1), 393 T.getLength() - 1)); 394 395 State = LS_VerbatimBlockFirstLine; 396} 397 398void Lexer::lexVerbatimBlockFirstLine(Token &T) { 399 assert(BufferPtr < CommentEnd); 400 401 // FIXME: It would be better to scan the text once, finding either the block 402 // end command or newline. 403 // 404 // Extract current line. 405 const char *Newline = findNewline(BufferPtr, CommentEnd); 406 StringRef Line(BufferPtr, Newline - BufferPtr); 407 408 // Look for end command in current line. 409 size_t Pos = Line.find(VerbatimBlockEndCommandName); 410 const char *NextLine; 411 if (Pos == StringRef::npos) { 412 // Current line is completely verbatim. 413 NextLine = skipNewline(Newline, CommentEnd); 414 } else if (Pos == 0) { 415 // Current line contains just an end command. 416 const char *End = BufferPtr + VerbatimBlockEndCommandName.size(); 417 formTokenWithChars(T, End, tok::verbatim_block_end); 418 T.setVerbatimBlockName(StringRef(End - (T.getLength() - 1), 419 T.getLength() - 1)); 420 State = LS_Normal; 421 return; 422 } else { 423 // There is some text, followed by end command. Extract text first. 424 NextLine = BufferPtr + Pos; 425 } 426 427 formTokenWithChars(T, NextLine, tok::verbatim_block_line); 428 T.setVerbatimBlockText(StringRef(NextLine - T.getLength(), T.getLength())); 429 430 State = LS_VerbatimBlockBody; 431} 432 433void Lexer::lexVerbatimBlockBody(Token &T) { 434 assert(State == LS_VerbatimBlockBody); 435 436 if (CommentState == LCS_InsideCComment) 437 skipLineStartingDecorations(); 438 439 lexVerbatimBlockFirstLine(T); 440} 441 442void Lexer::lexVerbatimLine(Token &T, const char *TextBegin) { 443 // Extract current line. 444 const char *Newline = findNewline(BufferPtr, CommentEnd); 445 446 const StringRef Name(BufferPtr + 1, TextBegin - BufferPtr - 1); 447 const StringRef Text(TextBegin, Newline - TextBegin); 448 449 formTokenWithChars(T, Newline, tok::verbatim_line); 450 T.setVerbatimLineName(Name); 451 T.setVerbatimLineText(Text); 452} 453 454void Lexer::setupAndLexHTMLOpenTag(Token &T) { 455 assert(BufferPtr[0] == '<' && isHTMLIdentifierCharacter(BufferPtr[1])); 456 const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd); 457 458 formTokenWithChars(T, TagNameEnd, tok::html_tag_open); 459 T.setHTMLTagOpenName(StringRef(TagNameEnd - (T.getLength() - 1), 460 T.getLength() - 1)); 461 462 BufferPtr = skipWhitespace(BufferPtr, CommentEnd); 463 464 if (BufferPtr != CommentEnd && *BufferPtr == '>') { 465 BufferPtr++; 466 return; 467 } 468 469 if (BufferPtr != CommentEnd && isHTMLIdentifierCharacter(*BufferPtr)) 470 State = LS_HTMLOpenTag; 471} 472 473void Lexer::lexHTMLOpenTag(Token &T) { 474 assert(State == LS_HTMLOpenTag); 475 476 const char *TokenPtr = BufferPtr; 477 char C = *TokenPtr; 478 if (isHTMLIdentifierCharacter(C)) { 479 TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd); 480 formTokenWithChars(T, TokenPtr, tok::html_ident); 481 T.setHTMLIdent(StringRef(TokenPtr - T.getLength(), T.getLength())); 482 } else { 483 switch (C) { 484 case '=': 485 TokenPtr++; 486 formTokenWithChars(T, TokenPtr, tok::html_equals); 487 break; 488 case '\"': 489 case '\'': { 490 const char *OpenQuote = TokenPtr; 491 TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd); 492 const char *ClosingQuote = TokenPtr; 493 if (TokenPtr != CommentEnd) // Skip closing quote. 494 TokenPtr++; 495 formTokenWithChars(T, TokenPtr, tok::html_quoted_string); 496 T.setHTMLQuotedString(StringRef(OpenQuote + 1, 497 ClosingQuote - (OpenQuote + 1))); 498 break; 499 } 500 case '>': 501 TokenPtr++; 502 formTokenWithChars(T, TokenPtr, tok::html_greater); 503 break; 504 } 505 } 506 507 // Now look ahead and return to normal state if we don't see any HTML tokens 508 // ahead. 509 BufferPtr = skipWhitespace(BufferPtr, CommentEnd); 510 if (BufferPtr == CommentEnd) { 511 State = LS_Normal; 512 return; 513 } 514 515 C = *BufferPtr; 516 if (!isHTMLIdentifierCharacter(C) && 517 C != '=' && C != '\"' && C != '\'' && C != '>') { 518 State = LS_Normal; 519 return; 520 } 521} 522 523void Lexer::lexHTMLCloseTag(Token &T) { 524 assert(BufferPtr[0] == '<' && BufferPtr[1] == '/'); 525 526 const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd); 527 const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd); 528 529 const char *End = skipWhitespace(TagNameEnd, CommentEnd); 530 if (End != CommentEnd && *End == '>') 531 End++; 532 533 formTokenWithChars(T, End, tok::html_tag_close); 534 T.setHTMLTagCloseName(StringRef(TagNameBegin, TagNameEnd - TagNameBegin)); 535} 536 537Lexer::Lexer(SourceLocation FileLoc, const CommentOptions &CommOpts, 538 const char *BufferStart, const char *BufferEnd): 539 BufferStart(BufferStart), BufferEnd(BufferEnd), 540 FileLoc(FileLoc), CommOpts(CommOpts), BufferPtr(BufferStart), 541 CommentState(LCS_BeforeComment), State(LS_Normal) { 542} 543 544void Lexer::lex(Token &T) { 545again: 546 switch (CommentState) { 547 case LCS_BeforeComment: 548 if (BufferPtr == BufferEnd) { 549 formTokenWithChars(T, BufferPtr, tok::eof); 550 return; 551 } 552 553 assert(*BufferPtr == '/'); 554 BufferPtr++; // Skip first slash. 555 switch(*BufferPtr) { 556 case '/': { // BCPL comment. 557 BufferPtr++; // Skip second slash. 558 559 if (BufferPtr != BufferEnd) { 560 // Skip Doxygen magic marker, if it is present. 561 // It might be missing because of a typo //< or /*<, or because we 562 // merged this non-Doxygen comment into a bunch of Doxygen comments 563 // around it: /** ... */ /* ... */ /** ... */ 564 const char C = *BufferPtr; 565 if (C == '/' || C == '!') 566 BufferPtr++; 567 } 568 569 // Skip less-than symbol that marks trailing comments. 570 // Skip it even if the comment is not a Doxygen one, because //< and /*< 571 // are frequent typos. 572 if (BufferPtr != BufferEnd && *BufferPtr == '<') 573 BufferPtr++; 574 575 CommentState = LCS_InsideBCPLComment; 576 State = LS_Normal; 577 CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd); 578 goto again; 579 } 580 case '*': { // C comment. 581 BufferPtr++; // Skip star. 582 583 // Skip Doxygen magic marker. 584 const char C = *BufferPtr; 585 if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!') 586 BufferPtr++; 587 588 // Skip less-than symbol that marks trailing comments. 589 if (BufferPtr != BufferEnd && *BufferPtr == '<') 590 BufferPtr++; 591 592 CommentState = LCS_InsideCComment; 593 State = LS_Normal; 594 CommentEnd = findCCommentEnd(BufferPtr, BufferEnd); 595 goto again; 596 } 597 default: 598 llvm_unreachable("second character of comment should be '/' or '*'"); 599 } 600 601 case LCS_BetweenComments: { 602 // Consecutive comments are extracted only if there is only whitespace 603 // between them. So we can search for the start of the next comment. 604 const char *EndWhitespace = BufferPtr; 605 while(EndWhitespace != BufferEnd && *EndWhitespace != '/') 606 EndWhitespace++; 607 608 // Turn any whitespace between comments (and there is only whitespace 609 // between them) into a newline. We have two newlines between comments 610 // in total (first one was synthesized after a comment). 611 formTokenWithChars(T, EndWhitespace, tok::newline); 612 613 CommentState = LCS_BeforeComment; 614 break; 615 } 616 617 case LCS_InsideBCPLComment: 618 case LCS_InsideCComment: 619 if (BufferPtr != CommentEnd) { 620 lexCommentText(T); 621 break; 622 } else { 623 // Skip C comment closing sequence. 624 if (CommentState == LCS_InsideCComment) { 625 assert(BufferPtr[0] == '*' && BufferPtr[1] == '/'); 626 BufferPtr += 2; 627 assert(BufferPtr <= BufferEnd); 628 629 // Synthenize newline just after the C comment, regardless if there is 630 // actually a newline. 631 formTokenWithChars(T, BufferPtr, tok::newline); 632 633 CommentState = LCS_BetweenComments; 634 break; 635 } else { 636 // Don't synthesized a newline after BCPL comment. 637 CommentState = LCS_BetweenComments; 638 goto again; 639 } 640 } 641 } 642} 643 644StringRef Lexer::getSpelling(const Token &Tok, 645 const SourceManager &SourceMgr, 646 bool *Invalid) const { 647 SourceLocation Loc = Tok.getLocation(); 648 std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc); 649 650 bool InvalidTemp = false; 651 StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp); 652 if (InvalidTemp) { 653 *Invalid = true; 654 return StringRef(); 655 } 656 657 const char *Begin = File.data() + LocInfo.second; 658 return StringRef(Begin, Tok.getLength()); 659} 660 661void Lexer::addVerbatimBlockCommand(StringRef BeginName, StringRef EndName) { 662 VerbatimBlockCommand VBC; 663 VBC.BeginName = BeginName; 664 VBC.EndName = EndName; 665 VerbatimBlockCommands.push_back(VBC); 666} 667 668void Lexer::addVerbatimLineCommand(StringRef Name) { 669 VerbatimLineCommand VLC; 670 VLC.Name = Name; 671 VerbatimLineCommands.push_back(VLC); 672} 673 674} // end namespace comments 675} // end namespace clang 676 677