CommentLexer.h revision 8d3ba23f2d9e6c87794d059412a0808c9cbacb25
1//===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines lexer for structured comments and supporting token class. 11// 12//===----------------------------------------------------------------------===// 13 14#ifndef LLVM_CLANG_AST_COMMENT_LEXER_H 15#define LLVM_CLANG_AST_COMMENT_LEXER_H 16 17#include "clang/Basic/SourceManager.h" 18#include "llvm/ADT/StringRef.h" 19#include "llvm/ADT/SmallString.h" 20#include "llvm/ADT/SmallVector.h" 21#include "llvm/Support/Allocator.h" 22#include "llvm/Support/raw_ostream.h" 23 24namespace clang { 25namespace comments { 26 27class Lexer; 28class TextTokenRetokenizer; 29 30namespace tok { 31enum TokenKind { 32 eof, 33 newline, 34 text, 35 command, 36 verbatim_block_begin, 37 verbatim_block_line, 38 verbatim_block_end, 39 verbatim_line_name, 40 verbatim_line_text, 41 html_tag_open, // <tag 42 html_ident, // attr 43 html_equals, // = 44 html_quoted_string, // "blah\"blah" or 'blah\'blah' 45 html_greater, // > 46 html_tag_close // </tag 47}; 48} // end namespace tok 49 50class CommentOptions { 51public: 52 bool Markdown; 53}; 54 55/// \brief Comment token. 56class Token { 57 friend class Lexer; 58 friend class TextTokenRetokenizer; 59 60 /// The location of the token. 61 SourceLocation Loc; 62 63 /// The actual kind of the token. 64 tok::TokenKind Kind; 65 66 /// Length of the token spelling in comment. Can be 0 for synthenized 67 /// tokens. 68 unsigned Length; 69 70 /// Contains text value associated with a token. 71 const char *TextPtr1; 72 unsigned TextLen1; 73 74public: 75 SourceLocation getLocation() const LLVM_READONLY { return Loc; } 76 void setLocation(SourceLocation SL) { Loc = SL; } 77 78 SourceLocation getEndLocation() const LLVM_READONLY { 79 if (Length == 0 || Length == 1) 80 return Loc; 81 return Loc.getLocWithOffset(Length - 1); 82 } 83 84 tok::TokenKind getKind() const LLVM_READONLY { return Kind; } 85 void setKind(tok::TokenKind K) { Kind = K; } 86 87 bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; } 88 bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; } 89 90 unsigned getLength() const LLVM_READONLY { return Length; } 91 void setLength(unsigned L) { Length = L; } 92 93 StringRef getText() const LLVM_READONLY { 94 assert(is(tok::text)); 95 return StringRef(TextPtr1, TextLen1); 96 } 97 98 void setText(StringRef Text) { 99 assert(is(tok::text)); 100 TextPtr1 = Text.data(); 101 TextLen1 = Text.size(); 102 } 103 104 StringRef getCommandName() const LLVM_READONLY { 105 assert(is(tok::command)); 106 return StringRef(TextPtr1, TextLen1); 107 } 108 109 void setCommandName(StringRef Name) { 110 assert(is(tok::command)); 111 TextPtr1 = Name.data(); 112 TextLen1 = Name.size(); 113 } 114 115 StringRef getVerbatimBlockName() const LLVM_READONLY { 116 assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); 117 return StringRef(TextPtr1, TextLen1); 118 } 119 120 void setVerbatimBlockName(StringRef Name) { 121 assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); 122 TextPtr1 = Name.data(); 123 TextLen1 = Name.size(); 124 } 125 126 StringRef getVerbatimBlockText() const LLVM_READONLY { 127 assert(is(tok::verbatim_block_line)); 128 return StringRef(TextPtr1, TextLen1); 129 } 130 131 void setVerbatimBlockText(StringRef Text) { 132 assert(is(tok::verbatim_block_line)); 133 TextPtr1 = Text.data(); 134 TextLen1 = Text.size(); 135 } 136 137 /// Returns the name of verbatim line command. 138 StringRef getVerbatimLineName() const LLVM_READONLY { 139 assert(is(tok::verbatim_line_name)); 140 return StringRef(TextPtr1, TextLen1); 141 } 142 143 void setVerbatimLineName(StringRef Name) { 144 assert(is(tok::verbatim_line_name)); 145 TextPtr1 = Name.data(); 146 TextLen1 = Name.size(); 147 } 148 149 StringRef getVerbatimLineText() const LLVM_READONLY { 150 assert(is(tok::verbatim_line_text)); 151 return StringRef(TextPtr1, TextLen1); 152 } 153 154 void setVerbatimLineText(StringRef Text) { 155 assert(is(tok::verbatim_line_text)); 156 TextPtr1 = Text.data(); 157 TextLen1 = Text.size(); 158 } 159 160 StringRef getHTMLTagOpenName() const LLVM_READONLY { 161 assert(is(tok::html_tag_open)); 162 return StringRef(TextPtr1, TextLen1); 163 } 164 165 void setHTMLTagOpenName(StringRef Name) { 166 assert(is(tok::html_tag_open)); 167 TextPtr1 = Name.data(); 168 TextLen1 = Name.size(); 169 } 170 171 StringRef getHTMLIdent() const LLVM_READONLY { 172 assert(is(tok::html_ident)); 173 return StringRef(TextPtr1, TextLen1); 174 } 175 176 void setHTMLIdent(StringRef Name) { 177 assert(is(tok::html_ident)); 178 TextPtr1 = Name.data(); 179 TextLen1 = Name.size(); 180 } 181 182 StringRef getHTMLQuotedString() const LLVM_READONLY { 183 assert(is(tok::html_quoted_string)); 184 return StringRef(TextPtr1, TextLen1); 185 } 186 187 void setHTMLQuotedString(StringRef Str) { 188 assert(is(tok::html_quoted_string)); 189 TextPtr1 = Str.data(); 190 TextLen1 = Str.size(); 191 } 192 193 StringRef getHTMLTagCloseName() const LLVM_READONLY { 194 assert(is(tok::html_tag_close)); 195 return StringRef(TextPtr1, TextLen1); 196 } 197 198 void setHTMLTagCloseName(StringRef Name) { 199 assert(is(tok::html_tag_close)); 200 TextPtr1 = Name.data(); 201 TextLen1 = Name.size(); 202 } 203 204 void dump(const Lexer &L, const SourceManager &SM) const; 205}; 206 207/// \brief Comment lexer. 208class Lexer { 209private: 210 Lexer(const Lexer&); // DO NOT IMPLEMENT 211 void operator=(const Lexer&); // DO NOT IMPLEMENT 212 213 const char *const BufferStart; 214 const char *const BufferEnd; 215 SourceLocation FileLoc; 216 CommentOptions CommOpts; 217 218 const char *BufferPtr; 219 220 /// One past end pointer for the current comment. For BCPL comments points 221 /// to newline or BufferEnd, for C comments points to star in '*/'. 222 const char *CommentEnd; 223 224 enum LexerCommentState { 225 LCS_BeforeComment, 226 LCS_InsideBCPLComment, 227 LCS_InsideCComment, 228 LCS_BetweenComments 229 }; 230 231 /// Low-level lexer state, track if we are inside or outside of comment. 232 LexerCommentState CommentState; 233 234 enum LexerState { 235 /// Lexing normal comment text 236 LS_Normal, 237 238 /// Finished lexing verbatim block beginning command, will lex first body 239 /// line. 240 LS_VerbatimBlockFirstLine, 241 242 /// Lexing verbatim block body line-by-line, skipping line-starting 243 /// decorations. 244 LS_VerbatimBlockBody, 245 246 /// Finished lexing verbatim line beginning command, will lex text (one 247 /// line). 248 LS_VerbatimLineText, 249 250 /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes. 251 LS_HTMLOpenTag, 252 253 /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'. 254 LS_HTMLCloseTag 255 }; 256 257 /// Current lexing mode. 258 LexerState State; 259 260 /// A verbatim-like block command eats every character (except line starting 261 /// decorations) until matching end command is seen or comment end is hit. 262 struct VerbatimBlockCommand { 263 StringRef BeginName; 264 StringRef EndName; 265 }; 266 267 typedef SmallVector<VerbatimBlockCommand, 4> VerbatimBlockCommandVector; 268 269 /// Registered verbatim-like block commands. 270 VerbatimBlockCommandVector VerbatimBlockCommands; 271 272 /// If State is LS_VerbatimBlock, contains the the name of verbatim end 273 /// command, including command marker. 274 SmallString<16> VerbatimBlockEndCommandName; 275 276 bool isVerbatimBlockCommand(StringRef BeginName, StringRef &EndName) const; 277 278 /// A verbatim-like line command eats everything until a newline is seen or 279 /// comment end is hit. 280 struct VerbatimLineCommand { 281 StringRef Name; 282 }; 283 284 typedef SmallVector<VerbatimLineCommand, 4> VerbatimLineCommandVector; 285 286 /// Registered verbatim-like line commands. 287 VerbatimLineCommandVector VerbatimLineCommands; 288 289 bool isVerbatimLineCommand(StringRef Name) const; 290 291 void formTokenWithChars(Token &Result, const char *TokEnd, 292 tok::TokenKind Kind) { 293 const unsigned TokLen = TokEnd - BufferPtr; 294 Result.setLocation(getSourceLocation(BufferPtr)); 295 Result.setKind(Kind); 296 Result.setLength(TokLen); 297#ifndef NDEBUG 298 Result.TextPtr1 = "<UNSET>"; 299 Result.TextLen1 = 7; 300#endif 301 BufferPtr = TokEnd; 302 } 303 304 SourceLocation getSourceLocation(const char *Loc) const { 305 assert(Loc >= BufferStart && Loc <= BufferEnd && 306 "Location out of range for this buffer!"); 307 308 const unsigned CharNo = Loc - BufferStart; 309 return FileLoc.getLocWithOffset(CharNo); 310 } 311 312 /// Eat string matching regexp \code \s*\* \endcode. 313 void skipLineStartingDecorations(); 314 315 /// Lex stuff inside comments. CommentEnd should be set correctly. 316 void lexCommentText(Token &T); 317 318 void setupAndLexVerbatimBlock(Token &T, 319 const char *TextBegin, 320 char Marker, StringRef EndName); 321 322 void lexVerbatimBlockFirstLine(Token &T); 323 324 void lexVerbatimBlockBody(Token &T); 325 326 void setupAndLexVerbatimLine(Token &T, const char *TextBegin); 327 328 void lexVerbatimLineText(Token &T); 329 330 void setupAndLexHTMLOpenTag(Token &T); 331 332 void lexHTMLOpenTag(Token &T); 333 334 void setupAndLexHTMLCloseTag(Token &T); 335 336 void lexHTMLCloseTag(Token &T); 337 338public: 339 Lexer(SourceLocation FileLoc, const CommentOptions &CommOpts, 340 const char *BufferStart, const char *BufferEnd); 341 342 void lex(Token &T); 343 344 StringRef getSpelling(const Token &Tok, 345 const SourceManager &SourceMgr, 346 bool *Invalid = NULL) const; 347 348 /// \brief Register a new verbatim block command. 349 void addVerbatimBlockCommand(StringRef BeginName, StringRef EndName); 350 351 /// \brief Register a new verbatim line command. 352 void addVerbatimLineCommand(StringRef Name); 353}; 354 355/// Re-lexes a sequence of tok::text tokens. 356class TextTokenRetokenizer { 357 llvm::BumpPtrAllocator &Allocator; 358 static const unsigned MaxTokens = 16; 359 SmallVector<Token, MaxTokens> Toks; 360 361 struct Position { 362 unsigned CurToken; 363 const char *BufferStart; 364 const char *BufferEnd; 365 const char *BufferPtr; 366 SourceLocation BufferStartLoc; 367 }; 368 369 /// Current position in Toks. 370 Position Pos; 371 372 bool isEnd() const { 373 return Pos.CurToken >= Toks.size(); 374 } 375 376 /// Sets up the buffer pointers to point to current token. 377 void setupBuffer() { 378 assert(Pos.CurToken < Toks.size()); 379 const Token &Tok = Toks[Pos.CurToken]; 380 381 Pos.BufferStart = Tok.getText().begin(); 382 Pos.BufferEnd = Tok.getText().end(); 383 Pos.BufferPtr = Pos.BufferStart; 384 Pos.BufferStartLoc = Tok.getLocation(); 385 } 386 387 SourceLocation getSourceLocation() const { 388 const unsigned CharNo = Pos.BufferPtr - Pos.BufferStart; 389 return Pos.BufferStartLoc.getLocWithOffset(CharNo); 390 } 391 392 char peek() const { 393 assert(!isEnd()); 394 assert(Pos.BufferPtr != Pos.BufferEnd); 395 return *Pos.BufferPtr; 396 } 397 398 void consumeChar() { 399 assert(!isEnd()); 400 assert(Pos.BufferPtr != Pos.BufferEnd); 401 Pos.BufferPtr++; 402 if (Pos.BufferPtr == Pos.BufferEnd) { 403 Pos.CurToken++; 404 if (Pos.CurToken < Toks.size()) 405 setupBuffer(); 406 } 407 } 408 409 static bool isWhitespace(char C) { 410 return C == ' ' || C == '\n' || C == '\r' || 411 C == '\t' || C == '\f' || C == '\v'; 412 } 413 414 void consumeWhitespace() { 415 while (!isEnd()) { 416 if (isWhitespace(peek())) 417 consumeChar(); 418 else 419 break; 420 } 421 } 422 423 void formTokenWithChars(Token &Result, 424 SourceLocation Loc, 425 const char *TokBegin, 426 unsigned TokLength, 427 StringRef Text) { 428 Result.setLocation(Loc); 429 Result.setKind(tok::text); 430 Result.setLength(TokLength); 431#ifndef NDEBUG 432 Result.TextPtr1 = "<UNSET>"; 433 Result.TextLen1 = 7; 434#endif 435 Result.setText(Text); 436 } 437 438public: 439 TextTokenRetokenizer(llvm::BumpPtrAllocator &Allocator): 440 Allocator(Allocator) { 441 Pos.CurToken = 0; 442 } 443 444 /// Add a token. 445 /// Returns true on success, false if it seems like we have enough tokens. 446 bool addToken(const Token &Tok) { 447 assert(Tok.is(tok::text)); 448 if (Toks.size() >= MaxTokens) 449 return false; 450 451 Toks.push_back(Tok); 452 if (Toks.size() == 1) 453 setupBuffer(); 454 return true; 455 } 456 457 /// Extract a word -- sequence of non-whitespace characters. 458 bool lexWord(Token &Tok) { 459 if (isEnd()) 460 return false; 461 462 Position SavedPos = Pos; 463 464 consumeWhitespace(); 465 SmallString<32> WordText; 466 const char *WordBegin = Pos.BufferPtr; 467 SourceLocation Loc = getSourceLocation(); 468 while (!isEnd()) { 469 const char C = peek(); 470 if (!isWhitespace(C)) { 471 WordText.push_back(C); 472 consumeChar(); 473 } else 474 break; 475 } 476 const unsigned Length = WordText.size(); 477 if (Length == 0) { 478 Pos = SavedPos; 479 return false; 480 } 481 482 char *TextPtr = new (Allocator) char[Length + 1]; 483 484 memcpy(TextPtr, WordText.c_str(), Length + 1); 485 StringRef Text = StringRef(TextPtr, Length); 486 487 formTokenWithChars(Tok, Loc, WordBegin, 488 Pos.BufferPtr - WordBegin, Text); 489 return true; 490 } 491 492 bool lexDelimitedSeq(Token &Tok, char OpenDelim, char CloseDelim) { 493 if (isEnd()) 494 return false; 495 496 Position SavedPos = Pos; 497 498 consumeWhitespace(); 499 SmallString<32> WordText; 500 const char *WordBegin = Pos.BufferPtr; 501 SourceLocation Loc = getSourceLocation(); 502 bool Error = false; 503 if (!isEnd()) { 504 const char C = peek(); 505 if (C == OpenDelim) { 506 WordText.push_back(C); 507 consumeChar(); 508 } else 509 Error = true; 510 } 511 char C; 512 while (!Error && !isEnd()) { 513 C = peek(); 514 WordText.push_back(C); 515 consumeChar(); 516 if (C == CloseDelim) 517 break; 518 } 519 if (!Error && C != CloseDelim) 520 Error = true; 521 522 if (Error) { 523 Pos = SavedPos; 524 return false; 525 } 526 527 const unsigned Length = WordText.size(); 528 char *TextPtr = new (Allocator) char[Length + 1]; 529 530 memcpy(TextPtr, WordText.c_str(), Length + 1); 531 StringRef Text = StringRef(TextPtr, Length); 532 533 formTokenWithChars(Tok, Loc, WordBegin, 534 Pos.BufferPtr - WordBegin, Text); 535 return true; 536 } 537 538 /// Return a text token. Useful to take tokens back. 539 bool lexText(Token &Tok) { 540 if (isEnd()) 541 return false; 542 543 if (Pos.BufferPtr != Pos.BufferStart) 544 formTokenWithChars(Tok, getSourceLocation(), 545 Pos.BufferPtr, Pos.BufferEnd - Pos.BufferPtr, 546 StringRef(Pos.BufferPtr, 547 Pos.BufferEnd - Pos.BufferPtr)); 548 else 549 Tok = Toks[Pos.CurToken]; 550 551 Pos.CurToken++; 552 if (Pos.CurToken < Toks.size()) 553 setupBuffer(); 554 return true; 555 } 556}; 557 558} // end namespace comments 559} // end namespace clang 560 561#endif 562 563