CommentLexer.h revision 477a9f58c1b197f315befd03b42a8a0b3a2f0ff9
1//===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file defines lexer for structured comments and supporting token class. 11// 12//===----------------------------------------------------------------------===// 13 14#ifndef LLVM_CLANG_AST_COMMENT_LEXER_H 15#define LLVM_CLANG_AST_COMMENT_LEXER_H 16 17#include "clang/Basic/SourceManager.h" 18#include "llvm/ADT/StringRef.h" 19#include "llvm/ADT/SmallString.h" 20#include "llvm/ADT/SmallVector.h" 21#include "llvm/Support/Allocator.h" 22#include "llvm/Support/raw_ostream.h" 23 24namespace clang { 25namespace comments { 26 27class Lexer; 28class TextTokenRetokenizer; 29 30namespace tok { 31enum TokenKind { 32 eof, 33 newline, 34 text, 35 command, 36 verbatim_block_begin, 37 verbatim_block_line, 38 verbatim_block_end, 39 verbatim_line_name, 40 verbatim_line_text, 41 html_start_tag, // <tag 42 html_ident, // attr 43 html_equals, // = 44 html_quoted_string, // "blah\"blah" or 'blah\'blah' 45 html_greater, // > 46 html_slash_greater, // /> 47 html_end_tag // </tag 48}; 49} // end namespace tok 50 51class CommentOptions { 52public: 53 bool Markdown; 54}; 55 56/// \brief Comment token. 57class Token { 58 friend class Lexer; 59 friend class TextTokenRetokenizer; 60 61 /// The location of the token. 62 SourceLocation Loc; 63 64 /// The actual kind of the token. 65 tok::TokenKind Kind; 66 67 /// Length of the token spelling in comment. Can be 0 for synthenized 68 /// tokens. 69 unsigned Length; 70 71 /// Contains text value associated with a token. 72 const char *TextPtr1; 73 unsigned TextLen1; 74 75public: 76 SourceLocation getLocation() const LLVM_READONLY { return Loc; } 77 void setLocation(SourceLocation SL) { Loc = SL; } 78 79 SourceLocation getEndLocation() const LLVM_READONLY { 80 if (Length == 0 || Length == 1) 81 return Loc; 82 return Loc.getLocWithOffset(Length - 1); 83 } 84 85 tok::TokenKind getKind() const LLVM_READONLY { return Kind; } 86 void setKind(tok::TokenKind K) { Kind = K; } 87 88 bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; } 89 bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; } 90 91 unsigned getLength() const LLVM_READONLY { return Length; } 92 void setLength(unsigned L) { Length = L; } 93 94 StringRef getText() const LLVM_READONLY { 95 assert(is(tok::text)); 96 return StringRef(TextPtr1, TextLen1); 97 } 98 99 void setText(StringRef Text) { 100 assert(is(tok::text)); 101 TextPtr1 = Text.data(); 102 TextLen1 = Text.size(); 103 } 104 105 StringRef getCommandName() const LLVM_READONLY { 106 assert(is(tok::command)); 107 return StringRef(TextPtr1, TextLen1); 108 } 109 110 void setCommandName(StringRef Name) { 111 assert(is(tok::command)); 112 TextPtr1 = Name.data(); 113 TextLen1 = Name.size(); 114 } 115 116 StringRef getVerbatimBlockName() const LLVM_READONLY { 117 assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); 118 return StringRef(TextPtr1, TextLen1); 119 } 120 121 void setVerbatimBlockName(StringRef Name) { 122 assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); 123 TextPtr1 = Name.data(); 124 TextLen1 = Name.size(); 125 } 126 127 StringRef getVerbatimBlockText() const LLVM_READONLY { 128 assert(is(tok::verbatim_block_line)); 129 return StringRef(TextPtr1, TextLen1); 130 } 131 132 void setVerbatimBlockText(StringRef Text) { 133 assert(is(tok::verbatim_block_line)); 134 TextPtr1 = Text.data(); 135 TextLen1 = Text.size(); 136 } 137 138 /// Returns the name of verbatim line command. 139 StringRef getVerbatimLineName() const LLVM_READONLY { 140 assert(is(tok::verbatim_line_name)); 141 return StringRef(TextPtr1, TextLen1); 142 } 143 144 void setVerbatimLineName(StringRef Name) { 145 assert(is(tok::verbatim_line_name)); 146 TextPtr1 = Name.data(); 147 TextLen1 = Name.size(); 148 } 149 150 StringRef getVerbatimLineText() const LLVM_READONLY { 151 assert(is(tok::verbatim_line_text)); 152 return StringRef(TextPtr1, TextLen1); 153 } 154 155 void setVerbatimLineText(StringRef Text) { 156 assert(is(tok::verbatim_line_text)); 157 TextPtr1 = Text.data(); 158 TextLen1 = Text.size(); 159 } 160 161 StringRef getHTMLTagStartName() const LLVM_READONLY { 162 assert(is(tok::html_start_tag)); 163 return StringRef(TextPtr1, TextLen1); 164 } 165 166 void setHTMLTagStartName(StringRef Name) { 167 assert(is(tok::html_start_tag)); 168 TextPtr1 = Name.data(); 169 TextLen1 = Name.size(); 170 } 171 172 StringRef getHTMLIdent() const LLVM_READONLY { 173 assert(is(tok::html_ident)); 174 return StringRef(TextPtr1, TextLen1); 175 } 176 177 void setHTMLIdent(StringRef Name) { 178 assert(is(tok::html_ident)); 179 TextPtr1 = Name.data(); 180 TextLen1 = Name.size(); 181 } 182 183 StringRef getHTMLQuotedString() const LLVM_READONLY { 184 assert(is(tok::html_quoted_string)); 185 return StringRef(TextPtr1, TextLen1); 186 } 187 188 void setHTMLQuotedString(StringRef Str) { 189 assert(is(tok::html_quoted_string)); 190 TextPtr1 = Str.data(); 191 TextLen1 = Str.size(); 192 } 193 194 StringRef getHTMLTagEndName() const LLVM_READONLY { 195 assert(is(tok::html_end_tag)); 196 return StringRef(TextPtr1, TextLen1); 197 } 198 199 void setHTMLTagEndName(StringRef Name) { 200 assert(is(tok::html_end_tag)); 201 TextPtr1 = Name.data(); 202 TextLen1 = Name.size(); 203 } 204 205 void dump(const Lexer &L, const SourceManager &SM) const; 206}; 207 208/// \brief Comment lexer. 209class Lexer { 210private: 211 Lexer(const Lexer&); // DO NOT IMPLEMENT 212 void operator=(const Lexer&); // DO NOT IMPLEMENT 213 214 /// Allocator for strings that are semantic values of tokens and have to be 215 /// computed (for example, resolved decimal character references). 216 llvm::BumpPtrAllocator &Allocator; 217 218 const char *const BufferStart; 219 const char *const BufferEnd; 220 SourceLocation FileLoc; 221 CommentOptions CommOpts; 222 223 const char *BufferPtr; 224 225 /// One past end pointer for the current comment. For BCPL comments points 226 /// to newline or BufferEnd, for C comments points to star in '*/'. 227 const char *CommentEnd; 228 229 enum LexerCommentState { 230 LCS_BeforeComment, 231 LCS_InsideBCPLComment, 232 LCS_InsideCComment, 233 LCS_BetweenComments 234 }; 235 236 /// Low-level lexer state, track if we are inside or outside of comment. 237 LexerCommentState CommentState; 238 239 enum LexerState { 240 /// Lexing normal comment text 241 LS_Normal, 242 243 /// Finished lexing verbatim block beginning command, will lex first body 244 /// line. 245 LS_VerbatimBlockFirstLine, 246 247 /// Lexing verbatim block body line-by-line, skipping line-starting 248 /// decorations. 249 LS_VerbatimBlockBody, 250 251 /// Finished lexing verbatim line beginning command, will lex text (one 252 /// line). 253 LS_VerbatimLineText, 254 255 /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes. 256 LS_HTMLStartTag, 257 258 /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'. 259 LS_HTMLEndTag 260 }; 261 262 /// Current lexing mode. 263 LexerState State; 264 265 /// A verbatim-like block command eats every character (except line starting 266 /// decorations) until matching end command is seen or comment end is hit. 267 struct VerbatimBlockCommand { 268 StringRef BeginName; 269 StringRef EndName; 270 }; 271 272 typedef SmallVector<VerbatimBlockCommand, 4> VerbatimBlockCommandVector; 273 274 /// Registered verbatim-like block commands. 275 VerbatimBlockCommandVector VerbatimBlockCommands; 276 277 /// If State is LS_VerbatimBlock, contains the name of verbatim end 278 /// command, including command marker. 279 SmallString<16> VerbatimBlockEndCommandName; 280 281 bool isVerbatimBlockCommand(StringRef BeginName, StringRef &EndName) const; 282 283 /// A verbatim-like line command eats everything until a newline is seen or 284 /// comment end is hit. 285 struct VerbatimLineCommand { 286 StringRef Name; 287 }; 288 289 typedef SmallVector<VerbatimLineCommand, 4> VerbatimLineCommandVector; 290 291 /// Registered verbatim-like line commands. 292 VerbatimLineCommandVector VerbatimLineCommands; 293 294 bool isVerbatimLineCommand(StringRef Name) const; 295 296 /// Given a character reference name (e.g., "lt"), return the character that 297 /// it stands for (e.g., "<"). 298 StringRef resolveHTMLNamedCharacterReference(StringRef Name) const; 299 300 /// Given a Unicode codepoint as base-10 integer, return the character. 301 StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const; 302 303 /// Given a Unicode codepoint as base-16 integer, return the character. 304 StringRef resolveHTMLHexCharacterReference(StringRef Name) const; 305 306 void formTokenWithChars(Token &Result, const char *TokEnd, 307 tok::TokenKind Kind) { 308 const unsigned TokLen = TokEnd - BufferPtr; 309 Result.setLocation(getSourceLocation(BufferPtr)); 310 Result.setKind(Kind); 311 Result.setLength(TokLen); 312#ifndef NDEBUG 313 Result.TextPtr1 = "<UNSET>"; 314 Result.TextLen1 = 7; 315#endif 316 BufferPtr = TokEnd; 317 } 318 319 void formTextToken(Token &Result, const char *TokEnd) { 320 StringRef Text(BufferPtr, TokEnd - BufferPtr); 321 formTokenWithChars(Result, TokEnd, tok::text); 322 Result.setText(Text); 323 } 324 325 SourceLocation getSourceLocation(const char *Loc) const { 326 assert(Loc >= BufferStart && Loc <= BufferEnd && 327 "Location out of range for this buffer!"); 328 329 const unsigned CharNo = Loc - BufferStart; 330 return FileLoc.getLocWithOffset(CharNo); 331 } 332 333 /// Eat string matching regexp \code \s*\* \endcode. 334 void skipLineStartingDecorations(); 335 336 /// Lex stuff inside comments. CommentEnd should be set correctly. 337 void lexCommentText(Token &T); 338 339 void setupAndLexVerbatimBlock(Token &T, 340 const char *TextBegin, 341 char Marker, StringRef EndName); 342 343 void lexVerbatimBlockFirstLine(Token &T); 344 345 void lexVerbatimBlockBody(Token &T); 346 347 void setupAndLexVerbatimLine(Token &T, const char *TextBegin); 348 349 void lexVerbatimLineText(Token &T); 350 351 void lexHTMLCharacterReference(Token &T); 352 353 void setupAndLexHTMLStartTag(Token &T); 354 355 void lexHTMLStartTag(Token &T); 356 357 void setupAndLexHTMLEndTag(Token &T); 358 359 void lexHTMLEndTag(Token &T); 360 361public: 362 Lexer(llvm::BumpPtrAllocator &Allocator, 363 SourceLocation FileLoc, const CommentOptions &CommOpts, 364 const char *BufferStart, const char *BufferEnd); 365 366 void lex(Token &T); 367 368 StringRef getSpelling(const Token &Tok, 369 const SourceManager &SourceMgr, 370 bool *Invalid = NULL) const; 371 372 /// \brief Register a new verbatim block command. 373 void addVerbatimBlockCommand(StringRef BeginName, StringRef EndName); 374 375 /// \brief Register a new verbatim line command. 376 void addVerbatimLineCommand(StringRef Name); 377}; 378 379} // end namespace comments 380} // end namespace clang 381 382#endif 383 384