CommentLexer.h revision 814e219fc6d5faeb48e4fd5375843346f2d4a7a7
1//===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10//  This file defines lexer for structured comments and supporting token class.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_CLANG_AST_COMMENT_LEXER_H
15#define LLVM_CLANG_AST_COMMENT_LEXER_H
16
17#include "clang/Basic/SourceManager.h"
18#include "llvm/ADT/StringRef.h"
19#include "llvm/ADT/SmallString.h"
20#include "llvm/ADT/SmallVector.h"
21#include "llvm/Support/Allocator.h"
22#include "llvm/Support/raw_ostream.h"
23
24namespace clang {
25namespace comments {
26
27class Lexer;
28class TextTokenRetokenizer;
29
30namespace tok {
31enum TokenKind {
32  eof,
33  newline,
34  text,
35  command,
36  verbatim_block_begin,
37  verbatim_block_line,
38  verbatim_block_end,
39  verbatim_line_name,
40  verbatim_line_text,
41  html_tag_open,      // <tag
42  html_ident,         // attr
43  html_equals,        // =
44  html_quoted_string, // "blah\"blah" or 'blah\'blah'
45  html_greater,       // >
46  html_tag_close      // </tag
47};
48} // end namespace tok
49
50class CommentOptions {
51public:
52  bool Markdown;
53};
54
55/// \brief Comment token.
56class Token {
57  friend class Lexer;
58  friend class TextTokenRetokenizer;
59
60  /// The location of the token.
61  SourceLocation Loc;
62
63  /// The actual kind of the token.
64  tok::TokenKind Kind;
65
66  /// Length of the token spelling in comment.  Can be 0 for synthenized
67  /// tokens.
68  unsigned Length;
69
70  /// Contains text value associated with a token.
71  const char *TextPtr1;
72  unsigned TextLen1;
73
74public:
75  SourceLocation getLocation() const LLVM_READONLY { return Loc; }
76  void setLocation(SourceLocation SL) { Loc = SL; }
77
78  SourceLocation getEndLocation() const LLVM_READONLY {
79    if (Length == 0 || Length == 1)
80      return Loc;
81    return Loc.getLocWithOffset(Length - 1);
82  }
83
84  tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
85  void setKind(tok::TokenKind K) { Kind = K; }
86
87  bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
88  bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
89
90  unsigned getLength() const LLVM_READONLY { return Length; }
91  void setLength(unsigned L) { Length = L; }
92
93  StringRef getText() const LLVM_READONLY {
94    assert(is(tok::text));
95    return StringRef(TextPtr1, TextLen1);
96  }
97
98  void setText(StringRef Text) {
99    assert(is(tok::text));
100    TextPtr1 = Text.data();
101    TextLen1 = Text.size();
102  }
103
104  StringRef getCommandName() const LLVM_READONLY {
105    assert(is(tok::command));
106    return StringRef(TextPtr1, TextLen1);
107  }
108
109  void setCommandName(StringRef Name) {
110    assert(is(tok::command));
111    TextPtr1 = Name.data();
112    TextLen1 = Name.size();
113  }
114
115  StringRef getVerbatimBlockName() const LLVM_READONLY {
116    assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
117    return StringRef(TextPtr1, TextLen1);
118  }
119
120  void setVerbatimBlockName(StringRef Name) {
121    assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
122    TextPtr1 = Name.data();
123    TextLen1 = Name.size();
124  }
125
126  StringRef getVerbatimBlockText() const LLVM_READONLY {
127    assert(is(tok::verbatim_block_line));
128    return StringRef(TextPtr1, TextLen1);
129  }
130
131  void setVerbatimBlockText(StringRef Text) {
132    assert(is(tok::verbatim_block_line));
133    TextPtr1 = Text.data();
134    TextLen1 = Text.size();
135  }
136
137  /// Returns the name of verbatim line command.
138  StringRef getVerbatimLineName() const LLVM_READONLY {
139    assert(is(tok::verbatim_line_name));
140    return StringRef(TextPtr1, TextLen1);
141  }
142
143  void setVerbatimLineName(StringRef Name) {
144    assert(is(tok::verbatim_line_name));
145    TextPtr1 = Name.data();
146    TextLen1 = Name.size();
147  }
148
149  StringRef getVerbatimLineText() const LLVM_READONLY {
150    assert(is(tok::verbatim_line_text));
151    return StringRef(TextPtr1, TextLen1);
152  }
153
154  void setVerbatimLineText(StringRef Text) {
155    assert(is(tok::verbatim_line_text));
156    TextPtr1 = Text.data();
157    TextLen1 = Text.size();
158  }
159
160  StringRef getHTMLTagOpenName() const LLVM_READONLY {
161    assert(is(tok::html_tag_open));
162    return StringRef(TextPtr1, TextLen1);
163  }
164
165  void setHTMLTagOpenName(StringRef Name) {
166    assert(is(tok::html_tag_open));
167    TextPtr1 = Name.data();
168    TextLen1 = Name.size();
169  }
170
171  StringRef getHTMLIdent() const LLVM_READONLY {
172    assert(is(tok::html_ident));
173    return StringRef(TextPtr1, TextLen1);
174  }
175
176  void setHTMLIdent(StringRef Name) {
177    assert(is(tok::html_ident));
178    TextPtr1 = Name.data();
179    TextLen1 = Name.size();
180  }
181
182  StringRef getHTMLQuotedString() const LLVM_READONLY {
183    assert(is(tok::html_quoted_string));
184    return StringRef(TextPtr1, TextLen1);
185  }
186
187  void setHTMLQuotedString(StringRef Str) {
188    assert(is(tok::html_quoted_string));
189    TextPtr1 = Str.data();
190    TextLen1 = Str.size();
191  }
192
193  StringRef getHTMLTagCloseName() const LLVM_READONLY {
194    assert(is(tok::html_tag_close));
195    return StringRef(TextPtr1, TextLen1);
196  }
197
198  void setHTMLTagCloseName(StringRef Name) {
199    assert(is(tok::html_tag_close));
200    TextPtr1 = Name.data();
201    TextLen1 = Name.size();
202  }
203
204  void dump(const Lexer &L, const SourceManager &SM) const;
205};
206
207/// \brief Comment lexer.
208class Lexer {
209private:
210  Lexer(const Lexer&);          // DO NOT IMPLEMENT
211  void operator=(const Lexer&); // DO NOT IMPLEMENT
212
213  const char *const BufferStart;
214  const char *const BufferEnd;
215  SourceLocation FileLoc;
216  CommentOptions CommOpts;
217
218  const char *BufferPtr;
219
220  /// One past end pointer for the current comment.  For BCPL comments points
221  /// to newline or BufferEnd, for C comments points to star in '*/'.
222  const char *CommentEnd;
223
224  enum LexerCommentState {
225    LCS_BeforeComment,
226    LCS_InsideBCPLComment,
227    LCS_InsideCComment,
228    LCS_BetweenComments
229  };
230
231  /// Low-level lexer state, track if we are inside or outside of comment.
232  LexerCommentState CommentState;
233
234  enum LexerState {
235    /// Lexing normal comment text
236    LS_Normal,
237
238    /// Finished lexing verbatim block beginning command, will lex first body
239    /// line.
240    LS_VerbatimBlockFirstLine,
241
242    /// Lexing verbatim block body line-by-line, skipping line-starting
243    /// decorations.
244    LS_VerbatimBlockBody,
245
246    /// Finished lexing verbatim line beginning command, will lex text (one
247    /// line).
248    LS_VerbatimLineText,
249
250    /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
251    LS_HTMLOpenTag,
252
253    /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'.
254    LS_HTMLCloseTag
255  };
256
257  /// Current lexing mode.
258  LexerState State;
259
260  /// A verbatim-like block command eats every character (except line starting
261  /// decorations) until matching end command is seen or comment end is hit.
262  struct VerbatimBlockCommand {
263    StringRef BeginName;
264    StringRef EndName;
265  };
266
267  typedef SmallVector<VerbatimBlockCommand, 4> VerbatimBlockCommandVector;
268
269  /// Registered verbatim-like block commands.
270  VerbatimBlockCommandVector VerbatimBlockCommands;
271
272  /// If State is LS_VerbatimBlock, contains the the name of verbatim end
273  /// command, including command marker.
274  SmallString<16> VerbatimBlockEndCommandName;
275
276  bool isVerbatimBlockCommand(StringRef BeginName, StringRef &EndName) const;
277
278  /// A verbatim-like line command eats everything until a newline is seen or
279  /// comment end is hit.
280  struct VerbatimLineCommand {
281    StringRef Name;
282  };
283
284  typedef SmallVector<VerbatimLineCommand, 4> VerbatimLineCommandVector;
285
286  /// Registered verbatim-like line commands.
287  VerbatimLineCommandVector VerbatimLineCommands;
288
289  bool isVerbatimLineCommand(StringRef Name) const;
290
291  void formTokenWithChars(Token &Result, const char *TokEnd,
292                          tok::TokenKind Kind) {
293    const unsigned TokLen = TokEnd - BufferPtr;
294    Result.setLocation(getSourceLocation(BufferPtr));
295    Result.setKind(Kind);
296    Result.setLength(TokLen);
297#ifndef NDEBUG
298    Result.TextPtr1 = "<UNSET>";
299    Result.TextLen1 = 7;
300#endif
301    BufferPtr = TokEnd;
302  }
303
304  SourceLocation getSourceLocation(const char *Loc) const {
305    assert(Loc >= BufferStart && Loc <= BufferEnd &&
306           "Location out of range for this buffer!");
307
308    const unsigned CharNo = Loc - BufferStart;
309    return FileLoc.getLocWithOffset(CharNo);
310  }
311
312  /// Eat string matching regexp \code \s*\* \endcode.
313  void skipLineStartingDecorations();
314
315  /// Lex stuff inside comments.  CommentEnd should be set correctly.
316  void lexCommentText(Token &T);
317
318  void setupAndLexVerbatimBlock(Token &T,
319                                const char *TextBegin,
320                                char Marker, StringRef EndName);
321
322  void lexVerbatimBlockFirstLine(Token &T);
323
324  void lexVerbatimBlockBody(Token &T);
325
326  void setupAndLexVerbatimLine(Token &T, const char *TextBegin);
327
328  void lexVerbatimLineText(Token &T);
329
330  void setupAndLexHTMLOpenTag(Token &T);
331
332  void lexHTMLOpenTag(Token &T);
333
334  void setupAndLexHTMLCloseTag(Token &T);
335
336  void lexHTMLCloseTag(Token &T);
337
338public:
339  Lexer(SourceLocation FileLoc, const CommentOptions &CommOpts,
340        const char *BufferStart, const char *BufferEnd);
341
342  void lex(Token &T);
343
344  StringRef getSpelling(const Token &Tok,
345                        const SourceManager &SourceMgr,
346                        bool *Invalid = NULL) const;
347
348  /// \brief Register a new verbatim block command.
349  void addVerbatimBlockCommand(StringRef BeginName, StringRef EndName);
350
351  /// \brief Register a new verbatim line command.
352  void addVerbatimLineCommand(StringRef Name);
353};
354
355/// Re-lexes a sequence of tok::text tokens.
356class TextTokenRetokenizer {
357  llvm::BumpPtrAllocator &Allocator;
358  static const unsigned MaxTokens = 16;
359  SmallVector<Token, MaxTokens> Toks;
360
361  struct Position {
362    unsigned CurToken;
363    const char *BufferStart;
364    const char *BufferEnd;
365    const char *BufferPtr;
366    SourceLocation BufferStartLoc;
367  };
368
369  /// Current position in Toks.
370  Position Pos;
371
372  bool isEnd() const {
373    return Pos.CurToken >= Toks.size();
374  }
375
376  /// Sets up the buffer pointers to point to current token.
377  void setupBuffer() {
378    assert(Pos.CurToken < Toks.size());
379    const Token &Tok = Toks[Pos.CurToken];
380
381    Pos.BufferStart = Tok.getText().begin();
382    Pos.BufferEnd = Tok.getText().end();
383    Pos.BufferPtr = Pos.BufferStart;
384    Pos.BufferStartLoc = Tok.getLocation();
385  }
386
387  SourceLocation getSourceLocation() const {
388    const unsigned CharNo = Pos.BufferPtr - Pos.BufferStart;
389    return Pos.BufferStartLoc.getLocWithOffset(CharNo);
390  }
391
392  char peek() const {
393    assert(!isEnd());
394    assert(Pos.BufferPtr != Pos.BufferEnd);
395    return *Pos.BufferPtr;
396  }
397
398  void consumeChar() {
399    assert(!isEnd());
400    assert(Pos.BufferPtr != Pos.BufferEnd);
401    Pos.BufferPtr++;
402    if (Pos.BufferPtr == Pos.BufferEnd) {
403      Pos.CurToken++;
404      if (Pos.CurToken < Toks.size())
405        setupBuffer();
406    }
407  }
408
409  static bool isWhitespace(char C) {
410    return C == ' ' || C == '\n' || C == '\r' ||
411           C == '\t' || C == '\f' || C == '\v';
412  }
413
414  void consumeWhitespace() {
415    while (!isEnd()) {
416      if (isWhitespace(peek()))
417        consumeChar();
418      else
419        break;
420    }
421  }
422
423  void formTokenWithChars(Token &Result,
424                          SourceLocation Loc,
425                          const char *TokBegin,
426                          unsigned TokLength,
427                          StringRef Text) {
428    Result.setLocation(Loc);
429    Result.setKind(tok::text);
430    Result.setLength(TokLength);
431#ifndef NDEBUG
432    Result.TextPtr1 = "<UNSET>";
433    Result.TextLen1 = 7;
434#endif
435    Result.setText(Text);
436  }
437
438public:
439  TextTokenRetokenizer(llvm::BumpPtrAllocator &Allocator):
440      Allocator(Allocator) {
441    Pos.CurToken = 0;
442  }
443
444  /// Add a token.
445  /// Returns true on success, false if it seems like we have enough tokens.
446  bool addToken(const Token &Tok) {
447    assert(Tok.is(tok::text));
448    if (Toks.size() >= MaxTokens)
449      return false;
450
451    Toks.push_back(Tok);
452    if (Toks.size() == 1)
453      setupBuffer();
454    return true;
455  }
456
457  /// Extract a word -- sequence of non-whitespace characters.
458  bool lexWord(Token &Tok) {
459    if (isEnd())
460      return false;
461
462    Position SavedPos = Pos;
463
464    consumeWhitespace();
465    SmallString<32> WordText;
466    const char *WordBegin = Pos.BufferPtr;
467    SourceLocation Loc = getSourceLocation();
468    while (!isEnd()) {
469      const char C = peek();
470      if (!isWhitespace(C)) {
471        WordText.push_back(C);
472        consumeChar();
473      } else
474        break;
475    }
476    const unsigned Length = WordText.size();
477    if (Length == 0) {
478      Pos = SavedPos;
479      return false;
480    }
481
482    char *TextPtr = Allocator.Allocate<char>(Length + 1);
483
484    memcpy(TextPtr, WordText.c_str(), Length + 1);
485    StringRef Text = StringRef(TextPtr, Length);
486
487    formTokenWithChars(Tok, Loc, WordBegin,
488                       Pos.BufferPtr - WordBegin, Text);
489    return true;
490  }
491
492  bool lexDelimitedSeq(Token &Tok, char OpenDelim, char CloseDelim) {
493    if (isEnd())
494      return false;
495
496    Position SavedPos = Pos;
497
498    consumeWhitespace();
499    SmallString<32> WordText;
500    const char *WordBegin = Pos.BufferPtr;
501    SourceLocation Loc = getSourceLocation();
502    bool Error = false;
503    if (!isEnd()) {
504      const char C = peek();
505      if (C == OpenDelim) {
506        WordText.push_back(C);
507        consumeChar();
508      } else
509        Error = true;
510    }
511    char C;
512    while (!Error && !isEnd()) {
513      C = peek();
514      WordText.push_back(C);
515      consumeChar();
516      if (C == CloseDelim)
517        break;
518    }
519    if (!Error && C != CloseDelim)
520      Error = true;
521
522    if (Error) {
523      Pos = SavedPos;
524      return false;
525    }
526
527    const unsigned Length = WordText.size();
528    char *TextPtr = Allocator.Allocate<char>(Length + 1);
529
530    memcpy(TextPtr, WordText.c_str(), Length + 1);
531    StringRef Text = StringRef(TextPtr, Length);
532
533    formTokenWithChars(Tok, Loc, WordBegin,
534                       Pos.BufferPtr - WordBegin, Text);
535    return true;
536  }
537
538  /// Return a text token.  Useful to take tokens back.
539  bool lexText(Token &Tok) {
540    if (isEnd())
541      return false;
542
543    if (Pos.BufferPtr != Pos.BufferStart)
544      formTokenWithChars(Tok, getSourceLocation(),
545                         Pos.BufferPtr, Pos.BufferEnd - Pos.BufferPtr,
546                         StringRef(Pos.BufferPtr,
547                                   Pos.BufferEnd - Pos.BufferPtr));
548    else
549      Tok = Toks[Pos.CurToken];
550
551    Pos.CurToken++;
552    if (Pos.CurToken < Toks.size())
553      setupBuffer();
554    return true;
555  }
556};
557
558} // end namespace comments
559} // end namespace clang
560
561#endif
562
563