CommentLexer.h revision 3f38bf2d441fac379c427f86153fbb0cb41256c6
1//===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10//  This file defines lexer for structured comments and supporting token class.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_CLANG_AST_COMMENT_LEXER_H
15#define LLVM_CLANG_AST_COMMENT_LEXER_H
16
17#include "clang/Basic/SourceManager.h"
18#include "llvm/ADT/StringRef.h"
19#include "llvm/ADT/SmallString.h"
20#include "llvm/ADT/SmallVector.h"
21#include "llvm/Support/Allocator.h"
22#include "llvm/Support/raw_ostream.h"
23
24namespace clang {
25namespace comments {
26
27class Lexer;
28class TextTokenRetokenizer;
29
30namespace tok {
31enum TokenKind {
32  eof,
33  newline,
34  text,
35  command,
36  verbatim_block_begin,
37  verbatim_block_line,
38  verbatim_block_end,
39  verbatim_line_name,
40  verbatim_line_text,
41  html_start_tag,     // <tag
42  html_ident,         // attr
43  html_equals,        // =
44  html_quoted_string, // "blah\"blah" or 'blah\'blah'
45  html_greater,       // >
46  html_slash_greater, // />
47  html_end_tag        // </tag
48};
49} // end namespace tok
50
51class CommentOptions {
52public:
53  bool Markdown;
54};
55
56/// \brief Comment token.
57class Token {
58  friend class Lexer;
59  friend class TextTokenRetokenizer;
60
61  /// The location of the token.
62  SourceLocation Loc;
63
64  /// The actual kind of the token.
65  tok::TokenKind Kind;
66
67  /// Length of the token spelling in comment.  Can be 0 for synthenized
68  /// tokens.
69  unsigned Length;
70
71  /// Contains text value associated with a token.
72  const char *TextPtr1;
73  unsigned TextLen1;
74
75public:
76  SourceLocation getLocation() const LLVM_READONLY { return Loc; }
77  void setLocation(SourceLocation SL) { Loc = SL; }
78
79  SourceLocation getEndLocation() const LLVM_READONLY {
80    if (Length == 0 || Length == 1)
81      return Loc;
82    return Loc.getLocWithOffset(Length - 1);
83  }
84
85  tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
86  void setKind(tok::TokenKind K) { Kind = K; }
87
88  bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
89  bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
90
91  unsigned getLength() const LLVM_READONLY { return Length; }
92  void setLength(unsigned L) { Length = L; }
93
94  StringRef getText() const LLVM_READONLY {
95    assert(is(tok::text));
96    return StringRef(TextPtr1, TextLen1);
97  }
98
99  void setText(StringRef Text) {
100    assert(is(tok::text));
101    TextPtr1 = Text.data();
102    TextLen1 = Text.size();
103  }
104
105  StringRef getCommandName() const LLVM_READONLY {
106    assert(is(tok::command));
107    return StringRef(TextPtr1, TextLen1);
108  }
109
110  void setCommandName(StringRef Name) {
111    assert(is(tok::command));
112    TextPtr1 = Name.data();
113    TextLen1 = Name.size();
114  }
115
116  StringRef getVerbatimBlockName() const LLVM_READONLY {
117    assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
118    return StringRef(TextPtr1, TextLen1);
119  }
120
121  void setVerbatimBlockName(StringRef Name) {
122    assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
123    TextPtr1 = Name.data();
124    TextLen1 = Name.size();
125  }
126
127  StringRef getVerbatimBlockText() const LLVM_READONLY {
128    assert(is(tok::verbatim_block_line));
129    return StringRef(TextPtr1, TextLen1);
130  }
131
132  void setVerbatimBlockText(StringRef Text) {
133    assert(is(tok::verbatim_block_line));
134    TextPtr1 = Text.data();
135    TextLen1 = Text.size();
136  }
137
138  /// Returns the name of verbatim line command.
139  StringRef getVerbatimLineName() const LLVM_READONLY {
140    assert(is(tok::verbatim_line_name));
141    return StringRef(TextPtr1, TextLen1);
142  }
143
144  void setVerbatimLineName(StringRef Name) {
145    assert(is(tok::verbatim_line_name));
146    TextPtr1 = Name.data();
147    TextLen1 = Name.size();
148  }
149
150  StringRef getVerbatimLineText() const LLVM_READONLY {
151    assert(is(tok::verbatim_line_text));
152    return StringRef(TextPtr1, TextLen1);
153  }
154
155  void setVerbatimLineText(StringRef Text) {
156    assert(is(tok::verbatim_line_text));
157    TextPtr1 = Text.data();
158    TextLen1 = Text.size();
159  }
160
161  StringRef getHTMLTagStartName() const LLVM_READONLY {
162    assert(is(tok::html_start_tag));
163    return StringRef(TextPtr1, TextLen1);
164  }
165
166  void setHTMLTagStartName(StringRef Name) {
167    assert(is(tok::html_start_tag));
168    TextPtr1 = Name.data();
169    TextLen1 = Name.size();
170  }
171
172  StringRef getHTMLIdent() const LLVM_READONLY {
173    assert(is(tok::html_ident));
174    return StringRef(TextPtr1, TextLen1);
175  }
176
177  void setHTMLIdent(StringRef Name) {
178    assert(is(tok::html_ident));
179    TextPtr1 = Name.data();
180    TextLen1 = Name.size();
181  }
182
183  StringRef getHTMLQuotedString() const LLVM_READONLY {
184    assert(is(tok::html_quoted_string));
185    return StringRef(TextPtr1, TextLen1);
186  }
187
188  void setHTMLQuotedString(StringRef Str) {
189    assert(is(tok::html_quoted_string));
190    TextPtr1 = Str.data();
191    TextLen1 = Str.size();
192  }
193
194  StringRef getHTMLTagEndName() const LLVM_READONLY {
195    assert(is(tok::html_end_tag));
196    return StringRef(TextPtr1, TextLen1);
197  }
198
199  void setHTMLTagEndName(StringRef Name) {
200    assert(is(tok::html_end_tag));
201    TextPtr1 = Name.data();
202    TextLen1 = Name.size();
203  }
204
205  void dump(const Lexer &L, const SourceManager &SM) const;
206};
207
208/// \brief Comment lexer.
209class Lexer {
210private:
211  Lexer(const Lexer&);          // DO NOT IMPLEMENT
212  void operator=(const Lexer&); // DO NOT IMPLEMENT
213
214  const char *const BufferStart;
215  const char *const BufferEnd;
216  SourceLocation FileLoc;
217  CommentOptions CommOpts;
218
219  const char *BufferPtr;
220
221  /// One past end pointer for the current comment.  For BCPL comments points
222  /// to newline or BufferEnd, for C comments points to star in '*/'.
223  const char *CommentEnd;
224
225  enum LexerCommentState {
226    LCS_BeforeComment,
227    LCS_InsideBCPLComment,
228    LCS_InsideCComment,
229    LCS_BetweenComments
230  };
231
232  /// Low-level lexer state, track if we are inside or outside of comment.
233  LexerCommentState CommentState;
234
235  enum LexerState {
236    /// Lexing normal comment text
237    LS_Normal,
238
239    /// Finished lexing verbatim block beginning command, will lex first body
240    /// line.
241    LS_VerbatimBlockFirstLine,
242
243    /// Lexing verbatim block body line-by-line, skipping line-starting
244    /// decorations.
245    LS_VerbatimBlockBody,
246
247    /// Finished lexing verbatim line beginning command, will lex text (one
248    /// line).
249    LS_VerbatimLineText,
250
251    /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
252    LS_HTMLStartTag,
253
254    /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'.
255    LS_HTMLEndTag
256  };
257
258  /// Current lexing mode.
259  LexerState State;
260
261  /// A verbatim-like block command eats every character (except line starting
262  /// decorations) until matching end command is seen or comment end is hit.
263  struct VerbatimBlockCommand {
264    StringRef BeginName;
265    StringRef EndName;
266  };
267
268  typedef SmallVector<VerbatimBlockCommand, 4> VerbatimBlockCommandVector;
269
270  /// Registered verbatim-like block commands.
271  VerbatimBlockCommandVector VerbatimBlockCommands;
272
273  /// If State is LS_VerbatimBlock, contains the the name of verbatim end
274  /// command, including command marker.
275  SmallString<16> VerbatimBlockEndCommandName;
276
277  bool isVerbatimBlockCommand(StringRef BeginName, StringRef &EndName) const;
278
279  /// A verbatim-like line command eats everything until a newline is seen or
280  /// comment end is hit.
281  struct VerbatimLineCommand {
282    StringRef Name;
283  };
284
285  typedef SmallVector<VerbatimLineCommand, 4> VerbatimLineCommandVector;
286
287  /// Registered verbatim-like line commands.
288  VerbatimLineCommandVector VerbatimLineCommands;
289
290  bool isVerbatimLineCommand(StringRef Name) const;
291
292  void formTokenWithChars(Token &Result, const char *TokEnd,
293                          tok::TokenKind Kind) {
294    const unsigned TokLen = TokEnd - BufferPtr;
295    Result.setLocation(getSourceLocation(BufferPtr));
296    Result.setKind(Kind);
297    Result.setLength(TokLen);
298#ifndef NDEBUG
299    Result.TextPtr1 = "<UNSET>";
300    Result.TextLen1 = 7;
301#endif
302    BufferPtr = TokEnd;
303  }
304
305  SourceLocation getSourceLocation(const char *Loc) const {
306    assert(Loc >= BufferStart && Loc <= BufferEnd &&
307           "Location out of range for this buffer!");
308
309    const unsigned CharNo = Loc - BufferStart;
310    return FileLoc.getLocWithOffset(CharNo);
311  }
312
313  /// Eat string matching regexp \code \s*\* \endcode.
314  void skipLineStartingDecorations();
315
316  /// Lex stuff inside comments.  CommentEnd should be set correctly.
317  void lexCommentText(Token &T);
318
319  void setupAndLexVerbatimBlock(Token &T,
320                                const char *TextBegin,
321                                char Marker, StringRef EndName);
322
323  void lexVerbatimBlockFirstLine(Token &T);
324
325  void lexVerbatimBlockBody(Token &T);
326
327  void setupAndLexVerbatimLine(Token &T, const char *TextBegin);
328
329  void lexVerbatimLineText(Token &T);
330
331  void setupAndLexHTMLStartTag(Token &T);
332
333  void lexHTMLStartTag(Token &T);
334
335  void setupAndLexHTMLEndTag(Token &T);
336
337  void lexHTMLEndTag(Token &T);
338
339public:
340  Lexer(SourceLocation FileLoc, const CommentOptions &CommOpts,
341        const char *BufferStart, const char *BufferEnd);
342
343  void lex(Token &T);
344
345  StringRef getSpelling(const Token &Tok,
346                        const SourceManager &SourceMgr,
347                        bool *Invalid = NULL) const;
348
349  /// \brief Register a new verbatim block command.
350  void addVerbatimBlockCommand(StringRef BeginName, StringRef EndName);
351
352  /// \brief Register a new verbatim line command.
353  void addVerbatimLineCommand(StringRef Name);
354};
355
356/// Re-lexes a sequence of tok::text tokens.
357class TextTokenRetokenizer {
358  llvm::BumpPtrAllocator &Allocator;
359  static const unsigned MaxTokens = 16;
360  SmallVector<Token, MaxTokens> Toks;
361
362  struct Position {
363    unsigned CurToken;
364    const char *BufferStart;
365    const char *BufferEnd;
366    const char *BufferPtr;
367    SourceLocation BufferStartLoc;
368  };
369
370  /// Current position in Toks.
371  Position Pos;
372
373  bool isEnd() const {
374    return Pos.CurToken >= Toks.size();
375  }
376
377  /// Sets up the buffer pointers to point to current token.
378  void setupBuffer() {
379    assert(Pos.CurToken < Toks.size());
380    const Token &Tok = Toks[Pos.CurToken];
381
382    Pos.BufferStart = Tok.getText().begin();
383    Pos.BufferEnd = Tok.getText().end();
384    Pos.BufferPtr = Pos.BufferStart;
385    Pos.BufferStartLoc = Tok.getLocation();
386  }
387
388  SourceLocation getSourceLocation() const {
389    const unsigned CharNo = Pos.BufferPtr - Pos.BufferStart;
390    return Pos.BufferStartLoc.getLocWithOffset(CharNo);
391  }
392
393  char peek() const {
394    assert(!isEnd());
395    assert(Pos.BufferPtr != Pos.BufferEnd);
396    return *Pos.BufferPtr;
397  }
398
399  void consumeChar() {
400    assert(!isEnd());
401    assert(Pos.BufferPtr != Pos.BufferEnd);
402    Pos.BufferPtr++;
403    if (Pos.BufferPtr == Pos.BufferEnd) {
404      Pos.CurToken++;
405      if (Pos.CurToken < Toks.size())
406        setupBuffer();
407    }
408  }
409
410  static bool isWhitespace(char C) {
411    return C == ' ' || C == '\n' || C == '\r' ||
412           C == '\t' || C == '\f' || C == '\v';
413  }
414
415  void consumeWhitespace() {
416    while (!isEnd()) {
417      if (isWhitespace(peek()))
418        consumeChar();
419      else
420        break;
421    }
422  }
423
424  void formTokenWithChars(Token &Result,
425                          SourceLocation Loc,
426                          const char *TokBegin,
427                          unsigned TokLength,
428                          StringRef Text) {
429    Result.setLocation(Loc);
430    Result.setKind(tok::text);
431    Result.setLength(TokLength);
432#ifndef NDEBUG
433    Result.TextPtr1 = "<UNSET>";
434    Result.TextLen1 = 7;
435#endif
436    Result.setText(Text);
437  }
438
439public:
440  TextTokenRetokenizer(llvm::BumpPtrAllocator &Allocator):
441      Allocator(Allocator) {
442    Pos.CurToken = 0;
443  }
444
445  /// Add a token.
446  /// Returns true on success, false if it seems like we have enough tokens.
447  bool addToken(const Token &Tok) {
448    assert(Tok.is(tok::text));
449    if (Toks.size() >= MaxTokens)
450      return false;
451
452    Toks.push_back(Tok);
453    if (Toks.size() == 1)
454      setupBuffer();
455    return true;
456  }
457
458  /// Extract a word -- sequence of non-whitespace characters.
459  bool lexWord(Token &Tok) {
460    if (isEnd())
461      return false;
462
463    Position SavedPos = Pos;
464
465    consumeWhitespace();
466    SmallString<32> WordText;
467    const char *WordBegin = Pos.BufferPtr;
468    SourceLocation Loc = getSourceLocation();
469    while (!isEnd()) {
470      const char C = peek();
471      if (!isWhitespace(C)) {
472        WordText.push_back(C);
473        consumeChar();
474      } else
475        break;
476    }
477    const unsigned Length = WordText.size();
478    if (Length == 0) {
479      Pos = SavedPos;
480      return false;
481    }
482
483    char *TextPtr = Allocator.Allocate<char>(Length + 1);
484
485    memcpy(TextPtr, WordText.c_str(), Length + 1);
486    StringRef Text = StringRef(TextPtr, Length);
487
488    formTokenWithChars(Tok, Loc, WordBegin,
489                       Pos.BufferPtr - WordBegin, Text);
490    return true;
491  }
492
493  bool lexDelimitedSeq(Token &Tok, char OpenDelim, char CloseDelim) {
494    if (isEnd())
495      return false;
496
497    Position SavedPos = Pos;
498
499    consumeWhitespace();
500    SmallString<32> WordText;
501    const char *WordBegin = Pos.BufferPtr;
502    SourceLocation Loc = getSourceLocation();
503    bool Error = false;
504    if (!isEnd()) {
505      const char C = peek();
506      if (C == OpenDelim) {
507        WordText.push_back(C);
508        consumeChar();
509      } else
510        Error = true;
511    }
512    char C = '\0';
513    while (!Error && !isEnd()) {
514      C = peek();
515      WordText.push_back(C);
516      consumeChar();
517      if (C == CloseDelim)
518        break;
519    }
520    if (!Error && C != CloseDelim)
521      Error = true;
522
523    if (Error) {
524      Pos = SavedPos;
525      return false;
526    }
527
528    const unsigned Length = WordText.size();
529    char *TextPtr = Allocator.Allocate<char>(Length + 1);
530
531    memcpy(TextPtr, WordText.c_str(), Length + 1);
532    StringRef Text = StringRef(TextPtr, Length);
533
534    formTokenWithChars(Tok, Loc, WordBegin,
535                       Pos.BufferPtr - WordBegin, Text);
536    return true;
537  }
538
539  /// Return a text token.  Useful to take tokens back.
540  bool lexText(Token &Tok) {
541    if (isEnd())
542      return false;
543
544    if (Pos.BufferPtr != Pos.BufferStart)
545      formTokenWithChars(Tok, getSourceLocation(),
546                         Pos.BufferPtr, Pos.BufferEnd - Pos.BufferPtr,
547                         StringRef(Pos.BufferPtr,
548                                   Pos.BufferEnd - Pos.BufferPtr));
549    else
550      Tok = Toks[Pos.CurToken];
551
552    Pos.CurToken++;
553    if (Pos.CurToken < Toks.size())
554      setupBuffer();
555    return true;
556  }
557};
558
559} // end namespace comments
560} // end namespace clang
561
562#endif
563
564