CommentLexer.h revision 651f13cea278ec967336033dd032faef0e9fc2ec
1//===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10//  This file defines lexer for structured comments and supporting token class.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_CLANG_AST_COMMENT_LEXER_H
15#define LLVM_CLANG_AST_COMMENT_LEXER_H
16
17#include "clang/Basic/Diagnostic.h"
18#include "clang/Basic/SourceManager.h"
19#include "llvm/ADT/SmallString.h"
20#include "llvm/ADT/SmallVector.h"
21#include "llvm/ADT/StringRef.h"
22#include "llvm/Support/Allocator.h"
23#include "llvm/Support/raw_ostream.h"
24
25namespace clang {
26namespace comments {
27
28class Lexer;
29class TextTokenRetokenizer;
30struct CommandInfo;
31class CommandTraits;
32
33namespace tok {
34enum TokenKind {
35  eof,
36  newline,
37  text,
38  unknown_command,   // Command that does not have an ID.
39  backslash_command, // Command with an ID, that used backslash marker.
40  at_command,        // Command with an ID, that used 'at' marker.
41  verbatim_block_begin,
42  verbatim_block_line,
43  verbatim_block_end,
44  verbatim_line_name,
45  verbatim_line_text,
46  html_start_tag,     // <tag
47  html_ident,         // attr
48  html_equals,        // =
49  html_quoted_string, // "blah\"blah" or 'blah\'blah'
50  html_greater,       // >
51  html_slash_greater, // />
52  html_end_tag        // </tag
53};
54} // end namespace tok
55
56/// \brief Comment token.
57class Token {
58  friend class Lexer;
59  friend class TextTokenRetokenizer;
60
61  /// The location of the token.
62  SourceLocation Loc;
63
64  /// The actual kind of the token.
65  tok::TokenKind Kind;
66
67  /// Length of the token spelling in comment.  Can be 0 for synthenized
68  /// tokens.
69  unsigned Length;
70
71  /// Contains text value associated with a token.
72  const char *TextPtr;
73
74  /// Integer value associated with a token.
75  ///
76  /// If the token is a konwn command, contains command ID and TextPtr is
77  /// unused (command spelling can be found with CommandTraits).  Otherwise,
78  /// contains the length of the string that starts at TextPtr.
79  unsigned IntVal;
80
81public:
82  SourceLocation getLocation() const LLVM_READONLY { return Loc; }
83  void setLocation(SourceLocation SL) { Loc = SL; }
84
85  SourceLocation getEndLocation() const LLVM_READONLY {
86    if (Length == 0 || Length == 1)
87      return Loc;
88    return Loc.getLocWithOffset(Length - 1);
89  }
90
91  tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
92  void setKind(tok::TokenKind K) { Kind = K; }
93
94  bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
95  bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
96
97  unsigned getLength() const LLVM_READONLY { return Length; }
98  void setLength(unsigned L) { Length = L; }
99
100  StringRef getText() const LLVM_READONLY {
101    assert(is(tok::text));
102    return StringRef(TextPtr, IntVal);
103  }
104
105  void setText(StringRef Text) {
106    assert(is(tok::text));
107    TextPtr = Text.data();
108    IntVal = Text.size();
109  }
110
111  StringRef getUnknownCommandName() const LLVM_READONLY {
112    assert(is(tok::unknown_command));
113    return StringRef(TextPtr, IntVal);
114  }
115
116  void setUnknownCommandName(StringRef Name) {
117    assert(is(tok::unknown_command));
118    TextPtr = Name.data();
119    IntVal = Name.size();
120  }
121
122  unsigned getCommandID() const LLVM_READONLY {
123    assert(is(tok::backslash_command) || is(tok::at_command));
124    return IntVal;
125  }
126
127  void setCommandID(unsigned ID) {
128    assert(is(tok::backslash_command) || is(tok::at_command));
129    IntVal = ID;
130  }
131
132  unsigned getVerbatimBlockID() const LLVM_READONLY {
133    assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
134    return IntVal;
135  }
136
137  void setVerbatimBlockID(unsigned ID) {
138    assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
139    IntVal = ID;
140  }
141
142  StringRef getVerbatimBlockText() const LLVM_READONLY {
143    assert(is(tok::verbatim_block_line));
144    return StringRef(TextPtr, IntVal);
145  }
146
147  void setVerbatimBlockText(StringRef Text) {
148    assert(is(tok::verbatim_block_line));
149    TextPtr = Text.data();
150    IntVal = Text.size();
151  }
152
153  unsigned getVerbatimLineID() const LLVM_READONLY {
154    assert(is(tok::verbatim_line_name));
155    return IntVal;
156  }
157
158  void setVerbatimLineID(unsigned ID) {
159    assert(is(tok::verbatim_line_name));
160    IntVal = ID;
161  }
162
163  StringRef getVerbatimLineText() const LLVM_READONLY {
164    assert(is(tok::verbatim_line_text));
165    return StringRef(TextPtr, IntVal);
166  }
167
168  void setVerbatimLineText(StringRef Text) {
169    assert(is(tok::verbatim_line_text));
170    TextPtr = Text.data();
171    IntVal = Text.size();
172  }
173
174  StringRef getHTMLTagStartName() const LLVM_READONLY {
175    assert(is(tok::html_start_tag));
176    return StringRef(TextPtr, IntVal);
177  }
178
179  void setHTMLTagStartName(StringRef Name) {
180    assert(is(tok::html_start_tag));
181    TextPtr = Name.data();
182    IntVal = Name.size();
183  }
184
185  StringRef getHTMLIdent() const LLVM_READONLY {
186    assert(is(tok::html_ident));
187    return StringRef(TextPtr, IntVal);
188  }
189
190  void setHTMLIdent(StringRef Name) {
191    assert(is(tok::html_ident));
192    TextPtr = Name.data();
193    IntVal = Name.size();
194  }
195
196  StringRef getHTMLQuotedString() const LLVM_READONLY {
197    assert(is(tok::html_quoted_string));
198    return StringRef(TextPtr, IntVal);
199  }
200
201  void setHTMLQuotedString(StringRef Str) {
202    assert(is(tok::html_quoted_string));
203    TextPtr = Str.data();
204    IntVal = Str.size();
205  }
206
207  StringRef getHTMLTagEndName() const LLVM_READONLY {
208    assert(is(tok::html_end_tag));
209    return StringRef(TextPtr, IntVal);
210  }
211
212  void setHTMLTagEndName(StringRef Name) {
213    assert(is(tok::html_end_tag));
214    TextPtr = Name.data();
215    IntVal = Name.size();
216  }
217
218  void dump(const Lexer &L, const SourceManager &SM) const;
219};
220
221/// \brief Comment lexer.
222class Lexer {
223private:
224  Lexer(const Lexer &) LLVM_DELETED_FUNCTION;
225  void operator=(const Lexer &) LLVM_DELETED_FUNCTION;
226
227  /// Allocator for strings that are semantic values of tokens and have to be
228  /// computed (for example, resolved decimal character references).
229  llvm::BumpPtrAllocator &Allocator;
230
231  DiagnosticsEngine &Diags;
232
233  const CommandTraits &Traits;
234
235  const char *const BufferStart;
236  const char *const BufferEnd;
237  SourceLocation FileLoc;
238
239  const char *BufferPtr;
240
241  /// One past end pointer for the current comment.  For BCPL comments points
242  /// to newline or BufferEnd, for C comments points to star in '*/'.
243  const char *CommentEnd;
244
245  enum LexerCommentState {
246    LCS_BeforeComment,
247    LCS_InsideBCPLComment,
248    LCS_InsideCComment,
249    LCS_BetweenComments
250  };
251
252  /// Low-level lexer state, track if we are inside or outside of comment.
253  LexerCommentState CommentState;
254
255  enum LexerState {
256    /// Lexing normal comment text
257    LS_Normal,
258
259    /// Finished lexing verbatim block beginning command, will lex first body
260    /// line.
261    LS_VerbatimBlockFirstLine,
262
263    /// Lexing verbatim block body line-by-line, skipping line-starting
264    /// decorations.
265    LS_VerbatimBlockBody,
266
267    /// Finished lexing verbatim line beginning command, will lex text (one
268    /// line).
269    LS_VerbatimLineText,
270
271    /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
272    LS_HTMLStartTag,
273
274    /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'.
275    LS_HTMLEndTag
276  };
277
278  /// Current lexing mode.
279  LexerState State;
280
281  /// If State is LS_VerbatimBlock, contains the name of verbatim end
282  /// command, including command marker.
283  SmallString<16> VerbatimBlockEndCommandName;
284
285  /// Given a character reference name (e.g., "lt"), return the character that
286  /// it stands for (e.g., "<").
287  StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
288
289  /// Given a Unicode codepoint as base-10 integer, return the character.
290  StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
291
292  /// Given a Unicode codepoint as base-16 integer, return the character.
293  StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
294
295  void formTokenWithChars(Token &Result, const char *TokEnd,
296                          tok::TokenKind Kind);
297
298  void formTextToken(Token &Result, const char *TokEnd) {
299    StringRef Text(BufferPtr, TokEnd - BufferPtr);
300    formTokenWithChars(Result, TokEnd, tok::text);
301    Result.setText(Text);
302  }
303
304  SourceLocation getSourceLocation(const char *Loc) const {
305    assert(Loc >= BufferStart && Loc <= BufferEnd &&
306           "Location out of range for this buffer!");
307
308    const unsigned CharNo = Loc - BufferStart;
309    return FileLoc.getLocWithOffset(CharNo);
310  }
311
312  DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) {
313    return Diags.Report(Loc, DiagID);
314  }
315
316  /// Eat string matching regexp \code \s*\* \endcode.
317  void skipLineStartingDecorations();
318
319  /// Lex stuff inside comments.  CommentEnd should be set correctly.
320  void lexCommentText(Token &T);
321
322  void setupAndLexVerbatimBlock(Token &T,
323                                const char *TextBegin,
324                                char Marker, const CommandInfo *Info);
325
326  void lexVerbatimBlockFirstLine(Token &T);
327
328  void lexVerbatimBlockBody(Token &T);
329
330  void setupAndLexVerbatimLine(Token &T, const char *TextBegin,
331                               const CommandInfo *Info);
332
333  void lexVerbatimLineText(Token &T);
334
335  void lexHTMLCharacterReference(Token &T);
336
337  void setupAndLexHTMLStartTag(Token &T);
338
339  void lexHTMLStartTag(Token &T);
340
341  void setupAndLexHTMLEndTag(Token &T);
342
343  void lexHTMLEndTag(Token &T);
344
345public:
346  Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
347        const CommandTraits &Traits,
348        SourceLocation FileLoc,
349        const char *BufferStart, const char *BufferEnd);
350
351  void lex(Token &T);
352
353  StringRef getSpelling(const Token &Tok,
354                        const SourceManager &SourceMgr,
355                        bool *Invalid = NULL) const;
356};
357
358} // end namespace comments
359} // end namespace clang
360
361#endif
362
363