1//===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10//  This file defines lexer for structured comments and supporting token class.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_CLANG_AST_COMMENT_LEXER_H
15#define LLVM_CLANG_AST_COMMENT_LEXER_H
16
17#include "clang/Basic/SourceManager.h"
18#include "llvm/ADT/SmallString.h"
19#include "llvm/ADT/SmallVector.h"
20#include "llvm/ADT/StringRef.h"
21#include "llvm/Support/Allocator.h"
22#include "llvm/Support/raw_ostream.h"
23
24namespace clang {
25namespace comments {
26
27class Lexer;
28class TextTokenRetokenizer;
29struct CommandInfo;
30class CommandTraits;
31
32namespace tok {
33enum TokenKind {
34  eof,
35  newline,
36  text,
37  unknown_command,   // Command that does not have an ID.
38  backslash_command, // Command with an ID, that used backslash marker.
39  at_command,        // Command with an ID, that used 'at' marker.
40  verbatim_block_begin,
41  verbatim_block_line,
42  verbatim_block_end,
43  verbatim_line_name,
44  verbatim_line_text,
45  html_start_tag,     // <tag
46  html_ident,         // attr
47  html_equals,        // =
48  html_quoted_string, // "blah\"blah" or 'blah\'blah'
49  html_greater,       // >
50  html_slash_greater, // />
51  html_end_tag        // </tag
52};
53} // end namespace tok
54
55/// \brief Comment token.
56class Token {
57  friend class Lexer;
58  friend class TextTokenRetokenizer;
59
60  /// The location of the token.
61  SourceLocation Loc;
62
63  /// The actual kind of the token.
64  tok::TokenKind Kind;
65
66  /// Length of the token spelling in comment.  Can be 0 for synthenized
67  /// tokens.
68  unsigned Length;
69
70  /// Contains text value associated with a token.
71  const char *TextPtr;
72
73  /// Integer value associated with a token.
74  ///
75  /// If the token is a konwn command, contains command ID and TextPtr is
76  /// unused (command spelling can be found with CommandTraits).  Otherwise,
77  /// contains the length of the string that starts at TextPtr.
78  unsigned IntVal;
79
80public:
81  SourceLocation getLocation() const LLVM_READONLY { return Loc; }
82  void setLocation(SourceLocation SL) { Loc = SL; }
83
84  SourceLocation getEndLocation() const LLVM_READONLY {
85    if (Length == 0 || Length == 1)
86      return Loc;
87    return Loc.getLocWithOffset(Length - 1);
88  }
89
90  tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
91  void setKind(tok::TokenKind K) { Kind = K; }
92
93  bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
94  bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
95
96  unsigned getLength() const LLVM_READONLY { return Length; }
97  void setLength(unsigned L) { Length = L; }
98
99  StringRef getText() const LLVM_READONLY {
100    assert(is(tok::text));
101    return StringRef(TextPtr, IntVal);
102  }
103
104  void setText(StringRef Text) {
105    assert(is(tok::text));
106    TextPtr = Text.data();
107    IntVal = Text.size();
108  }
109
110  StringRef getUnknownCommandName() const LLVM_READONLY {
111    assert(is(tok::unknown_command));
112    return StringRef(TextPtr, IntVal);
113  }
114
115  void setUnknownCommandName(StringRef Name) {
116    assert(is(tok::unknown_command));
117    TextPtr = Name.data();
118    IntVal = Name.size();
119  }
120
121  unsigned getCommandID() const LLVM_READONLY {
122    assert(is(tok::backslash_command) || is(tok::at_command));
123    return IntVal;
124  }
125
126  void setCommandID(unsigned ID) {
127    assert(is(tok::backslash_command) || is(tok::at_command));
128    IntVal = ID;
129  }
130
131  unsigned getVerbatimBlockID() const LLVM_READONLY {
132    assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
133    return IntVal;
134  }
135
136  void setVerbatimBlockID(unsigned ID) {
137    assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
138    IntVal = ID;
139  }
140
141  StringRef getVerbatimBlockText() const LLVM_READONLY {
142    assert(is(tok::verbatim_block_line));
143    return StringRef(TextPtr, IntVal);
144  }
145
146  void setVerbatimBlockText(StringRef Text) {
147    assert(is(tok::verbatim_block_line));
148    TextPtr = Text.data();
149    IntVal = Text.size();
150  }
151
152  unsigned getVerbatimLineID() const LLVM_READONLY {
153    assert(is(tok::verbatim_line_name));
154    return IntVal;
155  }
156
157  void setVerbatimLineID(unsigned ID) {
158    assert(is(tok::verbatim_line_name));
159    IntVal = ID;
160  }
161
162  StringRef getVerbatimLineText() const LLVM_READONLY {
163    assert(is(tok::verbatim_line_text));
164    return StringRef(TextPtr, IntVal);
165  }
166
167  void setVerbatimLineText(StringRef Text) {
168    assert(is(tok::verbatim_line_text));
169    TextPtr = Text.data();
170    IntVal = Text.size();
171  }
172
173  StringRef getHTMLTagStartName() const LLVM_READONLY {
174    assert(is(tok::html_start_tag));
175    return StringRef(TextPtr, IntVal);
176  }
177
178  void setHTMLTagStartName(StringRef Name) {
179    assert(is(tok::html_start_tag));
180    TextPtr = Name.data();
181    IntVal = Name.size();
182  }
183
184  StringRef getHTMLIdent() const LLVM_READONLY {
185    assert(is(tok::html_ident));
186    return StringRef(TextPtr, IntVal);
187  }
188
189  void setHTMLIdent(StringRef Name) {
190    assert(is(tok::html_ident));
191    TextPtr = Name.data();
192    IntVal = Name.size();
193  }
194
195  StringRef getHTMLQuotedString() const LLVM_READONLY {
196    assert(is(tok::html_quoted_string));
197    return StringRef(TextPtr, IntVal);
198  }
199
200  void setHTMLQuotedString(StringRef Str) {
201    assert(is(tok::html_quoted_string));
202    TextPtr = Str.data();
203    IntVal = Str.size();
204  }
205
206  StringRef getHTMLTagEndName() const LLVM_READONLY {
207    assert(is(tok::html_end_tag));
208    return StringRef(TextPtr, IntVal);
209  }
210
211  void setHTMLTagEndName(StringRef Name) {
212    assert(is(tok::html_end_tag));
213    TextPtr = Name.data();
214    IntVal = Name.size();
215  }
216
217  void dump(const Lexer &L, const SourceManager &SM) const;
218};
219
220/// \brief Comment lexer.
221class Lexer {
222private:
223  Lexer(const Lexer &) LLVM_DELETED_FUNCTION;
224  void operator=(const Lexer &) LLVM_DELETED_FUNCTION;
225
226  /// Allocator for strings that are semantic values of tokens and have to be
227  /// computed (for example, resolved decimal character references).
228  llvm::BumpPtrAllocator &Allocator;
229
230  const CommandTraits &Traits;
231
232  const char *const BufferStart;
233  const char *const BufferEnd;
234  SourceLocation FileLoc;
235
236  const char *BufferPtr;
237
238  /// One past end pointer for the current comment.  For BCPL comments points
239  /// to newline or BufferEnd, for C comments points to star in '*/'.
240  const char *CommentEnd;
241
242  enum LexerCommentState {
243    LCS_BeforeComment,
244    LCS_InsideBCPLComment,
245    LCS_InsideCComment,
246    LCS_BetweenComments
247  };
248
249  /// Low-level lexer state, track if we are inside or outside of comment.
250  LexerCommentState CommentState;
251
252  enum LexerState {
253    /// Lexing normal comment text
254    LS_Normal,
255
256    /// Finished lexing verbatim block beginning command, will lex first body
257    /// line.
258    LS_VerbatimBlockFirstLine,
259
260    /// Lexing verbatim block body line-by-line, skipping line-starting
261    /// decorations.
262    LS_VerbatimBlockBody,
263
264    /// Finished lexing verbatim line beginning command, will lex text (one
265    /// line).
266    LS_VerbatimLineText,
267
268    /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
269    LS_HTMLStartTag,
270
271    /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'.
272    LS_HTMLEndTag
273  };
274
275  /// Current lexing mode.
276  LexerState State;
277
278  /// If State is LS_VerbatimBlock, contains the name of verbatim end
279  /// command, including command marker.
280  SmallString<16> VerbatimBlockEndCommandName;
281
282  /// Given a character reference name (e.g., "lt"), return the character that
283  /// it stands for (e.g., "<").
284  StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
285
286  /// Given a Unicode codepoint as base-10 integer, return the character.
287  StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
288
289  /// Given a Unicode codepoint as base-16 integer, return the character.
290  StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
291
292  void formTokenWithChars(Token &Result, const char *TokEnd,
293                          tok::TokenKind Kind) {
294    const unsigned TokLen = TokEnd - BufferPtr;
295    Result.setLocation(getSourceLocation(BufferPtr));
296    Result.setKind(Kind);
297    Result.setLength(TokLen);
298#ifndef NDEBUG
299    Result.TextPtr = "<UNSET>";
300    Result.IntVal = 7;
301#endif
302    BufferPtr = TokEnd;
303  }
304
305  void formTextToken(Token &Result, const char *TokEnd) {
306    StringRef Text(BufferPtr, TokEnd - BufferPtr);
307    formTokenWithChars(Result, TokEnd, tok::text);
308    Result.setText(Text);
309  }
310
311  SourceLocation getSourceLocation(const char *Loc) const {
312    assert(Loc >= BufferStart && Loc <= BufferEnd &&
313           "Location out of range for this buffer!");
314
315    const unsigned CharNo = Loc - BufferStart;
316    return FileLoc.getLocWithOffset(CharNo);
317  }
318
319  /// Eat string matching regexp \code \s*\* \endcode.
320  void skipLineStartingDecorations();
321
322  /// Lex stuff inside comments.  CommentEnd should be set correctly.
323  void lexCommentText(Token &T);
324
325  void setupAndLexVerbatimBlock(Token &T,
326                                const char *TextBegin,
327                                char Marker, const CommandInfo *Info);
328
329  void lexVerbatimBlockFirstLine(Token &T);
330
331  void lexVerbatimBlockBody(Token &T);
332
333  void setupAndLexVerbatimLine(Token &T, const char *TextBegin,
334                               const CommandInfo *Info);
335
336  void lexVerbatimLineText(Token &T);
337
338  void lexHTMLCharacterReference(Token &T);
339
340  void setupAndLexHTMLStartTag(Token &T);
341
342  void lexHTMLStartTag(Token &T);
343
344  void setupAndLexHTMLEndTag(Token &T);
345
346  void lexHTMLEndTag(Token &T);
347
348public:
349  Lexer(llvm::BumpPtrAllocator &Allocator, const CommandTraits &Traits,
350        SourceLocation FileLoc,
351        const char *BufferStart, const char *BufferEnd);
352
353  void lex(Token &T);
354
355  StringRef getSpelling(const Token &Tok,
356                        const SourceManager &SourceMgr,
357                        bool *Invalid = NULL) const;
358};
359
360} // end namespace comments
361} // end namespace clang
362
363#endif
364
365