CommentParser.cpp revision c4b0f9b851ca59e61b802d58792ea3600fd9a9d4
1//===--- CommentParser.cpp - Doxygen comment parser -----------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9
10#include "clang/AST/CommentParser.h"
11#include "clang/AST/CommentSema.h"
12#include "clang/AST/CommentDiagnostic.h"
13#include "clang/Basic/SourceManager.h"
14#include "llvm/Support/ErrorHandling.h"
15
16namespace clang {
17namespace comments {
18
19/// Re-lexes a sequence of tok::text tokens.
20class TextTokenRetokenizer {
21  llvm::BumpPtrAllocator &Allocator;
22  static const unsigned MaxTokens = 16;
23  SmallVector<Token, MaxTokens> Toks;
24
25  struct Position {
26    unsigned CurToken;
27    const char *BufferStart;
28    const char *BufferEnd;
29    const char *BufferPtr;
30    SourceLocation BufferStartLoc;
31  };
32
33  /// Current position in Toks.
34  Position Pos;
35
36  bool isEnd() const {
37    return Pos.CurToken >= Toks.size();
38  }
39
40  /// Sets up the buffer pointers to point to current token.
41  void setupBuffer() {
42    assert(Pos.CurToken < Toks.size());
43    const Token &Tok = Toks[Pos.CurToken];
44
45    Pos.BufferStart = Tok.getText().begin();
46    Pos.BufferEnd = Tok.getText().end();
47    Pos.BufferPtr = Pos.BufferStart;
48    Pos.BufferStartLoc = Tok.getLocation();
49  }
50
51  SourceLocation getSourceLocation() const {
52    const unsigned CharNo = Pos.BufferPtr - Pos.BufferStart;
53    return Pos.BufferStartLoc.getLocWithOffset(CharNo);
54  }
55
56  char peek() const {
57    assert(!isEnd());
58    assert(Pos.BufferPtr != Pos.BufferEnd);
59    return *Pos.BufferPtr;
60  }
61
62  void consumeChar() {
63    assert(!isEnd());
64    assert(Pos.BufferPtr != Pos.BufferEnd);
65    Pos.BufferPtr++;
66    if (Pos.BufferPtr == Pos.BufferEnd) {
67      Pos.CurToken++;
68      if (Pos.CurToken < Toks.size())
69        setupBuffer();
70    }
71  }
72
73  static bool isWhitespace(char C) {
74    return C == ' ' || C == '\n' || C == '\r' ||
75           C == '\t' || C == '\f' || C == '\v';
76  }
77
78  void consumeWhitespace() {
79    while (!isEnd()) {
80      if (isWhitespace(peek()))
81        consumeChar();
82      else
83        break;
84    }
85  }
86
87  void formTokenWithChars(Token &Result,
88                          SourceLocation Loc,
89                          const char *TokBegin,
90                          unsigned TokLength,
91                          StringRef Text) {
92    Result.setLocation(Loc);
93    Result.setKind(tok::text);
94    Result.setLength(TokLength);
95#ifndef NDEBUG
96    Result.TextPtr1 = "<UNSET>";
97    Result.TextLen1 = 7;
98#endif
99    Result.setText(Text);
100  }
101
102public:
103  TextTokenRetokenizer(llvm::BumpPtrAllocator &Allocator):
104      Allocator(Allocator) {
105    Pos.CurToken = 0;
106  }
107
108  /// Add a token.
109  /// Returns true on success, false if it seems like we have enough tokens.
110  bool addToken(const Token &Tok) {
111    assert(Tok.is(tok::text));
112    if (Toks.size() >= MaxTokens)
113      return false;
114
115    Toks.push_back(Tok);
116    if (Toks.size() == 1)
117      setupBuffer();
118    return true;
119  }
120
121  /// Extract a word -- sequence of non-whitespace characters.
122  bool lexWord(Token &Tok) {
123    if (isEnd())
124      return false;
125
126    Position SavedPos = Pos;
127
128    consumeWhitespace();
129    SmallString<32> WordText;
130    const char *WordBegin = Pos.BufferPtr;
131    SourceLocation Loc = getSourceLocation();
132    while (!isEnd()) {
133      const char C = peek();
134      if (!isWhitespace(C)) {
135        WordText.push_back(C);
136        consumeChar();
137      } else
138        break;
139    }
140    const unsigned Length = WordText.size();
141    if (Length == 0) {
142      Pos = SavedPos;
143      return false;
144    }
145
146    char *TextPtr = Allocator.Allocate<char>(Length + 1);
147
148    memcpy(TextPtr, WordText.c_str(), Length + 1);
149    StringRef Text = StringRef(TextPtr, Length);
150
151    formTokenWithChars(Tok, Loc, WordBegin,
152                       Pos.BufferPtr - WordBegin, Text);
153    return true;
154  }
155
156  bool lexDelimitedSeq(Token &Tok, char OpenDelim, char CloseDelim) {
157    if (isEnd())
158      return false;
159
160    Position SavedPos = Pos;
161
162    consumeWhitespace();
163    SmallString<32> WordText;
164    const char *WordBegin = Pos.BufferPtr;
165    SourceLocation Loc = getSourceLocation();
166    bool Error = false;
167    if (!isEnd()) {
168      const char C = peek();
169      if (C == OpenDelim) {
170        WordText.push_back(C);
171        consumeChar();
172      } else
173        Error = true;
174    }
175    char C = '\0';
176    while (!Error && !isEnd()) {
177      C = peek();
178      WordText.push_back(C);
179      consumeChar();
180      if (C == CloseDelim)
181        break;
182    }
183    if (!Error && C != CloseDelim)
184      Error = true;
185
186    if (Error) {
187      Pos = SavedPos;
188      return false;
189    }
190
191    const unsigned Length = WordText.size();
192    char *TextPtr = Allocator.Allocate<char>(Length + 1);
193
194    memcpy(TextPtr, WordText.c_str(), Length + 1);
195    StringRef Text = StringRef(TextPtr, Length);
196
197    formTokenWithChars(Tok, Loc, WordBegin,
198                       Pos.BufferPtr - WordBegin, Text);
199    return true;
200  }
201
202  /// Return a text token.  Useful to take tokens back.
203  bool lexText(Token &Tok) {
204    if (isEnd())
205      return false;
206
207    if (Pos.BufferPtr != Pos.BufferStart)
208      formTokenWithChars(Tok, getSourceLocation(),
209                         Pos.BufferPtr, Pos.BufferEnd - Pos.BufferPtr,
210                         StringRef(Pos.BufferPtr,
211                                   Pos.BufferEnd - Pos.BufferPtr));
212    else
213      Tok = Toks[Pos.CurToken];
214
215    Pos.CurToken++;
216    if (Pos.CurToken < Toks.size())
217      setupBuffer();
218    return true;
219  }
220};
221
222Parser::Parser(Lexer &L, Sema &S, llvm::BumpPtrAllocator &Allocator,
223               const SourceManager &SourceMgr, DiagnosticsEngine &Diags):
224    L(L), S(S), Allocator(Allocator), SourceMgr(SourceMgr), Diags(Diags) {
225  consumeToken();
226}
227
228ParamCommandComment *Parser::parseParamCommandArgs(
229    ParamCommandComment *PC,
230    TextTokenRetokenizer &Retokenizer) {
231  Token Arg;
232  // Check if argument looks like direction specification: [dir]
233  // e.g., [in], [out], [in,out]
234  if (Retokenizer.lexDelimitedSeq(Arg, '[', ']'))
235    PC = S.actOnParamCommandDirectionArg(PC,
236                                         Arg.getLocation(),
237                                         Arg.getEndLocation(),
238                                         Arg.getText());
239
240  if (Retokenizer.lexWord(Arg))
241    PC = S.actOnParamCommandParamNameArg(PC,
242                                         Arg.getLocation(),
243                                         Arg.getEndLocation(),
244                                         Arg.getText());
245
246  return PC;
247}
248
249BlockCommandComment *Parser::parseBlockCommandArgs(
250    BlockCommandComment *BC,
251    TextTokenRetokenizer &Retokenizer,
252    unsigned NumArgs) {
253  typedef BlockCommandComment::Argument Argument;
254  Argument *Args =
255      new (Allocator.Allocate<Argument>(NumArgs)) Argument[NumArgs];
256  unsigned ParsedArgs = 0;
257  Token Arg;
258  while (ParsedArgs < NumArgs && Retokenizer.lexWord(Arg)) {
259    Args[ParsedArgs] = Argument(SourceRange(Arg.getLocation(),
260                                            Arg.getEndLocation()),
261                                Arg.getText());
262    ParsedArgs++;
263  }
264
265  return S.actOnBlockCommandArgs(BC, llvm::makeArrayRef(Args, ParsedArgs));
266}
267
268BlockCommandComment *Parser::parseBlockCommand() {
269  assert(Tok.is(tok::command));
270
271  ParamCommandComment *PC;
272  BlockCommandComment *BC;
273  bool IsParam = false;
274  unsigned NumArgs = 0;
275  if (S.isParamCommand(Tok.getCommandName())) {
276    IsParam = true;
277    PC = S.actOnParamCommandStart(Tok.getLocation(),
278                                  Tok.getEndLocation(),
279                                  Tok.getCommandName());
280  } else {
281    NumArgs = S.getBlockCommandNumArgs(Tok.getCommandName());
282    BC = S.actOnBlockCommandStart(Tok.getLocation(),
283                                  Tok.getEndLocation(),
284                                  Tok.getCommandName());
285  }
286  consumeToken();
287
288  if (Tok.is(tok::command) && S.isBlockCommand(Tok.getCommandName())) {
289    // Block command ahead.  We can't nest block commands, so pretend that this
290    // command has an empty argument.
291    ParagraphComment *PC = S.actOnParagraphComment(
292                                ArrayRef<InlineContentComment *>());
293    return S.actOnBlockCommandFinish(BC, PC);
294  }
295
296  if (IsParam || NumArgs > 0) {
297    // In order to parse command arguments we need to retokenize a few
298    // following text tokens.
299    TextTokenRetokenizer Retokenizer(Allocator);
300    while (Tok.is(tok::text)) {
301      if (Retokenizer.addToken(Tok))
302        consumeToken();
303    }
304
305    if (IsParam)
306      PC = parseParamCommandArgs(PC, Retokenizer);
307    else
308      BC = parseBlockCommandArgs(BC, Retokenizer, NumArgs);
309
310    // Put back tokens we didn't use.
311    SmallVector<Token, 16> TextToks;
312    Token Text;
313    while (Retokenizer.lexText(Text)) {
314      TextToks.push_back(Text);
315    }
316    putBack(TextToks);
317  }
318
319  BlockContentComment *Block = parseParagraphOrBlockCommand();
320  // Since we have checked for a block command, we should have parsed a
321  // paragraph.
322  if (IsParam)
323    return S.actOnParamCommandFinish(PC, cast<ParagraphComment>(Block));
324  else
325    return S.actOnBlockCommandFinish(BC, cast<ParagraphComment>(Block));
326}
327
328InlineCommandComment *Parser::parseInlineCommand() {
329  assert(Tok.is(tok::command));
330
331  const Token CommandTok = Tok;
332  consumeToken();
333
334  TextTokenRetokenizer Retokenizer(Allocator);
335  while (Tok.is(tok::text)) {
336    if (Retokenizer.addToken(Tok))
337      consumeToken();
338  }
339
340  Token ArgTok;
341  bool ArgTokValid = Retokenizer.lexWord(ArgTok);
342
343  InlineCommandComment *IC;
344  if (ArgTokValid) {
345    IC = S.actOnInlineCommand(CommandTok.getLocation(),
346                              CommandTok.getEndLocation(),
347                              CommandTok.getCommandName(),
348                              ArgTok.getLocation(),
349                              ArgTok.getEndLocation(),
350                              ArgTok.getText());
351  } else {
352    IC = S.actOnInlineCommand(CommandTok.getLocation(),
353                              CommandTok.getEndLocation(),
354                              CommandTok.getCommandName());
355  }
356
357  Token Text;
358  while (Retokenizer.lexText(Text))
359    putBack(Text);
360
361  return IC;
362}
363
364HTMLStartTagComment *Parser::parseHTMLStartTag() {
365  assert(Tok.is(tok::html_start_tag));
366  HTMLStartTagComment *HST =
367      S.actOnHTMLStartTagStart(Tok.getLocation(),
368                               Tok.getHTMLTagStartName());
369  consumeToken();
370
371  SmallVector<HTMLStartTagComment::Attribute, 2> Attrs;
372  while (true) {
373    switch (Tok.getKind()) {
374    case tok::html_ident: {
375      Token Ident = Tok;
376      consumeToken();
377      if (Tok.isNot(tok::html_equals)) {
378        Attrs.push_back(HTMLStartTagComment::Attribute(Ident.getLocation(),
379                                                       Ident.getHTMLIdent()));
380        continue;
381      }
382      Token Equals = Tok;
383      consumeToken();
384      if (Tok.isNot(tok::html_quoted_string)) {
385        Diag(Tok.getLocation(),
386             diag::warn_doc_html_start_tag_expected_quoted_string)
387          << SourceRange(Equals.getLocation());
388        Attrs.push_back(HTMLStartTagComment::Attribute(Ident.getLocation(),
389                                                       Ident.getHTMLIdent()));
390        while (Tok.is(tok::html_equals) ||
391               Tok.is(tok::html_quoted_string))
392          consumeToken();
393        continue;
394      }
395      Attrs.push_back(HTMLStartTagComment::Attribute(
396                              Ident.getLocation(),
397                              Ident.getHTMLIdent(),
398                              Equals.getLocation(),
399                              SourceRange(Tok.getLocation(),
400                                          Tok.getEndLocation()),
401                              Tok.getHTMLQuotedString()));
402      consumeToken();
403      continue;
404    }
405
406    case tok::html_greater:
407      HST = S.actOnHTMLStartTagFinish(HST,
408                                      copyArray(llvm::makeArrayRef(Attrs)),
409                                      Tok.getLocation(),
410                                      /* IsSelfClosing = */ false);
411      consumeToken();
412      return HST;
413
414    case tok::html_slash_greater:
415      HST = S.actOnHTMLStartTagFinish(HST,
416                                      copyArray(llvm::makeArrayRef(Attrs)),
417                                      Tok.getLocation(),
418                                      /* IsSelfClosing = */ true);
419      consumeToken();
420      return HST;
421
422    case tok::html_equals:
423    case tok::html_quoted_string:
424      Diag(Tok.getLocation(),
425           diag::warn_doc_html_start_tag_expected_ident_or_greater);
426      while (Tok.is(tok::html_equals) ||
427             Tok.is(tok::html_quoted_string))
428        consumeToken();
429      if (Tok.is(tok::html_ident) ||
430          Tok.is(tok::html_greater) ||
431          Tok.is(tok::html_slash_greater))
432        continue;
433
434      return S.actOnHTMLStartTagFinish(HST,
435                                       copyArray(llvm::makeArrayRef(Attrs)),
436                                       SourceLocation(),
437                                       /* IsSelfClosing = */ false);
438
439    default:
440      // Not a token from an HTML start tag.  Thus HTML tag prematurely ended.
441      HST = S.actOnHTMLStartTagFinish(HST,
442                                      copyArray(llvm::makeArrayRef(Attrs)),
443                                      SourceLocation(),
444                                      /* IsSelfClosing = */ false);
445      bool StartLineInvalid;
446      const unsigned StartLine = SourceMgr.getPresumedLineNumber(
447                                                  HST->getLocation(),
448                                                  &StartLineInvalid);
449      bool EndLineInvalid;
450      const unsigned EndLine = SourceMgr.getPresumedLineNumber(
451                                                  Tok.getLocation(),
452                                                  &EndLineInvalid);
453      if (StartLineInvalid || EndLineInvalid || StartLine == EndLine)
454        Diag(Tok.getLocation(),
455             diag::warn_doc_html_start_tag_expected_ident_or_greater)
456          << HST->getSourceRange();
457      else {
458        Diag(Tok.getLocation(),
459             diag::warn_doc_html_start_tag_expected_ident_or_greater);
460        Diag(HST->getLocation(), diag::note_doc_html_tag_started_here)
461          << HST->getSourceRange();
462      }
463      return HST;
464    }
465  }
466}
467
468HTMLEndTagComment *Parser::parseHTMLEndTag() {
469  assert(Tok.is(tok::html_end_tag));
470  Token TokEndTag = Tok;
471  consumeToken();
472  SourceLocation Loc;
473  if (Tok.is(tok::html_greater)) {
474    Loc = Tok.getLocation();
475    consumeToken();
476  }
477
478  return S.actOnHTMLEndTag(TokEndTag.getLocation(),
479                           Loc,
480                           TokEndTag.getHTMLTagEndName());
481}
482
483BlockContentComment *Parser::parseParagraphOrBlockCommand() {
484  SmallVector<InlineContentComment *, 8> Content;
485
486  while (true) {
487    switch (Tok.getKind()) {
488    case tok::verbatim_block_begin:
489    case tok::verbatim_line_name:
490    case tok::eof:
491      assert(Content.size() != 0);
492      break; // Block content or EOF ahead, finish this parapgaph.
493
494    case tok::command:
495      if (S.isBlockCommand(Tok.getCommandName())) {
496        if (Content.size() == 0)
497          return parseBlockCommand();
498        break; // Block command ahead, finish this parapgaph.
499      }
500      if (S.isInlineCommand(Tok.getCommandName())) {
501        Content.push_back(parseInlineCommand());
502        continue;
503      }
504
505      // Not a block command, not an inline command ==> an unknown command.
506      Content.push_back(S.actOnUnknownCommand(Tok.getLocation(),
507                                              Tok.getEndLocation(),
508                                              Tok.getCommandName()));
509      consumeToken();
510      continue;
511
512    case tok::newline: {
513      consumeToken();
514      if (Tok.is(tok::newline) || Tok.is(tok::eof)) {
515        consumeToken();
516        break; // Two newlines -- end of paragraph.
517      }
518      if (Content.size() > 0)
519        Content.back()->addTrailingNewline();
520      continue;
521    }
522
523    // Don't deal with HTML tag soup now.
524    case tok::html_start_tag:
525      Content.push_back(parseHTMLStartTag());
526      continue;
527
528    case tok::html_end_tag:
529      Content.push_back(parseHTMLEndTag());
530      continue;
531
532    case tok::text:
533      Content.push_back(S.actOnText(Tok.getLocation(),
534                                    Tok.getEndLocation(),
535                                    Tok.getText()));
536      consumeToken();
537      continue;
538
539    case tok::verbatim_block_line:
540    case tok::verbatim_block_end:
541    case tok::verbatim_line_text:
542    case tok::html_ident:
543    case tok::html_equals:
544    case tok::html_quoted_string:
545    case tok::html_greater:
546    case tok::html_slash_greater:
547      llvm_unreachable("should not see this token");
548    }
549    break;
550  }
551
552  return S.actOnParagraphComment(copyArray(llvm::makeArrayRef(Content)));
553}
554
555VerbatimBlockComment *Parser::parseVerbatimBlock() {
556  assert(Tok.is(tok::verbatim_block_begin));
557
558  VerbatimBlockComment *VB =
559      S.actOnVerbatimBlockStart(Tok.getLocation(),
560                                Tok.getVerbatimBlockName());
561  consumeToken();
562
563  // Don't create an empty line if verbatim opening command is followed
564  // by a newline.
565  if (Tok.is(tok::newline))
566    consumeToken();
567
568  SmallVector<VerbatimBlockLineComment *, 8> Lines;
569  while (Tok.is(tok::verbatim_block_line) ||
570         Tok.is(tok::newline)) {
571    VerbatimBlockLineComment *Line;
572    if (Tok.is(tok::verbatim_block_line)) {
573      Line = S.actOnVerbatimBlockLine(Tok.getLocation(),
574                                      Tok.getVerbatimBlockText());
575      consumeToken();
576      if (Tok.is(tok::newline)) {
577        consumeToken();
578      }
579    } else {
580      // Empty line, just a tok::newline.
581      Line = S.actOnVerbatimBlockLine(Tok.getLocation(), "");
582      consumeToken();
583    }
584    Lines.push_back(Line);
585  }
586
587  if (Tok.is(tok::verbatim_block_end)) {
588    VB = S.actOnVerbatimBlockFinish(VB, Tok.getLocation(),
589                                    Tok.getVerbatimBlockName(),
590                                    copyArray(llvm::makeArrayRef(Lines)));
591    consumeToken();
592  } else {
593    // Unterminated \\verbatim block
594    VB = S.actOnVerbatimBlockFinish(VB, SourceLocation(), "",
595                                    copyArray(llvm::makeArrayRef(Lines)));
596  }
597
598  return VB;
599}
600
601VerbatimLineComment *Parser::parseVerbatimLine() {
602  assert(Tok.is(tok::verbatim_line_name));
603
604  Token NameTok = Tok;
605  consumeToken();
606
607  SourceLocation TextBegin;
608  StringRef Text;
609  // Next token might not be a tok::verbatim_line_text if verbatim line
610  // starting command comes just before a newline or comment end.
611  if (Tok.is(tok::verbatim_line_text)) {
612    TextBegin = Tok.getLocation();
613    Text = Tok.getVerbatimLineText();
614  } else {
615    TextBegin = NameTok.getEndLocation();
616    Text = "";
617  }
618
619  VerbatimLineComment *VL = S.actOnVerbatimLine(NameTok.getLocation(),
620                                                NameTok.getVerbatimLineName(),
621                                                TextBegin,
622                                                Text);
623  consumeToken();
624  return VL;
625}
626
627BlockContentComment *Parser::parseBlockContent() {
628  switch (Tok.getKind()) {
629  case tok::text:
630  case tok::command:
631  case tok::html_start_tag:
632  case tok::html_end_tag:
633    return parseParagraphOrBlockCommand();
634
635  case tok::verbatim_block_begin:
636    return parseVerbatimBlock();
637
638  case tok::verbatim_line_name:
639    return parseVerbatimLine();
640
641  case tok::eof:
642  case tok::newline:
643  case tok::verbatim_block_line:
644  case tok::verbatim_block_end:
645  case tok::verbatim_line_text:
646  case tok::html_ident:
647  case tok::html_equals:
648  case tok::html_quoted_string:
649  case tok::html_greater:
650  case tok::html_slash_greater:
651    llvm_unreachable("should not see this token");
652  }
653  llvm_unreachable("bogus token kind");
654}
655
656FullComment *Parser::parseFullComment() {
657  // Skip newlines at the beginning of the comment.
658  while (Tok.is(tok::newline))
659    consumeToken();
660
661  SmallVector<BlockContentComment *, 8> Blocks;
662  while (Tok.isNot(tok::eof)) {
663    Blocks.push_back(parseBlockContent());
664
665    // Skip extra newlines after paragraph end.
666    while (Tok.is(tok::newline))
667      consumeToken();
668  }
669  return S.actOnFullComment(copyArray(llvm::makeArrayRef(Blocks)));
670}
671
672} // end namespace comments
673} // end namespace clang
674