CommentLexer.cpp revision 2d44d77fed3200e2eff289f55493317e90d3398c
1#include "clang/AST/CommentLexer.h"
2#include "llvm/ADT/StringSwitch.h"
3#include "llvm/Support/ErrorHandling.h"
4
5namespace clang {
6namespace comments {
7
8void Token::dump(const Lexer &L, const SourceManager &SM) const {
9  llvm::errs() << "comments::Token Kind=" << Kind << " ";
10  Loc.dump(SM);
11  llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
12}
13
14bool Lexer::isVerbatimBlockCommand(StringRef BeginName,
15                                  StringRef &EndName) const {
16  const char *Result = llvm::StringSwitch<const char *>(BeginName)
17    .Case("code", "endcode")
18    .Case("verbatim", "endverbatim")
19    .Case("htmlonly", "endhtmlonly")
20    .Case("latexonly", "endlatexonly")
21    .Case("xmlonly", "endxmlonly")
22    .Case("manonly", "endmanonly")
23    .Case("rtfonly", "endrtfonly")
24
25    .Case("dot", "enddot")
26    .Case("msc", "endmsc")
27
28    .Case("f$", "f$") // Inline LaTeX formula
29    .Case("f[", "f]") // Displayed LaTeX formula
30    .Case("f{", "f}") // LaTeX environment
31
32    .Default(NULL);
33
34  if (Result) {
35    EndName = Result;
36    return true;
37  }
38
39  for (VerbatimBlockCommandVector::const_iterator
40           I = VerbatimBlockCommands.begin(),
41           E = VerbatimBlockCommands.end();
42       I != E; ++I)
43    if (I->BeginName == BeginName) {
44      EndName = I->EndName;
45      return true;
46    }
47
48  return false;
49}
50
51bool Lexer::isVerbatimLineCommand(StringRef Name) const {
52  bool Result = llvm::StringSwitch<bool>(Name)
53  .Case("fn", true)
54  .Case("var", true)
55  .Case("property", true)
56  .Case("typedef", true)
57
58  .Case("overload", true)
59
60  .Case("defgroup", true)
61  .Case("ingroup", true)
62  .Case("addtogroup", true)
63  .Case("weakgroup", true)
64  .Case("name", true)
65
66  .Case("section", true)
67  .Case("subsection", true)
68  .Case("subsubsection", true)
69  .Case("paragraph", true)
70
71  .Case("mainpage", true)
72  .Case("subpage", true)
73  .Case("ref", true)
74
75  .Default(false);
76
77  if (Result)
78    return true;
79
80  for (VerbatimLineCommandVector::const_iterator
81           I = VerbatimLineCommands.begin(),
82           E = VerbatimLineCommands.end();
83       I != E; ++I)
84    if (I->Name == Name)
85      return true;
86
87  return false;
88}
89
90void Lexer::skipLineStartingDecorations() {
91  // This function should be called only for C comments
92  assert(CommentState == LCS_InsideCComment);
93
94  if (BufferPtr == CommentEnd)
95    return;
96
97  switch (*BufferPtr) {
98  case ' ':
99  case '\t':
100  case '\f':
101  case '\v': {
102    const char *NewBufferPtr = BufferPtr;
103    NewBufferPtr++;
104    if (NewBufferPtr == CommentEnd)
105      return;
106
107    char C = *NewBufferPtr;
108    while (C == ' ' || C == '\t' || C == '\f' || C == '\v') {
109      NewBufferPtr++;
110      if (NewBufferPtr == CommentEnd)
111        return;
112      C = *NewBufferPtr;
113    }
114    if (C == '*')
115      BufferPtr = NewBufferPtr + 1;
116    break;
117  }
118  case '*':
119    BufferPtr++;
120    break;
121  }
122}
123
124namespace {
125const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
126  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
127    const char C = *BufferPtr;
128    if (C == '\n' || C == '\r')
129      return BufferPtr;
130  }
131  return BufferEnd;
132}
133
134const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
135  if (BufferPtr == BufferEnd)
136    return BufferPtr;
137
138  if (*BufferPtr == '\n')
139    BufferPtr++;
140  else {
141    assert(*BufferPtr == '\r');
142    BufferPtr++;
143    if (BufferPtr != BufferEnd && *BufferPtr == '\n')
144      BufferPtr++;
145  }
146  return BufferPtr;
147}
148
149bool isHTMLIdentifierCharacter(char C) {
150  return (C >= 'a' && C <= 'z') ||
151         (C >= 'A' && C <= 'Z') ||
152         (C >= '0' && C <= '9');
153}
154
155const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
156  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
157    if (!isHTMLIdentifierCharacter(*BufferPtr))
158      return BufferPtr;
159  }
160  return BufferEnd;
161}
162
163/// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
164/// string allowed.
165///
166/// Returns pointer to closing quote.
167const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
168{
169  const char Quote = *BufferPtr;
170  assert(Quote == '\"' || Quote == '\'');
171
172  BufferPtr++;
173  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
174    const char C = *BufferPtr;
175    if (C == Quote && BufferPtr[-1] != '\\')
176      return BufferPtr;
177  }
178  return BufferEnd;
179}
180
181bool isHorizontalWhitespace(char C) {
182  return C == ' ' || C == '\t' || C == '\f' || C == '\v';
183}
184
185bool isWhitespace(char C) {
186  return C == ' ' || C == '\n' || C == '\r' ||
187         C == '\t' || C == '\f' || C == '\v';
188}
189
190const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
191  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
192    if (!isWhitespace(*BufferPtr))
193      return BufferPtr;
194  }
195  return BufferEnd;
196}
197
198bool isCommandNameCharacter(char C) {
199  return (C >= 'a' && C <= 'z') ||
200         (C >= 'A' && C <= 'Z') ||
201         (C >= '0' && C <= '9');
202}
203
204const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
205  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
206    if (!isCommandNameCharacter(*BufferPtr))
207      return BufferPtr;
208  }
209  return BufferEnd;
210}
211
212/// Return the one past end pointer for BCPL comments.
213/// Handles newlines escaped with backslash or trigraph for backslahs.
214const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
215  const char *CurPtr = BufferPtr;
216  while (CurPtr != BufferEnd) {
217    char C = *CurPtr;
218    while (C != '\n' && C != '\r') {
219      CurPtr++;
220      if (CurPtr == BufferEnd)
221        return BufferEnd;
222      C = *CurPtr;
223    }
224    // We found a newline, check if it is escaped.
225    const char *EscapePtr = CurPtr - 1;
226    while(isHorizontalWhitespace(*EscapePtr))
227      EscapePtr--;
228
229    if (*EscapePtr == '\\' ||
230        (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
231         EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
232      // We found an escaped newline.
233      CurPtr = skipNewline(CurPtr, BufferEnd);
234    } else
235      return CurPtr; // Not an escaped newline.
236  }
237  return BufferEnd;
238}
239
240/// Return the one past end pointer for C comments.
241/// Very dumb, does not handle escaped newlines or trigraphs.
242const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
243  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
244    if (*BufferPtr == '*') {
245      assert(BufferPtr + 1 != BufferEnd);
246      if (*(BufferPtr + 1) == '/')
247        return BufferPtr;
248    }
249  }
250  llvm_unreachable("buffer end hit before '*/' was seen");
251}
252} // unnamed namespace
253
254void Lexer::lexCommentText(Token &T) {
255  assert(CommentState == LCS_InsideBCPLComment ||
256         CommentState == LCS_InsideCComment);
257
258  switch (State) {
259  case LS_Normal:
260    break;
261  case LS_VerbatimBlockFirstLine:
262    lexVerbatimBlockFirstLine(T);
263    return;
264  case LS_VerbatimBlockBody:
265    lexVerbatimBlockBody(T);
266    return;
267  case LS_HTMLOpenTag:
268    lexHTMLOpenTag(T);
269    return;
270  }
271
272  assert(State == LS_Normal);
273
274  const char *TokenPtr = BufferPtr;
275  assert(TokenPtr < CommentEnd);
276  while (TokenPtr != CommentEnd) {
277    switch(*TokenPtr) {
278      case '\\':
279      case '@': {
280        TokenPtr++;
281        if (TokenPtr == CommentEnd) {
282          formTokenWithChars(T, TokenPtr, tok::text);
283          T.setText(StringRef(BufferPtr - T.getLength(), T.getLength()));
284          return;
285        }
286        char C = *TokenPtr;
287        switch (C) {
288        default:
289          break;
290
291        case '\\': case '@': case '&': case '$':
292        case '#':  case '<': case '>': case '%':
293        case '\"': case '.': case ':':
294          // This is one of \\ \@ \& \$ etc escape sequences.
295          TokenPtr++;
296          if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
297            // This is the \:: escape sequence.
298            TokenPtr++;
299          }
300          formTokenWithChars(T, TokenPtr, tok::text);
301          T.setText(StringRef(BufferPtr - (T.getLength() - 1),
302                              T.getLength() - 1));
303          return;
304        }
305
306        // Don't make zero-length commands.
307        if (!isCommandNameCharacter(*TokenPtr)) {
308          formTokenWithChars(T, TokenPtr, tok::text);
309          T.setText(StringRef(BufferPtr - T.getLength(), T.getLength()));
310          return;
311        }
312
313        TokenPtr = skipCommandName(TokenPtr, CommentEnd);
314        unsigned Length = TokenPtr - (BufferPtr + 1);
315
316        // Hardcoded support for lexing LaTeX formula commands
317        // \f$ \f[ \f] \f{ \f} as a single command.
318        if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
319          C = *TokenPtr;
320          if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
321            TokenPtr++;
322            Length++;
323          }
324        }
325
326        const StringRef CommandName(BufferPtr + 1, Length);
327        StringRef EndName;
328
329        if (isVerbatimBlockCommand(CommandName, EndName)) {
330          setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, EndName);
331          return;
332        }
333        if (isVerbatimLineCommand(CommandName)) {
334          lexVerbatimLine(T, TokenPtr);
335          return;
336        }
337        formTokenWithChars(T, TokenPtr, tok::command);
338        T.setCommandName(CommandName);
339        return;
340      }
341
342      case '<': {
343        TokenPtr++;
344        if (TokenPtr == CommentEnd) {
345          formTokenWithChars(T, TokenPtr, tok::text);
346          T.setText(StringRef(BufferPtr - T.getLength(), T.getLength()));
347          return;
348        }
349        const char C = *TokenPtr;
350        if (isHTMLIdentifierCharacter(C))
351          setupAndLexHTMLOpenTag(T);
352        else if (C == '/')
353          lexHTMLCloseTag(T);
354        return;
355      }
356
357      case '\n':
358      case '\r':
359        TokenPtr = skipNewline(TokenPtr, CommentEnd);
360        formTokenWithChars(T, TokenPtr, tok::newline);
361
362        if (CommentState == LCS_InsideCComment)
363          skipLineStartingDecorations();
364        return;
365
366      default: {
367        while (true) {
368          TokenPtr++;
369          if (TokenPtr == CommentEnd)
370            break;
371          char C = *TokenPtr;
372          if(C == '\n' || C == '\r' ||
373             C == '\\' || C == '@' || C == '<')
374            break;
375        }
376        formTokenWithChars(T, TokenPtr, tok::text);
377        T.setText(StringRef(BufferPtr - T.getLength(), T.getLength()));
378        return;
379      }
380    }
381  }
382}
383
384void Lexer::setupAndLexVerbatimBlock(Token &T,
385                                     const char *TextBegin,
386                                     char Marker, StringRef EndName) {
387  VerbatimBlockEndCommandName.clear();
388  VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
389  VerbatimBlockEndCommandName.append(EndName);
390
391  formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
392  T.setVerbatimBlockName(StringRef(TextBegin - (T.getLength() - 1),
393                                   T.getLength() - 1));
394
395  State = LS_VerbatimBlockFirstLine;
396}
397
398void Lexer::lexVerbatimBlockFirstLine(Token &T) {
399  assert(BufferPtr < CommentEnd);
400
401  // FIXME: It would be better to scan the text once, finding either the block
402  // end command or newline.
403  //
404  // Extract current line.
405  const char *Newline = findNewline(BufferPtr, CommentEnd);
406  StringRef Line(BufferPtr, Newline - BufferPtr);
407
408  // Look for end command in current line.
409  size_t Pos = Line.find(VerbatimBlockEndCommandName);
410  const char *NextLine;
411  if (Pos == StringRef::npos) {
412    // Current line is completely verbatim.
413    NextLine = skipNewline(Newline, CommentEnd);
414  } else if (Pos == 0) {
415    // Current line contains just an end command.
416    const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
417    formTokenWithChars(T, End, tok::verbatim_block_end);
418    T.setVerbatimBlockName(StringRef(End - (T.getLength() - 1),
419                                     T.getLength() - 1));
420    State = LS_Normal;
421    return;
422  } else {
423    // There is some text, followed by end command.  Extract text first.
424    NextLine = BufferPtr + Pos;
425  }
426
427  formTokenWithChars(T, NextLine, tok::verbatim_block_line);
428  T.setVerbatimBlockText(StringRef(NextLine - T.getLength(), T.getLength()));
429
430  State = LS_VerbatimBlockBody;
431}
432
433void Lexer::lexVerbatimBlockBody(Token &T) {
434  assert(State == LS_VerbatimBlockBody);
435
436  if (CommentState == LCS_InsideCComment)
437    skipLineStartingDecorations();
438
439  lexVerbatimBlockFirstLine(T);
440}
441
442void Lexer::lexVerbatimLine(Token &T, const char *TextBegin) {
443  // Extract current line.
444  const char *Newline = findNewline(BufferPtr, CommentEnd);
445
446  const StringRef Name(BufferPtr + 1, TextBegin - BufferPtr - 1);
447  const StringRef Text(TextBegin, Newline - TextBegin);
448
449  formTokenWithChars(T, Newline, tok::verbatim_line);
450  T.setVerbatimLineName(Name);
451  T.setVerbatimLineText(Text);
452}
453
454void Lexer::setupAndLexHTMLOpenTag(Token &T) {
455  assert(BufferPtr[0] == '<' && isHTMLIdentifierCharacter(BufferPtr[1]));
456  const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
457
458  formTokenWithChars(T, TagNameEnd, tok::html_tag_open);
459  T.setHTMLTagOpenName(StringRef(TagNameEnd - (T.getLength() - 1),
460                                 T.getLength() - 1));
461
462  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
463
464  if (BufferPtr != CommentEnd && *BufferPtr == '>') {
465    BufferPtr++;
466    return;
467  }
468
469  if (BufferPtr != CommentEnd && isHTMLIdentifierCharacter(*BufferPtr))
470    State = LS_HTMLOpenTag;
471}
472
473void Lexer::lexHTMLOpenTag(Token &T) {
474  assert(State == LS_HTMLOpenTag);
475
476  const char *TokenPtr = BufferPtr;
477  char C = *TokenPtr;
478  if (isHTMLIdentifierCharacter(C)) {
479    TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
480    formTokenWithChars(T, TokenPtr, tok::html_ident);
481    T.setHTMLIdent(StringRef(TokenPtr - T.getLength(), T.getLength()));
482  } else {
483    switch (C) {
484    case '=':
485      TokenPtr++;
486      formTokenWithChars(T, TokenPtr, tok::html_equals);
487      break;
488    case '\"':
489    case '\'': {
490      const char *OpenQuote = TokenPtr;
491      TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
492      const char *ClosingQuote = TokenPtr;
493      if (TokenPtr != CommentEnd) // Skip closing quote.
494        TokenPtr++;
495      formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
496      T.setHTMLQuotedString(StringRef(OpenQuote + 1,
497                                      ClosingQuote - (OpenQuote + 1)));
498      break;
499    }
500    case '>':
501      TokenPtr++;
502      formTokenWithChars(T, TokenPtr, tok::html_greater);
503      break;
504    }
505  }
506
507  // Now look ahead and return to normal state if we don't see any HTML tokens
508  // ahead.
509  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
510  if (BufferPtr == CommentEnd) {
511    State = LS_Normal;
512    return;
513  }
514
515  C = *BufferPtr;
516  if (!isHTMLIdentifierCharacter(C) &&
517      C != '=' && C != '\"' && C != '\'' && C != '>') {
518    State = LS_Normal;
519    return;
520  }
521}
522
523void Lexer::lexHTMLCloseTag(Token &T) {
524  assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
525
526  const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
527  const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
528
529  const char *End = skipWhitespace(TagNameEnd, CommentEnd);
530  if (End != CommentEnd && *End == '>')
531    End++;
532
533  formTokenWithChars(T, End, tok::html_tag_close);
534  T.setHTMLTagCloseName(StringRef(TagNameBegin, TagNameEnd - TagNameBegin));
535}
536
537Lexer::Lexer(SourceLocation FileLoc, const CommentOptions &CommOpts,
538             const char *BufferStart, const char *BufferEnd):
539    BufferStart(BufferStart), BufferEnd(BufferEnd),
540    FileLoc(FileLoc), CommOpts(CommOpts), BufferPtr(BufferStart),
541    CommentState(LCS_BeforeComment), State(LS_Normal) {
542}
543
544void Lexer::lex(Token &T) {
545again:
546  switch (CommentState) {
547  case LCS_BeforeComment:
548    if (BufferPtr == BufferEnd) {
549      formTokenWithChars(T, BufferPtr, tok::eof);
550      return;
551    }
552
553    assert(*BufferPtr == '/');
554    BufferPtr++; // Skip first slash.
555    switch(*BufferPtr) {
556    case '/': { // BCPL comment.
557      BufferPtr++; // Skip second slash.
558
559      if (BufferPtr != BufferEnd) {
560        // Skip Doxygen magic marker, if it is present.
561        // It might be missing because of a typo //< or /*<, or because we
562        // merged this non-Doxygen comment into a bunch of Doxygen comments
563        // around it: /** ... */ /* ... */ /** ... */
564        const char C = *BufferPtr;
565        if (C == '/' || C == '!')
566          BufferPtr++;
567      }
568
569      // Skip less-than symbol that marks trailing comments.
570      // Skip it even if the comment is not a Doxygen one, because //< and /*<
571      // are frequent typos.
572      if (BufferPtr != BufferEnd && *BufferPtr == '<')
573        BufferPtr++;
574
575      CommentState = LCS_InsideBCPLComment;
576      State = LS_Normal;
577      CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
578      goto again;
579    }
580    case '*': { // C comment.
581      BufferPtr++; // Skip star.
582
583      // Skip Doxygen magic marker.
584      const char C = *BufferPtr;
585      if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
586        BufferPtr++;
587
588      // Skip less-than symbol that marks trailing comments.
589      if (BufferPtr != BufferEnd && *BufferPtr == '<')
590        BufferPtr++;
591
592      CommentState = LCS_InsideCComment;
593      State = LS_Normal;
594      CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
595      goto again;
596    }
597    default:
598      llvm_unreachable("second character of comment should be '/' or '*'");
599    }
600
601  case LCS_BetweenComments: {
602    // Consecutive comments are extracted only if there is only whitespace
603    // between them.  So we can search for the start of the next comment.
604    const char *EndWhitespace = BufferPtr;
605    while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
606      EndWhitespace++;
607
608    // Turn any whitespace between comments (and there is only whitespace
609    // between them) into a newline.  We have two newlines between comments
610    // in total (first one was synthesized after a comment).
611    formTokenWithChars(T, EndWhitespace, tok::newline);
612
613    CommentState = LCS_BeforeComment;
614    break;
615  }
616
617  case LCS_InsideBCPLComment:
618  case LCS_InsideCComment:
619    if (BufferPtr != CommentEnd) {
620      lexCommentText(T);
621      break;
622    } else {
623      // Skip C comment closing sequence.
624      if (CommentState == LCS_InsideCComment) {
625        assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
626        BufferPtr += 2;
627        assert(BufferPtr <= BufferEnd);
628
629        // Synthenize newline just after the C comment, regardless if there is
630        // actually a newline.
631        formTokenWithChars(T, BufferPtr, tok::newline);
632
633        CommentState = LCS_BetweenComments;
634        break;
635      } else {
636        // Don't synthesized a newline after BCPL comment.
637        CommentState = LCS_BetweenComments;
638        goto again;
639      }
640    }
641  }
642}
643
644StringRef Lexer::getSpelling(const Token &Tok,
645                             const SourceManager &SourceMgr,
646                             bool *Invalid) const {
647  SourceLocation Loc = Tok.getLocation();
648  std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
649
650  bool InvalidTemp = false;
651  StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
652  if (InvalidTemp) {
653    *Invalid = true;
654    return StringRef();
655  }
656
657  const char *Begin = File.data() + LocInfo.second;
658  return StringRef(Begin, Tok.getLength());
659}
660
661void Lexer::addVerbatimBlockCommand(StringRef BeginName, StringRef EndName) {
662  VerbatimBlockCommand VBC;
663  VBC.BeginName = BeginName;
664  VBC.EndName = EndName;
665  VerbatimBlockCommands.push_back(VBC);
666}
667
668void Lexer::addVerbatimLineCommand(StringRef Name) {
669  VerbatimLineCommand VLC;
670  VLC.Name = Name;
671  VerbatimLineCommands.push_back(VLC);
672}
673
674} // end namespace comments
675} // end namespace clang
676
677