1#include "clang/AST/CommentLexer.h"
2#include "clang/AST/CommentCommandTraits.h"
3#include "clang/AST/CommentDiagnostic.h"
4#include "clang/Basic/CharInfo.h"
5#include "llvm/ADT/StringExtras.h"
6#include "llvm/ADT/StringSwitch.h"
7#include "llvm/Support/ConvertUTF.h"
8#include "llvm/Support/ErrorHandling.h"
9
10namespace clang {
11namespace comments {
12
13void Token::dump(const Lexer &L, const SourceManager &SM) const {
14  llvm::errs() << "comments::Token Kind=" << Kind << " ";
15  Loc.dump(SM);
16  llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
17}
18
19static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
20  return isLetter(C);
21}
22
23static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
24  return isDigit(C);
25}
26
27static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
28  return isHexDigit(C);
29}
30
31static inline StringRef convertCodePointToUTF8(
32                                      llvm::BumpPtrAllocator &Allocator,
33                                      unsigned CodePoint) {
34  char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
35  char *ResolvedPtr = Resolved;
36  if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
37    return StringRef(Resolved, ResolvedPtr - Resolved);
38  else
39    return StringRef();
40}
41
42namespace {
43
44#include "clang/AST/CommentHTMLTags.inc"
45#include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
46
47} // unnamed namespace
48
49StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
50  // Fast path, first check a few most widely used named character references.
51  return llvm::StringSwitch<StringRef>(Name)
52      .Case("amp", "&")
53      .Case("lt", "<")
54      .Case("gt", ">")
55      .Case("quot", "\"")
56      .Case("apos", "\'")
57      // Slow path.
58      .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
59}
60
61StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
62  unsigned CodePoint = 0;
63  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
64    assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
65    CodePoint *= 10;
66    CodePoint += Name[i] - '0';
67  }
68  return convertCodePointToUTF8(Allocator, CodePoint);
69}
70
71StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
72  unsigned CodePoint = 0;
73  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
74    CodePoint *= 16;
75    const char C = Name[i];
76    assert(isHTMLHexCharacterReferenceCharacter(C));
77    CodePoint += llvm::hexDigitValue(C);
78  }
79  return convertCodePointToUTF8(Allocator, CodePoint);
80}
81
82void Lexer::skipLineStartingDecorations() {
83  // This function should be called only for C comments
84  assert(CommentState == LCS_InsideCComment);
85
86  if (BufferPtr == CommentEnd)
87    return;
88
89  switch (*BufferPtr) {
90  case ' ':
91  case '\t':
92  case '\f':
93  case '\v': {
94    const char *NewBufferPtr = BufferPtr;
95    NewBufferPtr++;
96    if (NewBufferPtr == CommentEnd)
97      return;
98
99    char C = *NewBufferPtr;
100    while (isHorizontalWhitespace(C)) {
101      NewBufferPtr++;
102      if (NewBufferPtr == CommentEnd)
103        return;
104      C = *NewBufferPtr;
105    }
106    if (C == '*')
107      BufferPtr = NewBufferPtr + 1;
108    break;
109  }
110  case '*':
111    BufferPtr++;
112    break;
113  }
114}
115
116namespace {
117/// Returns pointer to the first newline character in the string.
118const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
119  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
120    if (isVerticalWhitespace(*BufferPtr))
121      return BufferPtr;
122  }
123  return BufferEnd;
124}
125
126const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
127  if (BufferPtr == BufferEnd)
128    return BufferPtr;
129
130  if (*BufferPtr == '\n')
131    BufferPtr++;
132  else {
133    assert(*BufferPtr == '\r');
134    BufferPtr++;
135    if (BufferPtr != BufferEnd && *BufferPtr == '\n')
136      BufferPtr++;
137  }
138  return BufferPtr;
139}
140
141const char *skipNamedCharacterReference(const char *BufferPtr,
142                                        const char *BufferEnd) {
143  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
144    if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
145      return BufferPtr;
146  }
147  return BufferEnd;
148}
149
150const char *skipDecimalCharacterReference(const char *BufferPtr,
151                                          const char *BufferEnd) {
152  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
153    if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
154      return BufferPtr;
155  }
156  return BufferEnd;
157}
158
159const char *skipHexCharacterReference(const char *BufferPtr,
160                                          const char *BufferEnd) {
161  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
162    if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
163      return BufferPtr;
164  }
165  return BufferEnd;
166}
167
168bool isHTMLIdentifierStartingCharacter(char C) {
169  return isLetter(C);
170}
171
172bool isHTMLIdentifierCharacter(char C) {
173  return isAlphanumeric(C);
174}
175
176const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
177  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
178    if (!isHTMLIdentifierCharacter(*BufferPtr))
179      return BufferPtr;
180  }
181  return BufferEnd;
182}
183
184/// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
185/// string allowed.
186///
187/// Returns pointer to closing quote.
188const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
189{
190  const char Quote = *BufferPtr;
191  assert(Quote == '\"' || Quote == '\'');
192
193  BufferPtr++;
194  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
195    const char C = *BufferPtr;
196    if (C == Quote && BufferPtr[-1] != '\\')
197      return BufferPtr;
198  }
199  return BufferEnd;
200}
201
202const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
203  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
204    if (!isWhitespace(*BufferPtr))
205      return BufferPtr;
206  }
207  return BufferEnd;
208}
209
210bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
211  return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
212}
213
214bool isCommandNameStartCharacter(char C) {
215  return isLetter(C);
216}
217
218bool isCommandNameCharacter(char C) {
219  return isAlphanumeric(C);
220}
221
222const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
223  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
224    if (!isCommandNameCharacter(*BufferPtr))
225      return BufferPtr;
226  }
227  return BufferEnd;
228}
229
230/// Return the one past end pointer for BCPL comments.
231/// Handles newlines escaped with backslash or trigraph for backslahs.
232const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
233  const char *CurPtr = BufferPtr;
234  while (CurPtr != BufferEnd) {
235    while (!isVerticalWhitespace(*CurPtr)) {
236      CurPtr++;
237      if (CurPtr == BufferEnd)
238        return BufferEnd;
239    }
240    // We found a newline, check if it is escaped.
241    const char *EscapePtr = CurPtr - 1;
242    while(isHorizontalWhitespace(*EscapePtr))
243      EscapePtr--;
244
245    if (*EscapePtr == '\\' ||
246        (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
247         EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
248      // We found an escaped newline.
249      CurPtr = skipNewline(CurPtr, BufferEnd);
250    } else
251      return CurPtr; // Not an escaped newline.
252  }
253  return BufferEnd;
254}
255
256/// Return the one past end pointer for C comments.
257/// Very dumb, does not handle escaped newlines or trigraphs.
258const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
259  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
260    if (*BufferPtr == '*') {
261      assert(BufferPtr + 1 != BufferEnd);
262      if (*(BufferPtr + 1) == '/')
263        return BufferPtr;
264    }
265  }
266  llvm_unreachable("buffer end hit before '*/' was seen");
267}
268
269} // unnamed namespace
270
271void Lexer::lexCommentText(Token &T) {
272  assert(CommentState == LCS_InsideBCPLComment ||
273         CommentState == LCS_InsideCComment);
274
275  switch (State) {
276  case LS_Normal:
277    break;
278  case LS_VerbatimBlockFirstLine:
279    lexVerbatimBlockFirstLine(T);
280    return;
281  case LS_VerbatimBlockBody:
282    lexVerbatimBlockBody(T);
283    return;
284  case LS_VerbatimLineText:
285    lexVerbatimLineText(T);
286    return;
287  case LS_HTMLStartTag:
288    lexHTMLStartTag(T);
289    return;
290  case LS_HTMLEndTag:
291    lexHTMLEndTag(T);
292    return;
293  }
294
295  assert(State == LS_Normal);
296
297  const char *TokenPtr = BufferPtr;
298  assert(TokenPtr < CommentEnd);
299  while (TokenPtr != CommentEnd) {
300    switch(*TokenPtr) {
301      case '\\':
302      case '@': {
303        // Commands that start with a backslash and commands that start with
304        // 'at' have equivalent semantics.  But we keep information about the
305        // exact syntax in AST for comments.
306        tok::TokenKind CommandKind =
307            (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
308        TokenPtr++;
309        if (TokenPtr == CommentEnd) {
310          formTextToken(T, TokenPtr);
311          return;
312        }
313        char C = *TokenPtr;
314        switch (C) {
315        default:
316          break;
317
318        case '\\': case '@': case '&': case '$':
319        case '#':  case '<': case '>': case '%':
320        case '\"': case '.': case ':':
321          // This is one of \\ \@ \& \$ etc escape sequences.
322          TokenPtr++;
323          if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
324            // This is the \:: escape sequence.
325            TokenPtr++;
326          }
327          StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
328          formTokenWithChars(T, TokenPtr, tok::text);
329          T.setText(UnescapedText);
330          return;
331        }
332
333        // Don't make zero-length commands.
334        if (!isCommandNameStartCharacter(*TokenPtr)) {
335          formTextToken(T, TokenPtr);
336          return;
337        }
338
339        TokenPtr = skipCommandName(TokenPtr, CommentEnd);
340        unsigned Length = TokenPtr - (BufferPtr + 1);
341
342        // Hardcoded support for lexing LaTeX formula commands
343        // \f$ \f[ \f] \f{ \f} as a single command.
344        if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
345          C = *TokenPtr;
346          if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
347            TokenPtr++;
348            Length++;
349          }
350        }
351
352        const StringRef CommandName(BufferPtr + 1, Length);
353
354        const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
355        if (!Info) {
356          formTokenWithChars(T, TokenPtr, tok::unknown_command);
357          T.setUnknownCommandName(CommandName);
358          if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
359            StringRef CorrectedName = Info->Name;
360            SourceRange CommandRange(T.getLocation().getLocWithOffset(1),
361                                     T.getEndLocation());
362            Diag(T.getLocation(), diag::warn_correct_comment_command_name)
363              << CommandName << CorrectedName
364              << FixItHint::CreateReplacement(CommandRange, CorrectedName);
365          } else {
366            Diag(T.getLocation(), diag::warn_unknown_comment_command_name);
367            return;
368          }
369        }
370        if (Info->IsVerbatimBlockCommand) {
371          setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
372          return;
373        }
374        if (Info->IsVerbatimLineCommand) {
375          setupAndLexVerbatimLine(T, TokenPtr, Info);
376          return;
377        }
378        formTokenWithChars(T, TokenPtr, CommandKind);
379        T.setCommandID(Info->getID());
380        return;
381      }
382
383      case '&':
384        lexHTMLCharacterReference(T);
385        return;
386
387      case '<': {
388        TokenPtr++;
389        if (TokenPtr == CommentEnd) {
390          formTextToken(T, TokenPtr);
391          return;
392        }
393        const char C = *TokenPtr;
394        if (isHTMLIdentifierStartingCharacter(C))
395          setupAndLexHTMLStartTag(T);
396        else if (C == '/')
397          setupAndLexHTMLEndTag(T);
398        else
399          formTextToken(T, TokenPtr);
400
401        return;
402      }
403
404      case '\n':
405      case '\r':
406        TokenPtr = skipNewline(TokenPtr, CommentEnd);
407        formTokenWithChars(T, TokenPtr, tok::newline);
408
409        if (CommentState == LCS_InsideCComment)
410          skipLineStartingDecorations();
411        return;
412
413      default: {
414        size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
415                         find_first_of("\n\r\\@&<");
416        if (End != StringRef::npos)
417          TokenPtr += End;
418        else
419          TokenPtr = CommentEnd;
420        formTextToken(T, TokenPtr);
421        return;
422      }
423    }
424  }
425}
426
427void Lexer::setupAndLexVerbatimBlock(Token &T,
428                                     const char *TextBegin,
429                                     char Marker, const CommandInfo *Info) {
430  assert(Info->IsVerbatimBlockCommand);
431
432  VerbatimBlockEndCommandName.clear();
433  VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
434  VerbatimBlockEndCommandName.append(Info->EndCommandName);
435
436  formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
437  T.setVerbatimBlockID(Info->getID());
438
439  // If there is a newline following the verbatim opening command, skip the
440  // newline so that we don't create an tok::verbatim_block_line with empty
441  // text content.
442  if (BufferPtr != CommentEnd &&
443      isVerticalWhitespace(*BufferPtr)) {
444    BufferPtr = skipNewline(BufferPtr, CommentEnd);
445    State = LS_VerbatimBlockBody;
446    return;
447  }
448
449  State = LS_VerbatimBlockFirstLine;
450}
451
452void Lexer::lexVerbatimBlockFirstLine(Token &T) {
453again:
454  assert(BufferPtr < CommentEnd);
455
456  // FIXME: It would be better to scan the text once, finding either the block
457  // end command or newline.
458  //
459  // Extract current line.
460  const char *Newline = findNewline(BufferPtr, CommentEnd);
461  StringRef Line(BufferPtr, Newline - BufferPtr);
462
463  // Look for end command in current line.
464  size_t Pos = Line.find(VerbatimBlockEndCommandName);
465  const char *TextEnd;
466  const char *NextLine;
467  if (Pos == StringRef::npos) {
468    // Current line is completely verbatim.
469    TextEnd = Newline;
470    NextLine = skipNewline(Newline, CommentEnd);
471  } else if (Pos == 0) {
472    // Current line contains just an end command.
473    const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
474    StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
475    formTokenWithChars(T, End, tok::verbatim_block_end);
476    T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
477    State = LS_Normal;
478    return;
479  } else {
480    // There is some text, followed by end command.  Extract text first.
481    TextEnd = BufferPtr + Pos;
482    NextLine = TextEnd;
483    // If there is only whitespace before end command, skip whitespace.
484    if (isWhitespace(BufferPtr, TextEnd)) {
485      BufferPtr = TextEnd;
486      goto again;
487    }
488  }
489
490  StringRef Text(BufferPtr, TextEnd - BufferPtr);
491  formTokenWithChars(T, NextLine, tok::verbatim_block_line);
492  T.setVerbatimBlockText(Text);
493
494  State = LS_VerbatimBlockBody;
495}
496
497void Lexer::lexVerbatimBlockBody(Token &T) {
498  assert(State == LS_VerbatimBlockBody);
499
500  if (CommentState == LCS_InsideCComment)
501    skipLineStartingDecorations();
502
503  lexVerbatimBlockFirstLine(T);
504}
505
506void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
507                                    const CommandInfo *Info) {
508  assert(Info->IsVerbatimLineCommand);
509  formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
510  T.setVerbatimLineID(Info->getID());
511
512  State = LS_VerbatimLineText;
513}
514
515void Lexer::lexVerbatimLineText(Token &T) {
516  assert(State == LS_VerbatimLineText);
517
518  // Extract current line.
519  const char *Newline = findNewline(BufferPtr, CommentEnd);
520  const StringRef Text(BufferPtr, Newline - BufferPtr);
521  formTokenWithChars(T, Newline, tok::verbatim_line_text);
522  T.setVerbatimLineText(Text);
523
524  State = LS_Normal;
525}
526
527void Lexer::lexHTMLCharacterReference(Token &T) {
528  const char *TokenPtr = BufferPtr;
529  assert(*TokenPtr == '&');
530  TokenPtr++;
531  if (TokenPtr == CommentEnd) {
532    formTextToken(T, TokenPtr);
533    return;
534  }
535  const char *NamePtr;
536  bool isNamed = false;
537  bool isDecimal = false;
538  char C = *TokenPtr;
539  if (isHTMLNamedCharacterReferenceCharacter(C)) {
540    NamePtr = TokenPtr;
541    TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
542    isNamed = true;
543  } else if (C == '#') {
544    TokenPtr++;
545    if (TokenPtr == CommentEnd) {
546      formTextToken(T, TokenPtr);
547      return;
548    }
549    C = *TokenPtr;
550    if (isHTMLDecimalCharacterReferenceCharacter(C)) {
551      NamePtr = TokenPtr;
552      TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
553      isDecimal = true;
554    } else if (C == 'x' || C == 'X') {
555      TokenPtr++;
556      NamePtr = TokenPtr;
557      TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
558    } else {
559      formTextToken(T, TokenPtr);
560      return;
561    }
562  } else {
563    formTextToken(T, TokenPtr);
564    return;
565  }
566  if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
567      *TokenPtr != ';') {
568    formTextToken(T, TokenPtr);
569    return;
570  }
571  StringRef Name(NamePtr, TokenPtr - NamePtr);
572  TokenPtr++; // Skip semicolon.
573  StringRef Resolved;
574  if (isNamed)
575    Resolved = resolveHTMLNamedCharacterReference(Name);
576  else if (isDecimal)
577    Resolved = resolveHTMLDecimalCharacterReference(Name);
578  else
579    Resolved = resolveHTMLHexCharacterReference(Name);
580
581  if (Resolved.empty()) {
582    formTextToken(T, TokenPtr);
583    return;
584  }
585  formTokenWithChars(T, TokenPtr, tok::text);
586  T.setText(Resolved);
587  return;
588}
589
590void Lexer::setupAndLexHTMLStartTag(Token &T) {
591  assert(BufferPtr[0] == '<' &&
592         isHTMLIdentifierStartingCharacter(BufferPtr[1]));
593  const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
594  StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
595  if (!isHTMLTagName(Name)) {
596    formTextToken(T, TagNameEnd);
597    return;
598  }
599
600  formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
601  T.setHTMLTagStartName(Name);
602
603  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
604
605  const char C = *BufferPtr;
606  if (BufferPtr != CommentEnd &&
607      (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
608    State = LS_HTMLStartTag;
609}
610
611void Lexer::lexHTMLStartTag(Token &T) {
612  assert(State == LS_HTMLStartTag);
613
614  const char *TokenPtr = BufferPtr;
615  char C = *TokenPtr;
616  if (isHTMLIdentifierCharacter(C)) {
617    TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
618    StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
619    formTokenWithChars(T, TokenPtr, tok::html_ident);
620    T.setHTMLIdent(Ident);
621  } else {
622    switch (C) {
623    case '=':
624      TokenPtr++;
625      formTokenWithChars(T, TokenPtr, tok::html_equals);
626      break;
627    case '\"':
628    case '\'': {
629      const char *OpenQuote = TokenPtr;
630      TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
631      const char *ClosingQuote = TokenPtr;
632      if (TokenPtr != CommentEnd) // Skip closing quote.
633        TokenPtr++;
634      formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
635      T.setHTMLQuotedString(StringRef(OpenQuote + 1,
636                                      ClosingQuote - (OpenQuote + 1)));
637      break;
638    }
639    case '>':
640      TokenPtr++;
641      formTokenWithChars(T, TokenPtr, tok::html_greater);
642      State = LS_Normal;
643      return;
644    case '/':
645      TokenPtr++;
646      if (TokenPtr != CommentEnd && *TokenPtr == '>') {
647        TokenPtr++;
648        formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
649      } else
650        formTextToken(T, TokenPtr);
651
652      State = LS_Normal;
653      return;
654    }
655  }
656
657  // Now look ahead and return to normal state if we don't see any HTML tokens
658  // ahead.
659  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
660  if (BufferPtr == CommentEnd) {
661    State = LS_Normal;
662    return;
663  }
664
665  C = *BufferPtr;
666  if (!isHTMLIdentifierStartingCharacter(C) &&
667      C != '=' && C != '\"' && C != '\'' && C != '>') {
668    State = LS_Normal;
669    return;
670  }
671}
672
673void Lexer::setupAndLexHTMLEndTag(Token &T) {
674  assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
675
676  const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
677  const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
678  StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
679  if (!isHTMLTagName(Name)) {
680    formTextToken(T, TagNameEnd);
681    return;
682  }
683
684  const char *End = skipWhitespace(TagNameEnd, CommentEnd);
685
686  formTokenWithChars(T, End, tok::html_end_tag);
687  T.setHTMLTagEndName(Name);
688
689  if (BufferPtr != CommentEnd && *BufferPtr == '>')
690    State = LS_HTMLEndTag;
691}
692
693void Lexer::lexHTMLEndTag(Token &T) {
694  assert(BufferPtr != CommentEnd && *BufferPtr == '>');
695
696  formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
697  State = LS_Normal;
698}
699
700Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
701             const CommandTraits &Traits,
702             SourceLocation FileLoc,
703             const char *BufferStart, const char *BufferEnd):
704    Allocator(Allocator), Diags(Diags), Traits(Traits),
705    BufferStart(BufferStart), BufferEnd(BufferEnd),
706    FileLoc(FileLoc), BufferPtr(BufferStart),
707    CommentState(LCS_BeforeComment), State(LS_Normal) {
708}
709
710void Lexer::lex(Token &T) {
711again:
712  switch (CommentState) {
713  case LCS_BeforeComment:
714    if (BufferPtr == BufferEnd) {
715      formTokenWithChars(T, BufferPtr, tok::eof);
716      return;
717    }
718
719    assert(*BufferPtr == '/');
720    BufferPtr++; // Skip first slash.
721    switch(*BufferPtr) {
722    case '/': { // BCPL comment.
723      BufferPtr++; // Skip second slash.
724
725      if (BufferPtr != BufferEnd) {
726        // Skip Doxygen magic marker, if it is present.
727        // It might be missing because of a typo //< or /*<, or because we
728        // merged this non-Doxygen comment into a bunch of Doxygen comments
729        // around it: /** ... */ /* ... */ /** ... */
730        const char C = *BufferPtr;
731        if (C == '/' || C == '!')
732          BufferPtr++;
733      }
734
735      // Skip less-than symbol that marks trailing comments.
736      // Skip it even if the comment is not a Doxygen one, because //< and /*<
737      // are frequent typos.
738      if (BufferPtr != BufferEnd && *BufferPtr == '<')
739        BufferPtr++;
740
741      CommentState = LCS_InsideBCPLComment;
742      if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
743        State = LS_Normal;
744      CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
745      goto again;
746    }
747    case '*': { // C comment.
748      BufferPtr++; // Skip star.
749
750      // Skip Doxygen magic marker.
751      const char C = *BufferPtr;
752      if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
753        BufferPtr++;
754
755      // Skip less-than symbol that marks trailing comments.
756      if (BufferPtr != BufferEnd && *BufferPtr == '<')
757        BufferPtr++;
758
759      CommentState = LCS_InsideCComment;
760      State = LS_Normal;
761      CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
762      goto again;
763    }
764    default:
765      llvm_unreachable("second character of comment should be '/' or '*'");
766    }
767
768  case LCS_BetweenComments: {
769    // Consecutive comments are extracted only if there is only whitespace
770    // between them.  So we can search for the start of the next comment.
771    const char *EndWhitespace = BufferPtr;
772    while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
773      EndWhitespace++;
774
775    // Turn any whitespace between comments (and there is only whitespace
776    // between them -- guaranteed by comment extraction) into a newline.  We
777    // have two newlines between C comments in total (first one was synthesized
778    // after a comment).
779    formTokenWithChars(T, EndWhitespace, tok::newline);
780
781    CommentState = LCS_BeforeComment;
782    break;
783  }
784
785  case LCS_InsideBCPLComment:
786  case LCS_InsideCComment:
787    if (BufferPtr != CommentEnd) {
788      lexCommentText(T);
789      break;
790    } else {
791      // Skip C comment closing sequence.
792      if (CommentState == LCS_InsideCComment) {
793        assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
794        BufferPtr += 2;
795        assert(BufferPtr <= BufferEnd);
796
797        // Synthenize newline just after the C comment, regardless if there is
798        // actually a newline.
799        formTokenWithChars(T, BufferPtr, tok::newline);
800
801        CommentState = LCS_BetweenComments;
802        break;
803      } else {
804        // Don't synthesized a newline after BCPL comment.
805        CommentState = LCS_BetweenComments;
806        goto again;
807      }
808    }
809  }
810}
811
812StringRef Lexer::getSpelling(const Token &Tok,
813                             const SourceManager &SourceMgr,
814                             bool *Invalid) const {
815  SourceLocation Loc = Tok.getLocation();
816  std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
817
818  bool InvalidTemp = false;
819  StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
820  if (InvalidTemp) {
821    *Invalid = true;
822    return StringRef();
823  }
824
825  const char *Begin = File.data() + LocInfo.second;
826  return StringRef(Begin, Tok.getLength());
827}
828
829} // end namespace comments
830} // end namespace clang
831
832