1//===--- CommentLexer.cpp -------------------------------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9
10#include "clang/AST/CommentLexer.h"
11#include "clang/AST/CommentCommandTraits.h"
12#include "clang/AST/CommentDiagnostic.h"
13#include "clang/Basic/CharInfo.h"
14#include "llvm/ADT/StringExtras.h"
15#include "llvm/ADT/StringSwitch.h"
16#include "llvm/Support/ConvertUTF.h"
17#include "llvm/Support/ErrorHandling.h"
18
19namespace clang {
20namespace comments {
21
22void Token::dump(const Lexer &L, const SourceManager &SM) const {
23  llvm::errs() << "comments::Token Kind=" << Kind << " ";
24  Loc.dump(SM);
25  llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
26}
27
28static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
29  return isLetter(C);
30}
31
32static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
33  return isDigit(C);
34}
35
36static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
37  return isHexDigit(C);
38}
39
40static inline StringRef convertCodePointToUTF8(
41                                      llvm::BumpPtrAllocator &Allocator,
42                                      unsigned CodePoint) {
43  char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
44  char *ResolvedPtr = Resolved;
45  if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
46    return StringRef(Resolved, ResolvedPtr - Resolved);
47  else
48    return StringRef();
49}
50
51namespace {
52
53#include "clang/AST/CommentHTMLTags.inc"
54#include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
55
56} // end anonymous namespace
57
58StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
59  // Fast path, first check a few most widely used named character references.
60  return llvm::StringSwitch<StringRef>(Name)
61      .Case("amp", "&")
62      .Case("lt", "<")
63      .Case("gt", ">")
64      .Case("quot", "\"")
65      .Case("apos", "\'")
66      // Slow path.
67      .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
68}
69
70StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
71  unsigned CodePoint = 0;
72  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
73    assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
74    CodePoint *= 10;
75    CodePoint += Name[i] - '0';
76  }
77  return convertCodePointToUTF8(Allocator, CodePoint);
78}
79
80StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
81  unsigned CodePoint = 0;
82  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
83    CodePoint *= 16;
84    const char C = Name[i];
85    assert(isHTMLHexCharacterReferenceCharacter(C));
86    CodePoint += llvm::hexDigitValue(C);
87  }
88  return convertCodePointToUTF8(Allocator, CodePoint);
89}
90
91void Lexer::skipLineStartingDecorations() {
92  // This function should be called only for C comments
93  assert(CommentState == LCS_InsideCComment);
94
95  if (BufferPtr == CommentEnd)
96    return;
97
98  switch (*BufferPtr) {
99  case ' ':
100  case '\t':
101  case '\f':
102  case '\v': {
103    const char *NewBufferPtr = BufferPtr;
104    NewBufferPtr++;
105    if (NewBufferPtr == CommentEnd)
106      return;
107
108    char C = *NewBufferPtr;
109    while (isHorizontalWhitespace(C)) {
110      NewBufferPtr++;
111      if (NewBufferPtr == CommentEnd)
112        return;
113      C = *NewBufferPtr;
114    }
115    if (C == '*')
116      BufferPtr = NewBufferPtr + 1;
117    break;
118  }
119  case '*':
120    BufferPtr++;
121    break;
122  }
123}
124
125namespace {
126/// Returns pointer to the first newline character in the string.
127const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
128  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
129    if (isVerticalWhitespace(*BufferPtr))
130      return BufferPtr;
131  }
132  return BufferEnd;
133}
134
135const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
136  if (BufferPtr == BufferEnd)
137    return BufferPtr;
138
139  if (*BufferPtr == '\n')
140    BufferPtr++;
141  else {
142    assert(*BufferPtr == '\r');
143    BufferPtr++;
144    if (BufferPtr != BufferEnd && *BufferPtr == '\n')
145      BufferPtr++;
146  }
147  return BufferPtr;
148}
149
150const char *skipNamedCharacterReference(const char *BufferPtr,
151                                        const char *BufferEnd) {
152  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
153    if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
154      return BufferPtr;
155  }
156  return BufferEnd;
157}
158
159const char *skipDecimalCharacterReference(const char *BufferPtr,
160                                          const char *BufferEnd) {
161  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
162    if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
163      return BufferPtr;
164  }
165  return BufferEnd;
166}
167
168const char *skipHexCharacterReference(const char *BufferPtr,
169                                      const char *BufferEnd) {
170  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
171    if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
172      return BufferPtr;
173  }
174  return BufferEnd;
175}
176
177bool isHTMLIdentifierStartingCharacter(char C) {
178  return isLetter(C);
179}
180
181bool isHTMLIdentifierCharacter(char C) {
182  return isAlphanumeric(C);
183}
184
185const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
186  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
187    if (!isHTMLIdentifierCharacter(*BufferPtr))
188      return BufferPtr;
189  }
190  return BufferEnd;
191}
192
193/// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
194/// string allowed.
195///
196/// Returns pointer to closing quote.
197const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
198{
199  const char Quote = *BufferPtr;
200  assert(Quote == '\"' || Quote == '\'');
201
202  BufferPtr++;
203  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
204    const char C = *BufferPtr;
205    if (C == Quote && BufferPtr[-1] != '\\')
206      return BufferPtr;
207  }
208  return BufferEnd;
209}
210
211const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
212  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
213    if (!isWhitespace(*BufferPtr))
214      return BufferPtr;
215  }
216  return BufferEnd;
217}
218
219bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
220  return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
221}
222
223bool isCommandNameStartCharacter(char C) {
224  return isLetter(C);
225}
226
227bool isCommandNameCharacter(char C) {
228  return isAlphanumeric(C);
229}
230
231const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
232  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
233    if (!isCommandNameCharacter(*BufferPtr))
234      return BufferPtr;
235  }
236  return BufferEnd;
237}
238
239/// Return the one past end pointer for BCPL comments.
240/// Handles newlines escaped with backslash or trigraph for backslahs.
241const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
242  const char *CurPtr = BufferPtr;
243  while (CurPtr != BufferEnd) {
244    while (!isVerticalWhitespace(*CurPtr)) {
245      CurPtr++;
246      if (CurPtr == BufferEnd)
247        return BufferEnd;
248    }
249    // We found a newline, check if it is escaped.
250    const char *EscapePtr = CurPtr - 1;
251    while(isHorizontalWhitespace(*EscapePtr))
252      EscapePtr--;
253
254    if (*EscapePtr == '\\' ||
255        (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
256         EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
257      // We found an escaped newline.
258      CurPtr = skipNewline(CurPtr, BufferEnd);
259    } else
260      return CurPtr; // Not an escaped newline.
261  }
262  return BufferEnd;
263}
264
265/// Return the one past end pointer for C comments.
266/// Very dumb, does not handle escaped newlines or trigraphs.
267const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
268  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
269    if (*BufferPtr == '*') {
270      assert(BufferPtr + 1 != BufferEnd);
271      if (*(BufferPtr + 1) == '/')
272        return BufferPtr;
273    }
274  }
275  llvm_unreachable("buffer end hit before '*/' was seen");
276}
277
278} // end anonymous namespace
279
280void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
281                               tok::TokenKind Kind) {
282  const unsigned TokLen = TokEnd - BufferPtr;
283  Result.setLocation(getSourceLocation(BufferPtr));
284  Result.setKind(Kind);
285  Result.setLength(TokLen);
286#ifndef NDEBUG
287  Result.TextPtr = "<UNSET>";
288  Result.IntVal = 7;
289#endif
290  BufferPtr = TokEnd;
291}
292
293void Lexer::lexCommentText(Token &T) {
294  assert(CommentState == LCS_InsideBCPLComment ||
295         CommentState == LCS_InsideCComment);
296
297  switch (State) {
298  case LS_Normal:
299    break;
300  case LS_VerbatimBlockFirstLine:
301    lexVerbatimBlockFirstLine(T);
302    return;
303  case LS_VerbatimBlockBody:
304    lexVerbatimBlockBody(T);
305    return;
306  case LS_VerbatimLineText:
307    lexVerbatimLineText(T);
308    return;
309  case LS_HTMLStartTag:
310    lexHTMLStartTag(T);
311    return;
312  case LS_HTMLEndTag:
313    lexHTMLEndTag(T);
314    return;
315  }
316
317  assert(State == LS_Normal);
318
319  const char *TokenPtr = BufferPtr;
320  assert(TokenPtr < CommentEnd);
321  while (TokenPtr != CommentEnd) {
322    switch(*TokenPtr) {
323      case '\\':
324      case '@': {
325        // Commands that start with a backslash and commands that start with
326        // 'at' have equivalent semantics.  But we keep information about the
327        // exact syntax in AST for comments.
328        tok::TokenKind CommandKind =
329            (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
330        TokenPtr++;
331        if (TokenPtr == CommentEnd) {
332          formTextToken(T, TokenPtr);
333          return;
334        }
335        char C = *TokenPtr;
336        switch (C) {
337        default:
338          break;
339
340        case '\\': case '@': case '&': case '$':
341        case '#':  case '<': case '>': case '%':
342        case '\"': case '.': case ':':
343          // This is one of \\ \@ \& \$ etc escape sequences.
344          TokenPtr++;
345          if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
346            // This is the \:: escape sequence.
347            TokenPtr++;
348          }
349          StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
350          formTokenWithChars(T, TokenPtr, tok::text);
351          T.setText(UnescapedText);
352          return;
353        }
354
355        // Don't make zero-length commands.
356        if (!isCommandNameStartCharacter(*TokenPtr)) {
357          formTextToken(T, TokenPtr);
358          return;
359        }
360
361        TokenPtr = skipCommandName(TokenPtr, CommentEnd);
362        unsigned Length = TokenPtr - (BufferPtr + 1);
363
364        // Hardcoded support for lexing LaTeX formula commands
365        // \f$ \f[ \f] \f{ \f} as a single command.
366        if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
367          C = *TokenPtr;
368          if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
369            TokenPtr++;
370            Length++;
371          }
372        }
373
374        StringRef CommandName(BufferPtr + 1, Length);
375
376        const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
377        if (!Info) {
378          if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
379            StringRef CorrectedName = Info->Name;
380            SourceLocation Loc = getSourceLocation(BufferPtr);
381            SourceRange CommandRange(Loc.getLocWithOffset(1),
382                                     getSourceLocation(TokenPtr));
383            Diag(Loc, diag::warn_correct_comment_command_name)
384              << CommandName << CorrectedName
385              << FixItHint::CreateReplacement(CommandRange, CorrectedName);
386          } else {
387            formTokenWithChars(T, TokenPtr, tok::unknown_command);
388            T.setUnknownCommandName(CommandName);
389            Diag(T.getLocation(), diag::warn_unknown_comment_command_name);
390            return;
391          }
392        }
393        if (Info->IsVerbatimBlockCommand) {
394          setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
395          return;
396        }
397        if (Info->IsVerbatimLineCommand) {
398          setupAndLexVerbatimLine(T, TokenPtr, Info);
399          return;
400        }
401        formTokenWithChars(T, TokenPtr, CommandKind);
402        T.setCommandID(Info->getID());
403        return;
404      }
405
406      case '&':
407        lexHTMLCharacterReference(T);
408        return;
409
410      case '<': {
411        TokenPtr++;
412        if (TokenPtr == CommentEnd) {
413          formTextToken(T, TokenPtr);
414          return;
415        }
416        const char C = *TokenPtr;
417        if (isHTMLIdentifierStartingCharacter(C))
418          setupAndLexHTMLStartTag(T);
419        else if (C == '/')
420          setupAndLexHTMLEndTag(T);
421        else
422          formTextToken(T, TokenPtr);
423        return;
424      }
425
426      case '\n':
427      case '\r':
428        TokenPtr = skipNewline(TokenPtr, CommentEnd);
429        formTokenWithChars(T, TokenPtr, tok::newline);
430
431        if (CommentState == LCS_InsideCComment)
432          skipLineStartingDecorations();
433        return;
434
435      default: {
436        size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
437                         find_first_of("\n\r\\@&<");
438        if (End != StringRef::npos)
439          TokenPtr += End;
440        else
441          TokenPtr = CommentEnd;
442        formTextToken(T, TokenPtr);
443        return;
444      }
445    }
446  }
447}
448
449void Lexer::setupAndLexVerbatimBlock(Token &T,
450                                     const char *TextBegin,
451                                     char Marker, const CommandInfo *Info) {
452  assert(Info->IsVerbatimBlockCommand);
453
454  VerbatimBlockEndCommandName.clear();
455  VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
456  VerbatimBlockEndCommandName.append(Info->EndCommandName);
457
458  formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
459  T.setVerbatimBlockID(Info->getID());
460
461  // If there is a newline following the verbatim opening command, skip the
462  // newline so that we don't create an tok::verbatim_block_line with empty
463  // text content.
464  if (BufferPtr != CommentEnd &&
465      isVerticalWhitespace(*BufferPtr)) {
466    BufferPtr = skipNewline(BufferPtr, CommentEnd);
467    State = LS_VerbatimBlockBody;
468    return;
469  }
470
471  State = LS_VerbatimBlockFirstLine;
472}
473
474void Lexer::lexVerbatimBlockFirstLine(Token &T) {
475again:
476  assert(BufferPtr < CommentEnd);
477
478  // FIXME: It would be better to scan the text once, finding either the block
479  // end command or newline.
480  //
481  // Extract current line.
482  const char *Newline = findNewline(BufferPtr, CommentEnd);
483  StringRef Line(BufferPtr, Newline - BufferPtr);
484
485  // Look for end command in current line.
486  size_t Pos = Line.find(VerbatimBlockEndCommandName);
487  const char *TextEnd;
488  const char *NextLine;
489  if (Pos == StringRef::npos) {
490    // Current line is completely verbatim.
491    TextEnd = Newline;
492    NextLine = skipNewline(Newline, CommentEnd);
493  } else if (Pos == 0) {
494    // Current line contains just an end command.
495    const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
496    StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
497    formTokenWithChars(T, End, tok::verbatim_block_end);
498    T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
499    State = LS_Normal;
500    return;
501  } else {
502    // There is some text, followed by end command.  Extract text first.
503    TextEnd = BufferPtr + Pos;
504    NextLine = TextEnd;
505    // If there is only whitespace before end command, skip whitespace.
506    if (isWhitespace(BufferPtr, TextEnd)) {
507      BufferPtr = TextEnd;
508      goto again;
509    }
510  }
511
512  StringRef Text(BufferPtr, TextEnd - BufferPtr);
513  formTokenWithChars(T, NextLine, tok::verbatim_block_line);
514  T.setVerbatimBlockText(Text);
515
516  State = LS_VerbatimBlockBody;
517}
518
519void Lexer::lexVerbatimBlockBody(Token &T) {
520  assert(State == LS_VerbatimBlockBody);
521
522  if (CommentState == LCS_InsideCComment)
523    skipLineStartingDecorations();
524
525  if (BufferPtr == CommentEnd) {
526    formTokenWithChars(T, BufferPtr, tok::verbatim_block_line);
527    T.setVerbatimBlockText("");
528    return;
529  }
530
531  lexVerbatimBlockFirstLine(T);
532}
533
534void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
535                                    const CommandInfo *Info) {
536  assert(Info->IsVerbatimLineCommand);
537  formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
538  T.setVerbatimLineID(Info->getID());
539
540  State = LS_VerbatimLineText;
541}
542
543void Lexer::lexVerbatimLineText(Token &T) {
544  assert(State == LS_VerbatimLineText);
545
546  // Extract current line.
547  const char *Newline = findNewline(BufferPtr, CommentEnd);
548  StringRef Text(BufferPtr, Newline - BufferPtr);
549  formTokenWithChars(T, Newline, tok::verbatim_line_text);
550  T.setVerbatimLineText(Text);
551
552  State = LS_Normal;
553}
554
555void Lexer::lexHTMLCharacterReference(Token &T) {
556  const char *TokenPtr = BufferPtr;
557  assert(*TokenPtr == '&');
558  TokenPtr++;
559  if (TokenPtr == CommentEnd) {
560    formTextToken(T, TokenPtr);
561    return;
562  }
563  const char *NamePtr;
564  bool isNamed = false;
565  bool isDecimal = false;
566  char C = *TokenPtr;
567  if (isHTMLNamedCharacterReferenceCharacter(C)) {
568    NamePtr = TokenPtr;
569    TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
570    isNamed = true;
571  } else if (C == '#') {
572    TokenPtr++;
573    if (TokenPtr == CommentEnd) {
574      formTextToken(T, TokenPtr);
575      return;
576    }
577    C = *TokenPtr;
578    if (isHTMLDecimalCharacterReferenceCharacter(C)) {
579      NamePtr = TokenPtr;
580      TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
581      isDecimal = true;
582    } else if (C == 'x' || C == 'X') {
583      TokenPtr++;
584      NamePtr = TokenPtr;
585      TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
586    } else {
587      formTextToken(T, TokenPtr);
588      return;
589    }
590  } else {
591    formTextToken(T, TokenPtr);
592    return;
593  }
594  if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
595      *TokenPtr != ';') {
596    formTextToken(T, TokenPtr);
597    return;
598  }
599  StringRef Name(NamePtr, TokenPtr - NamePtr);
600  TokenPtr++; // Skip semicolon.
601  StringRef Resolved;
602  if (isNamed)
603    Resolved = resolveHTMLNamedCharacterReference(Name);
604  else if (isDecimal)
605    Resolved = resolveHTMLDecimalCharacterReference(Name);
606  else
607    Resolved = resolveHTMLHexCharacterReference(Name);
608
609  if (Resolved.empty()) {
610    formTextToken(T, TokenPtr);
611    return;
612  }
613  formTokenWithChars(T, TokenPtr, tok::text);
614  T.setText(Resolved);
615}
616
617void Lexer::setupAndLexHTMLStartTag(Token &T) {
618  assert(BufferPtr[0] == '<' &&
619         isHTMLIdentifierStartingCharacter(BufferPtr[1]));
620  const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
621  StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
622  if (!isHTMLTagName(Name)) {
623    formTextToken(T, TagNameEnd);
624    return;
625  }
626
627  formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
628  T.setHTMLTagStartName(Name);
629
630  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
631
632  const char C = *BufferPtr;
633  if (BufferPtr != CommentEnd &&
634      (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
635    State = LS_HTMLStartTag;
636}
637
638void Lexer::lexHTMLStartTag(Token &T) {
639  assert(State == LS_HTMLStartTag);
640
641  const char *TokenPtr = BufferPtr;
642  char C = *TokenPtr;
643  if (isHTMLIdentifierCharacter(C)) {
644    TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
645    StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
646    formTokenWithChars(T, TokenPtr, tok::html_ident);
647    T.setHTMLIdent(Ident);
648  } else {
649    switch (C) {
650    case '=':
651      TokenPtr++;
652      formTokenWithChars(T, TokenPtr, tok::html_equals);
653      break;
654    case '\"':
655    case '\'': {
656      const char *OpenQuote = TokenPtr;
657      TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
658      const char *ClosingQuote = TokenPtr;
659      if (TokenPtr != CommentEnd) // Skip closing quote.
660        TokenPtr++;
661      formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
662      T.setHTMLQuotedString(StringRef(OpenQuote + 1,
663                                      ClosingQuote - (OpenQuote + 1)));
664      break;
665    }
666    case '>':
667      TokenPtr++;
668      formTokenWithChars(T, TokenPtr, tok::html_greater);
669      State = LS_Normal;
670      return;
671    case '/':
672      TokenPtr++;
673      if (TokenPtr != CommentEnd && *TokenPtr == '>') {
674        TokenPtr++;
675        formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
676      } else
677        formTextToken(T, TokenPtr);
678
679      State = LS_Normal;
680      return;
681    }
682  }
683
684  // Now look ahead and return to normal state if we don't see any HTML tokens
685  // ahead.
686  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
687  if (BufferPtr == CommentEnd) {
688    State = LS_Normal;
689    return;
690  }
691
692  C = *BufferPtr;
693  if (!isHTMLIdentifierStartingCharacter(C) &&
694      C != '=' && C != '\"' && C != '\'' && C != '>') {
695    State = LS_Normal;
696    return;
697  }
698}
699
700void Lexer::setupAndLexHTMLEndTag(Token &T) {
701  assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
702
703  const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
704  const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
705  StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
706  if (!isHTMLTagName(Name)) {
707    formTextToken(T, TagNameEnd);
708    return;
709  }
710
711  const char *End = skipWhitespace(TagNameEnd, CommentEnd);
712
713  formTokenWithChars(T, End, tok::html_end_tag);
714  T.setHTMLTagEndName(Name);
715
716  if (BufferPtr != CommentEnd && *BufferPtr == '>')
717    State = LS_HTMLEndTag;
718}
719
720void Lexer::lexHTMLEndTag(Token &T) {
721  assert(BufferPtr != CommentEnd && *BufferPtr == '>');
722
723  formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
724  State = LS_Normal;
725}
726
727Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
728             const CommandTraits &Traits,
729             SourceLocation FileLoc,
730             const char *BufferStart, const char *BufferEnd):
731    Allocator(Allocator), Diags(Diags), Traits(Traits),
732    BufferStart(BufferStart), BufferEnd(BufferEnd),
733    FileLoc(FileLoc), BufferPtr(BufferStart),
734    CommentState(LCS_BeforeComment), State(LS_Normal) {
735}
736
737void Lexer::lex(Token &T) {
738again:
739  switch (CommentState) {
740  case LCS_BeforeComment:
741    if (BufferPtr == BufferEnd) {
742      formTokenWithChars(T, BufferPtr, tok::eof);
743      return;
744    }
745
746    assert(*BufferPtr == '/');
747    BufferPtr++; // Skip first slash.
748    switch(*BufferPtr) {
749    case '/': { // BCPL comment.
750      BufferPtr++; // Skip second slash.
751
752      if (BufferPtr != BufferEnd) {
753        // Skip Doxygen magic marker, if it is present.
754        // It might be missing because of a typo //< or /*<, or because we
755        // merged this non-Doxygen comment into a bunch of Doxygen comments
756        // around it: /** ... */ /* ... */ /** ... */
757        const char C = *BufferPtr;
758        if (C == '/' || C == '!')
759          BufferPtr++;
760      }
761
762      // Skip less-than symbol that marks trailing comments.
763      // Skip it even if the comment is not a Doxygen one, because //< and /*<
764      // are frequent typos.
765      if (BufferPtr != BufferEnd && *BufferPtr == '<')
766        BufferPtr++;
767
768      CommentState = LCS_InsideBCPLComment;
769      if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
770        State = LS_Normal;
771      CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
772      goto again;
773    }
774    case '*': { // C comment.
775      BufferPtr++; // Skip star.
776
777      // Skip Doxygen magic marker.
778      const char C = *BufferPtr;
779      if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
780        BufferPtr++;
781
782      // Skip less-than symbol that marks trailing comments.
783      if (BufferPtr != BufferEnd && *BufferPtr == '<')
784        BufferPtr++;
785
786      CommentState = LCS_InsideCComment;
787      State = LS_Normal;
788      CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
789      goto again;
790    }
791    default:
792      llvm_unreachable("second character of comment should be '/' or '*'");
793    }
794
795  case LCS_BetweenComments: {
796    // Consecutive comments are extracted only if there is only whitespace
797    // between them.  So we can search for the start of the next comment.
798    const char *EndWhitespace = BufferPtr;
799    while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
800      EndWhitespace++;
801
802    // Turn any whitespace between comments (and there is only whitespace
803    // between them -- guaranteed by comment extraction) into a newline.  We
804    // have two newlines between C comments in total (first one was synthesized
805    // after a comment).
806    formTokenWithChars(T, EndWhitespace, tok::newline);
807
808    CommentState = LCS_BeforeComment;
809    break;
810  }
811
812  case LCS_InsideBCPLComment:
813  case LCS_InsideCComment:
814    if (BufferPtr != CommentEnd) {
815      lexCommentText(T);
816      break;
817    } else {
818      // Skip C comment closing sequence.
819      if (CommentState == LCS_InsideCComment) {
820        assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
821        BufferPtr += 2;
822        assert(BufferPtr <= BufferEnd);
823
824        // Synthenize newline just after the C comment, regardless if there is
825        // actually a newline.
826        formTokenWithChars(T, BufferPtr, tok::newline);
827
828        CommentState = LCS_BetweenComments;
829        break;
830      } else {
831        // Don't synthesized a newline after BCPL comment.
832        CommentState = LCS_BetweenComments;
833        goto again;
834      }
835    }
836  }
837}
838
839StringRef Lexer::getSpelling(const Token &Tok,
840                             const SourceManager &SourceMgr,
841                             bool *Invalid) const {
842  SourceLocation Loc = Tok.getLocation();
843  std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
844
845  bool InvalidTemp = false;
846  StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
847  if (InvalidTemp) {
848    *Invalid = true;
849    return StringRef();
850  }
851
852  const char *Begin = File.data() + LocInfo.second;
853  return StringRef(Begin, Tok.getLength());
854}
855
856} // end namespace comments
857} // end namespace clang
858