CommentLexer.cpp revision ad6fd9f93ce0d328397e8d57ef7117ced24fc8e2
12d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko#include "clang/AST/CommentLexer.h"
2ad6fd9f93ce0d328397e8d57ef7117ced24fc8e2Fariborz Jahanian#include "clang/Lex/LexDiagnostic.h"
3aa58081902ad31927df02e8537d972eabe29d6dfDmitri Gribenko#include "clang/AST/CommentCommandTraits.h"
4bf8814478fddfa611911bdbd6a53a6614938cc63Dmitri Gribenko#include "clang/Basic/CharInfo.h"
5c934dfe950a14fe447aa14a7dae25d00ee87c8bbDmitri Gribenko#include "llvm/ADT/StringExtras.h"
62d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko#include "llvm/ADT/StringSwitch.h"
7cb5620c9b213f4bd323912159fdddda35e258a14Dmitri Gribenko#include "llvm/Support/ConvertUTF.h"
82d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko#include "llvm/Support/ErrorHandling.h"
92d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
102d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkonamespace clang {
112d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkonamespace comments {
122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkovoid Token::dump(const Lexer &L, const SourceManager &SM) const {
142d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  llvm::errs() << "comments::Token Kind=" << Kind << " ";
152d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  Loc.dump(SM);
162d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
172d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
182d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
190ff4f8bf47c924b4b01d989a53432a95471a068dDmitri Gribenkostatic inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
20bf8814478fddfa611911bdbd6a53a6614938cc63Dmitri Gribenko  return isLetter(C);
21477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko}
22477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko
230ff4f8bf47c924b4b01d989a53432a95471a068dDmitri Gribenkostatic inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
24bf8814478fddfa611911bdbd6a53a6614938cc63Dmitri Gribenko  return isDigit(C);
25477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko}
26477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko
270ff4f8bf47c924b4b01d989a53432a95471a068dDmitri Gribenkostatic inline bool isHTMLHexCharacterReferenceCharacter(char C) {
28bf8814478fddfa611911bdbd6a53a6614938cc63Dmitri Gribenko  return isHexDigit(C);
29477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko}
30834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko
310ff4f8bf47c924b4b01d989a53432a95471a068dDmitri Gribenkostatic inline StringRef convertCodePointToUTF8(
320ff4f8bf47c924b4b01d989a53432a95471a068dDmitri Gribenko                                      llvm::BumpPtrAllocator &Allocator,
330ff4f8bf47c924b4b01d989a53432a95471a068dDmitri Gribenko                                      unsigned CodePoint) {
34658a115c8e0d5bddf607a13d2ce13cd306ef2389Fariborz Jahanian  char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
35658a115c8e0d5bddf607a13d2ce13cd306ef2389Fariborz Jahanian  char *ResolvedPtr = Resolved;
36cb5620c9b213f4bd323912159fdddda35e258a14Dmitri Gribenko  if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
37658a115c8e0d5bddf607a13d2ce13cd306ef2389Fariborz Jahanian    return StringRef(Resolved, ResolvedPtr - Resolved);
38658a115c8e0d5bddf607a13d2ce13cd306ef2389Fariborz Jahanian  else
39658a115c8e0d5bddf607a13d2ce13cd306ef2389Fariborz Jahanian    return StringRef();
40658a115c8e0d5bddf607a13d2ce13cd306ef2389Fariborz Jahanian}
415bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko
420ff4f8bf47c924b4b01d989a53432a95471a068dDmitri Gribenkonamespace {
430ff4f8bf47c924b4b01d989a53432a95471a068dDmitri Gribenko
445bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko#include "clang/AST/CommentHTMLTags.inc"
455bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko#include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
465bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko
475bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko} // unnamed namespace
48658a115c8e0d5bddf607a13d2ce13cd306ef2389Fariborz Jahanian
49477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri GribenkoStringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
505bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko  // Fast path, first check a few most widely used named character references.
51477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  return llvm::StringSwitch<StringRef>(Name)
52477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      .Case("amp", "&")
53477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      .Case("lt", "<")
54477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      .Case("gt", ">")
55477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      .Case("quot", "\"")
56477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      .Case("apos", "\'")
575bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko      // Slow path.
585bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko      .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
59658a115c8e0d5bddf607a13d2ce13cd306ef2389Fariborz Jahanian}
60477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko
61477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri GribenkoStringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
62477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  unsigned CodePoint = 0;
63477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
64477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
65477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    CodePoint *= 10;
66477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    CodePoint += Name[i] - '0';
67477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  }
685bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko  return convertCodePointToUTF8(Allocator, CodePoint);
695bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko}
70477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko
715bd1e5ba000023910ad986a16dd16d7ca914750aDmitri GribenkoStringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
725bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko  unsigned CodePoint = 0;
735bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
745bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko    CodePoint *= 16;
755bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko    const char C = Name[i];
765bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko    assert(isHTMLHexCharacterReferenceCharacter(C));
775bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko    CodePoint += llvm::hexDigitValue(C);
785bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko  }
795bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko  return convertCodePointToUTF8(Allocator, CodePoint);
80477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko}
81477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko
822d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkovoid Lexer::skipLineStartingDecorations() {
832d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  // This function should be called only for C comments
842d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  assert(CommentState == LCS_InsideCComment);
852d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
862d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  if (BufferPtr == CommentEnd)
872d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    return;
882d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
892d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  switch (*BufferPtr) {
902d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  case ' ':
912d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  case '\t':
922d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  case '\f':
932d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  case '\v': {
942d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    const char *NewBufferPtr = BufferPtr;
952d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    NewBufferPtr++;
962d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    if (NewBufferPtr == CommentEnd)
972d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      return;
982d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
992d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    char C = *NewBufferPtr;
100bf8814478fddfa611911bdbd6a53a6614938cc63Dmitri Gribenko    while (isHorizontalWhitespace(C)) {
1012d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      NewBufferPtr++;
1022d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      if (NewBufferPtr == CommentEnd)
1032d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        return;
1042d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      C = *NewBufferPtr;
1052d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    }
1062d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    if (C == '*')
1072d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      BufferPtr = NewBufferPtr + 1;
1082d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    break;
1092d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
1102d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  case '*':
1112d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    BufferPtr++;
1122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    break;
1132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
1142d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
1152d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
1162d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkonamespace {
1178d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko/// Returns pointer to the first newline character in the string.
1182d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *findNewline(const char *BufferPtr, const char *BufferEnd) {
1192d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
120bf8814478fddfa611911bdbd6a53a6614938cc63Dmitri Gribenko    if (isVerticalWhitespace(*BufferPtr))
1212d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      return BufferPtr;
1222d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
1232d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  return BufferEnd;
1242d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
1252d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
1262d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
1272d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  if (BufferPtr == BufferEnd)
1282d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    return BufferPtr;
1292d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
1302d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  if (*BufferPtr == '\n')
1312d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    BufferPtr++;
1322d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  else {
1332d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    assert(*BufferPtr == '\r');
1342d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    BufferPtr++;
1352d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    if (BufferPtr != BufferEnd && *BufferPtr == '\n')
1362d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      BufferPtr++;
1372d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
1382d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  return BufferPtr;
1392d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
1402d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
141477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenkoconst char *skipNamedCharacterReference(const char *BufferPtr,
142477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko                                        const char *BufferEnd) {
143477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
144477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
145477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      return BufferPtr;
146477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  }
147477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  return BufferEnd;
148477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko}
149477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko
150477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenkoconst char *skipDecimalCharacterReference(const char *BufferPtr,
151477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko                                          const char *BufferEnd) {
152477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
153477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
154477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      return BufferPtr;
155477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  }
156477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  return BufferEnd;
157477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko}
158477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko
159477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenkoconst char *skipHexCharacterReference(const char *BufferPtr,
160477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko                                          const char *BufferEnd) {
161477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
162477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
163477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      return BufferPtr;
164477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  }
165477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  return BufferEnd;
166477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko}
167477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko
168a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenkobool isHTMLIdentifierStartingCharacter(char C) {
169bf8814478fddfa611911bdbd6a53a6614938cc63Dmitri Gribenko  return isLetter(C);
170a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko}
171a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko
1722d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkobool isHTMLIdentifierCharacter(char C) {
173bf8814478fddfa611911bdbd6a53a6614938cc63Dmitri Gribenko  return isAlphanumeric(C);
1742d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
1752d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
1762d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
1772d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
1782d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    if (!isHTMLIdentifierCharacter(*BufferPtr))
1792d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      return BufferPtr;
1802d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
1812d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  return BufferEnd;
1822d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
1832d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
1842d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
1852d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// string allowed.
1862d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko///
1872d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// Returns pointer to closing quote.
1882d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
1892d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko{
1902d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  const char Quote = *BufferPtr;
1912d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  assert(Quote == '\"' || Quote == '\'');
1922d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
1932d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  BufferPtr++;
1942d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
1952d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    const char C = *BufferPtr;
1962d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    if (C == Quote && BufferPtr[-1] != '\\')
1972d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      return BufferPtr;
1982d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
1992d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  return BufferEnd;
2002d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
2012d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
2022d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
2032d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
2042d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    if (!isWhitespace(*BufferPtr))
2052d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      return BufferPtr;
2062d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
2072d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  return BufferEnd;
2082d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
2092d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
21064da4e55c111f4733135e1780216609569767351Dmitri Gribenkobool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
21164da4e55c111f4733135e1780216609569767351Dmitri Gribenko  return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
21264da4e55c111f4733135e1780216609569767351Dmitri Gribenko}
21364da4e55c111f4733135e1780216609569767351Dmitri Gribenko
2148c05da3fd8db98af482826ba059ab1ad6d58010fDmitri Gribenkobool isCommandNameStartCharacter(char C) {
215bf8814478fddfa611911bdbd6a53a6614938cc63Dmitri Gribenko  return isLetter(C);
2168c05da3fd8db98af482826ba059ab1ad6d58010fDmitri Gribenko}
2178c05da3fd8db98af482826ba059ab1ad6d58010fDmitri Gribenko
2182d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkobool isCommandNameCharacter(char C) {
219bf8814478fddfa611911bdbd6a53a6614938cc63Dmitri Gribenko  return isAlphanumeric(C);
2202d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
2212d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
2222d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
2232d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
2242d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    if (!isCommandNameCharacter(*BufferPtr))
2252d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      return BufferPtr;
2262d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
2272d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  return BufferEnd;
2282d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
2292d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
2302d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// Return the one past end pointer for BCPL comments.
2312d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// Handles newlines escaped with backslash or trigraph for backslahs.
2322d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
2332d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  const char *CurPtr = BufferPtr;
2342d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  while (CurPtr != BufferEnd) {
235bf8814478fddfa611911bdbd6a53a6614938cc63Dmitri Gribenko    while (!isVerticalWhitespace(*CurPtr)) {
2362d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      CurPtr++;
2372d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      if (CurPtr == BufferEnd)
2382d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        return BufferEnd;
2392d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    }
2402d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    // We found a newline, check if it is escaped.
2412d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    const char *EscapePtr = CurPtr - 1;
2422d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    while(isHorizontalWhitespace(*EscapePtr))
2432d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      EscapePtr--;
2442d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
2452d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    if (*EscapePtr == '\\' ||
2462d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
2472d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko         EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
2482d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      // We found an escaped newline.
2492d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      CurPtr = skipNewline(CurPtr, BufferEnd);
2502d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    } else
2512d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      return CurPtr; // Not an escaped newline.
2522d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
2532d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  return BufferEnd;
2542d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
2552d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
2562d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// Return the one past end pointer for C comments.
2572d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// Very dumb, does not handle escaped newlines or trigraphs.
2582d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
2592d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
2602d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    if (*BufferPtr == '*') {
2612d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      assert(BufferPtr + 1 != BufferEnd);
2622d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      if (*(BufferPtr + 1) == '/')
2632d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        return BufferPtr;
2642d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    }
2652d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
2662d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  llvm_unreachable("buffer end hit before '*/' was seen");
2672d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
2682d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} // unnamed namespace
2692d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
2702d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkovoid Lexer::lexCommentText(Token &T) {
2712d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  assert(CommentState == LCS_InsideBCPLComment ||
2722d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko         CommentState == LCS_InsideCComment);
2732d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
2742d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  switch (State) {
2752d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  case LS_Normal:
2762d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    break;
2772d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  case LS_VerbatimBlockFirstLine:
2782d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    lexVerbatimBlockFirstLine(T);
2792d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    return;
2802d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  case LS_VerbatimBlockBody:
2812d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    lexVerbatimBlockBody(T);
2822d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    return;
283962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko  case LS_VerbatimLineText:
284962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko    lexVerbatimLineText(T);
285962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko    return;
2863f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko  case LS_HTMLStartTag:
2873f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko    lexHTMLStartTag(T);
2882d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    return;
2893f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko  case LS_HTMLEndTag:
2903f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko    lexHTMLEndTag(T);
2918d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko    return;
2922d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
2932d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
2942d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  assert(State == LS_Normal);
2952d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
2962d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  const char *TokenPtr = BufferPtr;
2972d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  assert(TokenPtr < CommentEnd);
2982d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  while (TokenPtr != CommentEnd) {
2992d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    switch(*TokenPtr) {
3002d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      case '\\':
3012d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      case '@': {
302808383d2d6d58a7c7db85f8c7618fb74d821309fDmitri Gribenko        // Commands that start with a backslash and commands that start with
303808383d2d6d58a7c7db85f8c7618fb74d821309fDmitri Gribenko        // 'at' have equivalent semantics.  But we keep information about the
304808383d2d6d58a7c7db85f8c7618fb74d821309fDmitri Gribenko        // exact syntax in AST for comments.
305808383d2d6d58a7c7db85f8c7618fb74d821309fDmitri Gribenko        tok::TokenKind CommandKind =
306808383d2d6d58a7c7db85f8c7618fb74d821309fDmitri Gribenko            (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
3072d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        TokenPtr++;
3082d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        if (TokenPtr == CommentEnd) {
309477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko          formTextToken(T, TokenPtr);
3102d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          return;
3112d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        }
3122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        char C = *TokenPtr;
3132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        switch (C) {
3142d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        default:
3152d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          break;
3162d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
3172d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        case '\\': case '@': case '&': case '$':
3182d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        case '#':  case '<': case '>': case '%':
3192d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        case '\"': case '.': case ':':
3202d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          // This is one of \\ \@ \& \$ etc escape sequences.
3212d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          TokenPtr++;
3222d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
3232d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko            // This is the \:: escape sequence.
3242d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko            TokenPtr++;
3252d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          }
326f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko          StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
3272d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          formTokenWithChars(T, TokenPtr, tok::text);
328f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko          T.setText(UnescapedText);
3292d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          return;
3302d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        }
3312d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
3322d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        // Don't make zero-length commands.
3338c05da3fd8db98af482826ba059ab1ad6d58010fDmitri Gribenko        if (!isCommandNameStartCharacter(*TokenPtr)) {
334477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko          formTextToken(T, TokenPtr);
3352d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          return;
3362d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        }
3372d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
3382d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        TokenPtr = skipCommandName(TokenPtr, CommentEnd);
3392d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        unsigned Length = TokenPtr - (BufferPtr + 1);
3402d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
3412d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        // Hardcoded support for lexing LaTeX formula commands
3422d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        // \f$ \f[ \f] \f{ \f} as a single command.
3432d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
3442d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          C = *TokenPtr;
3452d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
3462d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko            TokenPtr++;
3472d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko            Length++;
3482d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          }
3492d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        }
3502d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
3512d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        const StringRef CommandName(BufferPtr + 1, Length);
3522d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
353e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko        const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
354e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko        if (!Info) {
355e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko          formTokenWithChars(T, TokenPtr, tok::unknown_command);
356e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko          T.setUnknownCommandName(CommandName);
357ad6fd9f93ce0d328397e8d57ef7117ced24fc8e2Fariborz Jahanian          Diag(T.getLocation(),
358ad6fd9f93ce0d328397e8d57ef7117ced24fc8e2Fariborz Jahanian               diag::warn_unknown_comment_command_name);
3592d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          return;
3602d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        }
361e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko        if (Info->IsVerbatimBlockCommand) {
362e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko          setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
363e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko          return;
364e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko        }
365e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko        if (Info->IsVerbatimLineCommand) {
366e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko          setupAndLexVerbatimLine(T, TokenPtr, Info);
3672d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          return;
3682d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        }
369808383d2d6d58a7c7db85f8c7618fb74d821309fDmitri Gribenko        formTokenWithChars(T, TokenPtr, CommandKind);
370e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko        T.setCommandID(Info->getID());
3712d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        return;
3722d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      }
3732d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
374477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      case '&':
375477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko        lexHTMLCharacterReference(T);
376477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko        return;
377477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko
3782d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      case '<': {
3792d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        TokenPtr++;
3802d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        if (TokenPtr == CommentEnd) {
381477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko          formTextToken(T, TokenPtr);
3822d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          return;
3832d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        }
3842d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        const char C = *TokenPtr;
385a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko        if (isHTMLIdentifierStartingCharacter(C))
3863f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko          setupAndLexHTMLStartTag(T);
3872d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        else if (C == '/')
3883f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko          setupAndLexHTMLEndTag(T);
389477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko        else
390477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko          formTextToken(T, TokenPtr);
391477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko
3922d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        return;
3932d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      }
3942d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
3952d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      case '\n':
3962d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      case '\r':
3972d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        TokenPtr = skipNewline(TokenPtr, CommentEnd);
3982d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        formTokenWithChars(T, TokenPtr, tok::newline);
3992d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
4002d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        if (CommentState == LCS_InsideCComment)
4012d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          skipLineStartingDecorations();
4022d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        return;
4032d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
4042d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      default: {
405aa7dbafc3539868ce271cb336444ec544260905aDmitri Gribenko        size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
406aa7dbafc3539868ce271cb336444ec544260905aDmitri Gribenko                         find_first_of("\n\r\\@&<");
407aa7dbafc3539868ce271cb336444ec544260905aDmitri Gribenko        if (End != StringRef::npos)
408aa7dbafc3539868ce271cb336444ec544260905aDmitri Gribenko          TokenPtr += End;
409aa7dbafc3539868ce271cb336444ec544260905aDmitri Gribenko        else
410aa7dbafc3539868ce271cb336444ec544260905aDmitri Gribenko          TokenPtr = CommentEnd;
411477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko        formTextToken(T, TokenPtr);
4122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        return;
4132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      }
4142d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    }
4152d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
4162d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
4172d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
4182d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkovoid Lexer::setupAndLexVerbatimBlock(Token &T,
4192d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko                                     const char *TextBegin,
420e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko                                     char Marker, const CommandInfo *Info) {
421e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko  assert(Info->IsVerbatimBlockCommand);
422e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko
4232d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  VerbatimBlockEndCommandName.clear();
4242d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
425e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko  VerbatimBlockEndCommandName.append(Info->EndCommandName);
4262d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
4272d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
428e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko  T.setVerbatimBlockID(Info->getID());
4292d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
4308d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko  // If there is a newline following the verbatim opening command, skip the
4318d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko  // newline so that we don't create an tok::verbatim_block_line with empty
4328d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko  // text content.
433bf8814478fddfa611911bdbd6a53a6614938cc63Dmitri Gribenko  if (BufferPtr != CommentEnd &&
434bf8814478fddfa611911bdbd6a53a6614938cc63Dmitri Gribenko      isVerticalWhitespace(*BufferPtr)) {
435bf8814478fddfa611911bdbd6a53a6614938cc63Dmitri Gribenko    BufferPtr = skipNewline(BufferPtr, CommentEnd);
436bf8814478fddfa611911bdbd6a53a6614938cc63Dmitri Gribenko    State = LS_VerbatimBlockBody;
437bf8814478fddfa611911bdbd6a53a6614938cc63Dmitri Gribenko    return;
4388d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko  }
4398d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko
4402d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  State = LS_VerbatimBlockFirstLine;
4412d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
4422d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
4432d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkovoid Lexer::lexVerbatimBlockFirstLine(Token &T) {
44464da4e55c111f4733135e1780216609569767351Dmitri Gribenkoagain:
4452d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  assert(BufferPtr < CommentEnd);
4462d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
4472d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  // FIXME: It would be better to scan the text once, finding either the block
4482d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  // end command or newline.
4492d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  //
4502d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  // Extract current line.
4512d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  const char *Newline = findNewline(BufferPtr, CommentEnd);
4522d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  StringRef Line(BufferPtr, Newline - BufferPtr);
4532d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
4542d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  // Look for end command in current line.
4552d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  size_t Pos = Line.find(VerbatimBlockEndCommandName);
4568d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko  const char *TextEnd;
4572d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  const char *NextLine;
4582d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  if (Pos == StringRef::npos) {
4592d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    // Current line is completely verbatim.
4608d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko    TextEnd = Newline;
4612d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    NextLine = skipNewline(Newline, CommentEnd);
4622d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  } else if (Pos == 0) {
4632d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    // Current line contains just an end command.
4642d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
465f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko    StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
4662d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    formTokenWithChars(T, End, tok::verbatim_block_end);
467e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko    T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
4682d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    State = LS_Normal;
4692d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    return;
4702d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  } else {
4712d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    // There is some text, followed by end command.  Extract text first.
4728d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko    TextEnd = BufferPtr + Pos;
4738d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko    NextLine = TextEnd;
47464da4e55c111f4733135e1780216609569767351Dmitri Gribenko    // If there is only whitespace before end command, skip whitespace.
47564da4e55c111f4733135e1780216609569767351Dmitri Gribenko    if (isWhitespace(BufferPtr, TextEnd)) {
47664da4e55c111f4733135e1780216609569767351Dmitri Gribenko      BufferPtr = TextEnd;
47764da4e55c111f4733135e1780216609569767351Dmitri Gribenko      goto again;
47864da4e55c111f4733135e1780216609569767351Dmitri Gribenko    }
4792d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
4802d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
4818d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko  StringRef Text(BufferPtr, TextEnd - BufferPtr);
4822d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  formTokenWithChars(T, NextLine, tok::verbatim_block_line);
483f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko  T.setVerbatimBlockText(Text);
4842d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
4852d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  State = LS_VerbatimBlockBody;
4862d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
4872d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
4882d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkovoid Lexer::lexVerbatimBlockBody(Token &T) {
4892d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  assert(State == LS_VerbatimBlockBody);
4902d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
4912d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  if (CommentState == LCS_InsideCComment)
4922d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    skipLineStartingDecorations();
4932d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
4942d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  lexVerbatimBlockFirstLine(T);
4952d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
4962d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
497e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenkovoid Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
498e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko                                    const CommandInfo *Info) {
499e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko  assert(Info->IsVerbatimLineCommand);
500962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko  formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
501e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko  T.setVerbatimLineID(Info->getID());
502962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko
503962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko  State = LS_VerbatimLineText;
504962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko}
505962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko
506962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenkovoid Lexer::lexVerbatimLineText(Token &T) {
507962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko  assert(State == LS_VerbatimLineText);
508962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko
509962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko  // Extract current line.
510962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko  const char *Newline = findNewline(BufferPtr, CommentEnd);
511962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko  const StringRef Text(BufferPtr, Newline - BufferPtr);
512962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko  formTokenWithChars(T, Newline, tok::verbatim_line_text);
5132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  T.setVerbatimLineText(Text);
514962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko
515962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko  State = LS_Normal;
5162d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
5172d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
518477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenkovoid Lexer::lexHTMLCharacterReference(Token &T) {
519477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  const char *TokenPtr = BufferPtr;
520477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  assert(*TokenPtr == '&');
521477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  TokenPtr++;
522477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  if (TokenPtr == CommentEnd) {
523477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    formTextToken(T, TokenPtr);
524477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    return;
525477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  }
526477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  const char *NamePtr;
527477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  bool isNamed = false;
528477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  bool isDecimal = false;
529477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  char C = *TokenPtr;
530477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  if (isHTMLNamedCharacterReferenceCharacter(C)) {
531477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    NamePtr = TokenPtr;
532477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
533477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    isNamed = true;
534477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  } else if (C == '#') {
535477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    TokenPtr++;
536477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    if (TokenPtr == CommentEnd) {
537477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      formTextToken(T, TokenPtr);
538477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      return;
539477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    }
540477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    C = *TokenPtr;
541477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    if (isHTMLDecimalCharacterReferenceCharacter(C)) {
542477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      NamePtr = TokenPtr;
543477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
544477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      isDecimal = true;
545477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    } else if (C == 'x' || C == 'X') {
546477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      TokenPtr++;
547477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      NamePtr = TokenPtr;
548477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
549477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    } else {
550477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      formTextToken(T, TokenPtr);
551477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      return;
552477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    }
553477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  } else {
554477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    formTextToken(T, TokenPtr);
555477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    return;
556477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  }
557477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
558477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      *TokenPtr != ';') {
559477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    formTextToken(T, TokenPtr);
560477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    return;
561477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  }
562477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  StringRef Name(NamePtr, TokenPtr - NamePtr);
563477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  TokenPtr++; // Skip semicolon.
564477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  StringRef Resolved;
5655bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko  if (isNamed)
566477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    Resolved = resolveHTMLNamedCharacterReference(Name);
567477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  else if (isDecimal)
568477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    Resolved = resolveHTMLDecimalCharacterReference(Name);
569477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  else
570477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    Resolved = resolveHTMLHexCharacterReference(Name);
571477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko
572477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  if (Resolved.empty()) {
573477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    formTextToken(T, TokenPtr);
574477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    return;
575477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  }
576477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  formTokenWithChars(T, TokenPtr, tok::text);
577477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  T.setText(Resolved);
578477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  return;
579477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko}
580477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko
5813f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenkovoid Lexer::setupAndLexHTMLStartTag(Token &T) {
582a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko  assert(BufferPtr[0] == '<' &&
583a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko         isHTMLIdentifierStartingCharacter(BufferPtr[1]));
5842d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
585f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko  StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
586834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko  if (!isHTMLTagName(Name)) {
587834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko    formTextToken(T, TagNameEnd);
588834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko    return;
589834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko  }
590834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko
5913f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko  formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
5923f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko  T.setHTMLTagStartName(Name);
5932d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
5942d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
5952d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
596a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko  const char C = *BufferPtr;
597a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko  if (BufferPtr != CommentEnd &&
598a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko      (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
5993f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko    State = LS_HTMLStartTag;
6002d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
6012d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
6023f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenkovoid Lexer::lexHTMLStartTag(Token &T) {
6033f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko  assert(State == LS_HTMLStartTag);
6042d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
6052d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  const char *TokenPtr = BufferPtr;
6062d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  char C = *TokenPtr;
6072d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  if (isHTMLIdentifierCharacter(C)) {
6082d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
609f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko    StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
6102d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    formTokenWithChars(T, TokenPtr, tok::html_ident);
611f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko    T.setHTMLIdent(Ident);
6122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  } else {
6132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    switch (C) {
6142d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    case '=':
6152d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      TokenPtr++;
6162d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      formTokenWithChars(T, TokenPtr, tok::html_equals);
6172d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      break;
6182d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    case '\"':
6192d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    case '\'': {
6202d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      const char *OpenQuote = TokenPtr;
6212d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
6222d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      const char *ClosingQuote = TokenPtr;
6232d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      if (TokenPtr != CommentEnd) // Skip closing quote.
6242d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        TokenPtr++;
6252d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
6262d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      T.setHTMLQuotedString(StringRef(OpenQuote + 1,
6272d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko                                      ClosingQuote - (OpenQuote + 1)));
6282d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      break;
6292d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    }
6302d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    case '>':
6312d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      TokenPtr++;
6322d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      formTokenWithChars(T, TokenPtr, tok::html_greater);
633a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko      State = LS_Normal;
634a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko      return;
635a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko    case '/':
636a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko      TokenPtr++;
637a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko      if (TokenPtr != CommentEnd && *TokenPtr == '>') {
638a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko        TokenPtr++;
639a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko        formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
640477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      } else
641477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko        formTextToken(T, TokenPtr);
642477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko
643a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko      State = LS_Normal;
644a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko      return;
6452d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    }
6462d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
6472d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
6482d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  // Now look ahead and return to normal state if we don't see any HTML tokens
6492d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  // ahead.
6502d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
6512d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  if (BufferPtr == CommentEnd) {
6522d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    State = LS_Normal;
6532d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    return;
6542d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
6552d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
6562d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  C = *BufferPtr;
657a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko  if (!isHTMLIdentifierStartingCharacter(C) &&
6582d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      C != '=' && C != '\"' && C != '\'' && C != '>') {
6592d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    State = LS_Normal;
6602d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    return;
6612d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
6622d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
6632d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
6643f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenkovoid Lexer::setupAndLexHTMLEndTag(Token &T) {
6652d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
6662d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
6672d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
6682d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
669834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko  StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
670834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko  if (!isHTMLTagName(Name)) {
671834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko    formTextToken(T, TagNameEnd);
672834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko    return;
673834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko  }
6742d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
6752d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  const char *End = skipWhitespace(TagNameEnd, CommentEnd);
6762d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
6773f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko  formTokenWithChars(T, End, tok::html_end_tag);
678834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko  T.setHTMLTagEndName(Name);
6798d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko
6808d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko  if (BufferPtr != CommentEnd && *BufferPtr == '>')
6813f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko    State = LS_HTMLEndTag;
6828d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko}
6838d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko
6843f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenkovoid Lexer::lexHTMLEndTag(Token &T) {
6858d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko  assert(BufferPtr != CommentEnd && *BufferPtr == '>');
6868d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko
6878d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko  formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
6888d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko  State = LS_Normal;
6892d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
6902d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
691ad6fd9f93ce0d328397e8d57ef7117ced24fc8e2Fariborz JahanianLexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
692ad6fd9f93ce0d328397e8d57ef7117ced24fc8e2Fariborz Jahanian             const CommandTraits &Traits,
693af503a6f218cbef8704609812668360b0cbd0b60Dmitri Gribenko             SourceLocation FileLoc,
6942d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko             const char *BufferStart, const char *BufferEnd):
695ad6fd9f93ce0d328397e8d57ef7117ced24fc8e2Fariborz Jahanian    Allocator(Allocator), Diags(Diags), Traits(Traits),
6962d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    BufferStart(BufferStart), BufferEnd(BufferEnd),
697af503a6f218cbef8704609812668360b0cbd0b60Dmitri Gribenko    FileLoc(FileLoc), BufferPtr(BufferStart),
6982d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    CommentState(LCS_BeforeComment), State(LS_Normal) {
6992d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
7002d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
7012d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkovoid Lexer::lex(Token &T) {
7022d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoagain:
7032d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  switch (CommentState) {
7042d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  case LCS_BeforeComment:
7052d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    if (BufferPtr == BufferEnd) {
7062d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      formTokenWithChars(T, BufferPtr, tok::eof);
7072d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      return;
7082d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    }
7092d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
7102d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    assert(*BufferPtr == '/');
7112d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    BufferPtr++; // Skip first slash.
7122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    switch(*BufferPtr) {
7132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    case '/': { // BCPL comment.
7142d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      BufferPtr++; // Skip second slash.
7152d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
7162d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      if (BufferPtr != BufferEnd) {
7172d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        // Skip Doxygen magic marker, if it is present.
7182d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        // It might be missing because of a typo //< or /*<, or because we
7192d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        // merged this non-Doxygen comment into a bunch of Doxygen comments
7202d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        // around it: /** ... */ /* ... */ /** ... */
7212d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        const char C = *BufferPtr;
7222d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        if (C == '/' || C == '!')
7232d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          BufferPtr++;
7242d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      }
7252d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
7262d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      // Skip less-than symbol that marks trailing comments.
7272d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      // Skip it even if the comment is not a Doxygen one, because //< and /*<
7282d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      // are frequent typos.
7292d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      if (BufferPtr != BufferEnd && *BufferPtr == '<')
7302d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        BufferPtr++;
7312d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
7322d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      CommentState = LCS_InsideBCPLComment;
7338d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko      if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
7348d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko        State = LS_Normal;
7352d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
7362d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      goto again;
7372d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    }
7382d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    case '*': { // C comment.
7392d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      BufferPtr++; // Skip star.
7402d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
7412d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      // Skip Doxygen magic marker.
7422d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      const char C = *BufferPtr;
7432d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
7442d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        BufferPtr++;
7452d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
7462d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      // Skip less-than symbol that marks trailing comments.
7472d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      if (BufferPtr != BufferEnd && *BufferPtr == '<')
7482d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        BufferPtr++;
7492d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
7502d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      CommentState = LCS_InsideCComment;
7512d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      State = LS_Normal;
7522d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
7532d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      goto again;
7542d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    }
7552d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    default:
7562d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      llvm_unreachable("second character of comment should be '/' or '*'");
7572d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    }
7582d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
7592d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  case LCS_BetweenComments: {
7602d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    // Consecutive comments are extracted only if there is only whitespace
7612d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    // between them.  So we can search for the start of the next comment.
7622d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    const char *EndWhitespace = BufferPtr;
7632d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
7642d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      EndWhitespace++;
7652d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
7662d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    // Turn any whitespace between comments (and there is only whitespace
767a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko    // between them -- guaranteed by comment extraction) into a newline.  We
768a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko    // have two newlines between C comments in total (first one was synthesized
769a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko    // after a comment).
7702d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    formTokenWithChars(T, EndWhitespace, tok::newline);
7712d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
7722d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    CommentState = LCS_BeforeComment;
7732d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    break;
7742d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
7752d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
7762d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  case LCS_InsideBCPLComment:
7772d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  case LCS_InsideCComment:
7782d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    if (BufferPtr != CommentEnd) {
7792d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      lexCommentText(T);
7802d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      break;
7812d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    } else {
7822d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      // Skip C comment closing sequence.
7832d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      if (CommentState == LCS_InsideCComment) {
7842d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
7852d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        BufferPtr += 2;
7862d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        assert(BufferPtr <= BufferEnd);
7872d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
7882d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        // Synthenize newline just after the C comment, regardless if there is
7892d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        // actually a newline.
7902d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        formTokenWithChars(T, BufferPtr, tok::newline);
7912d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
7922d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        CommentState = LCS_BetweenComments;
7932d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        break;
7942d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      } else {
7952d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        // Don't synthesized a newline after BCPL comment.
7962d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        CommentState = LCS_BetweenComments;
7972d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        goto again;
7982d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      }
7992d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    }
8002d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
8012d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
8022d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
8032d44d77fed3200e2eff289f55493317e90d3398cDmitri GribenkoStringRef Lexer::getSpelling(const Token &Tok,
8042d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko                             const SourceManager &SourceMgr,
8052d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko                             bool *Invalid) const {
8062d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  SourceLocation Loc = Tok.getLocation();
8072d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
8082d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
8092d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  bool InvalidTemp = false;
8102d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
8112d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  if (InvalidTemp) {
8122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    *Invalid = true;
8132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    return StringRef();
8142d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
8152d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
8162d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  const char *Begin = File.data() + LocInfo.second;
8172d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  return StringRef(Begin, Tok.getLength());
8182d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
8192d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
8202d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} // end namespace comments
8212d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} // end namespace clang
8222d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
823