CommentLexer.cpp revision aa58081902ad31927df02e8537d972eabe29d6df
12d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko#include "clang/AST/CommentLexer.h"
2aa58081902ad31927df02e8537d972eabe29d6dfDmitri Gribenko#include "clang/AST/CommentCommandTraits.h"
3477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko#include "clang/Basic/ConvertUTF.h"
42d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko#include "llvm/ADT/StringSwitch.h"
52d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko#include "llvm/Support/ErrorHandling.h"
62d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
72d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkonamespace clang {
82d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkonamespace comments {
92d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
102d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkovoid Token::dump(const Lexer &L, const SourceManager &SM) const {
112d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  llvm::errs() << "comments::Token Kind=" << Kind << " ";
122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  Loc.dump(SM);
132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
142d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
152d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
16477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenkonamespace {
17477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenkobool isHTMLNamedCharacterReferenceCharacter(char C) {
18477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  return (C >= 'a' && C <= 'z') ||
19477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko         (C >= 'A' && C <= 'Z');
20477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko}
21477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko
22477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenkobool isHTMLDecimalCharacterReferenceCharacter(char C) {
23477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  return C >= '0' && C <= '9';
24477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko}
25477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko
26477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenkobool isHTMLHexCharacterReferenceCharacter(char C) {
27477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  return (C >= '0' && C <= '9') ||
28477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko         (C >= 'a' && C <= 'f') ||
29477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko         (C >= 'A' && C <= 'F');
30477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko}
31477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko} // unnamed namespace
32477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko
33477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri GribenkoStringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
34477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  return llvm::StringSwitch<StringRef>(Name)
35477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      .Case("amp", "&")
36477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      .Case("lt", "<")
37477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      .Case("gt", ">")
38477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      .Case("quot", "\"")
39477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      .Case("apos", "\'")
40477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      .Default("");
41477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko}
42477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko
43477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri GribenkoStringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
44477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  unsigned CodePoint = 0;
45477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
46477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
47477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    CodePoint *= 10;
48477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    CodePoint += Name[i] - '0';
49477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  }
50477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko
51477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
52477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  char *ResolvedPtr = Resolved;
53477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
54477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    return StringRef(Resolved, ResolvedPtr - Resolved);
55477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  else
56477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    return StringRef();
57477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko}
58477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko
59477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri GribenkoStringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
60477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  unsigned CodePoint = 0;
61477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  for (unsigned i = 0, e = Name.size(); i != e; ++i) {
62477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    CodePoint *= 16;
63477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    const char C = Name[i];
64477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    assert(isHTMLHexCharacterReferenceCharacter(C));
65477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    if (C >= '0' && C <= '9')
66477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      CodePoint += Name[i] - '0';
67477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    else if (C >= 'a' && C <= 'f')
68477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      CodePoint += Name[i] - 'a' + 10;
69477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    else
70477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      CodePoint += Name[i] - 'A' + 10;
71477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  }
72477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko
73477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
74477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  char *ResolvedPtr = Resolved;
75477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
76477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    return StringRef(Resolved, ResolvedPtr - Resolved);
77477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  else
78477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    return StringRef();
79477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko}
80477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko
812d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkovoid Lexer::skipLineStartingDecorations() {
822d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  // This function should be called only for C comments
832d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  assert(CommentState == LCS_InsideCComment);
842d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
852d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  if (BufferPtr == CommentEnd)
862d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    return;
872d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
882d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  switch (*BufferPtr) {
892d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  case ' ':
902d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  case '\t':
912d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  case '\f':
922d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  case '\v': {
932d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    const char *NewBufferPtr = BufferPtr;
942d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    NewBufferPtr++;
952d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    if (NewBufferPtr == CommentEnd)
962d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      return;
972d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
982d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    char C = *NewBufferPtr;
992d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    while (C == ' ' || C == '\t' || C == '\f' || C == '\v') {
1002d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      NewBufferPtr++;
1012d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      if (NewBufferPtr == CommentEnd)
1022d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        return;
1032d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      C = *NewBufferPtr;
1042d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    }
1052d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    if (C == '*')
1062d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      BufferPtr = NewBufferPtr + 1;
1072d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    break;
1082d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
1092d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  case '*':
1102d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    BufferPtr++;
1112d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    break;
1122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
1132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
1142d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
1152d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkonamespace {
1168d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko/// Returns pointer to the first newline character in the string.
1172d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *findNewline(const char *BufferPtr, const char *BufferEnd) {
1182d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
1192d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    const char C = *BufferPtr;
1202d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    if (C == '\n' || C == '\r')
1212d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      return BufferPtr;
1222d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
1232d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  return BufferEnd;
1242d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
1252d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
1262d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
1272d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  if (BufferPtr == BufferEnd)
1282d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    return BufferPtr;
1292d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
1302d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  if (*BufferPtr == '\n')
1312d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    BufferPtr++;
1322d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  else {
1332d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    assert(*BufferPtr == '\r');
1342d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    BufferPtr++;
1352d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    if (BufferPtr != BufferEnd && *BufferPtr == '\n')
1362d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      BufferPtr++;
1372d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
1382d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  return BufferPtr;
1392d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
1402d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
141477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenkoconst char *skipNamedCharacterReference(const char *BufferPtr,
142477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko                                        const char *BufferEnd) {
143477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
144477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
145477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      return BufferPtr;
146477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  }
147477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  return BufferEnd;
148477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko}
149477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko
150477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenkoconst char *skipDecimalCharacterReference(const char *BufferPtr,
151477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko                                          const char *BufferEnd) {
152477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
153477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
154477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      return BufferPtr;
155477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  }
156477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  return BufferEnd;
157477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko}
158477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko
159477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenkoconst char *skipHexCharacterReference(const char *BufferPtr,
160477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko                                          const char *BufferEnd) {
161477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
162477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
163477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      return BufferPtr;
164477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  }
165477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  return BufferEnd;
166477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko}
167477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko
168a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenkobool isHTMLIdentifierStartingCharacter(char C) {
169a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko  return (C >= 'a' && C <= 'z') ||
170a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko         (C >= 'A' && C <= 'Z');
171a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko}
172a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko
1732d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkobool isHTMLIdentifierCharacter(char C) {
1742d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  return (C >= 'a' && C <= 'z') ||
1752d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko         (C >= 'A' && C <= 'Z') ||
1762d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko         (C >= '0' && C <= '9');
1772d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
1782d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
1792d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
1802d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
1812d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    if (!isHTMLIdentifierCharacter(*BufferPtr))
1822d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      return BufferPtr;
1832d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
1842d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  return BufferEnd;
1852d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
1862d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
1872d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
1882d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// string allowed.
1892d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko///
1902d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// Returns pointer to closing quote.
1912d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
1922d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko{
1932d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  const char Quote = *BufferPtr;
1942d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  assert(Quote == '\"' || Quote == '\'');
1952d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
1962d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  BufferPtr++;
1972d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
1982d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    const char C = *BufferPtr;
1992d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    if (C == Quote && BufferPtr[-1] != '\\')
2002d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      return BufferPtr;
2012d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
2022d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  return BufferEnd;
2032d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
2042d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
2052d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkobool isHorizontalWhitespace(char C) {
2062d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  return C == ' ' || C == '\t' || C == '\f' || C == '\v';
2072d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
2082d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
2092d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkobool isWhitespace(char C) {
2102d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  return C == ' ' || C == '\n' || C == '\r' ||
2112d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko         C == '\t' || C == '\f' || C == '\v';
2122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
2132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
2142d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
2152d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
2162d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    if (!isWhitespace(*BufferPtr))
2172d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      return BufferPtr;
2182d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
2192d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  return BufferEnd;
2202d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
2212d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
22264da4e55c111f4733135e1780216609569767351Dmitri Gribenkobool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
22364da4e55c111f4733135e1780216609569767351Dmitri Gribenko  return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
22464da4e55c111f4733135e1780216609569767351Dmitri Gribenko}
22564da4e55c111f4733135e1780216609569767351Dmitri Gribenko
2262d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkobool isCommandNameCharacter(char C) {
2272d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  return (C >= 'a' && C <= 'z') ||
2282d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko         (C >= 'A' && C <= 'Z') ||
2292d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko         (C >= '0' && C <= '9');
2302d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
2312d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
2322d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
2332d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
2342d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    if (!isCommandNameCharacter(*BufferPtr))
2352d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      return BufferPtr;
2362d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
2372d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  return BufferEnd;
2382d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
2392d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
2402d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// Return the one past end pointer for BCPL comments.
2412d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// Handles newlines escaped with backslash or trigraph for backslahs.
2422d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
2432d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  const char *CurPtr = BufferPtr;
2442d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  while (CurPtr != BufferEnd) {
2452d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    char C = *CurPtr;
2462d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    while (C != '\n' && C != '\r') {
2472d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      CurPtr++;
2482d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      if (CurPtr == BufferEnd)
2492d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        return BufferEnd;
2502d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      C = *CurPtr;
2512d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    }
2522d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    // We found a newline, check if it is escaped.
2532d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    const char *EscapePtr = CurPtr - 1;
2542d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    while(isHorizontalWhitespace(*EscapePtr))
2552d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      EscapePtr--;
2562d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
2572d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    if (*EscapePtr == '\\' ||
2582d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
2592d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko         EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
2602d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      // We found an escaped newline.
2612d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      CurPtr = skipNewline(CurPtr, BufferEnd);
2622d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    } else
2632d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      return CurPtr; // Not an escaped newline.
2642d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
2652d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  return BufferEnd;
2662d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
2672d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
2682d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// Return the one past end pointer for C comments.
2692d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// Very dumb, does not handle escaped newlines or trigraphs.
2702d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
2712d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
2722d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    if (*BufferPtr == '*') {
2732d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      assert(BufferPtr + 1 != BufferEnd);
2742d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      if (*(BufferPtr + 1) == '/')
2752d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        return BufferPtr;
2762d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    }
2772d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
2782d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  llvm_unreachable("buffer end hit before '*/' was seen");
2792d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
2802d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} // unnamed namespace
2812d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
2822d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkovoid Lexer::lexCommentText(Token &T) {
2832d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  assert(CommentState == LCS_InsideBCPLComment ||
2842d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko         CommentState == LCS_InsideCComment);
2852d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
2862d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  switch (State) {
2872d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  case LS_Normal:
2882d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    break;
2892d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  case LS_VerbatimBlockFirstLine:
2902d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    lexVerbatimBlockFirstLine(T);
2912d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    return;
2922d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  case LS_VerbatimBlockBody:
2932d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    lexVerbatimBlockBody(T);
2942d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    return;
295962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko  case LS_VerbatimLineText:
296962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko    lexVerbatimLineText(T);
297962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko    return;
2983f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko  case LS_HTMLStartTag:
2993f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko    lexHTMLStartTag(T);
3002d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    return;
3013f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko  case LS_HTMLEndTag:
3023f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko    lexHTMLEndTag(T);
3038d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko    return;
3042d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
3052d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
3062d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  assert(State == LS_Normal);
3072d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
3082d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  const char *TokenPtr = BufferPtr;
3092d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  assert(TokenPtr < CommentEnd);
3102d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  while (TokenPtr != CommentEnd) {
3112d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    switch(*TokenPtr) {
3122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      case '\\':
3132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      case '@': {
3142d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        TokenPtr++;
3152d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        if (TokenPtr == CommentEnd) {
316477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko          formTextToken(T, TokenPtr);
3172d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          return;
3182d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        }
3192d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        char C = *TokenPtr;
3202d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        switch (C) {
3212d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        default:
3222d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          break;
3232d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
3242d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        case '\\': case '@': case '&': case '$':
3252d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        case '#':  case '<': case '>': case '%':
3262d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        case '\"': case '.': case ':':
3272d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          // This is one of \\ \@ \& \$ etc escape sequences.
3282d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          TokenPtr++;
3292d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
3302d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko            // This is the \:: escape sequence.
3312d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko            TokenPtr++;
3322d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          }
333f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko          StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
3342d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          formTokenWithChars(T, TokenPtr, tok::text);
335f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko          T.setText(UnescapedText);
3362d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          return;
3372d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        }
3382d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
3392d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        // Don't make zero-length commands.
3402d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        if (!isCommandNameCharacter(*TokenPtr)) {
341477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko          formTextToken(T, TokenPtr);
3422d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          return;
3432d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        }
3442d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
3452d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        TokenPtr = skipCommandName(TokenPtr, CommentEnd);
3462d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        unsigned Length = TokenPtr - (BufferPtr + 1);
3472d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
3482d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        // Hardcoded support for lexing LaTeX formula commands
3492d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        // \f$ \f[ \f] \f{ \f} as a single command.
3502d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
3512d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          C = *TokenPtr;
3522d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
3532d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko            TokenPtr++;
3542d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko            Length++;
3552d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          }
3562d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        }
3572d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
3582d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        const StringRef CommandName(BufferPtr + 1, Length);
3592d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        StringRef EndName;
3602d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
361aa58081902ad31927df02e8537d972eabe29d6dfDmitri Gribenko        if (Traits.isVerbatimBlockCommand(CommandName, EndName)) {
3622d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, EndName);
3632d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          return;
3642d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        }
365aa58081902ad31927df02e8537d972eabe29d6dfDmitri Gribenko        if (Traits.isVerbatimLineCommand(CommandName)) {
366962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko          setupAndLexVerbatimLine(T, TokenPtr);
3672d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          return;
3682d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        }
3692d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        formTokenWithChars(T, TokenPtr, tok::command);
3702d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        T.setCommandName(CommandName);
3712d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        return;
3722d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      }
3732d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
374477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      case '&':
375477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko        lexHTMLCharacterReference(T);
376477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko        return;
377477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko
3782d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      case '<': {
3792d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        TokenPtr++;
3802d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        if (TokenPtr == CommentEnd) {
381477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko          formTextToken(T, TokenPtr);
3822d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          return;
3832d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        }
3842d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        const char C = *TokenPtr;
385a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko        if (isHTMLIdentifierStartingCharacter(C))
3863f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko          setupAndLexHTMLStartTag(T);
3872d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        else if (C == '/')
3883f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko          setupAndLexHTMLEndTag(T);
389477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko        else
390477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko          formTextToken(T, TokenPtr);
391477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko
3922d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        return;
3932d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      }
3942d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
3952d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      case '\n':
3962d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      case '\r':
3972d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        TokenPtr = skipNewline(TokenPtr, CommentEnd);
3982d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        formTokenWithChars(T, TokenPtr, tok::newline);
3992d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
4002d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        if (CommentState == LCS_InsideCComment)
4012d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          skipLineStartingDecorations();
4022d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        return;
4032d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
4042d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      default: {
4052d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        while (true) {
4062d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          TokenPtr++;
4072d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          if (TokenPtr == CommentEnd)
4082d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko            break;
409a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko          const char C = *TokenPtr;
4102d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          if(C == '\n' || C == '\r' ||
411477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko             C == '\\' || C == '@' || C == '&' || C == '<')
4122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko            break;
4132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        }
414477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko        formTextToken(T, TokenPtr);
4152d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        return;
4162d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      }
4172d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    }
4182d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
4192d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
4202d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
4212d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkovoid Lexer::setupAndLexVerbatimBlock(Token &T,
4222d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko                                     const char *TextBegin,
4232d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko                                     char Marker, StringRef EndName) {
4242d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  VerbatimBlockEndCommandName.clear();
4252d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
4262d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  VerbatimBlockEndCommandName.append(EndName);
4272d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
428f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko  StringRef Name(BufferPtr + 1, TextBegin - (BufferPtr + 1));
4292d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
430f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko  T.setVerbatimBlockName(Name);
4312d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
4328d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko  // If there is a newline following the verbatim opening command, skip the
4338d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko  // newline so that we don't create an tok::verbatim_block_line with empty
4348d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko  // text content.
4358d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko  if (BufferPtr != CommentEnd) {
4368d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko    const char C = *BufferPtr;
4378d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko    if (C == '\n' || C == '\r') {
4388d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko      BufferPtr = skipNewline(BufferPtr, CommentEnd);
4398d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko      State = LS_VerbatimBlockBody;
4408d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko      return;
4418d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko    }
4428d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko  }
4438d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko
4442d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  State = LS_VerbatimBlockFirstLine;
4452d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
4462d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
4472d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkovoid Lexer::lexVerbatimBlockFirstLine(Token &T) {
44864da4e55c111f4733135e1780216609569767351Dmitri Gribenkoagain:
4492d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  assert(BufferPtr < CommentEnd);
4502d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
4512d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  // FIXME: It would be better to scan the text once, finding either the block
4522d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  // end command or newline.
4532d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  //
4542d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  // Extract current line.
4552d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  const char *Newline = findNewline(BufferPtr, CommentEnd);
4562d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  StringRef Line(BufferPtr, Newline - BufferPtr);
4572d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
4582d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  // Look for end command in current line.
4592d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  size_t Pos = Line.find(VerbatimBlockEndCommandName);
4608d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko  const char *TextEnd;
4612d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  const char *NextLine;
4622d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  if (Pos == StringRef::npos) {
4632d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    // Current line is completely verbatim.
4648d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko    TextEnd = Newline;
4652d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    NextLine = skipNewline(Newline, CommentEnd);
4662d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  } else if (Pos == 0) {
4672d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    // Current line contains just an end command.
4682d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
469f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko    StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
4702d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    formTokenWithChars(T, End, tok::verbatim_block_end);
471f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko    T.setVerbatimBlockName(Name);
4722d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    State = LS_Normal;
4732d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    return;
4742d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  } else {
4752d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    // There is some text, followed by end command.  Extract text first.
4768d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko    TextEnd = BufferPtr + Pos;
4778d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko    NextLine = TextEnd;
47864da4e55c111f4733135e1780216609569767351Dmitri Gribenko    // If there is only whitespace before end command, skip whitespace.
47964da4e55c111f4733135e1780216609569767351Dmitri Gribenko    if (isWhitespace(BufferPtr, TextEnd)) {
48064da4e55c111f4733135e1780216609569767351Dmitri Gribenko      BufferPtr = TextEnd;
48164da4e55c111f4733135e1780216609569767351Dmitri Gribenko      goto again;
48264da4e55c111f4733135e1780216609569767351Dmitri Gribenko    }
4832d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
4842d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
4858d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko  StringRef Text(BufferPtr, TextEnd - BufferPtr);
4862d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  formTokenWithChars(T, NextLine, tok::verbatim_block_line);
487f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko  T.setVerbatimBlockText(Text);
4882d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
4892d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  State = LS_VerbatimBlockBody;
4902d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
4912d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
4922d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkovoid Lexer::lexVerbatimBlockBody(Token &T) {
4932d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  assert(State == LS_VerbatimBlockBody);
4942d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
4952d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  if (CommentState == LCS_InsideCComment)
4962d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    skipLineStartingDecorations();
4972d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
4982d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  lexVerbatimBlockFirstLine(T);
4992d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
5002d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
501962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenkovoid Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin) {
5022d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  const StringRef Name(BufferPtr + 1, TextBegin - BufferPtr - 1);
503962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko  formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
5042d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  T.setVerbatimLineName(Name);
505962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko
506962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko  State = LS_VerbatimLineText;
507962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko}
508962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko
509962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenkovoid Lexer::lexVerbatimLineText(Token &T) {
510962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko  assert(State == LS_VerbatimLineText);
511962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko
512962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko  // Extract current line.
513962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko  const char *Newline = findNewline(BufferPtr, CommentEnd);
514962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko  const StringRef Text(BufferPtr, Newline - BufferPtr);
515962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko  formTokenWithChars(T, Newline, tok::verbatim_line_text);
5162d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  T.setVerbatimLineText(Text);
517962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko
518962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko  State = LS_Normal;
5192d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
5202d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
521477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenkovoid Lexer::lexHTMLCharacterReference(Token &T) {
522477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  const char *TokenPtr = BufferPtr;
523477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  assert(*TokenPtr == '&');
524477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  TokenPtr++;
525477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  if (TokenPtr == CommentEnd) {
526477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    formTextToken(T, TokenPtr);
527477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    return;
528477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  }
529477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  const char *NamePtr;
530477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  bool isNamed = false;
531477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  bool isDecimal = false;
532477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  char C = *TokenPtr;
533477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  if (isHTMLNamedCharacterReferenceCharacter(C)) {
534477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    NamePtr = TokenPtr;
535477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
536477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    isNamed = true;
537477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  } else if (C == '#') {
538477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    TokenPtr++;
539477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    if (TokenPtr == CommentEnd) {
540477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      formTextToken(T, TokenPtr);
541477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      return;
542477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    }
543477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    C = *TokenPtr;
544477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    if (isHTMLDecimalCharacterReferenceCharacter(C)) {
545477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      NamePtr = TokenPtr;
546477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
547477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      isDecimal = true;
548477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    } else if (C == 'x' || C == 'X') {
549477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      TokenPtr++;
550477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      NamePtr = TokenPtr;
551477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
552477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    } else {
553477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      formTextToken(T, TokenPtr);
554477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      return;
555477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    }
556477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  } else {
557477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    formTextToken(T, TokenPtr);
558477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    return;
559477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  }
560477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
561477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      *TokenPtr != ';') {
562477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    formTextToken(T, TokenPtr);
563477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    return;
564477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  }
565477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  StringRef Name(NamePtr, TokenPtr - NamePtr);
566477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  TokenPtr++; // Skip semicolon.
567477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  StringRef Resolved;
568477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  if (isNamed)
569477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    Resolved = resolveHTMLNamedCharacterReference(Name);
570477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  else if (isDecimal)
571477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    Resolved = resolveHTMLDecimalCharacterReference(Name);
572477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  else
573477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    Resolved = resolveHTMLHexCharacterReference(Name);
574477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko
575477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  if (Resolved.empty()) {
576477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    formTextToken(T, TokenPtr);
577477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko    return;
578477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  }
579477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  formTokenWithChars(T, TokenPtr, tok::text);
580477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  T.setText(Resolved);
581477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko  return;
582477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko}
583477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko
5843f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenkovoid Lexer::setupAndLexHTMLStartTag(Token &T) {
585a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko  assert(BufferPtr[0] == '<' &&
586a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko         isHTMLIdentifierStartingCharacter(BufferPtr[1]));
5872d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
5882d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
589f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko  StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
5903f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko  formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
5913f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko  T.setHTMLTagStartName(Name);
5922d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
5932d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
5942d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
595a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko  const char C = *BufferPtr;
596a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko  if (BufferPtr != CommentEnd &&
597a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko      (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
5983f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko    State = LS_HTMLStartTag;
5992d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
6002d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
6013f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenkovoid Lexer::lexHTMLStartTag(Token &T) {
6023f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko  assert(State == LS_HTMLStartTag);
6032d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
6042d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  const char *TokenPtr = BufferPtr;
6052d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  char C = *TokenPtr;
6062d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  if (isHTMLIdentifierCharacter(C)) {
6072d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
608f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko    StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
6092d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    formTokenWithChars(T, TokenPtr, tok::html_ident);
610f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko    T.setHTMLIdent(Ident);
6112d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  } else {
6122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    switch (C) {
6132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    case '=':
6142d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      TokenPtr++;
6152d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      formTokenWithChars(T, TokenPtr, tok::html_equals);
6162d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      break;
6172d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    case '\"':
6182d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    case '\'': {
6192d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      const char *OpenQuote = TokenPtr;
6202d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
6212d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      const char *ClosingQuote = TokenPtr;
6222d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      if (TokenPtr != CommentEnd) // Skip closing quote.
6232d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        TokenPtr++;
6242d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
6252d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      T.setHTMLQuotedString(StringRef(OpenQuote + 1,
6262d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko                                      ClosingQuote - (OpenQuote + 1)));
6272d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      break;
6282d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    }
6292d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    case '>':
6302d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      TokenPtr++;
6312d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      formTokenWithChars(T, TokenPtr, tok::html_greater);
632a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko      State = LS_Normal;
633a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko      return;
634a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko    case '/':
635a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko      TokenPtr++;
636a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko      if (TokenPtr != CommentEnd && *TokenPtr == '>') {
637a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko        TokenPtr++;
638a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko        formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
639477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko      } else
640477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko        formTextToken(T, TokenPtr);
641477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko
642a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko      State = LS_Normal;
643a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko      return;
6442d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    }
6452d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
6462d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
6472d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  // Now look ahead and return to normal state if we don't see any HTML tokens
6482d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  // ahead.
6492d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
6502d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  if (BufferPtr == CommentEnd) {
6512d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    State = LS_Normal;
6522d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    return;
6532d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
6542d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
6552d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  C = *BufferPtr;
656a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko  if (!isHTMLIdentifierStartingCharacter(C) &&
6572d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      C != '=' && C != '\"' && C != '\'' && C != '>') {
6582d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    State = LS_Normal;
6592d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    return;
6602d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
6612d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
6622d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
6633f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenkovoid Lexer::setupAndLexHTMLEndTag(Token &T) {
6642d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
6652d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
6662d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
6672d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
6682d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
6692d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  const char *End = skipWhitespace(TagNameEnd, CommentEnd);
6702d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
6713f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko  formTokenWithChars(T, End, tok::html_end_tag);
6723f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko  T.setHTMLTagEndName(StringRef(TagNameBegin, TagNameEnd - TagNameBegin));
6738d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko
6748d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko  if (BufferPtr != CommentEnd && *BufferPtr == '>')
6753f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko    State = LS_HTMLEndTag;
6768d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko}
6778d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko
6783f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenkovoid Lexer::lexHTMLEndTag(Token &T) {
6798d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko  assert(BufferPtr != CommentEnd && *BufferPtr == '>');
6808d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko
6818d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko  formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
6828d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko  State = LS_Normal;
6832d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
6842d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
685aa58081902ad31927df02e8537d972eabe29d6dfDmitri GribenkoLexer::Lexer(llvm::BumpPtrAllocator &Allocator, const CommandTraits &Traits,
686477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko             SourceLocation FileLoc, const CommentOptions &CommOpts,
6872d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko             const char *BufferStart, const char *BufferEnd):
688aa58081902ad31927df02e8537d972eabe29d6dfDmitri Gribenko    Allocator(Allocator), Traits(Traits),
6892d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    BufferStart(BufferStart), BufferEnd(BufferEnd),
6902d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    FileLoc(FileLoc), CommOpts(CommOpts), BufferPtr(BufferStart),
6912d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    CommentState(LCS_BeforeComment), State(LS_Normal) {
6922d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
6932d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
6942d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkovoid Lexer::lex(Token &T) {
6952d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoagain:
6962d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  switch (CommentState) {
6972d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  case LCS_BeforeComment:
6982d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    if (BufferPtr == BufferEnd) {
6992d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      formTokenWithChars(T, BufferPtr, tok::eof);
7002d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      return;
7012d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    }
7022d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
7032d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    assert(*BufferPtr == '/');
7042d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    BufferPtr++; // Skip first slash.
7052d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    switch(*BufferPtr) {
7062d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    case '/': { // BCPL comment.
7072d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      BufferPtr++; // Skip second slash.
7082d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
7092d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      if (BufferPtr != BufferEnd) {
7102d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        // Skip Doxygen magic marker, if it is present.
7112d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        // It might be missing because of a typo //< or /*<, or because we
7122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        // merged this non-Doxygen comment into a bunch of Doxygen comments
7132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        // around it: /** ... */ /* ... */ /** ... */
7142d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        const char C = *BufferPtr;
7152d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        if (C == '/' || C == '!')
7162d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko          BufferPtr++;
7172d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      }
7182d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
7192d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      // Skip less-than symbol that marks trailing comments.
7202d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      // Skip it even if the comment is not a Doxygen one, because //< and /*<
7212d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      // are frequent typos.
7222d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      if (BufferPtr != BufferEnd && *BufferPtr == '<')
7232d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        BufferPtr++;
7242d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
7252d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      CommentState = LCS_InsideBCPLComment;
7268d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko      if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
7278d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko        State = LS_Normal;
7282d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
7292d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      goto again;
7302d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    }
7312d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    case '*': { // C comment.
7322d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      BufferPtr++; // Skip star.
7332d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
7342d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      // Skip Doxygen magic marker.
7352d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      const char C = *BufferPtr;
7362d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
7372d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        BufferPtr++;
7382d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
7392d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      // Skip less-than symbol that marks trailing comments.
7402d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      if (BufferPtr != BufferEnd && *BufferPtr == '<')
7412d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        BufferPtr++;
7422d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
7432d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      CommentState = LCS_InsideCComment;
7442d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      State = LS_Normal;
7452d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
7462d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      goto again;
7472d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    }
7482d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    default:
7492d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      llvm_unreachable("second character of comment should be '/' or '*'");
7502d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    }
7512d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
7522d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  case LCS_BetweenComments: {
7532d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    // Consecutive comments are extracted only if there is only whitespace
7542d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    // between them.  So we can search for the start of the next comment.
7552d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    const char *EndWhitespace = BufferPtr;
7562d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
7572d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      EndWhitespace++;
7582d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
7592d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    // Turn any whitespace between comments (and there is only whitespace
760a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko    // between them -- guaranteed by comment extraction) into a newline.  We
761a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko    // have two newlines between C comments in total (first one was synthesized
762a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko    // after a comment).
7632d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    formTokenWithChars(T, EndWhitespace, tok::newline);
7642d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
7652d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    CommentState = LCS_BeforeComment;
7662d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    break;
7672d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
7682d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
7692d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  case LCS_InsideBCPLComment:
7702d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  case LCS_InsideCComment:
7712d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    if (BufferPtr != CommentEnd) {
7722d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      lexCommentText(T);
7732d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      break;
7742d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    } else {
7752d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      // Skip C comment closing sequence.
7762d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      if (CommentState == LCS_InsideCComment) {
7772d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
7782d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        BufferPtr += 2;
7792d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        assert(BufferPtr <= BufferEnd);
7802d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
7812d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        // Synthenize newline just after the C comment, regardless if there is
7822d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        // actually a newline.
7832d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        formTokenWithChars(T, BufferPtr, tok::newline);
7842d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
7852d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        CommentState = LCS_BetweenComments;
7862d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        break;
7872d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      } else {
7882d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        // Don't synthesized a newline after BCPL comment.
7892d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        CommentState = LCS_BetweenComments;
7902d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko        goto again;
7912d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko      }
7922d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    }
7932d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
7942d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
7952d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
7962d44d77fed3200e2eff289f55493317e90d3398cDmitri GribenkoStringRef Lexer::getSpelling(const Token &Tok,
7972d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko                             const SourceManager &SourceMgr,
7982d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko                             bool *Invalid) const {
7992d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  SourceLocation Loc = Tok.getLocation();
8002d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
8012d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
8022d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  bool InvalidTemp = false;
8032d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
8042d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  if (InvalidTemp) {
8052d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    *Invalid = true;
8062d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko    return StringRef();
8072d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  }
8082d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
8092d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  const char *Begin = File.data() + LocInfo.second;
8102d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko  return StringRef(Begin, Tok.getLength());
8112d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko}
8122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
8132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} // end namespace comments
8142d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} // end namespace clang
8152d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko
816