CommentLexer.cpp revision aa58081902ad31927df02e8537d972eabe29d6df
12d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko#include "clang/AST/CommentLexer.h" 2aa58081902ad31927df02e8537d972eabe29d6dfDmitri Gribenko#include "clang/AST/CommentCommandTraits.h" 3477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko#include "clang/Basic/ConvertUTF.h" 42d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko#include "llvm/ADT/StringSwitch.h" 52d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko#include "llvm/Support/ErrorHandling.h" 62d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 72d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkonamespace clang { 82d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkonamespace comments { 92d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 102d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkovoid Token::dump(const Lexer &L, const SourceManager &SM) const { 112d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko llvm::errs() << "comments::Token Kind=" << Kind << " "; 122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko Loc.dump(SM); 132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n"; 142d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 152d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 16477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenkonamespace { 17477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenkobool isHTMLNamedCharacterReferenceCharacter(char C) { 18477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return (C >= 'a' && C <= 'z') || 19477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko (C >= 'A' && C <= 'Z'); 20477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko} 21477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 22477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenkobool isHTMLDecimalCharacterReferenceCharacter(char C) { 23477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return C >= '0' && C <= '9'; 24477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko} 25477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 26477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenkobool isHTMLHexCharacterReferenceCharacter(char C) { 27477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return (C >= '0' && C <= '9') || 28477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko (C >= 'a' && C <= 'f') || 29477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko (C >= 'A' && C <= 'F'); 30477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko} 31477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko} // unnamed namespace 32477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 33477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri GribenkoStringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const { 34477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return llvm::StringSwitch<StringRef>(Name) 35477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko .Case("amp", "&") 36477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko .Case("lt", "<") 37477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko .Case("gt", ">") 38477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko .Case("quot", "\"") 39477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko .Case("apos", "\'") 40477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko .Default(""); 41477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko} 42477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 43477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri GribenkoStringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const { 44477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko unsigned CodePoint = 0; 45477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko for (unsigned i = 0, e = Name.size(); i != e; ++i) { 46477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko assert(isHTMLDecimalCharacterReferenceCharacter(Name[i])); 47477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko CodePoint *= 10; 48477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko CodePoint += Name[i] - '0'; 49477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } 50477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 51477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT); 52477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko char *ResolvedPtr = Resolved; 53477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr)) 54477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return StringRef(Resolved, ResolvedPtr - Resolved); 55477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko else 56477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return StringRef(); 57477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko} 58477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 59477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri GribenkoStringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const { 60477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko unsigned CodePoint = 0; 61477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko for (unsigned i = 0, e = Name.size(); i != e; ++i) { 62477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko CodePoint *= 16; 63477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko const char C = Name[i]; 64477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko assert(isHTMLHexCharacterReferenceCharacter(C)); 65477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko if (C >= '0' && C <= '9') 66477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko CodePoint += Name[i] - '0'; 67477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko else if (C >= 'a' && C <= 'f') 68477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko CodePoint += Name[i] - 'a' + 10; 69477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko else 70477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko CodePoint += Name[i] - 'A' + 10; 71477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } 72477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 73477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT); 74477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko char *ResolvedPtr = Resolved; 75477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr)) 76477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return StringRef(Resolved, ResolvedPtr - Resolved); 77477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko else 78477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return StringRef(); 79477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko} 80477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 812d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkovoid Lexer::skipLineStartingDecorations() { 822d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // This function should be called only for C comments 832d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(CommentState == LCS_InsideCComment); 842d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 852d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (BufferPtr == CommentEnd) 862d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 872d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 882d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko switch (*BufferPtr) { 892d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case ' ': 902d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '\t': 912d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '\f': 922d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '\v': { 932d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *NewBufferPtr = BufferPtr; 942d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko NewBufferPtr++; 952d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (NewBufferPtr == CommentEnd) 962d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 972d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 982d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko char C = *NewBufferPtr; 992d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko while (C == ' ' || C == '\t' || C == '\f' || C == '\v') { 1002d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko NewBufferPtr++; 1012d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (NewBufferPtr == CommentEnd) 1022d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 1032d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko C = *NewBufferPtr; 1042d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 1052d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (C == '*') 1062d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr = NewBufferPtr + 1; 1072d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko break; 1082d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 1092d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '*': 1102d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; 1112d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko break; 1122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 1132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 1142d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 1152d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkonamespace { 1168d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko/// Returns pointer to the first newline character in the string. 1172d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *findNewline(const char *BufferPtr, const char *BufferEnd) { 1182d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 1192d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char C = *BufferPtr; 1202d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (C == '\n' || C == '\r') 1212d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferPtr; 1222d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 1232d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferEnd; 1242d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 1252d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 1262d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *skipNewline(const char *BufferPtr, const char *BufferEnd) { 1272d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (BufferPtr == BufferEnd) 1282d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferPtr; 1292d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 1302d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (*BufferPtr == '\n') 1312d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; 1322d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko else { 1332d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(*BufferPtr == '\r'); 1342d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; 1352d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (BufferPtr != BufferEnd && *BufferPtr == '\n') 1362d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; 1372d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 1382d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferPtr; 1392d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 1402d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 141477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenkoconst char *skipNamedCharacterReference(const char *BufferPtr, 142477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko const char *BufferEnd) { 143477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 144477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr)) 145477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return BufferPtr; 146477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } 147477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return BufferEnd; 148477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko} 149477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 150477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenkoconst char *skipDecimalCharacterReference(const char *BufferPtr, 151477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko const char *BufferEnd) { 152477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 153477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr)) 154477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return BufferPtr; 155477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } 156477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return BufferEnd; 157477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko} 158477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 159477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenkoconst char *skipHexCharacterReference(const char *BufferPtr, 160477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko const char *BufferEnd) { 161477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 162477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr)) 163477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return BufferPtr; 164477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } 165477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return BufferEnd; 166477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko} 167477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 168a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenkobool isHTMLIdentifierStartingCharacter(char C) { 169a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko return (C >= 'a' && C <= 'z') || 170a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko (C >= 'A' && C <= 'Z'); 171a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko} 172a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko 1732d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkobool isHTMLIdentifierCharacter(char C) { 1742d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return (C >= 'a' && C <= 'z') || 1752d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko (C >= 'A' && C <= 'Z') || 1762d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko (C >= '0' && C <= '9'); 1772d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 1782d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 1792d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) { 1802d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 1812d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (!isHTMLIdentifierCharacter(*BufferPtr)) 1822d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferPtr; 1832d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 1842d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferEnd; 1852d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 1862d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 1872d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// Skip HTML string quoted in single or double quotes. Escaping quotes inside 1882d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// string allowed. 1892d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// 1902d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// Returns pointer to closing quote. 1912d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd) 1922d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko{ 1932d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char Quote = *BufferPtr; 1942d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(Quote == '\"' || Quote == '\''); 1952d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 1962d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; 1972d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 1982d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char C = *BufferPtr; 1992d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (C == Quote && BufferPtr[-1] != '\\') 2002d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferPtr; 2012d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 2022d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferEnd; 2032d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 2042d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 2052d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkobool isHorizontalWhitespace(char C) { 2062d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return C == ' ' || C == '\t' || C == '\f' || C == '\v'; 2072d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 2082d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 2092d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkobool isWhitespace(char C) { 2102d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return C == ' ' || C == '\n' || C == '\r' || 2112d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko C == '\t' || C == '\f' || C == '\v'; 2122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 2132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 2142d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) { 2152d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 2162d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (!isWhitespace(*BufferPtr)) 2172d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferPtr; 2182d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 2192d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferEnd; 2202d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 2212d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 22264da4e55c111f4733135e1780216609569767351Dmitri Gribenkobool isWhitespace(const char *BufferPtr, const char *BufferEnd) { 22364da4e55c111f4733135e1780216609569767351Dmitri Gribenko return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd; 22464da4e55c111f4733135e1780216609569767351Dmitri Gribenko} 22564da4e55c111f4733135e1780216609569767351Dmitri Gribenko 2262d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkobool isCommandNameCharacter(char C) { 2272d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return (C >= 'a' && C <= 'z') || 2282d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko (C >= 'A' && C <= 'Z') || 2292d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko (C >= '0' && C <= '9'); 2302d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 2312d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 2322d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *skipCommandName(const char *BufferPtr, const char *BufferEnd) { 2332d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 2342d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (!isCommandNameCharacter(*BufferPtr)) 2352d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferPtr; 2362d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 2372d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferEnd; 2382d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 2392d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 2402d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// Return the one past end pointer for BCPL comments. 2412d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// Handles newlines escaped with backslash or trigraph for backslahs. 2422d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) { 2432d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *CurPtr = BufferPtr; 2442d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko while (CurPtr != BufferEnd) { 2452d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko char C = *CurPtr; 2462d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko while (C != '\n' && C != '\r') { 2472d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko CurPtr++; 2482d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (CurPtr == BufferEnd) 2492d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferEnd; 2502d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko C = *CurPtr; 2512d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 2522d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // We found a newline, check if it is escaped. 2532d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *EscapePtr = CurPtr - 1; 2542d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko while(isHorizontalWhitespace(*EscapePtr)) 2552d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko EscapePtr--; 2562d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 2572d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (*EscapePtr == '\\' || 2582d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' && 2592d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) { 2602d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // We found an escaped newline. 2612d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko CurPtr = skipNewline(CurPtr, BufferEnd); 2622d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } else 2632d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return CurPtr; // Not an escaped newline. 2642d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 2652d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferEnd; 2662d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 2672d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 2682d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// Return the one past end pointer for C comments. 2692d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// Very dumb, does not handle escaped newlines or trigraphs. 2702d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) { 2712d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 2722d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (*BufferPtr == '*') { 2732d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(BufferPtr + 1 != BufferEnd); 2742d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (*(BufferPtr + 1) == '/') 2752d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferPtr; 2762d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 2772d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 2782d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko llvm_unreachable("buffer end hit before '*/' was seen"); 2792d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 2802d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} // unnamed namespace 2812d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 2822d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkovoid Lexer::lexCommentText(Token &T) { 2832d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(CommentState == LCS_InsideBCPLComment || 2842d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko CommentState == LCS_InsideCComment); 2852d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 2862d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko switch (State) { 2872d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case LS_Normal: 2882d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko break; 2892d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case LS_VerbatimBlockFirstLine: 2902d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko lexVerbatimBlockFirstLine(T); 2912d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 2922d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case LS_VerbatimBlockBody: 2932d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko lexVerbatimBlockBody(T); 2942d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 295962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko case LS_VerbatimLineText: 296962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko lexVerbatimLineText(T); 297962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko return; 2983f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko case LS_HTMLStartTag: 2993f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko lexHTMLStartTag(T); 3002d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 3013f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko case LS_HTMLEndTag: 3023f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko lexHTMLEndTag(T); 3038d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko return; 3042d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 3052d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 3062d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(State == LS_Normal); 3072d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 3082d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *TokenPtr = BufferPtr; 3092d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(TokenPtr < CommentEnd); 3102d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko while (TokenPtr != CommentEnd) { 3112d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko switch(*TokenPtr) { 3122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '\\': 3132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '@': { 3142d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr++; 3152d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (TokenPtr == CommentEnd) { 316477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 3172d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 3182d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 3192d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko char C = *TokenPtr; 3202d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko switch (C) { 3212d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko default: 3222d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko break; 3232d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 3242d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '\\': case '@': case '&': case '$': 3252d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '#': case '<': case '>': case '%': 3262d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '\"': case '.': case ':': 3272d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // This is one of \\ \@ \& \$ etc escape sequences. 3282d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr++; 3292d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') { 3302d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // This is the \:: escape sequence. 3312d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr++; 3322d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 333f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1)); 3342d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, TokenPtr, tok::text); 335f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko T.setText(UnescapedText); 3362d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 3372d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 3382d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 3392d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Don't make zero-length commands. 3402d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (!isCommandNameCharacter(*TokenPtr)) { 341477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 3422d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 3432d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 3442d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 3452d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr = skipCommandName(TokenPtr, CommentEnd); 3462d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko unsigned Length = TokenPtr - (BufferPtr + 1); 3472d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 3482d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Hardcoded support for lexing LaTeX formula commands 3492d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // \f$ \f[ \f] \f{ \f} as a single command. 3502d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) { 3512d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko C = *TokenPtr; 3522d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') { 3532d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr++; 3542d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko Length++; 3552d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 3562d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 3572d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 3582d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const StringRef CommandName(BufferPtr + 1, Length); 3592d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko StringRef EndName; 3602d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 361aa58081902ad31927df02e8537d972eabe29d6dfDmitri Gribenko if (Traits.isVerbatimBlockCommand(CommandName, EndName)) { 3622d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, EndName); 3632d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 3642d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 365aa58081902ad31927df02e8537d972eabe29d6dfDmitri Gribenko if (Traits.isVerbatimLineCommand(CommandName)) { 366962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko setupAndLexVerbatimLine(T, TokenPtr); 3672d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 3682d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 3692d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, TokenPtr, tok::command); 3702d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko T.setCommandName(CommandName); 3712d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 3722d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 3732d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 374477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko case '&': 375477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko lexHTMLCharacterReference(T); 376477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return; 377477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 3782d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '<': { 3792d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr++; 3802d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (TokenPtr == CommentEnd) { 381477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 3822d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 3832d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 3842d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char C = *TokenPtr; 385a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko if (isHTMLIdentifierStartingCharacter(C)) 3863f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko setupAndLexHTMLStartTag(T); 3872d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko else if (C == '/') 3883f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko setupAndLexHTMLEndTag(T); 389477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko else 390477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 391477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 3922d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 3932d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 3942d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 3952d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '\n': 3962d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '\r': 3972d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr = skipNewline(TokenPtr, CommentEnd); 3982d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, TokenPtr, tok::newline); 3992d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 4002d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (CommentState == LCS_InsideCComment) 4012d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko skipLineStartingDecorations(); 4022d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 4032d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 4042d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko default: { 4052d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko while (true) { 4062d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr++; 4072d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (TokenPtr == CommentEnd) 4082d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko break; 409a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko const char C = *TokenPtr; 4102d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if(C == '\n' || C == '\r' || 411477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko C == '\\' || C == '@' || C == '&' || C == '<') 4122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko break; 4132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 414477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 4152d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 4162d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 4172d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 4182d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 4192d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 4202d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 4212d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkovoid Lexer::setupAndLexVerbatimBlock(Token &T, 4222d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *TextBegin, 4232d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko char Marker, StringRef EndName) { 4242d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko VerbatimBlockEndCommandName.clear(); 4252d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@"); 4262d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko VerbatimBlockEndCommandName.append(EndName); 4272d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 428f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko StringRef Name(BufferPtr + 1, TextBegin - (BufferPtr + 1)); 4292d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, TextBegin, tok::verbatim_block_begin); 430f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko T.setVerbatimBlockName(Name); 4312d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 4328d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko // If there is a newline following the verbatim opening command, skip the 4338d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko // newline so that we don't create an tok::verbatim_block_line with empty 4348d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko // text content. 4358d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko if (BufferPtr != CommentEnd) { 4368d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko const char C = *BufferPtr; 4378d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko if (C == '\n' || C == '\r') { 4388d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko BufferPtr = skipNewline(BufferPtr, CommentEnd); 4398d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko State = LS_VerbatimBlockBody; 4408d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko return; 4418d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko } 4428d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko } 4438d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko 4442d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko State = LS_VerbatimBlockFirstLine; 4452d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 4462d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 4472d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkovoid Lexer::lexVerbatimBlockFirstLine(Token &T) { 44864da4e55c111f4733135e1780216609569767351Dmitri Gribenkoagain: 4492d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(BufferPtr < CommentEnd); 4502d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 4512d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // FIXME: It would be better to scan the text once, finding either the block 4522d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // end command or newline. 4532d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // 4542d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Extract current line. 4552d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *Newline = findNewline(BufferPtr, CommentEnd); 4562d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko StringRef Line(BufferPtr, Newline - BufferPtr); 4572d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 4582d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Look for end command in current line. 4592d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko size_t Pos = Line.find(VerbatimBlockEndCommandName); 4608d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko const char *TextEnd; 4612d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *NextLine; 4622d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (Pos == StringRef::npos) { 4632d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Current line is completely verbatim. 4648d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko TextEnd = Newline; 4652d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko NextLine = skipNewline(Newline, CommentEnd); 4662d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } else if (Pos == 0) { 4672d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Current line contains just an end command. 4682d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *End = BufferPtr + VerbatimBlockEndCommandName.size(); 469f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko StringRef Name(BufferPtr + 1, End - (BufferPtr + 1)); 4702d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, End, tok::verbatim_block_end); 471f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko T.setVerbatimBlockName(Name); 4722d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko State = LS_Normal; 4732d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 4742d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } else { 4752d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // There is some text, followed by end command. Extract text first. 4768d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko TextEnd = BufferPtr + Pos; 4778d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko NextLine = TextEnd; 47864da4e55c111f4733135e1780216609569767351Dmitri Gribenko // If there is only whitespace before end command, skip whitespace. 47964da4e55c111f4733135e1780216609569767351Dmitri Gribenko if (isWhitespace(BufferPtr, TextEnd)) { 48064da4e55c111f4733135e1780216609569767351Dmitri Gribenko BufferPtr = TextEnd; 48164da4e55c111f4733135e1780216609569767351Dmitri Gribenko goto again; 48264da4e55c111f4733135e1780216609569767351Dmitri Gribenko } 4832d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 4842d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 4858d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko StringRef Text(BufferPtr, TextEnd - BufferPtr); 4862d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, NextLine, tok::verbatim_block_line); 487f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko T.setVerbatimBlockText(Text); 4882d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 4892d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko State = LS_VerbatimBlockBody; 4902d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 4912d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 4922d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkovoid Lexer::lexVerbatimBlockBody(Token &T) { 4932d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(State == LS_VerbatimBlockBody); 4942d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 4952d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (CommentState == LCS_InsideCComment) 4962d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko skipLineStartingDecorations(); 4972d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 4982d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko lexVerbatimBlockFirstLine(T); 4992d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 5002d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 501962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenkovoid Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin) { 5022d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const StringRef Name(BufferPtr + 1, TextBegin - BufferPtr - 1); 503962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko formTokenWithChars(T, TextBegin, tok::verbatim_line_name); 5042d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko T.setVerbatimLineName(Name); 505962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko 506962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko State = LS_VerbatimLineText; 507962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko} 508962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko 509962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenkovoid Lexer::lexVerbatimLineText(Token &T) { 510962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko assert(State == LS_VerbatimLineText); 511962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko 512962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko // Extract current line. 513962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko const char *Newline = findNewline(BufferPtr, CommentEnd); 514962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko const StringRef Text(BufferPtr, Newline - BufferPtr); 515962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko formTokenWithChars(T, Newline, tok::verbatim_line_text); 5162d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko T.setVerbatimLineText(Text); 517962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko 518962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko State = LS_Normal; 5192d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 5202d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 521477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenkovoid Lexer::lexHTMLCharacterReference(Token &T) { 522477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko const char *TokenPtr = BufferPtr; 523477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko assert(*TokenPtr == '&'); 524477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko TokenPtr++; 525477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko if (TokenPtr == CommentEnd) { 526477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 527477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return; 528477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } 529477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko const char *NamePtr; 530477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko bool isNamed = false; 531477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko bool isDecimal = false; 532477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko char C = *TokenPtr; 533477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko if (isHTMLNamedCharacterReferenceCharacter(C)) { 534477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko NamePtr = TokenPtr; 535477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd); 536477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko isNamed = true; 537477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } else if (C == '#') { 538477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko TokenPtr++; 539477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko if (TokenPtr == CommentEnd) { 540477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 541477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return; 542477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } 543477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko C = *TokenPtr; 544477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko if (isHTMLDecimalCharacterReferenceCharacter(C)) { 545477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko NamePtr = TokenPtr; 546477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd); 547477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko isDecimal = true; 548477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } else if (C == 'x' || C == 'X') { 549477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko TokenPtr++; 550477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko NamePtr = TokenPtr; 551477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd); 552477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } else { 553477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 554477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return; 555477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } 556477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } else { 557477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 558477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return; 559477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } 560477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko if (NamePtr == TokenPtr || TokenPtr == CommentEnd || 561477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko *TokenPtr != ';') { 562477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 563477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return; 564477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } 565477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko StringRef Name(NamePtr, TokenPtr - NamePtr); 566477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko TokenPtr++; // Skip semicolon. 567477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko StringRef Resolved; 568477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko if (isNamed) 569477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko Resolved = resolveHTMLNamedCharacterReference(Name); 570477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko else if (isDecimal) 571477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko Resolved = resolveHTMLDecimalCharacterReference(Name); 572477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko else 573477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko Resolved = resolveHTMLHexCharacterReference(Name); 574477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 575477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko if (Resolved.empty()) { 576477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 577477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return; 578477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } 579477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTokenWithChars(T, TokenPtr, tok::text); 580477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko T.setText(Resolved); 581477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return; 582477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko} 583477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 5843f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenkovoid Lexer::setupAndLexHTMLStartTag(Token &T) { 585a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko assert(BufferPtr[0] == '<' && 586a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko isHTMLIdentifierStartingCharacter(BufferPtr[1])); 5872d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd); 5882d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 589f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1)); 5903f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko formTokenWithChars(T, TagNameEnd, tok::html_start_tag); 5913f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko T.setHTMLTagStartName(Name); 5922d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 5932d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr = skipWhitespace(BufferPtr, CommentEnd); 5942d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 595a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko const char C = *BufferPtr; 596a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko if (BufferPtr != CommentEnd && 597a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C))) 5983f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko State = LS_HTMLStartTag; 5992d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 6002d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 6013f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenkovoid Lexer::lexHTMLStartTag(Token &T) { 6023f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko assert(State == LS_HTMLStartTag); 6032d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 6042d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *TokenPtr = BufferPtr; 6052d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko char C = *TokenPtr; 6062d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (isHTMLIdentifierCharacter(C)) { 6072d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd); 608f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko StringRef Ident(BufferPtr, TokenPtr - BufferPtr); 6092d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, TokenPtr, tok::html_ident); 610f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko T.setHTMLIdent(Ident); 6112d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } else { 6122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko switch (C) { 6132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '=': 6142d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr++; 6152d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, TokenPtr, tok::html_equals); 6162d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko break; 6172d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '\"': 6182d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '\'': { 6192d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *OpenQuote = TokenPtr; 6202d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd); 6212d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *ClosingQuote = TokenPtr; 6222d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (TokenPtr != CommentEnd) // Skip closing quote. 6232d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr++; 6242d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, TokenPtr, tok::html_quoted_string); 6252d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko T.setHTMLQuotedString(StringRef(OpenQuote + 1, 6262d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko ClosingQuote - (OpenQuote + 1))); 6272d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko break; 6282d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 6292d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '>': 6302d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr++; 6312d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, TokenPtr, tok::html_greater); 632a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko State = LS_Normal; 633a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko return; 634a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko case '/': 635a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko TokenPtr++; 636a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko if (TokenPtr != CommentEnd && *TokenPtr == '>') { 637a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko TokenPtr++; 638a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko formTokenWithChars(T, TokenPtr, tok::html_slash_greater); 639477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } else 640477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 641477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 642a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko State = LS_Normal; 643a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko return; 6442d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 6452d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 6462d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 6472d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Now look ahead and return to normal state if we don't see any HTML tokens 6482d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // ahead. 6492d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr = skipWhitespace(BufferPtr, CommentEnd); 6502d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (BufferPtr == CommentEnd) { 6512d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko State = LS_Normal; 6522d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 6532d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 6542d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 6552d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko C = *BufferPtr; 656a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko if (!isHTMLIdentifierStartingCharacter(C) && 6572d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko C != '=' && C != '\"' && C != '\'' && C != '>') { 6582d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko State = LS_Normal; 6592d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 6602d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 6612d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 6622d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 6633f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenkovoid Lexer::setupAndLexHTMLEndTag(Token &T) { 6642d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(BufferPtr[0] == '<' && BufferPtr[1] == '/'); 6652d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 6662d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd); 6672d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd); 6682d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 6692d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *End = skipWhitespace(TagNameEnd, CommentEnd); 6702d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 6713f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko formTokenWithChars(T, End, tok::html_end_tag); 6723f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko T.setHTMLTagEndName(StringRef(TagNameBegin, TagNameEnd - TagNameBegin)); 6738d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko 6748d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko if (BufferPtr != CommentEnd && *BufferPtr == '>') 6753f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko State = LS_HTMLEndTag; 6768d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko} 6778d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko 6783f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenkovoid Lexer::lexHTMLEndTag(Token &T) { 6798d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko assert(BufferPtr != CommentEnd && *BufferPtr == '>'); 6808d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko 6818d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko formTokenWithChars(T, BufferPtr + 1, tok::html_greater); 6828d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko State = LS_Normal; 6832d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 6842d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 685aa58081902ad31927df02e8537d972eabe29d6dfDmitri GribenkoLexer::Lexer(llvm::BumpPtrAllocator &Allocator, const CommandTraits &Traits, 686477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko SourceLocation FileLoc, const CommentOptions &CommOpts, 6872d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *BufferStart, const char *BufferEnd): 688aa58081902ad31927df02e8537d972eabe29d6dfDmitri Gribenko Allocator(Allocator), Traits(Traits), 6892d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferStart(BufferStart), BufferEnd(BufferEnd), 6902d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko FileLoc(FileLoc), CommOpts(CommOpts), BufferPtr(BufferStart), 6912d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko CommentState(LCS_BeforeComment), State(LS_Normal) { 6922d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 6932d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 6942d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkovoid Lexer::lex(Token &T) { 6952d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoagain: 6962d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko switch (CommentState) { 6972d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case LCS_BeforeComment: 6982d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (BufferPtr == BufferEnd) { 6992d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, BufferPtr, tok::eof); 7002d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 7012d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 7022d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7032d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(*BufferPtr == '/'); 7042d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; // Skip first slash. 7052d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko switch(*BufferPtr) { 7062d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '/': { // BCPL comment. 7072d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; // Skip second slash. 7082d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7092d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (BufferPtr != BufferEnd) { 7102d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Skip Doxygen magic marker, if it is present. 7112d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // It might be missing because of a typo //< or /*<, or because we 7122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // merged this non-Doxygen comment into a bunch of Doxygen comments 7132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // around it: /** ... */ /* ... */ /** ... */ 7142d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char C = *BufferPtr; 7152d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (C == '/' || C == '!') 7162d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; 7172d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 7182d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7192d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Skip less-than symbol that marks trailing comments. 7202d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Skip it even if the comment is not a Doxygen one, because //< and /*< 7212d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // are frequent typos. 7222d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (BufferPtr != BufferEnd && *BufferPtr == '<') 7232d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; 7242d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7252d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko CommentState = LCS_InsideBCPLComment; 7268d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine) 7278d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko State = LS_Normal; 7282d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd); 7292d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko goto again; 7302d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 7312d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '*': { // C comment. 7322d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; // Skip star. 7332d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7342d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Skip Doxygen magic marker. 7352d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char C = *BufferPtr; 7362d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!') 7372d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; 7382d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7392d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Skip less-than symbol that marks trailing comments. 7402d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (BufferPtr != BufferEnd && *BufferPtr == '<') 7412d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; 7422d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7432d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko CommentState = LCS_InsideCComment; 7442d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko State = LS_Normal; 7452d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko CommentEnd = findCCommentEnd(BufferPtr, BufferEnd); 7462d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko goto again; 7472d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 7482d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko default: 7492d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko llvm_unreachable("second character of comment should be '/' or '*'"); 7502d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 7512d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7522d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case LCS_BetweenComments: { 7532d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Consecutive comments are extracted only if there is only whitespace 7542d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // between them. So we can search for the start of the next comment. 7552d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *EndWhitespace = BufferPtr; 7562d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko while(EndWhitespace != BufferEnd && *EndWhitespace != '/') 7572d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko EndWhitespace++; 7582d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7592d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Turn any whitespace between comments (and there is only whitespace 760a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko // between them -- guaranteed by comment extraction) into a newline. We 761a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko // have two newlines between C comments in total (first one was synthesized 762a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko // after a comment). 7632d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, EndWhitespace, tok::newline); 7642d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7652d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko CommentState = LCS_BeforeComment; 7662d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko break; 7672d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 7682d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7692d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case LCS_InsideBCPLComment: 7702d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case LCS_InsideCComment: 7712d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (BufferPtr != CommentEnd) { 7722d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko lexCommentText(T); 7732d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko break; 7742d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } else { 7752d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Skip C comment closing sequence. 7762d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (CommentState == LCS_InsideCComment) { 7772d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(BufferPtr[0] == '*' && BufferPtr[1] == '/'); 7782d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr += 2; 7792d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(BufferPtr <= BufferEnd); 7802d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7812d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Synthenize newline just after the C comment, regardless if there is 7822d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // actually a newline. 7832d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, BufferPtr, tok::newline); 7842d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7852d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko CommentState = LCS_BetweenComments; 7862d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko break; 7872d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } else { 7882d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Don't synthesized a newline after BCPL comment. 7892d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko CommentState = LCS_BetweenComments; 7902d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko goto again; 7912d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 7922d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 7932d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 7942d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 7952d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7962d44d77fed3200e2eff289f55493317e90d3398cDmitri GribenkoStringRef Lexer::getSpelling(const Token &Tok, 7972d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const SourceManager &SourceMgr, 7982d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko bool *Invalid) const { 7992d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko SourceLocation Loc = Tok.getLocation(); 8002d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc); 8012d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 8022d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko bool InvalidTemp = false; 8032d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp); 8042d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (InvalidTemp) { 8052d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko *Invalid = true; 8062d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return StringRef(); 8072d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 8082d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 8092d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *Begin = File.data() + LocInfo.second; 8102d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return StringRef(Begin, Tok.getLength()); 8112d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 8122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 8132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} // end namespace comments 8142d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} // end namespace clang 8152d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 816