CommentLexer.cpp revision abbfa671539c74b5bec66a64964de984c908cdfa
12d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko#include "clang/AST/CommentLexer.h" 2aa58081902ad31927df02e8537d972eabe29d6dfDmitri Gribenko#include "clang/AST/CommentCommandTraits.h" 3efa78d163214fd9e909ab2bf6911edfbc7a2b9dfFariborz Jahanian#include "clang/AST/CommentDiagnostic.h" 4bf8814478fddfa611911bdbd6a53a6614938cc63Dmitri Gribenko#include "clang/Basic/CharInfo.h" 5c934dfe950a14fe447aa14a7dae25d00ee87c8bbDmitri Gribenko#include "llvm/ADT/StringExtras.h" 62d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko#include "llvm/ADT/StringSwitch.h" 7cb5620c9b213f4bd323912159fdddda35e258a14Dmitri Gribenko#include "llvm/Support/ConvertUTF.h" 82d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko#include "llvm/Support/ErrorHandling.h" 92d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 102d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkonamespace clang { 112d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkonamespace comments { 122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkovoid Token::dump(const Lexer &L, const SourceManager &SM) const { 142d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko llvm::errs() << "comments::Token Kind=" << Kind << " "; 152d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko Loc.dump(SM); 162d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n"; 172d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 182d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 190ff4f8bf47c924b4b01d989a53432a95471a068dDmitri Gribenkostatic inline bool isHTMLNamedCharacterReferenceCharacter(char C) { 20bf8814478fddfa611911bdbd6a53a6614938cc63Dmitri Gribenko return isLetter(C); 21477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko} 22477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 230ff4f8bf47c924b4b01d989a53432a95471a068dDmitri Gribenkostatic inline bool isHTMLDecimalCharacterReferenceCharacter(char C) { 24bf8814478fddfa611911bdbd6a53a6614938cc63Dmitri Gribenko return isDigit(C); 25477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko} 26477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 270ff4f8bf47c924b4b01d989a53432a95471a068dDmitri Gribenkostatic inline bool isHTMLHexCharacterReferenceCharacter(char C) { 28bf8814478fddfa611911bdbd6a53a6614938cc63Dmitri Gribenko return isHexDigit(C); 29477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko} 30834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko 310ff4f8bf47c924b4b01d989a53432a95471a068dDmitri Gribenkostatic inline StringRef convertCodePointToUTF8( 320ff4f8bf47c924b4b01d989a53432a95471a068dDmitri Gribenko llvm::BumpPtrAllocator &Allocator, 330ff4f8bf47c924b4b01d989a53432a95471a068dDmitri Gribenko unsigned CodePoint) { 34658a115c8e0d5bddf607a13d2ce13cd306ef2389Fariborz Jahanian char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT); 35658a115c8e0d5bddf607a13d2ce13cd306ef2389Fariborz Jahanian char *ResolvedPtr = Resolved; 36cb5620c9b213f4bd323912159fdddda35e258a14Dmitri Gribenko if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr)) 37658a115c8e0d5bddf607a13d2ce13cd306ef2389Fariborz Jahanian return StringRef(Resolved, ResolvedPtr - Resolved); 38658a115c8e0d5bddf607a13d2ce13cd306ef2389Fariborz Jahanian else 39658a115c8e0d5bddf607a13d2ce13cd306ef2389Fariborz Jahanian return StringRef(); 40658a115c8e0d5bddf607a13d2ce13cd306ef2389Fariborz Jahanian} 415bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko 420ff4f8bf47c924b4b01d989a53432a95471a068dDmitri Gribenkonamespace { 430ff4f8bf47c924b4b01d989a53432a95471a068dDmitri Gribenko 445bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko#include "clang/AST/CommentHTMLTags.inc" 455bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko#include "clang/AST/CommentHTMLNamedCharacterReferences.inc" 465bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko 475bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko} // unnamed namespace 48658a115c8e0d5bddf607a13d2ce13cd306ef2389Fariborz Jahanian 49477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri GribenkoStringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const { 505bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko // Fast path, first check a few most widely used named character references. 51477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return llvm::StringSwitch<StringRef>(Name) 52477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko .Case("amp", "&") 53477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko .Case("lt", "<") 54477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko .Case("gt", ">") 55477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko .Case("quot", "\"") 56477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko .Case("apos", "\'") 575bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko // Slow path. 585bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko .Default(translateHTMLNamedCharacterReferenceToUTF8(Name)); 59658a115c8e0d5bddf607a13d2ce13cd306ef2389Fariborz Jahanian} 60477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 61477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri GribenkoStringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const { 62477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko unsigned CodePoint = 0; 63477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko for (unsigned i = 0, e = Name.size(); i != e; ++i) { 64477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko assert(isHTMLDecimalCharacterReferenceCharacter(Name[i])); 65477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko CodePoint *= 10; 66477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko CodePoint += Name[i] - '0'; 67477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } 685bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko return convertCodePointToUTF8(Allocator, CodePoint); 695bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko} 70477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 715bd1e5ba000023910ad986a16dd16d7ca914750aDmitri GribenkoStringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const { 725bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko unsigned CodePoint = 0; 735bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko for (unsigned i = 0, e = Name.size(); i != e; ++i) { 745bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko CodePoint *= 16; 755bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko const char C = Name[i]; 765bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko assert(isHTMLHexCharacterReferenceCharacter(C)); 775bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko CodePoint += llvm::hexDigitValue(C); 785bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko } 795bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko return convertCodePointToUTF8(Allocator, CodePoint); 80477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko} 81477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 822d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkovoid Lexer::skipLineStartingDecorations() { 832d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // This function should be called only for C comments 842d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(CommentState == LCS_InsideCComment); 852d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 862d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (BufferPtr == CommentEnd) 872d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 882d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 892d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko switch (*BufferPtr) { 902d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case ' ': 912d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '\t': 922d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '\f': 932d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '\v': { 942d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *NewBufferPtr = BufferPtr; 952d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko NewBufferPtr++; 962d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (NewBufferPtr == CommentEnd) 972d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 982d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 992d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko char C = *NewBufferPtr; 100bf8814478fddfa611911bdbd6a53a6614938cc63Dmitri Gribenko while (isHorizontalWhitespace(C)) { 1012d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko NewBufferPtr++; 1022d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (NewBufferPtr == CommentEnd) 1032d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 1042d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko C = *NewBufferPtr; 1052d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 1062d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (C == '*') 1072d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr = NewBufferPtr + 1; 1082d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko break; 1092d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 1102d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '*': 1112d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; 1122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko break; 1132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 1142d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 1152d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 1162d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkonamespace { 1178d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko/// Returns pointer to the first newline character in the string. 1182d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *findNewline(const char *BufferPtr, const char *BufferEnd) { 1192d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 120bf8814478fddfa611911bdbd6a53a6614938cc63Dmitri Gribenko if (isVerticalWhitespace(*BufferPtr)) 1212d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferPtr; 1222d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 1232d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferEnd; 1242d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 1252d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 1262d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *skipNewline(const char *BufferPtr, const char *BufferEnd) { 1272d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (BufferPtr == BufferEnd) 1282d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferPtr; 1292d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 1302d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (*BufferPtr == '\n') 1312d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; 1322d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko else { 1332d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(*BufferPtr == '\r'); 1342d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; 1352d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (BufferPtr != BufferEnd && *BufferPtr == '\n') 1362d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; 1372d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 1382d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferPtr; 1392d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 1402d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 141477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenkoconst char *skipNamedCharacterReference(const char *BufferPtr, 142477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko const char *BufferEnd) { 143477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 144477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr)) 145477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return BufferPtr; 146477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } 147477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return BufferEnd; 148477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko} 149477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 150477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenkoconst char *skipDecimalCharacterReference(const char *BufferPtr, 151477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko const char *BufferEnd) { 152477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 153477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr)) 154477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return BufferPtr; 155477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } 156477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return BufferEnd; 157477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko} 158477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 159477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenkoconst char *skipHexCharacterReference(const char *BufferPtr, 160477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko const char *BufferEnd) { 161477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 162477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr)) 163477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return BufferPtr; 164477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } 165477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return BufferEnd; 166477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko} 167477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 168a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenkobool isHTMLIdentifierStartingCharacter(char C) { 169bf8814478fddfa611911bdbd6a53a6614938cc63Dmitri Gribenko return isLetter(C); 170a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko} 171a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko 1722d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkobool isHTMLIdentifierCharacter(char C) { 173bf8814478fddfa611911bdbd6a53a6614938cc63Dmitri Gribenko return isAlphanumeric(C); 1742d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 1752d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 1762d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) { 1772d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 1782d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (!isHTMLIdentifierCharacter(*BufferPtr)) 1792d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferPtr; 1802d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 1812d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferEnd; 1822d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 1832d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 1842d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// Skip HTML string quoted in single or double quotes. Escaping quotes inside 1852d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// string allowed. 1862d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// 1872d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// Returns pointer to closing quote. 1882d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd) 1892d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko{ 1902d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char Quote = *BufferPtr; 1912d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(Quote == '\"' || Quote == '\''); 1922d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 1932d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; 1942d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 1952d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char C = *BufferPtr; 1962d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (C == Quote && BufferPtr[-1] != '\\') 1972d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferPtr; 1982d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 1992d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferEnd; 2002d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 2012d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 2022d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) { 2032d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 2042d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (!isWhitespace(*BufferPtr)) 2052d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferPtr; 2062d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 2072d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferEnd; 2082d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 2092d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 21064da4e55c111f4733135e1780216609569767351Dmitri Gribenkobool isWhitespace(const char *BufferPtr, const char *BufferEnd) { 21164da4e55c111f4733135e1780216609569767351Dmitri Gribenko return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd; 21264da4e55c111f4733135e1780216609569767351Dmitri Gribenko} 21364da4e55c111f4733135e1780216609569767351Dmitri Gribenko 2148c05da3fd8db98af482826ba059ab1ad6d58010fDmitri Gribenkobool isCommandNameStartCharacter(char C) { 215bf8814478fddfa611911bdbd6a53a6614938cc63Dmitri Gribenko return isLetter(C); 2168c05da3fd8db98af482826ba059ab1ad6d58010fDmitri Gribenko} 2178c05da3fd8db98af482826ba059ab1ad6d58010fDmitri Gribenko 2182d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkobool isCommandNameCharacter(char C) { 219bf8814478fddfa611911bdbd6a53a6614938cc63Dmitri Gribenko return isAlphanumeric(C); 2202d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 2212d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 2222d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *skipCommandName(const char *BufferPtr, const char *BufferEnd) { 2232d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 2242d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (!isCommandNameCharacter(*BufferPtr)) 2252d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferPtr; 2262d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 2272d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferEnd; 2282d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 2292d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 2302d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// Return the one past end pointer for BCPL comments. 2312d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// Handles newlines escaped with backslash or trigraph for backslahs. 2322d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) { 2332d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *CurPtr = BufferPtr; 2342d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko while (CurPtr != BufferEnd) { 235bf8814478fddfa611911bdbd6a53a6614938cc63Dmitri Gribenko while (!isVerticalWhitespace(*CurPtr)) { 2362d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko CurPtr++; 2372d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (CurPtr == BufferEnd) 2382d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferEnd; 2392d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 2402d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // We found a newline, check if it is escaped. 2412d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *EscapePtr = CurPtr - 1; 2422d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko while(isHorizontalWhitespace(*EscapePtr)) 2432d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko EscapePtr--; 2442d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 2452d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (*EscapePtr == '\\' || 2462d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' && 2472d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) { 2482d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // We found an escaped newline. 2492d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko CurPtr = skipNewline(CurPtr, BufferEnd); 2502d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } else 2512d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return CurPtr; // Not an escaped newline. 2522d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 2532d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferEnd; 2542d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 2552d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 2562d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// Return the one past end pointer for C comments. 2572d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// Very dumb, does not handle escaped newlines or trigraphs. 2582d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) { 2592d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 2602d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (*BufferPtr == '*') { 2612d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(BufferPtr + 1 != BufferEnd); 2622d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (*(BufferPtr + 1) == '/') 2632d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferPtr; 2642d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 2652d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 2662d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko llvm_unreachable("buffer end hit before '*/' was seen"); 2672d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 2680089bc4ddee6bb309ad25f4c7ad4b7ffe5df4512Fariborz Jahanian 2692d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} // unnamed namespace 2702d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 2712d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkovoid Lexer::lexCommentText(Token &T) { 2722d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(CommentState == LCS_InsideBCPLComment || 2732d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko CommentState == LCS_InsideCComment); 2742d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 2752d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko switch (State) { 2762d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case LS_Normal: 2772d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko break; 2782d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case LS_VerbatimBlockFirstLine: 2792d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko lexVerbatimBlockFirstLine(T); 2802d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 2812d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case LS_VerbatimBlockBody: 2822d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko lexVerbatimBlockBody(T); 2832d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 284962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko case LS_VerbatimLineText: 285962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko lexVerbatimLineText(T); 286962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko return; 2873f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko case LS_HTMLStartTag: 2883f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko lexHTMLStartTag(T); 2892d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 2903f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko case LS_HTMLEndTag: 2913f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko lexHTMLEndTag(T); 2928d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko return; 2932d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 2942d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 2952d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(State == LS_Normal); 2962d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 2972d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *TokenPtr = BufferPtr; 2982d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(TokenPtr < CommentEnd); 2992d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko while (TokenPtr != CommentEnd) { 3002d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko switch(*TokenPtr) { 3012d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '\\': 3022d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '@': { 303808383d2d6d58a7c7db85f8c7618fb74d821309fDmitri Gribenko // Commands that start with a backslash and commands that start with 304808383d2d6d58a7c7db85f8c7618fb74d821309fDmitri Gribenko // 'at' have equivalent semantics. But we keep information about the 305808383d2d6d58a7c7db85f8c7618fb74d821309fDmitri Gribenko // exact syntax in AST for comments. 306808383d2d6d58a7c7db85f8c7618fb74d821309fDmitri Gribenko tok::TokenKind CommandKind = 307808383d2d6d58a7c7db85f8c7618fb74d821309fDmitri Gribenko (*TokenPtr == '@') ? tok::at_command : tok::backslash_command; 3082d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr++; 3092d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (TokenPtr == CommentEnd) { 310477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 3112d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 3122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 3132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko char C = *TokenPtr; 3142d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko switch (C) { 3152d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko default: 3162d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko break; 3172d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 3182d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '\\': case '@': case '&': case '$': 3192d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '#': case '<': case '>': case '%': 3202d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '\"': case '.': case ':': 3212d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // This is one of \\ \@ \& \$ etc escape sequences. 3222d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr++; 3232d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') { 3242d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // This is the \:: escape sequence. 3252d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr++; 3262d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 327f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1)); 3282d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, TokenPtr, tok::text); 329f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko T.setText(UnescapedText); 3302d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 3312d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 3322d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 3332d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Don't make zero-length commands. 3348c05da3fd8db98af482826ba059ab1ad6d58010fDmitri Gribenko if (!isCommandNameStartCharacter(*TokenPtr)) { 335477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 3362d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 3372d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 3382d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 3392d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr = skipCommandName(TokenPtr, CommentEnd); 3402d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko unsigned Length = TokenPtr - (BufferPtr + 1); 3412d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 3422d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Hardcoded support for lexing LaTeX formula commands 3432d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // \f$ \f[ \f] \f{ \f} as a single command. 3442d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) { 3452d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko C = *TokenPtr; 3462d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') { 3472d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr++; 3482d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko Length++; 3492d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 3502d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 3512d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 3522d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const StringRef CommandName(BufferPtr + 1, Length); 3532d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 354e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName); 355e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko if (!Info) { 356e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko formTokenWithChars(T, TokenPtr, tok::unknown_command); 357e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko T.setUnknownCommandName(CommandName); 358abbfa671539c74b5bec66a64964de984c908cdfaFariborz Jahanian if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) { 3590089bc4ddee6bb309ad25f4c7ad4b7ffe5df4512Fariborz Jahanian StringRef CorrectedName = Info->Name; 3600089bc4ddee6bb309ad25f4c7ad4b7ffe5df4512Fariborz Jahanian SourceRange CommandRange(T.getLocation().getLocWithOffset(1), 3610089bc4ddee6bb309ad25f4c7ad4b7ffe5df4512Fariborz Jahanian T.getEndLocation()); 3620089bc4ddee6bb309ad25f4c7ad4b7ffe5df4512Fariborz Jahanian Diag(T.getLocation(), diag::warn_correct_comment_command_name) 3630089bc4ddee6bb309ad25f4c7ad4b7ffe5df4512Fariborz Jahanian << CommandName << CorrectedName 3640089bc4ddee6bb309ad25f4c7ad4b7ffe5df4512Fariborz Jahanian << FixItHint::CreateReplacement(CommandRange, CorrectedName); 3650089bc4ddee6bb309ad25f4c7ad4b7ffe5df4512Fariborz Jahanian } else { 3660089bc4ddee6bb309ad25f4c7ad4b7ffe5df4512Fariborz Jahanian Diag(T.getLocation(), diag::warn_unknown_comment_command_name); 3670089bc4ddee6bb309ad25f4c7ad4b7ffe5df4512Fariborz Jahanian return; 3680089bc4ddee6bb309ad25f4c7ad4b7ffe5df4512Fariborz Jahanian } 3692d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 370e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko if (Info->IsVerbatimBlockCommand) { 371e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info); 372e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko return; 373e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko } 374e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko if (Info->IsVerbatimLineCommand) { 375e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko setupAndLexVerbatimLine(T, TokenPtr, Info); 3762d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 3772d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 378808383d2d6d58a7c7db85f8c7618fb74d821309fDmitri Gribenko formTokenWithChars(T, TokenPtr, CommandKind); 379e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko T.setCommandID(Info->getID()); 3802d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 3812d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 3822d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 383477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko case '&': 384477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko lexHTMLCharacterReference(T); 385477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return; 386477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 3872d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '<': { 3882d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr++; 3892d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (TokenPtr == CommentEnd) { 390477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 3912d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 3922d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 3932d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char C = *TokenPtr; 394a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko if (isHTMLIdentifierStartingCharacter(C)) 3953f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko setupAndLexHTMLStartTag(T); 3962d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko else if (C == '/') 3973f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko setupAndLexHTMLEndTag(T); 398477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko else 399477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 400477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 4012d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 4022d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 4032d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 4042d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '\n': 4052d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '\r': 4062d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr = skipNewline(TokenPtr, CommentEnd); 4072d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, TokenPtr, tok::newline); 4082d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 4092d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (CommentState == LCS_InsideCComment) 4102d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko skipLineStartingDecorations(); 4112d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 4122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 4132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko default: { 414aa7dbafc3539868ce271cb336444ec544260905aDmitri Gribenko size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr). 415aa7dbafc3539868ce271cb336444ec544260905aDmitri Gribenko find_first_of("\n\r\\@&<"); 416aa7dbafc3539868ce271cb336444ec544260905aDmitri Gribenko if (End != StringRef::npos) 417aa7dbafc3539868ce271cb336444ec544260905aDmitri Gribenko TokenPtr += End; 418aa7dbafc3539868ce271cb336444ec544260905aDmitri Gribenko else 419aa7dbafc3539868ce271cb336444ec544260905aDmitri Gribenko TokenPtr = CommentEnd; 420477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 4212d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 4222d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 4232d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 4242d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 4252d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 4262d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 4272d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkovoid Lexer::setupAndLexVerbatimBlock(Token &T, 4282d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *TextBegin, 429e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko char Marker, const CommandInfo *Info) { 430e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko assert(Info->IsVerbatimBlockCommand); 431e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko 4322d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko VerbatimBlockEndCommandName.clear(); 4332d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@"); 434e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko VerbatimBlockEndCommandName.append(Info->EndCommandName); 4352d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 4362d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, TextBegin, tok::verbatim_block_begin); 437e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko T.setVerbatimBlockID(Info->getID()); 4382d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 4398d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko // If there is a newline following the verbatim opening command, skip the 4408d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko // newline so that we don't create an tok::verbatim_block_line with empty 4418d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko // text content. 442bf8814478fddfa611911bdbd6a53a6614938cc63Dmitri Gribenko if (BufferPtr != CommentEnd && 443bf8814478fddfa611911bdbd6a53a6614938cc63Dmitri Gribenko isVerticalWhitespace(*BufferPtr)) { 444bf8814478fddfa611911bdbd6a53a6614938cc63Dmitri Gribenko BufferPtr = skipNewline(BufferPtr, CommentEnd); 445bf8814478fddfa611911bdbd6a53a6614938cc63Dmitri Gribenko State = LS_VerbatimBlockBody; 446bf8814478fddfa611911bdbd6a53a6614938cc63Dmitri Gribenko return; 4478d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko } 4488d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko 4492d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko State = LS_VerbatimBlockFirstLine; 4502d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 4512d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 4522d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkovoid Lexer::lexVerbatimBlockFirstLine(Token &T) { 45364da4e55c111f4733135e1780216609569767351Dmitri Gribenkoagain: 4542d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(BufferPtr < CommentEnd); 4552d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 4562d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // FIXME: It would be better to scan the text once, finding either the block 4572d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // end command or newline. 4582d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // 4592d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Extract current line. 4602d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *Newline = findNewline(BufferPtr, CommentEnd); 4612d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko StringRef Line(BufferPtr, Newline - BufferPtr); 4622d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 4632d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Look for end command in current line. 4642d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko size_t Pos = Line.find(VerbatimBlockEndCommandName); 4658d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko const char *TextEnd; 4662d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *NextLine; 4672d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (Pos == StringRef::npos) { 4682d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Current line is completely verbatim. 4698d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko TextEnd = Newline; 4702d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko NextLine = skipNewline(Newline, CommentEnd); 4712d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } else if (Pos == 0) { 4722d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Current line contains just an end command. 4732d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *End = BufferPtr + VerbatimBlockEndCommandName.size(); 474f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko StringRef Name(BufferPtr + 1, End - (BufferPtr + 1)); 4752d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, End, tok::verbatim_block_end); 476e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID()); 4772d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko State = LS_Normal; 4782d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 4792d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } else { 4802d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // There is some text, followed by end command. Extract text first. 4818d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko TextEnd = BufferPtr + Pos; 4828d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko NextLine = TextEnd; 48364da4e55c111f4733135e1780216609569767351Dmitri Gribenko // If there is only whitespace before end command, skip whitespace. 48464da4e55c111f4733135e1780216609569767351Dmitri Gribenko if (isWhitespace(BufferPtr, TextEnd)) { 48564da4e55c111f4733135e1780216609569767351Dmitri Gribenko BufferPtr = TextEnd; 48664da4e55c111f4733135e1780216609569767351Dmitri Gribenko goto again; 48764da4e55c111f4733135e1780216609569767351Dmitri Gribenko } 4882d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 4892d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 4908d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko StringRef Text(BufferPtr, TextEnd - BufferPtr); 4912d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, NextLine, tok::verbatim_block_line); 492f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko T.setVerbatimBlockText(Text); 4932d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 4942d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko State = LS_VerbatimBlockBody; 4952d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 4962d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 4972d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkovoid Lexer::lexVerbatimBlockBody(Token &T) { 4982d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(State == LS_VerbatimBlockBody); 4992d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 5002d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (CommentState == LCS_InsideCComment) 5012d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko skipLineStartingDecorations(); 5022d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 5032d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko lexVerbatimBlockFirstLine(T); 5042d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 5052d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 506e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenkovoid Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin, 507e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko const CommandInfo *Info) { 508e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko assert(Info->IsVerbatimLineCommand); 509962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko formTokenWithChars(T, TextBegin, tok::verbatim_line_name); 510e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko T.setVerbatimLineID(Info->getID()); 511962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko 512962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko State = LS_VerbatimLineText; 513962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko} 514962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko 515962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenkovoid Lexer::lexVerbatimLineText(Token &T) { 516962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko assert(State == LS_VerbatimLineText); 517962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko 518962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko // Extract current line. 519962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko const char *Newline = findNewline(BufferPtr, CommentEnd); 520962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko const StringRef Text(BufferPtr, Newline - BufferPtr); 521962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko formTokenWithChars(T, Newline, tok::verbatim_line_text); 5222d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko T.setVerbatimLineText(Text); 523962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko 524962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko State = LS_Normal; 5252d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 5262d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 527477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenkovoid Lexer::lexHTMLCharacterReference(Token &T) { 528477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko const char *TokenPtr = BufferPtr; 529477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko assert(*TokenPtr == '&'); 530477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko TokenPtr++; 531477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko if (TokenPtr == CommentEnd) { 532477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 533477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return; 534477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } 535477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko const char *NamePtr; 536477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko bool isNamed = false; 537477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko bool isDecimal = false; 538477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko char C = *TokenPtr; 539477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko if (isHTMLNamedCharacterReferenceCharacter(C)) { 540477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko NamePtr = TokenPtr; 541477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd); 542477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko isNamed = true; 543477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } else if (C == '#') { 544477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko TokenPtr++; 545477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko if (TokenPtr == CommentEnd) { 546477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 547477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return; 548477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } 549477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko C = *TokenPtr; 550477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko if (isHTMLDecimalCharacterReferenceCharacter(C)) { 551477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko NamePtr = TokenPtr; 552477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd); 553477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko isDecimal = true; 554477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } else if (C == 'x' || C == 'X') { 555477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko TokenPtr++; 556477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko NamePtr = TokenPtr; 557477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd); 558477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } else { 559477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 560477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return; 561477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } 562477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } else { 563477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 564477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return; 565477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } 566477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko if (NamePtr == TokenPtr || TokenPtr == CommentEnd || 567477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko *TokenPtr != ';') { 568477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 569477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return; 570477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } 571477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko StringRef Name(NamePtr, TokenPtr - NamePtr); 572477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko TokenPtr++; // Skip semicolon. 573477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko StringRef Resolved; 5745bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko if (isNamed) 575477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko Resolved = resolveHTMLNamedCharacterReference(Name); 576477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko else if (isDecimal) 577477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko Resolved = resolveHTMLDecimalCharacterReference(Name); 578477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko else 579477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko Resolved = resolveHTMLHexCharacterReference(Name); 580477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 581477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko if (Resolved.empty()) { 582477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 583477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return; 584477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } 585477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTokenWithChars(T, TokenPtr, tok::text); 586477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko T.setText(Resolved); 587477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return; 588477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko} 589477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 5903f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenkovoid Lexer::setupAndLexHTMLStartTag(Token &T) { 591a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko assert(BufferPtr[0] == '<' && 592a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko isHTMLIdentifierStartingCharacter(BufferPtr[1])); 5932d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd); 594f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1)); 595834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko if (!isHTMLTagName(Name)) { 596834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko formTextToken(T, TagNameEnd); 597834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko return; 598834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko } 599834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko 6003f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko formTokenWithChars(T, TagNameEnd, tok::html_start_tag); 6013f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko T.setHTMLTagStartName(Name); 6022d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 6032d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr = skipWhitespace(BufferPtr, CommentEnd); 6042d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 605a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko const char C = *BufferPtr; 606a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko if (BufferPtr != CommentEnd && 607a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C))) 6083f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko State = LS_HTMLStartTag; 6092d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 6102d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 6113f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenkovoid Lexer::lexHTMLStartTag(Token &T) { 6123f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko assert(State == LS_HTMLStartTag); 6132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 6142d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *TokenPtr = BufferPtr; 6152d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko char C = *TokenPtr; 6162d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (isHTMLIdentifierCharacter(C)) { 6172d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd); 618f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko StringRef Ident(BufferPtr, TokenPtr - BufferPtr); 6192d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, TokenPtr, tok::html_ident); 620f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko T.setHTMLIdent(Ident); 6212d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } else { 6222d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko switch (C) { 6232d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '=': 6242d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr++; 6252d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, TokenPtr, tok::html_equals); 6262d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko break; 6272d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '\"': 6282d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '\'': { 6292d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *OpenQuote = TokenPtr; 6302d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd); 6312d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *ClosingQuote = TokenPtr; 6322d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (TokenPtr != CommentEnd) // Skip closing quote. 6332d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr++; 6342d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, TokenPtr, tok::html_quoted_string); 6352d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko T.setHTMLQuotedString(StringRef(OpenQuote + 1, 6362d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko ClosingQuote - (OpenQuote + 1))); 6372d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko break; 6382d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 6392d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '>': 6402d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr++; 6412d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, TokenPtr, tok::html_greater); 642a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko State = LS_Normal; 643a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko return; 644a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko case '/': 645a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko TokenPtr++; 646a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko if (TokenPtr != CommentEnd && *TokenPtr == '>') { 647a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko TokenPtr++; 648a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko formTokenWithChars(T, TokenPtr, tok::html_slash_greater); 649477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } else 650477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 651477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 652a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko State = LS_Normal; 653a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko return; 6542d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 6552d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 6562d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 6572d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Now look ahead and return to normal state if we don't see any HTML tokens 6582d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // ahead. 6592d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr = skipWhitespace(BufferPtr, CommentEnd); 6602d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (BufferPtr == CommentEnd) { 6612d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko State = LS_Normal; 6622d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 6632d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 6642d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 6652d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko C = *BufferPtr; 666a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko if (!isHTMLIdentifierStartingCharacter(C) && 6672d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko C != '=' && C != '\"' && C != '\'' && C != '>') { 6682d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko State = LS_Normal; 6692d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 6702d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 6712d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 6722d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 6733f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenkovoid Lexer::setupAndLexHTMLEndTag(Token &T) { 6742d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(BufferPtr[0] == '<' && BufferPtr[1] == '/'); 6752d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 6762d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd); 6772d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd); 678834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin); 679834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko if (!isHTMLTagName(Name)) { 680834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko formTextToken(T, TagNameEnd); 681834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko return; 682834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko } 6832d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 6842d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *End = skipWhitespace(TagNameEnd, CommentEnd); 6852d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 6863f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko formTokenWithChars(T, End, tok::html_end_tag); 687834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko T.setHTMLTagEndName(Name); 6888d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko 6898d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko if (BufferPtr != CommentEnd && *BufferPtr == '>') 6903f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko State = LS_HTMLEndTag; 6918d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko} 6928d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko 6933f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenkovoid Lexer::lexHTMLEndTag(Token &T) { 6948d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko assert(BufferPtr != CommentEnd && *BufferPtr == '>'); 6958d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko 6968d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko formTokenWithChars(T, BufferPtr + 1, tok::html_greater); 6978d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko State = LS_Normal; 6982d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 6992d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 700ad6fd9f93ce0d328397e8d57ef7117ced24fc8e2Fariborz JahanianLexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, 701ad6fd9f93ce0d328397e8d57ef7117ced24fc8e2Fariborz Jahanian const CommandTraits &Traits, 702af503a6f218cbef8704609812668360b0cbd0b60Dmitri Gribenko SourceLocation FileLoc, 7032d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *BufferStart, const char *BufferEnd): 704ad6fd9f93ce0d328397e8d57ef7117ced24fc8e2Fariborz Jahanian Allocator(Allocator), Diags(Diags), Traits(Traits), 7052d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferStart(BufferStart), BufferEnd(BufferEnd), 706af503a6f218cbef8704609812668360b0cbd0b60Dmitri Gribenko FileLoc(FileLoc), BufferPtr(BufferStart), 7072d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko CommentState(LCS_BeforeComment), State(LS_Normal) { 7082d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 7092d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7102d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkovoid Lexer::lex(Token &T) { 7112d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoagain: 7122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko switch (CommentState) { 7132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case LCS_BeforeComment: 7142d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (BufferPtr == BufferEnd) { 7152d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, BufferPtr, tok::eof); 7162d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 7172d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 7182d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7192d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(*BufferPtr == '/'); 7202d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; // Skip first slash. 7212d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko switch(*BufferPtr) { 7222d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '/': { // BCPL comment. 7232d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; // Skip second slash. 7242d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7252d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (BufferPtr != BufferEnd) { 7262d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Skip Doxygen magic marker, if it is present. 7272d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // It might be missing because of a typo //< or /*<, or because we 7282d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // merged this non-Doxygen comment into a bunch of Doxygen comments 7292d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // around it: /** ... */ /* ... */ /** ... */ 7302d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char C = *BufferPtr; 7312d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (C == '/' || C == '!') 7322d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; 7332d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 7342d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7352d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Skip less-than symbol that marks trailing comments. 7362d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Skip it even if the comment is not a Doxygen one, because //< and /*< 7372d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // are frequent typos. 7382d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (BufferPtr != BufferEnd && *BufferPtr == '<') 7392d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; 7402d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7412d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko CommentState = LCS_InsideBCPLComment; 7428d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine) 7438d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko State = LS_Normal; 7442d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd); 7452d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko goto again; 7462d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 7472d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '*': { // C comment. 7482d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; // Skip star. 7492d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7502d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Skip Doxygen magic marker. 7512d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char C = *BufferPtr; 7522d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!') 7532d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; 7542d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7552d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Skip less-than symbol that marks trailing comments. 7562d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (BufferPtr != BufferEnd && *BufferPtr == '<') 7572d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; 7582d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7592d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko CommentState = LCS_InsideCComment; 7602d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko State = LS_Normal; 7612d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko CommentEnd = findCCommentEnd(BufferPtr, BufferEnd); 7622d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko goto again; 7632d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 7642d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko default: 7652d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko llvm_unreachable("second character of comment should be '/' or '*'"); 7662d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 7672d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7682d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case LCS_BetweenComments: { 7692d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Consecutive comments are extracted only if there is only whitespace 7702d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // between them. So we can search for the start of the next comment. 7712d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *EndWhitespace = BufferPtr; 7722d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko while(EndWhitespace != BufferEnd && *EndWhitespace != '/') 7732d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko EndWhitespace++; 7742d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7752d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Turn any whitespace between comments (and there is only whitespace 776a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko // between them -- guaranteed by comment extraction) into a newline. We 777a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko // have two newlines between C comments in total (first one was synthesized 778a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko // after a comment). 7792d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, EndWhitespace, tok::newline); 7802d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7812d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko CommentState = LCS_BeforeComment; 7822d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko break; 7832d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 7842d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7852d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case LCS_InsideBCPLComment: 7862d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case LCS_InsideCComment: 7872d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (BufferPtr != CommentEnd) { 7882d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko lexCommentText(T); 7892d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko break; 7902d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } else { 7912d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Skip C comment closing sequence. 7922d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (CommentState == LCS_InsideCComment) { 7932d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(BufferPtr[0] == '*' && BufferPtr[1] == '/'); 7942d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr += 2; 7952d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(BufferPtr <= BufferEnd); 7962d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7972d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Synthenize newline just after the C comment, regardless if there is 7982d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // actually a newline. 7992d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, BufferPtr, tok::newline); 8002d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 8012d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko CommentState = LCS_BetweenComments; 8022d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko break; 8032d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } else { 8042d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Don't synthesized a newline after BCPL comment. 8052d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko CommentState = LCS_BetweenComments; 8062d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko goto again; 8072d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 8082d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 8092d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 8102d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 8112d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 8122d44d77fed3200e2eff289f55493317e90d3398cDmitri GribenkoStringRef Lexer::getSpelling(const Token &Tok, 8132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const SourceManager &SourceMgr, 8142d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko bool *Invalid) const { 8152d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko SourceLocation Loc = Tok.getLocation(); 8162d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc); 8172d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 8182d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko bool InvalidTemp = false; 8192d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp); 8202d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (InvalidTemp) { 8212d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko *Invalid = true; 8222d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return StringRef(); 8232d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 8242d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 8252d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *Begin = File.data() + LocInfo.second; 8262d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return StringRef(Begin, Tok.getLength()); 8272d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 8282d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 8292d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} // end namespace comments 8302d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} // end namespace clang 8312d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 832