CommentLexer.cpp revision 5bd1e5ba000023910ad986a16dd16d7ca914750a
12d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko#include "clang/AST/CommentLexer.h" 2aa58081902ad31927df02e8537d972eabe29d6dfDmitri Gribenko#include "clang/AST/CommentCommandTraits.h" 3c934dfe950a14fe447aa14a7dae25d00ee87c8bbDmitri Gribenko#include "llvm/ADT/StringExtras.h" 42d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko#include "llvm/ADT/StringSwitch.h" 5cb5620c9b213f4bd323912159fdddda35e258a14Dmitri Gribenko#include "llvm/Support/ConvertUTF.h" 62d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko#include "llvm/Support/ErrorHandling.h" 72d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 82d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkonamespace clang { 92d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkonamespace comments { 102d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 112d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkovoid Token::dump(const Lexer &L, const SourceManager &SM) const { 122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko llvm::errs() << "comments::Token Kind=" << Kind << " "; 132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko Loc.dump(SM); 142d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n"; 152d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 162d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 17477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenkonamespace { 18477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenkobool isHTMLNamedCharacterReferenceCharacter(char C) { 19477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return (C >= 'a' && C <= 'z') || 20477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko (C >= 'A' && C <= 'Z'); 21477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko} 22477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 23477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenkobool isHTMLDecimalCharacterReferenceCharacter(char C) { 24477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return C >= '0' && C <= '9'; 25477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko} 26477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 27477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenkobool isHTMLHexCharacterReferenceCharacter(char C) { 28477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return (C >= '0' && C <= '9') || 29477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko (C >= 'a' && C <= 'f') || 30477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko (C >= 'A' && C <= 'F'); 31477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko} 32834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko 335bd1e5ba000023910ad986a16dd16d7ca914750aDmitri GribenkoStringRef convertCodePointToUTF8(llvm::BumpPtrAllocator &Allocator, 345bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko unsigned CodePoint) { 35658a115c8e0d5bddf607a13d2ce13cd306ef2389Fariborz Jahanian char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT); 36658a115c8e0d5bddf607a13d2ce13cd306ef2389Fariborz Jahanian char *ResolvedPtr = Resolved; 37cb5620c9b213f4bd323912159fdddda35e258a14Dmitri Gribenko if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr)) 38658a115c8e0d5bddf607a13d2ce13cd306ef2389Fariborz Jahanian return StringRef(Resolved, ResolvedPtr - Resolved); 39658a115c8e0d5bddf607a13d2ce13cd306ef2389Fariborz Jahanian else 40658a115c8e0d5bddf607a13d2ce13cd306ef2389Fariborz Jahanian return StringRef(); 41658a115c8e0d5bddf607a13d2ce13cd306ef2389Fariborz Jahanian} 425bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko 435bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko#include "clang/AST/CommentHTMLTags.inc" 445bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko#include "clang/AST/CommentHTMLNamedCharacterReferences.inc" 455bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko 465bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko} // unnamed namespace 47658a115c8e0d5bddf607a13d2ce13cd306ef2389Fariborz Jahanian 48477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri GribenkoStringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const { 495bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko // Fast path, first check a few most widely used named character references. 50477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return llvm::StringSwitch<StringRef>(Name) 51477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko .Case("amp", "&") 52477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko .Case("lt", "<") 53477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko .Case("gt", ">") 54477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko .Case("quot", "\"") 55477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko .Case("apos", "\'") 565bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko // Slow path. 575bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko .Default(translateHTMLNamedCharacterReferenceToUTF8(Name)); 58658a115c8e0d5bddf607a13d2ce13cd306ef2389Fariborz Jahanian} 59477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 60477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri GribenkoStringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const { 61477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko unsigned CodePoint = 0; 62477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko for (unsigned i = 0, e = Name.size(); i != e; ++i) { 63477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko assert(isHTMLDecimalCharacterReferenceCharacter(Name[i])); 64477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko CodePoint *= 10; 65477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko CodePoint += Name[i] - '0'; 66477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } 675bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko return convertCodePointToUTF8(Allocator, CodePoint); 685bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko} 69477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 705bd1e5ba000023910ad986a16dd16d7ca914750aDmitri GribenkoStringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const { 715bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko unsigned CodePoint = 0; 725bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko for (unsigned i = 0, e = Name.size(); i != e; ++i) { 735bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko CodePoint *= 16; 745bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko const char C = Name[i]; 755bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko assert(isHTMLHexCharacterReferenceCharacter(C)); 765bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko CodePoint += llvm::hexDigitValue(C); 775bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko } 785bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko return convertCodePointToUTF8(Allocator, CodePoint); 79477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko} 80477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 812d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkovoid Lexer::skipLineStartingDecorations() { 822d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // This function should be called only for C comments 832d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(CommentState == LCS_InsideCComment); 842d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 852d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (BufferPtr == CommentEnd) 862d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 872d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 882d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko switch (*BufferPtr) { 892d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case ' ': 902d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '\t': 912d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '\f': 922d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '\v': { 932d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *NewBufferPtr = BufferPtr; 942d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko NewBufferPtr++; 952d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (NewBufferPtr == CommentEnd) 962d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 972d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 982d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko char C = *NewBufferPtr; 992d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko while (C == ' ' || C == '\t' || C == '\f' || C == '\v') { 1002d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko NewBufferPtr++; 1012d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (NewBufferPtr == CommentEnd) 1022d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 1032d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko C = *NewBufferPtr; 1042d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 1052d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (C == '*') 1062d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr = NewBufferPtr + 1; 1072d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko break; 1082d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 1092d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '*': 1102d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; 1112d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko break; 1122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 1132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 1142d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 1152d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkonamespace { 1168d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko/// Returns pointer to the first newline character in the string. 1172d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *findNewline(const char *BufferPtr, const char *BufferEnd) { 1182d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 1192d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char C = *BufferPtr; 1202d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (C == '\n' || C == '\r') 1212d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferPtr; 1222d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 1232d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferEnd; 1242d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 1252d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 1262d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *skipNewline(const char *BufferPtr, const char *BufferEnd) { 1272d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (BufferPtr == BufferEnd) 1282d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferPtr; 1292d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 1302d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (*BufferPtr == '\n') 1312d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; 1322d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko else { 1332d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(*BufferPtr == '\r'); 1342d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; 1352d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (BufferPtr != BufferEnd && *BufferPtr == '\n') 1362d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; 1372d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 1382d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferPtr; 1392d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 1402d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 141477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenkoconst char *skipNamedCharacterReference(const char *BufferPtr, 142477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko const char *BufferEnd) { 143477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 144477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr)) 145477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return BufferPtr; 146477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } 147477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return BufferEnd; 148477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko} 149477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 150477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenkoconst char *skipDecimalCharacterReference(const char *BufferPtr, 151477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko const char *BufferEnd) { 152477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 153477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr)) 154477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return BufferPtr; 155477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } 156477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return BufferEnd; 157477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko} 158477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 159477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenkoconst char *skipHexCharacterReference(const char *BufferPtr, 160477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko const char *BufferEnd) { 161477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 162477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr)) 163477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return BufferPtr; 164477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } 165477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return BufferEnd; 166477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko} 167477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 168a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenkobool isHTMLIdentifierStartingCharacter(char C) { 169a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko return (C >= 'a' && C <= 'z') || 170a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko (C >= 'A' && C <= 'Z'); 171a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko} 172a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko 1732d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkobool isHTMLIdentifierCharacter(char C) { 1742d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return (C >= 'a' && C <= 'z') || 1752d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko (C >= 'A' && C <= 'Z') || 1762d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko (C >= '0' && C <= '9'); 1772d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 1782d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 1792d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) { 1802d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 1812d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (!isHTMLIdentifierCharacter(*BufferPtr)) 1822d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferPtr; 1832d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 1842d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferEnd; 1852d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 1862d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 1872d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// Skip HTML string quoted in single or double quotes. Escaping quotes inside 1882d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// string allowed. 1892d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// 1902d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// Returns pointer to closing quote. 1912d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd) 1922d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko{ 1932d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char Quote = *BufferPtr; 1942d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(Quote == '\"' || Quote == '\''); 1952d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 1962d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; 1972d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 1982d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char C = *BufferPtr; 1992d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (C == Quote && BufferPtr[-1] != '\\') 2002d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferPtr; 2012d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 2022d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferEnd; 2032d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 2042d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 2052d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkobool isHorizontalWhitespace(char C) { 2062d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return C == ' ' || C == '\t' || C == '\f' || C == '\v'; 2072d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 2082d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 2092d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkobool isWhitespace(char C) { 2102d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return C == ' ' || C == '\n' || C == '\r' || 2112d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko C == '\t' || C == '\f' || C == '\v'; 2122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 2132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 2142d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) { 2152d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 2162d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (!isWhitespace(*BufferPtr)) 2172d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferPtr; 2182d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 2192d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferEnd; 2202d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 2212d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 22264da4e55c111f4733135e1780216609569767351Dmitri Gribenkobool isWhitespace(const char *BufferPtr, const char *BufferEnd) { 22364da4e55c111f4733135e1780216609569767351Dmitri Gribenko return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd; 22464da4e55c111f4733135e1780216609569767351Dmitri Gribenko} 22564da4e55c111f4733135e1780216609569767351Dmitri Gribenko 2268c05da3fd8db98af482826ba059ab1ad6d58010fDmitri Gribenkobool isCommandNameStartCharacter(char C) { 2278c05da3fd8db98af482826ba059ab1ad6d58010fDmitri Gribenko return (C >= 'a' && C <= 'z') || 2288c05da3fd8db98af482826ba059ab1ad6d58010fDmitri Gribenko (C >= 'A' && C <= 'Z'); 2298c05da3fd8db98af482826ba059ab1ad6d58010fDmitri Gribenko} 2308c05da3fd8db98af482826ba059ab1ad6d58010fDmitri Gribenko 2312d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkobool isCommandNameCharacter(char C) { 2322d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return (C >= 'a' && C <= 'z') || 2332d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko (C >= 'A' && C <= 'Z') || 2342d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko (C >= '0' && C <= '9'); 2352d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 2362d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 2372d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *skipCommandName(const char *BufferPtr, const char *BufferEnd) { 2382d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 2392d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (!isCommandNameCharacter(*BufferPtr)) 2402d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferPtr; 2412d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 2422d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferEnd; 2432d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 2442d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 2452d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// Return the one past end pointer for BCPL comments. 2462d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// Handles newlines escaped with backslash or trigraph for backslahs. 2472d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) { 2482d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *CurPtr = BufferPtr; 2492d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko while (CurPtr != BufferEnd) { 2502d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko char C = *CurPtr; 2512d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko while (C != '\n' && C != '\r') { 2522d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko CurPtr++; 2532d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (CurPtr == BufferEnd) 2542d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferEnd; 2552d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko C = *CurPtr; 2562d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 2572d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // We found a newline, check if it is escaped. 2582d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *EscapePtr = CurPtr - 1; 2592d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko while(isHorizontalWhitespace(*EscapePtr)) 2602d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko EscapePtr--; 2612d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 2622d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (*EscapePtr == '\\' || 2632d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' && 2642d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) { 2652d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // We found an escaped newline. 2662d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko CurPtr = skipNewline(CurPtr, BufferEnd); 2672d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } else 2682d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return CurPtr; // Not an escaped newline. 2692d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 2702d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferEnd; 2712d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 2722d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 2732d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// Return the one past end pointer for C comments. 2742d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko/// Very dumb, does not handle escaped newlines or trigraphs. 2752d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoconst char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) { 2762d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko for ( ; BufferPtr != BufferEnd; ++BufferPtr) { 2772d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (*BufferPtr == '*') { 2782d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(BufferPtr + 1 != BufferEnd); 2792d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (*(BufferPtr + 1) == '/') 2802d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return BufferPtr; 2812d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 2822d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 2832d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko llvm_unreachable("buffer end hit before '*/' was seen"); 2842d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 2852d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} // unnamed namespace 2862d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 2872d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkovoid Lexer::lexCommentText(Token &T) { 2882d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(CommentState == LCS_InsideBCPLComment || 2892d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko CommentState == LCS_InsideCComment); 2902d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 2912d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko switch (State) { 2922d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case LS_Normal: 2932d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko break; 2942d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case LS_VerbatimBlockFirstLine: 2952d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko lexVerbatimBlockFirstLine(T); 2962d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 2972d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case LS_VerbatimBlockBody: 2982d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko lexVerbatimBlockBody(T); 2992d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 300962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko case LS_VerbatimLineText: 301962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko lexVerbatimLineText(T); 302962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko return; 3033f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko case LS_HTMLStartTag: 3043f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko lexHTMLStartTag(T); 3052d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 3063f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko case LS_HTMLEndTag: 3073f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko lexHTMLEndTag(T); 3088d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko return; 3092d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 3102d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 3112d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(State == LS_Normal); 3122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 3132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *TokenPtr = BufferPtr; 3142d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(TokenPtr < CommentEnd); 3152d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko while (TokenPtr != CommentEnd) { 3162d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko switch(*TokenPtr) { 3172d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '\\': 3182d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '@': { 3192d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr++; 3202d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (TokenPtr == CommentEnd) { 321477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 3222d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 3232d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 3242d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko char C = *TokenPtr; 3252d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko switch (C) { 3262d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko default: 3272d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko break; 3282d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 3292d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '\\': case '@': case '&': case '$': 3302d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '#': case '<': case '>': case '%': 3312d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '\"': case '.': case ':': 3322d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // This is one of \\ \@ \& \$ etc escape sequences. 3332d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr++; 3342d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') { 3352d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // This is the \:: escape sequence. 3362d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr++; 3372d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 338f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1)); 3392d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, TokenPtr, tok::text); 340f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko T.setText(UnescapedText); 3412d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 3422d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 3432d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 3442d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Don't make zero-length commands. 3458c05da3fd8db98af482826ba059ab1ad6d58010fDmitri Gribenko if (!isCommandNameStartCharacter(*TokenPtr)) { 346477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 3472d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 3482d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 3492d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 3502d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr = skipCommandName(TokenPtr, CommentEnd); 3512d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko unsigned Length = TokenPtr - (BufferPtr + 1); 3522d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 3532d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Hardcoded support for lexing LaTeX formula commands 3542d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // \f$ \f[ \f] \f{ \f} as a single command. 3552d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) { 3562d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko C = *TokenPtr; 3572d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') { 3582d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr++; 3592d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko Length++; 3602d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 3612d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 3622d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 3632d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const StringRef CommandName(BufferPtr + 1, Length); 3642d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 365e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName); 366e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko if (!Info) { 367e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko formTokenWithChars(T, TokenPtr, tok::unknown_command); 368e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko T.setUnknownCommandName(CommandName); 3692d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 3702d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 371e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko if (Info->IsVerbatimBlockCommand) { 372e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info); 373e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko return; 374e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko } 375e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko if (Info->IsVerbatimLineCommand) { 376e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko setupAndLexVerbatimLine(T, TokenPtr, Info); 3772d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 3782d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 3792d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, TokenPtr, tok::command); 380e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko T.setCommandID(Info->getID()); 3812d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 3822d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 3832d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 384477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko case '&': 385477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko lexHTMLCharacterReference(T); 386477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return; 387477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 3882d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '<': { 3892d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr++; 3902d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (TokenPtr == CommentEnd) { 391477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 3922d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 3932d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 3942d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char C = *TokenPtr; 395a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko if (isHTMLIdentifierStartingCharacter(C)) 3963f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko setupAndLexHTMLStartTag(T); 3972d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko else if (C == '/') 3983f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko setupAndLexHTMLEndTag(T); 399477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko else 400477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 401477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 4022d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 4032d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 4042d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 4052d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '\n': 4062d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '\r': 4072d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr = skipNewline(TokenPtr, CommentEnd); 4082d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, TokenPtr, tok::newline); 4092d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 4102d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (CommentState == LCS_InsideCComment) 4112d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko skipLineStartingDecorations(); 4122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 4132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 4142d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko default: { 415aa7dbafc3539868ce271cb336444ec544260905aDmitri Gribenko size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr). 416aa7dbafc3539868ce271cb336444ec544260905aDmitri Gribenko find_first_of("\n\r\\@&<"); 417aa7dbafc3539868ce271cb336444ec544260905aDmitri Gribenko if (End != StringRef::npos) 418aa7dbafc3539868ce271cb336444ec544260905aDmitri Gribenko TokenPtr += End; 419aa7dbafc3539868ce271cb336444ec544260905aDmitri Gribenko else 420aa7dbafc3539868ce271cb336444ec544260905aDmitri Gribenko TokenPtr = CommentEnd; 421477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 4222d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 4232d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 4242d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 4252d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 4262d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 4272d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 4282d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkovoid Lexer::setupAndLexVerbatimBlock(Token &T, 4292d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *TextBegin, 430e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko char Marker, const CommandInfo *Info) { 431e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko assert(Info->IsVerbatimBlockCommand); 432e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko 4332d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko VerbatimBlockEndCommandName.clear(); 4342d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@"); 435e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko VerbatimBlockEndCommandName.append(Info->EndCommandName); 4362d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 4372d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, TextBegin, tok::verbatim_block_begin); 438e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko T.setVerbatimBlockID(Info->getID()); 4392d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 4408d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko // If there is a newline following the verbatim opening command, skip the 4418d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko // newline so that we don't create an tok::verbatim_block_line with empty 4428d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko // text content. 4438d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko if (BufferPtr != CommentEnd) { 4448d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko const char C = *BufferPtr; 4458d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko if (C == '\n' || C == '\r') { 4468d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko BufferPtr = skipNewline(BufferPtr, CommentEnd); 4478d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko State = LS_VerbatimBlockBody; 4488d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko return; 4498d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko } 4508d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko } 4518d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko 4522d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko State = LS_VerbatimBlockFirstLine; 4532d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 4542d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 4552d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkovoid Lexer::lexVerbatimBlockFirstLine(Token &T) { 45664da4e55c111f4733135e1780216609569767351Dmitri Gribenkoagain: 4572d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(BufferPtr < CommentEnd); 4582d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 4592d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // FIXME: It would be better to scan the text once, finding either the block 4602d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // end command or newline. 4612d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // 4622d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Extract current line. 4632d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *Newline = findNewline(BufferPtr, CommentEnd); 4642d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko StringRef Line(BufferPtr, Newline - BufferPtr); 4652d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 4662d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Look for end command in current line. 4672d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko size_t Pos = Line.find(VerbatimBlockEndCommandName); 4688d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko const char *TextEnd; 4692d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *NextLine; 4702d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (Pos == StringRef::npos) { 4712d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Current line is completely verbatim. 4728d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko TextEnd = Newline; 4732d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko NextLine = skipNewline(Newline, CommentEnd); 4742d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } else if (Pos == 0) { 4752d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Current line contains just an end command. 4762d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *End = BufferPtr + VerbatimBlockEndCommandName.size(); 477f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko StringRef Name(BufferPtr + 1, End - (BufferPtr + 1)); 4782d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, End, tok::verbatim_block_end); 479e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID()); 4802d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko State = LS_Normal; 4812d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 4822d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } else { 4832d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // There is some text, followed by end command. Extract text first. 4848d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko TextEnd = BufferPtr + Pos; 4858d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko NextLine = TextEnd; 48664da4e55c111f4733135e1780216609569767351Dmitri Gribenko // If there is only whitespace before end command, skip whitespace. 48764da4e55c111f4733135e1780216609569767351Dmitri Gribenko if (isWhitespace(BufferPtr, TextEnd)) { 48864da4e55c111f4733135e1780216609569767351Dmitri Gribenko BufferPtr = TextEnd; 48964da4e55c111f4733135e1780216609569767351Dmitri Gribenko goto again; 49064da4e55c111f4733135e1780216609569767351Dmitri Gribenko } 4912d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 4922d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 4938d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko StringRef Text(BufferPtr, TextEnd - BufferPtr); 4942d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, NextLine, tok::verbatim_block_line); 495f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko T.setVerbatimBlockText(Text); 4962d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 4972d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko State = LS_VerbatimBlockBody; 4982d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 4992d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 5002d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkovoid Lexer::lexVerbatimBlockBody(Token &T) { 5012d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(State == LS_VerbatimBlockBody); 5022d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 5032d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (CommentState == LCS_InsideCComment) 5042d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko skipLineStartingDecorations(); 5052d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 5062d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko lexVerbatimBlockFirstLine(T); 5072d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 5082d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 509e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenkovoid Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin, 510e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko const CommandInfo *Info) { 511e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko assert(Info->IsVerbatimLineCommand); 512962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko formTokenWithChars(T, TextBegin, tok::verbatim_line_name); 513e4330a302ac20b41b9800267ebd4b5b01f8553f8Dmitri Gribenko T.setVerbatimLineID(Info->getID()); 514962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko 515962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko State = LS_VerbatimLineText; 516962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko} 517962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko 518962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenkovoid Lexer::lexVerbatimLineText(Token &T) { 519962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko assert(State == LS_VerbatimLineText); 520962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko 521962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko // Extract current line. 522962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko const char *Newline = findNewline(BufferPtr, CommentEnd); 523962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko const StringRef Text(BufferPtr, Newline - BufferPtr); 524962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko formTokenWithChars(T, Newline, tok::verbatim_line_text); 5252d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko T.setVerbatimLineText(Text); 526962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko 527962668d2c192dd02f75b8ec3628a89964bfb738bDmitri Gribenko State = LS_Normal; 5282d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 5292d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 530477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenkovoid Lexer::lexHTMLCharacterReference(Token &T) { 531477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko const char *TokenPtr = BufferPtr; 532477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko assert(*TokenPtr == '&'); 533477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko TokenPtr++; 534477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko if (TokenPtr == CommentEnd) { 535477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 536477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return; 537477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } 538477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko const char *NamePtr; 539477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko bool isNamed = false; 540477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko bool isDecimal = false; 541477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko char C = *TokenPtr; 542477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko if (isHTMLNamedCharacterReferenceCharacter(C)) { 543477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko NamePtr = TokenPtr; 544477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd); 545477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko isNamed = true; 546477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } else if (C == '#') { 547477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko TokenPtr++; 548477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko if (TokenPtr == CommentEnd) { 549477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 550477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return; 551477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } 552477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko C = *TokenPtr; 553477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko if (isHTMLDecimalCharacterReferenceCharacter(C)) { 554477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko NamePtr = TokenPtr; 555477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd); 556477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko isDecimal = true; 557477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } else if (C == 'x' || C == 'X') { 558477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko TokenPtr++; 559477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko NamePtr = TokenPtr; 560477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd); 561477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } else { 562477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 563477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return; 564477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } 565477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } else { 566477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 567477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return; 568477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } 569477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko if (NamePtr == TokenPtr || TokenPtr == CommentEnd || 570477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko *TokenPtr != ';') { 571477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 572477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return; 573477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } 574477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko StringRef Name(NamePtr, TokenPtr - NamePtr); 575477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko TokenPtr++; // Skip semicolon. 576477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko StringRef Resolved; 5775bd1e5ba000023910ad986a16dd16d7ca914750aDmitri Gribenko if (isNamed) 578477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko Resolved = resolveHTMLNamedCharacterReference(Name); 579477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko else if (isDecimal) 580477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko Resolved = resolveHTMLDecimalCharacterReference(Name); 581477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko else 582477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko Resolved = resolveHTMLHexCharacterReference(Name); 583477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 584477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko if (Resolved.empty()) { 585477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 586477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return; 587477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } 588477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTokenWithChars(T, TokenPtr, tok::text); 589477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko T.setText(Resolved); 590477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko return; 591477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko} 592477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 5933f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenkovoid Lexer::setupAndLexHTMLStartTag(Token &T) { 594a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko assert(BufferPtr[0] == '<' && 595a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko isHTMLIdentifierStartingCharacter(BufferPtr[1])); 5962d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd); 597f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1)); 598834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko if (!isHTMLTagName(Name)) { 599834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko formTextToken(T, TagNameEnd); 600834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko return; 601834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko } 602834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko 6033f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko formTokenWithChars(T, TagNameEnd, tok::html_start_tag); 6043f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko T.setHTMLTagStartName(Name); 6052d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 6062d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr = skipWhitespace(BufferPtr, CommentEnd); 6072d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 608a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko const char C = *BufferPtr; 609a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko if (BufferPtr != CommentEnd && 610a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C))) 6113f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko State = LS_HTMLStartTag; 6122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 6132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 6143f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenkovoid Lexer::lexHTMLStartTag(Token &T) { 6153f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko assert(State == LS_HTMLStartTag); 6162d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 6172d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *TokenPtr = BufferPtr; 6182d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko char C = *TokenPtr; 6192d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (isHTMLIdentifierCharacter(C)) { 6202d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd); 621f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko StringRef Ident(BufferPtr, TokenPtr - BufferPtr); 6222d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, TokenPtr, tok::html_ident); 623f5e0aeac8a510ba1fd4c83391978cffd31e5ac69Dmitri Gribenko T.setHTMLIdent(Ident); 6242d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } else { 6252d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko switch (C) { 6262d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '=': 6272d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr++; 6282d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, TokenPtr, tok::html_equals); 6292d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko break; 6302d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '\"': 6312d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '\'': { 6322d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *OpenQuote = TokenPtr; 6332d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd); 6342d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *ClosingQuote = TokenPtr; 6352d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (TokenPtr != CommentEnd) // Skip closing quote. 6362d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr++; 6372d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, TokenPtr, tok::html_quoted_string); 6382d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko T.setHTMLQuotedString(StringRef(OpenQuote + 1, 6392d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko ClosingQuote - (OpenQuote + 1))); 6402d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko break; 6412d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 6422d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '>': 6432d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko TokenPtr++; 6442d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, TokenPtr, tok::html_greater); 645a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko State = LS_Normal; 646a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko return; 647a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko case '/': 648a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko TokenPtr++; 649a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko if (TokenPtr != CommentEnd && *TokenPtr == '>') { 650a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko TokenPtr++; 651a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko formTokenWithChars(T, TokenPtr, tok::html_slash_greater); 652477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko } else 653477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko formTextToken(T, TokenPtr); 654477a9f58c1b197f315befd03b42a8a0b3a2f0ff9Dmitri Gribenko 655a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko State = LS_Normal; 656a5ef44ff5d93a3be6ca67782828157a71894cf0cDmitri Gribenko return; 6572d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 6582d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 6592d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 6602d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Now look ahead and return to normal state if we don't see any HTML tokens 6612d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // ahead. 6622d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr = skipWhitespace(BufferPtr, CommentEnd); 6632d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (BufferPtr == CommentEnd) { 6642d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko State = LS_Normal; 6652d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 6662d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 6672d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 6682d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko C = *BufferPtr; 669a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko if (!isHTMLIdentifierStartingCharacter(C) && 6702d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko C != '=' && C != '\"' && C != '\'' && C != '>') { 6712d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko State = LS_Normal; 6722d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 6732d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 6742d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 6752d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 6763f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenkovoid Lexer::setupAndLexHTMLEndTag(Token &T) { 6772d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(BufferPtr[0] == '<' && BufferPtr[1] == '/'); 6782d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 6792d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd); 6802d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd); 681834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin); 682834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko if (!isHTMLTagName(Name)) { 683834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko formTextToken(T, TagNameEnd); 684834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko return; 685834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko } 6862d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 6872d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *End = skipWhitespace(TagNameEnd, CommentEnd); 6882d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 6893f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko formTokenWithChars(T, End, tok::html_end_tag); 690834a5bd311b4a32f89937ca5b6dd2b4111891859Dmitri Gribenko T.setHTMLTagEndName(Name); 6918d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko 6928d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko if (BufferPtr != CommentEnd && *BufferPtr == '>') 6933f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenko State = LS_HTMLEndTag; 6948d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko} 6958d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko 6963f38bf2d441fac379c427f86153fbb0cb41256c6Dmitri Gribenkovoid Lexer::lexHTMLEndTag(Token &T) { 6978d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko assert(BufferPtr != CommentEnd && *BufferPtr == '>'); 6988d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko 6998d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko formTokenWithChars(T, BufferPtr + 1, tok::html_greater); 7008d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko State = LS_Normal; 7012d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 7022d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 703aa58081902ad31927df02e8537d972eabe29d6dfDmitri GribenkoLexer::Lexer(llvm::BumpPtrAllocator &Allocator, const CommandTraits &Traits, 704af503a6f218cbef8704609812668360b0cbd0b60Dmitri Gribenko SourceLocation FileLoc, 7052d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *BufferStart, const char *BufferEnd): 706aa58081902ad31927df02e8537d972eabe29d6dfDmitri Gribenko Allocator(Allocator), Traits(Traits), 7072d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferStart(BufferStart), BufferEnd(BufferEnd), 708af503a6f218cbef8704609812668360b0cbd0b60Dmitri Gribenko FileLoc(FileLoc), BufferPtr(BufferStart), 7092d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko CommentState(LCS_BeforeComment), State(LS_Normal) { 7102d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 7112d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkovoid Lexer::lex(Token &T) { 7132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenkoagain: 7142d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko switch (CommentState) { 7152d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case LCS_BeforeComment: 7162d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (BufferPtr == BufferEnd) { 7172d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, BufferPtr, tok::eof); 7182d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return; 7192d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 7202d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7212d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(*BufferPtr == '/'); 7222d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; // Skip first slash. 7232d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko switch(*BufferPtr) { 7242d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '/': { // BCPL comment. 7252d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; // Skip second slash. 7262d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7272d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (BufferPtr != BufferEnd) { 7282d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Skip Doxygen magic marker, if it is present. 7292d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // It might be missing because of a typo //< or /*<, or because we 7302d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // merged this non-Doxygen comment into a bunch of Doxygen comments 7312d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // around it: /** ... */ /* ... */ /** ... */ 7322d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char C = *BufferPtr; 7332d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (C == '/' || C == '!') 7342d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; 7352d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 7362d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7372d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Skip less-than symbol that marks trailing comments. 7382d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Skip it even if the comment is not a Doxygen one, because //< and /*< 7392d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // are frequent typos. 7402d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (BufferPtr != BufferEnd && *BufferPtr == '<') 7412d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; 7422d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7432d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko CommentState = LCS_InsideBCPLComment; 7448d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine) 7458d3ba23f2d9e6c87794d059412a0808c9cbacb25Dmitri Gribenko State = LS_Normal; 7462d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd); 7472d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko goto again; 7482d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 7492d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case '*': { // C comment. 7502d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; // Skip star. 7512d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7522d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Skip Doxygen magic marker. 7532d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char C = *BufferPtr; 7542d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!') 7552d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; 7562d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7572d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Skip less-than symbol that marks trailing comments. 7582d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (BufferPtr != BufferEnd && *BufferPtr == '<') 7592d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr++; 7602d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7612d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko CommentState = LCS_InsideCComment; 7622d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko State = LS_Normal; 7632d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko CommentEnd = findCCommentEnd(BufferPtr, BufferEnd); 7642d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko goto again; 7652d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 7662d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko default: 7672d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko llvm_unreachable("second character of comment should be '/' or '*'"); 7682d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 7692d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7702d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case LCS_BetweenComments: { 7712d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Consecutive comments are extracted only if there is only whitespace 7722d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // between them. So we can search for the start of the next comment. 7732d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *EndWhitespace = BufferPtr; 7742d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko while(EndWhitespace != BufferEnd && *EndWhitespace != '/') 7752d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko EndWhitespace++; 7762d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7772d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Turn any whitespace between comments (and there is only whitespace 778a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko // between them -- guaranteed by comment extraction) into a newline. We 779a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko // have two newlines between C comments in total (first one was synthesized 780a99ec107ba6b5abaf27c6cc9318e65689163f2a1Dmitri Gribenko // after a comment). 7812d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, EndWhitespace, tok::newline); 7822d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7832d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko CommentState = LCS_BeforeComment; 7842d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko break; 7852d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 7862d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7872d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case LCS_InsideBCPLComment: 7882d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko case LCS_InsideCComment: 7892d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (BufferPtr != CommentEnd) { 7902d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko lexCommentText(T); 7912d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko break; 7922d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } else { 7932d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Skip C comment closing sequence. 7942d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (CommentState == LCS_InsideCComment) { 7952d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(BufferPtr[0] == '*' && BufferPtr[1] == '/'); 7962d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko BufferPtr += 2; 7972d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko assert(BufferPtr <= BufferEnd); 7982d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 7992d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Synthenize newline just after the C comment, regardless if there is 8002d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // actually a newline. 8012d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko formTokenWithChars(T, BufferPtr, tok::newline); 8022d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 8032d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko CommentState = LCS_BetweenComments; 8042d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko break; 8052d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } else { 8062d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko // Don't synthesized a newline after BCPL comment. 8072d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko CommentState = LCS_BetweenComments; 8082d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko goto again; 8092d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 8102d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 8112d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 8122d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 8132d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 8142d44d77fed3200e2eff289f55493317e90d3398cDmitri GribenkoStringRef Lexer::getSpelling(const Token &Tok, 8152d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const SourceManager &SourceMgr, 8162d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko bool *Invalid) const { 8172d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko SourceLocation Loc = Tok.getLocation(); 8182d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc); 8192d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 8202d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko bool InvalidTemp = false; 8212d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp); 8222d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko if (InvalidTemp) { 8232d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko *Invalid = true; 8242d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return StringRef(); 8252d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko } 8262d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 8272d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko const char *Begin = File.data() + LocInfo.second; 8282d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko return StringRef(Begin, Tok.getLength()); 8292d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} 8302d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 8312d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} // end namespace comments 8322d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko} // end namespace clang 8332d44d77fed3200e2eff289f55493317e90d3398cDmitri Gribenko 834