18403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// Copyright (c) 2011, Mike Samuel 28403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// All rights reserved. 38403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// 48403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// Redistribution and use in source and binary forms, with or without 58403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// modification, are permitted provided that the following conditions 68403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// are met: 78403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// 88403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// Redistributions of source code must retain the above copyright 98403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// notice, this list of conditions and the following disclaimer. 108403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// Redistributions in binary form must reproduce the above copyright 118403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// notice, this list of conditions and the following disclaimer in the 128403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// documentation and/or other materials provided with the distribution. 138403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// Neither the name of the OWASP nor the names of its contributors may 148403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// be used to endorse or promote products derived from this software 158403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// without specific prior written permission. 168403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 178403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 188403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 198403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 208403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 218403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 228403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 238403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 248403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 258403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 268403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 278403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// POSSIBILITY OF SUCH DAMAGE. 288403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel 295c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.compackage org.owasp.html; 305c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 315c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.comimport com.google.common.collect.ImmutableSet; 325c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.comimport com.google.common.collect.Lists; 335c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.comimport java.util.LinkedList; 345c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.comimport java.util.NoSuchElementException; 355c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.comimport java.util.Set; 365c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 371bfae835221847e7791625e2baa98a60eb3cfa8amikesamuelimport javax.annotation.concurrent.NotThreadSafe; 381bfae835221847e7791625e2baa98a60eb3cfa8amikesamuel 395c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com/** 404e867904c8295537803c1c8a076e130df5674b58mikesamuel * A flexible lexer for HTML. 415c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * This is hairy code, but it is outside the TCB for the HTML sanitizer. 425c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * 434e867904c8295537803c1c8a076e130df5674b58mikesamuel * @author Mike Samuel <mikesamuel@gmail.com> 445c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com */ 451bfae835221847e7791625e2baa98a60eb3cfa8amikesamuel@NotThreadSafe 464e867904c8295537803c1c8a076e130df5674b58mikesamuelfinal class HtmlLexer extends AbstractTokenStream { 475c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com private final String input; 485c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com private final HtmlInputSplitter splitter; 495c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com private State state = State.OUTSIDE_TAG; 505c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 515c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com public HtmlLexer(String input) { 525c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com this.input = input; 535c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com this.splitter = new HtmlInputSplitter(input); 545c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 555c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 565c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com /** 574e867904c8295537803c1c8a076e130df5674b58mikesamuel * Normalize case of names that are not name-spaced. This lower-cases HTML 584e867904c8295537803c1c8a076e130df5674b58mikesamuel * element and attribute names, but not ones for embedded SVG or MATHML. 595c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com */ 605c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com static String canonicalName(String elementOrAttribName) { 615c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com return elementOrAttribName.indexOf(':') >= 0 625c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com ? elementOrAttribName : Strings.toLowerCase(elementOrAttribName); 635c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 645c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 655c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com /** 666d8c2e9241d042a3e0bff40dac4c388966ad060cmikesamuel * An FSM that lets us reclassify text tokens inside tags as attribute 675c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * names/values 685c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com */ 695c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com private static enum State { 705c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com OUTSIDE_TAG, 715c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com IN_TAG, 725c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com SAW_NAME, 735c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com SAW_EQ, 745c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com ; 755c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 765c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 775c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com /** 785c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * Makes sure that this.token contains a token if one is available. 795c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * This may require fetching and combining multiple tokens from the underlying 805c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * splitter. 815c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com */ 825c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com @Override 834e867904c8295537803c1c8a076e130df5674b58mikesamuel protected HtmlToken produce() { 845c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com HtmlToken token = readToken(); 855c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (token == null) { return null; } 865c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 875c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com switch (token.type) { 885c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 895c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // Keep track of whether we're inside a tag or not. 905c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case TAGBEGIN: 915c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.IN_TAG; 925c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 935c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case TAGEND: 945c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (state == State.SAW_EQ && HtmlTokenType.TAGEND == token.type) { 955c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // Distinguish <input type=checkbox checked=> from 965c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // <input type=checkbox checked> 975c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com pushbackToken(token); 985c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.IN_TAG; 995c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com return HtmlToken.instance( 1005c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com token.start, token.start, HtmlTokenType.ATTRVALUE); 1015c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 1025c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 1035c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.OUTSIDE_TAG; 1045c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 1055c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 1065c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // Drop ignorable tokens by zeroing out the one received and recursing 1075c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case IGNORABLE: 1085c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com return produce(); 1095c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 1105c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // collapse adjacent text nodes if we're outside a tag, or otherwise, 1115c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // Recognize attribute names and values. 1125c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com default: 1135c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com switch (state) { 1145c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case OUTSIDE_TAG: 1155c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (HtmlTokenType.TEXT == token.type 1165c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com || HtmlTokenType.UNESCAPED == token.type) { 1175c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com token = collapseSubsequent(token); 1185c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 1195c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 1205c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case IN_TAG: 1215c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (HtmlTokenType.TEXT == token.type 1225c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com && !token.tokenInContextMatches(input, "=")) { 1235c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // Reclassify as attribute name 1245c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com token = HtmlInputSplitter.reclassify( 1255c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com token, HtmlTokenType.ATTRNAME); 1265c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.SAW_NAME; 1275c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 1285c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 1295c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case SAW_NAME: 1305c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (HtmlTokenType.TEXT == token.type) { 1315c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (token.tokenInContextMatches(input, "=")) { 1325c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.SAW_EQ; 1335c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // Skip the '=' token 1345c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com return produce(); 1355c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else { 1365c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // Reclassify as attribute name 1375c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com token = HtmlInputSplitter.reclassify( 1385c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com token, HtmlTokenType.ATTRNAME); 1395c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 1405c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else { 1415c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.IN_TAG; 1425c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 1435c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 1445c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case SAW_EQ: 1455c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (HtmlTokenType.TEXT == token.type 1465c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com || HtmlTokenType.QSTRING == token.type) { 1475c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (HtmlTokenType.TEXT == token.type) { 1485c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // Collapse adjacent text nodes to properly handle 1495c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // <a onclick=this.clicked=true> 1505c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // <a title=foo bar> 1515c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com token = collapseAttributeName(token); 1525c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 1535c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // Reclassify as value 1545c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com token = HtmlInputSplitter.reclassify( 1555c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com token, HtmlTokenType.ATTRVALUE); 1565c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.IN_TAG; 1575c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 1585c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 1595c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 1605c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 1615c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 1625c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 1635c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com return token; 1645c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 1655c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 1665c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com /** 1675c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * Collapses all the following tokens of the same type into this.token. 1685c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com */ 1694e867904c8295537803c1c8a076e130df5674b58mikesamuel private HtmlToken collapseSubsequent(HtmlToken token) { 1705c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com HtmlToken collapsed = token; 1715c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com for (HtmlToken next; 1725c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com (next= peekToken(0)) != null && next.type == token.type; 1735c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com readToken()) { 1745c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com collapsed = join(collapsed, next); 1755c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 1765c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com return collapsed; 1775c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 1785c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 1794e867904c8295537803c1c8a076e130df5674b58mikesamuel private HtmlToken collapseAttributeName(HtmlToken token) { 1805c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // We want to collapse tokens into the value that are not parts of an 1815c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // attribute value. We should include any space or text adjacent to the 1825c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // value, but should stop at any of the following constructions: 1835c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // space end-of-file e.g. name=foo_ 1845c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // space valueless-attrib-name e.g. name=foo checked 1855c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // space tag-end e.g. name=foo /> 1865c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // space text space? '=' e.g. name=foo bar= 1875c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com int nToMerge = 0; 1885c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com for (HtmlToken t; (t = peekToken(nToMerge)) != null;) { 1895c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (t.type == HtmlTokenType.IGNORABLE) { 1905c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com HtmlToken tok = peekToken(nToMerge + 1); 1915c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (tok == null) { break; } 1925c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (tok.type != HtmlTokenType.TEXT) { break; } 1935c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (isValuelessAttribute(input.substring(tok.start, tok.end))) { 1945c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 1955c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 1965c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com HtmlToken eq = peekToken(nToMerge + 2); 1975c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (eq != null && eq.type == HtmlTokenType.IGNORABLE) { 1985c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com eq = peekToken(nToMerge + 3); 1995c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 2005c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (eq == null || eq.tokenInContextMatches(input, "=")) { 2015c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 2025c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 2035c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else if (t.type != HtmlTokenType.TEXT) { 2045c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 2055c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 2065c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com ++nToMerge; 2075c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 2085c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (nToMerge == 0) { return token; } 2095c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 2105c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com int end = token.end; 2115c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com do { 2125c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com end = readToken().end; 2135c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } while (--nToMerge > 0); 2145c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 2155c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com return HtmlToken.instance(token.start, end, HtmlTokenType.TEXT); 2165c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 2175c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 2185c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com private static HtmlToken join(HtmlToken a, HtmlToken b) { 2195c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com return HtmlToken.instance(a.start, b.end, a.type); 2205c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 2215c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 2225c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com private final LinkedList<HtmlToken> lookahead = Lists.newLinkedList(); 2234e867904c8295537803c1c8a076e130df5674b58mikesamuel private HtmlToken readToken() { 2245c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (!lookahead.isEmpty()) { 2255c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com return lookahead.remove(); 2265c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else if (splitter.hasNext()) { 2275c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com return splitter.next(); 2285c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else { 2295c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com return null; 2305c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 2315c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 2325c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 2334e867904c8295537803c1c8a076e130df5674b58mikesamuel private HtmlToken peekToken(int i) { 2345c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com while (lookahead.size() <= i && splitter.hasNext()) { 2355c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com lookahead.add(splitter.next()); 2365c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 2375c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com return lookahead.size() > i ? lookahead.get(i) : null; 2385c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 2395c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 2405c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com private void pushbackToken(HtmlToken token) { 2415c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com lookahead.addFirst(token); 2425c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 2435c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 2445c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com /** Can the attribute appear in HTML without a value. */ 2455c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com private static boolean isValuelessAttribute(String attribName) { 2465c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com boolean valueless = VALUELESS_ATTRIB_NAMES.contains( 2475c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com Strings.toLowerCase(attribName)); 2485c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com return valueless; 2495c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 2505c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 2515c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // From http://issues.apache.org/jira/browse/XALANC-519 2525c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com private static final Set<String> VALUELESS_ATTRIB_NAMES = ImmutableSet.of( 2535c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com "checked", "compact", "declare", "defer", "disabled", 2545c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com "ismap", "multiple", "nohref", "noresize", "noshade", 2555c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com "nowrap", "readonly", "selected"); 2565c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com} 2575c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 2585c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com/** 2595c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * A token stream that breaks a character stream into <tt> 2605c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * HtmlTokenType.{TEXT,TAGBEGIN,TAGEND,DIRECTIVE,COMMENT,CDATA,DIRECTIVE}</tt> 2615c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * tokens. The matching of attribute names and values is done in a later step. 2625c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com */ 2635c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.comfinal class HtmlInputSplitter extends AbstractTokenStream { 2645c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com /** The source of HTML character data. */ 2655c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com private final String input; 2665c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com /** An offset into input. */ 2675c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com private int offset; 2685c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com /** True iff the current character is inside a tag. */ 2695c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com private boolean inTag; 2705c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com /** 2715c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * True if inside a script, xmp, listing, or similar tag whose content does 2725c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * not follow the normal escaping rules. 2735c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com */ 2745c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com private boolean inEscapeExemptBlock; 2755c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 2765c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com /** 2775c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * Null or the name of the close tag required to end the current escape exempt 2785c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * block. 2795c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * Preformatted tags include <script>, <xmp>, etc. that may 2805c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * contain unescaped HTML input. 2815c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com */ 2825c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com private String escapeExemptTagName = null; 2835c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 2845c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com private HtmlTextEscapingMode textEscapingMode; 2855c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 2865c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com public HtmlInputSplitter(String input) { 2875c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com this.input = input; 2885c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 2895c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 2905c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com /** 2915c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * Make sure that there is a token ready to yield in this.token. 2925c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com */ 2935c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com @Override 2945c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com protected HtmlToken produce() { 2955c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com HtmlToken token = parseToken(); 2965c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (null == token) { return null; } 2975c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 2985c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // Handle escape-exempt blocks. 2995c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // The parse() method is only dimly aware of escape-excempt blocks, so 3005c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // here we detect the beginning and ends of escape exempt blocks, and 3015c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // reclassify as UNESCAPED, any tokens that appear in the middle. 3025c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (inEscapeExemptBlock) { 3035c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (token.type != HtmlTokenType.SERVERCODE) { 3045c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // classify RCDATA as text since it can contain entities 3055c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com token = reclassify( 3065c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com token, (this.textEscapingMode == HtmlTextEscapingMode.RCDATA 3075c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com ? HtmlTokenType.TEXT 3085c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com : HtmlTokenType.UNESCAPED)); 3095c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 3105c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else { 3115c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com switch (token.type) { 3125c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case TAGBEGIN: 3135c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com { 3145c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com String canonTagName = canonicalName( 3155c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com token.start + 1, token.end); 3165c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (HtmlTextEscapingMode.isTagFollowedByLiteralContent( 3175c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com canonTagName)) { 3185c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com this.escapeExemptTagName = canonTagName; 3195c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com this.textEscapingMode = HtmlTextEscapingMode.getModeForTag( 3205c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com canonTagName); 3215c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 3225c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 3235c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 3245c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case TAGEND: 3255c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com this.inEscapeExemptBlock = null != this.escapeExemptTagName; 3265c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 3275c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com default: 3285c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 3295c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 3305c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 3315c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com return token; 3325c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 3335c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 3345c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com /** 3355c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * States for a state machine for optimistically identifying tags and other 3365c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * html/xml/phpish structures. 3375c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com */ 3385c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com private static enum State { 3395c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com TAGNAME, 3405c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com SLASH, 3415c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com BANG, 3425c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com BANG_DASH, 3435c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com COMMENT, 3445c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com COMMENT_DASH, 3455c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com COMMENT_DASH_DASH, 3465c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com DIRECTIVE, 3475c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com DONE, 348d78e82dfc7da9c1e4ad8e4199bc375089a799c85mikesamuel BOGUS_COMMENT, 3495c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com SERVER_CODE, 3505c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com SERVER_CODE_PCT, 3515c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 3525c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // From HTML 5 section 8.1.2.6 3535c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 3545c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // The text in CDATA and RCDATA elements must not contain any 3555c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // occurrences of the string "</" followed by characters that 3565c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // case-insensitively match the tag name of the element followed 3575c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // by one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF), 3585c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // U+000B LINE TABULATION, U+000C FORM FEED (FF), U+0020 SPACE, 3595c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // U+003E GREATER-THAN SIGN (>), or U+002F SOLIDUS (/), unless 3605c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // that string is part of an escaping text span. 3615c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 3625c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // An escaping text span is a span of text (in CDATA and RCDATA 3635c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // elements) and character entity references (in RCDATA elements) 3645c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // that starts with an escaping text span start that is not itself 3655c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // in an escaping text span, and ends at the next escaping text 3665c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // span end. 3675c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 3685c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // An escaping text span start is a part of text that consists of 3695c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // the four character sequence "<!--". 3705c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 3715c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // An escaping text span end is a part of text that consists of 3725c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // the three character sequence "-->". 3735c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 3745c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // An escaping text span start may share its U+002D HYPHEN-MINUS characters 3755c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // with its corresponding escaping text span end. 3765c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com UNESCAPED_LT_BANG, // <! 3775c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com UNESCAPED_LT_BANG_DASH, // <!- 3785c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com ESCAPING_TEXT_SPAN, // Inside an escaping text span 3795c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com ESCAPING_TEXT_SPAN_DASH, // Seen - inside an escaping text span 3805c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com ESCAPING_TEXT_SPAN_DASH_DASH, // Seen -- inside an escaping text span 3815c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com ; 3825c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 3835c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 3845c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com private HtmlToken lastNonIgnorable = null; 3855c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com /** 3865c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * Breaks the character stream into tokens. 3875c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * This method returns a stream of tokens such that each token starts where 3885c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * the last token ended. 3895c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * 3905c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * <p>This property is useful as it allows fetch to collapse and reclassify 3915c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * ranges of tokens based on state that is easy to maintain there. 3925c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * 3935c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * <p>Later passes are responsible for throwing away useless tokens. 3945c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com */ 3955c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com private HtmlToken parseToken() { 3965c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com int start = offset; 3975c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com int limit = input.length(); 3985c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (start == limit) { return null; } 3995c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 4005c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com int end = start + 1; 4015c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com HtmlTokenType type; 4025c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 4035c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com char ch = input.charAt(start); 4045c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (inTag) { 4055c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if ('>' == ch) { 4065c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com type = HtmlTokenType.TAGEND; 4075c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com inTag = false; 4085c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else if ('/' == ch) { 4095c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (end != limit && '>' == input.charAt(end)) { 4105c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com type = HtmlTokenType.TAGEND; 4115c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com inTag = false; 4125c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com ++end; 4135c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else { 4145c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com type = HtmlTokenType.TEXT; 4155c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 4165c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else if ('=' == ch) { 4175c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com type = HtmlTokenType.TEXT; 4185c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else if ('"' == ch || '\'' == ch) { 4195c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com type = HtmlTokenType.QSTRING; 4205c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com int delim = ch; 4215c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com for (; end < limit; ++end) { 4225c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (input.charAt(end) == delim) { 4235c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com ++end; 4245c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 4255c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 4265c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 4275c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else if (!Character.isWhitespace(ch)) { 4285c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com type = HtmlTokenType.TEXT; 4295c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com for (; end < limit; ++end) { 4305c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com ch = input.charAt(end); 4315c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // End a text chunk before /> 4325c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if ((lastNonIgnorable == null 4335c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com || !lastNonIgnorable.tokenInContextMatches(input, "=")) 4345c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com && '/' == ch && end + 1 < limit 4355c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com && '>' == input.charAt(end + 1)) { 4365c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 4375c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else if ('>' == ch || '=' == ch 4385c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com || Character.isWhitespace(ch)) { 4395c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 4405c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else if ('"' == ch || '\'' == ch) { 4415c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (end + 1 < limit) { 4425c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com char ch2 = input.charAt(end + 1); 4435c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (ch2 >= 0 && Character.isWhitespace(ch2) 4445c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com || ch2 == '>' || ch2 == '/') { 4455c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com ++end; 4465c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 4475c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 4485c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 4495c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 4505c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 4515c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else { 4525c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // We skip whitespace tokens inside tag bodies. 4535c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com type = HtmlTokenType.IGNORABLE; 4545c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com while (end < limit && Character.isWhitespace(input.charAt(end))) { 4555c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com ++end; 4565c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 4575c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 4585c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else { 4595c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (ch == '<') { 4605c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (end == limit) { 4615c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com type = HtmlTokenType.TEXT; 4625c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else { 4635c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com ch = input.charAt(end); 4645c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com type = null; 4655c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com State state = null; 4665c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com switch (ch) { 4675c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case '/': // close tag? 4685c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.SLASH; 4695c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com ++end; 4705c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 4715c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case '!': // Comment or declaration 4725c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (!this.inEscapeExemptBlock) { 4735c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.BANG; 4745c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else if (HtmlTextEscapingMode.allowsEscapingTextSpan( 4755c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com escapeExemptTagName)) { 4765c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // Directives, and cdata suppressed in escape 4775c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // exempt mode as they could obscure the close of the 4785c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // escape exempty block, but comments are similar to escaping 4795c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // text spans, and are significant in all CDATA and RCDATA 4805c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // blocks except those inside <xmp> tags. 4815c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // See "Escaping text spans" in section 8.1.2.6 of HTML5. 4825c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // http://www.w3.org/html/wg/html5/#cdata-rcdata-restrictions 4835c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.UNESCAPED_LT_BANG; 4845c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 4855c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com ++end; 4865c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 4875c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case '?': 4885c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (!this.inEscapeExemptBlock) { 489d78e82dfc7da9c1e4ad8e4199bc375089a799c85mikesamuel state = State.BOGUS_COMMENT; 4905c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 4915c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com ++end; 4925c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 4935c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case '%': 4945c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.SERVER_CODE; 4955c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com ++end; 4965c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 4975c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com default: 4985c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (isIdentStart(ch) && !this.inEscapeExemptBlock) { 4995c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.TAGNAME; 5005c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com ++end; 5015c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else if ('<' == ch) { 5025c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com type = HtmlTokenType.TEXT; 5035c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else { 5045c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com ++end; 5055c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 5065c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 5075c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 5085c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (null != state) { 5095c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com charloop: 5105c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com while (end < limit) { 5115c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com ch = input.charAt(end); 5125c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com switch (state) { 5135c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case TAGNAME: 5145c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (Character.isWhitespace(ch) 5155c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com || '>' == ch || '/' == ch || '<' == ch) { 5165c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // End processing of an escape exempt block when we see 5175c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // a corresponding end tag. 5185c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (this.inEscapeExemptBlock 5195c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com && '/' == input.charAt(start + 1) 5205c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com && textEscapingMode != HtmlTextEscapingMode.PLAIN_TEXT 5215c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com && canonicalName(start + 2, end) 5225c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com .equals(escapeExemptTagName)) { 5235c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com this.inEscapeExemptBlock = false; 5245c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com this.escapeExemptTagName = null; 5255c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com this.textEscapingMode = null; 5265c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 5275c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com type = HtmlTokenType.TAGBEGIN; 5285c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // Don't process content as attributes if we're inside 5295c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // an escape exempt block. 5305c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com inTag = !this.inEscapeExemptBlock; 5315c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.DONE; 5325c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break charloop; 5335c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 5345c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 5355c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case SLASH: 5365c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (Character.isLetter(ch)) { 5375c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.TAGNAME; 5385c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else { 5395c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if ('<' == ch) { 5405c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com type = HtmlTokenType.TEXT; 5415c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else { 5425c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com ++end; 5435c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 5445c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break charloop; 5455c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 5465c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 5475c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case BANG: 5485c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if ('-' == ch) { 5495c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.BANG_DASH; 5505c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else { 5515c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.DIRECTIVE; 5525c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 5535c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 5545c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case BANG_DASH: 5555c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if ('-' == ch) { 5565c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.COMMENT; 5575c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else { 5585c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.DIRECTIVE; 5595c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 5605c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 5615c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case COMMENT: 5625c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if ('-' == ch) { 5635c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.COMMENT_DASH; 5645c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 5655c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 5665c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case COMMENT_DASH: 5675c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = ('-' == ch) 5685c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com ? State.COMMENT_DASH_DASH 5695c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com : State.COMMENT_DASH; 5705c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 5715c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case COMMENT_DASH_DASH: 5725c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if ('>' == ch) { 5735c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.DONE; 5745c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com type = HtmlTokenType.COMMENT; 5755c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else if ('-' == ch) { 5765c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.COMMENT_DASH_DASH; 5775c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else { 5785c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.COMMENT_DASH; 5795c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 5805c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 5815c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case DIRECTIVE: 5825c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if ('>' == ch) { 5835c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com type = HtmlTokenType.DIRECTIVE; 5845c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.DONE; 5855c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 5865c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 587d78e82dfc7da9c1e4ad8e4199bc375089a799c85mikesamuel case BOGUS_COMMENT: 5885c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if ('>' == ch) { 589d78e82dfc7da9c1e4ad8e4199bc375089a799c85mikesamuel type = HtmlTokenType.QMARKMETA; 5905c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.DONE; 5915c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 5925c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 5935c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case SERVER_CODE: 5945c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if ('%' == ch) { 5955c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.SERVER_CODE_PCT; 5965c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 5975c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 5985c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case SERVER_CODE_PCT: 5995c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if ('>' == ch) { 6005c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com type = HtmlTokenType.SERVERCODE; 6015c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.DONE; 6025c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else if ('%' != ch) { 6035c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.SERVER_CODE; 6045c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 6055c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 6065c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case UNESCAPED_LT_BANG: 6075c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if ('-' == ch) { 6085c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.UNESCAPED_LT_BANG_DASH; 6095c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else { 6105c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com type = HtmlTokenType.TEXT; 6115c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.DONE; 6125c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 6135c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 6145c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case UNESCAPED_LT_BANG_DASH: 6155c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if ('-' == ch) { 6165c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // According to HTML 5 section 8.1.2.6 6175c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 6185c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // An escaping text span start may share its 6195c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // U+002D HYPHEN-MINUS characters with its 6205c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com // corresponding escaping text span end. 6215c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.ESCAPING_TEXT_SPAN_DASH_DASH; 6225c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else { 6235c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com type = HtmlTokenType.TEXT; 6245c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.DONE; 6255c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 6265c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 6275c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case ESCAPING_TEXT_SPAN: 6285c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if ('-' == ch) { 6295c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.ESCAPING_TEXT_SPAN_DASH; 6305c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 6315c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 6325c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case ESCAPING_TEXT_SPAN_DASH: 6335c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if ('-' == ch) { 6345c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.ESCAPING_TEXT_SPAN_DASH_DASH; 6355c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else { 6365c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.ESCAPING_TEXT_SPAN; 6375c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 6385c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 6395c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case ESCAPING_TEXT_SPAN_DASH_DASH: 6405c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if ('>' == ch) { 6415c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com type = HtmlTokenType.TEXT; 6425c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.DONE; 6435c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else if ('-' != ch) { 6445c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com state = State.ESCAPING_TEXT_SPAN; 6455c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 6465c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 6475c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case DONE: 6485c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com throw new AssertionError( 6495c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com "Unexpectedly DONE while lexing HTML token stream"); 6505c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 6515c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com ++end; 6525c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (State.DONE == state) { break; } 6535c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 6545c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (end == limit) { 6555c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com switch (state) { 6565c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case DONE: 6575c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 658d78e82dfc7da9c1e4ad8e4199bc375089a799c85mikesamuel case BOGUS_COMMENT: 659d78e82dfc7da9c1e4ad8e4199bc375089a799c85mikesamuel type = HtmlTokenType.QMARKMETA; 660d78e82dfc7da9c1e4ad8e4199bc375089a799c85mikesamuel break; 6615c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case COMMENT: 6625c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case COMMENT_DASH: 6635c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case COMMENT_DASH_DASH: 6645c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com type = HtmlTokenType.COMMENT; 6655c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 6665c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case DIRECTIVE: 6675c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case SERVER_CODE: 6685c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case SERVER_CODE_PCT: 6695c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com type = HtmlTokenType.SERVERCODE; 6705c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 6715c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com case TAGNAME: 6725c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com type = HtmlTokenType.TAGBEGIN; 6735c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 6745c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com default: 6755c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com type = HtmlTokenType.TEXT; 6765c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com break; 6775c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 6785c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 6795c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 6805c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 6815c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } else { 6825c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com type = null; 6835c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 6845c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 6855c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (null == type) { 6865c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com while (end < limit && '<' != input.charAt(end)) { ++end; } 6875c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com type = HtmlTokenType.TEXT; 6885c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 6895c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 6905c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com offset = end; 6915c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com HtmlToken result = HtmlToken.instance(start, end, type); 6925c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (type != HtmlTokenType.IGNORABLE) { lastNonIgnorable = result; } 6935c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com return result; 6945c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 6955c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 6965c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com private String canonicalName(int start, int end) { 6975c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com return HtmlLexer.canonicalName(input.substring(start, end)); 6985c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 6995c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 700be666032a113a8af92bc557add8e83579cf0ef5cmikesamuel private static boolean isIdentStart(char ch) { 7015c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com return ch >= 'A' && ch <= 'z' && (ch <= 'Z' || ch >= 'a'); 7025c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 7035c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 7045c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com static HtmlToken reclassify(HtmlToken token, HtmlTokenType type) { 7055c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com return HtmlToken.instance(token.start, token.end, type); 7065c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 7075c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com} 7085c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 7095c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 7105c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com/** 7115c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * A TokenStream that lazily fetches one token at a time. 7125c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * 7136d8c2e9241d042a3e0bff40dac4c388966ad060cmikesamuel * @author Mike Samuel <mikesamuel@gmail.com> 7145c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com */ 7155c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.comabstract class AbstractTokenStream implements TokenStream { 7165c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com private HtmlToken tok; 7175c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 7184e867904c8295537803c1c8a076e130df5674b58mikesamuel public final boolean hasNext() { 7195c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (tok == null) { tok = produce(); } 7205c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com return tok != null; 7215c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 7225c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 7234e867904c8295537803c1c8a076e130df5674b58mikesamuel public HtmlToken next() { 7245c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (this.tok == null) { this.tok = produce(); } 7255c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com HtmlToken t = this.tok; 7265c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com if (t == null) { throw new NoSuchElementException(); } 7275c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com this.tok = null; 7285c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com return t; 7295c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com } 7305c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com 7314e867904c8295537803c1c8a076e130df5674b58mikesamuel protected abstract HtmlToken produce(); 7325c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com} 733