18403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// Copyright (c) 2011, Mike Samuel
28403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// All rights reserved.
38403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel//
48403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// Redistribution and use in source and binary forms, with or without
58403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// modification, are permitted provided that the following conditions
68403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// are met:
78403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel//
88403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// Redistributions of source code must retain the above copyright
98403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// notice, this list of conditions and the following disclaimer.
108403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// Redistributions in binary form must reproduce the above copyright
118403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// notice, this list of conditions and the following disclaimer in the
128403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// documentation and/or other materials provided with the distribution.
138403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// Neither the name of the OWASP nor the names of its contributors may
148403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// be used to endorse or promote products derived from this software
158403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// without specific prior written permission.
168403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
178403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
188403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
198403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
208403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
218403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
228403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
238403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
248403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
258403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
268403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
278403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel// POSSIBILITY OF SUCH DAMAGE.
288403881c365ab36b721ccc4500af1b3a5bd25870mikesamuel
295c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.compackage org.owasp.html;
305c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
315c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.comimport com.google.common.collect.ImmutableSet;
325c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.comimport com.google.common.collect.Lists;
335c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.comimport java.util.LinkedList;
345c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.comimport java.util.NoSuchElementException;
355c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.comimport java.util.Set;
365c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
371bfae835221847e7791625e2baa98a60eb3cfa8amikesamuelimport javax.annotation.concurrent.NotThreadSafe;
381bfae835221847e7791625e2baa98a60eb3cfa8amikesamuel
395c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com/**
404e867904c8295537803c1c8a076e130df5674b58mikesamuel * A flexible lexer for HTML.
415c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * This is hairy code, but it is outside the TCB for the HTML sanitizer.
425c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com *
434e867904c8295537803c1c8a076e130df5674b58mikesamuel * @author Mike Samuel <mikesamuel@gmail.com>
445c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com */
451bfae835221847e7791625e2baa98a60eb3cfa8amikesamuel@NotThreadSafe
464e867904c8295537803c1c8a076e130df5674b58mikesamuelfinal class HtmlLexer extends AbstractTokenStream {
475c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  private final String input;
485c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  private final HtmlInputSplitter splitter;
495c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  private State state = State.OUTSIDE_TAG;
505c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
515c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  public HtmlLexer(String input) {
525c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    this.input = input;
535c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    this.splitter = new HtmlInputSplitter(input);
545c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  }
555c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
565c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  /**
574e867904c8295537803c1c8a076e130df5674b58mikesamuel   * Normalize case of names that are not name-spaced.  This lower-cases HTML
584e867904c8295537803c1c8a076e130df5674b58mikesamuel   * element and attribute names, but not ones for embedded SVG or MATHML.
595c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   */
605c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  static String canonicalName(String elementOrAttribName) {
615c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    return elementOrAttribName.indexOf(':') >= 0
625c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        ? elementOrAttribName : Strings.toLowerCase(elementOrAttribName);
635c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  }
645c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
655c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  /**
666d8c2e9241d042a3e0bff40dac4c388966ad060cmikesamuel   * An FSM that lets us reclassify text tokens inside tags as attribute
675c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * names/values
685c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   */
695c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  private static enum State {
705c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    OUTSIDE_TAG,
715c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    IN_TAG,
725c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    SAW_NAME,
735c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    SAW_EQ,
745c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    ;
755c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  }
765c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
775c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  /**
785c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * Makes sure that this.token contains a token if one is available.
795c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * This may require fetching and combining multiple tokens from the underlying
805c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * splitter.
815c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   */
825c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  @Override
834e867904c8295537803c1c8a076e130df5674b58mikesamuel  protected HtmlToken produce() {
845c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    HtmlToken token = readToken();
855c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    if (token == null) { return null; }
865c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
875c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    switch (token.type) {
885c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
895c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      // Keep track of whether we're inside a tag or not.
905c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      case TAGBEGIN:
915c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        state = State.IN_TAG;
925c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        break;
935c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      case TAGEND:
945c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        if (state == State.SAW_EQ && HtmlTokenType.TAGEND == token.type) {
955c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          // Distinguish <input type=checkbox checked=> from
965c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          // <input type=checkbox checked>
975c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          pushbackToken(token);
985c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          state = State.IN_TAG;
995c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          return HtmlToken.instance(
1005c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              token.start, token.start, HtmlTokenType.ATTRVALUE);
1015c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        }
1025c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
1035c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        state = State.OUTSIDE_TAG;
1045c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        break;
1055c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
1065c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      // Drop ignorable tokens by zeroing out the one received and recursing
1075c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      case IGNORABLE:
1085c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        return produce();
1095c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
1105c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      // collapse adjacent text nodes if we're outside a tag, or otherwise,
1115c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      // Recognize attribute names and values.
1125c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      default:
1135c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        switch (state) {
1145c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          case OUTSIDE_TAG:
1155c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            if (HtmlTokenType.TEXT == token.type
1165c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                || HtmlTokenType.UNESCAPED == token.type) {
1175c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              token = collapseSubsequent(token);
1185c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            }
1195c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            break;
1205c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          case IN_TAG:
1215c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            if (HtmlTokenType.TEXT == token.type
1225c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                && !token.tokenInContextMatches(input, "=")) {
1235c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              // Reclassify as attribute name
1245c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              token = HtmlInputSplitter.reclassify(
1255c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  token, HtmlTokenType.ATTRNAME);
1265c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              state = State.SAW_NAME;
1275c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            }
1285c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            break;
1295c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          case SAW_NAME:
1305c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            if (HtmlTokenType.TEXT == token.type) {
1315c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              if (token.tokenInContextMatches(input, "=")) {
1325c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                state = State.SAW_EQ;
1335c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                // Skip the '=' token
1345c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                return produce();
1355c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              } else {
1365c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                // Reclassify as attribute name
1375c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                token = HtmlInputSplitter.reclassify(
1385c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    token, HtmlTokenType.ATTRNAME);
1395c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              }
1405c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            } else {
1415c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              state = State.IN_TAG;
1425c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            }
1435c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            break;
1445c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          case SAW_EQ:
1455c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            if (HtmlTokenType.TEXT == token.type
1465c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                || HtmlTokenType.QSTRING == token.type) {
1475c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              if (HtmlTokenType.TEXT == token.type) {
1485c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                // Collapse adjacent text nodes to properly handle
1495c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                //   <a onclick=this.clicked=true>
1505c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                //   <a title=foo bar>
1515c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                token = collapseAttributeName(token);
1525c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              }
1535c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              // Reclassify as value
1545c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              token = HtmlInputSplitter.reclassify(
1555c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  token, HtmlTokenType.ATTRVALUE);
1565c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              state = State.IN_TAG;
1575c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            }
1585c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            break;
1595c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        }
1605c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        break;
1615c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    }
1625c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
1635c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    return token;
1645c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  }
1655c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
1665c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  /**
1675c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * Collapses all the following tokens of the same type into this.token.
1685c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   */
1694e867904c8295537803c1c8a076e130df5674b58mikesamuel  private HtmlToken collapseSubsequent(HtmlToken token) {
1705c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    HtmlToken collapsed = token;
1715c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    for (HtmlToken next;
1725c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com         (next= peekToken(0)) != null && next.type == token.type;
1735c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com         readToken()) {
1745c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      collapsed = join(collapsed, next);
1755c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    }
1765c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    return collapsed;
1775c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  }
1785c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
1794e867904c8295537803c1c8a076e130df5674b58mikesamuel  private HtmlToken collapseAttributeName(HtmlToken token) {
1805c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    // We want to collapse tokens into the value that are not parts of an
1815c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    // attribute value.  We should include any space or text adjacent to the
1825c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    // value, but should stop at any of the following constructions:
1835c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    //   space end-of-file              e.g. name=foo_
1845c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    //   space valueless-attrib-name    e.g. name=foo checked
1855c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    //   space tag-end                  e.g. name=foo />
1865c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    //   space text space? '='          e.g. name=foo bar=
1875c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    int nToMerge = 0;
1885c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    for (HtmlToken t; (t = peekToken(nToMerge)) != null;) {
1895c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      if (t.type == HtmlTokenType.IGNORABLE) {
1905c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        HtmlToken tok = peekToken(nToMerge + 1);
1915c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        if (tok == null) { break; }
1925c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        if (tok.type != HtmlTokenType.TEXT) { break; }
1935c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        if (isValuelessAttribute(input.substring(tok.start, tok.end))) {
1945c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          break;
1955c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        }
1965c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        HtmlToken eq = peekToken(nToMerge + 2);
1975c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        if (eq != null && eq.type == HtmlTokenType.IGNORABLE) {
1985c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          eq = peekToken(nToMerge + 3);
1995c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        }
2005c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        if (eq == null || eq.tokenInContextMatches(input, "=")) {
2015c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          break;
2025c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        }
2035c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      } else if (t.type != HtmlTokenType.TEXT) {
2045c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        break;
2055c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      }
2065c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      ++nToMerge;
2075c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    }
2085c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    if (nToMerge == 0) { return token; }
2095c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
2105c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    int end = token.end;
2115c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    do {
2125c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      end = readToken().end;
2135c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    } while (--nToMerge > 0);
2145c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
2155c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    return HtmlToken.instance(token.start, end, HtmlTokenType.TEXT);
2165c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  }
2175c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
2185c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  private static HtmlToken join(HtmlToken a, HtmlToken b) {
2195c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    return HtmlToken.instance(a.start, b.end, a.type);
2205c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  }
2215c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
2225c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  private final LinkedList<HtmlToken> lookahead = Lists.newLinkedList();
2234e867904c8295537803c1c8a076e130df5674b58mikesamuel  private HtmlToken readToken() {
2245c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    if (!lookahead.isEmpty()) {
2255c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      return lookahead.remove();
2265c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    } else if (splitter.hasNext()) {
2275c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      return splitter.next();
2285c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    } else {
2295c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      return null;
2305c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    }
2315c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  }
2325c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
2334e867904c8295537803c1c8a076e130df5674b58mikesamuel  private HtmlToken peekToken(int i) {
2345c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    while (lookahead.size() <= i && splitter.hasNext()) {
2355c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      lookahead.add(splitter.next());
2365c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    }
2375c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    return lookahead.size() > i ? lookahead.get(i) : null;
2385c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  }
2395c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
2405c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  private void pushbackToken(HtmlToken token) {
2415c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    lookahead.addFirst(token);
2425c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  }
2435c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
2445c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  /** Can the attribute appear in HTML without a value. */
2455c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  private static boolean isValuelessAttribute(String attribName) {
2465c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    boolean valueless = VALUELESS_ATTRIB_NAMES.contains(
2475c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        Strings.toLowerCase(attribName));
2485c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    return valueless;
2495c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  }
2505c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
2515c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  // From http://issues.apache.org/jira/browse/XALANC-519
2525c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  private static final Set<String> VALUELESS_ATTRIB_NAMES = ImmutableSet.of(
2535c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      "checked", "compact", "declare", "defer", "disabled",
2545c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      "ismap", "multiple", "nohref", "noresize", "noshade",
2555c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      "nowrap", "readonly", "selected");
2565c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com}
2575c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
2585c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com/**
2595c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * A token stream that breaks a character stream into <tt>
2605c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * HtmlTokenType.{TEXT,TAGBEGIN,TAGEND,DIRECTIVE,COMMENT,CDATA,DIRECTIVE}</tt>
2615c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * tokens.  The matching of attribute names and values is done in a later step.
2625c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com */
2635c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.comfinal class HtmlInputSplitter extends AbstractTokenStream {
2645c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  /** The source of HTML character data. */
2655c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  private final String input;
2665c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  /** An offset into input. */
2675c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  private int offset;
2685c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  /** True iff the current character is inside a tag. */
2695c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  private boolean inTag;
2705c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  /**
2715c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * True if inside a script, xmp, listing, or similar tag whose content does
2725c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * not follow the normal escaping rules.
2735c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   */
2745c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  private boolean inEscapeExemptBlock;
2755c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
2765c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  /**
2775c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * Null or the name of the close tag required to end the current escape exempt
2785c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * block.
2795c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * Preformatted tags include &lt;script&gt;, &lt;xmp&gt;, etc. that may
2805c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * contain unescaped HTML input.
2815c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   */
2825c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  private String escapeExemptTagName = null;
2835c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
2845c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  private HtmlTextEscapingMode textEscapingMode;
2855c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
2865c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  public HtmlInputSplitter(String input) {
2875c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    this.input = input;
2885c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  }
2895c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
2905c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  /**
2915c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * Make sure that there is a token ready to yield in this.token.
2925c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   */
2935c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  @Override
2945c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  protected HtmlToken produce() {
2955c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    HtmlToken token = parseToken();
2965c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    if (null == token) { return null; }
2975c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
2985c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    // Handle escape-exempt blocks.
2995c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    // The parse() method is only dimly aware of escape-excempt blocks, so
3005c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    // here we detect the beginning and ends of escape exempt blocks, and
3015c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    // reclassify as UNESCAPED, any tokens that appear in the middle.
3025c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    if (inEscapeExemptBlock) {
3035c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      if (token.type != HtmlTokenType.SERVERCODE) {
3045c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        // classify RCDATA as text since it can contain entities
3055c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        token = reclassify(
3065c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            token, (this.textEscapingMode == HtmlTextEscapingMode.RCDATA
3075c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    ? HtmlTokenType.TEXT
3085c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    : HtmlTokenType.UNESCAPED));
3095c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      }
3105c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    } else {
3115c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      switch (token.type) {
3125c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        case TAGBEGIN:
3135c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          {
3145c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            String canonTagName = canonicalName(
3155c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                token.start + 1, token.end);
3165c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            if (HtmlTextEscapingMode.isTagFollowedByLiteralContent(
3175c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    canonTagName)) {
3185c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              this.escapeExemptTagName = canonTagName;
3195c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              this.textEscapingMode = HtmlTextEscapingMode.getModeForTag(
3205c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  canonTagName);
3215c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            }
3225c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            break;
3235c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          }
3245c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        case TAGEND:
3255c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          this.inEscapeExemptBlock = null != this.escapeExemptTagName;
3265c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          break;
3275c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        default:
3285c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          break;
3295c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      }
3305c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    }
3315c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    return token;
3325c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  }
3335c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
3345c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  /**
3355c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * States for a state machine for optimistically identifying tags and other
3365c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * html/xml/phpish structures.
3375c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   */
3385c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  private static enum State {
3395c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    TAGNAME,
3405c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    SLASH,
3415c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    BANG,
3425c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    BANG_DASH,
3435c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    COMMENT,
3445c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    COMMENT_DASH,
3455c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    COMMENT_DASH_DASH,
3465c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    DIRECTIVE,
3475c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    DONE,
348d78e82dfc7da9c1e4ad8e4199bc375089a799c85mikesamuel    BOGUS_COMMENT,
3495c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    SERVER_CODE,
3505c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    SERVER_CODE_PCT,
3515c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
3525c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    // From HTML 5 section 8.1.2.6
3535c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
3545c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    // The text in CDATA and RCDATA elements must not contain any
3555c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    // occurrences of the string "</" followed by characters that
3565c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    // case-insensitively match the tag name of the element followed
3575c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    // by one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF),
3585c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    // U+000B LINE TABULATION, U+000C FORM FEED (FF), U+0020 SPACE,
3595c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    // U+003E GREATER-THAN SIGN (>), or U+002F SOLIDUS (/), unless
3605c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    // that string is part of an escaping text span.
3615c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
3625c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    // An escaping text span is a span of text (in CDATA and RCDATA
3635c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    // elements) and character entity references (in RCDATA elements)
3645c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    // that starts with an escaping text span start that is not itself
3655c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    // in an escaping text span, and ends at the next escaping text
3665c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    // span end.
3675c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
3685c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    // An escaping text span start is a part of text that consists of
3695c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    // the four character sequence "<!--".
3705c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
3715c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    // An escaping text span end is a part of text that consists of
3725c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    // the three character sequence "-->".
3735c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
3745c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    // An escaping text span start may share its U+002D HYPHEN-MINUS characters
3755c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    // with its corresponding escaping text span end.
3765c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    UNESCAPED_LT_BANG,             // <!
3775c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    UNESCAPED_LT_BANG_DASH,        // <!-
3785c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    ESCAPING_TEXT_SPAN,            // Inside an escaping text span
3795c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    ESCAPING_TEXT_SPAN_DASH,       // Seen - inside an escaping text span
3805c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    ESCAPING_TEXT_SPAN_DASH_DASH,  // Seen -- inside an escaping text span
3815c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    ;
3825c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  }
3835c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
3845c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  private HtmlToken lastNonIgnorable = null;
3855c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  /**
3865c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * Breaks the character stream into tokens.
3875c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * This method returns a stream of tokens such that each token starts where
3885c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * the last token ended.
3895c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   *
3905c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * <p>This property is useful as it allows fetch to collapse and reclassify
3915c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * ranges of tokens based on state that is easy to maintain there.
3925c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   *
3935c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   * <p>Later passes are responsible for throwing away useless tokens.
3945c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com   */
3955c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  private HtmlToken parseToken() {
3965c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    int start = offset;
3975c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    int limit = input.length();
3985c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    if (start == limit) { return null; }
3995c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
4005c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    int end = start + 1;
4015c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    HtmlTokenType type;
4025c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
4035c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    char ch = input.charAt(start);
4045c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    if (inTag) {
4055c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      if ('>' == ch) {
4065c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        type = HtmlTokenType.TAGEND;
4075c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        inTag = false;
4085c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      } else if ('/' == ch) {
4095c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        if (end != limit && '>' == input.charAt(end)) {
4105c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          type = HtmlTokenType.TAGEND;
4115c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          inTag = false;
4125c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          ++end;
4135c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        } else {
4145c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          type = HtmlTokenType.TEXT;
4155c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        }
4165c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      } else if ('=' == ch) {
4175c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        type = HtmlTokenType.TEXT;
4185c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      } else if ('"' == ch || '\'' == ch) {
4195c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        type = HtmlTokenType.QSTRING;
4205c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        int delim = ch;
4215c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        for (; end < limit; ++end) {
4225c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          if (input.charAt(end) == delim) {
4235c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            ++end;
4245c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            break;
4255c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          }
4265c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        }
4275c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      } else if (!Character.isWhitespace(ch)) {
4285c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        type = HtmlTokenType.TEXT;
4295c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        for (; end < limit; ++end) {
4305c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          ch = input.charAt(end);
4315c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          // End a text chunk before />
4325c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          if ((lastNonIgnorable == null
4335c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com               || !lastNonIgnorable.tokenInContextMatches(input, "="))
4345c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              && '/' == ch && end + 1 < limit
4355c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              && '>' == input.charAt(end + 1)) {
4365c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            break;
4375c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          } else if ('>' == ch || '=' == ch
4385c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                     || Character.isWhitespace(ch)) {
4395c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            break;
4405c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          } else if ('"' == ch || '\'' == ch) {
4415c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            if (end + 1 < limit) {
4425c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              char ch2 = input.charAt(end + 1);
4435c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              if (ch2 >= 0 && Character.isWhitespace(ch2)
4445c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  || ch2 == '>' || ch2 == '/') {
4455c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                ++end;
4465c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                break;
4475c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              }
4485c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            }
4495c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          }
4505c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        }
4515c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      } else {
4525c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        // We skip whitespace tokens inside tag bodies.
4535c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        type = HtmlTokenType.IGNORABLE;
4545c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        while (end < limit && Character.isWhitespace(input.charAt(end))) {
4555c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          ++end;
4565c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        }
4575c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      }
4585c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    } else {
4595c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      if (ch == '<') {
4605c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        if (end == limit) {
4615c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          type = HtmlTokenType.TEXT;
4625c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        } else {
4635c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          ch = input.charAt(end);
4645c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          type = null;
4655c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          State state = null;
4665c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          switch (ch) {
4675c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            case '/':  // close tag?
4685c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              state = State.SLASH;
4695c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              ++end;
4705c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              break;
4715c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            case '!':  // Comment or declaration
4725c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              if (!this.inEscapeExemptBlock) {
4735c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                state = State.BANG;
4745c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              } else if (HtmlTextEscapingMode.allowsEscapingTextSpan(
4755c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                             escapeExemptTagName)) {
4765c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                // Directives, and cdata suppressed in escape
4775c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                // exempt mode as they could obscure the close of the
4785c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                // escape exempty block, but comments are similar to escaping
4795c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                // text spans, and are significant in all CDATA and RCDATA
4805c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                // blocks except those inside <xmp> tags.
4815c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                // See "Escaping text spans" in section 8.1.2.6 of HTML5.
4825c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                // http://www.w3.org/html/wg/html5/#cdata-rcdata-restrictions
4835c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                state = State.UNESCAPED_LT_BANG;
4845c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              }
4855c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              ++end;
4865c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              break;
4875c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            case '?':
4885c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              if (!this.inEscapeExemptBlock) {
489d78e82dfc7da9c1e4ad8e4199bc375089a799c85mikesamuel                state = State.BOGUS_COMMENT;
4905c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              }
4915c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              ++end;
4925c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              break;
4935c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            case '%':
4945c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              state = State.SERVER_CODE;
4955c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              ++end;
4965c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              break;
4975c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            default:
4985c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              if (isIdentStart(ch) && !this.inEscapeExemptBlock) {
4995c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                state = State.TAGNAME;
5005c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                ++end;
5015c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              } else if ('<' == ch) {
5025c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                type = HtmlTokenType.TEXT;
5035c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              } else {
5045c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                ++end;
5055c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              }
5065c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              break;
5075c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          }
5085c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          if (null != state) {
5095c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            charloop:
5105c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            while (end < limit) {
5115c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              ch = input.charAt(end);
5125c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              switch (state) {
5135c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                case TAGNAME:
5145c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  if (Character.isWhitespace(ch)
5155c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                      || '>' == ch || '/' == ch || '<' == ch) {
5165c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    // End processing of an escape exempt block when we see
5175c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    // a corresponding end tag.
5185c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    if (this.inEscapeExemptBlock
5195c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                        && '/' == input.charAt(start + 1)
5205c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                        && textEscapingMode != HtmlTextEscapingMode.PLAIN_TEXT
5215c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                        && canonicalName(start + 2, end)
5225c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                            .equals(escapeExemptTagName)) {
5235c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                      this.inEscapeExemptBlock = false;
5245c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                      this.escapeExemptTagName = null;
5255c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                      this.textEscapingMode = null;
5265c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    }
5275c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    type = HtmlTokenType.TAGBEGIN;
5285c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    // Don't process content as attributes if we're inside
5295c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    // an escape exempt block.
5305c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    inTag = !this.inEscapeExemptBlock;
5315c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    state = State.DONE;
5325c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    break charloop;
5335c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  }
5345c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  break;
5355c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                case SLASH:
5365c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  if (Character.isLetter(ch)) {
5375c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    state = State.TAGNAME;
5385c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  } else {
5395c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    if ('<' == ch) {
5405c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                      type = HtmlTokenType.TEXT;
5415c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    } else {
5425c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                      ++end;
5435c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    }
5445c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    break charloop;
5455c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  }
5465c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  break;
5475c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                case BANG:
5485c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  if ('-' == ch) {
5495c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    state = State.BANG_DASH;
5505c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  } else {
5515c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    state = State.DIRECTIVE;
5525c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  }
5535c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  break;
5545c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                case BANG_DASH:
5555c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  if ('-' == ch) {
5565c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    state = State.COMMENT;
5575c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  } else {
5585c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    state = State.DIRECTIVE;
5595c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  }
5605c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  break;
5615c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                case COMMENT:
5625c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  if ('-' == ch) {
5635c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    state = State.COMMENT_DASH;
5645c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  }
5655c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  break;
5665c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                case COMMENT_DASH:
5675c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  state = ('-' == ch)
5685c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                      ? State.COMMENT_DASH_DASH
5695c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                      : State.COMMENT_DASH;
5705c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  break;
5715c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                case COMMENT_DASH_DASH:
5725c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  if ('>' == ch) {
5735c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    state = State.DONE;
5745c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    type = HtmlTokenType.COMMENT;
5755c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  } else if ('-' == ch) {
5765c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    state = State.COMMENT_DASH_DASH;
5775c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  } else {
5785c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    state = State.COMMENT_DASH;
5795c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  }
5805c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  break;
5815c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                case DIRECTIVE:
5825c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  if ('>' == ch) {
5835c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    type = HtmlTokenType.DIRECTIVE;
5845c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    state = State.DONE;
5855c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  }
5865c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  break;
587d78e82dfc7da9c1e4ad8e4199bc375089a799c85mikesamuel                case BOGUS_COMMENT:
5885c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  if ('>' == ch) {
589d78e82dfc7da9c1e4ad8e4199bc375089a799c85mikesamuel                    type = HtmlTokenType.QMARKMETA;
5905c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    state = State.DONE;
5915c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  }
5925c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  break;
5935c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                case SERVER_CODE:
5945c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  if ('%' == ch) {
5955c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    state = State.SERVER_CODE_PCT;
5965c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  }
5975c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  break;
5985c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                case SERVER_CODE_PCT:
5995c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  if ('>' == ch) {
6005c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    type = HtmlTokenType.SERVERCODE;
6015c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    state = State.DONE;
6025c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  } else if ('%' != ch) {
6035c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    state = State.SERVER_CODE;
6045c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  }
6055c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  break;
6065c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                case UNESCAPED_LT_BANG:
6075c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  if ('-' == ch) {
6085c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    state = State.UNESCAPED_LT_BANG_DASH;
6095c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  } else {
6105c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    type = HtmlTokenType.TEXT;
6115c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    state = State.DONE;
6125c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  }
6135c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  break;
6145c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                case UNESCAPED_LT_BANG_DASH:
6155c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  if ('-' == ch) {
6165c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    // According to HTML 5 section 8.1.2.6
6175c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
6185c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    // An escaping text span start may share its
6195c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    // U+002D HYPHEN-MINUS characters with its
6205c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    // corresponding escaping text span end.
6215c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    state = State.ESCAPING_TEXT_SPAN_DASH_DASH;
6225c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  } else {
6235c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    type = HtmlTokenType.TEXT;
6245c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    state = State.DONE;
6255c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  }
6265c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  break;
6275c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                case ESCAPING_TEXT_SPAN:
6285c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  if ('-' == ch) {
6295c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    state = State.ESCAPING_TEXT_SPAN_DASH;
6305c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  }
6315c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  break;
6325c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                case ESCAPING_TEXT_SPAN_DASH:
6335c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  if ('-' == ch) {
6345c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    state = State.ESCAPING_TEXT_SPAN_DASH_DASH;
6355c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  } else {
6365c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    state = State.ESCAPING_TEXT_SPAN;
6375c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  }
6385c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  break;
6395c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                case ESCAPING_TEXT_SPAN_DASH_DASH:
6405c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  if ('>' == ch) {
6415c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    type = HtmlTokenType.TEXT;
6425c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    state = State.DONE;
6435c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  } else if ('-' != ch) {
6445c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                    state = State.ESCAPING_TEXT_SPAN;
6455c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  }
6465c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  break;
6475c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                case DONE:
6485c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  throw new AssertionError(
6495c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                      "Unexpectedly DONE while lexing HTML token stream");
6505c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              }
6515c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              ++end;
6525c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              if (State.DONE == state) { break; }
6535c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            }
6545c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            if (end == limit) {
6555c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              switch (state) {
6565c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                case DONE:
6575c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  break;
658d78e82dfc7da9c1e4ad8e4199bc375089a799c85mikesamuel                case BOGUS_COMMENT:
659d78e82dfc7da9c1e4ad8e4199bc375089a799c85mikesamuel                  type = HtmlTokenType.QMARKMETA;
660d78e82dfc7da9c1e4ad8e4199bc375089a799c85mikesamuel                  break;
6615c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                case COMMENT:
6625c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                case COMMENT_DASH:
6635c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                case COMMENT_DASH_DASH:
6645c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  type = HtmlTokenType.COMMENT;
6655c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  break;
6665c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                case DIRECTIVE:
6675c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                case SERVER_CODE:
6685c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                case SERVER_CODE_PCT:
6695c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  type = HtmlTokenType.SERVERCODE;
6705c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  break;
6715c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                case TAGNAME:
6725c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  type = HtmlTokenType.TAGBEGIN;
6735c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  break;
6745c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                default:
6755c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  type = HtmlTokenType.TEXT;
6765c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com                  break;
6775c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com              }
6785c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com            }
6795c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com          }
6805c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        }
6815c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      } else {
6825c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com        type = null;
6835c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      }
6845c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    }
6855c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    if (null == type) {
6865c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      while (end < limit && '<' != input.charAt(end)) { ++end; }
6875c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com      type = HtmlTokenType.TEXT;
6885c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    }
6895c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
6905c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    offset = end;
6915c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    HtmlToken result = HtmlToken.instance(start, end, type);
6925c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    if (type != HtmlTokenType.IGNORABLE) { lastNonIgnorable = result; }
6935c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    return result;
6945c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  }
6955c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
6965c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  private String canonicalName(int start, int end) {
6975c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    return HtmlLexer.canonicalName(input.substring(start, end));
6985c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  }
6995c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
700be666032a113a8af92bc557add8e83579cf0ef5cmikesamuel  private static boolean isIdentStart(char ch) {
7015c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    return ch >= 'A' && ch <= 'z' && (ch <= 'Z' || ch >= 'a');
7025c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  }
7035c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
7045c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  static HtmlToken reclassify(HtmlToken token, HtmlTokenType type) {
7055c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    return HtmlToken.instance(token.start, token.end, type);
7065c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  }
7075c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com}
7085c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
7095c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
7105c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com/**
7115c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com * A TokenStream that lazily fetches one token at a time.
7125c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com *
7136d8c2e9241d042a3e0bff40dac4c388966ad060cmikesamuel * @author Mike Samuel <mikesamuel@gmail.com>
7145c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com */
7155c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.comabstract class AbstractTokenStream implements TokenStream {
7165c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  private HtmlToken tok;
7175c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
7184e867904c8295537803c1c8a076e130df5674b58mikesamuel  public final boolean hasNext() {
7195c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    if (tok == null) { tok = produce(); }
7205c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    return tok != null;
7215c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  }
7225c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
7234e867904c8295537803c1c8a076e130df5674b58mikesamuel  public HtmlToken next() {
7245c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    if (this.tok == null) { this.tok = produce(); }
7255c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    HtmlToken t = this.tok;
7265c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    if (t == null) { throw new NoSuchElementException(); }
7275c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    this.tok = null;
7285c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com    return t;
7295c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com  }
7305c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com
7314e867904c8295537803c1c8a076e130df5674b58mikesamuel  protected abstract HtmlToken produce();
7325c702c12be71d8070da9287cc4a044617dd726a7manico.james@gmail.com}
733