1// Copyright (c) 2011, Mike Samuel
2// All rights reserved.
3//
4// Redistribution and use in source and binary forms, with or without
5// modification, are permitted provided that the following conditions
6// are met:
7//
8// Redistributions of source code must retain the above copyright
9// notice, this list of conditions and the following disclaimer.
10// Redistributions in binary form must reproduce the above copyright
11// notice, this list of conditions and the following disclaimer in the
12// documentation and/or other materials provided with the distribution.
13// Neither the name of the OWASP nor the names of its contributors may
14// be used to endorse or promote products derived from this software
15// without specific prior written permission.
16// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
19// FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
20// COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
21// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
22// BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
23// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25// LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
26// ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27// POSSIBILITY OF SUCH DAMAGE.
28
29package org.owasp.html;
30
31import com.google.common.collect.ImmutableSet;
32import com.google.common.collect.Lists;
33import java.util.LinkedList;
34import java.util.NoSuchElementException;
35import java.util.Set;
36
37import javax.annotation.concurrent.NotThreadSafe;
38
39/**
40 * A flexible lexer for HTML.
41 * This is hairy code, but it is outside the TCB for the HTML sanitizer.
42 *
43 * @author Mike Samuel <mikesamuel@gmail.com>
44 */
45@NotThreadSafe
46final class HtmlLexer extends AbstractTokenStream {
47  private final String input;
48  private final HtmlInputSplitter splitter;
49  private State state = State.OUTSIDE_TAG;
50
51  public HtmlLexer(String input) {
52    this.input = input;
53    this.splitter = new HtmlInputSplitter(input);
54  }
55
56  /**
57   * Normalize case of names that are not name-spaced.  This lower-cases HTML
58   * element and attribute names, but not ones for embedded SVG or MATHML.
59   */
60  static String canonicalName(String elementOrAttribName) {
61    return elementOrAttribName.indexOf(':') >= 0
62        ? elementOrAttribName : Strings.toLowerCase(elementOrAttribName);
63  }
64
65  /**
66   * An FSM that lets us reclassify text tokens inside tags as attribute
67   * names/values
68   */
69  private static enum State {
70    OUTSIDE_TAG,
71    IN_TAG,
72    SAW_NAME,
73    SAW_EQ,
74    ;
75  }
76
77  /**
78   * Makes sure that this.token contains a token if one is available.
79   * This may require fetching and combining multiple tokens from the underlying
80   * splitter.
81   */
82  @Override
83  protected HtmlToken produce() {
84    HtmlToken token = readToken();
85    if (token == null) { return null; }
86
87    switch (token.type) {
88
89      // Keep track of whether we're inside a tag or not.
90      case TAGBEGIN:
91        state = State.IN_TAG;
92        break;
93      case TAGEND:
94        if (state == State.SAW_EQ && HtmlTokenType.TAGEND == token.type) {
95          // Distinguish <input type=checkbox checked=> from
96          // <input type=checkbox checked>
97          pushbackToken(token);
98          state = State.IN_TAG;
99          return HtmlToken.instance(
100              token.start, token.start, HtmlTokenType.ATTRVALUE);
101        }
102
103        state = State.OUTSIDE_TAG;
104        break;
105
106      // Drop ignorable tokens by zeroing out the one received and recursing
107      case IGNORABLE:
108        return produce();
109
110      // collapse adjacent text nodes if we're outside a tag, or otherwise,
111      // Recognize attribute names and values.
112      default:
113        switch (state) {
114          case OUTSIDE_TAG:
115            if (HtmlTokenType.TEXT == token.type
116                || HtmlTokenType.UNESCAPED == token.type) {
117              token = collapseSubsequent(token);
118            }
119            break;
120          case IN_TAG:
121            if (HtmlTokenType.TEXT == token.type
122                && !token.tokenInContextMatches(input, "=")) {
123              // Reclassify as attribute name
124              token = HtmlInputSplitter.reclassify(
125                  token, HtmlTokenType.ATTRNAME);
126              state = State.SAW_NAME;
127            }
128            break;
129          case SAW_NAME:
130            if (HtmlTokenType.TEXT == token.type) {
131              if (token.tokenInContextMatches(input, "=")) {
132                state = State.SAW_EQ;
133                // Skip the '=' token
134                return produce();
135              } else {
136                // Reclassify as attribute name
137                token = HtmlInputSplitter.reclassify(
138                    token, HtmlTokenType.ATTRNAME);
139              }
140            } else {
141              state = State.IN_TAG;
142            }
143            break;
144          case SAW_EQ:
145            if (HtmlTokenType.TEXT == token.type
146                || HtmlTokenType.QSTRING == token.type) {
147              if (HtmlTokenType.TEXT == token.type) {
148                // Collapse adjacent text nodes to properly handle
149                //   <a onclick=this.clicked=true>
150                //   <a title=foo bar>
151                token = collapseAttributeName(token);
152              }
153              // Reclassify as value
154              token = HtmlInputSplitter.reclassify(
155                  token, HtmlTokenType.ATTRVALUE);
156              state = State.IN_TAG;
157            }
158            break;
159        }
160        break;
161    }
162
163    return token;
164  }
165
166  /**
167   * Collapses all the following tokens of the same type into this.token.
168   */
169  private HtmlToken collapseSubsequent(HtmlToken token) {
170    HtmlToken collapsed = token;
171    for (HtmlToken next;
172         (next= peekToken(0)) != null && next.type == token.type;
173         readToken()) {
174      collapsed = join(collapsed, next);
175    }
176    return collapsed;
177  }
178
179  private HtmlToken collapseAttributeName(HtmlToken token) {
180    // We want to collapse tokens into the value that are not parts of an
181    // attribute value.  We should include any space or text adjacent to the
182    // value, but should stop at any of the following constructions:
183    //   space end-of-file              e.g. name=foo_
184    //   space valueless-attrib-name    e.g. name=foo checked
185    //   space tag-end                  e.g. name=foo />
186    //   space text space? '='          e.g. name=foo bar=
187    int nToMerge = 0;
188    for (HtmlToken t; (t = peekToken(nToMerge)) != null;) {
189      if (t.type == HtmlTokenType.IGNORABLE) {
190        HtmlToken tok = peekToken(nToMerge + 1);
191        if (tok == null) { break; }
192        if (tok.type != HtmlTokenType.TEXT) { break; }
193        if (isValuelessAttribute(input.substring(tok.start, tok.end))) {
194          break;
195        }
196        HtmlToken eq = peekToken(nToMerge + 2);
197        if (eq != null && eq.type == HtmlTokenType.IGNORABLE) {
198          eq = peekToken(nToMerge + 3);
199        }
200        if (eq == null || eq.tokenInContextMatches(input, "=")) {
201          break;
202        }
203      } else if (t.type != HtmlTokenType.TEXT) {
204        break;
205      }
206      ++nToMerge;
207    }
208    if (nToMerge == 0) { return token; }
209
210    int end = token.end;
211    do {
212      end = readToken().end;
213    } while (--nToMerge > 0);
214
215    return HtmlToken.instance(token.start, end, HtmlTokenType.TEXT);
216  }
217
218  private static HtmlToken join(HtmlToken a, HtmlToken b) {
219    return HtmlToken.instance(a.start, b.end, a.type);
220  }
221
222  private final LinkedList<HtmlToken> lookahead = Lists.newLinkedList();
223  private HtmlToken readToken() {
224    if (!lookahead.isEmpty()) {
225      return lookahead.remove();
226    } else if (splitter.hasNext()) {
227      return splitter.next();
228    } else {
229      return null;
230    }
231  }
232
233  private HtmlToken peekToken(int i) {
234    while (lookahead.size() <= i && splitter.hasNext()) {
235      lookahead.add(splitter.next());
236    }
237    return lookahead.size() > i ? lookahead.get(i) : null;
238  }
239
240  private void pushbackToken(HtmlToken token) {
241    lookahead.addFirst(token);
242  }
243
244  /** Can the attribute appear in HTML without a value. */
245  private static boolean isValuelessAttribute(String attribName) {
246    boolean valueless = VALUELESS_ATTRIB_NAMES.contains(
247        Strings.toLowerCase(attribName));
248    return valueless;
249  }
250
251  // From http://issues.apache.org/jira/browse/XALANC-519
252  private static final Set<String> VALUELESS_ATTRIB_NAMES = ImmutableSet.of(
253      "checked", "compact", "declare", "defer", "disabled",
254      "ismap", "multiple", "nohref", "noresize", "noshade",
255      "nowrap", "readonly", "selected");
256}
257
258/**
259 * A token stream that breaks a character stream into <tt>
260 * HtmlTokenType.{TEXT,TAGBEGIN,TAGEND,DIRECTIVE,COMMENT,CDATA,DIRECTIVE}</tt>
261 * tokens.  The matching of attribute names and values is done in a later step.
262 */
263final class HtmlInputSplitter extends AbstractTokenStream {
264  /** The source of HTML character data. */
265  private final String input;
266  /** An offset into input. */
267  private int offset;
268  /** True iff the current character is inside a tag. */
269  private boolean inTag;
270  /**
271   * True if inside a script, xmp, listing, or similar tag whose content does
272   * not follow the normal escaping rules.
273   */
274  private boolean inEscapeExemptBlock;
275
276  /**
277   * Null or the name of the close tag required to end the current escape exempt
278   * block.
279   * Preformatted tags include &lt;script&gt;, &lt;xmp&gt;, etc. that may
280   * contain unescaped HTML input.
281   */
282  private String escapeExemptTagName = null;
283
284  private HtmlTextEscapingMode textEscapingMode;
285
286  public HtmlInputSplitter(String input) {
287    this.input = input;
288  }
289
290  /**
291   * Make sure that there is a token ready to yield in this.token.
292   */
293  @Override
294  protected HtmlToken produce() {
295    HtmlToken token = parseToken();
296    if (null == token) { return null; }
297
298    // Handle escape-exempt blocks.
299    // The parse() method is only dimly aware of escape-excempt blocks, so
300    // here we detect the beginning and ends of escape exempt blocks, and
301    // reclassify as UNESCAPED, any tokens that appear in the middle.
302    if (inEscapeExemptBlock) {
303      if (token.type != HtmlTokenType.SERVERCODE) {
304        // classify RCDATA as text since it can contain entities
305        token = reclassify(
306            token, (this.textEscapingMode == HtmlTextEscapingMode.RCDATA
307                    ? HtmlTokenType.TEXT
308                    : HtmlTokenType.UNESCAPED));
309      }
310    } else {
311      switch (token.type) {
312        case TAGBEGIN:
313          {
314            String canonTagName = canonicalName(
315                token.start + 1, token.end);
316            if (HtmlTextEscapingMode.isTagFollowedByLiteralContent(
317                    canonTagName)) {
318              this.escapeExemptTagName = canonTagName;
319              this.textEscapingMode = HtmlTextEscapingMode.getModeForTag(
320                  canonTagName);
321            }
322            break;
323          }
324        case TAGEND:
325          this.inEscapeExemptBlock = null != this.escapeExemptTagName;
326          break;
327        default:
328          break;
329      }
330    }
331    return token;
332  }
333
334  /**
335   * States for a state machine for optimistically identifying tags and other
336   * html/xml/phpish structures.
337   */
338  private static enum State {
339    TAGNAME,
340    SLASH,
341    BANG,
342    BANG_DASH,
343    COMMENT,
344    COMMENT_DASH,
345    COMMENT_DASH_DASH,
346    DIRECTIVE,
347    DONE,
348    BOGUS_COMMENT,
349    SERVER_CODE,
350    SERVER_CODE_PCT,
351
352    // From HTML 5 section 8.1.2.6
353
354    // The text in CDATA and RCDATA elements must not contain any
355    // occurrences of the string "</" followed by characters that
356    // case-insensitively match the tag name of the element followed
357    // by one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF),
358    // U+000B LINE TABULATION, U+000C FORM FEED (FF), U+0020 SPACE,
359    // U+003E GREATER-THAN SIGN (>), or U+002F SOLIDUS (/), unless
360    // that string is part of an escaping text span.
361
362    // An escaping text span is a span of text (in CDATA and RCDATA
363    // elements) and character entity references (in RCDATA elements)
364    // that starts with an escaping text span start that is not itself
365    // in an escaping text span, and ends at the next escaping text
366    // span end.
367
368    // An escaping text span start is a part of text that consists of
369    // the four character sequence "<!--".
370
371    // An escaping text span end is a part of text that consists of
372    // the three character sequence "-->".
373
374    // An escaping text span start may share its U+002D HYPHEN-MINUS characters
375    // with its corresponding escaping text span end.
376    UNESCAPED_LT_BANG,             // <!
377    UNESCAPED_LT_BANG_DASH,        // <!-
378    ESCAPING_TEXT_SPAN,            // Inside an escaping text span
379    ESCAPING_TEXT_SPAN_DASH,       // Seen - inside an escaping text span
380    ESCAPING_TEXT_SPAN_DASH_DASH,  // Seen -- inside an escaping text span
381    ;
382  }
383
384  private HtmlToken lastNonIgnorable = null;
385  /**
386   * Breaks the character stream into tokens.
387   * This method returns a stream of tokens such that each token starts where
388   * the last token ended.
389   *
390   * <p>This property is useful as it allows fetch to collapse and reclassify
391   * ranges of tokens based on state that is easy to maintain there.
392   *
393   * <p>Later passes are responsible for throwing away useless tokens.
394   */
395  private HtmlToken parseToken() {
396    int start = offset;
397    int limit = input.length();
398    if (start == limit) { return null; }
399
400    int end = start + 1;
401    HtmlTokenType type;
402
403    char ch = input.charAt(start);
404    if (inTag) {
405      if ('>' == ch) {
406        type = HtmlTokenType.TAGEND;
407        inTag = false;
408      } else if ('/' == ch) {
409        if (end != limit && '>' == input.charAt(end)) {
410          type = HtmlTokenType.TAGEND;
411          inTag = false;
412          ++end;
413        } else {
414          type = HtmlTokenType.TEXT;
415        }
416      } else if ('=' == ch) {
417        type = HtmlTokenType.TEXT;
418      } else if ('"' == ch || '\'' == ch) {
419        type = HtmlTokenType.QSTRING;
420        int delim = ch;
421        for (; end < limit; ++end) {
422          if (input.charAt(end) == delim) {
423            ++end;
424            break;
425          }
426        }
427      } else if (!Character.isWhitespace(ch)) {
428        type = HtmlTokenType.TEXT;
429        for (; end < limit; ++end) {
430          ch = input.charAt(end);
431          // End a text chunk before />
432          if ((lastNonIgnorable == null
433               || !lastNonIgnorable.tokenInContextMatches(input, "="))
434              && '/' == ch && end + 1 < limit
435              && '>' == input.charAt(end + 1)) {
436            break;
437          } else if ('>' == ch || '=' == ch
438                     || Character.isWhitespace(ch)) {
439            break;
440          } else if ('"' == ch || '\'' == ch) {
441            if (end + 1 < limit) {
442              char ch2 = input.charAt(end + 1);
443              if (ch2 >= 0 && Character.isWhitespace(ch2)
444                  || ch2 == '>' || ch2 == '/') {
445                ++end;
446                break;
447              }
448            }
449          }
450        }
451      } else {
452        // We skip whitespace tokens inside tag bodies.
453        type = HtmlTokenType.IGNORABLE;
454        while (end < limit && Character.isWhitespace(input.charAt(end))) {
455          ++end;
456        }
457      }
458    } else {
459      if (ch == '<') {
460        if (end == limit) {
461          type = HtmlTokenType.TEXT;
462        } else {
463          ch = input.charAt(end);
464          type = null;
465          State state = null;
466          switch (ch) {
467            case '/':  // close tag?
468              state = State.SLASH;
469              ++end;
470              break;
471            case '!':  // Comment or declaration
472              if (!this.inEscapeExemptBlock) {
473                state = State.BANG;
474              } else if (HtmlTextEscapingMode.allowsEscapingTextSpan(
475                             escapeExemptTagName)) {
476                // Directives, and cdata suppressed in escape
477                // exempt mode as they could obscure the close of the
478                // escape exempty block, but comments are similar to escaping
479                // text spans, and are significant in all CDATA and RCDATA
480                // blocks except those inside <xmp> tags.
481                // See "Escaping text spans" in section 8.1.2.6 of HTML5.
482                // http://www.w3.org/html/wg/html5/#cdata-rcdata-restrictions
483                state = State.UNESCAPED_LT_BANG;
484              }
485              ++end;
486              break;
487            case '?':
488              if (!this.inEscapeExemptBlock) {
489                state = State.BOGUS_COMMENT;
490              }
491              ++end;
492              break;
493            case '%':
494              state = State.SERVER_CODE;
495              ++end;
496              break;
497            default:
498              if (isIdentStart(ch) && !this.inEscapeExemptBlock) {
499                state = State.TAGNAME;
500                ++end;
501              } else if ('<' == ch) {
502                type = HtmlTokenType.TEXT;
503              } else {
504                ++end;
505              }
506              break;
507          }
508          if (null != state) {
509            charloop:
510            while (end < limit) {
511              ch = input.charAt(end);
512              switch (state) {
513                case TAGNAME:
514                  if (Character.isWhitespace(ch)
515                      || '>' == ch || '/' == ch || '<' == ch) {
516                    // End processing of an escape exempt block when we see
517                    // a corresponding end tag.
518                    if (this.inEscapeExemptBlock
519                        && '/' == input.charAt(start + 1)
520                        && textEscapingMode != HtmlTextEscapingMode.PLAIN_TEXT
521                        && canonicalName(start + 2, end)
522                            .equals(escapeExemptTagName)) {
523                      this.inEscapeExemptBlock = false;
524                      this.escapeExemptTagName = null;
525                      this.textEscapingMode = null;
526                    }
527                    type = HtmlTokenType.TAGBEGIN;
528                    // Don't process content as attributes if we're inside
529                    // an escape exempt block.
530                    inTag = !this.inEscapeExemptBlock;
531                    state = State.DONE;
532                    break charloop;
533                  }
534                  break;
535                case SLASH:
536                  if (Character.isLetter(ch)) {
537                    state = State.TAGNAME;
538                  } else {
539                    if ('<' == ch) {
540                      type = HtmlTokenType.TEXT;
541                    } else {
542                      ++end;
543                    }
544                    break charloop;
545                  }
546                  break;
547                case BANG:
548                  if ('-' == ch) {
549                    state = State.BANG_DASH;
550                  } else {
551                    state = State.DIRECTIVE;
552                  }
553                  break;
554                case BANG_DASH:
555                  if ('-' == ch) {
556                    state = State.COMMENT;
557                  } else {
558                    state = State.DIRECTIVE;
559                  }
560                  break;
561                case COMMENT:
562                  if ('-' == ch) {
563                    state = State.COMMENT_DASH;
564                  }
565                  break;
566                case COMMENT_DASH:
567                  state = ('-' == ch)
568                      ? State.COMMENT_DASH_DASH
569                      : State.COMMENT_DASH;
570                  break;
571                case COMMENT_DASH_DASH:
572                  if ('>' == ch) {
573                    state = State.DONE;
574                    type = HtmlTokenType.COMMENT;
575                  } else if ('-' == ch) {
576                    state = State.COMMENT_DASH_DASH;
577                  } else {
578                    state = State.COMMENT_DASH;
579                  }
580                  break;
581                case DIRECTIVE:
582                  if ('>' == ch) {
583                    type = HtmlTokenType.DIRECTIVE;
584                    state = State.DONE;
585                  }
586                  break;
587                case BOGUS_COMMENT:
588                  if ('>' == ch) {
589                    type = HtmlTokenType.QMARKMETA;
590                    state = State.DONE;
591                  }
592                  break;
593                case SERVER_CODE:
594                  if ('%' == ch) {
595                    state = State.SERVER_CODE_PCT;
596                  }
597                  break;
598                case SERVER_CODE_PCT:
599                  if ('>' == ch) {
600                    type = HtmlTokenType.SERVERCODE;
601                    state = State.DONE;
602                  } else if ('%' != ch) {
603                    state = State.SERVER_CODE;
604                  }
605                  break;
606                case UNESCAPED_LT_BANG:
607                  if ('-' == ch) {
608                    state = State.UNESCAPED_LT_BANG_DASH;
609                  } else {
610                    type = HtmlTokenType.TEXT;
611                    state = State.DONE;
612                  }
613                  break;
614                case UNESCAPED_LT_BANG_DASH:
615                  if ('-' == ch) {
616                    // According to HTML 5 section 8.1.2.6
617
618                    // An escaping text span start may share its
619                    // U+002D HYPHEN-MINUS characters with its
620                    // corresponding escaping text span end.
621                    state = State.ESCAPING_TEXT_SPAN_DASH_DASH;
622                  } else {
623                    type = HtmlTokenType.TEXT;
624                    state = State.DONE;
625                  }
626                  break;
627                case ESCAPING_TEXT_SPAN:
628                  if ('-' == ch) {
629                    state = State.ESCAPING_TEXT_SPAN_DASH;
630                  }
631                  break;
632                case ESCAPING_TEXT_SPAN_DASH:
633                  if ('-' == ch) {
634                    state = State.ESCAPING_TEXT_SPAN_DASH_DASH;
635                  } else {
636                    state = State.ESCAPING_TEXT_SPAN;
637                  }
638                  break;
639                case ESCAPING_TEXT_SPAN_DASH_DASH:
640                  if ('>' == ch) {
641                    type = HtmlTokenType.TEXT;
642                    state = State.DONE;
643                  } else if ('-' != ch) {
644                    state = State.ESCAPING_TEXT_SPAN;
645                  }
646                  break;
647                case DONE:
648                  throw new AssertionError(
649                      "Unexpectedly DONE while lexing HTML token stream");
650              }
651              ++end;
652              if (State.DONE == state) { break; }
653            }
654            if (end == limit) {
655              switch (state) {
656                case DONE:
657                  break;
658                case BOGUS_COMMENT:
659                  type = HtmlTokenType.QMARKMETA;
660                  break;
661                case COMMENT:
662                case COMMENT_DASH:
663                case COMMENT_DASH_DASH:
664                  type = HtmlTokenType.COMMENT;
665                  break;
666                case DIRECTIVE:
667                case SERVER_CODE:
668                case SERVER_CODE_PCT:
669                  type = HtmlTokenType.SERVERCODE;
670                  break;
671                case TAGNAME:
672                  type = HtmlTokenType.TAGBEGIN;
673                  break;
674                default:
675                  type = HtmlTokenType.TEXT;
676                  break;
677              }
678            }
679          }
680        }
681      } else {
682        type = null;
683      }
684    }
685    if (null == type) {
686      while (end < limit && '<' != input.charAt(end)) { ++end; }
687      type = HtmlTokenType.TEXT;
688    }
689
690    offset = end;
691    HtmlToken result = HtmlToken.instance(start, end, type);
692    if (type != HtmlTokenType.IGNORABLE) { lastNonIgnorable = result; }
693    return result;
694  }
695
696  private String canonicalName(int start, int end) {
697    return HtmlLexer.canonicalName(input.substring(start, end));
698  }
699
700  private static boolean isIdentStart(char ch) {
701    return ch >= 'A' && ch <= 'z' && (ch <= 'Z' || ch >= 'a');
702  }
703
704  static HtmlToken reclassify(HtmlToken token, HtmlTokenType type) {
705    return HtmlToken.instance(token.start, token.end, type);
706  }
707}
708
709
710/**
711 * A TokenStream that lazily fetches one token at a time.
712 *
713 * @author Mike Samuel <mikesamuel@gmail.com>
714 */
715abstract class AbstractTokenStream implements TokenStream {
716  private HtmlToken tok;
717
718  public final boolean hasNext() {
719    if (tok == null) { tok = produce(); }
720    return tok != null;
721  }
722
723  public HtmlToken next() {
724    if (this.tok == null) { this.tok = produce(); }
725    HtmlToken t = this.tok;
726    if (t == null) { throw new NoSuchElementException(); }
727    this.tok = null;
728    return t;
729  }
730
731  protected abstract HtmlToken produce();
732}
733