HtmlLexer.java revision 4e867904c8295537803c1c8a076e130df5674b58
1package org.owasp.html;
2
3import com.google.common.collect.ImmutableSet;
4import com.google.common.collect.Lists;
5import java.util.LinkedList;
6import java.util.NoSuchElementException;
7import java.util.Set;
8
9/**
10 * A flexible lexer for HTML.
11 * This is hairy code, but it is outside the TCB for the HTML sanitizer.
12 *
13 * @author Mike Samuel <mikesamuel@gmail.com>
14 */
15final class HtmlLexer extends AbstractTokenStream {
16  private final String input;
17  private final HtmlInputSplitter splitter;
18  private State state = State.OUTSIDE_TAG;
19
20  public HtmlLexer(String input) {
21    this.input = input;
22    this.splitter = new HtmlInputSplitter(input);
23  }
24
25  /**
26   * Normalize case of names that are not name-spaced.  This lower-cases HTML
27   * element and attribute names, but not ones for embedded SVG or MATHML.
28   */
29  static String canonicalName(String elementOrAttribName) {
30    return elementOrAttribName.indexOf(':') >= 0
31        ? elementOrAttribName : Strings.toLowerCase(elementOrAttribName);
32  }
33
34  /**
35   * An fsm that lets us reclassify text tokens inside tags as attribute
36   * names/values
37   */
38  private static enum State {
39    OUTSIDE_TAG,
40    IN_TAG,
41    SAW_NAME,
42    SAW_EQ,
43    ;
44  }
45
46  /**
47   * Makes sure that this.token contains a token if one is available.
48   * This may require fetching and combining multiple tokens from the underlying
49   * splitter.
50   */
51  @Override
52  protected HtmlToken produce() {
53    HtmlToken token = readToken();
54    if (token == null) { return null; }
55
56    switch (token.type) {
57
58      // Keep track of whether we're inside a tag or not.
59      case TAGBEGIN:
60        state = State.IN_TAG;
61        break;
62      case TAGEND:
63        if (state == State.SAW_EQ && HtmlTokenType.TAGEND == token.type) {
64          // Distinguish <input type=checkbox checked=> from
65          // <input type=checkbox checked>
66          pushbackToken(token);
67          state = State.IN_TAG;
68          return HtmlToken.instance(
69              token.start, token.start, HtmlTokenType.ATTRVALUE);
70        }
71
72        state = State.OUTSIDE_TAG;
73        break;
74
75      // Drop ignorable tokens by zeroing out the one received and recursing
76      case IGNORABLE:
77        return produce();
78
79      // collapse adjacent text nodes if we're outside a tag, or otherwise,
80      // Recognize attribute names and values.
81      default:
82        switch (state) {
83          case OUTSIDE_TAG:
84            if (HtmlTokenType.TEXT == token.type
85                || HtmlTokenType.UNESCAPED == token.type) {
86              token = collapseSubsequent(token);
87            }
88            break;
89          case IN_TAG:
90            if (HtmlTokenType.TEXT == token.type
91                && !token.tokenInContextMatches(input, "=")) {
92              // Reclassify as attribute name
93              token = HtmlInputSplitter.reclassify(
94                  token, HtmlTokenType.ATTRNAME);
95              state = State.SAW_NAME;
96            }
97            break;
98          case SAW_NAME:
99            if (HtmlTokenType.TEXT == token.type) {
100              if (token.tokenInContextMatches(input, "=")) {
101                state = State.SAW_EQ;
102                // Skip the '=' token
103                return produce();
104              } else {
105                // Reclassify as attribute name
106                token = HtmlInputSplitter.reclassify(
107                    token, HtmlTokenType.ATTRNAME);
108              }
109            } else {
110              state = State.IN_TAG;
111            }
112            break;
113          case SAW_EQ:
114            if (HtmlTokenType.TEXT == token.type
115                || HtmlTokenType.QSTRING == token.type) {
116              if (HtmlTokenType.TEXT == token.type) {
117                // Collapse adjacent text nodes to properly handle
118                //   <a onclick=this.clicked=true>
119                //   <a title=foo bar>
120                token = collapseAttributeName(token);
121              }
122              // Reclassify as value
123              token = HtmlInputSplitter.reclassify(
124                  token, HtmlTokenType.ATTRVALUE);
125              state = State.IN_TAG;
126            }
127            break;
128        }
129        break;
130    }
131
132    return token;
133  }
134
135  /**
136   * Collapses all the following tokens of the same type into this.token.
137   */
138  private HtmlToken collapseSubsequent(HtmlToken token) {
139    HtmlToken collapsed = token;
140    for (HtmlToken next;
141         (next= peekToken(0)) != null && next.type == token.type;
142         readToken()) {
143      collapsed = join(collapsed, next);
144    }
145    return collapsed;
146  }
147
148  private HtmlToken collapseAttributeName(HtmlToken token) {
149    // We want to collapse tokens into the value that are not parts of an
150    // attribute value.  We should include any space or text adjacent to the
151    // value, but should stop at any of the following constructions:
152    //   space end-of-file              e.g. name=foo_
153    //   space valueless-attrib-name    e.g. name=foo checked
154    //   space tag-end                  e.g. name=foo />
155    //   space text space? '='          e.g. name=foo bar=
156    int nToMerge = 0;
157    for (HtmlToken t; (t = peekToken(nToMerge)) != null;) {
158      if (t.type == HtmlTokenType.IGNORABLE) {
159        HtmlToken tok = peekToken(nToMerge + 1);
160        if (tok == null) { break; }
161        if (tok.type != HtmlTokenType.TEXT) { break; }
162        if (isValuelessAttribute(input.substring(tok.start, tok.end))) {
163          break;
164        }
165        HtmlToken eq = peekToken(nToMerge + 2);
166        if (eq != null && eq.type == HtmlTokenType.IGNORABLE) {
167          eq = peekToken(nToMerge + 3);
168        }
169        if (eq == null || eq.tokenInContextMatches(input, "=")) {
170          break;
171        }
172      } else if (t.type != HtmlTokenType.TEXT) {
173        break;
174      }
175      ++nToMerge;
176    }
177    if (nToMerge == 0) { return token; }
178
179    int end = token.end;
180    do {
181      end = readToken().end;
182    } while (--nToMerge > 0);
183
184    return HtmlToken.instance(token.start, end, HtmlTokenType.TEXT);
185  }
186
187  private static HtmlToken join(HtmlToken a, HtmlToken b) {
188    return HtmlToken.instance(a.start, b.end, a.type);
189  }
190
191  private final LinkedList<HtmlToken> lookahead = Lists.newLinkedList();
192  private HtmlToken readToken() {
193    if (!lookahead.isEmpty()) {
194      return lookahead.remove();
195    } else if (splitter.hasNext()) {
196      return splitter.next();
197    } else {
198      return null;
199    }
200  }
201
202  private HtmlToken peekToken(int i) {
203    while (lookahead.size() <= i && splitter.hasNext()) {
204      lookahead.add(splitter.next());
205    }
206    return lookahead.size() > i ? lookahead.get(i) : null;
207  }
208
209  private void pushbackToken(HtmlToken token) {
210    lookahead.addFirst(token);
211  }
212
213  /** Can the attribute appear in HTML without a value. */
214  private static boolean isValuelessAttribute(String attribName) {
215    boolean valueless = VALUELESS_ATTRIB_NAMES.contains(
216        Strings.toLowerCase(attribName));
217    return valueless;
218  }
219
220  // From http://issues.apache.org/jira/browse/XALANC-519
221  private static final Set<String> VALUELESS_ATTRIB_NAMES = ImmutableSet.of(
222      "checked", "compact", "declare", "defer", "disabled",
223      "ismap", "multiple", "nohref", "noresize", "noshade",
224      "nowrap", "readonly", "selected");
225}
226
227/**
228 * A token stream that breaks a character stream into <tt>
229 * HtmlTokenType.{TEXT,TAGBEGIN,TAGEND,DIRECTIVE,COMMENT,CDATA,DIRECTIVE}</tt>
230 * tokens.  The matching of attribute names and values is done in a later step.
231 */
232final class HtmlInputSplitter extends AbstractTokenStream {
233  /** The source of HTML character data. */
234  private final String input;
235  /** An offset into input. */
236  private int offset;
237  /** True iff the current character is inside a tag. */
238  private boolean inTag;
239  /**
240   * True if inside a script, xmp, listing, or similar tag whose content does
241   * not follow the normal escaping rules.
242   */
243  private boolean inEscapeExemptBlock;
244
245  /**
246   * Null or the name of the close tag required to end the current escape exempt
247   * block.
248   * Preformatted tags include &lt;script&gt;, &lt;xmp&gt;, etc. that may
249   * contain unescaped HTML input.
250   */
251  private String escapeExemptTagName = null;
252
253  private HtmlTextEscapingMode textEscapingMode;
254
255  public HtmlInputSplitter(String input) {
256    this.input = input;
257  }
258
259  /**
260   * Make sure that there is a token ready to yield in this.token.
261   */
262  @Override
263  protected HtmlToken produce() {
264    HtmlToken token = parseToken();
265    if (null == token) { return null; }
266
267    // Handle escape-exempt blocks.
268    // The parse() method is only dimly aware of escape-excempt blocks, so
269    // here we detect the beginning and ends of escape exempt blocks, and
270    // reclassify as UNESCAPED, any tokens that appear in the middle.
271    if (inEscapeExemptBlock) {
272      if (token.type != HtmlTokenType.SERVERCODE) {
273        // classify RCDATA as text since it can contain entities
274        token = reclassify(
275            token, (this.textEscapingMode == HtmlTextEscapingMode.RCDATA
276                    ? HtmlTokenType.TEXT
277                    : HtmlTokenType.UNESCAPED));
278      }
279    } else {
280      switch (token.type) {
281        case TAGBEGIN:
282          {
283            String canonTagName = canonicalName(
284                token.start + 1, token.end);
285            if (HtmlTextEscapingMode.isTagFollowedByLiteralContent(
286                    canonTagName)) {
287              this.escapeExemptTagName = canonTagName;
288              this.textEscapingMode = HtmlTextEscapingMode.getModeForTag(
289                  canonTagName);
290            }
291            break;
292          }
293        case TAGEND:
294          this.inEscapeExemptBlock = null != this.escapeExemptTagName;
295          break;
296        default:
297          break;
298      }
299    }
300    return token;
301  }
302
303  /**
304   * States for a state machine for optimistically identifying tags and other
305   * html/xml/phpish structures.
306   */
307  private static enum State {
308    TAGNAME,
309    SLASH,
310    BANG,
311    BANG_DASH,
312    COMMENT,
313    COMMENT_DASH,
314    COMMENT_DASH_DASH,
315    DIRECTIVE,
316    DONE,
317    APP_DIRECTIVE,
318    APP_DIRECTIVE_QMARK,
319    SERVER_CODE,
320    SERVER_CODE_PCT,
321
322    // From HTML 5 section 8.1.2.6
323
324    // The text in CDATA and RCDATA elements must not contain any
325    // occurrences of the string "</" followed by characters that
326    // case-insensitively match the tag name of the element followed
327    // by one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF),
328    // U+000B LINE TABULATION, U+000C FORM FEED (FF), U+0020 SPACE,
329    // U+003E GREATER-THAN SIGN (>), or U+002F SOLIDUS (/), unless
330    // that string is part of an escaping text span.
331
332    // An escaping text span is a span of text (in CDATA and RCDATA
333    // elements) and character entity references (in RCDATA elements)
334    // that starts with an escaping text span start that is not itself
335    // in an escaping text span, and ends at the next escaping text
336    // span end.
337
338    // An escaping text span start is a part of text that consists of
339    // the four character sequence "<!--".
340
341    // An escaping text span end is a part of text that consists of
342    // the three character sequence "-->".
343
344    // An escaping text span start may share its U+002D HYPHEN-MINUS characters
345    // with its corresponding escaping text span end.
346    UNESCAPED_LT_BANG,             // <!
347    UNESCAPED_LT_BANG_DASH,        // <!-
348    ESCAPING_TEXT_SPAN,            // Inside an escaping text span
349    ESCAPING_TEXT_SPAN_DASH,       // Seen - inside an escaping text span
350    ESCAPING_TEXT_SPAN_DASH_DASH,  // Seen -- inside an escaping text span
351    ;
352  }
353
354  private HtmlToken lastNonIgnorable = null;
355  /**
356   * Breaks the character stream into tokens.
357   * This method returns a stream of tokens such that each token starts where
358   * the last token ended.
359   *
360   * <p>This property is useful as it allows fetch to collapse and reclassify
361   * ranges of tokens based on state that is easy to maintain there.
362   *
363   * <p>Later passes are responsible for throwing away useless tokens.
364   */
365  private HtmlToken parseToken() {
366    int start = offset;
367    int limit = input.length();
368    if (start == limit) { return null; }
369
370    int end = start + 1;
371    HtmlTokenType type;
372
373    char ch = input.charAt(start);
374    if (inTag) {
375      if ('>' == ch) {
376        type = HtmlTokenType.TAGEND;
377        inTag = false;
378      } else if ('/' == ch) {
379        if (end != limit && '>' == input.charAt(end)) {
380          type = HtmlTokenType.TAGEND;
381          inTag = false;
382          ++end;
383        } else {
384          type = HtmlTokenType.TEXT;
385        }
386      } else if ('=' == ch) {
387        type = HtmlTokenType.TEXT;
388      } else if ('"' == ch || '\'' == ch) {
389        type = HtmlTokenType.QSTRING;
390        int delim = ch;
391        for (; end < limit; ++end) {
392          if (input.charAt(end) == delim) {
393            ++end;
394            break;
395          }
396        }
397      } else if (!Character.isWhitespace(ch)) {
398        type = HtmlTokenType.TEXT;
399        for (; end < limit; ++end) {
400          ch = input.charAt(end);
401          // End a text chunk before />
402          if ((lastNonIgnorable == null
403               || !lastNonIgnorable.tokenInContextMatches(input, "="))
404              && '/' == ch && end + 1 < limit
405              && '>' == input.charAt(end + 1)) {
406            break;
407          } else if ('>' == ch || '=' == ch
408                     || Character.isWhitespace(ch)) {
409            break;
410          } else if ('"' == ch || '\'' == ch) {
411            if (end + 1 < limit) {
412              char ch2 = input.charAt(end + 1);
413              if (ch2 >= 0 && Character.isWhitespace(ch2)
414                  || ch2 == '>' || ch2 == '/') {
415                ++end;
416                break;
417              }
418            }
419          }
420        }
421      } else {
422        // We skip whitespace tokens inside tag bodies.
423        type = HtmlTokenType.IGNORABLE;
424        while (end < limit && Character.isWhitespace(input.charAt(end))) {
425          ++end;
426        }
427      }
428    } else {
429      if (ch == '<') {
430        if (end == limit) {
431          type = HtmlTokenType.TEXT;
432        } else {
433          ch = input.charAt(end);
434          type = null;
435          State state = null;
436          switch (ch) {
437            case '/':  // close tag?
438              state = State.SLASH;
439              ++end;
440              break;
441            case '!':  // Comment or declaration
442              if (!this.inEscapeExemptBlock) {
443                state = State.BANG;
444              } else if (HtmlTextEscapingMode.allowsEscapingTextSpan(
445                             escapeExemptTagName)) {
446                // Directives, and cdata suppressed in escape
447                // exempt mode as they could obscure the close of the
448                // escape exempty block, but comments are similar to escaping
449                // text spans, and are significant in all CDATA and RCDATA
450                // blocks except those inside <xmp> tags.
451                // See "Escaping text spans" in section 8.1.2.6 of HTML5.
452                // http://www.w3.org/html/wg/html5/#cdata-rcdata-restrictions
453                state = State.UNESCAPED_LT_BANG;
454              }
455              ++end;
456              break;
457            case '?':
458              if (!this.inEscapeExemptBlock) {
459                state = State.APP_DIRECTIVE;
460              }
461              ++end;
462              break;
463            case '%':
464              state = State.SERVER_CODE;
465              ++end;
466              break;
467            default:
468              if (isIdentStart(ch) && !this.inEscapeExemptBlock) {
469                state = State.TAGNAME;
470                ++end;
471              } else if ('<' == ch) {
472                type = HtmlTokenType.TEXT;
473              } else {
474                ++end;
475              }
476              break;
477          }
478          if (null != state) {
479            charloop:
480            while (end < limit) {
481              ch = input.charAt(end);
482              switch (state) {
483                case TAGNAME:
484                  if (Character.isWhitespace(ch)
485                      || '>' == ch || '/' == ch || '<' == ch) {
486                    // End processing of an escape exempt block when we see
487                    // a corresponding end tag.
488                    if (this.inEscapeExemptBlock
489                        && '/' == input.charAt(start + 1)
490                        && textEscapingMode != HtmlTextEscapingMode.PLAIN_TEXT
491                        && canonicalName(start + 2, end)
492                            .equals(escapeExemptTagName)) {
493                      this.inEscapeExemptBlock = false;
494                      this.escapeExemptTagName = null;
495                      this.textEscapingMode = null;
496                    }
497                    type = HtmlTokenType.TAGBEGIN;
498                    // Don't process content as attributes if we're inside
499                    // an escape exempt block.
500                    inTag = !this.inEscapeExemptBlock;
501                    state = State.DONE;
502                    break charloop;
503                  }
504                  break;
505                case SLASH:
506                  if (Character.isLetter(ch)) {
507                    state = State.TAGNAME;
508                  } else {
509                    if ('<' == ch) {
510                      type = HtmlTokenType.TEXT;
511                    } else {
512                      ++end;
513                    }
514                    break charloop;
515                  }
516                  break;
517                case BANG:
518                  if ('-' == ch) {
519                    state = State.BANG_DASH;
520                  } else {
521                    state = State.DIRECTIVE;
522                  }
523                  break;
524                case BANG_DASH:
525                  if ('-' == ch) {
526                    state = State.COMMENT;
527                  } else {
528                    state = State.DIRECTIVE;
529                  }
530                  break;
531                case COMMENT:
532                  if ('-' == ch) {
533                    state = State.COMMENT_DASH;
534                  }
535                  break;
536                case COMMENT_DASH:
537                  state = ('-' == ch)
538                      ? State.COMMENT_DASH_DASH
539                      : State.COMMENT_DASH;
540                  break;
541                case COMMENT_DASH_DASH:
542                  if ('>' == ch) {
543                    state = State.DONE;
544                    type = HtmlTokenType.COMMENT;
545                  } else if ('-' == ch) {
546                    state = State.COMMENT_DASH_DASH;
547                  } else {
548                    state = State.COMMENT_DASH;
549                  }
550                  break;
551                case DIRECTIVE:
552                  if ('>' == ch) {
553                    type = HtmlTokenType.DIRECTIVE;
554                    state = State.DONE;
555                  }
556                  break;
557                case APP_DIRECTIVE:
558                  if ('?' == ch) { state = State.APP_DIRECTIVE_QMARK; }
559                  break;
560                case APP_DIRECTIVE_QMARK:
561                  if ('>' == ch) {
562                    type = HtmlTokenType.DIRECTIVE;
563                    state = State.DONE;
564                  } else if ('?' != ch) {
565                    state = State.APP_DIRECTIVE;
566                  }
567                  break;
568                case SERVER_CODE:
569                  if ('%' == ch) {
570                    state = State.SERVER_CODE_PCT;
571                  }
572                  break;
573                case SERVER_CODE_PCT:
574                  if ('>' == ch) {
575                    type = HtmlTokenType.SERVERCODE;
576                    state = State.DONE;
577                  } else if ('%' != ch) {
578                    state = State.SERVER_CODE;
579                  }
580                  break;
581                case UNESCAPED_LT_BANG:
582                  if ('-' == ch) {
583                    state = State.UNESCAPED_LT_BANG_DASH;
584                  } else {
585                    type = HtmlTokenType.TEXT;
586                    state = State.DONE;
587                  }
588                  break;
589                case UNESCAPED_LT_BANG_DASH:
590                  if ('-' == ch) {
591                    // According to HTML 5 section 8.1.2.6
592
593                    // An escaping text span start may share its
594                    // U+002D HYPHEN-MINUS characters with its
595                    // corresponding escaping text span end.
596                    state = State.ESCAPING_TEXT_SPAN_DASH_DASH;
597                  } else {
598                    type = HtmlTokenType.TEXT;
599                    state = State.DONE;
600                  }
601                  break;
602                case ESCAPING_TEXT_SPAN:
603                  if ('-' == ch) {
604                    state = State.ESCAPING_TEXT_SPAN_DASH;
605                  }
606                  break;
607                case ESCAPING_TEXT_SPAN_DASH:
608                  if ('-' == ch) {
609                    state = State.ESCAPING_TEXT_SPAN_DASH_DASH;
610                  } else {
611                    state = State.ESCAPING_TEXT_SPAN;
612                  }
613                  break;
614                case ESCAPING_TEXT_SPAN_DASH_DASH:
615                  if ('>' == ch) {
616                    type = HtmlTokenType.TEXT;
617                    state = State.DONE;
618                  } else if ('-' != ch) {
619                    state = State.ESCAPING_TEXT_SPAN;
620                  }
621                  break;
622                case DONE:
623                  throw new AssertionError(
624                      "Unexpectedly DONE while lexing HTML token stream");
625              }
626              ++end;
627              if (State.DONE == state) { break; }
628            }
629            if (end == limit) {
630              switch (state) {
631                case DONE:
632                  break;
633                case COMMENT:
634                case COMMENT_DASH:
635                case COMMENT_DASH_DASH:
636                  type = HtmlTokenType.COMMENT;
637                  break;
638                case DIRECTIVE:
639                case APP_DIRECTIVE:
640                case APP_DIRECTIVE_QMARK:
641                  type = HtmlTokenType.DIRECTIVE;
642                  break;
643                case SERVER_CODE:
644                case SERVER_CODE_PCT:
645                  type = HtmlTokenType.SERVERCODE;
646                  break;
647                case TAGNAME:
648                  type = HtmlTokenType.TAGBEGIN;
649                  break;
650                default:
651                  type = HtmlTokenType.TEXT;
652                  break;
653              }
654            }
655          }
656        }
657      } else {
658        type = null;
659      }
660    }
661    if (null == type) {
662      while (end < limit && '<' != input.charAt(end)) { ++end; }
663      type = HtmlTokenType.TEXT;
664    }
665
666    offset = end;
667    HtmlToken result = HtmlToken.instance(start, end, type);
668    if (type != HtmlTokenType.IGNORABLE) { lastNonIgnorable = result; }
669    return result;
670  }
671
672  private String canonicalName(int start, int end) {
673    return HtmlLexer.canonicalName(input.substring(start, end));
674  }
675
676  private boolean isIdentStart(char ch) {
677    return ch >= 'A' && ch <= 'z' && (ch <= 'Z' || ch >= 'a');
678  }
679
680  static HtmlToken reclassify(HtmlToken token, HtmlTokenType type) {
681    return HtmlToken.instance(token.start, token.end, type);
682  }
683}
684
685
686/**
687 * A TokenStream that lazily fetches one token at a time.
688 *
689 * @author msamuel@gmail.com (Mike Samuel)
690 */
691abstract class AbstractTokenStream implements TokenStream {
692  private HtmlToken tok;
693
694  public final boolean hasNext() {
695    if (tok == null) { tok = produce(); }
696    return tok != null;
697  }
698
699  public HtmlToken next() {
700    if (this.tok == null) { this.tok = produce(); }
701    HtmlToken t = this.tok;
702    if (t == null) { throw new NoSuchElementException(); }
703    this.tok = null;
704    return t;
705  }
706
707  protected abstract HtmlToken produce();
708}
709