HtmlLexer.java revision 4e867904c8295537803c1c8a076e130df5674b58
1package org.owasp.html; 2 3import com.google.common.collect.ImmutableSet; 4import com.google.common.collect.Lists; 5import java.util.LinkedList; 6import java.util.NoSuchElementException; 7import java.util.Set; 8 9/** 10 * A flexible lexer for HTML. 11 * This is hairy code, but it is outside the TCB for the HTML sanitizer. 12 * 13 * @author Mike Samuel <mikesamuel@gmail.com> 14 */ 15final class HtmlLexer extends AbstractTokenStream { 16 private final String input; 17 private final HtmlInputSplitter splitter; 18 private State state = State.OUTSIDE_TAG; 19 20 public HtmlLexer(String input) { 21 this.input = input; 22 this.splitter = new HtmlInputSplitter(input); 23 } 24 25 /** 26 * Normalize case of names that are not name-spaced. This lower-cases HTML 27 * element and attribute names, but not ones for embedded SVG or MATHML. 28 */ 29 static String canonicalName(String elementOrAttribName) { 30 return elementOrAttribName.indexOf(':') >= 0 31 ? elementOrAttribName : Strings.toLowerCase(elementOrAttribName); 32 } 33 34 /** 35 * An fsm that lets us reclassify text tokens inside tags as attribute 36 * names/values 37 */ 38 private static enum State { 39 OUTSIDE_TAG, 40 IN_TAG, 41 SAW_NAME, 42 SAW_EQ, 43 ; 44 } 45 46 /** 47 * Makes sure that this.token contains a token if one is available. 48 * This may require fetching and combining multiple tokens from the underlying 49 * splitter. 50 */ 51 @Override 52 protected HtmlToken produce() { 53 HtmlToken token = readToken(); 54 if (token == null) { return null; } 55 56 switch (token.type) { 57 58 // Keep track of whether we're inside a tag or not. 59 case TAGBEGIN: 60 state = State.IN_TAG; 61 break; 62 case TAGEND: 63 if (state == State.SAW_EQ && HtmlTokenType.TAGEND == token.type) { 64 // Distinguish <input type=checkbox checked=> from 65 // <input type=checkbox checked> 66 pushbackToken(token); 67 state = State.IN_TAG; 68 return HtmlToken.instance( 69 token.start, token.start, HtmlTokenType.ATTRVALUE); 70 } 71 72 state = State.OUTSIDE_TAG; 73 break; 74 75 // Drop ignorable tokens by zeroing out the one received and recursing 76 case IGNORABLE: 77 return produce(); 78 79 // collapse adjacent text nodes if we're outside a tag, or otherwise, 80 // Recognize attribute names and values. 81 default: 82 switch (state) { 83 case OUTSIDE_TAG: 84 if (HtmlTokenType.TEXT == token.type 85 || HtmlTokenType.UNESCAPED == token.type) { 86 token = collapseSubsequent(token); 87 } 88 break; 89 case IN_TAG: 90 if (HtmlTokenType.TEXT == token.type 91 && !token.tokenInContextMatches(input, "=")) { 92 // Reclassify as attribute name 93 token = HtmlInputSplitter.reclassify( 94 token, HtmlTokenType.ATTRNAME); 95 state = State.SAW_NAME; 96 } 97 break; 98 case SAW_NAME: 99 if (HtmlTokenType.TEXT == token.type) { 100 if (token.tokenInContextMatches(input, "=")) { 101 state = State.SAW_EQ; 102 // Skip the '=' token 103 return produce(); 104 } else { 105 // Reclassify as attribute name 106 token = HtmlInputSplitter.reclassify( 107 token, HtmlTokenType.ATTRNAME); 108 } 109 } else { 110 state = State.IN_TAG; 111 } 112 break; 113 case SAW_EQ: 114 if (HtmlTokenType.TEXT == token.type 115 || HtmlTokenType.QSTRING == token.type) { 116 if (HtmlTokenType.TEXT == token.type) { 117 // Collapse adjacent text nodes to properly handle 118 // <a onclick=this.clicked=true> 119 // <a title=foo bar> 120 token = collapseAttributeName(token); 121 } 122 // Reclassify as value 123 token = HtmlInputSplitter.reclassify( 124 token, HtmlTokenType.ATTRVALUE); 125 state = State.IN_TAG; 126 } 127 break; 128 } 129 break; 130 } 131 132 return token; 133 } 134 135 /** 136 * Collapses all the following tokens of the same type into this.token. 137 */ 138 private HtmlToken collapseSubsequent(HtmlToken token) { 139 HtmlToken collapsed = token; 140 for (HtmlToken next; 141 (next= peekToken(0)) != null && next.type == token.type; 142 readToken()) { 143 collapsed = join(collapsed, next); 144 } 145 return collapsed; 146 } 147 148 private HtmlToken collapseAttributeName(HtmlToken token) { 149 // We want to collapse tokens into the value that are not parts of an 150 // attribute value. We should include any space or text adjacent to the 151 // value, but should stop at any of the following constructions: 152 // space end-of-file e.g. name=foo_ 153 // space valueless-attrib-name e.g. name=foo checked 154 // space tag-end e.g. name=foo /> 155 // space text space? '=' e.g. name=foo bar= 156 int nToMerge = 0; 157 for (HtmlToken t; (t = peekToken(nToMerge)) != null;) { 158 if (t.type == HtmlTokenType.IGNORABLE) { 159 HtmlToken tok = peekToken(nToMerge + 1); 160 if (tok == null) { break; } 161 if (tok.type != HtmlTokenType.TEXT) { break; } 162 if (isValuelessAttribute(input.substring(tok.start, tok.end))) { 163 break; 164 } 165 HtmlToken eq = peekToken(nToMerge + 2); 166 if (eq != null && eq.type == HtmlTokenType.IGNORABLE) { 167 eq = peekToken(nToMerge + 3); 168 } 169 if (eq == null || eq.tokenInContextMatches(input, "=")) { 170 break; 171 } 172 } else if (t.type != HtmlTokenType.TEXT) { 173 break; 174 } 175 ++nToMerge; 176 } 177 if (nToMerge == 0) { return token; } 178 179 int end = token.end; 180 do { 181 end = readToken().end; 182 } while (--nToMerge > 0); 183 184 return HtmlToken.instance(token.start, end, HtmlTokenType.TEXT); 185 } 186 187 private static HtmlToken join(HtmlToken a, HtmlToken b) { 188 return HtmlToken.instance(a.start, b.end, a.type); 189 } 190 191 private final LinkedList<HtmlToken> lookahead = Lists.newLinkedList(); 192 private HtmlToken readToken() { 193 if (!lookahead.isEmpty()) { 194 return lookahead.remove(); 195 } else if (splitter.hasNext()) { 196 return splitter.next(); 197 } else { 198 return null; 199 } 200 } 201 202 private HtmlToken peekToken(int i) { 203 while (lookahead.size() <= i && splitter.hasNext()) { 204 lookahead.add(splitter.next()); 205 } 206 return lookahead.size() > i ? lookahead.get(i) : null; 207 } 208 209 private void pushbackToken(HtmlToken token) { 210 lookahead.addFirst(token); 211 } 212 213 /** Can the attribute appear in HTML without a value. */ 214 private static boolean isValuelessAttribute(String attribName) { 215 boolean valueless = VALUELESS_ATTRIB_NAMES.contains( 216 Strings.toLowerCase(attribName)); 217 return valueless; 218 } 219 220 // From http://issues.apache.org/jira/browse/XALANC-519 221 private static final Set<String> VALUELESS_ATTRIB_NAMES = ImmutableSet.of( 222 "checked", "compact", "declare", "defer", "disabled", 223 "ismap", "multiple", "nohref", "noresize", "noshade", 224 "nowrap", "readonly", "selected"); 225} 226 227/** 228 * A token stream that breaks a character stream into <tt> 229 * HtmlTokenType.{TEXT,TAGBEGIN,TAGEND,DIRECTIVE,COMMENT,CDATA,DIRECTIVE}</tt> 230 * tokens. The matching of attribute names and values is done in a later step. 231 */ 232final class HtmlInputSplitter extends AbstractTokenStream { 233 /** The source of HTML character data. */ 234 private final String input; 235 /** An offset into input. */ 236 private int offset; 237 /** True iff the current character is inside a tag. */ 238 private boolean inTag; 239 /** 240 * True if inside a script, xmp, listing, or similar tag whose content does 241 * not follow the normal escaping rules. 242 */ 243 private boolean inEscapeExemptBlock; 244 245 /** 246 * Null or the name of the close tag required to end the current escape exempt 247 * block. 248 * Preformatted tags include <script>, <xmp>, etc. that may 249 * contain unescaped HTML input. 250 */ 251 private String escapeExemptTagName = null; 252 253 private HtmlTextEscapingMode textEscapingMode; 254 255 public HtmlInputSplitter(String input) { 256 this.input = input; 257 } 258 259 /** 260 * Make sure that there is a token ready to yield in this.token. 261 */ 262 @Override 263 protected HtmlToken produce() { 264 HtmlToken token = parseToken(); 265 if (null == token) { return null; } 266 267 // Handle escape-exempt blocks. 268 // The parse() method is only dimly aware of escape-excempt blocks, so 269 // here we detect the beginning and ends of escape exempt blocks, and 270 // reclassify as UNESCAPED, any tokens that appear in the middle. 271 if (inEscapeExemptBlock) { 272 if (token.type != HtmlTokenType.SERVERCODE) { 273 // classify RCDATA as text since it can contain entities 274 token = reclassify( 275 token, (this.textEscapingMode == HtmlTextEscapingMode.RCDATA 276 ? HtmlTokenType.TEXT 277 : HtmlTokenType.UNESCAPED)); 278 } 279 } else { 280 switch (token.type) { 281 case TAGBEGIN: 282 { 283 String canonTagName = canonicalName( 284 token.start + 1, token.end); 285 if (HtmlTextEscapingMode.isTagFollowedByLiteralContent( 286 canonTagName)) { 287 this.escapeExemptTagName = canonTagName; 288 this.textEscapingMode = HtmlTextEscapingMode.getModeForTag( 289 canonTagName); 290 } 291 break; 292 } 293 case TAGEND: 294 this.inEscapeExemptBlock = null != this.escapeExemptTagName; 295 break; 296 default: 297 break; 298 } 299 } 300 return token; 301 } 302 303 /** 304 * States for a state machine for optimistically identifying tags and other 305 * html/xml/phpish structures. 306 */ 307 private static enum State { 308 TAGNAME, 309 SLASH, 310 BANG, 311 BANG_DASH, 312 COMMENT, 313 COMMENT_DASH, 314 COMMENT_DASH_DASH, 315 DIRECTIVE, 316 DONE, 317 APP_DIRECTIVE, 318 APP_DIRECTIVE_QMARK, 319 SERVER_CODE, 320 SERVER_CODE_PCT, 321 322 // From HTML 5 section 8.1.2.6 323 324 // The text in CDATA and RCDATA elements must not contain any 325 // occurrences of the string "</" followed by characters that 326 // case-insensitively match the tag name of the element followed 327 // by one of U+0009 CHARACTER TABULATION, U+000A LINE FEED (LF), 328 // U+000B LINE TABULATION, U+000C FORM FEED (FF), U+0020 SPACE, 329 // U+003E GREATER-THAN SIGN (>), or U+002F SOLIDUS (/), unless 330 // that string is part of an escaping text span. 331 332 // An escaping text span is a span of text (in CDATA and RCDATA 333 // elements) and character entity references (in RCDATA elements) 334 // that starts with an escaping text span start that is not itself 335 // in an escaping text span, and ends at the next escaping text 336 // span end. 337 338 // An escaping text span start is a part of text that consists of 339 // the four character sequence "<!--". 340 341 // An escaping text span end is a part of text that consists of 342 // the three character sequence "-->". 343 344 // An escaping text span start may share its U+002D HYPHEN-MINUS characters 345 // with its corresponding escaping text span end. 346 UNESCAPED_LT_BANG, // <! 347 UNESCAPED_LT_BANG_DASH, // <!- 348 ESCAPING_TEXT_SPAN, // Inside an escaping text span 349 ESCAPING_TEXT_SPAN_DASH, // Seen - inside an escaping text span 350 ESCAPING_TEXT_SPAN_DASH_DASH, // Seen -- inside an escaping text span 351 ; 352 } 353 354 private HtmlToken lastNonIgnorable = null; 355 /** 356 * Breaks the character stream into tokens. 357 * This method returns a stream of tokens such that each token starts where 358 * the last token ended. 359 * 360 * <p>This property is useful as it allows fetch to collapse and reclassify 361 * ranges of tokens based on state that is easy to maintain there. 362 * 363 * <p>Later passes are responsible for throwing away useless tokens. 364 */ 365 private HtmlToken parseToken() { 366 int start = offset; 367 int limit = input.length(); 368 if (start == limit) { return null; } 369 370 int end = start + 1; 371 HtmlTokenType type; 372 373 char ch = input.charAt(start); 374 if (inTag) { 375 if ('>' == ch) { 376 type = HtmlTokenType.TAGEND; 377 inTag = false; 378 } else if ('/' == ch) { 379 if (end != limit && '>' == input.charAt(end)) { 380 type = HtmlTokenType.TAGEND; 381 inTag = false; 382 ++end; 383 } else { 384 type = HtmlTokenType.TEXT; 385 } 386 } else if ('=' == ch) { 387 type = HtmlTokenType.TEXT; 388 } else if ('"' == ch || '\'' == ch) { 389 type = HtmlTokenType.QSTRING; 390 int delim = ch; 391 for (; end < limit; ++end) { 392 if (input.charAt(end) == delim) { 393 ++end; 394 break; 395 } 396 } 397 } else if (!Character.isWhitespace(ch)) { 398 type = HtmlTokenType.TEXT; 399 for (; end < limit; ++end) { 400 ch = input.charAt(end); 401 // End a text chunk before /> 402 if ((lastNonIgnorable == null 403 || !lastNonIgnorable.tokenInContextMatches(input, "=")) 404 && '/' == ch && end + 1 < limit 405 && '>' == input.charAt(end + 1)) { 406 break; 407 } else if ('>' == ch || '=' == ch 408 || Character.isWhitespace(ch)) { 409 break; 410 } else if ('"' == ch || '\'' == ch) { 411 if (end + 1 < limit) { 412 char ch2 = input.charAt(end + 1); 413 if (ch2 >= 0 && Character.isWhitespace(ch2) 414 || ch2 == '>' || ch2 == '/') { 415 ++end; 416 break; 417 } 418 } 419 } 420 } 421 } else { 422 // We skip whitespace tokens inside tag bodies. 423 type = HtmlTokenType.IGNORABLE; 424 while (end < limit && Character.isWhitespace(input.charAt(end))) { 425 ++end; 426 } 427 } 428 } else { 429 if (ch == '<') { 430 if (end == limit) { 431 type = HtmlTokenType.TEXT; 432 } else { 433 ch = input.charAt(end); 434 type = null; 435 State state = null; 436 switch (ch) { 437 case '/': // close tag? 438 state = State.SLASH; 439 ++end; 440 break; 441 case '!': // Comment or declaration 442 if (!this.inEscapeExemptBlock) { 443 state = State.BANG; 444 } else if (HtmlTextEscapingMode.allowsEscapingTextSpan( 445 escapeExemptTagName)) { 446 // Directives, and cdata suppressed in escape 447 // exempt mode as they could obscure the close of the 448 // escape exempty block, but comments are similar to escaping 449 // text spans, and are significant in all CDATA and RCDATA 450 // blocks except those inside <xmp> tags. 451 // See "Escaping text spans" in section 8.1.2.6 of HTML5. 452 // http://www.w3.org/html/wg/html5/#cdata-rcdata-restrictions 453 state = State.UNESCAPED_LT_BANG; 454 } 455 ++end; 456 break; 457 case '?': 458 if (!this.inEscapeExemptBlock) { 459 state = State.APP_DIRECTIVE; 460 } 461 ++end; 462 break; 463 case '%': 464 state = State.SERVER_CODE; 465 ++end; 466 break; 467 default: 468 if (isIdentStart(ch) && !this.inEscapeExemptBlock) { 469 state = State.TAGNAME; 470 ++end; 471 } else if ('<' == ch) { 472 type = HtmlTokenType.TEXT; 473 } else { 474 ++end; 475 } 476 break; 477 } 478 if (null != state) { 479 charloop: 480 while (end < limit) { 481 ch = input.charAt(end); 482 switch (state) { 483 case TAGNAME: 484 if (Character.isWhitespace(ch) 485 || '>' == ch || '/' == ch || '<' == ch) { 486 // End processing of an escape exempt block when we see 487 // a corresponding end tag. 488 if (this.inEscapeExemptBlock 489 && '/' == input.charAt(start + 1) 490 && textEscapingMode != HtmlTextEscapingMode.PLAIN_TEXT 491 && canonicalName(start + 2, end) 492 .equals(escapeExemptTagName)) { 493 this.inEscapeExemptBlock = false; 494 this.escapeExemptTagName = null; 495 this.textEscapingMode = null; 496 } 497 type = HtmlTokenType.TAGBEGIN; 498 // Don't process content as attributes if we're inside 499 // an escape exempt block. 500 inTag = !this.inEscapeExemptBlock; 501 state = State.DONE; 502 break charloop; 503 } 504 break; 505 case SLASH: 506 if (Character.isLetter(ch)) { 507 state = State.TAGNAME; 508 } else { 509 if ('<' == ch) { 510 type = HtmlTokenType.TEXT; 511 } else { 512 ++end; 513 } 514 break charloop; 515 } 516 break; 517 case BANG: 518 if ('-' == ch) { 519 state = State.BANG_DASH; 520 } else { 521 state = State.DIRECTIVE; 522 } 523 break; 524 case BANG_DASH: 525 if ('-' == ch) { 526 state = State.COMMENT; 527 } else { 528 state = State.DIRECTIVE; 529 } 530 break; 531 case COMMENT: 532 if ('-' == ch) { 533 state = State.COMMENT_DASH; 534 } 535 break; 536 case COMMENT_DASH: 537 state = ('-' == ch) 538 ? State.COMMENT_DASH_DASH 539 : State.COMMENT_DASH; 540 break; 541 case COMMENT_DASH_DASH: 542 if ('>' == ch) { 543 state = State.DONE; 544 type = HtmlTokenType.COMMENT; 545 } else if ('-' == ch) { 546 state = State.COMMENT_DASH_DASH; 547 } else { 548 state = State.COMMENT_DASH; 549 } 550 break; 551 case DIRECTIVE: 552 if ('>' == ch) { 553 type = HtmlTokenType.DIRECTIVE; 554 state = State.DONE; 555 } 556 break; 557 case APP_DIRECTIVE: 558 if ('?' == ch) { state = State.APP_DIRECTIVE_QMARK; } 559 break; 560 case APP_DIRECTIVE_QMARK: 561 if ('>' == ch) { 562 type = HtmlTokenType.DIRECTIVE; 563 state = State.DONE; 564 } else if ('?' != ch) { 565 state = State.APP_DIRECTIVE; 566 } 567 break; 568 case SERVER_CODE: 569 if ('%' == ch) { 570 state = State.SERVER_CODE_PCT; 571 } 572 break; 573 case SERVER_CODE_PCT: 574 if ('>' == ch) { 575 type = HtmlTokenType.SERVERCODE; 576 state = State.DONE; 577 } else if ('%' != ch) { 578 state = State.SERVER_CODE; 579 } 580 break; 581 case UNESCAPED_LT_BANG: 582 if ('-' == ch) { 583 state = State.UNESCAPED_LT_BANG_DASH; 584 } else { 585 type = HtmlTokenType.TEXT; 586 state = State.DONE; 587 } 588 break; 589 case UNESCAPED_LT_BANG_DASH: 590 if ('-' == ch) { 591 // According to HTML 5 section 8.1.2.6 592 593 // An escaping text span start may share its 594 // U+002D HYPHEN-MINUS characters with its 595 // corresponding escaping text span end. 596 state = State.ESCAPING_TEXT_SPAN_DASH_DASH; 597 } else { 598 type = HtmlTokenType.TEXT; 599 state = State.DONE; 600 } 601 break; 602 case ESCAPING_TEXT_SPAN: 603 if ('-' == ch) { 604 state = State.ESCAPING_TEXT_SPAN_DASH; 605 } 606 break; 607 case ESCAPING_TEXT_SPAN_DASH: 608 if ('-' == ch) { 609 state = State.ESCAPING_TEXT_SPAN_DASH_DASH; 610 } else { 611 state = State.ESCAPING_TEXT_SPAN; 612 } 613 break; 614 case ESCAPING_TEXT_SPAN_DASH_DASH: 615 if ('>' == ch) { 616 type = HtmlTokenType.TEXT; 617 state = State.DONE; 618 } else if ('-' != ch) { 619 state = State.ESCAPING_TEXT_SPAN; 620 } 621 break; 622 case DONE: 623 throw new AssertionError( 624 "Unexpectedly DONE while lexing HTML token stream"); 625 } 626 ++end; 627 if (State.DONE == state) { break; } 628 } 629 if (end == limit) { 630 switch (state) { 631 case DONE: 632 break; 633 case COMMENT: 634 case COMMENT_DASH: 635 case COMMENT_DASH_DASH: 636 type = HtmlTokenType.COMMENT; 637 break; 638 case DIRECTIVE: 639 case APP_DIRECTIVE: 640 case APP_DIRECTIVE_QMARK: 641 type = HtmlTokenType.DIRECTIVE; 642 break; 643 case SERVER_CODE: 644 case SERVER_CODE_PCT: 645 type = HtmlTokenType.SERVERCODE; 646 break; 647 case TAGNAME: 648 type = HtmlTokenType.TAGBEGIN; 649 break; 650 default: 651 type = HtmlTokenType.TEXT; 652 break; 653 } 654 } 655 } 656 } 657 } else { 658 type = null; 659 } 660 } 661 if (null == type) { 662 while (end < limit && '<' != input.charAt(end)) { ++end; } 663 type = HtmlTokenType.TEXT; 664 } 665 666 offset = end; 667 HtmlToken result = HtmlToken.instance(start, end, type); 668 if (type != HtmlTokenType.IGNORABLE) { lastNonIgnorable = result; } 669 return result; 670 } 671 672 private String canonicalName(int start, int end) { 673 return HtmlLexer.canonicalName(input.substring(start, end)); 674 } 675 676 private boolean isIdentStart(char ch) { 677 return ch >= 'A' && ch <= 'z' && (ch <= 'Z' || ch >= 'a'); 678 } 679 680 static HtmlToken reclassify(HtmlToken token, HtmlTokenType type) { 681 return HtmlToken.instance(token.start, token.end, type); 682 } 683} 684 685 686/** 687 * A TokenStream that lazily fetches one token at a time. 688 * 689 * @author msamuel@gmail.com (Mike Samuel) 690 */ 691abstract class AbstractTokenStream implements TokenStream { 692 private HtmlToken tok; 693 694 public final boolean hasNext() { 695 if (tok == null) { tok = produce(); } 696 return tok != null; 697 } 698 699 public HtmlToken next() { 700 if (this.tok == null) { this.tok = produce(); } 701 HtmlToken t = this.tok; 702 if (t == null) { throw new NoSuchElementException(); } 703 this.tok = null; 704 return t; 705 } 706 707 protected abstract HtmlToken produce(); 708} 709