1/* 2 * Copyright (C) 2010 Google Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17package com.google.streamhtmlparser.impl; 18 19import com.google.common.base.Preconditions; 20import com.google.common.collect.Maps; 21import com.google.streamhtmlparser.ExternalState; 22import com.google.streamhtmlparser.HtmlParser; 23import com.google.streamhtmlparser.ParseException; 24import com.google.streamhtmlparser.util.CharacterRecorder; 25import com.google.streamhtmlparser.util.EntityResolver; 26import com.google.streamhtmlparser.util.HtmlUtils; 27 28import java.util.Map; 29 30/** 31 * A custom specialized parser - ported from the main C++ version - used to 32 * implement context-aware escaping of run-time data in web-application 33 * templates. 34 * 35 * <p>This is the main class in the package. It implements the 36 * {@code HtmlParser} interface. 37 * 38 * <p>This class is not thread-safe, in particular you cannot invoke any 39 * state changing operations (such as {@code parse} from multiple threads 40 * on the same object. 41 * 42 * <p>If you are looking at this class, chances are very high you are 43 * implementing Auto-Escaping for a new template system. Please see the 44 * landing page including a design document at 45 * <a href="http://go/autoescape">Auto-Escape Landing Page</a>. 46 */ 47public class HtmlParserImpl extends GenericParser implements HtmlParser { 48 49 /* 50 * Internal representation of the parser state, which is at a 51 * finer-granularity than the external state as given to callers. 52 * The relationship between <code>InternalState</code> and 53 * <code>ExternalState</code> is a many-to-one relationship. 54 */ 55 private static final InternalState TEXT; 56 private static final InternalState TAG_START; 57 private static final InternalState TAG_NAME; 58 private static final InternalState DECL_START; 59 private static final InternalState DECL_BODY; 60 private static final InternalState COM_OPEN; 61 private static final InternalState COM_BODY; 62 private static final InternalState COM_DASH; 63 private static final InternalState COM_DASH_DASH; 64 private static final InternalState PI; 65 private static final InternalState PI_MAY_END; 66 private static final InternalState TAG_SPACE; 67 private static final InternalState TAG_CLOSE; 68 private static final InternalState ATTR; 69 private static final InternalState ATTR_SPACE; 70 private static final InternalState VALUE; 71 private static final InternalState VALUE_TEXT; 72 private static final InternalState VALUE_Q_START; 73 private static final InternalState VALUE_Q; 74 private static final InternalState VALUE_DQ_START; 75 private static final InternalState VALUE_DQ; 76 private static final InternalState CDATA_COM_START; 77 private static final InternalState CDATA_COM_START_DASH; 78 private static final InternalState CDATA_COM_BODY; 79 private static final InternalState CDATA_COM_DASH; 80 private static final InternalState CDATA_COM_DASH_DASH; 81 private static final InternalState CDATA_TEXT; 82 private static final InternalState CDATA_LT; 83 private static final InternalState CDATA_MAY_CLOSE; 84 private static final InternalState JS_FILE; 85 private static final InternalState CSS_FILE; 86 87 static { 88 TEXT = InternalState.getInstanceHtml("TEXT"); 89 TAG_START = InternalState.getInstanceHtml("TAG_START"); 90 TAG_NAME = InternalState.getInstanceHtml("TAG_NAME"); 91 DECL_START = InternalState.getInstanceHtml("DECL_START"); 92 DECL_BODY = InternalState.getInstanceHtml("DECL_BODY"); 93 COM_OPEN = InternalState.getInstanceHtml("COM_OPEN"); 94 COM_BODY = InternalState.getInstanceHtml("COM_BODY"); 95 COM_DASH = InternalState.getInstanceHtml("COM_DASH"); 96 COM_DASH_DASH = InternalState.getInstanceHtml("COM_DASH_DASH"); 97 PI =InternalState.getInstanceHtml("PI"); 98 PI_MAY_END = InternalState.getInstanceHtml("PI_MAY_END"); 99 TAG_SPACE = InternalState.getInstanceHtml("TAG_SPACE"); 100 TAG_CLOSE = InternalState.getInstanceHtml("TAG_CLOSE"); 101 ATTR = InternalState.getInstanceHtml("ATTR"); 102 ATTR_SPACE = InternalState.getInstanceHtml("ATTR_SPACE"); 103 VALUE = InternalState.getInstanceHtml("VALUE"); 104 VALUE_TEXT = InternalState.getInstanceHtml("VALUE_TEXT"); 105 VALUE_Q_START = InternalState.getInstanceHtml("VALUE_Q_START"); 106 VALUE_Q = InternalState.getInstanceHtml("VALUE_Q"); 107 VALUE_DQ_START = InternalState.getInstanceHtml("VALUE_DQ_START"); 108 VALUE_DQ = InternalState.getInstanceHtml("VALUE_DQ"); 109 CDATA_COM_START = InternalState.getInstanceHtml("CDATA_COM_START"); 110 CDATA_COM_START_DASH = 111 InternalState.getInstanceHtml("CDATA_COM_START_DASH"); 112 CDATA_COM_BODY = InternalState.getInstanceHtml("CDATA_COM_BODY"); 113 CDATA_COM_DASH = InternalState.getInstanceHtml("CDATA_COM_DASH"); 114 CDATA_COM_DASH_DASH = InternalState.getInstanceHtml("CDATA_COM_DASH_DASH"); 115 CDATA_TEXT = InternalState.getInstanceHtml("CDATA_TEXT"); 116 CDATA_LT = InternalState.getInstanceHtml("CDATA_LT"); 117 CDATA_MAY_CLOSE = InternalState.getInstanceHtml("CDATA_MAY_CLOSE"); 118 JS_FILE = InternalState.getInstanceHtml("JS_FILE"); 119 CSS_FILE = InternalState.getInstanceHtml("CSS_FILE"); 120 } 121 122 private static final Map<InternalState, ExternalState> STATE_MAPPING = 123 Maps.newHashMap(); 124 static { 125 initializeStateMapping(); 126 } 127 128 private static final ParserStateTable STATE_TABLE = new ParserStateTable(); 129 static { 130 initializeParserStateTable(); 131 } 132 133 private final CharacterRecorder tag; 134 private final CharacterRecorder attr; 135 private final CharacterRecorder value; 136 private final CharacterRecorder cdataCloseTag; 137 private final EntityResolver entityResolver; 138 private final JavascriptParserImpl jsParser; 139 private boolean insideJavascript; 140 private int valueIndex; 141 // True iff InsertText() was called at the start of a URL attribute value. 142 private boolean textInsideUrlValue; 143 144 /** 145 * Creates an {@code HtmlParserImpl} object. 146 * 147 * <p>Both for performance reasons and to leverage code a state-flow machine 148 * that is automatically generated from Python for multiple target 149 * languages, this object uses a static {@code ParserStateTable} that 150 * is read-only and obtained from the generated code in {@code HtmlParserFsm}. 151 * That code also maintains the mapping from internal states 152 * ({@code InternalState}) to external states ({@code ExternalState}). 153 */ 154 public HtmlParserImpl() { 155 super(STATE_TABLE, STATE_MAPPING, TEXT); 156 tag = new CharacterRecorder(); 157 attr = new CharacterRecorder(); 158 value = new CharacterRecorder(); 159 cdataCloseTag = new CharacterRecorder(); 160 entityResolver = new EntityResolver(); 161 jsParser = new JavascriptParserImpl(); 162 insideJavascript = false; 163 valueIndex = 0; 164 textInsideUrlValue = false; 165 } 166 167 /** 168 * Creates an {@code HtmlParserImpl} that is a copy of the one provided. 169 * 170 * @param aHtmlParserImpl the {@code HtmlParserImpl} object to copy 171 */ 172 public HtmlParserImpl(HtmlParserImpl aHtmlParserImpl) { 173 super(aHtmlParserImpl); 174 tag = new CharacterRecorder(aHtmlParserImpl.tag); 175 attr = new CharacterRecorder(aHtmlParserImpl.attr); 176 value = new CharacterRecorder(aHtmlParserImpl.value); 177 cdataCloseTag = new CharacterRecorder(aHtmlParserImpl.cdataCloseTag); 178 entityResolver = new EntityResolver(aHtmlParserImpl.entityResolver); 179 jsParser = new JavascriptParserImpl(aHtmlParserImpl.jsParser); 180 insideJavascript = aHtmlParserImpl.insideJavascript; 181 valueIndex = aHtmlParserImpl.valueIndex; 182 textInsideUrlValue = aHtmlParserImpl.textInsideUrlValue; 183 } 184 185 @Override 186 public boolean inJavascript() { 187 return (insideJavascript 188 && ( (getState() == STATE_VALUE) 189 || (currentState == CDATA_TEXT) 190 || (currentState == CDATA_COM_START) 191 || (currentState == CDATA_COM_START_DASH) 192 || (currentState == CDATA_COM_BODY) 193 || (currentState == CDATA_COM_DASH) 194 || (currentState == CDATA_COM_DASH_DASH) 195 || (currentState == CDATA_LT) 196 || (currentState == CDATA_MAY_CLOSE) 197 || (currentState == JS_FILE) )); 198 } 199 200 @Override 201 public boolean isJavascriptQuoted() { 202 if (inJavascript()) { 203 ExternalState jsParserState = jsParser.getState(); 204 return (jsParserState == JavascriptParserImpl.STATE_Q 205 || jsParserState == JavascriptParserImpl.STATE_DQ); 206 } 207 return false; 208 } 209 210 @Override 211 public boolean inAttribute() { 212 ExternalState extState = getState(); 213 return (extState != null && (extState == STATE_ATTR 214 || extState == STATE_VALUE)); 215 } 216 217 /** 218 * Returns {@code true} if and only if the parser is currently within 219 * a CSS context. A CSS context is one of the below: 220 * <ul> 221 * <li>Inside a STYLE tag. 222 * <li>Inside a STYLE attribute. 223 * <li>Inside a CSS file when the parser was reset in the CSS mode. 224 * </ul> 225 * 226 * @return {@code true} if and only if the parser is inside CSS 227 */ 228 @Override 229 public boolean inCss() { 230 return (currentState == CSS_FILE 231 || (getState() == STATE_VALUE 232 && (getAttributeType() == ATTR_TYPE.STYLE)) 233 || ("style".equals(getTag()))); 234 } 235 236 @Override 237 public ATTR_TYPE getAttributeType() { 238 String attribute = getAttribute(); 239 if (!inAttribute()) { 240 return ATTR_TYPE.NONE; 241 } 242 if (HtmlUtils.isAttributeJavascript(attribute)) { 243 return ATTR_TYPE.JS; 244 } 245 if (HtmlUtils.isAttributeUri(attribute)) { 246 return ATTR_TYPE.URI; 247 } 248 if (HtmlUtils.isAttributeStyle(attribute)) { 249 return ATTR_TYPE.STYLE; 250 } 251 252 // Special logic to handle the "content" attribute of the "meta" tag. 253 if ("meta".equals(getTag()) && "content".equals(getAttribute())) { 254 HtmlUtils.META_REDIRECT_TYPE redirectType = 255 HtmlUtils.parseContentAttributeForUrl(getValue()); 256 if (redirectType == HtmlUtils.META_REDIRECT_TYPE.URL_START || 257 redirectType == HtmlUtils.META_REDIRECT_TYPE.URL) 258 return ATTR_TYPE.URI; 259 } 260 261 return ATTR_TYPE.REGULAR; 262 } 263 264 @Override 265 public ExternalState getJavascriptState() { 266 return jsParser.getState(); 267 } 268 269 @Override 270 public boolean isAttributeQuoted() { 271 return (currentState == VALUE_Q_START 272 || currentState == VALUE_Q 273 || currentState == VALUE_DQ_START 274 || currentState == VALUE_DQ); 275 } 276 277 @Override 278 public String getTag() { 279 return tag.getContent().toLowerCase(); 280 } 281 282 @Override 283 public String getAttribute() { 284 return inAttribute() ? attr.getContent().toLowerCase() : ""; 285 } 286 287 @Override 288 public String getValue() { 289 return (getState() == STATE_VALUE) ? value.getContent() : ""; 290 } 291 292 @Override 293 public int getValueIndex() { 294 if (getState() != STATE_VALUE) { 295 return 0; 296 } 297 return valueIndex; 298 } 299 300 @Override 301 public boolean isUrlStart() { 302 // False when not inside an HTML attribute value 303 if (getState() != STATE_VALUE) { 304 return false; 305 } 306 307 // Or when the HTML attribute is not of URI type. 308 if (getAttributeType() != ATTR_TYPE.URI) { 309 return false; 310 } 311 312 // Or when we received an InsertText() directive at the start of a URL. 313 if (textInsideUrlValue) { 314 return false; 315 } 316 317 if ("meta".equals(getTag())) { 318 // At this point, we know we are in the "content" attribute 319 // or we would not have the URI attribute type. 320 return (HtmlUtils.parseContentAttributeForUrl(getValue()) == 321 HtmlUtils.META_REDIRECT_TYPE.URL_START); 322 } 323 324 // For all other URI attributes, check if we are at index 0. 325 return (getValueIndex() == 0); 326} 327 328 /** 329 * {@inheritDoc} 330 * 331 * Resets the state of the parser to a state consistent with the 332 * {@code Mode} provided. This will reset finer-grained state 333 * information back to a default value, hence use only when 334 * you want to parse text from a very clean slate. 335 * 336 * <p>See the {@link HtmlParser.Mode} enum for information on all 337 * the valid modes. 338 * 339 * @param mode is an enum representing the high-level state of the parser 340 */ 341 @Override 342 public void resetMode(Mode mode) { 343 insideJavascript = false; 344 tag.reset(); 345 attr.reset(); 346 value.reset(); 347 cdataCloseTag.reset(); 348 valueIndex = 0; 349 textInsideUrlValue = false; 350 jsParser.reset(); 351 352 switch (mode) { 353 case HTML: 354 currentState = TEXT; 355 break; 356 case JS: 357 currentState = JS_FILE; 358 insideJavascript = true; 359 break; 360 case CSS: 361 currentState = CSS_FILE; 362 break; 363 case HTML_IN_TAG: 364 currentState = TAG_SPACE; 365 break; 366 default: 367 throw new IllegalArgumentException("Did not recognize Mode: " + 368 mode.toString()); 369 } 370 } 371 372 /** 373 * Resets the state of the parser to the initial state of parsing HTML. 374 */ 375 public void reset() { 376 super.reset(); 377 resetMode(Mode.HTML); 378 } 379 380 /** 381 * A specialized directive to tell the parser there is some content 382 * that will be inserted here but that it will not get to parse. Used 383 * by the template system that may not be able to give some content 384 * to the parser but wants it to know there typically will be content 385 * inserted at that point. This is a hint used in corner cases within 386 * parsing of HTML attribute names and values where content we do not 387 * get to see could affect our parsing and alter our current state. 388 * 389 * <p>The two cases where {@code #insertText()} affects our parsing are: 390 * <ul> 391 * <li>We are at the start of the value of a URL-accepting HTML attribute. In 392 * that case, we change internal state to no longer be considered at the 393 * start of the URL. This may affect what escaping template systems may want 394 * to perform on the HTML attribute value. We avoid injecting fake data and 395 * hence not modify the current index of the value as determined by 396 * {@link #getValueIndex()}</li> 397 * <li>We just transitioned from an attribute name to an attribute value 398 * (by parsing the separating {@code '='} character). In that case, we 399 * change internal state to be now inside a non-quoted HTML attribute 400 * value.</li> 401 * </ul> 402 * 403 * @throws ParseException if an unrecoverable error occurred during parsing 404 */ 405 @Override 406 public void insertText() throws ParseException { 407 // Case: Inside URL attribute value. 408 if (getState() == STATE_VALUE 409 && getAttributeType() == ATTR_TYPE.URI 410 && isUrlStart()) { 411 textInsideUrlValue = true; 412 } 413 // Case: Before parsing any attribute value. 414 if (currentState == VALUE) { 415 setNextState(VALUE_TEXT); 416 } 417 } 418 419 @Override 420 protected InternalState handleEnterState(InternalState currentState, 421 InternalState expectedNextState, 422 char input) { 423 InternalState nextState = expectedNextState; 424 if (currentState == TAG_NAME) { 425 enterTagName(); 426 } else if (currentState == ATTR) { 427 enterAttribute(); 428 } else if (currentState == TAG_CLOSE) { 429 nextState = tagClose(currentState); 430 } else if (currentState == CDATA_MAY_CLOSE) { 431 enterStateCdataMayClose(); 432 } else if (currentState == VALUE) { 433 enterValue(); 434 } else 435 if (currentState == VALUE_TEXT || currentState == VALUE_Q 436 || currentState == VALUE_DQ) { 437 enterValueContent(); 438 } 439 return nextState; 440 } 441 442 @Override 443 protected InternalState handleExitState(InternalState currentState, 444 InternalState expectedNextState, 445 char input) { 446 InternalState nextState = expectedNextState; 447 if (currentState == TAG_NAME) { 448 exitTagName(); 449 } else if (currentState == ATTR) { 450 exitAttribute(); 451 } else if (currentState == CDATA_MAY_CLOSE) { 452 nextState = exitStateCdataMayClose(nextState, input); 453 } else 454 if ((currentState == VALUE_TEXT) || (currentState == VALUE_Q) 455 || (currentState == VALUE_DQ)) { 456 exitValueContent(); 457 } 458 return nextState; 459 } 460 461 @Override 462 protected InternalState handleInState(InternalState currentState, 463 char input) throws ParseException { 464 if ((currentState == CDATA_TEXT) 465 || (currentState == CDATA_COM_START) 466 || (currentState == CDATA_COM_START_DASH) 467 || (currentState == CDATA_COM_BODY) 468 || (currentState == CDATA_COM_DASH) 469 || (currentState == CDATA_COM_DASH_DASH) 470 || (currentState == CDATA_LT) 471 || (currentState == CDATA_MAY_CLOSE) 472 || (currentState == JS_FILE)) { 473 inStateCdata(input); 474 } else if ((currentState == VALUE_TEXT) 475 || (currentState == VALUE_Q) 476 || (currentState == VALUE_DQ)) { 477 inStateValue(input); 478 } 479 return currentState; 480 } 481 482 /** 483 * Invokes recording on all CharacterRecorder objects. Currently we do 484 * not check that one and only one of them is recording. I did a fair 485 * bit of testing on the C++ parser and was not convinced there is 486 * such a guarantee. 487 */ 488 @Override 489 protected void record(char input) { 490 attr.maybeRecord(input); 491 tag.maybeRecord(input); 492 value.maybeRecord(input); 493 cdataCloseTag.maybeRecord(input); 494 } 495 496 /** 497 * Starts recording the name of the HTML tag. Called when the parser 498 * enters a new tag. 499 */ 500 private void enterTagName() { 501 tag.startRecording(); 502 } 503 504 private void exitTagName() { 505 tag.stopRecording(); 506 String tagString = tag.getContent(); 507 if (!tagString.isEmpty() && tagString.charAt(0) == '/') { 508 tag.reset(); 509 } 510 } 511 512 /** 513 * Starts recording the name of the HTML attribute. Called when the parser 514 * enters a new HTML attribute. 515 */ 516 private void enterAttribute() { 517 attr.startRecording(); 518 } 519 520 private void exitAttribute() { 521 attr.stopRecording(); 522 } 523 524 /** 525 * Tracks the index within the HTML attribute value and initializes 526 * the javascript parser for attributes that take javascript. 527 * 528 * Called when the parser enters a new HTML attribute value. 529 */ 530 private void enterValue() { 531 valueIndex = 0; 532 textInsideUrlValue = false; 533 if (HtmlUtils.isAttributeJavascript(getAttribute())) { 534 entityResolver.reset(); 535 jsParser.reset(); 536 insideJavascript = true; 537 } else { 538 insideJavascript = false; 539 } 540 } 541 542 /** 543 * Starts recordning the contents of the attribute value. 544 * 545 * Called when entering an attribute value. 546 */ 547 private void enterValueContent() { 548 value.startRecording(); 549 } 550 551 /** 552 * Stops the recording of the attribute value and exits javascript 553 * (in case we were inside it). 554 */ 555 private void exitValueContent() { 556 value.stopRecording(); 557 insideJavascript = false; 558 } 559 560 /** 561 * Processes javascript after performing entity resolution and updates 562 * the position within the attribute value. 563 * If the status of the entity resolution is <code>IN_PROGRESS</code>, 564 * we don't invoke the javascript parser. 565 * 566 * <p>Called for every character inside an attribute value. 567 * 568 * @param input character read 569 * @throws ParseException if an unrecoverable error occurred during parsing 570 */ 571 private void inStateValue(char input) throws ParseException { 572 valueIndex++; 573 if (insideJavascript) { 574 EntityResolver.Status status = entityResolver.processChar(input); 575 if (status == EntityResolver.Status.COMPLETED) { 576 jsParser.parse(entityResolver.getEntity()); 577 entityResolver.reset(); 578 } else if (status == EntityResolver.Status.NOT_STARTED) { 579 jsParser.parse(input); 580 } 581 } 582 } 583 584 /** 585 * Handles the tag it finished reading. 586 * 587 * <p>For a script tag, it initializes the javascript parser. For all 588 * tags that are recognized to have CDATA values 589 * (including the script tag), it switches the CDATA state to handle them 590 * properly. For code simplification, CDATA and RCDATA sections are 591 * treated the same. 592 * 593 * <p>Called when the parser leaves a tag definition. 594 * 595 * @param state current state 596 * @return state next state, could be the same as current state 597 */ 598 private InternalState tagClose(InternalState state) { 599 InternalState nextState = state; 600 String tagName = getTag(); 601 if ("script".equals(tagName)) { 602 nextState = CDATA_TEXT; 603 jsParser.reset(); 604 insideJavascript = true; 605 } else if ("style".equals(tagName) 606 || "title".equals(tagName) 607 || "textarea".equals(tagName)) { 608 nextState = CDATA_TEXT; 609 insideJavascript = false; 610 } 611 return nextState; 612 } 613 614 /** 615 * Feeds the character to the javascript parser for processing. 616 * 617 * <p>Called inside CDATA blocks to parse javascript. 618 * 619 * @param input character read 620 * @throws ParseException if an unrecoverable error occurred during parsing 621 */ 622 private void inStateCdata(char input) throws ParseException { 623 if (insideJavascript) { 624 jsParser.parse(input); 625 } 626 } 627 628 /** 629 * Starts recording. This is so we find the closing tag name in order to 630 * know if the tag is going to be closed or not. 631 * 632 * <p>Called when encountering a '<' character in a CDATA section. 633 */ 634 private void enterStateCdataMayClose() { 635 cdataCloseTag.startRecording(); 636 } 637 638 /** 639 * Determines whether to close the tag element, It closes it if it finds 640 * the corresponding end tag. Called when reading what could be a 641 * closing CDATA tag. 642 * 643 * @param input the character read 644 * @param expectedNextState the expected state to go to next 645 * unless we want to change it here 646 * @return the next state to go to 647 */ 648 private InternalState exitStateCdataMayClose( 649 InternalState expectedNextState, 650 char input) { 651 InternalState nextState = expectedNextState; 652 cdataCloseTag.stopRecording(); 653 String cdataCloseTagString = cdataCloseTag.getContent(); 654 Preconditions.checkState(!cdataCloseTagString.isEmpty() 655 && cdataCloseTagString.charAt(0) == '/'); // Developer error. 656 657 if (cdataCloseTagString.substring(1).equalsIgnoreCase(getTag()) 658 && (input == '>' || HtmlUtils.isHtmlSpace(input))) { 659 tag.clear(); 660 insideJavascript = false; 661 } else { 662 nextState = CDATA_TEXT; 663 } 664 return nextState; 665 } 666 667 668 // ======================================================= // 669 // SECTION BELOW WILL ALL BE AUTO-GENERATED IN FUTURE. // 670 // ======================================================= // 671 672 private static void registerMapping(InternalState internalState, 673 ExternalState externalState) { 674 STATE_MAPPING.put(internalState, externalState); 675 } 676 677 private static void initializeStateMapping() { 678 // Each parser implementation must map the error state appropriately. 679 registerMapping(InternalState.INTERNAL_ERROR_STATE, HtmlParser.STATE_ERROR); 680 681 registerMapping(TEXT, HtmlParser.STATE_TEXT); 682 registerMapping(TAG_START, HtmlParser.STATE_TAG); 683 registerMapping(TAG_NAME, HtmlParser.STATE_TAG); 684 registerMapping(DECL_START, HtmlParser.STATE_TEXT); 685 registerMapping(DECL_BODY, HtmlParser.STATE_TEXT); 686 registerMapping(COM_OPEN, HtmlParser.STATE_TEXT); 687 registerMapping(COM_BODY, HtmlParser.STATE_COMMENT); 688 registerMapping(COM_DASH, HtmlParser.STATE_COMMENT); 689 registerMapping(COM_DASH_DASH, HtmlParser.STATE_COMMENT); 690 registerMapping(PI, HtmlParser.STATE_TEXT); 691 registerMapping(PI_MAY_END, HtmlParser.STATE_TEXT); 692 registerMapping(TAG_SPACE, HtmlParser.STATE_TAG); 693 registerMapping(TAG_CLOSE, HtmlParser.STATE_TEXT); 694 registerMapping(ATTR, HtmlParser.STATE_ATTR); 695 registerMapping(ATTR_SPACE, HtmlParser.STATE_ATTR); 696 registerMapping(VALUE, HtmlParser.STATE_VALUE); 697 registerMapping(VALUE_TEXT, HtmlParser.STATE_VALUE); 698 registerMapping(VALUE_Q_START, HtmlParser.STATE_VALUE); 699 registerMapping(VALUE_Q, HtmlParser.STATE_VALUE); 700 registerMapping(VALUE_DQ_START, HtmlParser.STATE_VALUE); 701 registerMapping(VALUE_DQ, HtmlParser.STATE_VALUE); 702 registerMapping(CDATA_COM_START, HtmlParser.STATE_TEXT); 703 registerMapping(CDATA_COM_START_DASH, HtmlParser.STATE_TEXT); 704 registerMapping(CDATA_COM_BODY, HtmlParser.STATE_TEXT); 705 registerMapping(CDATA_COM_DASH, HtmlParser.STATE_TEXT); 706 registerMapping(CDATA_COM_DASH_DASH, HtmlParser.STATE_TEXT); 707 registerMapping(CDATA_TEXT, HtmlParser.STATE_TEXT); 708 registerMapping(CDATA_LT, HtmlParser.STATE_TEXT); 709 registerMapping(CDATA_MAY_CLOSE, HtmlParser.STATE_TEXT); 710 registerMapping(JS_FILE, HtmlParser.STATE_JS_FILE); 711 registerMapping(CSS_FILE, HtmlParser.STATE_CSS_FILE); 712 } 713 714 private static void registerTransition(String expression, 715 InternalState source, 716 InternalState to) { 717 // It seems to silly to go through a StateTableTransition here 718 // but it adds extra data checking. 719 StateTableTransition stt = new StateTableTransition(expression, 720 source, to); 721 STATE_TABLE.setExpression(stt.getExpression(), stt.getFrom(), 722 stt.getTo()); 723 } 724 725 // NOTE: The "[:default:]" transition should be registered before any 726 // other transitions for a given state or it will over-write them. 727 private static void initializeParserStateTable() { 728 registerTransition("[:default:]", CSS_FILE, CSS_FILE); 729 registerTransition("[:default:]", JS_FILE, JS_FILE); 730 registerTransition("[:default:]", CDATA_MAY_CLOSE, CDATA_TEXT); 731 registerTransition(" \t\n\r", CDATA_MAY_CLOSE, TAG_SPACE); 732 registerTransition(">", CDATA_MAY_CLOSE, TEXT); 733 registerTransition("A-Za-z0-9/_:-", CDATA_MAY_CLOSE, CDATA_MAY_CLOSE); 734 registerTransition("[:default:]", CDATA_LT, CDATA_TEXT); 735 registerTransition("!", CDATA_LT, CDATA_COM_START); 736 registerTransition("/", CDATA_LT, CDATA_MAY_CLOSE); 737 registerTransition("[:default:]", CDATA_TEXT, CDATA_TEXT); 738 registerTransition("<", CDATA_TEXT, CDATA_LT); 739 registerTransition("[:default:]", CDATA_COM_DASH_DASH, CDATA_COM_BODY); 740 registerTransition(">", CDATA_COM_DASH_DASH, CDATA_TEXT); 741 registerTransition("-", CDATA_COM_DASH_DASH, CDATA_COM_DASH_DASH); 742 registerTransition("[:default:]", CDATA_COM_DASH, CDATA_COM_BODY); 743 registerTransition("-", CDATA_COM_DASH, CDATA_COM_DASH_DASH); 744 registerTransition("[:default:]", CDATA_COM_BODY, CDATA_COM_BODY); 745 registerTransition("-", CDATA_COM_BODY, CDATA_COM_DASH); 746 registerTransition("[:default:]", CDATA_COM_START_DASH, CDATA_TEXT); 747 registerTransition("-", CDATA_COM_START_DASH, CDATA_COM_BODY); 748 registerTransition("[:default:]", CDATA_COM_START, CDATA_TEXT); 749 registerTransition("-", CDATA_COM_START, CDATA_COM_START_DASH); 750 registerTransition("[:default:]", VALUE_DQ, VALUE_DQ); 751 registerTransition("\"", VALUE_DQ, TAG_SPACE); 752 registerTransition("[:default:]", VALUE_DQ_START, VALUE_DQ); 753 registerTransition("\"", VALUE_DQ_START, TAG_SPACE); 754 registerTransition("[:default:]", VALUE_Q, VALUE_Q); 755 registerTransition("\'", VALUE_Q, TAG_SPACE); 756 registerTransition("[:default:]", VALUE_Q_START, VALUE_Q); 757 registerTransition("\'", VALUE_Q_START, TAG_SPACE); 758 registerTransition("[:default:]", VALUE_TEXT, VALUE_TEXT); 759 registerTransition(" \t\n\r", VALUE_TEXT, TAG_SPACE); 760 registerTransition(">", VALUE_TEXT, TAG_CLOSE); 761 registerTransition("[:default:]", VALUE, VALUE_TEXT); 762 registerTransition(">", VALUE, TAG_CLOSE); 763 registerTransition(" \t\n\r", VALUE, VALUE); 764 registerTransition("\"", VALUE, VALUE_DQ_START); 765 registerTransition("\'", VALUE, VALUE_Q_START); 766 registerTransition("=", ATTR_SPACE, VALUE); 767 registerTransition("/", ATTR_SPACE, TAG_SPACE); 768 registerTransition("A-Za-z0-9_:-", ATTR_SPACE, ATTR); 769 registerTransition(" \t\n\r", ATTR_SPACE, ATTR_SPACE); 770 registerTransition(">", ATTR_SPACE, TAG_CLOSE); 771 registerTransition(" \t\n\r", ATTR, ATTR_SPACE); 772 registerTransition("=", ATTR, VALUE); 773 registerTransition("/", ATTR, TAG_SPACE); 774 registerTransition(">", ATTR, TAG_CLOSE); 775 registerTransition("A-Za-z0-9_:.-", ATTR, ATTR); 776 registerTransition("[:default:]", TAG_CLOSE, TEXT); 777 registerTransition("<", TAG_CLOSE, TAG_START); 778 registerTransition("/", TAG_SPACE, TAG_SPACE); 779 registerTransition("A-Za-z0-9_:-", TAG_SPACE, ATTR); 780 registerTransition(" \t\n\r", TAG_SPACE, TAG_SPACE); 781 registerTransition(">", TAG_SPACE, TAG_CLOSE); 782 registerTransition("[:default:]", PI_MAY_END, PI); 783 registerTransition(">", PI_MAY_END, TEXT); 784 registerTransition("[:default:]", PI, PI); 785 registerTransition("?", PI, PI_MAY_END); 786 registerTransition("[:default:]", COM_DASH_DASH, COM_BODY); 787 registerTransition(">", COM_DASH_DASH, TEXT); 788 registerTransition("-", COM_DASH_DASH, COM_DASH_DASH); 789 registerTransition("[:default:]", COM_DASH, COM_BODY); 790 registerTransition("-", COM_DASH, COM_DASH_DASH); 791 registerTransition("[:default:]", COM_BODY, COM_BODY); 792 registerTransition("-", COM_BODY, COM_DASH); 793 registerTransition("[:default:]", COM_OPEN, TEXT); 794 registerTransition("-", COM_OPEN, COM_BODY); 795 registerTransition("[:default:]", DECL_BODY, DECL_BODY); 796 registerTransition(">", DECL_BODY, TEXT); 797 registerTransition("[:default:]", DECL_START, DECL_BODY); 798 registerTransition(">", DECL_START, TEXT); 799 registerTransition("-", DECL_START, COM_OPEN); 800 registerTransition(">", TAG_NAME, TAG_CLOSE); 801 registerTransition(" \t\n\r", TAG_NAME, TAG_SPACE); 802 registerTransition("A-Za-z0-9/_:-", TAG_NAME, TAG_NAME); 803 804 // Manual change to remain in-sync with CL 10597850 in C HtmlParser. 805 registerTransition("[:default:]", TAG_START, TEXT); 806 registerTransition("<", TAG_START, TAG_START); 807 // End of manual change. 808 809 registerTransition("!", TAG_START, DECL_START); 810 registerTransition("?", TAG_START, PI); 811 registerTransition("A-Za-z0-9/_:-", TAG_START, TAG_NAME); 812 registerTransition("[:default:]", TEXT, TEXT); 813 registerTransition("<", TEXT, TAG_START); 814 } 815} 816