1/** 2 * Copyright (c) 2004, Google Inc. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16package com.android.mail.lib.html.parser; 17 18import com.android.mail.lib.base.CharEscapers; 19import com.android.mail.lib.base.CharMatcher; 20import com.android.mail.lib.base.StringUtil; 21import com.android.mail.lib.base.X; 22import com.google.common.collect.Lists; 23 24import java.io.PrintWriter; 25import java.io.StringWriter; 26import java.util.ArrayList; 27import java.util.Arrays; 28import java.util.List; 29 30 31/** 32 * HtmlDocument is a container for a list of html nodes, and represents the 33 * entire html document. It contains toHTML() method which prints out the html 34 * text, toXHTML for printing out XHTML text and toString() which prints out in 35 * debug format. 36 * 37 * @author jlim@google.com (Jing Yee Lim) 38 */ 39public class HtmlDocument { 40 /** List of Node objects */ 41 private final List<Node> nodes; 42 43 /** 44 * Creates a Html document. 45 * @param nodes list of html nodes 46 */ 47 public HtmlDocument(List<Node> nodes) { 48 this.nodes = nodes; 49 } 50 51 /** Gets the list of nodes */ 52 public List<Node> getNodes() { 53 return nodes; 54 } 55 56 /** Returns a HTML string for the current document */ 57 public String toHTML() { 58 StringBuilder sb = new StringBuilder(nodes.size() * 10); 59 for (Node n : nodes) { 60 n.toHTML(sb); 61 } 62 return sb.toString(); 63 } 64 65 /** Returns a XHTML string for the current document */ 66 public String toXHTML() { 67 StringBuilder sb = new StringBuilder(nodes.size() * 10); 68 for (Node n : nodes) { 69 n.toXHTML(sb); 70 } 71 return sb.toString(); 72 } 73 74 /** 75 * Returns, as much as possible, original content of preparsed nodes. This 76 * is only different from toHTML() if the nodes were created with original 77 * content, e.g., by HtmlParser in preserve mode. 78 */ 79 public String toOriginalHTML() { 80 StringBuilder sb = new StringBuilder(nodes.size() * 10); 81 for (Node n : nodes) { 82 n.toOriginalHTML(sb); 83 } 84 return sb.toString(); 85 } 86 87 /** Returns the HTML document in debug format */ 88 @Override 89 public String toString() { 90 StringWriter strWriter = new StringWriter(); 91 accept(new DebugPrinter(new PrintWriter(strWriter))); 92 return strWriter.toString(); 93 } 94 95 /** 96 * Creates start Tag Node. 97 * @see HtmlDocument#createTag(HTML.Element, List, String, String) 98 */ 99 public static Tag createTag(HTML.Element element, List<TagAttribute> attributes) { 100 return createTag(element, attributes, null, null); 101 } 102 103 /** 104 * Creates start Tag Node. 105 * @see HtmlDocument.Tag#Tag(HTML.Element, List, boolean, String, String) 106 */ 107 public static Tag createTag(HTML.Element element, 108 List<TagAttribute> attributes, String originalHtmlBeforeAttributes, 109 String originalHtmlAfterAttributes) { 110 return new Tag(element, attributes, false, originalHtmlBeforeAttributes, 111 originalHtmlAfterAttributes); 112 } 113 114 /** 115 * Creates self-terminating Tag Node. 116 * @see HtmlDocument#createSelfTerminatingTag(HTML.Element, List, String, String) 117 */ 118 public static Tag createSelfTerminatingTag(HTML.Element element, 119 List<TagAttribute> attributes) { 120 return createSelfTerminatingTag(element, attributes, null, null); 121 } 122 123 /** 124 * Creates self-terminating Tag Node. 125 * @see HtmlDocument#createTag(HTML.Element, List, String, String) 126 */ 127 public static Tag createSelfTerminatingTag(HTML.Element element, 128 List<TagAttribute> attributes, String originalHtmlBeforeAttributes, 129 String originalHtmlAfterAttributes) { 130 return new Tag(element, attributes, true, originalHtmlBeforeAttributes, 131 originalHtmlAfterAttributes); 132 } 133 134 /** 135 * @see HtmlDocument#createEndTag(HTML.Element, String) 136 */ 137 public static EndTag createEndTag(HTML.Element element) { 138 return createEndTag(element, null); 139 } 140 141 /** 142 * @see HtmlDocument.EndTag#EndTag(HTML.Element, String) 143 */ 144 public static EndTag createEndTag(HTML.Element element, String originalHtml) { 145 return new EndTag(element, originalHtml); 146 } 147 148 /** 149 * @see HtmlDocument#createTagAttribute(HTML.Attribute, String, String) 150 */ 151 public static TagAttribute createTagAttribute(HTML.Attribute attr, String value) { 152 return createTagAttribute(attr, value, null); 153 } 154 155 /** 156 * @see HtmlDocument.TagAttribute#TagAttribute(HTML.Attribute, String, String) 157 */ 158 public static TagAttribute createTagAttribute(HTML.Attribute attr, 159 String value, String originalHtml) { 160 X.assertTrue(attr != null); 161 return new TagAttribute(attr, value, originalHtml); 162 } 163 164 /** 165 * @see HtmlDocument#createText(String, String) 166 */ 167 public static Text createText(String text) { 168 return createText(text, null); 169 } 170 171 /** 172 * Creates a Text node. 173 * @see UnescapedText#UnescapedText(String, String) 174 */ 175 public static Text createText(String text, String original) { 176 return new UnescapedText(text, original); 177 } 178 179 /** 180 * Creates a Text node where the content hasn't been unescaped yet (this will 181 * be done lazily). 182 */ 183 public static Text createEscapedText(String htmlText, String original) { 184 return new EscapedText(htmlText, original); 185 } 186 187 /** 188 * Creates an Comment node. 189 * @see Comment#Comment(String) 190 */ 191 public static Comment createHtmlComment(String content) { 192 return new Comment(content); 193 } 194 195 /** 196 * Creates a CDATA node. 197 * @see CDATA#CDATA(String) 198 */ 199 public static CDATA createCDATA(String text) { 200 return new CDATA(text); 201 } 202 203 /** Accepts a Visitor */ 204 public void accept(Visitor v) { 205 v.start(); 206 for (Node node : nodes) { 207 node.accept(v); 208 } 209 v.finish(); 210 } 211 212 /** 213 * @param filter results of this filter replace the existing nodes 214 * @return new document with filtered nodes 215 */ 216 public HtmlDocument filter(MultiplexFilter filter) { 217 filter.start(); 218 List<Node> newNodes = new ArrayList<Node>(); 219 for (Node node : nodes) { 220 filter.filter(node, newNodes); 221 } 222 filter.finish(newNodes); 223 return new HtmlDocument(newNodes); 224 } 225 226 /** 227 * Html node 228 */ 229 public static abstract class Node { 230 231 /** Accepts a visitor */ 232 public abstract void accept(Visitor visitor); 233 234 /** Converts to HTML */ 235 public String toHTML() { 236 StringBuilder sb = new StringBuilder(); 237 toHTML(sb); 238 return sb.toString(); 239 } 240 241 /** Converts to HTML */ 242 public abstract void toHTML(StringBuilder sb); 243 244 /** Converts to XHTML */ 245 public String toXHTML() { 246 StringBuilder sb = new StringBuilder(); 247 toXHTML(sb); 248 return sb.toString(); 249 } 250 251 /** Converts to XHTML */ 252 public abstract void toXHTML(StringBuilder sb); 253 254 /** 255 * @return Original if it's available; otherwise, returns 256 * <code>toHTML()</code> 257 */ 258 public String toOriginalHTML() { 259 StringBuilder sb = new StringBuilder(); 260 toOriginalHTML(sb); 261 return sb.toString(); 262 } 263 264 /** 265 * @param sb Destination of HTML to be appended. Appends original if it's 266 * available; otherwise, appends <code>toHTML()</code> 267 */ 268 public abstract void toOriginalHTML(StringBuilder sb); 269 } 270 271 /** 272 * HTML comment node. 273 */ 274 public static class Comment extends Node { 275 276 private final String content; 277 278 /** 279 * @param content Raw comment, including "<!--" and "-->". 280 */ 281 public Comment(String content) { 282 this.content = content; 283 } 284 285 @Override 286 public void accept(Visitor visitor) { 287 visitor.visitComment(this); 288 } 289 290 /** 291 * Emit original unchanged. 292 * @param sb Destination of result. 293 */ 294 @Override 295 public void toHTML(StringBuilder sb) { 296 sb.append(content); 297 } 298 299 /** 300 * Emit original unchanged. 301 * @param sb Destination of result. 302 */ 303 @Override 304 public void toXHTML(StringBuilder sb) { 305 sb.append(content); 306 } 307 308 /** 309 * Emit original unchanged. 310 * @param sb Destination of result. 311 */ 312 @Override 313 public void toOriginalHTML(StringBuilder sb) { 314 sb.append(content); 315 } 316 317 /** 318 * @return Original unchanged. 319 */ 320 public String getContent() { 321 return content; 322 } 323 } 324 325 /** 326 * Text node 327 */ 328 public static abstract class Text extends Node { 329 330 /** 331 * unaltered original content of this node 332 */ 333 private final String originalHtml; 334 335 /** 336 * content of this node in HTML format 337 */ 338 private String html; 339 340 /** 341 * @param originalHtml Unaltered original HTML. If not null, 342 * toOriginalHTML() will return this. 343 */ 344 protected Text(String originalHtml) { 345 this.originalHtml = originalHtml; 346 } 347 348 /** 349 * Gets the plain, unescaped text. 350 */ 351 abstract public String getText(); 352 353 // Returns true if it contains only white space 354 public boolean isWhitespace() { 355 String text = getText(); 356 int len = text.length(); 357 for (int i = 0; i < len; i++) { 358 if (!Character.isWhitespace(text.charAt(i))) { 359 return false; 360 } 361 } 362 return true; 363 } 364 365 @Override 366 public boolean equals(Object o) { 367 if (o == this) { 368 return true; 369 } 370 if (o instanceof Text) { 371 Text that = (Text) o; 372 373 return this.originalHtml == null ? that.originalHtml == null 374 : this.originalHtml.equals(that.originalHtml); 375 } 376 return false; 377 } 378 379 @Override 380 public int hashCode() { 381 return originalHtml == null ? 0 : originalHtml.hashCode(); 382 } 383 384 @Override 385 public String toString() { 386 return getText(); 387 } 388 389 /** Extends Node.accept */ 390 @Override 391 public void accept(Visitor visitor) { 392 visitor.visitText(this); 393 } 394 395 /** 396 * Gets the HTML, with HTML entities escaped. 397 */ 398 @Override 399 public void toHTML(StringBuilder sb) { 400 if (html == null) { 401 html = CharEscapers.asciiHtmlEscaper().escape(getText()); 402 } 403 sb.append(html); 404 } 405 406 /** 407 * @see HtmlDocument.Text#toHTML(StringBuilder) 408 */ 409 @Override 410 public void toXHTML(StringBuilder sb) { 411 toHTML(sb); 412 } 413 414 /** 415 * @param sb Appends original HTML to this if available. Otherwise, 416 * same as toHTML(). 417 */ 418 @Override 419 public void toOriginalHTML(StringBuilder sb) { 420 if (originalHtml != null) { 421 sb.append(originalHtml); 422 } else { 423 toHTML(sb); 424 } 425 } 426 427 /** 428 * @return the original HTML (possibly with entities unescaped if the 429 * document was malformed). May be null if original HTML was not preserved 430 * (see constructor argument of {@link HtmlParser}) 431 */ 432 public String getOriginalHTML() { 433 return originalHtml; 434 } 435 } 436 437 /** 438 * {@link Text} implementation where the given text is assumed to have been 439 * already HTML unescaped. 440 */ 441 private static class UnescapedText extends Text { 442 /** 443 * content of this node as plain, unescaped text 444 */ 445 protected final String text; 446 447 private UnescapedText(String plainText, String originalHtml) { 448 super(originalHtml); 449 X.assertTrue(plainText != null); 450 this.text = plainText; 451 } 452 453 @Override public String getText() { 454 return text; 455 } 456 } 457 458 /** 459 * {@link Text} implementation where the given text is not unescaped yet, and 460 * unescaping will only be done lazily. 461 */ 462 private static class EscapedText extends Text { 463 private final String htmlText; 464 private String text; 465 466 private EscapedText(String htmlText, String originalHtml) { 467 super(originalHtml); 468 this.htmlText = htmlText; 469 } 470 471 @Override public String getText() { 472 if (text == null) { 473 text = StringUtil.unescapeHTML(htmlText); 474 } 475 return text; 476 } 477 } 478 479 /** 480 * CDATA node is a subclass of Text node. 481 */ 482 public static class CDATA extends UnescapedText { 483 private CDATA(String text) { 484 super(text, text); 485 } 486 487 @Override public void toHTML(StringBuilder sb) { 488 // Do not htmlescape CDATA text 489 sb.append(text); 490 } 491 492 @Override public void toXHTML(StringBuilder sb) { 493 sb.append("<![CDATA[") 494 .append(text) 495 .append("]]>"); 496 } 497 } 498 499 /** 500 * Tag is a HTML open tag. 501 */ 502 public static class Tag extends Node { 503 // The element 504 private final HTML.Element element; 505 506 // List of TagAttribute objects. This may be null. 507 private List<TagAttribute> attributes; 508 509 private final boolean isSelfTerminating; 510 511 private final String originalHtmlBeforeAttributes; 512 513 private final String originalHtmlAfterAttributes; 514 515 /** 516 * @param element the HTML4 element 517 * @param attributes list of TagAttribute objects, may be null 518 * @param isSelfTerminating 519 * @param originalHtmlBeforeAttributes Original tag's full content before 520 * first attribute, including beginning '<'. This should not 521 * include preceeding whitespace for the first attribute, as that 522 * should be included in the attribute node. If not null, tag will 523 * preserve this original content. e.g., if original tag were 524 * "<foO bar='zbc'>", case of foO would be preserved. This 525 * method does not validate that 526 * <code>originalHtmlBeforeAttributes</code> is a valid tag String. 527 * @param originalHtmlAfterAttributes Full content of original tag after 528 * last attribute, including ending '>'. If not null, tag will 529 * preserve this original content. e.g., if original tag were 530 * "<foo bar='zbc' >", the spaces before '>' be preserved. 531 * This method does not validate that 532 * <code>originalHtmlAfterAttributes</code> is a valid tag String. 533 */ 534 private Tag(HTML.Element element, List<TagAttribute> attributes, 535 boolean isSelfTerminating, String originalHtmlBeforeAttributes, 536 String originalHtmlAfterAttributes) { 537 X.assertTrue(element != null); 538 this.element = element; 539 this.attributes = attributes; 540 this.isSelfTerminating = isSelfTerminating; 541 this.originalHtmlBeforeAttributes = originalHtmlBeforeAttributes; 542 this.originalHtmlAfterAttributes = originalHtmlAfterAttributes; 543 } 544 545 /** Gets the name */ 546 public String getName() { 547 return element.getName(); 548 } 549 550 /** Gets the element */ 551 public HTML.Element getElement() { 552 return element; 553 } 554 555 /** Adds an attribute */ 556 public void addAttribute(HTML.Attribute attr, String value) { 557 X.assertTrue(attr != null); 558 addAttribute(new TagAttribute(attr, value, null)); 559 } 560 561 /** Adds an attribute */ 562 public void addAttribute(TagAttribute attr) { 563 X.assertTrue(attr != null); 564 if (attributes == null) { 565 attributes = new ArrayList<TagAttribute>(); 566 } 567 attributes.add(attr); 568 } 569 570 /** Gets the list of attributes, note that this maybe null. */ 571 public List<TagAttribute> getAttributes() { 572 return attributes; 573 } 574 575 /** Finds and returns a TagAttribute, or null if not found */ 576 public TagAttribute getAttribute(HTML.Attribute attr) { 577 if (attributes != null) { 578 for (TagAttribute attribute : attributes) { 579 if (attribute.getAttribute().equals(attr)) { 580 return attribute; 581 } 582 } 583 } 584 return null; 585 } 586 587 /** 588 * Finds and returns list of TagAttribute of given attribute 589 * type, or empty list if not found, 590 */ 591 public List<TagAttribute> getAttributes(HTML.Attribute attr) { 592 List<TagAttribute> result = Lists.newArrayList(); 593 if (attributes != null) { 594 for (TagAttribute attribute : attributes) { 595 if (attribute.getAttribute().equals(attr)) { 596 result.add(attribute); 597 } 598 } 599 } 600 return result; 601 } 602 603 /** Returns debug string */ 604 @Override 605 public String toString() { 606 StringBuilder sb = new StringBuilder(); 607 sb.append("Start Tag: "); 608 sb.append(element.getName()); 609 if (attributes != null) { 610 for (TagAttribute attr : attributes) { 611 sb.append(' '); 612 sb.append(attr.toString()); 613 } 614 } 615 return sb.toString(); 616 } 617 618 /** Implements Node.accept */ 619 @Override 620 public void accept(Visitor visitor) { 621 visitor.visitTag(this); 622 } 623 624 /** Implements Node.toHTML */ 625 @Override 626 public void toHTML(StringBuilder sb) { 627 serialize(sb, SerializeType.HTML); 628 } 629 630 @Override 631 public void toXHTML(StringBuilder sb) { 632 serialize(sb, SerializeType.XHTML); 633 } 634 635 @Override 636 public void toOriginalHTML(StringBuilder sb) { 637 serialize(sb, SerializeType.ORIGINAL_HTML); 638 } 639 640 /** 641 * Specifies format of serialized output. 642 */ 643 private enum SerializeType { 644 ORIGINAL_HTML, HTML, XHTML 645 } 646 647 private void serialize(StringBuilder sb, SerializeType type) { 648 // before attributes 649 if (type == SerializeType.ORIGINAL_HTML && originalHtmlBeforeAttributes != null) { 650 sb.append(originalHtmlBeforeAttributes); 651 } else { 652 sb.append('<'); 653 sb.append(element.getName()); 654 } 655 656 // attributes 657 if (attributes != null) { 658 for (TagAttribute attr : attributes) { 659 // attribute includes leading whitespace, so we needn't add it here 660 if (type == SerializeType.ORIGINAL_HTML) { 661 attr.toOriginalHTML(sb); 662 } else if (type == SerializeType.HTML) { 663 attr.toHTML(sb); 664 } else { 665 attr.toXHTML(sb); 666 } 667 } 668 } 669 670 // after attributes 671 if (type == SerializeType.ORIGINAL_HTML && originalHtmlAfterAttributes != null) { 672 sb.append(originalHtmlAfterAttributes); 673 } else if (type == SerializeType.XHTML && (isSelfTerminating || getElement().isEmpty())) { 674 sb.append(" />"); 675 } else { 676 sb.append('>'); 677 } 678 } 679 680 public boolean isSelfTerminating() { 681 return isSelfTerminating; 682 } 683 684 public String getOriginalHtmlBeforeAttributes() { 685 return originalHtmlBeforeAttributes; 686 } 687 688 public String getOriginalHtmlAfterAttributes() { 689 return originalHtmlAfterAttributes; 690 } 691 } 692 693 /** 694 * EndTag is a closing HTML tag. 695 */ 696 public static class EndTag extends Node { 697 // The element 698 private final HTML.Element element; 699 700 private final String originalHtml; 701 702 /** 703 * @param element The HTML.Element element. Can not be null. 704 * @param originalHtml Full content of original tag, including beginning 705 * and ending '<' and '>'. If not null, tag will preserve this original 706 * content. e.g., if original tag were "</foo >", the space after foo 707 * would be preserved. This method does not validate that originalHtml is a 708 * valid tag String. 709 */ 710 private EndTag(HTML.Element element, String originalHtml) { 711 X.assertTrue(element != null); 712 this.element = element; 713 this.originalHtml = originalHtml; 714 } 715 716 /** Gets the name */ 717 public String getName() { 718 return element.getName(); 719 } 720 721 /** Gets the element */ 722 public HTML.Element getElement() { 723 return element; 724 } 725 726 /** Returns debug string */ 727 @Override 728 public String toString() { 729 return "End Tag: " + element.getName(); 730 } 731 732 /** Implements Node.accept */ 733 @Override 734 public void accept(Visitor visitor) { 735 visitor.visitEndTag(this); 736 } 737 738 /** Implements Node.toHTML */ 739 @Override 740 public void toHTML(StringBuilder sb) { 741 sb.append("</"); 742 sb.append(element.getName()); 743 sb.append('>'); 744 } 745 746 @Override 747 public void toXHTML(StringBuilder sb) { 748 toHTML(sb); 749 } 750 751 @Override 752 public void toOriginalHTML(StringBuilder sb) { 753 if (originalHtml != null) { 754 sb.append(originalHtml); 755 } else { 756 toHTML(sb); 757 } 758 } 759 } 760 761 /** 762 * TagAttribute represents an attribute in a HTML tag. 763 */ 764 public static class TagAttribute { 765 private final HTML.Attribute attribute; 766 private String value; 767 private String originalHtml; 768 769 /** 770 * @param attribute the HTML.Attribute. Can't be null. 771 * @param value The value in plain-text format. This can be null if the 772 * attribute has no value. 773 * @param originalHtml If not null, toOriginalHTML() will preserve original 774 * content. This should contain any leading whitespace from the 775 * original. 776 */ 777 private TagAttribute(HTML.Attribute attribute, String value, String originalHtml) { 778 X.assertTrue(attribute != null); 779 this.attribute = attribute; 780 this.value = value; 781 this.originalHtml = originalHtml; 782 } 783 784 /** Gets the name */ 785 public String getName() { 786 return attribute.getName(); 787 } 788 789 /** Gets the HTML.Attribute information */ 790 public HTML.Attribute getAttribute() { 791 return attribute; 792 } 793 794 /** 795 * Sets the attribute value. 796 * This value must be in plain-text, not html-escaped. 797 * This can be null, if the attribute has no values. 798 * This clears <code>originalHtml_</code> if it were set, so 799 * <code>toOriginalHTML()</code> might not preserve original any more. 800 */ 801 public void setValue(String value) { 802 this.value = value; 803 originalHtml = null; 804 } 805 806 /** Returns the attribute value in plain-text, never null */ 807 public String getValue() { 808 return value != null ? value : ""; 809 } 810 811 /** Returns true if the attribute value is not empty */ 812 public boolean hasValue() { 813 return value != null; 814 } 815 816 /** 817 * Writes out the attribute in HTML format with all necessary preceding 818 * whitespace. Emits originalHtml_ if it were specified to the constructor. 819 * Otherwise, emits a new name="value" string with a single preceding space. 820 */ 821 public void toHTML(StringBuilder sb) { 822 sb.append(' '); 823 sb.append(attribute.getName()); 824 if (value != null && attribute.getType() != HTML.Attribute.BOOLEAN_TYPE) { 825 sb.append("=\""); 826 sb.append(CharEscapers.asciiHtmlEscaper().escape(value)); 827 sb.append("\""); 828 } 829 } 830 831 /** Returns the attribute html string */ 832 public String toHTML() { 833 StringBuilder sb = new StringBuilder(); 834 toHTML(sb); 835 return sb.toString(); 836 } 837 838 /** 839 * Writes out the attribute in XHTML format (value is always appended, 840 * even if it is empty) with all necessary preceeding whitespace. 841 */ 842 public void toXHTML(StringBuilder sb) { 843 sb.append(' '); 844 sb.append(attribute.getName()).append("=\""); 845 846 // Assume that value-less attribute are boolean attributes like "disabled" 847 if (hasValue()) { 848 sb.append(CharEscapers.asciiHtmlEscaper().escape(value)); 849 } else { 850 sb.append(attribute.getName()); 851 } 852 853 sb.append("\""); 854 } 855 856 /** Returns the attribute XHTML string */ 857 public String toXHTML() { 858 StringBuilder sb = new StringBuilder(); 859 toXHTML(sb); 860 return sb.toString(); 861 } 862 863 /** 864 * @param sb Destination to which attribute is written, in its original 865 * preparsed form if possible. 866 */ 867 public void toOriginalHTML(StringBuilder sb) { 868 if (originalHtml != null) { 869 sb.append(originalHtml); 870 } else { 871 toHTML(sb); 872 } 873 } 874 875 /** 876 * Writes out the attribute in its original form as it was parsed.. 877 */ 878 public String toOriginalHTML() { 879 StringBuilder sb = new StringBuilder(); 880 toOriginalHTML(sb); 881 return sb.toString(); 882 } 883 884 @Override 885 public String toString() { 886 return "{" + attribute.getName() + "=" + value + "}"; 887 } 888 } 889 890 /** 891 * Filter is like Visitor, except it implies that the nodes may be changed, 892 * whereas HtmlDocument.Visitor just implies that the nodes are iterated 893 * over. A Filter can behave just like a Visitor if it merely returns the 894 * same node that it visited. Also, methods may be called on a node to change 895 * the values it contains. Alternatively, a new node entirely can be created 896 * and returned, which will essentially replace the previous node with the 897 * new node in the document tree. A node may be removed by returning null 898 * instead of a node. 899 */ 900 public static interface Filter { 901 /** This is called first */ 902 void start(); 903 904 /** A text node */ 905 Text visitText(Text n); 906 907 /** An open tag */ 908 Tag visitTag(Tag n); 909 910 /** End tag */ 911 EndTag visitEndTag(EndTag n); 912 913 /** HTML comment */ 914 Comment visitComment(Comment n); 915 916 /* Called at the end. */ 917 void finish(); 918 } 919 920 /** 921 * Like Filter, except each node may be replaced by multiple nodes. Also, 922 * does not do double dispatch accept/visit. 923 */ 924 public static interface MultiplexFilter { 925 /** 926 * Called first. 927 */ 928 void start(); 929 930 /** 931 * @param originalNode node to filter 932 * @param out Destination to which this object appends nodes to replace 933 * originalNode. Can not be null. 934 */ 935 void filter(Node originalNode, List<Node> out); 936 937 /** 938 * Called at the end. 939 * @param out Destination to which this object appends nodes at the end of 940 * the document. Can not be null. 941 */ 942 void finish(List<Node> out); 943 } 944 945 /** 946 * Converts a normal {@link Filter} into a {@link MultiplexFilter}. 947 */ 948 public static class MultiplexFilterAdapter implements MultiplexFilter { 949 950 private final Filter filter; 951 952 public MultiplexFilterAdapter(Filter filter) { 953 this.filter = filter; 954 } 955 956 public void start() { 957 filter.start(); 958 } 959 960 public void filter(Node originalNode, List<Node> out) { 961 if (originalNode == null) { 962 return; 963 } 964 965 Node resultNode; 966 if (originalNode instanceof Tag) { 967 resultNode = filter.visitTag((Tag) originalNode); 968 } else if (originalNode instanceof Text) { 969 resultNode = filter.visitText((Text) originalNode); 970 } else if (originalNode instanceof EndTag) { 971 resultNode = filter.visitEndTag((EndTag) originalNode); 972 } else if (originalNode instanceof Comment) { 973 resultNode = filter.visitComment((Comment) originalNode); 974 } else { 975 throw new IllegalArgumentException("unknown node type: " + originalNode.getClass()); 976 } 977 978 if (resultNode != null) { 979 out.add(resultNode); 980 } 981 } 982 983 public void finish(List<Node> out) { 984 filter.finish(); 985 } 986 } 987 988 /** 989 * Like Filter, except each node may be replaced by multiple nodes. Also, 990 * does not do double dispatch accept/visit. Dispatches filterNode() to 991 * node-specific methods. 992 */ 993 public static abstract class SimpleMultiplexFilter implements MultiplexFilter { 994 995 /** 996 * @see HtmlDocument.MultiplexFilter#filter(HtmlDocument.Node, List) 997 */ 998 public void filter(Node originalNode, List<Node> out) { 999 if (originalNode == null) { 1000 return; 1001 } 1002 1003 if (originalNode instanceof Tag) { 1004 filterTag((Tag) originalNode, out); 1005 } else if (originalNode instanceof Text) { 1006 filterText((Text) originalNode, out); 1007 } else if (originalNode instanceof EndTag) { 1008 filterEndTag((EndTag) originalNode, out); 1009 } else if (originalNode instanceof Comment) { 1010 filterComment((Comment) originalNode, out); 1011 } else { 1012 throw new IllegalArgumentException("unknown node type: " 1013 + originalNode.getClass()); 1014 } 1015 } 1016 1017 public abstract void filterTag(Tag originalTag, List<Node> out); 1018 1019 public abstract void filterText(Text originalText, List<Node> out); 1020 1021 public abstract void filterEndTag(EndTag originalEndTag, List<Node> out); 1022 1023 public void filterComment(Comment originalComment, List<Node> out) { 1024 } 1025 } 1026 1027 /** 1028 * Contains a list of filters which are applied, in order, to each Node. The 1029 * output of each becomes the input to the next. As soon as one returns an 1030 * empty list it breaks the chain. 1031 */ 1032 public static class MultiplexFilterChain implements MultiplexFilter { 1033 1034 private final List<MultiplexFilter> filters = new ArrayList<MultiplexFilter>(); 1035 1036 /** 1037 * @param sourceFilters these filters are applied in List order 1038 */ 1039 public MultiplexFilterChain(List<MultiplexFilter> sourceFilters) { 1040 filters.addAll(sourceFilters); 1041 } 1042 1043 /** 1044 * @see HtmlDocument.MultiplexFilter#start() 1045 */ 1046 public void start() { 1047 for (MultiplexFilter filter : filters) { 1048 filter.start(); 1049 } 1050 } 1051 1052 /** 1053 * @see HtmlDocument.MultiplexFilter#filter(HtmlDocument.Node, List) 1054 */ 1055 public void filter(Node originalNode, List<Node> out) { 1056 List<Node> result = new ArrayList<Node>(); 1057 result.add(originalNode); 1058 1059 // loop through filters until one returns nothing, or until we're out of 1060 // filters 1061 for (MultiplexFilter filter : filters) { 1062 if (result.isEmpty()) { 1063 return; 1064 } 1065 1066 // apply filter to each node and collect results 1067 List<Node> newResult = new ArrayList<Node>(); 1068 for (Node node : result) { 1069 filter.filter(node, newResult); 1070 } 1071 result = newResult; 1072 } 1073 1074 out.addAll(result); 1075 } 1076 1077 /** 1078 * @see HtmlDocument.MultiplexFilter#finish(List) 1079 */ 1080 public void finish(List<Node> out) { 1081 List<Node> result = new ArrayList<Node>(); 1082 1083 // loop through filters until one returns nothing, or until we're out of 1084 // filters 1085 for (MultiplexFilter filter : filters) { 1086 // apply filter to each node and collect results 1087 List<Node> newResult = new ArrayList<Node>(); 1088 for (Node node : result) { 1089 filter.filter(node, newResult); 1090 } 1091 filter.finish(newResult); 1092 result = newResult; 1093 } 1094 1095 out.addAll(result); 1096 } 1097 } 1098 1099 /** 1100 * Html visitor allows external code to iterate through the nodes in the 1101 * document. See HtmlDocument.accept. 1102 */ 1103 public static interface Visitor { 1104 /** This is called first */ 1105 void start(); 1106 1107 /** A text node */ 1108 void visitText(Text n); 1109 1110 /** An open tag */ 1111 void visitTag(Tag n); 1112 1113 /** End tag */ 1114 void visitEndTag(EndTag n); 1115 1116 /** comment */ 1117 void visitComment(Comment n); 1118 1119 /* Called at the end. */ 1120 void finish(); 1121 } 1122 1123 /** 1124 * An implementation of the Visitor interface which simply delegates its 1125 * methods to a wrapped instance of another Visitor. 1126 * 1127 * <p>This is useful for chaining Visitors together. 1128 */ 1129 public static class VisitorWrapper implements Visitor { 1130 private final Visitor wrapped; 1131 1132 protected VisitorWrapper(Visitor wrap) { 1133 wrapped = wrap; 1134 } 1135 1136 public void start() { 1137 wrapped.start(); 1138 } 1139 1140 public void visitText(Text n) { 1141 wrapped.visitText(n); 1142 } 1143 1144 public void visitTag(Tag n) { 1145 wrapped.visitTag(n); 1146 } 1147 1148 public void visitEndTag(EndTag n) { 1149 wrapped.visitEndTag(n); 1150 } 1151 1152 public void visitComment(Comment n) { 1153 wrapped.visitComment(n); 1154 } 1155 1156 public void finish() { 1157 wrapped.finish(); 1158 } 1159 } 1160 1161 /** 1162 * A special helper Visitor that builds a HtmlDocument. 1163 */ 1164 public static class Builder implements Visitor { 1165 private final boolean preserveComments; 1166 private final List<Node> nodes = new ArrayList<Node>(); 1167 private HtmlDocument doc; 1168 1169 /** 1170 * @see Builder#Builder(boolean) 1171 */ 1172 public Builder() { 1173 this(false); 1174 } 1175 1176 /** 1177 * @param preserveComments If false, ignores Comment nodes 1178 */ 1179 public Builder(boolean preserveComments) { 1180 this.preserveComments = preserveComments; 1181 } 1182 1183 public void addNode(Node node) { 1184 nodes.add(node); 1185 } 1186 public void start() { 1187 } 1188 public void visitText(Text t) { 1189 addNode(t); 1190 } 1191 public void visitTag(Tag t) { 1192 addNode(t); 1193 } 1194 public void visitComment(Comment n) { 1195 if (preserveComments) { 1196 addNode(n); 1197 } 1198 } 1199 public void visitEndTag(EndTag t) { 1200 addNode(t); 1201 } 1202 public void finish() { 1203 doc = new HtmlDocument(nodes); 1204 } 1205 1206 /** Gets the html document that has been constructed */ 1207 public HtmlDocument getDocument() { 1208 return doc; 1209 } 1210 } 1211 1212 /** 1213 * A Visitor that prints out the html document in debug format. 1214 */ 1215 public static class DebugPrinter implements Visitor { 1216 1217 private final PrintWriter writer; 1218 1219 public DebugPrinter(PrintWriter writer) { 1220 this.writer = writer; 1221 } 1222 1223 public void start() { 1224 } 1225 1226 public void visitText(Text t) { 1227 writeCollapsed("TEXT", t.getText()); 1228 } 1229 1230 public void visitComment(Comment n) { 1231 writeCollapsed("COMMENT", n.getContent()); 1232 } 1233 1234 private void writeCollapsed(String type, String s) { 1235 writer.print(type); 1236 writer.print(": "); 1237 String noNewlines = s.replace("\n", " "); 1238 // Use CharMatcher#WHITESPACE? 1239 String collapsed = CharMatcher.LEGACY_WHITESPACE.trimAndCollapseFrom(noNewlines, ' '); 1240 writer.print(collapsed); 1241 } 1242 1243 public void visitTag(Tag tag) { 1244 writer.print("==<" + tag.getName() + ">"); 1245 List<TagAttribute> attributes = tag.getAttributes(); 1246 if (attributes != null) { 1247 1248 // Attribute values 1249 List<String> attrs = new ArrayList<String>(); 1250 for (TagAttribute a : attributes) { 1251 attrs.add("[" + a.getName() + " : " + a.getValue() + "]"); 1252 } 1253 String[] array = attrs.toArray(new String[attrs.size()]); 1254 1255 // Sort the attributes so that it's easier to read and compare 1256 Arrays.sort(array); 1257 for (int i = 0; i < array.length; i++) { 1258 writer.print(" " + array[i]); 1259 } 1260 } 1261 writer.println(); 1262 } 1263 1264 public void visitEndTag(EndTag endtag) { 1265 writer.println("==</" + endtag.getName() + ">"); 1266 } 1267 1268 public void finish() { 1269 } 1270 } 1271 1272}