1/**
2 * Copyright (c) 2004, Google Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package com.google.android.mail.common.html.parser;
18
19import com.google.android.mail.common.base.CharEscapers;
20import com.google.android.mail.common.base.CharMatcher;
21import com.google.android.mail.common.base.StringUtil;
22import com.google.android.mail.common.base.X;
23import com.google.common.collect.Lists;
24
25import java.io.PrintWriter;
26import java.io.StringWriter;
27import java.util.ArrayList;
28import java.util.Arrays;
29import java.util.List;
30
31
32/**
33 * HtmlDocument is a container for a list of html nodes, and represents the
34 * entire html document. It contains toHTML() method which prints out the html
35 * text, toXHTML for printing out XHTML text and toString() which prints out in
36 * debug format.
37 *
38 * @author jlim@google.com (Jing Yee Lim)
39 */
40public class HtmlDocument {
41  /** List of Node objects */
42  private final List<Node> nodes;
43
44  /**
45   * Creates a Html document.
46   * @param nodes list of html nodes
47   */
48  public HtmlDocument(List<Node> nodes) {
49    this.nodes = nodes;
50  }
51
52  /** Gets the list of nodes */
53  public List<Node> getNodes() {
54    return nodes;
55  }
56
57  /** Returns a HTML string for the current document */
58  public String toHTML() {
59    StringBuilder sb = new StringBuilder(nodes.size() * 10);
60    for (Node n : nodes) {
61      n.toHTML(sb);
62    }
63    return sb.toString();
64  }
65
66  /** Returns a XHTML string for the current document */
67  public String toXHTML() {
68    StringBuilder sb = new StringBuilder(nodes.size() * 10);
69    for (Node n : nodes) {
70      n.toXHTML(sb);
71    }
72    return sb.toString();
73  }
74
75  /**
76   * Returns, as much as possible, original content of preparsed nodes.  This
77   * is only different from toHTML() if the nodes were created with original
78   * content, e.g., by HtmlParser in preserve mode.
79   */
80  public String toOriginalHTML() {
81    StringBuilder sb = new StringBuilder(nodes.size() * 10);
82    for (Node n : nodes) {
83      n.toOriginalHTML(sb);
84    }
85    return sb.toString();
86  }
87
88  /** Returns the HTML document in debug format */
89  @Override
90  public String toString() {
91    StringWriter strWriter = new StringWriter();
92    accept(new DebugPrinter(new PrintWriter(strWriter)));
93    return strWriter.toString();
94  }
95
96  /**
97   * Creates start Tag Node.
98   * @see HtmlDocument#createTag(HTML.Element, List, String, String)
99   */
100  public static Tag createTag(HTML.Element element, List<TagAttribute> attributes) {
101    return createTag(element, attributes, null, null);
102  }
103
104  /**
105   * Creates start Tag Node.
106   * @see HtmlDocument.Tag#Tag(HTML.Element, List, boolean, String, String)
107   */
108  public static Tag createTag(HTML.Element element,
109      List<TagAttribute> attributes, String originalHtmlBeforeAttributes,
110      String originalHtmlAfterAttributes) {
111    return new Tag(element, attributes, false, originalHtmlBeforeAttributes,
112        originalHtmlAfterAttributes);
113  }
114
115  /**
116   * Creates self-terminating Tag Node.
117   * @see HtmlDocument#createSelfTerminatingTag(HTML.Element, List, String, String)
118   */
119  public static Tag createSelfTerminatingTag(HTML.Element element,
120      List<TagAttribute> attributes) {
121    return createSelfTerminatingTag(element, attributes, null, null);
122  }
123
124  /**
125   * Creates self-terminating Tag Node.
126   * @see HtmlDocument#createTag(HTML.Element, List, String, String)
127   */
128  public static Tag createSelfTerminatingTag(HTML.Element element,
129      List<TagAttribute> attributes, String originalHtmlBeforeAttributes,
130      String originalHtmlAfterAttributes) {
131    return new Tag(element, attributes, true, originalHtmlBeforeAttributes,
132        originalHtmlAfterAttributes);
133  }
134
135  /**
136   * @see HtmlDocument#createEndTag(HTML.Element, String)
137   */
138  public static EndTag createEndTag(HTML.Element element) {
139    return createEndTag(element, null);
140  }
141
142  /**
143   * @see HtmlDocument.EndTag#EndTag(HTML.Element, String)
144   */
145  public static EndTag createEndTag(HTML.Element element, String originalHtml) {
146    return new EndTag(element, originalHtml);
147  }
148
149  /**
150   * @see HtmlDocument#createTagAttribute(HTML.Attribute, String, String)
151   */
152  public static TagAttribute createTagAttribute(HTML.Attribute attr, String value) {
153    return createTagAttribute(attr, value, null);
154  }
155
156  /**
157   * @see HtmlDocument.TagAttribute#TagAttribute(HTML.Attribute, String, String)
158   */
159  public static TagAttribute createTagAttribute(HTML.Attribute attr,
160      String value, String originalHtml) {
161    X.assertTrue(attr != null);
162    return new TagAttribute(attr, value, originalHtml);
163  }
164
165  /**
166   * @see HtmlDocument#createText(String, String)
167   */
168  public static Text createText(String text) {
169    return createText(text, null);
170  }
171
172  /**
173   * Creates a Text node.
174   * @see UnescapedText#UnescapedText(String, String)
175   */
176  public static Text createText(String text, String original) {
177    return new UnescapedText(text, original);
178  }
179
180  /**
181   * Creates a Text node where the content hasn't been unescaped yet (this will
182   * be done lazily).
183   */
184  public static Text createEscapedText(String htmlText, String original) {
185    return new EscapedText(htmlText, original);
186  }
187
188  /**
189   * Creates an Comment node.
190   * @see Comment#Comment(String)
191   */
192  public static Comment createHtmlComment(String content) {
193    return new Comment(content);
194  }
195
196  /**
197   * Creates a CDATA node.
198   * @see CDATA#CDATA(String)
199   */
200  public static CDATA createCDATA(String text) {
201    return new CDATA(text);
202  }
203
204  /** Accepts a Visitor */
205  public void accept(Visitor v) {
206    v.start();
207    for (Node node : nodes) {
208      node.accept(v);
209    }
210    v.finish();
211  }
212
213  /**
214   * @param filter results of this filter replace the existing nodes
215   * @return new document with filtered nodes
216   */
217  public HtmlDocument filter(MultiplexFilter filter) {
218    filter.start();
219    List<Node> newNodes = new ArrayList<Node>();
220    for (Node node : nodes) {
221      filter.filter(node, newNodes);
222    }
223    filter.finish(newNodes);
224    return new HtmlDocument(newNodes);
225  }
226
227  /**
228   * Html node
229   */
230  public static abstract class Node {
231
232    /** Accepts a visitor */
233    public abstract void accept(Visitor visitor);
234
235    /** Converts to HTML */
236    public String toHTML() {
237      StringBuilder sb = new StringBuilder();
238      toHTML(sb);
239      return sb.toString();
240    }
241
242    /** Converts to HTML */
243    public abstract void toHTML(StringBuilder sb);
244
245    /** Converts to XHTML */
246    public String toXHTML() {
247      StringBuilder sb = new StringBuilder();
248      toXHTML(sb);
249      return sb.toString();
250    }
251
252    /** Converts to XHTML */
253    public abstract void toXHTML(StringBuilder sb);
254
255    /**
256     * @return Original if it's available; otherwise, returns
257     * <code>toHTML()</code>
258     */
259    public String toOriginalHTML() {
260      StringBuilder sb = new StringBuilder();
261      toOriginalHTML(sb);
262      return sb.toString();
263    }
264
265    /**
266     * @param sb Destination of HTML to be appended.  Appends original if it's
267     * available; otherwise, appends <code>toHTML()</code>
268     */
269    public abstract void toOriginalHTML(StringBuilder sb);
270  }
271
272  /**
273   * HTML comment node.
274   */
275  public static class Comment extends Node {
276
277    private final String content;
278
279    /**
280     * @param content Raw comment, including "&lt;!--" and "--&gt;".
281     */
282    public Comment(String content) {
283      this.content = content;
284    }
285
286    @Override
287    public void accept(Visitor visitor) {
288      visitor.visitComment(this);
289    }
290
291    /**
292     * Emit original unchanged.
293     * @param sb Destination of result.
294     */
295    @Override
296    public void toHTML(StringBuilder sb) {
297      sb.append(content);
298    }
299
300    /**
301     * Emit original unchanged.
302     * @param sb Destination of result.
303     */
304    @Override
305    public void toXHTML(StringBuilder sb) {
306      sb.append(content);
307    }
308
309    /**
310     * Emit original unchanged.
311     * @param sb Destination of result.
312     */
313    @Override
314    public void toOriginalHTML(StringBuilder sb) {
315      sb.append(content);
316    }
317
318    /**
319     * @return Original unchanged.
320     */
321    public String getContent() {
322      return content;
323    }
324  }
325
326  /**
327   * Text node
328   */
329  public static abstract class Text extends Node {
330
331    /**
332     * unaltered original content of this node
333     */
334    private final String originalHtml;
335
336    /**
337     * content of this node in HTML format
338     */
339    private String html;
340
341    /**
342     * @param originalHtml Unaltered original HTML. If not null,
343     *        toOriginalHTML() will return this.
344     */
345    protected Text(String originalHtml) {
346      this.originalHtml = originalHtml;
347    }
348
349    /**
350     * Gets the plain, unescaped text.
351     */
352    abstract public String getText();
353
354    // Returns true if it contains only white space
355    public boolean isWhitespace() {
356      String text = getText();
357      int len = text.length();
358      for (int i = 0; i < len; i++) {
359        if (!Character.isWhitespace(text.charAt(i))) {
360          return false;
361        }
362      }
363      return true;
364    }
365
366    @Override
367    public boolean equals(Object o) {
368      if (o == this) {
369        return true;
370      }
371      if (o instanceof Text) {
372        Text that = (Text) o;
373
374        return this.originalHtml == null ? that.originalHtml == null
375            : this.originalHtml.equals(that.originalHtml);
376      }
377      return false;
378    }
379
380    @Override
381    public int hashCode() {
382      return originalHtml == null ? 0 : originalHtml.hashCode();
383    }
384
385    @Override
386    public String toString() {
387      return getText();
388    }
389
390    /** Extends Node.accept */
391    @Override
392    public void accept(Visitor visitor) {
393      visitor.visitText(this);
394    }
395
396    /**
397     * Gets the HTML, with HTML entities escaped.
398     */
399    @Override
400    public void toHTML(StringBuilder sb) {
401      if (html == null) {
402        html = CharEscapers.asciiHtmlEscaper().escape(getText());
403      }
404      sb.append(html);
405    }
406
407    /**
408     * @see HtmlDocument.Text#toHTML(StringBuilder)
409     */
410    @Override
411    public void toXHTML(StringBuilder sb) {
412      toHTML(sb);
413    }
414
415    /**
416     * @param sb Appends original HTML to this if available.  Otherwise,
417     * same as toHTML().
418     */
419    @Override
420    public void toOriginalHTML(StringBuilder sb) {
421      if (originalHtml != null) {
422        sb.append(originalHtml);
423      } else {
424        toHTML(sb);
425      }
426    }
427
428    /**
429     * @return the original HTML (possibly with entities unescaped if the
430     * document was malformed). May be null if original HTML was not preserved
431     * (see constructor argument of {@link HtmlParser})
432     */
433    public String getOriginalHTML() {
434      return originalHtml;
435    }
436  }
437
438  /**
439   * {@link Text} implementation where the given text is assumed to have been
440   * already HTML unescaped.
441   */
442  private static class UnescapedText extends Text {
443    /**
444     * content of this node as plain, unescaped text
445     */
446    protected final String text;
447
448    private UnescapedText(String plainText, String originalHtml) {
449      super(originalHtml);
450      X.assertTrue(plainText != null);
451      this.text = plainText;
452    }
453
454    @Override public String getText() {
455      return text;
456    }
457  }
458
459  /**
460   * {@link Text} implementation where the given text is not unescaped yet, and
461   * unescaping will only be done lazily.
462   */
463  private static class EscapedText extends Text {
464    private final String htmlText;
465    private String text;
466
467    private EscapedText(String htmlText, String originalHtml) {
468      super(originalHtml);
469      this.htmlText = htmlText;
470    }
471
472    @Override public String getText() {
473      if (text == null) {
474        text = StringUtil.unescapeHTML(htmlText);
475      }
476      return text;
477    }
478  }
479
480  /**
481   * CDATA node is a subclass of Text node.
482   */
483  public static class CDATA extends UnescapedText {
484    private CDATA(String text) {
485      super(text, text);
486    }
487
488    @Override public void toHTML(StringBuilder sb) {
489      // Do not htmlescape CDATA text
490      sb.append(text);
491    }
492
493    @Override public void toXHTML(StringBuilder sb) {
494      sb.append("<![CDATA[")
495        .append(text)
496        .append("]]>");
497    }
498  }
499
500  /**
501   * Tag is a HTML open tag.
502   */
503  public static class Tag extends Node {
504    // The element
505    private final HTML.Element element;
506
507    // List of TagAttribute objects. This may be null.
508    private List<TagAttribute> attributes;
509
510    private final boolean isSelfTerminating;
511
512    private final String originalHtmlBeforeAttributes;
513
514    private final String originalHtmlAfterAttributes;
515
516    /**
517     * @param element the HTML4 element
518     * @param attributes list of TagAttribute objects, may be null
519     * @param isSelfTerminating
520     * @param originalHtmlBeforeAttributes Original tag's full content before
521     *        first attribute, including beginning '&lt;'. This should not
522     *        include preceeding whitespace for the first attribute, as that
523     *        should be included in the attribute node. If not null, tag will
524     *        preserve this original content. e.g., if original tag were
525     *        "&lt;foO bar='zbc'&gt;", case of foO would be preserved. This
526     *        method does not validate that
527     *        <code>originalHtmlBeforeAttributes</code> is a valid tag String.
528     * @param originalHtmlAfterAttributes Full content of original tag after
529     *        last attribute, including ending '>'. If not null, tag will
530     *        preserve this original content. e.g., if original tag were
531     *        "&lt;foo bar='zbc'  &gt;", the spaces before '&gt;' be preserved.
532     *        This method does not validate that
533     *        <code>originalHtmlAfterAttributes</code> is a valid tag String.
534     */
535    private Tag(HTML.Element element, List<TagAttribute> attributes,
536        boolean isSelfTerminating, String originalHtmlBeforeAttributes,
537        String originalHtmlAfterAttributes) {
538      X.assertTrue(element != null);
539      this.element = element;
540      this.attributes = attributes;
541      this.isSelfTerminating = isSelfTerminating;
542      this.originalHtmlBeforeAttributes = originalHtmlBeforeAttributes;
543      this.originalHtmlAfterAttributes = originalHtmlAfterAttributes;
544    }
545
546    /** Gets the name */
547    public String getName() {
548      return element.getName();
549    }
550
551    /** Gets the element */
552    public HTML.Element getElement() {
553      return element;
554    }
555
556    /** Adds an attribute */
557    public void addAttribute(HTML.Attribute attr, String value) {
558      X.assertTrue(attr != null);
559      addAttribute(new TagAttribute(attr, value, null));
560    }
561
562    /** Adds an attribute */
563    public void addAttribute(TagAttribute attr) {
564      X.assertTrue(attr != null);
565      if (attributes == null) {
566        attributes = new ArrayList<TagAttribute>();
567      }
568      attributes.add(attr);
569    }
570
571    /** Gets the list of attributes, note that this maybe null. */
572    public List<TagAttribute> getAttributes() {
573      return attributes;
574    }
575
576    /** Finds and returns a TagAttribute, or null if not found */
577    public TagAttribute getAttribute(HTML.Attribute attr) {
578      if (attributes != null) {
579        for (TagAttribute attribute : attributes) {
580          if (attribute.getAttribute().equals(attr)) {
581            return attribute;
582          }
583        }
584      }
585      return null;
586    }
587
588    /**
589     * Finds and returns list of TagAttribute of given attribute
590     * type, or empty list if not found,
591     */
592    public List<TagAttribute> getAttributes(HTML.Attribute attr) {
593      List<TagAttribute> result = Lists.newArrayList();
594      if (attributes != null) {
595        for (TagAttribute attribute : attributes) {
596          if (attribute.getAttribute().equals(attr)) {
597            result.add(attribute);
598          }
599        }
600      }
601      return result;
602    }
603
604    /** Returns debug string */
605    @Override
606    public String toString() {
607      StringBuilder sb = new StringBuilder();
608      sb.append("Start Tag: ");
609      sb.append(element.getName());
610      if (attributes != null) {
611        for (TagAttribute attr : attributes) {
612          sb.append(' ');
613          sb.append(attr.toString());
614        }
615      }
616      return sb.toString();
617    }
618
619    /** Implements Node.accept */
620    @Override
621    public void accept(Visitor visitor) {
622      visitor.visitTag(this);
623    }
624
625    /** Implements Node.toHTML */
626    @Override
627    public void toHTML(StringBuilder sb) {
628      serialize(sb, SerializeType.HTML);
629    }
630
631    @Override
632    public void toXHTML(StringBuilder sb) {
633      serialize(sb, SerializeType.XHTML);
634    }
635
636    @Override
637    public void toOriginalHTML(StringBuilder sb) {
638      serialize(sb, SerializeType.ORIGINAL_HTML);
639    }
640
641    /**
642     * Specifies format of serialized output.
643     */
644    private enum SerializeType {
645      ORIGINAL_HTML, HTML, XHTML
646    }
647
648    private void serialize(StringBuilder sb, SerializeType type) {
649      // before attributes
650      if (type == SerializeType.ORIGINAL_HTML && originalHtmlBeforeAttributes != null) {
651        sb.append(originalHtmlBeforeAttributes);
652      } else {
653        sb.append('<');
654        sb.append(element.getName());
655      }
656
657      // attributes
658      if (attributes != null) {
659        for (TagAttribute attr : attributes) {
660          // attribute includes leading whitespace, so we needn't add it here
661          if (type == SerializeType.ORIGINAL_HTML) {
662            attr.toOriginalHTML(sb);
663          } else if (type == SerializeType.HTML) {
664            attr.toHTML(sb);
665          } else {
666            attr.toXHTML(sb);
667          }
668        }
669      }
670
671      // after attributes
672      if (type == SerializeType.ORIGINAL_HTML && originalHtmlAfterAttributes != null) {
673        sb.append(originalHtmlAfterAttributes);
674      } else if (type == SerializeType.XHTML && (isSelfTerminating || getElement().isEmpty())) {
675        sb.append(" />");
676      } else {
677        sb.append('>');
678      }
679    }
680
681    public boolean isSelfTerminating() {
682      return isSelfTerminating;
683    }
684
685    public String getOriginalHtmlBeforeAttributes() {
686      return originalHtmlBeforeAttributes;
687    }
688
689    public String getOriginalHtmlAfterAttributes() {
690      return originalHtmlAfterAttributes;
691    }
692  }
693
694  /**
695   * EndTag is a closing HTML tag.
696   */
697  public static class EndTag extends Node {
698    // The element
699    private final HTML.Element element;
700
701    private final String originalHtml;
702
703    /**
704     * @param element The HTML.Element element.  Can not be null.
705     * @param originalHtml Full content of original tag, including beginning
706     * and ending '<' and '>'.  If not null, tag will preserve this original
707     * content. e.g., if original tag were "&lt;/foo &gt;", the space after foo
708     * would be preserved.  This method does not validate that originalHtml is a
709     * valid tag String.
710     */
711    private EndTag(HTML.Element element, String originalHtml) {
712      X.assertTrue(element != null);
713      this.element = element;
714      this.originalHtml = originalHtml;
715    }
716
717    /** Gets the name */
718    public String getName() {
719      return element.getName();
720    }
721
722    /** Gets the element */
723    public HTML.Element getElement() {
724      return element;
725    }
726
727    /** Returns debug string */
728    @Override
729    public String toString() {
730      return "End Tag: " + element.getName();
731    }
732
733    /** Implements Node.accept */
734    @Override
735    public void accept(Visitor visitor) {
736      visitor.visitEndTag(this);
737    }
738
739    /** Implements Node.toHTML */
740    @Override
741    public void toHTML(StringBuilder sb) {
742      sb.append("</");
743      sb.append(element.getName());
744      sb.append('>');
745    }
746
747    @Override
748    public void toXHTML(StringBuilder sb) {
749      toHTML(sb);
750    }
751
752    @Override
753    public void toOriginalHTML(StringBuilder sb) {
754      if (originalHtml != null) {
755        sb.append(originalHtml);
756      } else {
757        toHTML(sb);
758      }
759    }
760  }
761
762  /**
763   * TagAttribute represents an attribute in a HTML tag.
764   */
765  public static class TagAttribute {
766    private final HTML.Attribute attribute;
767    private String value;
768    private String originalHtml;
769
770    /**
771     * @param attribute the HTML.Attribute. Can't be null.
772     * @param value The value in plain-text format. This can be null if the
773     *        attribute has no value.
774     * @param originalHtml If not null, toOriginalHTML() will preserve original
775     *        content. This should contain any leading whitespace from the
776     *        original.
777     */
778    private TagAttribute(HTML.Attribute attribute, String value, String originalHtml) {
779      X.assertTrue(attribute != null);
780      this.attribute = attribute;
781      this.value = value;
782      this.originalHtml = originalHtml;
783    }
784
785    /** Gets the name */
786    public String getName() {
787      return attribute.getName();
788    }
789
790    /** Gets the HTML.Attribute information */
791    public HTML.Attribute getAttribute() {
792      return attribute;
793    }
794
795    /**
796     * Sets the attribute value.
797     * This value must be in plain-text, not html-escaped.
798     * This can be null, if the attribute has no values.
799     * This clears <code>originalHtml_</code> if it were set, so
800     * <code>toOriginalHTML()</code> might not preserve original any more.
801     */
802    public void setValue(String value) {
803      this.value = value;
804      originalHtml = null;
805    }
806
807    /** Returns the attribute value in plain-text, never null */
808    public String getValue() {
809      return value != null ? value : "";
810    }
811
812    /** Returns true if the attribute value is not empty */
813    public boolean hasValue() {
814      return value != null;
815    }
816
817    /**
818     * Writes out the attribute in HTML format with all necessary preceding
819     * whitespace. Emits originalHtml_ if it were specified to the constructor.
820     * Otherwise, emits a new name="value" string with a single preceding space.
821     */
822    public void toHTML(StringBuilder sb) {
823      sb.append(' ');
824      sb.append(attribute.getName());
825      if (value != null && attribute.getType() != HTML.Attribute.BOOLEAN_TYPE) {
826        sb.append("=\"");
827        sb.append(CharEscapers.asciiHtmlEscaper().escape(value));
828        sb.append("\"");
829      }
830    }
831
832    /** Returns the attribute html string */
833    public String toHTML() {
834      StringBuilder sb = new StringBuilder();
835      toHTML(sb);
836      return sb.toString();
837    }
838
839    /**
840     * Writes out the attribute in XHTML format (value is always appended,
841     * even if it is empty) with all necessary preceeding whitespace.
842     */
843    public void toXHTML(StringBuilder sb) {
844      sb.append(' ');
845      sb.append(attribute.getName()).append("=\"");
846
847      // Assume that value-less attribute are boolean attributes like "disabled"
848      if (hasValue()) {
849        sb.append(CharEscapers.asciiHtmlEscaper().escape(value));
850      } else {
851        sb.append(attribute.getName());
852      }
853
854      sb.append("\"");
855    }
856
857    /** Returns the attribute XHTML string */
858    public String toXHTML() {
859      StringBuilder sb = new StringBuilder();
860      toXHTML(sb);
861      return sb.toString();
862    }
863
864    /**
865     * @param sb Destination to which attribute is written, in its original
866     * preparsed form if possible.
867     */
868    public void toOriginalHTML(StringBuilder sb) {
869      if (originalHtml != null) {
870        sb.append(originalHtml);
871      } else {
872        toHTML(sb);
873      }
874    }
875
876    /**
877     * Writes out the attribute in its original form as it was parsed..
878     */
879    public String toOriginalHTML() {
880      StringBuilder sb = new StringBuilder();
881      toOriginalHTML(sb);
882      return sb.toString();
883    }
884
885    @Override
886    public String toString() {
887      return "{" + attribute.getName() + "=" + value + "}";
888    }
889  }
890
891  /**
892   * Filter is like Visitor, except it implies that the nodes may be changed,
893   * whereas HtmlDocument.Visitor just implies that the nodes are iterated
894   * over. A Filter can behave just like a Visitor if it merely returns the
895   * same node that it visited. Also, methods may be called on a node to change
896   * the values it contains. Alternatively, a new node entirely can be created
897   * and returned, which will essentially replace the previous node with the
898   * new node in the document tree. A node may be removed by returning null
899   * instead of a node.
900   */
901  public static interface Filter {
902    /** This is called first */
903    void start();
904
905    /** A text node */
906    Text visitText(Text n);
907
908    /** An open tag */
909    Tag visitTag(Tag n);
910
911    /** End tag */
912    EndTag visitEndTag(EndTag n);
913
914    /** HTML comment */
915    Comment visitComment(Comment n);
916
917    /* Called at the end. */
918    void finish();
919  }
920
921  /**
922   * Like Filter, except each node may be replaced by multiple nodes.  Also,
923   * does not do double dispatch accept/visit.
924   */
925  public static interface MultiplexFilter {
926    /**
927     * Called first.
928     */
929    void start();
930
931    /**
932     * @param originalNode node to filter
933     * @param out Destination to which this object appends nodes to replace
934     * originalNode.  Can not be null.
935     */
936    void filter(Node originalNode, List<Node> out);
937
938    /**
939     * Called at the end.
940     * @param out Destination to which this object appends nodes at the end of
941     * the document.  Can not be null.
942     */
943    void finish(List<Node> out);
944  }
945
946  /**
947   * Converts a normal {@link Filter} into a {@link MultiplexFilter}.
948   */
949  public static class MultiplexFilterAdapter implements MultiplexFilter {
950
951    private final Filter filter;
952
953    public MultiplexFilterAdapter(Filter filter) {
954      this.filter = filter;
955    }
956
957    public void start() {
958      filter.start();
959    }
960
961    public void filter(Node originalNode, List<Node> out) {
962      if (originalNode == null) {
963        return;
964      }
965
966      Node resultNode;
967      if (originalNode instanceof Tag) {
968        resultNode = filter.visitTag((Tag) originalNode);
969      } else if (originalNode instanceof Text) {
970        resultNode = filter.visitText((Text) originalNode);
971      } else if (originalNode instanceof EndTag) {
972        resultNode = filter.visitEndTag((EndTag) originalNode);
973      } else if (originalNode instanceof Comment) {
974        resultNode = filter.visitComment((Comment) originalNode);
975      } else {
976        throw new IllegalArgumentException("unknown node type: " + originalNode.getClass());
977      }
978
979      if (resultNode != null) {
980        out.add(resultNode);
981      }
982    }
983
984    public void finish(List<Node> out) {
985      filter.finish();
986    }
987  }
988
989  /**
990   * Like Filter, except each node may be replaced by multiple nodes.  Also,
991   * does not do double dispatch accept/visit.  Dispatches filterNode() to
992   * node-specific methods.
993   */
994  public static abstract class SimpleMultiplexFilter implements MultiplexFilter {
995
996    /**
997     * @see HtmlDocument.MultiplexFilter#filter(HtmlDocument.Node, List)
998     */
999    public void filter(Node originalNode, List<Node> out) {
1000      if (originalNode == null) {
1001        return;
1002      }
1003
1004      if (originalNode instanceof Tag) {
1005        filterTag((Tag) originalNode, out);
1006      } else if (originalNode instanceof Text) {
1007        filterText((Text) originalNode, out);
1008      } else if (originalNode instanceof EndTag) {
1009        filterEndTag((EndTag) originalNode, out);
1010      } else if (originalNode instanceof Comment) {
1011        filterComment((Comment) originalNode, out);
1012      } else {
1013        throw new IllegalArgumentException("unknown node type: "
1014            + originalNode.getClass());
1015      }
1016    }
1017
1018    public abstract void filterTag(Tag originalTag, List<Node> out);
1019
1020    public abstract void filterText(Text originalText, List<Node> out);
1021
1022    public abstract void filterEndTag(EndTag originalEndTag, List<Node> out);
1023
1024    public void filterComment(Comment originalComment, List<Node> out) {
1025    }
1026  }
1027
1028  /**
1029   * Contains a list of filters which are applied, in order, to each Node.  The
1030   * output of each becomes the input to the next.  As soon as one returns an
1031   * empty list it breaks the chain.
1032   */
1033  public static class MultiplexFilterChain implements MultiplexFilter {
1034
1035    private final List<MultiplexFilter> filters = new ArrayList<MultiplexFilter>();
1036
1037    /**
1038     * @param sourceFilters these filters are applied in List order
1039     */
1040    public MultiplexFilterChain(List<MultiplexFilter> sourceFilters) {
1041      filters.addAll(sourceFilters);
1042    }
1043
1044    /**
1045     * @see HtmlDocument.MultiplexFilter#start()
1046     */
1047    public void start() {
1048      for (MultiplexFilter filter : filters) {
1049        filter.start();
1050      }
1051    }
1052
1053    /**
1054     * @see HtmlDocument.MultiplexFilter#filter(HtmlDocument.Node, List)
1055     */
1056    public void filter(Node originalNode, List<Node> out) {
1057      List<Node> result = new ArrayList<Node>();
1058      result.add(originalNode);
1059
1060      // loop through filters until one returns nothing, or until we're out of
1061      // filters
1062      for (MultiplexFilter filter : filters) {
1063        if (result.isEmpty()) {
1064          return;
1065        }
1066
1067        // apply filter to each node and collect results
1068        List<Node> newResult = new ArrayList<Node>();
1069        for (Node node : result) {
1070          filter.filter(node, newResult);
1071        }
1072        result = newResult;
1073      }
1074
1075      out.addAll(result);
1076    }
1077
1078    /**
1079     * @see HtmlDocument.MultiplexFilter#finish(List)
1080     */
1081    public void finish(List<Node> out) {
1082      List<Node> result = new ArrayList<Node>();
1083
1084      // loop through filters until one returns nothing, or until we're out of
1085      // filters
1086      for (MultiplexFilter filter : filters) {
1087        // apply filter to each node and collect results
1088        List<Node> newResult = new ArrayList<Node>();
1089        for (Node node : result) {
1090          filter.filter(node, newResult);
1091        }
1092        filter.finish(newResult);
1093        result = newResult;
1094      }
1095
1096      out.addAll(result);
1097    }
1098  }
1099
1100  /**
1101   * Html visitor allows external code to iterate through the nodes in the
1102   * document. See HtmlDocument.accept.
1103   */
1104  public static interface Visitor {
1105    /** This is called first */
1106    void start();
1107
1108    /** A text node */
1109    void visitText(Text n);
1110
1111    /** An open tag */
1112    void visitTag(Tag n);
1113
1114    /** End tag */
1115    void visitEndTag(EndTag n);
1116
1117    /** comment */
1118    void visitComment(Comment n);
1119
1120    /* Called at the end. */
1121    void finish();
1122  }
1123
1124  /**
1125   * An implementation of the Visitor interface which simply delegates its
1126   * methods to a wrapped instance of another Visitor.
1127   *
1128   * <p>This is useful for chaining Visitors together.
1129   */
1130  public static class VisitorWrapper implements Visitor {
1131    private final Visitor wrapped;
1132
1133    protected VisitorWrapper(Visitor wrap) {
1134      wrapped = wrap;
1135    }
1136
1137    public void start() {
1138      wrapped.start();
1139    }
1140
1141    public void visitText(Text n) {
1142      wrapped.visitText(n);
1143    }
1144
1145    public void visitTag(Tag n) {
1146      wrapped.visitTag(n);
1147    }
1148
1149    public void visitEndTag(EndTag n) {
1150      wrapped.visitEndTag(n);
1151    }
1152
1153    public void visitComment(Comment n) {
1154      wrapped.visitComment(n);
1155    }
1156
1157    public void finish() {
1158      wrapped.finish();
1159    }
1160  }
1161
1162  /**
1163   * A special helper Visitor that builds a HtmlDocument.
1164   */
1165  public static class Builder implements Visitor {
1166    private final boolean preserveComments;
1167    private final List<Node> nodes = new ArrayList<Node>();
1168    private HtmlDocument doc;
1169
1170    /**
1171     * @see Builder#Builder(boolean)
1172     */
1173    public Builder() {
1174      this(false);
1175    }
1176
1177    /**
1178     * @param preserveComments If false, ignores Comment nodes
1179     */
1180    public Builder(boolean preserveComments) {
1181      this.preserveComments = preserveComments;
1182    }
1183
1184    public void addNode(Node node) {
1185      nodes.add(node);
1186    }
1187    public void start() {
1188    }
1189    public void visitText(Text t) {
1190      addNode(t);
1191    }
1192    public void visitTag(Tag t) {
1193      addNode(t);
1194    }
1195    public void visitComment(Comment n) {
1196      if (preserveComments) {
1197        addNode(n);
1198      }
1199    }
1200    public void visitEndTag(EndTag t) {
1201      addNode(t);
1202    }
1203    public void finish() {
1204      doc = new HtmlDocument(nodes);
1205    }
1206
1207    /** Gets the html document that has been constructed */
1208    public HtmlDocument getDocument() {
1209      return doc;
1210    }
1211  }
1212
1213  /**
1214   * A Visitor that prints out the html document in debug format.
1215   */
1216  public static class DebugPrinter implements Visitor {
1217
1218    private final PrintWriter writer;
1219
1220    public DebugPrinter(PrintWriter writer) {
1221      this.writer = writer;
1222    }
1223
1224    public void start() {
1225    }
1226
1227    public void visitText(Text t) {
1228      writeCollapsed("TEXT", t.getText());
1229    }
1230
1231    public void visitComment(Comment n) {
1232      writeCollapsed("COMMENT", n.getContent());
1233    }
1234
1235    private void writeCollapsed(String type, String s) {
1236      writer.print(type);
1237      writer.print(": ");
1238      String noNewlines = s.replace("\n", " ");
1239      // Use CharMatcher#WHITESPACE?
1240      String collapsed = CharMatcher.LEGACY_WHITESPACE.trimAndCollapseFrom(noNewlines, ' ');
1241      writer.print(collapsed);
1242    }
1243
1244    public void visitTag(Tag tag) {
1245      writer.print("==<" + tag.getName() + ">");
1246      List<TagAttribute> attributes = tag.getAttributes();
1247      if (attributes != null) {
1248
1249        // Attribute values
1250        List<String> attrs = new ArrayList<String>();
1251        for (TagAttribute a : attributes) {
1252          attrs.add("[" + a.getName() + " : " + a.getValue() + "]");
1253        }
1254        String[] array = attrs.toArray(new String[attrs.size()]);
1255
1256        // Sort the attributes so that it's easier to read and compare
1257        Arrays.sort(array);
1258        for (int i = 0; i < array.length; i++) {
1259          writer.print(" " + array[i]);
1260        }
1261      }
1262      writer.println();
1263    }
1264
1265    public void visitEndTag(EndTag endtag) {
1266      writer.println("==</" + endtag.getName() + ">");
1267    }
1268
1269    public void finish() {
1270    }
1271  }
1272
1273}