1/**
2 * Copyright (c) 2004, Google Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16package com.android.mail.lib.html.parser;
17
18import com.android.mail.lib.base.CharEscapers;
19import com.android.mail.lib.base.CharMatcher;
20import com.android.mail.lib.base.StringUtil;
21import com.android.mail.lib.base.X;
22import com.google.common.collect.Lists;
23
24import java.io.PrintWriter;
25import java.io.StringWriter;
26import java.util.ArrayList;
27import java.util.Arrays;
28import java.util.List;
29
30
31/**
32 * HtmlDocument is a container for a list of html nodes, and represents the
33 * entire html document. It contains toHTML() method which prints out the html
34 * text, toXHTML for printing out XHTML text and toString() which prints out in
35 * debug format.
36 *
37 * @author jlim@google.com (Jing Yee Lim)
38 */
39public class HtmlDocument {
40  /** List of Node objects */
41  private final List<Node> nodes;
42
43  /**
44   * Creates a Html document.
45   * @param nodes list of html nodes
46   */
47  public HtmlDocument(List<Node> nodes) {
48    this.nodes = nodes;
49  }
50
51  /** Gets the list of nodes */
52  public List<Node> getNodes() {
53    return nodes;
54  }
55
56  /** Returns a HTML string for the current document */
57  public String toHTML() {
58    StringBuilder sb = new StringBuilder(nodes.size() * 10);
59    for (Node n : nodes) {
60      n.toHTML(sb);
61    }
62    return sb.toString();
63  }
64
65  /** Returns a XHTML string for the current document */
66  public String toXHTML() {
67    StringBuilder sb = new StringBuilder(nodes.size() * 10);
68    for (Node n : nodes) {
69      n.toXHTML(sb);
70    }
71    return sb.toString();
72  }
73
74  /**
75   * Returns, as much as possible, original content of preparsed nodes.  This
76   * is only different from toHTML() if the nodes were created with original
77   * content, e.g., by HtmlParser in preserve mode.
78   */
79  public String toOriginalHTML() {
80    StringBuilder sb = new StringBuilder(nodes.size() * 10);
81    for (Node n : nodes) {
82      n.toOriginalHTML(sb);
83    }
84    return sb.toString();
85  }
86
87  /** Returns the HTML document in debug format */
88  @Override
89  public String toString() {
90    StringWriter strWriter = new StringWriter();
91    accept(new DebugPrinter(new PrintWriter(strWriter)));
92    return strWriter.toString();
93  }
94
95  /**
96   * Creates start Tag Node.
97   * @see HtmlDocument#createTag(HTML.Element, List, String, String)
98   */
99  public static Tag createTag(HTML.Element element, List<TagAttribute> attributes) {
100    return createTag(element, attributes, null, null);
101  }
102
103  /**
104   * Creates start Tag Node.
105   * @see HtmlDocument.Tag#Tag(HTML.Element, List, boolean, String, String)
106   */
107  public static Tag createTag(HTML.Element element,
108      List<TagAttribute> attributes, String originalHtmlBeforeAttributes,
109      String originalHtmlAfterAttributes) {
110    return new Tag(element, attributes, false, originalHtmlBeforeAttributes,
111        originalHtmlAfterAttributes);
112  }
113
114  /**
115   * Creates self-terminating Tag Node.
116   * @see HtmlDocument#createSelfTerminatingTag(HTML.Element, List, String, String)
117   */
118  public static Tag createSelfTerminatingTag(HTML.Element element,
119      List<TagAttribute> attributes) {
120    return createSelfTerminatingTag(element, attributes, null, null);
121  }
122
123  /**
124   * Creates self-terminating Tag Node.
125   * @see HtmlDocument#createTag(HTML.Element, List, String, String)
126   */
127  public static Tag createSelfTerminatingTag(HTML.Element element,
128      List<TagAttribute> attributes, String originalHtmlBeforeAttributes,
129      String originalHtmlAfterAttributes) {
130    return new Tag(element, attributes, true, originalHtmlBeforeAttributes,
131        originalHtmlAfterAttributes);
132  }
133
134  /**
135   * @see HtmlDocument#createEndTag(HTML.Element, String)
136   */
137  public static EndTag createEndTag(HTML.Element element) {
138    return createEndTag(element, null);
139  }
140
141  /**
142   * @see HtmlDocument.EndTag#EndTag(HTML.Element, String)
143   */
144  public static EndTag createEndTag(HTML.Element element, String originalHtml) {
145    return new EndTag(element, originalHtml);
146  }
147
148  /**
149   * @see HtmlDocument#createTagAttribute(HTML.Attribute, String, String)
150   */
151  public static TagAttribute createTagAttribute(HTML.Attribute attr, String value) {
152    return createTagAttribute(attr, value, null);
153  }
154
155  /**
156   * @see HtmlDocument.TagAttribute#TagAttribute(HTML.Attribute, String, String)
157   */
158  public static TagAttribute createTagAttribute(HTML.Attribute attr,
159      String value, String originalHtml) {
160    X.assertTrue(attr != null);
161    return new TagAttribute(attr, value, originalHtml);
162  }
163
164  /**
165   * @see HtmlDocument#createText(String, String)
166   */
167  public static Text createText(String text) {
168    return createText(text, null);
169  }
170
171  /**
172   * Creates a Text node.
173   * @see UnescapedText#UnescapedText(String, String)
174   */
175  public static Text createText(String text, String original) {
176    return new UnescapedText(text, original);
177  }
178
179  /**
180   * Creates a Text node where the content hasn't been unescaped yet (this will
181   * be done lazily).
182   */
183  public static Text createEscapedText(String htmlText, String original) {
184    return new EscapedText(htmlText, original);
185  }
186
187  /**
188   * Creates an Comment node.
189   * @see Comment#Comment(String)
190   */
191  public static Comment createHtmlComment(String content) {
192    return new Comment(content);
193  }
194
195  /**
196   * Creates a CDATA node.
197   * @see CDATA#CDATA(String)
198   */
199  public static CDATA createCDATA(String text) {
200    return new CDATA(text);
201  }
202
203  /** Accepts a Visitor */
204  public void accept(Visitor v) {
205    v.start();
206    for (Node node : nodes) {
207      node.accept(v);
208    }
209    v.finish();
210  }
211
212  /**
213   * @param filter results of this filter replace the existing nodes
214   * @return new document with filtered nodes
215   */
216  public HtmlDocument filter(MultiplexFilter filter) {
217    filter.start();
218    List<Node> newNodes = new ArrayList<Node>();
219    for (Node node : nodes) {
220      filter.filter(node, newNodes);
221    }
222    filter.finish(newNodes);
223    return new HtmlDocument(newNodes);
224  }
225
226  /**
227   * Html node
228   */
229  public static abstract class Node {
230
231    /** Accepts a visitor */
232    public abstract void accept(Visitor visitor);
233
234    /** Converts to HTML */
235    public String toHTML() {
236      StringBuilder sb = new StringBuilder();
237      toHTML(sb);
238      return sb.toString();
239    }
240
241    /** Converts to HTML */
242    public abstract void toHTML(StringBuilder sb);
243
244    /** Converts to XHTML */
245    public String toXHTML() {
246      StringBuilder sb = new StringBuilder();
247      toXHTML(sb);
248      return sb.toString();
249    }
250
251    /** Converts to XHTML */
252    public abstract void toXHTML(StringBuilder sb);
253
254    /**
255     * @return Original if it's available; otherwise, returns
256     * <code>toHTML()</code>
257     */
258    public String toOriginalHTML() {
259      StringBuilder sb = new StringBuilder();
260      toOriginalHTML(sb);
261      return sb.toString();
262    }
263
264    /**
265     * @param sb Destination of HTML to be appended.  Appends original if it's
266     * available; otherwise, appends <code>toHTML()</code>
267     */
268    public abstract void toOriginalHTML(StringBuilder sb);
269  }
270
271  /**
272   * HTML comment node.
273   */
274  public static class Comment extends Node {
275
276    private final String content;
277
278    /**
279     * @param content Raw comment, including "&lt;!--" and "--&gt;".
280     */
281    public Comment(String content) {
282      this.content = content;
283    }
284
285    @Override
286    public void accept(Visitor visitor) {
287      visitor.visitComment(this);
288    }
289
290    /**
291     * Emit original unchanged.
292     * @param sb Destination of result.
293     */
294    @Override
295    public void toHTML(StringBuilder sb) {
296      sb.append(content);
297    }
298
299    /**
300     * Emit original unchanged.
301     * @param sb Destination of result.
302     */
303    @Override
304    public void toXHTML(StringBuilder sb) {
305      sb.append(content);
306    }
307
308    /**
309     * Emit original unchanged.
310     * @param sb Destination of result.
311     */
312    @Override
313    public void toOriginalHTML(StringBuilder sb) {
314      sb.append(content);
315    }
316
317    /**
318     * @return Original unchanged.
319     */
320    public String getContent() {
321      return content;
322    }
323  }
324
325  /**
326   * Text node
327   */
328  public static abstract class Text extends Node {
329
330    /**
331     * unaltered original content of this node
332     */
333    private final String originalHtml;
334
335    /**
336     * content of this node in HTML format
337     */
338    private String html;
339
340    /**
341     * @param originalHtml Unaltered original HTML. If not null,
342     *        toOriginalHTML() will return this.
343     */
344    protected Text(String originalHtml) {
345      this.originalHtml = originalHtml;
346    }
347
348    /**
349     * Gets the plain, unescaped text.
350     */
351    abstract public String getText();
352
353    // Returns true if it contains only white space
354    public boolean isWhitespace() {
355      String text = getText();
356      int len = text.length();
357      for (int i = 0; i < len; i++) {
358        if (!Character.isWhitespace(text.charAt(i))) {
359          return false;
360        }
361      }
362      return true;
363    }
364
365    @Override
366    public boolean equals(Object o) {
367      if (o == this) {
368        return true;
369      }
370      if (o instanceof Text) {
371        Text that = (Text) o;
372
373        return this.originalHtml == null ? that.originalHtml == null
374            : this.originalHtml.equals(that.originalHtml);
375      }
376      return false;
377    }
378
379    @Override
380    public int hashCode() {
381      return originalHtml == null ? 0 : originalHtml.hashCode();
382    }
383
384    @Override
385    public String toString() {
386      return getText();
387    }
388
389    /** Extends Node.accept */
390    @Override
391    public void accept(Visitor visitor) {
392      visitor.visitText(this);
393    }
394
395    /**
396     * Gets the HTML, with HTML entities escaped.
397     */
398    @Override
399    public void toHTML(StringBuilder sb) {
400      if (html == null) {
401        html = CharEscapers.asciiHtmlEscaper().escape(getText());
402      }
403      sb.append(html);
404    }
405
406    /**
407     * @see HtmlDocument.Text#toHTML(StringBuilder)
408     */
409    @Override
410    public void toXHTML(StringBuilder sb) {
411      toHTML(sb);
412    }
413
414    /**
415     * @param sb Appends original HTML to this if available.  Otherwise,
416     * same as toHTML().
417     */
418    @Override
419    public void toOriginalHTML(StringBuilder sb) {
420      if (originalHtml != null) {
421        sb.append(originalHtml);
422      } else {
423        toHTML(sb);
424      }
425    }
426
427    /**
428     * @return the original HTML (possibly with entities unescaped if the
429     * document was malformed). May be null if original HTML was not preserved
430     * (see constructor argument of {@link HtmlParser})
431     */
432    public String getOriginalHTML() {
433      return originalHtml;
434    }
435  }
436
437  /**
438   * {@link Text} implementation where the given text is assumed to have been
439   * already HTML unescaped.
440   */
441  private static class UnescapedText extends Text {
442    /**
443     * content of this node as plain, unescaped text
444     */
445    protected final String text;
446
447    private UnescapedText(String plainText, String originalHtml) {
448      super(originalHtml);
449      X.assertTrue(plainText != null);
450      this.text = plainText;
451    }
452
453    @Override public String getText() {
454      return text;
455    }
456  }
457
458  /**
459   * {@link Text} implementation where the given text is not unescaped yet, and
460   * unescaping will only be done lazily.
461   */
462  private static class EscapedText extends Text {
463    private final String htmlText;
464    private String text;
465
466    private EscapedText(String htmlText, String originalHtml) {
467      super(originalHtml);
468      this.htmlText = htmlText;
469    }
470
471    @Override public String getText() {
472      if (text == null) {
473        text = StringUtil.unescapeHTML(htmlText);
474      }
475      return text;
476    }
477  }
478
479  /**
480   * CDATA node is a subclass of Text node.
481   */
482  public static class CDATA extends UnescapedText {
483    private CDATA(String text) {
484      super(text, text);
485    }
486
487    @Override public void toHTML(StringBuilder sb) {
488      // Do not htmlescape CDATA text
489      sb.append(text);
490    }
491
492    @Override public void toXHTML(StringBuilder sb) {
493      sb.append("<![CDATA[")
494        .append(text)
495        .append("]]>");
496    }
497  }
498
499  /**
500   * Tag is a HTML open tag.
501   */
502  public static class Tag extends Node {
503    // The element
504    private final HTML.Element element;
505
506    // List of TagAttribute objects. This may be null.
507    private List<TagAttribute> attributes;
508
509    private final boolean isSelfTerminating;
510
511    private final String originalHtmlBeforeAttributes;
512
513    private final String originalHtmlAfterAttributes;
514
515    /**
516     * @param element the HTML4 element
517     * @param attributes list of TagAttribute objects, may be null
518     * @param isSelfTerminating
519     * @param originalHtmlBeforeAttributes Original tag's full content before
520     *        first attribute, including beginning '&lt;'. This should not
521     *        include preceeding whitespace for the first attribute, as that
522     *        should be included in the attribute node. If not null, tag will
523     *        preserve this original content. e.g., if original tag were
524     *        "&lt;foO bar='zbc'&gt;", case of foO would be preserved. This
525     *        method does not validate that
526     *        <code>originalHtmlBeforeAttributes</code> is a valid tag String.
527     * @param originalHtmlAfterAttributes Full content of original tag after
528     *        last attribute, including ending '>'. If not null, tag will
529     *        preserve this original content. e.g., if original tag were
530     *        "&lt;foo bar='zbc'  &gt;", the spaces before '&gt;' be preserved.
531     *        This method does not validate that
532     *        <code>originalHtmlAfterAttributes</code> is a valid tag String.
533     */
534    private Tag(HTML.Element element, List<TagAttribute> attributes,
535        boolean isSelfTerminating, String originalHtmlBeforeAttributes,
536        String originalHtmlAfterAttributes) {
537      X.assertTrue(element != null);
538      this.element = element;
539      this.attributes = attributes;
540      this.isSelfTerminating = isSelfTerminating;
541      this.originalHtmlBeforeAttributes = originalHtmlBeforeAttributes;
542      this.originalHtmlAfterAttributes = originalHtmlAfterAttributes;
543    }
544
545    /** Gets the name */
546    public String getName() {
547      return element.getName();
548    }
549
550    /** Gets the element */
551    public HTML.Element getElement() {
552      return element;
553    }
554
555    /** Adds an attribute */
556    public void addAttribute(HTML.Attribute attr, String value) {
557      X.assertTrue(attr != null);
558      addAttribute(new TagAttribute(attr, value, null));
559    }
560
561    /** Adds an attribute */
562    public void addAttribute(TagAttribute attr) {
563      X.assertTrue(attr != null);
564      if (attributes == null) {
565        attributes = new ArrayList<TagAttribute>();
566      }
567      attributes.add(attr);
568    }
569
570    /** Gets the list of attributes, note that this maybe null. */
571    public List<TagAttribute> getAttributes() {
572      return attributes;
573    }
574
575    /** Finds and returns a TagAttribute, or null if not found */
576    public TagAttribute getAttribute(HTML.Attribute attr) {
577      if (attributes != null) {
578        for (TagAttribute attribute : attributes) {
579          if (attribute.getAttribute().equals(attr)) {
580            return attribute;
581          }
582        }
583      }
584      return null;
585    }
586
587    /**
588     * Finds and returns list of TagAttribute of given attribute
589     * type, or empty list if not found,
590     */
591    public List<TagAttribute> getAttributes(HTML.Attribute attr) {
592      List<TagAttribute> result = Lists.newArrayList();
593      if (attributes != null) {
594        for (TagAttribute attribute : attributes) {
595          if (attribute.getAttribute().equals(attr)) {
596            result.add(attribute);
597          }
598        }
599      }
600      return result;
601    }
602
603    /** Returns debug string */
604    @Override
605    public String toString() {
606      StringBuilder sb = new StringBuilder();
607      sb.append("Start Tag: ");
608      sb.append(element.getName());
609      if (attributes != null) {
610        for (TagAttribute attr : attributes) {
611          sb.append(' ');
612          sb.append(attr.toString());
613        }
614      }
615      return sb.toString();
616    }
617
618    /** Implements Node.accept */
619    @Override
620    public void accept(Visitor visitor) {
621      visitor.visitTag(this);
622    }
623
624    /** Implements Node.toHTML */
625    @Override
626    public void toHTML(StringBuilder sb) {
627      serialize(sb, SerializeType.HTML);
628    }
629
630    @Override
631    public void toXHTML(StringBuilder sb) {
632      serialize(sb, SerializeType.XHTML);
633    }
634
635    @Override
636    public void toOriginalHTML(StringBuilder sb) {
637      serialize(sb, SerializeType.ORIGINAL_HTML);
638    }
639
640    /**
641     * Specifies format of serialized output.
642     */
643    private enum SerializeType {
644      ORIGINAL_HTML, HTML, XHTML
645    }
646
647    private void serialize(StringBuilder sb, SerializeType type) {
648      // before attributes
649      if (type == SerializeType.ORIGINAL_HTML && originalHtmlBeforeAttributes != null) {
650        sb.append(originalHtmlBeforeAttributes);
651      } else {
652        sb.append('<');
653        sb.append(element.getName());
654      }
655
656      // attributes
657      if (attributes != null) {
658        for (TagAttribute attr : attributes) {
659          // attribute includes leading whitespace, so we needn't add it here
660          if (type == SerializeType.ORIGINAL_HTML) {
661            attr.toOriginalHTML(sb);
662          } else if (type == SerializeType.HTML) {
663            attr.toHTML(sb);
664          } else {
665            attr.toXHTML(sb);
666          }
667        }
668      }
669
670      // after attributes
671      if (type == SerializeType.ORIGINAL_HTML && originalHtmlAfterAttributes != null) {
672        sb.append(originalHtmlAfterAttributes);
673      } else if (type == SerializeType.XHTML && (isSelfTerminating || getElement().isEmpty())) {
674        sb.append(" />");
675      } else {
676        sb.append('>');
677      }
678    }
679
680    public boolean isSelfTerminating() {
681      return isSelfTerminating;
682    }
683
684    public String getOriginalHtmlBeforeAttributes() {
685      return originalHtmlBeforeAttributes;
686    }
687
688    public String getOriginalHtmlAfterAttributes() {
689      return originalHtmlAfterAttributes;
690    }
691  }
692
693  /**
694   * EndTag is a closing HTML tag.
695   */
696  public static class EndTag extends Node {
697    // The element
698    private final HTML.Element element;
699
700    private final String originalHtml;
701
702    /**
703     * @param element The HTML.Element element.  Can not be null.
704     * @param originalHtml Full content of original tag, including beginning
705     * and ending '<' and '>'.  If not null, tag will preserve this original
706     * content. e.g., if original tag were "&lt;/foo &gt;", the space after foo
707     * would be preserved.  This method does not validate that originalHtml is a
708     * valid tag String.
709     */
710    private EndTag(HTML.Element element, String originalHtml) {
711      X.assertTrue(element != null);
712      this.element = element;
713      this.originalHtml = originalHtml;
714    }
715
716    /** Gets the name */
717    public String getName() {
718      return element.getName();
719    }
720
721    /** Gets the element */
722    public HTML.Element getElement() {
723      return element;
724    }
725
726    /** Returns debug string */
727    @Override
728    public String toString() {
729      return "End Tag: " + element.getName();
730    }
731
732    /** Implements Node.accept */
733    @Override
734    public void accept(Visitor visitor) {
735      visitor.visitEndTag(this);
736    }
737
738    /** Implements Node.toHTML */
739    @Override
740    public void toHTML(StringBuilder sb) {
741      sb.append("</");
742      sb.append(element.getName());
743      sb.append('>');
744    }
745
746    @Override
747    public void toXHTML(StringBuilder sb) {
748      toHTML(sb);
749    }
750
751    @Override
752    public void toOriginalHTML(StringBuilder sb) {
753      if (originalHtml != null) {
754        sb.append(originalHtml);
755      } else {
756        toHTML(sb);
757      }
758    }
759  }
760
761  /**
762   * TagAttribute represents an attribute in a HTML tag.
763   */
764  public static class TagAttribute {
765    private final HTML.Attribute attribute;
766    private String value;
767    private String originalHtml;
768
769    /**
770     * @param attribute the HTML.Attribute. Can't be null.
771     * @param value The value in plain-text format. This can be null if the
772     *        attribute has no value.
773     * @param originalHtml If not null, toOriginalHTML() will preserve original
774     *        content. This should contain any leading whitespace from the
775     *        original.
776     */
777    private TagAttribute(HTML.Attribute attribute, String value, String originalHtml) {
778      X.assertTrue(attribute != null);
779      this.attribute = attribute;
780      this.value = value;
781      this.originalHtml = originalHtml;
782    }
783
784    /** Gets the name */
785    public String getName() {
786      return attribute.getName();
787    }
788
789    /** Gets the HTML.Attribute information */
790    public HTML.Attribute getAttribute() {
791      return attribute;
792    }
793
794    /**
795     * Sets the attribute value.
796     * This value must be in plain-text, not html-escaped.
797     * This can be null, if the attribute has no values.
798     * This clears <code>originalHtml_</code> if it were set, so
799     * <code>toOriginalHTML()</code> might not preserve original any more.
800     */
801    public void setValue(String value) {
802      this.value = value;
803      originalHtml = null;
804    }
805
806    /** Returns the attribute value in plain-text, never null */
807    public String getValue() {
808      return value != null ? value : "";
809    }
810
811    /** Returns true if the attribute value is not empty */
812    public boolean hasValue() {
813      return value != null;
814    }
815
816    /**
817     * Writes out the attribute in HTML format with all necessary preceding
818     * whitespace. Emits originalHtml_ if it were specified to the constructor.
819     * Otherwise, emits a new name="value" string with a single preceding space.
820     */
821    public void toHTML(StringBuilder sb) {
822      sb.append(' ');
823      sb.append(attribute.getName());
824      if (value != null && attribute.getType() != HTML.Attribute.BOOLEAN_TYPE) {
825        sb.append("=\"");
826        sb.append(CharEscapers.asciiHtmlEscaper().escape(value));
827        sb.append("\"");
828      }
829    }
830
831    /** Returns the attribute html string */
832    public String toHTML() {
833      StringBuilder sb = new StringBuilder();
834      toHTML(sb);
835      return sb.toString();
836    }
837
838    /**
839     * Writes out the attribute in XHTML format (value is always appended,
840     * even if it is empty) with all necessary preceeding whitespace.
841     */
842    public void toXHTML(StringBuilder sb) {
843      sb.append(' ');
844      sb.append(attribute.getName()).append("=\"");
845
846      // Assume that value-less attribute are boolean attributes like "disabled"
847      if (hasValue()) {
848        sb.append(CharEscapers.asciiHtmlEscaper().escape(value));
849      } else {
850        sb.append(attribute.getName());
851      }
852
853      sb.append("\"");
854    }
855
856    /** Returns the attribute XHTML string */
857    public String toXHTML() {
858      StringBuilder sb = new StringBuilder();
859      toXHTML(sb);
860      return sb.toString();
861    }
862
863    /**
864     * @param sb Destination to which attribute is written, in its original
865     * preparsed form if possible.
866     */
867    public void toOriginalHTML(StringBuilder sb) {
868      if (originalHtml != null) {
869        sb.append(originalHtml);
870      } else {
871        toHTML(sb);
872      }
873    }
874
875    /**
876     * Writes out the attribute in its original form as it was parsed..
877     */
878    public String toOriginalHTML() {
879      StringBuilder sb = new StringBuilder();
880      toOriginalHTML(sb);
881      return sb.toString();
882    }
883
884    @Override
885    public String toString() {
886      return "{" + attribute.getName() + "=" + value + "}";
887    }
888  }
889
890  /**
891   * Filter is like Visitor, except it implies that the nodes may be changed,
892   * whereas HtmlDocument.Visitor just implies that the nodes are iterated
893   * over. A Filter can behave just like a Visitor if it merely returns the
894   * same node that it visited. Also, methods may be called on a node to change
895   * the values it contains. Alternatively, a new node entirely can be created
896   * and returned, which will essentially replace the previous node with the
897   * new node in the document tree. A node may be removed by returning null
898   * instead of a node.
899   */
900  public static interface Filter {
901    /** This is called first */
902    void start();
903
904    /** A text node */
905    Text visitText(Text n);
906
907    /** An open tag */
908    Tag visitTag(Tag n);
909
910    /** End tag */
911    EndTag visitEndTag(EndTag n);
912
913    /** HTML comment */
914    Comment visitComment(Comment n);
915
916    /* Called at the end. */
917    void finish();
918  }
919
920  /**
921   * Like Filter, except each node may be replaced by multiple nodes.  Also,
922   * does not do double dispatch accept/visit.
923   */
924  public static interface MultiplexFilter {
925    /**
926     * Called first.
927     */
928    void start();
929
930    /**
931     * @param originalNode node to filter
932     * @param out Destination to which this object appends nodes to replace
933     * originalNode.  Can not be null.
934     */
935    void filter(Node originalNode, List<Node> out);
936
937    /**
938     * Called at the end.
939     * @param out Destination to which this object appends nodes at the end of
940     * the document.  Can not be null.
941     */
942    void finish(List<Node> out);
943  }
944
945  /**
946   * Converts a normal {@link Filter} into a {@link MultiplexFilter}.
947   */
948  public static class MultiplexFilterAdapter implements MultiplexFilter {
949
950    private final Filter filter;
951
952    public MultiplexFilterAdapter(Filter filter) {
953      this.filter = filter;
954    }
955
956    public void start() {
957      filter.start();
958    }
959
960    public void filter(Node originalNode, List<Node> out) {
961      if (originalNode == null) {
962        return;
963      }
964
965      Node resultNode;
966      if (originalNode instanceof Tag) {
967        resultNode = filter.visitTag((Tag) originalNode);
968      } else if (originalNode instanceof Text) {
969        resultNode = filter.visitText((Text) originalNode);
970      } else if (originalNode instanceof EndTag) {
971        resultNode = filter.visitEndTag((EndTag) originalNode);
972      } else if (originalNode instanceof Comment) {
973        resultNode = filter.visitComment((Comment) originalNode);
974      } else {
975        throw new IllegalArgumentException("unknown node type: " + originalNode.getClass());
976      }
977
978      if (resultNode != null) {
979        out.add(resultNode);
980      }
981    }
982
983    public void finish(List<Node> out) {
984      filter.finish();
985    }
986  }
987
988  /**
989   * Like Filter, except each node may be replaced by multiple nodes.  Also,
990   * does not do double dispatch accept/visit.  Dispatches filterNode() to
991   * node-specific methods.
992   */
993  public static abstract class SimpleMultiplexFilter implements MultiplexFilter {
994
995    /**
996     * @see HtmlDocument.MultiplexFilter#filter(HtmlDocument.Node, List)
997     */
998    public void filter(Node originalNode, List<Node> out) {
999      if (originalNode == null) {
1000        return;
1001      }
1002
1003      if (originalNode instanceof Tag) {
1004        filterTag((Tag) originalNode, out);
1005      } else if (originalNode instanceof Text) {
1006        filterText((Text) originalNode, out);
1007      } else if (originalNode instanceof EndTag) {
1008        filterEndTag((EndTag) originalNode, out);
1009      } else if (originalNode instanceof Comment) {
1010        filterComment((Comment) originalNode, out);
1011      } else {
1012        throw new IllegalArgumentException("unknown node type: "
1013            + originalNode.getClass());
1014      }
1015    }
1016
1017    public abstract void filterTag(Tag originalTag, List<Node> out);
1018
1019    public abstract void filterText(Text originalText, List<Node> out);
1020
1021    public abstract void filterEndTag(EndTag originalEndTag, List<Node> out);
1022
1023    public void filterComment(Comment originalComment, List<Node> out) {
1024    }
1025  }
1026
1027  /**
1028   * Contains a list of filters which are applied, in order, to each Node.  The
1029   * output of each becomes the input to the next.  As soon as one returns an
1030   * empty list it breaks the chain.
1031   */
1032  public static class MultiplexFilterChain implements MultiplexFilter {
1033
1034    private final List<MultiplexFilter> filters = new ArrayList<MultiplexFilter>();
1035
1036    /**
1037     * @param sourceFilters these filters are applied in List order
1038     */
1039    public MultiplexFilterChain(List<MultiplexFilter> sourceFilters) {
1040      filters.addAll(sourceFilters);
1041    }
1042
1043    /**
1044     * @see HtmlDocument.MultiplexFilter#start()
1045     */
1046    public void start() {
1047      for (MultiplexFilter filter : filters) {
1048        filter.start();
1049      }
1050    }
1051
1052    /**
1053     * @see HtmlDocument.MultiplexFilter#filter(HtmlDocument.Node, List)
1054     */
1055    public void filter(Node originalNode, List<Node> out) {
1056      List<Node> result = new ArrayList<Node>();
1057      result.add(originalNode);
1058
1059      // loop through filters until one returns nothing, or until we're out of
1060      // filters
1061      for (MultiplexFilter filter : filters) {
1062        if (result.isEmpty()) {
1063          return;
1064        }
1065
1066        // apply filter to each node and collect results
1067        List<Node> newResult = new ArrayList<Node>();
1068        for (Node node : result) {
1069          filter.filter(node, newResult);
1070        }
1071        result = newResult;
1072      }
1073
1074      out.addAll(result);
1075    }
1076
1077    /**
1078     * @see HtmlDocument.MultiplexFilter#finish(List)
1079     */
1080    public void finish(List<Node> out) {
1081      List<Node> result = new ArrayList<Node>();
1082
1083      // loop through filters until one returns nothing, or until we're out of
1084      // filters
1085      for (MultiplexFilter filter : filters) {
1086        // apply filter to each node and collect results
1087        List<Node> newResult = new ArrayList<Node>();
1088        for (Node node : result) {
1089          filter.filter(node, newResult);
1090        }
1091        filter.finish(newResult);
1092        result = newResult;
1093      }
1094
1095      out.addAll(result);
1096    }
1097  }
1098
1099  /**
1100   * Html visitor allows external code to iterate through the nodes in the
1101   * document. See HtmlDocument.accept.
1102   */
1103  public static interface Visitor {
1104    /** This is called first */
1105    void start();
1106
1107    /** A text node */
1108    void visitText(Text n);
1109
1110    /** An open tag */
1111    void visitTag(Tag n);
1112
1113    /** End tag */
1114    void visitEndTag(EndTag n);
1115
1116    /** comment */
1117    void visitComment(Comment n);
1118
1119    /* Called at the end. */
1120    void finish();
1121  }
1122
1123  /**
1124   * An implementation of the Visitor interface which simply delegates its
1125   * methods to a wrapped instance of another Visitor.
1126   *
1127   * <p>This is useful for chaining Visitors together.
1128   */
1129  public static class VisitorWrapper implements Visitor {
1130    private final Visitor wrapped;
1131
1132    protected VisitorWrapper(Visitor wrap) {
1133      wrapped = wrap;
1134    }
1135
1136    public void start() {
1137      wrapped.start();
1138    }
1139
1140    public void visitText(Text n) {
1141      wrapped.visitText(n);
1142    }
1143
1144    public void visitTag(Tag n) {
1145      wrapped.visitTag(n);
1146    }
1147
1148    public void visitEndTag(EndTag n) {
1149      wrapped.visitEndTag(n);
1150    }
1151
1152    public void visitComment(Comment n) {
1153      wrapped.visitComment(n);
1154    }
1155
1156    public void finish() {
1157      wrapped.finish();
1158    }
1159  }
1160
1161  /**
1162   * A special helper Visitor that builds a HtmlDocument.
1163   */
1164  public static class Builder implements Visitor {
1165    private final boolean preserveComments;
1166    private final List<Node> nodes = new ArrayList<Node>();
1167    private HtmlDocument doc;
1168
1169    /**
1170     * @see Builder#Builder(boolean)
1171     */
1172    public Builder() {
1173      this(false);
1174    }
1175
1176    /**
1177     * @param preserveComments If false, ignores Comment nodes
1178     */
1179    public Builder(boolean preserveComments) {
1180      this.preserveComments = preserveComments;
1181    }
1182
1183    public void addNode(Node node) {
1184      nodes.add(node);
1185    }
1186    public void start() {
1187    }
1188    public void visitText(Text t) {
1189      addNode(t);
1190    }
1191    public void visitTag(Tag t) {
1192      addNode(t);
1193    }
1194    public void visitComment(Comment n) {
1195      if (preserveComments) {
1196        addNode(n);
1197      }
1198    }
1199    public void visitEndTag(EndTag t) {
1200      addNode(t);
1201    }
1202    public void finish() {
1203      doc = new HtmlDocument(nodes);
1204    }
1205
1206    /** Gets the html document that has been constructed */
1207    public HtmlDocument getDocument() {
1208      return doc;
1209    }
1210  }
1211
1212  /**
1213   * A Visitor that prints out the html document in debug format.
1214   */
1215  public static class DebugPrinter implements Visitor {
1216
1217    private final PrintWriter writer;
1218
1219    public DebugPrinter(PrintWriter writer) {
1220      this.writer = writer;
1221    }
1222
1223    public void start() {
1224    }
1225
1226    public void visitText(Text t) {
1227      writeCollapsed("TEXT", t.getText());
1228    }
1229
1230    public void visitComment(Comment n) {
1231      writeCollapsed("COMMENT", n.getContent());
1232    }
1233
1234    private void writeCollapsed(String type, String s) {
1235      writer.print(type);
1236      writer.print(": ");
1237      String noNewlines = s.replace("\n", " ");
1238      // Use CharMatcher#WHITESPACE?
1239      String collapsed = CharMatcher.LEGACY_WHITESPACE.trimAndCollapseFrom(noNewlines, ' ');
1240      writer.print(collapsed);
1241    }
1242
1243    public void visitTag(Tag tag) {
1244      writer.print("==<" + tag.getName() + ">");
1245      List<TagAttribute> attributes = tag.getAttributes();
1246      if (attributes != null) {
1247
1248        // Attribute values
1249        List<String> attrs = new ArrayList<String>();
1250        for (TagAttribute a : attributes) {
1251          attrs.add("[" + a.getName() + " : " + a.getValue() + "]");
1252        }
1253        String[] array = attrs.toArray(new String[attrs.size()]);
1254
1255        // Sort the attributes so that it's easier to read and compare
1256        Arrays.sort(array);
1257        for (int i = 0; i < array.length; i++) {
1258          writer.print(" " + array[i]);
1259        }
1260      }
1261      writer.println();
1262    }
1263
1264    public void visitEndTag(EndTag endtag) {
1265      writer.println("==</" + endtag.getName() + ">");
1266    }
1267
1268    public void finish() {
1269    }
1270  }
1271
1272}