1993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira/**
2993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Copyright (c) 2004, Google Inc.
3993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira *
4993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Licensed under the Apache License, Version 2.0 (the "License");
5993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * you may not use this file except in compliance with the License.
6993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * You may obtain a copy of the License at
7993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira *
8993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira *     http://www.apache.org/licenses/LICENSE-2.0
9993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira *
10993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Unless required by applicable law or agreed to in writing, software
11993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * distributed under the License is distributed on an "AS IS" BASIS,
12993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * See the License for the specific language governing permissions and
14993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * limitations under the License.
15993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */
161bdbfefe4b144c7b031a1d9242a0fa061a0ae6b5Scott Kennedypackage com.google.android.mail.common.html.parser;
17993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
1877b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Caoimport android.text.Spanned;
1977b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao
201bdbfefe4b144c7b031a1d9242a0fa061a0ae6b5Scott Kennedyimport com.google.android.mail.common.base.CharMatcher;
211bdbfefe4b144c7b031a1d9242a0fa061a0ae6b5Scott Kennedyimport com.google.android.mail.common.base.Preconditions;
221bdbfefe4b144c7b031a1d9242a0fa061a0ae6b5Scott Kennedyimport com.google.android.mail.common.base.X;
23f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedyimport com.google.common.annotations.VisibleForTesting;
24993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereiraimport com.google.common.collect.ImmutableSet;
25993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
26993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereiraimport java.util.ArrayList;
27993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereiraimport java.util.Arrays;
28993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereiraimport java.util.Collections;
29993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereiraimport java.util.List;
30993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereiraimport java.util.Set;
31993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereiraimport java.util.Stack;
32993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereiraimport java.util.logging.Logger;
33993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
34993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira/**
35993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * HtmlTree represents a parsed and well-formed html text, it provides
36993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * methods to convert to plain text. It also provides methods to find
37993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * well-formed blocks of text, for quote detection.
38993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira *
39993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * We don't really build a html tree data structure. Instead, for
40993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * efficiency, and for the ability to do a simple in-order-traversal
41993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * of the tree, we simply keeps a linear list of nodes (in-order).
42993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * The begin_ and end_ arrays keeps track of the starting end ending
43993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * nodes:
44993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira *
45993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * For a string node, begin_[node] = end_[node] = node
46993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * For an open tag, begin_[node] = node, end_[node] = the matching end tag
47993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * For a close tag, end_[node] = the matching open tag, end_[node] = node
48993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira *
49993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * @author jlim@google.com (Jing Yee Lim)
50993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */
51993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereirapublic class HtmlTree {
52f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy  // http://www.w3.org/TR/html4/struct/text.html#h-9.1
53f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy  private static final CharMatcher HTML_WHITESPACE = CharMatcher.anyOf(" \t\f\u200b\r\n");
54993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
55993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /**
56993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * An interface that allows clients to provide their own implementation
5777b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao   * of a {@link Converter}.
58993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   */
5977b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao  public static interface ConverterFactory {
60993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    /**
6177b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao     * Creates a new instance of a {@link Converter} to convert
6277b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao     * the contents of an {@link HtmlTree} to some resulting object.
63993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     */
6477b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao    Converter createInstance();
65993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
66993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
67993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /**
6877b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao   * An interface for an object which converts a single HtmlTree into some object
69993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   */
7077b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao  public static interface Converter<T> {
71993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    /**
72993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * Adds the given node {@code n} to plain text.
73993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     *
74993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * @param n The node to convert to text.
75993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * @param nodeNum The number of the node among the list of all notes.
76993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * @param endNum The number of the ending node if this is a start node,
77993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     *    otherwise the same as {@code nodeNum}.
78993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     */
79993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    void addNode(HtmlDocument.Node n, int nodeNum, int endNum);
80993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
81993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    /**
82993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * Returns the current length of the plain text.
83993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     */
84993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    int getPlainTextLength();
85993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
86993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    /**
8777b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao     * Returns the current built object.
88993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     */
8977b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao    T getObject();
90993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
91993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
92993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /** A factory that produces converters of the default implementation. */
9377b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao  private static final ConverterFactory DEFAULT_CONVERTER_FACTORY =
9477b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao      new ConverterFactory() {
95f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy        @Override
9677b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao        public Converter<String> createInstance() {
97993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          return new DefaultPlainTextConverter();
98993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        }
99993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      };
100993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
101993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /** Contains html nodes */
102993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  private final List<HtmlDocument.Node> nodes = new ArrayList<HtmlDocument.Node>();
103993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
104993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /** Keeps track of beginning and end of each node */
105993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  private final Stack<Integer> begins = new Stack<Integer>();
106993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  private final Stack<Integer> ends = new Stack<Integer>();
107993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
108993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /** Plain text (lazy creation) */
109993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  private String plainText;
110993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
11177b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao  /** Constructed span (lazy creation) */
11277b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao  private Spanned constructedSpan;
11377b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao
114993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /** The html string (lazy creation) */
115993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  private String html;
116993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
117993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /** textPositions[node pos] = the text position */
118993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  private int[] textPositions;
119993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
12077b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao  private ConverterFactory converterFactory = DEFAULT_CONVERTER_FACTORY;
121993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
122993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  // For debugging only
123993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  private static final boolean DEBUG = false;
124993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
125993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  private static final Logger logger = Logger.getLogger(HtmlTree.class.getName());
126993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
127993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  //------------------------------------------------------------------------
128993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
129993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /** HtmlTree can only be constructed from this package */
130993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  HtmlTree() {
131993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
132993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
133993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /**
13477b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao   * Sets a new {@link ConverterFactory} to be used to convert
135993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * the contents of this tree to plaintext.
136993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   */
13777b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao  public void setConverterFactory(ConverterFactory factory) {
138993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    if (factory == null) {
139993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      throw new NullPointerException("factory must not be null");
140993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
141993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    converterFactory = factory;
142993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
143993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
144993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /**
145993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * Gets the list of node objects. A node can be either a
146993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * Tag, EngTag or a String object.
147993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * @return the nodes of the tree
148993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   */
149993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  public List<HtmlDocument.Node> getNodesList() {
150993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    return Collections.unmodifiableList(nodes);
151993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
152993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
153993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /**
154993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * @return number of nodes
155993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   */
156993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  public int getNumNodes() {
157993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    return nodes.size();
158993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
159993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
160993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /**
161f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy   * Returns number of matching open tag node, or {@code endTagNodeNum} itself
162f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy   * if it does not point to a closing tag.
163f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy   */
164f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy  public int findOpenTag(int endTagNodeNum) {
165f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy    X.assertTrue(endTagNodeNum >= 0 && endTagNodeNum < nodes.size());
166f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy    return begins.get(endTagNodeNum);
167f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy  }
168f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy
169f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy  /**
170f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy   * Returns number of matching closing tag node, or {@code openTagNodeNum} itself
171f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy   * if it does not point to an open tag or points to an open tag with no closing one.
172f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy   */
173f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy  public int findEndTag(int openTagNodeNum) {
174f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy    X.assertTrue(openTagNodeNum >= 0 && openTagNodeNum < nodes.size());
175f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy    return ends.get(openTagNodeNum);
176f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy  }
177f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy
178f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy  /**
179f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy   * Returns number of matching open/closing tag node, or {@code tagNodeNum} itself
180f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy   * if it does not point to an open/closing tag (e.g text node or comment).
181f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy   */
182f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy  public int findPairedTag(int tagNodeNum) {
183f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy    X.assertTrue(tagNodeNum >= 0 && tagNodeNum < nodes.size());
184f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy    int openNodeNum = begins.get(tagNodeNum);
185f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy    int endNodeNum = ends.get(tagNodeNum);
186f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy    return tagNodeNum == openNodeNum ? endNodeNum : openNodeNum;
187f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy  }
188f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy
189f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy  /**
190993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * Gets the entire html.
191993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   */
192993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  public String getHtml() {
193993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    return getHtml(-1);
194993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
195993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
196993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /**
197993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * Gets the entire html, if wrapSize is > 0, it tries to do wrapping at the
198993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * specified size.
199993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   */
200993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  public String getHtml(int wrapSize) {
201993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    if (html == null) {
202993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      html = getHtml(0, nodes.size(), wrapSize);
203993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
204993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    return html;
205993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
206993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
207993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /** Gets parts of the html */
208993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  public String getHtml(int fromNode, int toNode) {
209993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    return getHtml(fromNode, toNode, -1);
210993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
211993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
212993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /**
213993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * Gets parts of the html, if wrapSize is > 0, it tries
214993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * to do wrapping at the specified size.
215993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   */
216993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  public String getHtml(int fromNode, int toNode, int wrapSize) {
217993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    X.assertTrue(fromNode >= 0 && toNode <= nodes.size());
218993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
219993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    int estSize = (toNode - fromNode) * 10;
220993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    StringBuilder sb = new StringBuilder(estSize);
221993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    int lastWrapIndex = 0;      // used for wrapping
222993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    for (int n = fromNode; n < toNode; n++) {
223993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      HtmlDocument.Node node = nodes.get(n);
224993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      node.toHTML(sb);
225993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // TODO: maybe we can be smarter about this and not add newlines
226993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // within <pre> tags, unless the whole long line is encompassed
227993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // by the <pre> tag.
228993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      if (wrapSize > 0) {
229993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        // We can only wrap if the last outputted node is an element that
230993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        // breaks the flow. Otherwise, we risk the possibility of inserting
231993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        // spaces where they shouldn't be.
232993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        if ((node instanceof HtmlDocument.Tag &&
233993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira              ((HtmlDocument.Tag) node).getElement().breaksFlow()) ||
234993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira            (node instanceof HtmlDocument.EndTag &&
235993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira              ((HtmlDocument.EndTag) node).getElement().breaksFlow())) {
236993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          // Check to see if there is a newline in the most recent node's html.
237993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          int recentNewLine = sb.substring(lastWrapIndex + 1).lastIndexOf('\n');
238993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          if (recentNewLine != -1) {
239993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira            lastWrapIndex += recentNewLine;
240993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          }
241993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          // If the last index - last index of a newline is greater than
242993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          // wrapSize, add a newline.
243993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          if (((sb.length() - 1) - lastWrapIndex) > wrapSize) {
244993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira            sb.append('\n');
245993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira            lastWrapIndex = sb.length() - 1;
246993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          }
247993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        }
248993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      }
249993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
250993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
251993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    return sb.toString();
252993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
253993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
254993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /**
255993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * Convert a html region into chunks of html code, each containing
256993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * roughly chunkSize characters.
257993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   */
258993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  public ArrayList<String> getHtmlChunks(int fromNode, int toNode, int chunkSize) {
259993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    X.assertTrue(fromNode >= 0 && toNode <= nodes.size());
260993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
261993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    ArrayList<String> chunks = new ArrayList<String>();
262993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
263993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // Do a best effort attempt to not split apart certain elements (as of now,
264993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // just the <textarea>). We cannot guarantee that they will not be split
265993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // because the client may specify endpoint nodes that land in the middle
266993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // of an element (although this shouldn't happen if the endpoints returned
267993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // by createBlocks() are properly used).
268993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    int stack = 0;
269993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    boolean balanced = true;
270993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
271993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    StringBuilder sb = new StringBuilder(chunkSize + 256);
272993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    for (int n = fromNode; n < toNode; n++) {
273993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      HtmlDocument.Node node = nodes.get(n);
274993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      node.toHTML(sb);
275993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
276993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      if (node instanceof HtmlDocument.Tag) {
277993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        if (HTML4.TEXTAREA_ELEMENT.equals(
278f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy            ((HtmlDocument.Tag) node).getElement())) {
279993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          stack++;
280993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        }
281993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      }
282993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      if (node instanceof HtmlDocument.EndTag) {
283993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        if (HTML4.TEXTAREA_ELEMENT.equals(
284f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy            ((HtmlDocument.EndTag) node).getElement())) {
285993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          if (stack == 0) {
286993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira            balanced = false;
287993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          } else {
288993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira            stack--;
289993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          }
290993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        }
291993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      }
292993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
293993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      if (stack == 0 && sb.length() >= chunkSize) {
294993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        chunks.add(sb.toString());
295993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        sb.setLength(0);
296993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      }
297993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
298993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
299993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // Don't forget the last chunk!
300993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    if (sb.length() > 0) {
301993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      chunks.add(sb.toString());
302993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
303993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
304993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // If the tree is not balanced (cut off in the middle of a node), log
305993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // debug data. Clients should fix their code so that the endpoints from
306993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // createBlocks() are properly used.
307993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    if (!balanced || stack != 0) {
308993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      StringBuilder debug = new StringBuilder("Returning unbalanced HTML:\n");
309993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      debug.append(getHtml());
310993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      debug.append("\nfromNode: ").append(fromNode);
311993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      debug.append("\ntoNode: ").append(toNode);
312993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      debug.append("\nNum nodes_: ").append(getNumNodes());
313993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      for (String chunk : chunks) {
314993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        debug.append("\nChunk:\n").append(chunk);
315993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      }
316993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      logger.severe(debug.toString());
317993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
318993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
319993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    return chunks;
320993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
321993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
322993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /**
323993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * Returns height (maximum length from root to a leaf) of the HTML tree.
324993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * @return height of the HTML tree.
325993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   */
326993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  public int getTreeHeight() {
327993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    int currentHeight = 0;
328993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    int maxHeight = 0;
329993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
330993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    for (int i = 0; i < nodes.size(); i++) {
331993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      HtmlDocument.Node node = nodes.get(i);
332993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      if (node instanceof HtmlDocument.Tag) {
333993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        currentHeight++;
334993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        if (currentHeight > maxHeight) {
335993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          maxHeight = currentHeight;
336993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        }
337993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        if (((HtmlDocument.Tag) node).getElement().isEmpty()) {
338993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          // Empty tags have no closing pair, so decrease counter here.
339993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          currentHeight--;
340993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        }
341993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      } else if (node instanceof HtmlDocument.EndTag) {
342993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        currentHeight--;
343993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      }
344993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
345993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
346993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // TODO(anatol): make this value cachable?
347993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    return maxHeight;
348993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
349993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
350993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  //------------------------------------------------------------------------
351993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  // Creating well-formed blocks within the html tree.
352993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  //------------------------------------------------------------------------
353993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /**
354993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * A Block represents a region of a html tree that
355993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * 1) is well-formed, i.e. for each node in the block, all its descendants
356993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * are also contained in the block. So it's safe to wrap the region
357993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * within a <table> or <div>, etc.
358993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * 2) starts at the beginning of a "line", e.g. a <div>, a <br>.
359993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   */
360993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  public static class Block {
361993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    /* The starting node */
362993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    public int start_node;
363993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
364993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    /* The ending node (non-inclusive to the block) */
365993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    public int end_node;
366993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
367993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
368993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /**
369993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * Creates a list of Blocks, given a text-range.
370993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * We may create multiple blocks if one single well-formed Block cannot be
371993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * created.
372993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   *
373993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * @param textStart beginning plain-text offset
374993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * @param textEnd beginning plain-text offset
375993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * @param minNode the smallest node number
376993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * @param maxNode the largest node number
377993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * @return a list of 0 or more Block objects, never null
378993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   */
379993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  public ArrayList<Block> createBlocks(int textStart, int textEnd, int minNode, int maxNode) {
380993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
381993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    ArrayList<Block> blocks = new ArrayList<Block>();
382993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    int startNode = Math.max(getBlockStart(textStart), minNode);
383993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    int endNode = Math.min(getBlockEnd(textEnd), maxNode);
384993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
385993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    if (DEBUG) {
386993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      debug("Creating block: " +
387993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira            "text pos: " + textStart + "-" + textEnd + "\n" +
388993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira            "node pos: " + startNode + "-" + endNode + "\n" +
389993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira            plainText.substring(textStart, textEnd));
390993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
391993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
392993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // Split up the block [start, end) into one or more blocks that
393993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // are well-formed, and begins at a "line" boundary.
394993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    int blockStart = -1;
395993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    for (int n = startNode; n < endNode;) {
396993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
397993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // The node n spans [nBegin, nEnd]
398993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      int nBegin = begins.get(n);
399993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      int nEnd = ends.get(n);
400993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
401993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      if (blockStart == -1) {
402993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        // Check if this is a valid start node
403993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        if (nBegin >= n && nEnd <= endNode &&
404993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira            canBeginBlockAt(n)) {
405993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          blockStart = n;
406993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          n = nEnd + 1;
407993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        } else {
408993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          n++;
409993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        }
410993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        continue;
411993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      }
412993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
413993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // If the node [nBegin, nEnd) lies completely within
414993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // the region then proceed to the (nEnd + 1).
415993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      if (nBegin >= blockStart && nEnd < endNode) {
416993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        n = nEnd + 1;
417993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        continue;
418993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      }
419993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
420993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // If we got here, we have to break up the region into one
421993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // or more blocks because the current node cannot be included
422993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // in the region.
423993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      if (DEBUG) {
424993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        debug("Forcing new block: " + n + " ("  + nBegin + " " + nEnd +
425993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira              ") exceeds (" + blockStart + " " + endNode + ")");
426993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      }
427993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      Block b = new Block();
428993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      b.start_node = blockStart;
429993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      b.end_node = n;
430993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      blocks.add(b);
431993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
432993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      blockStart = -1;
433993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      n++;
434993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
435993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
436993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // Last block
437993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    if (blockStart != -1) {
438993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      Block b = new Block();
439993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      b.start_node = blockStart;
440993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      b.end_node = endNode;
441993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      blocks.add(b);
442993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
443993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
444993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    if (DEBUG) {
445993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      for (int i = 0; i < blocks.size(); i++) {
446993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        Block b = blocks.get(i);
447993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        debug("Block " + i + "/" + blocks.size() + ": " +
448993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira              b.start_node + "-" + b.end_node + " " +
449993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira              getPlainText(b.start_node, b.end_node));
450993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      }
451993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
452993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
453993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    return blocks;
454993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
455993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
456993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /**
457993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * Checks if a block can begin starting from a node position
458993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   */
459993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  private boolean canBeginBlockAt(int nodePos) {
460993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    int textPos = textPositions[nodePos];
461993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
462993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // Make sure that we don't exceed the text position, this happens
463993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // for the last tag nodes.
464993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    if (textPos == plainText.length()) {
465993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      textPos--;
466993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
467993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
468993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // Scan backwards to check if a nodePos is at the beginning
469993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // of a line.
470993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    for (int i = textPos; i > 0; i--) {
471993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      char ch = plainText.charAt(i);
472993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      if (ch == '\n') {
473993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        return true;
474993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      }
475f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy      if (i < textPos && !HTML_WHITESPACE.matches(ch)) {
476993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        return false;
477993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      }
478993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
479993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    return true;
480993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
481993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
482993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /**
483993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * Returns the start of a block given a text-pos
484993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   */
485993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  private int getBlockStart(int textPos) {
486993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    int nodenum = Arrays.binarySearch(textPositions, textPos);
487993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    if (nodenum >= 0) {
488993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // Got an exact node alignment. Get the outer most pos that
489993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // matches the text position
490993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      while ((nodenum - 1) >= 0 && textPositions[nodenum - 1] == textPos) {
491993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        nodenum--;
492993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      }
493993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    } else {
494993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // textPos matches the middle of a node.
495993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      nodenum = -nodenum - 1;
496993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
497993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
498993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    X.assertTrue(nodenum >= 0 && nodenum <= nodes.size());
499993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    return nodenum;
500993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
501993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
502993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /**
503993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * Returns the end of a block given a text-pos
504993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   */
505993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  private int getBlockEnd(int textPos) {
506993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    int nodenum = Arrays.binarySearch(textPositions, textPos);
507993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    if (nodenum >= 0) {
508993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // Got an exact node alignment.
509993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      while ((nodenum + 1) < textPositions.length && textPositions[nodenum + 1] == textPos) {
510993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        nodenum++;
511993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      }
512993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    } else {
513993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // textPos matches the middle of a node.
514993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      nodenum = -nodenum - 2;
515993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
516993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    X.assertTrue(nodenum >= 0 && nodenum <= nodes.size());
517993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    return nodenum;
518993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
519993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
520993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  //------------------------------------------------------------------------
521993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  // Plain text view of the html tree
522993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  //------------------------------------------------------------------------
523993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /**
524993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * @return the plain-text position corresponding to the node
525993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   */
526993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  public int getTextPosition(int node) {
527993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    return textPositions[node];
528993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
529993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
530993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /**
531993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * @return a plain-text String of the html tree
532993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   */
533993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  public String getPlainText() {
534993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    if (plainText == null) {
535993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      convertToPlainText();
536993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
537993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    return plainText;
538993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
539993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
540993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /**
541993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * @return a plain-text String of a part of the html tree
542993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   */
543993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  public String getPlainText(int fromNode, int toNode) {
544993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    if (plainText == null) {
545993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      convertToPlainText();
546993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
547993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    int textstart = textPositions[fromNode];
548993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    int textend = textPositions[toNode];
549993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    return plainText.substring(textstart, textend);
550993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
551993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
552993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /**
553993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * Converts the html tree to plain text.
554993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * We simply iterate through the nodes in the tree.
555993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * As we output the plain-text, we keep track of the text position
556993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * of each node.
557993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * For String nodes, we replace '\n' with ' ' unless we're in a
558993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * <pre> block.
559993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   */
560993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  private void convertToPlainText() {
561993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    X.assertTrue(plainText == null && textPositions == null);
562993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
563993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    int numNodes = nodes.size();
564993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
565993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // Keeps track of start text position of each node, including a last
566993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // entry for the size of the text.
567993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    textPositions = new int[numNodes + 1];
568993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
56977b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao    Converter<String> converter = (Converter<String>) converterFactory.createInstance();
570993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
571993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    for (int i = 0; i < numNodes; i++) {
572993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      textPositions[i] = converter.getPlainTextLength();
573993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      converter.addNode(nodes.get(i), i, ends.get(i));
574993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
575993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
576993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // Add a last entry, so that textPositions_[nodes_.size()] is valid.
577993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    textPositions[numNodes] = converter.getPlainTextLength();
578993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
57977b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao    plainText = converter.getObject();
580993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
581993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    if (DEBUG) {
582993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      debug("Plain text: " + plainText);
583993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
584993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      for (int i = 0; i < nodes.size(); i++) {
585993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        int textPos = textPositions[i];
586993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        String text = plainText.substring(textPos, textPositions[i + 1]);
587993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        debug("At " + i + ": pos=" + textPos + " " +  text);
588993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      }
589993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
590993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
591993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
59277b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao    //------------------------------------------------------------------------
59377b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao    // Spanned view of the html tree
59477b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao    //------------------------------------------------------------------------
59577b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao    /**
59677b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao     * @return a Spanned representation of the html tree
59777b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao     */
59877b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao    public Spanned getSpanned() {
59977b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao        if (constructedSpan == null) {
60077b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao            convertToSpan();
60177b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao        }
60277b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao        return constructedSpan;
60377b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao    }
60477b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao
60577b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao    /**
60677b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao     * Converts the html tree to plain text.
60777b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao     * We simply iterate through the nodes in the tree.
60877b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao     * As we output the plain-text, we keep track of the text position
60977b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao     * of each node.
61077b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao     * For String nodes, we replace '\n' with ' ' unless we're in a
61177b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao     * <pre> block.
61277b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao     */
61377b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao    private void convertToSpan() {
61477b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao        X.assertTrue(constructedSpan == null);
61577b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao
61677b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao        int numNodes = nodes.size();
61777b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao
61877b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao        Converter<Spanned> converter = (Converter<Spanned>) converterFactory.createInstance();
61977b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao
62077b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao        for (int i = 0; i < numNodes; i++) {
62177b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao            converter.addNode(nodes.get(i), i, ends.get(i));
62277b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao        }
62377b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao
62477b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao        constructedSpan = converter.getObject();
62577b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao    }
62677b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao
627993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /**
628993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * Encapsulates the logic for outputting plain text with respect to text
629993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * segments, white space separators, line breaks, and quote marks.
630993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   */
631f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy  @VisibleForTesting
632993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  static final class PlainTextPrinter {
633993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    /**
634993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * Separators are whitespace inserted between segments of text. The
635993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * semantics are such that between any two segments of text, there is
636993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * at most one separator. As such, separators are ordered in increasing
637993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * priority, and setting a separator multiple times between text will
638993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * result in the single separator with the highest priority being used.
639993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * For example, a LineBreak (one newline) will override a Space, but will
640993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * be overriden by a BlankLine (two newlines).
641993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     */
642993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    static enum Separator {
643993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // The values here must be ordered by increasing priority, as the
644993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // enum's ordinal() method is used when determining if a new separator
645993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // should override an existing one.
646993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      None,
647993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      Space,      // single space
648993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      LineBreak,  // single new line
649993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      BlankLine   // two new lines
650993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
651993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
652993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // White space characters that are collapsed as a single space.
653993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // Note that characters such as the non-breaking whitespace
654993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // and full-width spaces are not equivalent to the normal spaces.
655993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    private static final String HTML_SPACE_EQUIVALENTS = " \n\r\t\f";
656993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
657993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    /**
658993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * Determines if the given character is considered an HTML space character.
659993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * Consecutive HTML space characters are collapsed into a single space when
660993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * not within a PRE element.
661993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     */
662993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    private static boolean isHtmlWhiteSpace(char ch) {
663993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      return HTML_SPACE_EQUIVALENTS.indexOf(ch) >= 0;
664993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
665993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
666993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // The buffer in which we accumulate the converted plain text
667993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    private final StringBuilder sb = new StringBuilder();
668993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
669993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // How many <blockquote> blocks we are in.
670993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    private int quoteDepth = 0;
671993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
672993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // How many logical newlines are at the end of the buffer we've outputted.
673993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // Note that we can't simply count the newlines at the end of the output
674993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // buffer because a logical new line may be followed by quote marks.
675993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    //
676993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // We initialize the value to 2 so that we consume any initial separators,
677993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // since we don't need separators at the beginning of the output. This also
678993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // results in correctly outputting any quote marks at the beginning of the
679993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // output if the first piece of text is within a BLOCKQUOTE element.
680993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    private int endingNewLines = 2;
681993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
682993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // The next separator to be inserted between two text nodes.
683993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    private Separator separator = Separator.None;
684993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
685993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    /** Returns the current length of the text. */
686993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    final int getTextLength() {
687993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      return sb.length();
688993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
689993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
690993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    /** Returns the current text. */
691993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    final String getText() {
692993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      return sb.toString();
693993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
694993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
695993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    /**
696993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * Sets the next separator between two text nodes. A Space separator is
697993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * used if there is any whitespace between the two text nodes when there is
698993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * no intervening element that breaks flow. This is automatically handled
699993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * by the {@link #appendNormalText} function so the client never needs to
700993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * specify this separator.
701993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * <p>
702993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * A LineBreak separator (single new line) is used if text segments are
703993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * separated or enclosed by elements that break flow (e.g. DIV, TABLE, HR,
704993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * etc.). The client should set this separator for opening and closing tags
705993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * of any element that breaks flow.
706993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * <p>
707993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * A BlankLine separator (two new lines) should be set for opening and
708993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * closing P tags.
709993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * <p>
710993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * If this method is called multiple times between text nodes, a
711993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * separator with a higher priority will override that of a lower priority.
712993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     */
713993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    final void setSeparator(Separator newSeparator) {
714993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      if (newSeparator.ordinal() > separator.ordinal()) {
715993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        separator = newSeparator;
716993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      }
717993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
718993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
719993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    /** Increments the current quote depth of the text. */
720993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    final void incQuoteDepth() {
721993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      quoteDepth++;
722993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
723993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
724993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    /** Decrements the current quote depth of the text. */
725993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    final void decQuoteDepth() {
726993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      quoteDepth = Math.max(0, quoteDepth - 1);
727993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
728993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
729993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    /**
730993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * Normalizes the HTML whitespace in the given {@code text} and appends it
731993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * as the next segment of text. This will flush any separator that should
732993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * be appended before the text, as well as any quote marks that should
733993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * follow the last newline if the quote depth is non-zero.
734993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     */
735993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    final void appendNormalText(String text) {
736993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      if (text.length() == 0) {
737993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        return;
738993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      }
739993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      boolean startsWithSpace = isHtmlWhiteSpace(text.charAt(0));
740993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      boolean endsWithSpace = isHtmlWhiteSpace(text.charAt(text.length() - 1));
741993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
742993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // Strip beginning and ending whitespace.
743993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      text = CharMatcher.anyOf(HTML_SPACE_EQUIVALENTS).trimFrom(text);
744993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
745993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // Collapse whitespace within the text.
746993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      text = CharMatcher.anyOf(HTML_SPACE_EQUIVALENTS).collapseFrom(text, ' ');
747993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
748993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      if (startsWithSpace) {
749993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        setSeparator(Separator.Space);
750993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      }
751993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
752993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      appendTextDirect(text);
753993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
754993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      if (endsWithSpace) {
755993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        setSeparator(Separator.Space);
756993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      }
757993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
758993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
759993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    /**
760993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * Appends the given text, preserving all whitespace. This is used for
761993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * appending text in a PRE element.
762993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     */
763993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    final void appendPreText(String text) {
764993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // We're in a <pre> block. Split the text into lines, and append
765993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // each line with appendTextDirect() to preserve white space.
766993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      String[] lines = text.split("[\\r\\n]", -1);
767993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
768993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // split() will always return an array with at least one element.
769993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      appendTextDirect(lines[0]);
770993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
771993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // For all of the remaining lines, we append a newline first, which
772993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // takes care of any quote marks that we need to output if the quote
773993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // depth is non-zero.
774993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      for (int i = 1; i < lines.length; i++) {
775993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        appendNewLine();
776993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        appendTextDirect(lines[i]);
777993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      }
778993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
779993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
780993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    /**
781993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * Appends the {@code text} directly to the output, taking into account
782993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * any separator that should be appended before it, and any quote marks
783993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * that should follow the last newline if the quote depth is non-zero.
784993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * <p>
785993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * {@code text} must not contain any new lines--in order to handle
786993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * quoting correctly, it is up to the caller to either normalize away the
787993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * newlines, or split the text up into separate lines and handle new lines
788993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * with the {@link #appendNewLine} method.
789993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * <p>
790993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * The original {@code text} is not modified in any way. Use this method
791993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * when you need to preserve the original white space.
792993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * <p>
793993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * If the given {@code text} is non empty, this method will result in
794993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * {@code endingNewLines} being reset to 0.
795993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     */
796993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    private void appendTextDirect(String text) {
797993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      if (text.length() == 0) {
798993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        return;
799993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      }
800993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      Preconditions.checkArgument(text.indexOf('\n') < 0,
801993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira                                  "text must not contain newlines.");
802993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      flushSeparator();
803993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      maybeAddQuoteMarks(true);
804993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      sb.append(text);
805993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      endingNewLines = 0;
806993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
807993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
808993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    /**
809993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * Appends a forced line break, which is the equivalent of a BR element.
810993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     */
811993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    final void appendForcedLineBreak() {
812993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      flushSeparator();
813993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      appendNewLine();
814993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
815993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
816993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    /**
817993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * Appends any pending separator to the output buffer. This should be
818993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * called before appending text to the buffer.
819993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     */
820993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    private void flushSeparator() {
821993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      switch (separator) {
822993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        case Space:
823993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          if (endingNewLines == 0) {
824993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira            // Only append a space separator if we are not following a new
825993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira            // line character. For example, we don't append a separator
826993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira            // space after a <br> tag, since the <br>'s newline fulfills the
827993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira            // space separation requirement.
828993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira            sb.append(" ");
829993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          }
830993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          break;
831993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        case LineBreak:
832993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          while (endingNewLines < 1) {
833993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira            appendNewLine();
834993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          }
835993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          break;
836993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        case BlankLine:
837993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          while (endingNewLines < 2) {
838993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira            appendNewLine();
839993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          }
840993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          break;
841993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      }
842993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      separator = Separator.None;
843993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
844993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
845993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    /**
846993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * Adds a newline to the output. This handles any quote marks that should
847993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * follow any previous new lines, and increments {@code endingNewLines}.
848993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     */
849993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    private void appendNewLine() {
850993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      maybeAddQuoteMarks(false);
851993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      sb.append('\n');
852993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      endingNewLines++;
853993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
854993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
855993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    /**
856993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * Adds quote marks to the output if we are at the beginning of a line.
857993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * One '>' character is used for every level of quoting we are in.
858993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     *
859993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     * @param includeEndingSpace Includes a single space after the quote marks.
860993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira     */
861993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    private void maybeAddQuoteMarks(boolean includeEndingSpace) {
862993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // We only need to add quote marks if we are at the beginning of line.
863993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      if (endingNewLines > 0 && quoteDepth > 0) {
864993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        for (int i = 0; i < quoteDepth; i++) {
865993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          sb.append('>');
866993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        }
867993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        if (includeEndingSpace) {
868993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          sb.append(' ');
869993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        }
870993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      }
871993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
872993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
873993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
874993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /**
875993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * Contains the logic for converting the contents of one HtmlTree into
876993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * plaintext.
877993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   */
87877b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao  public static class DefaultPlainTextConverter implements Converter<String> {
879993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
880993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    private static final Set<HTML.Element> BLANK_LINE_ELEMENTS =
881993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        ImmutableSet.of(
882993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira            HTML4.P_ELEMENT,
883993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira            HTML4.BLOCKQUOTE_ELEMENT,
884993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira            HTML4.PRE_ELEMENT);
885993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
886993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    private final PlainTextPrinter printer = new PlainTextPrinter();
887993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
888993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    private int preDepth = 0;
889c56b233013cf107c702ef9f61305282670ad804aScott Kennedy    private int styleDepth = 0;
890993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
891f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy    @Override
892993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    public void addNode(HtmlDocument.Node n, int nodeNum, int endNum) {
893993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      if (n instanceof HtmlDocument.Text) {        // A string node
894993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
895993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        HtmlDocument.Text textNode = (HtmlDocument.Text) n;
896993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        String str = textNode.getText();
897993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
898993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        if (preDepth > 0) {
899993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          printer.appendPreText(str);
900993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
901c56b233013cf107c702ef9f61305282670ad804aScott Kennedy        } else if (styleDepth > 0) {
902c56b233013cf107c702ef9f61305282670ad804aScott Kennedy          // Append nothing
903993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        } else {
904993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          printer.appendNormalText(str);
905993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        }
906993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
907993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      } else if (n instanceof HtmlDocument.Tag) {
908993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
909993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        // Check for linebreaking tags.
910993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        HtmlDocument.Tag tag = (HtmlDocument.Tag) n;
911993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        HTML.Element element = tag.getElement();
912993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
913993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        if (BLANK_LINE_ELEMENTS.contains(element)) {
914993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          printer.setSeparator(PlainTextPrinter.Separator.BlankLine);
915993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
916993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        } else if (HTML4.BR_ELEMENT.equals(element)) {
917993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          // The <BR> element is special in that it always adds a newline.
918993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          printer.appendForcedLineBreak();
919993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
920993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        } else if (element.breaksFlow()) {
921993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          // All other elements that break the flow add a LineBreak separator.
922993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          printer.setSeparator(PlainTextPrinter.Separator.LineBreak);
923993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
924993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          if (HTML4.HR_ELEMENT.equals(element)) {
925993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira            printer.appendNormalText("________________________________");
926993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira            printer.setSeparator(PlainTextPrinter.Separator.LineBreak);
927993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          }
928993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        }
929993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
930993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        if (HTML4.BLOCKQUOTE_ELEMENT.equals(element)) {
931993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          printer.incQuoteDepth();
932993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
933993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        } else if (HTML4.PRE_ELEMENT.equals(element)) {
934993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          preDepth++;
935c56b233013cf107c702ef9f61305282670ad804aScott Kennedy        } else if (HTML4.STYLE_ELEMENT.equals(element)) {
936c56b233013cf107c702ef9f61305282670ad804aScott Kennedy          styleDepth++;
937993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        }
938993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
939993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      } else if (n instanceof HtmlDocument.EndTag) {
940993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
941993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        // Check for linebreaking tags.
942993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        HtmlDocument.EndTag endTag = (HtmlDocument.EndTag) n;
943993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        HTML.Element element = endTag.getElement();
944993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
945993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        if (BLANK_LINE_ELEMENTS.contains(element)) {
946993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          printer.setSeparator(PlainTextPrinter.Separator.BlankLine);
947993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
948993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        } else if (element.breaksFlow()) {
949993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          // All other elements that break the flow add a LineBreak separator.
950993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          printer.setSeparator(PlainTextPrinter.Separator.LineBreak);
951993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        }
952993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
953993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        if (HTML4.BLOCKQUOTE_ELEMENT.equals(element)) {
954993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          printer.decQuoteDepth();
955993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
956993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        } else if (HTML4.PRE_ELEMENT.equals(element)) {
957993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          preDepth--;
958c56b233013cf107c702ef9f61305282670ad804aScott Kennedy        } else if (HTML4.STYLE_ELEMENT.equals(element)) {
959c56b233013cf107c702ef9f61305282670ad804aScott Kennedy          styleDepth--;
960993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        }
961993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      }
962993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
963993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
964f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy    @Override
965993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    public final int getPlainTextLength() {
966993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      return printer.getTextLength();
967993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
968993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
969f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy    @Override
97077b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao    public final String getObject() {
971993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      return printer.getText();
972993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
973993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
974993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
975993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  //------------------------------------------------------------------------
976993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  // The following methods are used to build the html tree.
977993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  //------------------------------------------------------------------------
978993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /** For building the html tree */
979993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  private Stack<Integer> stack;
980993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  private int parent;
981993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
982993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /** Starts the build process */
983993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  void start() {
984993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    stack = new Stack<Integer>();
985993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    parent = -1;
986993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
987993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
988993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /** Finishes the build process */
989993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  void finish() {
990993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    X.assertTrue(stack.size() == 0);
991993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    X.assertTrue(parent == -1);
992993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
993993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
994993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /**
995f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy   * Adds a html start tag, there must followed later by a call to addEndTag()
996993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * to add the matching end tag
997993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   */
998993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  void addStartTag(HtmlDocument.Tag t) {
999993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    int nodenum = nodes.size();
1000993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    addNode(t, nodenum, -1);
1001993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
1002993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    stack.add(parent);
1003993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    parent = nodenum;
1004993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
1005993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
1006993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /**
1007993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * Adds a html end tag, this must be preceded by a previous matching open tag
1008993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   */
1009993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  void addEndTag(HtmlDocument.EndTag t) {
1010993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    int nodenum = nodes.size();
1011993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    addNode(t, parent, nodenum);
1012993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
1013993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    if (parent != -1) {
1014993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      ends.set(parent, nodenum);
1015993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
1016993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
1017993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    parent = stack.pop();
1018993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
1019993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
1020993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /** Adds a singular tag that does not have a corresponding end tag */
1021993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  void addSingularTag(HtmlDocument.Tag t) {
1022993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    int nodenum = nodes.size();
1023993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    addNode(t, nodenum, nodenum);
1024993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
1025993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
1026993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /**
1027993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * Adds a text
1028993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * @param t a plain-text string
1029993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   */
1030993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  void addText(HtmlDocument.Text t) {
1031993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    int nodenum = nodes.size();
1032993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    addNode(t, nodenum, nodenum);
1033993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
1034993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
1035993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /** Adds a node */
1036993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  private void addNode(HtmlDocument.Node n, int begin, int end) {
1037993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    nodes.add(n);
1038993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    begins.add(begin);
1039993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    ends.add(end);
1040993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
1041993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
1042993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /** For debugging */
1043993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  private static final void debug(String str) {
1044993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    logger.finest(str);
1045993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
1046993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
1047993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira}