1993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira/** 2993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Copyright (c) 2004, Google Inc. 3993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * 4993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Licensed under the Apache License, Version 2.0 (the "License"); 5993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * you may not use this file except in compliance with the License. 6993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * You may obtain a copy of the License at 7993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * 8993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * http://www.apache.org/licenses/LICENSE-2.0 9993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * 10993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Unless required by applicable law or agreed to in writing, software 11993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * distributed under the License is distributed on an "AS IS" BASIS, 12993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * See the License for the specific language governing permissions and 14993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * limitations under the License. 15993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 161bdbfefe4b144c7b031a1d9242a0fa061a0ae6b5Scott Kennedypackage com.google.android.mail.common.html.parser; 17993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 1877b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Caoimport android.text.Spanned; 1977b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao 201bdbfefe4b144c7b031a1d9242a0fa061a0ae6b5Scott Kennedyimport com.google.android.mail.common.base.CharMatcher; 211bdbfefe4b144c7b031a1d9242a0fa061a0ae6b5Scott Kennedyimport com.google.android.mail.common.base.Preconditions; 221bdbfefe4b144c7b031a1d9242a0fa061a0ae6b5Scott Kennedyimport com.google.android.mail.common.base.X; 23f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedyimport com.google.common.annotations.VisibleForTesting; 24993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereiraimport com.google.common.collect.ImmutableSet; 25993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 26993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereiraimport java.util.ArrayList; 27993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereiraimport java.util.Arrays; 28993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereiraimport java.util.Collections; 29993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereiraimport java.util.List; 30993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereiraimport java.util.Set; 31993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereiraimport java.util.Stack; 32993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereiraimport java.util.logging.Logger; 33993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 34993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira/** 35993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * HtmlTree represents a parsed and well-formed html text, it provides 36993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * methods to convert to plain text. It also provides methods to find 37993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * well-formed blocks of text, for quote detection. 38993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * 39993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * We don't really build a html tree data structure. Instead, for 40993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * efficiency, and for the ability to do a simple in-order-traversal 41993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * of the tree, we simply keeps a linear list of nodes (in-order). 42993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * The begin_ and end_ arrays keeps track of the starting end ending 43993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * nodes: 44993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * 45993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * For a string node, begin_[node] = end_[node] = node 46993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * For an open tag, begin_[node] = node, end_[node] = the matching end tag 47993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * For a close tag, end_[node] = the matching open tag, end_[node] = node 48993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * 49993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * @author jlim@google.com (Jing Yee Lim) 50993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 51993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereirapublic class HtmlTree { 52f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy // http://www.w3.org/TR/html4/struct/text.html#h-9.1 53f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy private static final CharMatcher HTML_WHITESPACE = CharMatcher.anyOf(" \t\f\u200b\r\n"); 54993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 55993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 56993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * An interface that allows clients to provide their own implementation 5777b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao * of a {@link Converter}. 58993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 5977b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao public static interface ConverterFactory { 60993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 6177b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao * Creates a new instance of a {@link Converter} to convert 6277b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao * the contents of an {@link HtmlTree} to some resulting object. 63993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 6477b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao Converter createInstance(); 65993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 66993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 67993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 6877b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao * An interface for an object which converts a single HtmlTree into some object 69993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 7077b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao public static interface Converter<T> { 71993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 72993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Adds the given node {@code n} to plain text. 73993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * 74993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * @param n The node to convert to text. 75993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * @param nodeNum The number of the node among the list of all notes. 76993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * @param endNum The number of the ending node if this is a start node, 77993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * otherwise the same as {@code nodeNum}. 78993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 79993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira void addNode(HtmlDocument.Node n, int nodeNum, int endNum); 80993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 81993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 82993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Returns the current length of the plain text. 83993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 84993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira int getPlainTextLength(); 85993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 86993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 8777b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao * Returns the current built object. 88993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 8977b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao T getObject(); 90993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 91993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 92993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** A factory that produces converters of the default implementation. */ 9377b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao private static final ConverterFactory DEFAULT_CONVERTER_FACTORY = 9477b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao new ConverterFactory() { 95f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy @Override 9677b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao public Converter<String> createInstance() { 97993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira return new DefaultPlainTextConverter(); 98993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 99993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira }; 100993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 101993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** Contains html nodes */ 102993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private final List<HtmlDocument.Node> nodes = new ArrayList<HtmlDocument.Node>(); 103993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 104993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** Keeps track of beginning and end of each node */ 105993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private final Stack<Integer> begins = new Stack<Integer>(); 106993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private final Stack<Integer> ends = new Stack<Integer>(); 107993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 108993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** Plain text (lazy creation) */ 109993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private String plainText; 110993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 11177b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao /** Constructed span (lazy creation) */ 11277b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao private Spanned constructedSpan; 11377b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao 114993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** The html string (lazy creation) */ 115993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private String html; 116993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 117993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** textPositions[node pos] = the text position */ 118993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private int[] textPositions; 119993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 12077b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao private ConverterFactory converterFactory = DEFAULT_CONVERTER_FACTORY; 121993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 122993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // For debugging only 123993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private static final boolean DEBUG = false; 124993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 125993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private static final Logger logger = Logger.getLogger(HtmlTree.class.getName()); 126993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 127993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira //------------------------------------------------------------------------ 128993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 129993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** HtmlTree can only be constructed from this package */ 130993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira HtmlTree() { 131993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 132993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 133993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 13477b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao * Sets a new {@link ConverterFactory} to be used to convert 135993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * the contents of this tree to plaintext. 136993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 13777b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao public void setConverterFactory(ConverterFactory factory) { 138993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (factory == null) { 139993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira throw new NullPointerException("factory must not be null"); 140993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 141993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira converterFactory = factory; 142993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 143993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 144993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 145993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Gets the list of node objects. A node can be either a 146993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Tag, EngTag or a String object. 147993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * @return the nodes of the tree 148993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 149993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira public List<HtmlDocument.Node> getNodesList() { 150993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira return Collections.unmodifiableList(nodes); 151993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 152993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 153993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 154993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * @return number of nodes 155993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 156993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira public int getNumNodes() { 157993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira return nodes.size(); 158993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 159993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 160993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 161f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy * Returns number of matching open tag node, or {@code endTagNodeNum} itself 162f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy * if it does not point to a closing tag. 163f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy */ 164f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy public int findOpenTag(int endTagNodeNum) { 165f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy X.assertTrue(endTagNodeNum >= 0 && endTagNodeNum < nodes.size()); 166f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy return begins.get(endTagNodeNum); 167f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy } 168f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy 169f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy /** 170f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy * Returns number of matching closing tag node, or {@code openTagNodeNum} itself 171f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy * if it does not point to an open tag or points to an open tag with no closing one. 172f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy */ 173f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy public int findEndTag(int openTagNodeNum) { 174f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy X.assertTrue(openTagNodeNum >= 0 && openTagNodeNum < nodes.size()); 175f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy return ends.get(openTagNodeNum); 176f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy } 177f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy 178f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy /** 179f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy * Returns number of matching open/closing tag node, or {@code tagNodeNum} itself 180f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy * if it does not point to an open/closing tag (e.g text node or comment). 181f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy */ 182f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy public int findPairedTag(int tagNodeNum) { 183f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy X.assertTrue(tagNodeNum >= 0 && tagNodeNum < nodes.size()); 184f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy int openNodeNum = begins.get(tagNodeNum); 185f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy int endNodeNum = ends.get(tagNodeNum); 186f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy return tagNodeNum == openNodeNum ? endNodeNum : openNodeNum; 187f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy } 188f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy 189f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy /** 190993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Gets the entire html. 191993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 192993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira public String getHtml() { 193993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira return getHtml(-1); 194993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 195993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 196993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 197993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Gets the entire html, if wrapSize is > 0, it tries to do wrapping at the 198993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * specified size. 199993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 200993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira public String getHtml(int wrapSize) { 201993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (html == null) { 202993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira html = getHtml(0, nodes.size(), wrapSize); 203993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 204993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira return html; 205993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 206993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 207993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** Gets parts of the html */ 208993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira public String getHtml(int fromNode, int toNode) { 209993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira return getHtml(fromNode, toNode, -1); 210993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 211993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 212993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 213993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Gets parts of the html, if wrapSize is > 0, it tries 214993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * to do wrapping at the specified size. 215993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 216993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira public String getHtml(int fromNode, int toNode, int wrapSize) { 217993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira X.assertTrue(fromNode >= 0 && toNode <= nodes.size()); 218993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 219993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira int estSize = (toNode - fromNode) * 10; 220993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira StringBuilder sb = new StringBuilder(estSize); 221993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira int lastWrapIndex = 0; // used for wrapping 222993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira for (int n = fromNode; n < toNode; n++) { 223993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira HtmlDocument.Node node = nodes.get(n); 224993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira node.toHTML(sb); 225993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // TODO: maybe we can be smarter about this and not add newlines 226993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // within <pre> tags, unless the whole long line is encompassed 227993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // by the <pre> tag. 228993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (wrapSize > 0) { 229993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // We can only wrap if the last outputted node is an element that 230993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // breaks the flow. Otherwise, we risk the possibility of inserting 231993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // spaces where they shouldn't be. 232993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if ((node instanceof HtmlDocument.Tag && 233993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira ((HtmlDocument.Tag) node).getElement().breaksFlow()) || 234993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira (node instanceof HtmlDocument.EndTag && 235993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira ((HtmlDocument.EndTag) node).getElement().breaksFlow())) { 236993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // Check to see if there is a newline in the most recent node's html. 237993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira int recentNewLine = sb.substring(lastWrapIndex + 1).lastIndexOf('\n'); 238993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (recentNewLine != -1) { 239993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira lastWrapIndex += recentNewLine; 240993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 241993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // If the last index - last index of a newline is greater than 242993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // wrapSize, add a newline. 243993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (((sb.length() - 1) - lastWrapIndex) > wrapSize) { 244993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira sb.append('\n'); 245993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira lastWrapIndex = sb.length() - 1; 246993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 247993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 248993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 249993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 250993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 251993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira return sb.toString(); 252993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 253993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 254993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 255993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Convert a html region into chunks of html code, each containing 256993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * roughly chunkSize characters. 257993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 258993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira public ArrayList<String> getHtmlChunks(int fromNode, int toNode, int chunkSize) { 259993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira X.assertTrue(fromNode >= 0 && toNode <= nodes.size()); 260993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 261993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira ArrayList<String> chunks = new ArrayList<String>(); 262993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 263993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // Do a best effort attempt to not split apart certain elements (as of now, 264993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // just the <textarea>). We cannot guarantee that they will not be split 265993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // because the client may specify endpoint nodes that land in the middle 266993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // of an element (although this shouldn't happen if the endpoints returned 267993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // by createBlocks() are properly used). 268993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira int stack = 0; 269993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira boolean balanced = true; 270993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 271993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira StringBuilder sb = new StringBuilder(chunkSize + 256); 272993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira for (int n = fromNode; n < toNode; n++) { 273993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira HtmlDocument.Node node = nodes.get(n); 274993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira node.toHTML(sb); 275993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 276993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (node instanceof HtmlDocument.Tag) { 277993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (HTML4.TEXTAREA_ELEMENT.equals( 278f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy ((HtmlDocument.Tag) node).getElement())) { 279993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira stack++; 280993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 281993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 282993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (node instanceof HtmlDocument.EndTag) { 283993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (HTML4.TEXTAREA_ELEMENT.equals( 284f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy ((HtmlDocument.EndTag) node).getElement())) { 285993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (stack == 0) { 286993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira balanced = false; 287993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } else { 288993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira stack--; 289993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 290993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 291993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 292993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 293993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (stack == 0 && sb.length() >= chunkSize) { 294993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira chunks.add(sb.toString()); 295993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira sb.setLength(0); 296993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 297993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 298993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 299993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // Don't forget the last chunk! 300993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (sb.length() > 0) { 301993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira chunks.add(sb.toString()); 302993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 303993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 304993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // If the tree is not balanced (cut off in the middle of a node), log 305993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // debug data. Clients should fix their code so that the endpoints from 306993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // createBlocks() are properly used. 307993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (!balanced || stack != 0) { 308993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira StringBuilder debug = new StringBuilder("Returning unbalanced HTML:\n"); 309993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira debug.append(getHtml()); 310993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira debug.append("\nfromNode: ").append(fromNode); 311993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira debug.append("\ntoNode: ").append(toNode); 312993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira debug.append("\nNum nodes_: ").append(getNumNodes()); 313993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira for (String chunk : chunks) { 314993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira debug.append("\nChunk:\n").append(chunk); 315993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 316993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira logger.severe(debug.toString()); 317993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 318993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 319993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira return chunks; 320993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 321993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 322993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 323993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Returns height (maximum length from root to a leaf) of the HTML tree. 324993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * @return height of the HTML tree. 325993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 326993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira public int getTreeHeight() { 327993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira int currentHeight = 0; 328993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira int maxHeight = 0; 329993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 330993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira for (int i = 0; i < nodes.size(); i++) { 331993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira HtmlDocument.Node node = nodes.get(i); 332993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (node instanceof HtmlDocument.Tag) { 333993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira currentHeight++; 334993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (currentHeight > maxHeight) { 335993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira maxHeight = currentHeight; 336993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 337993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (((HtmlDocument.Tag) node).getElement().isEmpty()) { 338993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // Empty tags have no closing pair, so decrease counter here. 339993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira currentHeight--; 340993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 341993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } else if (node instanceof HtmlDocument.EndTag) { 342993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira currentHeight--; 343993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 344993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 345993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 346993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // TODO(anatol): make this value cachable? 347993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira return maxHeight; 348993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 349993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 350993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira //------------------------------------------------------------------------ 351993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // Creating well-formed blocks within the html tree. 352993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira //------------------------------------------------------------------------ 353993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 354993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * A Block represents a region of a html tree that 355993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * 1) is well-formed, i.e. for each node in the block, all its descendants 356993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * are also contained in the block. So it's safe to wrap the region 357993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * within a <table> or <div>, etc. 358993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * 2) starts at the beginning of a "line", e.g. a <div>, a <br>. 359993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 360993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira public static class Block { 361993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /* The starting node */ 362993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira public int start_node; 363993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 364993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /* The ending node (non-inclusive to the block) */ 365993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira public int end_node; 366993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 367993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 368993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 369993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Creates a list of Blocks, given a text-range. 370993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * We may create multiple blocks if one single well-formed Block cannot be 371993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * created. 372993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * 373993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * @param textStart beginning plain-text offset 374993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * @param textEnd beginning plain-text offset 375993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * @param minNode the smallest node number 376993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * @param maxNode the largest node number 377993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * @return a list of 0 or more Block objects, never null 378993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 379993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira public ArrayList<Block> createBlocks(int textStart, int textEnd, int minNode, int maxNode) { 380993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 381993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira ArrayList<Block> blocks = new ArrayList<Block>(); 382993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira int startNode = Math.max(getBlockStart(textStart), minNode); 383993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira int endNode = Math.min(getBlockEnd(textEnd), maxNode); 384993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 385993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (DEBUG) { 386993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira debug("Creating block: " + 387993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira "text pos: " + textStart + "-" + textEnd + "\n" + 388993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira "node pos: " + startNode + "-" + endNode + "\n" + 389993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira plainText.substring(textStart, textEnd)); 390993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 391993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 392993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // Split up the block [start, end) into one or more blocks that 393993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // are well-formed, and begins at a "line" boundary. 394993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira int blockStart = -1; 395993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira for (int n = startNode; n < endNode;) { 396993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 397993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // The node n spans [nBegin, nEnd] 398993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira int nBegin = begins.get(n); 399993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira int nEnd = ends.get(n); 400993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 401993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (blockStart == -1) { 402993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // Check if this is a valid start node 403993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (nBegin >= n && nEnd <= endNode && 404993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira canBeginBlockAt(n)) { 405993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira blockStart = n; 406993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira n = nEnd + 1; 407993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } else { 408993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira n++; 409993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 410993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira continue; 411993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 412993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 413993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // If the node [nBegin, nEnd) lies completely within 414993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // the region then proceed to the (nEnd + 1). 415993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (nBegin >= blockStart && nEnd < endNode) { 416993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira n = nEnd + 1; 417993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira continue; 418993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 419993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 420993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // If we got here, we have to break up the region into one 421993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // or more blocks because the current node cannot be included 422993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // in the region. 423993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (DEBUG) { 424993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira debug("Forcing new block: " + n + " (" + nBegin + " " + nEnd + 425993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira ") exceeds (" + blockStart + " " + endNode + ")"); 426993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 427993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira Block b = new Block(); 428993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira b.start_node = blockStart; 429993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira b.end_node = n; 430993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira blocks.add(b); 431993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 432993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira blockStart = -1; 433993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira n++; 434993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 435993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 436993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // Last block 437993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (blockStart != -1) { 438993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira Block b = new Block(); 439993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira b.start_node = blockStart; 440993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira b.end_node = endNode; 441993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira blocks.add(b); 442993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 443993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 444993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (DEBUG) { 445993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira for (int i = 0; i < blocks.size(); i++) { 446993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira Block b = blocks.get(i); 447993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira debug("Block " + i + "/" + blocks.size() + ": " + 448993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira b.start_node + "-" + b.end_node + " " + 449993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira getPlainText(b.start_node, b.end_node)); 450993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 451993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 452993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 453993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira return blocks; 454993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 455993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 456993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 457993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Checks if a block can begin starting from a node position 458993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 459993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private boolean canBeginBlockAt(int nodePos) { 460993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira int textPos = textPositions[nodePos]; 461993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 462993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // Make sure that we don't exceed the text position, this happens 463993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // for the last tag nodes. 464993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (textPos == plainText.length()) { 465993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira textPos--; 466993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 467993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 468993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // Scan backwards to check if a nodePos is at the beginning 469993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // of a line. 470993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira for (int i = textPos; i > 0; i--) { 471993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira char ch = plainText.charAt(i); 472993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (ch == '\n') { 473993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira return true; 474993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 475f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy if (i < textPos && !HTML_WHITESPACE.matches(ch)) { 476993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira return false; 477993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 478993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 479993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira return true; 480993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 481993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 482993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 483993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Returns the start of a block given a text-pos 484993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 485993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private int getBlockStart(int textPos) { 486993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira int nodenum = Arrays.binarySearch(textPositions, textPos); 487993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (nodenum >= 0) { 488993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // Got an exact node alignment. Get the outer most pos that 489993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // matches the text position 490993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira while ((nodenum - 1) >= 0 && textPositions[nodenum - 1] == textPos) { 491993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira nodenum--; 492993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 493993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } else { 494993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // textPos matches the middle of a node. 495993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira nodenum = -nodenum - 1; 496993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 497993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 498993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira X.assertTrue(nodenum >= 0 && nodenum <= nodes.size()); 499993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira return nodenum; 500993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 501993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 502993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 503993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Returns the end of a block given a text-pos 504993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 505993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private int getBlockEnd(int textPos) { 506993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira int nodenum = Arrays.binarySearch(textPositions, textPos); 507993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (nodenum >= 0) { 508993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // Got an exact node alignment. 509993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira while ((nodenum + 1) < textPositions.length && textPositions[nodenum + 1] == textPos) { 510993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira nodenum++; 511993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 512993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } else { 513993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // textPos matches the middle of a node. 514993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira nodenum = -nodenum - 2; 515993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 516993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira X.assertTrue(nodenum >= 0 && nodenum <= nodes.size()); 517993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira return nodenum; 518993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 519993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 520993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira //------------------------------------------------------------------------ 521993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // Plain text view of the html tree 522993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira //------------------------------------------------------------------------ 523993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 524993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * @return the plain-text position corresponding to the node 525993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 526993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira public int getTextPosition(int node) { 527993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira return textPositions[node]; 528993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 529993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 530993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 531993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * @return a plain-text String of the html tree 532993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 533993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira public String getPlainText() { 534993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (plainText == null) { 535993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira convertToPlainText(); 536993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 537993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira return plainText; 538993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 539993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 540993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 541993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * @return a plain-text String of a part of the html tree 542993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 543993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira public String getPlainText(int fromNode, int toNode) { 544993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (plainText == null) { 545993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira convertToPlainText(); 546993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 547993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira int textstart = textPositions[fromNode]; 548993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira int textend = textPositions[toNode]; 549993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira return plainText.substring(textstart, textend); 550993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 551993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 552993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 553993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Converts the html tree to plain text. 554993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * We simply iterate through the nodes in the tree. 555993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * As we output the plain-text, we keep track of the text position 556993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * of each node. 557993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * For String nodes, we replace '\n' with ' ' unless we're in a 558993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * <pre> block. 559993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 560993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private void convertToPlainText() { 561993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira X.assertTrue(plainText == null && textPositions == null); 562993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 563993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira int numNodes = nodes.size(); 564993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 565993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // Keeps track of start text position of each node, including a last 566993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // entry for the size of the text. 567993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira textPositions = new int[numNodes + 1]; 568993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 56977b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao Converter<String> converter = (Converter<String>) converterFactory.createInstance(); 570993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 571993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira for (int i = 0; i < numNodes; i++) { 572993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira textPositions[i] = converter.getPlainTextLength(); 573993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira converter.addNode(nodes.get(i), i, ends.get(i)); 574993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 575993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 576993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // Add a last entry, so that textPositions_[nodes_.size()] is valid. 577993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira textPositions[numNodes] = converter.getPlainTextLength(); 578993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 57977b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao plainText = converter.getObject(); 580993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 581993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (DEBUG) { 582993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira debug("Plain text: " + plainText); 583993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 584993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira for (int i = 0; i < nodes.size(); i++) { 585993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira int textPos = textPositions[i]; 586993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira String text = plainText.substring(textPos, textPositions[i + 1]); 587993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira debug("At " + i + ": pos=" + textPos + " " + text); 588993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 589993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 590993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 591993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 59277b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao //------------------------------------------------------------------------ 59377b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao // Spanned view of the html tree 59477b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao //------------------------------------------------------------------------ 59577b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao /** 59677b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao * @return a Spanned representation of the html tree 59777b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao */ 59877b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao public Spanned getSpanned() { 59977b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao if (constructedSpan == null) { 60077b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao convertToSpan(); 60177b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao } 60277b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao return constructedSpan; 60377b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao } 60477b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao 60577b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao /** 60677b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao * Converts the html tree to plain text. 60777b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao * We simply iterate through the nodes in the tree. 60877b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao * As we output the plain-text, we keep track of the text position 60977b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao * of each node. 61077b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao * For String nodes, we replace '\n' with ' ' unless we're in a 61177b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao * <pre> block. 61277b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao */ 61377b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao private void convertToSpan() { 61477b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao X.assertTrue(constructedSpan == null); 61577b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao 61677b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao int numNodes = nodes.size(); 61777b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao 61877b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao Converter<Spanned> converter = (Converter<Spanned>) converterFactory.createInstance(); 61977b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao 62077b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao for (int i = 0; i < numNodes; i++) { 62177b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao converter.addNode(nodes.get(i), i, ends.get(i)); 62277b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao } 62377b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao 62477b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao constructedSpan = converter.getObject(); 62577b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao } 62677b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao 627993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 628993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Encapsulates the logic for outputting plain text with respect to text 629993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * segments, white space separators, line breaks, and quote marks. 630993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 631f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy @VisibleForTesting 632993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira static final class PlainTextPrinter { 633993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 634993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Separators are whitespace inserted between segments of text. The 635993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * semantics are such that between any two segments of text, there is 636993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * at most one separator. As such, separators are ordered in increasing 637993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * priority, and setting a separator multiple times between text will 638993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * result in the single separator with the highest priority being used. 639993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * For example, a LineBreak (one newline) will override a Space, but will 640993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * be overriden by a BlankLine (two newlines). 641993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 642993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira static enum Separator { 643993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // The values here must be ordered by increasing priority, as the 644993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // enum's ordinal() method is used when determining if a new separator 645993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // should override an existing one. 646993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira None, 647993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira Space, // single space 648993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira LineBreak, // single new line 649993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira BlankLine // two new lines 650993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 651993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 652993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // White space characters that are collapsed as a single space. 653993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // Note that characters such as the non-breaking whitespace 654993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // and full-width spaces are not equivalent to the normal spaces. 655993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private static final String HTML_SPACE_EQUIVALENTS = " \n\r\t\f"; 656993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 657993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 658993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Determines if the given character is considered an HTML space character. 659993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Consecutive HTML space characters are collapsed into a single space when 660993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * not within a PRE element. 661993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 662993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private static boolean isHtmlWhiteSpace(char ch) { 663993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira return HTML_SPACE_EQUIVALENTS.indexOf(ch) >= 0; 664993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 665993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 666993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // The buffer in which we accumulate the converted plain text 667993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private final StringBuilder sb = new StringBuilder(); 668993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 669993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // How many <blockquote> blocks we are in. 670993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private int quoteDepth = 0; 671993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 672993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // How many logical newlines are at the end of the buffer we've outputted. 673993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // Note that we can't simply count the newlines at the end of the output 674993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // buffer because a logical new line may be followed by quote marks. 675993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // 676993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // We initialize the value to 2 so that we consume any initial separators, 677993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // since we don't need separators at the beginning of the output. This also 678993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // results in correctly outputting any quote marks at the beginning of the 679993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // output if the first piece of text is within a BLOCKQUOTE element. 680993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private int endingNewLines = 2; 681993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 682993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // The next separator to be inserted between two text nodes. 683993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private Separator separator = Separator.None; 684993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 685993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** Returns the current length of the text. */ 686993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira final int getTextLength() { 687993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira return sb.length(); 688993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 689993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 690993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** Returns the current text. */ 691993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira final String getText() { 692993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira return sb.toString(); 693993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 694993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 695993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 696993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Sets the next separator between two text nodes. A Space separator is 697993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * used if there is any whitespace between the two text nodes when there is 698993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * no intervening element that breaks flow. This is automatically handled 699993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * by the {@link #appendNormalText} function so the client never needs to 700993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * specify this separator. 701993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * <p> 702993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * A LineBreak separator (single new line) is used if text segments are 703993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * separated or enclosed by elements that break flow (e.g. DIV, TABLE, HR, 704993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * etc.). The client should set this separator for opening and closing tags 705993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * of any element that breaks flow. 706993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * <p> 707993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * A BlankLine separator (two new lines) should be set for opening and 708993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * closing P tags. 709993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * <p> 710993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * If this method is called multiple times between text nodes, a 711993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * separator with a higher priority will override that of a lower priority. 712993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 713993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira final void setSeparator(Separator newSeparator) { 714993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (newSeparator.ordinal() > separator.ordinal()) { 715993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira separator = newSeparator; 716993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 717993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 718993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 719993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** Increments the current quote depth of the text. */ 720993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira final void incQuoteDepth() { 721993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira quoteDepth++; 722993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 723993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 724993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** Decrements the current quote depth of the text. */ 725993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira final void decQuoteDepth() { 726993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira quoteDepth = Math.max(0, quoteDepth - 1); 727993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 728993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 729993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 730993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Normalizes the HTML whitespace in the given {@code text} and appends it 731993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * as the next segment of text. This will flush any separator that should 732993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * be appended before the text, as well as any quote marks that should 733993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * follow the last newline if the quote depth is non-zero. 734993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 735993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira final void appendNormalText(String text) { 736993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (text.length() == 0) { 737993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira return; 738993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 739993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira boolean startsWithSpace = isHtmlWhiteSpace(text.charAt(0)); 740993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira boolean endsWithSpace = isHtmlWhiteSpace(text.charAt(text.length() - 1)); 741993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 742993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // Strip beginning and ending whitespace. 743993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira text = CharMatcher.anyOf(HTML_SPACE_EQUIVALENTS).trimFrom(text); 744993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 745993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // Collapse whitespace within the text. 746993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira text = CharMatcher.anyOf(HTML_SPACE_EQUIVALENTS).collapseFrom(text, ' '); 747993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 748993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (startsWithSpace) { 749993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira setSeparator(Separator.Space); 750993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 751993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 752993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira appendTextDirect(text); 753993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 754993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (endsWithSpace) { 755993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira setSeparator(Separator.Space); 756993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 757993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 758993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 759993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 760993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Appends the given text, preserving all whitespace. This is used for 761993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * appending text in a PRE element. 762993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 763993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira final void appendPreText(String text) { 764993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // We're in a <pre> block. Split the text into lines, and append 765993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // each line with appendTextDirect() to preserve white space. 766993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira String[] lines = text.split("[\\r\\n]", -1); 767993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 768993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // split() will always return an array with at least one element. 769993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira appendTextDirect(lines[0]); 770993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 771993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // For all of the remaining lines, we append a newline first, which 772993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // takes care of any quote marks that we need to output if the quote 773993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // depth is non-zero. 774993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira for (int i = 1; i < lines.length; i++) { 775993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira appendNewLine(); 776993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira appendTextDirect(lines[i]); 777993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 778993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 779993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 780993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 781993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Appends the {@code text} directly to the output, taking into account 782993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * any separator that should be appended before it, and any quote marks 783993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * that should follow the last newline if the quote depth is non-zero. 784993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * <p> 785993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * {@code text} must not contain any new lines--in order to handle 786993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * quoting correctly, it is up to the caller to either normalize away the 787993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * newlines, or split the text up into separate lines and handle new lines 788993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * with the {@link #appendNewLine} method. 789993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * <p> 790993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * The original {@code text} is not modified in any way. Use this method 791993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * when you need to preserve the original white space. 792993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * <p> 793993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * If the given {@code text} is non empty, this method will result in 794993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * {@code endingNewLines} being reset to 0. 795993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 796993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private void appendTextDirect(String text) { 797993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (text.length() == 0) { 798993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira return; 799993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 800993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira Preconditions.checkArgument(text.indexOf('\n') < 0, 801993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira "text must not contain newlines."); 802993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira flushSeparator(); 803993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira maybeAddQuoteMarks(true); 804993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira sb.append(text); 805993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira endingNewLines = 0; 806993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 807993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 808993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 809993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Appends a forced line break, which is the equivalent of a BR element. 810993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 811993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira final void appendForcedLineBreak() { 812993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira flushSeparator(); 813993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira appendNewLine(); 814993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 815993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 816993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 817993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Appends any pending separator to the output buffer. This should be 818993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * called before appending text to the buffer. 819993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 820993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private void flushSeparator() { 821993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira switch (separator) { 822993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira case Space: 823993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (endingNewLines == 0) { 824993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // Only append a space separator if we are not following a new 825993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // line character. For example, we don't append a separator 826993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // space after a <br> tag, since the <br>'s newline fulfills the 827993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // space separation requirement. 828993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira sb.append(" "); 829993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 830993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira break; 831993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira case LineBreak: 832993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira while (endingNewLines < 1) { 833993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira appendNewLine(); 834993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 835993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira break; 836993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira case BlankLine: 837993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira while (endingNewLines < 2) { 838993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira appendNewLine(); 839993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 840993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira break; 841993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 842993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira separator = Separator.None; 843993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 844993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 845993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 846993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Adds a newline to the output. This handles any quote marks that should 847993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * follow any previous new lines, and increments {@code endingNewLines}. 848993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 849993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private void appendNewLine() { 850993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira maybeAddQuoteMarks(false); 851993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira sb.append('\n'); 852993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira endingNewLines++; 853993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 854993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 855993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 856993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Adds quote marks to the output if we are at the beginning of a line. 857993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * One '>' character is used for every level of quoting we are in. 858993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * 859993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * @param includeEndingSpace Includes a single space after the quote marks. 860993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 861993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private void maybeAddQuoteMarks(boolean includeEndingSpace) { 862993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // We only need to add quote marks if we are at the beginning of line. 863993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (endingNewLines > 0 && quoteDepth > 0) { 864993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira for (int i = 0; i < quoteDepth; i++) { 865993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira sb.append('>'); 866993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 867993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (includeEndingSpace) { 868993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira sb.append(' '); 869993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 870993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 871993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 872993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 873993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 874993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 875993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Contains the logic for converting the contents of one HtmlTree into 876993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * plaintext. 877993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 87877b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao public static class DefaultPlainTextConverter implements Converter<String> { 879993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 880993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private static final Set<HTML.Element> BLANK_LINE_ELEMENTS = 881993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira ImmutableSet.of( 882993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira HTML4.P_ELEMENT, 883993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira HTML4.BLOCKQUOTE_ELEMENT, 884993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira HTML4.PRE_ELEMENT); 885993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 886993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private final PlainTextPrinter printer = new PlainTextPrinter(); 887993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 888993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private int preDepth = 0; 889c56b233013cf107c702ef9f61305282670ad804aScott Kennedy private int styleDepth = 0; 890993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 891f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy @Override 892993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira public void addNode(HtmlDocument.Node n, int nodeNum, int endNum) { 893993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (n instanceof HtmlDocument.Text) { // A string node 894993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 895993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira HtmlDocument.Text textNode = (HtmlDocument.Text) n; 896993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira String str = textNode.getText(); 897993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 898993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (preDepth > 0) { 899993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira printer.appendPreText(str); 900993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 901c56b233013cf107c702ef9f61305282670ad804aScott Kennedy } else if (styleDepth > 0) { 902c56b233013cf107c702ef9f61305282670ad804aScott Kennedy // Append nothing 903993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } else { 904993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira printer.appendNormalText(str); 905993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 906993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 907993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } else if (n instanceof HtmlDocument.Tag) { 908993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 909993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // Check for linebreaking tags. 910993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira HtmlDocument.Tag tag = (HtmlDocument.Tag) n; 911993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira HTML.Element element = tag.getElement(); 912993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 913993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (BLANK_LINE_ELEMENTS.contains(element)) { 914993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira printer.setSeparator(PlainTextPrinter.Separator.BlankLine); 915993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 916993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } else if (HTML4.BR_ELEMENT.equals(element)) { 917993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // The <BR> element is special in that it always adds a newline. 918993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira printer.appendForcedLineBreak(); 919993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 920993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } else if (element.breaksFlow()) { 921993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // All other elements that break the flow add a LineBreak separator. 922993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira printer.setSeparator(PlainTextPrinter.Separator.LineBreak); 923993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 924993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (HTML4.HR_ELEMENT.equals(element)) { 925993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira printer.appendNormalText("________________________________"); 926993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira printer.setSeparator(PlainTextPrinter.Separator.LineBreak); 927993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 928993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 929993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 930993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (HTML4.BLOCKQUOTE_ELEMENT.equals(element)) { 931993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira printer.incQuoteDepth(); 932993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 933993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } else if (HTML4.PRE_ELEMENT.equals(element)) { 934993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira preDepth++; 935c56b233013cf107c702ef9f61305282670ad804aScott Kennedy } else if (HTML4.STYLE_ELEMENT.equals(element)) { 936c56b233013cf107c702ef9f61305282670ad804aScott Kennedy styleDepth++; 937993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 938993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 939993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } else if (n instanceof HtmlDocument.EndTag) { 940993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 941993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // Check for linebreaking tags. 942993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira HtmlDocument.EndTag endTag = (HtmlDocument.EndTag) n; 943993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira HTML.Element element = endTag.getElement(); 944993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 945993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (BLANK_LINE_ELEMENTS.contains(element)) { 946993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira printer.setSeparator(PlainTextPrinter.Separator.BlankLine); 947993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 948993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } else if (element.breaksFlow()) { 949993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // All other elements that break the flow add a LineBreak separator. 950993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira printer.setSeparator(PlainTextPrinter.Separator.LineBreak); 951993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 952993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 953993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (HTML4.BLOCKQUOTE_ELEMENT.equals(element)) { 954993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira printer.decQuoteDepth(); 955993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 956993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } else if (HTML4.PRE_ELEMENT.equals(element)) { 957993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira preDepth--; 958c56b233013cf107c702ef9f61305282670ad804aScott Kennedy } else if (HTML4.STYLE_ELEMENT.equals(element)) { 959c56b233013cf107c702ef9f61305282670ad804aScott Kennedy styleDepth--; 960993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 961993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 962993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 963993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 964f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy @Override 965993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira public final int getPlainTextLength() { 966993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira return printer.getTextLength(); 967993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 968993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 969f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy @Override 97077b4c2c31d7601665c337ce5cbc9d84fb9332be8Jin Cao public final String getObject() { 971993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira return printer.getText(); 972993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 973993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 974993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 975993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira //------------------------------------------------------------------------ 976993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // The following methods are used to build the html tree. 977993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira //------------------------------------------------------------------------ 978993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** For building the html tree */ 979993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private Stack<Integer> stack; 980993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private int parent; 981993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 982993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** Starts the build process */ 983993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira void start() { 984993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira stack = new Stack<Integer>(); 985993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira parent = -1; 986993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 987993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 988993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** Finishes the build process */ 989993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira void finish() { 990993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira X.assertTrue(stack.size() == 0); 991993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira X.assertTrue(parent == -1); 992993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 993993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 994993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 995f13c3be6a8772a9d6d35c247340d14b863a64befScott Kennedy * Adds a html start tag, there must followed later by a call to addEndTag() 996993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * to add the matching end tag 997993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 998993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira void addStartTag(HtmlDocument.Tag t) { 999993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira int nodenum = nodes.size(); 1000993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira addNode(t, nodenum, -1); 1001993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 1002993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira stack.add(parent); 1003993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira parent = nodenum; 1004993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 1005993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 1006993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 1007993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Adds a html end tag, this must be preceded by a previous matching open tag 1008993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 1009993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira void addEndTag(HtmlDocument.EndTag t) { 1010993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira int nodenum = nodes.size(); 1011993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira addNode(t, parent, nodenum); 1012993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 1013993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (parent != -1) { 1014993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira ends.set(parent, nodenum); 1015993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 1016993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 1017993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira parent = stack.pop(); 1018993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 1019993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 1020993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** Adds a singular tag that does not have a corresponding end tag */ 1021993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira void addSingularTag(HtmlDocument.Tag t) { 1022993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira int nodenum = nodes.size(); 1023993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira addNode(t, nodenum, nodenum); 1024993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 1025993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 1026993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 1027993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Adds a text 1028993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * @param t a plain-text string 1029993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 1030993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira void addText(HtmlDocument.Text t) { 1031993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira int nodenum = nodes.size(); 1032993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira addNode(t, nodenum, nodenum); 1033993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 1034993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 1035993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** Adds a node */ 1036993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private void addNode(HtmlDocument.Node n, int begin, int end) { 1037993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira nodes.add(n); 1038993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira begins.add(begin); 1039993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira ends.add(end); 1040993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 1041993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 1042993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** For debugging */ 1043993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private static final void debug(String str) { 1044993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira logger.finest(str); 1045993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 1046993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 1047993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira}