/** * Copyright (c) 2004, Google Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package com.android.mail.lib.html.parser; import com.android.mail.lib.base.CharMatcher; import com.android.mail.lib.base.Preconditions; import com.android.mail.lib.base.X; import com.google.common.collect.ImmutableSet; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Set; import java.util.Stack; import java.util.logging.Logger; /** * HtmlTree represents a parsed and well-formed html text, it provides * methods to convert to plain text. It also provides methods to find * well-formed blocks of text, for quote detection. * * We don't really build a html tree data structure. Instead, for * efficiency, and for the ability to do a simple in-order-traversal * of the tree, we simply keeps a linear list of nodes (in-order). * The begin_ and end_ arrays keeps track of the starting end ending * nodes: * * For a string node, begin_[node] = end_[node] = node * For an open tag, begin_[node] = node, end_[node] = the matching end tag * For a close tag, end_[node] = the matching open tag, end_[node] = node * * @author jlim@google.com (Jing Yee Lim) */ public class HtmlTree { /** * An interface that allows clients to provide their own implementation * of a {@link PlainTextConverter}. */ public static interface PlainTextConverterFactory { /** * Creates a new instance of a {@link PlainTextConverter} to convert * the contents of an {@link HtmlTree} to plain text. */ PlainTextConverter createInstance(); } /** * An interface for an object which converts a single HtmlTree into * plaintext. */ public static interface PlainTextConverter { /** * Adds the given node {@code n} to plain text. * * @param n The node to convert to text. * @param nodeNum The number of the node among the list of all notes. * @param endNum The number of the ending node if this is a start node, * otherwise the same as {@code nodeNum}. */ void addNode(HtmlDocument.Node n, int nodeNum, int endNum); /** * Returns the current length of the plain text. */ int getPlainTextLength(); /** * Returns the current plain text. */ String getPlainText(); } /** A factory that produces converters of the default implementation. */ private static final PlainTextConverterFactory DEFAULT_CONVERTER_FACTORY = new PlainTextConverterFactory() { public PlainTextConverter createInstance() { return new DefaultPlainTextConverter(); } }; /** Contains html nodes */ private final List nodes = new ArrayList(); /** Keeps track of beginning and end of each node */ private final Stack begins = new Stack(); private final Stack ends = new Stack(); /** Plain text (lazy creation) */ private String plainText; /** The html string (lazy creation) */ private String html; /** textPositions[node pos] = the text position */ private int[] textPositions; private PlainTextConverterFactory converterFactory = DEFAULT_CONVERTER_FACTORY; // For debugging only private static final boolean DEBUG = false; private static final Logger logger = Logger.getLogger(HtmlTree.class.getName()); //------------------------------------------------------------------------ /** HtmlTree can only be constructed from this package */ HtmlTree() { } /** * Sets a new {@link PlainTextConverterFactory} to be used to convert * the contents of this tree to plaintext. */ public void setPlainTextConverterFactory(PlainTextConverterFactory factory) { if (factory == null) { throw new NullPointerException("factory must not be null"); } converterFactory = factory; } /** * Gets the list of node objects. A node can be either a * Tag, EngTag or a String object. * @return the nodes of the tree */ public List getNodesList() { return Collections.unmodifiableList(nodes); } /** * @return number of nodes */ public int getNumNodes() { return nodes.size(); } /** * Gets the entire html. */ public String getHtml() { return getHtml(-1); } /** * Gets the entire html, if wrapSize is > 0, it tries to do wrapping at the * specified size. */ public String getHtml(int wrapSize) { if (html == null) { html = getHtml(0, nodes.size(), wrapSize); } return html; } /** Gets parts of the html */ public String getHtml(int fromNode, int toNode) { return getHtml(fromNode, toNode, -1); } /** * Gets parts of the html, if wrapSize is > 0, it tries * to do wrapping at the specified size. */ public String getHtml(int fromNode, int toNode, int wrapSize) { X.assertTrue(fromNode >= 0 && toNode <= nodes.size()); int estSize = (toNode - fromNode) * 10; StringBuilder sb = new StringBuilder(estSize); int lastWrapIndex = 0; // used for wrapping for (int n = fromNode; n < toNode; n++) { HtmlDocument.Node node = nodes.get(n); node.toHTML(sb); // TODO: maybe we can be smarter about this and not add newlines // within
 tags, unless the whole long line is encompassed
      // by the 
 tag.
      if (wrapSize > 0) {
        // We can only wrap if the last outputted node is an element that
        // breaks the flow. Otherwise, we risk the possibility of inserting
        // spaces where they shouldn't be.
        if ((node instanceof HtmlDocument.Tag &&
              ((HtmlDocument.Tag) node).getElement().breaksFlow()) ||
            (node instanceof HtmlDocument.EndTag &&
              ((HtmlDocument.EndTag) node).getElement().breaksFlow())) {
          // Check to see if there is a newline in the most recent node's html.
          int recentNewLine = sb.substring(lastWrapIndex + 1).lastIndexOf('\n');
          if (recentNewLine != -1) {
            lastWrapIndex += recentNewLine;
          }
          // If the last index - last index of a newline is greater than
          // wrapSize, add a newline.
          if (((sb.length() - 1) - lastWrapIndex) > wrapSize) {
            sb.append('\n');
            lastWrapIndex = sb.length() - 1;
          }
        }
      }
    }

    return sb.toString();
  }

  /**
   * Convert a html region into chunks of html code, each containing
   * roughly chunkSize characters.
   */
  public ArrayList getHtmlChunks(int fromNode, int toNode, int chunkSize) {
    X.assertTrue(fromNode >= 0 && toNode <= nodes.size());

    ArrayList chunks = new ArrayList();

    // Do a best effort attempt to not split apart certain elements (as of now,
    // just the