1993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira/**
2993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Copyright (c) 2004, Google Inc.
3993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira *
4993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Licensed under the Apache License, Version 2.0 (the "License");
5993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * you may not use this file except in compliance with the License.
6993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * You may obtain a copy of the License at
7993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira *
8993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira *     http://www.apache.org/licenses/LICENSE-2.0
9993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira *
10993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Unless required by applicable law or agreed to in writing, software
11993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * distributed under the License is distributed on an "AS IS" BASIS,
12993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * See the License for the specific language governing permissions and
14993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * limitations under the License.
15993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */
161bdbfefe4b144c7b031a1d9242a0fa061a0ae6b5Scott Kennedypackage com.google.android.mail.common.html.parser;
17993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
181bdbfefe4b144c7b031a1d9242a0fa061a0ae6b5Scott Kennedyimport com.google.android.mail.common.base.X;
191bdbfefe4b144c7b031a1d9242a0fa061a0ae6b5Scott Kennedyimport com.google.android.mail.common.html.parser.HtmlDocument.EndTag;
20993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereiraimport com.google.common.io.ByteStreams;
21993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
22993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereiraimport java.io.IOException;
23993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereiraimport java.util.ArrayList;
24993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereiraimport java.util.List;
25993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereiraimport java.util.logging.Level;
26993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereiraimport java.util.logging.Logger;
27993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
28993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira/**
29993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * HtmlTreeBuilder builds a well-formed HtmlTree.
30993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira *
31993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * @see HtmlTree
32993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * @author jlim@google.com (Jing Yee Lim)
33993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */
34993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereirapublic class HtmlTreeBuilder implements HtmlDocument.Visitor {
35993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
36993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  private static final Logger logger = Logger.getLogger(HtmlTreeBuilder.class.getName());
37993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
38993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /** Stack contains HTML4.Element objects to keep track of unclosed tags */
39993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  private final List<HTML.Element> stack = new ArrayList<HTML.Element>();
40993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  private final TableFixer tableFixer = new TableFixer();
41993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  private HtmlTree tree;
42993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  private boolean built = false;
43993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
44993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /** Gets the built html tree */
45993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  public HtmlTree getTree() {
46993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    X.assertTrue(built);
47993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    return tree;
48993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
49993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
50993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /** Implements HtmlDocument.Visitor.start */
51993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  public void start() {
52993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    tree = new HtmlTree();
53993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    tree.start();
54993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
55993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
56993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /** Implements HtmlDocument.Visitor.finish */
57993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  public void finish() {
58993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // Close all tags
59993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    while (stack.size() > 0) {
60993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      addMissingEndTag();
61993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
62993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    tableFixer.finish();
63993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    tree.finish();
64993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
65993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    built = true;
66993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
67993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
68993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /** Implements HtmlDocument.Visitor.visitTag */
69993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  public void visitTag(HtmlDocument.Tag t) {
70993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    tableFixer.seeTag(t);
71993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
72993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    HTML.Element element = t.getElement();
73993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    if (element.isEmpty()) {
74993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      tree.addSingularTag(t);
75993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    } else if (t.isSelfTerminating()) {
76993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // Explicitly create a non-selfterminating open tag and add it to the tree
77993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // and also immediately add the corresponding close tag. This is done
78993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // so that the toHTML, toXHTML and toOriginalHTML of the tree's node list
79993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // will be balanced consistently.
80993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // Otherwise there is a possibility of "<span /></span>" for example, if
81993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // the created tree is converted to string through toXHTML.
82993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      tree.addStartTag(HtmlDocument.createTag(element,
83993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          t.getAttributes(), t.getOriginalHtmlBeforeAttributes(),
84993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          t.getOriginalHtmlAfterAttributes()));
85993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      EndTag end = HtmlDocument.createEndTag(element);
86993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      tableFixer.seeEndTag(end);
87993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      tree.addEndTag(end);
88993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    } else {
89993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      tree.addStartTag(t);
90993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      push(element);                       // Track the open tags
91993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
92993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
93993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
94993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /** Implements HtmlVisitor.visit */
95993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  public void visitEndTag(HtmlDocument.EndTag t) {
96993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
97993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // Here we pop back to the start tag
98993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    HTML.Element element = t.getElement();
99993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    int pos = findStartTag(element);
100993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    if (pos >= 0) {
101993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
102993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // Add missing end-tags if any
103993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      while (pos < stack.size() - 1) {
104993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        addMissingEndTag();
105993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      }
106993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
107993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      pop();
108993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      tableFixer.seeEndTag(t);
109993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      tree.addEndTag(t);
110993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
111993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    } else {
112993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // Not found, ignore this end tag
113993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      logger.finest("Ignoring end tag: " + element.getName());
114993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
115993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
116993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
117993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /** Implements HtmlDocument.Visitor.visitText */
118993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  public void visitText(HtmlDocument.Text t) {
119993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    tableFixer.seeText(t);
120993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    tree.addText(t);
121993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
122993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
123993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /** Implements HtmlDocument.Visitor.visitComment */
124993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  public void visitComment(HtmlDocument.Comment n) {
125993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // ignore
126993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
127993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
128993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /** Finds the start tag from the stack, returns -1 if not found */
129993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  private int findStartTag(HTML.Element element) {
130993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    for (int i = stack.size() - 1; i >= 0; i--) {
131993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      HTML.Element e = stack.get(i);
132993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      if (e == element) {
133993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        return i;
134993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      }
135993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
136993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    return -1;
137993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
138993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
139993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /**
140993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * Adds a close tag corresponding to a tag on the stack, if
141993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * the tag needs a close tag.
142993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   */
143993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  private void addMissingEndTag() {
144993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    HTML.Element element = pop();
145993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
146993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    HtmlDocument.EndTag endTag = HtmlDocument.createEndTag(element);
147993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    tableFixer.seeEndTag(endTag);
148993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    tree.addEndTag(endTag);
149993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
150993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
151993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /** Pushes a tag onto the stack */
152993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  private void push(HTML.Element element) {
153993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    stack.add(element);
154993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
155993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
156993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /** Pops an elemnt from the stack */
157993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  private HTML.Element pop() {
158993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    return stack.remove(stack.size() - 1);
159993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
160993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
161993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /**
162993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * The TableFixer makes sure that a <table> structure is more or less well
163993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * formed. Note that it only ensures that data within the <table> tag doesn't
164993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * "leak out" of the table.
165993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   *
166993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * For instance, all the tags here are balanced with end tags. But the
167993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * 'outside' text ends up leaking out of the table.
168993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * <table><tr><td bgcolor=yellow>
169993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * <table><table>inside</table><td>outside</td></table>
170993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * </td></tr></table>
171993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   *
172993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * The TableFixer makes sure that
173993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * 1) Within a table:, text and other elements are enclosed within a TD.
174993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   *    A TD tag is inserted where necessary.
175993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * 2) All table structure tags are enclosed within a <table>. A TABLE tag
176993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   *    is inserted where necessary.
177993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   *
178993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * Note that the TableFixer only adds open tags, it doesn't add end tags.
179993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * The HtmlTreeVerifier ensures that all open tags are properly matched
180993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * up and closed.
181993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   *
182993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   * @author Jing Yee Lim (jlim@google.com)
183993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira   */
184993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  class TableFixer {
185993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
186993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    private int tables = 0;             // table nesting level
187993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
188993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // States within a <table>
189993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    static final int NULL = 0;
190993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    static final int IN_CELL = 1;       // in a <td> or <th> tag
191993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    static final int IN_CAPTION = 2;    // in a <caption> tag
192993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
193993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    private int state;
194993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
195993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    void seeTag(HtmlDocument.Tag tag) {
196993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      HTML.Element element = tag.getElement();
197993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      if (element.getType() == HTML.Element.TABLE_TYPE) {
198993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
199993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        if (HTML4.TABLE_ELEMENT.equals(element)) {
200993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          if (tables > 0) {
201993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira            ensureCellState();
202993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          }
203993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          tables++;
204993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          state = NULL;
205993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
206993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        } else {
207993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          // Make sure that we're in a table
208993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          ensureTableState();
209993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
210993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          // In cell/caption?
211993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          if (HTML4.TD_ELEMENT.equals(element) ||
212993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira              HTML4.TH_ELEMENT.equals(element)) {
213993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira            state = IN_CELL;
214993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
215993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          } else if (HTML4.CAPTION_ELEMENT.equals(element)) {
216993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira            state = IN_CAPTION;
217993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          }
218993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        }
219993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      } else {
220993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        if (tables > 0) {
221993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
222993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          // Ok to have a form element outside a table cell.
223993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          // e.g. <TR><FORM><TD>...
224993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          if (!HTML4.FORM_ELEMENT.equals(element)) {
225993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira            ensureCellState();
226993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          }
227993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        }
228993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      }
229993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
230993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
231993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    void seeEndTag(HtmlDocument.EndTag endTag) {
232993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      HTML.Element element= endTag.getElement();
233993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
234993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      if (tables > 0 && element.getType() == HTML.Element.TABLE_TYPE) {
235993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
236993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        if (HTML4.TD_ELEMENT.equals(element) ||
237993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira            HTML4.TR_ELEMENT.equals(element) ||
238993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira            HTML4.TH_ELEMENT.equals(element)) {
239993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          // End of a cell
240993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          state = NULL;
241993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
242993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        } else if (HTML4.CAPTION_ELEMENT.equals(element)) { // End caption
243993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          state = NULL;
244993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
245993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        } else if (HTML4.TABLE_ELEMENT.equals(element)) { // End table
246993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          X.assertTrue(tables > 0);
247993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          tables--;
248993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          state = (tables > 0) ? IN_CELL : NULL;
249993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        }
250993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      }
251993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
252993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
253993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    void seeText(HtmlDocument.Text textNode) {
254993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // If we're in a table, but not in a cell or caption, and the
255993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      // text is not whitespace, add a <TD>
256993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      if (tables > 0 &&
257993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          state == NULL &&
258993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          !textNode.isWhitespace()) {
259993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        ensureCellState();
260993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      }
261993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
262993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
263993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    void finish() {
264993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      X.assertTrue(tables == 0);
265993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      X.assertTrue(state == NULL);
266993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
267993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
268993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // Ensure that we're within a TABLE
269993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    private void ensureTableState() {
270993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      if (tables == 0) {
271993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        push(HTML4.TABLE_ELEMENT);
272993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
273993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        HtmlDocument.Tag tableTag =
274993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira          HtmlDocument.createTag(HTML4.TABLE_ELEMENT, null);
275993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        tree.addStartTag(tableTag);
276993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
277993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        tables++;
278993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      }
279993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
280993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
281993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    // Ensure that we're within a TD or TH cell
282993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    private void ensureCellState() {
283993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      if (state != IN_CELL) {
284993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        push(HTML4.TD_ELEMENT);
285993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
286993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        HtmlDocument.Tag tdTag = HtmlDocument.createTag(HTML4.TD_ELEMENT, null);
287993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        tree.addStartTag(tdTag);
288993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
289993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira        state = IN_CELL;
290993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira      }
291993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    }
292993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
293993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
294993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  /** For testing */
295993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  public static void main(String[] args) throws IOException {
296993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    logger.setLevel(Level.FINEST);
297993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
298993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    String html = new String(ByteStreams.toByteArray(System.in));
299993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    HtmlParser parser = new HtmlParser();
300993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    HtmlDocument doc = parser.parse(html);
301993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
302993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    HtmlTreeBuilder builder = new HtmlTreeBuilder();
303993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    doc.accept(builder);
304993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    String outputHtml = builder.getTree().getHtml();
305993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira
306993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira    System.out.println(outputHtml);
307993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira  }
308993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira}