1/**
2 * Copyright (c) 2004, Google Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *     http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16package com.google.android.mail.common.html.parser;
17
18import com.google.android.mail.common.base.X;
19import com.google.android.mail.common.html.parser.HtmlDocument.EndTag;
20import com.google.common.io.ByteStreams;
21
22import java.io.IOException;
23import java.util.ArrayList;
24import java.util.List;
25import java.util.logging.Level;
26import java.util.logging.Logger;
27
28/**
29 * HtmlTreeBuilder builds a well-formed HtmlTree.
30 *
31 * @see HtmlTree
32 * @author jlim@google.com (Jing Yee Lim)
33 */
34public class HtmlTreeBuilder implements HtmlDocument.Visitor {
35
36  private static final Logger logger = Logger.getLogger(HtmlTreeBuilder.class.getName());
37
38  /** Stack contains HTML4.Element objects to keep track of unclosed tags */
39  private final List<HTML.Element> stack = new ArrayList<HTML.Element>();
40  private final TableFixer tableFixer = new TableFixer();
41  private HtmlTree tree;
42  private boolean built = false;
43
44  /** Gets the built html tree */
45  public HtmlTree getTree() {
46    X.assertTrue(built);
47    return tree;
48  }
49
50  /** Implements HtmlDocument.Visitor.start */
51  public void start() {
52    tree = new HtmlTree();
53    tree.start();
54  }
55
56  /** Implements HtmlDocument.Visitor.finish */
57  public void finish() {
58    // Close all tags
59    while (stack.size() > 0) {
60      addMissingEndTag();
61    }
62    tableFixer.finish();
63    tree.finish();
64
65    built = true;
66  }
67
68  /** Implements HtmlDocument.Visitor.visitTag */
69  public void visitTag(HtmlDocument.Tag t) {
70    tableFixer.seeTag(t);
71
72    HTML.Element element = t.getElement();
73    if (element.isEmpty()) {
74      tree.addSingularTag(t);
75    } else if (t.isSelfTerminating()) {
76      // Explicitly create a non-selfterminating open tag and add it to the tree
77      // and also immediately add the corresponding close tag. This is done
78      // so that the toHTML, toXHTML and toOriginalHTML of the tree's node list
79      // will be balanced consistently.
80      // Otherwise there is a possibility of "<span /></span>" for example, if
81      // the created tree is converted to string through toXHTML.
82      tree.addStartTag(HtmlDocument.createTag(element,
83          t.getAttributes(), t.getOriginalHtmlBeforeAttributes(),
84          t.getOriginalHtmlAfterAttributes()));
85      EndTag end = HtmlDocument.createEndTag(element);
86      tableFixer.seeEndTag(end);
87      tree.addEndTag(end);
88    } else {
89      tree.addStartTag(t);
90      push(element);                       // Track the open tags
91    }
92  }
93
94  /** Implements HtmlVisitor.visit */
95  public void visitEndTag(HtmlDocument.EndTag t) {
96
97    // Here we pop back to the start tag
98    HTML.Element element = t.getElement();
99    int pos = findStartTag(element);
100    if (pos >= 0) {
101
102      // Add missing end-tags if any
103      while (pos < stack.size() - 1) {
104        addMissingEndTag();
105      }
106
107      pop();
108      tableFixer.seeEndTag(t);
109      tree.addEndTag(t);
110
111    } else {
112      // Not found, ignore this end tag
113      logger.finest("Ignoring end tag: " + element.getName());
114    }
115  }
116
117  /** Implements HtmlDocument.Visitor.visitText */
118  public void visitText(HtmlDocument.Text t) {
119    tableFixer.seeText(t);
120    tree.addText(t);
121  }
122
123  /** Implements HtmlDocument.Visitor.visitComment */
124  public void visitComment(HtmlDocument.Comment n) {
125    // ignore
126  }
127
128  /** Finds the start tag from the stack, returns -1 if not found */
129  private int findStartTag(HTML.Element element) {
130    for (int i = stack.size() - 1; i >= 0; i--) {
131      HTML.Element e = stack.get(i);
132      if (e == element) {
133        return i;
134      }
135    }
136    return -1;
137  }
138
139  /**
140   * Adds a close tag corresponding to a tag on the stack, if
141   * the tag needs a close tag.
142   */
143  private void addMissingEndTag() {
144    HTML.Element element = pop();
145
146    HtmlDocument.EndTag endTag = HtmlDocument.createEndTag(element);
147    tableFixer.seeEndTag(endTag);
148    tree.addEndTag(endTag);
149  }
150
151  /** Pushes a tag onto the stack */
152  private void push(HTML.Element element) {
153    stack.add(element);
154  }
155
156  /** Pops an elemnt from the stack */
157  private HTML.Element pop() {
158    return stack.remove(stack.size() - 1);
159  }
160
161  /**
162   * The TableFixer makes sure that a <table> structure is more or less well
163   * formed. Note that it only ensures that data within the <table> tag doesn't
164   * "leak out" of the table.
165   *
166   * For instance, all the tags here are balanced with end tags. But the
167   * 'outside' text ends up leaking out of the table.
168   * <table><tr><td bgcolor=yellow>
169   * <table><table>inside</table><td>outside</td></table>
170   * </td></tr></table>
171   *
172   * The TableFixer makes sure that
173   * 1) Within a table:, text and other elements are enclosed within a TD.
174   *    A TD tag is inserted where necessary.
175   * 2) All table structure tags are enclosed within a <table>. A TABLE tag
176   *    is inserted where necessary.
177   *
178   * Note that the TableFixer only adds open tags, it doesn't add end tags.
179   * The HtmlTreeVerifier ensures that all open tags are properly matched
180   * up and closed.
181   *
182   * @author Jing Yee Lim (jlim@google.com)
183   */
184  class TableFixer {
185
186    private int tables = 0;             // table nesting level
187
188    // States within a <table>
189    static final int NULL = 0;
190    static final int IN_CELL = 1;       // in a <td> or <th> tag
191    static final int IN_CAPTION = 2;    // in a <caption> tag
192
193    private int state;
194
195    void seeTag(HtmlDocument.Tag tag) {
196      HTML.Element element = tag.getElement();
197      if (element.getType() == HTML.Element.TABLE_TYPE) {
198
199        if (HTML4.TABLE_ELEMENT.equals(element)) {
200          if (tables > 0) {
201            ensureCellState();
202          }
203          tables++;
204          state = NULL;
205
206        } else {
207          // Make sure that we're in a table
208          ensureTableState();
209
210          // In cell/caption?
211          if (HTML4.TD_ELEMENT.equals(element) ||
212              HTML4.TH_ELEMENT.equals(element)) {
213            state = IN_CELL;
214
215          } else if (HTML4.CAPTION_ELEMENT.equals(element)) {
216            state = IN_CAPTION;
217          }
218        }
219      } else {
220        if (tables > 0) {
221
222          // Ok to have a form element outside a table cell.
223          // e.g. <TR><FORM><TD>...
224          if (!HTML4.FORM_ELEMENT.equals(element)) {
225            ensureCellState();
226          }
227        }
228      }
229    }
230
231    void seeEndTag(HtmlDocument.EndTag endTag) {
232      HTML.Element element= endTag.getElement();
233
234      if (tables > 0 && element.getType() == HTML.Element.TABLE_TYPE) {
235
236        if (HTML4.TD_ELEMENT.equals(element) ||
237            HTML4.TR_ELEMENT.equals(element) ||
238            HTML4.TH_ELEMENT.equals(element)) {
239          // End of a cell
240          state = NULL;
241
242        } else if (HTML4.CAPTION_ELEMENT.equals(element)) { // End caption
243          state = NULL;
244
245        } else if (HTML4.TABLE_ELEMENT.equals(element)) { // End table
246          X.assertTrue(tables > 0);
247          tables--;
248          state = (tables > 0) ? IN_CELL : NULL;
249        }
250      }
251    }
252
253    void seeText(HtmlDocument.Text textNode) {
254      // If we're in a table, but not in a cell or caption, and the
255      // text is not whitespace, add a <TD>
256      if (tables > 0 &&
257          state == NULL &&
258          !textNode.isWhitespace()) {
259        ensureCellState();
260      }
261    }
262
263    void finish() {
264      X.assertTrue(tables == 0);
265      X.assertTrue(state == NULL);
266    }
267
268    // Ensure that we're within a TABLE
269    private void ensureTableState() {
270      if (tables == 0) {
271        push(HTML4.TABLE_ELEMENT);
272
273        HtmlDocument.Tag tableTag =
274          HtmlDocument.createTag(HTML4.TABLE_ELEMENT, null);
275        tree.addStartTag(tableTag);
276
277        tables++;
278      }
279    }
280
281    // Ensure that we're within a TD or TH cell
282    private void ensureCellState() {
283      if (state != IN_CELL) {
284        push(HTML4.TD_ELEMENT);
285
286        HtmlDocument.Tag tdTag = HtmlDocument.createTag(HTML4.TD_ELEMENT, null);
287        tree.addStartTag(tdTag);
288
289        state = IN_CELL;
290      }
291    }
292  }
293
294  /** For testing */
295  public static void main(String[] args) throws IOException {
296    logger.setLevel(Level.FINEST);
297
298    String html = new String(ByteStreams.toByteArray(System.in));
299    HtmlParser parser = new HtmlParser();
300    HtmlDocument doc = parser.parse(html);
301
302    HtmlTreeBuilder builder = new HtmlTreeBuilder();
303    doc.accept(builder);
304    String outputHtml = builder.getTree().getHtml();
305
306    System.out.println(outputHtml);
307  }
308}