1993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira/** 2993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Copyright (c) 2004, Google Inc. 3993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * 4993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Licensed under the Apache License, Version 2.0 (the "License"); 5993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * you may not use this file except in compliance with the License. 6993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * You may obtain a copy of the License at 7993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * 8993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * http://www.apache.org/licenses/LICENSE-2.0 9993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * 10993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Unless required by applicable law or agreed to in writing, software 11993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * distributed under the License is distributed on an "AS IS" BASIS, 12993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * See the License for the specific language governing permissions and 14993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * limitations under the License. 15993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 161bdbfefe4b144c7b031a1d9242a0fa061a0ae6b5Scott Kennedypackage com.google.android.mail.common.html.parser; 17993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 181bdbfefe4b144c7b031a1d9242a0fa061a0ae6b5Scott Kennedyimport com.google.android.mail.common.base.X; 191bdbfefe4b144c7b031a1d9242a0fa061a0ae6b5Scott Kennedyimport com.google.android.mail.common.html.parser.HtmlDocument.EndTag; 20993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereiraimport com.google.common.io.ByteStreams; 21993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 22993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereiraimport java.io.IOException; 23993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereiraimport java.util.ArrayList; 24993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereiraimport java.util.List; 25993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereiraimport java.util.logging.Level; 26993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereiraimport java.util.logging.Logger; 27993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 28993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira/** 29993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * HtmlTreeBuilder builds a well-formed HtmlTree. 30993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * 31993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * @see HtmlTree 32993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * @author jlim@google.com (Jing Yee Lim) 33993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 34993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereirapublic class HtmlTreeBuilder implements HtmlDocument.Visitor { 35993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 36993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private static final Logger logger = Logger.getLogger(HtmlTreeBuilder.class.getName()); 37993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 38993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** Stack contains HTML4.Element objects to keep track of unclosed tags */ 39993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private final List<HTML.Element> stack = new ArrayList<HTML.Element>(); 40993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private final TableFixer tableFixer = new TableFixer(); 41993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private HtmlTree tree; 42993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private boolean built = false; 43993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 44993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** Gets the built html tree */ 45993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira public HtmlTree getTree() { 46993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira X.assertTrue(built); 47993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira return tree; 48993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 49993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 50993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** Implements HtmlDocument.Visitor.start */ 51993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira public void start() { 52993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira tree = new HtmlTree(); 53993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira tree.start(); 54993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 55993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 56993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** Implements HtmlDocument.Visitor.finish */ 57993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira public void finish() { 58993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // Close all tags 59993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira while (stack.size() > 0) { 60993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira addMissingEndTag(); 61993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 62993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira tableFixer.finish(); 63993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira tree.finish(); 64993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 65993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira built = true; 66993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 67993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 68993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** Implements HtmlDocument.Visitor.visitTag */ 69993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira public void visitTag(HtmlDocument.Tag t) { 70993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira tableFixer.seeTag(t); 71993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 72993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira HTML.Element element = t.getElement(); 73993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (element.isEmpty()) { 74993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira tree.addSingularTag(t); 75993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } else if (t.isSelfTerminating()) { 76993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // Explicitly create a non-selfterminating open tag and add it to the tree 77993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // and also immediately add the corresponding close tag. This is done 78993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // so that the toHTML, toXHTML and toOriginalHTML of the tree's node list 79993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // will be balanced consistently. 80993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // Otherwise there is a possibility of "<span /></span>" for example, if 81993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // the created tree is converted to string through toXHTML. 82993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira tree.addStartTag(HtmlDocument.createTag(element, 83993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira t.getAttributes(), t.getOriginalHtmlBeforeAttributes(), 84993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira t.getOriginalHtmlAfterAttributes())); 85993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira EndTag end = HtmlDocument.createEndTag(element); 86993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira tableFixer.seeEndTag(end); 87993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira tree.addEndTag(end); 88993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } else { 89993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira tree.addStartTag(t); 90993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira push(element); // Track the open tags 91993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 92993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 93993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 94993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** Implements HtmlVisitor.visit */ 95993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira public void visitEndTag(HtmlDocument.EndTag t) { 96993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 97993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // Here we pop back to the start tag 98993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira HTML.Element element = t.getElement(); 99993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira int pos = findStartTag(element); 100993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (pos >= 0) { 101993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 102993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // Add missing end-tags if any 103993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira while (pos < stack.size() - 1) { 104993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira addMissingEndTag(); 105993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 106993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 107993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira pop(); 108993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira tableFixer.seeEndTag(t); 109993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira tree.addEndTag(t); 110993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 111993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } else { 112993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // Not found, ignore this end tag 113993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira logger.finest("Ignoring end tag: " + element.getName()); 114993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 115993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 116993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 117993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** Implements HtmlDocument.Visitor.visitText */ 118993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira public void visitText(HtmlDocument.Text t) { 119993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira tableFixer.seeText(t); 120993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira tree.addText(t); 121993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 122993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 123993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** Implements HtmlDocument.Visitor.visitComment */ 124993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira public void visitComment(HtmlDocument.Comment n) { 125993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // ignore 126993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 127993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 128993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** Finds the start tag from the stack, returns -1 if not found */ 129993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private int findStartTag(HTML.Element element) { 130993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira for (int i = stack.size() - 1; i >= 0; i--) { 131993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira HTML.Element e = stack.get(i); 132993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (e == element) { 133993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira return i; 134993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 135993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 136993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira return -1; 137993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 138993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 139993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 140993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Adds a close tag corresponding to a tag on the stack, if 141993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * the tag needs a close tag. 142993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 143993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private void addMissingEndTag() { 144993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira HTML.Element element = pop(); 145993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 146993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira HtmlDocument.EndTag endTag = HtmlDocument.createEndTag(element); 147993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira tableFixer.seeEndTag(endTag); 148993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira tree.addEndTag(endTag); 149993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 150993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 151993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** Pushes a tag onto the stack */ 152993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private void push(HTML.Element element) { 153993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira stack.add(element); 154993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 155993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 156993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** Pops an elemnt from the stack */ 157993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private HTML.Element pop() { 158993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira return stack.remove(stack.size() - 1); 159993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 160993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 161993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** 162993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * The TableFixer makes sure that a <table> structure is more or less well 163993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * formed. Note that it only ensures that data within the <table> tag doesn't 164993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * "leak out" of the table. 165993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * 166993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * For instance, all the tags here are balanced with end tags. But the 167993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * 'outside' text ends up leaking out of the table. 168993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * <table><tr><td bgcolor=yellow> 169993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * <table><table>inside</table><td>outside</td></table> 170993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * </td></tr></table> 171993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * 172993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * The TableFixer makes sure that 173993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * 1) Within a table:, text and other elements are enclosed within a TD. 174993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * A TD tag is inserted where necessary. 175993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * 2) All table structure tags are enclosed within a <table>. A TABLE tag 176993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * is inserted where necessary. 177993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * 178993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * Note that the TableFixer only adds open tags, it doesn't add end tags. 179993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * The HtmlTreeVerifier ensures that all open tags are properly matched 180993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * up and closed. 181993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * 182993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira * @author Jing Yee Lim (jlim@google.com) 183993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira */ 184993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira class TableFixer { 185993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 186993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private int tables = 0; // table nesting level 187993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 188993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // States within a <table> 189993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira static final int NULL = 0; 190993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira static final int IN_CELL = 1; // in a <td> or <th> tag 191993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira static final int IN_CAPTION = 2; // in a <caption> tag 192993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 193993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private int state; 194993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 195993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira void seeTag(HtmlDocument.Tag tag) { 196993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira HTML.Element element = tag.getElement(); 197993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (element.getType() == HTML.Element.TABLE_TYPE) { 198993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 199993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (HTML4.TABLE_ELEMENT.equals(element)) { 200993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (tables > 0) { 201993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira ensureCellState(); 202993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 203993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira tables++; 204993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira state = NULL; 205993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 206993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } else { 207993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // Make sure that we're in a table 208993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira ensureTableState(); 209993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 210993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // In cell/caption? 211993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (HTML4.TD_ELEMENT.equals(element) || 212993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira HTML4.TH_ELEMENT.equals(element)) { 213993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira state = IN_CELL; 214993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 215993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } else if (HTML4.CAPTION_ELEMENT.equals(element)) { 216993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira state = IN_CAPTION; 217993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 218993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 219993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } else { 220993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (tables > 0) { 221993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 222993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // Ok to have a form element outside a table cell. 223993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // e.g. <TR><FORM><TD>... 224993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (!HTML4.FORM_ELEMENT.equals(element)) { 225993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira ensureCellState(); 226993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 227993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 228993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 229993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 230993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 231993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira void seeEndTag(HtmlDocument.EndTag endTag) { 232993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira HTML.Element element= endTag.getElement(); 233993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 234993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (tables > 0 && element.getType() == HTML.Element.TABLE_TYPE) { 235993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 236993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (HTML4.TD_ELEMENT.equals(element) || 237993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira HTML4.TR_ELEMENT.equals(element) || 238993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira HTML4.TH_ELEMENT.equals(element)) { 239993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // End of a cell 240993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira state = NULL; 241993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 242993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } else if (HTML4.CAPTION_ELEMENT.equals(element)) { // End caption 243993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira state = NULL; 244993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 245993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } else if (HTML4.TABLE_ELEMENT.equals(element)) { // End table 246993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira X.assertTrue(tables > 0); 247993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira tables--; 248993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira state = (tables > 0) ? IN_CELL : NULL; 249993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 250993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 251993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 252993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 253993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira void seeText(HtmlDocument.Text textNode) { 254993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // If we're in a table, but not in a cell or caption, and the 255993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // text is not whitespace, add a <TD> 256993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (tables > 0 && 257993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira state == NULL && 258993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira !textNode.isWhitespace()) { 259993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira ensureCellState(); 260993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 261993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 262993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 263993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira void finish() { 264993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira X.assertTrue(tables == 0); 265993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira X.assertTrue(state == NULL); 266993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 267993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 268993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // Ensure that we're within a TABLE 269993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private void ensureTableState() { 270993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (tables == 0) { 271993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira push(HTML4.TABLE_ELEMENT); 272993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 273993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira HtmlDocument.Tag tableTag = 274993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira HtmlDocument.createTag(HTML4.TABLE_ELEMENT, null); 275993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira tree.addStartTag(tableTag); 276993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 277993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira tables++; 278993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 279993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 280993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 281993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira // Ensure that we're within a TD or TH cell 282993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira private void ensureCellState() { 283993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira if (state != IN_CELL) { 284993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira push(HTML4.TD_ELEMENT); 285993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 286993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira HtmlDocument.Tag tdTag = HtmlDocument.createTag(HTML4.TD_ELEMENT, null); 287993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira tree.addStartTag(tdTag); 288993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 289993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira state = IN_CELL; 290993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 291993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 292993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 293993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 294993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira /** For testing */ 295993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira public static void main(String[] args) throws IOException { 296993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira logger.setLevel(Level.FINEST); 297993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 298993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira String html = new String(ByteStreams.toByteArray(System.in)); 299993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira HtmlParser parser = new HtmlParser(); 300993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira HtmlDocument doc = parser.parse(html); 301993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 302993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira HtmlTreeBuilder builder = new HtmlTreeBuilder(); 303993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira doc.accept(builder); 304993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira String outputHtml = builder.getTree().getHtml(); 305993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira 306993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira System.out.println(outputHtml); 307993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira } 308993ef2674bf860a84c5c17e51a7a9e13e5d56504Mindy Pereira}