156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson/* 256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Copyright (C) 2010 Google Inc. 356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Licensed under the Apache License, Version 2.0 (the "License"); 556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * you may not use this file except in compliance with the License. 656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * You may obtain a copy of the License at 756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * http://www.apache.org/licenses/LICENSE-2.0 956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 1056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Unless required by applicable law or agreed to in writing, software 1156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * distributed under the License is distributed on an "AS IS" BASIS, 1256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 1356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * See the License for the specific language governing permissions and 1456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * limitations under the License. 1556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */ 1656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 1756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonpackage com.google.streamhtmlparser; 1856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 1956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonimport com.google.streamhtmlparser.impl.HtmlParserImpl; 2056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 2156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonimport java.util.Set; 2256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonimport java.util.logging.Logger; 2356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 2456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson/** 2556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * A factory class to obtain instances of an {@link HtmlParser}. 2656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Currently each instance is a new object given these are fairly 2756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * light-weight. 2856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 2956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <p>In the unlikely case that this class fails to initialize properly 3056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * (a developer error), an error is emitted to the error console and the logs 3156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * and the specialized parser creation methods will throw 3256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * an {@link AssertionError} on all invokations. 3356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */ 3456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodsonpublic class HtmlParserFactory { 3556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 3656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final Logger logger = 3756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson Logger.getLogger(HtmlParserFactory.class.getName()); 3856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 3956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson /** 4056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * To provide additional options when creating an {@code HtmlParser} using 4156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * {@link HtmlParserFactory#createParserInAttribute(HtmlParser.ATTR_TYPE, 4256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * boolean, Set)} 4356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */ 4456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson public enum AttributeOptions { 4556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 4656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson /** 4756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Indicates that the attribute value is Javascript-quoted. Only takes 4856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * effect for Javascript-accepting attributes - as identified by 4956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * {@link HtmlParser.ATTR_TYPE#JS} - and only when the attribute is also 5056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * HTML quoted. 5156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */ 5256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson JS_QUOTED, 5356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 5456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson /** 5556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Indicates the attribute value is only a part of a URL as opposed to a 5656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * full URL. In particular, the value is not at the start of a URL and 5756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * hence does not necessitate validation of the URL scheme. 5856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Only valid for URI-accepting attributes - as identified by 5956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * {@link HtmlParser.ATTR_TYPE#URI}. 6056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */ 6156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson URL_PARTIAL, 6256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 6356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 6456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson /** 6556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * To provide additional options when creating an {@code HtmlParser} using 6656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * {@link HtmlParserFactory#createParserInMode(HtmlParser.Mode, Set)} 6756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */ 6856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson public enum ModeOptions { 6956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 7056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson /** 7156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Indicates that the parser is inside a quoted {@code String}. Only 7256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * valid in the {@link HtmlParser.Mode#JS} mode. 7356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */ 7456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson JS_QUOTED 7556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 7656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 7756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final HtmlParser parserInDefaultAttr = createParser(); 7856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final HtmlParser parserInDefaultAttrQ = createParser(); 7956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final HtmlParser parserInUriAttrComplete = createParser(); 8056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final HtmlParser parserInUriAttrQComplete = createParser(); 8156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final HtmlParser parserInUriAttrPartial = createParser(); 8256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final HtmlParser parserInUriAttrQPartial = createParser(); 8356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final HtmlParser parserInJsAttr = createParser(); 8456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final HtmlParser parserInJsAttrQ = createParser(); 8556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final HtmlParser parserInQJsAttr = createParser(); 8656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final HtmlParser parserInStyleAttr = createParser(); 8756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final HtmlParser parserInStyleAttrQ = createParser(); 8856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static final HtmlParser parserInJsQ = createParser(); 8956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 9056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson /** 9156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Protects all the createParserXXX methods by throwing a run-time exception 9256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * if this class failed to initialize properly. 9356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */ 9456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static boolean initSuccess = false; 9556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 9656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson static { 9756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson try { 9856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson initializeParsers(); 9956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson initSuccess = true; 10056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } catch (ParseException e) { 10156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // Log a severe error and print it to stderr along with a stack trace. 10256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson String error = HtmlParserFactory.class.getName() + 10356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson " Failed initialization: " + e.getMessage(); 10456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson logger.severe(error); 10556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson System.err.println(error); 10656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson e.printStackTrace(); 10756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 10856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 10956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 11056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // Static class. 11156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private HtmlParserFactory() { 11256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } // COV_NF_LINE 11356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 11456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson /** 11556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Returns an {@code HtmlParser} object ready to parse HTML input. 11656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 11756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * @return an {@code HtmlParser} in the provided mode 11856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */ 11956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson public static HtmlParser createParser() { 12056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson return new HtmlParserImpl(); 12156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 12256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 12356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson /** 12456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Returns an {@code HtmlParser} object initialized with the 12556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * requested Mode. Provide non {@code null} options to provide 12656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * a more precise initialization with the desired Mode. 12756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 12856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * @param mode the mode to reset the parser with 12956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * @param options additional options or {@code null} for none 13056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * @return an {@code HtmlParser} in the provided mode 13156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * @throws AssertionError when this class failed to initialize 13256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */ 13356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson public static HtmlParser createParserInMode(HtmlParser.Mode mode, 13456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson Set<ModeOptions> options) { 13556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson requireInitialized(); 13656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 13756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson if (options != null && options.contains(ModeOptions.JS_QUOTED)) 13856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson return createParser(parserInJsQ); 13956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 14056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // With no options given, this method is just a convenience wrapper for 14156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // the two calls below. 14256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson HtmlParser parser = new HtmlParserImpl(); 14356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson parser.resetMode(mode); 14456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson return parser; 14556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 14656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 14756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson /** 14856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Returns an {@code HtmlParser} that is a copy of the one 14956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * supplied. It holds the same internal state and hence can 15056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * proceed with parsing in-lieu of the supplied parser. 15156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 15256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * @param aHtmlParser a {@code HtmlParser} to copy from 15356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * @return an {@code HtmlParser} that is a copy of the provided one 15456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * @throws AssertionError when this class failed to initialize 15556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */ 15656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson public static HtmlParser createParser(HtmlParser aHtmlParser) { 15756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson requireInitialized(); 15856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 15956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // Should never get a ClassCastException since there is only one 16056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // implementation of the HtmlParser interface. 16156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson return new HtmlParserImpl((HtmlParserImpl) aHtmlParser); 16256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 16356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 16456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson /** 16556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * A very specialized {@code HtmlParser} accessor that returns a parser 16656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * in a state where it expects to read the value of an attribute 16756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * of an HTML tag. This is only useful when the parser has not seen a 16856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * certain HTML tag and an attribute name and needs to continue parsing 16956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * from a state as though it has. 17056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 17156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <p>For example, to create a parser in a state akin to that 17256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * after the parser has parsed "<a href=\"", invoke: 17356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <pre> 17456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * createParserInAttribute(HtmlParser.ATTR_TYPE.URI, true)} 17556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * </pre> 17656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 17756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <p>You must provide the proper value of quoting or the parser 17856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * will go into an unexpected state. 17956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * As a special-case, when called with the {@code HtmlParser.ATTR_TYPE} 18056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * of {@code HtmlParser.ATTR_TYPE.NONE}, the parser is created in a state 18156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * inside an HTML tag where it expects an attribute name not an attribute 18256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * value. It becomes equivalent to a parser initialized in the 18356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * {@code HTML_IN_TAG} mode. 18456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 18556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * @param attrtype the attribute type which the parser should be in 18656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * @param quoted whether the attribute value is enclosed in double quotes 18756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * @param options additional options or {@code null} for none 18856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * @return an {@code HtmlParser} initialized in the given attribute type 18956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * and quoting 19056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * @throws AssertionError when this class failed to initialize 19156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */ 19256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson public static HtmlParser createParserInAttribute( 19356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson HtmlParser.ATTR_TYPE attrtype, 19456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson boolean quoted, Set<AttributeOptions> options) { 19556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson requireInitialized(); 19656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 19756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson HtmlParser parser; 19856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson switch (attrtype) { 19956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson case REGULAR: 20056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson parser = createParser( 20156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson quoted ? parserInDefaultAttrQ : parserInDefaultAttr); 20256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson break; 20356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson case URI: 20456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson if (options != null && options.contains(AttributeOptions.URL_PARTIAL)) 20556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson parser = createParser( 20656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson quoted ? parserInUriAttrQPartial : parserInUriAttrPartial); 20756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson else 20856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson parser = createParser( 20956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson quoted ? parserInUriAttrQComplete : parserInUriAttrComplete); 21056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson break; 21156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson case JS: 21256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // Note: We currently do not support the case of the value being 21356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // inside a Javascript quoted string that is in an unquoted HTML 21456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // attribute, such as <a href=bla onmouseover=alert('[VALUE')>. 21556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // It would be simple to add but currently we assume Javascript 21656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // quoted attribute values are always HTML quoted. 21756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson if (quoted) { 21856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson if (options != null && options.contains(AttributeOptions.JS_QUOTED)) 21956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson parser = createParser(parserInQJsAttr); 22056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson else 22156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson parser = createParser(parserInJsAttrQ); 22256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } else { 22356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson parser = createParser(parserInJsAttr); 22456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 22556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson break; 22656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson case STYLE: 22756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson parser = createParser( 22856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson quoted ? parserInStyleAttrQ : parserInStyleAttr); 22956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson break; 23056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson case NONE: 23156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson parser = createParserInMode(HtmlParser.Mode.HTML_IN_TAG, null); 23256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson break; 23356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson default: 23456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson throw new IllegalArgumentException( 23556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson "Did not recognize ATTR_TYPE given: " + attrtype); 23656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 23756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson return parser; 23856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 23956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 24056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson /** 24156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Initializes a set of static parsers to be subsequently used 24256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * by the various createParserXXX methods. 24356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * The parsers are set to their proper states by making them parse 24456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * an appropriate HTML input fragment. This approach is the most likely 24556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * to ensure all their internal state is consistent. 24656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 24756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <p>In the very unexpected case of the parsing failing (developer error), 24856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * this class will fail to initialize properly. 24956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 25056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <p>In addition: 25156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <ul> 25256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <li>The HTML tag is set to a fictitious name {@code xparsertag}. 25356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <li>The attribute name is chosen to match the required attribute type. 25456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * When several possibilities exist, one is chosen arbitrarily. 25556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * <li>If quoting is required, a double quote is provided after the '='. 25656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * </ul> 25756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 25856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * @throws ParseException if parsing failed. 25956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */ 26056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static void initializeParsers() throws ParseException { 26156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson parserInDefaultAttr.parse("<xparsertag htmlparser="); 26256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson parserInDefaultAttrQ.parse("<xparsertag htmlparser=\""); 26356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 26456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // Chosing the "src" attribute, one of several possible names here 26556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson parserInUriAttrComplete.parse("<xparsertag src="); 26656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson parserInUriAttrQComplete.parse("<xparsertag src=\""); 26756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 26856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // To support a parser that is initialized within a URL parameter 26956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // rather than at the beginning of a URL. We use a fake domain 27056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // (example.com from RFC 2606 <http://www.rfc-editor.org/rfc/rfc2606.txt>) 27156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // and a fake query parameter. 27256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson final String fakeUrlPrefix = "http://example.com/fakequeryparam="; 27356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson parserInUriAttrPartial.parse("<xparsertag src=" + fakeUrlPrefix); 27456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson parserInUriAttrQPartial.parse("<xparsertag src=\"" + fakeUrlPrefix); 27556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 27656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // Using onmouse= which is a fictitious attribute name that the parser 27756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // understands as being a valid javascript-enabled attribute. Chosing fake 27856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // names may help during debugging. 27956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson parserInJsAttr.parse("<xparsertag onmouse="); 28056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson parserInJsAttrQ.parse("<xparsertag onmouse=\""); 28156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // Single quote added as the Javascript is itself quoted. 28256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson parserInQJsAttr.parse("<xparsertag onmouse=\"'"); 28356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 28456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // A parser in the Javascript context within a (single) quoted string. 28556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson parserInJsQ.resetMode(HtmlParser.Mode.JS); 28656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson parserInJsQ.parse("var fakeparservar='"); 28756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 28856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson // Chosing the "style" attribute as it is the only option 28956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson parserInStyleAttr.parse("<xparsertag style="); 29056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson parserInStyleAttrQ.parse("<xparsertag style=\""); 29156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 29256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson 29356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson /** 29456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * Throws an {@link AssertionError} if the class was not initialized 29556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * correctly, otherwise simply returns. This is to protect against the 29656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * possibility the needed parsers were not created successfully during 29756ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * static initialized, which can only happen due to an error during 29856ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * development of this library. 29956ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * 30056ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson * @throws AssertionError when this class failed to initialize 30156ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson */ 30256ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson private static void requireInitialized() { 30356ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson if (!initSuccess) 30456ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson throw new AssertionError("HtmlParserFactory failed initialization."); 30556ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson } 30656ed4167b942ec265f9cee70ac4d71d10b3835ceBen Dodson} 307