1// This file is part of TagSoup and is Copyright 2002-2008 by John Cowan.
2//
3// TagSoup is licensed under the Apache License,
4// Version 2.0.  You may obtain a copy of this license at
5// http://www.apache.org/licenses/LICENSE-2.0 .  You may also have
6// additional legal rights not granted by this license.
7//
8// TagSoup is distributed in the hope that it will be useful, but
9// unless required by applicable law or agreed to in writing, TagSoup
10// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
11// OF ANY KIND, either express or implied; not even the implied warranty
12// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
13//
14//
15// The TagSoup parser
16
17package org.ccil.cowan.tagsoup;
18import java.util.HashMap;
19import java.util.ArrayList;
20import java.util.Locale;
21import java.io.*;
22import java.net.URL;
23import java.net.URLConnection;
24import org.xml.sax.*;
25import org.xml.sax.helpers.DefaultHandler;
26import org.xml.sax.ext.LexicalHandler;
27
28
29/**
30The SAX parser class.
31**/
32public class Parser extends DefaultHandler implements ScanHandler, XMLReader, LexicalHandler {
33
34	// XMLReader implementation
35
36	private ContentHandler theContentHandler = this;
37	private LexicalHandler theLexicalHandler = this;
38	private DTDHandler theDTDHandler = this;
39	private ErrorHandler theErrorHandler = this;
40	private EntityResolver theEntityResolver = this;
41	private Schema theSchema;
42	private Scanner theScanner;
43	private AutoDetector theAutoDetector;
44
45	// Default values for feature flags
46
47	private static boolean DEFAULT_NAMESPACES = true;
48	private static boolean DEFAULT_IGNORE_BOGONS = false;
49	private static boolean DEFAULT_BOGONS_EMPTY = false;
50        private static boolean DEFAULT_ROOT_BOGONS = true;
51	private static boolean DEFAULT_DEFAULT_ATTRIBUTES = true;
52	private static boolean DEFAULT_TRANSLATE_COLONS = false;
53	private static boolean DEFAULT_RESTART_ELEMENTS = true;
54	private static boolean DEFAULT_IGNORABLE_WHITESPACE = false;
55	private static boolean DEFAULT_CDATA_ELEMENTS = true;
56
57	// Feature flags.
58
59	private boolean namespaces = DEFAULT_NAMESPACES;
60	private boolean ignoreBogons = DEFAULT_IGNORE_BOGONS;
61	private boolean bogonsEmpty = DEFAULT_BOGONS_EMPTY;
62        private boolean rootBogons = DEFAULT_ROOT_BOGONS;
63	private boolean defaultAttributes = DEFAULT_DEFAULT_ATTRIBUTES;
64	private boolean translateColons = DEFAULT_TRANSLATE_COLONS;
65	private boolean restartElements = DEFAULT_RESTART_ELEMENTS;
66	private boolean ignorableWhitespace = DEFAULT_IGNORABLE_WHITESPACE;
67	private boolean CDATAElements = DEFAULT_CDATA_ELEMENTS;
68
69	/**
70	A value of "true" indicates namespace URIs and unprefixed local
71	names for element and attribute names will be available.
72	**/
73	public final static String namespacesFeature =
74		"http://xml.org/sax/features/namespaces";
75
76	/**
77	A value of "true" indicates that XML qualified names (with prefixes)
78	and attributes (including xmlns* attributes) will be available.
79	We don't support this value.
80	**/
81	public final static String namespacePrefixesFeature =
82		"http://xml.org/sax/features/namespace-prefixes";
83
84	/**
85	Reports whether this parser processes external general entities
86	(it doesn't).
87	**/
88	public final static String externalGeneralEntitiesFeature =
89		"http://xml.org/sax/features/external-general-entities";
90
91	/**
92	Reports whether this parser processes external parameter entities
93	(it doesn't).
94	**/
95	public final static String externalParameterEntitiesFeature =
96		"http://xml.org/sax/features/external-parameter-entities";
97
98	/**
99	May be examined only during a parse, after the startDocument()
100	callback has been completed; read-only. The value is true if
101	the document specified standalone="yes" in its XML declaration,
102	and otherwise is false.  (It's always false.)
103	**/
104	public final static String isStandaloneFeature =
105		"http://xml.org/sax/features/is-standalone";
106
107	/**
108	A value of "true" indicates that the LexicalHandler will report
109	the beginning and end of parameter entities (it won't).
110	**/
111	public final static String lexicalHandlerParameterEntitiesFeature =
112		"http://xml.org/sax/features/lexical-handler/parameter-entities";
113
114	/**
115	A value of "true" indicates that system IDs in declarations will
116	be absolutized (relative to their base URIs) before reporting.
117	(This returns true but doesn't actually do anything.)
118	**/
119	public final static String resolveDTDURIsFeature =
120		"http://xml.org/sax/features/resolve-dtd-uris";
121
122	/**
123	Has a value of "true" if all XML names (for elements,
124	prefixes, attributes, entities, notations, and local
125	names), as well as Namespace URIs, will have been interned
126	using java.lang.String.intern. This supports fast testing of
127	equality/inequality against string constants, rather than forcing
128	slower calls to String.equals().  (We always intern.)
129	**/
130	public final static String stringInterningFeature =
131		"http://xml.org/sax/features/string-interning";
132
133	/**
134	Returns "true" if the Attributes objects passed by this
135	parser in ContentHandler.startElement() implement the
136	org.xml.sax.ext.Attributes2 interface.	(They don't.)
137	**/
138
139	public final static String useAttributes2Feature =
140		"http://xml.org/sax/features/use-attributes2";
141
142	/**
143	Returns "true" if the Locator objects passed by this parser
144	in ContentHandler.setDocumentLocator() implement the
145	org.xml.sax.ext.Locator2 interface.  (They don't.)
146	**/
147	public final static String useLocator2Feature =
148		"http://xml.org/sax/features/use-locator2";
149
150	/**
151	Returns "true" if, when setEntityResolver is given an object
152	implementing the org.xml.sax.ext.EntityResolver2 interface,
153	those new methods will be used.  (They won't be.)
154	**/
155	public final static String useEntityResolver2Feature =
156		"http://xml.org/sax/features/use-entity-resolver2";
157
158	/**
159	Controls whether the parser is reporting all validity errors
160	(We don't report any validity errors.)
161	**/
162	public final static String validationFeature =
163		"http://xml.org/sax/features/validation";
164
165	/**
166	Controls whether the parser reports Unicode normalization
167	errors as described in section 2.13 and Appendix B of the XML
168	1.1 Recommendation.  (We don't normalize.)
169	**/
170	public final static String unicodeNormalizationCheckingFeature =
171"http://xml.org/sax/features/unicode-normalization-checking";
172
173	/**
174	Controls whether, when the namespace-prefixes feature is set,
175	the parser treats namespace declaration attributes as being in
176	the http://www.w3.org/2000/xmlns/ namespace.  (It doesn't.)
177	**/
178	public final static String xmlnsURIsFeature =
179		"http://xml.org/sax/features/xmlns-uris";
180
181	/**
182	Returns "true" if the parser supports both XML 1.1 and XML 1.0.
183	(Always false.)
184	**/
185	public final static String XML11Feature =
186		"http://xml.org/sax/features/xml-1.1";
187
188	/**
189	A value of "true" indicates that the parser will ignore
190	unknown elements.
191	**/
192	public final static String ignoreBogonsFeature =
193		"http://www.ccil.org/~cowan/tagsoup/features/ignore-bogons";
194
195	/**
196	A value of "true" indicates that the parser will give unknown
197	elements a content model of EMPTY; a value of "false", a
198	content model of ANY.
199	**/
200	public final static String bogonsEmptyFeature =
201		"http://www.ccil.org/~cowan/tagsoup/features/bogons-empty";
202
203	/**
204	A value of "true" indicates that the parser will allow unknown
205	elements to be the root element.
206	**/
207	public final static String rootBogonsFeature =
208		"http://www.ccil.org/~cowan/tagsoup/features/root-bogons";
209
210	/**
211	A value of "true" indicates that the parser will return default
212	attribute values for missing attributes that have default values.
213	**/
214	public final static String defaultAttributesFeature =
215		"http://www.ccil.org/~cowan/tagsoup/features/default-attributes";
216
217	/**
218	A value of "true" indicates that the parser will
219	translate colons into underscores in names.
220	**/
221	public final static String translateColonsFeature =
222		"http://www.ccil.org/~cowan/tagsoup/features/translate-colons";
223
224	/**
225	A value of "true" indicates that the parser will
226	attempt to restart the restartable elements.
227	**/
228	public final static String restartElementsFeature =
229		"http://www.ccil.org/~cowan/tagsoup/features/restart-elements";
230
231	/**
232	A value of "true" indicates that the parser will
233	transmit whitespace in element-only content via the SAX
234	ignorableWhitespace callback.  Normally this is not done,
235	because HTML is an SGML application and SGML suppresses
236	such whitespace.
237	**/
238	public final static String ignorableWhitespaceFeature =
239		"http://www.ccil.org/~cowan/tagsoup/features/ignorable-whitespace";
240
241	/**
242	A value of "true" indicates that the parser will treat CDATA
243	elements specially.  Normally true, since the input is by
244	default HTML.
245	**/
246	public final static String CDATAElementsFeature =
247		"http://www.ccil.org/~cowan/tagsoup/features/cdata-elements";
248
249	/**
250	Used to see some syntax events that are essential in some
251	applications: comments, CDATA delimiters, selected general
252	entity inclusions, and the start and end of the DTD (and
253	declaration of document element name). The Object must implement
254	org.xml.sax.ext.LexicalHandler.
255	**/
256	public final static String lexicalHandlerProperty =
257		"http://xml.org/sax/properties/lexical-handler";
258
259	/**
260	Specifies the Scanner object this Parser uses.
261	**/
262	public final static String scannerProperty =
263		"http://www.ccil.org/~cowan/tagsoup/properties/scanner";
264
265	/**
266	Specifies the Schema object this Parser uses.
267	**/
268	public final static String schemaProperty =
269		"http://www.ccil.org/~cowan/tagsoup/properties/schema";
270
271	/**
272	Specifies the AutoDetector (for encoding detection) this Parser uses.
273	**/
274	public final static String autoDetectorProperty =
275		"http://www.ccil.org/~cowan/tagsoup/properties/auto-detector";
276
277	// Due to sucky Java order of initialization issues, these
278	// entries are maintained separately from the initial values of
279	// the corresponding instance variables, but care must be taken
280	// to keep them in sync.
281
282	private HashMap theFeatures = new HashMap();
283	{
284		theFeatures.put(namespacesFeature, truthValue(DEFAULT_NAMESPACES));
285		theFeatures.put(namespacePrefixesFeature, Boolean.FALSE);
286		theFeatures.put(externalGeneralEntitiesFeature, Boolean.FALSE);
287		theFeatures.put(externalParameterEntitiesFeature, Boolean.FALSE);
288		theFeatures.put(isStandaloneFeature, Boolean.FALSE);
289		theFeatures.put(lexicalHandlerParameterEntitiesFeature,
290			Boolean.FALSE);
291		theFeatures.put(resolveDTDURIsFeature, Boolean.TRUE);
292		theFeatures.put(stringInterningFeature, Boolean.TRUE);
293		theFeatures.put(useAttributes2Feature, Boolean.FALSE);
294		theFeatures.put(useLocator2Feature, Boolean.FALSE);
295		theFeatures.put(useEntityResolver2Feature, Boolean.FALSE);
296		theFeatures.put(validationFeature, Boolean.FALSE);
297		theFeatures.put(xmlnsURIsFeature, Boolean.FALSE);
298		theFeatures.put(xmlnsURIsFeature, Boolean.FALSE);
299		theFeatures.put(XML11Feature, Boolean.FALSE);
300		theFeatures.put(ignoreBogonsFeature, truthValue(DEFAULT_IGNORE_BOGONS));
301		theFeatures.put(bogonsEmptyFeature, truthValue(DEFAULT_BOGONS_EMPTY));
302		theFeatures.put(rootBogonsFeature, truthValue(DEFAULT_ROOT_BOGONS));
303		theFeatures.put(defaultAttributesFeature, truthValue(DEFAULT_DEFAULT_ATTRIBUTES));
304		theFeatures.put(translateColonsFeature, truthValue(DEFAULT_TRANSLATE_COLONS));
305		theFeatures.put(restartElementsFeature, truthValue(DEFAULT_RESTART_ELEMENTS));
306		theFeatures.put(ignorableWhitespaceFeature, truthValue(DEFAULT_IGNORABLE_WHITESPACE));
307		theFeatures.put(CDATAElementsFeature, truthValue(DEFAULT_CDATA_ELEMENTS));
308		}
309
310	// Private clone of Boolean.valueOf that is guaranteed to return
311	// Boolean.TRUE or Boolean.FALSE
312	private static Boolean truthValue(boolean b) {
313		return b ? Boolean.TRUE : Boolean.FALSE;
314		}
315
316
317	public boolean getFeature (String name)
318		throws SAXNotRecognizedException, SAXNotSupportedException {
319		Boolean b = (Boolean)theFeatures.get(name);
320		if (b == null) {
321			throw new SAXNotRecognizedException("Unknown feature " + name);
322			}
323		return b.booleanValue();
324		}
325
326	public void setFeature (String name, boolean value)
327	throws SAXNotRecognizedException, SAXNotSupportedException {
328		Boolean b = (Boolean)theFeatures.get(name);
329		if (b == null) {
330			throw new SAXNotRecognizedException("Unknown feature " + name);
331			}
332		if (value) theFeatures.put(name, Boolean.TRUE);
333		else theFeatures.put(name, Boolean.FALSE);
334
335		if (name.equals(namespacesFeature)) namespaces = value;
336		else if (name.equals(ignoreBogonsFeature)) ignoreBogons = value;
337		else if (name.equals(bogonsEmptyFeature)) bogonsEmpty = value;
338		else if (name.equals(rootBogonsFeature)) rootBogons = value;
339		else if (name.equals(defaultAttributesFeature)) defaultAttributes = value;
340		else if (name.equals(translateColonsFeature)) translateColons = value;
341		else if (name.equals(restartElementsFeature)) restartElements = value;
342		else if (name.equals(ignorableWhitespaceFeature)) ignorableWhitespace = value;
343		else if (name.equals(CDATAElementsFeature)) CDATAElements = value;
344		}
345
346	public Object getProperty (String name)
347	throws SAXNotRecognizedException, SAXNotSupportedException {
348		if (name.equals(lexicalHandlerProperty)) {
349			return theLexicalHandler == this ? null : theLexicalHandler;
350			}
351		else if (name.equals(scannerProperty)) {
352			return theScanner;
353			}
354		else if (name.equals(schemaProperty)) {
355			return theSchema;
356			}
357		else if (name.equals(autoDetectorProperty)) {
358			return theAutoDetector;
359			}
360		else {
361			throw new SAXNotRecognizedException("Unknown property " + name);
362			}
363		}
364
365	public void setProperty (String name, Object value)
366	throws SAXNotRecognizedException, SAXNotSupportedException {
367		if (name.equals(lexicalHandlerProperty)) {
368			if (value == null) {
369				theLexicalHandler = this;
370				}
371			else if (value instanceof LexicalHandler) {
372				theLexicalHandler = (LexicalHandler)value;
373				}
374			else {
375				throw new SAXNotSupportedException("Your lexical handler is not a LexicalHandler");
376				}
377			}
378		else if (name.equals(scannerProperty)) {
379			if (value instanceof Scanner) {
380				theScanner = (Scanner)value;
381				}
382			else {
383				throw new SAXNotSupportedException("Your scanner is not a Scanner");
384				}
385			}
386		else if (name.equals(schemaProperty)) {
387			if (value instanceof Schema) {
388				theSchema = (Schema)value;
389				}
390			else {
391				 throw new SAXNotSupportedException("Your schema is not a Schema");
392				}
393			}
394		else if (name.equals(autoDetectorProperty)) {
395			if (value instanceof AutoDetector) {
396				theAutoDetector = (AutoDetector)value;
397				}
398			else {
399				throw new SAXNotSupportedException("Your auto-detector is not an AutoDetector");
400				}
401			}
402		else {
403			throw new SAXNotRecognizedException("Unknown property " + name);
404			}
405		}
406
407	public void setEntityResolver (EntityResolver resolver) {
408		theEntityResolver = (resolver == null) ? this : resolver;
409		}
410
411	public EntityResolver getEntityResolver () {
412		return (theEntityResolver == this) ? null : theEntityResolver;
413		}
414
415	public void setDTDHandler (DTDHandler handler) {
416		theDTDHandler = (handler == null) ? this : handler;
417		}
418
419	public DTDHandler getDTDHandler () {
420		return (theDTDHandler == this) ? null : theDTDHandler;
421		}
422
423	public void setContentHandler (ContentHandler handler) {
424		theContentHandler = (handler == null) ? this : handler;
425		}
426
427	public ContentHandler getContentHandler () {
428		return (theContentHandler == this) ? null : theContentHandler;
429		}
430
431	public void setErrorHandler (ErrorHandler handler) {
432		theErrorHandler = (handler == null) ? this : handler;
433		}
434
435	public ErrorHandler getErrorHandler () {
436		return (theErrorHandler == this) ? null : theErrorHandler;
437		}
438
439	public void parse (InputSource input) throws IOException, SAXException {
440		setup();
441		Reader r = getReader(input);
442		theContentHandler.startDocument();
443		theScanner.resetDocumentLocator(input.getPublicId(), input.getSystemId());
444		if (theScanner instanceof Locator) {
445			theContentHandler.setDocumentLocator((Locator)theScanner);
446			}
447		if (!(theSchema.getURI().equals("")))
448			theContentHandler.startPrefixMapping(theSchema.getPrefix(),
449				theSchema.getURI());
450		theScanner.scan(r, this);
451		}
452
453	public void parse (String systemid) throws IOException, SAXException {
454		parse(new InputSource(systemid));
455		}
456
457	// Sets up instance variables that haven't been set by setFeature
458	private void setup() {
459		if (theSchema == null) theSchema = new HTMLSchema();
460		if (theScanner == null) theScanner = new HTMLScanner();
461		if (theAutoDetector == null) {
462			theAutoDetector = new AutoDetector() {
463				public Reader autoDetectingReader(InputStream i) {
464					return new InputStreamReader(i);
465					}
466				};
467			}
468		theStack = new Element(theSchema.getElementType("<root>"), defaultAttributes);
469		thePCDATA = new Element(theSchema.getElementType("<pcdata>"), defaultAttributes);
470		theNewElement = null;
471		theAttributeName = null;
472		thePITarget = null;
473		theSaved = null;
474		theEntity = 0;
475		virginStack = true;
476                theDoctypeName = theDoctypePublicId = theDoctypeSystemId = null;
477		}
478
479	// Return a Reader based on the contents of an InputSource
480	// Buffer both the InputStream and the Reader
481	private Reader getReader(InputSource s) throws SAXException, IOException {
482		Reader r = s.getCharacterStream();
483		InputStream i = s.getByteStream();
484		String encoding = s.getEncoding();
485		String publicid = s.getPublicId();
486		String systemid = s.getSystemId();
487		if (r == null) {
488			if (i == null) i = getInputStream(publicid, systemid);
489//			i = new BufferedInputStream(i);
490			if (encoding == null) {
491				r = theAutoDetector.autoDetectingReader(i);
492				}
493			else {
494				try {
495					r = new InputStreamReader(i, encoding);
496					}
497				catch (UnsupportedEncodingException e) {
498					r = new InputStreamReader(i);
499					}
500				}
501			}
502//		r = new BufferedReader(r);
503		return r;
504		}
505
506	// Get an InputStream based on a publicid and a systemid
507	private InputStream getInputStream(String publicid, String systemid) throws IOException, SAXException {
508		URL basis = new URL("file", "", System.getProperty("user.dir") + "/.");
509		URL url = new URL(basis, systemid);
510		URLConnection c = url.openConnection();
511		return c.getInputStream();
512		}
513		// We don't process publicids (who uses them anyhow?)
514
515	// ScanHandler implementation
516
517	private Element theNewElement = null;
518	private String theAttributeName = null;
519	private boolean theDoctypeIsPresent = false;
520	private String theDoctypePublicId = null;
521	private String theDoctypeSystemId = null;
522	private String theDoctypeName = null;
523	private String thePITarget = null;
524	private Element theStack = null;
525	private Element theSaved = null;
526	private Element thePCDATA = null;
527	private int theEntity = 0;	// needs to support chars past U+FFFF
528
529	public void adup(char[] buff, int offset, int length) throws SAXException {
530		if (theNewElement == null || theAttributeName == null) return;
531		theNewElement.setAttribute(theAttributeName, null, theAttributeName);
532		theAttributeName = null;
533		}
534
535	public void aname(char[] buff, int offset, int length) throws SAXException {
536		if (theNewElement == null) return;
537		// Currently we don't rely on Schema to canonicalize
538		// attribute names.
539		theAttributeName = makeName(buff, offset, length).toLowerCase(Locale.ROOT);
540//		System.err.println("%% Attribute name " + theAttributeName);
541		}
542
543	public void aval(char[] buff, int offset, int length) throws SAXException {
544		if (theNewElement == null || theAttributeName == null) return;
545		String value = new String(buff, offset, length);
546//		System.err.println("%% Attribute value [" + value + "]");
547		value = expandEntities(value);
548		theNewElement.setAttribute(theAttributeName, null, value);
549		theAttributeName = null;
550//		System.err.println("%% Aval done");
551		}
552
553	// Expand entity references in attribute values selectively.
554	// Currently we expand a reference iff it is properly terminated
555	// with a semicolon.
556	private String expandEntities(String src) {
557		int refStart = -1;
558		int len = src.length();
559		char[] dst = new char[len];
560		int dstlen = 0;
561		for (int i = 0; i < len; i++) {
562			char ch = src.charAt(i);
563			dst[dstlen++] = ch;
564//			System.err.print("i = " + i + ", d = " + dstlen + ", ch = [" + ch + "] ");
565			if (ch == '&' && refStart == -1) {
566				// start of a ref excluding &
567				refStart = dstlen;
568//				System.err.println("start of ref");
569				}
570			else if (refStart == -1) {
571				// not in a ref
572//				System.err.println("not in ref");
573				}
574			else if (Character.isLetter(ch) ||
575					Character.isDigit(ch) ||
576					ch == '#') {
577				// valid entity char
578//				System.err.println("valid");
579				}
580			else if (ch == ';') {
581				// properly terminated ref
582//				System.err.print("got [" + new String(dst, refStart, dstlen-refStart-1) + "]");
583				int ent = lookupEntity(dst, refStart, dstlen - refStart - 1);
584//				System.err.println(" = " + ent);
585				if (ent > 0xFFFF) {
586					ent -= 0x10000;
587					dst[refStart - 1] = (char)((ent>>10) + 0xD800);
588					dst[refStart] = (char)((ent&0x3FF) + 0xDC00);
589					dstlen = refStart + 1;
590					}
591				else if (ent != 0) {
592					dst[refStart - 1] = (char)ent;
593					dstlen = refStart;
594					}
595				refStart = -1;
596				}
597			else {
598				// improperly terminated ref
599//				System.err.println("end of ref");
600				refStart = -1;
601				}
602			}
603		return new String(dst, 0, dstlen);
604		}
605
606	public void entity(char[] buff, int offset, int length) throws SAXException {
607		theEntity = lookupEntity(buff, offset, length);
608		}
609
610	// Process numeric character references,
611	// deferring to the schema for named ones.
612	private int lookupEntity(char[] buff, int offset, int length) {
613		int result = 0;
614		if (length < 1) return result;
615//		System.err.println("%% Entity at " + offset + " " + length);
616//		System.err.println("%% Got entity [" + new String(buff, offset, length) + "]");
617		if (buff[offset] == '#') {
618                        if (length > 1 && (buff[offset+1] == 'x'
619                                        || buff[offset+1] == 'X')) {
620                                try {
621                                        return Integer.parseInt(new String(buff, offset + 2, length - 2), 16);
622                                        }
623                                catch (NumberFormatException e) { return 0; }
624                                }
625                        try {
626                                return Integer.parseInt(new String(buff, offset + 1, length - 1), 10);
627                                }
628                        catch (NumberFormatException e) { return 0; }
629                        }
630		return theSchema.getEntity(new String(buff, offset, length));
631		}
632
633	public void eof(char[] buff, int offset, int length) throws SAXException {
634		if (virginStack) rectify(thePCDATA);
635		while (theStack.next() != null) {
636			pop();
637			}
638		if (!(theSchema.getURI().equals("")))
639			theContentHandler.endPrefixMapping(theSchema.getPrefix());
640		theContentHandler.endDocument();
641		}
642
643	public void etag(char[] buff, int offset, int length) throws SAXException {
644		if (etag_cdata(buff, offset, length)) return;
645		etag_basic(buff, offset, length);
646		}
647
648	private static char[] etagchars = {'<', '/', '>'};
649	public boolean etag_cdata(char[] buff, int offset, int length) throws SAXException {
650		String currentName = theStack.name();
651		// If this is a CDATA element and the tag doesn't match,
652		// or isn't properly formed (junk after the name),
653		// restart CDATA mode and process the tag as characters.
654		if (CDATAElements && (theStack.flags() & Schema.F_CDATA) != 0) {
655			boolean realTag = (length == currentName.length());
656			if (realTag) {
657				for (int i = 0; i < length; i++) {
658					if (Character.toLowerCase(buff[offset + i]) != Character.toLowerCase(currentName.charAt(i))) {
659						realTag = false;
660						break;
661						}
662					}
663				}
664			if (!realTag) {
665				theContentHandler.characters(etagchars, 0, 2);
666				theContentHandler.characters(buff, offset, length);
667				theContentHandler.characters(etagchars, 2, 1);
668				theScanner.startCDATA();
669				return true;
670				}
671			}
672		return false;
673		}
674
675	public void etag_basic(char[] buff, int offset, int length) throws SAXException {
676		theNewElement = null;
677		String name;
678		if (length != 0) {
679			// Canonicalize case of name
680			name = makeName(buff, offset, length);
681//			System.err.println("got etag [" + name + "]");
682			ElementType type = theSchema.getElementType(name);
683			if (type == null) return;	// mysterious end-tag
684			name = type.name();
685			}
686		else {
687			name = theStack.name();
688			}
689//		System.err.println("%% Got end of " + name);
690
691		Element sp;
692		boolean inNoforce = false;
693		for (sp = theStack; sp != null; sp = sp.next()) {
694			if (sp.name().equals(name)) break;
695			if ((sp.flags() & Schema.F_NOFORCE) != 0) inNoforce = true;
696			}
697
698		if (sp == null) return;		// Ignore unknown etags
699		if (sp.next() == null || sp.next().next() == null) return;
700		if (inNoforce) {		// inside an F_NOFORCE element?
701			sp.preclose();		// preclose the matching element
702			}
703		else {			// restartably pop everything above us
704			while (theStack != sp) {
705				restartablyPop();
706				}
707			pop();
708			}
709		// pop any preclosed elements now at the top
710		while (theStack.isPreclosed()) {
711			pop();
712			}
713		restart(null);
714		}
715
716	// Push restartables on the stack if possible
717	// e is the next element to be started, if we know what it is
718	private void restart(Element e) throws SAXException {
719		while (theSaved != null && theStack.canContain(theSaved) &&
720				(e == null || theSaved.canContain(e))) {
721			Element next = theSaved.next();
722			push(theSaved);
723			theSaved = next;
724			}
725		}
726
727	// Pop the stack irrevocably
728	private void pop() throws SAXException {
729		if (theStack == null) return;		// empty stack
730		String name = theStack.name();
731		String localName = theStack.localName();
732		String namespace = theStack.namespace();
733		String prefix = prefixOf(name);
734
735//		System.err.println("%% Popping " + name);
736		if (!namespaces) namespace = localName = "";
737		theContentHandler.endElement(namespace, localName, name);
738		if (foreign(prefix, namespace)) {
739			theContentHandler.endPrefixMapping(prefix);
740//			System.err.println("%% Unmapping [" + prefix + "] for elements to " + namespace);
741			}
742		Attributes atts = theStack.atts();
743		for (int i = atts.getLength() - 1; i >= 0; i--) {
744			String attNamespace = atts.getURI(i);
745			String attPrefix = prefixOf(atts.getQName(i));
746			if (foreign(attPrefix, attNamespace)) {
747				theContentHandler.endPrefixMapping(attPrefix);
748//			System.err.println("%% Unmapping [" + attPrefix + "] for attributes to " + attNamespace);
749				}
750			}
751		theStack = theStack.next();
752		}
753
754	// Pop the stack restartably
755	private void restartablyPop() throws SAXException {
756		Element popped = theStack;
757		pop();
758		if (restartElements && (popped.flags() & Schema.F_RESTART) != 0) {
759			popped.anonymize();
760			popped.setNext(theSaved);
761			theSaved = popped;
762			}
763		}
764
765	// Push element onto stack
766	private boolean virginStack = true;
767	private void push(Element e) throws SAXException {
768		String name = e.name();
769		String localName = e.localName();
770		String namespace = e.namespace();
771		String prefix = prefixOf(name);
772
773//		System.err.println("%% Pushing " + name);
774		e.clean();
775		if (!namespaces) namespace = localName = "";
776                if (virginStack && localName.equalsIgnoreCase(theDoctypeName)) {
777                    try {
778                        theEntityResolver.resolveEntity(theDoctypePublicId, theDoctypeSystemId);
779                    } catch (IOException ew) { }   // Can't be thrown for root I believe.
780                }
781		if (foreign(prefix, namespace)) {
782			theContentHandler.startPrefixMapping(prefix, namespace);
783//			System.err.println("%% Mapping [" + prefix + "] for elements to " + namespace);
784			}
785		Attributes atts = e.atts();
786		int len = atts.getLength();
787		for (int i = 0; i < len; i++) {
788			String attNamespace = atts.getURI(i);
789			String attPrefix = prefixOf(atts.getQName(i));
790			if (foreign(attPrefix, attNamespace)) {
791				theContentHandler.startPrefixMapping(attPrefix, attNamespace);
792//				System.err.println("%% Mapping [" + attPrefix + "] for attributes to " + attNamespace);
793				}
794			}
795		theContentHandler.startElement(namespace, localName, name, e.atts());
796		e.setNext(theStack);
797		theStack = e;
798		virginStack = false;
799		if (CDATAElements && (theStack.flags() & Schema.F_CDATA) != 0) {
800			theScanner.startCDATA();
801			}
802		}
803
804	// Get the prefix from a QName
805	private String prefixOf(String name) {
806		int i = name.indexOf(':');
807		String prefix = "";
808		if (i != -1) prefix = name.substring(0, i);
809//		System.err.println("%% " + prefix + " is prefix of " + name);
810		return prefix;
811		}
812
813	// Return true if we have a foreign name
814	private boolean foreign(String prefix, String namespace) {
815//		System.err.print("%% Testing " + prefix + " and " + namespace + " for foreignness -- ");
816		boolean foreign = !(prefix.equals("") || namespace.equals("") ||
817			namespace.equals(theSchema.getURI()));
818//		System.err.println(foreign);
819		return foreign;
820		}
821
822        /**
823         * Parsing the complete XML Document Type Definition is way too complex,
824         * but for many simple cases we can extract something useful from it.
825         *
826         * doctypedecl  ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
827         *  DeclSep     ::= PEReference | S
828         *  intSubset   ::= (markupdecl | DeclSep)*
829         *  markupdecl  ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment
830         *  ExternalID  ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral
831         */
832	public void decl(char[] buff, int offset, int length) throws SAXException {
833		String s = new String(buff, offset, length);
834		String name = null;
835		String systemid = null;
836		String publicid = null;
837		String[] v = split(s);
838		if (v.length > 0 && "DOCTYPE".equalsIgnoreCase(v[0])) {
839			if (theDoctypeIsPresent) return;		// one doctype only!
840			theDoctypeIsPresent = true;
841			if (v.length > 1) {
842				name = v[1];
843				if (v.length>3 && "SYSTEM".equals(v[2])) {
844				systemid = v[3];
845				}
846			else if (v.length > 3 && "PUBLIC".equals(v[2])) {
847				publicid = v[3];
848				if (v.length > 4) {
849					systemid = v[4];
850					}
851				else {
852					systemid = "";
853					}
854                    }
855                }
856            }
857		publicid = trimquotes(publicid);
858		systemid = trimquotes(systemid);
859		if (name != null) {
860			publicid = cleanPublicid(publicid);
861			theLexicalHandler.startDTD(name, publicid, systemid);
862			theLexicalHandler.endDTD();
863			theDoctypeName = name;
864			theDoctypePublicId = publicid;
865		if (theScanner instanceof Locator) {    // Must resolve systemid
866                    theDoctypeSystemId  = ((Locator)theScanner).getSystemId();
867                    try {
868                        theDoctypeSystemId = new URL(new URL(theDoctypeSystemId), systemid).toString();
869                    } catch (Exception e) {}
870                }
871            }
872        }
873
874	// If the String is quoted, trim the quotes.
875	private static String trimquotes(String in) {
876		if (in == null) return in;
877		int length = in.length();
878		if (length == 0) return in;
879		char s = in.charAt(0);
880		char e = in.charAt(length - 1);
881		if (s == e && (s == '\'' || s == '"')) {
882			in = in.substring(1, in.length() - 1);
883			}
884		return in;
885		}
886
887	// Split the supplied String into words or phrases seperated by spaces.
888	// Recognises quotes around a phrase and doesn't split it.
889	private static String[] split(String val) throws IllegalArgumentException {
890		val = val.trim();
891		if (val.length() == 0) {
892			return new String[0];
893			}
894		else {
895			ArrayList l = new ArrayList();
896			int s = 0;
897			int e = 0;
898			boolean sq = false;	// single quote
899			boolean dq = false;	// double quote
900			char lastc = 0;
901			int len = val.length();
902			for (e=0; e < len; e++) {
903				char c = val.charAt(e);
904				if (!dq && c == '\'' && lastc != '\\') {
905				sq = !sq;
906				if (s < 0) s = e;
907				}
908			else if (!sq && c == '\"' && lastc != '\\') {
909				dq = !dq;
910				if (s < 0) s = e;
911				}
912			else if (!sq && !dq) {
913				if (Character.isWhitespace(c)) {
914					if (s >= 0) l.add(val.substring(s, e));
915					s = -1;
916					}
917				else if (s < 0 && c != ' ') {
918					s = e;
919					}
920				}
921			lastc = c;
922			}
923		l.add(val.substring(s, e));
924		return (String[])l.toArray(new String[0]);
925		}
926        }
927
928	// Replace junk in publicids with spaces
929	private static String legal =
930		"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-'()+,./:=?;!*#@$_%";
931
932	private String cleanPublicid(String src) {
933		if (src == null) return null;
934		int len = src.length();
935		StringBuffer dst = new StringBuffer(len);
936		boolean suppressSpace = true;
937		for (int i = 0; i < len; i++) {
938			char ch = src.charAt(i);
939			if (legal.indexOf(ch) != -1) { 	// legal but not whitespace
940				dst.append(ch);
941				suppressSpace = false;
942				}
943			else if (suppressSpace) {	// normalizable whitespace or junk
944				;
945				}
946			else {
947				dst.append(' ');
948				suppressSpace = true;
949				}
950			}
951//		System.err.println("%% Publicid [" + dst.toString().trim() + "]");
952		return dst.toString().trim();	// trim any final junk whitespace
953		}
954
955
956	public void gi(char[] buff, int offset, int length) throws SAXException {
957		if (theNewElement != null) return;
958		String name = makeName(buff, offset, length);
959		if (name == null) return;
960		ElementType type = theSchema.getElementType(name);
961		if (type == null) {
962			// Suppress unknown elements if ignore-bogons is on
963			if (ignoreBogons) return;
964			int bogonModel = bogonsEmpty ? Schema.M_EMPTY : Schema.M_ANY;
965			int bogonMemberOf = rootBogons ? Schema.M_ANY : (Schema.M_ANY & ~ Schema.M_ROOT);
966			theSchema.elementType(name, bogonModel, bogonMemberOf, 0);
967			if (!rootBogons) theSchema.parent(name, theSchema.rootElementType().name());
968			type = theSchema.getElementType(name);
969			}
970
971		theNewElement = new Element(type, defaultAttributes);
972//		System.err.println("%% Got GI " + theNewElement.name());
973		}
974
975	public void cdsect(char[] buff, int offset, int length) throws SAXException {
976		theLexicalHandler.startCDATA();
977		pcdata(buff, offset, length);
978		theLexicalHandler.endCDATA();
979		}
980	public void pcdata(char[] buff, int offset, int length) throws SAXException {
981		if (length == 0) return;
982		boolean allWhite = true;
983		for (int i = 0; i < length; i++) {
984			if (!Character.isWhitespace(buff[offset+i])) {
985				allWhite = false;
986				}
987			}
988		if (allWhite && !theStack.canContain(thePCDATA)) {
989			if (ignorableWhitespace) {
990				theContentHandler.ignorableWhitespace(buff, offset, length);
991				}
992			}
993		else {
994			rectify(thePCDATA);
995			theContentHandler.characters(buff, offset, length);
996			}
997		}
998
999	public void pitarget(char[] buff, int offset, int length) throws SAXException {
1000		if (theNewElement != null) return;
1001		thePITarget = makeName(buff, offset, length).replace(':', '_');
1002		}
1003
1004	public void pi(char[] buff, int offset, int length) throws SAXException {
1005		if (theNewElement != null || thePITarget == null) return;
1006		if ("xml".equalsIgnoreCase(thePITarget)) return;
1007//		if (length > 0 && buff[length - 1] == '?') System.err.println("%% Removing ? from PI");
1008		if (length > 0 && buff[length - 1] == '?') length--;	// remove trailing ?
1009		theContentHandler.processingInstruction(thePITarget,
1010			new String(buff, offset, length));
1011		thePITarget = null;
1012		}
1013
1014	public void stagc(char[] buff, int offset, int length) throws SAXException {
1015//		System.err.println("%% Start-tag");
1016		if (theNewElement == null) return;
1017		rectify(theNewElement);
1018		if (theStack.model() == Schema.M_EMPTY) {
1019			// Force an immediate end tag
1020			etag_basic(buff, offset, length);
1021			}
1022		}
1023
1024	public void stage(char[] buff, int offset, int length) throws SAXException {
1025//		System.err.println("%% Empty-tag");
1026		if (theNewElement == null) return;
1027		rectify(theNewElement);
1028		// Force an immediate end tag
1029		etag_basic(buff, offset, length);
1030		}
1031
1032	// Comment buffer is twice the size of the output buffer
1033	private char[] theCommentBuffer = new char[2000];
1034	public void cmnt(char[] buff, int offset, int length) throws SAXException {
1035		theLexicalHandler.comment(buff, offset, length);
1036		}
1037
1038	// Rectify the stack, pushing and popping as needed
1039	// so that the argument can be safely pushed
1040	private void rectify(Element e) throws SAXException {
1041		Element sp;
1042		while (true) {
1043			for (sp = theStack; sp != null; sp = sp.next()) {
1044				if (sp.canContain(e)) break;
1045				}
1046			if (sp != null) break;
1047			ElementType parentType = e.parent();
1048			if (parentType == null) break;
1049			Element parent = new Element(parentType, defaultAttributes);
1050//			System.err.println("%% Ascending from " + e.name() + " to " + parent.name());
1051			parent.setNext(e);
1052			e = parent;
1053			}
1054		if (sp == null) return;		// don't know what to do
1055		while (theStack != sp) {
1056			if (theStack == null || theStack.next() == null ||
1057				theStack.next().next() == null) break;
1058			restartablyPop();
1059			}
1060		while (e != null) {
1061			Element nexte = e.next();
1062			if (!e.name().equals("<pcdata>")) push(e);
1063			e = nexte;
1064			restart(e);
1065			}
1066		theNewElement = null;
1067		}
1068
1069	public int getEntity() {
1070		return theEntity;
1071		}
1072
1073	// Return the argument as a valid XML name
1074	// This no longer lowercases the result: we depend on Schema to
1075	// canonicalize case.
1076	private String makeName(char[] buff, int offset, int length) {
1077		StringBuffer dst = new StringBuffer(length + 2);
1078		boolean seenColon = false;
1079		boolean start = true;
1080//		String src = new String(buff, offset, length); // DEBUG
1081		for (; length-- > 0; offset++) {
1082			char ch = buff[offset];
1083			if (Character.isLetter(ch) || ch == '_') {
1084				start = false;
1085				dst.append(ch);
1086				}
1087			else if (Character.isDigit(ch) || ch == '-' || ch == '.') {
1088				if (start) dst.append('_');
1089				start = false;
1090				dst.append(ch);
1091				}
1092			else if (ch == ':' && !seenColon) {
1093				seenColon = true;
1094				if (start) dst.append('_');
1095				start = true;
1096				dst.append(translateColons ? '_' : ch);
1097				}
1098			}
1099		int dstLength = dst.length();
1100		if (dstLength == 0 || dst.charAt(dstLength - 1) == ':') dst.append('_');
1101//		System.err.println("Made name \"" + dst + "\" from \"" + src + "\"");
1102		return dst.toString().intern();
1103		}
1104
1105	// Default LexicalHandler implementation
1106
1107	public void comment(char[] ch, int start, int length) throws SAXException { }
1108	public void endCDATA() throws SAXException { }
1109	public void endDTD() throws SAXException { }
1110	public void endEntity(String name) throws SAXException { }
1111	public void startCDATA() throws SAXException { }
1112	public void startDTD(String name, String publicid, String systemid) throws SAXException { }
1113	public void startEntity(String name) throws SAXException { }
1114
1115	}
1116