1// This file is part of TagSoup and is Copyright 2002-2008 by John Cowan.
2//
3// TagSoup is licensed under the Apache License,
4// Version 2.0.  You may obtain a copy of this license at
5// http://www.apache.org/licenses/LICENSE-2.0 .  You may also have
6// additional legal rights not granted by this license.
7//
8// TagSoup is distributed in the hope that it will be useful, but
9// unless required by applicable law or agreed to in writing, TagSoup
10// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
11// OF ANY KIND, either express or implied; not even the implied warranty
12// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
13//
14//
15// The TagSoup parser
16
17package org.ccil.cowan.tagsoup;
18import java.util.HashMap;
19import java.util.ArrayList;
20import java.io.*;
21import java.net.URL;
22import java.net.URLConnection;
23import org.xml.sax.*;
24import org.xml.sax.helpers.DefaultHandler;
25import org.xml.sax.ext.LexicalHandler;
26
27
28/**
29The SAX parser class.
30**/
31public class Parser extends DefaultHandler implements ScanHandler, XMLReader, LexicalHandler {
32
33	// XMLReader implementation
34
35	private ContentHandler theContentHandler = this;
36	private LexicalHandler theLexicalHandler = this;
37	private DTDHandler theDTDHandler = this;
38	private ErrorHandler theErrorHandler = this;
39	private EntityResolver theEntityResolver = this;
40	private Schema theSchema;
41	private Scanner theScanner;
42	private AutoDetector theAutoDetector;
43
44	// Default values for feature flags
45
46	private static boolean DEFAULT_NAMESPACES = true;
47	private static boolean DEFAULT_IGNORE_BOGONS = false;
48	private static boolean DEFAULT_BOGONS_EMPTY = false;
49        private static boolean DEFAULT_ROOT_BOGONS = true;
50	private static boolean DEFAULT_DEFAULT_ATTRIBUTES = true;
51	private static boolean DEFAULT_TRANSLATE_COLONS = false;
52	private static boolean DEFAULT_RESTART_ELEMENTS = true;
53	private static boolean DEFAULT_IGNORABLE_WHITESPACE = false;
54	private static boolean DEFAULT_CDATA_ELEMENTS = true;
55
56	// Feature flags.
57
58	private boolean namespaces = DEFAULT_NAMESPACES;
59	private boolean ignoreBogons = DEFAULT_IGNORE_BOGONS;
60	private boolean bogonsEmpty = DEFAULT_BOGONS_EMPTY;
61        private boolean rootBogons = DEFAULT_ROOT_BOGONS;
62	private boolean defaultAttributes = DEFAULT_DEFAULT_ATTRIBUTES;
63	private boolean translateColons = DEFAULT_TRANSLATE_COLONS;
64	private boolean restartElements = DEFAULT_RESTART_ELEMENTS;
65	private boolean ignorableWhitespace = DEFAULT_IGNORABLE_WHITESPACE;
66	private boolean CDATAElements = DEFAULT_CDATA_ELEMENTS;
67
68	/**
69	A value of "true" indicates namespace URIs and unprefixed local
70	names for element and attribute names will be available.
71	**/
72	public final static String namespacesFeature =
73		"http://xml.org/sax/features/namespaces";
74
75	/**
76	A value of "true" indicates that XML qualified names (with prefixes)
77	and attributes (including xmlns* attributes) will be available.
78	We don't support this value.
79	**/
80	public final static String namespacePrefixesFeature =
81		"http://xml.org/sax/features/namespace-prefixes";
82
83	/**
84	Reports whether this parser processes external general entities
85	(it doesn't).
86	**/
87	public final static String externalGeneralEntitiesFeature =
88		"http://xml.org/sax/features/external-general-entities";
89
90	/**
91	Reports whether this parser processes external parameter entities
92	(it doesn't).
93	**/
94	public final static String externalParameterEntitiesFeature =
95		"http://xml.org/sax/features/external-parameter-entities";
96
97	/**
98	May be examined only during a parse, after the startDocument()
99	callback has been completed; read-only. The value is true if
100	the document specified standalone="yes" in its XML declaration,
101	and otherwise is false.  (It's always false.)
102	**/
103	public final static String isStandaloneFeature =
104		"http://xml.org/sax/features/is-standalone";
105
106	/**
107	A value of "true" indicates that the LexicalHandler will report
108	the beginning and end of parameter entities (it won't).
109	**/
110	public final static String lexicalHandlerParameterEntitiesFeature =
111		"http://xml.org/sax/features/lexical-handler/parameter-entities";
112
113	/**
114	A value of "true" indicates that system IDs in declarations will
115	be absolutized (relative to their base URIs) before reporting.
116	(This returns true but doesn't actually do anything.)
117	**/
118	public final static String resolveDTDURIsFeature =
119		"http://xml.org/sax/features/resolve-dtd-uris";
120
121	/**
122	Has a value of "true" if all XML names (for elements,
123	prefixes, attributes, entities, notations, and local
124	names), as well as Namespace URIs, will have been interned
125	using java.lang.String.intern. This supports fast testing of
126	equality/inequality against string constants, rather than forcing
127	slower calls to String.equals().  (We always intern.)
128	**/
129	public final static String stringInterningFeature =
130		"http://xml.org/sax/features/string-interning";
131
132	/**
133	Returns "true" if the Attributes objects passed by this
134	parser in ContentHandler.startElement() implement the
135	org.xml.sax.ext.Attributes2 interface.	(They don't.)
136	**/
137
138	public final static String useAttributes2Feature =
139		"http://xml.org/sax/features/use-attributes2";
140
141	/**
142	Returns "true" if the Locator objects passed by this parser
143	in ContentHandler.setDocumentLocator() implement the
144	org.xml.sax.ext.Locator2 interface.  (They don't.)
145	**/
146	public final static String useLocator2Feature =
147		"http://xml.org/sax/features/use-locator2";
148
149	/**
150	Returns "true" if, when setEntityResolver is given an object
151	implementing the org.xml.sax.ext.EntityResolver2 interface,
152	those new methods will be used.  (They won't be.)
153	**/
154	public final static String useEntityResolver2Feature =
155		"http://xml.org/sax/features/use-entity-resolver2";
156
157	/**
158	Controls whether the parser is reporting all validity errors
159	(We don't report any validity errors.)
160	**/
161	public final static String validationFeature =
162		"http://xml.org/sax/features/validation";
163
164	/**
165	Controls whether the parser reports Unicode normalization
166	errors as described in section 2.13 and Appendix B of the XML
167	1.1 Recommendation.  (We don't normalize.)
168	**/
169	public final static String unicodeNormalizationCheckingFeature =
170"http://xml.org/sax/features/unicode-normalization-checking";
171
172	/**
173	Controls whether, when the namespace-prefixes feature is set,
174	the parser treats namespace declaration attributes as being in
175	the http://www.w3.org/2000/xmlns/ namespace.  (It doesn't.)
176	**/
177	public final static String xmlnsURIsFeature =
178		"http://xml.org/sax/features/xmlns-uris";
179
180	/**
181	Returns "true" if the parser supports both XML 1.1 and XML 1.0.
182	(Always false.)
183	**/
184	public final static String XML11Feature =
185		"http://xml.org/sax/features/xml-1.1";
186
187	/**
188	A value of "true" indicates that the parser will ignore
189	unknown elements.
190	**/
191	public final static String ignoreBogonsFeature =
192		"http://www.ccil.org/~cowan/tagsoup/features/ignore-bogons";
193
194	/**
195	A value of "true" indicates that the parser will give unknown
196	elements a content model of EMPTY; a value of "false", a
197	content model of ANY.
198	**/
199	public final static String bogonsEmptyFeature =
200		"http://www.ccil.org/~cowan/tagsoup/features/bogons-empty";
201
202	/**
203	A value of "true" indicates that the parser will allow unknown
204	elements to be the root element.
205	**/
206	public final static String rootBogonsFeature =
207		"http://www.ccil.org/~cowan/tagsoup/features/root-bogons";
208
209	/**
210	A value of "true" indicates that the parser will return default
211	attribute values for missing attributes that have default values.
212	**/
213	public final static String defaultAttributesFeature =
214		"http://www.ccil.org/~cowan/tagsoup/features/default-attributes";
215
216	/**
217	A value of "true" indicates that the parser will
218	translate colons into underscores in names.
219	**/
220	public final static String translateColonsFeature =
221		"http://www.ccil.org/~cowan/tagsoup/features/translate-colons";
222
223	/**
224	A value of "true" indicates that the parser will
225	attempt to restart the restartable elements.
226	**/
227	public final static String restartElementsFeature =
228		"http://www.ccil.org/~cowan/tagsoup/features/restart-elements";
229
230	/**
231	A value of "true" indicates that the parser will
232	transmit whitespace in element-only content via the SAX
233	ignorableWhitespace callback.  Normally this is not done,
234	because HTML is an SGML application and SGML suppresses
235	such whitespace.
236	**/
237	public final static String ignorableWhitespaceFeature =
238		"http://www.ccil.org/~cowan/tagsoup/features/ignorable-whitespace";
239
240	/**
241	A value of "true" indicates that the parser will treat CDATA
242	elements specially.  Normally true, since the input is by
243	default HTML.
244	**/
245	public final static String CDATAElementsFeature =
246		"http://www.ccil.org/~cowan/tagsoup/features/cdata-elements";
247
248	/**
249	Used to see some syntax events that are essential in some
250	applications: comments, CDATA delimiters, selected general
251	entity inclusions, and the start and end of the DTD (and
252	declaration of document element name). The Object must implement
253	org.xml.sax.ext.LexicalHandler.
254	**/
255	public final static String lexicalHandlerProperty =
256		"http://xml.org/sax/properties/lexical-handler";
257
258	/**
259	Specifies the Scanner object this Parser uses.
260	**/
261	public final static String scannerProperty =
262		"http://www.ccil.org/~cowan/tagsoup/properties/scanner";
263
264	/**
265	Specifies the Schema object this Parser uses.
266	**/
267	public final static String schemaProperty =
268		"http://www.ccil.org/~cowan/tagsoup/properties/schema";
269
270	/**
271	Specifies the AutoDetector (for encoding detection) this Parser uses.
272	**/
273	public final static String autoDetectorProperty =
274		"http://www.ccil.org/~cowan/tagsoup/properties/auto-detector";
275
276	// Due to sucky Java order of initialization issues, these
277	// entries are maintained separately from the initial values of
278	// the corresponding instance variables, but care must be taken
279	// to keep them in sync.
280
281	private HashMap theFeatures = new HashMap();
282	{
283		theFeatures.put(namespacesFeature, truthValue(DEFAULT_NAMESPACES));
284		theFeatures.put(namespacePrefixesFeature, Boolean.FALSE);
285		theFeatures.put(externalGeneralEntitiesFeature, Boolean.FALSE);
286		theFeatures.put(externalParameterEntitiesFeature, Boolean.FALSE);
287		theFeatures.put(isStandaloneFeature, Boolean.FALSE);
288		theFeatures.put(lexicalHandlerParameterEntitiesFeature,
289			Boolean.FALSE);
290		theFeatures.put(resolveDTDURIsFeature, Boolean.TRUE);
291		theFeatures.put(stringInterningFeature, Boolean.TRUE);
292		theFeatures.put(useAttributes2Feature, Boolean.FALSE);
293		theFeatures.put(useLocator2Feature, Boolean.FALSE);
294		theFeatures.put(useEntityResolver2Feature, Boolean.FALSE);
295		theFeatures.put(validationFeature, Boolean.FALSE);
296		theFeatures.put(xmlnsURIsFeature, Boolean.FALSE);
297		theFeatures.put(xmlnsURIsFeature, Boolean.FALSE);
298		theFeatures.put(XML11Feature, Boolean.FALSE);
299		theFeatures.put(ignoreBogonsFeature, truthValue(DEFAULT_IGNORE_BOGONS));
300		theFeatures.put(bogonsEmptyFeature, truthValue(DEFAULT_BOGONS_EMPTY));
301		theFeatures.put(rootBogonsFeature, truthValue(DEFAULT_ROOT_BOGONS));
302		theFeatures.put(defaultAttributesFeature, truthValue(DEFAULT_DEFAULT_ATTRIBUTES));
303		theFeatures.put(translateColonsFeature, truthValue(DEFAULT_TRANSLATE_COLONS));
304		theFeatures.put(restartElementsFeature, truthValue(DEFAULT_RESTART_ELEMENTS));
305		theFeatures.put(ignorableWhitespaceFeature, truthValue(DEFAULT_IGNORABLE_WHITESPACE));
306		theFeatures.put(CDATAElementsFeature, truthValue(DEFAULT_CDATA_ELEMENTS));
307		}
308
309	// Private clone of Boolean.valueOf that is guaranteed to return
310	// Boolean.TRUE or Boolean.FALSE
311	private static Boolean truthValue(boolean b) {
312		return b ? Boolean.TRUE : Boolean.FALSE;
313		}
314
315
316	public boolean getFeature (String name)
317		throws SAXNotRecognizedException, SAXNotSupportedException {
318		Boolean b = (Boolean)theFeatures.get(name);
319		if (b == null) {
320			throw new SAXNotRecognizedException("Unknown feature " + name);
321			}
322		return b.booleanValue();
323		}
324
325	public void setFeature (String name, boolean value)
326	throws SAXNotRecognizedException, SAXNotSupportedException {
327		Boolean b = (Boolean)theFeatures.get(name);
328		if (b == null) {
329			throw new SAXNotRecognizedException("Unknown feature " + name);
330			}
331		if (value) theFeatures.put(name, Boolean.TRUE);
332		else theFeatures.put(name, Boolean.FALSE);
333
334		if (name.equals(namespacesFeature)) namespaces = value;
335		else if (name.equals(ignoreBogonsFeature)) ignoreBogons = value;
336		else if (name.equals(bogonsEmptyFeature)) bogonsEmpty = value;
337		else if (name.equals(rootBogonsFeature)) rootBogons = value;
338		else if (name.equals(defaultAttributesFeature)) defaultAttributes = value;
339		else if (name.equals(translateColonsFeature)) translateColons = value;
340		else if (name.equals(restartElementsFeature)) restartElements = value;
341		else if (name.equals(ignorableWhitespaceFeature)) ignorableWhitespace = value;
342		else if (name.equals(CDATAElementsFeature)) CDATAElements = value;
343		}
344
345	public Object getProperty (String name)
346	throws SAXNotRecognizedException, SAXNotSupportedException {
347		if (name.equals(lexicalHandlerProperty)) {
348			return theLexicalHandler == this ? null : theLexicalHandler;
349			}
350		else if (name.equals(scannerProperty)) {
351			return theScanner;
352			}
353		else if (name.equals(schemaProperty)) {
354			return theSchema;
355			}
356		else if (name.equals(autoDetectorProperty)) {
357			return theAutoDetector;
358			}
359		else {
360			throw new SAXNotRecognizedException("Unknown property " + name);
361			}
362		}
363
364	public void setProperty (String name, Object value)
365	throws SAXNotRecognizedException, SAXNotSupportedException {
366		if (name.equals(lexicalHandlerProperty)) {
367			if (value == null) {
368				theLexicalHandler = this;
369				}
370			else if (value instanceof LexicalHandler) {
371				theLexicalHandler = (LexicalHandler)value;
372				}
373			else {
374				throw new SAXNotSupportedException("Your lexical handler is not a LexicalHandler");
375				}
376			}
377		else if (name.equals(scannerProperty)) {
378			if (value instanceof Scanner) {
379				theScanner = (Scanner)value;
380				}
381			else {
382				throw new SAXNotSupportedException("Your scanner is not a Scanner");
383				}
384			}
385		else if (name.equals(schemaProperty)) {
386			if (value instanceof Schema) {
387				theSchema = (Schema)value;
388				}
389			else {
390				 throw new SAXNotSupportedException("Your schema is not a Schema");
391				}
392			}
393		else if (name.equals(autoDetectorProperty)) {
394			if (value instanceof AutoDetector) {
395				theAutoDetector = (AutoDetector)value;
396				}
397			else {
398				throw new SAXNotSupportedException("Your auto-detector is not an AutoDetector");
399				}
400			}
401		else {
402			throw new SAXNotRecognizedException("Unknown property " + name);
403			}
404		}
405
406	public void setEntityResolver (EntityResolver resolver) {
407		theEntityResolver = (resolver == null) ? this : resolver;
408		}
409
410	public EntityResolver getEntityResolver () {
411		return (theEntityResolver == this) ? null : theEntityResolver;
412		}
413
414	public void setDTDHandler (DTDHandler handler) {
415		theDTDHandler = (handler == null) ? this : handler;
416		}
417
418	public DTDHandler getDTDHandler () {
419		return (theDTDHandler == this) ? null : theDTDHandler;
420		}
421
422	public void setContentHandler (ContentHandler handler) {
423		theContentHandler = (handler == null) ? this : handler;
424		}
425
426	public ContentHandler getContentHandler () {
427		return (theContentHandler == this) ? null : theContentHandler;
428		}
429
430	public void setErrorHandler (ErrorHandler handler) {
431		theErrorHandler = (handler == null) ? this : handler;
432		}
433
434	public ErrorHandler getErrorHandler () {
435		return (theErrorHandler == this) ? null : theErrorHandler;
436		}
437
438	public void parse (InputSource input) throws IOException, SAXException {
439		setup();
440		Reader r = getReader(input);
441		theContentHandler.startDocument();
442		theScanner.resetDocumentLocator(input.getPublicId(), input.getSystemId());
443		if (theScanner instanceof Locator) {
444			theContentHandler.setDocumentLocator((Locator)theScanner);
445			}
446		if (!(theSchema.getURI().equals("")))
447			theContentHandler.startPrefixMapping(theSchema.getPrefix(),
448				theSchema.getURI());
449		theScanner.scan(r, this);
450		}
451
452	public void parse (String systemid) throws IOException, SAXException {
453		parse(new InputSource(systemid));
454		}
455
456	// Sets up instance variables that haven't been set by setFeature
457	private void setup() {
458		if (theSchema == null) theSchema = new HTMLSchema();
459		if (theScanner == null) theScanner = new HTMLScanner();
460		if (theAutoDetector == null) {
461			theAutoDetector = new AutoDetector() {
462				public Reader autoDetectingReader(InputStream i) {
463					return new InputStreamReader(i);
464					}
465				};
466			}
467		theStack = new Element(theSchema.getElementType("<root>"), defaultAttributes);
468		thePCDATA = new Element(theSchema.getElementType("<pcdata>"), defaultAttributes);
469		theNewElement = null;
470		theAttributeName = null;
471		thePITarget = null;
472		theSaved = null;
473		theEntity = 0;
474		virginStack = true;
475                theDoctypeName = theDoctypePublicId = theDoctypeSystemId = null;
476		}
477
478	// Return a Reader based on the contents of an InputSource
479	// Buffer both the InputStream and the Reader
480	private Reader getReader(InputSource s) throws SAXException, IOException {
481		Reader r = s.getCharacterStream();
482		InputStream i = s.getByteStream();
483		String encoding = s.getEncoding();
484		String publicid = s.getPublicId();
485		String systemid = s.getSystemId();
486		if (r == null) {
487			if (i == null) i = getInputStream(publicid, systemid);
488//			i = new BufferedInputStream(i);
489			if (encoding == null) {
490				r = theAutoDetector.autoDetectingReader(i);
491				}
492			else {
493				try {
494					r = new InputStreamReader(i, encoding);
495					}
496				catch (UnsupportedEncodingException e) {
497					r = new InputStreamReader(i);
498					}
499				}
500			}
501//		r = new BufferedReader(r);
502		return r;
503		}
504
505	// Get an InputStream based on a publicid and a systemid
506	private InputStream getInputStream(String publicid, String systemid) throws IOException, SAXException {
507		URL basis = new URL("file", "", System.getProperty("user.dir") + "/.");
508		URL url = new URL(basis, systemid);
509		URLConnection c = url.openConnection();
510		return c.getInputStream();
511		}
512		// We don't process publicids (who uses them anyhow?)
513
514	// ScanHandler implementation
515
516	private Element theNewElement = null;
517	private String theAttributeName = null;
518	private boolean theDoctypeIsPresent = false;
519	private String theDoctypePublicId = null;
520	private String theDoctypeSystemId = null;
521	private String theDoctypeName = null;
522	private String thePITarget = null;
523	private Element theStack = null;
524	private Element theSaved = null;
525	private Element thePCDATA = null;
526	private int theEntity = 0;	// needs to support chars past U+FFFF
527
528	public void adup(char[] buff, int offset, int length) throws SAXException {
529		if (theNewElement == null || theAttributeName == null) return;
530		theNewElement.setAttribute(theAttributeName, null, theAttributeName);
531		theAttributeName = null;
532		}
533
534	public void aname(char[] buff, int offset, int length) throws SAXException {
535		if (theNewElement == null) return;
536		// Currently we don't rely on Schema to canonicalize
537		// attribute names.
538		theAttributeName = makeName(buff, offset, length).toLowerCase();
539//		System.err.println("%% Attribute name " + theAttributeName);
540		}
541
542	public void aval(char[] buff, int offset, int length) throws SAXException {
543		if (theNewElement == null || theAttributeName == null) return;
544		String value = new String(buff, offset, length);
545//		System.err.println("%% Attribute value [" + value + "]");
546		value = expandEntities(value);
547		theNewElement.setAttribute(theAttributeName, null, value);
548		theAttributeName = null;
549//		System.err.println("%% Aval done");
550		}
551
552	// Expand entity references in attribute values selectively.
553	// Currently we expand a reference iff it is properly terminated
554	// with a semicolon.
555	private String expandEntities(String src) {
556		int refStart = -1;
557		int len = src.length();
558		char[] dst = new char[len];
559		int dstlen = 0;
560		for (int i = 0; i < len; i++) {
561			char ch = src.charAt(i);
562			dst[dstlen++] = ch;
563//			System.err.print("i = " + i + ", d = " + dstlen + ", ch = [" + ch + "] ");
564			if (ch == '&' && refStart == -1) {
565				// start of a ref excluding &
566				refStart = dstlen;
567//				System.err.println("start of ref");
568				}
569			else if (refStart == -1) {
570				// not in a ref
571//				System.err.println("not in ref");
572				}
573			else if (Character.isLetter(ch) ||
574					Character.isDigit(ch) ||
575					ch == '#') {
576				// valid entity char
577//				System.err.println("valid");
578				}
579			else if (ch == ';') {
580				// properly terminated ref
581//				System.err.print("got [" + new String(dst, refStart, dstlen-refStart-1) + "]");
582				int ent = lookupEntity(dst, refStart, dstlen - refStart - 1);
583//				System.err.println(" = " + ent);
584				if (ent > 0xFFFF) {
585					ent -= 0x10000;
586					dst[refStart - 1] = (char)((ent>>10) + 0xD800);
587					dst[refStart] = (char)((ent&0x3FF) + 0xDC00);
588					dstlen = refStart + 1;
589					}
590				else if (ent != 0) {
591					dst[refStart - 1] = (char)ent;
592					dstlen = refStart;
593					}
594				refStart = -1;
595				}
596			else {
597				// improperly terminated ref
598//				System.err.println("end of ref");
599				refStart = -1;
600				}
601			}
602		return new String(dst, 0, dstlen);
603		}
604
605	public void entity(char[] buff, int offset, int length) throws SAXException {
606		theEntity = lookupEntity(buff, offset, length);
607		}
608
609	// Process numeric character references,
610	// deferring to the schema for named ones.
611	private int lookupEntity(char[] buff, int offset, int length) {
612		int result = 0;
613		if (length < 1) return result;
614//		System.err.println("%% Entity at " + offset + " " + length);
615//		System.err.println("%% Got entity [" + new String(buff, offset, length) + "]");
616		if (buff[offset] == '#') {
617                        if (length > 1 && (buff[offset+1] == 'x'
618                                        || buff[offset+1] == 'X')) {
619                                try {
620                                        return Integer.parseInt(new String(buff, offset + 2, length - 2), 16);
621                                        }
622                                catch (NumberFormatException e) { return 0; }
623                                }
624                        try {
625                                return Integer.parseInt(new String(buff, offset + 1, length - 1), 10);
626                                }
627                        catch (NumberFormatException e) { return 0; }
628                        }
629		return theSchema.getEntity(new String(buff, offset, length));
630		}
631
632	public void eof(char[] buff, int offset, int length) throws SAXException {
633		if (virginStack) rectify(thePCDATA);
634		while (theStack.next() != null) {
635			pop();
636			}
637		if (!(theSchema.getURI().equals("")))
638			theContentHandler.endPrefixMapping(theSchema.getPrefix());
639		theContentHandler.endDocument();
640		}
641
642	public void etag(char[] buff, int offset, int length) throws SAXException {
643		if (etag_cdata(buff, offset, length)) return;
644		etag_basic(buff, offset, length);
645		}
646
647	private static char[] etagchars = {'<', '/', '>'};
648	public boolean etag_cdata(char[] buff, int offset, int length) throws SAXException {
649		String currentName = theStack.name();
650		// If this is a CDATA element and the tag doesn't match,
651		// or isn't properly formed (junk after the name),
652		// restart CDATA mode and process the tag as characters.
653		if (CDATAElements && (theStack.flags() & Schema.F_CDATA) != 0) {
654			boolean realTag = (length == currentName.length());
655			if (realTag) {
656				for (int i = 0; i < length; i++) {
657					if (Character.toLowerCase(buff[offset + i]) != Character.toLowerCase(currentName.charAt(i))) {
658						realTag = false;
659						break;
660						}
661					}
662				}
663			if (!realTag) {
664				theContentHandler.characters(etagchars, 0, 2);
665				theContentHandler.characters(buff, offset, length);
666				theContentHandler.characters(etagchars, 2, 1);
667				theScanner.startCDATA();
668				return true;
669				}
670			}
671		return false;
672		}
673
674	public void etag_basic(char[] buff, int offset, int length) throws SAXException {
675		theNewElement = null;
676		String name;
677		if (length != 0) {
678			// Canonicalize case of name
679			name = makeName(buff, offset, length);
680//			System.err.println("got etag [" + name + "]");
681			ElementType type = theSchema.getElementType(name);
682			if (type == null) return;	// mysterious end-tag
683			name = type.name();
684			}
685		else {
686			name = theStack.name();
687			}
688//		System.err.println("%% Got end of " + name);
689
690		Element sp;
691		boolean inNoforce = false;
692		for (sp = theStack; sp != null; sp = sp.next()) {
693			if (sp.name().equals(name)) break;
694			if ((sp.flags() & Schema.F_NOFORCE) != 0) inNoforce = true;
695			}
696
697		if (sp == null) return;		// Ignore unknown etags
698		if (sp.next() == null || sp.next().next() == null) return;
699		if (inNoforce) {		// inside an F_NOFORCE element?
700			sp.preclose();		// preclose the matching element
701			}
702		else {			// restartably pop everything above us
703			while (theStack != sp) {
704				restartablyPop();
705				}
706			pop();
707			}
708		// pop any preclosed elements now at the top
709		while (theStack.isPreclosed()) {
710			pop();
711			}
712		restart(null);
713		}
714
715	// Push restartables on the stack if possible
716	// e is the next element to be started, if we know what it is
717	private void restart(Element e) throws SAXException {
718		while (theSaved != null && theStack.canContain(theSaved) &&
719				(e == null || theSaved.canContain(e))) {
720			Element next = theSaved.next();
721			push(theSaved);
722			theSaved = next;
723			}
724		}
725
726	// Pop the stack irrevocably
727	private void pop() throws SAXException {
728		if (theStack == null) return;		// empty stack
729		String name = theStack.name();
730		String localName = theStack.localName();
731		String namespace = theStack.namespace();
732		String prefix = prefixOf(name);
733
734//		System.err.println("%% Popping " + name);
735		if (!namespaces) namespace = localName = "";
736		theContentHandler.endElement(namespace, localName, name);
737		if (foreign(prefix, namespace)) {
738			theContentHandler.endPrefixMapping(prefix);
739//			System.err.println("%% Unmapping [" + prefix + "] for elements to " + namespace);
740			}
741		Attributes atts = theStack.atts();
742		for (int i = atts.getLength() - 1; i >= 0; i--) {
743			String attNamespace = atts.getURI(i);
744			String attPrefix = prefixOf(atts.getQName(i));
745			if (foreign(attPrefix, attNamespace)) {
746				theContentHandler.endPrefixMapping(attPrefix);
747//			System.err.println("%% Unmapping [" + attPrefix + "] for attributes to " + attNamespace);
748				}
749			}
750		theStack = theStack.next();
751		}
752
753	// Pop the stack restartably
754	private void restartablyPop() throws SAXException {
755		Element popped = theStack;
756		pop();
757		if (restartElements && (popped.flags() & Schema.F_RESTART) != 0) {
758			popped.anonymize();
759			popped.setNext(theSaved);
760			theSaved = popped;
761			}
762		}
763
764	// Push element onto stack
765	private boolean virginStack = true;
766	private void push(Element e) throws SAXException {
767		String name = e.name();
768		String localName = e.localName();
769		String namespace = e.namespace();
770		String prefix = prefixOf(name);
771
772//		System.err.println("%% Pushing " + name);
773		e.clean();
774		if (!namespaces) namespace = localName = "";
775                if (virginStack && localName.equalsIgnoreCase(theDoctypeName)) {
776                    try {
777                        theEntityResolver.resolveEntity(theDoctypePublicId, theDoctypeSystemId);
778                    } catch (IOException ew) { }   // Can't be thrown for root I believe.
779                }
780		if (foreign(prefix, namespace)) {
781			theContentHandler.startPrefixMapping(prefix, namespace);
782//			System.err.println("%% Mapping [" + prefix + "] for elements to " + namespace);
783			}
784		Attributes atts = e.atts();
785		int len = atts.getLength();
786		for (int i = 0; i < len; i++) {
787			String attNamespace = atts.getURI(i);
788			String attPrefix = prefixOf(atts.getQName(i));
789			if (foreign(attPrefix, attNamespace)) {
790				theContentHandler.startPrefixMapping(attPrefix, attNamespace);
791//				System.err.println("%% Mapping [" + attPrefix + "] for attributes to " + attNamespace);
792				}
793			}
794		theContentHandler.startElement(namespace, localName, name, e.atts());
795		e.setNext(theStack);
796		theStack = e;
797		virginStack = false;
798		if (CDATAElements && (theStack.flags() & Schema.F_CDATA) != 0) {
799			theScanner.startCDATA();
800			}
801		}
802
803	// Get the prefix from a QName
804	private String prefixOf(String name) {
805		int i = name.indexOf(':');
806		String prefix = "";
807		if (i != -1) prefix = name.substring(0, i);
808//		System.err.println("%% " + prefix + " is prefix of " + name);
809		return prefix;
810		}
811
812	// Return true if we have a foreign name
813	private boolean foreign(String prefix, String namespace) {
814//		System.err.print("%% Testing " + prefix + " and " + namespace + " for foreignness -- ");
815		boolean foreign = !(prefix.equals("") || namespace.equals("") ||
816			namespace.equals(theSchema.getURI()));
817//		System.err.println(foreign);
818		return foreign;
819		}
820
821        /**
822         * Parsing the complete XML Document Type Definition is way too complex,
823         * but for many simple cases we can extract something useful from it.
824         *
825         * doctypedecl  ::= '<!DOCTYPE' S Name (S ExternalID)? S? ('[' intSubset ']' S?)? '>'
826         *  DeclSep     ::= PEReference | S
827         *  intSubset   ::= (markupdecl | DeclSep)*
828         *  markupdecl  ::= elementdecl | AttlistDecl | EntityDecl | NotationDecl | PI | Comment
829         *  ExternalID  ::= 'SYSTEM' S SystemLiteral | 'PUBLIC' S PubidLiteral S SystemLiteral
830         */
831	public void decl(char[] buff, int offset, int length) throws SAXException {
832		String s = new String(buff, offset, length);
833		String name = null;
834		String systemid = null;
835		String publicid = null;
836		String[] v = split(s);
837		if (v.length > 0 && "DOCTYPE".equals(v[0])) {
838			if (theDoctypeIsPresent) return;		// one doctype only!
839			theDoctypeIsPresent = true;
840			if (v.length > 1) {
841				name = v[1];
842				if (v.length>3 && "SYSTEM".equals(v[2])) {
843				systemid = v[3];
844				}
845			else if (v.length > 3 && "PUBLIC".equals(v[2])) {
846				publicid = v[3];
847				if (v.length > 4) {
848					systemid = v[4];
849					}
850				else {
851					systemid = "";
852					}
853                    }
854                }
855            }
856		publicid = trimquotes(publicid);
857		systemid = trimquotes(systemid);
858		if (name != null) {
859			publicid = cleanPublicid(publicid);
860			theLexicalHandler.startDTD(name, publicid, systemid);
861			theLexicalHandler.endDTD();
862			theDoctypeName = name;
863			theDoctypePublicId = publicid;
864		if (theScanner instanceof Locator) {    // Must resolve systemid
865                    theDoctypeSystemId  = ((Locator)theScanner).getSystemId();
866                    try {
867                        theDoctypeSystemId = new URL(new URL(theDoctypeSystemId), systemid).toString();
868                    } catch (Exception e) {}
869                }
870            }
871        }
872
873	// If the String is quoted, trim the quotes.
874	private static String trimquotes(String in) {
875		if (in == null) return in;
876		int length = in.length();
877		if (length == 0) return in;
878		char s = in.charAt(0);
879		char e = in.charAt(length - 1);
880		if (s == e && (s == '\'' || s == '"')) {
881			in = in.substring(1, in.length() - 1);
882			}
883		return in;
884		}
885
886	// Split the supplied String into words or phrases seperated by spaces.
887	// Recognises quotes around a phrase and doesn't split it.
888	private static String[] split(String val) throws IllegalArgumentException {
889		val = val.trim();
890		if (val.length() == 0) {
891			return new String[0];
892			}
893		else {
894			ArrayList l = new ArrayList();
895			int s = 0;
896			int e = 0;
897			boolean sq = false;	// single quote
898			boolean dq = false;	// double quote
899			char lastc = 0;
900			int len = val.length();
901			for (e=0; e < len; e++) {
902				char c = val.charAt(e);
903				if (!dq && c == '\'' && lastc != '\\') {
904				sq = !sq;
905				if (s < 0) s = e;
906				}
907			else if (!sq && c == '\"' && lastc != '\\') {
908				dq = !dq;
909				if (s < 0) s = e;
910				}
911			else if (!sq && !dq) {
912				if (Character.isWhitespace(c)) {
913					if (s >= 0) l.add(val.substring(s, e));
914					s = -1;
915					}
916				else if (s < 0 && c != ' ') {
917					s = e;
918					}
919				}
920			lastc = c;
921			}
922		l.add(val.substring(s, e));
923		return (String[])l.toArray(new String[0]);
924		}
925        }
926
927	// Replace junk in publicids with spaces
928	private static String legal =
929		"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789-'()+,./:=?;!*#@$_%";
930
931	private String cleanPublicid(String src) {
932		if (src == null) return null;
933		int len = src.length();
934		StringBuffer dst = new StringBuffer(len);
935		boolean suppressSpace = true;
936		for (int i = 0; i < len; i++) {
937			char ch = src.charAt(i);
938			if (legal.indexOf(ch) != -1) { 	// legal but not whitespace
939				dst.append(ch);
940				suppressSpace = false;
941				}
942			else if (suppressSpace) {	// normalizable whitespace or junk
943				;
944				}
945			else {
946				dst.append(' ');
947				suppressSpace = true;
948				}
949			}
950//		System.err.println("%% Publicid [" + dst.toString().trim() + "]");
951		return dst.toString().trim();	// trim any final junk whitespace
952		}
953
954
955	public void gi(char[] buff, int offset, int length) throws SAXException {
956		if (theNewElement != null) return;
957		String name = makeName(buff, offset, length);
958		if (name == null) return;
959		ElementType type = theSchema.getElementType(name);
960		if (type == null) {
961			// Suppress unknown elements if ignore-bogons is on
962			if (ignoreBogons) return;
963			int bogonModel = bogonsEmpty ? Schema.M_EMPTY : Schema.M_ANY;
964			int bogonMemberOf = rootBogons ? Schema.M_ANY : (Schema.M_ANY & ~ Schema.M_ROOT);
965			theSchema.elementType(name, bogonModel, bogonMemberOf, 0);
966			if (!rootBogons) theSchema.parent(name, theSchema.rootElementType().name());
967			type = theSchema.getElementType(name);
968			}
969
970		theNewElement = new Element(type, defaultAttributes);
971//		System.err.println("%% Got GI " + theNewElement.name());
972		}
973
974	public void cdsect(char[] buff, int offset, int length) throws SAXException {
975		theLexicalHandler.startCDATA();
976		pcdata(buff, offset, length);
977		theLexicalHandler.endCDATA();
978		}
979	public void pcdata(char[] buff, int offset, int length) throws SAXException {
980		if (length == 0) return;
981		boolean allWhite = true;
982		for (int i = 0; i < length; i++) {
983			if (!Character.isWhitespace(buff[offset+i])) {
984				allWhite = false;
985				}
986			}
987		if (allWhite && !theStack.canContain(thePCDATA)) {
988			if (ignorableWhitespace) {
989				theContentHandler.ignorableWhitespace(buff, offset, length);
990				}
991			}
992		else {
993			rectify(thePCDATA);
994			theContentHandler.characters(buff, offset, length);
995			}
996		}
997
998	public void pitarget(char[] buff, int offset, int length) throws SAXException {
999		if (theNewElement != null) return;
1000		thePITarget = makeName(buff, offset, length).replace(':', '_');
1001		}
1002
1003	public void pi(char[] buff, int offset, int length) throws SAXException {
1004		if (theNewElement != null || thePITarget == null) return;
1005		if ("xml".equalsIgnoreCase(thePITarget)) return;
1006//		if (length > 0 && buff[length - 1] == '?') System.err.println("%% Removing ? from PI");
1007		if (length > 0 && buff[length - 1] == '?') length--;	// remove trailing ?
1008		theContentHandler.processingInstruction(thePITarget,
1009			new String(buff, offset, length));
1010		thePITarget = null;
1011		}
1012
1013	public void stagc(char[] buff, int offset, int length) throws SAXException {
1014//		System.err.println("%% Start-tag");
1015		if (theNewElement == null) return;
1016		rectify(theNewElement);
1017		if (theStack.model() == Schema.M_EMPTY) {
1018			// Force an immediate end tag
1019			etag_basic(buff, offset, length);
1020			}
1021		}
1022
1023	public void stage(char[] buff, int offset, int length) throws SAXException {
1024//		System.err.println("%% Empty-tag");
1025		if (theNewElement == null) return;
1026		rectify(theNewElement);
1027		// Force an immediate end tag
1028		etag_basic(buff, offset, length);
1029		}
1030
1031	// Comment buffer is twice the size of the output buffer
1032	private char[] theCommentBuffer = new char[2000];
1033	public void cmnt(char[] buff, int offset, int length) throws SAXException {
1034		theLexicalHandler.comment(buff, offset, length);
1035		}
1036
1037	// Rectify the stack, pushing and popping as needed
1038	// so that the argument can be safely pushed
1039	private void rectify(Element e) throws SAXException {
1040		Element sp;
1041		while (true) {
1042			for (sp = theStack; sp != null; sp = sp.next()) {
1043				if (sp.canContain(e)) break;
1044				}
1045			if (sp != null) break;
1046			ElementType parentType = e.parent();
1047			if (parentType == null) break;
1048			Element parent = new Element(parentType, defaultAttributes);
1049//			System.err.println("%% Ascending from " + e.name() + " to " + parent.name());
1050			parent.setNext(e);
1051			e = parent;
1052			}
1053		if (sp == null) return;		// don't know what to do
1054		while (theStack != sp) {
1055			if (theStack == null || theStack.next() == null ||
1056				theStack.next().next() == null) break;
1057			restartablyPop();
1058			}
1059		while (e != null) {
1060			Element nexte = e.next();
1061			if (!e.name().equals("<pcdata>")) push(e);
1062			e = nexte;
1063			restart(e);
1064			}
1065		theNewElement = null;
1066		}
1067
1068	public int getEntity() {
1069		return theEntity;
1070		}
1071
1072	// Return the argument as a valid XML name
1073	// This no longer lowercases the result: we depend on Schema to
1074	// canonicalize case.
1075	private String makeName(char[] buff, int offset, int length) {
1076		StringBuffer dst = new StringBuffer(length + 2);
1077		boolean seenColon = false;
1078		boolean start = true;
1079//		String src = new String(buff, offset, length); // DEBUG
1080		for (; length-- > 0; offset++) {
1081			char ch = buff[offset];
1082			if (Character.isLetter(ch) || ch == '_') {
1083				start = false;
1084				dst.append(ch);
1085				}
1086			else if (Character.isDigit(ch) || ch == '-' || ch == '.') {
1087				if (start) dst.append('_');
1088				start = false;
1089				dst.append(ch);
1090				}
1091			else if (ch == ':' && !seenColon) {
1092				seenColon = true;
1093				if (start) dst.append('_');
1094				start = true;
1095				dst.append(translateColons ? '_' : ch);
1096				}
1097			}
1098		int dstLength = dst.length();
1099		if (dstLength == 0 || dst.charAt(dstLength - 1) == ':') dst.append('_');
1100//		System.err.println("Made name \"" + dst + "\" from \"" + src + "\"");
1101		return dst.toString().intern();
1102		}
1103
1104	// Default LexicalHandler implementation
1105
1106	public void comment(char[] ch, int start, int length) throws SAXException { }
1107	public void endCDATA() throws SAXException { }
1108	public void endDTD() throws SAXException { }
1109	public void endEntity(String name) throws SAXException { }
1110	public void startCDATA() throws SAXException { }
1111	public void startDTD(String name, String publicid, String systemid) throws SAXException { }
1112	public void startEntity(String name) throws SAXException { }
1113
1114	}
1115