HTMLScanner.java revision 4bb395b502d0c2495f7a5d226ccf7f06f53dea38
1// This file is part of TagSoup and is Copyright 2002-2008 by John Cowan.
2//
3// TagSoup is licensed under the Apache License,
4// Version 2.0.  You may obtain a copy of this license at
5// http://www.apache.org/licenses/LICENSE-2.0 .  You may also have
6// additional legal rights not granted by this license.
7//
8// TagSoup is distributed in the hope that it will be useful, but
9// unless required by applicable law or agreed to in writing, TagSoup
10// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
11// OF ANY KIND, either express or implied; not even the implied warranty
12// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
13//
14//
15package org.ccil.cowan.tagsoup;
16import java.io.*;
17import org.xml.sax.SAXException;
18import org.xml.sax.Locator;
19
20/**
21This class implements a table-driven scanner for HTML, allowing for lots of
22defects.  It implements the Scanner interface, which accepts a Reader
23object to fetch characters from and a ScanHandler object to report lexical
24events to.
25*/
26
27public class HTMLScanner implements Scanner, Locator {
28
29	// Start of state table
30		private static final int S_ANAME = 1;
31	private static final int S_APOS = 2;
32	private static final int S_AVAL = 3;
33	private static final int S_BB = 4;
34	private static final int S_BBC = 5;
35	private static final int S_BBCD = 6;
36	private static final int S_BBCDA = 7;
37	private static final int S_BBCDAT = 8;
38	private static final int S_BBCDATA = 9;
39	private static final int S_CDATA = 10;
40	private static final int S_CDATA2 = 11;
41	private static final int S_CDSECT = 12;
42	private static final int S_CDSECT1 = 13;
43	private static final int S_CDSECT2 = 14;
44	private static final int S_COM = 15;
45	private static final int S_COM2 = 16;
46	private static final int S_COM3 = 17;
47	private static final int S_COM4 = 18;
48	private static final int S_DECL = 19;
49	private static final int S_DECL2 = 20;
50	private static final int S_DONE = 21;
51	private static final int S_EMPTYTAG = 22;
52	private static final int S_ENT = 23;
53	private static final int S_EQ = 24;
54	private static final int S_ETAG = 25;
55	private static final int S_GI = 26;
56	private static final int S_NCR = 27;
57	private static final int S_PCDATA = 28;
58	private static final int S_PI = 29;
59	private static final int S_PITARGET = 30;
60	private static final int S_QUOT = 31;
61	private static final int S_STAGC = 32;
62	private static final int S_TAG = 33;
63	private static final int S_TAGWS = 34;
64	private static final int S_XNCR = 35;
65	private static final int A_ADUP = 1;
66	private static final int A_ADUP_SAVE = 2;
67	private static final int A_ADUP_STAGC = 3;
68	private static final int A_ANAME = 4;
69	private static final int A_ANAME_ADUP = 5;
70	private static final int A_ANAME_ADUP_STAGC = 6;
71	private static final int A_AVAL = 7;
72	private static final int A_AVAL_STAGC = 8;
73	private static final int A_CDATA = 9;
74	private static final int A_CMNT = 10;
75	private static final int A_DECL = 11;
76	private static final int A_EMPTYTAG = 12;
77	private static final int A_ENTITY = 13;
78	private static final int A_ENTITY_START = 14;
79	private static final int A_ETAG = 15;
80	private static final int A_GI = 16;
81	private static final int A_GI_STAGC = 17;
82	private static final int A_LT = 18;
83	private static final int A_LT_PCDATA = 19;
84	private static final int A_MINUS = 20;
85	private static final int A_MINUS2 = 21;
86	private static final int A_MINUS3 = 22;
87	private static final int A_PCDATA = 23;
88	private static final int A_PI = 24;
89	private static final int A_PITARGET = 25;
90	private static final int A_PITARGET_PI = 26;
91	private static final int A_SAVE = 27;
92	private static final int A_SKIP = 28;
93	private static final int A_SP = 29;
94	private static final int A_STAGC = 30;
95	private static final int A_UNGET = 31;
96	private static final int A_UNSAVE_PCDATA = 32;
97	private static int[] statetable = {
98		S_ANAME, '/', A_ANAME_ADUP, S_EMPTYTAG,
99		S_ANAME, '=', A_ANAME, S_AVAL,
100		S_ANAME, '>', A_ANAME_ADUP_STAGC, S_PCDATA,
101		S_ANAME, 0, A_SAVE, S_ANAME,
102		S_ANAME, -1, A_ANAME_ADUP_STAGC, S_DONE,
103		S_ANAME, ' ', A_ANAME, S_EQ,
104		S_ANAME, '\n', A_ANAME, S_EQ,
105		S_ANAME, '\t', A_ANAME, S_EQ,
106		S_APOS, '\'', A_AVAL, S_TAGWS,
107		S_APOS, 0, A_SAVE, S_APOS,
108		S_APOS, -1, A_AVAL_STAGC, S_DONE,
109		S_APOS, ' ', A_SP, S_APOS,
110		S_APOS, '\n', A_SP, S_APOS,
111		S_APOS, '\t', A_SP, S_APOS,
112		S_AVAL, '\'', A_SKIP, S_APOS,
113		S_AVAL, '"', A_SKIP, S_QUOT,
114		S_AVAL, '>', A_AVAL_STAGC, S_PCDATA,
115		S_AVAL, 0, A_SAVE, S_STAGC,
116		S_AVAL, -1, A_AVAL_STAGC, S_DONE,
117		S_AVAL, ' ', A_SKIP, S_AVAL,
118		S_AVAL, '\n', A_SKIP, S_AVAL,
119		S_AVAL, '\t', A_SKIP, S_AVAL,
120		S_BB, 'C', A_SKIP, S_BBC,
121		S_BB, 0, A_SKIP, S_DECL,
122		S_BB, -1, A_SKIP, S_DONE,
123		S_BBC, 'D', A_SKIP, S_BBCD,
124		S_BBC, 0, A_SKIP, S_DECL,
125		S_BBC, -1, A_SKIP, S_DONE,
126		S_BBCD, 'A', A_SKIP, S_BBCDA,
127		S_BBCD, 0, A_SKIP, S_DECL,
128		S_BBCD, -1, A_SKIP, S_DONE,
129		S_BBCDA, 'T', A_SKIP, S_BBCDAT,
130		S_BBCDA, 0, A_SKIP, S_DECL,
131		S_BBCDA, -1, A_SKIP, S_DONE,
132		S_BBCDAT, 'A', A_SKIP, S_BBCDATA,
133		S_BBCDAT, 0, A_SKIP, S_DECL,
134		S_BBCDAT, -1, A_SKIP, S_DONE,
135		S_BBCDATA, '[', A_SKIP, S_CDSECT,
136		S_BBCDATA, 0, A_SKIP, S_DECL,
137		S_BBCDATA, -1, A_SKIP, S_DONE,
138		S_CDATA, '<', A_SAVE, S_CDATA2,
139		S_CDATA, 0, A_SAVE, S_CDATA,
140		S_CDATA, -1, A_PCDATA, S_DONE,
141		S_CDATA2, '/', A_UNSAVE_PCDATA, S_ETAG,
142		S_CDATA2, 0, A_SAVE, S_CDATA,
143		S_CDATA2, -1, A_UNSAVE_PCDATA, S_DONE,
144		S_CDSECT, ']', A_SAVE, S_CDSECT1,
145		S_CDSECT, 0, A_SAVE, S_CDSECT,
146		S_CDSECT, -1, A_SKIP, S_DONE,
147		S_CDSECT1, ']', A_SAVE, S_CDSECT2,
148		S_CDSECT1, 0, A_SAVE, S_CDSECT,
149		S_CDSECT1, -1, A_SKIP, S_DONE,
150		S_CDSECT2, '>', A_CDATA, S_PCDATA,
151		S_CDSECT2, 0, A_SAVE, S_CDSECT,
152		S_CDSECT2, -1, A_SKIP, S_DONE,
153		S_COM, '-', A_SKIP, S_COM2,
154		S_COM, 0, A_SAVE, S_COM2,
155		S_COM, -1, A_CMNT, S_DONE,
156		S_COM2, '-', A_SKIP, S_COM3,
157		S_COM2, 0, A_SAVE, S_COM2,
158		S_COM2, -1, A_CMNT, S_DONE,
159		S_COM3, '-', A_SKIP, S_COM4,
160		S_COM3, 0, A_MINUS, S_COM2,
161		S_COM3, -1, A_CMNT, S_DONE,
162		S_COM4, '-', A_MINUS3, S_COM4,
163		S_COM4, '>', A_CMNT, S_PCDATA,
164		S_COM4, 0, A_MINUS2, S_COM2,
165		S_COM4, -1, A_CMNT, S_DONE,
166		S_DECL, '-', A_SKIP, S_COM,
167		S_DECL, '[', A_SKIP, S_BB,
168		S_DECL, '>', A_SKIP, S_PCDATA,
169		S_DECL, 0, A_SAVE, S_DECL2,
170		S_DECL, -1, A_SKIP, S_DONE,
171		S_DECL2, '>', A_DECL, S_PCDATA,
172		S_DECL2, 0, A_SAVE, S_DECL2,
173		S_DECL2, -1, A_SKIP, S_DONE,
174		S_EMPTYTAG, '>', A_EMPTYTAG, S_PCDATA,
175		S_EMPTYTAG, 0, A_SAVE, S_ANAME,
176		S_EMPTYTAG, ' ', A_SKIP, S_TAGWS,
177		S_EMPTYTAG, '\n', A_SKIP, S_TAGWS,
178		S_EMPTYTAG, '\t', A_SKIP, S_TAGWS,
179		S_ENT, 0, A_ENTITY, S_ENT,
180		S_ENT, -1, A_ENTITY, S_DONE,
181		S_EQ, '=', A_SKIP, S_AVAL,
182		S_EQ, '>', A_ADUP_STAGC, S_PCDATA,
183		S_EQ, 0, A_ADUP_SAVE, S_ANAME,
184		S_EQ, -1, A_ADUP_STAGC, S_DONE,
185		S_EQ, ' ', A_SKIP, S_EQ,
186		S_EQ, '\n', A_SKIP, S_EQ,
187		S_EQ, '\t', A_SKIP, S_EQ,
188		S_ETAG, '>', A_ETAG, S_PCDATA,
189		S_ETAG, 0, A_SAVE, S_ETAG,
190		S_ETAG, -1, A_ETAG, S_DONE,
191		S_ETAG, ' ', A_SKIP, S_ETAG,
192		S_ETAG, '\n', A_SKIP, S_ETAG,
193		S_ETAG, '\t', A_SKIP, S_ETAG,
194		S_GI, '/', A_SKIP, S_EMPTYTAG,
195		S_GI, '>', A_GI_STAGC, S_PCDATA,
196		S_GI, 0, A_SAVE, S_GI,
197		S_GI, -1, A_SKIP, S_DONE,
198		S_GI, ' ', A_GI, S_TAGWS,
199		S_GI, '\n', A_GI, S_TAGWS,
200		S_GI, '\t', A_GI, S_TAGWS,
201		S_NCR, 0, A_ENTITY, S_NCR,
202		S_NCR, -1, A_ENTITY, S_DONE,
203		S_PCDATA, '&', A_ENTITY_START, S_ENT,
204		S_PCDATA, '<', A_PCDATA, S_TAG,
205		S_PCDATA, 0, A_SAVE, S_PCDATA,
206		S_PCDATA, -1, A_PCDATA, S_DONE,
207		S_PI, '>', A_PI, S_PCDATA,
208		S_PI, 0, A_SAVE, S_PI,
209		S_PI, -1, A_PI, S_DONE,
210		S_PITARGET, '>', A_PITARGET_PI, S_PCDATA,
211		S_PITARGET, 0, A_SAVE, S_PITARGET,
212		S_PITARGET, -1, A_PITARGET_PI, S_DONE,
213		S_PITARGET, ' ', A_PITARGET, S_PI,
214		S_PITARGET, '\n', A_PITARGET, S_PI,
215		S_PITARGET, '\t', A_PITARGET, S_PI,
216		S_QUOT, '"', A_AVAL, S_TAGWS,
217		S_QUOT, 0, A_SAVE, S_QUOT,
218		S_QUOT, -1, A_AVAL_STAGC, S_DONE,
219		S_QUOT, ' ', A_SP, S_QUOT,
220		S_QUOT, '\n', A_SP, S_QUOT,
221		S_QUOT, '\t', A_SP, S_QUOT,
222		S_STAGC, '>', A_AVAL_STAGC, S_PCDATA,
223		S_STAGC, 0, A_SAVE, S_STAGC,
224		S_STAGC, -1, A_AVAL_STAGC, S_DONE,
225		S_STAGC, ' ', A_AVAL, S_TAGWS,
226		S_STAGC, '\n', A_AVAL, S_TAGWS,
227		S_STAGC, '\t', A_AVAL, S_TAGWS,
228		S_TAG, '!', A_SKIP, S_DECL,
229		S_TAG, '?', A_SKIP, S_PITARGET,
230		S_TAG, '/', A_SKIP, S_ETAG,
231		S_TAG, '<', A_SAVE, S_TAG,
232		S_TAG, 0, A_SAVE, S_GI,
233		S_TAG, -1, A_LT_PCDATA, S_DONE,
234		S_TAG, ' ', A_LT, S_PCDATA,
235		S_TAG, '\n', A_LT, S_PCDATA,
236		S_TAG, '\t', A_LT, S_PCDATA,
237		S_TAGWS, '/', A_SKIP, S_EMPTYTAG,
238		S_TAGWS, '>', A_STAGC, S_PCDATA,
239		S_TAGWS, 0, A_SAVE, S_ANAME,
240		S_TAGWS, -1, A_STAGC, S_DONE,
241		S_TAGWS, ' ', A_SKIP, S_TAGWS,
242		S_TAGWS, '\n', A_SKIP, S_TAGWS,
243		S_TAGWS, '\t', A_SKIP, S_TAGWS,
244		S_XNCR, 0, A_ENTITY, S_XNCR,
245		S_XNCR, -1, A_ENTITY, S_DONE,
246
247	};
248	private static final String[] debug_actionnames = { "", "A_ADUP", "A_ADUP_SAVE", "A_ADUP_STAGC", "A_ANAME", "A_ANAME_ADUP", "A_ANAME_ADUP_STAGC", "A_AVAL", "A_AVAL_STAGC", "A_CDATA", "A_CMNT", "A_DECL", "A_EMPTYTAG", "A_ENTITY", "A_ENTITY_START", "A_ETAG", "A_GI", "A_GI_STAGC", "A_LT", "A_LT_PCDATA", "A_MINUS", "A_MINUS2", "A_MINUS3", "A_PCDATA", "A_PI", "A_PITARGET", "A_PITARGET_PI", "A_SAVE", "A_SKIP", "A_SP", "A_STAGC", "A_UNGET", "A_UNSAVE_PCDATA"};
249	private static final String[] debug_statenames = { "", "S_ANAME", "S_APOS", "S_AVAL", "S_BB", "S_BBC", "S_BBCD", "S_BBCDA", "S_BBCDAT", "S_BBCDATA", "S_CDATA", "S_CDATA2", "S_CDSECT", "S_CDSECT1", "S_CDSECT2", "S_COM", "S_COM2", "S_COM3", "S_COM4", "S_DECL", "S_DECL2", "S_DONE", "S_EMPTYTAG", "S_ENT", "S_EQ", "S_ETAG", "S_GI", "S_NCR", "S_PCDATA", "S_PI", "S_PITARGET", "S_QUOT", "S_STAGC", "S_TAG", "S_TAGWS", "S_XNCR"};
250
251
252	// End of state table
253
254	private String thePublicid;			// Locator state
255	private String theSystemid;
256	private int theLastLine;
257	private int theLastColumn;
258	private int theCurrentLine;
259	private int theCurrentColumn;
260
261	int theState;					// Current state
262	int theNextState;				// Next state
263	char[] theOutputBuffer = new char[200];	// Output buffer
264	int theSize;					// Current buffer size
265	int[] theWinMap = {				// Windows chars map
266		0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
267		0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD,
268		0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
269		0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178};
270
271	// Compensate for bug in PushbackReader that allows
272	// pushing back EOF.
273	private void unread(PushbackReader r, int c) throws IOException {
274		if (c != -1) r.unread(c);
275		}
276
277	// Locator implementation
278
279	public int getLineNumber() {
280		return theLastLine;
281		}
282	public int getColumnNumber() {
283		return theLastColumn;
284		}
285	public String getPublicId() {
286		return thePublicid;
287		}
288	public String getSystemId() {
289		return theSystemid;
290		}
291
292
293	// Scanner implementation
294
295	/**
296	Reset document locator, supplying systemid and publicid.
297	@param systemid System id
298	@param publicid Public id
299	*/
300
301	public void resetDocumentLocator(String publicid, String systemid) {
302		thePublicid = publicid;
303		theSystemid = systemid;
304		theLastLine = theLastColumn = theCurrentLine = theCurrentColumn = 0;
305		}
306
307	/**
308	Scan HTML source, reporting lexical events.
309	@param r0 Reader that provides characters
310	@param h ScanHandler that accepts lexical events.
311	*/
312
313	public void scan(Reader r0, ScanHandler h) throws IOException, SAXException {
314		theState = S_PCDATA;
315		PushbackReader r;
316		if (r0 instanceof PushbackReader) {
317			r = (PushbackReader)r0;
318			}
319		else if (r0 instanceof BufferedReader) {
320			r = new PushbackReader(r0);
321			}
322		else {
323			r = new PushbackReader(new BufferedReader(r0, 200));
324			}
325
326		int firstChar = r.read();	// Remove any leading BOM
327		if (firstChar != '\uFEFF') unread(r, firstChar);
328
329		while (theState != S_DONE) {
330			int ch = r.read();
331
332			// Process control characters
333			if (ch >= 0x80 && ch <= 0x9F) ch = theWinMap[ch-0x80];
334
335			if (ch == '\r') {
336				ch = r.read();		// expect LF next
337				if (ch != '\n') {
338					unread(r, ch);	// nope
339					ch = '\n';
340					}
341				}
342
343			if (ch == '\n') {
344				theCurrentLine++;
345				theCurrentColumn = 0;
346				}
347			else {
348				theCurrentColumn++;
349				}
350
351			if (!(ch >= 0x20 || ch == '\n' || ch == '\t' || ch == -1)) continue;
352
353			// Search state table
354			int action = 0;
355			for (int i = 0; i < statetable.length; i += 4) {
356				if (theState != statetable[i]) {
357					if (action != 0) break;
358					continue;
359					}
360				if (statetable[i+1] == 0) {
361					action = statetable[i+2];
362					theNextState = statetable[i+3];
363					}
364				else if (statetable[i+1] == ch) {
365					action = statetable[i+2];
366					theNextState = statetable[i+3];
367					break;
368					}
369				}
370//			System.err.println("In " + debug_statenames[theState] + " got " + nicechar(ch) + " doing " + debug_actionnames[action] + " then " + debug_statenames[theNextState]);
371			switch (action) {
372			case 0:
373				throw new Error(
374"HTMLScanner can't cope with " + Integer.toString(ch) + " in state " +
375Integer.toString(theState));
376        		case A_ADUP:
377				h.adup(theOutputBuffer, 0, theSize);
378				theSize = 0;
379				break;
380        		case A_ADUP_SAVE:
381				h.adup(theOutputBuffer, 0, theSize);
382				theSize = 0;
383				save(ch, h);
384				break;
385        		case A_ADUP_STAGC:
386				h.adup(theOutputBuffer, 0, theSize);
387				theSize = 0;
388				h.stagc(theOutputBuffer, 0, theSize);
389				break;
390        		case A_ANAME:
391				h.aname(theOutputBuffer, 0, theSize);
392				theSize = 0;
393				break;
394        		case A_ANAME_ADUP:
395				h.aname(theOutputBuffer, 0, theSize);
396				theSize = 0;
397				h.adup(theOutputBuffer, 0, theSize);
398				break;
399        		case A_ANAME_ADUP_STAGC:
400				h.aname(theOutputBuffer, 0, theSize);
401				theSize = 0;
402				h.adup(theOutputBuffer, 0, theSize);
403				h.stagc(theOutputBuffer, 0, theSize);
404				break;
405        		case A_AVAL:
406				h.aval(theOutputBuffer, 0, theSize);
407				theSize = 0;
408				break;
409        		case A_AVAL_STAGC:
410				h.aval(theOutputBuffer, 0, theSize);
411				theSize = 0;
412				h.stagc(theOutputBuffer, 0, theSize);
413				break;
414			case A_CDATA:
415				mark();
416				// suppress the final "]]" in the buffer
417				if (theSize > 1) theSize -= 2;
418				h.pcdata(theOutputBuffer, 0, theSize);
419				theSize = 0;
420				break;
421			case A_ENTITY_START:
422				h.pcdata(theOutputBuffer, 0, theSize);
423				theSize = 0;
424				save(ch, h);
425				break;
426			case A_ENTITY:
427				mark();
428				char ch1 = (char)ch;
429//				System.out.println("Got " + ch1 + " in state " + ((theState == S_ENT) ? "S_ENT" : ((theState == S_NCR) ? "S_NCR" : "UNK")));
430				if (theState == S_ENT && ch1 == '#') {
431					theNextState = S_NCR;
432					save(ch, h);
433					break;
434					}
435				else if (theState == S_NCR && (ch1 == 'x' || ch1 == 'X')) {
436					theNextState = S_XNCR;
437					save(ch, h);
438					break;
439					}
440				else if (theState == S_ENT && Character.isLetterOrDigit(ch1)) {
441					save(ch, h);
442					break;
443					}
444				else if (theState == S_NCR && Character.isDigit(ch1)) {
445					save(ch, h);
446					break;
447					}
448				else if (theState == S_XNCR && (Character.isDigit(ch1) || "abcdefABCDEF".indexOf(ch1) != -1)) {
449					save(ch, h);
450					break;
451					}
452
453				// The whole entity reference has been collected
454//				System.err.println("%%" + new String(theOutputBuffer, 0, theSize));
455				h.entity(theOutputBuffer, 1, theSize - 1);
456				int ent = h.getEntity();
457//				System.err.println("%% value = " + ent);
458				if (ent != 0) {
459					theSize = 0;
460					if (ent >= 0x80 && ent <= 0x9F) {
461						ent = theWinMap[ent-0x80];
462						}
463					if (ent < 0x20) {
464						// Control becomes space
465						ent = 0x20;
466						}
467					else if (ent >= 0xD800 && ent <= 0xDFFF) {
468						// Surrogates get dropped
469						ent = 0;
470						}
471					else if (ent <= 0xFFFF) {
472						// BMP character
473						save(ent, h);
474						}
475					else {
476						// Astral converted to two surrogates
477						ent -= 0x10000;
478						save((ent>>10) + 0xD800, h);
479						save((ent&0x3FF) + 0xDC00, h);
480						}
481					if (ch != ';') {
482						unread(r, ch);
483						theCurrentColumn--;
484						}
485					}
486				else {
487					unread(r, ch);
488					theCurrentColumn--;
489					}
490				theNextState = S_PCDATA;
491				break;
492        		case A_ETAG:
493				h.etag(theOutputBuffer, 0, theSize);
494				theSize = 0;
495				break;
496        		case A_DECL:
497				h.decl(theOutputBuffer, 0, theSize);
498				theSize = 0;
499				break;
500        		case A_GI:
501				h.gi(theOutputBuffer, 0, theSize);
502				theSize = 0;
503				break;
504			case A_GI_STAGC:
505				h.gi(theOutputBuffer, 0, theSize);
506				theSize = 0;
507				h.stagc(theOutputBuffer, 0, theSize);
508				break;
509        		case A_LT:
510				mark();
511				save('<', h);
512				save(ch, h);
513				break;
514			case A_LT_PCDATA:
515				mark();
516				save('<', h);
517				h.pcdata(theOutputBuffer, 0, theSize);
518				theSize = 0;
519				break;
520        		case A_PCDATA:
521				mark();
522				h.pcdata(theOutputBuffer, 0, theSize);
523				theSize = 0;
524				break;
525			case A_CMNT:
526				mark();
527				h.cmnt(theOutputBuffer, 0, theSize);
528				theSize = 0;
529				break;
530			case A_MINUS3:
531				save('-', h);
532				save(' ', h);
533				break;
534			case A_MINUS2:
535				save('-', h);
536				save(' ', h);
537				// fall through into A_MINUS
538			case A_MINUS:
539				save('-', h);
540				save(ch, h);
541				break;
542        		case A_PI:
543				mark();
544				h.pi(theOutputBuffer, 0, theSize);
545				theSize = 0;
546				break;
547        		case A_PITARGET:
548				h.pitarget(theOutputBuffer, 0, theSize);
549				theSize = 0;
550				break;
551        		case A_PITARGET_PI:
552				h.pitarget(theOutputBuffer, 0, theSize);
553				theSize = 0;
554				h.pi(theOutputBuffer, 0, theSize);
555				break;
556        		case A_SAVE:
557				save(ch, h);
558				break;
559        		case A_SKIP:
560				break;
561        		case A_SP:
562				save(' ', h);
563				break;
564        		case A_STAGC:
565				h.stagc(theOutputBuffer, 0, theSize);
566				theSize = 0;
567				break;
568			case A_EMPTYTAG:
569				mark();
570//				System.err.println("%%% Empty tag seen");
571				if (theSize > 0) h.gi(theOutputBuffer, 0, theSize);
572				theSize = 0;
573				h.stage(theOutputBuffer, 0, theSize);
574				break;
575			case A_UNGET:
576				unread(r, ch);
577				theCurrentColumn--;
578				break;
579        		case A_UNSAVE_PCDATA:
580				if (theSize > 0) theSize--;
581				h.pcdata(theOutputBuffer, 0, theSize);
582				theSize = 0;
583				break;
584			default:
585				throw new Error("Can't process state " + action);
586				}
587			theState = theNextState;
588			}
589		h.eof(theOutputBuffer, 0, 0);
590		}
591
592	/**
593	* Mark the current scan position as a "point of interest" - start of a tag,
594	* cdata, processing instruction etc.
595	*/
596
597	private void mark() {
598		theLastColumn = theCurrentColumn;
599		theLastLine = theCurrentLine;
600		}
601
602	/**
603	A callback for the ScanHandler that allows it to force
604	the lexer state to CDATA content (no markup is recognized except
605	the end of element.
606	*/
607
608	public void startCDATA() { theNextState = S_CDATA; }
609
610	private void save(int ch, ScanHandler h) throws IOException, SAXException {
611		if (theSize >= theOutputBuffer.length - 20) {
612			if (theState == S_PCDATA || theState == S_CDATA) {
613				// Return a buffer-sized chunk of PCDATA
614				h.pcdata(theOutputBuffer, 0, theSize);
615				theSize = 0;
616				}
617			else {
618				// Grow the buffer size
619				char[] newOutputBuffer = new char[theOutputBuffer.length * 2];
620                                System.arraycopy(theOutputBuffer, 0, newOutputBuffer, 0, theSize+1);
621				theOutputBuffer = newOutputBuffer;
622				}
623			}
624		theOutputBuffer[theSize++] = (char)ch;
625		}
626
627	/**
628	Test procedure.  Reads HTML from the standard input and writes
629	PYX to the standard output.
630	*/
631
632	public static void main(String[] argv) throws IOException, SAXException {
633		Scanner s = new HTMLScanner();
634		Reader r = new InputStreamReader(System.in, "UTF-8");
635		Writer w = new OutputStreamWriter(System.out, "UTF-8");
636		PYXWriter pw = new PYXWriter(w);
637		s.scan(r, pw);
638		w.close();
639		}
640
641
642	private static String nicechar(int in) {
643		if (in == '\n') return "\\n";
644		if (in < 32) return "0x"+Integer.toHexString(in);
645		return "'"+((char)in)+"'";
646		}
647
648	}
649