1// This file is part of TagSoup and is Copyright 2002-2008 by John Cowan.
2//
3// TagSoup is licensed under the Apache License,
4// Version 2.0.  You may obtain a copy of this license at
5// http://www.apache.org/licenses/LICENSE-2.0 .  You may also have
6// additional legal rights not granted by this license.
7//
8// TagSoup is distributed in the hope that it will be useful, but
9// unless required by applicable law or agreed to in writing, TagSoup
10// is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
11// OF ANY KIND, either express or implied; not even the implied warranty
12// of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
13//
14//
15package org.ccil.cowan.tagsoup;
16import java.io.*;
17import org.xml.sax.SAXException;
18import org.xml.sax.Locator;
19
20/**
21This class implements a table-driven scanner for HTML, allowing for lots of
22defects.  It implements the Scanner interface, which accepts a Reader
23object to fetch characters from and a ScanHandler object to report lexical
24events to.
25*/
26
27public class HTMLScanner implements Scanner, Locator {
28
29	// Start of state table
30		private static final int S_ANAME = 1;
31	private static final int S_APOS = 2;
32	private static final int S_AVAL = 3;
33	private static final int S_BB = 4;
34	private static final int S_BBC = 5;
35	private static final int S_BBCD = 6;
36	private static final int S_BBCDA = 7;
37	private static final int S_BBCDAT = 8;
38	private static final int S_BBCDATA = 9;
39	private static final int S_CDATA = 10;
40	private static final int S_CDATA2 = 11;
41	private static final int S_CDSECT = 12;
42	private static final int S_CDSECT1 = 13;
43	private static final int S_CDSECT2 = 14;
44	private static final int S_COM = 15;
45	private static final int S_COM2 = 16;
46	private static final int S_COM3 = 17;
47	private static final int S_COM4 = 18;
48	private static final int S_DECL = 19;
49	private static final int S_DECL2 = 20;
50	private static final int S_DONE = 21;
51	private static final int S_EMPTYTAG = 22;
52	private static final int S_ENT = 23;
53	private static final int S_EQ = 24;
54	private static final int S_ETAG = 25;
55	private static final int S_GI = 26;
56	private static final int S_NCR = 27;
57	private static final int S_PCDATA = 28;
58	private static final int S_PI = 29;
59	private static final int S_PITARGET = 30;
60	private static final int S_QUOT = 31;
61	private static final int S_STAGC = 32;
62	private static final int S_TAG = 33;
63	private static final int S_TAGWS = 34;
64	private static final int S_XNCR = 35;
65	private static final int A_ADUP = 1;
66	private static final int A_ADUP_SAVE = 2;
67	private static final int A_ADUP_STAGC = 3;
68	private static final int A_ANAME = 4;
69	private static final int A_ANAME_ADUP = 5;
70	private static final int A_ANAME_ADUP_STAGC = 6;
71	private static final int A_AVAL = 7;
72	private static final int A_AVAL_STAGC = 8;
73	private static final int A_CDATA = 9;
74	private static final int A_CMNT = 10;
75	private static final int A_DECL = 11;
76	private static final int A_EMPTYTAG = 12;
77	private static final int A_ENTITY = 13;
78	private static final int A_ENTITY_START = 14;
79	private static final int A_ETAG = 15;
80	private static final int A_GI = 16;
81	private static final int A_GI_STAGC = 17;
82	private static final int A_LT = 18;
83	private static final int A_LT_PCDATA = 19;
84	private static final int A_MINUS = 20;
85	private static final int A_MINUS2 = 21;
86	private static final int A_MINUS3 = 22;
87	private static final int A_PCDATA = 23;
88	private static final int A_PI = 24;
89	private static final int A_PITARGET = 25;
90	private static final int A_PITARGET_PI = 26;
91	private static final int A_SAVE = 27;
92	private static final int A_SKIP = 28;
93	private static final int A_SP = 29;
94	private static final int A_STAGC = 30;
95	private static final int A_UNGET = 31;
96	private static final int A_UNSAVE_PCDATA = 32;
97	private static int[] statetable = {
98		S_ANAME, '/', A_ANAME_ADUP, S_EMPTYTAG,
99		S_ANAME, '=', A_ANAME, S_AVAL,
100		S_ANAME, '>', A_ANAME_ADUP_STAGC, S_PCDATA,
101		S_ANAME, 0, A_SAVE, S_ANAME,
102		S_ANAME, -1, A_ANAME_ADUP_STAGC, S_DONE,
103		S_ANAME, ' ', A_ANAME, S_EQ,
104		S_ANAME, '\n', A_ANAME, S_EQ,
105		S_ANAME, '\t', A_ANAME, S_EQ,
106		S_APOS, '\'', A_AVAL, S_TAGWS,
107		S_APOS, 0, A_SAVE, S_APOS,
108		S_APOS, -1, A_AVAL_STAGC, S_DONE,
109		S_APOS, ' ', A_SP, S_APOS,
110		S_APOS, '\n', A_SP, S_APOS,
111		S_APOS, '\t', A_SP, S_APOS,
112		S_AVAL, '"', A_SKIP, S_QUOT,
113		S_AVAL, '\'', A_SKIP, S_APOS,
114		S_AVAL, '>', A_AVAL_STAGC, S_PCDATA,
115		S_AVAL, 0, A_SAVE, S_STAGC,
116		S_AVAL, -1, A_AVAL_STAGC, S_DONE,
117		S_AVAL, ' ', A_SKIP, S_AVAL,
118		S_AVAL, '\n', A_SKIP, S_AVAL,
119		S_AVAL, '\t', A_SKIP, S_AVAL,
120		S_BB, 'C', A_SKIP, S_BBC,
121		S_BB, 0, A_SKIP, S_DECL,
122		S_BB, -1, A_SKIP, S_DONE,
123		S_BBC, 'D', A_SKIP, S_BBCD,
124		S_BBC, 0, A_SKIP, S_DECL,
125		S_BBC, -1, A_SKIP, S_DONE,
126		S_BBCD, 'A', A_SKIP, S_BBCDA,
127		S_BBCD, 0, A_SKIP, S_DECL,
128		S_BBCD, -1, A_SKIP, S_DONE,
129		S_BBCDA, 'T', A_SKIP, S_BBCDAT,
130		S_BBCDA, 0, A_SKIP, S_DECL,
131		S_BBCDA, -1, A_SKIP, S_DONE,
132		S_BBCDAT, 'A', A_SKIP, S_BBCDATA,
133		S_BBCDAT, 0, A_SKIP, S_DECL,
134		S_BBCDAT, -1, A_SKIP, S_DONE,
135		S_BBCDATA, '[', A_SKIP, S_CDSECT,
136		S_BBCDATA, 0, A_SKIP, S_DECL,
137		S_BBCDATA, -1, A_SKIP, S_DONE,
138		S_CDATA, '<', A_SAVE, S_CDATA2,
139		S_CDATA, 0, A_SAVE, S_CDATA,
140		S_CDATA, -1, A_PCDATA, S_DONE,
141		S_CDATA2, '/', A_UNSAVE_PCDATA, S_ETAG,
142		S_CDATA2, 0, A_SAVE, S_CDATA,
143		S_CDATA2, -1, A_UNSAVE_PCDATA, S_DONE,
144		S_CDSECT, ']', A_SAVE, S_CDSECT1,
145		S_CDSECT, 0, A_SAVE, S_CDSECT,
146		S_CDSECT, -1, A_SKIP, S_DONE,
147		S_CDSECT1, ']', A_SAVE, S_CDSECT2,
148		S_CDSECT1, 0, A_SAVE, S_CDSECT,
149		S_CDSECT1, -1, A_SKIP, S_DONE,
150		S_CDSECT2, '>', A_CDATA, S_PCDATA,
151		S_CDSECT2, 0, A_SAVE, S_CDSECT,
152		S_CDSECT2, -1, A_SKIP, S_DONE,
153		S_COM, '-', A_SKIP, S_COM2,
154		S_COM, 0, A_SAVE, S_COM2,
155		S_COM, -1, A_CMNT, S_DONE,
156		S_COM2, '-', A_SKIP, S_COM3,
157		S_COM2, 0, A_SAVE, S_COM2,
158		S_COM2, -1, A_CMNT, S_DONE,
159		S_COM3, '-', A_SKIP, S_COM4,
160		S_COM3, 0, A_MINUS, S_COM2,
161		S_COM3, -1, A_CMNT, S_DONE,
162		S_COM4, '-', A_MINUS3, S_COM4,
163		S_COM4, '>', A_CMNT, S_PCDATA,
164		S_COM4, 0, A_MINUS2, S_COM2,
165		S_COM4, -1, A_CMNT, S_DONE,
166		S_DECL, '-', A_SKIP, S_COM,
167		S_DECL, '>', A_SKIP, S_PCDATA,
168		S_DECL, '[', A_SKIP, S_BB,
169		S_DECL, 0, A_SAVE, S_DECL2,
170		S_DECL, -1, A_SKIP, S_DONE,
171		S_DECL2, '>', A_DECL, S_PCDATA,
172		S_DECL2, 0, A_SAVE, S_DECL2,
173		S_DECL2, -1, A_SKIP, S_DONE,
174		S_EMPTYTAG, '>', A_EMPTYTAG, S_PCDATA,
175		S_EMPTYTAG, 0, A_SAVE, S_ANAME,
176		S_EMPTYTAG, ' ', A_SKIP, S_TAGWS,
177		S_EMPTYTAG, '\n', A_SKIP, S_TAGWS,
178		S_EMPTYTAG, '\t', A_SKIP, S_TAGWS,
179		S_ENT, 0, A_ENTITY, S_ENT,
180		S_ENT, -1, A_ENTITY, S_DONE,
181		S_EQ, '=', A_SKIP, S_AVAL,
182		S_EQ, '>', A_ADUP_STAGC, S_PCDATA,
183		S_EQ, 0, A_ADUP_SAVE, S_ANAME,
184		S_EQ, -1, A_ADUP_STAGC, S_DONE,
185		S_EQ, ' ', A_SKIP, S_EQ,
186		S_EQ, '\n', A_SKIP, S_EQ,
187		S_EQ, '\t', A_SKIP, S_EQ,
188		S_ETAG, '>', A_ETAG, S_PCDATA,
189		S_ETAG, 0, A_SAVE, S_ETAG,
190		S_ETAG, -1, A_ETAG, S_DONE,
191		S_ETAG, ' ', A_SKIP, S_ETAG,
192		S_ETAG, '\n', A_SKIP, S_ETAG,
193		S_ETAG, '\t', A_SKIP, S_ETAG,
194		S_GI, '/', A_SKIP, S_EMPTYTAG,
195		S_GI, '>', A_GI_STAGC, S_PCDATA,
196		S_GI, 0, A_SAVE, S_GI,
197		S_GI, -1, A_SKIP, S_DONE,
198		S_GI, ' ', A_GI, S_TAGWS,
199		S_GI, '\n', A_GI, S_TAGWS,
200		S_GI, '\t', A_GI, S_TAGWS,
201		S_NCR, 0, A_ENTITY, S_NCR,
202		S_NCR, -1, A_ENTITY, S_DONE,
203		S_PCDATA, '&', A_ENTITY_START, S_ENT,
204		S_PCDATA, '<', A_PCDATA, S_TAG,
205		S_PCDATA, 0, A_SAVE, S_PCDATA,
206		S_PCDATA, -1, A_PCDATA, S_DONE,
207		S_PI, '>', A_PI, S_PCDATA,
208		S_PI, 0, A_SAVE, S_PI,
209		S_PI, -1, A_PI, S_DONE,
210		S_PITARGET, '>', A_PITARGET_PI, S_PCDATA,
211		S_PITARGET, 0, A_SAVE, S_PITARGET,
212		S_PITARGET, -1, A_PITARGET_PI, S_DONE,
213		S_PITARGET, ' ', A_PITARGET, S_PI,
214		S_PITARGET, '\n', A_PITARGET, S_PI,
215		S_PITARGET, '\t', A_PITARGET, S_PI,
216		S_QUOT, '"', A_AVAL, S_TAGWS,
217		S_QUOT, 0, A_SAVE, S_QUOT,
218		S_QUOT, -1, A_AVAL_STAGC, S_DONE,
219		S_QUOT, ' ', A_SP, S_QUOT,
220		S_QUOT, '\n', A_SP, S_QUOT,
221		S_QUOT, '\t', A_SP, S_QUOT,
222		S_STAGC, '>', A_AVAL_STAGC, S_PCDATA,
223		S_STAGC, 0, A_SAVE, S_STAGC,
224		S_STAGC, -1, A_AVAL_STAGC, S_DONE,
225		S_STAGC, ' ', A_AVAL, S_TAGWS,
226		S_STAGC, '\n', A_AVAL, S_TAGWS,
227		S_STAGC, '\t', A_AVAL, S_TAGWS,
228		S_TAG, '!', A_SKIP, S_DECL,
229		S_TAG, '/', A_SKIP, S_ETAG,
230		S_TAG, '<', A_SAVE, S_TAG,
231		S_TAG, '?', A_SKIP, S_PITARGET,
232		S_TAG, 0, A_SAVE, S_GI,
233		S_TAG, -1, A_LT_PCDATA, S_DONE,
234		S_TAG, ' ', A_LT, S_PCDATA,
235		S_TAG, '\n', A_LT, S_PCDATA,
236		S_TAG, '\t', A_LT, S_PCDATA,
237		S_TAGWS, '/', A_SKIP, S_EMPTYTAG,
238		S_TAGWS, '>', A_STAGC, S_PCDATA,
239		S_TAGWS, 0, A_SAVE, S_ANAME,
240		S_TAGWS, -1, A_STAGC, S_DONE,
241		S_TAGWS, ' ', A_SKIP, S_TAGWS,
242		S_TAGWS, '\n', A_SKIP, S_TAGWS,
243		S_TAGWS, '\t', A_SKIP, S_TAGWS,
244		S_XNCR, 0, A_ENTITY, S_XNCR,
245		S_XNCR, -1, A_ENTITY, S_DONE,
246
247	};
248	private static final String[] debug_actionnames = { "", "A_ADUP", "A_ADUP_SAVE", "A_ADUP_STAGC", "A_ANAME", "A_ANAME_ADUP", "A_ANAME_ADUP_STAGC", "A_AVAL", "A_AVAL_STAGC", "A_CDATA", "A_CMNT", "A_DECL", "A_EMPTYTAG", "A_ENTITY", "A_ENTITY_START", "A_ETAG", "A_GI", "A_GI_STAGC", "A_LT", "A_LT_PCDATA", "A_MINUS", "A_MINUS2", "A_MINUS3", "A_PCDATA", "A_PI", "A_PITARGET", "A_PITARGET_PI", "A_SAVE", "A_SKIP", "A_SP", "A_STAGC", "A_UNGET", "A_UNSAVE_PCDATA"};
249	private static final String[] debug_statenames = { "", "S_ANAME", "S_APOS", "S_AVAL", "S_BB", "S_BBC", "S_BBCD", "S_BBCDA", "S_BBCDAT", "S_BBCDATA", "S_CDATA", "S_CDATA2", "S_CDSECT", "S_CDSECT1", "S_CDSECT2", "S_COM", "S_COM2", "S_COM3", "S_COM4", "S_DECL", "S_DECL2", "S_DONE", "S_EMPTYTAG", "S_ENT", "S_EQ", "S_ETAG", "S_GI", "S_NCR", "S_PCDATA", "S_PI", "S_PITARGET", "S_QUOT", "S_STAGC", "S_TAG", "S_TAGWS", "S_XNCR"};
250
251
252	// End of state table
253
254	private String thePublicid;			// Locator state
255	private String theSystemid;
256	private int theLastLine;
257	private int theLastColumn;
258	private int theCurrentLine;
259	private int theCurrentColumn;
260
261	int theState;					// Current state
262	int theNextState;				// Next state
263	char[] theOutputBuffer = new char[200];	// Output buffer
264	int theSize;					// Current buffer size
265	int[] theWinMap = {				// Windows chars map
266		0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
267		0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD,
268		0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
269		0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178};
270
271	/**
272	 * Index into the state table for [state][input character - 2].
273	 * The state table consists of 4-entry runs on the form
274	 * { current state, input character, action, next state }.
275	 * We precompute the index into the state table for all possible
276	 * { current state, input character } and store the result in
277	 * the statetableIndex array. Since only some input characters
278	 * are present in the state table, we only do the computation for
279	 * characters 0 to the highest character value in the state table.
280	 * An input character of -2 is used to cover all other characters
281	 * as -2 is guaranteed not to match any input character entry
282	 * in the state table.
283	 *
284	 * <p>When doing lookups, the input character should first be tested
285	 * to be in the range [-1 (inclusive), statetableIndexMaxChar (exclusive)].
286	 * if it isn't use -2 as the input character.
287	 *
288	 * <p>Finally, add 2 to the input character to cover for the fact that
289	 * Java doesn't support negative array indexes. Then look up
290	 * the value in the statetableIndex. If the value is -1, then
291	 * no action or next state was found for the { state, input } that
292	 * you had. If it isn't -1, then action = statetable[value + 2] and
293	 * next state = statetable[value + 3]. That is, the value points
294	 * to the start of the answer 4-tuple in the statetable.
295	 */
296	static short[][] statetableIndex;
297	/**
298	 * The highest character value seen in the statetable.
299	 * See the doc comment for statetableIndex to see how this
300	 * is used.
301	 */
302	static int statetableIndexMaxChar;
303	static {
304		int maxState = -1;
305		int maxChar = -1;
306		for (int i = 0; i < statetable.length; i += 4) {
307			if (statetable[i] > maxState) {
308				maxState = statetable[i];
309				}
310			if (statetable[i + 1] > maxChar) {
311				maxChar = statetable[i + 1];
312				}
313			}
314		statetableIndexMaxChar = maxChar + 1;
315
316		statetableIndex = new short[maxState + 1][maxChar + 3];
317		for (int theState = 0; theState <= maxState; ++theState) {
318			for (int ch = -2; ch <= maxChar; ++ch) {
319				int hit = -1;
320				int action = 0;
321				for (int i = 0; i < statetable.length; i += 4) {
322					if (theState != statetable[i]) {
323						if (action != 0) break;
324						continue;
325						}
326					if (statetable[i+1] == 0) {
327						hit = i;
328						action = statetable[i+2];
329						}
330					else if (statetable[i+1] == ch) {
331						hit = i;
332						action = statetable[i+2];
333						break;
334						}
335					}
336				statetableIndex[theState][ch + 2] = (short) hit;
337				}
338			}
339		}
340
341	// Compensate for bug in PushbackReader that allows
342	// pushing back EOF.
343	private void unread(PushbackReader r, int c) throws IOException {
344		if (c != -1) r.unread(c);
345		}
346
347	// Locator implementation
348
349	public int getLineNumber() {
350		return theLastLine;
351		}
352	public int getColumnNumber() {
353		return theLastColumn;
354		}
355	public String getPublicId() {
356		return thePublicid;
357		}
358	public String getSystemId() {
359		return theSystemid;
360		}
361
362
363	// Scanner implementation
364
365	/**
366	Reset document locator, supplying systemid and publicid.
367	@param systemid System id
368	@param publicid Public id
369	*/
370
371	public void resetDocumentLocator(String publicid, String systemid) {
372		thePublicid = publicid;
373		theSystemid = systemid;
374		theLastLine = theLastColumn = theCurrentLine = theCurrentColumn = 0;
375		}
376
377	/**
378	Scan HTML source, reporting lexical events.
379	@param r0 Reader that provides characters
380	@param h ScanHandler that accepts lexical events.
381	*/
382
383	public void scan(Reader r0, ScanHandler h) throws IOException, SAXException {
384		theState = S_PCDATA;
385		PushbackReader r;
386		if (r0 instanceof BufferedReader) {
387			r = new PushbackReader(r0, 5);
388			}
389		else {
390			r = new PushbackReader(new BufferedReader(r0), 5);
391			}
392
393		int firstChar = r.read();	// Remove any leading BOM
394		if (firstChar != '\uFEFF') unread(r, firstChar);
395
396		while (theState != S_DONE) {
397			int ch = r.read();
398
399			// Process control characters
400			if (ch >= 0x80 && ch <= 0x9F) ch = theWinMap[ch-0x80];
401
402			if (ch == '\r') {
403				ch = r.read();		// expect LF next
404				if (ch != '\n') {
405					unread(r, ch);	// nope
406					ch = '\n';
407					}
408				}
409
410			if (ch == '\n') {
411				theCurrentLine++;
412				theCurrentColumn = 0;
413				}
414			else {
415				theCurrentColumn++;
416				}
417
418			if (!(ch >= 0x20 || ch == '\n' || ch == '\t' || ch == -1)) continue;
419
420			// Search state table
421			int adjCh = (ch >= -1 && ch < statetableIndexMaxChar) ? ch : -2;
422			int statetableRow = statetableIndex[theState][adjCh + 2];
423			int action = 0;
424			if (statetableRow != -1) {
425				action = statetable[statetableRow + 2];
426				theNextState = statetable[statetableRow + 3];
427				}
428
429//			System.err.println("In " + debug_statenames[theState] + " got " + nicechar(ch) + " doing " + debug_actionnames[action] + " then " + debug_statenames[theNextState]);
430			switch (action) {
431			case 0:
432				throw new Error(
433					"HTMLScanner can't cope with " + Integer.toString(ch) + " in state " +
434					Integer.toString(theState));
435			case A_ADUP:
436				h.adup(theOutputBuffer, 0, theSize);
437				theSize = 0;
438				break;
439			case A_ADUP_SAVE:
440				h.adup(theOutputBuffer, 0, theSize);
441				theSize = 0;
442				save(ch, h);
443				break;
444			case A_ADUP_STAGC:
445				h.adup(theOutputBuffer, 0, theSize);
446				theSize = 0;
447				h.stagc(theOutputBuffer, 0, theSize);
448				break;
449			case A_ANAME:
450				h.aname(theOutputBuffer, 0, theSize);
451				theSize = 0;
452				break;
453			case A_ANAME_ADUP:
454				h.aname(theOutputBuffer, 0, theSize);
455				theSize = 0;
456				h.adup(theOutputBuffer, 0, theSize);
457				break;
458			case A_ANAME_ADUP_STAGC:
459				h.aname(theOutputBuffer, 0, theSize);
460				theSize = 0;
461				h.adup(theOutputBuffer, 0, theSize);
462				h.stagc(theOutputBuffer, 0, theSize);
463				break;
464			case A_AVAL:
465				h.aval(theOutputBuffer, 0, theSize);
466				theSize = 0;
467				break;
468			case A_AVAL_STAGC:
469				h.aval(theOutputBuffer, 0, theSize);
470				theSize = 0;
471				h.stagc(theOutputBuffer, 0, theSize);
472				break;
473			case A_CDATA:
474				mark();
475				// suppress the final "]]" in the buffer
476				if (theSize > 1) theSize -= 2;
477				h.pcdata(theOutputBuffer, 0, theSize);
478				theSize = 0;
479				break;
480			case A_ENTITY_START:
481				h.pcdata(theOutputBuffer, 0, theSize);
482				theSize = 0;
483				save(ch, h);
484				break;
485			case A_ENTITY:
486				mark();
487				char ch1 = (char)ch;
488//				System.out.println("Got " + ch1 + " in state " + ((theState == S_ENT) ? "S_ENT" : ((theState == S_NCR) ? "S_NCR" : "UNK")));
489				if (theState == S_ENT && ch1 == '#') {
490					theNextState = S_NCR;
491					save(ch, h);
492					break;
493					}
494				else if (theState == S_NCR && (ch1 == 'x' || ch1 == 'X')) {
495					theNextState = S_XNCR;
496					save(ch, h);
497					break;
498					}
499				else if (theState == S_ENT && Character.isLetterOrDigit(ch1)) {
500					save(ch, h);
501					break;
502					}
503				else if (theState == S_NCR && Character.isDigit(ch1)) {
504					save(ch, h);
505					break;
506					}
507				else if (theState == S_XNCR && (Character.isDigit(ch1) || "abcdefABCDEF".indexOf(ch1) != -1)) {
508					save(ch, h);
509					break;
510					}
511
512				// The whole entity reference has been collected
513//				System.err.println("%%" + new String(theOutputBuffer, 0, theSize));
514				h.entity(theOutputBuffer, 1, theSize - 1);
515				int ent = h.getEntity();
516//				System.err.println("%% value = " + ent);
517				if (ent != 0) {
518					theSize = 0;
519					if (ent >= 0x80 && ent <= 0x9F) {
520						ent = theWinMap[ent-0x80];
521						}
522					if (ent < 0x20) {
523						// Control becomes space
524						ent = 0x20;
525						}
526					else if (ent >= 0xD800 && ent <= 0xDFFF) {
527						// Surrogates get dropped
528						ent = 0;
529						}
530					else if (ent <= 0xFFFF) {
531						// BMP character
532						save(ent, h);
533						}
534					else {
535						// Astral converted to two surrogates
536						ent -= 0x10000;
537						save((ent>>10) + 0xD800, h);
538						save((ent&0x3FF) + 0xDC00, h);
539						}
540					if (ch != ';') {
541						unread(r, ch);
542						theCurrentColumn--;
543						}
544					}
545				else {
546					unread(r, ch);
547					theCurrentColumn--;
548					}
549				theNextState = S_PCDATA;
550				break;
551			case A_ETAG:
552				h.etag(theOutputBuffer, 0, theSize);
553				theSize = 0;
554				break;
555			case A_DECL:
556				h.decl(theOutputBuffer, 0, theSize);
557				theSize = 0;
558				break;
559			case A_GI:
560				h.gi(theOutputBuffer, 0, theSize);
561				theSize = 0;
562				break;
563			case A_GI_STAGC:
564				h.gi(theOutputBuffer, 0, theSize);
565				theSize = 0;
566				h.stagc(theOutputBuffer, 0, theSize);
567				break;
568			case A_LT:
569				mark();
570				save('<', h);
571				save(ch, h);
572				break;
573			case A_LT_PCDATA:
574				mark();
575				save('<', h);
576				h.pcdata(theOutputBuffer, 0, theSize);
577				theSize = 0;
578				break;
579			case A_PCDATA:
580				mark();
581				h.pcdata(theOutputBuffer, 0, theSize);
582				theSize = 0;
583				break;
584			case A_CMNT:
585				mark();
586				h.cmnt(theOutputBuffer, 0, theSize);
587				theSize = 0;
588				break;
589			case A_MINUS3:
590				save('-', h);
591				save(' ', h);
592				break;
593			case A_MINUS2:
594				save('-', h);
595				save(' ', h);
596				// fall through into A_MINUS
597			case A_MINUS:
598				save('-', h);
599				save(ch, h);
600				break;
601			case A_PI:
602				mark();
603				h.pi(theOutputBuffer, 0, theSize);
604				theSize = 0;
605				break;
606			case A_PITARGET:
607				h.pitarget(theOutputBuffer, 0, theSize);
608				theSize = 0;
609				break;
610			case A_PITARGET_PI:
611				h.pitarget(theOutputBuffer, 0, theSize);
612				theSize = 0;
613				h.pi(theOutputBuffer, 0, theSize);
614				break;
615			case A_SAVE:
616				save(ch, h);
617				break;
618			case A_SKIP:
619				break;
620			case A_SP:
621				save(' ', h);
622				break;
623			case A_STAGC:
624				h.stagc(theOutputBuffer, 0, theSize);
625				theSize = 0;
626				break;
627			case A_EMPTYTAG:
628				mark();
629//				System.err.println("%%% Empty tag seen");
630				if (theSize > 0) h.gi(theOutputBuffer, 0, theSize);
631				theSize = 0;
632				h.stage(theOutputBuffer, 0, theSize);
633				break;
634			case A_UNGET:
635				unread(r, ch);
636				theCurrentColumn--;
637				break;
638			case A_UNSAVE_PCDATA:
639				if (theSize > 0) theSize--;
640				h.pcdata(theOutputBuffer, 0, theSize);
641				theSize = 0;
642				break;
643			default:
644				throw new Error("Can't process state " + action);
645				}
646			theState = theNextState;
647			}
648		h.eof(theOutputBuffer, 0, 0);
649		}
650
651	/**
652	* Mark the current scan position as a "point of interest" - start of a tag,
653	* cdata, processing instruction etc.
654	*/
655
656	private void mark() {
657		theLastColumn = theCurrentColumn;
658		theLastLine = theCurrentLine;
659		}
660
661	/**
662	A callback for the ScanHandler that allows it to force
663	the lexer state to CDATA content (no markup is recognized except
664	the end of element.
665	*/
666
667	public void startCDATA() { theNextState = S_CDATA; }
668
669	private void save(int ch, ScanHandler h) throws IOException, SAXException {
670		if (theSize >= theOutputBuffer.length - 20) {
671			if (theState == S_PCDATA || theState == S_CDATA) {
672				// Return a buffer-sized chunk of PCDATA
673				h.pcdata(theOutputBuffer, 0, theSize);
674				theSize = 0;
675				}
676			else {
677				// Grow the buffer size
678				char[] newOutputBuffer = new char[theOutputBuffer.length * 2];
679				System.arraycopy(theOutputBuffer, 0, newOutputBuffer, 0, theSize+1);
680				theOutputBuffer = newOutputBuffer;
681				}
682			}
683		theOutputBuffer[theSize++] = (char)ch;
684		}
685
686	/**
687	Test procedure.  Reads HTML from the standard input and writes
688	PYX to the standard output.
689	*/
690
691	public static void main(String[] argv) throws IOException, SAXException {
692		Scanner s = new HTMLScanner();
693		Reader r = new InputStreamReader(System.in, "UTF-8");
694		Writer w = new OutputStreamWriter(System.out, "UTF-8");
695		PYXWriter pw = new PYXWriter(w);
696		s.scan(r, pw);
697		w.close();
698		}
699
700
701	private static String nicechar(int in) {
702		if (in == '\n') return "\\n";
703		if (in < 32) return "0x"+Integer.toHexString(in);
704		return "'"+((char)in)+"'";
705		}
706
707	}
708