1/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * daniel@veillard.com
7 */
8
9#define IN_LIBXML
10#include "libxml.h"
11#ifdef LIBXML_HTML_ENABLED
12
13#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
39#include <libxml/HTMLtree.h>
40#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
44#include <libxml/globals.h>
45#include <libxml/uri.h>
46
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
54static int htmlOmittedDefaultValue = 1;
55
56xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57			     xmlChar end, xmlChar  end2, xmlChar end3);
58static void htmlParseComment(htmlParserCtxtPtr ctxt);
59
60/************************************************************************
61 *									*
62 * 		Some factorized error routines				*
63 *									*
64 ************************************************************************/
65
66/**
67 * htmlErrMemory:
68 * @ctxt:  an HTML parser context
69 * @extra:  extra informations
70 *
71 * Handle a redefinition of attribute error
72 */
73static void
74htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75{
76    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77        (ctxt->instate == XML_PARSER_EOF))
78	return;
79    if (ctxt != NULL) {
80        ctxt->errNo = XML_ERR_NO_MEMORY;
81        ctxt->instate = XML_PARSER_EOF;
82        ctxt->disableSAX = 1;
83    }
84    if (extra)
85        __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
86                        XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87                        NULL, NULL, 0, 0,
88                        "Memory allocation failed : %s\n", extra);
89    else
90        __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
91                        XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92                        NULL, NULL, 0, 0, "Memory allocation failed\n");
93}
94
95/**
96 * htmlParseErr:
97 * @ctxt:  an HTML parser context
98 * @error:  the error number
99 * @msg:  the error message
100 * @str1:  string infor
101 * @str2:  string infor
102 *
103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
104 */
105static void
106htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107             const char *msg, const xmlChar *str1, const xmlChar *str2)
108{
109    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110        (ctxt->instate == XML_PARSER_EOF))
111	return;
112    if (ctxt != NULL)
113	ctxt->errNo = error;
114    __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
115                    XML_ERR_ERROR, NULL, 0,
116		    (const char *) str1, (const char *) str2,
117		    NULL, 0, 0,
118		    msg, str1, str2);
119    if (ctxt != NULL)
120	ctxt->wellFormed = 0;
121}
122
123/**
124 * htmlParseErrInt:
125 * @ctxt:  an HTML parser context
126 * @error:  the error number
127 * @msg:  the error message
128 * @val:  integer info
129 *
130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
131 */
132static void
133htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134             const char *msg, int val)
135{
136    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137        (ctxt->instate == XML_PARSER_EOF))
138	return;
139    if (ctxt != NULL)
140	ctxt->errNo = error;
141    __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
142                    XML_ERR_ERROR, NULL, 0, NULL, NULL,
143		    NULL, val, 0, msg, val);
144    if (ctxt != NULL)
145	ctxt->wellFormed = 0;
146}
147
148/************************************************************************
149 *									*
150 * 		Parser stacks related functions and macros		*
151 *									*
152 ************************************************************************/
153
154/**
155 * htmlnamePush:
156 * @ctxt:  an HTML parser context
157 * @value:  the element name
158 *
159 * Pushes a new element name on top of the name stack
160 *
161 * Returns 0 in case of error, the index in the stack otherwise
162 */
163static int
164htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
165{
166    if (ctxt->nameNr >= ctxt->nameMax) {
167        ctxt->nameMax *= 2;
168        ctxt->nameTab = (const xmlChar * *)
169                         xmlRealloc((xmlChar * *)ctxt->nameTab,
170                                    ctxt->nameMax *
171                                    sizeof(ctxt->nameTab[0]));
172        if (ctxt->nameTab == NULL) {
173            htmlErrMemory(ctxt, NULL);
174            return (0);
175        }
176    }
177    ctxt->nameTab[ctxt->nameNr] = value;
178    ctxt->name = value;
179    return (ctxt->nameNr++);
180}
181/**
182 * htmlnamePop:
183 * @ctxt: an HTML parser context
184 *
185 * Pops the top element name from the name stack
186 *
187 * Returns the name just removed
188 */
189static const xmlChar *
190htmlnamePop(htmlParserCtxtPtr ctxt)
191{
192    const xmlChar *ret;
193
194    if (ctxt->nameNr <= 0)
195        return (NULL);
196    ctxt->nameNr--;
197    if (ctxt->nameNr < 0)
198        return (NULL);
199    if (ctxt->nameNr > 0)
200        ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
201    else
202        ctxt->name = NULL;
203    ret = ctxt->nameTab[ctxt->nameNr];
204    ctxt->nameTab[ctxt->nameNr] = NULL;
205    return (ret);
206}
207
208/*
209 * Macros for accessing the content. Those should be used only by the parser,
210 * and not exported.
211 *
212 * Dirty macros, i.e. one need to make assumption on the context to use them
213 *
214 *   CUR_PTR return the current pointer to the xmlChar to be parsed.
215 *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
216 *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
217 *           in UNICODE mode. This should be used internally by the parser
218 *           only to compare to ASCII values otherwise it would break when
219 *           running with UTF-8 encoding.
220 *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
221 *           to compare on ASCII based substring.
222 *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
223 *           it should be used only to compare on ASCII based substring.
224 *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
225 *           strings without newlines within the parser.
226 *
227 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
228 *
229 *   CURRENT Returns the current char value, with the full decoding of
230 *           UTF-8 if we are using this mode. It returns an int.
231 *   NEXT    Skip to the next character, this does the proper decoding
232 *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
233 *   NEXTL(l) Skip the current unicode character of l xmlChars long.
234 *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
235 */
236
237#define UPPER (toupper(*ctxt->input->cur))
238
239#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
240
241#define NXT(val) ctxt->input->cur[(val)]
242
243#define UPP(val) (toupper(ctxt->input->cur[(val)]))
244
245#define CUR_PTR ctxt->input->cur
246
247#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
248		   (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
249	xmlParserInputShrink(ctxt->input)
250
251#define GROW if ((ctxt->progressive == 0) &&				\
252		 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))	\
253	xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
254
255#define CURRENT ((int) (*ctxt->input->cur))
256
257#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
258
259/* Inported from XML */
260
261/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
262#define CUR ((int) (*ctxt->input->cur))
263#define NEXT xmlNextChar(ctxt)
264
265#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
266#define NXT(val) ctxt->input->cur[(val)]
267#define CUR_PTR ctxt->input->cur
268
269
270#define NEXTL(l) do {							\
271    if (*(ctxt->input->cur) == '\n') {					\
272	ctxt->input->line++; ctxt->input->col = 1;			\
273    } else ctxt->input->col++;						\
274    ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++;		\
275  } while (0)
276
277/************
278    \
279    if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt);	\
280    if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
281 ************/
282
283#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
284#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
285
286#define COPY_BUF(l,b,i,v)						\
287    if (l == 1) b[i++] = (xmlChar) v;					\
288    else i += xmlCopyChar(l,&b[i],v)
289
290/**
291 * htmlCurrentChar:
292 * @ctxt:  the HTML parser context
293 * @len:  pointer to the length of the char read
294 *
295 * The current char value, if using UTF-8 this may actually span multiple
296 * bytes in the input buffer. Implement the end of line normalization:
297 * 2.11 End-of-Line Handling
298 * If the encoding is unspecified, in the case we find an ISO-Latin-1
299 * char, then the encoding converter is plugged in automatically.
300 *
301 * Returns the current char value and its length
302 */
303
304static int
305htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
306    if (ctxt->instate == XML_PARSER_EOF)
307	return(0);
308
309    if (ctxt->token != 0) {
310	*len = 0;
311	return(ctxt->token);
312    }
313    if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
314	/*
315	 * We are supposed to handle UTF8, check it's valid
316	 * From rfc2044: encoding of the Unicode values on UTF-8:
317	 *
318	 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
319	 * 0000 0000-0000 007F   0xxxxxxx
320	 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
321	 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
322	 *
323	 * Check for the 0x110000 limit too
324	 */
325	const unsigned char *cur = ctxt->input->cur;
326	unsigned char c;
327	unsigned int val;
328
329	c = *cur;
330	if (c & 0x80) {
331	    if (cur[1] == 0)
332		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
333	    if ((cur[1] & 0xc0) != 0x80)
334		goto encoding_error;
335	    if ((c & 0xe0) == 0xe0) {
336
337		if (cur[2] == 0)
338		    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
339		if ((cur[2] & 0xc0) != 0x80)
340		    goto encoding_error;
341		if ((c & 0xf0) == 0xf0) {
342		    if (cur[3] == 0)
343			xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
344		    if (((c & 0xf8) != 0xf0) ||
345			((cur[3] & 0xc0) != 0x80))
346			goto encoding_error;
347		    /* 4-byte code */
348		    *len = 4;
349		    val = (cur[0] & 0x7) << 18;
350		    val |= (cur[1] & 0x3f) << 12;
351		    val |= (cur[2] & 0x3f) << 6;
352		    val |= cur[3] & 0x3f;
353		} else {
354		  /* 3-byte code */
355		    *len = 3;
356		    val = (cur[0] & 0xf) << 12;
357		    val |= (cur[1] & 0x3f) << 6;
358		    val |= cur[2] & 0x3f;
359		}
360	    } else {
361	      /* 2-byte code */
362		*len = 2;
363		val = (cur[0] & 0x1f) << 6;
364		val |= cur[1] & 0x3f;
365	    }
366	    if (!IS_CHAR(val)) {
367	        htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
368				"Char 0x%X out of allowed range\n", val);
369	    }
370	    return(val);
371	} else {
372	    /* 1-byte code */
373	    *len = 1;
374	    return((int) *ctxt->input->cur);
375	}
376    }
377    /*
378     * Assume it's a fixed length encoding (1) with
379     * a compatible encoding for the ASCII set, since
380     * XML constructs only use < 128 chars
381     */
382    *len = 1;
383    if ((int) *ctxt->input->cur < 0x80)
384	return((int) *ctxt->input->cur);
385
386    /*
387     * Humm this is bad, do an automatic flow conversion
388     */
389    xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
390    ctxt->charset = XML_CHAR_ENCODING_UTF8;
391    return(xmlCurrentChar(ctxt, len));
392
393encoding_error:
394    /*
395     * If we detect an UTF8 error that probably mean that the
396     * input encoding didn't get properly advertized in the
397     * declaration header. Report the error and switch the encoding
398     * to ISO-Latin-1 (if you don't like this policy, just declare the
399     * encoding !)
400     */
401    {
402        char buffer[150];
403
404	if (ctxt->input->end - ctxt->input->cur >= 4) {
405	    snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
406			    ctxt->input->cur[0], ctxt->input->cur[1],
407			    ctxt->input->cur[2], ctxt->input->cur[3]);
408	} else {
409	    snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
410	}
411	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
412		     "Input is not proper UTF-8, indicate encoding !\n",
413		     BAD_CAST buffer, NULL);
414    }
415
416    ctxt->charset = XML_CHAR_ENCODING_8859_1;
417    *len = 1;
418    return((int) *ctxt->input->cur);
419}
420
421/**
422 * htmlSkipBlankChars:
423 * @ctxt:  the HTML parser context
424 *
425 * skip all blanks character found at that point in the input streams.
426 *
427 * Returns the number of space chars skipped
428 */
429
430static int
431htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
432    int res = 0;
433
434    while (IS_BLANK_CH(*(ctxt->input->cur))) {
435	if ((*ctxt->input->cur == 0) &&
436	    (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
437		xmlPopInput(ctxt);
438	} else {
439	    if (*(ctxt->input->cur) == '\n') {
440		ctxt->input->line++; ctxt->input->col = 1;
441	    } else ctxt->input->col++;
442	    ctxt->input->cur++;
443	    ctxt->nbChars++;
444	    if (*ctxt->input->cur == 0)
445		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
446	}
447	res++;
448    }
449    return(res);
450}
451
452
453
454/************************************************************************
455 *									*
456 * 		The list of HTML elements and their properties		*
457 *									*
458 ************************************************************************/
459
460/*
461 *  Start Tag: 1 means the start tag can be ommited
462 *  End Tag:   1 means the end tag can be ommited
463 *             2 means it's forbidden (empty elements)
464 *             3 means the tag is stylistic and should be closed easily
465 *  Depr:      this element is deprecated
466 *  DTD:       1 means that this element is valid only in the Loose DTD
467 *             2 means that this element is valid only in the Frameset DTD
468 *
469 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
470	, subElements , impliedsubelt , Attributes, userdata
471 */
472
473/* Definitions and a couple of vars for HTML Elements */
474
475#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
476#define NB_FONTSTYLE 8
477#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
478#define NB_PHRASE 10
479#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
480#define NB_SPECIAL 16
481#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
482#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
483#define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
484#define NB_BLOCK NB_HEADING + NB_LIST + 14
485#define FORMCTRL "input", "select", "textarea", "label", "button"
486#define NB_FORMCTRL 5
487#define PCDATA
488#define NB_PCDATA 0
489#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
490#define NB_HEADING 6
491#define LIST "ul", "ol", "dir", "menu"
492#define NB_LIST 4
493#define MODIFIER
494#define NB_MODIFIER 0
495#define FLOW BLOCK,INLINE
496#define NB_FLOW NB_BLOCK + NB_INLINE
497#define EMPTY NULL
498
499
500static const char* const html_flow[] = { FLOW, NULL } ;
501static const char* const html_inline[] = { INLINE, NULL } ;
502
503/* placeholders: elts with content but no subelements */
504static const char* const html_pcdata[] = { NULL } ;
505#define html_cdata html_pcdata
506
507
508/* ... and for HTML Attributes */
509
510#define COREATTRS "id", "class", "style", "title"
511#define NB_COREATTRS 4
512#define I18N "lang", "dir"
513#define NB_I18N 2
514#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
515#define NB_EVENTS 9
516#define ATTRS COREATTRS,I18N,EVENTS
517#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
518#define CELLHALIGN "align", "char", "charoff"
519#define NB_CELLHALIGN 3
520#define CELLVALIGN "valign"
521#define NB_CELLVALIGN 1
522
523static const char* const html_attrs[] = { ATTRS, NULL } ;
524static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
525static const char* const core_attrs[] = { COREATTRS, NULL } ;
526static const char* const i18n_attrs[] = { I18N, NULL } ;
527
528
529/* Other declarations that should go inline ... */
530static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
531	"href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
532	"tabindex", "onfocus", "onblur", NULL } ;
533static const char* const target_attr[] = { "target", NULL } ;
534static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
535static const char* const alt_attr[] = { "alt", NULL } ;
536static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
537static const char* const href_attrs[] = { "href", NULL } ;
538static const char* const clear_attrs[] = { "clear", NULL } ;
539static const char* const inline_p[] = { INLINE, "p", NULL } ;
540
541static const char* const flow_param[] = { FLOW, "param", NULL } ;
542static const char* const applet_attrs[] = { COREATTRS , "codebase",
543		"archive", "alt", "name", "height", "width", "align",
544		"hspace", "vspace", NULL } ;
545static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
546	"tabindex", "accesskey", "onfocus", "onblur", NULL } ;
547static const char* const basefont_attrs[] =
548	{ "id", "size", "color", "face", NULL } ;
549static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
550static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
551static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
552static const char* const body_depr[] = { "background", "bgcolor", "text",
553	"link", "vlink", "alink", NULL } ;
554static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
555	"disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
556
557
558static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
559static const char* const col_elt[] = { "col", NULL } ;
560static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
561static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
562static const char* const dl_contents[] = { "dt", "dd", NULL } ;
563static const char* const compact_attr[] = { "compact", NULL } ;
564static const char* const label_attr[] = { "label", NULL } ;
565static const char* const fieldset_contents[] = { FLOW, "legend" } ;
566static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
567static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
568static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
569static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
570static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
571static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
572static const char* const head_attrs[] = { I18N, "profile", NULL } ;
573static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
574static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
575static const char* const version_attr[] = { "version", NULL } ;
576static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
577static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
578static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
579static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
580static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
581static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
582static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
583static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
584static const char* const align_attr[] = { "align", NULL } ;
585static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
586static const char* const map_contents[] = { BLOCK, "area", NULL } ;
587static const char* const name_attr[] = { "name", NULL } ;
588static const char* const action_attr[] = { "action", NULL } ;
589static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
590static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
591static const char* const content_attr[] = { "content", NULL } ;
592static const char* const type_attr[] = { "type", NULL } ;
593static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
594static const char* const object_contents[] = { FLOW, "param", NULL } ;
595static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
596static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
597static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
598static const char* const option_elt[] = { "option", NULL } ;
599static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
600static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
601static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
602static const char* const width_attr[] = { "width", NULL } ;
603static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
604static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
605static const char* const language_attr[] = { "language", NULL } ;
606static const char* const select_content[] = { "optgroup", "option", NULL } ;
607static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
608static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
609static const char* const table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
610static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
611static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
612static const char* const tr_elt[] = { "tr", NULL } ;
613static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
614static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
615static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
616static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
617static const char* const tr_contents[] = { "th", "td", NULL } ;
618static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
619static const char* const li_elt[] = { "li", NULL } ;
620static const char* const ul_depr[] = { "type", "compact", NULL} ;
621static const char* const dir_attr[] = { "dir", NULL} ;
622
623#define DECL (const char**)
624
625static const htmlElemDesc
626html40ElementTable[] = {
627{ "a",		0, 0, 0, 0, 0, 0, 1, "anchor ",
628	DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
629},
630{ "abbr",	0, 0, 0, 0, 0, 0, 1, "abbreviated form",
631	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
632},
633{ "acronym",	0, 0, 0, 0, 0, 0, 1, "",
634	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
635},
636{ "address",	0, 0, 0, 0, 0, 0, 0, "information on author ",
637	DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
638},
639{ "applet",	0, 0, 0, 0, 1, 1, 2, "java applet ",
640	DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
641},
642{ "area",	0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
643	EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
644},
645{ "b",		0, 3, 0, 0, 0, 0, 1, "bold text style",
646	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
647},
648{ "base",	0, 2, 2, 1, 0, 0, 0, "document base uri ",
649	EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
650},
651{ "basefont",	0, 2, 2, 1, 1, 1, 1, "base font size " ,
652	EMPTY , NULL , NULL, DECL basefont_attrs, NULL
653},
654{ "bdo",	0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
655	DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
656},
657{ "big",	0, 3, 0, 0, 0, 0, 1, "large text style",
658	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
659},
660{ "blockquote",	0, 0, 0, 0, 0, 0, 0, "long quotation ",
661	DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
662},
663{ "body",	1, 1, 0, 0, 0, 0, 0, "document body ",
664	DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
665},
666{ "br",		0, 2, 2, 1, 0, 0, 1, "forced line break ",
667	EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
668},
669{ "button",	0, 0, 0, 0, 0, 0, 2, "push button ",
670	DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
671},
672{ "caption",	0, 0, 0, 0, 0, 0, 0, "table caption ",
673	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
674},
675{ "center",	0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
676	DECL html_flow , NULL , NULL, DECL html_attrs, NULL
677},
678{ "cite",	0, 0, 0, 0, 0, 0, 1, "citation",
679	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
680},
681{ "code",	0, 0, 0, 0, 0, 0, 1, "computer code fragment",
682	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
683},
684{ "col",	0, 2, 2, 1, 0, 0, 0, "table column ",
685	EMPTY , NULL , DECL col_attrs , NULL, NULL
686},
687{ "colgroup",	0, 1, 0, 0, 0, 0, 0, "table column group ",
688	DECL col_elt , "col" , DECL col_attrs , NULL, NULL
689},
690{ "dd",		0, 1, 0, 0, 0, 0, 0, "definition description ",
691	DECL html_flow , NULL , DECL html_attrs, NULL, NULL
692},
693{ "del",	0, 0, 0, 0, 0, 0, 2, "deleted text ",
694	DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
695},
696{ "dfn",	0, 0, 0, 0, 0, 0, 1, "instance definition",
697	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
698},
699{ "dir",	0, 0, 0, 0, 1, 1, 0, "directory list",
700	DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
701},
702{ "div",	0, 0, 0, 0, 0, 0, 0, "generic language/style container",
703	DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
704},
705{ "dl",		0, 0, 0, 0, 0, 0, 0, "definition list ",
706	DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
707},
708{ "dt",		0, 1, 0, 0, 0, 0, 0, "definition term ",
709	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
710},
711{ "em",		0, 3, 0, 0, 0, 0, 1, "emphasis",
712	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
713},
714{ "embed",	0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
715	EMPTY, NULL, DECL embed_attrs, NULL, NULL
716},
717{ "fieldset",	0, 0, 0, 0, 0, 0, 0, "form control group ",
718	DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
719},
720{ "font",	0, 3, 0, 0, 1, 1, 1, "local change to font ",
721	DECL html_inline, NULL, NULL, DECL font_attrs, NULL
722},
723{ "form",	0, 0, 0, 0, 0, 0, 0, "interactive form ",
724	DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
725},
726{ "frame",	0, 2, 2, 1, 0, 2, 0, "subwindow " ,
727	EMPTY, NULL, NULL, DECL frame_attrs, NULL
728},
729{ "frameset",	0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
730	DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
731},
732{ "h1",		0, 0, 0, 0, 0, 0, 0, "heading ",
733	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
734},
735{ "h2",		0, 0, 0, 0, 0, 0, 0, "heading ",
736	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
737},
738{ "h3",		0, 0, 0, 0, 0, 0, 0, "heading ",
739	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
740},
741{ "h4",		0, 0, 0, 0, 0, 0, 0, "heading ",
742	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
743},
744{ "h5",		0, 0, 0, 0, 0, 0, 0, "heading ",
745	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
746},
747{ "h6",		0, 0, 0, 0, 0, 0, 0, "heading ",
748	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
749},
750{ "head",	1, 1, 0, 0, 0, 0, 0, "document head ",
751	DECL head_contents, NULL, DECL head_attrs, NULL, NULL
752},
753{ "hr",		0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
754	EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
755},
756{ "html",	1, 1, 0, 0, 0, 0, 0, "document root element ",
757	DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
758},
759{ "i",		0, 3, 0, 0, 0, 0, 1, "italic text style",
760	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
761},
762{ "iframe",	0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
763	DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
764},
765{ "img",	0, 2, 2, 1, 0, 0, 1, "embedded image ",
766	EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
767},
768{ "input",	0, 2, 2, 1, 0, 0, 1, "form control ",
769	EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
770},
771{ "ins",	0, 0, 0, 0, 0, 0, 2, "inserted text",
772	DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
773},
774{ "isindex",	0, 2, 2, 1, 1, 1, 0, "single line prompt ",
775	EMPTY, NULL, NULL, DECL prompt_attrs, NULL
776},
777{ "kbd",	0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
778	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
779},
780{ "label",	0, 0, 0, 0, 0, 0, 1, "form field label text ",
781	DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
782},
783{ "legend",	0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
784	DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
785},
786{ "li",		0, 1, 1, 0, 0, 0, 0, "list item ",
787	DECL html_flow, NULL, DECL html_attrs, NULL, NULL
788},
789{ "link",	0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
790	EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
791},
792{ "map",	0, 0, 0, 0, 0, 0, 2, "client-side image map ",
793	DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
794},
795{ "menu",	0, 0, 0, 0, 1, 1, 0, "menu list ",
796	DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
797},
798{ "meta",	0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
799	EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
800},
801{ "noframes",	0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
802	DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
803},
804{ "noscript",	0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
805	DECL html_flow, "div", DECL html_attrs, NULL, NULL
806},
807{ "object",	0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
808	DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
809},
810{ "ol",		0, 0, 0, 0, 0, 0, 0, "ordered list ",
811	DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
812},
813{ "optgroup",	0, 0, 0, 0, 0, 0, 0, "option group ",
814	DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
815},
816{ "option",	0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
817	DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
818},
819{ "p",		0, 1, 0, 0, 0, 0, 0, "paragraph ",
820	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
821},
822{ "param",	0, 2, 2, 1, 0, 0, 0, "named property value ",
823	EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
824},
825{ "pre",	0, 0, 0, 0, 0, 0, 0, "preformatted text ",
826	DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
827},
828{ "q",		0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
829	DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
830},
831{ "s",		0, 3, 0, 0, 1, 1, 1, "strike-through text style",
832	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
833},
834{ "samp",	0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
835	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
836},
837{ "script",	0, 0, 0, 0, 0, 0, 2, "script statements ",
838	DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
839},
840{ "select",	0, 0, 0, 0, 0, 0, 1, "option selector ",
841	DECL select_content, NULL, DECL select_attrs, NULL, NULL
842},
843{ "small",	0, 3, 0, 0, 0, 0, 1, "small text style",
844	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
845},
846{ "span",	0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
847	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
848},
849{ "strike",	0, 3, 0, 0, 1, 1, 1, "strike-through text",
850	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
851},
852{ "strong",	0, 3, 0, 0, 0, 0, 1, "strong emphasis",
853	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
854},
855{ "style",	0, 0, 0, 0, 0, 0, 0, "style info ",
856	DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
857},
858{ "sub",	0, 3, 0, 0, 0, 0, 1, "subscript",
859	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
860},
861{ "sup",	0, 3, 0, 0, 0, 0, 1, "superscript ",
862	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
863},
864{ "table",	0, 0, 0, 0, 0, 0, 0, "",
865	DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
866},
867{ "tbody",	1, 0, 0, 0, 0, 0, 0, "table body ",
868	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
869},
870{ "td",		0, 0, 0, 0, 0, 0, 0, "table data cell",
871	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
872},
873{ "textarea",	0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
874	DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
875},
876{ "tfoot",	0, 1, 0, 0, 0, 0, 0, "table footer ",
877	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
878},
879{ "th",		0, 1, 0, 0, 0, 0, 0, "table header cell",
880	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
881},
882{ "thead",	0, 1, 0, 0, 0, 0, 0, "table header ",
883	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
884},
885{ "title",	0, 0, 0, 0, 0, 0, 0, "document title ",
886	DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
887},
888{ "tr",		0, 0, 0, 0, 0, 0, 0, "table row ",
889	DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
890},
891{ "tt",		0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
892	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
893},
894{ "u",		0, 3, 0, 0, 1, 1, 1, "underlined text style",
895	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
896},
897{ "ul",		0, 0, 0, 0, 0, 0, 0, "unordered list ",
898	DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
899},
900{ "var",	0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
901	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
902}
903};
904
905/*
906 * start tags that imply the end of current element
907 */
908static const char * const htmlStartClose[] = {
909"form",		"form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
910		"dl", "ul", "ol", "menu", "dir", "address", "pre",
911		"listing", "xmp", "head", NULL,
912"head",		"p", NULL,
913"title",	"p", NULL,
914"body",		"head", "style", "link", "title", "p", NULL,
915"frameset",	"head", "style", "link", "title", "p", NULL,
916"li",		"p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
917		"pre", "listing", "xmp", "head", "li", NULL,
918"hr",		"p", "head", NULL,
919"h1",		"p", "head", NULL,
920"h2",		"p", "head", NULL,
921"h3",		"p", "head", NULL,
922"h4",		"p", "head", NULL,
923"h5",		"p", "head", NULL,
924"h6",		"p", "head", NULL,
925"dir",		"p", "head", NULL,
926"address",	"p", "head", "ul", NULL,
927"pre",		"p", "head", "ul", NULL,
928"listing",	"p", "head", NULL,
929"xmp",		"p", "head", NULL,
930"blockquote",	"p", "head", NULL,
931"dl",		"p", "dt", "menu", "dir", "address", "pre", "listing",
932		"xmp", "head", NULL,
933"dt",		"p", "menu", "dir", "address", "pre", "listing", "xmp",
934                "head", "dd", NULL,
935"dd",		"p", "menu", "dir", "address", "pre", "listing", "xmp",
936                "head", "dt", NULL,
937"ul",		"p", "head", "ol", "menu", "dir", "address", "pre",
938		"listing", "xmp", NULL,
939"ol",		"p", "head", "ul", NULL,
940"menu",		"p", "head", "ul", NULL,
941"p",		"p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
942"div",		"p", "head", NULL,
943"noscript",	"p", "head", NULL,
944"center",	"font", "b", "i", "p", "head", NULL,
945"a",		"a", NULL,
946"caption",	"p", NULL,
947"colgroup",	"caption", "colgroup", "col", "p", NULL,
948"col",		"caption", "col", "p", NULL,
949"table",	"p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
950		"listing", "xmp", "a", NULL,
951"th",		"th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
952"td",		"th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
953"tr",		"th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
954"thead",	"caption", "col", "colgroup", NULL,
955"tfoot",	"th", "td", "tr", "caption", "col", "colgroup", "thead",
956		"tbody", "p", NULL,
957"tbody",	"th", "td", "tr", "caption", "col", "colgroup", "thead",
958		"tfoot", "tbody", "p", NULL,
959"optgroup",	"option", NULL,
960"option",	"option", NULL,
961"fieldset",	"legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
962		"pre", "listing", "xmp", "a", NULL,
963NULL
964};
965
966/*
967 * The list of HTML elements which are supposed not to have
968 * CDATA content and where a p element will be implied
969 *
970 * TODO: extend that list by reading the HTML SGML DTD on
971 *       implied paragraph
972 */
973static const char *const htmlNoContentElements[] = {
974    "html",
975    "head",
976    NULL
977};
978
979/*
980 * The list of HTML attributes which are of content %Script;
981 * NOTE: when adding ones, check htmlIsScriptAttribute() since
982 *       it assumes the name starts with 'on'
983 */
984static const char *const htmlScriptAttributes[] = {
985    "onclick",
986    "ondblclick",
987    "onmousedown",
988    "onmouseup",
989    "onmouseover",
990    "onmousemove",
991    "onmouseout",
992    "onkeypress",
993    "onkeydown",
994    "onkeyup",
995    "onload",
996    "onunload",
997    "onfocus",
998    "onblur",
999    "onsubmit",
1000    "onrest",
1001    "onchange",
1002    "onselect"
1003};
1004
1005/*
1006 * This table is used by the htmlparser to know what to do with
1007 * broken html pages. By assigning different priorities to different
1008 * elements the parser can decide how to handle extra endtags.
1009 * Endtags are only allowed to close elements with lower or equal
1010 * priority.
1011 */
1012
1013typedef struct {
1014    const char *name;
1015    int priority;
1016} elementPriority;
1017
1018static const elementPriority htmlEndPriority[] = {
1019    {"div",   150},
1020    {"td",    160},
1021    {"th",    160},
1022    {"tr",    170},
1023    {"thead", 180},
1024    {"tbody", 180},
1025    {"tfoot", 180},
1026    {"table", 190},
1027    {"head",  200},
1028    {"body",  200},
1029    {"html",  220},
1030    {NULL,    100} /* Default priority */
1031};
1032
1033static const char** htmlStartCloseIndex[100];
1034static int htmlStartCloseIndexinitialized = 0;
1035
1036/************************************************************************
1037 *									*
1038 * 		functions to handle HTML specific data			*
1039 *									*
1040 ************************************************************************/
1041
1042/**
1043 * htmlInitAutoClose:
1044 *
1045 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1046 * This is not reentrant. Call xmlInitParser() once before processing in
1047 * case of use in multithreaded programs.
1048 */
1049void
1050htmlInitAutoClose(void) {
1051    int indx, i = 0;
1052
1053    if (htmlStartCloseIndexinitialized) return;
1054
1055    for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1056    indx = 0;
1057    while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1058        htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
1059	while (htmlStartClose[i] != NULL) i++;
1060	i++;
1061    }
1062    htmlStartCloseIndexinitialized = 1;
1063}
1064
1065/**
1066 * htmlTagLookup:
1067 * @tag:  The tag name in lowercase
1068 *
1069 * Lookup the HTML tag in the ElementTable
1070 *
1071 * Returns the related htmlElemDescPtr or NULL if not found.
1072 */
1073const htmlElemDesc *
1074htmlTagLookup(const xmlChar *tag) {
1075    unsigned int i;
1076
1077    for (i = 0; i < (sizeof(html40ElementTable) /
1078                     sizeof(html40ElementTable[0]));i++) {
1079        if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
1080	    return((htmlElemDescPtr) &html40ElementTable[i]);
1081    }
1082    return(NULL);
1083}
1084
1085/**
1086 * htmlGetEndPriority:
1087 * @name: The name of the element to look up the priority for.
1088 *
1089 * Return value: The "endtag" priority.
1090 **/
1091static int
1092htmlGetEndPriority (const xmlChar *name) {
1093    int i = 0;
1094
1095    while ((htmlEndPriority[i].name != NULL) &&
1096	   (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1097	i++;
1098
1099    return(htmlEndPriority[i].priority);
1100}
1101
1102
1103/**
1104 * htmlCheckAutoClose:
1105 * @newtag:  The new tag name
1106 * @oldtag:  The old tag name
1107 *
1108 * Checks whether the new tag is one of the registered valid tags for
1109 * closing old.
1110 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1111 *
1112 * Returns 0 if no, 1 if yes.
1113 */
1114static int
1115htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1116{
1117    int i, indx;
1118    const char **closed = NULL;
1119
1120    if (htmlStartCloseIndexinitialized == 0)
1121        htmlInitAutoClose();
1122
1123    /* inefficient, but not a big deal */
1124    for (indx = 0; indx < 100; indx++) {
1125        closed = htmlStartCloseIndex[indx];
1126        if (closed == NULL)
1127            return (0);
1128        if (xmlStrEqual(BAD_CAST * closed, newtag))
1129            break;
1130    }
1131
1132    i = closed - htmlStartClose;
1133    i++;
1134    while (htmlStartClose[i] != NULL) {
1135        if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1136            return (1);
1137        }
1138        i++;
1139    }
1140    return (0);
1141}
1142
1143/**
1144 * htmlAutoCloseOnClose:
1145 * @ctxt:  an HTML parser context
1146 * @newtag:  The new tag name
1147 * @force:  force the tag closure
1148 *
1149 * The HTML DTD allows an ending tag to implicitly close other tags.
1150 */
1151static void
1152htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1153{
1154    const htmlElemDesc *info;
1155    int i, priority;
1156
1157    priority = htmlGetEndPriority(newtag);
1158
1159    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1160
1161        if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1162            break;
1163        /*
1164         * A missplaced endtag can only close elements with lower
1165         * or equal priority, so if we find an element with higher
1166         * priority before we find an element with
1167         * matching name, we just ignore this endtag
1168         */
1169        if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1170            return;
1171    }
1172    if (i < 0)
1173        return;
1174
1175    while (!xmlStrEqual(newtag, ctxt->name)) {
1176        info = htmlTagLookup(ctxt->name);
1177        if ((info != NULL) && (info->endTag == 3)) {
1178            htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1179	                 "Opening and ending tag mismatch: %s and %s\n",
1180			 newtag, ctxt->name);
1181        }
1182        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1183            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1184	htmlnamePop(ctxt);
1185    }
1186}
1187
1188/**
1189 * htmlAutoCloseOnEnd:
1190 * @ctxt:  an HTML parser context
1191 *
1192 * Close all remaining tags at the end of the stream
1193 */
1194static void
1195htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1196{
1197    int i;
1198
1199    if (ctxt->nameNr == 0)
1200        return;
1201    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1202        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1203            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1204	htmlnamePop(ctxt);
1205    }
1206}
1207
1208/**
1209 * htmlAutoClose:
1210 * @ctxt:  an HTML parser context
1211 * @newtag:  The new tag name or NULL
1212 *
1213 * The HTML DTD allows a tag to implicitly close other tags.
1214 * The list is kept in htmlStartClose array. This function is
1215 * called when a new tag has been detected and generates the
1216 * appropriates closes if possible/needed.
1217 * If newtag is NULL this mean we are at the end of the resource
1218 * and we should check
1219 */
1220static void
1221htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1222{
1223    while ((newtag != NULL) && (ctxt->name != NULL) &&
1224           (htmlCheckAutoClose(newtag, ctxt->name))) {
1225        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1226            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1227	htmlnamePop(ctxt);
1228    }
1229    if (newtag == NULL) {
1230        htmlAutoCloseOnEnd(ctxt);
1231        return;
1232    }
1233    while ((newtag == NULL) && (ctxt->name != NULL) &&
1234           ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1235            (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1236            (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1237        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1238            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1239	htmlnamePop(ctxt);
1240    }
1241}
1242
1243/**
1244 * htmlAutoCloseTag:
1245 * @doc:  the HTML document
1246 * @name:  The tag name
1247 * @elem:  the HTML element
1248 *
1249 * The HTML DTD allows a tag to implicitly close other tags.
1250 * The list is kept in htmlStartClose array. This function checks
1251 * if the element or one of it's children would autoclose the
1252 * given tag.
1253 *
1254 * Returns 1 if autoclose, 0 otherwise
1255 */
1256int
1257htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1258    htmlNodePtr child;
1259
1260    if (elem == NULL) return(1);
1261    if (xmlStrEqual(name, elem->name)) return(0);
1262    if (htmlCheckAutoClose(elem->name, name)) return(1);
1263    child = elem->children;
1264    while (child != NULL) {
1265        if (htmlAutoCloseTag(doc, name, child)) return(1);
1266	child = child->next;
1267    }
1268    return(0);
1269}
1270
1271/**
1272 * htmlIsAutoClosed:
1273 * @doc:  the HTML document
1274 * @elem:  the HTML element
1275 *
1276 * The HTML DTD allows a tag to implicitly close other tags.
1277 * The list is kept in htmlStartClose array. This function checks
1278 * if a tag is autoclosed by one of it's child
1279 *
1280 * Returns 1 if autoclosed, 0 otherwise
1281 */
1282int
1283htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1284    htmlNodePtr child;
1285
1286    if (elem == NULL) return(1);
1287    child = elem->children;
1288    while (child != NULL) {
1289	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1290	child = child->next;
1291    }
1292    return(0);
1293}
1294
1295/**
1296 * htmlCheckImplied:
1297 * @ctxt:  an HTML parser context
1298 * @newtag:  The new tag name
1299 *
1300 * The HTML DTD allows a tag to exists only implicitly
1301 * called when a new tag has been detected and generates the
1302 * appropriates implicit tags if missing
1303 */
1304static void
1305htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1306    if (!htmlOmittedDefaultValue)
1307	return;
1308    if (xmlStrEqual(newtag, BAD_CAST"html"))
1309	return;
1310    if (ctxt->nameNr <= 0) {
1311	htmlnamePush(ctxt, BAD_CAST"html");
1312	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1313	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1314    }
1315    if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1316        return;
1317    if ((ctxt->nameNr <= 1) &&
1318        ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1319	 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1320	 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1321	 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1322	 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1323	 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1324	    /*
1325	     * dropped OBJECT ... i you put it first BODY will be
1326	     * assumed !
1327	     */
1328	    htmlnamePush(ctxt, BAD_CAST"head");
1329	    if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1330		ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1331    } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1332	       (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1333	       (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1334	int i;
1335	for (i = 0;i < ctxt->nameNr;i++) {
1336	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1337		return;
1338	    }
1339	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1340		return;
1341	    }
1342	}
1343
1344	htmlnamePush(ctxt, BAD_CAST"body");
1345	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1346	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1347    }
1348}
1349
1350/**
1351 * htmlCheckParagraph
1352 * @ctxt:  an HTML parser context
1353 *
1354 * Check whether a p element need to be implied before inserting
1355 * characters in the current element.
1356 *
1357 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1358 *         in case of error.
1359 */
1360
1361static int
1362htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1363    const xmlChar *tag;
1364    int i;
1365
1366    if (ctxt == NULL)
1367	return(-1);
1368    tag = ctxt->name;
1369    if (tag == NULL) {
1370	htmlAutoClose(ctxt, BAD_CAST"p");
1371	htmlCheckImplied(ctxt, BAD_CAST"p");
1372	htmlnamePush(ctxt, BAD_CAST"p");
1373	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1374	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1375	return(1);
1376    }
1377    if (!htmlOmittedDefaultValue)
1378	return(0);
1379    for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1380	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1381	    htmlAutoClose(ctxt, BAD_CAST"p");
1382	    htmlCheckImplied(ctxt, BAD_CAST"p");
1383	    htmlnamePush(ctxt, BAD_CAST"p");
1384	    if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1385		ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1386	    return(1);
1387	}
1388    }
1389    return(0);
1390}
1391
1392/**
1393 * htmlIsScriptAttribute:
1394 * @name:  an attribute name
1395 *
1396 * Check if an attribute is of content type Script
1397 *
1398 * Returns 1 is the attribute is a script 0 otherwise
1399 */
1400int
1401htmlIsScriptAttribute(const xmlChar *name) {
1402    unsigned int i;
1403
1404    if (name == NULL)
1405       	return(0);
1406    /*
1407     * all script attributes start with 'on'
1408     */
1409    if ((name[0] != 'o') || (name[1] != 'n'))
1410       	return(0);
1411    for (i = 0;
1412	 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1413	 i++) {
1414	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1415	    return(1);
1416    }
1417    return(0);
1418}
1419
1420/************************************************************************
1421 *									*
1422 * 		The list of HTML predefined entities			*
1423 *									*
1424 ************************************************************************/
1425
1426
1427static const htmlEntityDesc  html40EntitiesTable[] = {
1428/*
1429 * the 4 absolute ones, plus apostrophe.
1430 */
1431{ 34,	"quot",	"quotation mark = APL quote, U+0022 ISOnum" },
1432{ 38,	"amp",	"ampersand, U+0026 ISOnum" },
1433{ 39,	"apos",	"single quote" },
1434{ 60,	"lt",	"less-than sign, U+003C ISOnum" },
1435{ 62,	"gt",	"greater-than sign, U+003E ISOnum" },
1436
1437/*
1438 * A bunch still in the 128-255 range
1439 * Replacing them depend really on the charset used.
1440 */
1441{ 160,	"nbsp",	"no-break space = non-breaking space, U+00A0 ISOnum" },
1442{ 161,	"iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1443{ 162,	"cent",	"cent sign, U+00A2 ISOnum" },
1444{ 163,	"pound","pound sign, U+00A3 ISOnum" },
1445{ 164,	"curren","currency sign, U+00A4 ISOnum" },
1446{ 165,	"yen",	"yen sign = yuan sign, U+00A5 ISOnum" },
1447{ 166,	"brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1448{ 167,	"sect",	"section sign, U+00A7 ISOnum" },
1449{ 168,	"uml",	"diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1450{ 169,	"copy",	"copyright sign, U+00A9 ISOnum" },
1451{ 170,	"ordf",	"feminine ordinal indicator, U+00AA ISOnum" },
1452{ 171,	"laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1453{ 172,	"not",	"not sign, U+00AC ISOnum" },
1454{ 173,	"shy",	"soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1455{ 174,	"reg",	"registered sign = registered trade mark sign, U+00AE ISOnum" },
1456{ 175,	"macr",	"macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1457{ 176,	"deg",	"degree sign, U+00B0 ISOnum" },
1458{ 177,	"plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1459{ 178,	"sup2",	"superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1460{ 179,	"sup3",	"superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1461{ 180,	"acute","acute accent = spacing acute, U+00B4 ISOdia" },
1462{ 181,	"micro","micro sign, U+00B5 ISOnum" },
1463{ 182,	"para",	"pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1464{ 183,	"middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1465{ 184,	"cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1466{ 185,	"sup1",	"superscript one = superscript digit one, U+00B9 ISOnum" },
1467{ 186,	"ordm",	"masculine ordinal indicator, U+00BA ISOnum" },
1468{ 187,	"raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1469{ 188,	"frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1470{ 189,	"frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1471{ 190,	"frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1472{ 191,	"iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1473{ 192,	"Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1474{ 193,	"Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1475{ 194,	"Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1476{ 195,	"Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1477{ 196,	"Auml",	"latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1478{ 197,	"Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1479{ 198,	"AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1480{ 199,	"Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1481{ 200,	"Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1482{ 201,	"Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1483{ 202,	"Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1484{ 203,	"Euml",	"latin capital letter E with diaeresis, U+00CB ISOlat1" },
1485{ 204,	"Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1486{ 205,	"Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1487{ 206,	"Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1488{ 207,	"Iuml",	"latin capital letter I with diaeresis, U+00CF ISOlat1" },
1489{ 208,	"ETH",	"latin capital letter ETH, U+00D0 ISOlat1" },
1490{ 209,	"Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1491{ 210,	"Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1492{ 211,	"Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1493{ 212,	"Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1494{ 213,	"Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1495{ 214,	"Ouml",	"latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1496{ 215,	"times","multiplication sign, U+00D7 ISOnum" },
1497{ 216,	"Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1498{ 217,	"Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1499{ 218,	"Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1500{ 219,	"Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1501{ 220,	"Uuml",	"latin capital letter U with diaeresis, U+00DC ISOlat1" },
1502{ 221,	"Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1503{ 222,	"THORN","latin capital letter THORN, U+00DE ISOlat1" },
1504{ 223,	"szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1505{ 224,	"agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1506{ 225,	"aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1507{ 226,	"acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1508{ 227,	"atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1509{ 228,	"auml",	"latin small letter a with diaeresis, U+00E4 ISOlat1" },
1510{ 229,	"aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1511{ 230,	"aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1512{ 231,	"ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1513{ 232,	"egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1514{ 233,	"eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1515{ 234,	"ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1516{ 235,	"euml",	"latin small letter e with diaeresis, U+00EB ISOlat1" },
1517{ 236,	"igrave","latin small letter i with grave, U+00EC ISOlat1" },
1518{ 237,	"iacute","latin small letter i with acute, U+00ED ISOlat1" },
1519{ 238,	"icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1520{ 239,	"iuml",	"latin small letter i with diaeresis, U+00EF ISOlat1" },
1521{ 240,	"eth",	"latin small letter eth, U+00F0 ISOlat1" },
1522{ 241,	"ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1523{ 242,	"ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1524{ 243,	"oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1525{ 244,	"ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1526{ 245,	"otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1527{ 246,	"ouml",	"latin small letter o with diaeresis, U+00F6 ISOlat1" },
1528{ 247,	"divide","division sign, U+00F7 ISOnum" },
1529{ 248,	"oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1530{ 249,	"ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1531{ 250,	"uacute","latin small letter u with acute, U+00FA ISOlat1" },
1532{ 251,	"ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1533{ 252,	"uuml",	"latin small letter u with diaeresis, U+00FC ISOlat1" },
1534{ 253,	"yacute","latin small letter y with acute, U+00FD ISOlat1" },
1535{ 254,	"thorn","latin small letter thorn with, U+00FE ISOlat1" },
1536{ 255,	"yuml",	"latin small letter y with diaeresis, U+00FF ISOlat1" },
1537
1538{ 338,	"OElig","latin capital ligature OE, U+0152 ISOlat2" },
1539{ 339,	"oelig","latin small ligature oe, U+0153 ISOlat2" },
1540{ 352,	"Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1541{ 353,	"scaron","latin small letter s with caron, U+0161 ISOlat2" },
1542{ 376,	"Yuml",	"latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1543
1544/*
1545 * Anything below should really be kept as entities references
1546 */
1547{ 402,	"fnof",	"latin small f with hook = function = florin, U+0192 ISOtech" },
1548
1549{ 710,	"circ",	"modifier letter circumflex accent, U+02C6 ISOpub" },
1550{ 732,	"tilde","small tilde, U+02DC ISOdia" },
1551
1552{ 913,	"Alpha","greek capital letter alpha, U+0391" },
1553{ 914,	"Beta",	"greek capital letter beta, U+0392" },
1554{ 915,	"Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1555{ 916,	"Delta","greek capital letter delta, U+0394 ISOgrk3" },
1556{ 917,	"Epsilon","greek capital letter epsilon, U+0395" },
1557{ 918,	"Zeta",	"greek capital letter zeta, U+0396" },
1558{ 919,	"Eta",	"greek capital letter eta, U+0397" },
1559{ 920,	"Theta","greek capital letter theta, U+0398 ISOgrk3" },
1560{ 921,	"Iota",	"greek capital letter iota, U+0399" },
1561{ 922,	"Kappa","greek capital letter kappa, U+039A" },
1562{ 923,	"Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1563{ 924,	"Mu",	"greek capital letter mu, U+039C" },
1564{ 925,	"Nu",	"greek capital letter nu, U+039D" },
1565{ 926,	"Xi",	"greek capital letter xi, U+039E ISOgrk3" },
1566{ 927,	"Omicron","greek capital letter omicron, U+039F" },
1567{ 928,	"Pi",	"greek capital letter pi, U+03A0 ISOgrk3" },
1568{ 929,	"Rho",	"greek capital letter rho, U+03A1" },
1569{ 931,	"Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1570{ 932,	"Tau",	"greek capital letter tau, U+03A4" },
1571{ 933,	"Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1572{ 934,	"Phi",	"greek capital letter phi, U+03A6 ISOgrk3" },
1573{ 935,	"Chi",	"greek capital letter chi, U+03A7" },
1574{ 936,	"Psi",	"greek capital letter psi, U+03A8 ISOgrk3" },
1575{ 937,	"Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1576
1577{ 945,	"alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1578{ 946,	"beta",	"greek small letter beta, U+03B2 ISOgrk3" },
1579{ 947,	"gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1580{ 948,	"delta","greek small letter delta, U+03B4 ISOgrk3" },
1581{ 949,	"epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1582{ 950,	"zeta",	"greek small letter zeta, U+03B6 ISOgrk3" },
1583{ 951,	"eta",	"greek small letter eta, U+03B7 ISOgrk3" },
1584{ 952,	"theta","greek small letter theta, U+03B8 ISOgrk3" },
1585{ 953,	"iota",	"greek small letter iota, U+03B9 ISOgrk3" },
1586{ 954,	"kappa","greek small letter kappa, U+03BA ISOgrk3" },
1587{ 955,	"lambda","greek small letter lambda, U+03BB ISOgrk3" },
1588{ 956,	"mu",	"greek small letter mu, U+03BC ISOgrk3" },
1589{ 957,	"nu",	"greek small letter nu, U+03BD ISOgrk3" },
1590{ 958,	"xi",	"greek small letter xi, U+03BE ISOgrk3" },
1591{ 959,	"omicron","greek small letter omicron, U+03BF NEW" },
1592{ 960,	"pi",	"greek small letter pi, U+03C0 ISOgrk3" },
1593{ 961,	"rho",	"greek small letter rho, U+03C1 ISOgrk3" },
1594{ 962,	"sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1595{ 963,	"sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1596{ 964,	"tau",	"greek small letter tau, U+03C4 ISOgrk3" },
1597{ 965,	"upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1598{ 966,	"phi",	"greek small letter phi, U+03C6 ISOgrk3" },
1599{ 967,	"chi",	"greek small letter chi, U+03C7 ISOgrk3" },
1600{ 968,	"psi",	"greek small letter psi, U+03C8 ISOgrk3" },
1601{ 969,	"omega","greek small letter omega, U+03C9 ISOgrk3" },
1602{ 977,	"thetasym","greek small letter theta symbol, U+03D1 NEW" },
1603{ 978,	"upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1604{ 982,	"piv",	"greek pi symbol, U+03D6 ISOgrk3" },
1605
1606{ 8194,	"ensp",	"en space, U+2002 ISOpub" },
1607{ 8195,	"emsp",	"em space, U+2003 ISOpub" },
1608{ 8201,	"thinsp","thin space, U+2009 ISOpub" },
1609{ 8204,	"zwnj",	"zero width non-joiner, U+200C NEW RFC 2070" },
1610{ 8205,	"zwj",	"zero width joiner, U+200D NEW RFC 2070" },
1611{ 8206,	"lrm",	"left-to-right mark, U+200E NEW RFC 2070" },
1612{ 8207,	"rlm",	"right-to-left mark, U+200F NEW RFC 2070" },
1613{ 8211,	"ndash","en dash, U+2013 ISOpub" },
1614{ 8212,	"mdash","em dash, U+2014 ISOpub" },
1615{ 8216,	"lsquo","left single quotation mark, U+2018 ISOnum" },
1616{ 8217,	"rsquo","right single quotation mark, U+2019 ISOnum" },
1617{ 8218,	"sbquo","single low-9 quotation mark, U+201A NEW" },
1618{ 8220,	"ldquo","left double quotation mark, U+201C ISOnum" },
1619{ 8221,	"rdquo","right double quotation mark, U+201D ISOnum" },
1620{ 8222,	"bdquo","double low-9 quotation mark, U+201E NEW" },
1621{ 8224,	"dagger","dagger, U+2020 ISOpub" },
1622{ 8225,	"Dagger","double dagger, U+2021 ISOpub" },
1623
1624{ 8226,	"bull",	"bullet = black small circle, U+2022 ISOpub" },
1625{ 8230,	"hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1626
1627{ 8240,	"permil","per mille sign, U+2030 ISOtech" },
1628
1629{ 8242,	"prime","prime = minutes = feet, U+2032 ISOtech" },
1630{ 8243,	"Prime","double prime = seconds = inches, U+2033 ISOtech" },
1631
1632{ 8249,	"lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1633{ 8250,	"rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1634
1635{ 8254,	"oline","overline = spacing overscore, U+203E NEW" },
1636{ 8260,	"frasl","fraction slash, U+2044 NEW" },
1637
1638{ 8364,	"euro",	"euro sign, U+20AC NEW" },
1639
1640{ 8465,	"image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1641{ 8472,	"weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1642{ 8476,	"real",	"blackletter capital R = real part symbol, U+211C ISOamso" },
1643{ 8482,	"trade","trade mark sign, U+2122 ISOnum" },
1644{ 8501,	"alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1645{ 8592,	"larr",	"leftwards arrow, U+2190 ISOnum" },
1646{ 8593,	"uarr",	"upwards arrow, U+2191 ISOnum" },
1647{ 8594,	"rarr",	"rightwards arrow, U+2192 ISOnum" },
1648{ 8595,	"darr",	"downwards arrow, U+2193 ISOnum" },
1649{ 8596,	"harr",	"left right arrow, U+2194 ISOamsa" },
1650{ 8629,	"crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1651{ 8656,	"lArr",	"leftwards double arrow, U+21D0 ISOtech" },
1652{ 8657,	"uArr",	"upwards double arrow, U+21D1 ISOamsa" },
1653{ 8658,	"rArr",	"rightwards double arrow, U+21D2 ISOtech" },
1654{ 8659,	"dArr",	"downwards double arrow, U+21D3 ISOamsa" },
1655{ 8660,	"hArr",	"left right double arrow, U+21D4 ISOamsa" },
1656
1657{ 8704,	"forall","for all, U+2200 ISOtech" },
1658{ 8706,	"part",	"partial differential, U+2202 ISOtech" },
1659{ 8707,	"exist","there exists, U+2203 ISOtech" },
1660{ 8709,	"empty","empty set = null set = diameter, U+2205 ISOamso" },
1661{ 8711,	"nabla","nabla = backward difference, U+2207 ISOtech" },
1662{ 8712,	"isin",	"element of, U+2208 ISOtech" },
1663{ 8713,	"notin","not an element of, U+2209 ISOtech" },
1664{ 8715,	"ni",	"contains as member, U+220B ISOtech" },
1665{ 8719,	"prod",	"n-ary product = product sign, U+220F ISOamsb" },
1666{ 8721,	"sum",	"n-ary summation, U+2211 ISOamsb" },
1667{ 8722,	"minus","minus sign, U+2212 ISOtech" },
1668{ 8727,	"lowast","asterisk operator, U+2217 ISOtech" },
1669{ 8730,	"radic","square root = radical sign, U+221A ISOtech" },
1670{ 8733,	"prop",	"proportional to, U+221D ISOtech" },
1671{ 8734,	"infin","infinity, U+221E ISOtech" },
1672{ 8736,	"ang",	"angle, U+2220 ISOamso" },
1673{ 8743,	"and",	"logical and = wedge, U+2227 ISOtech" },
1674{ 8744,	"or",	"logical or = vee, U+2228 ISOtech" },
1675{ 8745,	"cap",	"intersection = cap, U+2229 ISOtech" },
1676{ 8746,	"cup",	"union = cup, U+222A ISOtech" },
1677{ 8747,	"int",	"integral, U+222B ISOtech" },
1678{ 8756,	"there4","therefore, U+2234 ISOtech" },
1679{ 8764,	"sim",	"tilde operator = varies with = similar to, U+223C ISOtech" },
1680{ 8773,	"cong",	"approximately equal to, U+2245 ISOtech" },
1681{ 8776,	"asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1682{ 8800,	"ne",	"not equal to, U+2260 ISOtech" },
1683{ 8801,	"equiv","identical to, U+2261 ISOtech" },
1684{ 8804,	"le",	"less-than or equal to, U+2264 ISOtech" },
1685{ 8805,	"ge",	"greater-than or equal to, U+2265 ISOtech" },
1686{ 8834,	"sub",	"subset of, U+2282 ISOtech" },
1687{ 8835,	"sup",	"superset of, U+2283 ISOtech" },
1688{ 8836,	"nsub",	"not a subset of, U+2284 ISOamsn" },
1689{ 8838,	"sube",	"subset of or equal to, U+2286 ISOtech" },
1690{ 8839,	"supe",	"superset of or equal to, U+2287 ISOtech" },
1691{ 8853,	"oplus","circled plus = direct sum, U+2295 ISOamsb" },
1692{ 8855,	"otimes","circled times = vector product, U+2297 ISOamsb" },
1693{ 8869,	"perp",	"up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1694{ 8901,	"sdot",	"dot operator, U+22C5 ISOamsb" },
1695{ 8968,	"lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1696{ 8969,	"rceil","right ceiling, U+2309 ISOamsc" },
1697{ 8970,	"lfloor","left floor = apl downstile, U+230A ISOamsc" },
1698{ 8971,	"rfloor","right floor, U+230B ISOamsc" },
1699{ 9001,	"lang",	"left-pointing angle bracket = bra, U+2329 ISOtech" },
1700{ 9002,	"rang",	"right-pointing angle bracket = ket, U+232A ISOtech" },
1701{ 9674,	"loz",	"lozenge, U+25CA ISOpub" },
1702
1703{ 9824,	"spades","black spade suit, U+2660 ISOpub" },
1704{ 9827,	"clubs","black club suit = shamrock, U+2663 ISOpub" },
1705{ 9829,	"hearts","black heart suit = valentine, U+2665 ISOpub" },
1706{ 9830,	"diams","black diamond suit, U+2666 ISOpub" },
1707
1708};
1709
1710/************************************************************************
1711 *									*
1712 *		Commodity functions to handle entities			*
1713 *									*
1714 ************************************************************************/
1715
1716/*
1717 * Macro used to grow the current buffer.
1718 */
1719#define growBuffer(buffer) {						\
1720    xmlChar *tmp;							\
1721    buffer##_size *= 2;							\
1722    tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1723    if (tmp == NULL) {						\
1724	htmlErrMemory(ctxt, "growing buffer\n");			\
1725	xmlFree(buffer);						\
1726	return(NULL);							\
1727    }									\
1728    buffer = tmp;							\
1729}
1730
1731/**
1732 * htmlEntityLookup:
1733 * @name: the entity name
1734 *
1735 * Lookup the given entity in EntitiesTable
1736 *
1737 * TODO: the linear scan is really ugly, an hash table is really needed.
1738 *
1739 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1740 */
1741const htmlEntityDesc *
1742htmlEntityLookup(const xmlChar *name) {
1743    unsigned int i;
1744
1745    for (i = 0;i < (sizeof(html40EntitiesTable)/
1746                    sizeof(html40EntitiesTable[0]));i++) {
1747        if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1748            return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1749	}
1750    }
1751    return(NULL);
1752}
1753
1754/**
1755 * htmlEntityValueLookup:
1756 * @value: the entity's unicode value
1757 *
1758 * Lookup the given entity in EntitiesTable
1759 *
1760 * TODO: the linear scan is really ugly, an hash table is really needed.
1761 *
1762 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1763 */
1764const htmlEntityDesc *
1765htmlEntityValueLookup(unsigned int value) {
1766    unsigned int i;
1767
1768    for (i = 0;i < (sizeof(html40EntitiesTable)/
1769                    sizeof(html40EntitiesTable[0]));i++) {
1770        if (html40EntitiesTable[i].value >= value) {
1771	    if (html40EntitiesTable[i].value > value)
1772		break;
1773            return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1774	}
1775    }
1776    return(NULL);
1777}
1778
1779/**
1780 * UTF8ToHtml:
1781 * @out:  a pointer to an array of bytes to store the result
1782 * @outlen:  the length of @out
1783 * @in:  a pointer to an array of UTF-8 chars
1784 * @inlen:  the length of @in
1785 *
1786 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1787 * plus HTML entities block of chars out.
1788 *
1789 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1790 * The value of @inlen after return is the number of octets consumed
1791 *     as the return value is positive, else unpredictable.
1792 * The value of @outlen after return is the number of octets consumed.
1793 */
1794int
1795UTF8ToHtml(unsigned char* out, int *outlen,
1796              const unsigned char* in, int *inlen) {
1797    const unsigned char* processed = in;
1798    const unsigned char* outend;
1799    const unsigned char* outstart = out;
1800    const unsigned char* instart = in;
1801    const unsigned char* inend;
1802    unsigned int c, d;
1803    int trailing;
1804
1805    if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
1806    if (in == NULL) {
1807        /*
1808	 * initialization nothing to do
1809	 */
1810	*outlen = 0;
1811	*inlen = 0;
1812	return(0);
1813    }
1814    inend = in + (*inlen);
1815    outend = out + (*outlen);
1816    while (in < inend) {
1817	d = *in++;
1818	if      (d < 0x80)  { c= d; trailing= 0; }
1819	else if (d < 0xC0) {
1820	    /* trailing byte in leading position */
1821	    *outlen = out - outstart;
1822	    *inlen = processed - instart;
1823	    return(-2);
1824        } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
1825        else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
1826        else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
1827	else {
1828	    /* no chance for this in Ascii */
1829	    *outlen = out - outstart;
1830	    *inlen = processed - instart;
1831	    return(-2);
1832	}
1833
1834	if (inend - in < trailing) {
1835	    break;
1836	}
1837
1838	for ( ; trailing; trailing--) {
1839	    if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1840		break;
1841	    c <<= 6;
1842	    c |= d & 0x3F;
1843	}
1844
1845	/* assertion: c is a single UTF-4 value */
1846	if (c < 0x80) {
1847	    if (out + 1 >= outend)
1848		break;
1849	    *out++ = c;
1850	} else {
1851	    int len;
1852	    const htmlEntityDesc * ent;
1853	    const char *cp;
1854	    char nbuf[16];
1855
1856	    /*
1857	     * Try to lookup a predefined HTML entity for it
1858	     */
1859
1860	    ent = htmlEntityValueLookup(c);
1861	    if (ent == NULL) {
1862	      snprintf(nbuf, sizeof(nbuf), "#%u", c);
1863	      cp = nbuf;
1864	    }
1865	    else
1866	      cp = ent->name;
1867	    len = strlen(cp);
1868	    if (out + 2 + len >= outend)
1869		break;
1870	    *out++ = '&';
1871	    memcpy(out, cp, len);
1872	    out += len;
1873	    *out++ = ';';
1874	}
1875	processed = in;
1876    }
1877    *outlen = out - outstart;
1878    *inlen = processed - instart;
1879    return(0);
1880}
1881
1882/**
1883 * htmlEncodeEntities:
1884 * @out:  a pointer to an array of bytes to store the result
1885 * @outlen:  the length of @out
1886 * @in:  a pointer to an array of UTF-8 chars
1887 * @inlen:  the length of @in
1888 * @quoteChar: the quote character to escape (' or ") or zero.
1889 *
1890 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1891 * plus HTML entities block of chars out.
1892 *
1893 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1894 * The value of @inlen after return is the number of octets consumed
1895 *     as the return value is positive, else unpredictable.
1896 * The value of @outlen after return is the number of octets consumed.
1897 */
1898int
1899htmlEncodeEntities(unsigned char* out, int *outlen,
1900		   const unsigned char* in, int *inlen, int quoteChar) {
1901    const unsigned char* processed = in;
1902    const unsigned char* outend;
1903    const unsigned char* outstart = out;
1904    const unsigned char* instart = in;
1905    const unsigned char* inend;
1906    unsigned int c, d;
1907    int trailing;
1908
1909    if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
1910        return(-1);
1911    outend = out + (*outlen);
1912    inend = in + (*inlen);
1913    while (in < inend) {
1914	d = *in++;
1915	if      (d < 0x80)  { c= d; trailing= 0; }
1916	else if (d < 0xC0) {
1917	    /* trailing byte in leading position */
1918	    *outlen = out - outstart;
1919	    *inlen = processed - instart;
1920	    return(-2);
1921        } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
1922        else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
1923        else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
1924	else {
1925	    /* no chance for this in Ascii */
1926	    *outlen = out - outstart;
1927	    *inlen = processed - instart;
1928	    return(-2);
1929	}
1930
1931	if (inend - in < trailing)
1932	    break;
1933
1934	while (trailing--) {
1935	    if (((d= *in++) & 0xC0) != 0x80) {
1936		*outlen = out - outstart;
1937		*inlen = processed - instart;
1938		return(-2);
1939	    }
1940	    c <<= 6;
1941	    c |= d & 0x3F;
1942	}
1943
1944	/* assertion: c is a single UTF-4 value */
1945	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1946	    (c != '&') && (c != '<') && (c != '>')) {
1947	    if (out >= outend)
1948		break;
1949	    *out++ = c;
1950	} else {
1951	    const htmlEntityDesc * ent;
1952	    const char *cp;
1953	    char nbuf[16];
1954	    int len;
1955
1956	    /*
1957	     * Try to lookup a predefined HTML entity for it
1958	     */
1959	    ent = htmlEntityValueLookup(c);
1960	    if (ent == NULL) {
1961		snprintf(nbuf, sizeof(nbuf), "#%u", c);
1962		cp = nbuf;
1963	    }
1964	    else
1965		cp = ent->name;
1966	    len = strlen(cp);
1967	    if (out + 2 + len > outend)
1968		break;
1969	    *out++ = '&';
1970	    memcpy(out, cp, len);
1971	    out += len;
1972	    *out++ = ';';
1973	}
1974	processed = in;
1975    }
1976    *outlen = out - outstart;
1977    *inlen = processed - instart;
1978    return(0);
1979}
1980
1981/************************************************************************
1982 *									*
1983 *		Commodity functions to handle streams			*
1984 *									*
1985 ************************************************************************/
1986
1987/**
1988 * htmlNewInputStream:
1989 * @ctxt:  an HTML parser context
1990 *
1991 * Create a new input stream structure
1992 * Returns the new input stream or NULL
1993 */
1994static htmlParserInputPtr
1995htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1996    htmlParserInputPtr input;
1997
1998    input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1999    if (input == NULL) {
2000        htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2001	return(NULL);
2002    }
2003    memset(input, 0, sizeof(htmlParserInput));
2004    input->filename = NULL;
2005    input->directory = NULL;
2006    input->base = NULL;
2007    input->cur = NULL;
2008    input->buf = NULL;
2009    input->line = 1;
2010    input->col = 1;
2011    input->buf = NULL;
2012    input->free = NULL;
2013    input->version = NULL;
2014    input->consumed = 0;
2015    input->length = 0;
2016    return(input);
2017}
2018
2019
2020/************************************************************************
2021 *									*
2022 *		Commodity functions, cleanup needed ?			*
2023 *									*
2024 ************************************************************************/
2025/*
2026 * all tags allowing pc data from the html 4.01 loose dtd
2027 * NOTE: it might be more apropriate to integrate this information
2028 * into the html40ElementTable array but I don't want to risk any
2029 * binary incomptibility
2030 */
2031static const char *allowPCData[] = {
2032    "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2033    "blockquote", "body", "button", "caption", "center", "cite", "code",
2034    "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2035    "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2036    "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2037    "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2038};
2039
2040/**
2041 * areBlanks:
2042 * @ctxt:  an HTML parser context
2043 * @str:  a xmlChar *
2044 * @len:  the size of @str
2045 *
2046 * Is this a sequence of blank chars that one can ignore ?
2047 *
2048 * Returns 1 if ignorable 0 otherwise.
2049 */
2050
2051static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2052    unsigned int i;
2053    int j;
2054    xmlNodePtr lastChild;
2055    xmlDtdPtr dtd;
2056
2057    for (j = 0;j < len;j++)
2058        if (!(IS_BLANK_CH(str[j]))) return(0);
2059
2060    if (CUR == 0) return(1);
2061    if (CUR != '<') return(0);
2062    if (ctxt->name == NULL)
2063	return(1);
2064    if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2065	return(1);
2066    if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2067	return(1);
2068
2069    /* Only strip CDATA children of the body tag for strict HTML DTDs */
2070    if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2071        dtd = xmlGetIntSubset(ctxt->myDoc);
2072        if (dtd != NULL && dtd->ExternalID != NULL) {
2073            if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2074                    !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2075                return(1);
2076        }
2077    }
2078
2079    if (ctxt->node == NULL) return(0);
2080    lastChild = xmlGetLastChild(ctxt->node);
2081    while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2082	lastChild = lastChild->prev;
2083    if (lastChild == NULL) {
2084        if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2085            (ctxt->node->content != NULL)) return(0);
2086	/* keep ws in constructs like ...<b> </b>...
2087	   for all tags "b" allowing PCDATA */
2088	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2089	    if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2090		return(0);
2091	    }
2092	}
2093    } else if (xmlNodeIsText(lastChild)) {
2094        return(0);
2095    } else {
2096	/* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2097	   for all tags "p" allowing PCDATA */
2098	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2099	    if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2100		return(0);
2101	    }
2102	}
2103    }
2104    return(1);
2105}
2106
2107/**
2108 * htmlNewDocNoDtD:
2109 * @URI:  URI for the dtd, or NULL
2110 * @ExternalID:  the external ID of the DTD, or NULL
2111 *
2112 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2113 * are NULL
2114 *
2115 * Returns a new document, do not initialize the DTD if not provided
2116 */
2117htmlDocPtr
2118htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2119    xmlDocPtr cur;
2120
2121    /*
2122     * Allocate a new document and fill the fields.
2123     */
2124    cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2125    if (cur == NULL) {
2126	htmlErrMemory(NULL, "HTML document creation failed\n");
2127	return(NULL);
2128    }
2129    memset(cur, 0, sizeof(xmlDoc));
2130
2131    cur->type = XML_HTML_DOCUMENT_NODE;
2132    cur->version = NULL;
2133    cur->intSubset = NULL;
2134    cur->doc = cur;
2135    cur->name = NULL;
2136    cur->children = NULL;
2137    cur->extSubset = NULL;
2138    cur->oldNs = NULL;
2139    cur->encoding = NULL;
2140    cur->standalone = 1;
2141    cur->compression = 0;
2142    cur->ids = NULL;
2143    cur->refs = NULL;
2144    cur->_private = NULL;
2145    cur->charset = XML_CHAR_ENCODING_UTF8;
2146    cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2147    if ((ExternalID != NULL) ||
2148	(URI != NULL))
2149	xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2150    return(cur);
2151}
2152
2153/**
2154 * htmlNewDoc:
2155 * @URI:  URI for the dtd, or NULL
2156 * @ExternalID:  the external ID of the DTD, or NULL
2157 *
2158 * Creates a new HTML document
2159 *
2160 * Returns a new document
2161 */
2162htmlDocPtr
2163htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2164    if ((URI == NULL) && (ExternalID == NULL))
2165	return(htmlNewDocNoDtD(
2166		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2167		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2168
2169    return(htmlNewDocNoDtD(URI, ExternalID));
2170}
2171
2172
2173/************************************************************************
2174 *									*
2175 *			The parser itself				*
2176 *	Relates to http://www.w3.org/TR/html40				*
2177 *									*
2178 ************************************************************************/
2179
2180/************************************************************************
2181 *									*
2182 *			The parser itself				*
2183 *									*
2184 ************************************************************************/
2185
2186static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2187
2188/**
2189 * htmlParseHTMLName:
2190 * @ctxt:  an HTML parser context
2191 *
2192 * parse an HTML tag or attribute name, note that we convert it to lowercase
2193 * since HTML names are not case-sensitive.
2194 *
2195 * Returns the Tag Name parsed or NULL
2196 */
2197
2198static const xmlChar *
2199htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2200    int i = 0;
2201    xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2202
2203    if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2204        (CUR != ':')) return(NULL);
2205
2206    while ((i < HTML_PARSER_BUFFER_SIZE) &&
2207           ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2208	   (CUR == ':') || (CUR == '-') || (CUR == '_'))) {
2209	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2210        else loc[i] = CUR;
2211	i++;
2212
2213	NEXT;
2214    }
2215
2216    return(xmlDictLookup(ctxt->dict, loc, i));
2217}
2218
2219
2220/**
2221 * htmlParseHTMLName_nonInvasive:
2222 * @ctxt:  an HTML parser context
2223 *
2224 * parse an HTML tag or attribute name, note that we convert it to lowercase
2225 * since HTML names are not case-sensitive, this doesn't consume the data
2226 * from the stream, it's a look-ahead
2227 *
2228 * Returns the Tag Name parsed or NULL
2229 */
2230
2231static const xmlChar *
2232htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2233    int i = 0;
2234    xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2235
2236    if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2237        (NXT(1) != ':')) return(NULL);
2238
2239    while ((i < HTML_PARSER_BUFFER_SIZE) &&
2240           ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2241	   (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2242	if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2243        else loc[i] = NXT(1+i);
2244	i++;
2245    }
2246
2247    return(xmlDictLookup(ctxt->dict, loc, i));
2248}
2249
2250
2251/**
2252 * htmlParseName:
2253 * @ctxt:  an HTML parser context
2254 *
2255 * parse an HTML name, this routine is case sensitive.
2256 *
2257 * Returns the Name parsed or NULL
2258 */
2259
2260static const xmlChar *
2261htmlParseName(htmlParserCtxtPtr ctxt) {
2262    const xmlChar *in;
2263    const xmlChar *ret;
2264    int count = 0;
2265
2266    GROW;
2267
2268    /*
2269     * Accelerator for simple ASCII names
2270     */
2271    in = ctxt->input->cur;
2272    if (((*in >= 0x61) && (*in <= 0x7A)) ||
2273	((*in >= 0x41) && (*in <= 0x5A)) ||
2274	(*in == '_') || (*in == ':')) {
2275	in++;
2276	while (((*in >= 0x61) && (*in <= 0x7A)) ||
2277	       ((*in >= 0x41) && (*in <= 0x5A)) ||
2278	       ((*in >= 0x30) && (*in <= 0x39)) ||
2279	       (*in == '_') || (*in == '-') ||
2280	       (*in == ':') || (*in == '.'))
2281	    in++;
2282	if ((*in > 0) && (*in < 0x80)) {
2283	    count = in - ctxt->input->cur;
2284	    ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2285	    ctxt->input->cur = in;
2286	    ctxt->nbChars += count;
2287	    ctxt->input->col += count;
2288	    return(ret);
2289	}
2290    }
2291    return(htmlParseNameComplex(ctxt));
2292}
2293
2294static const xmlChar *
2295htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2296    int len = 0, l;
2297    int c;
2298    int count = 0;
2299
2300    /*
2301     * Handler for more complex cases
2302     */
2303    GROW;
2304    c = CUR_CHAR(l);
2305    if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2306	(!IS_LETTER(c) && (c != '_') &&
2307         (c != ':'))) {
2308	return(NULL);
2309    }
2310
2311    while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2312	   ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2313            (c == '.') || (c == '-') ||
2314	    (c == '_') || (c == ':') ||
2315	    (IS_COMBINING(c)) ||
2316	    (IS_EXTENDER(c)))) {
2317	if (count++ > 100) {
2318	    count = 0;
2319	    GROW;
2320	}
2321	len += l;
2322	NEXTL(l);
2323	c = CUR_CHAR(l);
2324    }
2325    return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2326}
2327
2328
2329/**
2330 * htmlParseHTMLAttribute:
2331 * @ctxt:  an HTML parser context
2332 * @stop:  a char stop value
2333 *
2334 * parse an HTML attribute value till the stop (quote), if
2335 * stop is 0 then it stops at the first space
2336 *
2337 * Returns the attribute parsed or NULL
2338 */
2339
2340static xmlChar *
2341htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2342    xmlChar *buffer = NULL;
2343    int buffer_size = 0;
2344    xmlChar *out = NULL;
2345    const xmlChar *name = NULL;
2346    const xmlChar *cur = NULL;
2347    const htmlEntityDesc * ent;
2348
2349    /*
2350     * allocate a translation buffer.
2351     */
2352    buffer_size = HTML_PARSER_BUFFER_SIZE;
2353    buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2354    if (buffer == NULL) {
2355	htmlErrMemory(ctxt, "buffer allocation failed\n");
2356	return(NULL);
2357    }
2358    out = buffer;
2359
2360    /*
2361     * Ok loop until we reach one of the ending chars
2362     */
2363    while ((CUR != 0) && (CUR != stop)) {
2364	if ((stop == 0) && (CUR == '>')) break;
2365	if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2366        if (CUR == '&') {
2367	    if (NXT(1) == '#') {
2368		unsigned int c;
2369		int bits;
2370
2371		c = htmlParseCharRef(ctxt);
2372		if      (c <    0x80)
2373		        { *out++  = c;                bits= -6; }
2374		else if (c <   0x800)
2375		        { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2376		else if (c < 0x10000)
2377		        { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2378		else
2379		        { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2380
2381		for ( ; bits >= 0; bits-= 6) {
2382		    *out++  = ((c >> bits) & 0x3F) | 0x80;
2383		}
2384
2385		if (out - buffer > buffer_size - 100) {
2386			int indx = out - buffer;
2387
2388			growBuffer(buffer);
2389			out = &buffer[indx];
2390		}
2391	    } else {
2392		ent = htmlParseEntityRef(ctxt, &name);
2393		if (name == NULL) {
2394		    *out++ = '&';
2395		    if (out - buffer > buffer_size - 100) {
2396			int indx = out - buffer;
2397
2398			growBuffer(buffer);
2399			out = &buffer[indx];
2400		    }
2401		} else if (ent == NULL) {
2402		    *out++ = '&';
2403		    cur = name;
2404		    while (*cur != 0) {
2405			if (out - buffer > buffer_size - 100) {
2406			    int indx = out - buffer;
2407
2408			    growBuffer(buffer);
2409			    out = &buffer[indx];
2410			}
2411			*out++ = *cur++;
2412		    }
2413		} else {
2414		    unsigned int c;
2415		    int bits;
2416
2417		    if (out - buffer > buffer_size - 100) {
2418			int indx = out - buffer;
2419
2420			growBuffer(buffer);
2421			out = &buffer[indx];
2422		    }
2423		    c = ent->value;
2424		    if      (c <    0x80)
2425			{ *out++  = c;                bits= -6; }
2426		    else if (c <   0x800)
2427			{ *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2428		    else if (c < 0x10000)
2429			{ *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2430		    else
2431			{ *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2432
2433		    for ( ; bits >= 0; bits-= 6) {
2434			*out++  = ((c >> bits) & 0x3F) | 0x80;
2435		    }
2436		}
2437	    }
2438	} else {
2439	    unsigned int c;
2440	    int bits, l;
2441
2442	    if (out - buffer > buffer_size - 100) {
2443		int indx = out - buffer;
2444
2445		growBuffer(buffer);
2446		out = &buffer[indx];
2447	    }
2448	    c = CUR_CHAR(l);
2449	    if      (c <    0x80)
2450		    { *out++  = c;                bits= -6; }
2451	    else if (c <   0x800)
2452		    { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2453	    else if (c < 0x10000)
2454		    { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2455	    else
2456		    { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2457
2458	    for ( ; bits >= 0; bits-= 6) {
2459		*out++  = ((c >> bits) & 0x3F) | 0x80;
2460	    }
2461	    NEXT;
2462	}
2463    }
2464    *out++ = 0;
2465    return(buffer);
2466}
2467
2468/**
2469 * htmlParseEntityRef:
2470 * @ctxt:  an HTML parser context
2471 * @str:  location to store the entity name
2472 *
2473 * parse an HTML ENTITY references
2474 *
2475 * [68] EntityRef ::= '&' Name ';'
2476 *
2477 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2478 *         if non-NULL *str will have to be freed by the caller.
2479 */
2480const htmlEntityDesc *
2481htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2482    const xmlChar *name;
2483    const htmlEntityDesc * ent = NULL;
2484
2485    if (str != NULL) *str = NULL;
2486    if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2487
2488    if (CUR == '&') {
2489        NEXT;
2490        name = htmlParseName(ctxt);
2491	if (name == NULL) {
2492	    htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2493	                 "htmlParseEntityRef: no name\n", NULL, NULL);
2494	} else {
2495	    GROW;
2496	    if (CUR == ';') {
2497	        if (str != NULL)
2498		    *str = name;
2499
2500		/*
2501		 * Lookup the entity in the table.
2502		 */
2503		ent = htmlEntityLookup(name);
2504		if (ent != NULL) /* OK that's ugly !!! */
2505		    NEXT;
2506	    } else {
2507		htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2508		             "htmlParseEntityRef: expecting ';'\n",
2509			     NULL, NULL);
2510	        if (str != NULL)
2511		    *str = name;
2512	    }
2513	}
2514    }
2515    return(ent);
2516}
2517
2518/**
2519 * htmlParseAttValue:
2520 * @ctxt:  an HTML parser context
2521 *
2522 * parse a value for an attribute
2523 * Note: the parser won't do substitution of entities here, this
2524 * will be handled later in xmlStringGetNodeList, unless it was
2525 * asked for ctxt->replaceEntities != 0
2526 *
2527 * Returns the AttValue parsed or NULL.
2528 */
2529
2530static xmlChar *
2531htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2532    xmlChar *ret = NULL;
2533
2534    if (CUR == '"') {
2535        NEXT;
2536	ret = htmlParseHTMLAttribute(ctxt, '"');
2537        if (CUR != '"') {
2538	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2539	                 "AttValue: \" expected\n", NULL, NULL);
2540	} else
2541	    NEXT;
2542    } else if (CUR == '\'') {
2543        NEXT;
2544	ret = htmlParseHTMLAttribute(ctxt, '\'');
2545        if (CUR != '\'') {
2546	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2547	                 "AttValue: ' expected\n", NULL, NULL);
2548	} else
2549	    NEXT;
2550    } else {
2551        /*
2552	 * That's an HTMLism, the attribute value may not be quoted
2553	 */
2554	ret = htmlParseHTMLAttribute(ctxt, 0);
2555	if (ret == NULL) {
2556	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2557	                 "AttValue: no value found\n", NULL, NULL);
2558	}
2559    }
2560    return(ret);
2561}
2562
2563/**
2564 * htmlParseSystemLiteral:
2565 * @ctxt:  an HTML parser context
2566 *
2567 * parse an HTML Literal
2568 *
2569 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2570 *
2571 * Returns the SystemLiteral parsed or NULL
2572 */
2573
2574static xmlChar *
2575htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2576    const xmlChar *q;
2577    xmlChar *ret = NULL;
2578
2579    if (CUR == '"') {
2580        NEXT;
2581	q = CUR_PTR;
2582	while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
2583	    NEXT;
2584	if (!IS_CHAR_CH(CUR)) {
2585	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2586			 "Unfinished SystemLiteral\n", NULL, NULL);
2587	} else {
2588	    ret = xmlStrndup(q, CUR_PTR - q);
2589	    NEXT;
2590        }
2591    } else if (CUR == '\'') {
2592        NEXT;
2593	q = CUR_PTR;
2594	while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
2595	    NEXT;
2596	if (!IS_CHAR_CH(CUR)) {
2597	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2598			 "Unfinished SystemLiteral\n", NULL, NULL);
2599	} else {
2600	    ret = xmlStrndup(q, CUR_PTR - q);
2601	    NEXT;
2602        }
2603    } else {
2604	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2605	             " or ' expected\n", NULL, NULL);
2606    }
2607
2608    return(ret);
2609}
2610
2611/**
2612 * htmlParsePubidLiteral:
2613 * @ctxt:  an HTML parser context
2614 *
2615 * parse an HTML public literal
2616 *
2617 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2618 *
2619 * Returns the PubidLiteral parsed or NULL.
2620 */
2621
2622static xmlChar *
2623htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2624    const xmlChar *q;
2625    xmlChar *ret = NULL;
2626    /*
2627     * Name ::= (Letter | '_') (NameChar)*
2628     */
2629    if (CUR == '"') {
2630        NEXT;
2631	q = CUR_PTR;
2632	while (IS_PUBIDCHAR_CH(CUR)) NEXT;
2633	if (CUR != '"') {
2634	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2635	                 "Unfinished PubidLiteral\n", NULL, NULL);
2636	} else {
2637	    ret = xmlStrndup(q, CUR_PTR - q);
2638	    NEXT;
2639	}
2640    } else if (CUR == '\'') {
2641        NEXT;
2642	q = CUR_PTR;
2643	while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
2644	    NEXT;
2645	if (CUR != '\'') {
2646	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2647	                 "Unfinished PubidLiteral\n", NULL, NULL);
2648	} else {
2649	    ret = xmlStrndup(q, CUR_PTR - q);
2650	    NEXT;
2651	}
2652    } else {
2653	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2654	             "PubidLiteral \" or ' expected\n", NULL, NULL);
2655    }
2656
2657    return(ret);
2658}
2659
2660/**
2661 * htmlParseScript:
2662 * @ctxt:  an HTML parser context
2663 *
2664 * parse the content of an HTML SCRIPT or STYLE element
2665 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2666 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2667 * http://www.w3.org/TR/html4/types.html#type-script
2668 * http://www.w3.org/TR/html4/types.html#h-6.15
2669 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2670 *
2671 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2672 * element and the value of intrinsic event attributes. User agents must
2673 * not evaluate script data as HTML markup but instead must pass it on as
2674 * data to a script engine.
2675 * NOTES:
2676 * - The content is passed like CDATA
2677 * - the attributes for style and scripting "onXXX" are also described
2678 *   as CDATA but SGML allows entities references in attributes so their
2679 *   processing is identical as other attributes
2680 */
2681static void
2682htmlParseScript(htmlParserCtxtPtr ctxt) {
2683    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2684    int nbchar = 0;
2685    int cur,l;
2686
2687    SHRINK;
2688    cur = CUR_CHAR(l);
2689    while (IS_CHAR_CH(cur)) {
2690	if ((cur == '<') && (NXT(1) == '/')) {
2691            /*
2692             * One should break here, the specification is clear:
2693             * Authors should therefore escape "</" within the content.
2694             * Escape mechanisms are specific to each scripting or
2695             * style sheet language.
2696             *
2697             * In recovery mode, only break if end tag match the
2698             * current tag, effectively ignoring all tags inside the
2699             * script/style block and treating the entire block as
2700             * CDATA.
2701             */
2702            if (ctxt->recovery) {
2703                if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2704				   xmlStrlen(ctxt->name)) == 0)
2705                {
2706                    break; /* while */
2707                } else {
2708		    htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
2709				 "Element %s embeds close tag\n",
2710		                 ctxt->name, NULL);
2711		}
2712            } else {
2713                if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2714                    ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2715                {
2716                    break; /* while */
2717                }
2718            }
2719	}
2720	COPY_BUF(l,buf,nbchar,cur);
2721	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2722	    if (ctxt->sax->cdataBlock!= NULL) {
2723		/*
2724		 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2725		 */
2726		ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2727	    } else if (ctxt->sax->characters != NULL) {
2728		ctxt->sax->characters(ctxt->userData, buf, nbchar);
2729	    }
2730	    nbchar = 0;
2731	}
2732	GROW;
2733	NEXTL(l);
2734	cur = CUR_CHAR(l);
2735    }
2736
2737    if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
2738	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2739	                "Invalid char in CDATA 0x%X\n", cur);
2740	NEXT;
2741    }
2742
2743    if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2744	if (ctxt->sax->cdataBlock!= NULL) {
2745	    /*
2746	     * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2747	     */
2748	    ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2749	} else if (ctxt->sax->characters != NULL) {
2750	    ctxt->sax->characters(ctxt->userData, buf, nbchar);
2751	}
2752    }
2753}
2754
2755
2756/**
2757 * htmlParseCharData:
2758 * @ctxt:  an HTML parser context
2759 *
2760 * parse a CharData section.
2761 * if we are within a CDATA section ']]>' marks an end of section.
2762 *
2763 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2764 */
2765
2766static void
2767htmlParseCharData(htmlParserCtxtPtr ctxt) {
2768    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2769    int nbchar = 0;
2770    int cur, l;
2771    int chunk = 0;
2772
2773    SHRINK;
2774    cur = CUR_CHAR(l);
2775    while (((cur != '<') || (ctxt->token == '<')) &&
2776           ((cur != '&') || (ctxt->token == '&')) &&
2777	   (cur != 0)) {
2778	if (!(IS_CHAR(cur))) {
2779	    htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2780	                "Invalid char in CDATA 0x%X\n", cur);
2781	} else {
2782	    COPY_BUF(l,buf,nbchar,cur);
2783	}
2784	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2785	    /*
2786	     * Ok the segment is to be consumed as chars.
2787	     */
2788	    if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2789		if (areBlanks(ctxt, buf, nbchar)) {
2790		    if (ctxt->sax->ignorableWhitespace != NULL)
2791			ctxt->sax->ignorableWhitespace(ctxt->userData,
2792			                               buf, nbchar);
2793		} else {
2794		    htmlCheckParagraph(ctxt);
2795		    if (ctxt->sax->characters != NULL)
2796			ctxt->sax->characters(ctxt->userData, buf, nbchar);
2797		}
2798	    }
2799	    nbchar = 0;
2800	}
2801	NEXTL(l);
2802        chunk++;
2803        if (chunk > HTML_PARSER_BUFFER_SIZE) {
2804            chunk = 0;
2805            SHRINK;
2806            GROW;
2807        }
2808	cur = CUR_CHAR(l);
2809	if (cur == 0) {
2810	    SHRINK;
2811	    GROW;
2812	    cur = CUR_CHAR(l);
2813	}
2814    }
2815    if (nbchar != 0) {
2816        buf[nbchar] = 0;
2817
2818	/*
2819	 * Ok the segment is to be consumed as chars.
2820	 */
2821	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2822	    if (areBlanks(ctxt, buf, nbchar)) {
2823		if (ctxt->sax->ignorableWhitespace != NULL)
2824		    ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2825	    } else {
2826		htmlCheckParagraph(ctxt);
2827		if (ctxt->sax->characters != NULL)
2828		    ctxt->sax->characters(ctxt->userData, buf, nbchar);
2829	    }
2830	}
2831    } else {
2832	/*
2833	 * Loop detection
2834	 */
2835	if (cur == 0)
2836	    ctxt->instate = XML_PARSER_EOF;
2837    }
2838}
2839
2840/**
2841 * htmlParseExternalID:
2842 * @ctxt:  an HTML parser context
2843 * @publicID:  a xmlChar** receiving PubidLiteral
2844 *
2845 * Parse an External ID or a Public ID
2846 *
2847 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2848 *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
2849 *
2850 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2851 *
2852 * Returns the function returns SystemLiteral and in the second
2853 *                case publicID receives PubidLiteral, is strict is off
2854 *                it is possible to return NULL and have publicID set.
2855 */
2856
2857static xmlChar *
2858htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
2859    xmlChar *URI = NULL;
2860
2861    if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2862         (UPP(2) == 'S') && (UPP(3) == 'T') &&
2863	 (UPP(4) == 'E') && (UPP(5) == 'M')) {
2864        SKIP(6);
2865	if (!IS_BLANK_CH(CUR)) {
2866	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2867	                 "Space required after 'SYSTEM'\n", NULL, NULL);
2868	}
2869        SKIP_BLANKS;
2870	URI = htmlParseSystemLiteral(ctxt);
2871	if (URI == NULL) {
2872	    htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2873	                 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
2874        }
2875    } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2876	       (UPP(2) == 'B') && (UPP(3) == 'L') &&
2877	       (UPP(4) == 'I') && (UPP(5) == 'C')) {
2878        SKIP(6);
2879	if (!IS_BLANK_CH(CUR)) {
2880	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2881	                 "Space required after 'PUBLIC'\n", NULL, NULL);
2882	}
2883        SKIP_BLANKS;
2884	*publicID = htmlParsePubidLiteral(ctxt);
2885	if (*publicID == NULL) {
2886	    htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2887	                 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
2888			 NULL, NULL);
2889	}
2890        SKIP_BLANKS;
2891        if ((CUR == '"') || (CUR == '\'')) {
2892	    URI = htmlParseSystemLiteral(ctxt);
2893	}
2894    }
2895    return(URI);
2896}
2897
2898/**
2899 * xmlParsePI:
2900 * @ctxt:  an XML parser context
2901 *
2902 * parse an XML Processing Instruction.
2903 *
2904 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
2905 */
2906static void
2907htmlParsePI(htmlParserCtxtPtr ctxt) {
2908    xmlChar *buf = NULL;
2909    int len = 0;
2910    int size = HTML_PARSER_BUFFER_SIZE;
2911    int cur, l;
2912    const xmlChar *target;
2913    xmlParserInputState state;
2914    int count = 0;
2915
2916    if ((RAW == '<') && (NXT(1) == '?')) {
2917	state = ctxt->instate;
2918        ctxt->instate = XML_PARSER_PI;
2919	/*
2920	 * this is a Processing Instruction.
2921	 */
2922	SKIP(2);
2923	SHRINK;
2924
2925	/*
2926	 * Parse the target name and check for special support like
2927	 * namespace.
2928	 */
2929        target = htmlParseName(ctxt);
2930	if (target != NULL) {
2931	    if (RAW == '>') {
2932		SKIP(1);
2933
2934		/*
2935		 * SAX: PI detected.
2936		 */
2937		if ((ctxt->sax) && (!ctxt->disableSAX) &&
2938		    (ctxt->sax->processingInstruction != NULL))
2939		    ctxt->sax->processingInstruction(ctxt->userData,
2940		                                     target, NULL);
2941		ctxt->instate = state;
2942		return;
2943	    }
2944	    buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
2945	    if (buf == NULL) {
2946		htmlErrMemory(ctxt, NULL);
2947		ctxt->instate = state;
2948		return;
2949	    }
2950	    cur = CUR;
2951	    if (!IS_BLANK(cur)) {
2952		htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2953			  "ParsePI: PI %s space expected\n", target, NULL);
2954	    }
2955            SKIP_BLANKS;
2956	    cur = CUR_CHAR(l);
2957	    while (IS_CHAR(cur) && (cur != '>')) {
2958		if (len + 5 >= size) {
2959		    xmlChar *tmp;
2960
2961		    size *= 2;
2962		    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
2963		    if (tmp == NULL) {
2964			htmlErrMemory(ctxt, NULL);
2965			xmlFree(buf);
2966			ctxt->instate = state;
2967			return;
2968		    }
2969		    buf = tmp;
2970		}
2971		count++;
2972		if (count > 50) {
2973		    GROW;
2974		    count = 0;
2975		}
2976		COPY_BUF(l,buf,len,cur);
2977		NEXTL(l);
2978		cur = CUR_CHAR(l);
2979		if (cur == 0) {
2980		    SHRINK;
2981		    GROW;
2982		    cur = CUR_CHAR(l);
2983		}
2984	    }
2985	    buf[len] = 0;
2986	    if (cur != '>') {
2987		htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
2988		      "ParsePI: PI %s never end ...\n", target, NULL);
2989	    } else {
2990		SKIP(1);
2991
2992		/*
2993		 * SAX: PI detected.
2994		 */
2995		if ((ctxt->sax) && (!ctxt->disableSAX) &&
2996		    (ctxt->sax->processingInstruction != NULL))
2997		    ctxt->sax->processingInstruction(ctxt->userData,
2998		                                     target, buf);
2999	    }
3000	    xmlFree(buf);
3001	} else {
3002	    htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3003                         "PI is not started correctly", NULL, NULL);
3004	}
3005	ctxt->instate = state;
3006    }
3007}
3008
3009/**
3010 * htmlParseComment:
3011 * @ctxt:  an HTML parser context
3012 *
3013 * Parse an XML (SGML) comment <!-- .... -->
3014 *
3015 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3016 */
3017static void
3018htmlParseComment(htmlParserCtxtPtr ctxt) {
3019    xmlChar *buf = NULL;
3020    int len;
3021    int size = HTML_PARSER_BUFFER_SIZE;
3022    int q, ql;
3023    int r, rl;
3024    int cur, l;
3025    xmlParserInputState state;
3026
3027    /*
3028     * Check that there is a comment right here.
3029     */
3030    if ((RAW != '<') || (NXT(1) != '!') ||
3031        (NXT(2) != '-') || (NXT(3) != '-')) return;
3032
3033    state = ctxt->instate;
3034    ctxt->instate = XML_PARSER_COMMENT;
3035    SHRINK;
3036    SKIP(4);
3037    buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3038    if (buf == NULL) {
3039        htmlErrMemory(ctxt, "buffer allocation failed\n");
3040	ctxt->instate = state;
3041	return;
3042    }
3043    q = CUR_CHAR(ql);
3044    NEXTL(ql);
3045    r = CUR_CHAR(rl);
3046    NEXTL(rl);
3047    cur = CUR_CHAR(l);
3048    len = 0;
3049    while (IS_CHAR(cur) &&
3050           ((cur != '>') ||
3051	    (r != '-') || (q != '-'))) {
3052	if (len + 5 >= size) {
3053	    xmlChar *tmp;
3054
3055	    size *= 2;
3056	    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3057	    if (tmp == NULL) {
3058	        xmlFree(buf);
3059	        htmlErrMemory(ctxt, "growing buffer failed\n");
3060		ctxt->instate = state;
3061		return;
3062	    }
3063	    buf = tmp;
3064	}
3065	COPY_BUF(ql,buf,len,q);
3066	q = r;
3067	ql = rl;
3068	r = cur;
3069	rl = l;
3070	NEXTL(l);
3071	cur = CUR_CHAR(l);
3072	if (cur == 0) {
3073	    SHRINK;
3074	    GROW;
3075	    cur = CUR_CHAR(l);
3076	}
3077    }
3078    buf[len] = 0;
3079    if (!IS_CHAR(cur)) {
3080	htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3081	             "Comment not terminated \n<!--%.50s\n", buf, NULL);
3082	xmlFree(buf);
3083    } else {
3084        NEXT;
3085	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3086	    (!ctxt->disableSAX))
3087	    ctxt->sax->comment(ctxt->userData, buf);
3088	xmlFree(buf);
3089    }
3090    ctxt->instate = state;
3091}
3092
3093/**
3094 * htmlParseCharRef:
3095 * @ctxt:  an HTML parser context
3096 *
3097 * parse Reference declarations
3098 *
3099 * [66] CharRef ::= '&#' [0-9]+ ';' |
3100 *                  '&#x' [0-9a-fA-F]+ ';'
3101 *
3102 * Returns the value parsed (as an int)
3103 */
3104int
3105htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3106    int val = 0;
3107
3108    if ((ctxt == NULL) || (ctxt->input == NULL)) {
3109	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3110		     "htmlParseCharRef: context error\n",
3111		     NULL, NULL);
3112        return(0);
3113    }
3114    if ((CUR == '&') && (NXT(1) == '#') &&
3115        ((NXT(2) == 'x') || NXT(2) == 'X')) {
3116	SKIP(3);
3117	while (CUR != ';') {
3118	    if ((CUR >= '0') && (CUR <= '9'))
3119	        val = val * 16 + (CUR - '0');
3120	    else if ((CUR >= 'a') && (CUR <= 'f'))
3121	        val = val * 16 + (CUR - 'a') + 10;
3122	    else if ((CUR >= 'A') && (CUR <= 'F'))
3123	        val = val * 16 + (CUR - 'A') + 10;
3124	    else {
3125	        htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3126		             "htmlParseCharRef: missing semicolumn\n",
3127			     NULL, NULL);
3128		break;
3129	    }
3130	    NEXT;
3131	}
3132	if (CUR == ';')
3133	    NEXT;
3134    } else if  ((CUR == '&') && (NXT(1) == '#')) {
3135	SKIP(2);
3136	while (CUR != ';') {
3137	    if ((CUR >= '0') && (CUR <= '9'))
3138	        val = val * 10 + (CUR - '0');
3139	    else {
3140	        htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3141		             "htmlParseCharRef: missing semicolumn\n",
3142			     NULL, NULL);
3143		break;
3144	    }
3145	    NEXT;
3146	}
3147	if (CUR == ';')
3148	    NEXT;
3149    } else {
3150	htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3151	             "htmlParseCharRef: invalid value\n", NULL, NULL);
3152    }
3153    /*
3154     * Check the value IS_CHAR ...
3155     */
3156    if (IS_CHAR(val)) {
3157        return(val);
3158    } else {
3159	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3160			"htmlParseCharRef: invalid xmlChar value %d\n",
3161			val);
3162    }
3163    return(0);
3164}
3165
3166
3167/**
3168 * htmlParseDocTypeDecl:
3169 * @ctxt:  an HTML parser context
3170 *
3171 * parse a DOCTYPE declaration
3172 *
3173 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3174 *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3175 */
3176
3177static void
3178htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3179    const xmlChar *name;
3180    xmlChar *ExternalID = NULL;
3181    xmlChar *URI = NULL;
3182
3183    /*
3184     * We know that '<!DOCTYPE' has been detected.
3185     */
3186    SKIP(9);
3187
3188    SKIP_BLANKS;
3189
3190    /*
3191     * Parse the DOCTYPE name.
3192     */
3193    name = htmlParseName(ctxt);
3194    if (name == NULL) {
3195	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3196	             "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3197		     NULL, NULL);
3198    }
3199    /*
3200     * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3201     */
3202
3203    SKIP_BLANKS;
3204
3205    /*
3206     * Check for SystemID and ExternalID
3207     */
3208    URI = htmlParseExternalID(ctxt, &ExternalID);
3209    SKIP_BLANKS;
3210
3211    /*
3212     * We should be at the end of the DOCTYPE declaration.
3213     */
3214    if (CUR != '>') {
3215	htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3216	             "DOCTYPE improperly terminated\n", NULL, NULL);
3217        /* We shouldn't try to resynchronize ... */
3218    }
3219    NEXT;
3220
3221    /*
3222     * Create or update the document accordingly to the DOCTYPE
3223     */
3224    if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3225	(!ctxt->disableSAX))
3226	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3227
3228    /*
3229     * Cleanup, since we don't use all those identifiers
3230     */
3231    if (URI != NULL) xmlFree(URI);
3232    if (ExternalID != NULL) xmlFree(ExternalID);
3233}
3234
3235/**
3236 * htmlParseAttribute:
3237 * @ctxt:  an HTML parser context
3238 * @value:  a xmlChar ** used to store the value of the attribute
3239 *
3240 * parse an attribute
3241 *
3242 * [41] Attribute ::= Name Eq AttValue
3243 *
3244 * [25] Eq ::= S? '=' S?
3245 *
3246 * With namespace:
3247 *
3248 * [NS 11] Attribute ::= QName Eq AttValue
3249 *
3250 * Also the case QName == xmlns:??? is handled independently as a namespace
3251 * definition.
3252 *
3253 * Returns the attribute name, and the value in *value.
3254 */
3255
3256static const xmlChar *
3257htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3258    const xmlChar *name;
3259    xmlChar *val = NULL;
3260
3261    *value = NULL;
3262    name = htmlParseHTMLName(ctxt);
3263    if (name == NULL) {
3264	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3265	             "error parsing attribute name\n", NULL, NULL);
3266        return(NULL);
3267    }
3268
3269    /*
3270     * read the value
3271     */
3272    SKIP_BLANKS;
3273    if (CUR == '=') {
3274        NEXT;
3275	SKIP_BLANKS;
3276	val = htmlParseAttValue(ctxt);
3277    } else if (htmlIsBooleanAttr(name)) {
3278        /*
3279	 * assume a minimized attribute
3280	 */
3281	val = xmlStrdup(name);
3282    }
3283
3284    *value = val;
3285    return(name);
3286}
3287
3288/**
3289 * htmlCheckEncoding:
3290 * @ctxt:  an HTML parser context
3291 * @attvalue: the attribute value
3292 *
3293 * Checks an http-equiv attribute from a Meta tag to detect
3294 * the encoding
3295 * If a new encoding is detected the parser is switched to decode
3296 * it and pass UTF8
3297 */
3298static void
3299htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3300    const xmlChar *encoding;
3301
3302    if ((ctxt == NULL) || (attvalue == NULL))
3303	return;
3304
3305    /* do not change encoding */
3306    if (ctxt->input->encoding != NULL)
3307        return;
3308
3309    encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3310    if (encoding != NULL) {
3311	encoding += 8;
3312    } else {
3313	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3314	if (encoding != NULL)
3315	    encoding += 9;
3316    }
3317    if (encoding != NULL) {
3318	xmlCharEncoding enc;
3319	xmlCharEncodingHandlerPtr handler;
3320
3321	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3322
3323	if (ctxt->input->encoding != NULL)
3324	    xmlFree((xmlChar *) ctxt->input->encoding);
3325	ctxt->input->encoding = xmlStrdup(encoding);
3326
3327	enc = xmlParseCharEncoding((const char *) encoding);
3328	/*
3329	 * registered set of known encodings
3330	 */
3331	if (enc != XML_CHAR_ENCODING_ERROR) {
3332	    if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3333	         (enc == XML_CHAR_ENCODING_UTF16BE) ||
3334		 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3335		 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3336		(ctxt->input->buf != NULL) &&
3337		(ctxt->input->buf->encoder == NULL)) {
3338		htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3339		             "htmlCheckEncoding: wrong encoding meta\n",
3340			     NULL, NULL);
3341	    } else {
3342		xmlSwitchEncoding(ctxt, enc);
3343	    }
3344	    ctxt->charset = XML_CHAR_ENCODING_UTF8;
3345	} else {
3346	    /*
3347	     * fallback for unknown encodings
3348	     */
3349	    handler = xmlFindCharEncodingHandler((const char *) encoding);
3350	    if (handler != NULL) {
3351		xmlSwitchToEncoding(ctxt, handler);
3352		ctxt->charset = XML_CHAR_ENCODING_UTF8;
3353	    } else {
3354		ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3355	    }
3356	}
3357
3358	if ((ctxt->input->buf != NULL) &&
3359	    (ctxt->input->buf->encoder != NULL) &&
3360	    (ctxt->input->buf->raw != NULL) &&
3361	    (ctxt->input->buf->buffer != NULL)) {
3362	    int nbchars;
3363	    int processed;
3364
3365	    /*
3366	     * convert as much as possible to the parser reading buffer.
3367	     */
3368	    processed = ctxt->input->cur - ctxt->input->base;
3369	    xmlBufferShrink(ctxt->input->buf->buffer, processed);
3370	    nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3371		                       ctxt->input->buf->buffer,
3372				       ctxt->input->buf->raw);
3373	    if (nbchars < 0) {
3374		htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3375		             "htmlCheckEncoding: encoder error\n",
3376			     NULL, NULL);
3377	    }
3378	    ctxt->input->base =
3379	    ctxt->input->cur = ctxt->input->buf->buffer->content;
3380	}
3381    }
3382}
3383
3384/**
3385 * htmlCheckMeta:
3386 * @ctxt:  an HTML parser context
3387 * @atts:  the attributes values
3388 *
3389 * Checks an attributes from a Meta tag
3390 */
3391static void
3392htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3393    int i;
3394    const xmlChar *att, *value;
3395    int http = 0;
3396    const xmlChar *content = NULL;
3397
3398    if ((ctxt == NULL) || (atts == NULL))
3399	return;
3400
3401    i = 0;
3402    att = atts[i++];
3403    while (att != NULL) {
3404	value = atts[i++];
3405	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3406	 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3407	    http = 1;
3408	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3409	    content = value;
3410	att = atts[i++];
3411    }
3412    if ((http) && (content != NULL))
3413	htmlCheckEncoding(ctxt, content);
3414
3415}
3416
3417/**
3418 * htmlParseStartTag:
3419 * @ctxt:  an HTML parser context
3420 *
3421 * parse a start of tag either for rule element or
3422 * EmptyElement. In both case we don't parse the tag closing chars.
3423 *
3424 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3425 *
3426 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3427 *
3428 * With namespace:
3429 *
3430 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3431 *
3432 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3433 *
3434 * Returns 0 in case of success, -1 in case of error and 1 if discarded
3435 */
3436
3437static int
3438htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3439    const xmlChar *name;
3440    const xmlChar *attname;
3441    xmlChar *attvalue;
3442    const xmlChar **atts;
3443    int nbatts = 0;
3444    int maxatts;
3445    int meta = 0;
3446    int i;
3447    int discardtag = 0;
3448
3449    if ((ctxt == NULL) || (ctxt->input == NULL)) {
3450	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3451		     "htmlParseStartTag: context error\n", NULL, NULL);
3452	return -1;
3453    }
3454    if (CUR != '<') return -1;
3455    NEXT;
3456
3457    atts = ctxt->atts;
3458    maxatts = ctxt->maxatts;
3459
3460    GROW;
3461    name = htmlParseHTMLName(ctxt);
3462    if (name == NULL) {
3463	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3464	             "htmlParseStartTag: invalid element name\n",
3465		     NULL, NULL);
3466	/* Dump the bogus tag like browsers do */
3467	while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3468	    NEXT;
3469        return -1;
3470    }
3471    if (xmlStrEqual(name, BAD_CAST"meta"))
3472	meta = 1;
3473
3474    /*
3475     * Check for auto-closure of HTML elements.
3476     */
3477    htmlAutoClose(ctxt, name);
3478
3479    /*
3480     * Check for implied HTML elements.
3481     */
3482    htmlCheckImplied(ctxt, name);
3483
3484    /*
3485     * Avoid html at any level > 0, head at any level != 1
3486     * or any attempt to recurse body
3487     */
3488    if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3489	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3490	             "htmlParseStartTag: misplaced <html> tag\n",
3491		     name, NULL);
3492	discardtag = 1;
3493	ctxt->depth++;
3494    }
3495    if ((ctxt->nameNr != 1) &&
3496	(xmlStrEqual(name, BAD_CAST"head"))) {
3497	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3498	             "htmlParseStartTag: misplaced <head> tag\n",
3499		     name, NULL);
3500	discardtag = 1;
3501	ctxt->depth++;
3502    }
3503    if (xmlStrEqual(name, BAD_CAST"body")) {
3504	int indx;
3505	for (indx = 0;indx < ctxt->nameNr;indx++) {
3506	    if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3507		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3508		             "htmlParseStartTag: misplaced <body> tag\n",
3509			     name, NULL);
3510		discardtag = 1;
3511		ctxt->depth++;
3512	    }
3513	}
3514    }
3515
3516    /*
3517     * Now parse the attributes, it ends up with the ending
3518     *
3519     * (S Attribute)* S?
3520     */
3521    SKIP_BLANKS;
3522    while ((IS_CHAR_CH(CUR)) &&
3523           (CUR != '>') &&
3524	   ((CUR != '/') || (NXT(1) != '>'))) {
3525	long cons = ctxt->nbChars;
3526
3527	GROW;
3528	attname = htmlParseAttribute(ctxt, &attvalue);
3529        if (attname != NULL) {
3530
3531	    /*
3532	     * Well formedness requires at most one declaration of an attribute
3533	     */
3534	    for (i = 0; i < nbatts;i += 2) {
3535	        if (xmlStrEqual(atts[i], attname)) {
3536		    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3537		                 "Attribute %s redefined\n", attname, NULL);
3538		    if (attvalue != NULL)
3539			xmlFree(attvalue);
3540		    goto failed;
3541		}
3542	    }
3543
3544	    /*
3545	     * Add the pair to atts
3546	     */
3547	    if (atts == NULL) {
3548	        maxatts = 22; /* allow for 10 attrs by default */
3549	        atts = (const xmlChar **)
3550		       xmlMalloc(maxatts * sizeof(xmlChar *));
3551		if (atts == NULL) {
3552		    htmlErrMemory(ctxt, NULL);
3553		    if (attvalue != NULL)
3554			xmlFree(attvalue);
3555		    goto failed;
3556		}
3557		ctxt->atts = atts;
3558		ctxt->maxatts = maxatts;
3559	    } else if (nbatts + 4 > maxatts) {
3560	        const xmlChar **n;
3561
3562	        maxatts *= 2;
3563	        n = (const xmlChar **) xmlRealloc((void *) atts,
3564					     maxatts * sizeof(const xmlChar *));
3565		if (n == NULL) {
3566		    htmlErrMemory(ctxt, NULL);
3567		    if (attvalue != NULL)
3568			xmlFree(attvalue);
3569		    goto failed;
3570		}
3571		atts = n;
3572		ctxt->atts = atts;
3573		ctxt->maxatts = maxatts;
3574	    }
3575	    atts[nbatts++] = attname;
3576	    atts[nbatts++] = attvalue;
3577	    atts[nbatts] = NULL;
3578	    atts[nbatts + 1] = NULL;
3579	}
3580	else {
3581	    if (attvalue != NULL)
3582	        xmlFree(attvalue);
3583	    /* Dump the bogus attribute string up to the next blank or
3584	     * the end of the tag. */
3585	    while ((IS_CHAR_CH(CUR)) &&
3586	           !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3587		   ((CUR != '/') || (NXT(1) != '>')))
3588		NEXT;
3589	}
3590
3591failed:
3592	SKIP_BLANKS;
3593        if (cons == ctxt->nbChars) {
3594	    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3595	                 "htmlParseStartTag: problem parsing attributes\n",
3596			 NULL, NULL);
3597	    break;
3598	}
3599    }
3600
3601    /*
3602     * Handle specific association to the META tag
3603     */
3604    if (meta && (nbatts != 0))
3605	htmlCheckMeta(ctxt, atts);
3606
3607    /*
3608     * SAX: Start of Element !
3609     */
3610    if (!discardtag) {
3611	htmlnamePush(ctxt, name);
3612	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3613	    if (nbatts != 0)
3614		ctxt->sax->startElement(ctxt->userData, name, atts);
3615	    else
3616		ctxt->sax->startElement(ctxt->userData, name, NULL);
3617	}
3618    }
3619
3620    if (atts != NULL) {
3621        for (i = 1;i < nbatts;i += 2) {
3622	    if (atts[i] != NULL)
3623		xmlFree((xmlChar *) atts[i]);
3624	}
3625    }
3626
3627    return(discardtag);
3628}
3629
3630/**
3631 * htmlParseEndTag:
3632 * @ctxt:  an HTML parser context
3633 *
3634 * parse an end of tag
3635 *
3636 * [42] ETag ::= '</' Name S? '>'
3637 *
3638 * With namespace
3639 *
3640 * [NS 9] ETag ::= '</' QName S? '>'
3641 *
3642 * Returns 1 if the current level should be closed.
3643 */
3644
3645static int
3646htmlParseEndTag(htmlParserCtxtPtr ctxt)
3647{
3648    const xmlChar *name;
3649    const xmlChar *oldname;
3650    int i, ret;
3651
3652    if ((CUR != '<') || (NXT(1) != '/')) {
3653        htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3654	             "htmlParseEndTag: '</' not found\n", NULL, NULL);
3655        return (0);
3656    }
3657    SKIP(2);
3658
3659    name = htmlParseHTMLName(ctxt);
3660    if (name == NULL)
3661        return (0);
3662    /*
3663     * We should definitely be at the ending "S? '>'" part
3664     */
3665    SKIP_BLANKS;
3666    if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
3667        htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3668	             "End tag : expected '>'\n", NULL, NULL);
3669	if (ctxt->recovery) {
3670	    /*
3671	     * We're not at the ending > !!
3672	     * Error, unless in recover mode where we search forwards
3673	     * until we find a >
3674	     */
3675	    while (CUR != '\0' && CUR != '>') NEXT;
3676	    NEXT;
3677	}
3678    } else
3679        NEXT;
3680
3681    /*
3682     * if we ignored misplaced tags in htmlParseStartTag don't pop them
3683     * out now.
3684     */
3685    if ((ctxt->depth > 0) &&
3686        (xmlStrEqual(name, BAD_CAST "html") ||
3687         xmlStrEqual(name, BAD_CAST "body") ||
3688	 xmlStrEqual(name, BAD_CAST "head"))) {
3689	ctxt->depth--;
3690	return (0);
3691    }
3692
3693    /*
3694     * If the name read is not one of the element in the parsing stack
3695     * then return, it's just an error.
3696     */
3697    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3698        if (xmlStrEqual(name, ctxt->nameTab[i]))
3699            break;
3700    }
3701    if (i < 0) {
3702        htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3703	             "Unexpected end tag : %s\n", name, NULL);
3704        return (0);
3705    }
3706
3707
3708    /*
3709     * Check for auto-closure of HTML elements.
3710     */
3711
3712    htmlAutoCloseOnClose(ctxt, name);
3713
3714    /*
3715     * Well formedness constraints, opening and closing must match.
3716     * With the exception that the autoclose may have popped stuff out
3717     * of the stack.
3718     */
3719    if (!xmlStrEqual(name, ctxt->name)) {
3720        if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
3721            htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3722	                 "Opening and ending tag mismatch: %s and %s\n",
3723			 name, ctxt->name);
3724        }
3725    }
3726
3727    /*
3728     * SAX: End of Tag
3729     */
3730    oldname = ctxt->name;
3731    if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3732        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3733            ctxt->sax->endElement(ctxt->userData, name);
3734        htmlnamePop(ctxt);
3735        ret = 1;
3736    } else {
3737        ret = 0;
3738    }
3739
3740    return (ret);
3741}
3742
3743
3744/**
3745 * htmlParseReference:
3746 * @ctxt:  an HTML parser context
3747 *
3748 * parse and handle entity references in content,
3749 * this will end-up in a call to character() since this is either a
3750 * CharRef, or a predefined entity.
3751 */
3752static void
3753htmlParseReference(htmlParserCtxtPtr ctxt) {
3754    const htmlEntityDesc * ent;
3755    xmlChar out[6];
3756    const xmlChar *name;
3757    if (CUR != '&') return;
3758
3759    if (NXT(1) == '#') {
3760	unsigned int c;
3761	int bits, i = 0;
3762
3763	c = htmlParseCharRef(ctxt);
3764	if (c == 0)
3765	    return;
3766
3767        if      (c <    0x80) { out[i++]= c;                bits= -6; }
3768        else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
3769        else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
3770        else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
3771
3772        for ( ; bits >= 0; bits-= 6) {
3773            out[i++]= ((c >> bits) & 0x3F) | 0x80;
3774        }
3775	out[i] = 0;
3776
3777	htmlCheckParagraph(ctxt);
3778	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3779	    ctxt->sax->characters(ctxt->userData, out, i);
3780    } else {
3781	ent = htmlParseEntityRef(ctxt, &name);
3782	if (name == NULL) {
3783	    htmlCheckParagraph(ctxt);
3784	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3785	        ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3786	    return;
3787	}
3788	if ((ent == NULL) || !(ent->value > 0)) {
3789	    htmlCheckParagraph(ctxt);
3790	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3791		ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3792		ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3793		/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3794	    }
3795	} else {
3796	    unsigned int c;
3797	    int bits, i = 0;
3798
3799	    c = ent->value;
3800	    if      (c <    0x80)
3801	            { out[i++]= c;                bits= -6; }
3802	    else if (c <   0x800)
3803	            { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
3804	    else if (c < 0x10000)
3805	            { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
3806	    else
3807	            { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
3808
3809	    for ( ; bits >= 0; bits-= 6) {
3810		out[i++]= ((c >> bits) & 0x3F) | 0x80;
3811	    }
3812	    out[i] = 0;
3813
3814	    htmlCheckParagraph(ctxt);
3815	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3816		ctxt->sax->characters(ctxt->userData, out, i);
3817	}
3818    }
3819}
3820
3821/**
3822 * htmlParseContent:
3823 * @ctxt:  an HTML parser context
3824 *
3825 * Parse a content: comment, sub-element, reference or text.
3826 */
3827
3828static void
3829htmlParseContent(htmlParserCtxtPtr ctxt) {
3830    xmlChar *currentNode;
3831    int depth;
3832    const xmlChar *name;
3833
3834    currentNode = xmlStrdup(ctxt->name);
3835    depth = ctxt->nameNr;
3836    while (1) {
3837	long cons = ctxt->nbChars;
3838
3839        GROW;
3840	/*
3841	 * Our tag or one of it's parent or children is ending.
3842	 */
3843        if ((CUR == '<') && (NXT(1) == '/')) {
3844	    if (htmlParseEndTag(ctxt) &&
3845		((currentNode != NULL) || (ctxt->nameNr == 0))) {
3846		if (currentNode != NULL)
3847		    xmlFree(currentNode);
3848		return;
3849	    }
3850	    continue; /* while */
3851        }
3852
3853	else if ((CUR == '<') &&
3854	         ((IS_ASCII_LETTER(NXT(1))) ||
3855		  (NXT(1) == '_') || (NXT(1) == ':'))) {
3856	    name = htmlParseHTMLName_nonInvasive(ctxt);
3857	    if (name == NULL) {
3858	        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3859			 "htmlParseStartTag: invalid element name\n",
3860			 NULL, NULL);
3861	        /* Dump the bogus tag like browsers do */
3862 	        while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3863	            NEXT;
3864
3865	        if (currentNode != NULL)
3866	            xmlFree(currentNode);
3867	        return;
3868	    }
3869
3870	    if (ctxt->name != NULL) {
3871	        if (htmlCheckAutoClose(name, ctxt->name) == 1) {
3872	            htmlAutoClose(ctxt, name);
3873	            continue;
3874	        }
3875	    }
3876	}
3877
3878	/*
3879	 * Has this node been popped out during parsing of
3880	 * the next element
3881	 */
3882        if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3883	    (!xmlStrEqual(currentNode, ctxt->name)))
3884	     {
3885	    if (currentNode != NULL) xmlFree(currentNode);
3886	    return;
3887	}
3888
3889	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
3890	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
3891	    /*
3892	     * Handle SCRIPT/STYLE separately
3893	     */
3894	    htmlParseScript(ctxt);
3895	} else {
3896	    /*
3897	     * Sometimes DOCTYPE arrives in the middle of the document
3898	     */
3899	    if ((CUR == '<') && (NXT(1) == '!') &&
3900		(UPP(2) == 'D') && (UPP(3) == 'O') &&
3901		(UPP(4) == 'C') && (UPP(5) == 'T') &&
3902		(UPP(6) == 'Y') && (UPP(7) == 'P') &&
3903		(UPP(8) == 'E')) {
3904		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3905		             "Misplaced DOCTYPE declaration\n",
3906			     BAD_CAST "DOCTYPE" , NULL);
3907		htmlParseDocTypeDecl(ctxt);
3908	    }
3909
3910	    /*
3911	     * First case :  a comment
3912	     */
3913	    if ((CUR == '<') && (NXT(1) == '!') &&
3914		(NXT(2) == '-') && (NXT(3) == '-')) {
3915		htmlParseComment(ctxt);
3916	    }
3917
3918	    /*
3919	     * Second case : a Processing Instruction.
3920	     */
3921	    else if ((CUR == '<') && (NXT(1) == '?')) {
3922		htmlParsePI(ctxt);
3923	    }
3924
3925	    /*
3926	     * Third case :  a sub-element.
3927	     */
3928	    else if (CUR == '<') {
3929		htmlParseElement(ctxt);
3930	    }
3931
3932	    /*
3933	     * Fourth case : a reference. If if has not been resolved,
3934	     *    parsing returns it's Name, create the node
3935	     */
3936	    else if (CUR == '&') {
3937		htmlParseReference(ctxt);
3938	    }
3939
3940	    /*
3941	     * Fifth case : end of the resource
3942	     */
3943	    else if (CUR == 0) {
3944		htmlAutoCloseOnEnd(ctxt);
3945		break;
3946	    }
3947
3948	    /*
3949	     * Last case, text. Note that References are handled directly.
3950	     */
3951	    else {
3952		htmlParseCharData(ctxt);
3953	    }
3954
3955	    if (cons == ctxt->nbChars) {
3956		if (ctxt->node != NULL) {
3957		    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3958		                 "detected an error in element content\n",
3959				 NULL, NULL);
3960		}
3961		break;
3962	    }
3963	}
3964        GROW;
3965    }
3966    if (currentNode != NULL) xmlFree(currentNode);
3967}
3968
3969/**
3970 * htmlParseContent:
3971 * @ctxt:  an HTML parser context
3972 *
3973 * Parse a content: comment, sub-element, reference or text.
3974 */
3975
3976void
3977__htmlParseContent(void *ctxt) {
3978    if (ctxt != NULL)
3979	htmlParseContent((htmlParserCtxtPtr) ctxt);
3980}
3981
3982/**
3983 * htmlParseElement:
3984 * @ctxt:  an HTML parser context
3985 *
3986 * parse an HTML element, this is highly recursive
3987 *
3988 * [39] element ::= EmptyElemTag | STag content ETag
3989 *
3990 * [41] Attribute ::= Name Eq AttValue
3991 */
3992
3993void
3994htmlParseElement(htmlParserCtxtPtr ctxt) {
3995    const xmlChar *name;
3996    xmlChar *currentNode = NULL;
3997    const htmlElemDesc * info;
3998    htmlParserNodeInfo node_info;
3999    int failed;
4000    int depth;
4001    const xmlChar *oldptr;
4002
4003    if ((ctxt == NULL) || (ctxt->input == NULL)) {
4004	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4005		     "htmlParseElement: context error\n", NULL, NULL);
4006	return;
4007    }
4008    /* Capture start position */
4009    if (ctxt->record_info) {
4010        node_info.begin_pos = ctxt->input->consumed +
4011                          (CUR_PTR - ctxt->input->base);
4012	node_info.begin_line = ctxt->input->line;
4013    }
4014
4015    failed = htmlParseStartTag(ctxt);
4016    name = ctxt->name;
4017    if ((failed == -1) || (name == NULL)) {
4018	if (CUR == '>')
4019	    NEXT;
4020        return;
4021    }
4022
4023    /*
4024     * Lookup the info for that element.
4025     */
4026    info = htmlTagLookup(name);
4027    if (info == NULL) {
4028	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4029	             "Tag %s invalid\n", name, NULL);
4030    }
4031
4032    /*
4033     * Check for an Empty Element labeled the XML/SGML way
4034     */
4035    if ((CUR == '/') && (NXT(1) == '>')) {
4036        SKIP(2);
4037	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4038	    ctxt->sax->endElement(ctxt->userData, name);
4039	htmlnamePop(ctxt);
4040	return;
4041    }
4042
4043    if (CUR == '>') {
4044        NEXT;
4045    } else {
4046	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4047	             "Couldn't find end of Start Tag %s\n", name, NULL);
4048
4049	/*
4050	 * end of parsing of this node.
4051	 */
4052	if (xmlStrEqual(name, ctxt->name)) {
4053	    nodePop(ctxt);
4054	    htmlnamePop(ctxt);
4055	}
4056
4057	/*
4058	 * Capture end position and add node
4059	 */
4060	if (ctxt->record_info) {
4061	   node_info.end_pos = ctxt->input->consumed +
4062			      (CUR_PTR - ctxt->input->base);
4063	   node_info.end_line = ctxt->input->line;
4064	   node_info.node = ctxt->node;
4065	   xmlParserAddNodeInfo(ctxt, &node_info);
4066	}
4067	return;
4068    }
4069
4070    /*
4071     * Check for an Empty Element from DTD definition
4072     */
4073    if ((info != NULL) && (info->empty)) {
4074	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4075	    ctxt->sax->endElement(ctxt->userData, name);
4076	htmlnamePop(ctxt);
4077	return;
4078    }
4079
4080    /*
4081     * Parse the content of the element:
4082     */
4083    currentNode = xmlStrdup(ctxt->name);
4084    depth = ctxt->nameNr;
4085    while (IS_CHAR_CH(CUR)) {
4086	oldptr = ctxt->input->cur;
4087	htmlParseContent(ctxt);
4088	if (oldptr==ctxt->input->cur) break;
4089	if (ctxt->nameNr < depth) break;
4090    }
4091
4092    /*
4093     * Capture end position and add node
4094     */
4095    if ( currentNode != NULL && ctxt->record_info ) {
4096       node_info.end_pos = ctxt->input->consumed +
4097                          (CUR_PTR - ctxt->input->base);
4098       node_info.end_line = ctxt->input->line;
4099       node_info.node = ctxt->node;
4100       xmlParserAddNodeInfo(ctxt, &node_info);
4101    }
4102    if (!IS_CHAR_CH(CUR)) {
4103	htmlAutoCloseOnEnd(ctxt);
4104    }
4105
4106    if (currentNode != NULL)
4107	xmlFree(currentNode);
4108}
4109
4110/**
4111 * htmlParseDocument:
4112 * @ctxt:  an HTML parser context
4113 *
4114 * parse an HTML document (and build a tree if using the standard SAX
4115 * interface).
4116 *
4117 * Returns 0, -1 in case of error. the parser context is augmented
4118 *                as a result of the parsing.
4119 */
4120
4121int
4122htmlParseDocument(htmlParserCtxtPtr ctxt) {
4123    xmlChar start[4];
4124    xmlCharEncoding enc;
4125    xmlDtdPtr dtd;
4126
4127    xmlInitParser();
4128
4129    htmlDefaultSAXHandlerInit();
4130
4131    if ((ctxt == NULL) || (ctxt->input == NULL)) {
4132	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4133		     "htmlParseDocument: context error\n", NULL, NULL);
4134	return(XML_ERR_INTERNAL_ERROR);
4135    }
4136    ctxt->html = 1;
4137    GROW;
4138    /*
4139     * SAX: beginning of the document processing.
4140     */
4141    if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4142        ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4143
4144    if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4145        ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4146	/*
4147	 * Get the 4 first bytes and decode the charset
4148	 * if enc != XML_CHAR_ENCODING_NONE
4149	 * plug some encoding conversion routines.
4150	 */
4151	start[0] = RAW;
4152	start[1] = NXT(1);
4153	start[2] = NXT(2);
4154	start[3] = NXT(3);
4155	enc = xmlDetectCharEncoding(&start[0], 4);
4156	if (enc != XML_CHAR_ENCODING_NONE) {
4157	    xmlSwitchEncoding(ctxt, enc);
4158	}
4159    }
4160
4161    /*
4162     * Wipe out everything which is before the first '<'
4163     */
4164    SKIP_BLANKS;
4165    if (CUR == 0) {
4166	htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4167	             "Document is empty\n", NULL, NULL);
4168    }
4169
4170    if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4171	ctxt->sax->startDocument(ctxt->userData);
4172
4173
4174    /*
4175     * Parse possible comments and PIs before any content
4176     */
4177    while (((CUR == '<') && (NXT(1) == '!') &&
4178            (NXT(2) == '-') && (NXT(3) == '-')) ||
4179	   ((CUR == '<') && (NXT(1) == '?'))) {
4180        htmlParseComment(ctxt);
4181        htmlParsePI(ctxt);
4182	SKIP_BLANKS;
4183    }
4184
4185
4186    /*
4187     * Then possibly doc type declaration(s) and more Misc
4188     * (doctypedecl Misc*)?
4189     */
4190    if ((CUR == '<') && (NXT(1) == '!') &&
4191	(UPP(2) == 'D') && (UPP(3) == 'O') &&
4192	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4193	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4194	(UPP(8) == 'E')) {
4195	htmlParseDocTypeDecl(ctxt);
4196    }
4197    SKIP_BLANKS;
4198
4199    /*
4200     * Parse possible comments and PIs before any content
4201     */
4202    while (((CUR == '<') && (NXT(1) == '!') &&
4203            (NXT(2) == '-') && (NXT(3) == '-')) ||
4204	   ((CUR == '<') && (NXT(1) == '?'))) {
4205        htmlParseComment(ctxt);
4206        htmlParsePI(ctxt);
4207	SKIP_BLANKS;
4208    }
4209
4210    /*
4211     * Time to start parsing the tree itself
4212     */
4213    htmlParseContent(ctxt);
4214
4215    /*
4216     * autoclose
4217     */
4218    if (CUR == 0)
4219	htmlAutoCloseOnEnd(ctxt);
4220
4221
4222    /*
4223     * SAX: end of the document processing.
4224     */
4225    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4226        ctxt->sax->endDocument(ctxt->userData);
4227
4228    if (ctxt->myDoc != NULL) {
4229	dtd = xmlGetIntSubset(ctxt->myDoc);
4230	if (dtd == NULL)
4231	    ctxt->myDoc->intSubset =
4232		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4233		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4234		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4235    }
4236    if (! ctxt->wellFormed) return(-1);
4237    return(0);
4238}
4239
4240
4241/************************************************************************
4242 *									*
4243 *			Parser contexts handling			*
4244 *									*
4245 ************************************************************************/
4246
4247/**
4248 * htmlInitParserCtxt:
4249 * @ctxt:  an HTML parser context
4250 *
4251 * Initialize a parser context
4252 *
4253 * Returns 0 in case of success and -1 in case of error
4254 */
4255
4256static int
4257htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4258{
4259    htmlSAXHandler *sax;
4260
4261    if (ctxt == NULL) return(-1);
4262    memset(ctxt, 0, sizeof(htmlParserCtxt));
4263
4264    ctxt->dict = xmlDictCreate();
4265    if (ctxt->dict == NULL) {
4266        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4267	return(-1);
4268    }
4269    sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4270    if (sax == NULL) {
4271        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4272	return(-1);
4273    }
4274    else
4275        memset(sax, 0, sizeof(htmlSAXHandler));
4276
4277    /* Allocate the Input stack */
4278    ctxt->inputTab = (htmlParserInputPtr *)
4279                      xmlMalloc(5 * sizeof(htmlParserInputPtr));
4280    if (ctxt->inputTab == NULL) {
4281        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4282	ctxt->inputNr = 0;
4283	ctxt->inputMax = 0;
4284	ctxt->input = NULL;
4285	return(-1);
4286    }
4287    ctxt->inputNr = 0;
4288    ctxt->inputMax = 5;
4289    ctxt->input = NULL;
4290    ctxt->version = NULL;
4291    ctxt->encoding = NULL;
4292    ctxt->standalone = -1;
4293    ctxt->instate = XML_PARSER_START;
4294
4295    /* Allocate the Node stack */
4296    ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4297    if (ctxt->nodeTab == NULL) {
4298        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4299	ctxt->nodeNr = 0;
4300	ctxt->nodeMax = 0;
4301	ctxt->node = NULL;
4302	ctxt->inputNr = 0;
4303	ctxt->inputMax = 0;
4304	ctxt->input = NULL;
4305	return(-1);
4306    }
4307    ctxt->nodeNr = 0;
4308    ctxt->nodeMax = 10;
4309    ctxt->node = NULL;
4310
4311    /* Allocate the Name stack */
4312    ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4313    if (ctxt->nameTab == NULL) {
4314        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4315	ctxt->nameNr = 0;
4316	ctxt->nameMax = 10;
4317	ctxt->name = NULL;
4318	ctxt->nodeNr = 0;
4319	ctxt->nodeMax = 0;
4320	ctxt->node = NULL;
4321	ctxt->inputNr = 0;
4322	ctxt->inputMax = 0;
4323	ctxt->input = NULL;
4324	return(-1);
4325    }
4326    ctxt->nameNr = 0;
4327    ctxt->nameMax = 10;
4328    ctxt->name = NULL;
4329
4330    if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
4331    else {
4332        ctxt->sax = sax;
4333	memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
4334    }
4335    ctxt->userData = ctxt;
4336    ctxt->myDoc = NULL;
4337    ctxt->wellFormed = 1;
4338    ctxt->replaceEntities = 0;
4339    ctxt->linenumbers = xmlLineNumbersDefaultValue;
4340    ctxt->html = 1;
4341    ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
4342    ctxt->vctxt.userData = ctxt;
4343    ctxt->vctxt.error = xmlParserValidityError;
4344    ctxt->vctxt.warning = xmlParserValidityWarning;
4345    ctxt->record_info = 0;
4346    ctxt->validate = 0;
4347    ctxt->nbChars = 0;
4348    ctxt->checkIndex = 0;
4349    ctxt->catalogs = NULL;
4350    xmlInitNodeInfoSeq(&ctxt->node_seq);
4351    return(0);
4352}
4353
4354/**
4355 * htmlFreeParserCtxt:
4356 * @ctxt:  an HTML parser context
4357 *
4358 * Free all the memory used by a parser context. However the parsed
4359 * document in ctxt->myDoc is not freed.
4360 */
4361
4362void
4363htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4364{
4365    xmlFreeParserCtxt(ctxt);
4366}
4367
4368/**
4369 * htmlNewParserCtxt:
4370 *
4371 * Allocate and initialize a new parser context.
4372 *
4373 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4374 */
4375
4376htmlParserCtxtPtr
4377htmlNewParserCtxt(void)
4378{
4379    xmlParserCtxtPtr ctxt;
4380
4381    ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4382    if (ctxt == NULL) {
4383        htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
4384	return(NULL);
4385    }
4386    memset(ctxt, 0, sizeof(xmlParserCtxt));
4387    if (htmlInitParserCtxt(ctxt) < 0) {
4388        htmlFreeParserCtxt(ctxt);
4389	return(NULL);
4390    }
4391    return(ctxt);
4392}
4393
4394/**
4395 * htmlCreateMemoryParserCtxt:
4396 * @buffer:  a pointer to a char array
4397 * @size:  the size of the array
4398 *
4399 * Create a parser context for an HTML in-memory document.
4400 *
4401 * Returns the new parser context or NULL
4402 */
4403htmlParserCtxtPtr
4404htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4405    xmlParserCtxtPtr ctxt;
4406    xmlParserInputPtr input;
4407    xmlParserInputBufferPtr buf;
4408
4409    if (buffer == NULL)
4410	return(NULL);
4411    if (size <= 0)
4412	return(NULL);
4413
4414    ctxt = htmlNewParserCtxt();
4415    if (ctxt == NULL)
4416	return(NULL);
4417
4418    buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4419    if (buf == NULL) return(NULL);
4420
4421    input = xmlNewInputStream(ctxt);
4422    if (input == NULL) {
4423	xmlFreeParserCtxt(ctxt);
4424	return(NULL);
4425    }
4426
4427    input->filename = NULL;
4428    input->buf = buf;
4429    input->base = input->buf->buffer->content;
4430    input->cur = input->buf->buffer->content;
4431    input->end = &input->buf->buffer->content[input->buf->buffer->use];
4432
4433    inputPush(ctxt, input);
4434    return(ctxt);
4435}
4436
4437/**
4438 * htmlCreateDocParserCtxt:
4439 * @cur:  a pointer to an array of xmlChar
4440 * @encoding:  a free form C string describing the HTML document encoding, or NULL
4441 *
4442 * Create a parser context for an HTML document.
4443 *
4444 * TODO: check the need to add encoding handling there
4445 *
4446 * Returns the new parser context or NULL
4447 */
4448static htmlParserCtxtPtr
4449htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
4450    int len;
4451    htmlParserCtxtPtr ctxt;
4452
4453    if (cur == NULL)
4454	return(NULL);
4455    len = xmlStrlen(cur);
4456    ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4457    if (ctxt == NULL)
4458	return(NULL);
4459
4460    if (encoding != NULL) {
4461	xmlCharEncoding enc;
4462	xmlCharEncodingHandlerPtr handler;
4463
4464	if (ctxt->input->encoding != NULL)
4465	    xmlFree((xmlChar *) ctxt->input->encoding);
4466	ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
4467
4468	enc = xmlParseCharEncoding(encoding);
4469	/*
4470	 * registered set of known encodings
4471	 */
4472	if (enc != XML_CHAR_ENCODING_ERROR) {
4473	    xmlSwitchEncoding(ctxt, enc);
4474	    if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
4475		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4476		             "Unsupported encoding %s\n",
4477			     (const xmlChar *) encoding, NULL);
4478	    }
4479	} else {
4480	    /*
4481	     * fallback for unknown encodings
4482	     */
4483	    handler = xmlFindCharEncodingHandler((const char *) encoding);
4484	    if (handler != NULL) {
4485		xmlSwitchToEncoding(ctxt, handler);
4486	    } else {
4487		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4488		             "Unsupported encoding %s\n",
4489			     (const xmlChar *) encoding, NULL);
4490	    }
4491	}
4492    }
4493    return(ctxt);
4494}
4495
4496#ifdef LIBXML_PUSH_ENABLED
4497/************************************************************************
4498 *									*
4499 * 		Progressive parsing interfaces				*
4500 *									*
4501 ************************************************************************/
4502
4503/**
4504 * htmlParseLookupSequence:
4505 * @ctxt:  an HTML parser context
4506 * @first:  the first char to lookup
4507 * @next:  the next char to lookup or zero
4508 * @third:  the next char to lookup or zero
4509 * @comment: flag to force checking inside comments
4510 *
4511 * Try to find if a sequence (first, next, third) or  just (first next) or
4512 * (first) is available in the input stream.
4513 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4514 * to avoid rescanning sequences of bytes, it DOES change the state of the
4515 * parser, do not use liberally.
4516 * This is basically similar to xmlParseLookupSequence()
4517 *
4518 * Returns the index to the current parsing point if the full sequence
4519 *      is available, -1 otherwise.
4520 */
4521static int
4522htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
4523                        xmlChar next, xmlChar third, int iscomment) {
4524    int base, len;
4525    htmlParserInputPtr in;
4526    const xmlChar *buf;
4527    int incomment = 0;
4528
4529    in = ctxt->input;
4530    if (in == NULL) return(-1);
4531    base = in->cur - in->base;
4532    if (base < 0) return(-1);
4533    if (ctxt->checkIndex > base)
4534        base = ctxt->checkIndex;
4535    if (in->buf == NULL) {
4536	buf = in->base;
4537	len = in->length;
4538    } else {
4539	buf = in->buf->buffer->content;
4540	len = in->buf->buffer->use;
4541    }
4542    /* take into account the sequence length */
4543    if (third) len -= 2;
4544    else if (next) len --;
4545    for (;base < len;base++) {
4546	if (!incomment && (base + 4 < len) && !iscomment) {
4547	    if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4548		(buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4549		incomment = 1;
4550		/* do not increment past <! - some people use <!--> */
4551		base += 2;
4552	    }
4553	}
4554	if (incomment) {
4555	    if (base + 3 > len)
4556		return(-1);
4557	    if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4558		(buf[base + 2] == '>')) {
4559		incomment = 0;
4560		base += 2;
4561	    }
4562	    continue;
4563	}
4564        if (buf[base] == first) {
4565	    if (third != 0) {
4566		if ((buf[base + 1] != next) ||
4567		    (buf[base + 2] != third)) continue;
4568	    } else if (next != 0) {
4569		if (buf[base + 1] != next) continue;
4570	    }
4571	    ctxt->checkIndex = 0;
4572#ifdef DEBUG_PUSH
4573	    if (next == 0)
4574		xmlGenericError(xmlGenericErrorContext,
4575			"HPP: lookup '%c' found at %d\n",
4576			first, base);
4577	    else if (third == 0)
4578		xmlGenericError(xmlGenericErrorContext,
4579			"HPP: lookup '%c%c' found at %d\n",
4580			first, next, base);
4581	    else
4582		xmlGenericError(xmlGenericErrorContext,
4583			"HPP: lookup '%c%c%c' found at %d\n",
4584			first, next, third, base);
4585#endif
4586	    return(base - (in->cur - in->base));
4587	}
4588    }
4589    ctxt->checkIndex = base;
4590#ifdef DEBUG_PUSH
4591    if (next == 0)
4592	xmlGenericError(xmlGenericErrorContext,
4593		"HPP: lookup '%c' failed\n", first);
4594    else if (third == 0)
4595	xmlGenericError(xmlGenericErrorContext,
4596		"HPP: lookup '%c%c' failed\n", first, next);
4597    else
4598	xmlGenericError(xmlGenericErrorContext,
4599		"HPP: lookup '%c%c%c' failed\n", first, next, third);
4600#endif
4601    return(-1);
4602}
4603
4604/**
4605 * htmlParseTryOrFinish:
4606 * @ctxt:  an HTML parser context
4607 * @terminate:  last chunk indicator
4608 *
4609 * Try to progress on parsing
4610 *
4611 * Returns zero if no parsing was possible
4612 */
4613static int
4614htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4615    int ret = 0;
4616    htmlParserInputPtr in;
4617    int avail = 0;
4618    xmlChar cur, next;
4619
4620#ifdef DEBUG_PUSH
4621    switch (ctxt->instate) {
4622	case XML_PARSER_EOF:
4623	    xmlGenericError(xmlGenericErrorContext,
4624		    "HPP: try EOF\n"); break;
4625	case XML_PARSER_START:
4626	    xmlGenericError(xmlGenericErrorContext,
4627		    "HPP: try START\n"); break;
4628	case XML_PARSER_MISC:
4629	    xmlGenericError(xmlGenericErrorContext,
4630		    "HPP: try MISC\n");break;
4631	case XML_PARSER_COMMENT:
4632	    xmlGenericError(xmlGenericErrorContext,
4633		    "HPP: try COMMENT\n");break;
4634	case XML_PARSER_PROLOG:
4635	    xmlGenericError(xmlGenericErrorContext,
4636		    "HPP: try PROLOG\n");break;
4637	case XML_PARSER_START_TAG:
4638	    xmlGenericError(xmlGenericErrorContext,
4639		    "HPP: try START_TAG\n");break;
4640	case XML_PARSER_CONTENT:
4641	    xmlGenericError(xmlGenericErrorContext,
4642		    "HPP: try CONTENT\n");break;
4643	case XML_PARSER_CDATA_SECTION:
4644	    xmlGenericError(xmlGenericErrorContext,
4645		    "HPP: try CDATA_SECTION\n");break;
4646	case XML_PARSER_END_TAG:
4647	    xmlGenericError(xmlGenericErrorContext,
4648		    "HPP: try END_TAG\n");break;
4649	case XML_PARSER_ENTITY_DECL:
4650	    xmlGenericError(xmlGenericErrorContext,
4651		    "HPP: try ENTITY_DECL\n");break;
4652	case XML_PARSER_ENTITY_VALUE:
4653	    xmlGenericError(xmlGenericErrorContext,
4654		    "HPP: try ENTITY_VALUE\n");break;
4655	case XML_PARSER_ATTRIBUTE_VALUE:
4656	    xmlGenericError(xmlGenericErrorContext,
4657		    "HPP: try ATTRIBUTE_VALUE\n");break;
4658	case XML_PARSER_DTD:
4659	    xmlGenericError(xmlGenericErrorContext,
4660		    "HPP: try DTD\n");break;
4661	case XML_PARSER_EPILOG:
4662	    xmlGenericError(xmlGenericErrorContext,
4663		    "HPP: try EPILOG\n");break;
4664	case XML_PARSER_PI:
4665	    xmlGenericError(xmlGenericErrorContext,
4666		    "HPP: try PI\n");break;
4667	case XML_PARSER_SYSTEM_LITERAL:
4668	    xmlGenericError(xmlGenericErrorContext,
4669		    "HPP: try SYSTEM_LITERAL\n");break;
4670    }
4671#endif
4672
4673    while (1) {
4674
4675	in = ctxt->input;
4676	if (in == NULL) break;
4677	if (in->buf == NULL)
4678	    avail = in->length - (in->cur - in->base);
4679	else
4680	    avail = in->buf->buffer->use - (in->cur - in->base);
4681	if ((avail == 0) && (terminate)) {
4682	    htmlAutoCloseOnEnd(ctxt);
4683	    if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4684		/*
4685		 * SAX: end of the document processing.
4686		 */
4687		ctxt->instate = XML_PARSER_EOF;
4688		if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4689		    ctxt->sax->endDocument(ctxt->userData);
4690	    }
4691	}
4692        if (avail < 1)
4693	    goto done;
4694	cur = in->cur[0];
4695	if (cur == 0) {
4696	    SKIP(1);
4697	    continue;
4698	}
4699
4700        switch (ctxt->instate) {
4701            case XML_PARSER_EOF:
4702	        /*
4703		 * Document parsing is done !
4704		 */
4705	        goto done;
4706            case XML_PARSER_START:
4707	        /*
4708		 * Very first chars read from the document flow.
4709		 */
4710		cur = in->cur[0];
4711		if (IS_BLANK_CH(cur)) {
4712		    SKIP_BLANKS;
4713		    if (in->buf == NULL)
4714			avail = in->length - (in->cur - in->base);
4715		    else
4716			avail = in->buf->buffer->use - (in->cur - in->base);
4717		}
4718		if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4719		    ctxt->sax->setDocumentLocator(ctxt->userData,
4720						  &xmlDefaultSAXLocator);
4721		if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4722	            (!ctxt->disableSAX))
4723		    ctxt->sax->startDocument(ctxt->userData);
4724
4725		cur = in->cur[0];
4726		next = in->cur[1];
4727		if ((cur == '<') && (next == '!') &&
4728		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
4729		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
4730		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4731		    (UPP(8) == 'E')) {
4732		    if ((!terminate) &&
4733		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4734			goto done;
4735#ifdef DEBUG_PUSH
4736		    xmlGenericError(xmlGenericErrorContext,
4737			    "HPP: Parsing internal subset\n");
4738#endif
4739		    htmlParseDocTypeDecl(ctxt);
4740		    ctxt->instate = XML_PARSER_PROLOG;
4741#ifdef DEBUG_PUSH
4742		    xmlGenericError(xmlGenericErrorContext,
4743			    "HPP: entering PROLOG\n");
4744#endif
4745                } else {
4746		    ctxt->instate = XML_PARSER_MISC;
4747#ifdef DEBUG_PUSH
4748		    xmlGenericError(xmlGenericErrorContext,
4749			    "HPP: entering MISC\n");
4750#endif
4751		}
4752		break;
4753            case XML_PARSER_MISC:
4754		SKIP_BLANKS;
4755		if (in->buf == NULL)
4756		    avail = in->length - (in->cur - in->base);
4757		else
4758		    avail = in->buf->buffer->use - (in->cur - in->base);
4759		if (avail < 2)
4760		    goto done;
4761		cur = in->cur[0];
4762		next = in->cur[1];
4763	        if ((cur == '<') && (next == '!') &&
4764		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
4765		    if ((!terminate) &&
4766		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4767			goto done;
4768#ifdef DEBUG_PUSH
4769		    xmlGenericError(xmlGenericErrorContext,
4770			    "HPP: Parsing Comment\n");
4771#endif
4772		    htmlParseComment(ctxt);
4773		    ctxt->instate = XML_PARSER_MISC;
4774	        } else if ((cur == '<') && (next == '?')) {
4775		    if ((!terminate) &&
4776		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4777			goto done;
4778#ifdef DEBUG_PUSH
4779		    xmlGenericError(xmlGenericErrorContext,
4780			    "HPP: Parsing PI\n");
4781#endif
4782		    htmlParsePI(ctxt);
4783		    ctxt->instate = XML_PARSER_MISC;
4784		} else if ((cur == '<') && (next == '!') &&
4785		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
4786		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
4787		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4788		    (UPP(8) == 'E')) {
4789		    if ((!terminate) &&
4790		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4791			goto done;
4792#ifdef DEBUG_PUSH
4793		    xmlGenericError(xmlGenericErrorContext,
4794			    "HPP: Parsing internal subset\n");
4795#endif
4796		    htmlParseDocTypeDecl(ctxt);
4797		    ctxt->instate = XML_PARSER_PROLOG;
4798#ifdef DEBUG_PUSH
4799		    xmlGenericError(xmlGenericErrorContext,
4800			    "HPP: entering PROLOG\n");
4801#endif
4802		} else if ((cur == '<') && (next == '!') &&
4803		           (avail < 9)) {
4804		    goto done;
4805		} else {
4806		    ctxt->instate = XML_PARSER_START_TAG;
4807#ifdef DEBUG_PUSH
4808		    xmlGenericError(xmlGenericErrorContext,
4809			    "HPP: entering START_TAG\n");
4810#endif
4811		}
4812		break;
4813            case XML_PARSER_PROLOG:
4814		SKIP_BLANKS;
4815		if (in->buf == NULL)
4816		    avail = in->length - (in->cur - in->base);
4817		else
4818		    avail = in->buf->buffer->use - (in->cur - in->base);
4819		if (avail < 2)
4820		    goto done;
4821		cur = in->cur[0];
4822		next = in->cur[1];
4823		if ((cur == '<') && (next == '!') &&
4824		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
4825		    if ((!terminate) &&
4826		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4827			goto done;
4828#ifdef DEBUG_PUSH
4829		    xmlGenericError(xmlGenericErrorContext,
4830			    "HPP: Parsing Comment\n");
4831#endif
4832		    htmlParseComment(ctxt);
4833		    ctxt->instate = XML_PARSER_PROLOG;
4834	        } else if ((cur == '<') && (next == '?')) {
4835		    if ((!terminate) &&
4836		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4837			goto done;
4838#ifdef DEBUG_PUSH
4839		    xmlGenericError(xmlGenericErrorContext,
4840			    "HPP: Parsing PI\n");
4841#endif
4842		    htmlParsePI(ctxt);
4843		    ctxt->instate = XML_PARSER_PROLOG;
4844		} else if ((cur == '<') && (next == '!') &&
4845		           (avail < 4)) {
4846		    goto done;
4847		} else {
4848		    ctxt->instate = XML_PARSER_START_TAG;
4849#ifdef DEBUG_PUSH
4850		    xmlGenericError(xmlGenericErrorContext,
4851			    "HPP: entering START_TAG\n");
4852#endif
4853		}
4854		break;
4855            case XML_PARSER_EPILOG:
4856		if (in->buf == NULL)
4857		    avail = in->length - (in->cur - in->base);
4858		else
4859		    avail = in->buf->buffer->use - (in->cur - in->base);
4860		if (avail < 1)
4861		    goto done;
4862		cur = in->cur[0];
4863		if (IS_BLANK_CH(cur)) {
4864		    htmlParseCharData(ctxt);
4865		    goto done;
4866		}
4867		if (avail < 2)
4868		    goto done;
4869		next = in->cur[1];
4870	        if ((cur == '<') && (next == '!') &&
4871		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
4872		    if ((!terminate) &&
4873		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4874			goto done;
4875#ifdef DEBUG_PUSH
4876		    xmlGenericError(xmlGenericErrorContext,
4877			    "HPP: Parsing Comment\n");
4878#endif
4879		    htmlParseComment(ctxt);
4880		    ctxt->instate = XML_PARSER_EPILOG;
4881	        } else if ((cur == '<') && (next == '?')) {
4882		    if ((!terminate) &&
4883		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4884			goto done;
4885#ifdef DEBUG_PUSH
4886		    xmlGenericError(xmlGenericErrorContext,
4887			    "HPP: Parsing PI\n");
4888#endif
4889		    htmlParsePI(ctxt);
4890		    ctxt->instate = XML_PARSER_EPILOG;
4891		} else if ((cur == '<') && (next == '!') &&
4892		           (avail < 4)) {
4893		    goto done;
4894		} else {
4895		    ctxt->errNo = XML_ERR_DOCUMENT_END;
4896		    ctxt->wellFormed = 0;
4897		    ctxt->instate = XML_PARSER_EOF;
4898#ifdef DEBUG_PUSH
4899		    xmlGenericError(xmlGenericErrorContext,
4900			    "HPP: entering EOF\n");
4901#endif
4902		    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4903			ctxt->sax->endDocument(ctxt->userData);
4904		    goto done;
4905		}
4906		break;
4907            case XML_PARSER_START_TAG: {
4908	        const xmlChar *name;
4909		int failed;
4910		const htmlElemDesc * info;
4911
4912		if (avail < 2)
4913		    goto done;
4914		cur = in->cur[0];
4915	        if (cur != '<') {
4916		    ctxt->instate = XML_PARSER_CONTENT;
4917#ifdef DEBUG_PUSH
4918		    xmlGenericError(xmlGenericErrorContext,
4919			    "HPP: entering CONTENT\n");
4920#endif
4921		    break;
4922		}
4923		if (in->cur[1] == '/') {
4924		    ctxt->instate = XML_PARSER_END_TAG;
4925		    ctxt->checkIndex = 0;
4926#ifdef DEBUG_PUSH
4927		    xmlGenericError(xmlGenericErrorContext,
4928			    "HPP: entering END_TAG\n");
4929#endif
4930		    break;
4931		}
4932		if ((!terminate) &&
4933		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4934		    goto done;
4935
4936		failed = htmlParseStartTag(ctxt);
4937		name = ctxt->name;
4938		if ((failed == -1) ||
4939		    (name == NULL)) {
4940		    if (CUR == '>')
4941			NEXT;
4942		    break;
4943		}
4944
4945		/*
4946		 * Lookup the info for that element.
4947		 */
4948		info = htmlTagLookup(name);
4949		if (info == NULL) {
4950		    htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4951		                 "Tag %s invalid\n", name, NULL);
4952		}
4953
4954		/*
4955		 * Check for an Empty Element labeled the XML/SGML way
4956		 */
4957		if ((CUR == '/') && (NXT(1) == '>')) {
4958		    SKIP(2);
4959		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4960			ctxt->sax->endElement(ctxt->userData, name);
4961		    htmlnamePop(ctxt);
4962		    ctxt->instate = XML_PARSER_CONTENT;
4963#ifdef DEBUG_PUSH
4964		    xmlGenericError(xmlGenericErrorContext,
4965			    "HPP: entering CONTENT\n");
4966#endif
4967		    break;
4968		}
4969
4970		if (CUR == '>') {
4971		    NEXT;
4972		} else {
4973		    htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4974		                 "Couldn't find end of Start Tag %s\n",
4975				 name, NULL);
4976
4977		    /*
4978		     * end of parsing of this node.
4979		     */
4980		    if (xmlStrEqual(name, ctxt->name)) {
4981			nodePop(ctxt);
4982			htmlnamePop(ctxt);
4983		    }
4984
4985		    ctxt->instate = XML_PARSER_CONTENT;
4986#ifdef DEBUG_PUSH
4987		    xmlGenericError(xmlGenericErrorContext,
4988			    "HPP: entering CONTENT\n");
4989#endif
4990		    break;
4991		}
4992
4993		/*
4994		 * Check for an Empty Element from DTD definition
4995		 */
4996		if ((info != NULL) && (info->empty)) {
4997		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4998			ctxt->sax->endElement(ctxt->userData, name);
4999		    htmlnamePop(ctxt);
5000		}
5001		ctxt->instate = XML_PARSER_CONTENT;
5002#ifdef DEBUG_PUSH
5003		xmlGenericError(xmlGenericErrorContext,
5004			"HPP: entering CONTENT\n");
5005#endif
5006                break;
5007	    }
5008            case XML_PARSER_CONTENT: {
5009		long cons;
5010                /*
5011		 * Handle preparsed entities and charRef
5012		 */
5013		if (ctxt->token != 0) {
5014		    xmlChar chr[2] = { 0 , 0 } ;
5015
5016		    chr[0] = (xmlChar) ctxt->token;
5017		    htmlCheckParagraph(ctxt);
5018		    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5019			ctxt->sax->characters(ctxt->userData, chr, 1);
5020		    ctxt->token = 0;
5021		    ctxt->checkIndex = 0;
5022		}
5023		if ((avail == 1) && (terminate)) {
5024		    cur = in->cur[0];
5025		    if ((cur != '<') && (cur != '&')) {
5026			if (ctxt->sax != NULL) {
5027			    if (IS_BLANK_CH(cur)) {
5028				if (ctxt->sax->ignorableWhitespace != NULL)
5029				    ctxt->sax->ignorableWhitespace(
5030					    ctxt->userData, &cur, 1);
5031			    } else {
5032				htmlCheckParagraph(ctxt);
5033				if (ctxt->sax->characters != NULL)
5034				    ctxt->sax->characters(
5035					    ctxt->userData, &cur, 1);
5036			    }
5037			}
5038			ctxt->token = 0;
5039			ctxt->checkIndex = 0;
5040			in->cur++;
5041			break;
5042		    }
5043		}
5044		if (avail < 2)
5045		    goto done;
5046		cur = in->cur[0];
5047		next = in->cur[1];
5048		cons = ctxt->nbChars;
5049		if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5050		    (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5051		    /*
5052		     * Handle SCRIPT/STYLE separately
5053		     */
5054		    if (!terminate) {
5055		        int idx;
5056			xmlChar val;
5057
5058			idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5059			if (idx < 0)
5060			    goto done;
5061		        val = in->cur[idx + 2];
5062			if (val == 0) /* bad cut of input */
5063			    goto done;
5064		    }
5065		    htmlParseScript(ctxt);
5066		    if ((cur == '<') && (next == '/')) {
5067			ctxt->instate = XML_PARSER_END_TAG;
5068			ctxt->checkIndex = 0;
5069#ifdef DEBUG_PUSH
5070			xmlGenericError(xmlGenericErrorContext,
5071				"HPP: entering END_TAG\n");
5072#endif
5073			break;
5074		    }
5075		} else {
5076		    /*
5077		     * Sometimes DOCTYPE arrives in the middle of the document
5078		     */
5079		    if ((cur == '<') && (next == '!') &&
5080			(UPP(2) == 'D') && (UPP(3) == 'O') &&
5081			(UPP(4) == 'C') && (UPP(5) == 'T') &&
5082			(UPP(6) == 'Y') && (UPP(7) == 'P') &&
5083			(UPP(8) == 'E')) {
5084			if ((!terminate) &&
5085			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5086			    goto done;
5087			htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5088			             "Misplaced DOCTYPE declaration\n",
5089				     BAD_CAST "DOCTYPE" , NULL);
5090			htmlParseDocTypeDecl(ctxt);
5091		    } else if ((cur == '<') && (next == '!') &&
5092			(in->cur[2] == '-') && (in->cur[3] == '-')) {
5093			if ((!terminate) &&
5094			    (htmlParseLookupSequence(
5095			    		ctxt, '-', '-', '>', 1) < 0))
5096			    goto done;
5097#ifdef DEBUG_PUSH
5098			xmlGenericError(xmlGenericErrorContext,
5099				"HPP: Parsing Comment\n");
5100#endif
5101			htmlParseComment(ctxt);
5102			ctxt->instate = XML_PARSER_CONTENT;
5103		    } else if ((cur == '<') && (next == '?')) {
5104			if ((!terminate) &&
5105			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5106			    goto done;
5107#ifdef DEBUG_PUSH
5108			xmlGenericError(xmlGenericErrorContext,
5109				"HPP: Parsing PI\n");
5110#endif
5111			htmlParsePI(ctxt);
5112			ctxt->instate = XML_PARSER_CONTENT;
5113		    } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5114			goto done;
5115		    } else if ((cur == '<') && (next == '/')) {
5116			ctxt->instate = XML_PARSER_END_TAG;
5117			ctxt->checkIndex = 0;
5118#ifdef DEBUG_PUSH
5119			xmlGenericError(xmlGenericErrorContext,
5120				"HPP: entering END_TAG\n");
5121#endif
5122			break;
5123		    } else if (cur == '<') {
5124			ctxt->instate = XML_PARSER_START_TAG;
5125			ctxt->checkIndex = 0;
5126#ifdef DEBUG_PUSH
5127			xmlGenericError(xmlGenericErrorContext,
5128				"HPP: entering START_TAG\n");
5129#endif
5130			break;
5131		    } else if (cur == '&') {
5132			if ((!terminate) &&
5133			    (htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0))
5134			    goto done;
5135#ifdef DEBUG_PUSH
5136			xmlGenericError(xmlGenericErrorContext,
5137				"HPP: Parsing Reference\n");
5138#endif
5139			/* TODO: check generation of subtrees if noent !!! */
5140			htmlParseReference(ctxt);
5141		    } else {
5142		        /*
5143			 * check that the text sequence is complete
5144			 * before handing out the data to the parser
5145			 * to avoid problems with erroneous end of
5146			 * data detection.
5147			 */
5148			if ((!terminate) &&
5149			    (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
5150			    goto done;
5151			ctxt->checkIndex = 0;
5152#ifdef DEBUG_PUSH
5153			xmlGenericError(xmlGenericErrorContext,
5154				"HPP: Parsing char data\n");
5155#endif
5156			htmlParseCharData(ctxt);
5157		    }
5158		}
5159		if (cons == ctxt->nbChars) {
5160		    if (ctxt->node != NULL) {
5161			htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5162			             "detected an error in element content\n",
5163				     NULL, NULL);
5164		    }
5165		    NEXT;
5166		    break;
5167		}
5168
5169		break;
5170	    }
5171            case XML_PARSER_END_TAG:
5172		if (avail < 2)
5173		    goto done;
5174		if ((!terminate) &&
5175		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5176		    goto done;
5177		htmlParseEndTag(ctxt);
5178		if (ctxt->nameNr == 0) {
5179		    ctxt->instate = XML_PARSER_EPILOG;
5180		} else {
5181		    ctxt->instate = XML_PARSER_CONTENT;
5182		}
5183		ctxt->checkIndex = 0;
5184#ifdef DEBUG_PUSH
5185		xmlGenericError(xmlGenericErrorContext,
5186			"HPP: entering CONTENT\n");
5187#endif
5188	        break;
5189            case XML_PARSER_CDATA_SECTION:
5190		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5191			"HPP: internal error, state == CDATA\n",
5192			     NULL, NULL);
5193		ctxt->instate = XML_PARSER_CONTENT;
5194		ctxt->checkIndex = 0;
5195#ifdef DEBUG_PUSH
5196		xmlGenericError(xmlGenericErrorContext,
5197			"HPP: entering CONTENT\n");
5198#endif
5199		break;
5200            case XML_PARSER_DTD:
5201		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5202			"HPP: internal error, state == DTD\n",
5203			     NULL, NULL);
5204		ctxt->instate = XML_PARSER_CONTENT;
5205		ctxt->checkIndex = 0;
5206#ifdef DEBUG_PUSH
5207		xmlGenericError(xmlGenericErrorContext,
5208			"HPP: entering CONTENT\n");
5209#endif
5210		break;
5211            case XML_PARSER_COMMENT:
5212		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5213			"HPP: internal error, state == COMMENT\n",
5214			     NULL, NULL);
5215		ctxt->instate = XML_PARSER_CONTENT;
5216		ctxt->checkIndex = 0;
5217#ifdef DEBUG_PUSH
5218		xmlGenericError(xmlGenericErrorContext,
5219			"HPP: entering CONTENT\n");
5220#endif
5221		break;
5222            case XML_PARSER_PI:
5223		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5224			"HPP: internal error, state == PI\n",
5225			     NULL, NULL);
5226		ctxt->instate = XML_PARSER_CONTENT;
5227		ctxt->checkIndex = 0;
5228#ifdef DEBUG_PUSH
5229		xmlGenericError(xmlGenericErrorContext,
5230			"HPP: entering CONTENT\n");
5231#endif
5232		break;
5233            case XML_PARSER_ENTITY_DECL:
5234		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5235			"HPP: internal error, state == ENTITY_DECL\n",
5236			     NULL, NULL);
5237		ctxt->instate = XML_PARSER_CONTENT;
5238		ctxt->checkIndex = 0;
5239#ifdef DEBUG_PUSH
5240		xmlGenericError(xmlGenericErrorContext,
5241			"HPP: entering CONTENT\n");
5242#endif
5243		break;
5244            case XML_PARSER_ENTITY_VALUE:
5245		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5246			"HPP: internal error, state == ENTITY_VALUE\n",
5247			     NULL, NULL);
5248		ctxt->instate = XML_PARSER_CONTENT;
5249		ctxt->checkIndex = 0;
5250#ifdef DEBUG_PUSH
5251		xmlGenericError(xmlGenericErrorContext,
5252			"HPP: entering DTD\n");
5253#endif
5254		break;
5255            case XML_PARSER_ATTRIBUTE_VALUE:
5256		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5257			"HPP: internal error, state == ATTRIBUTE_VALUE\n",
5258			     NULL, NULL);
5259		ctxt->instate = XML_PARSER_START_TAG;
5260		ctxt->checkIndex = 0;
5261#ifdef DEBUG_PUSH
5262		xmlGenericError(xmlGenericErrorContext,
5263			"HPP: entering START_TAG\n");
5264#endif
5265		break;
5266	    case XML_PARSER_SYSTEM_LITERAL:
5267		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5268		    "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5269			     NULL, NULL);
5270		ctxt->instate = XML_PARSER_CONTENT;
5271		ctxt->checkIndex = 0;
5272#ifdef DEBUG_PUSH
5273		xmlGenericError(xmlGenericErrorContext,
5274			"HPP: entering CONTENT\n");
5275#endif
5276		break;
5277	    case XML_PARSER_IGNORE:
5278		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5279			"HPP: internal error, state == XML_PARSER_IGNORE\n",
5280			     NULL, NULL);
5281		ctxt->instate = XML_PARSER_CONTENT;
5282		ctxt->checkIndex = 0;
5283#ifdef DEBUG_PUSH
5284		xmlGenericError(xmlGenericErrorContext,
5285			"HPP: entering CONTENT\n");
5286#endif
5287		break;
5288	    case XML_PARSER_PUBLIC_LITERAL:
5289		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5290			"HPP: internal error, state == XML_PARSER_LITERAL\n",
5291			     NULL, NULL);
5292		ctxt->instate = XML_PARSER_CONTENT;
5293		ctxt->checkIndex = 0;
5294#ifdef DEBUG_PUSH
5295		xmlGenericError(xmlGenericErrorContext,
5296			"HPP: entering CONTENT\n");
5297#endif
5298		break;
5299
5300	}
5301    }
5302done:
5303    if ((avail == 0) && (terminate)) {
5304	htmlAutoCloseOnEnd(ctxt);
5305	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5306	    /*
5307	     * SAX: end of the document processing.
5308	     */
5309	    ctxt->instate = XML_PARSER_EOF;
5310	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5311		ctxt->sax->endDocument(ctxt->userData);
5312	}
5313    }
5314    if ((ctxt->myDoc != NULL) &&
5315	((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5316	 (ctxt->instate == XML_PARSER_EPILOG))) {
5317	xmlDtdPtr dtd;
5318	dtd = xmlGetIntSubset(ctxt->myDoc);
5319	if (dtd == NULL)
5320	    ctxt->myDoc->intSubset =
5321		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5322		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5323		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5324    }
5325#ifdef DEBUG_PUSH
5326    xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5327#endif
5328    return(ret);
5329}
5330
5331/**
5332 * htmlParseChunk:
5333 * @ctxt:  an HTML parser context
5334 * @chunk:  an char array
5335 * @size:  the size in byte of the chunk
5336 * @terminate:  last chunk indicator
5337 *
5338 * Parse a Chunk of memory
5339 *
5340 * Returns zero if no error, the xmlParserErrors otherwise.
5341 */
5342int
5343htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5344              int terminate) {
5345    if ((ctxt == NULL) || (ctxt->input == NULL)) {
5346	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5347		     "htmlParseChunk: context error\n", NULL, NULL);
5348	return(XML_ERR_INTERNAL_ERROR);
5349    }
5350    if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5351        (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
5352	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5353	int cur = ctxt->input->cur - ctxt->input->base;
5354	int res;
5355
5356	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5357	if (res < 0) {
5358	    ctxt->errNo = XML_PARSER_EOF;
5359	    ctxt->disableSAX = 1;
5360	    return (XML_PARSER_EOF);
5361	}
5362	ctxt->input->base = ctxt->input->buf->buffer->content + base;
5363	ctxt->input->cur = ctxt->input->base + cur;
5364	ctxt->input->end =
5365	  &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
5366#ifdef DEBUG_PUSH
5367	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5368#endif
5369
5370#if 0
5371	if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5372	    htmlParseTryOrFinish(ctxt, terminate);
5373#endif
5374    } else if (ctxt->instate != XML_PARSER_EOF) {
5375	if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5376	    xmlParserInputBufferPtr in = ctxt->input->buf;
5377	    if ((in->encoder != NULL) && (in->buffer != NULL) &&
5378		    (in->raw != NULL)) {
5379		int nbchars;
5380
5381		nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5382		if (nbchars < 0) {
5383		    htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5384			         "encoder error\n", NULL, NULL);
5385		    return(XML_ERR_INVALID_ENCODING);
5386		}
5387	    }
5388	}
5389    }
5390    htmlParseTryOrFinish(ctxt, terminate);
5391    if (terminate) {
5392	if ((ctxt->instate != XML_PARSER_EOF) &&
5393	    (ctxt->instate != XML_PARSER_EPILOG) &&
5394	    (ctxt->instate != XML_PARSER_MISC)) {
5395	    ctxt->errNo = XML_ERR_DOCUMENT_END;
5396	    ctxt->wellFormed = 0;
5397	}
5398	if (ctxt->instate != XML_PARSER_EOF) {
5399	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5400		ctxt->sax->endDocument(ctxt->userData);
5401	}
5402	ctxt->instate = XML_PARSER_EOF;
5403    }
5404    return((xmlParserErrors) ctxt->errNo);
5405}
5406
5407/************************************************************************
5408 *									*
5409 *			User entry points				*
5410 *									*
5411 ************************************************************************/
5412
5413/**
5414 * htmlCreatePushParserCtxt:
5415 * @sax:  a SAX handler
5416 * @user_data:  The user data returned on SAX callbacks
5417 * @chunk:  a pointer to an array of chars
5418 * @size:  number of chars in the array
5419 * @filename:  an optional file name or URI
5420 * @enc:  an optional encoding
5421 *
5422 * Create a parser context for using the HTML parser in push mode
5423 * The value of @filename is used for fetching external entities
5424 * and error/warning reports.
5425 *
5426 * Returns the new parser context or NULL
5427 */
5428htmlParserCtxtPtr
5429htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5430                         const char *chunk, int size, const char *filename,
5431			 xmlCharEncoding enc) {
5432    htmlParserCtxtPtr ctxt;
5433    htmlParserInputPtr inputStream;
5434    xmlParserInputBufferPtr buf;
5435
5436    xmlInitParser();
5437
5438    buf = xmlAllocParserInputBuffer(enc);
5439    if (buf == NULL) return(NULL);
5440
5441    ctxt = htmlNewParserCtxt();
5442    if (ctxt == NULL) {
5443	xmlFreeParserInputBuffer(buf);
5444	return(NULL);
5445    }
5446    if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
5447	ctxt->charset=XML_CHAR_ENCODING_UTF8;
5448    if (sax != NULL) {
5449	if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
5450	    xmlFree(ctxt->sax);
5451	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5452	if (ctxt->sax == NULL) {
5453	    xmlFree(buf);
5454	    xmlFree(ctxt);
5455	    return(NULL);
5456	}
5457	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5458	if (user_data != NULL)
5459	    ctxt->userData = user_data;
5460    }
5461    if (filename == NULL) {
5462	ctxt->directory = NULL;
5463    } else {
5464        ctxt->directory = xmlParserGetDirectory(filename);
5465    }
5466
5467    inputStream = htmlNewInputStream(ctxt);
5468    if (inputStream == NULL) {
5469	xmlFreeParserCtxt(ctxt);
5470	xmlFree(buf);
5471	return(NULL);
5472    }
5473
5474    if (filename == NULL)
5475	inputStream->filename = NULL;
5476    else
5477	inputStream->filename = (char *)
5478	    xmlCanonicPath((const xmlChar *) filename);
5479    inputStream->buf = buf;
5480    inputStream->base = inputStream->buf->buffer->content;
5481    inputStream->cur = inputStream->buf->buffer->content;
5482    inputStream->end =
5483	&inputStream->buf->buffer->content[inputStream->buf->buffer->use];
5484
5485    inputPush(ctxt, inputStream);
5486
5487    if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5488        (ctxt->input->buf != NULL))  {
5489	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5490	int cur = ctxt->input->cur - ctxt->input->base;
5491
5492	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5493
5494	ctxt->input->base = ctxt->input->buf->buffer->content + base;
5495	ctxt->input->cur = ctxt->input->base + cur;
5496	ctxt->input->end =
5497	    &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
5498#ifdef DEBUG_PUSH
5499	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5500#endif
5501    }
5502    ctxt->progressive = 1;
5503
5504    return(ctxt);
5505}
5506#endif /* LIBXML_PUSH_ENABLED */
5507
5508/**
5509 * htmlSAXParseDoc:
5510 * @cur:  a pointer to an array of xmlChar
5511 * @encoding:  a free form C string describing the HTML document encoding, or NULL
5512 * @sax:  the SAX handler block
5513 * @userData: if using SAX, this pointer will be provided on callbacks.
5514 *
5515 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5516 * to handle parse events. If sax is NULL, fallback to the default DOM
5517 * behavior and return a tree.
5518 *
5519 * Returns the resulting document tree unless SAX is NULL or the document is
5520 *     not well formed.
5521 */
5522
5523htmlDocPtr
5524htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
5525    htmlDocPtr ret;
5526    htmlParserCtxtPtr ctxt;
5527
5528    xmlInitParser();
5529
5530    if (cur == NULL) return(NULL);
5531
5532
5533    ctxt = htmlCreateDocParserCtxt(cur, encoding);
5534    if (ctxt == NULL) return(NULL);
5535    if (sax != NULL) {
5536        if (ctxt->sax != NULL) xmlFree (ctxt->sax);
5537        ctxt->sax = sax;
5538        ctxt->userData = userData;
5539    }
5540
5541    htmlParseDocument(ctxt);
5542    ret = ctxt->myDoc;
5543    if (sax != NULL) {
5544	ctxt->sax = NULL;
5545	ctxt->userData = NULL;
5546    }
5547    htmlFreeParserCtxt(ctxt);
5548
5549    return(ret);
5550}
5551
5552/**
5553 * htmlParseDoc:
5554 * @cur:  a pointer to an array of xmlChar
5555 * @encoding:  a free form C string describing the HTML document encoding, or NULL
5556 *
5557 * parse an HTML in-memory document and build a tree.
5558 *
5559 * Returns the resulting document tree
5560 */
5561
5562htmlDocPtr
5563htmlParseDoc(xmlChar *cur, const char *encoding) {
5564    return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5565}
5566
5567
5568/**
5569 * htmlCreateFileParserCtxt:
5570 * @filename:  the filename
5571 * @encoding:  a free form C string describing the HTML document encoding, or NULL
5572 *
5573 * Create a parser context for a file content.
5574 * Automatic support for ZLIB/Compress compressed document is provided
5575 * by default if found at compile-time.
5576 *
5577 * Returns the new parser context or NULL
5578 */
5579htmlParserCtxtPtr
5580htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5581{
5582    htmlParserCtxtPtr ctxt;
5583    htmlParserInputPtr inputStream;
5584    char *canonicFilename;
5585    /* htmlCharEncoding enc; */
5586    xmlChar *content, *content_line = (xmlChar *) "charset=";
5587
5588    if (filename == NULL)
5589        return(NULL);
5590
5591    ctxt = htmlNewParserCtxt();
5592    if (ctxt == NULL) {
5593	return(NULL);
5594    }
5595    canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
5596    if (canonicFilename == NULL) {
5597#ifdef LIBXML_SAX1_ENABLED
5598	if (xmlDefaultSAXHandler.error != NULL) {
5599	    xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5600	}
5601#endif
5602	xmlFreeParserCtxt(ctxt);
5603	return(NULL);
5604    }
5605
5606    inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5607    xmlFree(canonicFilename);
5608    if (inputStream == NULL) {
5609	xmlFreeParserCtxt(ctxt);
5610	return(NULL);
5611    }
5612
5613    inputPush(ctxt, inputStream);
5614
5615    /* set encoding */
5616    if (encoding) {
5617        content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
5618	if (content) {
5619	    strcpy ((char *)content, (char *)content_line);
5620            strcat ((char *)content, (char *)encoding);
5621            htmlCheckEncoding (ctxt, content);
5622	    xmlFree (content);
5623	}
5624    }
5625
5626    return(ctxt);
5627}
5628
5629/**
5630 * htmlSAXParseFile:
5631 * @filename:  the filename
5632 * @encoding:  a free form C string describing the HTML document encoding, or NULL
5633 * @sax:  the SAX handler block
5634 * @userData: if using SAX, this pointer will be provided on callbacks.
5635 *
5636 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5637 * compressed document is provided by default if found at compile-time.
5638 * It use the given SAX function block to handle the parsing callback.
5639 * If sax is NULL, fallback to the default DOM tree building routines.
5640 *
5641 * Returns the resulting document tree unless SAX is NULL or the document is
5642 *     not well formed.
5643 */
5644
5645htmlDocPtr
5646htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5647                 void *userData) {
5648    htmlDocPtr ret;
5649    htmlParserCtxtPtr ctxt;
5650    htmlSAXHandlerPtr oldsax = NULL;
5651
5652    xmlInitParser();
5653
5654    ctxt = htmlCreateFileParserCtxt(filename, encoding);
5655    if (ctxt == NULL) return(NULL);
5656    if (sax != NULL) {
5657	oldsax = ctxt->sax;
5658        ctxt->sax = sax;
5659        ctxt->userData = userData;
5660    }
5661
5662    htmlParseDocument(ctxt);
5663
5664    ret = ctxt->myDoc;
5665    if (sax != NULL) {
5666        ctxt->sax = oldsax;
5667        ctxt->userData = NULL;
5668    }
5669    htmlFreeParserCtxt(ctxt);
5670
5671    return(ret);
5672}
5673
5674/**
5675 * htmlParseFile:
5676 * @filename:  the filename
5677 * @encoding:  a free form C string describing the HTML document encoding, or NULL
5678 *
5679 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5680 * compressed document is provided by default if found at compile-time.
5681 *
5682 * Returns the resulting document tree
5683 */
5684
5685htmlDocPtr
5686htmlParseFile(const char *filename, const char *encoding) {
5687    return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5688}
5689
5690/**
5691 * htmlHandleOmittedElem:
5692 * @val:  int 0 or 1
5693 *
5694 * Set and return the previous value for handling HTML omitted tags.
5695 *
5696 * Returns the last value for 0 for no handling, 1 for auto insertion.
5697 */
5698
5699int
5700htmlHandleOmittedElem(int val) {
5701    int old = htmlOmittedDefaultValue;
5702
5703    htmlOmittedDefaultValue = val;
5704    return(old);
5705}
5706
5707/**
5708 * htmlElementAllowedHere:
5709 * @parent: HTML parent element
5710 * @elt: HTML element
5711 *
5712 * Checks whether an HTML element may be a direct child of a parent element.
5713 * Note - doesn't check for deprecated elements
5714 *
5715 * Returns 1 if allowed; 0 otherwise.
5716 */
5717int
5718htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5719  const char** p ;
5720
5721  if ( ! elt || ! parent || ! parent->subelts )
5722	return 0 ;
5723
5724  for ( p = parent->subelts; *p; ++p )
5725    if ( !xmlStrcmp((const xmlChar *)*p, elt) )
5726      return 1 ;
5727
5728  return 0 ;
5729}
5730/**
5731 * htmlElementStatusHere:
5732 * @parent: HTML parent element
5733 * @elt: HTML element
5734 *
5735 * Checks whether an HTML element may be a direct child of a parent element.
5736 * and if so whether it is valid or deprecated.
5737 *
5738 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5739 */
5740htmlStatus
5741htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5742  if ( ! parent || ! elt )
5743    return HTML_INVALID ;
5744  if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5745    return HTML_INVALID ;
5746
5747  return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5748}
5749/**
5750 * htmlAttrAllowed:
5751 * @elt: HTML element
5752 * @attr: HTML attribute
5753 * @legacy: whether to allow deprecated attributes
5754 *
5755 * Checks whether an attribute is valid for an element
5756 * Has full knowledge of Required and Deprecated attributes
5757 *
5758 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5759 */
5760htmlStatus
5761htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5762  const char** p ;
5763
5764  if ( !elt || ! attr )
5765	return HTML_INVALID ;
5766
5767  if ( elt->attrs_req )
5768    for ( p = elt->attrs_req; *p; ++p)
5769      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5770        return HTML_REQUIRED ;
5771
5772  if ( elt->attrs_opt )
5773    for ( p = elt->attrs_opt; *p; ++p)
5774      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5775        return HTML_VALID ;
5776
5777  if ( legacy && elt->attrs_depr )
5778    for ( p = elt->attrs_depr; *p; ++p)
5779      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
5780        return HTML_DEPRECATED ;
5781
5782  return HTML_INVALID ;
5783}
5784/**
5785 * htmlNodeStatus:
5786 * @node: an htmlNodePtr in a tree
5787 * @legacy: whether to allow deprecated elements (YES is faster here
5788 *	for Element nodes)
5789 *
5790 * Checks whether the tree node is valid.  Experimental (the author
5791 *     only uses the HTML enhancements in a SAX parser)
5792 *
5793 * Return: for Element nodes, a return from htmlElementAllowedHere (if
5794 *	legacy allowed) or htmlElementStatusHere (otherwise).
5795 *	for Attribute nodes, a return from htmlAttrAllowed
5796 *	for other nodes, HTML_NA (no checks performed)
5797 */
5798htmlStatus
5799htmlNodeStatus(const htmlNodePtr node, int legacy) {
5800  if ( ! node )
5801    return HTML_INVALID ;
5802
5803  switch ( node->type ) {
5804    case XML_ELEMENT_NODE:
5805      return legacy
5806	? ( htmlElementAllowedHere (
5807		htmlTagLookup(node->parent->name) , node->name
5808		) ? HTML_VALID : HTML_INVALID )
5809	: htmlElementStatusHere(
5810		htmlTagLookup(node->parent->name) ,
5811		htmlTagLookup(node->name) )
5812	;
5813    case XML_ATTRIBUTE_NODE:
5814      return htmlAttrAllowed(
5815	htmlTagLookup(node->parent->name) , node->name, legacy) ;
5816    default: return HTML_NA ;
5817  }
5818}
5819/************************************************************************
5820 *									*
5821 *	New set (2.6.0) of simpler and more flexible APIs		*
5822 *									*
5823 ************************************************************************/
5824/**
5825 * DICT_FREE:
5826 * @str:  a string
5827 *
5828 * Free a string if it is not owned by the "dict" dictionnary in the
5829 * current scope
5830 */
5831#define DICT_FREE(str)						\
5832	if ((str) && ((!dict) || 				\
5833	    (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))	\
5834	    xmlFree((char *)(str));
5835
5836/**
5837 * htmlCtxtReset:
5838 * @ctxt: an HTML parser context
5839 *
5840 * Reset a parser context
5841 */
5842void
5843htmlCtxtReset(htmlParserCtxtPtr ctxt)
5844{
5845    xmlParserInputPtr input;
5846    xmlDictPtr dict;
5847
5848    if (ctxt == NULL)
5849        return;
5850
5851    xmlInitParser();
5852    dict = ctxt->dict;
5853
5854    while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5855        xmlFreeInputStream(input);
5856    }
5857    ctxt->inputNr = 0;
5858    ctxt->input = NULL;
5859
5860    ctxt->spaceNr = 0;
5861    if (ctxt->spaceTab != NULL) {
5862	ctxt->spaceTab[0] = -1;
5863	ctxt->space = &ctxt->spaceTab[0];
5864    } else {
5865	ctxt->space = NULL;
5866    }
5867
5868
5869    ctxt->nodeNr = 0;
5870    ctxt->node = NULL;
5871
5872    ctxt->nameNr = 0;
5873    ctxt->name = NULL;
5874
5875    DICT_FREE(ctxt->version);
5876    ctxt->version = NULL;
5877    DICT_FREE(ctxt->encoding);
5878    ctxt->encoding = NULL;
5879    DICT_FREE(ctxt->directory);
5880    ctxt->directory = NULL;
5881    DICT_FREE(ctxt->extSubURI);
5882    ctxt->extSubURI = NULL;
5883    DICT_FREE(ctxt->extSubSystem);
5884    ctxt->extSubSystem = NULL;
5885    if (ctxt->myDoc != NULL)
5886        xmlFreeDoc(ctxt->myDoc);
5887    ctxt->myDoc = NULL;
5888
5889    ctxt->standalone = -1;
5890    ctxt->hasExternalSubset = 0;
5891    ctxt->hasPErefs = 0;
5892    ctxt->html = 1;
5893    ctxt->external = 0;
5894    ctxt->instate = XML_PARSER_START;
5895    ctxt->token = 0;
5896
5897    ctxt->wellFormed = 1;
5898    ctxt->nsWellFormed = 1;
5899    ctxt->valid = 1;
5900    ctxt->vctxt.userData = ctxt;
5901    ctxt->vctxt.error = xmlParserValidityError;
5902    ctxt->vctxt.warning = xmlParserValidityWarning;
5903    ctxt->record_info = 0;
5904    ctxt->nbChars = 0;
5905    ctxt->checkIndex = 0;
5906    ctxt->inSubset = 0;
5907    ctxt->errNo = XML_ERR_OK;
5908    ctxt->depth = 0;
5909    ctxt->charset = XML_CHAR_ENCODING_NONE;
5910    ctxt->catalogs = NULL;
5911    xmlInitNodeInfoSeq(&ctxt->node_seq);
5912
5913    if (ctxt->attsDefault != NULL) {
5914        xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
5915        ctxt->attsDefault = NULL;
5916    }
5917    if (ctxt->attsSpecial != NULL) {
5918        xmlHashFree(ctxt->attsSpecial, NULL);
5919        ctxt->attsSpecial = NULL;
5920    }
5921}
5922
5923/**
5924 * htmlCtxtUseOptions:
5925 * @ctxt: an HTML parser context
5926 * @options:  a combination of htmlParserOption(s)
5927 *
5928 * Applies the options to the parser context
5929 *
5930 * Returns 0 in case of success, the set of unknown or unimplemented options
5931 *         in case of error.
5932 */
5933int
5934htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
5935{
5936    if (ctxt == NULL)
5937        return(-1);
5938
5939    if (options & HTML_PARSE_NOWARNING) {
5940        ctxt->sax->warning = NULL;
5941        ctxt->vctxt.warning = NULL;
5942        options -= XML_PARSE_NOWARNING;
5943	ctxt->options |= XML_PARSE_NOWARNING;
5944    }
5945    if (options & HTML_PARSE_NOERROR) {
5946        ctxt->sax->error = NULL;
5947        ctxt->vctxt.error = NULL;
5948        ctxt->sax->fatalError = NULL;
5949        options -= XML_PARSE_NOERROR;
5950	ctxt->options |= XML_PARSE_NOERROR;
5951    }
5952    if (options & HTML_PARSE_PEDANTIC) {
5953        ctxt->pedantic = 1;
5954        options -= XML_PARSE_PEDANTIC;
5955	ctxt->options |= XML_PARSE_PEDANTIC;
5956    } else
5957        ctxt->pedantic = 0;
5958    if (options & XML_PARSE_NOBLANKS) {
5959        ctxt->keepBlanks = 0;
5960        ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5961        options -= XML_PARSE_NOBLANKS;
5962	ctxt->options |= XML_PARSE_NOBLANKS;
5963    } else
5964        ctxt->keepBlanks = 1;
5965    if (options & HTML_PARSE_RECOVER) {
5966        ctxt->recovery = 1;
5967	options -= HTML_PARSE_RECOVER;
5968    } else
5969        ctxt->recovery = 0;
5970    if (options & HTML_PARSE_COMPACT) {
5971	ctxt->options |= HTML_PARSE_COMPACT;
5972        options -= HTML_PARSE_COMPACT;
5973    }
5974    ctxt->dictNames = 0;
5975    return (options);
5976}
5977
5978/**
5979 * htmlDoRead:
5980 * @ctxt:  an HTML parser context
5981 * @URL:  the base URL to use for the document
5982 * @encoding:  the document encoding, or NULL
5983 * @options:  a combination of htmlParserOption(s)
5984 * @reuse:  keep the context for reuse
5985 *
5986 * Common front-end for the htmlRead functions
5987 *
5988 * Returns the resulting document tree or NULL
5989 */
5990static htmlDocPtr
5991htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
5992          int options, int reuse)
5993{
5994    htmlDocPtr ret;
5995
5996    htmlCtxtUseOptions(ctxt, options);
5997    ctxt->html = 1;
5998    if (encoding != NULL) {
5999        xmlCharEncodingHandlerPtr hdlr;
6000
6001	hdlr = xmlFindCharEncodingHandler(encoding);
6002	if (hdlr != NULL) {
6003	    xmlSwitchToEncoding(ctxt, hdlr);
6004	    if (ctxt->input->encoding != NULL)
6005	      xmlFree((xmlChar *) ctxt->input->encoding);
6006            ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6007        }
6008    }
6009    if ((URL != NULL) && (ctxt->input != NULL) &&
6010        (ctxt->input->filename == NULL))
6011        ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6012    htmlParseDocument(ctxt);
6013    ret = ctxt->myDoc;
6014    ctxt->myDoc = NULL;
6015    if (!reuse) {
6016        if ((ctxt->dictNames) &&
6017	    (ret != NULL) &&
6018	    (ret->dict == ctxt->dict))
6019	    ctxt->dict = NULL;
6020	xmlFreeParserCtxt(ctxt);
6021    }
6022    return (ret);
6023}
6024
6025/**
6026 * htmlReadDoc:
6027 * @cur:  a pointer to a zero terminated string
6028 * @URL:  the base URL to use for the document
6029 * @encoding:  the document encoding, or NULL
6030 * @options:  a combination of htmlParserOption(s)
6031 *
6032 * parse an XML in-memory document and build a tree.
6033 *
6034 * Returns the resulting document tree
6035 */
6036htmlDocPtr
6037htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6038{
6039    htmlParserCtxtPtr ctxt;
6040
6041    if (cur == NULL)
6042        return (NULL);
6043
6044    xmlInitParser();
6045    ctxt = htmlCreateDocParserCtxt(cur, NULL);
6046    if (ctxt == NULL)
6047        return (NULL);
6048    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6049}
6050
6051/**
6052 * htmlReadFile:
6053 * @filename:  a file or URL
6054 * @encoding:  the document encoding, or NULL
6055 * @options:  a combination of htmlParserOption(s)
6056 *
6057 * parse an XML file from the filesystem or the network.
6058 *
6059 * Returns the resulting document tree
6060 */
6061htmlDocPtr
6062htmlReadFile(const char *filename, const char *encoding, int options)
6063{
6064    htmlParserCtxtPtr ctxt;
6065
6066    xmlInitParser();
6067    ctxt = htmlCreateFileParserCtxt(filename, encoding);
6068    if (ctxt == NULL)
6069        return (NULL);
6070    return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6071}
6072
6073/**
6074 * htmlReadMemory:
6075 * @buffer:  a pointer to a char array
6076 * @size:  the size of the array
6077 * @URL:  the base URL to use for the document
6078 * @encoding:  the document encoding, or NULL
6079 * @options:  a combination of htmlParserOption(s)
6080 *
6081 * parse an XML in-memory document and build a tree.
6082 *
6083 * Returns the resulting document tree
6084 */
6085htmlDocPtr
6086htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6087{
6088    htmlParserCtxtPtr ctxt;
6089
6090    xmlInitParser();
6091    ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6092    if (ctxt == NULL)
6093        return (NULL);
6094    htmlDefaultSAXHandlerInit();
6095    if (ctxt->sax != NULL)
6096        memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6097    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6098}
6099
6100/**
6101 * htmlReadFd:
6102 * @fd:  an open file descriptor
6103 * @URL:  the base URL to use for the document
6104 * @encoding:  the document encoding, or NULL
6105 * @options:  a combination of htmlParserOption(s)
6106 *
6107 * parse an XML from a file descriptor and build a tree.
6108 *
6109 * Returns the resulting document tree
6110 */
6111htmlDocPtr
6112htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6113{
6114    htmlParserCtxtPtr ctxt;
6115    xmlParserInputBufferPtr input;
6116    xmlParserInputPtr stream;
6117
6118    if (fd < 0)
6119        return (NULL);
6120
6121    xmlInitParser();
6122    input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6123    if (input == NULL)
6124        return (NULL);
6125    ctxt = xmlNewParserCtxt();
6126    if (ctxt == NULL) {
6127        xmlFreeParserInputBuffer(input);
6128        return (NULL);
6129    }
6130    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6131    if (stream == NULL) {
6132        xmlFreeParserInputBuffer(input);
6133	xmlFreeParserCtxt(ctxt);
6134        return (NULL);
6135    }
6136    inputPush(ctxt, stream);
6137    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6138}
6139
6140/**
6141 * htmlReadIO:
6142 * @ioread:  an I/O read function
6143 * @ioclose:  an I/O close function
6144 * @ioctx:  an I/O handler
6145 * @URL:  the base URL to use for the document
6146 * @encoding:  the document encoding, or NULL
6147 * @options:  a combination of htmlParserOption(s)
6148 *
6149 * parse an HTML document from I/O functions and source and build a tree.
6150 *
6151 * Returns the resulting document tree
6152 */
6153htmlDocPtr
6154htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6155          void *ioctx, const char *URL, const char *encoding, int options)
6156{
6157    htmlParserCtxtPtr ctxt;
6158    xmlParserInputBufferPtr input;
6159    xmlParserInputPtr stream;
6160
6161    if (ioread == NULL)
6162        return (NULL);
6163    xmlInitParser();
6164
6165    input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6166                                         XML_CHAR_ENCODING_NONE);
6167    if (input == NULL)
6168        return (NULL);
6169    ctxt = htmlNewParserCtxt();
6170    if (ctxt == NULL) {
6171        xmlFreeParserInputBuffer(input);
6172        return (NULL);
6173    }
6174    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6175    if (stream == NULL) {
6176        xmlFreeParserInputBuffer(input);
6177	xmlFreeParserCtxt(ctxt);
6178        return (NULL);
6179    }
6180    inputPush(ctxt, stream);
6181    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6182}
6183
6184/**
6185 * htmlCtxtReadDoc:
6186 * @ctxt:  an HTML parser context
6187 * @cur:  a pointer to a zero terminated string
6188 * @URL:  the base URL to use for the document
6189 * @encoding:  the document encoding, or NULL
6190 * @options:  a combination of htmlParserOption(s)
6191 *
6192 * parse an XML in-memory document and build a tree.
6193 * This reuses the existing @ctxt parser context
6194 *
6195 * Returns the resulting document tree
6196 */
6197htmlDocPtr
6198htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6199               const char *URL, const char *encoding, int options)
6200{
6201    xmlParserInputPtr stream;
6202
6203    if (cur == NULL)
6204        return (NULL);
6205    if (ctxt == NULL)
6206        return (NULL);
6207
6208    htmlCtxtReset(ctxt);
6209
6210    stream = xmlNewStringInputStream(ctxt, cur);
6211    if (stream == NULL) {
6212        return (NULL);
6213    }
6214    inputPush(ctxt, stream);
6215    return (htmlDoRead(ctxt, URL, encoding, options, 1));
6216}
6217
6218/**
6219 * htmlCtxtReadFile:
6220 * @ctxt:  an HTML parser context
6221 * @filename:  a file or URL
6222 * @encoding:  the document encoding, or NULL
6223 * @options:  a combination of htmlParserOption(s)
6224 *
6225 * parse an XML file from the filesystem or the network.
6226 * This reuses the existing @ctxt parser context
6227 *
6228 * Returns the resulting document tree
6229 */
6230htmlDocPtr
6231htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6232                const char *encoding, int options)
6233{
6234    xmlParserInputPtr stream;
6235
6236    if (filename == NULL)
6237        return (NULL);
6238    if (ctxt == NULL)
6239        return (NULL);
6240
6241    htmlCtxtReset(ctxt);
6242
6243    stream = xmlLoadExternalEntity(filename, NULL, ctxt);
6244    if (stream == NULL) {
6245        return (NULL);
6246    }
6247    inputPush(ctxt, stream);
6248    return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6249}
6250
6251/**
6252 * htmlCtxtReadMemory:
6253 * @ctxt:  an HTML parser context
6254 * @buffer:  a pointer to a char array
6255 * @size:  the size of the array
6256 * @URL:  the base URL to use for the document
6257 * @encoding:  the document encoding, or NULL
6258 * @options:  a combination of htmlParserOption(s)
6259 *
6260 * parse an XML in-memory document and build a tree.
6261 * This reuses the existing @ctxt parser context
6262 *
6263 * Returns the resulting document tree
6264 */
6265htmlDocPtr
6266htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6267                  const char *URL, const char *encoding, int options)
6268{
6269    xmlParserInputBufferPtr input;
6270    xmlParserInputPtr stream;
6271
6272    if (ctxt == NULL)
6273        return (NULL);
6274    if (buffer == NULL)
6275        return (NULL);
6276
6277    htmlCtxtReset(ctxt);
6278
6279    input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6280    if (input == NULL) {
6281	return(NULL);
6282    }
6283
6284    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6285    if (stream == NULL) {
6286	xmlFreeParserInputBuffer(input);
6287	return(NULL);
6288    }
6289
6290    inputPush(ctxt, stream);
6291    return (htmlDoRead(ctxt, URL, encoding, options, 1));
6292}
6293
6294/**
6295 * htmlCtxtReadFd:
6296 * @ctxt:  an HTML parser context
6297 * @fd:  an open file descriptor
6298 * @URL:  the base URL to use for the document
6299 * @encoding:  the document encoding, or NULL
6300 * @options:  a combination of htmlParserOption(s)
6301 *
6302 * parse an XML from a file descriptor and build a tree.
6303 * This reuses the existing @ctxt parser context
6304 *
6305 * Returns the resulting document tree
6306 */
6307htmlDocPtr
6308htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6309              const char *URL, const char *encoding, int options)
6310{
6311    xmlParserInputBufferPtr input;
6312    xmlParserInputPtr stream;
6313
6314    if (fd < 0)
6315        return (NULL);
6316    if (ctxt == NULL)
6317        return (NULL);
6318
6319    htmlCtxtReset(ctxt);
6320
6321
6322    input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6323    if (input == NULL)
6324        return (NULL);
6325    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6326    if (stream == NULL) {
6327        xmlFreeParserInputBuffer(input);
6328        return (NULL);
6329    }
6330    inputPush(ctxt, stream);
6331    return (htmlDoRead(ctxt, URL, encoding, options, 1));
6332}
6333
6334/**
6335 * htmlCtxtReadIO:
6336 * @ctxt:  an HTML parser context
6337 * @ioread:  an I/O read function
6338 * @ioclose:  an I/O close function
6339 * @ioctx:  an I/O handler
6340 * @URL:  the base URL to use for the document
6341 * @encoding:  the document encoding, or NULL
6342 * @options:  a combination of htmlParserOption(s)
6343 *
6344 * parse an HTML document from I/O functions and source and build a tree.
6345 * This reuses the existing @ctxt parser context
6346 *
6347 * Returns the resulting document tree
6348 */
6349htmlDocPtr
6350htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6351              xmlInputCloseCallback ioclose, void *ioctx,
6352	      const char *URL,
6353              const char *encoding, int options)
6354{
6355    xmlParserInputBufferPtr input;
6356    xmlParserInputPtr stream;
6357
6358    if (ioread == NULL)
6359        return (NULL);
6360    if (ctxt == NULL)
6361        return (NULL);
6362
6363    htmlCtxtReset(ctxt);
6364
6365    input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6366                                         XML_CHAR_ENCODING_NONE);
6367    if (input == NULL)
6368        return (NULL);
6369    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6370    if (stream == NULL) {
6371        xmlFreeParserInputBuffer(input);
6372        return (NULL);
6373    }
6374    inputPush(ctxt, stream);
6375    return (htmlDoRead(ctxt, URL, encoding, options, 1));
6376}
6377
6378#define bottom_HTMLparser
6379#include "elfgcchack.h"
6380#endif /* LIBXML_HTML_ENABLED */
6381