1/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * daniel@veillard.com
7 */
8
9#define IN_LIBXML
10#include "libxml.h"
11#ifdef LIBXML_HTML_ENABLED
12
13#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
39#include <libxml/HTMLtree.h>
40#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
44#include <libxml/globals.h>
45#include <libxml/uri.h>
46
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
54static int htmlOmittedDefaultValue = 1;
55
56xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57			     xmlChar end, xmlChar  end2, xmlChar end3);
58static void htmlParseComment(htmlParserCtxtPtr ctxt);
59
60/************************************************************************
61 *									*
62 *		Some factorized error routines				*
63 *									*
64 ************************************************************************/
65
66/**
67 * htmlErrMemory:
68 * @ctxt:  an HTML parser context
69 * @extra:  extra informations
70 *
71 * Handle a redefinition of attribute error
72 */
73static void
74htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75{
76    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77        (ctxt->instate == XML_PARSER_EOF))
78	return;
79    if (ctxt != NULL) {
80        ctxt->errNo = XML_ERR_NO_MEMORY;
81        ctxt->instate = XML_PARSER_EOF;
82        ctxt->disableSAX = 1;
83    }
84    if (extra)
85        __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
86                        XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87                        NULL, NULL, 0, 0,
88                        "Memory allocation failed : %s\n", extra);
89    else
90        __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
91                        XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92                        NULL, NULL, 0, 0, "Memory allocation failed\n");
93}
94
95/**
96 * htmlParseErr:
97 * @ctxt:  an HTML parser context
98 * @error:  the error number
99 * @msg:  the error message
100 * @str1:  string infor
101 * @str2:  string infor
102 *
103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
104 */
105static void
106htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107             const char *msg, const xmlChar *str1, const xmlChar *str2)
108{
109    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110        (ctxt->instate == XML_PARSER_EOF))
111	return;
112    if (ctxt != NULL)
113	ctxt->errNo = error;
114    __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
115                    XML_ERR_ERROR, NULL, 0,
116		    (const char *) str1, (const char *) str2,
117		    NULL, 0, 0,
118		    msg, str1, str2);
119    if (ctxt != NULL)
120	ctxt->wellFormed = 0;
121}
122
123/**
124 * htmlParseErrInt:
125 * @ctxt:  an HTML parser context
126 * @error:  the error number
127 * @msg:  the error message
128 * @val:  integer info
129 *
130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
131 */
132static void
133htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134             const char *msg, int val)
135{
136    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137        (ctxt->instate == XML_PARSER_EOF))
138	return;
139    if (ctxt != NULL)
140	ctxt->errNo = error;
141    __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
142                    XML_ERR_ERROR, NULL, 0, NULL, NULL,
143		    NULL, val, 0, msg, val);
144    if (ctxt != NULL)
145	ctxt->wellFormed = 0;
146}
147
148/************************************************************************
149 *									*
150 *	Parser stacks related functions and macros		*
151 *									*
152 ************************************************************************/
153
154/**
155 * htmlnamePush:
156 * @ctxt:  an HTML parser context
157 * @value:  the element name
158 *
159 * Pushes a new element name on top of the name stack
160 *
161 * Returns 0 in case of error, the index in the stack otherwise
162 */
163static int
164htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
165{
166    if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
167        ctxt->html = 3;
168    if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
169        ctxt->html = 10;
170    if (ctxt->nameNr >= ctxt->nameMax) {
171        ctxt->nameMax *= 2;
172        ctxt->nameTab = (const xmlChar * *)
173                         xmlRealloc((xmlChar * *)ctxt->nameTab,
174                                    ctxt->nameMax *
175                                    sizeof(ctxt->nameTab[0]));
176        if (ctxt->nameTab == NULL) {
177            htmlErrMemory(ctxt, NULL);
178            return (0);
179        }
180    }
181    ctxt->nameTab[ctxt->nameNr] = value;
182    ctxt->name = value;
183    return (ctxt->nameNr++);
184}
185/**
186 * htmlnamePop:
187 * @ctxt: an HTML parser context
188 *
189 * Pops the top element name from the name stack
190 *
191 * Returns the name just removed
192 */
193static const xmlChar *
194htmlnamePop(htmlParserCtxtPtr ctxt)
195{
196    const xmlChar *ret;
197
198    if (ctxt->nameNr <= 0)
199        return (NULL);
200    ctxt->nameNr--;
201    if (ctxt->nameNr < 0)
202        return (NULL);
203    if (ctxt->nameNr > 0)
204        ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
205    else
206        ctxt->name = NULL;
207    ret = ctxt->nameTab[ctxt->nameNr];
208    ctxt->nameTab[ctxt->nameNr] = NULL;
209    return (ret);
210}
211
212/**
213 * htmlNodeInfoPush:
214 * @ctxt:  an HTML parser context
215 * @value:  the node info
216 *
217 * Pushes a new element name on top of the node info stack
218 *
219 * Returns 0 in case of error, the index in the stack otherwise
220 */
221static int
222htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
223{
224    if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
225        if (ctxt->nodeInfoMax == 0)
226                ctxt->nodeInfoMax = 5;
227        ctxt->nodeInfoMax *= 2;
228        ctxt->nodeInfoTab = (htmlParserNodeInfo *)
229                         xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
230                                    ctxt->nodeInfoMax *
231                                    sizeof(ctxt->nodeInfoTab[0]));
232        if (ctxt->nodeInfoTab == NULL) {
233            htmlErrMemory(ctxt, NULL);
234            return (0);
235        }
236    }
237    ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
238    ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
239    return (ctxt->nodeInfoNr++);
240}
241
242/**
243 * htmlNodeInfoPop:
244 * @ctxt:  an HTML parser context
245 *
246 * Pops the top element name from the node info stack
247 *
248 * Returns 0 in case of error, the pointer to NodeInfo otherwise
249 */
250static htmlParserNodeInfo *
251htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
252{
253    if (ctxt->nodeInfoNr <= 0)
254        return (NULL);
255    ctxt->nodeInfoNr--;
256    if (ctxt->nodeInfoNr < 0)
257        return (NULL);
258    if (ctxt->nodeInfoNr > 0)
259        ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
260    else
261        ctxt->nodeInfo = NULL;
262    return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
263}
264
265/*
266 * Macros for accessing the content. Those should be used only by the parser,
267 * and not exported.
268 *
269 * Dirty macros, i.e. one need to make assumption on the context to use them
270 *
271 *   CUR_PTR return the current pointer to the xmlChar to be parsed.
272 *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
273 *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
274 *           in UNICODE mode. This should be used internally by the parser
275 *           only to compare to ASCII values otherwise it would break when
276 *           running with UTF-8 encoding.
277 *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
278 *           to compare on ASCII based substring.
279 *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
280 *           it should be used only to compare on ASCII based substring.
281 *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
282 *           strings without newlines within the parser.
283 *
284 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
285 *
286 *   CURRENT Returns the current char value, with the full decoding of
287 *           UTF-8 if we are using this mode. It returns an int.
288 *   NEXT    Skip to the next character, this does the proper decoding
289 *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
290 *   NEXTL(l) Skip the current unicode character of l xmlChars long.
291 *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
292 */
293
294#define UPPER (toupper(*ctxt->input->cur))
295
296#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
297
298#define NXT(val) ctxt->input->cur[(val)]
299
300#define UPP(val) (toupper(ctxt->input->cur[(val)]))
301
302#define CUR_PTR ctxt->input->cur
303
304#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
305		   (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
306	xmlParserInputShrink(ctxt->input)
307
308#define GROW if ((ctxt->progressive == 0) &&				\
309		 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))	\
310	xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
311
312#define CURRENT ((int) (*ctxt->input->cur))
313
314#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
315
316/* Inported from XML */
317
318/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
319#define CUR ((int) (*ctxt->input->cur))
320#define NEXT xmlNextChar(ctxt)
321
322#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
323
324
325#define NEXTL(l) do {							\
326    if (*(ctxt->input->cur) == '\n') {					\
327	ctxt->input->line++; ctxt->input->col = 1;			\
328    } else ctxt->input->col++;						\
329    ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++;		\
330  } while (0)
331
332/************
333    \
334    if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt);	\
335    if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
336 ************/
337
338#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
339#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
340
341#define COPY_BUF(l,b,i,v)						\
342    if (l == 1) b[i++] = (xmlChar) v;					\
343    else i += xmlCopyChar(l,&b[i],v)
344
345/**
346 * htmlFindEncoding:
347 * @the HTML parser context
348 *
349 * Ty to find and encoding in the current data available in the input
350 * buffer this is needed to try to switch to the proper encoding when
351 * one face a character error.
352 * That's an heuristic, since it's operating outside of parsing it could
353 * try to use a meta which had been commented out, that's the reason it
354 * should only be used in case of error, not as a default.
355 *
356 * Returns an encoding string or NULL if not found, the string need to
357 *   be freed
358 */
359static xmlChar *
360htmlFindEncoding(xmlParserCtxtPtr ctxt) {
361    const xmlChar *start, *cur, *end;
362
363    if ((ctxt == NULL) || (ctxt->input == NULL) ||
364        (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
365        (ctxt->input->buf->encoder != NULL))
366        return(NULL);
367    if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
368        return(NULL);
369
370    start = ctxt->input->cur;
371    end = ctxt->input->end;
372    /* we also expect the input buffer to be zero terminated */
373    if (*end != 0)
374        return(NULL);
375
376    cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
377    if (cur == NULL)
378        return(NULL);
379    cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
380    if (cur == NULL)
381        return(NULL);
382    cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
383    if (cur == NULL)
384        return(NULL);
385    cur += 8;
386    start = cur;
387    while (((*cur >= 'A') && (*cur <= 'Z')) ||
388           ((*cur >= 'a') && (*cur <= 'z')) ||
389           ((*cur >= '0') && (*cur <= '9')) ||
390           (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
391           cur++;
392    if (cur == start)
393        return(NULL);
394    return(xmlStrndup(start, cur - start));
395}
396
397/**
398 * htmlCurrentChar:
399 * @ctxt:  the HTML parser context
400 * @len:  pointer to the length of the char read
401 *
402 * The current char value, if using UTF-8 this may actually span multiple
403 * bytes in the input buffer. Implement the end of line normalization:
404 * 2.11 End-of-Line Handling
405 * If the encoding is unspecified, in the case we find an ISO-Latin-1
406 * char, then the encoding converter is plugged in automatically.
407 *
408 * Returns the current char value and its length
409 */
410
411static int
412htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
413    if (ctxt->instate == XML_PARSER_EOF)
414	return(0);
415
416    if (ctxt->token != 0) {
417	*len = 0;
418	return(ctxt->token);
419    }
420    if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
421	/*
422	 * We are supposed to handle UTF8, check it's valid
423	 * From rfc2044: encoding of the Unicode values on UTF-8:
424	 *
425	 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
426	 * 0000 0000-0000 007F   0xxxxxxx
427	 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
428	 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
429	 *
430	 * Check for the 0x110000 limit too
431	 */
432	const unsigned char *cur = ctxt->input->cur;
433	unsigned char c;
434	unsigned int val;
435
436	c = *cur;
437	if (c & 0x80) {
438	    if (cur[1] == 0) {
439		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
440                cur = ctxt->input->cur;
441            }
442	    if ((cur[1] & 0xc0) != 0x80)
443		goto encoding_error;
444	    if ((c & 0xe0) == 0xe0) {
445
446		if (cur[2] == 0) {
447		    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
448                    cur = ctxt->input->cur;
449                }
450		if ((cur[2] & 0xc0) != 0x80)
451		    goto encoding_error;
452		if ((c & 0xf0) == 0xf0) {
453		    if (cur[3] == 0) {
454			xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
455                        cur = ctxt->input->cur;
456                    }
457		    if (((c & 0xf8) != 0xf0) ||
458			((cur[3] & 0xc0) != 0x80))
459			goto encoding_error;
460		    /* 4-byte code */
461		    *len = 4;
462		    val = (cur[0] & 0x7) << 18;
463		    val |= (cur[1] & 0x3f) << 12;
464		    val |= (cur[2] & 0x3f) << 6;
465		    val |= cur[3] & 0x3f;
466		} else {
467		  /* 3-byte code */
468		    *len = 3;
469		    val = (cur[0] & 0xf) << 12;
470		    val |= (cur[1] & 0x3f) << 6;
471		    val |= cur[2] & 0x3f;
472		}
473	    } else {
474	      /* 2-byte code */
475		*len = 2;
476		val = (cur[0] & 0x1f) << 6;
477		val |= cur[1] & 0x3f;
478	    }
479	    if (!IS_CHAR(val)) {
480	        htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
481				"Char 0x%X out of allowed range\n", val);
482	    }
483	    return(val);
484	} else {
485            if ((*ctxt->input->cur == 0) &&
486                (ctxt->input->cur < ctxt->input->end)) {
487                    htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
488				"Char 0x%X out of allowed range\n", 0);
489                *len = 1;
490                return(' ');
491            }
492	    /* 1-byte code */
493	    *len = 1;
494	    return((int) *ctxt->input->cur);
495	}
496    }
497    /*
498     * Assume it's a fixed length encoding (1) with
499     * a compatible encoding for the ASCII set, since
500     * XML constructs only use < 128 chars
501     */
502    *len = 1;
503    if ((int) *ctxt->input->cur < 0x80)
504	return((int) *ctxt->input->cur);
505
506    /*
507     * Humm this is bad, do an automatic flow conversion
508     */
509    {
510        xmlChar * guess;
511        xmlCharEncodingHandlerPtr handler;
512
513        guess = htmlFindEncoding(ctxt);
514        if (guess == NULL) {
515            xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
516        } else {
517            if (ctxt->input->encoding != NULL)
518                xmlFree((xmlChar *) ctxt->input->encoding);
519            ctxt->input->encoding = guess;
520            handler = xmlFindCharEncodingHandler((const char *) guess);
521            if (handler != NULL) {
522                xmlSwitchToEncoding(ctxt, handler);
523            } else {
524                htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
525                             "Unsupported encoding %s", guess, NULL);
526            }
527        }
528        ctxt->charset = XML_CHAR_ENCODING_UTF8;
529    }
530
531    return(xmlCurrentChar(ctxt, len));
532
533encoding_error:
534    /*
535     * If we detect an UTF8 error that probably mean that the
536     * input encoding didn't get properly advertized in the
537     * declaration header. Report the error and switch the encoding
538     * to ISO-Latin-1 (if you don't like this policy, just declare the
539     * encoding !)
540     */
541    {
542        char buffer[150];
543
544	if (ctxt->input->end - ctxt->input->cur >= 4) {
545	    snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
546			    ctxt->input->cur[0], ctxt->input->cur[1],
547			    ctxt->input->cur[2], ctxt->input->cur[3]);
548	} else {
549	    snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
550	}
551	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
552		     "Input is not proper UTF-8, indicate encoding !\n",
553		     BAD_CAST buffer, NULL);
554    }
555
556    ctxt->charset = XML_CHAR_ENCODING_8859_1;
557    *len = 1;
558    return((int) *ctxt->input->cur);
559}
560
561/**
562 * htmlSkipBlankChars:
563 * @ctxt:  the HTML parser context
564 *
565 * skip all blanks character found at that point in the input streams.
566 *
567 * Returns the number of space chars skipped
568 */
569
570static int
571htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
572    int res = 0;
573
574    while (IS_BLANK_CH(*(ctxt->input->cur))) {
575	if ((*ctxt->input->cur == 0) &&
576	    (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
577		xmlPopInput(ctxt);
578	} else {
579	    if (*(ctxt->input->cur) == '\n') {
580		ctxt->input->line++; ctxt->input->col = 1;
581	    } else ctxt->input->col++;
582	    ctxt->input->cur++;
583	    ctxt->nbChars++;
584	    if (*ctxt->input->cur == 0)
585		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
586	}
587	res++;
588    }
589    return(res);
590}
591
592
593
594/************************************************************************
595 *									*
596 *	The list of HTML elements and their properties		*
597 *									*
598 ************************************************************************/
599
600/*
601 *  Start Tag: 1 means the start tag can be ommited
602 *  End Tag:   1 means the end tag can be ommited
603 *             2 means it's forbidden (empty elements)
604 *             3 means the tag is stylistic and should be closed easily
605 *  Depr:      this element is deprecated
606 *  DTD:       1 means that this element is valid only in the Loose DTD
607 *             2 means that this element is valid only in the Frameset DTD
608 *
609 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
610	, subElements , impliedsubelt , Attributes, userdata
611 */
612
613/* Definitions and a couple of vars for HTML Elements */
614
615#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
616#define NB_FONTSTYLE 8
617#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
618#define NB_PHRASE 10
619#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
620#define NB_SPECIAL 16
621#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
622#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
623#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
624#define NB_BLOCK NB_HEADING + NB_LIST + 14
625#define FORMCTRL "input", "select", "textarea", "label", "button"
626#define NB_FORMCTRL 5
627#define PCDATA
628#define NB_PCDATA 0
629#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
630#define NB_HEADING 6
631#define LIST "ul", "ol", "dir", "menu"
632#define NB_LIST 4
633#define MODIFIER
634#define NB_MODIFIER 0
635#define FLOW BLOCK,INLINE
636#define NB_FLOW NB_BLOCK + NB_INLINE
637#define EMPTY NULL
638
639
640static const char* const html_flow[] = { FLOW, NULL } ;
641static const char* const html_inline[] = { INLINE, NULL } ;
642
643/* placeholders: elts with content but no subelements */
644static const char* const html_pcdata[] = { NULL } ;
645#define html_cdata html_pcdata
646
647
648/* ... and for HTML Attributes */
649
650#define COREATTRS "id", "class", "style", "title"
651#define NB_COREATTRS 4
652#define I18N "lang", "dir"
653#define NB_I18N 2
654#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
655#define NB_EVENTS 9
656#define ATTRS COREATTRS,I18N,EVENTS
657#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
658#define CELLHALIGN "align", "char", "charoff"
659#define NB_CELLHALIGN 3
660#define CELLVALIGN "valign"
661#define NB_CELLVALIGN 1
662
663static const char* const html_attrs[] = { ATTRS, NULL } ;
664static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
665static const char* const core_attrs[] = { COREATTRS, NULL } ;
666static const char* const i18n_attrs[] = { I18N, NULL } ;
667
668
669/* Other declarations that should go inline ... */
670static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
671	"href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
672	"tabindex", "onfocus", "onblur", NULL } ;
673static const char* const target_attr[] = { "target", NULL } ;
674static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
675static const char* const alt_attr[] = { "alt", NULL } ;
676static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
677static const char* const href_attrs[] = { "href", NULL } ;
678static const char* const clear_attrs[] = { "clear", NULL } ;
679static const char* const inline_p[] = { INLINE, "p", NULL } ;
680
681static const char* const flow_param[] = { FLOW, "param", NULL } ;
682static const char* const applet_attrs[] = { COREATTRS , "codebase",
683		"archive", "alt", "name", "height", "width", "align",
684		"hspace", "vspace", NULL } ;
685static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
686	"tabindex", "accesskey", "onfocus", "onblur", NULL } ;
687static const char* const basefont_attrs[] =
688	{ "id", "size", "color", "face", NULL } ;
689static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
690static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
691static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
692static const char* const body_depr[] = { "background", "bgcolor", "text",
693	"link", "vlink", "alink", NULL } ;
694static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
695	"disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
696
697
698static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
699static const char* const col_elt[] = { "col", NULL } ;
700static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
701static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
702static const char* const dl_contents[] = { "dt", "dd", NULL } ;
703static const char* const compact_attr[] = { "compact", NULL } ;
704static const char* const label_attr[] = { "label", NULL } ;
705static const char* const fieldset_contents[] = { FLOW, "legend" } ;
706static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
707static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
708static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
709static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
710static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
711static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
712static const char* const head_attrs[] = { I18N, "profile", NULL } ;
713static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
714static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
715static const char* const version_attr[] = { "version", NULL } ;
716static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
717static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
718static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
719static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
720static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
721static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
722static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
723static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
724static const char* const align_attr[] = { "align", NULL } ;
725static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
726static const char* const map_contents[] = { BLOCK, "area", NULL } ;
727static const char* const name_attr[] = { "name", NULL } ;
728static const char* const action_attr[] = { "action", NULL } ;
729static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
730static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
731static const char* const content_attr[] = { "content", NULL } ;
732static const char* const type_attr[] = { "type", NULL } ;
733static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
734static const char* const object_contents[] = { FLOW, "param", NULL } ;
735static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
736static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
737static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
738static const char* const option_elt[] = { "option", NULL } ;
739static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
740static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
741static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
742static const char* const width_attr[] = { "width", NULL } ;
743static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
744static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
745static const char* const language_attr[] = { "language", NULL } ;
746static const char* const select_content[] = { "optgroup", "option", NULL } ;
747static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
748static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
749static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
750static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
751static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
752static const char* const tr_elt[] = { "tr", NULL } ;
753static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
754static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
755static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
756static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
757static const char* const tr_contents[] = { "th", "td", NULL } ;
758static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
759static const char* const li_elt[] = { "li", NULL } ;
760static const char* const ul_depr[] = { "type", "compact", NULL} ;
761static const char* const dir_attr[] = { "dir", NULL} ;
762
763#define DECL (const char**)
764
765static const htmlElemDesc
766html40ElementTable[] = {
767{ "a",		0, 0, 0, 0, 0, 0, 1, "anchor ",
768	DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
769},
770{ "abbr",	0, 0, 0, 0, 0, 0, 1, "abbreviated form",
771	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
772},
773{ "acronym",	0, 0, 0, 0, 0, 0, 1, "",
774	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
775},
776{ "address",	0, 0, 0, 0, 0, 0, 0, "information on author ",
777	DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
778},
779{ "applet",	0, 0, 0, 0, 1, 1, 2, "java applet ",
780	DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
781},
782{ "area",	0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
783	EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
784},
785{ "b",		0, 3, 0, 0, 0, 0, 1, "bold text style",
786	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
787},
788{ "base",	0, 2, 2, 1, 0, 0, 0, "document base uri ",
789	EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
790},
791{ "basefont",	0, 2, 2, 1, 1, 1, 1, "base font size " ,
792	EMPTY , NULL , NULL, DECL basefont_attrs, NULL
793},
794{ "bdo",	0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
795	DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
796},
797{ "big",	0, 3, 0, 0, 0, 0, 1, "large text style",
798	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
799},
800{ "blockquote",	0, 0, 0, 0, 0, 0, 0, "long quotation ",
801	DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
802},
803{ "body",	1, 1, 0, 0, 0, 0, 0, "document body ",
804	DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
805},
806{ "br",		0, 2, 2, 1, 0, 0, 1, "forced line break ",
807	EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
808},
809{ "button",	0, 0, 0, 0, 0, 0, 2, "push button ",
810	DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
811},
812{ "caption",	0, 0, 0, 0, 0, 0, 0, "table caption ",
813	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
814},
815{ "center",	0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
816	DECL html_flow , NULL , NULL, DECL html_attrs, NULL
817},
818{ "cite",	0, 0, 0, 0, 0, 0, 1, "citation",
819	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
820},
821{ "code",	0, 0, 0, 0, 0, 0, 1, "computer code fragment",
822	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
823},
824{ "col",	0, 2, 2, 1, 0, 0, 0, "table column ",
825	EMPTY , NULL , DECL col_attrs , NULL, NULL
826},
827{ "colgroup",	0, 1, 0, 0, 0, 0, 0, "table column group ",
828	DECL col_elt , "col" , DECL col_attrs , NULL, NULL
829},
830{ "dd",		0, 1, 0, 0, 0, 0, 0, "definition description ",
831	DECL html_flow , NULL , DECL html_attrs, NULL, NULL
832},
833{ "del",	0, 0, 0, 0, 0, 0, 2, "deleted text ",
834	DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
835},
836{ "dfn",	0, 0, 0, 0, 0, 0, 1, "instance definition",
837	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
838},
839{ "dir",	0, 0, 0, 0, 1, 1, 0, "directory list",
840	DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
841},
842{ "div",	0, 0, 0, 0, 0, 0, 0, "generic language/style container",
843	DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
844},
845{ "dl",		0, 0, 0, 0, 0, 0, 0, "definition list ",
846	DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
847},
848{ "dt",		0, 1, 0, 0, 0, 0, 0, "definition term ",
849	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
850},
851{ "em",		0, 3, 0, 0, 0, 0, 1, "emphasis",
852	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
853},
854{ "embed",	0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
855	EMPTY, NULL, DECL embed_attrs, NULL, NULL
856},
857{ "fieldset",	0, 0, 0, 0, 0, 0, 0, "form control group ",
858	DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
859},
860{ "font",	0, 3, 0, 0, 1, 1, 1, "local change to font ",
861	DECL html_inline, NULL, NULL, DECL font_attrs, NULL
862},
863{ "form",	0, 0, 0, 0, 0, 0, 0, "interactive form ",
864	DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
865},
866{ "frame",	0, 2, 2, 1, 0, 2, 0, "subwindow " ,
867	EMPTY, NULL, NULL, DECL frame_attrs, NULL
868},
869{ "frameset",	0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
870	DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
871},
872{ "h1",		0, 0, 0, 0, 0, 0, 0, "heading ",
873	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
874},
875{ "h2",		0, 0, 0, 0, 0, 0, 0, "heading ",
876	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
877},
878{ "h3",		0, 0, 0, 0, 0, 0, 0, "heading ",
879	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
880},
881{ "h4",		0, 0, 0, 0, 0, 0, 0, "heading ",
882	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
883},
884{ "h5",		0, 0, 0, 0, 0, 0, 0, "heading ",
885	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
886},
887{ "h6",		0, 0, 0, 0, 0, 0, 0, "heading ",
888	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
889},
890{ "head",	1, 1, 0, 0, 0, 0, 0, "document head ",
891	DECL head_contents, NULL, DECL head_attrs, NULL, NULL
892},
893{ "hr",		0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
894	EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
895},
896{ "html",	1, 1, 0, 0, 0, 0, 0, "document root element ",
897	DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
898},
899{ "i",		0, 3, 0, 0, 0, 0, 1, "italic text style",
900	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
901},
902{ "iframe",	0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
903	DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
904},
905{ "img",	0, 2, 2, 1, 0, 0, 1, "embedded image ",
906	EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
907},
908{ "input",	0, 2, 2, 1, 0, 0, 1, "form control ",
909	EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
910},
911{ "ins",	0, 0, 0, 0, 0, 0, 2, "inserted text",
912	DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
913},
914{ "isindex",	0, 2, 2, 1, 1, 1, 0, "single line prompt ",
915	EMPTY, NULL, NULL, DECL prompt_attrs, NULL
916},
917{ "kbd",	0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
918	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
919},
920{ "label",	0, 0, 0, 0, 0, 0, 1, "form field label text ",
921	DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
922},
923{ "legend",	0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
924	DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
925},
926{ "li",		0, 1, 1, 0, 0, 0, 0, "list item ",
927	DECL html_flow, NULL, DECL html_attrs, NULL, NULL
928},
929{ "link",	0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
930	EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
931},
932{ "map",	0, 0, 0, 0, 0, 0, 2, "client-side image map ",
933	DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
934},
935{ "menu",	0, 0, 0, 0, 1, 1, 0, "menu list ",
936	DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
937},
938{ "meta",	0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
939	EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
940},
941{ "noframes",	0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
942	DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
943},
944{ "noscript",	0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
945	DECL html_flow, "div", DECL html_attrs, NULL, NULL
946},
947{ "object",	0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
948	DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
949},
950{ "ol",		0, 0, 0, 0, 0, 0, 0, "ordered list ",
951	DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
952},
953{ "optgroup",	0, 0, 0, 0, 0, 0, 0, "option group ",
954	DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
955},
956{ "option",	0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
957	DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
958},
959{ "p",		0, 1, 0, 0, 0, 0, 0, "paragraph ",
960	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
961},
962{ "param",	0, 2, 2, 1, 0, 0, 0, "named property value ",
963	EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
964},
965{ "pre",	0, 0, 0, 0, 0, 0, 0, "preformatted text ",
966	DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
967},
968{ "q",		0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
969	DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
970},
971{ "s",		0, 3, 0, 0, 1, 1, 1, "strike-through text style",
972	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
973},
974{ "samp",	0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
975	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
976},
977{ "script",	0, 0, 0, 0, 0, 0, 2, "script statements ",
978	DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
979},
980{ "select",	0, 0, 0, 0, 0, 0, 1, "option selector ",
981	DECL select_content, NULL, DECL select_attrs, NULL, NULL
982},
983{ "small",	0, 3, 0, 0, 0, 0, 1, "small text style",
984	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
985},
986{ "span",	0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
987	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
988},
989{ "strike",	0, 3, 0, 0, 1, 1, 1, "strike-through text",
990	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
991},
992{ "strong",	0, 3, 0, 0, 0, 0, 1, "strong emphasis",
993	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
994},
995{ "style",	0, 0, 0, 0, 0, 0, 0, "style info ",
996	DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
997},
998{ "sub",	0, 3, 0, 0, 0, 0, 1, "subscript",
999	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1000},
1001{ "sup",	0, 3, 0, 0, 0, 0, 1, "superscript ",
1002	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1003},
1004{ "table",	0, 0, 0, 0, 0, 0, 0, "",
1005	DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1006},
1007{ "tbody",	1, 0, 0, 0, 0, 0, 0, "table body ",
1008	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1009},
1010{ "td",		0, 0, 0, 0, 0, 0, 0, "table data cell",
1011	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1012},
1013{ "textarea",	0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1014	DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1015},
1016{ "tfoot",	0, 1, 0, 0, 0, 0, 0, "table footer ",
1017	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1018},
1019{ "th",		0, 1, 0, 0, 0, 0, 0, "table header cell",
1020	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1021},
1022{ "thead",	0, 1, 0, 0, 0, 0, 0, "table header ",
1023	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1024},
1025{ "title",	0, 0, 0, 0, 0, 0, 0, "document title ",
1026	DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1027},
1028{ "tr",		0, 0, 0, 0, 0, 0, 0, "table row ",
1029	DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1030},
1031{ "tt",		0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1032	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1033},
1034{ "u",		0, 3, 0, 0, 1, 1, 1, "underlined text style",
1035	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1036},
1037{ "ul",		0, 0, 0, 0, 0, 0, 0, "unordered list ",
1038	DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1039},
1040{ "var",	0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1041	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1042}
1043};
1044
1045/*
1046 * start tags that imply the end of current element
1047 */
1048static const char * const htmlStartClose[] = {
1049"form",		"form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
1050		"dl", "ul", "ol", "menu", "dir", "address", "pre",
1051		"listing", "xmp", "head", NULL,
1052"head",		"p", NULL,
1053"title",	"p", NULL,
1054"body",		"head", "style", "link", "title", "p", NULL,
1055"frameset",	"head", "style", "link", "title", "p", NULL,
1056"li",		"p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
1057		"pre", "listing", "xmp", "head", "li", NULL,
1058"hr",		"p", "head", NULL,
1059"h1",		"p", "head", NULL,
1060"h2",		"p", "head", NULL,
1061"h3",		"p", "head", NULL,
1062"h4",		"p", "head", NULL,
1063"h5",		"p", "head", NULL,
1064"h6",		"p", "head", NULL,
1065"dir",		"p", "head", NULL,
1066"address",	"p", "head", "ul", NULL,
1067"pre",		"p", "head", "ul", NULL,
1068"listing",	"p", "head", NULL,
1069"xmp",		"p", "head", NULL,
1070"blockquote",	"p", "head", NULL,
1071"dl",		"p", "dt", "menu", "dir", "address", "pre", "listing",
1072		"xmp", "head", NULL,
1073"dt",		"p", "menu", "dir", "address", "pre", "listing", "xmp",
1074                "head", "dd", NULL,
1075"dd",		"p", "menu", "dir", "address", "pre", "listing", "xmp",
1076                "head", "dt", NULL,
1077"ul",		"p", "head", "ol", "menu", "dir", "address", "pre",
1078		"listing", "xmp", NULL,
1079"ol",		"p", "head", "ul", NULL,
1080"menu",		"p", "head", "ul", NULL,
1081"p",		"p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
1082"div",		"p", "head", NULL,
1083"noscript",	"p", "head", NULL,
1084"center",	"font", "b", "i", "p", "head", NULL,
1085"a",		"a", NULL,
1086"caption",	"p", NULL,
1087"colgroup",	"caption", "colgroup", "col", "p", NULL,
1088"col",		"caption", "col", "p", NULL,
1089"table",	"p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1090		"listing", "xmp", "a", NULL,
1091"th",		"th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1092"td",		"th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1093"tr",		"th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1094"thead",	"caption", "col", "colgroup", NULL,
1095"tfoot",	"th", "td", "tr", "caption", "col", "colgroup", "thead",
1096		"tbody", "p", NULL,
1097"tbody",	"th", "td", "tr", "caption", "col", "colgroup", "thead",
1098		"tfoot", "tbody", "p", NULL,
1099"optgroup",	"option", NULL,
1100"option",	"option", NULL,
1101"fieldset",	"legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1102		"pre", "listing", "xmp", "a", NULL,
1103NULL
1104};
1105
1106/*
1107 * The list of HTML elements which are supposed not to have
1108 * CDATA content and where a p element will be implied
1109 *
1110 * TODO: extend that list by reading the HTML SGML DTD on
1111 *       implied paragraph
1112 */
1113static const char *const htmlNoContentElements[] = {
1114    "html",
1115    "head",
1116    NULL
1117};
1118
1119/*
1120 * The list of HTML attributes which are of content %Script;
1121 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1122 *       it assumes the name starts with 'on'
1123 */
1124static const char *const htmlScriptAttributes[] = {
1125    "onclick",
1126    "ondblclick",
1127    "onmousedown",
1128    "onmouseup",
1129    "onmouseover",
1130    "onmousemove",
1131    "onmouseout",
1132    "onkeypress",
1133    "onkeydown",
1134    "onkeyup",
1135    "onload",
1136    "onunload",
1137    "onfocus",
1138    "onblur",
1139    "onsubmit",
1140    "onrest",
1141    "onchange",
1142    "onselect"
1143};
1144
1145/*
1146 * This table is used by the htmlparser to know what to do with
1147 * broken html pages. By assigning different priorities to different
1148 * elements the parser can decide how to handle extra endtags.
1149 * Endtags are only allowed to close elements with lower or equal
1150 * priority.
1151 */
1152
1153typedef struct {
1154    const char *name;
1155    int priority;
1156} elementPriority;
1157
1158static const elementPriority htmlEndPriority[] = {
1159    {"div",   150},
1160    {"td",    160},
1161    {"th",    160},
1162    {"tr",    170},
1163    {"thead", 180},
1164    {"tbody", 180},
1165    {"tfoot", 180},
1166    {"table", 190},
1167    {"head",  200},
1168    {"body",  200},
1169    {"html",  220},
1170    {NULL,    100} /* Default priority */
1171};
1172
1173static const char** htmlStartCloseIndex[100];
1174static int htmlStartCloseIndexinitialized = 0;
1175
1176/************************************************************************
1177 *									*
1178 *	functions to handle HTML specific data			*
1179 *									*
1180 ************************************************************************/
1181
1182/**
1183 * htmlInitAutoClose:
1184 *
1185 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1186 * This is not reentrant. Call xmlInitParser() once before processing in
1187 * case of use in multithreaded programs.
1188 */
1189void
1190htmlInitAutoClose(void) {
1191    int indx, i = 0;
1192
1193    if (htmlStartCloseIndexinitialized) return;
1194
1195    for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1196    indx = 0;
1197    while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1198        htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
1199	while (htmlStartClose[i] != NULL) i++;
1200	i++;
1201    }
1202    htmlStartCloseIndexinitialized = 1;
1203}
1204
1205/**
1206 * htmlTagLookup:
1207 * @tag:  The tag name in lowercase
1208 *
1209 * Lookup the HTML tag in the ElementTable
1210 *
1211 * Returns the related htmlElemDescPtr or NULL if not found.
1212 */
1213const htmlElemDesc *
1214htmlTagLookup(const xmlChar *tag) {
1215    unsigned int i;
1216
1217    for (i = 0; i < (sizeof(html40ElementTable) /
1218                     sizeof(html40ElementTable[0]));i++) {
1219        if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
1220	    return((htmlElemDescPtr) &html40ElementTable[i]);
1221    }
1222    return(NULL);
1223}
1224
1225/**
1226 * htmlGetEndPriority:
1227 * @name: The name of the element to look up the priority for.
1228 *
1229 * Return value: The "endtag" priority.
1230 **/
1231static int
1232htmlGetEndPriority (const xmlChar *name) {
1233    int i = 0;
1234
1235    while ((htmlEndPriority[i].name != NULL) &&
1236	   (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1237	i++;
1238
1239    return(htmlEndPriority[i].priority);
1240}
1241
1242
1243/**
1244 * htmlCheckAutoClose:
1245 * @newtag:  The new tag name
1246 * @oldtag:  The old tag name
1247 *
1248 * Checks whether the new tag is one of the registered valid tags for
1249 * closing old.
1250 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1251 *
1252 * Returns 0 if no, 1 if yes.
1253 */
1254static int
1255htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1256{
1257    int i, indx;
1258    const char **closed = NULL;
1259
1260    if (htmlStartCloseIndexinitialized == 0)
1261        htmlInitAutoClose();
1262
1263    /* inefficient, but not a big deal */
1264    for (indx = 0; indx < 100; indx++) {
1265        closed = htmlStartCloseIndex[indx];
1266        if (closed == NULL)
1267            return (0);
1268        if (xmlStrEqual(BAD_CAST * closed, newtag))
1269            break;
1270    }
1271
1272    i = closed - htmlStartClose;
1273    i++;
1274    while (htmlStartClose[i] != NULL) {
1275        if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1276            return (1);
1277        }
1278        i++;
1279    }
1280    return (0);
1281}
1282
1283/**
1284 * htmlAutoCloseOnClose:
1285 * @ctxt:  an HTML parser context
1286 * @newtag:  The new tag name
1287 * @force:  force the tag closure
1288 *
1289 * The HTML DTD allows an ending tag to implicitly close other tags.
1290 */
1291static void
1292htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1293{
1294    const htmlElemDesc *info;
1295    int i, priority;
1296
1297    priority = htmlGetEndPriority(newtag);
1298
1299    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1300
1301        if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1302            break;
1303        /*
1304         * A missplaced endtag can only close elements with lower
1305         * or equal priority, so if we find an element with higher
1306         * priority before we find an element with
1307         * matching name, we just ignore this endtag
1308         */
1309        if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1310            return;
1311    }
1312    if (i < 0)
1313        return;
1314
1315    while (!xmlStrEqual(newtag, ctxt->name)) {
1316        info = htmlTagLookup(ctxt->name);
1317        if ((info != NULL) && (info->endTag == 3)) {
1318            htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1319	                 "Opening and ending tag mismatch: %s and %s\n",
1320			 newtag, ctxt->name);
1321        }
1322        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1323            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1324	htmlnamePop(ctxt);
1325    }
1326}
1327
1328/**
1329 * htmlAutoCloseOnEnd:
1330 * @ctxt:  an HTML parser context
1331 *
1332 * Close all remaining tags at the end of the stream
1333 */
1334static void
1335htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1336{
1337    int i;
1338
1339    if (ctxt->nameNr == 0)
1340        return;
1341    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1342        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1343            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1344	htmlnamePop(ctxt);
1345    }
1346}
1347
1348/**
1349 * htmlAutoClose:
1350 * @ctxt:  an HTML parser context
1351 * @newtag:  The new tag name or NULL
1352 *
1353 * The HTML DTD allows a tag to implicitly close other tags.
1354 * The list is kept in htmlStartClose array. This function is
1355 * called when a new tag has been detected and generates the
1356 * appropriates closes if possible/needed.
1357 * If newtag is NULL this mean we are at the end of the resource
1358 * and we should check
1359 */
1360static void
1361htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1362{
1363    while ((newtag != NULL) && (ctxt->name != NULL) &&
1364           (htmlCheckAutoClose(newtag, ctxt->name))) {
1365        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1366            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1367	htmlnamePop(ctxt);
1368    }
1369    if (newtag == NULL) {
1370        htmlAutoCloseOnEnd(ctxt);
1371        return;
1372    }
1373    while ((newtag == NULL) && (ctxt->name != NULL) &&
1374           ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1375            (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1376            (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1377        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1378            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1379	htmlnamePop(ctxt);
1380    }
1381}
1382
1383/**
1384 * htmlAutoCloseTag:
1385 * @doc:  the HTML document
1386 * @name:  The tag name
1387 * @elem:  the HTML element
1388 *
1389 * The HTML DTD allows a tag to implicitly close other tags.
1390 * The list is kept in htmlStartClose array. This function checks
1391 * if the element or one of it's children would autoclose the
1392 * given tag.
1393 *
1394 * Returns 1 if autoclose, 0 otherwise
1395 */
1396int
1397htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1398    htmlNodePtr child;
1399
1400    if (elem == NULL) return(1);
1401    if (xmlStrEqual(name, elem->name)) return(0);
1402    if (htmlCheckAutoClose(elem->name, name)) return(1);
1403    child = elem->children;
1404    while (child != NULL) {
1405        if (htmlAutoCloseTag(doc, name, child)) return(1);
1406	child = child->next;
1407    }
1408    return(0);
1409}
1410
1411/**
1412 * htmlIsAutoClosed:
1413 * @doc:  the HTML document
1414 * @elem:  the HTML element
1415 *
1416 * The HTML DTD allows a tag to implicitly close other tags.
1417 * The list is kept in htmlStartClose array. This function checks
1418 * if a tag is autoclosed by one of it's child
1419 *
1420 * Returns 1 if autoclosed, 0 otherwise
1421 */
1422int
1423htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1424    htmlNodePtr child;
1425
1426    if (elem == NULL) return(1);
1427    child = elem->children;
1428    while (child != NULL) {
1429	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1430	child = child->next;
1431    }
1432    return(0);
1433}
1434
1435/**
1436 * htmlCheckImplied:
1437 * @ctxt:  an HTML parser context
1438 * @newtag:  The new tag name
1439 *
1440 * The HTML DTD allows a tag to exists only implicitly
1441 * called when a new tag has been detected and generates the
1442 * appropriates implicit tags if missing
1443 */
1444static void
1445htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1446    int i;
1447
1448    if (ctxt->options & HTML_PARSE_NOIMPLIED)
1449        return;
1450    if (!htmlOmittedDefaultValue)
1451	return;
1452    if (xmlStrEqual(newtag, BAD_CAST"html"))
1453	return;
1454    if (ctxt->nameNr <= 0) {
1455	htmlnamePush(ctxt, BAD_CAST"html");
1456	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1457	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1458    }
1459    if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1460        return;
1461    if ((ctxt->nameNr <= 1) &&
1462        ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1463	 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1464	 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1465	 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1466	 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1467	 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1468        if (ctxt->html >= 3) {
1469            /* we already saw or generated an <head> before */
1470            return;
1471        }
1472        /*
1473         * dropped OBJECT ... i you put it first BODY will be
1474         * assumed !
1475         */
1476        htmlnamePush(ctxt, BAD_CAST"head");
1477        if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1478            ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1479    } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1480	       (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1481	       (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1482        if (ctxt->html >= 10) {
1483            /* we already saw or generated a <body> before */
1484            return;
1485        }
1486	for (i = 0;i < ctxt->nameNr;i++) {
1487	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1488		return;
1489	    }
1490	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1491		return;
1492	    }
1493	}
1494
1495	htmlnamePush(ctxt, BAD_CAST"body");
1496	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1497	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1498    }
1499}
1500
1501/**
1502 * htmlCheckParagraph
1503 * @ctxt:  an HTML parser context
1504 *
1505 * Check whether a p element need to be implied before inserting
1506 * characters in the current element.
1507 *
1508 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1509 *         in case of error.
1510 */
1511
1512static int
1513htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1514    const xmlChar *tag;
1515    int i;
1516
1517    if (ctxt == NULL)
1518	return(-1);
1519    tag = ctxt->name;
1520    if (tag == NULL) {
1521	htmlAutoClose(ctxt, BAD_CAST"p");
1522	htmlCheckImplied(ctxt, BAD_CAST"p");
1523	htmlnamePush(ctxt, BAD_CAST"p");
1524	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1525	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1526	return(1);
1527    }
1528    if (!htmlOmittedDefaultValue)
1529	return(0);
1530    for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1531	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1532	    htmlAutoClose(ctxt, BAD_CAST"p");
1533	    htmlCheckImplied(ctxt, BAD_CAST"p");
1534	    htmlnamePush(ctxt, BAD_CAST"p");
1535	    if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1536		ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1537	    return(1);
1538	}
1539    }
1540    return(0);
1541}
1542
1543/**
1544 * htmlIsScriptAttribute:
1545 * @name:  an attribute name
1546 *
1547 * Check if an attribute is of content type Script
1548 *
1549 * Returns 1 is the attribute is a script 0 otherwise
1550 */
1551int
1552htmlIsScriptAttribute(const xmlChar *name) {
1553    unsigned int i;
1554
1555    if (name == NULL)
1556      return(0);
1557    /*
1558     * all script attributes start with 'on'
1559     */
1560    if ((name[0] != 'o') || (name[1] != 'n'))
1561      return(0);
1562    for (i = 0;
1563	 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1564	 i++) {
1565	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1566	    return(1);
1567    }
1568    return(0);
1569}
1570
1571/************************************************************************
1572 *									*
1573 *	The list of HTML predefined entities			*
1574 *									*
1575 ************************************************************************/
1576
1577
1578static const htmlEntityDesc  html40EntitiesTable[] = {
1579/*
1580 * the 4 absolute ones, plus apostrophe.
1581 */
1582{ 34,	"quot",	"quotation mark = APL quote, U+0022 ISOnum" },
1583{ 38,	"amp",	"ampersand, U+0026 ISOnum" },
1584{ 39,	"apos",	"single quote" },
1585{ 60,	"lt",	"less-than sign, U+003C ISOnum" },
1586{ 62,	"gt",	"greater-than sign, U+003E ISOnum" },
1587
1588/*
1589 * A bunch still in the 128-255 range
1590 * Replacing them depend really on the charset used.
1591 */
1592{ 160,	"nbsp",	"no-break space = non-breaking space, U+00A0 ISOnum" },
1593{ 161,	"iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1594{ 162,	"cent",	"cent sign, U+00A2 ISOnum" },
1595{ 163,	"pound","pound sign, U+00A3 ISOnum" },
1596{ 164,	"curren","currency sign, U+00A4 ISOnum" },
1597{ 165,	"yen",	"yen sign = yuan sign, U+00A5 ISOnum" },
1598{ 166,	"brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1599{ 167,	"sect",	"section sign, U+00A7 ISOnum" },
1600{ 168,	"uml",	"diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1601{ 169,	"copy",	"copyright sign, U+00A9 ISOnum" },
1602{ 170,	"ordf",	"feminine ordinal indicator, U+00AA ISOnum" },
1603{ 171,	"laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1604{ 172,	"not",	"not sign, U+00AC ISOnum" },
1605{ 173,	"shy",	"soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1606{ 174,	"reg",	"registered sign = registered trade mark sign, U+00AE ISOnum" },
1607{ 175,	"macr",	"macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1608{ 176,	"deg",	"degree sign, U+00B0 ISOnum" },
1609{ 177,	"plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1610{ 178,	"sup2",	"superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1611{ 179,	"sup3",	"superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1612{ 180,	"acute","acute accent = spacing acute, U+00B4 ISOdia" },
1613{ 181,	"micro","micro sign, U+00B5 ISOnum" },
1614{ 182,	"para",	"pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1615{ 183,	"middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1616{ 184,	"cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1617{ 185,	"sup1",	"superscript one = superscript digit one, U+00B9 ISOnum" },
1618{ 186,	"ordm",	"masculine ordinal indicator, U+00BA ISOnum" },
1619{ 187,	"raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1620{ 188,	"frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1621{ 189,	"frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1622{ 190,	"frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1623{ 191,	"iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1624{ 192,	"Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1625{ 193,	"Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1626{ 194,	"Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1627{ 195,	"Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1628{ 196,	"Auml",	"latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1629{ 197,	"Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1630{ 198,	"AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1631{ 199,	"Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1632{ 200,	"Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1633{ 201,	"Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1634{ 202,	"Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1635{ 203,	"Euml",	"latin capital letter E with diaeresis, U+00CB ISOlat1" },
1636{ 204,	"Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1637{ 205,	"Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1638{ 206,	"Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1639{ 207,	"Iuml",	"latin capital letter I with diaeresis, U+00CF ISOlat1" },
1640{ 208,	"ETH",	"latin capital letter ETH, U+00D0 ISOlat1" },
1641{ 209,	"Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1642{ 210,	"Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1643{ 211,	"Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1644{ 212,	"Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1645{ 213,	"Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1646{ 214,	"Ouml",	"latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1647{ 215,	"times","multiplication sign, U+00D7 ISOnum" },
1648{ 216,	"Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1649{ 217,	"Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1650{ 218,	"Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1651{ 219,	"Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1652{ 220,	"Uuml",	"latin capital letter U with diaeresis, U+00DC ISOlat1" },
1653{ 221,	"Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1654{ 222,	"THORN","latin capital letter THORN, U+00DE ISOlat1" },
1655{ 223,	"szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1656{ 224,	"agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1657{ 225,	"aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1658{ 226,	"acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1659{ 227,	"atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1660{ 228,	"auml",	"latin small letter a with diaeresis, U+00E4 ISOlat1" },
1661{ 229,	"aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1662{ 230,	"aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1663{ 231,	"ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1664{ 232,	"egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1665{ 233,	"eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1666{ 234,	"ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1667{ 235,	"euml",	"latin small letter e with diaeresis, U+00EB ISOlat1" },
1668{ 236,	"igrave","latin small letter i with grave, U+00EC ISOlat1" },
1669{ 237,	"iacute","latin small letter i with acute, U+00ED ISOlat1" },
1670{ 238,	"icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1671{ 239,	"iuml",	"latin small letter i with diaeresis, U+00EF ISOlat1" },
1672{ 240,	"eth",	"latin small letter eth, U+00F0 ISOlat1" },
1673{ 241,	"ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1674{ 242,	"ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1675{ 243,	"oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1676{ 244,	"ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1677{ 245,	"otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1678{ 246,	"ouml",	"latin small letter o with diaeresis, U+00F6 ISOlat1" },
1679{ 247,	"divide","division sign, U+00F7 ISOnum" },
1680{ 248,	"oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1681{ 249,	"ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1682{ 250,	"uacute","latin small letter u with acute, U+00FA ISOlat1" },
1683{ 251,	"ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1684{ 252,	"uuml",	"latin small letter u with diaeresis, U+00FC ISOlat1" },
1685{ 253,	"yacute","latin small letter y with acute, U+00FD ISOlat1" },
1686{ 254,	"thorn","latin small letter thorn with, U+00FE ISOlat1" },
1687{ 255,	"yuml",	"latin small letter y with diaeresis, U+00FF ISOlat1" },
1688
1689{ 338,	"OElig","latin capital ligature OE, U+0152 ISOlat2" },
1690{ 339,	"oelig","latin small ligature oe, U+0153 ISOlat2" },
1691{ 352,	"Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1692{ 353,	"scaron","latin small letter s with caron, U+0161 ISOlat2" },
1693{ 376,	"Yuml",	"latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1694
1695/*
1696 * Anything below should really be kept as entities references
1697 */
1698{ 402,	"fnof",	"latin small f with hook = function = florin, U+0192 ISOtech" },
1699
1700{ 710,	"circ",	"modifier letter circumflex accent, U+02C6 ISOpub" },
1701{ 732,	"tilde","small tilde, U+02DC ISOdia" },
1702
1703{ 913,	"Alpha","greek capital letter alpha, U+0391" },
1704{ 914,	"Beta",	"greek capital letter beta, U+0392" },
1705{ 915,	"Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1706{ 916,	"Delta","greek capital letter delta, U+0394 ISOgrk3" },
1707{ 917,	"Epsilon","greek capital letter epsilon, U+0395" },
1708{ 918,	"Zeta",	"greek capital letter zeta, U+0396" },
1709{ 919,	"Eta",	"greek capital letter eta, U+0397" },
1710{ 920,	"Theta","greek capital letter theta, U+0398 ISOgrk3" },
1711{ 921,	"Iota",	"greek capital letter iota, U+0399" },
1712{ 922,	"Kappa","greek capital letter kappa, U+039A" },
1713{ 923,	"Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1714{ 924,	"Mu",	"greek capital letter mu, U+039C" },
1715{ 925,	"Nu",	"greek capital letter nu, U+039D" },
1716{ 926,	"Xi",	"greek capital letter xi, U+039E ISOgrk3" },
1717{ 927,	"Omicron","greek capital letter omicron, U+039F" },
1718{ 928,	"Pi",	"greek capital letter pi, U+03A0 ISOgrk3" },
1719{ 929,	"Rho",	"greek capital letter rho, U+03A1" },
1720{ 931,	"Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1721{ 932,	"Tau",	"greek capital letter tau, U+03A4" },
1722{ 933,	"Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1723{ 934,	"Phi",	"greek capital letter phi, U+03A6 ISOgrk3" },
1724{ 935,	"Chi",	"greek capital letter chi, U+03A7" },
1725{ 936,	"Psi",	"greek capital letter psi, U+03A8 ISOgrk3" },
1726{ 937,	"Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1727
1728{ 945,	"alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1729{ 946,	"beta",	"greek small letter beta, U+03B2 ISOgrk3" },
1730{ 947,	"gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1731{ 948,	"delta","greek small letter delta, U+03B4 ISOgrk3" },
1732{ 949,	"epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1733{ 950,	"zeta",	"greek small letter zeta, U+03B6 ISOgrk3" },
1734{ 951,	"eta",	"greek small letter eta, U+03B7 ISOgrk3" },
1735{ 952,	"theta","greek small letter theta, U+03B8 ISOgrk3" },
1736{ 953,	"iota",	"greek small letter iota, U+03B9 ISOgrk3" },
1737{ 954,	"kappa","greek small letter kappa, U+03BA ISOgrk3" },
1738{ 955,	"lambda","greek small letter lambda, U+03BB ISOgrk3" },
1739{ 956,	"mu",	"greek small letter mu, U+03BC ISOgrk3" },
1740{ 957,	"nu",	"greek small letter nu, U+03BD ISOgrk3" },
1741{ 958,	"xi",	"greek small letter xi, U+03BE ISOgrk3" },
1742{ 959,	"omicron","greek small letter omicron, U+03BF NEW" },
1743{ 960,	"pi",	"greek small letter pi, U+03C0 ISOgrk3" },
1744{ 961,	"rho",	"greek small letter rho, U+03C1 ISOgrk3" },
1745{ 962,	"sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1746{ 963,	"sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1747{ 964,	"tau",	"greek small letter tau, U+03C4 ISOgrk3" },
1748{ 965,	"upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1749{ 966,	"phi",	"greek small letter phi, U+03C6 ISOgrk3" },
1750{ 967,	"chi",	"greek small letter chi, U+03C7 ISOgrk3" },
1751{ 968,	"psi",	"greek small letter psi, U+03C8 ISOgrk3" },
1752{ 969,	"omega","greek small letter omega, U+03C9 ISOgrk3" },
1753{ 977,	"thetasym","greek small letter theta symbol, U+03D1 NEW" },
1754{ 978,	"upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1755{ 982,	"piv",	"greek pi symbol, U+03D6 ISOgrk3" },
1756
1757{ 8194,	"ensp",	"en space, U+2002 ISOpub" },
1758{ 8195,	"emsp",	"em space, U+2003 ISOpub" },
1759{ 8201,	"thinsp","thin space, U+2009 ISOpub" },
1760{ 8204,	"zwnj",	"zero width non-joiner, U+200C NEW RFC 2070" },
1761{ 8205,	"zwj",	"zero width joiner, U+200D NEW RFC 2070" },
1762{ 8206,	"lrm",	"left-to-right mark, U+200E NEW RFC 2070" },
1763{ 8207,	"rlm",	"right-to-left mark, U+200F NEW RFC 2070" },
1764{ 8211,	"ndash","en dash, U+2013 ISOpub" },
1765{ 8212,	"mdash","em dash, U+2014 ISOpub" },
1766{ 8216,	"lsquo","left single quotation mark, U+2018 ISOnum" },
1767{ 8217,	"rsquo","right single quotation mark, U+2019 ISOnum" },
1768{ 8218,	"sbquo","single low-9 quotation mark, U+201A NEW" },
1769{ 8220,	"ldquo","left double quotation mark, U+201C ISOnum" },
1770{ 8221,	"rdquo","right double quotation mark, U+201D ISOnum" },
1771{ 8222,	"bdquo","double low-9 quotation mark, U+201E NEW" },
1772{ 8224,	"dagger","dagger, U+2020 ISOpub" },
1773{ 8225,	"Dagger","double dagger, U+2021 ISOpub" },
1774
1775{ 8226,	"bull",	"bullet = black small circle, U+2022 ISOpub" },
1776{ 8230,	"hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1777
1778{ 8240,	"permil","per mille sign, U+2030 ISOtech" },
1779
1780{ 8242,	"prime","prime = minutes = feet, U+2032 ISOtech" },
1781{ 8243,	"Prime","double prime = seconds = inches, U+2033 ISOtech" },
1782
1783{ 8249,	"lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1784{ 8250,	"rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1785
1786{ 8254,	"oline","overline = spacing overscore, U+203E NEW" },
1787{ 8260,	"frasl","fraction slash, U+2044 NEW" },
1788
1789{ 8364,	"euro",	"euro sign, U+20AC NEW" },
1790
1791{ 8465,	"image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1792{ 8472,	"weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1793{ 8476,	"real",	"blackletter capital R = real part symbol, U+211C ISOamso" },
1794{ 8482,	"trade","trade mark sign, U+2122 ISOnum" },
1795{ 8501,	"alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1796{ 8592,	"larr",	"leftwards arrow, U+2190 ISOnum" },
1797{ 8593,	"uarr",	"upwards arrow, U+2191 ISOnum" },
1798{ 8594,	"rarr",	"rightwards arrow, U+2192 ISOnum" },
1799{ 8595,	"darr",	"downwards arrow, U+2193 ISOnum" },
1800{ 8596,	"harr",	"left right arrow, U+2194 ISOamsa" },
1801{ 8629,	"crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1802{ 8656,	"lArr",	"leftwards double arrow, U+21D0 ISOtech" },
1803{ 8657,	"uArr",	"upwards double arrow, U+21D1 ISOamsa" },
1804{ 8658,	"rArr",	"rightwards double arrow, U+21D2 ISOtech" },
1805{ 8659,	"dArr",	"downwards double arrow, U+21D3 ISOamsa" },
1806{ 8660,	"hArr",	"left right double arrow, U+21D4 ISOamsa" },
1807
1808{ 8704,	"forall","for all, U+2200 ISOtech" },
1809{ 8706,	"part",	"partial differential, U+2202 ISOtech" },
1810{ 8707,	"exist","there exists, U+2203 ISOtech" },
1811{ 8709,	"empty","empty set = null set = diameter, U+2205 ISOamso" },
1812{ 8711,	"nabla","nabla = backward difference, U+2207 ISOtech" },
1813{ 8712,	"isin",	"element of, U+2208 ISOtech" },
1814{ 8713,	"notin","not an element of, U+2209 ISOtech" },
1815{ 8715,	"ni",	"contains as member, U+220B ISOtech" },
1816{ 8719,	"prod",	"n-ary product = product sign, U+220F ISOamsb" },
1817{ 8721,	"sum",	"n-ary summation, U+2211 ISOamsb" },
1818{ 8722,	"minus","minus sign, U+2212 ISOtech" },
1819{ 8727,	"lowast","asterisk operator, U+2217 ISOtech" },
1820{ 8730,	"radic","square root = radical sign, U+221A ISOtech" },
1821{ 8733,	"prop",	"proportional to, U+221D ISOtech" },
1822{ 8734,	"infin","infinity, U+221E ISOtech" },
1823{ 8736,	"ang",	"angle, U+2220 ISOamso" },
1824{ 8743,	"and",	"logical and = wedge, U+2227 ISOtech" },
1825{ 8744,	"or",	"logical or = vee, U+2228 ISOtech" },
1826{ 8745,	"cap",	"intersection = cap, U+2229 ISOtech" },
1827{ 8746,	"cup",	"union = cup, U+222A ISOtech" },
1828{ 8747,	"int",	"integral, U+222B ISOtech" },
1829{ 8756,	"there4","therefore, U+2234 ISOtech" },
1830{ 8764,	"sim",	"tilde operator = varies with = similar to, U+223C ISOtech" },
1831{ 8773,	"cong",	"approximately equal to, U+2245 ISOtech" },
1832{ 8776,	"asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1833{ 8800,	"ne",	"not equal to, U+2260 ISOtech" },
1834{ 8801,	"equiv","identical to, U+2261 ISOtech" },
1835{ 8804,	"le",	"less-than or equal to, U+2264 ISOtech" },
1836{ 8805,	"ge",	"greater-than or equal to, U+2265 ISOtech" },
1837{ 8834,	"sub",	"subset of, U+2282 ISOtech" },
1838{ 8835,	"sup",	"superset of, U+2283 ISOtech" },
1839{ 8836,	"nsub",	"not a subset of, U+2284 ISOamsn" },
1840{ 8838,	"sube",	"subset of or equal to, U+2286 ISOtech" },
1841{ 8839,	"supe",	"superset of or equal to, U+2287 ISOtech" },
1842{ 8853,	"oplus","circled plus = direct sum, U+2295 ISOamsb" },
1843{ 8855,	"otimes","circled times = vector product, U+2297 ISOamsb" },
1844{ 8869,	"perp",	"up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1845{ 8901,	"sdot",	"dot operator, U+22C5 ISOamsb" },
1846{ 8968,	"lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1847{ 8969,	"rceil","right ceiling, U+2309 ISOamsc" },
1848{ 8970,	"lfloor","left floor = apl downstile, U+230A ISOamsc" },
1849{ 8971,	"rfloor","right floor, U+230B ISOamsc" },
1850{ 9001,	"lang",	"left-pointing angle bracket = bra, U+2329 ISOtech" },
1851{ 9002,	"rang",	"right-pointing angle bracket = ket, U+232A ISOtech" },
1852{ 9674,	"loz",	"lozenge, U+25CA ISOpub" },
1853
1854{ 9824,	"spades","black spade suit, U+2660 ISOpub" },
1855{ 9827,	"clubs","black club suit = shamrock, U+2663 ISOpub" },
1856{ 9829,	"hearts","black heart suit = valentine, U+2665 ISOpub" },
1857{ 9830,	"diams","black diamond suit, U+2666 ISOpub" },
1858
1859};
1860
1861/************************************************************************
1862 *									*
1863 *		Commodity functions to handle entities			*
1864 *									*
1865 ************************************************************************/
1866
1867/*
1868 * Macro used to grow the current buffer.
1869 */
1870#define growBuffer(buffer) {						\
1871    xmlChar *tmp;							\
1872    buffer##_size *= 2;							\
1873    tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1874    if (tmp == NULL) {						\
1875	htmlErrMemory(ctxt, "growing buffer\n");			\
1876	xmlFree(buffer);						\
1877	return(NULL);							\
1878    }									\
1879    buffer = tmp;							\
1880}
1881
1882/**
1883 * htmlEntityLookup:
1884 * @name: the entity name
1885 *
1886 * Lookup the given entity in EntitiesTable
1887 *
1888 * TODO: the linear scan is really ugly, an hash table is really needed.
1889 *
1890 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1891 */
1892const htmlEntityDesc *
1893htmlEntityLookup(const xmlChar *name) {
1894    unsigned int i;
1895
1896    for (i = 0;i < (sizeof(html40EntitiesTable)/
1897                    sizeof(html40EntitiesTable[0]));i++) {
1898        if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1899            return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1900	}
1901    }
1902    return(NULL);
1903}
1904
1905/**
1906 * htmlEntityValueLookup:
1907 * @value: the entity's unicode value
1908 *
1909 * Lookup the given entity in EntitiesTable
1910 *
1911 * TODO: the linear scan is really ugly, an hash table is really needed.
1912 *
1913 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1914 */
1915const htmlEntityDesc *
1916htmlEntityValueLookup(unsigned int value) {
1917    unsigned int i;
1918
1919    for (i = 0;i < (sizeof(html40EntitiesTable)/
1920                    sizeof(html40EntitiesTable[0]));i++) {
1921        if (html40EntitiesTable[i].value >= value) {
1922	    if (html40EntitiesTable[i].value > value)
1923		break;
1924            return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1925	}
1926    }
1927    return(NULL);
1928}
1929
1930/**
1931 * UTF8ToHtml:
1932 * @out:  a pointer to an array of bytes to store the result
1933 * @outlen:  the length of @out
1934 * @in:  a pointer to an array of UTF-8 chars
1935 * @inlen:  the length of @in
1936 *
1937 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1938 * plus HTML entities block of chars out.
1939 *
1940 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1941 * The value of @inlen after return is the number of octets consumed
1942 *     as the return value is positive, else unpredictable.
1943 * The value of @outlen after return is the number of octets consumed.
1944 */
1945int
1946UTF8ToHtml(unsigned char* out, int *outlen,
1947              const unsigned char* in, int *inlen) {
1948    const unsigned char* processed = in;
1949    const unsigned char* outend;
1950    const unsigned char* outstart = out;
1951    const unsigned char* instart = in;
1952    const unsigned char* inend;
1953    unsigned int c, d;
1954    int trailing;
1955
1956    if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
1957    if (in == NULL) {
1958        /*
1959	 * initialization nothing to do
1960	 */
1961	*outlen = 0;
1962	*inlen = 0;
1963	return(0);
1964    }
1965    inend = in + (*inlen);
1966    outend = out + (*outlen);
1967    while (in < inend) {
1968	d = *in++;
1969	if      (d < 0x80)  { c= d; trailing= 0; }
1970	else if (d < 0xC0) {
1971	    /* trailing byte in leading position */
1972	    *outlen = out - outstart;
1973	    *inlen = processed - instart;
1974	    return(-2);
1975        } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
1976        else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
1977        else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
1978	else {
1979	    /* no chance for this in Ascii */
1980	    *outlen = out - outstart;
1981	    *inlen = processed - instart;
1982	    return(-2);
1983	}
1984
1985	if (inend - in < trailing) {
1986	    break;
1987	}
1988
1989	for ( ; trailing; trailing--) {
1990	    if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1991		break;
1992	    c <<= 6;
1993	    c |= d & 0x3F;
1994	}
1995
1996	/* assertion: c is a single UTF-4 value */
1997	if (c < 0x80) {
1998	    if (out + 1 >= outend)
1999		break;
2000	    *out++ = c;
2001	} else {
2002	    int len;
2003	    const htmlEntityDesc * ent;
2004	    const char *cp;
2005	    char nbuf[16];
2006
2007	    /*
2008	     * Try to lookup a predefined HTML entity for it
2009	     */
2010
2011	    ent = htmlEntityValueLookup(c);
2012	    if (ent == NULL) {
2013	      snprintf(nbuf, sizeof(nbuf), "#%u", c);
2014	      cp = nbuf;
2015	    }
2016	    else
2017	      cp = ent->name;
2018	    len = strlen(cp);
2019	    if (out + 2 + len >= outend)
2020		break;
2021	    *out++ = '&';
2022	    memcpy(out, cp, len);
2023	    out += len;
2024	    *out++ = ';';
2025	}
2026	processed = in;
2027    }
2028    *outlen = out - outstart;
2029    *inlen = processed - instart;
2030    return(0);
2031}
2032
2033/**
2034 * htmlEncodeEntities:
2035 * @out:  a pointer to an array of bytes to store the result
2036 * @outlen:  the length of @out
2037 * @in:  a pointer to an array of UTF-8 chars
2038 * @inlen:  the length of @in
2039 * @quoteChar: the quote character to escape (' or ") or zero.
2040 *
2041 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2042 * plus HTML entities block of chars out.
2043 *
2044 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2045 * The value of @inlen after return is the number of octets consumed
2046 *     as the return value is positive, else unpredictable.
2047 * The value of @outlen after return is the number of octets consumed.
2048 */
2049int
2050htmlEncodeEntities(unsigned char* out, int *outlen,
2051		   const unsigned char* in, int *inlen, int quoteChar) {
2052    const unsigned char* processed = in;
2053    const unsigned char* outend;
2054    const unsigned char* outstart = out;
2055    const unsigned char* instart = in;
2056    const unsigned char* inend;
2057    unsigned int c, d;
2058    int trailing;
2059
2060    if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2061        return(-1);
2062    outend = out + (*outlen);
2063    inend = in + (*inlen);
2064    while (in < inend) {
2065	d = *in++;
2066	if      (d < 0x80)  { c= d; trailing= 0; }
2067	else if (d < 0xC0) {
2068	    /* trailing byte in leading position */
2069	    *outlen = out - outstart;
2070	    *inlen = processed - instart;
2071	    return(-2);
2072        } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2073        else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2074        else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2075	else {
2076	    /* no chance for this in Ascii */
2077	    *outlen = out - outstart;
2078	    *inlen = processed - instart;
2079	    return(-2);
2080	}
2081
2082	if (inend - in < trailing)
2083	    break;
2084
2085	while (trailing--) {
2086	    if (((d= *in++) & 0xC0) != 0x80) {
2087		*outlen = out - outstart;
2088		*inlen = processed - instart;
2089		return(-2);
2090	    }
2091	    c <<= 6;
2092	    c |= d & 0x3F;
2093	}
2094
2095	/* assertion: c is a single UTF-4 value */
2096	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2097	    (c != '&') && (c != '<') && (c != '>')) {
2098	    if (out >= outend)
2099		break;
2100	    *out++ = c;
2101	} else {
2102	    const htmlEntityDesc * ent;
2103	    const char *cp;
2104	    char nbuf[16];
2105	    int len;
2106
2107	    /*
2108	     * Try to lookup a predefined HTML entity for it
2109	     */
2110	    ent = htmlEntityValueLookup(c);
2111	    if (ent == NULL) {
2112		snprintf(nbuf, sizeof(nbuf), "#%u", c);
2113		cp = nbuf;
2114	    }
2115	    else
2116		cp = ent->name;
2117	    len = strlen(cp);
2118	    if (out + 2 + len > outend)
2119		break;
2120	    *out++ = '&';
2121	    memcpy(out, cp, len);
2122	    out += len;
2123	    *out++ = ';';
2124	}
2125	processed = in;
2126    }
2127    *outlen = out - outstart;
2128    *inlen = processed - instart;
2129    return(0);
2130}
2131
2132/************************************************************************
2133 *									*
2134 *		Commodity functions to handle streams			*
2135 *									*
2136 ************************************************************************/
2137
2138/**
2139 * htmlNewInputStream:
2140 * @ctxt:  an HTML parser context
2141 *
2142 * Create a new input stream structure
2143 * Returns the new input stream or NULL
2144 */
2145static htmlParserInputPtr
2146htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2147    htmlParserInputPtr input;
2148
2149    input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2150    if (input == NULL) {
2151        htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2152	return(NULL);
2153    }
2154    memset(input, 0, sizeof(htmlParserInput));
2155    input->filename = NULL;
2156    input->directory = NULL;
2157    input->base = NULL;
2158    input->cur = NULL;
2159    input->buf = NULL;
2160    input->line = 1;
2161    input->col = 1;
2162    input->buf = NULL;
2163    input->free = NULL;
2164    input->version = NULL;
2165    input->consumed = 0;
2166    input->length = 0;
2167    return(input);
2168}
2169
2170
2171/************************************************************************
2172 *									*
2173 *		Commodity functions, cleanup needed ?			*
2174 *									*
2175 ************************************************************************/
2176/*
2177 * all tags allowing pc data from the html 4.01 loose dtd
2178 * NOTE: it might be more apropriate to integrate this information
2179 * into the html40ElementTable array but I don't want to risk any
2180 * binary incomptibility
2181 */
2182static const char *allowPCData[] = {
2183    "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2184    "blockquote", "body", "button", "caption", "center", "cite", "code",
2185    "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2186    "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2187    "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2188    "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2189};
2190
2191/**
2192 * areBlanks:
2193 * @ctxt:  an HTML parser context
2194 * @str:  a xmlChar *
2195 * @len:  the size of @str
2196 *
2197 * Is this a sequence of blank chars that one can ignore ?
2198 *
2199 * Returns 1 if ignorable 0 otherwise.
2200 */
2201
2202static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2203    unsigned int i;
2204    int j;
2205    xmlNodePtr lastChild;
2206    xmlDtdPtr dtd;
2207
2208    for (j = 0;j < len;j++)
2209        if (!(IS_BLANK_CH(str[j]))) return(0);
2210
2211    if (CUR == 0) return(1);
2212    if (CUR != '<') return(0);
2213    if (ctxt->name == NULL)
2214	return(1);
2215    if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2216	return(1);
2217    if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2218	return(1);
2219
2220    /* Only strip CDATA children of the body tag for strict HTML DTDs */
2221    if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2222        dtd = xmlGetIntSubset(ctxt->myDoc);
2223        if (dtd != NULL && dtd->ExternalID != NULL) {
2224            if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2225                    !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2226                return(1);
2227        }
2228    }
2229
2230    if (ctxt->node == NULL) return(0);
2231    lastChild = xmlGetLastChild(ctxt->node);
2232    while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2233	lastChild = lastChild->prev;
2234    if (lastChild == NULL) {
2235        if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2236            (ctxt->node->content != NULL)) return(0);
2237	/* keep ws in constructs like ...<b> </b>...
2238	   for all tags "b" allowing PCDATA */
2239	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2240	    if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2241		return(0);
2242	    }
2243	}
2244    } else if (xmlNodeIsText(lastChild)) {
2245        return(0);
2246    } else {
2247	/* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2248	   for all tags "p" allowing PCDATA */
2249	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2250	    if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2251		return(0);
2252	    }
2253	}
2254    }
2255    return(1);
2256}
2257
2258/**
2259 * htmlNewDocNoDtD:
2260 * @URI:  URI for the dtd, or NULL
2261 * @ExternalID:  the external ID of the DTD, or NULL
2262 *
2263 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2264 * are NULL
2265 *
2266 * Returns a new document, do not initialize the DTD if not provided
2267 */
2268htmlDocPtr
2269htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2270    xmlDocPtr cur;
2271
2272    /*
2273     * Allocate a new document and fill the fields.
2274     */
2275    cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2276    if (cur == NULL) {
2277	htmlErrMemory(NULL, "HTML document creation failed\n");
2278	return(NULL);
2279    }
2280    memset(cur, 0, sizeof(xmlDoc));
2281
2282    cur->type = XML_HTML_DOCUMENT_NODE;
2283    cur->version = NULL;
2284    cur->intSubset = NULL;
2285    cur->doc = cur;
2286    cur->name = NULL;
2287    cur->children = NULL;
2288    cur->extSubset = NULL;
2289    cur->oldNs = NULL;
2290    cur->encoding = NULL;
2291    cur->standalone = 1;
2292    cur->compression = 0;
2293    cur->ids = NULL;
2294    cur->refs = NULL;
2295    cur->_private = NULL;
2296    cur->charset = XML_CHAR_ENCODING_UTF8;
2297    cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2298    if ((ExternalID != NULL) ||
2299	(URI != NULL))
2300	xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2301    return(cur);
2302}
2303
2304/**
2305 * htmlNewDoc:
2306 * @URI:  URI for the dtd, or NULL
2307 * @ExternalID:  the external ID of the DTD, or NULL
2308 *
2309 * Creates a new HTML document
2310 *
2311 * Returns a new document
2312 */
2313htmlDocPtr
2314htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2315    if ((URI == NULL) && (ExternalID == NULL))
2316	return(htmlNewDocNoDtD(
2317		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2318		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2319
2320    return(htmlNewDocNoDtD(URI, ExternalID));
2321}
2322
2323
2324/************************************************************************
2325 *									*
2326 *			The parser itself				*
2327 *	Relates to http://www.w3.org/TR/html40				*
2328 *									*
2329 ************************************************************************/
2330
2331/************************************************************************
2332 *									*
2333 *			The parser itself				*
2334 *									*
2335 ************************************************************************/
2336
2337static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2338
2339/**
2340 * htmlParseHTMLName:
2341 * @ctxt:  an HTML parser context
2342 *
2343 * parse an HTML tag or attribute name, note that we convert it to lowercase
2344 * since HTML names are not case-sensitive.
2345 *
2346 * Returns the Tag Name parsed or NULL
2347 */
2348
2349static const xmlChar *
2350htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2351    int i = 0;
2352    xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2353
2354    if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2355        (CUR != ':') && (CUR != '.')) return(NULL);
2356
2357    while ((i < HTML_PARSER_BUFFER_SIZE) &&
2358           ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2359	   (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2360           (CUR == '.'))) {
2361	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2362        else loc[i] = CUR;
2363	i++;
2364
2365	NEXT;
2366    }
2367
2368    return(xmlDictLookup(ctxt->dict, loc, i));
2369}
2370
2371
2372/**
2373 * htmlParseHTMLName_nonInvasive:
2374 * @ctxt:  an HTML parser context
2375 *
2376 * parse an HTML tag or attribute name, note that we convert it to lowercase
2377 * since HTML names are not case-sensitive, this doesn't consume the data
2378 * from the stream, it's a look-ahead
2379 *
2380 * Returns the Tag Name parsed or NULL
2381 */
2382
2383static const xmlChar *
2384htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2385    int i = 0;
2386    xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2387
2388    if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2389        (NXT(1) != ':')) return(NULL);
2390
2391    while ((i < HTML_PARSER_BUFFER_SIZE) &&
2392           ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2393	   (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2394	if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2395        else loc[i] = NXT(1+i);
2396	i++;
2397    }
2398
2399    return(xmlDictLookup(ctxt->dict, loc, i));
2400}
2401
2402
2403/**
2404 * htmlParseName:
2405 * @ctxt:  an HTML parser context
2406 *
2407 * parse an HTML name, this routine is case sensitive.
2408 *
2409 * Returns the Name parsed or NULL
2410 */
2411
2412static const xmlChar *
2413htmlParseName(htmlParserCtxtPtr ctxt) {
2414    const xmlChar *in;
2415    const xmlChar *ret;
2416    int count = 0;
2417
2418    GROW;
2419
2420    /*
2421     * Accelerator for simple ASCII names
2422     */
2423    in = ctxt->input->cur;
2424    if (((*in >= 0x61) && (*in <= 0x7A)) ||
2425	((*in >= 0x41) && (*in <= 0x5A)) ||
2426	(*in == '_') || (*in == ':')) {
2427	in++;
2428	while (((*in >= 0x61) && (*in <= 0x7A)) ||
2429	       ((*in >= 0x41) && (*in <= 0x5A)) ||
2430	       ((*in >= 0x30) && (*in <= 0x39)) ||
2431	       (*in == '_') || (*in == '-') ||
2432	       (*in == ':') || (*in == '.'))
2433	    in++;
2434	if ((*in > 0) && (*in < 0x80)) {
2435	    count = in - ctxt->input->cur;
2436	    ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2437	    ctxt->input->cur = in;
2438	    ctxt->nbChars += count;
2439	    ctxt->input->col += count;
2440	    return(ret);
2441	}
2442    }
2443    return(htmlParseNameComplex(ctxt));
2444}
2445
2446static const xmlChar *
2447htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2448    int len = 0, l;
2449    int c;
2450    int count = 0;
2451
2452    /*
2453     * Handler for more complex cases
2454     */
2455    GROW;
2456    c = CUR_CHAR(l);
2457    if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2458	(!IS_LETTER(c) && (c != '_') &&
2459         (c != ':'))) {
2460	return(NULL);
2461    }
2462
2463    while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2464	   ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2465            (c == '.') || (c == '-') ||
2466	    (c == '_') || (c == ':') ||
2467	    (IS_COMBINING(c)) ||
2468	    (IS_EXTENDER(c)))) {
2469	if (count++ > 100) {
2470	    count = 0;
2471	    GROW;
2472	}
2473	len += l;
2474	NEXTL(l);
2475	c = CUR_CHAR(l);
2476    }
2477    return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2478}
2479
2480
2481/**
2482 * htmlParseHTMLAttribute:
2483 * @ctxt:  an HTML parser context
2484 * @stop:  a char stop value
2485 *
2486 * parse an HTML attribute value till the stop (quote), if
2487 * stop is 0 then it stops at the first space
2488 *
2489 * Returns the attribute parsed or NULL
2490 */
2491
2492static xmlChar *
2493htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2494    xmlChar *buffer = NULL;
2495    int buffer_size = 0;
2496    xmlChar *out = NULL;
2497    const xmlChar *name = NULL;
2498    const xmlChar *cur = NULL;
2499    const htmlEntityDesc * ent;
2500
2501    /*
2502     * allocate a translation buffer.
2503     */
2504    buffer_size = HTML_PARSER_BUFFER_SIZE;
2505    buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2506    if (buffer == NULL) {
2507	htmlErrMemory(ctxt, "buffer allocation failed\n");
2508	return(NULL);
2509    }
2510    out = buffer;
2511
2512    /*
2513     * Ok loop until we reach one of the ending chars
2514     */
2515    while ((CUR != 0) && (CUR != stop)) {
2516	if ((stop == 0) && (CUR == '>')) break;
2517	if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2518        if (CUR == '&') {
2519	    if (NXT(1) == '#') {
2520		unsigned int c;
2521		int bits;
2522
2523		c = htmlParseCharRef(ctxt);
2524		if      (c <    0x80)
2525		        { *out++  = c;                bits= -6; }
2526		else if (c <   0x800)
2527		        { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2528		else if (c < 0x10000)
2529		        { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2530		else
2531		        { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2532
2533		for ( ; bits >= 0; bits-= 6) {
2534		    *out++  = ((c >> bits) & 0x3F) | 0x80;
2535		}
2536
2537		if (out - buffer > buffer_size - 100) {
2538			int indx = out - buffer;
2539
2540			growBuffer(buffer);
2541			out = &buffer[indx];
2542		}
2543	    } else {
2544		ent = htmlParseEntityRef(ctxt, &name);
2545		if (name == NULL) {
2546		    *out++ = '&';
2547		    if (out - buffer > buffer_size - 100) {
2548			int indx = out - buffer;
2549
2550			growBuffer(buffer);
2551			out = &buffer[indx];
2552		    }
2553		} else if (ent == NULL) {
2554		    *out++ = '&';
2555		    cur = name;
2556		    while (*cur != 0) {
2557			if (out - buffer > buffer_size - 100) {
2558			    int indx = out - buffer;
2559
2560			    growBuffer(buffer);
2561			    out = &buffer[indx];
2562			}
2563			*out++ = *cur++;
2564		    }
2565		} else {
2566		    unsigned int c;
2567		    int bits;
2568
2569		    if (out - buffer > buffer_size - 100) {
2570			int indx = out - buffer;
2571
2572			growBuffer(buffer);
2573			out = &buffer[indx];
2574		    }
2575		    c = ent->value;
2576		    if      (c <    0x80)
2577			{ *out++  = c;                bits= -6; }
2578		    else if (c <   0x800)
2579			{ *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2580		    else if (c < 0x10000)
2581			{ *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2582		    else
2583			{ *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2584
2585		    for ( ; bits >= 0; bits-= 6) {
2586			*out++  = ((c >> bits) & 0x3F) | 0x80;
2587		    }
2588		}
2589	    }
2590	} else {
2591	    unsigned int c;
2592	    int bits, l;
2593
2594	    if (out - buffer > buffer_size - 100) {
2595		int indx = out - buffer;
2596
2597		growBuffer(buffer);
2598		out = &buffer[indx];
2599	    }
2600	    c = CUR_CHAR(l);
2601	    if      (c <    0x80)
2602		    { *out++  = c;                bits= -6; }
2603	    else if (c <   0x800)
2604		    { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2605	    else if (c < 0x10000)
2606		    { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2607	    else
2608		    { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2609
2610	    for ( ; bits >= 0; bits-= 6) {
2611		*out++  = ((c >> bits) & 0x3F) | 0x80;
2612	    }
2613	    NEXT;
2614	}
2615    }
2616    *out = 0;
2617    return(buffer);
2618}
2619
2620/**
2621 * htmlParseEntityRef:
2622 * @ctxt:  an HTML parser context
2623 * @str:  location to store the entity name
2624 *
2625 * parse an HTML ENTITY references
2626 *
2627 * [68] EntityRef ::= '&' Name ';'
2628 *
2629 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2630 *         if non-NULL *str will have to be freed by the caller.
2631 */
2632const htmlEntityDesc *
2633htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2634    const xmlChar *name;
2635    const htmlEntityDesc * ent = NULL;
2636
2637    if (str != NULL) *str = NULL;
2638    if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2639
2640    if (CUR == '&') {
2641        NEXT;
2642        name = htmlParseName(ctxt);
2643	if (name == NULL) {
2644	    htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2645	                 "htmlParseEntityRef: no name\n", NULL, NULL);
2646	} else {
2647	    GROW;
2648	    if (CUR == ';') {
2649	        if (str != NULL)
2650		    *str = name;
2651
2652		/*
2653		 * Lookup the entity in the table.
2654		 */
2655		ent = htmlEntityLookup(name);
2656		if (ent != NULL) /* OK that's ugly !!! */
2657		    NEXT;
2658	    } else {
2659		htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2660		             "htmlParseEntityRef: expecting ';'\n",
2661			     NULL, NULL);
2662	        if (str != NULL)
2663		    *str = name;
2664	    }
2665	}
2666    }
2667    return(ent);
2668}
2669
2670/**
2671 * htmlParseAttValue:
2672 * @ctxt:  an HTML parser context
2673 *
2674 * parse a value for an attribute
2675 * Note: the parser won't do substitution of entities here, this
2676 * will be handled later in xmlStringGetNodeList, unless it was
2677 * asked for ctxt->replaceEntities != 0
2678 *
2679 * Returns the AttValue parsed or NULL.
2680 */
2681
2682static xmlChar *
2683htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2684    xmlChar *ret = NULL;
2685
2686    if (CUR == '"') {
2687        NEXT;
2688	ret = htmlParseHTMLAttribute(ctxt, '"');
2689        if (CUR != '"') {
2690	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2691	                 "AttValue: \" expected\n", NULL, NULL);
2692	} else
2693	    NEXT;
2694    } else if (CUR == '\'') {
2695        NEXT;
2696	ret = htmlParseHTMLAttribute(ctxt, '\'');
2697        if (CUR != '\'') {
2698	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2699	                 "AttValue: ' expected\n", NULL, NULL);
2700	} else
2701	    NEXT;
2702    } else {
2703        /*
2704	 * That's an HTMLism, the attribute value may not be quoted
2705	 */
2706	ret = htmlParseHTMLAttribute(ctxt, 0);
2707	if (ret == NULL) {
2708	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2709	                 "AttValue: no value found\n", NULL, NULL);
2710	}
2711    }
2712    return(ret);
2713}
2714
2715/**
2716 * htmlParseSystemLiteral:
2717 * @ctxt:  an HTML parser context
2718 *
2719 * parse an HTML Literal
2720 *
2721 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2722 *
2723 * Returns the SystemLiteral parsed or NULL
2724 */
2725
2726static xmlChar *
2727htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2728    const xmlChar *q;
2729    xmlChar *ret = NULL;
2730
2731    if (CUR == '"') {
2732        NEXT;
2733	q = CUR_PTR;
2734	while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
2735	    NEXT;
2736	if (!IS_CHAR_CH(CUR)) {
2737	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2738			 "Unfinished SystemLiteral\n", NULL, NULL);
2739	} else {
2740	    ret = xmlStrndup(q, CUR_PTR - q);
2741	    NEXT;
2742        }
2743    } else if (CUR == '\'') {
2744        NEXT;
2745	q = CUR_PTR;
2746	while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
2747	    NEXT;
2748	if (!IS_CHAR_CH(CUR)) {
2749	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2750			 "Unfinished SystemLiteral\n", NULL, NULL);
2751	} else {
2752	    ret = xmlStrndup(q, CUR_PTR - q);
2753	    NEXT;
2754        }
2755    } else {
2756	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2757	             " or ' expected\n", NULL, NULL);
2758    }
2759
2760    return(ret);
2761}
2762
2763/**
2764 * htmlParsePubidLiteral:
2765 * @ctxt:  an HTML parser context
2766 *
2767 * parse an HTML public literal
2768 *
2769 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2770 *
2771 * Returns the PubidLiteral parsed or NULL.
2772 */
2773
2774static xmlChar *
2775htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2776    const xmlChar *q;
2777    xmlChar *ret = NULL;
2778    /*
2779     * Name ::= (Letter | '_') (NameChar)*
2780     */
2781    if (CUR == '"') {
2782        NEXT;
2783	q = CUR_PTR;
2784	while (IS_PUBIDCHAR_CH(CUR)) NEXT;
2785	if (CUR != '"') {
2786	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2787	                 "Unfinished PubidLiteral\n", NULL, NULL);
2788	} else {
2789	    ret = xmlStrndup(q, CUR_PTR - q);
2790	    NEXT;
2791	}
2792    } else if (CUR == '\'') {
2793        NEXT;
2794	q = CUR_PTR;
2795	while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
2796	    NEXT;
2797	if (CUR != '\'') {
2798	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2799	                 "Unfinished PubidLiteral\n", NULL, NULL);
2800	} else {
2801	    ret = xmlStrndup(q, CUR_PTR - q);
2802	    NEXT;
2803	}
2804    } else {
2805	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2806	             "PubidLiteral \" or ' expected\n", NULL, NULL);
2807    }
2808
2809    return(ret);
2810}
2811
2812/**
2813 * htmlParseScript:
2814 * @ctxt:  an HTML parser context
2815 *
2816 * parse the content of an HTML SCRIPT or STYLE element
2817 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2818 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2819 * http://www.w3.org/TR/html4/types.html#type-script
2820 * http://www.w3.org/TR/html4/types.html#h-6.15
2821 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2822 *
2823 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2824 * element and the value of intrinsic event attributes. User agents must
2825 * not evaluate script data as HTML markup but instead must pass it on as
2826 * data to a script engine.
2827 * NOTES:
2828 * - The content is passed like CDATA
2829 * - the attributes for style and scripting "onXXX" are also described
2830 *   as CDATA but SGML allows entities references in attributes so their
2831 *   processing is identical as other attributes
2832 */
2833static void
2834htmlParseScript(htmlParserCtxtPtr ctxt) {
2835    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2836    int nbchar = 0;
2837    int cur,l;
2838
2839    SHRINK;
2840    cur = CUR_CHAR(l);
2841    while (IS_CHAR_CH(cur)) {
2842	if ((cur == '<') && (NXT(1) == '/')) {
2843            /*
2844             * One should break here, the specification is clear:
2845             * Authors should therefore escape "</" within the content.
2846             * Escape mechanisms are specific to each scripting or
2847             * style sheet language.
2848             *
2849             * In recovery mode, only break if end tag match the
2850             * current tag, effectively ignoring all tags inside the
2851             * script/style block and treating the entire block as
2852             * CDATA.
2853             */
2854            if (ctxt->recovery) {
2855                if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2856				   xmlStrlen(ctxt->name)) == 0)
2857                {
2858                    break; /* while */
2859                } else {
2860		    htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
2861				 "Element %s embeds close tag\n",
2862		                 ctxt->name, NULL);
2863		}
2864            } else {
2865                if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2866                    ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2867                {
2868                    break; /* while */
2869                }
2870            }
2871	}
2872	COPY_BUF(l,buf,nbchar,cur);
2873	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2874	    if (ctxt->sax->cdataBlock!= NULL) {
2875		/*
2876		 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2877		 */
2878		ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2879	    } else if (ctxt->sax->characters != NULL) {
2880		ctxt->sax->characters(ctxt->userData, buf, nbchar);
2881	    }
2882	    nbchar = 0;
2883	}
2884	GROW;
2885	NEXTL(l);
2886	cur = CUR_CHAR(l);
2887    }
2888
2889    if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
2890	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2891	                "Invalid char in CDATA 0x%X\n", cur);
2892	NEXT;
2893    }
2894
2895    if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2896	if (ctxt->sax->cdataBlock!= NULL) {
2897	    /*
2898	     * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2899	     */
2900	    ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2901	} else if (ctxt->sax->characters != NULL) {
2902	    ctxt->sax->characters(ctxt->userData, buf, nbchar);
2903	}
2904    }
2905}
2906
2907
2908/**
2909 * htmlParseCharData:
2910 * @ctxt:  an HTML parser context
2911 *
2912 * parse a CharData section.
2913 * if we are within a CDATA section ']]>' marks an end of section.
2914 *
2915 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2916 */
2917
2918static void
2919htmlParseCharData(htmlParserCtxtPtr ctxt) {
2920    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2921    int nbchar = 0;
2922    int cur, l;
2923    int chunk = 0;
2924
2925    SHRINK;
2926    cur = CUR_CHAR(l);
2927    while (((cur != '<') || (ctxt->token == '<')) &&
2928           ((cur != '&') || (ctxt->token == '&')) &&
2929	   (cur != 0)) {
2930	if (!(IS_CHAR(cur))) {
2931	    htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2932	                "Invalid char in CDATA 0x%X\n", cur);
2933	} else {
2934	    COPY_BUF(l,buf,nbchar,cur);
2935	}
2936	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2937	    /*
2938	     * Ok the segment is to be consumed as chars.
2939	     */
2940	    if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2941		if (areBlanks(ctxt, buf, nbchar)) {
2942		    if (ctxt->sax->ignorableWhitespace != NULL)
2943			ctxt->sax->ignorableWhitespace(ctxt->userData,
2944			                               buf, nbchar);
2945		} else {
2946		    htmlCheckParagraph(ctxt);
2947		    if (ctxt->sax->characters != NULL)
2948			ctxt->sax->characters(ctxt->userData, buf, nbchar);
2949		}
2950	    }
2951	    nbchar = 0;
2952	}
2953	NEXTL(l);
2954        chunk++;
2955        if (chunk > HTML_PARSER_BUFFER_SIZE) {
2956            chunk = 0;
2957            SHRINK;
2958            GROW;
2959        }
2960	cur = CUR_CHAR(l);
2961	if (cur == 0) {
2962	    SHRINK;
2963	    GROW;
2964	    cur = CUR_CHAR(l);
2965	}
2966    }
2967    if (nbchar != 0) {
2968        buf[nbchar] = 0;
2969
2970	/*
2971	 * Ok the segment is to be consumed as chars.
2972	 */
2973	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2974	    if (areBlanks(ctxt, buf, nbchar)) {
2975		if (ctxt->sax->ignorableWhitespace != NULL)
2976		    ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2977	    } else {
2978		htmlCheckParagraph(ctxt);
2979		if (ctxt->sax->characters != NULL)
2980		    ctxt->sax->characters(ctxt->userData, buf, nbchar);
2981	    }
2982	}
2983    } else {
2984	/*
2985	 * Loop detection
2986	 */
2987	if (cur == 0)
2988	    ctxt->instate = XML_PARSER_EOF;
2989    }
2990}
2991
2992/**
2993 * htmlParseExternalID:
2994 * @ctxt:  an HTML parser context
2995 * @publicID:  a xmlChar** receiving PubidLiteral
2996 *
2997 * Parse an External ID or a Public ID
2998 *
2999 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3000 *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
3001 *
3002 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3003 *
3004 * Returns the function returns SystemLiteral and in the second
3005 *                case publicID receives PubidLiteral, is strict is off
3006 *                it is possible to return NULL and have publicID set.
3007 */
3008
3009static xmlChar *
3010htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3011    xmlChar *URI = NULL;
3012
3013    if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3014         (UPP(2) == 'S') && (UPP(3) == 'T') &&
3015	 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3016        SKIP(6);
3017	if (!IS_BLANK_CH(CUR)) {
3018	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3019	                 "Space required after 'SYSTEM'\n", NULL, NULL);
3020	}
3021        SKIP_BLANKS;
3022	URI = htmlParseSystemLiteral(ctxt);
3023	if (URI == NULL) {
3024	    htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3025	                 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3026        }
3027    } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3028	       (UPP(2) == 'B') && (UPP(3) == 'L') &&
3029	       (UPP(4) == 'I') && (UPP(5) == 'C')) {
3030        SKIP(6);
3031	if (!IS_BLANK_CH(CUR)) {
3032	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3033	                 "Space required after 'PUBLIC'\n", NULL, NULL);
3034	}
3035        SKIP_BLANKS;
3036	*publicID = htmlParsePubidLiteral(ctxt);
3037	if (*publicID == NULL) {
3038	    htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3039	                 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3040			 NULL, NULL);
3041	}
3042        SKIP_BLANKS;
3043        if ((CUR == '"') || (CUR == '\'')) {
3044	    URI = htmlParseSystemLiteral(ctxt);
3045	}
3046    }
3047    return(URI);
3048}
3049
3050/**
3051 * xmlParsePI:
3052 * @ctxt:  an XML parser context
3053 *
3054 * parse an XML Processing Instruction.
3055 *
3056 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3057 */
3058static void
3059htmlParsePI(htmlParserCtxtPtr ctxt) {
3060    xmlChar *buf = NULL;
3061    int len = 0;
3062    int size = HTML_PARSER_BUFFER_SIZE;
3063    int cur, l;
3064    const xmlChar *target;
3065    xmlParserInputState state;
3066    int count = 0;
3067
3068    if ((RAW == '<') && (NXT(1) == '?')) {
3069	state = ctxt->instate;
3070        ctxt->instate = XML_PARSER_PI;
3071	/*
3072	 * this is a Processing Instruction.
3073	 */
3074	SKIP(2);
3075	SHRINK;
3076
3077	/*
3078	 * Parse the target name and check for special support like
3079	 * namespace.
3080	 */
3081        target = htmlParseName(ctxt);
3082	if (target != NULL) {
3083	    if (RAW == '>') {
3084		SKIP(1);
3085
3086		/*
3087		 * SAX: PI detected.
3088		 */
3089		if ((ctxt->sax) && (!ctxt->disableSAX) &&
3090		    (ctxt->sax->processingInstruction != NULL))
3091		    ctxt->sax->processingInstruction(ctxt->userData,
3092		                                     target, NULL);
3093		ctxt->instate = state;
3094		return;
3095	    }
3096	    buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3097	    if (buf == NULL) {
3098		htmlErrMemory(ctxt, NULL);
3099		ctxt->instate = state;
3100		return;
3101	    }
3102	    cur = CUR;
3103	    if (!IS_BLANK(cur)) {
3104		htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3105			  "ParsePI: PI %s space expected\n", target, NULL);
3106	    }
3107            SKIP_BLANKS;
3108	    cur = CUR_CHAR(l);
3109	    while (IS_CHAR(cur) && (cur != '>')) {
3110		if (len + 5 >= size) {
3111		    xmlChar *tmp;
3112
3113		    size *= 2;
3114		    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3115		    if (tmp == NULL) {
3116			htmlErrMemory(ctxt, NULL);
3117			xmlFree(buf);
3118			ctxt->instate = state;
3119			return;
3120		    }
3121		    buf = tmp;
3122		}
3123		count++;
3124		if (count > 50) {
3125		    GROW;
3126		    count = 0;
3127		}
3128		COPY_BUF(l,buf,len,cur);
3129		NEXTL(l);
3130		cur = CUR_CHAR(l);
3131		if (cur == 0) {
3132		    SHRINK;
3133		    GROW;
3134		    cur = CUR_CHAR(l);
3135		}
3136	    }
3137	    buf[len] = 0;
3138	    if (cur != '>') {
3139		htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3140		      "ParsePI: PI %s never end ...\n", target, NULL);
3141	    } else {
3142		SKIP(1);
3143
3144		/*
3145		 * SAX: PI detected.
3146		 */
3147		if ((ctxt->sax) && (!ctxt->disableSAX) &&
3148		    (ctxt->sax->processingInstruction != NULL))
3149		    ctxt->sax->processingInstruction(ctxt->userData,
3150		                                     target, buf);
3151	    }
3152	    xmlFree(buf);
3153	} else {
3154	    htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3155                         "PI is not started correctly", NULL, NULL);
3156	}
3157	ctxt->instate = state;
3158    }
3159}
3160
3161/**
3162 * htmlParseComment:
3163 * @ctxt:  an HTML parser context
3164 *
3165 * Parse an XML (SGML) comment <!-- .... -->
3166 *
3167 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3168 */
3169static void
3170htmlParseComment(htmlParserCtxtPtr ctxt) {
3171    xmlChar *buf = NULL;
3172    int len;
3173    int size = HTML_PARSER_BUFFER_SIZE;
3174    int q, ql;
3175    int r, rl;
3176    int cur, l;
3177    xmlParserInputState state;
3178
3179    /*
3180     * Check that there is a comment right here.
3181     */
3182    if ((RAW != '<') || (NXT(1) != '!') ||
3183        (NXT(2) != '-') || (NXT(3) != '-')) return;
3184
3185    state = ctxt->instate;
3186    ctxt->instate = XML_PARSER_COMMENT;
3187    SHRINK;
3188    SKIP(4);
3189    buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3190    if (buf == NULL) {
3191        htmlErrMemory(ctxt, "buffer allocation failed\n");
3192	ctxt->instate = state;
3193	return;
3194    }
3195    q = CUR_CHAR(ql);
3196    NEXTL(ql);
3197    r = CUR_CHAR(rl);
3198    NEXTL(rl);
3199    cur = CUR_CHAR(l);
3200    len = 0;
3201    while (IS_CHAR(cur) &&
3202           ((cur != '>') ||
3203	    (r != '-') || (q != '-'))) {
3204	if (len + 5 >= size) {
3205	    xmlChar *tmp;
3206
3207	    size *= 2;
3208	    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3209	    if (tmp == NULL) {
3210	        xmlFree(buf);
3211	        htmlErrMemory(ctxt, "growing buffer failed\n");
3212		ctxt->instate = state;
3213		return;
3214	    }
3215	    buf = tmp;
3216	}
3217	COPY_BUF(ql,buf,len,q);
3218	q = r;
3219	ql = rl;
3220	r = cur;
3221	rl = l;
3222	NEXTL(l);
3223	cur = CUR_CHAR(l);
3224	if (cur == 0) {
3225	    SHRINK;
3226	    GROW;
3227	    cur = CUR_CHAR(l);
3228	}
3229    }
3230    buf[len] = 0;
3231    if (!IS_CHAR(cur)) {
3232	htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3233	             "Comment not terminated \n<!--%.50s\n", buf, NULL);
3234	xmlFree(buf);
3235    } else {
3236        NEXT;
3237	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3238	    (!ctxt->disableSAX))
3239	    ctxt->sax->comment(ctxt->userData, buf);
3240	xmlFree(buf);
3241    }
3242    ctxt->instate = state;
3243}
3244
3245/**
3246 * htmlParseCharRef:
3247 * @ctxt:  an HTML parser context
3248 *
3249 * parse Reference declarations
3250 *
3251 * [66] CharRef ::= '&#' [0-9]+ ';' |
3252 *                  '&#x' [0-9a-fA-F]+ ';'
3253 *
3254 * Returns the value parsed (as an int)
3255 */
3256int
3257htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3258    int val = 0;
3259
3260    if ((ctxt == NULL) || (ctxt->input == NULL)) {
3261	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3262		     "htmlParseCharRef: context error\n",
3263		     NULL, NULL);
3264        return(0);
3265    }
3266    if ((CUR == '&') && (NXT(1) == '#') &&
3267        ((NXT(2) == 'x') || NXT(2) == 'X')) {
3268	SKIP(3);
3269	while (CUR != ';') {
3270	    if ((CUR >= '0') && (CUR <= '9'))
3271	        val = val * 16 + (CUR - '0');
3272	    else if ((CUR >= 'a') && (CUR <= 'f'))
3273	        val = val * 16 + (CUR - 'a') + 10;
3274	    else if ((CUR >= 'A') && (CUR <= 'F'))
3275	        val = val * 16 + (CUR - 'A') + 10;
3276	    else {
3277	        htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3278		             "htmlParseCharRef: missing semicolumn\n",
3279			     NULL, NULL);
3280		break;
3281	    }
3282	    NEXT;
3283	}
3284	if (CUR == ';')
3285	    NEXT;
3286    } else if  ((CUR == '&') && (NXT(1) == '#')) {
3287	SKIP(2);
3288	while (CUR != ';') {
3289	    if ((CUR >= '0') && (CUR <= '9'))
3290	        val = val * 10 + (CUR - '0');
3291	    else {
3292	        htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3293		             "htmlParseCharRef: missing semicolumn\n",
3294			     NULL, NULL);
3295		break;
3296	    }
3297	    NEXT;
3298	}
3299	if (CUR == ';')
3300	    NEXT;
3301    } else {
3302	htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3303	             "htmlParseCharRef: invalid value\n", NULL, NULL);
3304    }
3305    /*
3306     * Check the value IS_CHAR ...
3307     */
3308    if (IS_CHAR(val)) {
3309        return(val);
3310    } else {
3311	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3312			"htmlParseCharRef: invalid xmlChar value %d\n",
3313			val);
3314    }
3315    return(0);
3316}
3317
3318
3319/**
3320 * htmlParseDocTypeDecl:
3321 * @ctxt:  an HTML parser context
3322 *
3323 * parse a DOCTYPE declaration
3324 *
3325 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3326 *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3327 */
3328
3329static void
3330htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3331    const xmlChar *name;
3332    xmlChar *ExternalID = NULL;
3333    xmlChar *URI = NULL;
3334
3335    /*
3336     * We know that '<!DOCTYPE' has been detected.
3337     */
3338    SKIP(9);
3339
3340    SKIP_BLANKS;
3341
3342    /*
3343     * Parse the DOCTYPE name.
3344     */
3345    name = htmlParseName(ctxt);
3346    if (name == NULL) {
3347	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3348	             "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3349		     NULL, NULL);
3350    }
3351    /*
3352     * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3353     */
3354
3355    SKIP_BLANKS;
3356
3357    /*
3358     * Check for SystemID and ExternalID
3359     */
3360    URI = htmlParseExternalID(ctxt, &ExternalID);
3361    SKIP_BLANKS;
3362
3363    /*
3364     * We should be at the end of the DOCTYPE declaration.
3365     */
3366    if (CUR != '>') {
3367	htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3368	             "DOCTYPE improperly terminated\n", NULL, NULL);
3369        /* We shouldn't try to resynchronize ... */
3370    }
3371    NEXT;
3372
3373    /*
3374     * Create or update the document accordingly to the DOCTYPE
3375     */
3376    if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3377	(!ctxt->disableSAX))
3378	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3379
3380    /*
3381     * Cleanup, since we don't use all those identifiers
3382     */
3383    if (URI != NULL) xmlFree(URI);
3384    if (ExternalID != NULL) xmlFree(ExternalID);
3385}
3386
3387/**
3388 * htmlParseAttribute:
3389 * @ctxt:  an HTML parser context
3390 * @value:  a xmlChar ** used to store the value of the attribute
3391 *
3392 * parse an attribute
3393 *
3394 * [41] Attribute ::= Name Eq AttValue
3395 *
3396 * [25] Eq ::= S? '=' S?
3397 *
3398 * With namespace:
3399 *
3400 * [NS 11] Attribute ::= QName Eq AttValue
3401 *
3402 * Also the case QName == xmlns:??? is handled independently as a namespace
3403 * definition.
3404 *
3405 * Returns the attribute name, and the value in *value.
3406 */
3407
3408static const xmlChar *
3409htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3410    const xmlChar *name;
3411    xmlChar *val = NULL;
3412
3413    *value = NULL;
3414    name = htmlParseHTMLName(ctxt);
3415    if (name == NULL) {
3416	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3417	             "error parsing attribute name\n", NULL, NULL);
3418        return(NULL);
3419    }
3420
3421    /*
3422     * read the value
3423     */
3424    SKIP_BLANKS;
3425    if (CUR == '=') {
3426        NEXT;
3427	SKIP_BLANKS;
3428	val = htmlParseAttValue(ctxt);
3429    }
3430
3431    *value = val;
3432    return(name);
3433}
3434
3435/**
3436 * htmlCheckEncoding:
3437 * @ctxt:  an HTML parser context
3438 * @attvalue: the attribute value
3439 *
3440 * Checks an http-equiv attribute from a Meta tag to detect
3441 * the encoding
3442 * If a new encoding is detected the parser is switched to decode
3443 * it and pass UTF8
3444 */
3445static void
3446htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3447    const xmlChar *encoding;
3448
3449    if ((ctxt == NULL) || (attvalue == NULL))
3450	return;
3451
3452    /* do not change encoding */
3453    if (ctxt->input->encoding != NULL)
3454        return;
3455
3456    encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3457    if (encoding != NULL) {
3458	encoding += 8;
3459    } else {
3460	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3461	if (encoding != NULL)
3462	    encoding += 9;
3463    }
3464    if (encoding != NULL) {
3465	xmlCharEncoding enc;
3466	xmlCharEncodingHandlerPtr handler;
3467
3468	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3469
3470	if (ctxt->input->encoding != NULL)
3471	    xmlFree((xmlChar *) ctxt->input->encoding);
3472	ctxt->input->encoding = xmlStrdup(encoding);
3473
3474	enc = xmlParseCharEncoding((const char *) encoding);
3475	/*
3476	 * registered set of known encodings
3477	 */
3478	if (enc != XML_CHAR_ENCODING_ERROR) {
3479	    if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3480	         (enc == XML_CHAR_ENCODING_UTF16BE) ||
3481		 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3482		 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3483		(ctxt->input->buf != NULL) &&
3484		(ctxt->input->buf->encoder == NULL)) {
3485		htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3486		             "htmlCheckEncoding: wrong encoding meta\n",
3487			     NULL, NULL);
3488	    } else {
3489		xmlSwitchEncoding(ctxt, enc);
3490	    }
3491	    ctxt->charset = XML_CHAR_ENCODING_UTF8;
3492	} else {
3493	    /*
3494	     * fallback for unknown encodings
3495	     */
3496	    handler = xmlFindCharEncodingHandler((const char *) encoding);
3497	    if (handler != NULL) {
3498		xmlSwitchToEncoding(ctxt, handler);
3499		ctxt->charset = XML_CHAR_ENCODING_UTF8;
3500	    } else {
3501		ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3502	    }
3503	}
3504
3505	if ((ctxt->input->buf != NULL) &&
3506	    (ctxt->input->buf->encoder != NULL) &&
3507	    (ctxt->input->buf->raw != NULL) &&
3508	    (ctxt->input->buf->buffer != NULL)) {
3509	    int nbchars;
3510	    int processed;
3511
3512	    /*
3513	     * convert as much as possible to the parser reading buffer.
3514	     */
3515	    processed = ctxt->input->cur - ctxt->input->base;
3516	    xmlBufferShrink(ctxt->input->buf->buffer, processed);
3517	    nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3518		                       ctxt->input->buf->buffer,
3519				       ctxt->input->buf->raw);
3520	    if (nbchars < 0) {
3521		htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3522		             "htmlCheckEncoding: encoder error\n",
3523			     NULL, NULL);
3524	    }
3525	    ctxt->input->base =
3526	    ctxt->input->cur = ctxt->input->buf->buffer->content;
3527            ctxt->input->end =
3528                          &ctxt->input->base[ctxt->input->buf->buffer->use];
3529	}
3530    }
3531}
3532
3533/**
3534 * htmlCheckMeta:
3535 * @ctxt:  an HTML parser context
3536 * @atts:  the attributes values
3537 *
3538 * Checks an attributes from a Meta tag
3539 */
3540static void
3541htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3542    int i;
3543    const xmlChar *att, *value;
3544    int http = 0;
3545    const xmlChar *content = NULL;
3546
3547    if ((ctxt == NULL) || (atts == NULL))
3548	return;
3549
3550    i = 0;
3551    att = atts[i++];
3552    while (att != NULL) {
3553	value = atts[i++];
3554	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3555	 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3556	    http = 1;
3557	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3558	    content = value;
3559	att = atts[i++];
3560    }
3561    if ((http) && (content != NULL))
3562	htmlCheckEncoding(ctxt, content);
3563
3564}
3565
3566/**
3567 * htmlParseStartTag:
3568 * @ctxt:  an HTML parser context
3569 *
3570 * parse a start of tag either for rule element or
3571 * EmptyElement. In both case we don't parse the tag closing chars.
3572 *
3573 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3574 *
3575 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3576 *
3577 * With namespace:
3578 *
3579 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3580 *
3581 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3582 *
3583 * Returns 0 in case of success, -1 in case of error and 1 if discarded
3584 */
3585
3586static int
3587htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3588    const xmlChar *name;
3589    const xmlChar *attname;
3590    xmlChar *attvalue;
3591    const xmlChar **atts;
3592    int nbatts = 0;
3593    int maxatts;
3594    int meta = 0;
3595    int i;
3596    int discardtag = 0;
3597
3598    if (ctxt->instate == XML_PARSER_EOF)
3599        return(-1);
3600    if ((ctxt == NULL) || (ctxt->input == NULL)) {
3601	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3602		     "htmlParseStartTag: context error\n", NULL, NULL);
3603	return -1;
3604    }
3605    if (CUR != '<') return -1;
3606    NEXT;
3607
3608    atts = ctxt->atts;
3609    maxatts = ctxt->maxatts;
3610
3611    GROW;
3612    name = htmlParseHTMLName(ctxt);
3613    if (name == NULL) {
3614	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3615	             "htmlParseStartTag: invalid element name\n",
3616		     NULL, NULL);
3617	/* Dump the bogus tag like browsers do */
3618	while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
3619               (ctxt->instate != XML_PARSER_EOF))
3620	    NEXT;
3621        return -1;
3622    }
3623    if (xmlStrEqual(name, BAD_CAST"meta"))
3624	meta = 1;
3625
3626    /*
3627     * Check for auto-closure of HTML elements.
3628     */
3629    htmlAutoClose(ctxt, name);
3630
3631    /*
3632     * Check for implied HTML elements.
3633     */
3634    htmlCheckImplied(ctxt, name);
3635
3636    /*
3637     * Avoid html at any level > 0, head at any level != 1
3638     * or any attempt to recurse body
3639     */
3640    if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3641	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3642	             "htmlParseStartTag: misplaced <html> tag\n",
3643		     name, NULL);
3644	discardtag = 1;
3645	ctxt->depth++;
3646    }
3647    if ((ctxt->nameNr != 1) &&
3648	(xmlStrEqual(name, BAD_CAST"head"))) {
3649	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3650	             "htmlParseStartTag: misplaced <head> tag\n",
3651		     name, NULL);
3652	discardtag = 1;
3653	ctxt->depth++;
3654    }
3655    if (xmlStrEqual(name, BAD_CAST"body")) {
3656	int indx;
3657	for (indx = 0;indx < ctxt->nameNr;indx++) {
3658	    if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3659		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3660		             "htmlParseStartTag: misplaced <body> tag\n",
3661			     name, NULL);
3662		discardtag = 1;
3663		ctxt->depth++;
3664	    }
3665	}
3666    }
3667
3668    /*
3669     * Now parse the attributes, it ends up with the ending
3670     *
3671     * (S Attribute)* S?
3672     */
3673    SKIP_BLANKS;
3674    while ((IS_CHAR_CH(CUR)) &&
3675           (CUR != '>') &&
3676	   ((CUR != '/') || (NXT(1) != '>'))) {
3677	long cons = ctxt->nbChars;
3678
3679	GROW;
3680	attname = htmlParseAttribute(ctxt, &attvalue);
3681        if (attname != NULL) {
3682
3683	    /*
3684	     * Well formedness requires at most one declaration of an attribute
3685	     */
3686	    for (i = 0; i < nbatts;i += 2) {
3687	        if (xmlStrEqual(atts[i], attname)) {
3688		    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3689		                 "Attribute %s redefined\n", attname, NULL);
3690		    if (attvalue != NULL)
3691			xmlFree(attvalue);
3692		    goto failed;
3693		}
3694	    }
3695
3696	    /*
3697	     * Add the pair to atts
3698	     */
3699	    if (atts == NULL) {
3700	        maxatts = 22; /* allow for 10 attrs by default */
3701	        atts = (const xmlChar **)
3702		       xmlMalloc(maxatts * sizeof(xmlChar *));
3703		if (atts == NULL) {
3704		    htmlErrMemory(ctxt, NULL);
3705		    if (attvalue != NULL)
3706			xmlFree(attvalue);
3707		    goto failed;
3708		}
3709		ctxt->atts = atts;
3710		ctxt->maxatts = maxatts;
3711	    } else if (nbatts + 4 > maxatts) {
3712	        const xmlChar **n;
3713
3714	        maxatts *= 2;
3715	        n = (const xmlChar **) xmlRealloc((void *) atts,
3716					     maxatts * sizeof(const xmlChar *));
3717		if (n == NULL) {
3718		    htmlErrMemory(ctxt, NULL);
3719		    if (attvalue != NULL)
3720			xmlFree(attvalue);
3721		    goto failed;
3722		}
3723		atts = n;
3724		ctxt->atts = atts;
3725		ctxt->maxatts = maxatts;
3726	    }
3727	    atts[nbatts++] = attname;
3728	    atts[nbatts++] = attvalue;
3729	    atts[nbatts] = NULL;
3730	    atts[nbatts + 1] = NULL;
3731	}
3732	else {
3733	    if (attvalue != NULL)
3734	        xmlFree(attvalue);
3735	    /* Dump the bogus attribute string up to the next blank or
3736	     * the end of the tag. */
3737	    while ((IS_CHAR_CH(CUR)) &&
3738	           !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3739		   ((CUR != '/') || (NXT(1) != '>')))
3740		NEXT;
3741	}
3742
3743failed:
3744	SKIP_BLANKS;
3745        if (cons == ctxt->nbChars) {
3746	    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3747	                 "htmlParseStartTag: problem parsing attributes\n",
3748			 NULL, NULL);
3749	    break;
3750	}
3751    }
3752
3753    /*
3754     * Handle specific association to the META tag
3755     */
3756    if (meta && (nbatts != 0))
3757	htmlCheckMeta(ctxt, atts);
3758
3759    /*
3760     * SAX: Start of Element !
3761     */
3762    if (!discardtag) {
3763	htmlnamePush(ctxt, name);
3764	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3765	    if (nbatts != 0)
3766		ctxt->sax->startElement(ctxt->userData, name, atts);
3767	    else
3768		ctxt->sax->startElement(ctxt->userData, name, NULL);
3769	}
3770    }
3771
3772    if (atts != NULL) {
3773        for (i = 1;i < nbatts;i += 2) {
3774	    if (atts[i] != NULL)
3775		xmlFree((xmlChar *) atts[i]);
3776	}
3777    }
3778
3779    return(discardtag);
3780}
3781
3782/**
3783 * htmlParseEndTag:
3784 * @ctxt:  an HTML parser context
3785 *
3786 * parse an end of tag
3787 *
3788 * [42] ETag ::= '</' Name S? '>'
3789 *
3790 * With namespace
3791 *
3792 * [NS 9] ETag ::= '</' QName S? '>'
3793 *
3794 * Returns 1 if the current level should be closed.
3795 */
3796
3797static int
3798htmlParseEndTag(htmlParserCtxtPtr ctxt)
3799{
3800    const xmlChar *name;
3801    const xmlChar *oldname;
3802    int i, ret;
3803
3804    if ((CUR != '<') || (NXT(1) != '/')) {
3805        htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3806	             "htmlParseEndTag: '</' not found\n", NULL, NULL);
3807        return (0);
3808    }
3809    SKIP(2);
3810
3811    name = htmlParseHTMLName(ctxt);
3812    if (name == NULL)
3813        return (0);
3814    /*
3815     * We should definitely be at the ending "S? '>'" part
3816     */
3817    SKIP_BLANKS;
3818    if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
3819        htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3820	             "End tag : expected '>'\n", NULL, NULL);
3821	if (ctxt->recovery) {
3822	    /*
3823	     * We're not at the ending > !!
3824	     * Error, unless in recover mode where we search forwards
3825	     * until we find a >
3826	     */
3827	    while (CUR != '\0' && CUR != '>') NEXT;
3828	    NEXT;
3829	}
3830    } else
3831        NEXT;
3832
3833    /*
3834     * if we ignored misplaced tags in htmlParseStartTag don't pop them
3835     * out now.
3836     */
3837    if ((ctxt->depth > 0) &&
3838        (xmlStrEqual(name, BAD_CAST "html") ||
3839         xmlStrEqual(name, BAD_CAST "body") ||
3840	 xmlStrEqual(name, BAD_CAST "head"))) {
3841	ctxt->depth--;
3842	return (0);
3843    }
3844
3845    /*
3846     * If the name read is not one of the element in the parsing stack
3847     * then return, it's just an error.
3848     */
3849    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3850        if (xmlStrEqual(name, ctxt->nameTab[i]))
3851            break;
3852    }
3853    if (i < 0) {
3854        htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3855	             "Unexpected end tag : %s\n", name, NULL);
3856        return (0);
3857    }
3858
3859
3860    /*
3861     * Check for auto-closure of HTML elements.
3862     */
3863
3864    htmlAutoCloseOnClose(ctxt, name);
3865
3866    /*
3867     * Well formedness constraints, opening and closing must match.
3868     * With the exception that the autoclose may have popped stuff out
3869     * of the stack.
3870     */
3871    if (!xmlStrEqual(name, ctxt->name)) {
3872        if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
3873            htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3874	                 "Opening and ending tag mismatch: %s and %s\n",
3875			 name, ctxt->name);
3876        }
3877    }
3878
3879    /*
3880     * SAX: End of Tag
3881     */
3882    oldname = ctxt->name;
3883    if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3884        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3885            ctxt->sax->endElement(ctxt->userData, name);
3886        htmlnamePop(ctxt);
3887        ret = 1;
3888    } else {
3889        ret = 0;
3890    }
3891
3892    return (ret);
3893}
3894
3895
3896/**
3897 * htmlParseReference:
3898 * @ctxt:  an HTML parser context
3899 *
3900 * parse and handle entity references in content,
3901 * this will end-up in a call to character() since this is either a
3902 * CharRef, or a predefined entity.
3903 */
3904static void
3905htmlParseReference(htmlParserCtxtPtr ctxt) {
3906    const htmlEntityDesc * ent;
3907    xmlChar out[6];
3908    const xmlChar *name;
3909    if (CUR != '&') return;
3910
3911    if (NXT(1) == '#') {
3912	unsigned int c;
3913	int bits, i = 0;
3914
3915	c = htmlParseCharRef(ctxt);
3916	if (c == 0)
3917	    return;
3918
3919        if      (c <    0x80) { out[i++]= c;                bits= -6; }
3920        else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
3921        else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
3922        else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
3923
3924        for ( ; bits >= 0; bits-= 6) {
3925            out[i++]= ((c >> bits) & 0x3F) | 0x80;
3926        }
3927	out[i] = 0;
3928
3929	htmlCheckParagraph(ctxt);
3930	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3931	    ctxt->sax->characters(ctxt->userData, out, i);
3932    } else {
3933	ent = htmlParseEntityRef(ctxt, &name);
3934	if (name == NULL) {
3935	    htmlCheckParagraph(ctxt);
3936	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3937	        ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3938	    return;
3939	}
3940	if ((ent == NULL) || !(ent->value > 0)) {
3941	    htmlCheckParagraph(ctxt);
3942	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3943		ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3944		ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3945		/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3946	    }
3947	} else {
3948	    unsigned int c;
3949	    int bits, i = 0;
3950
3951	    c = ent->value;
3952	    if      (c <    0x80)
3953	            { out[i++]= c;                bits= -6; }
3954	    else if (c <   0x800)
3955	            { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
3956	    else if (c < 0x10000)
3957	            { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
3958	    else
3959	            { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
3960
3961	    for ( ; bits >= 0; bits-= 6) {
3962		out[i++]= ((c >> bits) & 0x3F) | 0x80;
3963	    }
3964	    out[i] = 0;
3965
3966	    htmlCheckParagraph(ctxt);
3967	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3968		ctxt->sax->characters(ctxt->userData, out, i);
3969	}
3970    }
3971}
3972
3973/**
3974 * htmlParseContent:
3975 * @ctxt:  an HTML parser context
3976 *
3977 * Parse a content: comment, sub-element, reference or text.
3978 * Kept for compatibility with old code
3979 */
3980
3981static void
3982htmlParseContent(htmlParserCtxtPtr ctxt) {
3983    xmlChar *currentNode;
3984    int depth;
3985    const xmlChar *name;
3986
3987    currentNode = xmlStrdup(ctxt->name);
3988    depth = ctxt->nameNr;
3989    while (1) {
3990	long cons = ctxt->nbChars;
3991
3992        GROW;
3993
3994        if (ctxt->instate == XML_PARSER_EOF)
3995            break;
3996
3997	/*
3998	 * Our tag or one of it's parent or children is ending.
3999	 */
4000        if ((CUR == '<') && (NXT(1) == '/')) {
4001	    if (htmlParseEndTag(ctxt) &&
4002		((currentNode != NULL) || (ctxt->nameNr == 0))) {
4003		if (currentNode != NULL)
4004		    xmlFree(currentNode);
4005		return;
4006	    }
4007	    continue; /* while */
4008        }
4009
4010	else if ((CUR == '<') &&
4011	         ((IS_ASCII_LETTER(NXT(1))) ||
4012		  (NXT(1) == '_') || (NXT(1) == ':'))) {
4013	    name = htmlParseHTMLName_nonInvasive(ctxt);
4014	    if (name == NULL) {
4015	        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4016			 "htmlParseStartTag: invalid element name\n",
4017			 NULL, NULL);
4018	        /* Dump the bogus tag like browsers do */
4019        while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4020	            NEXT;
4021
4022	        if (currentNode != NULL)
4023	            xmlFree(currentNode);
4024	        return;
4025	    }
4026
4027	    if (ctxt->name != NULL) {
4028	        if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4029	            htmlAutoClose(ctxt, name);
4030	            continue;
4031	        }
4032	    }
4033	}
4034
4035	/*
4036	 * Has this node been popped out during parsing of
4037	 * the next element
4038	 */
4039        if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4040	    (!xmlStrEqual(currentNode, ctxt->name)))
4041	     {
4042	    if (currentNode != NULL) xmlFree(currentNode);
4043	    return;
4044	}
4045
4046	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4047	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4048	    /*
4049	     * Handle SCRIPT/STYLE separately
4050	     */
4051	    htmlParseScript(ctxt);
4052	} else {
4053	    /*
4054	     * Sometimes DOCTYPE arrives in the middle of the document
4055	     */
4056	    if ((CUR == '<') && (NXT(1) == '!') &&
4057		(UPP(2) == 'D') && (UPP(3) == 'O') &&
4058		(UPP(4) == 'C') && (UPP(5) == 'T') &&
4059		(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4060		(UPP(8) == 'E')) {
4061		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4062		             "Misplaced DOCTYPE declaration\n",
4063			     BAD_CAST "DOCTYPE" , NULL);
4064		htmlParseDocTypeDecl(ctxt);
4065	    }
4066
4067	    /*
4068	     * First case :  a comment
4069	     */
4070	    if ((CUR == '<') && (NXT(1) == '!') &&
4071		(NXT(2) == '-') && (NXT(3) == '-')) {
4072		htmlParseComment(ctxt);
4073	    }
4074
4075	    /*
4076	     * Second case : a Processing Instruction.
4077	     */
4078	    else if ((CUR == '<') && (NXT(1) == '?')) {
4079		htmlParsePI(ctxt);
4080	    }
4081
4082	    /*
4083	     * Third case :  a sub-element.
4084	     */
4085	    else if (CUR == '<') {
4086		htmlParseElement(ctxt);
4087	    }
4088
4089	    /*
4090	     * Fourth case : a reference. If if has not been resolved,
4091	     *    parsing returns it's Name, create the node
4092	     */
4093	    else if (CUR == '&') {
4094		htmlParseReference(ctxt);
4095	    }
4096
4097	    /*
4098	     * Fifth case : end of the resource
4099	     */
4100	    else if (CUR == 0) {
4101		htmlAutoCloseOnEnd(ctxt);
4102		break;
4103	    }
4104
4105	    /*
4106	     * Last case, text. Note that References are handled directly.
4107	     */
4108	    else {
4109		htmlParseCharData(ctxt);
4110	    }
4111
4112	    if (cons == ctxt->nbChars) {
4113		if (ctxt->node != NULL) {
4114		    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4115		                 "detected an error in element content\n",
4116				 NULL, NULL);
4117		}
4118		break;
4119	    }
4120	}
4121        GROW;
4122    }
4123    if (currentNode != NULL) xmlFree(currentNode);
4124}
4125
4126/**
4127 * htmlParseElement:
4128 * @ctxt:  an HTML parser context
4129 *
4130 * parse an HTML element, this is highly recursive
4131 * this is kept for compatibility with previous code versions
4132 *
4133 * [39] element ::= EmptyElemTag | STag content ETag
4134 *
4135 * [41] Attribute ::= Name Eq AttValue
4136 */
4137
4138void
4139htmlParseElement(htmlParserCtxtPtr ctxt) {
4140    const xmlChar *name;
4141    xmlChar *currentNode = NULL;
4142    const htmlElemDesc * info;
4143    htmlParserNodeInfo node_info;
4144    int failed;
4145    int depth;
4146    const xmlChar *oldptr;
4147
4148    if ((ctxt == NULL) || (ctxt->input == NULL)) {
4149	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4150		     "htmlParseElement: context error\n", NULL, NULL);
4151	return;
4152    }
4153
4154    if (ctxt->instate == XML_PARSER_EOF)
4155        return;
4156
4157    /* Capture start position */
4158    if (ctxt->record_info) {
4159        node_info.begin_pos = ctxt->input->consumed +
4160                          (CUR_PTR - ctxt->input->base);
4161	node_info.begin_line = ctxt->input->line;
4162    }
4163
4164    failed = htmlParseStartTag(ctxt);
4165    name = ctxt->name;
4166    if ((failed == -1) || (name == NULL)) {
4167	if (CUR == '>')
4168	    NEXT;
4169        return;
4170    }
4171
4172    /*
4173     * Lookup the info for that element.
4174     */
4175    info = htmlTagLookup(name);
4176    if (info == NULL) {
4177	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4178	             "Tag %s invalid\n", name, NULL);
4179    }
4180
4181    /*
4182     * Check for an Empty Element labeled the XML/SGML way
4183     */
4184    if ((CUR == '/') && (NXT(1) == '>')) {
4185        SKIP(2);
4186	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4187	    ctxt->sax->endElement(ctxt->userData, name);
4188	htmlnamePop(ctxt);
4189	return;
4190    }
4191
4192    if (CUR == '>') {
4193        NEXT;
4194    } else {
4195	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4196	             "Couldn't find end of Start Tag %s\n", name, NULL);
4197
4198	/*
4199	 * end of parsing of this node.
4200	 */
4201	if (xmlStrEqual(name, ctxt->name)) {
4202	    nodePop(ctxt);
4203	    htmlnamePop(ctxt);
4204	}
4205
4206	/*
4207	 * Capture end position and add node
4208	 */
4209	if (ctxt->record_info) {
4210	   node_info.end_pos = ctxt->input->consumed +
4211			      (CUR_PTR - ctxt->input->base);
4212	   node_info.end_line = ctxt->input->line;
4213	   node_info.node = ctxt->node;
4214	   xmlParserAddNodeInfo(ctxt, &node_info);
4215	}
4216	return;
4217    }
4218
4219    /*
4220     * Check for an Empty Element from DTD definition
4221     */
4222    if ((info != NULL) && (info->empty)) {
4223	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4224	    ctxt->sax->endElement(ctxt->userData, name);
4225	htmlnamePop(ctxt);
4226	return;
4227    }
4228
4229    /*
4230     * Parse the content of the element:
4231     */
4232    currentNode = xmlStrdup(ctxt->name);
4233    depth = ctxt->nameNr;
4234    while (IS_CHAR_CH(CUR)) {
4235	oldptr = ctxt->input->cur;
4236	htmlParseContent(ctxt);
4237	if (oldptr==ctxt->input->cur) break;
4238	if (ctxt->nameNr < depth) break;
4239    }
4240
4241    /*
4242     * Capture end position and add node
4243     */
4244    if ( currentNode != NULL && ctxt->record_info ) {
4245       node_info.end_pos = ctxt->input->consumed +
4246                          (CUR_PTR - ctxt->input->base);
4247       node_info.end_line = ctxt->input->line;
4248       node_info.node = ctxt->node;
4249       xmlParserAddNodeInfo(ctxt, &node_info);
4250    }
4251    if (!IS_CHAR_CH(CUR)) {
4252	htmlAutoCloseOnEnd(ctxt);
4253    }
4254
4255    if (currentNode != NULL)
4256	xmlFree(currentNode);
4257}
4258
4259static void
4260htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4261    /*
4262     * Capture end position and add node
4263     */
4264    if ( ctxt->node != NULL && ctxt->record_info ) {
4265       ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4266                                (CUR_PTR - ctxt->input->base);
4267       ctxt->nodeInfo->end_line = ctxt->input->line;
4268       ctxt->nodeInfo->node = ctxt->node;
4269       xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4270       htmlNodeInfoPop(ctxt);
4271    }
4272    if (!IS_CHAR_CH(CUR)) {
4273       htmlAutoCloseOnEnd(ctxt);
4274    }
4275}
4276
4277/**
4278 * htmlParseElementInternal:
4279 * @ctxt:  an HTML parser context
4280 *
4281 * parse an HTML element, new version, non recursive
4282 *
4283 * [39] element ::= EmptyElemTag | STag content ETag
4284 *
4285 * [41] Attribute ::= Name Eq AttValue
4286 */
4287
4288static void
4289htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4290    const xmlChar *name;
4291    const htmlElemDesc * info;
4292    htmlParserNodeInfo node_info;
4293    int failed;
4294
4295    if ((ctxt == NULL) || (ctxt->input == NULL)) {
4296	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4297		     "htmlParseElementInternal: context error\n", NULL, NULL);
4298	return;
4299    }
4300
4301    if (ctxt->instate == XML_PARSER_EOF)
4302        return;
4303
4304    /* Capture start position */
4305    if (ctxt->record_info) {
4306        node_info.begin_pos = ctxt->input->consumed +
4307                          (CUR_PTR - ctxt->input->base);
4308	node_info.begin_line = ctxt->input->line;
4309    }
4310
4311    failed = htmlParseStartTag(ctxt);
4312    name = ctxt->name;
4313    if ((failed == -1) || (name == NULL)) {
4314	if (CUR == '>')
4315	    NEXT;
4316        return;
4317    }
4318
4319    /*
4320     * Lookup the info for that element.
4321     */
4322    info = htmlTagLookup(name);
4323    if (info == NULL) {
4324	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4325	             "Tag %s invalid\n", name, NULL);
4326    }
4327
4328    /*
4329     * Check for an Empty Element labeled the XML/SGML way
4330     */
4331    if ((CUR == '/') && (NXT(1) == '>')) {
4332        SKIP(2);
4333	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4334	    ctxt->sax->endElement(ctxt->userData, name);
4335	htmlnamePop(ctxt);
4336	return;
4337    }
4338
4339    if (CUR == '>') {
4340        NEXT;
4341    } else {
4342	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4343	             "Couldn't find end of Start Tag %s\n", name, NULL);
4344
4345	/*
4346	 * end of parsing of this node.
4347	 */
4348	if (xmlStrEqual(name, ctxt->name)) {
4349	    nodePop(ctxt);
4350	    htmlnamePop(ctxt);
4351	}
4352
4353        if (ctxt->record_info)
4354            htmlNodeInfoPush(ctxt, &node_info);
4355        htmlParserFinishElementParsing(ctxt);
4356	return;
4357    }
4358
4359    /*
4360     * Check for an Empty Element from DTD definition
4361     */
4362    if ((info != NULL) && (info->empty)) {
4363	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4364	    ctxt->sax->endElement(ctxt->userData, name);
4365	htmlnamePop(ctxt);
4366	return;
4367    }
4368
4369    if (ctxt->record_info)
4370        htmlNodeInfoPush(ctxt, &node_info);
4371}
4372
4373/**
4374 * htmlParseContentInternal:
4375 * @ctxt:  an HTML parser context
4376 *
4377 * Parse a content: comment, sub-element, reference or text.
4378 * New version for non recursive htmlParseElementInternal
4379 */
4380
4381static void
4382htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4383    xmlChar *currentNode;
4384    int depth;
4385    const xmlChar *name;
4386
4387    currentNode = xmlStrdup(ctxt->name);
4388    depth = ctxt->nameNr;
4389    while (1) {
4390	long cons = ctxt->nbChars;
4391
4392        GROW;
4393
4394        if (ctxt->instate == XML_PARSER_EOF)
4395            break;
4396
4397	/*
4398	 * Our tag or one of it's parent or children is ending.
4399	 */
4400        if ((CUR == '<') && (NXT(1) == '/')) {
4401	    if (htmlParseEndTag(ctxt) &&
4402		((currentNode != NULL) || (ctxt->nameNr == 0))) {
4403		if (currentNode != NULL)
4404		    xmlFree(currentNode);
4405
4406	        currentNode = xmlStrdup(ctxt->name);
4407	        depth = ctxt->nameNr;
4408	    }
4409	    continue; /* while */
4410        }
4411
4412	else if ((CUR == '<') &&
4413	         ((IS_ASCII_LETTER(NXT(1))) ||
4414		  (NXT(1) == '_') || (NXT(1) == ':'))) {
4415	    name = htmlParseHTMLName_nonInvasive(ctxt);
4416	    if (name == NULL) {
4417	        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4418			 "htmlParseStartTag: invalid element name\n",
4419			 NULL, NULL);
4420	        /* Dump the bogus tag like browsers do */
4421	        while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4422	            NEXT;
4423
4424	        htmlParserFinishElementParsing(ctxt);
4425	        if (currentNode != NULL)
4426	            xmlFree(currentNode);
4427
4428	        currentNode = xmlStrdup(ctxt->name);
4429	        depth = ctxt->nameNr;
4430	        continue;
4431	    }
4432
4433	    if (ctxt->name != NULL) {
4434	        if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4435	            htmlAutoClose(ctxt, name);
4436	            continue;
4437	        }
4438	    }
4439	}
4440
4441	/*
4442	 * Has this node been popped out during parsing of
4443	 * the next element
4444	 */
4445        if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4446	    (!xmlStrEqual(currentNode, ctxt->name)))
4447	     {
4448	    htmlParserFinishElementParsing(ctxt);
4449	    if (currentNode != NULL) xmlFree(currentNode);
4450
4451	    currentNode = xmlStrdup(ctxt->name);
4452	    depth = ctxt->nameNr;
4453	    continue;
4454	}
4455
4456	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4457	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4458	    /*
4459	     * Handle SCRIPT/STYLE separately
4460	     */
4461	    htmlParseScript(ctxt);
4462	} else {
4463	    /*
4464	     * Sometimes DOCTYPE arrives in the middle of the document
4465	     */
4466	    if ((CUR == '<') && (NXT(1) == '!') &&
4467		(UPP(2) == 'D') && (UPP(3) == 'O') &&
4468		(UPP(4) == 'C') && (UPP(5) == 'T') &&
4469		(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4470		(UPP(8) == 'E')) {
4471		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4472		             "Misplaced DOCTYPE declaration\n",
4473			     BAD_CAST "DOCTYPE" , NULL);
4474		htmlParseDocTypeDecl(ctxt);
4475	    }
4476
4477	    /*
4478	     * First case :  a comment
4479	     */
4480	    if ((CUR == '<') && (NXT(1) == '!') &&
4481		(NXT(2) == '-') && (NXT(3) == '-')) {
4482		htmlParseComment(ctxt);
4483	    }
4484
4485	    /*
4486	     * Second case : a Processing Instruction.
4487	     */
4488	    else if ((CUR == '<') && (NXT(1) == '?')) {
4489		htmlParsePI(ctxt);
4490	    }
4491
4492	    /*
4493	     * Third case :  a sub-element.
4494	     */
4495	    else if (CUR == '<') {
4496		htmlParseElementInternal(ctxt);
4497		if (currentNode != NULL) xmlFree(currentNode);
4498
4499		currentNode = xmlStrdup(ctxt->name);
4500		depth = ctxt->nameNr;
4501	    }
4502
4503	    /*
4504	     * Fourth case : a reference. If if has not been resolved,
4505	     *    parsing returns it's Name, create the node
4506	     */
4507	    else if (CUR == '&') {
4508		htmlParseReference(ctxt);
4509	    }
4510
4511	    /*
4512	     * Fifth case : end of the resource
4513	     */
4514	    else if (CUR == 0) {
4515		htmlAutoCloseOnEnd(ctxt);
4516		break;
4517	    }
4518
4519	    /*
4520	     * Last case, text. Note that References are handled directly.
4521	     */
4522	    else {
4523		htmlParseCharData(ctxt);
4524	    }
4525
4526	    if (cons == ctxt->nbChars) {
4527		if (ctxt->node != NULL) {
4528		    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4529		                 "detected an error in element content\n",
4530				 NULL, NULL);
4531		}
4532		break;
4533	    }
4534	}
4535        GROW;
4536    }
4537    if (currentNode != NULL) xmlFree(currentNode);
4538}
4539
4540/**
4541 * htmlParseContent:
4542 * @ctxt:  an HTML parser context
4543 *
4544 * Parse a content: comment, sub-element, reference or text.
4545 * This is the entry point when called from parser.c
4546 */
4547
4548void
4549__htmlParseContent(void *ctxt) {
4550    if (ctxt != NULL)
4551	htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4552}
4553
4554/**
4555 * htmlParseDocument:
4556 * @ctxt:  an HTML parser context
4557 *
4558 * parse an HTML document (and build a tree if using the standard SAX
4559 * interface).
4560 *
4561 * Returns 0, -1 in case of error. the parser context is augmented
4562 *                as a result of the parsing.
4563 */
4564
4565int
4566htmlParseDocument(htmlParserCtxtPtr ctxt) {
4567    xmlChar start[4];
4568    xmlCharEncoding enc;
4569    xmlDtdPtr dtd;
4570
4571    xmlInitParser();
4572
4573    htmlDefaultSAXHandlerInit();
4574
4575    if ((ctxt == NULL) || (ctxt->input == NULL)) {
4576	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4577		     "htmlParseDocument: context error\n", NULL, NULL);
4578	return(XML_ERR_INTERNAL_ERROR);
4579    }
4580    ctxt->html = 1;
4581    ctxt->linenumbers = 1;
4582    GROW;
4583    /*
4584     * SAX: beginning of the document processing.
4585     */
4586    if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4587        ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4588
4589    if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4590        ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4591	/*
4592	 * Get the 4 first bytes and decode the charset
4593	 * if enc != XML_CHAR_ENCODING_NONE
4594	 * plug some encoding conversion routines.
4595	 */
4596	start[0] = RAW;
4597	start[1] = NXT(1);
4598	start[2] = NXT(2);
4599	start[3] = NXT(3);
4600	enc = xmlDetectCharEncoding(&start[0], 4);
4601	if (enc != XML_CHAR_ENCODING_NONE) {
4602	    xmlSwitchEncoding(ctxt, enc);
4603	}
4604    }
4605
4606    /*
4607     * Wipe out everything which is before the first '<'
4608     */
4609    SKIP_BLANKS;
4610    if (CUR == 0) {
4611	htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4612	             "Document is empty\n", NULL, NULL);
4613    }
4614
4615    if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4616	ctxt->sax->startDocument(ctxt->userData);
4617
4618
4619    /*
4620     * Parse possible comments and PIs before any content
4621     */
4622    while (((CUR == '<') && (NXT(1) == '!') &&
4623            (NXT(2) == '-') && (NXT(3) == '-')) ||
4624	   ((CUR == '<') && (NXT(1) == '?'))) {
4625        htmlParseComment(ctxt);
4626        htmlParsePI(ctxt);
4627	SKIP_BLANKS;
4628    }
4629
4630
4631    /*
4632     * Then possibly doc type declaration(s) and more Misc
4633     * (doctypedecl Misc*)?
4634     */
4635    if ((CUR == '<') && (NXT(1) == '!') &&
4636	(UPP(2) == 'D') && (UPP(3) == 'O') &&
4637	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4638	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4639	(UPP(8) == 'E')) {
4640	htmlParseDocTypeDecl(ctxt);
4641    }
4642    SKIP_BLANKS;
4643
4644    /*
4645     * Parse possible comments and PIs before any content
4646     */
4647    while (((CUR == '<') && (NXT(1) == '!') &&
4648            (NXT(2) == '-') && (NXT(3) == '-')) ||
4649	   ((CUR == '<') && (NXT(1) == '?'))) {
4650        htmlParseComment(ctxt);
4651        htmlParsePI(ctxt);
4652	SKIP_BLANKS;
4653    }
4654
4655    /*
4656     * Time to start parsing the tree itself
4657     */
4658    htmlParseContentInternal(ctxt);
4659
4660    /*
4661     * autoclose
4662     */
4663    if (CUR == 0)
4664	htmlAutoCloseOnEnd(ctxt);
4665
4666
4667    /*
4668     * SAX: end of the document processing.
4669     */
4670    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4671        ctxt->sax->endDocument(ctxt->userData);
4672
4673    if (ctxt->myDoc != NULL) {
4674	dtd = xmlGetIntSubset(ctxt->myDoc);
4675	if (dtd == NULL)
4676	    ctxt->myDoc->intSubset =
4677		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4678		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4679		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4680    }
4681    if (! ctxt->wellFormed) return(-1);
4682    return(0);
4683}
4684
4685
4686/************************************************************************
4687 *									*
4688 *			Parser contexts handling			*
4689 *									*
4690 ************************************************************************/
4691
4692/**
4693 * htmlInitParserCtxt:
4694 * @ctxt:  an HTML parser context
4695 *
4696 * Initialize a parser context
4697 *
4698 * Returns 0 in case of success and -1 in case of error
4699 */
4700
4701static int
4702htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4703{
4704    htmlSAXHandler *sax;
4705
4706    if (ctxt == NULL) return(-1);
4707    memset(ctxt, 0, sizeof(htmlParserCtxt));
4708
4709    ctxt->dict = xmlDictCreate();
4710    if (ctxt->dict == NULL) {
4711        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4712	return(-1);
4713    }
4714    sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4715    if (sax == NULL) {
4716        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4717	return(-1);
4718    }
4719    else
4720        memset(sax, 0, sizeof(htmlSAXHandler));
4721
4722    /* Allocate the Input stack */
4723    ctxt->inputTab = (htmlParserInputPtr *)
4724                      xmlMalloc(5 * sizeof(htmlParserInputPtr));
4725    if (ctxt->inputTab == NULL) {
4726        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4727	ctxt->inputNr = 0;
4728	ctxt->inputMax = 0;
4729	ctxt->input = NULL;
4730	return(-1);
4731    }
4732    ctxt->inputNr = 0;
4733    ctxt->inputMax = 5;
4734    ctxt->input = NULL;
4735    ctxt->version = NULL;
4736    ctxt->encoding = NULL;
4737    ctxt->standalone = -1;
4738    ctxt->instate = XML_PARSER_START;
4739
4740    /* Allocate the Node stack */
4741    ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4742    if (ctxt->nodeTab == NULL) {
4743        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4744	ctxt->nodeNr = 0;
4745	ctxt->nodeMax = 0;
4746	ctxt->node = NULL;
4747	ctxt->inputNr = 0;
4748	ctxt->inputMax = 0;
4749	ctxt->input = NULL;
4750	return(-1);
4751    }
4752    ctxt->nodeNr = 0;
4753    ctxt->nodeMax = 10;
4754    ctxt->node = NULL;
4755
4756    /* Allocate the Name stack */
4757    ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4758    if (ctxt->nameTab == NULL) {
4759        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4760	ctxt->nameNr = 0;
4761	ctxt->nameMax = 0;
4762	ctxt->name = NULL;
4763	ctxt->nodeNr = 0;
4764	ctxt->nodeMax = 0;
4765	ctxt->node = NULL;
4766	ctxt->inputNr = 0;
4767	ctxt->inputMax = 0;
4768	ctxt->input = NULL;
4769	return(-1);
4770    }
4771    ctxt->nameNr = 0;
4772    ctxt->nameMax = 10;
4773    ctxt->name = NULL;
4774
4775    ctxt->nodeInfoTab = NULL;
4776    ctxt->nodeInfoNr  = 0;
4777    ctxt->nodeInfoMax = 0;
4778
4779    if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
4780    else {
4781        ctxt->sax = sax;
4782	memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
4783    }
4784    ctxt->userData = ctxt;
4785    ctxt->myDoc = NULL;
4786    ctxt->wellFormed = 1;
4787    ctxt->replaceEntities = 0;
4788    ctxt->linenumbers = xmlLineNumbersDefaultValue;
4789    ctxt->html = 1;
4790    ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
4791    ctxt->vctxt.userData = ctxt;
4792    ctxt->vctxt.error = xmlParserValidityError;
4793    ctxt->vctxt.warning = xmlParserValidityWarning;
4794    ctxt->record_info = 0;
4795    ctxt->validate = 0;
4796    ctxt->nbChars = 0;
4797    ctxt->checkIndex = 0;
4798    ctxt->catalogs = NULL;
4799    xmlInitNodeInfoSeq(&ctxt->node_seq);
4800    return(0);
4801}
4802
4803/**
4804 * htmlFreeParserCtxt:
4805 * @ctxt:  an HTML parser context
4806 *
4807 * Free all the memory used by a parser context. However the parsed
4808 * document in ctxt->myDoc is not freed.
4809 */
4810
4811void
4812htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4813{
4814    xmlFreeParserCtxt(ctxt);
4815}
4816
4817/**
4818 * htmlNewParserCtxt:
4819 *
4820 * Allocate and initialize a new parser context.
4821 *
4822 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4823 */
4824
4825htmlParserCtxtPtr
4826htmlNewParserCtxt(void)
4827{
4828    xmlParserCtxtPtr ctxt;
4829
4830    ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4831    if (ctxt == NULL) {
4832        htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
4833	return(NULL);
4834    }
4835    memset(ctxt, 0, sizeof(xmlParserCtxt));
4836    if (htmlInitParserCtxt(ctxt) < 0) {
4837        htmlFreeParserCtxt(ctxt);
4838	return(NULL);
4839    }
4840    return(ctxt);
4841}
4842
4843/**
4844 * htmlCreateMemoryParserCtxt:
4845 * @buffer:  a pointer to a char array
4846 * @size:  the size of the array
4847 *
4848 * Create a parser context for an HTML in-memory document.
4849 *
4850 * Returns the new parser context or NULL
4851 */
4852htmlParserCtxtPtr
4853htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4854    xmlParserCtxtPtr ctxt;
4855    xmlParserInputPtr input;
4856    xmlParserInputBufferPtr buf;
4857
4858    if (buffer == NULL)
4859	return(NULL);
4860    if (size <= 0)
4861	return(NULL);
4862
4863    ctxt = htmlNewParserCtxt();
4864    if (ctxt == NULL)
4865	return(NULL);
4866
4867    buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4868    if (buf == NULL) return(NULL);
4869
4870    input = xmlNewInputStream(ctxt);
4871    if (input == NULL) {
4872	xmlFreeParserCtxt(ctxt);
4873	return(NULL);
4874    }
4875
4876    input->filename = NULL;
4877    input->buf = buf;
4878    input->base = input->buf->buffer->content;
4879    input->cur = input->buf->buffer->content;
4880    input->end = &input->buf->buffer->content[input->buf->buffer->use];
4881
4882    inputPush(ctxt, input);
4883    return(ctxt);
4884}
4885
4886/**
4887 * htmlCreateDocParserCtxt:
4888 * @cur:  a pointer to an array of xmlChar
4889 * @encoding:  a free form C string describing the HTML document encoding, or NULL
4890 *
4891 * Create a parser context for an HTML document.
4892 *
4893 * TODO: check the need to add encoding handling there
4894 *
4895 * Returns the new parser context or NULL
4896 */
4897static htmlParserCtxtPtr
4898htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
4899    int len;
4900    htmlParserCtxtPtr ctxt;
4901
4902    if (cur == NULL)
4903	return(NULL);
4904    len = xmlStrlen(cur);
4905    ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4906    if (ctxt == NULL)
4907	return(NULL);
4908
4909    if (encoding != NULL) {
4910	xmlCharEncoding enc;
4911	xmlCharEncodingHandlerPtr handler;
4912
4913	if (ctxt->input->encoding != NULL)
4914	    xmlFree((xmlChar *) ctxt->input->encoding);
4915	ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
4916
4917	enc = xmlParseCharEncoding(encoding);
4918	/*
4919	 * registered set of known encodings
4920	 */
4921	if (enc != XML_CHAR_ENCODING_ERROR) {
4922	    xmlSwitchEncoding(ctxt, enc);
4923	    if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
4924		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4925		             "Unsupported encoding %s\n",
4926			     (const xmlChar *) encoding, NULL);
4927	    }
4928	} else {
4929	    /*
4930	     * fallback for unknown encodings
4931	     */
4932	    handler = xmlFindCharEncodingHandler((const char *) encoding);
4933	    if (handler != NULL) {
4934		xmlSwitchToEncoding(ctxt, handler);
4935	    } else {
4936		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4937		             "Unsupported encoding %s\n",
4938			     (const xmlChar *) encoding, NULL);
4939	    }
4940	}
4941    }
4942    return(ctxt);
4943}
4944
4945#ifdef LIBXML_PUSH_ENABLED
4946/************************************************************************
4947 *									*
4948 *	Progressive parsing interfaces				*
4949 *									*
4950 ************************************************************************/
4951
4952/**
4953 * htmlParseLookupSequence:
4954 * @ctxt:  an HTML parser context
4955 * @first:  the first char to lookup
4956 * @next:  the next char to lookup or zero
4957 * @third:  the next char to lookup or zero
4958 * @comment: flag to force checking inside comments
4959 *
4960 * Try to find if a sequence (first, next, third) or  just (first next) or
4961 * (first) is available in the input stream.
4962 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4963 * to avoid rescanning sequences of bytes, it DOES change the state of the
4964 * parser, do not use liberally.
4965 * This is basically similar to xmlParseLookupSequence()
4966 *
4967 * Returns the index to the current parsing point if the full sequence
4968 *      is available, -1 otherwise.
4969 */
4970static int
4971htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
4972                        xmlChar next, xmlChar third, int iscomment,
4973                        int ignoreattrval)
4974{
4975    int base, len;
4976    htmlParserInputPtr in;
4977    const xmlChar *buf;
4978    int incomment = 0;
4979    int invalue = 0;
4980    char valdellim = 0x0;
4981
4982    in = ctxt->input;
4983    if (in == NULL)
4984        return (-1);
4985
4986    base = in->cur - in->base;
4987    if (base < 0)
4988        return (-1);
4989
4990    if (ctxt->checkIndex > base)
4991        base = ctxt->checkIndex;
4992
4993    if (in->buf == NULL) {
4994        buf = in->base;
4995        len = in->length;
4996    } else {
4997        buf = in->buf->buffer->content;
4998        len = in->buf->buffer->use;
4999    }
5000
5001    /* take into account the sequence length */
5002    if (third)
5003        len -= 2;
5004    else if (next)
5005        len--;
5006    for (; base < len; base++) {
5007        if ((!incomment) && (base + 4 < len) && (!iscomment)) {
5008            if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5009                (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5010                incomment = 1;
5011                /* do not increment past <! - some people use <!--> */
5012                base += 2;
5013            }
5014        }
5015        if (ignoreattrval) {
5016            if (buf[base] == '"' || buf[base] == '\'') {
5017                if (invalue) {
5018                    if (buf[base] == valdellim) {
5019                        invalue = 0;
5020                        continue;
5021                    }
5022                } else {
5023                    valdellim = buf[base];
5024                    invalue = 1;
5025                    continue;
5026                }
5027            } else if (invalue) {
5028                continue;
5029            }
5030        }
5031        if (incomment) {
5032            if (base + 3 > len)
5033                return (-1);
5034            if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5035                (buf[base + 2] == '>')) {
5036                incomment = 0;
5037                base += 2;
5038            }
5039            continue;
5040        }
5041        if (buf[base] == first) {
5042            if (third != 0) {
5043                if ((buf[base + 1] != next) || (buf[base + 2] != third))
5044                    continue;
5045            } else if (next != 0) {
5046                if (buf[base + 1] != next)
5047                    continue;
5048            }
5049            ctxt->checkIndex = 0;
5050#ifdef DEBUG_PUSH
5051            if (next == 0)
5052                xmlGenericError(xmlGenericErrorContext,
5053                                "HPP: lookup '%c' found at %d\n",
5054                                first, base);
5055            else if (third == 0)
5056                xmlGenericError(xmlGenericErrorContext,
5057                                "HPP: lookup '%c%c' found at %d\n",
5058                                first, next, base);
5059            else
5060                xmlGenericError(xmlGenericErrorContext,
5061                                "HPP: lookup '%c%c%c' found at %d\n",
5062                                first, next, third, base);
5063#endif
5064            return (base - (in->cur - in->base));
5065        }
5066    }
5067    if ((!incomment) && (!invalue))
5068        ctxt->checkIndex = base;
5069#ifdef DEBUG_PUSH
5070    if (next == 0)
5071        xmlGenericError(xmlGenericErrorContext,
5072                        "HPP: lookup '%c' failed\n", first);
5073    else if (third == 0)
5074        xmlGenericError(xmlGenericErrorContext,
5075                        "HPP: lookup '%c%c' failed\n", first, next);
5076    else
5077        xmlGenericError(xmlGenericErrorContext,
5078                        "HPP: lookup '%c%c%c' failed\n", first, next,
5079                        third);
5080#endif
5081    return (-1);
5082}
5083
5084/**
5085 * htmlParseLookupChars:
5086 * @ctxt: an HTML parser context
5087 * @stop: Array of chars, which stop the lookup.
5088 * @stopLen: Length of stop-Array
5089 *
5090 * Try to find if any char of the stop-Array is available in the input
5091 * stream.
5092 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5093 * to avoid rescanning sequences of bytes, it DOES change the state of the
5094 * parser, do not use liberally.
5095 *
5096 * Returns the index to the current parsing point if a stopChar
5097 *      is available, -1 otherwise.
5098 */
5099static int
5100htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
5101                     int stopLen)
5102{
5103    int base, len;
5104    htmlParserInputPtr in;
5105    const xmlChar *buf;
5106    int incomment = 0;
5107    int i;
5108
5109    in = ctxt->input;
5110    if (in == NULL)
5111        return (-1);
5112
5113    base = in->cur - in->base;
5114    if (base < 0)
5115        return (-1);
5116
5117    if (ctxt->checkIndex > base)
5118        base = ctxt->checkIndex;
5119
5120    if (in->buf == NULL) {
5121        buf = in->base;
5122        len = in->length;
5123    } else {
5124        buf = in->buf->buffer->content;
5125        len = in->buf->buffer->use;
5126    }
5127
5128    for (; base < len; base++) {
5129        if (!incomment && (base + 4 < len)) {
5130            if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5131                (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5132                incomment = 1;
5133                /* do not increment past <! - some people use <!--> */
5134                base += 2;
5135            }
5136        }
5137        if (incomment) {
5138            if (base + 3 > len)
5139                return (-1);
5140            if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5141                (buf[base + 2] == '>')) {
5142                incomment = 0;
5143                base += 2;
5144            }
5145            continue;
5146        }
5147        for (i = 0; i < stopLen; ++i) {
5148            if (buf[base] == stop[i]) {
5149                ctxt->checkIndex = 0;
5150                return (base - (in->cur - in->base));
5151            }
5152        }
5153    }
5154    ctxt->checkIndex = base;
5155    return (-1);
5156}
5157
5158/**
5159 * htmlParseTryOrFinish:
5160 * @ctxt:  an HTML parser context
5161 * @terminate:  last chunk indicator
5162 *
5163 * Try to progress on parsing
5164 *
5165 * Returns zero if no parsing was possible
5166 */
5167static int
5168htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5169    int ret = 0;
5170    htmlParserInputPtr in;
5171    int avail = 0;
5172    xmlChar cur, next;
5173
5174#ifdef DEBUG_PUSH
5175    switch (ctxt->instate) {
5176	case XML_PARSER_EOF:
5177	    xmlGenericError(xmlGenericErrorContext,
5178		    "HPP: try EOF\n"); break;
5179	case XML_PARSER_START:
5180	    xmlGenericError(xmlGenericErrorContext,
5181		    "HPP: try START\n"); break;
5182	case XML_PARSER_MISC:
5183	    xmlGenericError(xmlGenericErrorContext,
5184		    "HPP: try MISC\n");break;
5185	case XML_PARSER_COMMENT:
5186	    xmlGenericError(xmlGenericErrorContext,
5187		    "HPP: try COMMENT\n");break;
5188	case XML_PARSER_PROLOG:
5189	    xmlGenericError(xmlGenericErrorContext,
5190		    "HPP: try PROLOG\n");break;
5191	case XML_PARSER_START_TAG:
5192	    xmlGenericError(xmlGenericErrorContext,
5193		    "HPP: try START_TAG\n");break;
5194	case XML_PARSER_CONTENT:
5195	    xmlGenericError(xmlGenericErrorContext,
5196		    "HPP: try CONTENT\n");break;
5197	case XML_PARSER_CDATA_SECTION:
5198	    xmlGenericError(xmlGenericErrorContext,
5199		    "HPP: try CDATA_SECTION\n");break;
5200	case XML_PARSER_END_TAG:
5201	    xmlGenericError(xmlGenericErrorContext,
5202		    "HPP: try END_TAG\n");break;
5203	case XML_PARSER_ENTITY_DECL:
5204	    xmlGenericError(xmlGenericErrorContext,
5205		    "HPP: try ENTITY_DECL\n");break;
5206	case XML_PARSER_ENTITY_VALUE:
5207	    xmlGenericError(xmlGenericErrorContext,
5208		    "HPP: try ENTITY_VALUE\n");break;
5209	case XML_PARSER_ATTRIBUTE_VALUE:
5210	    xmlGenericError(xmlGenericErrorContext,
5211		    "HPP: try ATTRIBUTE_VALUE\n");break;
5212	case XML_PARSER_DTD:
5213	    xmlGenericError(xmlGenericErrorContext,
5214		    "HPP: try DTD\n");break;
5215	case XML_PARSER_EPILOG:
5216	    xmlGenericError(xmlGenericErrorContext,
5217		    "HPP: try EPILOG\n");break;
5218	case XML_PARSER_PI:
5219	    xmlGenericError(xmlGenericErrorContext,
5220		    "HPP: try PI\n");break;
5221	case XML_PARSER_SYSTEM_LITERAL:
5222	    xmlGenericError(xmlGenericErrorContext,
5223		    "HPP: try SYSTEM_LITERAL\n");break;
5224    }
5225#endif
5226
5227    while (1) {
5228
5229	in = ctxt->input;
5230	if (in == NULL) break;
5231	if (in->buf == NULL)
5232	    avail = in->length - (in->cur - in->base);
5233	else
5234	    avail = in->buf->buffer->use - (in->cur - in->base);
5235	if ((avail == 0) && (terminate)) {
5236	    htmlAutoCloseOnEnd(ctxt);
5237	    if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5238		/*
5239		 * SAX: end of the document processing.
5240		 */
5241		ctxt->instate = XML_PARSER_EOF;
5242		if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5243		    ctxt->sax->endDocument(ctxt->userData);
5244	    }
5245	}
5246        if (avail < 1)
5247	    goto done;
5248	cur = in->cur[0];
5249	if (cur == 0) {
5250	    SKIP(1);
5251	    continue;
5252	}
5253
5254        switch (ctxt->instate) {
5255            case XML_PARSER_EOF:
5256	        /*
5257		 * Document parsing is done !
5258		 */
5259	        goto done;
5260            case XML_PARSER_START:
5261	        /*
5262		 * Very first chars read from the document flow.
5263		 */
5264		cur = in->cur[0];
5265		if (IS_BLANK_CH(cur)) {
5266		    SKIP_BLANKS;
5267		    if (in->buf == NULL)
5268			avail = in->length - (in->cur - in->base);
5269		    else
5270			avail = in->buf->buffer->use - (in->cur - in->base);
5271		}
5272		if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5273		    ctxt->sax->setDocumentLocator(ctxt->userData,
5274						  &xmlDefaultSAXLocator);
5275		if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5276	            (!ctxt->disableSAX))
5277		    ctxt->sax->startDocument(ctxt->userData);
5278
5279		cur = in->cur[0];
5280		next = in->cur[1];
5281		if ((cur == '<') && (next == '!') &&
5282		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
5283		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
5284		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5285		    (UPP(8) == 'E')) {
5286		    if ((!terminate) &&
5287		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5288			goto done;
5289#ifdef DEBUG_PUSH
5290		    xmlGenericError(xmlGenericErrorContext,
5291			    "HPP: Parsing internal subset\n");
5292#endif
5293		    htmlParseDocTypeDecl(ctxt);
5294		    ctxt->instate = XML_PARSER_PROLOG;
5295#ifdef DEBUG_PUSH
5296		    xmlGenericError(xmlGenericErrorContext,
5297			    "HPP: entering PROLOG\n");
5298#endif
5299                } else {
5300		    ctxt->instate = XML_PARSER_MISC;
5301#ifdef DEBUG_PUSH
5302		    xmlGenericError(xmlGenericErrorContext,
5303			    "HPP: entering MISC\n");
5304#endif
5305		}
5306		break;
5307            case XML_PARSER_MISC:
5308		SKIP_BLANKS;
5309		if (in->buf == NULL)
5310		    avail = in->length - (in->cur - in->base);
5311		else
5312		    avail = in->buf->buffer->use - (in->cur - in->base);
5313		if (avail < 2)
5314		    goto done;
5315		cur = in->cur[0];
5316		next = in->cur[1];
5317	        if ((cur == '<') && (next == '!') &&
5318		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5319		    if ((!terminate) &&
5320		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5321			goto done;
5322#ifdef DEBUG_PUSH
5323		    xmlGenericError(xmlGenericErrorContext,
5324			    "HPP: Parsing Comment\n");
5325#endif
5326		    htmlParseComment(ctxt);
5327		    ctxt->instate = XML_PARSER_MISC;
5328	        } else if ((cur == '<') && (next == '?')) {
5329		    if ((!terminate) &&
5330		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5331			goto done;
5332#ifdef DEBUG_PUSH
5333		    xmlGenericError(xmlGenericErrorContext,
5334			    "HPP: Parsing PI\n");
5335#endif
5336		    htmlParsePI(ctxt);
5337		    ctxt->instate = XML_PARSER_MISC;
5338		} else if ((cur == '<') && (next == '!') &&
5339		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
5340		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
5341		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5342		    (UPP(8) == 'E')) {
5343		    if ((!terminate) &&
5344		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5345			goto done;
5346#ifdef DEBUG_PUSH
5347		    xmlGenericError(xmlGenericErrorContext,
5348			    "HPP: Parsing internal subset\n");
5349#endif
5350		    htmlParseDocTypeDecl(ctxt);
5351		    ctxt->instate = XML_PARSER_PROLOG;
5352#ifdef DEBUG_PUSH
5353		    xmlGenericError(xmlGenericErrorContext,
5354			    "HPP: entering PROLOG\n");
5355#endif
5356		} else if ((cur == '<') && (next == '!') &&
5357		           (avail < 9)) {
5358		    goto done;
5359		} else {
5360		    ctxt->instate = XML_PARSER_START_TAG;
5361#ifdef DEBUG_PUSH
5362		    xmlGenericError(xmlGenericErrorContext,
5363			    "HPP: entering START_TAG\n");
5364#endif
5365		}
5366		break;
5367            case XML_PARSER_PROLOG:
5368		SKIP_BLANKS;
5369		if (in->buf == NULL)
5370		    avail = in->length - (in->cur - in->base);
5371		else
5372		    avail = in->buf->buffer->use - (in->cur - in->base);
5373		if (avail < 2)
5374		    goto done;
5375		cur = in->cur[0];
5376		next = in->cur[1];
5377		if ((cur == '<') && (next == '!') &&
5378		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5379		    if ((!terminate) &&
5380		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5381			goto done;
5382#ifdef DEBUG_PUSH
5383		    xmlGenericError(xmlGenericErrorContext,
5384			    "HPP: Parsing Comment\n");
5385#endif
5386		    htmlParseComment(ctxt);
5387		    ctxt->instate = XML_PARSER_PROLOG;
5388	        } else if ((cur == '<') && (next == '?')) {
5389		    if ((!terminate) &&
5390		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5391			goto done;
5392#ifdef DEBUG_PUSH
5393		    xmlGenericError(xmlGenericErrorContext,
5394			    "HPP: Parsing PI\n");
5395#endif
5396		    htmlParsePI(ctxt);
5397		    ctxt->instate = XML_PARSER_PROLOG;
5398		} else if ((cur == '<') && (next == '!') &&
5399		           (avail < 4)) {
5400		    goto done;
5401		} else {
5402		    ctxt->instate = XML_PARSER_START_TAG;
5403#ifdef DEBUG_PUSH
5404		    xmlGenericError(xmlGenericErrorContext,
5405			    "HPP: entering START_TAG\n");
5406#endif
5407		}
5408		break;
5409            case XML_PARSER_EPILOG:
5410		if (in->buf == NULL)
5411		    avail = in->length - (in->cur - in->base);
5412		else
5413		    avail = in->buf->buffer->use - (in->cur - in->base);
5414		if (avail < 1)
5415		    goto done;
5416		cur = in->cur[0];
5417		if (IS_BLANK_CH(cur)) {
5418		    htmlParseCharData(ctxt);
5419		    goto done;
5420		}
5421		if (avail < 2)
5422		    goto done;
5423		next = in->cur[1];
5424	        if ((cur == '<') && (next == '!') &&
5425		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5426		    if ((!terminate) &&
5427		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5428			goto done;
5429#ifdef DEBUG_PUSH
5430		    xmlGenericError(xmlGenericErrorContext,
5431			    "HPP: Parsing Comment\n");
5432#endif
5433		    htmlParseComment(ctxt);
5434		    ctxt->instate = XML_PARSER_EPILOG;
5435	        } else if ((cur == '<') && (next == '?')) {
5436		    if ((!terminate) &&
5437		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5438			goto done;
5439#ifdef DEBUG_PUSH
5440		    xmlGenericError(xmlGenericErrorContext,
5441			    "HPP: Parsing PI\n");
5442#endif
5443		    htmlParsePI(ctxt);
5444		    ctxt->instate = XML_PARSER_EPILOG;
5445		} else if ((cur == '<') && (next == '!') &&
5446		           (avail < 4)) {
5447		    goto done;
5448		} else {
5449		    ctxt->errNo = XML_ERR_DOCUMENT_END;
5450		    ctxt->wellFormed = 0;
5451		    ctxt->instate = XML_PARSER_EOF;
5452#ifdef DEBUG_PUSH
5453		    xmlGenericError(xmlGenericErrorContext,
5454			    "HPP: entering EOF\n");
5455#endif
5456		    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5457			ctxt->sax->endDocument(ctxt->userData);
5458		    goto done;
5459		}
5460		break;
5461            case XML_PARSER_START_TAG: {
5462	        const xmlChar *name;
5463		int failed;
5464		const htmlElemDesc * info;
5465
5466		if (avail < 2)
5467		    goto done;
5468		cur = in->cur[0];
5469	        if (cur != '<') {
5470		    ctxt->instate = XML_PARSER_CONTENT;
5471#ifdef DEBUG_PUSH
5472		    xmlGenericError(xmlGenericErrorContext,
5473			    "HPP: entering CONTENT\n");
5474#endif
5475		    break;
5476		}
5477		if (in->cur[1] == '/') {
5478		    ctxt->instate = XML_PARSER_END_TAG;
5479		    ctxt->checkIndex = 0;
5480#ifdef DEBUG_PUSH
5481		    xmlGenericError(xmlGenericErrorContext,
5482			    "HPP: entering END_TAG\n");
5483#endif
5484		    break;
5485		}
5486		if ((!terminate) &&
5487		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5488		    goto done;
5489
5490		failed = htmlParseStartTag(ctxt);
5491		name = ctxt->name;
5492		if ((failed == -1) ||
5493		    (name == NULL)) {
5494		    if (CUR == '>')
5495			NEXT;
5496		    break;
5497		}
5498
5499		/*
5500		 * Lookup the info for that element.
5501		 */
5502		info = htmlTagLookup(name);
5503		if (info == NULL) {
5504		    htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5505		                 "Tag %s invalid\n", name, NULL);
5506		}
5507
5508		/*
5509		 * Check for an Empty Element labeled the XML/SGML way
5510		 */
5511		if ((CUR == '/') && (NXT(1) == '>')) {
5512		    SKIP(2);
5513		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5514			ctxt->sax->endElement(ctxt->userData, name);
5515		    htmlnamePop(ctxt);
5516		    ctxt->instate = XML_PARSER_CONTENT;
5517#ifdef DEBUG_PUSH
5518		    xmlGenericError(xmlGenericErrorContext,
5519			    "HPP: entering CONTENT\n");
5520#endif
5521		    break;
5522		}
5523
5524		if (CUR == '>') {
5525		    NEXT;
5526		} else {
5527		    htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5528		                 "Couldn't find end of Start Tag %s\n",
5529				 name, NULL);
5530
5531		    /*
5532		     * end of parsing of this node.
5533		     */
5534		    if (xmlStrEqual(name, ctxt->name)) {
5535			nodePop(ctxt);
5536			htmlnamePop(ctxt);
5537		    }
5538
5539		    ctxt->instate = XML_PARSER_CONTENT;
5540#ifdef DEBUG_PUSH
5541		    xmlGenericError(xmlGenericErrorContext,
5542			    "HPP: entering CONTENT\n");
5543#endif
5544		    break;
5545		}
5546
5547		/*
5548		 * Check for an Empty Element from DTD definition
5549		 */
5550		if ((info != NULL) && (info->empty)) {
5551		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5552			ctxt->sax->endElement(ctxt->userData, name);
5553		    htmlnamePop(ctxt);
5554		}
5555		ctxt->instate = XML_PARSER_CONTENT;
5556#ifdef DEBUG_PUSH
5557		xmlGenericError(xmlGenericErrorContext,
5558			"HPP: entering CONTENT\n");
5559#endif
5560                break;
5561	    }
5562            case XML_PARSER_CONTENT: {
5563		long cons;
5564                /*
5565		 * Handle preparsed entities and charRef
5566		 */
5567		if (ctxt->token != 0) {
5568		    xmlChar chr[2] = { 0 , 0 } ;
5569
5570		    chr[0] = (xmlChar) ctxt->token;
5571		    htmlCheckParagraph(ctxt);
5572		    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5573			ctxt->sax->characters(ctxt->userData, chr, 1);
5574		    ctxt->token = 0;
5575		    ctxt->checkIndex = 0;
5576		}
5577		if ((avail == 1) && (terminate)) {
5578		    cur = in->cur[0];
5579		    if ((cur != '<') && (cur != '&')) {
5580			if (ctxt->sax != NULL) {
5581			    if (IS_BLANK_CH(cur)) {
5582				if (ctxt->sax->ignorableWhitespace != NULL)
5583				    ctxt->sax->ignorableWhitespace(
5584					    ctxt->userData, &cur, 1);
5585			    } else {
5586				htmlCheckParagraph(ctxt);
5587				if (ctxt->sax->characters != NULL)
5588				    ctxt->sax->characters(
5589					    ctxt->userData, &cur, 1);
5590			    }
5591			}
5592			ctxt->token = 0;
5593			ctxt->checkIndex = 0;
5594			in->cur++;
5595			break;
5596		    }
5597		}
5598		if (avail < 2)
5599		    goto done;
5600		cur = in->cur[0];
5601		next = in->cur[1];
5602		cons = ctxt->nbChars;
5603		if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5604		    (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5605		    /*
5606		     * Handle SCRIPT/STYLE separately
5607		     */
5608		    if (!terminate) {
5609		        int idx;
5610			xmlChar val;
5611
5612			idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 1);
5613			if (idx < 0)
5614			    goto done;
5615		        val = in->cur[idx + 2];
5616			if (val == 0) /* bad cut of input */
5617			    goto done;
5618		    }
5619		    htmlParseScript(ctxt);
5620		    if ((cur == '<') && (next == '/')) {
5621			ctxt->instate = XML_PARSER_END_TAG;
5622			ctxt->checkIndex = 0;
5623#ifdef DEBUG_PUSH
5624			xmlGenericError(xmlGenericErrorContext,
5625				"HPP: entering END_TAG\n");
5626#endif
5627			break;
5628		    }
5629		} else {
5630		    /*
5631		     * Sometimes DOCTYPE arrives in the middle of the document
5632		     */
5633		    if ((cur == '<') && (next == '!') &&
5634			(UPP(2) == 'D') && (UPP(3) == 'O') &&
5635			(UPP(4) == 'C') && (UPP(5) == 'T') &&
5636			(UPP(6) == 'Y') && (UPP(7) == 'P') &&
5637			(UPP(8) == 'E')) {
5638			if ((!terminate) &&
5639			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5640			    goto done;
5641			htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5642			             "Misplaced DOCTYPE declaration\n",
5643				     BAD_CAST "DOCTYPE" , NULL);
5644			htmlParseDocTypeDecl(ctxt);
5645		    } else if ((cur == '<') && (next == '!') &&
5646			(in->cur[2] == '-') && (in->cur[3] == '-')) {
5647			if ((!terminate) &&
5648			    (htmlParseLookupSequence(
5649				ctxt, '-', '-', '>', 1, 1) < 0))
5650			    goto done;
5651#ifdef DEBUG_PUSH
5652			xmlGenericError(xmlGenericErrorContext,
5653				"HPP: Parsing Comment\n");
5654#endif
5655			htmlParseComment(ctxt);
5656			ctxt->instate = XML_PARSER_CONTENT;
5657		    } else if ((cur == '<') && (next == '?')) {
5658			if ((!terminate) &&
5659			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5660			    goto done;
5661#ifdef DEBUG_PUSH
5662			xmlGenericError(xmlGenericErrorContext,
5663				"HPP: Parsing PI\n");
5664#endif
5665			htmlParsePI(ctxt);
5666			ctxt->instate = XML_PARSER_CONTENT;
5667		    } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5668			goto done;
5669		    } else if ((cur == '<') && (next == '/')) {
5670			ctxt->instate = XML_PARSER_END_TAG;
5671			ctxt->checkIndex = 0;
5672#ifdef DEBUG_PUSH
5673			xmlGenericError(xmlGenericErrorContext,
5674				"HPP: entering END_TAG\n");
5675#endif
5676			break;
5677		    } else if (cur == '<') {
5678			ctxt->instate = XML_PARSER_START_TAG;
5679			ctxt->checkIndex = 0;
5680#ifdef DEBUG_PUSH
5681			xmlGenericError(xmlGenericErrorContext,
5682				"HPP: entering START_TAG\n");
5683#endif
5684			break;
5685		    } else if (cur == '&') {
5686			if ((!terminate) &&
5687			    (htmlParseLookupChars(ctxt,
5688                                                  BAD_CAST "; >/", 4) < 0))
5689			    goto done;
5690#ifdef DEBUG_PUSH
5691			xmlGenericError(xmlGenericErrorContext,
5692				"HPP: Parsing Reference\n");
5693#endif
5694			/* TODO: check generation of subtrees if noent !!! */
5695			htmlParseReference(ctxt);
5696		    } else {
5697		        /*
5698			 * check that the text sequence is complete
5699			 * before handing out the data to the parser
5700			 * to avoid problems with erroneous end of
5701			 * data detection.
5702			 */
5703			if ((!terminate) &&
5704                            (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
5705			    goto done;
5706			ctxt->checkIndex = 0;
5707#ifdef DEBUG_PUSH
5708			xmlGenericError(xmlGenericErrorContext,
5709				"HPP: Parsing char data\n");
5710#endif
5711			htmlParseCharData(ctxt);
5712		    }
5713		}
5714		if (cons == ctxt->nbChars) {
5715		    if (ctxt->node != NULL) {
5716			htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5717			             "detected an error in element content\n",
5718				     NULL, NULL);
5719		    }
5720		    NEXT;
5721		    break;
5722		}
5723
5724		break;
5725	    }
5726            case XML_PARSER_END_TAG:
5727		if (avail < 2)
5728		    goto done;
5729		if ((!terminate) &&
5730		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5731		    goto done;
5732		htmlParseEndTag(ctxt);
5733		if (ctxt->nameNr == 0) {
5734		    ctxt->instate = XML_PARSER_EPILOG;
5735		} else {
5736		    ctxt->instate = XML_PARSER_CONTENT;
5737		}
5738		ctxt->checkIndex = 0;
5739#ifdef DEBUG_PUSH
5740		xmlGenericError(xmlGenericErrorContext,
5741			"HPP: entering CONTENT\n");
5742#endif
5743	        break;
5744            case XML_PARSER_CDATA_SECTION:
5745		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5746			"HPP: internal error, state == CDATA\n",
5747			     NULL, NULL);
5748		ctxt->instate = XML_PARSER_CONTENT;
5749		ctxt->checkIndex = 0;
5750#ifdef DEBUG_PUSH
5751		xmlGenericError(xmlGenericErrorContext,
5752			"HPP: entering CONTENT\n");
5753#endif
5754		break;
5755            case XML_PARSER_DTD:
5756		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5757			"HPP: internal error, state == DTD\n",
5758			     NULL, NULL);
5759		ctxt->instate = XML_PARSER_CONTENT;
5760		ctxt->checkIndex = 0;
5761#ifdef DEBUG_PUSH
5762		xmlGenericError(xmlGenericErrorContext,
5763			"HPP: entering CONTENT\n");
5764#endif
5765		break;
5766            case XML_PARSER_COMMENT:
5767		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5768			"HPP: internal error, state == COMMENT\n",
5769			     NULL, NULL);
5770		ctxt->instate = XML_PARSER_CONTENT;
5771		ctxt->checkIndex = 0;
5772#ifdef DEBUG_PUSH
5773		xmlGenericError(xmlGenericErrorContext,
5774			"HPP: entering CONTENT\n");
5775#endif
5776		break;
5777            case XML_PARSER_PI:
5778		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5779			"HPP: internal error, state == PI\n",
5780			     NULL, NULL);
5781		ctxt->instate = XML_PARSER_CONTENT;
5782		ctxt->checkIndex = 0;
5783#ifdef DEBUG_PUSH
5784		xmlGenericError(xmlGenericErrorContext,
5785			"HPP: entering CONTENT\n");
5786#endif
5787		break;
5788            case XML_PARSER_ENTITY_DECL:
5789		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5790			"HPP: internal error, state == ENTITY_DECL\n",
5791			     NULL, NULL);
5792		ctxt->instate = XML_PARSER_CONTENT;
5793		ctxt->checkIndex = 0;
5794#ifdef DEBUG_PUSH
5795		xmlGenericError(xmlGenericErrorContext,
5796			"HPP: entering CONTENT\n");
5797#endif
5798		break;
5799            case XML_PARSER_ENTITY_VALUE:
5800		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5801			"HPP: internal error, state == ENTITY_VALUE\n",
5802			     NULL, NULL);
5803		ctxt->instate = XML_PARSER_CONTENT;
5804		ctxt->checkIndex = 0;
5805#ifdef DEBUG_PUSH
5806		xmlGenericError(xmlGenericErrorContext,
5807			"HPP: entering DTD\n");
5808#endif
5809		break;
5810            case XML_PARSER_ATTRIBUTE_VALUE:
5811		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5812			"HPP: internal error, state == ATTRIBUTE_VALUE\n",
5813			     NULL, NULL);
5814		ctxt->instate = XML_PARSER_START_TAG;
5815		ctxt->checkIndex = 0;
5816#ifdef DEBUG_PUSH
5817		xmlGenericError(xmlGenericErrorContext,
5818			"HPP: entering START_TAG\n");
5819#endif
5820		break;
5821	    case XML_PARSER_SYSTEM_LITERAL:
5822		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5823		    "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5824			     NULL, NULL);
5825		ctxt->instate = XML_PARSER_CONTENT;
5826		ctxt->checkIndex = 0;
5827#ifdef DEBUG_PUSH
5828		xmlGenericError(xmlGenericErrorContext,
5829			"HPP: entering CONTENT\n");
5830#endif
5831		break;
5832	    case XML_PARSER_IGNORE:
5833		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5834			"HPP: internal error, state == XML_PARSER_IGNORE\n",
5835			     NULL, NULL);
5836		ctxt->instate = XML_PARSER_CONTENT;
5837		ctxt->checkIndex = 0;
5838#ifdef DEBUG_PUSH
5839		xmlGenericError(xmlGenericErrorContext,
5840			"HPP: entering CONTENT\n");
5841#endif
5842		break;
5843	    case XML_PARSER_PUBLIC_LITERAL:
5844		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5845			"HPP: internal error, state == XML_PARSER_LITERAL\n",
5846			     NULL, NULL);
5847		ctxt->instate = XML_PARSER_CONTENT;
5848		ctxt->checkIndex = 0;
5849#ifdef DEBUG_PUSH
5850		xmlGenericError(xmlGenericErrorContext,
5851			"HPP: entering CONTENT\n");
5852#endif
5853		break;
5854
5855	}
5856    }
5857done:
5858    if ((avail == 0) && (terminate)) {
5859	htmlAutoCloseOnEnd(ctxt);
5860	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5861	    /*
5862	     * SAX: end of the document processing.
5863	     */
5864	    ctxt->instate = XML_PARSER_EOF;
5865	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5866		ctxt->sax->endDocument(ctxt->userData);
5867	}
5868    }
5869    if ((ctxt->myDoc != NULL) &&
5870	((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5871	 (ctxt->instate == XML_PARSER_EPILOG))) {
5872	xmlDtdPtr dtd;
5873	dtd = xmlGetIntSubset(ctxt->myDoc);
5874	if (dtd == NULL)
5875	    ctxt->myDoc->intSubset =
5876		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5877		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5878		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5879    }
5880#ifdef DEBUG_PUSH
5881    xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5882#endif
5883    return(ret);
5884}
5885
5886/**
5887 * htmlParseChunk:
5888 * @ctxt:  an HTML parser context
5889 * @chunk:  an char array
5890 * @size:  the size in byte of the chunk
5891 * @terminate:  last chunk indicator
5892 *
5893 * Parse a Chunk of memory
5894 *
5895 * Returns zero if no error, the xmlParserErrors otherwise.
5896 */
5897int
5898htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5899              int terminate) {
5900    if ((ctxt == NULL) || (ctxt->input == NULL)) {
5901	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5902		     "htmlParseChunk: context error\n", NULL, NULL);
5903	return(XML_ERR_INTERNAL_ERROR);
5904    }
5905    if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5906        (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
5907	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5908	int cur = ctxt->input->cur - ctxt->input->base;
5909	int res;
5910
5911	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5912	if (res < 0) {
5913	    ctxt->errNo = XML_PARSER_EOF;
5914	    ctxt->disableSAX = 1;
5915	    return (XML_PARSER_EOF);
5916	}
5917	ctxt->input->base = ctxt->input->buf->buffer->content + base;
5918	ctxt->input->cur = ctxt->input->base + cur;
5919	ctxt->input->end =
5920	  &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
5921#ifdef DEBUG_PUSH
5922	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5923#endif
5924
5925#if 0
5926	if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5927	    htmlParseTryOrFinish(ctxt, terminate);
5928#endif
5929    } else if (ctxt->instate != XML_PARSER_EOF) {
5930	if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5931	    xmlParserInputBufferPtr in = ctxt->input->buf;
5932	    if ((in->encoder != NULL) && (in->buffer != NULL) &&
5933		    (in->raw != NULL)) {
5934		int nbchars;
5935
5936		nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5937		if (nbchars < 0) {
5938		    htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5939			         "encoder error\n", NULL, NULL);
5940		    return(XML_ERR_INVALID_ENCODING);
5941		}
5942	    }
5943	}
5944    }
5945    htmlParseTryOrFinish(ctxt, terminate);
5946    if (terminate) {
5947	if ((ctxt->instate != XML_PARSER_EOF) &&
5948	    (ctxt->instate != XML_PARSER_EPILOG) &&
5949	    (ctxt->instate != XML_PARSER_MISC)) {
5950	    ctxt->errNo = XML_ERR_DOCUMENT_END;
5951	    ctxt->wellFormed = 0;
5952	}
5953	if (ctxt->instate != XML_PARSER_EOF) {
5954	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5955		ctxt->sax->endDocument(ctxt->userData);
5956	}
5957	ctxt->instate = XML_PARSER_EOF;
5958    }
5959    return((xmlParserErrors) ctxt->errNo);
5960}
5961
5962/************************************************************************
5963 *									*
5964 *			User entry points				*
5965 *									*
5966 ************************************************************************/
5967
5968/**
5969 * htmlCreatePushParserCtxt:
5970 * @sax:  a SAX handler
5971 * @user_data:  The user data returned on SAX callbacks
5972 * @chunk:  a pointer to an array of chars
5973 * @size:  number of chars in the array
5974 * @filename:  an optional file name or URI
5975 * @enc:  an optional encoding
5976 *
5977 * Create a parser context for using the HTML parser in push mode
5978 * The value of @filename is used for fetching external entities
5979 * and error/warning reports.
5980 *
5981 * Returns the new parser context or NULL
5982 */
5983htmlParserCtxtPtr
5984htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5985                         const char *chunk, int size, const char *filename,
5986			 xmlCharEncoding enc) {
5987    htmlParserCtxtPtr ctxt;
5988    htmlParserInputPtr inputStream;
5989    xmlParserInputBufferPtr buf;
5990
5991    xmlInitParser();
5992
5993    buf = xmlAllocParserInputBuffer(enc);
5994    if (buf == NULL) return(NULL);
5995
5996    ctxt = htmlNewParserCtxt();
5997    if (ctxt == NULL) {
5998	xmlFreeParserInputBuffer(buf);
5999	return(NULL);
6000    }
6001    if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6002	ctxt->charset=XML_CHAR_ENCODING_UTF8;
6003    if (sax != NULL) {
6004	if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
6005	    xmlFree(ctxt->sax);
6006	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6007	if (ctxt->sax == NULL) {
6008	    xmlFree(buf);
6009	    xmlFree(ctxt);
6010	    return(NULL);
6011	}
6012	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6013	if (user_data != NULL)
6014	    ctxt->userData = user_data;
6015    }
6016    if (filename == NULL) {
6017	ctxt->directory = NULL;
6018    } else {
6019        ctxt->directory = xmlParserGetDirectory(filename);
6020    }
6021
6022    inputStream = htmlNewInputStream(ctxt);
6023    if (inputStream == NULL) {
6024	xmlFreeParserCtxt(ctxt);
6025	xmlFree(buf);
6026	return(NULL);
6027    }
6028
6029    if (filename == NULL)
6030	inputStream->filename = NULL;
6031    else
6032	inputStream->filename = (char *)
6033	    xmlCanonicPath((const xmlChar *) filename);
6034    inputStream->buf = buf;
6035    inputStream->base = inputStream->buf->buffer->content;
6036    inputStream->cur = inputStream->buf->buffer->content;
6037    inputStream->end =
6038	&inputStream->buf->buffer->content[inputStream->buf->buffer->use];
6039
6040    inputPush(ctxt, inputStream);
6041
6042    if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6043        (ctxt->input->buf != NULL))  {
6044	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
6045	int cur = ctxt->input->cur - ctxt->input->base;
6046
6047	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6048
6049	ctxt->input->base = ctxt->input->buf->buffer->content + base;
6050	ctxt->input->cur = ctxt->input->base + cur;
6051	ctxt->input->end =
6052	    &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
6053#ifdef DEBUG_PUSH
6054	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6055#endif
6056    }
6057    ctxt->progressive = 1;
6058
6059    return(ctxt);
6060}
6061#endif /* LIBXML_PUSH_ENABLED */
6062
6063/**
6064 * htmlSAXParseDoc:
6065 * @cur:  a pointer to an array of xmlChar
6066 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6067 * @sax:  the SAX handler block
6068 * @userData: if using SAX, this pointer will be provided on callbacks.
6069 *
6070 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6071 * to handle parse events. If sax is NULL, fallback to the default DOM
6072 * behavior and return a tree.
6073 *
6074 * Returns the resulting document tree unless SAX is NULL or the document is
6075 *     not well formed.
6076 */
6077
6078htmlDocPtr
6079htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
6080    htmlDocPtr ret;
6081    htmlParserCtxtPtr ctxt;
6082
6083    xmlInitParser();
6084
6085    if (cur == NULL) return(NULL);
6086
6087
6088    ctxt = htmlCreateDocParserCtxt(cur, encoding);
6089    if (ctxt == NULL) return(NULL);
6090    if (sax != NULL) {
6091        if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6092        ctxt->sax = sax;
6093        ctxt->userData = userData;
6094    }
6095
6096    htmlParseDocument(ctxt);
6097    ret = ctxt->myDoc;
6098    if (sax != NULL) {
6099	ctxt->sax = NULL;
6100	ctxt->userData = NULL;
6101    }
6102    htmlFreeParserCtxt(ctxt);
6103
6104    return(ret);
6105}
6106
6107/**
6108 * htmlParseDoc:
6109 * @cur:  a pointer to an array of xmlChar
6110 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6111 *
6112 * parse an HTML in-memory document and build a tree.
6113 *
6114 * Returns the resulting document tree
6115 */
6116
6117htmlDocPtr
6118htmlParseDoc(xmlChar *cur, const char *encoding) {
6119    return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6120}
6121
6122
6123/**
6124 * htmlCreateFileParserCtxt:
6125 * @filename:  the filename
6126 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6127 *
6128 * Create a parser context for a file content.
6129 * Automatic support for ZLIB/Compress compressed document is provided
6130 * by default if found at compile-time.
6131 *
6132 * Returns the new parser context or NULL
6133 */
6134htmlParserCtxtPtr
6135htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6136{
6137    htmlParserCtxtPtr ctxt;
6138    htmlParserInputPtr inputStream;
6139    char *canonicFilename;
6140    /* htmlCharEncoding enc; */
6141    xmlChar *content, *content_line = (xmlChar *) "charset=";
6142
6143    if (filename == NULL)
6144        return(NULL);
6145
6146    ctxt = htmlNewParserCtxt();
6147    if (ctxt == NULL) {
6148	return(NULL);
6149    }
6150    canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6151    if (canonicFilename == NULL) {
6152#ifdef LIBXML_SAX1_ENABLED
6153	if (xmlDefaultSAXHandler.error != NULL) {
6154	    xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6155	}
6156#endif
6157	xmlFreeParserCtxt(ctxt);
6158	return(NULL);
6159    }
6160
6161    inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6162    xmlFree(canonicFilename);
6163    if (inputStream == NULL) {
6164	xmlFreeParserCtxt(ctxt);
6165	return(NULL);
6166    }
6167
6168    inputPush(ctxt, inputStream);
6169
6170    /* set encoding */
6171    if (encoding) {
6172        content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
6173	if (content) {
6174	    strcpy ((char *)content, (char *)content_line);
6175            strcat ((char *)content, (char *)encoding);
6176            htmlCheckEncoding (ctxt, content);
6177	    xmlFree (content);
6178	}
6179    }
6180
6181    return(ctxt);
6182}
6183
6184/**
6185 * htmlSAXParseFile:
6186 * @filename:  the filename
6187 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6188 * @sax:  the SAX handler block
6189 * @userData: if using SAX, this pointer will be provided on callbacks.
6190 *
6191 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6192 * compressed document is provided by default if found at compile-time.
6193 * It use the given SAX function block to handle the parsing callback.
6194 * If sax is NULL, fallback to the default DOM tree building routines.
6195 *
6196 * Returns the resulting document tree unless SAX is NULL or the document is
6197 *     not well formed.
6198 */
6199
6200htmlDocPtr
6201htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6202                 void *userData) {
6203    htmlDocPtr ret;
6204    htmlParserCtxtPtr ctxt;
6205    htmlSAXHandlerPtr oldsax = NULL;
6206
6207    xmlInitParser();
6208
6209    ctxt = htmlCreateFileParserCtxt(filename, encoding);
6210    if (ctxt == NULL) return(NULL);
6211    if (sax != NULL) {
6212	oldsax = ctxt->sax;
6213        ctxt->sax = sax;
6214        ctxt->userData = userData;
6215    }
6216
6217    htmlParseDocument(ctxt);
6218
6219    ret = ctxt->myDoc;
6220    if (sax != NULL) {
6221        ctxt->sax = oldsax;
6222        ctxt->userData = NULL;
6223    }
6224    htmlFreeParserCtxt(ctxt);
6225
6226    return(ret);
6227}
6228
6229/**
6230 * htmlParseFile:
6231 * @filename:  the filename
6232 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6233 *
6234 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6235 * compressed document is provided by default if found at compile-time.
6236 *
6237 * Returns the resulting document tree
6238 */
6239
6240htmlDocPtr
6241htmlParseFile(const char *filename, const char *encoding) {
6242    return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6243}
6244
6245/**
6246 * htmlHandleOmittedElem:
6247 * @val:  int 0 or 1
6248 *
6249 * Set and return the previous value for handling HTML omitted tags.
6250 *
6251 * Returns the last value for 0 for no handling, 1 for auto insertion.
6252 */
6253
6254int
6255htmlHandleOmittedElem(int val) {
6256    int old = htmlOmittedDefaultValue;
6257
6258    htmlOmittedDefaultValue = val;
6259    return(old);
6260}
6261
6262/**
6263 * htmlElementAllowedHere:
6264 * @parent: HTML parent element
6265 * @elt: HTML element
6266 *
6267 * Checks whether an HTML element may be a direct child of a parent element.
6268 * Note - doesn't check for deprecated elements
6269 *
6270 * Returns 1 if allowed; 0 otherwise.
6271 */
6272int
6273htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6274  const char** p ;
6275
6276  if ( ! elt || ! parent || ! parent->subelts )
6277	return 0 ;
6278
6279  for ( p = parent->subelts; *p; ++p )
6280    if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6281      return 1 ;
6282
6283  return 0 ;
6284}
6285/**
6286 * htmlElementStatusHere:
6287 * @parent: HTML parent element
6288 * @elt: HTML element
6289 *
6290 * Checks whether an HTML element may be a direct child of a parent element.
6291 * and if so whether it is valid or deprecated.
6292 *
6293 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6294 */
6295htmlStatus
6296htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6297  if ( ! parent || ! elt )
6298    return HTML_INVALID ;
6299  if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6300    return HTML_INVALID ;
6301
6302  return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6303}
6304/**
6305 * htmlAttrAllowed:
6306 * @elt: HTML element
6307 * @attr: HTML attribute
6308 * @legacy: whether to allow deprecated attributes
6309 *
6310 * Checks whether an attribute is valid for an element
6311 * Has full knowledge of Required and Deprecated attributes
6312 *
6313 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6314 */
6315htmlStatus
6316htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6317  const char** p ;
6318
6319  if ( !elt || ! attr )
6320	return HTML_INVALID ;
6321
6322  if ( elt->attrs_req )
6323    for ( p = elt->attrs_req; *p; ++p)
6324      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6325        return HTML_REQUIRED ;
6326
6327  if ( elt->attrs_opt )
6328    for ( p = elt->attrs_opt; *p; ++p)
6329      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6330        return HTML_VALID ;
6331
6332  if ( legacy && elt->attrs_depr )
6333    for ( p = elt->attrs_depr; *p; ++p)
6334      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6335        return HTML_DEPRECATED ;
6336
6337  return HTML_INVALID ;
6338}
6339/**
6340 * htmlNodeStatus:
6341 * @node: an htmlNodePtr in a tree
6342 * @legacy: whether to allow deprecated elements (YES is faster here
6343 *	for Element nodes)
6344 *
6345 * Checks whether the tree node is valid.  Experimental (the author
6346 *     only uses the HTML enhancements in a SAX parser)
6347 *
6348 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6349 *	legacy allowed) or htmlElementStatusHere (otherwise).
6350 *	for Attribute nodes, a return from htmlAttrAllowed
6351 *	for other nodes, HTML_NA (no checks performed)
6352 */
6353htmlStatus
6354htmlNodeStatus(const htmlNodePtr node, int legacy) {
6355  if ( ! node )
6356    return HTML_INVALID ;
6357
6358  switch ( node->type ) {
6359    case XML_ELEMENT_NODE:
6360      return legacy
6361	? ( htmlElementAllowedHere (
6362		htmlTagLookup(node->parent->name) , node->name
6363		) ? HTML_VALID : HTML_INVALID )
6364	: htmlElementStatusHere(
6365		htmlTagLookup(node->parent->name) ,
6366		htmlTagLookup(node->name) )
6367	;
6368    case XML_ATTRIBUTE_NODE:
6369      return htmlAttrAllowed(
6370	htmlTagLookup(node->parent->name) , node->name, legacy) ;
6371    default: return HTML_NA ;
6372  }
6373}
6374/************************************************************************
6375 *									*
6376 *	New set (2.6.0) of simpler and more flexible APIs		*
6377 *									*
6378 ************************************************************************/
6379/**
6380 * DICT_FREE:
6381 * @str:  a string
6382 *
6383 * Free a string if it is not owned by the "dict" dictionnary in the
6384 * current scope
6385 */
6386#define DICT_FREE(str)						\
6387	if ((str) && ((!dict) ||				\
6388	    (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))	\
6389	    xmlFree((char *)(str));
6390
6391/**
6392 * htmlCtxtReset:
6393 * @ctxt: an HTML parser context
6394 *
6395 * Reset a parser context
6396 */
6397void
6398htmlCtxtReset(htmlParserCtxtPtr ctxt)
6399{
6400    xmlParserInputPtr input;
6401    xmlDictPtr dict;
6402
6403    if (ctxt == NULL)
6404        return;
6405
6406    xmlInitParser();
6407    dict = ctxt->dict;
6408
6409    while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6410        xmlFreeInputStream(input);
6411    }
6412    ctxt->inputNr = 0;
6413    ctxt->input = NULL;
6414
6415    ctxt->spaceNr = 0;
6416    if (ctxt->spaceTab != NULL) {
6417	ctxt->spaceTab[0] = -1;
6418	ctxt->space = &ctxt->spaceTab[0];
6419    } else {
6420	ctxt->space = NULL;
6421    }
6422
6423
6424    ctxt->nodeNr = 0;
6425    ctxt->node = NULL;
6426
6427    ctxt->nameNr = 0;
6428    ctxt->name = NULL;
6429
6430    DICT_FREE(ctxt->version);
6431    ctxt->version = NULL;
6432    DICT_FREE(ctxt->encoding);
6433    ctxt->encoding = NULL;
6434    DICT_FREE(ctxt->directory);
6435    ctxt->directory = NULL;
6436    DICT_FREE(ctxt->extSubURI);
6437    ctxt->extSubURI = NULL;
6438    DICT_FREE(ctxt->extSubSystem);
6439    ctxt->extSubSystem = NULL;
6440    if (ctxt->myDoc != NULL)
6441        xmlFreeDoc(ctxt->myDoc);
6442    ctxt->myDoc = NULL;
6443
6444    ctxt->standalone = -1;
6445    ctxt->hasExternalSubset = 0;
6446    ctxt->hasPErefs = 0;
6447    ctxt->html = 1;
6448    ctxt->external = 0;
6449    ctxt->instate = XML_PARSER_START;
6450    ctxt->token = 0;
6451
6452    ctxt->wellFormed = 1;
6453    ctxt->nsWellFormed = 1;
6454    ctxt->valid = 1;
6455    ctxt->vctxt.userData = ctxt;
6456    ctxt->vctxt.error = xmlParserValidityError;
6457    ctxt->vctxt.warning = xmlParserValidityWarning;
6458    ctxt->record_info = 0;
6459    ctxt->nbChars = 0;
6460    ctxt->checkIndex = 0;
6461    ctxt->inSubset = 0;
6462    ctxt->errNo = XML_ERR_OK;
6463    ctxt->depth = 0;
6464    ctxt->charset = XML_CHAR_ENCODING_NONE;
6465    ctxt->catalogs = NULL;
6466    xmlInitNodeInfoSeq(&ctxt->node_seq);
6467
6468    if (ctxt->attsDefault != NULL) {
6469        xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
6470        ctxt->attsDefault = NULL;
6471    }
6472    if (ctxt->attsSpecial != NULL) {
6473        xmlHashFree(ctxt->attsSpecial, NULL);
6474        ctxt->attsSpecial = NULL;
6475    }
6476}
6477
6478/**
6479 * htmlCtxtUseOptions:
6480 * @ctxt: an HTML parser context
6481 * @options:  a combination of htmlParserOption(s)
6482 *
6483 * Applies the options to the parser context
6484 *
6485 * Returns 0 in case of success, the set of unknown or unimplemented options
6486 *         in case of error.
6487 */
6488int
6489htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6490{
6491    if (ctxt == NULL)
6492        return(-1);
6493
6494    if (options & HTML_PARSE_NOWARNING) {
6495        ctxt->sax->warning = NULL;
6496        ctxt->vctxt.warning = NULL;
6497        options -= XML_PARSE_NOWARNING;
6498	ctxt->options |= XML_PARSE_NOWARNING;
6499    }
6500    if (options & HTML_PARSE_NOERROR) {
6501        ctxt->sax->error = NULL;
6502        ctxt->vctxt.error = NULL;
6503        ctxt->sax->fatalError = NULL;
6504        options -= XML_PARSE_NOERROR;
6505	ctxt->options |= XML_PARSE_NOERROR;
6506    }
6507    if (options & HTML_PARSE_PEDANTIC) {
6508        ctxt->pedantic = 1;
6509        options -= XML_PARSE_PEDANTIC;
6510	ctxt->options |= XML_PARSE_PEDANTIC;
6511    } else
6512        ctxt->pedantic = 0;
6513    if (options & XML_PARSE_NOBLANKS) {
6514        ctxt->keepBlanks = 0;
6515        ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6516        options -= XML_PARSE_NOBLANKS;
6517	ctxt->options |= XML_PARSE_NOBLANKS;
6518    } else
6519        ctxt->keepBlanks = 1;
6520    if (options & HTML_PARSE_RECOVER) {
6521        ctxt->recovery = 1;
6522	options -= HTML_PARSE_RECOVER;
6523    } else
6524        ctxt->recovery = 0;
6525    if (options & HTML_PARSE_COMPACT) {
6526	ctxt->options |= HTML_PARSE_COMPACT;
6527        options -= HTML_PARSE_COMPACT;
6528    }
6529    if (options & XML_PARSE_HUGE) {
6530	ctxt->options |= XML_PARSE_HUGE;
6531        options -= XML_PARSE_HUGE;
6532    }
6533    ctxt->dictNames = 0;
6534    return (options);
6535}
6536
6537/**
6538 * htmlDoRead:
6539 * @ctxt:  an HTML parser context
6540 * @URL:  the base URL to use for the document
6541 * @encoding:  the document encoding, or NULL
6542 * @options:  a combination of htmlParserOption(s)
6543 * @reuse:  keep the context for reuse
6544 *
6545 * Common front-end for the htmlRead functions
6546 *
6547 * Returns the resulting document tree or NULL
6548 */
6549static htmlDocPtr
6550htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6551          int options, int reuse)
6552{
6553    htmlDocPtr ret;
6554
6555    htmlCtxtUseOptions(ctxt, options);
6556    ctxt->html = 1;
6557    if (encoding != NULL) {
6558        xmlCharEncodingHandlerPtr hdlr;
6559
6560	hdlr = xmlFindCharEncodingHandler(encoding);
6561	if (hdlr != NULL) {
6562	    xmlSwitchToEncoding(ctxt, hdlr);
6563	    if (ctxt->input->encoding != NULL)
6564	      xmlFree((xmlChar *) ctxt->input->encoding);
6565            ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6566        }
6567    }
6568    if ((URL != NULL) && (ctxt->input != NULL) &&
6569        (ctxt->input->filename == NULL))
6570        ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6571    htmlParseDocument(ctxt);
6572    ret = ctxt->myDoc;
6573    ctxt->myDoc = NULL;
6574    if (!reuse) {
6575        if ((ctxt->dictNames) &&
6576	    (ret != NULL) &&
6577	    (ret->dict == ctxt->dict))
6578	    ctxt->dict = NULL;
6579	xmlFreeParserCtxt(ctxt);
6580    }
6581    return (ret);
6582}
6583
6584/**
6585 * htmlReadDoc:
6586 * @cur:  a pointer to a zero terminated string
6587 * @URL:  the base URL to use for the document
6588 * @encoding:  the document encoding, or NULL
6589 * @options:  a combination of htmlParserOption(s)
6590 *
6591 * parse an XML in-memory document and build a tree.
6592 *
6593 * Returns the resulting document tree
6594 */
6595htmlDocPtr
6596htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6597{
6598    htmlParserCtxtPtr ctxt;
6599
6600    if (cur == NULL)
6601        return (NULL);
6602
6603    xmlInitParser();
6604    ctxt = htmlCreateDocParserCtxt(cur, NULL);
6605    if (ctxt == NULL)
6606        return (NULL);
6607    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6608}
6609
6610/**
6611 * htmlReadFile:
6612 * @filename:  a file or URL
6613 * @encoding:  the document encoding, or NULL
6614 * @options:  a combination of htmlParserOption(s)
6615 *
6616 * parse an XML file from the filesystem or the network.
6617 *
6618 * Returns the resulting document tree
6619 */
6620htmlDocPtr
6621htmlReadFile(const char *filename, const char *encoding, int options)
6622{
6623    htmlParserCtxtPtr ctxt;
6624
6625    xmlInitParser();
6626    ctxt = htmlCreateFileParserCtxt(filename, encoding);
6627    if (ctxt == NULL)
6628        return (NULL);
6629    return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6630}
6631
6632/**
6633 * htmlReadMemory:
6634 * @buffer:  a pointer to a char array
6635 * @size:  the size of the array
6636 * @URL:  the base URL to use for the document
6637 * @encoding:  the document encoding, or NULL
6638 * @options:  a combination of htmlParserOption(s)
6639 *
6640 * parse an XML in-memory document and build a tree.
6641 *
6642 * Returns the resulting document tree
6643 */
6644htmlDocPtr
6645htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6646{
6647    htmlParserCtxtPtr ctxt;
6648
6649    xmlInitParser();
6650    ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6651    if (ctxt == NULL)
6652        return (NULL);
6653    htmlDefaultSAXHandlerInit();
6654    if (ctxt->sax != NULL)
6655        memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6656    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6657}
6658
6659/**
6660 * htmlReadFd:
6661 * @fd:  an open file descriptor
6662 * @URL:  the base URL to use for the document
6663 * @encoding:  the document encoding, or NULL
6664 * @options:  a combination of htmlParserOption(s)
6665 *
6666 * parse an XML from a file descriptor and build a tree.
6667 *
6668 * Returns the resulting document tree
6669 */
6670htmlDocPtr
6671htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6672{
6673    htmlParserCtxtPtr ctxt;
6674    xmlParserInputBufferPtr input;
6675    xmlParserInputPtr stream;
6676
6677    if (fd < 0)
6678        return (NULL);
6679
6680    xmlInitParser();
6681    input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6682    if (input == NULL)
6683        return (NULL);
6684    ctxt = xmlNewParserCtxt();
6685    if (ctxt == NULL) {
6686        xmlFreeParserInputBuffer(input);
6687        return (NULL);
6688    }
6689    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6690    if (stream == NULL) {
6691        xmlFreeParserInputBuffer(input);
6692	xmlFreeParserCtxt(ctxt);
6693        return (NULL);
6694    }
6695    inputPush(ctxt, stream);
6696    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6697}
6698
6699/**
6700 * htmlReadIO:
6701 * @ioread:  an I/O read function
6702 * @ioclose:  an I/O close function
6703 * @ioctx:  an I/O handler
6704 * @URL:  the base URL to use for the document
6705 * @encoding:  the document encoding, or NULL
6706 * @options:  a combination of htmlParserOption(s)
6707 *
6708 * parse an HTML document from I/O functions and source and build a tree.
6709 *
6710 * Returns the resulting document tree
6711 */
6712htmlDocPtr
6713htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6714          void *ioctx, const char *URL, const char *encoding, int options)
6715{
6716    htmlParserCtxtPtr ctxt;
6717    xmlParserInputBufferPtr input;
6718    xmlParserInputPtr stream;
6719
6720    if (ioread == NULL)
6721        return (NULL);
6722    xmlInitParser();
6723
6724    input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6725                                         XML_CHAR_ENCODING_NONE);
6726    if (input == NULL)
6727        return (NULL);
6728    ctxt = htmlNewParserCtxt();
6729    if (ctxt == NULL) {
6730        xmlFreeParserInputBuffer(input);
6731        return (NULL);
6732    }
6733    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6734    if (stream == NULL) {
6735        xmlFreeParserInputBuffer(input);
6736	xmlFreeParserCtxt(ctxt);
6737        return (NULL);
6738    }
6739    inputPush(ctxt, stream);
6740    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6741}
6742
6743/**
6744 * htmlCtxtReadDoc:
6745 * @ctxt:  an HTML parser context
6746 * @cur:  a pointer to a zero terminated string
6747 * @URL:  the base URL to use for the document
6748 * @encoding:  the document encoding, or NULL
6749 * @options:  a combination of htmlParserOption(s)
6750 *
6751 * parse an XML in-memory document and build a tree.
6752 * This reuses the existing @ctxt parser context
6753 *
6754 * Returns the resulting document tree
6755 */
6756htmlDocPtr
6757htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6758               const char *URL, const char *encoding, int options)
6759{
6760    xmlParserInputPtr stream;
6761
6762    if (cur == NULL)
6763        return (NULL);
6764    if (ctxt == NULL)
6765        return (NULL);
6766
6767    htmlCtxtReset(ctxt);
6768
6769    stream = xmlNewStringInputStream(ctxt, cur);
6770    if (stream == NULL) {
6771        return (NULL);
6772    }
6773    inputPush(ctxt, stream);
6774    return (htmlDoRead(ctxt, URL, encoding, options, 1));
6775}
6776
6777/**
6778 * htmlCtxtReadFile:
6779 * @ctxt:  an HTML parser context
6780 * @filename:  a file or URL
6781 * @encoding:  the document encoding, or NULL
6782 * @options:  a combination of htmlParserOption(s)
6783 *
6784 * parse an XML file from the filesystem or the network.
6785 * This reuses the existing @ctxt parser context
6786 *
6787 * Returns the resulting document tree
6788 */
6789htmlDocPtr
6790htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6791                const char *encoding, int options)
6792{
6793    xmlParserInputPtr stream;
6794
6795    if (filename == NULL)
6796        return (NULL);
6797    if (ctxt == NULL)
6798        return (NULL);
6799
6800    htmlCtxtReset(ctxt);
6801
6802    stream = xmlLoadExternalEntity(filename, NULL, ctxt);
6803    if (stream == NULL) {
6804        return (NULL);
6805    }
6806    inputPush(ctxt, stream);
6807    return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6808}
6809
6810/**
6811 * htmlCtxtReadMemory:
6812 * @ctxt:  an HTML parser context
6813 * @buffer:  a pointer to a char array
6814 * @size:  the size of the array
6815 * @URL:  the base URL to use for the document
6816 * @encoding:  the document encoding, or NULL
6817 * @options:  a combination of htmlParserOption(s)
6818 *
6819 * parse an XML in-memory document and build a tree.
6820 * This reuses the existing @ctxt parser context
6821 *
6822 * Returns the resulting document tree
6823 */
6824htmlDocPtr
6825htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6826                  const char *URL, const char *encoding, int options)
6827{
6828    xmlParserInputBufferPtr input;
6829    xmlParserInputPtr stream;
6830
6831    if (ctxt == NULL)
6832        return (NULL);
6833    if (buffer == NULL)
6834        return (NULL);
6835
6836    htmlCtxtReset(ctxt);
6837
6838    input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6839    if (input == NULL) {
6840	return(NULL);
6841    }
6842
6843    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6844    if (stream == NULL) {
6845	xmlFreeParserInputBuffer(input);
6846	return(NULL);
6847    }
6848
6849    inputPush(ctxt, stream);
6850    return (htmlDoRead(ctxt, URL, encoding, options, 1));
6851}
6852
6853/**
6854 * htmlCtxtReadFd:
6855 * @ctxt:  an HTML parser context
6856 * @fd:  an open file descriptor
6857 * @URL:  the base URL to use for the document
6858 * @encoding:  the document encoding, or NULL
6859 * @options:  a combination of htmlParserOption(s)
6860 *
6861 * parse an XML from a file descriptor and build a tree.
6862 * This reuses the existing @ctxt parser context
6863 *
6864 * Returns the resulting document tree
6865 */
6866htmlDocPtr
6867htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6868              const char *URL, const char *encoding, int options)
6869{
6870    xmlParserInputBufferPtr input;
6871    xmlParserInputPtr stream;
6872
6873    if (fd < 0)
6874        return (NULL);
6875    if (ctxt == NULL)
6876        return (NULL);
6877
6878    htmlCtxtReset(ctxt);
6879
6880
6881    input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6882    if (input == NULL)
6883        return (NULL);
6884    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6885    if (stream == NULL) {
6886        xmlFreeParserInputBuffer(input);
6887        return (NULL);
6888    }
6889    inputPush(ctxt, stream);
6890    return (htmlDoRead(ctxt, URL, encoding, options, 1));
6891}
6892
6893/**
6894 * htmlCtxtReadIO:
6895 * @ctxt:  an HTML parser context
6896 * @ioread:  an I/O read function
6897 * @ioclose:  an I/O close function
6898 * @ioctx:  an I/O handler
6899 * @URL:  the base URL to use for the document
6900 * @encoding:  the document encoding, or NULL
6901 * @options:  a combination of htmlParserOption(s)
6902 *
6903 * parse an HTML document from I/O functions and source and build a tree.
6904 * This reuses the existing @ctxt parser context
6905 *
6906 * Returns the resulting document tree
6907 */
6908htmlDocPtr
6909htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6910              xmlInputCloseCallback ioclose, void *ioctx,
6911	      const char *URL,
6912              const char *encoding, int options)
6913{
6914    xmlParserInputBufferPtr input;
6915    xmlParserInputPtr stream;
6916
6917    if (ioread == NULL)
6918        return (NULL);
6919    if (ctxt == NULL)
6920        return (NULL);
6921
6922    htmlCtxtReset(ctxt);
6923
6924    input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6925                                         XML_CHAR_ENCODING_NONE);
6926    if (input == NULL)
6927        return (NULL);
6928    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6929    if (stream == NULL) {
6930        xmlFreeParserInputBuffer(input);
6931        return (NULL);
6932    }
6933    inputPush(ctxt, stream);
6934    return (htmlDoRead(ctxt, URL, encoding, options, 1));
6935}
6936
6937#define bottom_HTMLparser
6938#include "elfgcchack.h"
6939#endif /* LIBXML_HTML_ENABLED */
6940