1/*
2 * HTMLparser.c : an HTML 4.0 non-verifying parser
3 *
4 * See Copyright for the status of this software.
5 *
6 * daniel@veillard.com
7 */
8
9#define IN_LIBXML
10#include "libxml.h"
11#ifdef LIBXML_HTML_ENABLED
12
13#include <string.h>
14#ifdef HAVE_CTYPE_H
15#include <ctype.h>
16#endif
17#ifdef HAVE_STDLIB_H
18#include <stdlib.h>
19#endif
20#ifdef HAVE_SYS_STAT_H
21#include <sys/stat.h>
22#endif
23#ifdef HAVE_FCNTL_H
24#include <fcntl.h>
25#endif
26#ifdef HAVE_UNISTD_H
27#include <unistd.h>
28#endif
29#ifdef HAVE_ZLIB_H
30#include <zlib.h>
31#endif
32
33#include <libxml/xmlmemory.h>
34#include <libxml/tree.h>
35#include <libxml/parser.h>
36#include <libxml/parserInternals.h>
37#include <libxml/xmlerror.h>
38#include <libxml/HTMLparser.h>
39#include <libxml/HTMLtree.h>
40#include <libxml/entities.h>
41#include <libxml/encoding.h>
42#include <libxml/valid.h>
43#include <libxml/xmlIO.h>
44#include <libxml/globals.h>
45#include <libxml/uri.h>
46
47#define HTML_MAX_NAMELEN 1000
48#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49#define HTML_PARSER_BUFFER_SIZE 100
50
51/* #define DEBUG */
52/* #define DEBUG_PUSH */
53
54static int htmlOmittedDefaultValue = 1;
55
56xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57			     xmlChar end, xmlChar  end2, xmlChar end3);
58static void htmlParseComment(htmlParserCtxtPtr ctxt);
59
60/************************************************************************
61 *									*
62 *		Some factorized error routines				*
63 *									*
64 ************************************************************************/
65
66/**
67 * htmlErrMemory:
68 * @ctxt:  an HTML parser context
69 * @extra:  extra informations
70 *
71 * Handle a redefinition of attribute error
72 */
73static void
74htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75{
76    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77        (ctxt->instate == XML_PARSER_EOF))
78	return;
79    if (ctxt != NULL) {
80        ctxt->errNo = XML_ERR_NO_MEMORY;
81        ctxt->instate = XML_PARSER_EOF;
82        ctxt->disableSAX = 1;
83    }
84    if (extra)
85        __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
86                        XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87                        NULL, NULL, 0, 0,
88                        "Memory allocation failed : %s\n", extra);
89    else
90        __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
91                        XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92                        NULL, NULL, 0, 0, "Memory allocation failed\n");
93}
94
95/**
96 * htmlParseErr:
97 * @ctxt:  an HTML parser context
98 * @error:  the error number
99 * @msg:  the error message
100 * @str1:  string infor
101 * @str2:  string infor
102 *
103 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
104 */
105static void
106htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107             const char *msg, const xmlChar *str1, const xmlChar *str2)
108{
109    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110        (ctxt->instate == XML_PARSER_EOF))
111	return;
112    if (ctxt != NULL)
113	ctxt->errNo = error;
114    __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
115                    XML_ERR_ERROR, NULL, 0,
116		    (const char *) str1, (const char *) str2,
117		    NULL, 0, 0,
118		    msg, str1, str2);
119    if (ctxt != NULL)
120	ctxt->wellFormed = 0;
121}
122
123/**
124 * htmlParseErrInt:
125 * @ctxt:  an HTML parser context
126 * @error:  the error number
127 * @msg:  the error message
128 * @val:  integer info
129 *
130 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
131 */
132static void
133htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134             const char *msg, int val)
135{
136    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137        (ctxt->instate == XML_PARSER_EOF))
138	return;
139    if (ctxt != NULL)
140	ctxt->errNo = error;
141    __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
142                    XML_ERR_ERROR, NULL, 0, NULL, NULL,
143		    NULL, val, 0, msg, val);
144    if (ctxt != NULL)
145	ctxt->wellFormed = 0;
146}
147
148/************************************************************************
149 *									*
150 *	Parser stacks related functions and macros		*
151 *									*
152 ************************************************************************/
153
154/**
155 * htmlnamePush:
156 * @ctxt:  an HTML parser context
157 * @value:  the element name
158 *
159 * Pushes a new element name on top of the name stack
160 *
161 * Returns 0 in case of error, the index in the stack otherwise
162 */
163static int
164htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
165{
166    if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
167        ctxt->html = 3;
168    if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
169        ctxt->html = 10;
170    if (ctxt->nameNr >= ctxt->nameMax) {
171        ctxt->nameMax *= 2;
172        ctxt->nameTab = (const xmlChar * *)
173                         xmlRealloc((xmlChar * *)ctxt->nameTab,
174                                    ctxt->nameMax *
175                                    sizeof(ctxt->nameTab[0]));
176        if (ctxt->nameTab == NULL) {
177            htmlErrMemory(ctxt, NULL);
178            return (0);
179        }
180    }
181    ctxt->nameTab[ctxt->nameNr] = value;
182    ctxt->name = value;
183    return (ctxt->nameNr++);
184}
185/**
186 * htmlnamePop:
187 * @ctxt: an HTML parser context
188 *
189 * Pops the top element name from the name stack
190 *
191 * Returns the name just removed
192 */
193static const xmlChar *
194htmlnamePop(htmlParserCtxtPtr ctxt)
195{
196    const xmlChar *ret;
197
198    if (ctxt->nameNr <= 0)
199        return (NULL);
200    ctxt->nameNr--;
201    if (ctxt->nameNr < 0)
202        return (NULL);
203    if (ctxt->nameNr > 0)
204        ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
205    else
206        ctxt->name = NULL;
207    ret = ctxt->nameTab[ctxt->nameNr];
208    ctxt->nameTab[ctxt->nameNr] = NULL;
209    return (ret);
210}
211
212/**
213 * htmlNodeInfoPush:
214 * @ctxt:  an HTML parser context
215 * @value:  the node info
216 *
217 * Pushes a new element name on top of the node info stack
218 *
219 * Returns 0 in case of error, the index in the stack otherwise
220 */
221static int
222htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
223{
224    if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
225        if (ctxt->nodeInfoMax == 0)
226                ctxt->nodeInfoMax = 5;
227        ctxt->nodeInfoMax *= 2;
228        ctxt->nodeInfoTab = (htmlParserNodeInfo *)
229                         xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
230                                    ctxt->nodeInfoMax *
231                                    sizeof(ctxt->nodeInfoTab[0]));
232        if (ctxt->nodeInfoTab == NULL) {
233            htmlErrMemory(ctxt, NULL);
234            return (0);
235        }
236    }
237    ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
238    ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
239    return (ctxt->nodeInfoNr++);
240}
241
242/**
243 * htmlNodeInfoPop:
244 * @ctxt:  an HTML parser context
245 *
246 * Pops the top element name from the node info stack
247 *
248 * Returns 0 in case of error, the pointer to NodeInfo otherwise
249 */
250static htmlParserNodeInfo *
251htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
252{
253    if (ctxt->nodeInfoNr <= 0)
254        return (NULL);
255    ctxt->nodeInfoNr--;
256    if (ctxt->nodeInfoNr < 0)
257        return (NULL);
258    if (ctxt->nodeInfoNr > 0)
259        ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
260    else
261        ctxt->nodeInfo = NULL;
262    return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
263}
264
265/*
266 * Macros for accessing the content. Those should be used only by the parser,
267 * and not exported.
268 *
269 * Dirty macros, i.e. one need to make assumption on the context to use them
270 *
271 *   CUR_PTR return the current pointer to the xmlChar to be parsed.
272 *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
273 *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
274 *           in UNICODE mode. This should be used internally by the parser
275 *           only to compare to ASCII values otherwise it would break when
276 *           running with UTF-8 encoding.
277 *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
278 *           to compare on ASCII based substring.
279 *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
280 *           it should be used only to compare on ASCII based substring.
281 *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
282 *           strings without newlines within the parser.
283 *
284 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
285 *
286 *   CURRENT Returns the current char value, with the full decoding of
287 *           UTF-8 if we are using this mode. It returns an int.
288 *   NEXT    Skip to the next character, this does the proper decoding
289 *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
290 *   NEXTL(l) Skip the current unicode character of l xmlChars long.
291 *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
292 */
293
294#define UPPER (toupper(*ctxt->input->cur))
295
296#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
297
298#define NXT(val) ctxt->input->cur[(val)]
299
300#define UPP(val) (toupper(ctxt->input->cur[(val)]))
301
302#define CUR_PTR ctxt->input->cur
303
304#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
305		   (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
306	xmlParserInputShrink(ctxt->input)
307
308#define GROW if ((ctxt->progressive == 0) &&				\
309		 (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))	\
310	xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
311
312#define CURRENT ((int) (*ctxt->input->cur))
313
314#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
315
316/* Inported from XML */
317
318/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
319#define CUR ((int) (*ctxt->input->cur))
320#define NEXT xmlNextChar(ctxt)
321
322#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
323
324
325#define NEXTL(l) do {							\
326    if (*(ctxt->input->cur) == '\n') {					\
327	ctxt->input->line++; ctxt->input->col = 1;			\
328    } else ctxt->input->col++;						\
329    ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++;		\
330  } while (0)
331
332/************
333    \
334    if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt);	\
335    if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
336 ************/
337
338#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
339#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
340
341#define COPY_BUF(l,b,i,v)						\
342    if (l == 1) b[i++] = (xmlChar) v;					\
343    else i += xmlCopyChar(l,&b[i],v)
344
345/**
346 * htmlFindEncoding:
347 * @the HTML parser context
348 *
349 * Ty to find and encoding in the current data available in the input
350 * buffer this is needed to try to switch to the proper encoding when
351 * one face a character error.
352 * That's an heuristic, since it's operating outside of parsing it could
353 * try to use a meta which had been commented out, that's the reason it
354 * should only be used in case of error, not as a default.
355 *
356 * Returns an encoding string or NULL if not found, the string need to
357 *   be freed
358 */
359static xmlChar *
360htmlFindEncoding(xmlParserCtxtPtr ctxt) {
361    const xmlChar *start, *cur, *end;
362
363    if ((ctxt == NULL) || (ctxt->input == NULL) ||
364        (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
365        (ctxt->input->buf->encoder != NULL))
366        return(NULL);
367    if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
368        return(NULL);
369
370    start = ctxt->input->cur;
371    end = ctxt->input->end;
372    /* we also expect the input buffer to be zero terminated */
373    if (*end != 0)
374        return(NULL);
375
376    cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
377    if (cur == NULL)
378        return(NULL);
379    cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
380    if (cur == NULL)
381        return(NULL);
382    cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
383    if (cur == NULL)
384        return(NULL);
385    cur += 8;
386    start = cur;
387    while (((*cur >= 'A') && (*cur <= 'Z')) ||
388           ((*cur >= 'a') && (*cur <= 'z')) ||
389           ((*cur >= '0') && (*cur <= '9')) ||
390           (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
391           cur++;
392    if (cur == start)
393        return(NULL);
394    return(xmlStrndup(start, cur - start));
395}
396
397/**
398 * htmlCurrentChar:
399 * @ctxt:  the HTML parser context
400 * @len:  pointer to the length of the char read
401 *
402 * The current char value, if using UTF-8 this may actually span multiple
403 * bytes in the input buffer. Implement the end of line normalization:
404 * 2.11 End-of-Line Handling
405 * If the encoding is unspecified, in the case we find an ISO-Latin-1
406 * char, then the encoding converter is plugged in automatically.
407 *
408 * Returns the current char value and its length
409 */
410
411static int
412htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
413    if (ctxt->instate == XML_PARSER_EOF)
414	return(0);
415
416    if (ctxt->token != 0) {
417	*len = 0;
418	return(ctxt->token);
419    }
420    if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
421	/*
422	 * We are supposed to handle UTF8, check it's valid
423	 * From rfc2044: encoding of the Unicode values on UTF-8:
424	 *
425	 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
426	 * 0000 0000-0000 007F   0xxxxxxx
427	 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
428	 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
429	 *
430	 * Check for the 0x110000 limit too
431	 */
432	const unsigned char *cur = ctxt->input->cur;
433	unsigned char c;
434	unsigned int val;
435
436	c = *cur;
437	if (c & 0x80) {
438	    if (cur[1] == 0) {
439		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
440                cur = ctxt->input->cur;
441            }
442	    if ((cur[1] & 0xc0) != 0x80)
443		goto encoding_error;
444	    if ((c & 0xe0) == 0xe0) {
445
446		if (cur[2] == 0) {
447		    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
448                    cur = ctxt->input->cur;
449                }
450		if ((cur[2] & 0xc0) != 0x80)
451		    goto encoding_error;
452		if ((c & 0xf0) == 0xf0) {
453		    if (cur[3] == 0) {
454			xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
455                        cur = ctxt->input->cur;
456                    }
457		    if (((c & 0xf8) != 0xf0) ||
458			((cur[3] & 0xc0) != 0x80))
459			goto encoding_error;
460		    /* 4-byte code */
461		    *len = 4;
462		    val = (cur[0] & 0x7) << 18;
463		    val |= (cur[1] & 0x3f) << 12;
464		    val |= (cur[2] & 0x3f) << 6;
465		    val |= cur[3] & 0x3f;
466		} else {
467		  /* 3-byte code */
468		    *len = 3;
469		    val = (cur[0] & 0xf) << 12;
470		    val |= (cur[1] & 0x3f) << 6;
471		    val |= cur[2] & 0x3f;
472		}
473	    } else {
474	      /* 2-byte code */
475		*len = 2;
476		val = (cur[0] & 0x1f) << 6;
477		val |= cur[1] & 0x3f;
478	    }
479	    if (!IS_CHAR(val)) {
480	        htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
481				"Char 0x%X out of allowed range\n", val);
482	    }
483	    return(val);
484	} else {
485            if ((*ctxt->input->cur == 0) &&
486                (ctxt->input->cur < ctxt->input->end)) {
487                    htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
488				"Char 0x%X out of allowed range\n", 0);
489                *len = 1;
490                return(' ');
491            }
492	    /* 1-byte code */
493	    *len = 1;
494	    return((int) *ctxt->input->cur);
495	}
496    }
497    /*
498     * Assume it's a fixed length encoding (1) with
499     * a compatible encoding for the ASCII set, since
500     * XML constructs only use < 128 chars
501     */
502    *len = 1;
503    if ((int) *ctxt->input->cur < 0x80)
504	return((int) *ctxt->input->cur);
505
506    /*
507     * Humm this is bad, do an automatic flow conversion
508     */
509    {
510        xmlChar * guess;
511        xmlCharEncodingHandlerPtr handler;
512
513        guess = htmlFindEncoding(ctxt);
514        if (guess == NULL) {
515            xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
516        } else {
517            if (ctxt->input->encoding != NULL)
518                xmlFree((xmlChar *) ctxt->input->encoding);
519            ctxt->input->encoding = guess;
520            handler = xmlFindCharEncodingHandler((const char *) guess);
521            if (handler != NULL) {
522                xmlSwitchToEncoding(ctxt, handler);
523            } else {
524                htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
525                             "Unsupported encoding %s", guess, NULL);
526            }
527        }
528        ctxt->charset = XML_CHAR_ENCODING_UTF8;
529    }
530
531    return(xmlCurrentChar(ctxt, len));
532
533encoding_error:
534    /*
535     * If we detect an UTF8 error that probably mean that the
536     * input encoding didn't get properly advertized in the
537     * declaration header. Report the error and switch the encoding
538     * to ISO-Latin-1 (if you don't like this policy, just declare the
539     * encoding !)
540     */
541    {
542        char buffer[150];
543
544	if (ctxt->input->end - ctxt->input->cur >= 4) {
545	    snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
546			    ctxt->input->cur[0], ctxt->input->cur[1],
547			    ctxt->input->cur[2], ctxt->input->cur[3]);
548	} else {
549	    snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
550	}
551	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
552		     "Input is not proper UTF-8, indicate encoding !\n",
553		     BAD_CAST buffer, NULL);
554    }
555
556    ctxt->charset = XML_CHAR_ENCODING_8859_1;
557    *len = 1;
558    return((int) *ctxt->input->cur);
559}
560
561/**
562 * htmlSkipBlankChars:
563 * @ctxt:  the HTML parser context
564 *
565 * skip all blanks character found at that point in the input streams.
566 *
567 * Returns the number of space chars skipped
568 */
569
570static int
571htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
572    int res = 0;
573
574    while (IS_BLANK_CH(*(ctxt->input->cur))) {
575	if ((*ctxt->input->cur == 0) &&
576	    (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
577		xmlPopInput(ctxt);
578	} else {
579	    if (*(ctxt->input->cur) == '\n') {
580		ctxt->input->line++; ctxt->input->col = 1;
581	    } else ctxt->input->col++;
582	    ctxt->input->cur++;
583	    ctxt->nbChars++;
584	    if (*ctxt->input->cur == 0)
585		xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
586	}
587	res++;
588    }
589    return(res);
590}
591
592
593
594/************************************************************************
595 *									*
596 *	The list of HTML elements and their properties		*
597 *									*
598 ************************************************************************/
599
600/*
601 *  Start Tag: 1 means the start tag can be ommited
602 *  End Tag:   1 means the end tag can be ommited
603 *             2 means it's forbidden (empty elements)
604 *             3 means the tag is stylistic and should be closed easily
605 *  Depr:      this element is deprecated
606 *  DTD:       1 means that this element is valid only in the Loose DTD
607 *             2 means that this element is valid only in the Frameset DTD
608 *
609 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
610	, subElements , impliedsubelt , Attributes, userdata
611 */
612
613/* Definitions and a couple of vars for HTML Elements */
614
615#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
616#define NB_FONTSTYLE 8
617#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
618#define NB_PHRASE 10
619#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
620#define NB_SPECIAL 16
621#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
622#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
623#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
624#define NB_BLOCK NB_HEADING + NB_LIST + 14
625#define FORMCTRL "input", "select", "textarea", "label", "button"
626#define NB_FORMCTRL 5
627#define PCDATA
628#define NB_PCDATA 0
629#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
630#define NB_HEADING 6
631#define LIST "ul", "ol", "dir", "menu"
632#define NB_LIST 4
633#define MODIFIER
634#define NB_MODIFIER 0
635#define FLOW BLOCK,INLINE
636#define NB_FLOW NB_BLOCK + NB_INLINE
637#define EMPTY NULL
638
639
640static const char* const html_flow[] = { FLOW, NULL } ;
641static const char* const html_inline[] = { INLINE, NULL } ;
642
643/* placeholders: elts with content but no subelements */
644static const char* const html_pcdata[] = { NULL } ;
645#define html_cdata html_pcdata
646
647
648/* ... and for HTML Attributes */
649
650#define COREATTRS "id", "class", "style", "title"
651#define NB_COREATTRS 4
652#define I18N "lang", "dir"
653#define NB_I18N 2
654#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
655#define NB_EVENTS 9
656#define ATTRS COREATTRS,I18N,EVENTS
657#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
658#define CELLHALIGN "align", "char", "charoff"
659#define NB_CELLHALIGN 3
660#define CELLVALIGN "valign"
661#define NB_CELLVALIGN 1
662
663static const char* const html_attrs[] = { ATTRS, NULL } ;
664static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
665static const char* const core_attrs[] = { COREATTRS, NULL } ;
666static const char* const i18n_attrs[] = { I18N, NULL } ;
667
668
669/* Other declarations that should go inline ... */
670static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
671	"href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
672	"tabindex", "onfocus", "onblur", NULL } ;
673static const char* const target_attr[] = { "target", NULL } ;
674static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
675static const char* const alt_attr[] = { "alt", NULL } ;
676static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
677static const char* const href_attrs[] = { "href", NULL } ;
678static const char* const clear_attrs[] = { "clear", NULL } ;
679static const char* const inline_p[] = { INLINE, "p", NULL } ;
680
681static const char* const flow_param[] = { FLOW, "param", NULL } ;
682static const char* const applet_attrs[] = { COREATTRS , "codebase",
683		"archive", "alt", "name", "height", "width", "align",
684		"hspace", "vspace", NULL } ;
685static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
686	"tabindex", "accesskey", "onfocus", "onblur", NULL } ;
687static const char* const basefont_attrs[] =
688	{ "id", "size", "color", "face", NULL } ;
689static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
690static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
691static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
692static const char* const body_depr[] = { "background", "bgcolor", "text",
693	"link", "vlink", "alink", NULL } ;
694static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
695	"disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
696
697
698static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
699static const char* const col_elt[] = { "col", NULL } ;
700static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
701static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
702static const char* const dl_contents[] = { "dt", "dd", NULL } ;
703static const char* const compact_attr[] = { "compact", NULL } ;
704static const char* const label_attr[] = { "label", NULL } ;
705static const char* const fieldset_contents[] = { FLOW, "legend" } ;
706static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
707static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
708static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
709static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
710static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
711static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
712static const char* const head_attrs[] = { I18N, "profile", NULL } ;
713static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
714static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
715static const char* const version_attr[] = { "version", NULL } ;
716static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
717static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
718static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
719static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
720static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
721static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
722static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
723static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
724static const char* const align_attr[] = { "align", NULL } ;
725static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
726static const char* const map_contents[] = { BLOCK, "area", NULL } ;
727static const char* const name_attr[] = { "name", NULL } ;
728static const char* const action_attr[] = { "action", NULL } ;
729static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
730static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
731static const char* const content_attr[] = { "content", NULL } ;
732static const char* const type_attr[] = { "type", NULL } ;
733static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
734static const char* const object_contents[] = { FLOW, "param", NULL } ;
735static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
736static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
737static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
738static const char* const option_elt[] = { "option", NULL } ;
739static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
740static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
741static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
742static const char* const width_attr[] = { "width", NULL } ;
743static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
744static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
745static const char* const language_attr[] = { "language", NULL } ;
746static const char* const select_content[] = { "optgroup", "option", NULL } ;
747static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
748static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
749static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
750static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
751static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
752static const char* const tr_elt[] = { "tr", NULL } ;
753static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
754static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
755static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
756static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
757static const char* const tr_contents[] = { "th", "td", NULL } ;
758static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
759static const char* const li_elt[] = { "li", NULL } ;
760static const char* const ul_depr[] = { "type", "compact", NULL} ;
761static const char* const dir_attr[] = { "dir", NULL} ;
762
763#define DECL (const char**)
764
765static const htmlElemDesc
766html40ElementTable[] = {
767{ "a",		0, 0, 0, 0, 0, 0, 1, "anchor ",
768	DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
769},
770{ "abbr",	0, 0, 0, 0, 0, 0, 1, "abbreviated form",
771	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
772},
773{ "acronym",	0, 0, 0, 0, 0, 0, 1, "",
774	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
775},
776{ "address",	0, 0, 0, 0, 0, 0, 0, "information on author ",
777	DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
778},
779{ "applet",	0, 0, 0, 0, 1, 1, 2, "java applet ",
780	DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
781},
782{ "area",	0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
783	EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
784},
785{ "b",		0, 3, 0, 0, 0, 0, 1, "bold text style",
786	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
787},
788{ "base",	0, 2, 2, 1, 0, 0, 0, "document base uri ",
789	EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
790},
791{ "basefont",	0, 2, 2, 1, 1, 1, 1, "base font size " ,
792	EMPTY , NULL , NULL, DECL basefont_attrs, NULL
793},
794{ "bdo",	0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
795	DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
796},
797{ "big",	0, 3, 0, 0, 0, 0, 1, "large text style",
798	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
799},
800{ "blockquote",	0, 0, 0, 0, 0, 0, 0, "long quotation ",
801	DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
802},
803{ "body",	1, 1, 0, 0, 0, 0, 0, "document body ",
804	DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
805},
806{ "br",		0, 2, 2, 1, 0, 0, 1, "forced line break ",
807	EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
808},
809{ "button",	0, 0, 0, 0, 0, 0, 2, "push button ",
810	DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
811},
812{ "caption",	0, 0, 0, 0, 0, 0, 0, "table caption ",
813	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
814},
815{ "center",	0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
816	DECL html_flow , NULL , NULL, DECL html_attrs, NULL
817},
818{ "cite",	0, 0, 0, 0, 0, 0, 1, "citation",
819	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
820},
821{ "code",	0, 0, 0, 0, 0, 0, 1, "computer code fragment",
822	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
823},
824{ "col",	0, 2, 2, 1, 0, 0, 0, "table column ",
825	EMPTY , NULL , DECL col_attrs , NULL, NULL
826},
827{ "colgroup",	0, 1, 0, 0, 0, 0, 0, "table column group ",
828	DECL col_elt , "col" , DECL col_attrs , NULL, NULL
829},
830{ "dd",		0, 1, 0, 0, 0, 0, 0, "definition description ",
831	DECL html_flow , NULL , DECL html_attrs, NULL, NULL
832},
833{ "del",	0, 0, 0, 0, 0, 0, 2, "deleted text ",
834	DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
835},
836{ "dfn",	0, 0, 0, 0, 0, 0, 1, "instance definition",
837	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
838},
839{ "dir",	0, 0, 0, 0, 1, 1, 0, "directory list",
840	DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
841},
842{ "div",	0, 0, 0, 0, 0, 0, 0, "generic language/style container",
843	DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
844},
845{ "dl",		0, 0, 0, 0, 0, 0, 0, "definition list ",
846	DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
847},
848{ "dt",		0, 1, 0, 0, 0, 0, 0, "definition term ",
849	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
850},
851{ "em",		0, 3, 0, 0, 0, 0, 1, "emphasis",
852	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
853},
854{ "embed",	0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
855	EMPTY, NULL, DECL embed_attrs, NULL, NULL
856},
857{ "fieldset",	0, 0, 0, 0, 0, 0, 0, "form control group ",
858	DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
859},
860{ "font",	0, 3, 0, 0, 1, 1, 1, "local change to font ",
861	DECL html_inline, NULL, NULL, DECL font_attrs, NULL
862},
863{ "form",	0, 0, 0, 0, 0, 0, 0, "interactive form ",
864	DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
865},
866{ "frame",	0, 2, 2, 1, 0, 2, 0, "subwindow " ,
867	EMPTY, NULL, NULL, DECL frame_attrs, NULL
868},
869{ "frameset",	0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
870	DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
871},
872{ "h1",		0, 0, 0, 0, 0, 0, 0, "heading ",
873	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
874},
875{ "h2",		0, 0, 0, 0, 0, 0, 0, "heading ",
876	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
877},
878{ "h3",		0, 0, 0, 0, 0, 0, 0, "heading ",
879	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
880},
881{ "h4",		0, 0, 0, 0, 0, 0, 0, "heading ",
882	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
883},
884{ "h5",		0, 0, 0, 0, 0, 0, 0, "heading ",
885	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
886},
887{ "h6",		0, 0, 0, 0, 0, 0, 0, "heading ",
888	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
889},
890{ "head",	1, 1, 0, 0, 0, 0, 0, "document head ",
891	DECL head_contents, NULL, DECL head_attrs, NULL, NULL
892},
893{ "hr",		0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
894	EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
895},
896{ "html",	1, 1, 0, 0, 0, 0, 0, "document root element ",
897	DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
898},
899{ "i",		0, 3, 0, 0, 0, 0, 1, "italic text style",
900	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
901},
902{ "iframe",	0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
903	DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
904},
905{ "img",	0, 2, 2, 1, 0, 0, 1, "embedded image ",
906	EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
907},
908{ "input",	0, 2, 2, 1, 0, 0, 1, "form control ",
909	EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
910},
911{ "ins",	0, 0, 0, 0, 0, 0, 2, "inserted text",
912	DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
913},
914{ "isindex",	0, 2, 2, 1, 1, 1, 0, "single line prompt ",
915	EMPTY, NULL, NULL, DECL prompt_attrs, NULL
916},
917{ "kbd",	0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
918	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
919},
920{ "label",	0, 0, 0, 0, 0, 0, 1, "form field label text ",
921	DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
922},
923{ "legend",	0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
924	DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
925},
926{ "li",		0, 1, 1, 0, 0, 0, 0, "list item ",
927	DECL html_flow, NULL, DECL html_attrs, NULL, NULL
928},
929{ "link",	0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
930	EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
931},
932{ "map",	0, 0, 0, 0, 0, 0, 2, "client-side image map ",
933	DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
934},
935{ "menu",	0, 0, 0, 0, 1, 1, 0, "menu list ",
936	DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
937},
938{ "meta",	0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
939	EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
940},
941{ "noframes",	0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
942	DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
943},
944{ "noscript",	0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
945	DECL html_flow, "div", DECL html_attrs, NULL, NULL
946},
947{ "object",	0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
948	DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
949},
950{ "ol",		0, 0, 0, 0, 0, 0, 0, "ordered list ",
951	DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
952},
953{ "optgroup",	0, 0, 0, 0, 0, 0, 0, "option group ",
954	DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
955},
956{ "option",	0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
957	DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
958},
959{ "p",		0, 1, 0, 0, 0, 0, 0, "paragraph ",
960	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
961},
962{ "param",	0, 2, 2, 1, 0, 0, 0, "named property value ",
963	EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
964},
965{ "pre",	0, 0, 0, 0, 0, 0, 0, "preformatted text ",
966	DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
967},
968{ "q",		0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
969	DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
970},
971{ "s",		0, 3, 0, 0, 1, 1, 1, "strike-through text style",
972	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
973},
974{ "samp",	0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
975	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
976},
977{ "script",	0, 0, 0, 0, 0, 0, 2, "script statements ",
978	DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
979},
980{ "select",	0, 0, 0, 0, 0, 0, 1, "option selector ",
981	DECL select_content, NULL, DECL select_attrs, NULL, NULL
982},
983{ "small",	0, 3, 0, 0, 0, 0, 1, "small text style",
984	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
985},
986{ "span",	0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
987	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
988},
989{ "strike",	0, 3, 0, 0, 1, 1, 1, "strike-through text",
990	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
991},
992{ "strong",	0, 3, 0, 0, 0, 0, 1, "strong emphasis",
993	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
994},
995{ "style",	0, 0, 0, 0, 0, 0, 0, "style info ",
996	DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
997},
998{ "sub",	0, 3, 0, 0, 0, 0, 1, "subscript",
999	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1000},
1001{ "sup",	0, 3, 0, 0, 0, 0, 1, "superscript ",
1002	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1003},
1004{ "table",	0, 0, 0, 0, 0, 0, 0, "",
1005	DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1006},
1007{ "tbody",	1, 0, 0, 0, 0, 0, 0, "table body ",
1008	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1009},
1010{ "td",		0, 0, 0, 0, 0, 0, 0, "table data cell",
1011	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1012},
1013{ "textarea",	0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1014	DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1015},
1016{ "tfoot",	0, 1, 0, 0, 0, 0, 0, "table footer ",
1017	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1018},
1019{ "th",		0, 1, 0, 0, 0, 0, 0, "table header cell",
1020	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1021},
1022{ "thead",	0, 1, 0, 0, 0, 0, 0, "table header ",
1023	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1024},
1025{ "title",	0, 0, 0, 0, 0, 0, 0, "document title ",
1026	DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1027},
1028{ "tr",		0, 0, 0, 0, 0, 0, 0, "table row ",
1029	DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1030},
1031{ "tt",		0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1032	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1033},
1034{ "u",		0, 3, 0, 0, 1, 1, 1, "underlined text style",
1035	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1036},
1037{ "ul",		0, 0, 0, 0, 0, 0, 0, "unordered list ",
1038	DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1039},
1040{ "var",	0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1041	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1042}
1043};
1044
1045/*
1046 * start tags that imply the end of current element
1047 */
1048static const char * const htmlStartClose[] = {
1049"form",		"form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
1050		"dl", "ul", "ol", "menu", "dir", "address", "pre",
1051		"listing", "xmp", "head", NULL,
1052"head",		"p", NULL,
1053"title",	"p", NULL,
1054"body",		"head", "style", "link", "title", "p", NULL,
1055"frameset",	"head", "style", "link", "title", "p", NULL,
1056"li",		"p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
1057		"pre", "listing", "xmp", "head", "li", NULL,
1058"hr",		"p", "head", NULL,
1059"h1",		"p", "head", NULL,
1060"h2",		"p", "head", NULL,
1061"h3",		"p", "head", NULL,
1062"h4",		"p", "head", NULL,
1063"h5",		"p", "head", NULL,
1064"h6",		"p", "head", NULL,
1065"dir",		"p", "head", NULL,
1066"address",	"p", "head", "ul", NULL,
1067"pre",		"p", "head", "ul", NULL,
1068"listing",	"p", "head", NULL,
1069"xmp",		"p", "head", NULL,
1070"blockquote",	"p", "head", NULL,
1071"dl",		"p", "dt", "menu", "dir", "address", "pre", "listing",
1072		"xmp", "head", NULL,
1073"dt",		"p", "menu", "dir", "address", "pre", "listing", "xmp",
1074                "head", "dd", NULL,
1075"dd",		"p", "menu", "dir", "address", "pre", "listing", "xmp",
1076                "head", "dt", NULL,
1077"ul",		"p", "head", "ol", "menu", "dir", "address", "pre",
1078		"listing", "xmp", NULL,
1079"ol",		"p", "head", "ul", NULL,
1080"menu",		"p", "head", "ul", NULL,
1081"p",		"p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
1082"div",		"p", "head", NULL,
1083"noscript",	"p", "head", NULL,
1084"center",	"font", "b", "i", "p", "head", NULL,
1085"a",		"a", NULL,
1086"caption",	"p", NULL,
1087"colgroup",	"caption", "colgroup", "col", "p", NULL,
1088"col",		"caption", "col", "p", NULL,
1089"table",	"p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1090		"listing", "xmp", "a", NULL,
1091"th",		"th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1092"td",		"th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1093"tr",		"th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1094"thead",	"caption", "col", "colgroup", NULL,
1095"tfoot",	"th", "td", "tr", "caption", "col", "colgroup", "thead",
1096		"tbody", "p", NULL,
1097"tbody",	"th", "td", "tr", "caption", "col", "colgroup", "thead",
1098		"tfoot", "tbody", "p", NULL,
1099"optgroup",	"option", NULL,
1100"option",	"option", NULL,
1101"fieldset",	"legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1102		"pre", "listing", "xmp", "a", NULL,
1103NULL
1104};
1105
1106/*
1107 * The list of HTML elements which are supposed not to have
1108 * CDATA content and where a p element will be implied
1109 *
1110 * TODO: extend that list by reading the HTML SGML DTD on
1111 *       implied paragraph
1112 */
1113static const char *const htmlNoContentElements[] = {
1114    "html",
1115    "head",
1116    NULL
1117};
1118
1119/*
1120 * The list of HTML attributes which are of content %Script;
1121 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1122 *       it assumes the name starts with 'on'
1123 */
1124static const char *const htmlScriptAttributes[] = {
1125    "onclick",
1126    "ondblclick",
1127    "onmousedown",
1128    "onmouseup",
1129    "onmouseover",
1130    "onmousemove",
1131    "onmouseout",
1132    "onkeypress",
1133    "onkeydown",
1134    "onkeyup",
1135    "onload",
1136    "onunload",
1137    "onfocus",
1138    "onblur",
1139    "onsubmit",
1140    "onrest",
1141    "onchange",
1142    "onselect"
1143};
1144
1145/*
1146 * This table is used by the htmlparser to know what to do with
1147 * broken html pages. By assigning different priorities to different
1148 * elements the parser can decide how to handle extra endtags.
1149 * Endtags are only allowed to close elements with lower or equal
1150 * priority.
1151 */
1152
1153typedef struct {
1154    const char *name;
1155    int priority;
1156} elementPriority;
1157
1158static const elementPriority htmlEndPriority[] = {
1159    {"div",   150},
1160    {"td",    160},
1161    {"th",    160},
1162    {"tr",    170},
1163    {"thead", 180},
1164    {"tbody", 180},
1165    {"tfoot", 180},
1166    {"table", 190},
1167    {"head",  200},
1168    {"body",  200},
1169    {"html",  220},
1170    {NULL,    100} /* Default priority */
1171};
1172
1173static const char** htmlStartCloseIndex[100];
1174static int htmlStartCloseIndexinitialized = 0;
1175
1176/************************************************************************
1177 *									*
1178 *	functions to handle HTML specific data			*
1179 *									*
1180 ************************************************************************/
1181
1182/**
1183 * htmlInitAutoClose:
1184 *
1185 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1186 * This is not reentrant. Call xmlInitParser() once before processing in
1187 * case of use in multithreaded programs.
1188 */
1189void
1190htmlInitAutoClose(void) {
1191    int indx, i = 0;
1192
1193    if (htmlStartCloseIndexinitialized) return;
1194
1195    for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1196    indx = 0;
1197    while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1198        htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
1199	while (htmlStartClose[i] != NULL) i++;
1200	i++;
1201    }
1202    htmlStartCloseIndexinitialized = 1;
1203}
1204
1205/**
1206 * htmlTagLookup:
1207 * @tag:  The tag name in lowercase
1208 *
1209 * Lookup the HTML tag in the ElementTable
1210 *
1211 * Returns the related htmlElemDescPtr or NULL if not found.
1212 */
1213const htmlElemDesc *
1214htmlTagLookup(const xmlChar *tag) {
1215    unsigned int i;
1216
1217    for (i = 0; i < (sizeof(html40ElementTable) /
1218                     sizeof(html40ElementTable[0]));i++) {
1219        if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
1220	    return((htmlElemDescPtr) &html40ElementTable[i]);
1221    }
1222    return(NULL);
1223}
1224
1225/**
1226 * htmlGetEndPriority:
1227 * @name: The name of the element to look up the priority for.
1228 *
1229 * Return value: The "endtag" priority.
1230 **/
1231static int
1232htmlGetEndPriority (const xmlChar *name) {
1233    int i = 0;
1234
1235    while ((htmlEndPriority[i].name != NULL) &&
1236	   (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1237	i++;
1238
1239    return(htmlEndPriority[i].priority);
1240}
1241
1242
1243/**
1244 * htmlCheckAutoClose:
1245 * @newtag:  The new tag name
1246 * @oldtag:  The old tag name
1247 *
1248 * Checks whether the new tag is one of the registered valid tags for
1249 * closing old.
1250 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1251 *
1252 * Returns 0 if no, 1 if yes.
1253 */
1254static int
1255htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1256{
1257    int i, indx;
1258    const char **closed = NULL;
1259
1260    if (htmlStartCloseIndexinitialized == 0)
1261        htmlInitAutoClose();
1262
1263    /* inefficient, but not a big deal */
1264    for (indx = 0; indx < 100; indx++) {
1265        closed = htmlStartCloseIndex[indx];
1266        if (closed == NULL)
1267            return (0);
1268        if (xmlStrEqual(BAD_CAST * closed, newtag))
1269            break;
1270    }
1271
1272    i = closed - htmlStartClose;
1273    i++;
1274    while (htmlStartClose[i] != NULL) {
1275        if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1276            return (1);
1277        }
1278        i++;
1279    }
1280    return (0);
1281}
1282
1283/**
1284 * htmlAutoCloseOnClose:
1285 * @ctxt:  an HTML parser context
1286 * @newtag:  The new tag name
1287 * @force:  force the tag closure
1288 *
1289 * The HTML DTD allows an ending tag to implicitly close other tags.
1290 */
1291static void
1292htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1293{
1294    const htmlElemDesc *info;
1295    int i, priority;
1296
1297    priority = htmlGetEndPriority(newtag);
1298
1299    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1300
1301        if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1302            break;
1303        /*
1304         * A missplaced endtag can only close elements with lower
1305         * or equal priority, so if we find an element with higher
1306         * priority before we find an element with
1307         * matching name, we just ignore this endtag
1308         */
1309        if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1310            return;
1311    }
1312    if (i < 0)
1313        return;
1314
1315    while (!xmlStrEqual(newtag, ctxt->name)) {
1316        info = htmlTagLookup(ctxt->name);
1317        if ((info != NULL) && (info->endTag == 3)) {
1318            htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1319	                 "Opening and ending tag mismatch: %s and %s\n",
1320			 newtag, ctxt->name);
1321        }
1322        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1323            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1324	htmlnamePop(ctxt);
1325    }
1326}
1327
1328/**
1329 * htmlAutoCloseOnEnd:
1330 * @ctxt:  an HTML parser context
1331 *
1332 * Close all remaining tags at the end of the stream
1333 */
1334static void
1335htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1336{
1337    int i;
1338
1339    if (ctxt->nameNr == 0)
1340        return;
1341    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1342        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1343            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1344	htmlnamePop(ctxt);
1345    }
1346}
1347
1348/**
1349 * htmlAutoClose:
1350 * @ctxt:  an HTML parser context
1351 * @newtag:  The new tag name or NULL
1352 *
1353 * The HTML DTD allows a tag to implicitly close other tags.
1354 * The list is kept in htmlStartClose array. This function is
1355 * called when a new tag has been detected and generates the
1356 * appropriates closes if possible/needed.
1357 * If newtag is NULL this mean we are at the end of the resource
1358 * and we should check
1359 */
1360static void
1361htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1362{
1363    while ((newtag != NULL) && (ctxt->name != NULL) &&
1364           (htmlCheckAutoClose(newtag, ctxt->name))) {
1365        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1366            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1367	htmlnamePop(ctxt);
1368    }
1369    if (newtag == NULL) {
1370        htmlAutoCloseOnEnd(ctxt);
1371        return;
1372    }
1373    while ((newtag == NULL) && (ctxt->name != NULL) &&
1374           ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1375            (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1376            (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1377        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1378            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1379	htmlnamePop(ctxt);
1380    }
1381}
1382
1383/**
1384 * htmlAutoCloseTag:
1385 * @doc:  the HTML document
1386 * @name:  The tag name
1387 * @elem:  the HTML element
1388 *
1389 * The HTML DTD allows a tag to implicitly close other tags.
1390 * The list is kept in htmlStartClose array. This function checks
1391 * if the element or one of it's children would autoclose the
1392 * given tag.
1393 *
1394 * Returns 1 if autoclose, 0 otherwise
1395 */
1396int
1397htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1398    htmlNodePtr child;
1399
1400    if (elem == NULL) return(1);
1401    if (xmlStrEqual(name, elem->name)) return(0);
1402    if (htmlCheckAutoClose(elem->name, name)) return(1);
1403    child = elem->children;
1404    while (child != NULL) {
1405        if (htmlAutoCloseTag(doc, name, child)) return(1);
1406	child = child->next;
1407    }
1408    return(0);
1409}
1410
1411/**
1412 * htmlIsAutoClosed:
1413 * @doc:  the HTML document
1414 * @elem:  the HTML element
1415 *
1416 * The HTML DTD allows a tag to implicitly close other tags.
1417 * The list is kept in htmlStartClose array. This function checks
1418 * if a tag is autoclosed by one of it's child
1419 *
1420 * Returns 1 if autoclosed, 0 otherwise
1421 */
1422int
1423htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1424    htmlNodePtr child;
1425
1426    if (elem == NULL) return(1);
1427    child = elem->children;
1428    while (child != NULL) {
1429	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1430	child = child->next;
1431    }
1432    return(0);
1433}
1434
1435/**
1436 * htmlCheckImplied:
1437 * @ctxt:  an HTML parser context
1438 * @newtag:  The new tag name
1439 *
1440 * The HTML DTD allows a tag to exists only implicitly
1441 * called when a new tag has been detected and generates the
1442 * appropriates implicit tags if missing
1443 */
1444static void
1445htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1446    int i;
1447
1448    if (ctxt->options & HTML_PARSE_NOIMPLIED)
1449        return;
1450    if (!htmlOmittedDefaultValue)
1451	return;
1452    if (xmlStrEqual(newtag, BAD_CAST"html"))
1453	return;
1454    if (ctxt->nameNr <= 0) {
1455	htmlnamePush(ctxt, BAD_CAST"html");
1456	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1457	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1458    }
1459    if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1460        return;
1461    if ((ctxt->nameNr <= 1) &&
1462        ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1463	 (xmlStrEqual(newtag, BAD_CAST"style")) ||
1464	 (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1465	 (xmlStrEqual(newtag, BAD_CAST"link")) ||
1466	 (xmlStrEqual(newtag, BAD_CAST"title")) ||
1467	 (xmlStrEqual(newtag, BAD_CAST"base")))) {
1468        if (ctxt->html >= 3) {
1469            /* we already saw or generated an <head> before */
1470            return;
1471        }
1472        /*
1473         * dropped OBJECT ... i you put it first BODY will be
1474         * assumed !
1475         */
1476        htmlnamePush(ctxt, BAD_CAST"head");
1477        if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1478            ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1479    } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1480	       (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1481	       (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1482        if (ctxt->html >= 10) {
1483            /* we already saw or generated a <body> before */
1484            return;
1485        }
1486	for (i = 0;i < ctxt->nameNr;i++) {
1487	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1488		return;
1489	    }
1490	    if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1491		return;
1492	    }
1493	}
1494
1495	htmlnamePush(ctxt, BAD_CAST"body");
1496	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1497	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1498    }
1499}
1500
1501/**
1502 * htmlCheckParagraph
1503 * @ctxt:  an HTML parser context
1504 *
1505 * Check whether a p element need to be implied before inserting
1506 * characters in the current element.
1507 *
1508 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1509 *         in case of error.
1510 */
1511
1512static int
1513htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1514    const xmlChar *tag;
1515    int i;
1516
1517    if (ctxt == NULL)
1518	return(-1);
1519    tag = ctxt->name;
1520    if (tag == NULL) {
1521	htmlAutoClose(ctxt, BAD_CAST"p");
1522	htmlCheckImplied(ctxt, BAD_CAST"p");
1523	htmlnamePush(ctxt, BAD_CAST"p");
1524	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1525	    ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1526	return(1);
1527    }
1528    if (!htmlOmittedDefaultValue)
1529	return(0);
1530    for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1531	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1532	    htmlAutoClose(ctxt, BAD_CAST"p");
1533	    htmlCheckImplied(ctxt, BAD_CAST"p");
1534	    htmlnamePush(ctxt, BAD_CAST"p");
1535	    if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1536		ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1537	    return(1);
1538	}
1539    }
1540    return(0);
1541}
1542
1543/**
1544 * htmlIsScriptAttribute:
1545 * @name:  an attribute name
1546 *
1547 * Check if an attribute is of content type Script
1548 *
1549 * Returns 1 is the attribute is a script 0 otherwise
1550 */
1551int
1552htmlIsScriptAttribute(const xmlChar *name) {
1553    unsigned int i;
1554
1555    if (name == NULL)
1556      return(0);
1557    /*
1558     * all script attributes start with 'on'
1559     */
1560    if ((name[0] != 'o') || (name[1] != 'n'))
1561      return(0);
1562    for (i = 0;
1563	 i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1564	 i++) {
1565	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1566	    return(1);
1567    }
1568    return(0);
1569}
1570
1571/************************************************************************
1572 *									*
1573 *	The list of HTML predefined entities			*
1574 *									*
1575 ************************************************************************/
1576
1577
1578static const htmlEntityDesc  html40EntitiesTable[] = {
1579/*
1580 * the 4 absolute ones, plus apostrophe.
1581 */
1582{ 34,	"quot",	"quotation mark = APL quote, U+0022 ISOnum" },
1583{ 38,	"amp",	"ampersand, U+0026 ISOnum" },
1584{ 39,	"apos",	"single quote" },
1585{ 60,	"lt",	"less-than sign, U+003C ISOnum" },
1586{ 62,	"gt",	"greater-than sign, U+003E ISOnum" },
1587
1588/*
1589 * A bunch still in the 128-255 range
1590 * Replacing them depend really on the charset used.
1591 */
1592{ 160,	"nbsp",	"no-break space = non-breaking space, U+00A0 ISOnum" },
1593{ 161,	"iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1594{ 162,	"cent",	"cent sign, U+00A2 ISOnum" },
1595{ 163,	"pound","pound sign, U+00A3 ISOnum" },
1596{ 164,	"curren","currency sign, U+00A4 ISOnum" },
1597{ 165,	"yen",	"yen sign = yuan sign, U+00A5 ISOnum" },
1598{ 166,	"brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1599{ 167,	"sect",	"section sign, U+00A7 ISOnum" },
1600{ 168,	"uml",	"diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1601{ 169,	"copy",	"copyright sign, U+00A9 ISOnum" },
1602{ 170,	"ordf",	"feminine ordinal indicator, U+00AA ISOnum" },
1603{ 171,	"laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1604{ 172,	"not",	"not sign, U+00AC ISOnum" },
1605{ 173,	"shy",	"soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1606{ 174,	"reg",	"registered sign = registered trade mark sign, U+00AE ISOnum" },
1607{ 175,	"macr",	"macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1608{ 176,	"deg",	"degree sign, U+00B0 ISOnum" },
1609{ 177,	"plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1610{ 178,	"sup2",	"superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1611{ 179,	"sup3",	"superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1612{ 180,	"acute","acute accent = spacing acute, U+00B4 ISOdia" },
1613{ 181,	"micro","micro sign, U+00B5 ISOnum" },
1614{ 182,	"para",	"pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1615{ 183,	"middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1616{ 184,	"cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1617{ 185,	"sup1",	"superscript one = superscript digit one, U+00B9 ISOnum" },
1618{ 186,	"ordm",	"masculine ordinal indicator, U+00BA ISOnum" },
1619{ 187,	"raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1620{ 188,	"frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1621{ 189,	"frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1622{ 190,	"frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1623{ 191,	"iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1624{ 192,	"Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1625{ 193,	"Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1626{ 194,	"Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1627{ 195,	"Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1628{ 196,	"Auml",	"latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1629{ 197,	"Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1630{ 198,	"AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1631{ 199,	"Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1632{ 200,	"Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1633{ 201,	"Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1634{ 202,	"Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1635{ 203,	"Euml",	"latin capital letter E with diaeresis, U+00CB ISOlat1" },
1636{ 204,	"Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1637{ 205,	"Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1638{ 206,	"Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1639{ 207,	"Iuml",	"latin capital letter I with diaeresis, U+00CF ISOlat1" },
1640{ 208,	"ETH",	"latin capital letter ETH, U+00D0 ISOlat1" },
1641{ 209,	"Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1642{ 210,	"Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1643{ 211,	"Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1644{ 212,	"Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1645{ 213,	"Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1646{ 214,	"Ouml",	"latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1647{ 215,	"times","multiplication sign, U+00D7 ISOnum" },
1648{ 216,	"Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1649{ 217,	"Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1650{ 218,	"Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1651{ 219,	"Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1652{ 220,	"Uuml",	"latin capital letter U with diaeresis, U+00DC ISOlat1" },
1653{ 221,	"Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1654{ 222,	"THORN","latin capital letter THORN, U+00DE ISOlat1" },
1655{ 223,	"szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1656{ 224,	"agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1657{ 225,	"aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1658{ 226,	"acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1659{ 227,	"atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1660{ 228,	"auml",	"latin small letter a with diaeresis, U+00E4 ISOlat1" },
1661{ 229,	"aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1662{ 230,	"aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1663{ 231,	"ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1664{ 232,	"egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1665{ 233,	"eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1666{ 234,	"ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1667{ 235,	"euml",	"latin small letter e with diaeresis, U+00EB ISOlat1" },
1668{ 236,	"igrave","latin small letter i with grave, U+00EC ISOlat1" },
1669{ 237,	"iacute","latin small letter i with acute, U+00ED ISOlat1" },
1670{ 238,	"icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1671{ 239,	"iuml",	"latin small letter i with diaeresis, U+00EF ISOlat1" },
1672{ 240,	"eth",	"latin small letter eth, U+00F0 ISOlat1" },
1673{ 241,	"ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1674{ 242,	"ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1675{ 243,	"oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1676{ 244,	"ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1677{ 245,	"otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1678{ 246,	"ouml",	"latin small letter o with diaeresis, U+00F6 ISOlat1" },
1679{ 247,	"divide","division sign, U+00F7 ISOnum" },
1680{ 248,	"oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1681{ 249,	"ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1682{ 250,	"uacute","latin small letter u with acute, U+00FA ISOlat1" },
1683{ 251,	"ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1684{ 252,	"uuml",	"latin small letter u with diaeresis, U+00FC ISOlat1" },
1685{ 253,	"yacute","latin small letter y with acute, U+00FD ISOlat1" },
1686{ 254,	"thorn","latin small letter thorn with, U+00FE ISOlat1" },
1687{ 255,	"yuml",	"latin small letter y with diaeresis, U+00FF ISOlat1" },
1688
1689{ 338,	"OElig","latin capital ligature OE, U+0152 ISOlat2" },
1690{ 339,	"oelig","latin small ligature oe, U+0153 ISOlat2" },
1691{ 352,	"Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1692{ 353,	"scaron","latin small letter s with caron, U+0161 ISOlat2" },
1693{ 376,	"Yuml",	"latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1694
1695/*
1696 * Anything below should really be kept as entities references
1697 */
1698{ 402,	"fnof",	"latin small f with hook = function = florin, U+0192 ISOtech" },
1699
1700{ 710,	"circ",	"modifier letter circumflex accent, U+02C6 ISOpub" },
1701{ 732,	"tilde","small tilde, U+02DC ISOdia" },
1702
1703{ 913,	"Alpha","greek capital letter alpha, U+0391" },
1704{ 914,	"Beta",	"greek capital letter beta, U+0392" },
1705{ 915,	"Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1706{ 916,	"Delta","greek capital letter delta, U+0394 ISOgrk3" },
1707{ 917,	"Epsilon","greek capital letter epsilon, U+0395" },
1708{ 918,	"Zeta",	"greek capital letter zeta, U+0396" },
1709{ 919,	"Eta",	"greek capital letter eta, U+0397" },
1710{ 920,	"Theta","greek capital letter theta, U+0398 ISOgrk3" },
1711{ 921,	"Iota",	"greek capital letter iota, U+0399" },
1712{ 922,	"Kappa","greek capital letter kappa, U+039A" },
1713{ 923,	"Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1714{ 924,	"Mu",	"greek capital letter mu, U+039C" },
1715{ 925,	"Nu",	"greek capital letter nu, U+039D" },
1716{ 926,	"Xi",	"greek capital letter xi, U+039E ISOgrk3" },
1717{ 927,	"Omicron","greek capital letter omicron, U+039F" },
1718{ 928,	"Pi",	"greek capital letter pi, U+03A0 ISOgrk3" },
1719{ 929,	"Rho",	"greek capital letter rho, U+03A1" },
1720{ 931,	"Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1721{ 932,	"Tau",	"greek capital letter tau, U+03A4" },
1722{ 933,	"Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1723{ 934,	"Phi",	"greek capital letter phi, U+03A6 ISOgrk3" },
1724{ 935,	"Chi",	"greek capital letter chi, U+03A7" },
1725{ 936,	"Psi",	"greek capital letter psi, U+03A8 ISOgrk3" },
1726{ 937,	"Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1727
1728{ 945,	"alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1729{ 946,	"beta",	"greek small letter beta, U+03B2 ISOgrk3" },
1730{ 947,	"gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1731{ 948,	"delta","greek small letter delta, U+03B4 ISOgrk3" },
1732{ 949,	"epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1733{ 950,	"zeta",	"greek small letter zeta, U+03B6 ISOgrk3" },
1734{ 951,	"eta",	"greek small letter eta, U+03B7 ISOgrk3" },
1735{ 952,	"theta","greek small letter theta, U+03B8 ISOgrk3" },
1736{ 953,	"iota",	"greek small letter iota, U+03B9 ISOgrk3" },
1737{ 954,	"kappa","greek small letter kappa, U+03BA ISOgrk3" },
1738{ 955,	"lambda","greek small letter lambda, U+03BB ISOgrk3" },
1739{ 956,	"mu",	"greek small letter mu, U+03BC ISOgrk3" },
1740{ 957,	"nu",	"greek small letter nu, U+03BD ISOgrk3" },
1741{ 958,	"xi",	"greek small letter xi, U+03BE ISOgrk3" },
1742{ 959,	"omicron","greek small letter omicron, U+03BF NEW" },
1743{ 960,	"pi",	"greek small letter pi, U+03C0 ISOgrk3" },
1744{ 961,	"rho",	"greek small letter rho, U+03C1 ISOgrk3" },
1745{ 962,	"sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1746{ 963,	"sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1747{ 964,	"tau",	"greek small letter tau, U+03C4 ISOgrk3" },
1748{ 965,	"upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1749{ 966,	"phi",	"greek small letter phi, U+03C6 ISOgrk3" },
1750{ 967,	"chi",	"greek small letter chi, U+03C7 ISOgrk3" },
1751{ 968,	"psi",	"greek small letter psi, U+03C8 ISOgrk3" },
1752{ 969,	"omega","greek small letter omega, U+03C9 ISOgrk3" },
1753{ 977,	"thetasym","greek small letter theta symbol, U+03D1 NEW" },
1754{ 978,	"upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1755{ 982,	"piv",	"greek pi symbol, U+03D6 ISOgrk3" },
1756
1757{ 8194,	"ensp",	"en space, U+2002 ISOpub" },
1758{ 8195,	"emsp",	"em space, U+2003 ISOpub" },
1759{ 8201,	"thinsp","thin space, U+2009 ISOpub" },
1760{ 8204,	"zwnj",	"zero width non-joiner, U+200C NEW RFC 2070" },
1761{ 8205,	"zwj",	"zero width joiner, U+200D NEW RFC 2070" },
1762{ 8206,	"lrm",	"left-to-right mark, U+200E NEW RFC 2070" },
1763{ 8207,	"rlm",	"right-to-left mark, U+200F NEW RFC 2070" },
1764{ 8211,	"ndash","en dash, U+2013 ISOpub" },
1765{ 8212,	"mdash","em dash, U+2014 ISOpub" },
1766{ 8216,	"lsquo","left single quotation mark, U+2018 ISOnum" },
1767{ 8217,	"rsquo","right single quotation mark, U+2019 ISOnum" },
1768{ 8218,	"sbquo","single low-9 quotation mark, U+201A NEW" },
1769{ 8220,	"ldquo","left double quotation mark, U+201C ISOnum" },
1770{ 8221,	"rdquo","right double quotation mark, U+201D ISOnum" },
1771{ 8222,	"bdquo","double low-9 quotation mark, U+201E NEW" },
1772{ 8224,	"dagger","dagger, U+2020 ISOpub" },
1773{ 8225,	"Dagger","double dagger, U+2021 ISOpub" },
1774
1775{ 8226,	"bull",	"bullet = black small circle, U+2022 ISOpub" },
1776{ 8230,	"hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1777
1778{ 8240,	"permil","per mille sign, U+2030 ISOtech" },
1779
1780{ 8242,	"prime","prime = minutes = feet, U+2032 ISOtech" },
1781{ 8243,	"Prime","double prime = seconds = inches, U+2033 ISOtech" },
1782
1783{ 8249,	"lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1784{ 8250,	"rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1785
1786{ 8254,	"oline","overline = spacing overscore, U+203E NEW" },
1787{ 8260,	"frasl","fraction slash, U+2044 NEW" },
1788
1789{ 8364,	"euro",	"euro sign, U+20AC NEW" },
1790
1791{ 8465,	"image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1792{ 8472,	"weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1793{ 8476,	"real",	"blackletter capital R = real part symbol, U+211C ISOamso" },
1794{ 8482,	"trade","trade mark sign, U+2122 ISOnum" },
1795{ 8501,	"alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1796{ 8592,	"larr",	"leftwards arrow, U+2190 ISOnum" },
1797{ 8593,	"uarr",	"upwards arrow, U+2191 ISOnum" },
1798{ 8594,	"rarr",	"rightwards arrow, U+2192 ISOnum" },
1799{ 8595,	"darr",	"downwards arrow, U+2193 ISOnum" },
1800{ 8596,	"harr",	"left right arrow, U+2194 ISOamsa" },
1801{ 8629,	"crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1802{ 8656,	"lArr",	"leftwards double arrow, U+21D0 ISOtech" },
1803{ 8657,	"uArr",	"upwards double arrow, U+21D1 ISOamsa" },
1804{ 8658,	"rArr",	"rightwards double arrow, U+21D2 ISOtech" },
1805{ 8659,	"dArr",	"downwards double arrow, U+21D3 ISOamsa" },
1806{ 8660,	"hArr",	"left right double arrow, U+21D4 ISOamsa" },
1807
1808{ 8704,	"forall","for all, U+2200 ISOtech" },
1809{ 8706,	"part",	"partial differential, U+2202 ISOtech" },
1810{ 8707,	"exist","there exists, U+2203 ISOtech" },
1811{ 8709,	"empty","empty set = null set = diameter, U+2205 ISOamso" },
1812{ 8711,	"nabla","nabla = backward difference, U+2207 ISOtech" },
1813{ 8712,	"isin",	"element of, U+2208 ISOtech" },
1814{ 8713,	"notin","not an element of, U+2209 ISOtech" },
1815{ 8715,	"ni",	"contains as member, U+220B ISOtech" },
1816{ 8719,	"prod",	"n-ary product = product sign, U+220F ISOamsb" },
1817{ 8721,	"sum",	"n-ary summation, U+2211 ISOamsb" },
1818{ 8722,	"minus","minus sign, U+2212 ISOtech" },
1819{ 8727,	"lowast","asterisk operator, U+2217 ISOtech" },
1820{ 8730,	"radic","square root = radical sign, U+221A ISOtech" },
1821{ 8733,	"prop",	"proportional to, U+221D ISOtech" },
1822{ 8734,	"infin","infinity, U+221E ISOtech" },
1823{ 8736,	"ang",	"angle, U+2220 ISOamso" },
1824{ 8743,	"and",	"logical and = wedge, U+2227 ISOtech" },
1825{ 8744,	"or",	"logical or = vee, U+2228 ISOtech" },
1826{ 8745,	"cap",	"intersection = cap, U+2229 ISOtech" },
1827{ 8746,	"cup",	"union = cup, U+222A ISOtech" },
1828{ 8747,	"int",	"integral, U+222B ISOtech" },
1829{ 8756,	"there4","therefore, U+2234 ISOtech" },
1830{ 8764,	"sim",	"tilde operator = varies with = similar to, U+223C ISOtech" },
1831{ 8773,	"cong",	"approximately equal to, U+2245 ISOtech" },
1832{ 8776,	"asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1833{ 8800,	"ne",	"not equal to, U+2260 ISOtech" },
1834{ 8801,	"equiv","identical to, U+2261 ISOtech" },
1835{ 8804,	"le",	"less-than or equal to, U+2264 ISOtech" },
1836{ 8805,	"ge",	"greater-than or equal to, U+2265 ISOtech" },
1837{ 8834,	"sub",	"subset of, U+2282 ISOtech" },
1838{ 8835,	"sup",	"superset of, U+2283 ISOtech" },
1839{ 8836,	"nsub",	"not a subset of, U+2284 ISOamsn" },
1840{ 8838,	"sube",	"subset of or equal to, U+2286 ISOtech" },
1841{ 8839,	"supe",	"superset of or equal to, U+2287 ISOtech" },
1842{ 8853,	"oplus","circled plus = direct sum, U+2295 ISOamsb" },
1843{ 8855,	"otimes","circled times = vector product, U+2297 ISOamsb" },
1844{ 8869,	"perp",	"up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1845{ 8901,	"sdot",	"dot operator, U+22C5 ISOamsb" },
1846{ 8968,	"lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1847{ 8969,	"rceil","right ceiling, U+2309 ISOamsc" },
1848{ 8970,	"lfloor","left floor = apl downstile, U+230A ISOamsc" },
1849{ 8971,	"rfloor","right floor, U+230B ISOamsc" },
1850{ 9001,	"lang",	"left-pointing angle bracket = bra, U+2329 ISOtech" },
1851{ 9002,	"rang",	"right-pointing angle bracket = ket, U+232A ISOtech" },
1852{ 9674,	"loz",	"lozenge, U+25CA ISOpub" },
1853
1854{ 9824,	"spades","black spade suit, U+2660 ISOpub" },
1855{ 9827,	"clubs","black club suit = shamrock, U+2663 ISOpub" },
1856{ 9829,	"hearts","black heart suit = valentine, U+2665 ISOpub" },
1857{ 9830,	"diams","black diamond suit, U+2666 ISOpub" },
1858
1859};
1860
1861/************************************************************************
1862 *									*
1863 *		Commodity functions to handle entities			*
1864 *									*
1865 ************************************************************************/
1866
1867/*
1868 * Macro used to grow the current buffer.
1869 */
1870#define growBuffer(buffer) {						\
1871    xmlChar *tmp;							\
1872    buffer##_size *= 2;							\
1873    tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1874    if (tmp == NULL) {						\
1875	htmlErrMemory(ctxt, "growing buffer\n");			\
1876	xmlFree(buffer);						\
1877	return(NULL);							\
1878    }									\
1879    buffer = tmp;							\
1880}
1881
1882/**
1883 * htmlEntityLookup:
1884 * @name: the entity name
1885 *
1886 * Lookup the given entity in EntitiesTable
1887 *
1888 * TODO: the linear scan is really ugly, an hash table is really needed.
1889 *
1890 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1891 */
1892const htmlEntityDesc *
1893htmlEntityLookup(const xmlChar *name) {
1894    unsigned int i;
1895
1896    for (i = 0;i < (sizeof(html40EntitiesTable)/
1897                    sizeof(html40EntitiesTable[0]));i++) {
1898        if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1899            return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1900	}
1901    }
1902    return(NULL);
1903}
1904
1905/**
1906 * htmlEntityValueLookup:
1907 * @value: the entity's unicode value
1908 *
1909 * Lookup the given entity in EntitiesTable
1910 *
1911 * TODO: the linear scan is really ugly, an hash table is really needed.
1912 *
1913 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1914 */
1915const htmlEntityDesc *
1916htmlEntityValueLookup(unsigned int value) {
1917    unsigned int i;
1918
1919    for (i = 0;i < (sizeof(html40EntitiesTable)/
1920                    sizeof(html40EntitiesTable[0]));i++) {
1921        if (html40EntitiesTable[i].value >= value) {
1922	    if (html40EntitiesTable[i].value > value)
1923		break;
1924            return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1925	}
1926    }
1927    return(NULL);
1928}
1929
1930/**
1931 * UTF8ToHtml:
1932 * @out:  a pointer to an array of bytes to store the result
1933 * @outlen:  the length of @out
1934 * @in:  a pointer to an array of UTF-8 chars
1935 * @inlen:  the length of @in
1936 *
1937 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1938 * plus HTML entities block of chars out.
1939 *
1940 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1941 * The value of @inlen after return is the number of octets consumed
1942 *     as the return value is positive, else unpredictable.
1943 * The value of @outlen after return is the number of octets consumed.
1944 */
1945int
1946UTF8ToHtml(unsigned char* out, int *outlen,
1947              const unsigned char* in, int *inlen) {
1948    const unsigned char* processed = in;
1949    const unsigned char* outend;
1950    const unsigned char* outstart = out;
1951    const unsigned char* instart = in;
1952    const unsigned char* inend;
1953    unsigned int c, d;
1954    int trailing;
1955
1956    if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
1957    if (in == NULL) {
1958        /*
1959	 * initialization nothing to do
1960	 */
1961	*outlen = 0;
1962	*inlen = 0;
1963	return(0);
1964    }
1965    inend = in + (*inlen);
1966    outend = out + (*outlen);
1967    while (in < inend) {
1968	d = *in++;
1969	if      (d < 0x80)  { c= d; trailing= 0; }
1970	else if (d < 0xC0) {
1971	    /* trailing byte in leading position */
1972	    *outlen = out - outstart;
1973	    *inlen = processed - instart;
1974	    return(-2);
1975        } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
1976        else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
1977        else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
1978	else {
1979	    /* no chance for this in Ascii */
1980	    *outlen = out - outstart;
1981	    *inlen = processed - instart;
1982	    return(-2);
1983	}
1984
1985	if (inend - in < trailing) {
1986	    break;
1987	}
1988
1989	for ( ; trailing; trailing--) {
1990	    if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
1991		break;
1992	    c <<= 6;
1993	    c |= d & 0x3F;
1994	}
1995
1996	/* assertion: c is a single UTF-4 value */
1997	if (c < 0x80) {
1998	    if (out + 1 >= outend)
1999		break;
2000	    *out++ = c;
2001	} else {
2002	    int len;
2003	    const htmlEntityDesc * ent;
2004	    const char *cp;
2005	    char nbuf[16];
2006
2007	    /*
2008	     * Try to lookup a predefined HTML entity for it
2009	     */
2010
2011	    ent = htmlEntityValueLookup(c);
2012	    if (ent == NULL) {
2013	      snprintf(nbuf, sizeof(nbuf), "#%u", c);
2014	      cp = nbuf;
2015	    }
2016	    else
2017	      cp = ent->name;
2018	    len = strlen(cp);
2019	    if (out + 2 + len >= outend)
2020		break;
2021	    *out++ = '&';
2022	    memcpy(out, cp, len);
2023	    out += len;
2024	    *out++ = ';';
2025	}
2026	processed = in;
2027    }
2028    *outlen = out - outstart;
2029    *inlen = processed - instart;
2030    return(0);
2031}
2032
2033/**
2034 * htmlEncodeEntities:
2035 * @out:  a pointer to an array of bytes to store the result
2036 * @outlen:  the length of @out
2037 * @in:  a pointer to an array of UTF-8 chars
2038 * @inlen:  the length of @in
2039 * @quoteChar: the quote character to escape (' or ") or zero.
2040 *
2041 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2042 * plus HTML entities block of chars out.
2043 *
2044 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2045 * The value of @inlen after return is the number of octets consumed
2046 *     as the return value is positive, else unpredictable.
2047 * The value of @outlen after return is the number of octets consumed.
2048 */
2049int
2050htmlEncodeEntities(unsigned char* out, int *outlen,
2051		   const unsigned char* in, int *inlen, int quoteChar) {
2052    const unsigned char* processed = in;
2053    const unsigned char* outend;
2054    const unsigned char* outstart = out;
2055    const unsigned char* instart = in;
2056    const unsigned char* inend;
2057    unsigned int c, d;
2058    int trailing;
2059
2060    if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2061        return(-1);
2062    outend = out + (*outlen);
2063    inend = in + (*inlen);
2064    while (in < inend) {
2065	d = *in++;
2066	if      (d < 0x80)  { c= d; trailing= 0; }
2067	else if (d < 0xC0) {
2068	    /* trailing byte in leading position */
2069	    *outlen = out - outstart;
2070	    *inlen = processed - instart;
2071	    return(-2);
2072        } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2073        else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2074        else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2075	else {
2076	    /* no chance for this in Ascii */
2077	    *outlen = out - outstart;
2078	    *inlen = processed - instart;
2079	    return(-2);
2080	}
2081
2082	if (inend - in < trailing)
2083	    break;
2084
2085	while (trailing--) {
2086	    if (((d= *in++) & 0xC0) != 0x80) {
2087		*outlen = out - outstart;
2088		*inlen = processed - instart;
2089		return(-2);
2090	    }
2091	    c <<= 6;
2092	    c |= d & 0x3F;
2093	}
2094
2095	/* assertion: c is a single UTF-4 value */
2096	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2097	    (c != '&') && (c != '<') && (c != '>')) {
2098	    if (out >= outend)
2099		break;
2100	    *out++ = c;
2101	} else {
2102	    const htmlEntityDesc * ent;
2103	    const char *cp;
2104	    char nbuf[16];
2105	    int len;
2106
2107	    /*
2108	     * Try to lookup a predefined HTML entity for it
2109	     */
2110	    ent = htmlEntityValueLookup(c);
2111	    if (ent == NULL) {
2112		snprintf(nbuf, sizeof(nbuf), "#%u", c);
2113		cp = nbuf;
2114	    }
2115	    else
2116		cp = ent->name;
2117	    len = strlen(cp);
2118	    if (out + 2 + len > outend)
2119		break;
2120	    *out++ = '&';
2121	    memcpy(out, cp, len);
2122	    out += len;
2123	    *out++ = ';';
2124	}
2125	processed = in;
2126    }
2127    *outlen = out - outstart;
2128    *inlen = processed - instart;
2129    return(0);
2130}
2131
2132/************************************************************************
2133 *									*
2134 *		Commodity functions to handle streams			*
2135 *									*
2136 ************************************************************************/
2137
2138/**
2139 * htmlNewInputStream:
2140 * @ctxt:  an HTML parser context
2141 *
2142 * Create a new input stream structure
2143 * Returns the new input stream or NULL
2144 */
2145static htmlParserInputPtr
2146htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2147    htmlParserInputPtr input;
2148
2149    input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2150    if (input == NULL) {
2151        htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2152	return(NULL);
2153    }
2154    memset(input, 0, sizeof(htmlParserInput));
2155    input->filename = NULL;
2156    input->directory = NULL;
2157    input->base = NULL;
2158    input->cur = NULL;
2159    input->buf = NULL;
2160    input->line = 1;
2161    input->col = 1;
2162    input->buf = NULL;
2163    input->free = NULL;
2164    input->version = NULL;
2165    input->consumed = 0;
2166    input->length = 0;
2167    return(input);
2168}
2169
2170
2171/************************************************************************
2172 *									*
2173 *		Commodity functions, cleanup needed ?			*
2174 *									*
2175 ************************************************************************/
2176/*
2177 * all tags allowing pc data from the html 4.01 loose dtd
2178 * NOTE: it might be more apropriate to integrate this information
2179 * into the html40ElementTable array but I don't want to risk any
2180 * binary incomptibility
2181 */
2182static const char *allowPCData[] = {
2183    "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2184    "blockquote", "body", "button", "caption", "center", "cite", "code",
2185    "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2186    "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2187    "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2188    "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2189};
2190
2191/**
2192 * areBlanks:
2193 * @ctxt:  an HTML parser context
2194 * @str:  a xmlChar *
2195 * @len:  the size of @str
2196 *
2197 * Is this a sequence of blank chars that one can ignore ?
2198 *
2199 * Returns 1 if ignorable 0 otherwise.
2200 */
2201
2202static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2203    unsigned int i;
2204    int j;
2205    xmlNodePtr lastChild;
2206    xmlDtdPtr dtd;
2207
2208    for (j = 0;j < len;j++)
2209        if (!(IS_BLANK_CH(str[j]))) return(0);
2210
2211    if (CUR == 0) return(1);
2212    if (CUR != '<') return(0);
2213    if (ctxt->name == NULL)
2214	return(1);
2215    if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2216	return(1);
2217    if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2218	return(1);
2219
2220    /* Only strip CDATA children of the body tag for strict HTML DTDs */
2221    if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2222        dtd = xmlGetIntSubset(ctxt->myDoc);
2223        if (dtd != NULL && dtd->ExternalID != NULL) {
2224            if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2225                    !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2226                return(1);
2227        }
2228    }
2229
2230    if (ctxt->node == NULL) return(0);
2231    lastChild = xmlGetLastChild(ctxt->node);
2232    while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2233	lastChild = lastChild->prev;
2234    if (lastChild == NULL) {
2235        if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2236            (ctxt->node->content != NULL)) return(0);
2237	/* keep ws in constructs like ...<b> </b>...
2238	   for all tags "b" allowing PCDATA */
2239	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2240	    if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2241		return(0);
2242	    }
2243	}
2244    } else if (xmlNodeIsText(lastChild)) {
2245        return(0);
2246    } else {
2247	/* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2248	   for all tags "p" allowing PCDATA */
2249	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2250	    if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2251		return(0);
2252	    }
2253	}
2254    }
2255    return(1);
2256}
2257
2258/**
2259 * htmlNewDocNoDtD:
2260 * @URI:  URI for the dtd, or NULL
2261 * @ExternalID:  the external ID of the DTD, or NULL
2262 *
2263 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2264 * are NULL
2265 *
2266 * Returns a new document, do not initialize the DTD if not provided
2267 */
2268htmlDocPtr
2269htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2270    xmlDocPtr cur;
2271
2272    /*
2273     * Allocate a new document and fill the fields.
2274     */
2275    cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2276    if (cur == NULL) {
2277	htmlErrMemory(NULL, "HTML document creation failed\n");
2278	return(NULL);
2279    }
2280    memset(cur, 0, sizeof(xmlDoc));
2281
2282    cur->type = XML_HTML_DOCUMENT_NODE;
2283    cur->version = NULL;
2284    cur->intSubset = NULL;
2285    cur->doc = cur;
2286    cur->name = NULL;
2287    cur->children = NULL;
2288    cur->extSubset = NULL;
2289    cur->oldNs = NULL;
2290    cur->encoding = NULL;
2291    cur->standalone = 1;
2292    cur->compression = 0;
2293    cur->ids = NULL;
2294    cur->refs = NULL;
2295    cur->_private = NULL;
2296    cur->charset = XML_CHAR_ENCODING_UTF8;
2297    cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2298    if ((ExternalID != NULL) ||
2299	(URI != NULL))
2300	xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2301    return(cur);
2302}
2303
2304/**
2305 * htmlNewDoc:
2306 * @URI:  URI for the dtd, or NULL
2307 * @ExternalID:  the external ID of the DTD, or NULL
2308 *
2309 * Creates a new HTML document
2310 *
2311 * Returns a new document
2312 */
2313htmlDocPtr
2314htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2315    if ((URI == NULL) && (ExternalID == NULL))
2316	return(htmlNewDocNoDtD(
2317		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2318		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2319
2320    return(htmlNewDocNoDtD(URI, ExternalID));
2321}
2322
2323
2324/************************************************************************
2325 *									*
2326 *			The parser itself				*
2327 *	Relates to http://www.w3.org/TR/html40				*
2328 *									*
2329 ************************************************************************/
2330
2331/************************************************************************
2332 *									*
2333 *			The parser itself				*
2334 *									*
2335 ************************************************************************/
2336
2337static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2338
2339/**
2340 * htmlParseHTMLName:
2341 * @ctxt:  an HTML parser context
2342 *
2343 * parse an HTML tag or attribute name, note that we convert it to lowercase
2344 * since HTML names are not case-sensitive.
2345 *
2346 * Returns the Tag Name parsed or NULL
2347 */
2348
2349static const xmlChar *
2350htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2351    int i = 0;
2352    xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2353
2354    if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2355        (CUR != ':') && (CUR != '.')) return(NULL);
2356
2357    while ((i < HTML_PARSER_BUFFER_SIZE) &&
2358           ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2359	   (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2360           (CUR == '.'))) {
2361	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2362        else loc[i] = CUR;
2363	i++;
2364
2365	NEXT;
2366    }
2367
2368    return(xmlDictLookup(ctxt->dict, loc, i));
2369}
2370
2371
2372/**
2373 * htmlParseHTMLName_nonInvasive:
2374 * @ctxt:  an HTML parser context
2375 *
2376 * parse an HTML tag or attribute name, note that we convert it to lowercase
2377 * since HTML names are not case-sensitive, this doesn't consume the data
2378 * from the stream, it's a look-ahead
2379 *
2380 * Returns the Tag Name parsed or NULL
2381 */
2382
2383static const xmlChar *
2384htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2385    int i = 0;
2386    xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2387
2388    if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2389        (NXT(1) != ':')) return(NULL);
2390
2391    while ((i < HTML_PARSER_BUFFER_SIZE) &&
2392           ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2393	   (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2394	if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2395        else loc[i] = NXT(1+i);
2396	i++;
2397    }
2398
2399    return(xmlDictLookup(ctxt->dict, loc, i));
2400}
2401
2402
2403/**
2404 * htmlParseName:
2405 * @ctxt:  an HTML parser context
2406 *
2407 * parse an HTML name, this routine is case sensitive.
2408 *
2409 * Returns the Name parsed or NULL
2410 */
2411
2412static const xmlChar *
2413htmlParseName(htmlParserCtxtPtr ctxt) {
2414    const xmlChar *in;
2415    const xmlChar *ret;
2416    int count = 0;
2417
2418    GROW;
2419
2420    /*
2421     * Accelerator for simple ASCII names
2422     */
2423    in = ctxt->input->cur;
2424    if (((*in >= 0x61) && (*in <= 0x7A)) ||
2425	((*in >= 0x41) && (*in <= 0x5A)) ||
2426	(*in == '_') || (*in == ':')) {
2427	in++;
2428	while (((*in >= 0x61) && (*in <= 0x7A)) ||
2429	       ((*in >= 0x41) && (*in <= 0x5A)) ||
2430	       ((*in >= 0x30) && (*in <= 0x39)) ||
2431	       (*in == '_') || (*in == '-') ||
2432	       (*in == ':') || (*in == '.'))
2433	    in++;
2434	if ((*in > 0) && (*in < 0x80)) {
2435	    count = in - ctxt->input->cur;
2436	    ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2437	    ctxt->input->cur = in;
2438	    ctxt->nbChars += count;
2439	    ctxt->input->col += count;
2440	    return(ret);
2441	}
2442    }
2443    return(htmlParseNameComplex(ctxt));
2444}
2445
2446static const xmlChar *
2447htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2448    int len = 0, l;
2449    int c;
2450    int count = 0;
2451
2452    /*
2453     * Handler for more complex cases
2454     */
2455    GROW;
2456    c = CUR_CHAR(l);
2457    if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2458	(!IS_LETTER(c) && (c != '_') &&
2459         (c != ':'))) {
2460	return(NULL);
2461    }
2462
2463    while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2464	   ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2465            (c == '.') || (c == '-') ||
2466	    (c == '_') || (c == ':') ||
2467	    (IS_COMBINING(c)) ||
2468	    (IS_EXTENDER(c)))) {
2469	if (count++ > 100) {
2470	    count = 0;
2471	    GROW;
2472	}
2473	len += l;
2474	NEXTL(l);
2475	c = CUR_CHAR(l);
2476    }
2477    return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2478}
2479
2480
2481/**
2482 * htmlParseHTMLAttribute:
2483 * @ctxt:  an HTML parser context
2484 * @stop:  a char stop value
2485 *
2486 * parse an HTML attribute value till the stop (quote), if
2487 * stop is 0 then it stops at the first space
2488 *
2489 * Returns the attribute parsed or NULL
2490 */
2491
2492static xmlChar *
2493htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2494    xmlChar *buffer = NULL;
2495    int buffer_size = 0;
2496    xmlChar *out = NULL;
2497    const xmlChar *name = NULL;
2498    const xmlChar *cur = NULL;
2499    const htmlEntityDesc * ent;
2500
2501    /*
2502     * allocate a translation buffer.
2503     */
2504    buffer_size = HTML_PARSER_BUFFER_SIZE;
2505    buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2506    if (buffer == NULL) {
2507	htmlErrMemory(ctxt, "buffer allocation failed\n");
2508	return(NULL);
2509    }
2510    out = buffer;
2511
2512    /*
2513     * Ok loop until we reach one of the ending chars
2514     */
2515    while ((CUR != 0) && (CUR != stop)) {
2516	if ((stop == 0) && (CUR == '>')) break;
2517	if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2518        if (CUR == '&') {
2519	    if (NXT(1) == '#') {
2520		unsigned int c;
2521		int bits;
2522
2523		c = htmlParseCharRef(ctxt);
2524		if      (c <    0x80)
2525		        { *out++  = c;                bits= -6; }
2526		else if (c <   0x800)
2527		        { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2528		else if (c < 0x10000)
2529		        { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2530		else
2531		        { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2532
2533		for ( ; bits >= 0; bits-= 6) {
2534		    *out++  = ((c >> bits) & 0x3F) | 0x80;
2535		}
2536
2537		if (out - buffer > buffer_size - 100) {
2538			int indx = out - buffer;
2539
2540			growBuffer(buffer);
2541			out = &buffer[indx];
2542		}
2543	    } else {
2544		ent = htmlParseEntityRef(ctxt, &name);
2545		if (name == NULL) {
2546		    *out++ = '&';
2547		    if (out - buffer > buffer_size - 100) {
2548			int indx = out - buffer;
2549
2550			growBuffer(buffer);
2551			out = &buffer[indx];
2552		    }
2553		} else if (ent == NULL) {
2554		    *out++ = '&';
2555		    cur = name;
2556		    while (*cur != 0) {
2557			if (out - buffer > buffer_size - 100) {
2558			    int indx = out - buffer;
2559
2560			    growBuffer(buffer);
2561			    out = &buffer[indx];
2562			}
2563			*out++ = *cur++;
2564		    }
2565		} else {
2566		    unsigned int c;
2567		    int bits;
2568
2569		    if (out - buffer > buffer_size - 100) {
2570			int indx = out - buffer;
2571
2572			growBuffer(buffer);
2573			out = &buffer[indx];
2574		    }
2575		    c = ent->value;
2576		    if      (c <    0x80)
2577			{ *out++  = c;                bits= -6; }
2578		    else if (c <   0x800)
2579			{ *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2580		    else if (c < 0x10000)
2581			{ *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2582		    else
2583			{ *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2584
2585		    for ( ; bits >= 0; bits-= 6) {
2586			*out++  = ((c >> bits) & 0x3F) | 0x80;
2587		    }
2588		}
2589	    }
2590	} else {
2591	    unsigned int c;
2592	    int bits, l;
2593
2594	    if (out - buffer > buffer_size - 100) {
2595		int indx = out - buffer;
2596
2597		growBuffer(buffer);
2598		out = &buffer[indx];
2599	    }
2600	    c = CUR_CHAR(l);
2601	    if      (c <    0x80)
2602		    { *out++  = c;                bits= -6; }
2603	    else if (c <   0x800)
2604		    { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2605	    else if (c < 0x10000)
2606		    { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2607	    else
2608		    { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2609
2610	    for ( ; bits >= 0; bits-= 6) {
2611		*out++  = ((c >> bits) & 0x3F) | 0x80;
2612	    }
2613	    NEXT;
2614	}
2615    }
2616    *out = 0;
2617    return(buffer);
2618}
2619
2620/**
2621 * htmlParseEntityRef:
2622 * @ctxt:  an HTML parser context
2623 * @str:  location to store the entity name
2624 *
2625 * parse an HTML ENTITY references
2626 *
2627 * [68] EntityRef ::= '&' Name ';'
2628 *
2629 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2630 *         if non-NULL *str will have to be freed by the caller.
2631 */
2632const htmlEntityDesc *
2633htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2634    const xmlChar *name;
2635    const htmlEntityDesc * ent = NULL;
2636
2637    if (str != NULL) *str = NULL;
2638    if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2639
2640    if (CUR == '&') {
2641        NEXT;
2642        name = htmlParseName(ctxt);
2643	if (name == NULL) {
2644	    htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2645	                 "htmlParseEntityRef: no name\n", NULL, NULL);
2646	} else {
2647	    GROW;
2648	    if (CUR == ';') {
2649	        if (str != NULL)
2650		    *str = name;
2651
2652		/*
2653		 * Lookup the entity in the table.
2654		 */
2655		ent = htmlEntityLookup(name);
2656		if (ent != NULL) /* OK that's ugly !!! */
2657		    NEXT;
2658	    } else {
2659		htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2660		             "htmlParseEntityRef: expecting ';'\n",
2661			     NULL, NULL);
2662	        if (str != NULL)
2663		    *str = name;
2664	    }
2665	}
2666    }
2667    return(ent);
2668}
2669
2670/**
2671 * htmlParseAttValue:
2672 * @ctxt:  an HTML parser context
2673 *
2674 * parse a value for an attribute
2675 * Note: the parser won't do substitution of entities here, this
2676 * will be handled later in xmlStringGetNodeList, unless it was
2677 * asked for ctxt->replaceEntities != 0
2678 *
2679 * Returns the AttValue parsed or NULL.
2680 */
2681
2682static xmlChar *
2683htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2684    xmlChar *ret = NULL;
2685
2686    if (CUR == '"') {
2687        NEXT;
2688	ret = htmlParseHTMLAttribute(ctxt, '"');
2689        if (CUR != '"') {
2690	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2691	                 "AttValue: \" expected\n", NULL, NULL);
2692	} else
2693	    NEXT;
2694    } else if (CUR == '\'') {
2695        NEXT;
2696	ret = htmlParseHTMLAttribute(ctxt, '\'');
2697        if (CUR != '\'') {
2698	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2699	                 "AttValue: ' expected\n", NULL, NULL);
2700	} else
2701	    NEXT;
2702    } else {
2703        /*
2704	 * That's an HTMLism, the attribute value may not be quoted
2705	 */
2706	ret = htmlParseHTMLAttribute(ctxt, 0);
2707	if (ret == NULL) {
2708	    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2709	                 "AttValue: no value found\n", NULL, NULL);
2710	}
2711    }
2712    return(ret);
2713}
2714
2715/**
2716 * htmlParseSystemLiteral:
2717 * @ctxt:  an HTML parser context
2718 *
2719 * parse an HTML Literal
2720 *
2721 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2722 *
2723 * Returns the SystemLiteral parsed or NULL
2724 */
2725
2726static xmlChar *
2727htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2728    const xmlChar *q;
2729    xmlChar *ret = NULL;
2730
2731    if (CUR == '"') {
2732        NEXT;
2733	q = CUR_PTR;
2734	while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
2735	    NEXT;
2736	if (!IS_CHAR_CH(CUR)) {
2737	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2738			 "Unfinished SystemLiteral\n", NULL, NULL);
2739	} else {
2740	    ret = xmlStrndup(q, CUR_PTR - q);
2741	    NEXT;
2742        }
2743    } else if (CUR == '\'') {
2744        NEXT;
2745	q = CUR_PTR;
2746	while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
2747	    NEXT;
2748	if (!IS_CHAR_CH(CUR)) {
2749	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2750			 "Unfinished SystemLiteral\n", NULL, NULL);
2751	} else {
2752	    ret = xmlStrndup(q, CUR_PTR - q);
2753	    NEXT;
2754        }
2755    } else {
2756	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2757	             " or ' expected\n", NULL, NULL);
2758    }
2759
2760    return(ret);
2761}
2762
2763/**
2764 * htmlParsePubidLiteral:
2765 * @ctxt:  an HTML parser context
2766 *
2767 * parse an HTML public literal
2768 *
2769 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2770 *
2771 * Returns the PubidLiteral parsed or NULL.
2772 */
2773
2774static xmlChar *
2775htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2776    const xmlChar *q;
2777    xmlChar *ret = NULL;
2778    /*
2779     * Name ::= (Letter | '_') (NameChar)*
2780     */
2781    if (CUR == '"') {
2782        NEXT;
2783	q = CUR_PTR;
2784	while (IS_PUBIDCHAR_CH(CUR)) NEXT;
2785	if (CUR != '"') {
2786	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2787	                 "Unfinished PubidLiteral\n", NULL, NULL);
2788	} else {
2789	    ret = xmlStrndup(q, CUR_PTR - q);
2790	    NEXT;
2791	}
2792    } else if (CUR == '\'') {
2793        NEXT;
2794	q = CUR_PTR;
2795	while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
2796	    NEXT;
2797	if (CUR != '\'') {
2798	    htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2799	                 "Unfinished PubidLiteral\n", NULL, NULL);
2800	} else {
2801	    ret = xmlStrndup(q, CUR_PTR - q);
2802	    NEXT;
2803	}
2804    } else {
2805	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2806	             "PubidLiteral \" or ' expected\n", NULL, NULL);
2807    }
2808
2809    return(ret);
2810}
2811
2812/**
2813 * htmlParseScript:
2814 * @ctxt:  an HTML parser context
2815 *
2816 * parse the content of an HTML SCRIPT or STYLE element
2817 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2818 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2819 * http://www.w3.org/TR/html4/types.html#type-script
2820 * http://www.w3.org/TR/html4/types.html#h-6.15
2821 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2822 *
2823 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2824 * element and the value of intrinsic event attributes. User agents must
2825 * not evaluate script data as HTML markup but instead must pass it on as
2826 * data to a script engine.
2827 * NOTES:
2828 * - The content is passed like CDATA
2829 * - the attributes for style and scripting "onXXX" are also described
2830 *   as CDATA but SGML allows entities references in attributes so their
2831 *   processing is identical as other attributes
2832 */
2833static void
2834htmlParseScript(htmlParserCtxtPtr ctxt) {
2835    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2836    int nbchar = 0;
2837    int cur,l;
2838
2839    SHRINK;
2840    cur = CUR_CHAR(l);
2841    while (IS_CHAR_CH(cur)) {
2842	if ((cur == '<') && (NXT(1) == '/')) {
2843            /*
2844             * One should break here, the specification is clear:
2845             * Authors should therefore escape "</" within the content.
2846             * Escape mechanisms are specific to each scripting or
2847             * style sheet language.
2848             *
2849             * In recovery mode, only break if end tag match the
2850             * current tag, effectively ignoring all tags inside the
2851             * script/style block and treating the entire block as
2852             * CDATA.
2853             */
2854            if (ctxt->recovery) {
2855                if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2856				   xmlStrlen(ctxt->name)) == 0)
2857                {
2858                    break; /* while */
2859                } else {
2860		    htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
2861				 "Element %s embeds close tag\n",
2862		                 ctxt->name, NULL);
2863		}
2864            } else {
2865                if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2866                    ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2867                {
2868                    break; /* while */
2869                }
2870            }
2871	}
2872	COPY_BUF(l,buf,nbchar,cur);
2873	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2874	    if (ctxt->sax->cdataBlock!= NULL) {
2875		/*
2876		 * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2877		 */
2878		ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2879	    } else if (ctxt->sax->characters != NULL) {
2880		ctxt->sax->characters(ctxt->userData, buf, nbchar);
2881	    }
2882	    nbchar = 0;
2883	}
2884	GROW;
2885	NEXTL(l);
2886	cur = CUR_CHAR(l);
2887    }
2888
2889    if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
2890        htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2891                    "Invalid char in CDATA 0x%X\n", cur);
2892        if (ctxt->input->cur < ctxt->input->end) {
2893            NEXT;
2894        }
2895    }
2896
2897    if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2898	if (ctxt->sax->cdataBlock!= NULL) {
2899	    /*
2900	     * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2901	     */
2902	    ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2903	} else if (ctxt->sax->characters != NULL) {
2904	    ctxt->sax->characters(ctxt->userData, buf, nbchar);
2905	}
2906    }
2907}
2908
2909
2910/**
2911 * htmlParseCharData:
2912 * @ctxt:  an HTML parser context
2913 *
2914 * parse a CharData section.
2915 * if we are within a CDATA section ']]>' marks an end of section.
2916 *
2917 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2918 */
2919
2920static void
2921htmlParseCharData(htmlParserCtxtPtr ctxt) {
2922    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2923    int nbchar = 0;
2924    int cur, l;
2925    int chunk = 0;
2926
2927    SHRINK;
2928    cur = CUR_CHAR(l);
2929    while (((cur != '<') || (ctxt->token == '<')) &&
2930           ((cur != '&') || (ctxt->token == '&')) &&
2931	   (cur != 0)) {
2932	if (!(IS_CHAR(cur))) {
2933	    htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2934	                "Invalid char in CDATA 0x%X\n", cur);
2935	} else {
2936	    COPY_BUF(l,buf,nbchar,cur);
2937	}
2938	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2939	    /*
2940	     * Ok the segment is to be consumed as chars.
2941	     */
2942	    if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2943		if (areBlanks(ctxt, buf, nbchar)) {
2944		    if (ctxt->sax->ignorableWhitespace != NULL)
2945			ctxt->sax->ignorableWhitespace(ctxt->userData,
2946			                               buf, nbchar);
2947		} else {
2948		    htmlCheckParagraph(ctxt);
2949		    if (ctxt->sax->characters != NULL)
2950			ctxt->sax->characters(ctxt->userData, buf, nbchar);
2951		}
2952	    }
2953	    nbchar = 0;
2954	}
2955	NEXTL(l);
2956        chunk++;
2957        if (chunk > HTML_PARSER_BUFFER_SIZE) {
2958            chunk = 0;
2959            SHRINK;
2960            GROW;
2961        }
2962	cur = CUR_CHAR(l);
2963	if (cur == 0) {
2964	    SHRINK;
2965	    GROW;
2966	    cur = CUR_CHAR(l);
2967	}
2968    }
2969    if (nbchar != 0) {
2970        buf[nbchar] = 0;
2971
2972	/*
2973	 * Ok the segment is to be consumed as chars.
2974	 */
2975	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2976	    if (areBlanks(ctxt, buf, nbchar)) {
2977		if (ctxt->sax->ignorableWhitespace != NULL)
2978		    ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2979	    } else {
2980		htmlCheckParagraph(ctxt);
2981		if (ctxt->sax->characters != NULL)
2982		    ctxt->sax->characters(ctxt->userData, buf, nbchar);
2983	    }
2984	}
2985    } else {
2986	/*
2987	 * Loop detection
2988	 */
2989	if (cur == 0)
2990	    ctxt->instate = XML_PARSER_EOF;
2991    }
2992}
2993
2994/**
2995 * htmlParseExternalID:
2996 * @ctxt:  an HTML parser context
2997 * @publicID:  a xmlChar** receiving PubidLiteral
2998 *
2999 * Parse an External ID or a Public ID
3000 *
3001 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3002 *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
3003 *
3004 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3005 *
3006 * Returns the function returns SystemLiteral and in the second
3007 *                case publicID receives PubidLiteral, is strict is off
3008 *                it is possible to return NULL and have publicID set.
3009 */
3010
3011static xmlChar *
3012htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3013    xmlChar *URI = NULL;
3014
3015    if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3016         (UPP(2) == 'S') && (UPP(3) == 'T') &&
3017	 (UPP(4) == 'E') && (UPP(5) == 'M')) {
3018        SKIP(6);
3019	if (!IS_BLANK_CH(CUR)) {
3020	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3021	                 "Space required after 'SYSTEM'\n", NULL, NULL);
3022	}
3023        SKIP_BLANKS;
3024	URI = htmlParseSystemLiteral(ctxt);
3025	if (URI == NULL) {
3026	    htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3027	                 "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3028        }
3029    } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3030	       (UPP(2) == 'B') && (UPP(3) == 'L') &&
3031	       (UPP(4) == 'I') && (UPP(5) == 'C')) {
3032        SKIP(6);
3033	if (!IS_BLANK_CH(CUR)) {
3034	    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3035	                 "Space required after 'PUBLIC'\n", NULL, NULL);
3036	}
3037        SKIP_BLANKS;
3038	*publicID = htmlParsePubidLiteral(ctxt);
3039	if (*publicID == NULL) {
3040	    htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3041	                 "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3042			 NULL, NULL);
3043	}
3044        SKIP_BLANKS;
3045        if ((CUR == '"') || (CUR == '\'')) {
3046	    URI = htmlParseSystemLiteral(ctxt);
3047	}
3048    }
3049    return(URI);
3050}
3051
3052/**
3053 * xmlParsePI:
3054 * @ctxt:  an XML parser context
3055 *
3056 * parse an XML Processing Instruction.
3057 *
3058 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3059 */
3060static void
3061htmlParsePI(htmlParserCtxtPtr ctxt) {
3062    xmlChar *buf = NULL;
3063    int len = 0;
3064    int size = HTML_PARSER_BUFFER_SIZE;
3065    int cur, l;
3066    const xmlChar *target;
3067    xmlParserInputState state;
3068    int count = 0;
3069
3070    if ((RAW == '<') && (NXT(1) == '?')) {
3071	state = ctxt->instate;
3072        ctxt->instate = XML_PARSER_PI;
3073	/*
3074	 * this is a Processing Instruction.
3075	 */
3076	SKIP(2);
3077	SHRINK;
3078
3079	/*
3080	 * Parse the target name and check for special support like
3081	 * namespace.
3082	 */
3083        target = htmlParseName(ctxt);
3084	if (target != NULL) {
3085	    if (RAW == '>') {
3086		SKIP(1);
3087
3088		/*
3089		 * SAX: PI detected.
3090		 */
3091		if ((ctxt->sax) && (!ctxt->disableSAX) &&
3092		    (ctxt->sax->processingInstruction != NULL))
3093		    ctxt->sax->processingInstruction(ctxt->userData,
3094		                                     target, NULL);
3095		ctxt->instate = state;
3096		return;
3097	    }
3098	    buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3099	    if (buf == NULL) {
3100		htmlErrMemory(ctxt, NULL);
3101		ctxt->instate = state;
3102		return;
3103	    }
3104	    cur = CUR;
3105	    if (!IS_BLANK(cur)) {
3106		htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3107			  "ParsePI: PI %s space expected\n", target, NULL);
3108	    }
3109            SKIP_BLANKS;
3110	    cur = CUR_CHAR(l);
3111	    while (IS_CHAR(cur) && (cur != '>')) {
3112		if (len + 5 >= size) {
3113		    xmlChar *tmp;
3114
3115		    size *= 2;
3116		    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3117		    if (tmp == NULL) {
3118			htmlErrMemory(ctxt, NULL);
3119			xmlFree(buf);
3120			ctxt->instate = state;
3121			return;
3122		    }
3123		    buf = tmp;
3124		}
3125		count++;
3126		if (count > 50) {
3127		    GROW;
3128		    count = 0;
3129		}
3130		COPY_BUF(l,buf,len,cur);
3131		NEXTL(l);
3132		cur = CUR_CHAR(l);
3133		if (cur == 0) {
3134		    SHRINK;
3135		    GROW;
3136		    cur = CUR_CHAR(l);
3137		}
3138	    }
3139	    buf[len] = 0;
3140	    if (cur != '>') {
3141		htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3142		      "ParsePI: PI %s never end ...\n", target, NULL);
3143	    } else {
3144		SKIP(1);
3145
3146		/*
3147		 * SAX: PI detected.
3148		 */
3149		if ((ctxt->sax) && (!ctxt->disableSAX) &&
3150		    (ctxt->sax->processingInstruction != NULL))
3151		    ctxt->sax->processingInstruction(ctxt->userData,
3152		                                     target, buf);
3153	    }
3154	    xmlFree(buf);
3155	} else {
3156	    htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3157                         "PI is not started correctly", NULL, NULL);
3158	}
3159	ctxt->instate = state;
3160    }
3161}
3162
3163/**
3164 * htmlParseComment:
3165 * @ctxt:  an HTML parser context
3166 *
3167 * Parse an XML (SGML) comment <!-- .... -->
3168 *
3169 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3170 */
3171static void
3172htmlParseComment(htmlParserCtxtPtr ctxt) {
3173    xmlChar *buf = NULL;
3174    int len;
3175    int size = HTML_PARSER_BUFFER_SIZE;
3176    int q, ql;
3177    int r, rl;
3178    int cur, l;
3179    xmlParserInputState state;
3180
3181    /*
3182     * Check that there is a comment right here.
3183     */
3184    if ((RAW != '<') || (NXT(1) != '!') ||
3185        (NXT(2) != '-') || (NXT(3) != '-')) return;
3186
3187    state = ctxt->instate;
3188    ctxt->instate = XML_PARSER_COMMENT;
3189    SHRINK;
3190    SKIP(4);
3191    buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3192    if (buf == NULL) {
3193        htmlErrMemory(ctxt, "buffer allocation failed\n");
3194	ctxt->instate = state;
3195	return;
3196    }
3197    q = CUR_CHAR(ql);
3198    NEXTL(ql);
3199    r = CUR_CHAR(rl);
3200    NEXTL(rl);
3201    cur = CUR_CHAR(l);
3202    len = 0;
3203    while (IS_CHAR(cur) &&
3204           ((cur != '>') ||
3205	    (r != '-') || (q != '-'))) {
3206	if (len + 5 >= size) {
3207	    xmlChar *tmp;
3208
3209	    size *= 2;
3210	    tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3211	    if (tmp == NULL) {
3212	        xmlFree(buf);
3213	        htmlErrMemory(ctxt, "growing buffer failed\n");
3214		ctxt->instate = state;
3215		return;
3216	    }
3217	    buf = tmp;
3218	}
3219	COPY_BUF(ql,buf,len,q);
3220	q = r;
3221	ql = rl;
3222	r = cur;
3223	rl = l;
3224	NEXTL(l);
3225	cur = CUR_CHAR(l);
3226	if (cur == 0) {
3227	    SHRINK;
3228	    GROW;
3229	    cur = CUR_CHAR(l);
3230	}
3231    }
3232    buf[len] = 0;
3233    if (!IS_CHAR(cur)) {
3234	htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3235	             "Comment not terminated \n<!--%.50s\n", buf, NULL);
3236	xmlFree(buf);
3237    } else {
3238        NEXT;
3239	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3240	    (!ctxt->disableSAX))
3241	    ctxt->sax->comment(ctxt->userData, buf);
3242	xmlFree(buf);
3243    }
3244    ctxt->instate = state;
3245}
3246
3247/**
3248 * htmlParseCharRef:
3249 * @ctxt:  an HTML parser context
3250 *
3251 * parse Reference declarations
3252 *
3253 * [66] CharRef ::= '&#' [0-9]+ ';' |
3254 *                  '&#x' [0-9a-fA-F]+ ';'
3255 *
3256 * Returns the value parsed (as an int)
3257 */
3258int
3259htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3260    int val = 0;
3261
3262    if ((ctxt == NULL) || (ctxt->input == NULL)) {
3263	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3264		     "htmlParseCharRef: context error\n",
3265		     NULL, NULL);
3266        return(0);
3267    }
3268    if ((CUR == '&') && (NXT(1) == '#') &&
3269        ((NXT(2) == 'x') || NXT(2) == 'X')) {
3270	SKIP(3);
3271	while (CUR != ';') {
3272	    if ((CUR >= '0') && (CUR <= '9'))
3273	        val = val * 16 + (CUR - '0');
3274	    else if ((CUR >= 'a') && (CUR <= 'f'))
3275	        val = val * 16 + (CUR - 'a') + 10;
3276	    else if ((CUR >= 'A') && (CUR <= 'F'))
3277	        val = val * 16 + (CUR - 'A') + 10;
3278	    else {
3279	        htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3280		             "htmlParseCharRef: missing semicolon\n",
3281			     NULL, NULL);
3282		break;
3283	    }
3284	    NEXT;
3285	}
3286	if (CUR == ';')
3287	    NEXT;
3288    } else if  ((CUR == '&') && (NXT(1) == '#')) {
3289	SKIP(2);
3290	while (CUR != ';') {
3291	    if ((CUR >= '0') && (CUR <= '9'))
3292	        val = val * 10 + (CUR - '0');
3293	    else {
3294	        htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3295		             "htmlParseCharRef: missing semicolon\n",
3296			     NULL, NULL);
3297		break;
3298	    }
3299	    NEXT;
3300	}
3301	if (CUR == ';')
3302	    NEXT;
3303    } else {
3304	htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3305	             "htmlParseCharRef: invalid value\n", NULL, NULL);
3306    }
3307    /*
3308     * Check the value IS_CHAR ...
3309     */
3310    if (IS_CHAR(val)) {
3311        return(val);
3312    } else {
3313	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3314			"htmlParseCharRef: invalid xmlChar value %d\n",
3315			val);
3316    }
3317    return(0);
3318}
3319
3320
3321/**
3322 * htmlParseDocTypeDecl:
3323 * @ctxt:  an HTML parser context
3324 *
3325 * parse a DOCTYPE declaration
3326 *
3327 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3328 *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3329 */
3330
3331static void
3332htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3333    const xmlChar *name;
3334    xmlChar *ExternalID = NULL;
3335    xmlChar *URI = NULL;
3336
3337    /*
3338     * We know that '<!DOCTYPE' has been detected.
3339     */
3340    SKIP(9);
3341
3342    SKIP_BLANKS;
3343
3344    /*
3345     * Parse the DOCTYPE name.
3346     */
3347    name = htmlParseName(ctxt);
3348    if (name == NULL) {
3349	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3350	             "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3351		     NULL, NULL);
3352    }
3353    /*
3354     * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3355     */
3356
3357    SKIP_BLANKS;
3358
3359    /*
3360     * Check for SystemID and ExternalID
3361     */
3362    URI = htmlParseExternalID(ctxt, &ExternalID);
3363    SKIP_BLANKS;
3364
3365    /*
3366     * We should be at the end of the DOCTYPE declaration.
3367     */
3368    if (CUR != '>') {
3369	htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3370	             "DOCTYPE improperly terminated\n", NULL, NULL);
3371        /* We shouldn't try to resynchronize ... */
3372    }
3373    NEXT;
3374
3375    /*
3376     * Create or update the document accordingly to the DOCTYPE
3377     */
3378    if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3379	(!ctxt->disableSAX))
3380	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3381
3382    /*
3383     * Cleanup, since we don't use all those identifiers
3384     */
3385    if (URI != NULL) xmlFree(URI);
3386    if (ExternalID != NULL) xmlFree(ExternalID);
3387}
3388
3389/**
3390 * htmlParseAttribute:
3391 * @ctxt:  an HTML parser context
3392 * @value:  a xmlChar ** used to store the value of the attribute
3393 *
3394 * parse an attribute
3395 *
3396 * [41] Attribute ::= Name Eq AttValue
3397 *
3398 * [25] Eq ::= S? '=' S?
3399 *
3400 * With namespace:
3401 *
3402 * [NS 11] Attribute ::= QName Eq AttValue
3403 *
3404 * Also the case QName == xmlns:??? is handled independently as a namespace
3405 * definition.
3406 *
3407 * Returns the attribute name, and the value in *value.
3408 */
3409
3410static const xmlChar *
3411htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3412    const xmlChar *name;
3413    xmlChar *val = NULL;
3414
3415    *value = NULL;
3416    name = htmlParseHTMLName(ctxt);
3417    if (name == NULL) {
3418	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3419	             "error parsing attribute name\n", NULL, NULL);
3420        return(NULL);
3421    }
3422
3423    /*
3424     * read the value
3425     */
3426    SKIP_BLANKS;
3427    if (CUR == '=') {
3428        NEXT;
3429	SKIP_BLANKS;
3430	val = htmlParseAttValue(ctxt);
3431    }
3432
3433    *value = val;
3434    return(name);
3435}
3436
3437/**
3438 * htmlCheckEncoding:
3439 * @ctxt:  an HTML parser context
3440 * @attvalue: the attribute value
3441 *
3442 * Checks an http-equiv attribute from a Meta tag to detect
3443 * the encoding
3444 * If a new encoding is detected the parser is switched to decode
3445 * it and pass UTF8
3446 */
3447static void
3448htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3449    const xmlChar *encoding;
3450
3451    if ((ctxt == NULL) || (attvalue == NULL) ||
3452        (ctxt->options & HTML_PARSE_IGNORE_ENC))
3453	return;
3454
3455    /* do not change encoding */
3456    if (ctxt->input->encoding != NULL)
3457        return;
3458
3459    encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3460    if (encoding != NULL) {
3461	encoding += 8;
3462    } else {
3463	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3464	if (encoding != NULL)
3465	    encoding += 9;
3466    }
3467    if (encoding != NULL) {
3468	xmlCharEncoding enc;
3469	xmlCharEncodingHandlerPtr handler;
3470
3471	while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3472
3473	if (ctxt->input->encoding != NULL)
3474	    xmlFree((xmlChar *) ctxt->input->encoding);
3475	ctxt->input->encoding = xmlStrdup(encoding);
3476
3477	enc = xmlParseCharEncoding((const char *) encoding);
3478	/*
3479	 * registered set of known encodings
3480	 */
3481	if (enc != XML_CHAR_ENCODING_ERROR) {
3482	    if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3483	         (enc == XML_CHAR_ENCODING_UTF16BE) ||
3484		 (enc == XML_CHAR_ENCODING_UCS4LE) ||
3485		 (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3486		(ctxt->input->buf != NULL) &&
3487		(ctxt->input->buf->encoder == NULL)) {
3488		htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3489		             "htmlCheckEncoding: wrong encoding meta\n",
3490			     NULL, NULL);
3491	    } else {
3492		xmlSwitchEncoding(ctxt, enc);
3493	    }
3494	    ctxt->charset = XML_CHAR_ENCODING_UTF8;
3495	} else {
3496	    /*
3497	     * fallback for unknown encodings
3498	     */
3499	    handler = xmlFindCharEncodingHandler((const char *) encoding);
3500	    if (handler != NULL) {
3501		xmlSwitchToEncoding(ctxt, handler);
3502		ctxt->charset = XML_CHAR_ENCODING_UTF8;
3503	    } else {
3504		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3505		             "htmlCheckEncoding: unknown encoding %s\n",
3506			     encoding, NULL);
3507	    }
3508	}
3509
3510	if ((ctxt->input->buf != NULL) &&
3511	    (ctxt->input->buf->encoder != NULL) &&
3512	    (ctxt->input->buf->raw != NULL) &&
3513	    (ctxt->input->buf->buffer != NULL)) {
3514	    int nbchars;
3515	    int processed;
3516
3517	    /*
3518	     * convert as much as possible to the parser reading buffer.
3519	     */
3520	    processed = ctxt->input->cur - ctxt->input->base;
3521	    xmlBufferShrink(ctxt->input->buf->buffer, processed);
3522	    nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3523		                       ctxt->input->buf->buffer,
3524				       ctxt->input->buf->raw);
3525	    if (nbchars < 0) {
3526		htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3527		             "htmlCheckEncoding: encoder error\n",
3528			     NULL, NULL);
3529	    }
3530	    ctxt->input->base =
3531	    ctxt->input->cur = ctxt->input->buf->buffer->content;
3532            ctxt->input->end =
3533                          &ctxt->input->base[ctxt->input->buf->buffer->use];
3534	}
3535    }
3536}
3537
3538/**
3539 * htmlCheckMeta:
3540 * @ctxt:  an HTML parser context
3541 * @atts:  the attributes values
3542 *
3543 * Checks an attributes from a Meta tag
3544 */
3545static void
3546htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3547    int i;
3548    const xmlChar *att, *value;
3549    int http = 0;
3550    const xmlChar *content = NULL;
3551
3552    if ((ctxt == NULL) || (atts == NULL))
3553	return;
3554
3555    i = 0;
3556    att = atts[i++];
3557    while (att != NULL) {
3558	value = atts[i++];
3559	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3560	 && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3561	    http = 1;
3562	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3563	    content = value;
3564	att = atts[i++];
3565    }
3566    if ((http) && (content != NULL))
3567	htmlCheckEncoding(ctxt, content);
3568
3569}
3570
3571/**
3572 * htmlParseStartTag:
3573 * @ctxt:  an HTML parser context
3574 *
3575 * parse a start of tag either for rule element or
3576 * EmptyElement. In both case we don't parse the tag closing chars.
3577 *
3578 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3579 *
3580 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3581 *
3582 * With namespace:
3583 *
3584 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3585 *
3586 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3587 *
3588 * Returns 0 in case of success, -1 in case of error and 1 if discarded
3589 */
3590
3591static int
3592htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3593    const xmlChar *name;
3594    const xmlChar *attname;
3595    xmlChar *attvalue;
3596    const xmlChar **atts;
3597    int nbatts = 0;
3598    int maxatts;
3599    int meta = 0;
3600    int i;
3601    int discardtag = 0;
3602
3603    if (ctxt->instate == XML_PARSER_EOF)
3604        return(-1);
3605    if ((ctxt == NULL) || (ctxt->input == NULL)) {
3606	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3607		     "htmlParseStartTag: context error\n", NULL, NULL);
3608	return -1;
3609    }
3610    if (CUR != '<') return -1;
3611    NEXT;
3612
3613    atts = ctxt->atts;
3614    maxatts = ctxt->maxatts;
3615
3616    GROW;
3617    name = htmlParseHTMLName(ctxt);
3618    if (name == NULL) {
3619	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3620	             "htmlParseStartTag: invalid element name\n",
3621		     NULL, NULL);
3622	/* Dump the bogus tag like browsers do */
3623	while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
3624               (ctxt->instate != XML_PARSER_EOF))
3625	    NEXT;
3626        return -1;
3627    }
3628    if (xmlStrEqual(name, BAD_CAST"meta"))
3629	meta = 1;
3630
3631    /*
3632     * Check for auto-closure of HTML elements.
3633     */
3634    htmlAutoClose(ctxt, name);
3635
3636    /*
3637     * Check for implied HTML elements.
3638     */
3639    htmlCheckImplied(ctxt, name);
3640
3641    /*
3642     * Avoid html at any level > 0, head at any level != 1
3643     * or any attempt to recurse body
3644     */
3645    if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3646	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3647	             "htmlParseStartTag: misplaced <html> tag\n",
3648		     name, NULL);
3649	discardtag = 1;
3650	ctxt->depth++;
3651    }
3652    if ((ctxt->nameNr != 1) &&
3653	(xmlStrEqual(name, BAD_CAST"head"))) {
3654	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3655	             "htmlParseStartTag: misplaced <head> tag\n",
3656		     name, NULL);
3657	discardtag = 1;
3658	ctxt->depth++;
3659    }
3660    if (xmlStrEqual(name, BAD_CAST"body")) {
3661	int indx;
3662	for (indx = 0;indx < ctxt->nameNr;indx++) {
3663	    if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3664		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3665		             "htmlParseStartTag: misplaced <body> tag\n",
3666			     name, NULL);
3667		discardtag = 1;
3668		ctxt->depth++;
3669	    }
3670	}
3671    }
3672
3673    /*
3674     * Now parse the attributes, it ends up with the ending
3675     *
3676     * (S Attribute)* S?
3677     */
3678    SKIP_BLANKS;
3679    while ((IS_CHAR_CH(CUR)) &&
3680           (CUR != '>') &&
3681	   ((CUR != '/') || (NXT(1) != '>'))) {
3682	long cons = ctxt->nbChars;
3683
3684	GROW;
3685	attname = htmlParseAttribute(ctxt, &attvalue);
3686        if (attname != NULL) {
3687
3688	    /*
3689	     * Well formedness requires at most one declaration of an attribute
3690	     */
3691	    for (i = 0; i < nbatts;i += 2) {
3692	        if (xmlStrEqual(atts[i], attname)) {
3693		    htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3694		                 "Attribute %s redefined\n", attname, NULL);
3695		    if (attvalue != NULL)
3696			xmlFree(attvalue);
3697		    goto failed;
3698		}
3699	    }
3700
3701	    /*
3702	     * Add the pair to atts
3703	     */
3704	    if (atts == NULL) {
3705	        maxatts = 22; /* allow for 10 attrs by default */
3706	        atts = (const xmlChar **)
3707		       xmlMalloc(maxatts * sizeof(xmlChar *));
3708		if (atts == NULL) {
3709		    htmlErrMemory(ctxt, NULL);
3710		    if (attvalue != NULL)
3711			xmlFree(attvalue);
3712		    goto failed;
3713		}
3714		ctxt->atts = atts;
3715		ctxt->maxatts = maxatts;
3716	    } else if (nbatts + 4 > maxatts) {
3717	        const xmlChar **n;
3718
3719	        maxatts *= 2;
3720	        n = (const xmlChar **) xmlRealloc((void *) atts,
3721					     maxatts * sizeof(const xmlChar *));
3722		if (n == NULL) {
3723		    htmlErrMemory(ctxt, NULL);
3724		    if (attvalue != NULL)
3725			xmlFree(attvalue);
3726		    goto failed;
3727		}
3728		atts = n;
3729		ctxt->atts = atts;
3730		ctxt->maxatts = maxatts;
3731	    }
3732	    atts[nbatts++] = attname;
3733	    atts[nbatts++] = attvalue;
3734	    atts[nbatts] = NULL;
3735	    atts[nbatts + 1] = NULL;
3736	}
3737	else {
3738	    if (attvalue != NULL)
3739	        xmlFree(attvalue);
3740	    /* Dump the bogus attribute string up to the next blank or
3741	     * the end of the tag. */
3742	    while ((IS_CHAR_CH(CUR)) &&
3743	           !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3744		   ((CUR != '/') || (NXT(1) != '>')))
3745		NEXT;
3746	}
3747
3748failed:
3749	SKIP_BLANKS;
3750        if (cons == ctxt->nbChars) {
3751	    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3752	                 "htmlParseStartTag: problem parsing attributes\n",
3753			 NULL, NULL);
3754	    break;
3755	}
3756    }
3757
3758    /*
3759     * Handle specific association to the META tag
3760     */
3761    if (meta && (nbatts != 0))
3762	htmlCheckMeta(ctxt, atts);
3763
3764    /*
3765     * SAX: Start of Element !
3766     */
3767    if (!discardtag) {
3768	htmlnamePush(ctxt, name);
3769	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3770	    if (nbatts != 0)
3771		ctxt->sax->startElement(ctxt->userData, name, atts);
3772	    else
3773		ctxt->sax->startElement(ctxt->userData, name, NULL);
3774	}
3775    }
3776
3777    if (atts != NULL) {
3778        for (i = 1;i < nbatts;i += 2) {
3779	    if (atts[i] != NULL)
3780		xmlFree((xmlChar *) atts[i]);
3781	}
3782    }
3783
3784    return(discardtag);
3785}
3786
3787/**
3788 * htmlParseEndTag:
3789 * @ctxt:  an HTML parser context
3790 *
3791 * parse an end of tag
3792 *
3793 * [42] ETag ::= '</' Name S? '>'
3794 *
3795 * With namespace
3796 *
3797 * [NS 9] ETag ::= '</' QName S? '>'
3798 *
3799 * Returns 1 if the current level should be closed.
3800 */
3801
3802static int
3803htmlParseEndTag(htmlParserCtxtPtr ctxt)
3804{
3805    const xmlChar *name;
3806    const xmlChar *oldname;
3807    int i, ret;
3808
3809    if ((CUR != '<') || (NXT(1) != '/')) {
3810        htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3811	             "htmlParseEndTag: '</' not found\n", NULL, NULL);
3812        return (0);
3813    }
3814    SKIP(2);
3815
3816    name = htmlParseHTMLName(ctxt);
3817    if (name == NULL)
3818        return (0);
3819    /*
3820     * We should definitely be at the ending "S? '>'" part
3821     */
3822    SKIP_BLANKS;
3823    if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
3824        htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3825	             "End tag : expected '>'\n", NULL, NULL);
3826	if (ctxt->recovery) {
3827	    /*
3828	     * We're not at the ending > !!
3829	     * Error, unless in recover mode where we search forwards
3830	     * until we find a >
3831	     */
3832	    while (CUR != '\0' && CUR != '>') NEXT;
3833	    NEXT;
3834	}
3835    } else
3836        NEXT;
3837
3838    /*
3839     * if we ignored misplaced tags in htmlParseStartTag don't pop them
3840     * out now.
3841     */
3842    if ((ctxt->depth > 0) &&
3843        (xmlStrEqual(name, BAD_CAST "html") ||
3844         xmlStrEqual(name, BAD_CAST "body") ||
3845	 xmlStrEqual(name, BAD_CAST "head"))) {
3846	ctxt->depth--;
3847	return (0);
3848    }
3849
3850    /*
3851     * If the name read is not one of the element in the parsing stack
3852     * then return, it's just an error.
3853     */
3854    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3855        if (xmlStrEqual(name, ctxt->nameTab[i]))
3856            break;
3857    }
3858    if (i < 0) {
3859        htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3860	             "Unexpected end tag : %s\n", name, NULL);
3861        return (0);
3862    }
3863
3864
3865    /*
3866     * Check for auto-closure of HTML elements.
3867     */
3868
3869    htmlAutoCloseOnClose(ctxt, name);
3870
3871    /*
3872     * Well formedness constraints, opening and closing must match.
3873     * With the exception that the autoclose may have popped stuff out
3874     * of the stack.
3875     */
3876    if (!xmlStrEqual(name, ctxt->name)) {
3877        if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
3878            htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3879	                 "Opening and ending tag mismatch: %s and %s\n",
3880			 name, ctxt->name);
3881        }
3882    }
3883
3884    /*
3885     * SAX: End of Tag
3886     */
3887    oldname = ctxt->name;
3888    if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3889        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3890            ctxt->sax->endElement(ctxt->userData, name);
3891        htmlnamePop(ctxt);
3892        ret = 1;
3893    } else {
3894        ret = 0;
3895    }
3896
3897    return (ret);
3898}
3899
3900
3901/**
3902 * htmlParseReference:
3903 * @ctxt:  an HTML parser context
3904 *
3905 * parse and handle entity references in content,
3906 * this will end-up in a call to character() since this is either a
3907 * CharRef, or a predefined entity.
3908 */
3909static void
3910htmlParseReference(htmlParserCtxtPtr ctxt) {
3911    const htmlEntityDesc * ent;
3912    xmlChar out[6];
3913    const xmlChar *name;
3914    if (CUR != '&') return;
3915
3916    if (NXT(1) == '#') {
3917	unsigned int c;
3918	int bits, i = 0;
3919
3920	c = htmlParseCharRef(ctxt);
3921	if (c == 0)
3922	    return;
3923
3924        if      (c <    0x80) { out[i++]= c;                bits= -6; }
3925        else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
3926        else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
3927        else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
3928
3929        for ( ; bits >= 0; bits-= 6) {
3930            out[i++]= ((c >> bits) & 0x3F) | 0x80;
3931        }
3932	out[i] = 0;
3933
3934	htmlCheckParagraph(ctxt);
3935	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3936	    ctxt->sax->characters(ctxt->userData, out, i);
3937    } else {
3938	ent = htmlParseEntityRef(ctxt, &name);
3939	if (name == NULL) {
3940	    htmlCheckParagraph(ctxt);
3941	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3942	        ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3943	    return;
3944	}
3945	if ((ent == NULL) || !(ent->value > 0)) {
3946	    htmlCheckParagraph(ctxt);
3947	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3948		ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3949		ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3950		/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3951	    }
3952	} else {
3953	    unsigned int c;
3954	    int bits, i = 0;
3955
3956	    c = ent->value;
3957	    if      (c <    0x80)
3958	            { out[i++]= c;                bits= -6; }
3959	    else if (c <   0x800)
3960	            { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
3961	    else if (c < 0x10000)
3962	            { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
3963	    else
3964	            { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
3965
3966	    for ( ; bits >= 0; bits-= 6) {
3967		out[i++]= ((c >> bits) & 0x3F) | 0x80;
3968	    }
3969	    out[i] = 0;
3970
3971	    htmlCheckParagraph(ctxt);
3972	    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3973		ctxt->sax->characters(ctxt->userData, out, i);
3974	}
3975    }
3976}
3977
3978/**
3979 * htmlParseContent:
3980 * @ctxt:  an HTML parser context
3981 *
3982 * Parse a content: comment, sub-element, reference or text.
3983 * Kept for compatibility with old code
3984 */
3985
3986static void
3987htmlParseContent(htmlParserCtxtPtr ctxt) {
3988    xmlChar *currentNode;
3989    int depth;
3990    const xmlChar *name;
3991
3992    currentNode = xmlStrdup(ctxt->name);
3993    depth = ctxt->nameNr;
3994    while (1) {
3995	long cons = ctxt->nbChars;
3996
3997        GROW;
3998
3999        if (ctxt->instate == XML_PARSER_EOF)
4000            break;
4001
4002	/*
4003	 * Our tag or one of it's parent or children is ending.
4004	 */
4005        if ((CUR == '<') && (NXT(1) == '/')) {
4006	    if (htmlParseEndTag(ctxt) &&
4007		((currentNode != NULL) || (ctxt->nameNr == 0))) {
4008		if (currentNode != NULL)
4009		    xmlFree(currentNode);
4010		return;
4011	    }
4012	    continue; /* while */
4013        }
4014
4015	else if ((CUR == '<') &&
4016	         ((IS_ASCII_LETTER(NXT(1))) ||
4017		  (NXT(1) == '_') || (NXT(1) == ':'))) {
4018	    name = htmlParseHTMLName_nonInvasive(ctxt);
4019	    if (name == NULL) {
4020	        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4021			 "htmlParseStartTag: invalid element name\n",
4022			 NULL, NULL);
4023	        /* Dump the bogus tag like browsers do */
4024        while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4025	            NEXT;
4026
4027	        if (currentNode != NULL)
4028	            xmlFree(currentNode);
4029	        return;
4030	    }
4031
4032	    if (ctxt->name != NULL) {
4033	        if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4034	            htmlAutoClose(ctxt, name);
4035	            continue;
4036	        }
4037	    }
4038	}
4039
4040	/*
4041	 * Has this node been popped out during parsing of
4042	 * the next element
4043	 */
4044        if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4045	    (!xmlStrEqual(currentNode, ctxt->name)))
4046	     {
4047	    if (currentNode != NULL) xmlFree(currentNode);
4048	    return;
4049	}
4050
4051	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4052	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4053	    /*
4054	     * Handle SCRIPT/STYLE separately
4055	     */
4056	    htmlParseScript(ctxt);
4057	} else {
4058	    /*
4059	     * Sometimes DOCTYPE arrives in the middle of the document
4060	     */
4061	    if ((CUR == '<') && (NXT(1) == '!') &&
4062		(UPP(2) == 'D') && (UPP(3) == 'O') &&
4063		(UPP(4) == 'C') && (UPP(5) == 'T') &&
4064		(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4065		(UPP(8) == 'E')) {
4066		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4067		             "Misplaced DOCTYPE declaration\n",
4068			     BAD_CAST "DOCTYPE" , NULL);
4069		htmlParseDocTypeDecl(ctxt);
4070	    }
4071
4072	    /*
4073	     * First case :  a comment
4074	     */
4075	    if ((CUR == '<') && (NXT(1) == '!') &&
4076		(NXT(2) == '-') && (NXT(3) == '-')) {
4077		htmlParseComment(ctxt);
4078	    }
4079
4080	    /*
4081	     * Second case : a Processing Instruction.
4082	     */
4083	    else if ((CUR == '<') && (NXT(1) == '?')) {
4084		htmlParsePI(ctxt);
4085	    }
4086
4087	    /*
4088	     * Third case :  a sub-element.
4089	     */
4090	    else if (CUR == '<') {
4091		htmlParseElement(ctxt);
4092	    }
4093
4094	    /*
4095	     * Fourth case : a reference. If if has not been resolved,
4096	     *    parsing returns it's Name, create the node
4097	     */
4098	    else if (CUR == '&') {
4099		htmlParseReference(ctxt);
4100	    }
4101
4102	    /*
4103	     * Fifth case : end of the resource
4104	     */
4105	    else if (CUR == 0) {
4106		htmlAutoCloseOnEnd(ctxt);
4107		break;
4108	    }
4109
4110	    /*
4111	     * Last case, text. Note that References are handled directly.
4112	     */
4113	    else {
4114		htmlParseCharData(ctxt);
4115	    }
4116
4117	    if (cons == ctxt->nbChars) {
4118		if (ctxt->node != NULL) {
4119		    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4120		                 "detected an error in element content\n",
4121				 NULL, NULL);
4122		}
4123		break;
4124	    }
4125	}
4126        GROW;
4127    }
4128    if (currentNode != NULL) xmlFree(currentNode);
4129}
4130
4131/**
4132 * htmlParseElement:
4133 * @ctxt:  an HTML parser context
4134 *
4135 * parse an HTML element, this is highly recursive
4136 * this is kept for compatibility with previous code versions
4137 *
4138 * [39] element ::= EmptyElemTag | STag content ETag
4139 *
4140 * [41] Attribute ::= Name Eq AttValue
4141 */
4142
4143void
4144htmlParseElement(htmlParserCtxtPtr ctxt) {
4145    const xmlChar *name;
4146    xmlChar *currentNode = NULL;
4147    const htmlElemDesc * info;
4148    htmlParserNodeInfo node_info;
4149    int failed;
4150    int depth;
4151    const xmlChar *oldptr;
4152
4153    if ((ctxt == NULL) || (ctxt->input == NULL)) {
4154	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4155		     "htmlParseElement: context error\n", NULL, NULL);
4156	return;
4157    }
4158
4159    if (ctxt->instate == XML_PARSER_EOF)
4160        return;
4161
4162    /* Capture start position */
4163    if (ctxt->record_info) {
4164        node_info.begin_pos = ctxt->input->consumed +
4165                          (CUR_PTR - ctxt->input->base);
4166	node_info.begin_line = ctxt->input->line;
4167    }
4168
4169    failed = htmlParseStartTag(ctxt);
4170    name = ctxt->name;
4171    if ((failed == -1) || (name == NULL)) {
4172	if (CUR == '>')
4173	    NEXT;
4174        return;
4175    }
4176
4177    /*
4178     * Lookup the info for that element.
4179     */
4180    info = htmlTagLookup(name);
4181    if (info == NULL) {
4182	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4183	             "Tag %s invalid\n", name, NULL);
4184    }
4185
4186    /*
4187     * Check for an Empty Element labeled the XML/SGML way
4188     */
4189    if ((CUR == '/') && (NXT(1) == '>')) {
4190        SKIP(2);
4191	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4192	    ctxt->sax->endElement(ctxt->userData, name);
4193	htmlnamePop(ctxt);
4194	return;
4195    }
4196
4197    if (CUR == '>') {
4198        NEXT;
4199    } else {
4200	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4201	             "Couldn't find end of Start Tag %s\n", name, NULL);
4202
4203	/*
4204	 * end of parsing of this node.
4205	 */
4206	if (xmlStrEqual(name, ctxt->name)) {
4207	    nodePop(ctxt);
4208	    htmlnamePop(ctxt);
4209	}
4210
4211	/*
4212	 * Capture end position and add node
4213	 */
4214	if (ctxt->record_info) {
4215	   node_info.end_pos = ctxt->input->consumed +
4216			      (CUR_PTR - ctxt->input->base);
4217	   node_info.end_line = ctxt->input->line;
4218	   node_info.node = ctxt->node;
4219	   xmlParserAddNodeInfo(ctxt, &node_info);
4220	}
4221	return;
4222    }
4223
4224    /*
4225     * Check for an Empty Element from DTD definition
4226     */
4227    if ((info != NULL) && (info->empty)) {
4228	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4229	    ctxt->sax->endElement(ctxt->userData, name);
4230	htmlnamePop(ctxt);
4231	return;
4232    }
4233
4234    /*
4235     * Parse the content of the element:
4236     */
4237    currentNode = xmlStrdup(ctxt->name);
4238    depth = ctxt->nameNr;
4239    while (IS_CHAR_CH(CUR)) {
4240	oldptr = ctxt->input->cur;
4241	htmlParseContent(ctxt);
4242	if (oldptr==ctxt->input->cur) break;
4243	if (ctxt->nameNr < depth) break;
4244    }
4245
4246    /*
4247     * Capture end position and add node
4248     */
4249    if ( currentNode != NULL && ctxt->record_info ) {
4250       node_info.end_pos = ctxt->input->consumed +
4251                          (CUR_PTR - ctxt->input->base);
4252       node_info.end_line = ctxt->input->line;
4253       node_info.node = ctxt->node;
4254       xmlParserAddNodeInfo(ctxt, &node_info);
4255    }
4256    if (!IS_CHAR_CH(CUR)) {
4257	htmlAutoCloseOnEnd(ctxt);
4258    }
4259
4260    if (currentNode != NULL)
4261	xmlFree(currentNode);
4262}
4263
4264static void
4265htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4266    /*
4267     * Capture end position and add node
4268     */
4269    if ( ctxt->node != NULL && ctxt->record_info ) {
4270       ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4271                                (CUR_PTR - ctxt->input->base);
4272       ctxt->nodeInfo->end_line = ctxt->input->line;
4273       ctxt->nodeInfo->node = ctxt->node;
4274       xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4275       htmlNodeInfoPop(ctxt);
4276    }
4277    if (!IS_CHAR_CH(CUR)) {
4278       htmlAutoCloseOnEnd(ctxt);
4279    }
4280}
4281
4282/**
4283 * htmlParseElementInternal:
4284 * @ctxt:  an HTML parser context
4285 *
4286 * parse an HTML element, new version, non recursive
4287 *
4288 * [39] element ::= EmptyElemTag | STag content ETag
4289 *
4290 * [41] Attribute ::= Name Eq AttValue
4291 */
4292
4293static void
4294htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4295    const xmlChar *name;
4296    const htmlElemDesc * info;
4297    htmlParserNodeInfo node_info;
4298    int failed;
4299
4300    if ((ctxt == NULL) || (ctxt->input == NULL)) {
4301	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4302		     "htmlParseElementInternal: context error\n", NULL, NULL);
4303	return;
4304    }
4305
4306    if (ctxt->instate == XML_PARSER_EOF)
4307        return;
4308
4309    /* Capture start position */
4310    if (ctxt->record_info) {
4311        node_info.begin_pos = ctxt->input->consumed +
4312                          (CUR_PTR - ctxt->input->base);
4313	node_info.begin_line = ctxt->input->line;
4314    }
4315
4316    failed = htmlParseStartTag(ctxt);
4317    name = ctxt->name;
4318    if ((failed == -1) || (name == NULL)) {
4319	if (CUR == '>')
4320	    NEXT;
4321        return;
4322    }
4323
4324    /*
4325     * Lookup the info for that element.
4326     */
4327    info = htmlTagLookup(name);
4328    if (info == NULL) {
4329	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4330	             "Tag %s invalid\n", name, NULL);
4331    }
4332
4333    /*
4334     * Check for an Empty Element labeled the XML/SGML way
4335     */
4336    if ((CUR == '/') && (NXT(1) == '>')) {
4337        SKIP(2);
4338	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4339	    ctxt->sax->endElement(ctxt->userData, name);
4340	htmlnamePop(ctxt);
4341	return;
4342    }
4343
4344    if (CUR == '>') {
4345        NEXT;
4346    } else {
4347	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4348	             "Couldn't find end of Start Tag %s\n", name, NULL);
4349
4350	/*
4351	 * end of parsing of this node.
4352	 */
4353	if (xmlStrEqual(name, ctxt->name)) {
4354	    nodePop(ctxt);
4355	    htmlnamePop(ctxt);
4356	}
4357
4358        if (ctxt->record_info)
4359            htmlNodeInfoPush(ctxt, &node_info);
4360        htmlParserFinishElementParsing(ctxt);
4361	return;
4362    }
4363
4364    /*
4365     * Check for an Empty Element from DTD definition
4366     */
4367    if ((info != NULL) && (info->empty)) {
4368	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4369	    ctxt->sax->endElement(ctxt->userData, name);
4370	htmlnamePop(ctxt);
4371	return;
4372    }
4373
4374    if (ctxt->record_info)
4375        htmlNodeInfoPush(ctxt, &node_info);
4376}
4377
4378/**
4379 * htmlParseContentInternal:
4380 * @ctxt:  an HTML parser context
4381 *
4382 * Parse a content: comment, sub-element, reference or text.
4383 * New version for non recursive htmlParseElementInternal
4384 */
4385
4386static void
4387htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4388    xmlChar *currentNode;
4389    int depth;
4390    const xmlChar *name;
4391
4392    currentNode = xmlStrdup(ctxt->name);
4393    depth = ctxt->nameNr;
4394    while (1) {
4395	long cons = ctxt->nbChars;
4396
4397        GROW;
4398
4399        if (ctxt->instate == XML_PARSER_EOF)
4400            break;
4401
4402	/*
4403	 * Our tag or one of it's parent or children is ending.
4404	 */
4405        if ((CUR == '<') && (NXT(1) == '/')) {
4406	    if (htmlParseEndTag(ctxt) &&
4407		((currentNode != NULL) || (ctxt->nameNr == 0))) {
4408		if (currentNode != NULL)
4409		    xmlFree(currentNode);
4410
4411	        currentNode = xmlStrdup(ctxt->name);
4412	        depth = ctxt->nameNr;
4413	    }
4414	    continue; /* while */
4415        }
4416
4417	else if ((CUR == '<') &&
4418	         ((IS_ASCII_LETTER(NXT(1))) ||
4419		  (NXT(1) == '_') || (NXT(1) == ':'))) {
4420	    name = htmlParseHTMLName_nonInvasive(ctxt);
4421	    if (name == NULL) {
4422	        htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4423			 "htmlParseStartTag: invalid element name\n",
4424			 NULL, NULL);
4425	        /* Dump the bogus tag like browsers do */
4426	        while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4427	            NEXT;
4428
4429	        htmlParserFinishElementParsing(ctxt);
4430	        if (currentNode != NULL)
4431	            xmlFree(currentNode);
4432
4433	        currentNode = xmlStrdup(ctxt->name);
4434	        depth = ctxt->nameNr;
4435	        continue;
4436	    }
4437
4438	    if (ctxt->name != NULL) {
4439	        if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4440	            htmlAutoClose(ctxt, name);
4441	            continue;
4442	        }
4443	    }
4444	}
4445
4446	/*
4447	 * Has this node been popped out during parsing of
4448	 * the next element
4449	 */
4450        if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4451	    (!xmlStrEqual(currentNode, ctxt->name)))
4452	     {
4453	    htmlParserFinishElementParsing(ctxt);
4454	    if (currentNode != NULL) xmlFree(currentNode);
4455
4456	    currentNode = xmlStrdup(ctxt->name);
4457	    depth = ctxt->nameNr;
4458	    continue;
4459	}
4460
4461	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4462	    (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4463	    /*
4464	     * Handle SCRIPT/STYLE separately
4465	     */
4466	    htmlParseScript(ctxt);
4467	} else {
4468	    /*
4469	     * Sometimes DOCTYPE arrives in the middle of the document
4470	     */
4471	    if ((CUR == '<') && (NXT(1) == '!') &&
4472		(UPP(2) == 'D') && (UPP(3) == 'O') &&
4473		(UPP(4) == 'C') && (UPP(5) == 'T') &&
4474		(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4475		(UPP(8) == 'E')) {
4476		htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4477		             "Misplaced DOCTYPE declaration\n",
4478			     BAD_CAST "DOCTYPE" , NULL);
4479		htmlParseDocTypeDecl(ctxt);
4480	    }
4481
4482	    /*
4483	     * First case :  a comment
4484	     */
4485	    if ((CUR == '<') && (NXT(1) == '!') &&
4486		(NXT(2) == '-') && (NXT(3) == '-')) {
4487		htmlParseComment(ctxt);
4488	    }
4489
4490	    /*
4491	     * Second case : a Processing Instruction.
4492	     */
4493	    else if ((CUR == '<') && (NXT(1) == '?')) {
4494		htmlParsePI(ctxt);
4495	    }
4496
4497	    /*
4498	     * Third case :  a sub-element.
4499	     */
4500	    else if (CUR == '<') {
4501		htmlParseElementInternal(ctxt);
4502		if (currentNode != NULL) xmlFree(currentNode);
4503
4504		currentNode = xmlStrdup(ctxt->name);
4505		depth = ctxt->nameNr;
4506	    }
4507
4508	    /*
4509	     * Fourth case : a reference. If if has not been resolved,
4510	     *    parsing returns it's Name, create the node
4511	     */
4512	    else if (CUR == '&') {
4513		htmlParseReference(ctxt);
4514	    }
4515
4516	    /*
4517	     * Fifth case : end of the resource
4518	     */
4519	    else if (CUR == 0) {
4520		htmlAutoCloseOnEnd(ctxt);
4521		break;
4522	    }
4523
4524	    /*
4525	     * Last case, text. Note that References are handled directly.
4526	     */
4527	    else {
4528		htmlParseCharData(ctxt);
4529	    }
4530
4531	    if (cons == ctxt->nbChars) {
4532		if (ctxt->node != NULL) {
4533		    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4534		                 "detected an error in element content\n",
4535				 NULL, NULL);
4536		}
4537		break;
4538	    }
4539	}
4540        GROW;
4541    }
4542    if (currentNode != NULL) xmlFree(currentNode);
4543}
4544
4545/**
4546 * htmlParseContent:
4547 * @ctxt:  an HTML parser context
4548 *
4549 * Parse a content: comment, sub-element, reference or text.
4550 * This is the entry point when called from parser.c
4551 */
4552
4553void
4554__htmlParseContent(void *ctxt) {
4555    if (ctxt != NULL)
4556	htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4557}
4558
4559/**
4560 * htmlParseDocument:
4561 * @ctxt:  an HTML parser context
4562 *
4563 * parse an HTML document (and build a tree if using the standard SAX
4564 * interface).
4565 *
4566 * Returns 0, -1 in case of error. the parser context is augmented
4567 *                as a result of the parsing.
4568 */
4569
4570int
4571htmlParseDocument(htmlParserCtxtPtr ctxt) {
4572    xmlChar start[4];
4573    xmlCharEncoding enc;
4574    xmlDtdPtr dtd;
4575
4576    xmlInitParser();
4577
4578    htmlDefaultSAXHandlerInit();
4579
4580    if ((ctxt == NULL) || (ctxt->input == NULL)) {
4581	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4582		     "htmlParseDocument: context error\n", NULL, NULL);
4583	return(XML_ERR_INTERNAL_ERROR);
4584    }
4585    ctxt->html = 1;
4586    ctxt->linenumbers = 1;
4587    GROW;
4588    /*
4589     * SAX: beginning of the document processing.
4590     */
4591    if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4592        ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4593
4594    if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4595        ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4596	/*
4597	 * Get the 4 first bytes and decode the charset
4598	 * if enc != XML_CHAR_ENCODING_NONE
4599	 * plug some encoding conversion routines.
4600	 */
4601	start[0] = RAW;
4602	start[1] = NXT(1);
4603	start[2] = NXT(2);
4604	start[3] = NXT(3);
4605	enc = xmlDetectCharEncoding(&start[0], 4);
4606	if (enc != XML_CHAR_ENCODING_NONE) {
4607	    xmlSwitchEncoding(ctxt, enc);
4608	}
4609    }
4610
4611    /*
4612     * Wipe out everything which is before the first '<'
4613     */
4614    SKIP_BLANKS;
4615    if (CUR == 0) {
4616	htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4617	             "Document is empty\n", NULL, NULL);
4618    }
4619
4620    if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4621	ctxt->sax->startDocument(ctxt->userData);
4622
4623
4624    /*
4625     * Parse possible comments and PIs before any content
4626     */
4627    while (((CUR == '<') && (NXT(1) == '!') &&
4628            (NXT(2) == '-') && (NXT(3) == '-')) ||
4629	   ((CUR == '<') && (NXT(1) == '?'))) {
4630        htmlParseComment(ctxt);
4631        htmlParsePI(ctxt);
4632	SKIP_BLANKS;
4633    }
4634
4635
4636    /*
4637     * Then possibly doc type declaration(s) and more Misc
4638     * (doctypedecl Misc*)?
4639     */
4640    if ((CUR == '<') && (NXT(1) == '!') &&
4641	(UPP(2) == 'D') && (UPP(3) == 'O') &&
4642	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4643	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4644	(UPP(8) == 'E')) {
4645	htmlParseDocTypeDecl(ctxt);
4646    }
4647    SKIP_BLANKS;
4648
4649    /*
4650     * Parse possible comments and PIs before any content
4651     */
4652    while (((CUR == '<') && (NXT(1) == '!') &&
4653            (NXT(2) == '-') && (NXT(3) == '-')) ||
4654	   ((CUR == '<') && (NXT(1) == '?'))) {
4655        htmlParseComment(ctxt);
4656        htmlParsePI(ctxt);
4657	SKIP_BLANKS;
4658    }
4659
4660    /*
4661     * Time to start parsing the tree itself
4662     */
4663    htmlParseContentInternal(ctxt);
4664
4665    /*
4666     * autoclose
4667     */
4668    if (CUR == 0)
4669	htmlAutoCloseOnEnd(ctxt);
4670
4671
4672    /*
4673     * SAX: end of the document processing.
4674     */
4675    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4676        ctxt->sax->endDocument(ctxt->userData);
4677
4678    if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4679	dtd = xmlGetIntSubset(ctxt->myDoc);
4680	if (dtd == NULL)
4681	    ctxt->myDoc->intSubset =
4682		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4683		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4684		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4685    }
4686    if (! ctxt->wellFormed) return(-1);
4687    return(0);
4688}
4689
4690
4691/************************************************************************
4692 *									*
4693 *			Parser contexts handling			*
4694 *									*
4695 ************************************************************************/
4696
4697/**
4698 * htmlInitParserCtxt:
4699 * @ctxt:  an HTML parser context
4700 *
4701 * Initialize a parser context
4702 *
4703 * Returns 0 in case of success and -1 in case of error
4704 */
4705
4706static int
4707htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4708{
4709    htmlSAXHandler *sax;
4710
4711    if (ctxt == NULL) return(-1);
4712    memset(ctxt, 0, sizeof(htmlParserCtxt));
4713
4714    ctxt->dict = xmlDictCreate();
4715    if (ctxt->dict == NULL) {
4716        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4717	return(-1);
4718    }
4719    sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4720    if (sax == NULL) {
4721        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4722	return(-1);
4723    }
4724    else
4725        memset(sax, 0, sizeof(htmlSAXHandler));
4726
4727    /* Allocate the Input stack */
4728    ctxt->inputTab = (htmlParserInputPtr *)
4729                      xmlMalloc(5 * sizeof(htmlParserInputPtr));
4730    if (ctxt->inputTab == NULL) {
4731        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4732	ctxt->inputNr = 0;
4733	ctxt->inputMax = 0;
4734	ctxt->input = NULL;
4735	return(-1);
4736    }
4737    ctxt->inputNr = 0;
4738    ctxt->inputMax = 5;
4739    ctxt->input = NULL;
4740    ctxt->version = NULL;
4741    ctxt->encoding = NULL;
4742    ctxt->standalone = -1;
4743    ctxt->instate = XML_PARSER_START;
4744
4745    /* Allocate the Node stack */
4746    ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4747    if (ctxt->nodeTab == NULL) {
4748        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4749	ctxt->nodeNr = 0;
4750	ctxt->nodeMax = 0;
4751	ctxt->node = NULL;
4752	ctxt->inputNr = 0;
4753	ctxt->inputMax = 0;
4754	ctxt->input = NULL;
4755	return(-1);
4756    }
4757    ctxt->nodeNr = 0;
4758    ctxt->nodeMax = 10;
4759    ctxt->node = NULL;
4760
4761    /* Allocate the Name stack */
4762    ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4763    if (ctxt->nameTab == NULL) {
4764        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4765	ctxt->nameNr = 0;
4766	ctxt->nameMax = 0;
4767	ctxt->name = NULL;
4768	ctxt->nodeNr = 0;
4769	ctxt->nodeMax = 0;
4770	ctxt->node = NULL;
4771	ctxt->inputNr = 0;
4772	ctxt->inputMax = 0;
4773	ctxt->input = NULL;
4774	return(-1);
4775    }
4776    ctxt->nameNr = 0;
4777    ctxt->nameMax = 10;
4778    ctxt->name = NULL;
4779
4780    ctxt->nodeInfoTab = NULL;
4781    ctxt->nodeInfoNr  = 0;
4782    ctxt->nodeInfoMax = 0;
4783
4784    if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
4785    else {
4786        ctxt->sax = sax;
4787	memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
4788    }
4789    ctxt->userData = ctxt;
4790    ctxt->myDoc = NULL;
4791    ctxt->wellFormed = 1;
4792    ctxt->replaceEntities = 0;
4793    ctxt->linenumbers = xmlLineNumbersDefaultValue;
4794    ctxt->html = 1;
4795    ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
4796    ctxt->vctxt.userData = ctxt;
4797    ctxt->vctxt.error = xmlParserValidityError;
4798    ctxt->vctxt.warning = xmlParserValidityWarning;
4799    ctxt->record_info = 0;
4800    ctxt->validate = 0;
4801    ctxt->nbChars = 0;
4802    ctxt->checkIndex = 0;
4803    ctxt->catalogs = NULL;
4804    xmlInitNodeInfoSeq(&ctxt->node_seq);
4805    return(0);
4806}
4807
4808/**
4809 * htmlFreeParserCtxt:
4810 * @ctxt:  an HTML parser context
4811 *
4812 * Free all the memory used by a parser context. However the parsed
4813 * document in ctxt->myDoc is not freed.
4814 */
4815
4816void
4817htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4818{
4819    xmlFreeParserCtxt(ctxt);
4820}
4821
4822/**
4823 * htmlNewParserCtxt:
4824 *
4825 * Allocate and initialize a new parser context.
4826 *
4827 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4828 */
4829
4830htmlParserCtxtPtr
4831htmlNewParserCtxt(void)
4832{
4833    xmlParserCtxtPtr ctxt;
4834
4835    ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4836    if (ctxt == NULL) {
4837        htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
4838	return(NULL);
4839    }
4840    memset(ctxt, 0, sizeof(xmlParserCtxt));
4841    if (htmlInitParserCtxt(ctxt) < 0) {
4842        htmlFreeParserCtxt(ctxt);
4843	return(NULL);
4844    }
4845    return(ctxt);
4846}
4847
4848/**
4849 * htmlCreateMemoryParserCtxt:
4850 * @buffer:  a pointer to a char array
4851 * @size:  the size of the array
4852 *
4853 * Create a parser context for an HTML in-memory document.
4854 *
4855 * Returns the new parser context or NULL
4856 */
4857htmlParserCtxtPtr
4858htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4859    xmlParserCtxtPtr ctxt;
4860    xmlParserInputPtr input;
4861    xmlParserInputBufferPtr buf;
4862
4863    if (buffer == NULL)
4864	return(NULL);
4865    if (size <= 0)
4866	return(NULL);
4867
4868    ctxt = htmlNewParserCtxt();
4869    if (ctxt == NULL)
4870	return(NULL);
4871
4872    buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4873    if (buf == NULL) return(NULL);
4874
4875    input = xmlNewInputStream(ctxt);
4876    if (input == NULL) {
4877	xmlFreeParserCtxt(ctxt);
4878	return(NULL);
4879    }
4880
4881    input->filename = NULL;
4882    input->buf = buf;
4883    input->base = input->buf->buffer->content;
4884    input->cur = input->buf->buffer->content;
4885    input->end = &input->buf->buffer->content[input->buf->buffer->use];
4886
4887    inputPush(ctxt, input);
4888    return(ctxt);
4889}
4890
4891/**
4892 * htmlCreateDocParserCtxt:
4893 * @cur:  a pointer to an array of xmlChar
4894 * @encoding:  a free form C string describing the HTML document encoding, or NULL
4895 *
4896 * Create a parser context for an HTML document.
4897 *
4898 * TODO: check the need to add encoding handling there
4899 *
4900 * Returns the new parser context or NULL
4901 */
4902static htmlParserCtxtPtr
4903htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
4904    int len;
4905    htmlParserCtxtPtr ctxt;
4906
4907    if (cur == NULL)
4908	return(NULL);
4909    len = xmlStrlen(cur);
4910    ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4911    if (ctxt == NULL)
4912	return(NULL);
4913
4914    if (encoding != NULL) {
4915	xmlCharEncoding enc;
4916	xmlCharEncodingHandlerPtr handler;
4917
4918	if (ctxt->input->encoding != NULL)
4919	    xmlFree((xmlChar *) ctxt->input->encoding);
4920	ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
4921
4922	enc = xmlParseCharEncoding(encoding);
4923	/*
4924	 * registered set of known encodings
4925	 */
4926	if (enc != XML_CHAR_ENCODING_ERROR) {
4927	    xmlSwitchEncoding(ctxt, enc);
4928	    if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
4929		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4930		             "Unsupported encoding %s\n",
4931			     (const xmlChar *) encoding, NULL);
4932	    }
4933	} else {
4934	    /*
4935	     * fallback for unknown encodings
4936	     */
4937	    handler = xmlFindCharEncodingHandler((const char *) encoding);
4938	    if (handler != NULL) {
4939		xmlSwitchToEncoding(ctxt, handler);
4940	    } else {
4941		htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4942		             "Unsupported encoding %s\n",
4943			     (const xmlChar *) encoding, NULL);
4944	    }
4945	}
4946    }
4947    return(ctxt);
4948}
4949
4950#ifdef LIBXML_PUSH_ENABLED
4951/************************************************************************
4952 *									*
4953 *	Progressive parsing interfaces				*
4954 *									*
4955 ************************************************************************/
4956
4957/**
4958 * htmlParseLookupSequence:
4959 * @ctxt:  an HTML parser context
4960 * @first:  the first char to lookup
4961 * @next:  the next char to lookup or zero
4962 * @third:  the next char to lookup or zero
4963 * @comment: flag to force checking inside comments
4964 *
4965 * Try to find if a sequence (first, next, third) or  just (first next) or
4966 * (first) is available in the input stream.
4967 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
4968 * to avoid rescanning sequences of bytes, it DOES change the state of the
4969 * parser, do not use liberally.
4970 * This is basically similar to xmlParseLookupSequence()
4971 *
4972 * Returns the index to the current parsing point if the full sequence
4973 *      is available, -1 otherwise.
4974 */
4975static int
4976htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
4977                        xmlChar next, xmlChar third, int iscomment,
4978                        int ignoreattrval)
4979{
4980    int base, len;
4981    htmlParserInputPtr in;
4982    const xmlChar *buf;
4983    int incomment = 0;
4984    int invalue = 0;
4985    char valdellim = 0x0;
4986
4987    in = ctxt->input;
4988    if (in == NULL)
4989        return (-1);
4990
4991    base = in->cur - in->base;
4992    if (base < 0)
4993        return (-1);
4994
4995    if (ctxt->checkIndex > base)
4996        base = ctxt->checkIndex;
4997
4998    if (in->buf == NULL) {
4999        buf = in->base;
5000        len = in->length;
5001    } else {
5002        buf = in->buf->buffer->content;
5003        len = in->buf->buffer->use;
5004    }
5005
5006    /* take into account the sequence length */
5007    if (third)
5008        len -= 2;
5009    else if (next)
5010        len--;
5011    for (; base < len; base++) {
5012        if ((!incomment) && (base + 4 < len) && (!iscomment)) {
5013            if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5014                (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5015                incomment = 1;
5016                /* do not increment past <! - some people use <!--> */
5017                base += 2;
5018            }
5019        }
5020        if (ignoreattrval) {
5021            if (buf[base] == '"' || buf[base] == '\'') {
5022                if (invalue) {
5023                    if (buf[base] == valdellim) {
5024                        invalue = 0;
5025                        continue;
5026                    }
5027                } else {
5028                    valdellim = buf[base];
5029                    invalue = 1;
5030                    continue;
5031                }
5032            } else if (invalue) {
5033                continue;
5034            }
5035        }
5036        if (incomment) {
5037            if (base + 3 > len)
5038                return (-1);
5039            if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5040                (buf[base + 2] == '>')) {
5041                incomment = 0;
5042                base += 2;
5043            }
5044            continue;
5045        }
5046        if (buf[base] == first) {
5047            if (third != 0) {
5048                if ((buf[base + 1] != next) || (buf[base + 2] != third))
5049                    continue;
5050            } else if (next != 0) {
5051                if (buf[base + 1] != next)
5052                    continue;
5053            }
5054            ctxt->checkIndex = 0;
5055#ifdef DEBUG_PUSH
5056            if (next == 0)
5057                xmlGenericError(xmlGenericErrorContext,
5058                                "HPP: lookup '%c' found at %d\n",
5059                                first, base);
5060            else if (third == 0)
5061                xmlGenericError(xmlGenericErrorContext,
5062                                "HPP: lookup '%c%c' found at %d\n",
5063                                first, next, base);
5064            else
5065                xmlGenericError(xmlGenericErrorContext,
5066                                "HPP: lookup '%c%c%c' found at %d\n",
5067                                first, next, third, base);
5068#endif
5069            return (base - (in->cur - in->base));
5070        }
5071    }
5072    if ((!incomment) && (!invalue))
5073        ctxt->checkIndex = base;
5074#ifdef DEBUG_PUSH
5075    if (next == 0)
5076        xmlGenericError(xmlGenericErrorContext,
5077                        "HPP: lookup '%c' failed\n", first);
5078    else if (third == 0)
5079        xmlGenericError(xmlGenericErrorContext,
5080                        "HPP: lookup '%c%c' failed\n", first, next);
5081    else
5082        xmlGenericError(xmlGenericErrorContext,
5083                        "HPP: lookup '%c%c%c' failed\n", first, next,
5084                        third);
5085#endif
5086    return (-1);
5087}
5088
5089/**
5090 * htmlParseLookupChars:
5091 * @ctxt: an HTML parser context
5092 * @stop: Array of chars, which stop the lookup.
5093 * @stopLen: Length of stop-Array
5094 *
5095 * Try to find if any char of the stop-Array is available in the input
5096 * stream.
5097 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5098 * to avoid rescanning sequences of bytes, it DOES change the state of the
5099 * parser, do not use liberally.
5100 *
5101 * Returns the index to the current parsing point if a stopChar
5102 *      is available, -1 otherwise.
5103 */
5104static int
5105htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
5106                     int stopLen)
5107{
5108    int base, len;
5109    htmlParserInputPtr in;
5110    const xmlChar *buf;
5111    int incomment = 0;
5112    int i;
5113
5114    in = ctxt->input;
5115    if (in == NULL)
5116        return (-1);
5117
5118    base = in->cur - in->base;
5119    if (base < 0)
5120        return (-1);
5121
5122    if (ctxt->checkIndex > base)
5123        base = ctxt->checkIndex;
5124
5125    if (in->buf == NULL) {
5126        buf = in->base;
5127        len = in->length;
5128    } else {
5129        buf = in->buf->buffer->content;
5130        len = in->buf->buffer->use;
5131    }
5132
5133    for (; base < len; base++) {
5134        if (!incomment && (base + 4 < len)) {
5135            if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5136                (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5137                incomment = 1;
5138                /* do not increment past <! - some people use <!--> */
5139                base += 2;
5140            }
5141        }
5142        if (incomment) {
5143            if (base + 3 > len)
5144                return (-1);
5145            if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5146                (buf[base + 2] == '>')) {
5147                incomment = 0;
5148                base += 2;
5149            }
5150            continue;
5151        }
5152        for (i = 0; i < stopLen; ++i) {
5153            if (buf[base] == stop[i]) {
5154                ctxt->checkIndex = 0;
5155                return (base - (in->cur - in->base));
5156            }
5157        }
5158    }
5159    ctxt->checkIndex = base;
5160    return (-1);
5161}
5162
5163/**
5164 * htmlParseTryOrFinish:
5165 * @ctxt:  an HTML parser context
5166 * @terminate:  last chunk indicator
5167 *
5168 * Try to progress on parsing
5169 *
5170 * Returns zero if no parsing was possible
5171 */
5172static int
5173htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5174    int ret = 0;
5175    htmlParserInputPtr in;
5176    int avail = 0;
5177    xmlChar cur, next;
5178
5179#ifdef DEBUG_PUSH
5180    switch (ctxt->instate) {
5181	case XML_PARSER_EOF:
5182	    xmlGenericError(xmlGenericErrorContext,
5183		    "HPP: try EOF\n"); break;
5184	case XML_PARSER_START:
5185	    xmlGenericError(xmlGenericErrorContext,
5186		    "HPP: try START\n"); break;
5187	case XML_PARSER_MISC:
5188	    xmlGenericError(xmlGenericErrorContext,
5189		    "HPP: try MISC\n");break;
5190	case XML_PARSER_COMMENT:
5191	    xmlGenericError(xmlGenericErrorContext,
5192		    "HPP: try COMMENT\n");break;
5193	case XML_PARSER_PROLOG:
5194	    xmlGenericError(xmlGenericErrorContext,
5195		    "HPP: try PROLOG\n");break;
5196	case XML_PARSER_START_TAG:
5197	    xmlGenericError(xmlGenericErrorContext,
5198		    "HPP: try START_TAG\n");break;
5199	case XML_PARSER_CONTENT:
5200	    xmlGenericError(xmlGenericErrorContext,
5201		    "HPP: try CONTENT\n");break;
5202	case XML_PARSER_CDATA_SECTION:
5203	    xmlGenericError(xmlGenericErrorContext,
5204		    "HPP: try CDATA_SECTION\n");break;
5205	case XML_PARSER_END_TAG:
5206	    xmlGenericError(xmlGenericErrorContext,
5207		    "HPP: try END_TAG\n");break;
5208	case XML_PARSER_ENTITY_DECL:
5209	    xmlGenericError(xmlGenericErrorContext,
5210		    "HPP: try ENTITY_DECL\n");break;
5211	case XML_PARSER_ENTITY_VALUE:
5212	    xmlGenericError(xmlGenericErrorContext,
5213		    "HPP: try ENTITY_VALUE\n");break;
5214	case XML_PARSER_ATTRIBUTE_VALUE:
5215	    xmlGenericError(xmlGenericErrorContext,
5216		    "HPP: try ATTRIBUTE_VALUE\n");break;
5217	case XML_PARSER_DTD:
5218	    xmlGenericError(xmlGenericErrorContext,
5219		    "HPP: try DTD\n");break;
5220	case XML_PARSER_EPILOG:
5221	    xmlGenericError(xmlGenericErrorContext,
5222		    "HPP: try EPILOG\n");break;
5223	case XML_PARSER_PI:
5224	    xmlGenericError(xmlGenericErrorContext,
5225		    "HPP: try PI\n");break;
5226	case XML_PARSER_SYSTEM_LITERAL:
5227	    xmlGenericError(xmlGenericErrorContext,
5228		    "HPP: try SYSTEM_LITERAL\n");break;
5229    }
5230#endif
5231
5232    while (1) {
5233
5234	in = ctxt->input;
5235	if (in == NULL) break;
5236	if (in->buf == NULL)
5237	    avail = in->length - (in->cur - in->base);
5238	else
5239	    avail = in->buf->buffer->use - (in->cur - in->base);
5240	if ((avail == 0) && (terminate)) {
5241	    htmlAutoCloseOnEnd(ctxt);
5242	    if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5243		/*
5244		 * SAX: end of the document processing.
5245		 */
5246		ctxt->instate = XML_PARSER_EOF;
5247		if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5248		    ctxt->sax->endDocument(ctxt->userData);
5249	    }
5250	}
5251        if (avail < 1)
5252	    goto done;
5253	cur = in->cur[0];
5254	if (cur == 0) {
5255	    SKIP(1);
5256	    continue;
5257	}
5258
5259        switch (ctxt->instate) {
5260            case XML_PARSER_EOF:
5261	        /*
5262		 * Document parsing is done !
5263		 */
5264	        goto done;
5265            case XML_PARSER_START:
5266	        /*
5267		 * Very first chars read from the document flow.
5268		 */
5269		cur = in->cur[0];
5270		if (IS_BLANK_CH(cur)) {
5271		    SKIP_BLANKS;
5272		    if (in->buf == NULL)
5273			avail = in->length - (in->cur - in->base);
5274		    else
5275			avail = in->buf->buffer->use - (in->cur - in->base);
5276		}
5277		if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5278		    ctxt->sax->setDocumentLocator(ctxt->userData,
5279						  &xmlDefaultSAXLocator);
5280		if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5281	            (!ctxt->disableSAX))
5282		    ctxt->sax->startDocument(ctxt->userData);
5283
5284		cur = in->cur[0];
5285		next = in->cur[1];
5286		if ((cur == '<') && (next == '!') &&
5287		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
5288		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
5289		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5290		    (UPP(8) == 'E')) {
5291		    if ((!terminate) &&
5292		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5293			goto done;
5294#ifdef DEBUG_PUSH
5295		    xmlGenericError(xmlGenericErrorContext,
5296			    "HPP: Parsing internal subset\n");
5297#endif
5298		    htmlParseDocTypeDecl(ctxt);
5299		    ctxt->instate = XML_PARSER_PROLOG;
5300#ifdef DEBUG_PUSH
5301		    xmlGenericError(xmlGenericErrorContext,
5302			    "HPP: entering PROLOG\n");
5303#endif
5304                } else {
5305		    ctxt->instate = XML_PARSER_MISC;
5306#ifdef DEBUG_PUSH
5307		    xmlGenericError(xmlGenericErrorContext,
5308			    "HPP: entering MISC\n");
5309#endif
5310		}
5311		break;
5312            case XML_PARSER_MISC:
5313		SKIP_BLANKS;
5314		if (in->buf == NULL)
5315		    avail = in->length - (in->cur - in->base);
5316		else
5317		    avail = in->buf->buffer->use - (in->cur - in->base);
5318		if (avail < 2)
5319		    goto done;
5320		cur = in->cur[0];
5321		next = in->cur[1];
5322	        if ((cur == '<') && (next == '!') &&
5323		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5324		    if ((!terminate) &&
5325		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5326			goto done;
5327#ifdef DEBUG_PUSH
5328		    xmlGenericError(xmlGenericErrorContext,
5329			    "HPP: Parsing Comment\n");
5330#endif
5331		    htmlParseComment(ctxt);
5332		    ctxt->instate = XML_PARSER_MISC;
5333	        } else if ((cur == '<') && (next == '?')) {
5334		    if ((!terminate) &&
5335		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5336			goto done;
5337#ifdef DEBUG_PUSH
5338		    xmlGenericError(xmlGenericErrorContext,
5339			    "HPP: Parsing PI\n");
5340#endif
5341		    htmlParsePI(ctxt);
5342		    ctxt->instate = XML_PARSER_MISC;
5343		} else if ((cur == '<') && (next == '!') &&
5344		    (UPP(2) == 'D') && (UPP(3) == 'O') &&
5345		    (UPP(4) == 'C') && (UPP(5) == 'T') &&
5346		    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5347		    (UPP(8) == 'E')) {
5348		    if ((!terminate) &&
5349		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5350			goto done;
5351#ifdef DEBUG_PUSH
5352		    xmlGenericError(xmlGenericErrorContext,
5353			    "HPP: Parsing internal subset\n");
5354#endif
5355		    htmlParseDocTypeDecl(ctxt);
5356		    ctxt->instate = XML_PARSER_PROLOG;
5357#ifdef DEBUG_PUSH
5358		    xmlGenericError(xmlGenericErrorContext,
5359			    "HPP: entering PROLOG\n");
5360#endif
5361		} else if ((cur == '<') && (next == '!') &&
5362		           (avail < 9)) {
5363		    goto done;
5364		} else {
5365		    ctxt->instate = XML_PARSER_START_TAG;
5366#ifdef DEBUG_PUSH
5367		    xmlGenericError(xmlGenericErrorContext,
5368			    "HPP: entering START_TAG\n");
5369#endif
5370		}
5371		break;
5372            case XML_PARSER_PROLOG:
5373		SKIP_BLANKS;
5374		if (in->buf == NULL)
5375		    avail = in->length - (in->cur - in->base);
5376		else
5377		    avail = in->buf->buffer->use - (in->cur - in->base);
5378		if (avail < 2)
5379		    goto done;
5380		cur = in->cur[0];
5381		next = in->cur[1];
5382		if ((cur == '<') && (next == '!') &&
5383		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5384		    if ((!terminate) &&
5385		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5386			goto done;
5387#ifdef DEBUG_PUSH
5388		    xmlGenericError(xmlGenericErrorContext,
5389			    "HPP: Parsing Comment\n");
5390#endif
5391		    htmlParseComment(ctxt);
5392		    ctxt->instate = XML_PARSER_PROLOG;
5393	        } else if ((cur == '<') && (next == '?')) {
5394		    if ((!terminate) &&
5395		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5396			goto done;
5397#ifdef DEBUG_PUSH
5398		    xmlGenericError(xmlGenericErrorContext,
5399			    "HPP: Parsing PI\n");
5400#endif
5401		    htmlParsePI(ctxt);
5402		    ctxt->instate = XML_PARSER_PROLOG;
5403		} else if ((cur == '<') && (next == '!') &&
5404		           (avail < 4)) {
5405		    goto done;
5406		} else {
5407		    ctxt->instate = XML_PARSER_START_TAG;
5408#ifdef DEBUG_PUSH
5409		    xmlGenericError(xmlGenericErrorContext,
5410			    "HPP: entering START_TAG\n");
5411#endif
5412		}
5413		break;
5414            case XML_PARSER_EPILOG:
5415		if (in->buf == NULL)
5416		    avail = in->length - (in->cur - in->base);
5417		else
5418		    avail = in->buf->buffer->use - (in->cur - in->base);
5419		if (avail < 1)
5420		    goto done;
5421		cur = in->cur[0];
5422		if (IS_BLANK_CH(cur)) {
5423		    htmlParseCharData(ctxt);
5424		    goto done;
5425		}
5426		if (avail < 2)
5427		    goto done;
5428		next = in->cur[1];
5429	        if ((cur == '<') && (next == '!') &&
5430		    (in->cur[2] == '-') && (in->cur[3] == '-')) {
5431		    if ((!terminate) &&
5432		        (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5433			goto done;
5434#ifdef DEBUG_PUSH
5435		    xmlGenericError(xmlGenericErrorContext,
5436			    "HPP: Parsing Comment\n");
5437#endif
5438		    htmlParseComment(ctxt);
5439		    ctxt->instate = XML_PARSER_EPILOG;
5440	        } else if ((cur == '<') && (next == '?')) {
5441		    if ((!terminate) &&
5442		        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5443			goto done;
5444#ifdef DEBUG_PUSH
5445		    xmlGenericError(xmlGenericErrorContext,
5446			    "HPP: Parsing PI\n");
5447#endif
5448		    htmlParsePI(ctxt);
5449		    ctxt->instate = XML_PARSER_EPILOG;
5450		} else if ((cur == '<') && (next == '!') &&
5451		           (avail < 4)) {
5452		    goto done;
5453		} else {
5454		    ctxt->errNo = XML_ERR_DOCUMENT_END;
5455		    ctxt->wellFormed = 0;
5456		    ctxt->instate = XML_PARSER_EOF;
5457#ifdef DEBUG_PUSH
5458		    xmlGenericError(xmlGenericErrorContext,
5459			    "HPP: entering EOF\n");
5460#endif
5461		    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5462			ctxt->sax->endDocument(ctxt->userData);
5463		    goto done;
5464		}
5465		break;
5466            case XML_PARSER_START_TAG: {
5467	        const xmlChar *name;
5468		int failed;
5469		const htmlElemDesc * info;
5470
5471		if (avail < 2)
5472		    goto done;
5473		cur = in->cur[0];
5474	        if (cur != '<') {
5475		    ctxt->instate = XML_PARSER_CONTENT;
5476#ifdef DEBUG_PUSH
5477		    xmlGenericError(xmlGenericErrorContext,
5478			    "HPP: entering CONTENT\n");
5479#endif
5480		    break;
5481		}
5482		if (in->cur[1] == '/') {
5483		    ctxt->instate = XML_PARSER_END_TAG;
5484		    ctxt->checkIndex = 0;
5485#ifdef DEBUG_PUSH
5486		    xmlGenericError(xmlGenericErrorContext,
5487			    "HPP: entering END_TAG\n");
5488#endif
5489		    break;
5490		}
5491		if ((!terminate) &&
5492		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5493		    goto done;
5494
5495		failed = htmlParseStartTag(ctxt);
5496		name = ctxt->name;
5497		if ((failed == -1) ||
5498		    (name == NULL)) {
5499		    if (CUR == '>')
5500			NEXT;
5501		    break;
5502		}
5503
5504		/*
5505		 * Lookup the info for that element.
5506		 */
5507		info = htmlTagLookup(name);
5508		if (info == NULL) {
5509		    htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5510		                 "Tag %s invalid\n", name, NULL);
5511		}
5512
5513		/*
5514		 * Check for an Empty Element labeled the XML/SGML way
5515		 */
5516		if ((CUR == '/') && (NXT(1) == '>')) {
5517		    SKIP(2);
5518		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5519			ctxt->sax->endElement(ctxt->userData, name);
5520		    htmlnamePop(ctxt);
5521		    ctxt->instate = XML_PARSER_CONTENT;
5522#ifdef DEBUG_PUSH
5523		    xmlGenericError(xmlGenericErrorContext,
5524			    "HPP: entering CONTENT\n");
5525#endif
5526		    break;
5527		}
5528
5529		if (CUR == '>') {
5530		    NEXT;
5531		} else {
5532		    htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5533		                 "Couldn't find end of Start Tag %s\n",
5534				 name, NULL);
5535
5536		    /*
5537		     * end of parsing of this node.
5538		     */
5539		    if (xmlStrEqual(name, ctxt->name)) {
5540			nodePop(ctxt);
5541			htmlnamePop(ctxt);
5542		    }
5543
5544		    ctxt->instate = XML_PARSER_CONTENT;
5545#ifdef DEBUG_PUSH
5546		    xmlGenericError(xmlGenericErrorContext,
5547			    "HPP: entering CONTENT\n");
5548#endif
5549		    break;
5550		}
5551
5552		/*
5553		 * Check for an Empty Element from DTD definition
5554		 */
5555		if ((info != NULL) && (info->empty)) {
5556		    if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5557			ctxt->sax->endElement(ctxt->userData, name);
5558		    htmlnamePop(ctxt);
5559		}
5560		ctxt->instate = XML_PARSER_CONTENT;
5561#ifdef DEBUG_PUSH
5562		xmlGenericError(xmlGenericErrorContext,
5563			"HPP: entering CONTENT\n");
5564#endif
5565                break;
5566	    }
5567            case XML_PARSER_CONTENT: {
5568		long cons;
5569                /*
5570		 * Handle preparsed entities and charRef
5571		 */
5572		if (ctxt->token != 0) {
5573		    xmlChar chr[2] = { 0 , 0 } ;
5574
5575		    chr[0] = (xmlChar) ctxt->token;
5576		    htmlCheckParagraph(ctxt);
5577		    if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5578			ctxt->sax->characters(ctxt->userData, chr, 1);
5579		    ctxt->token = 0;
5580		    ctxt->checkIndex = 0;
5581		}
5582		if ((avail == 1) && (terminate)) {
5583		    cur = in->cur[0];
5584		    if ((cur != '<') && (cur != '&')) {
5585			if (ctxt->sax != NULL) {
5586			    if (IS_BLANK_CH(cur)) {
5587				if (ctxt->sax->ignorableWhitespace != NULL)
5588				    ctxt->sax->ignorableWhitespace(
5589					    ctxt->userData, &cur, 1);
5590			    } else {
5591				htmlCheckParagraph(ctxt);
5592				if (ctxt->sax->characters != NULL)
5593				    ctxt->sax->characters(
5594					    ctxt->userData, &cur, 1);
5595			    }
5596			}
5597			ctxt->token = 0;
5598			ctxt->checkIndex = 0;
5599			in->cur++;
5600			break;
5601		    }
5602		}
5603		if (avail < 2)
5604		    goto done;
5605		cur = in->cur[0];
5606		next = in->cur[1];
5607		cons = ctxt->nbChars;
5608		if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5609		    (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5610		    /*
5611		     * Handle SCRIPT/STYLE separately
5612		     */
5613		    if (!terminate) {
5614		        int idx;
5615			xmlChar val;
5616
5617			idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0);
5618			if (idx < 0)
5619			    goto done;
5620		        val = in->cur[idx + 2];
5621			if (val == 0) /* bad cut of input */
5622			    goto done;
5623		    }
5624		    htmlParseScript(ctxt);
5625		    if ((cur == '<') && (next == '/')) {
5626			ctxt->instate = XML_PARSER_END_TAG;
5627			ctxt->checkIndex = 0;
5628#ifdef DEBUG_PUSH
5629			xmlGenericError(xmlGenericErrorContext,
5630				"HPP: entering END_TAG\n");
5631#endif
5632			break;
5633		    }
5634		} else {
5635		    /*
5636		     * Sometimes DOCTYPE arrives in the middle of the document
5637		     */
5638		    if ((cur == '<') && (next == '!') &&
5639			(UPP(2) == 'D') && (UPP(3) == 'O') &&
5640			(UPP(4) == 'C') && (UPP(5) == 'T') &&
5641			(UPP(6) == 'Y') && (UPP(7) == 'P') &&
5642			(UPP(8) == 'E')) {
5643			if ((!terminate) &&
5644			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5645			    goto done;
5646			htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5647			             "Misplaced DOCTYPE declaration\n",
5648				     BAD_CAST "DOCTYPE" , NULL);
5649			htmlParseDocTypeDecl(ctxt);
5650		    } else if ((cur == '<') && (next == '!') &&
5651			(in->cur[2] == '-') && (in->cur[3] == '-')) {
5652			if ((!terminate) &&
5653			    (htmlParseLookupSequence(
5654				ctxt, '-', '-', '>', 1, 1) < 0))
5655			    goto done;
5656#ifdef DEBUG_PUSH
5657			xmlGenericError(xmlGenericErrorContext,
5658				"HPP: Parsing Comment\n");
5659#endif
5660			htmlParseComment(ctxt);
5661			ctxt->instate = XML_PARSER_CONTENT;
5662		    } else if ((cur == '<') && (next == '?')) {
5663			if ((!terminate) &&
5664			    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5665			    goto done;
5666#ifdef DEBUG_PUSH
5667			xmlGenericError(xmlGenericErrorContext,
5668				"HPP: Parsing PI\n");
5669#endif
5670			htmlParsePI(ctxt);
5671			ctxt->instate = XML_PARSER_CONTENT;
5672		    } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5673			goto done;
5674		    } else if ((cur == '<') && (next == '/')) {
5675			ctxt->instate = XML_PARSER_END_TAG;
5676			ctxt->checkIndex = 0;
5677#ifdef DEBUG_PUSH
5678			xmlGenericError(xmlGenericErrorContext,
5679				"HPP: entering END_TAG\n");
5680#endif
5681			break;
5682		    } else if (cur == '<') {
5683			ctxt->instate = XML_PARSER_START_TAG;
5684			ctxt->checkIndex = 0;
5685#ifdef DEBUG_PUSH
5686			xmlGenericError(xmlGenericErrorContext,
5687				"HPP: entering START_TAG\n");
5688#endif
5689			break;
5690		    } else if (cur == '&') {
5691			if ((!terminate) &&
5692			    (htmlParseLookupChars(ctxt,
5693                                                  BAD_CAST "; >/", 4) < 0))
5694			    goto done;
5695#ifdef DEBUG_PUSH
5696			xmlGenericError(xmlGenericErrorContext,
5697				"HPP: Parsing Reference\n");
5698#endif
5699			/* TODO: check generation of subtrees if noent !!! */
5700			htmlParseReference(ctxt);
5701		    } else {
5702		        /*
5703			 * check that the text sequence is complete
5704			 * before handing out the data to the parser
5705			 * to avoid problems with erroneous end of
5706			 * data detection.
5707			 */
5708			if ((!terminate) &&
5709                            (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
5710			    goto done;
5711			ctxt->checkIndex = 0;
5712#ifdef DEBUG_PUSH
5713			xmlGenericError(xmlGenericErrorContext,
5714				"HPP: Parsing char data\n");
5715#endif
5716			htmlParseCharData(ctxt);
5717		    }
5718		}
5719		if (cons == ctxt->nbChars) {
5720		    if (ctxt->node != NULL) {
5721			htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5722			             "detected an error in element content\n",
5723				     NULL, NULL);
5724		    }
5725		    NEXT;
5726		    break;
5727		}
5728
5729		break;
5730	    }
5731            case XML_PARSER_END_TAG:
5732		if (avail < 2)
5733		    goto done;
5734		if ((!terminate) &&
5735		    (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5736		    goto done;
5737		htmlParseEndTag(ctxt);
5738		if (ctxt->nameNr == 0) {
5739		    ctxt->instate = XML_PARSER_EPILOG;
5740		} else {
5741		    ctxt->instate = XML_PARSER_CONTENT;
5742		}
5743		ctxt->checkIndex = 0;
5744#ifdef DEBUG_PUSH
5745		xmlGenericError(xmlGenericErrorContext,
5746			"HPP: entering CONTENT\n");
5747#endif
5748	        break;
5749            case XML_PARSER_CDATA_SECTION:
5750		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5751			"HPP: internal error, state == CDATA\n",
5752			     NULL, NULL);
5753		ctxt->instate = XML_PARSER_CONTENT;
5754		ctxt->checkIndex = 0;
5755#ifdef DEBUG_PUSH
5756		xmlGenericError(xmlGenericErrorContext,
5757			"HPP: entering CONTENT\n");
5758#endif
5759		break;
5760            case XML_PARSER_DTD:
5761		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5762			"HPP: internal error, state == DTD\n",
5763			     NULL, NULL);
5764		ctxt->instate = XML_PARSER_CONTENT;
5765		ctxt->checkIndex = 0;
5766#ifdef DEBUG_PUSH
5767		xmlGenericError(xmlGenericErrorContext,
5768			"HPP: entering CONTENT\n");
5769#endif
5770		break;
5771            case XML_PARSER_COMMENT:
5772		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5773			"HPP: internal error, state == COMMENT\n",
5774			     NULL, NULL);
5775		ctxt->instate = XML_PARSER_CONTENT;
5776		ctxt->checkIndex = 0;
5777#ifdef DEBUG_PUSH
5778		xmlGenericError(xmlGenericErrorContext,
5779			"HPP: entering CONTENT\n");
5780#endif
5781		break;
5782            case XML_PARSER_PI:
5783		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5784			"HPP: internal error, state == PI\n",
5785			     NULL, NULL);
5786		ctxt->instate = XML_PARSER_CONTENT;
5787		ctxt->checkIndex = 0;
5788#ifdef DEBUG_PUSH
5789		xmlGenericError(xmlGenericErrorContext,
5790			"HPP: entering CONTENT\n");
5791#endif
5792		break;
5793            case XML_PARSER_ENTITY_DECL:
5794		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5795			"HPP: internal error, state == ENTITY_DECL\n",
5796			     NULL, NULL);
5797		ctxt->instate = XML_PARSER_CONTENT;
5798		ctxt->checkIndex = 0;
5799#ifdef DEBUG_PUSH
5800		xmlGenericError(xmlGenericErrorContext,
5801			"HPP: entering CONTENT\n");
5802#endif
5803		break;
5804            case XML_PARSER_ENTITY_VALUE:
5805		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5806			"HPP: internal error, state == ENTITY_VALUE\n",
5807			     NULL, NULL);
5808		ctxt->instate = XML_PARSER_CONTENT;
5809		ctxt->checkIndex = 0;
5810#ifdef DEBUG_PUSH
5811		xmlGenericError(xmlGenericErrorContext,
5812			"HPP: entering DTD\n");
5813#endif
5814		break;
5815            case XML_PARSER_ATTRIBUTE_VALUE:
5816		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5817			"HPP: internal error, state == ATTRIBUTE_VALUE\n",
5818			     NULL, NULL);
5819		ctxt->instate = XML_PARSER_START_TAG;
5820		ctxt->checkIndex = 0;
5821#ifdef DEBUG_PUSH
5822		xmlGenericError(xmlGenericErrorContext,
5823			"HPP: entering START_TAG\n");
5824#endif
5825		break;
5826	    case XML_PARSER_SYSTEM_LITERAL:
5827		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5828		    "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5829			     NULL, NULL);
5830		ctxt->instate = XML_PARSER_CONTENT;
5831		ctxt->checkIndex = 0;
5832#ifdef DEBUG_PUSH
5833		xmlGenericError(xmlGenericErrorContext,
5834			"HPP: entering CONTENT\n");
5835#endif
5836		break;
5837	    case XML_PARSER_IGNORE:
5838		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5839			"HPP: internal error, state == XML_PARSER_IGNORE\n",
5840			     NULL, NULL);
5841		ctxt->instate = XML_PARSER_CONTENT;
5842		ctxt->checkIndex = 0;
5843#ifdef DEBUG_PUSH
5844		xmlGenericError(xmlGenericErrorContext,
5845			"HPP: entering CONTENT\n");
5846#endif
5847		break;
5848	    case XML_PARSER_PUBLIC_LITERAL:
5849		htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5850			"HPP: internal error, state == XML_PARSER_LITERAL\n",
5851			     NULL, NULL);
5852		ctxt->instate = XML_PARSER_CONTENT;
5853		ctxt->checkIndex = 0;
5854#ifdef DEBUG_PUSH
5855		xmlGenericError(xmlGenericErrorContext,
5856			"HPP: entering CONTENT\n");
5857#endif
5858		break;
5859
5860	}
5861    }
5862done:
5863    if ((avail == 0) && (terminate)) {
5864	htmlAutoCloseOnEnd(ctxt);
5865	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5866	    /*
5867	     * SAX: end of the document processing.
5868	     */
5869	    ctxt->instate = XML_PARSER_EOF;
5870	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5871		ctxt->sax->endDocument(ctxt->userData);
5872	}
5873    }
5874    if ((ctxt->myDoc != NULL) &&
5875	((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5876	 (ctxt->instate == XML_PARSER_EPILOG))) {
5877	xmlDtdPtr dtd;
5878	dtd = xmlGetIntSubset(ctxt->myDoc);
5879	if (dtd == NULL)
5880	    ctxt->myDoc->intSubset =
5881		xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5882		    BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5883		    BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5884    }
5885#ifdef DEBUG_PUSH
5886    xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5887#endif
5888    return(ret);
5889}
5890
5891/**
5892 * htmlParseChunk:
5893 * @ctxt:  an HTML parser context
5894 * @chunk:  an char array
5895 * @size:  the size in byte of the chunk
5896 * @terminate:  last chunk indicator
5897 *
5898 * Parse a Chunk of memory
5899 *
5900 * Returns zero if no error, the xmlParserErrors otherwise.
5901 */
5902int
5903htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5904              int terminate) {
5905    if ((ctxt == NULL) || (ctxt->input == NULL)) {
5906	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5907		     "htmlParseChunk: context error\n", NULL, NULL);
5908	return(XML_ERR_INTERNAL_ERROR);
5909    }
5910    if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5911        (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
5912	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5913	int cur = ctxt->input->cur - ctxt->input->base;
5914	int res;
5915
5916	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5917	if (res < 0) {
5918	    ctxt->errNo = XML_PARSER_EOF;
5919	    ctxt->disableSAX = 1;
5920	    return (XML_PARSER_EOF);
5921	}
5922	ctxt->input->base = ctxt->input->buf->buffer->content + base;
5923	ctxt->input->cur = ctxt->input->base + cur;
5924	ctxt->input->end =
5925	  &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
5926#ifdef DEBUG_PUSH
5927	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5928#endif
5929
5930#if 0
5931	if ((terminate) || (ctxt->input->buf->buffer->use > 80))
5932	    htmlParseTryOrFinish(ctxt, terminate);
5933#endif
5934    } else if (ctxt->instate != XML_PARSER_EOF) {
5935	if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5936	    xmlParserInputBufferPtr in = ctxt->input->buf;
5937	    if ((in->encoder != NULL) && (in->buffer != NULL) &&
5938		    (in->raw != NULL)) {
5939		int nbchars;
5940
5941		nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5942		if (nbchars < 0) {
5943		    htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5944			         "encoder error\n", NULL, NULL);
5945		    return(XML_ERR_INVALID_ENCODING);
5946		}
5947	    }
5948	}
5949    }
5950    htmlParseTryOrFinish(ctxt, terminate);
5951    if (terminate) {
5952	if ((ctxt->instate != XML_PARSER_EOF) &&
5953	    (ctxt->instate != XML_PARSER_EPILOG) &&
5954	    (ctxt->instate != XML_PARSER_MISC)) {
5955	    ctxt->errNo = XML_ERR_DOCUMENT_END;
5956	    ctxt->wellFormed = 0;
5957	}
5958	if (ctxt->instate != XML_PARSER_EOF) {
5959	    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5960		ctxt->sax->endDocument(ctxt->userData);
5961	}
5962	ctxt->instate = XML_PARSER_EOF;
5963    }
5964    return((xmlParserErrors) ctxt->errNo);
5965}
5966
5967/************************************************************************
5968 *									*
5969 *			User entry points				*
5970 *									*
5971 ************************************************************************/
5972
5973/**
5974 * htmlCreatePushParserCtxt:
5975 * @sax:  a SAX handler
5976 * @user_data:  The user data returned on SAX callbacks
5977 * @chunk:  a pointer to an array of chars
5978 * @size:  number of chars in the array
5979 * @filename:  an optional file name or URI
5980 * @enc:  an optional encoding
5981 *
5982 * Create a parser context for using the HTML parser in push mode
5983 * The value of @filename is used for fetching external entities
5984 * and error/warning reports.
5985 *
5986 * Returns the new parser context or NULL
5987 */
5988htmlParserCtxtPtr
5989htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5990                         const char *chunk, int size, const char *filename,
5991			 xmlCharEncoding enc) {
5992    htmlParserCtxtPtr ctxt;
5993    htmlParserInputPtr inputStream;
5994    xmlParserInputBufferPtr buf;
5995
5996    xmlInitParser();
5997
5998    buf = xmlAllocParserInputBuffer(enc);
5999    if (buf == NULL) return(NULL);
6000
6001    ctxt = htmlNewParserCtxt();
6002    if (ctxt == NULL) {
6003	xmlFreeParserInputBuffer(buf);
6004	return(NULL);
6005    }
6006    if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6007	ctxt->charset=XML_CHAR_ENCODING_UTF8;
6008    if (sax != NULL) {
6009	if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
6010	    xmlFree(ctxt->sax);
6011	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6012	if (ctxt->sax == NULL) {
6013	    xmlFree(buf);
6014	    xmlFree(ctxt);
6015	    return(NULL);
6016	}
6017	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6018	if (user_data != NULL)
6019	    ctxt->userData = user_data;
6020    }
6021    if (filename == NULL) {
6022	ctxt->directory = NULL;
6023    } else {
6024        ctxt->directory = xmlParserGetDirectory(filename);
6025    }
6026
6027    inputStream = htmlNewInputStream(ctxt);
6028    if (inputStream == NULL) {
6029	xmlFreeParserCtxt(ctxt);
6030	xmlFree(buf);
6031	return(NULL);
6032    }
6033
6034    if (filename == NULL)
6035	inputStream->filename = NULL;
6036    else
6037	inputStream->filename = (char *)
6038	    xmlCanonicPath((const xmlChar *) filename);
6039    inputStream->buf = buf;
6040    inputStream->base = inputStream->buf->buffer->content;
6041    inputStream->cur = inputStream->buf->buffer->content;
6042    inputStream->end =
6043	&inputStream->buf->buffer->content[inputStream->buf->buffer->use];
6044
6045    inputPush(ctxt, inputStream);
6046
6047    if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6048        (ctxt->input->buf != NULL))  {
6049	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
6050	int cur = ctxt->input->cur - ctxt->input->base;
6051
6052	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6053
6054	ctxt->input->base = ctxt->input->buf->buffer->content + base;
6055	ctxt->input->cur = ctxt->input->base + cur;
6056	ctxt->input->end =
6057	    &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
6058#ifdef DEBUG_PUSH
6059	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6060#endif
6061    }
6062    ctxt->progressive = 1;
6063
6064    return(ctxt);
6065}
6066#endif /* LIBXML_PUSH_ENABLED */
6067
6068/**
6069 * htmlSAXParseDoc:
6070 * @cur:  a pointer to an array of xmlChar
6071 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6072 * @sax:  the SAX handler block
6073 * @userData: if using SAX, this pointer will be provided on callbacks.
6074 *
6075 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6076 * to handle parse events. If sax is NULL, fallback to the default DOM
6077 * behavior and return a tree.
6078 *
6079 * Returns the resulting document tree unless SAX is NULL or the document is
6080 *     not well formed.
6081 */
6082
6083htmlDocPtr
6084htmlSAXParseDoc(xmlChar *cur, const char *encoding, htmlSAXHandlerPtr sax, void *userData) {
6085    htmlDocPtr ret;
6086    htmlParserCtxtPtr ctxt;
6087
6088    xmlInitParser();
6089
6090    if (cur == NULL) return(NULL);
6091
6092
6093    ctxt = htmlCreateDocParserCtxt(cur, encoding);
6094    if (ctxt == NULL) return(NULL);
6095    if (sax != NULL) {
6096        if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6097        ctxt->sax = sax;
6098        ctxt->userData = userData;
6099    }
6100
6101    htmlParseDocument(ctxt);
6102    ret = ctxt->myDoc;
6103    if (sax != NULL) {
6104	ctxt->sax = NULL;
6105	ctxt->userData = NULL;
6106    }
6107    htmlFreeParserCtxt(ctxt);
6108
6109    return(ret);
6110}
6111
6112/**
6113 * htmlParseDoc:
6114 * @cur:  a pointer to an array of xmlChar
6115 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6116 *
6117 * parse an HTML in-memory document and build a tree.
6118 *
6119 * Returns the resulting document tree
6120 */
6121
6122htmlDocPtr
6123htmlParseDoc(xmlChar *cur, const char *encoding) {
6124    return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6125}
6126
6127
6128/**
6129 * htmlCreateFileParserCtxt:
6130 * @filename:  the filename
6131 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6132 *
6133 * Create a parser context for a file content.
6134 * Automatic support for ZLIB/Compress compressed document is provided
6135 * by default if found at compile-time.
6136 *
6137 * Returns the new parser context or NULL
6138 */
6139htmlParserCtxtPtr
6140htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6141{
6142    htmlParserCtxtPtr ctxt;
6143    htmlParserInputPtr inputStream;
6144    char *canonicFilename;
6145    /* htmlCharEncoding enc; */
6146    xmlChar *content, *content_line = (xmlChar *) "charset=";
6147
6148    if (filename == NULL)
6149        return(NULL);
6150
6151    ctxt = htmlNewParserCtxt();
6152    if (ctxt == NULL) {
6153	return(NULL);
6154    }
6155    canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6156    if (canonicFilename == NULL) {
6157#ifdef LIBXML_SAX1_ENABLED
6158	if (xmlDefaultSAXHandler.error != NULL) {
6159	    xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6160	}
6161#endif
6162	xmlFreeParserCtxt(ctxt);
6163	return(NULL);
6164    }
6165
6166    inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6167    xmlFree(canonicFilename);
6168    if (inputStream == NULL) {
6169	xmlFreeParserCtxt(ctxt);
6170	return(NULL);
6171    }
6172
6173    inputPush(ctxt, inputStream);
6174
6175    /* set encoding */
6176    if (encoding) {
6177        content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
6178	if (content) {
6179	    strcpy ((char *)content, (char *)content_line);
6180            strcat ((char *)content, (char *)encoding);
6181            htmlCheckEncoding (ctxt, content);
6182	    xmlFree (content);
6183	}
6184    }
6185
6186    return(ctxt);
6187}
6188
6189/**
6190 * htmlSAXParseFile:
6191 * @filename:  the filename
6192 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6193 * @sax:  the SAX handler block
6194 * @userData: if using SAX, this pointer will be provided on callbacks.
6195 *
6196 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6197 * compressed document is provided by default if found at compile-time.
6198 * It use the given SAX function block to handle the parsing callback.
6199 * If sax is NULL, fallback to the default DOM tree building routines.
6200 *
6201 * Returns the resulting document tree unless SAX is NULL or the document is
6202 *     not well formed.
6203 */
6204
6205htmlDocPtr
6206htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6207                 void *userData) {
6208    htmlDocPtr ret;
6209    htmlParserCtxtPtr ctxt;
6210    htmlSAXHandlerPtr oldsax = NULL;
6211
6212    xmlInitParser();
6213
6214    ctxt = htmlCreateFileParserCtxt(filename, encoding);
6215    if (ctxt == NULL) return(NULL);
6216    if (sax != NULL) {
6217	oldsax = ctxt->sax;
6218        ctxt->sax = sax;
6219        ctxt->userData = userData;
6220    }
6221
6222    htmlParseDocument(ctxt);
6223
6224    ret = ctxt->myDoc;
6225    if (sax != NULL) {
6226        ctxt->sax = oldsax;
6227        ctxt->userData = NULL;
6228    }
6229    htmlFreeParserCtxt(ctxt);
6230
6231    return(ret);
6232}
6233
6234/**
6235 * htmlParseFile:
6236 * @filename:  the filename
6237 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6238 *
6239 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6240 * compressed document is provided by default if found at compile-time.
6241 *
6242 * Returns the resulting document tree
6243 */
6244
6245htmlDocPtr
6246htmlParseFile(const char *filename, const char *encoding) {
6247    return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6248}
6249
6250/**
6251 * htmlHandleOmittedElem:
6252 * @val:  int 0 or 1
6253 *
6254 * Set and return the previous value for handling HTML omitted tags.
6255 *
6256 * Returns the last value for 0 for no handling, 1 for auto insertion.
6257 */
6258
6259int
6260htmlHandleOmittedElem(int val) {
6261    int old = htmlOmittedDefaultValue;
6262
6263    htmlOmittedDefaultValue = val;
6264    return(old);
6265}
6266
6267/**
6268 * htmlElementAllowedHere:
6269 * @parent: HTML parent element
6270 * @elt: HTML element
6271 *
6272 * Checks whether an HTML element may be a direct child of a parent element.
6273 * Note - doesn't check for deprecated elements
6274 *
6275 * Returns 1 if allowed; 0 otherwise.
6276 */
6277int
6278htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6279  const char** p ;
6280
6281  if ( ! elt || ! parent || ! parent->subelts )
6282	return 0 ;
6283
6284  for ( p = parent->subelts; *p; ++p )
6285    if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6286      return 1 ;
6287
6288  return 0 ;
6289}
6290/**
6291 * htmlElementStatusHere:
6292 * @parent: HTML parent element
6293 * @elt: HTML element
6294 *
6295 * Checks whether an HTML element may be a direct child of a parent element.
6296 * and if so whether it is valid or deprecated.
6297 *
6298 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6299 */
6300htmlStatus
6301htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6302  if ( ! parent || ! elt )
6303    return HTML_INVALID ;
6304  if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6305    return HTML_INVALID ;
6306
6307  return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6308}
6309/**
6310 * htmlAttrAllowed:
6311 * @elt: HTML element
6312 * @attr: HTML attribute
6313 * @legacy: whether to allow deprecated attributes
6314 *
6315 * Checks whether an attribute is valid for an element
6316 * Has full knowledge of Required and Deprecated attributes
6317 *
6318 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6319 */
6320htmlStatus
6321htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6322  const char** p ;
6323
6324  if ( !elt || ! attr )
6325	return HTML_INVALID ;
6326
6327  if ( elt->attrs_req )
6328    for ( p = elt->attrs_req; *p; ++p)
6329      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6330        return HTML_REQUIRED ;
6331
6332  if ( elt->attrs_opt )
6333    for ( p = elt->attrs_opt; *p; ++p)
6334      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6335        return HTML_VALID ;
6336
6337  if ( legacy && elt->attrs_depr )
6338    for ( p = elt->attrs_depr; *p; ++p)
6339      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6340        return HTML_DEPRECATED ;
6341
6342  return HTML_INVALID ;
6343}
6344/**
6345 * htmlNodeStatus:
6346 * @node: an htmlNodePtr in a tree
6347 * @legacy: whether to allow deprecated elements (YES is faster here
6348 *	for Element nodes)
6349 *
6350 * Checks whether the tree node is valid.  Experimental (the author
6351 *     only uses the HTML enhancements in a SAX parser)
6352 *
6353 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6354 *	legacy allowed) or htmlElementStatusHere (otherwise).
6355 *	for Attribute nodes, a return from htmlAttrAllowed
6356 *	for other nodes, HTML_NA (no checks performed)
6357 */
6358htmlStatus
6359htmlNodeStatus(const htmlNodePtr node, int legacy) {
6360  if ( ! node )
6361    return HTML_INVALID ;
6362
6363  switch ( node->type ) {
6364    case XML_ELEMENT_NODE:
6365      return legacy
6366	? ( htmlElementAllowedHere (
6367		htmlTagLookup(node->parent->name) , node->name
6368		) ? HTML_VALID : HTML_INVALID )
6369	: htmlElementStatusHere(
6370		htmlTagLookup(node->parent->name) ,
6371		htmlTagLookup(node->name) )
6372	;
6373    case XML_ATTRIBUTE_NODE:
6374      return htmlAttrAllowed(
6375	htmlTagLookup(node->parent->name) , node->name, legacy) ;
6376    default: return HTML_NA ;
6377  }
6378}
6379/************************************************************************
6380 *									*
6381 *	New set (2.6.0) of simpler and more flexible APIs		*
6382 *									*
6383 ************************************************************************/
6384/**
6385 * DICT_FREE:
6386 * @str:  a string
6387 *
6388 * Free a string if it is not owned by the "dict" dictionnary in the
6389 * current scope
6390 */
6391#define DICT_FREE(str)						\
6392	if ((str) && ((!dict) ||				\
6393	    (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))	\
6394	    xmlFree((char *)(str));
6395
6396/**
6397 * htmlCtxtReset:
6398 * @ctxt: an HTML parser context
6399 *
6400 * Reset a parser context
6401 */
6402void
6403htmlCtxtReset(htmlParserCtxtPtr ctxt)
6404{
6405    xmlParserInputPtr input;
6406    xmlDictPtr dict;
6407
6408    if (ctxt == NULL)
6409        return;
6410
6411    xmlInitParser();
6412    dict = ctxt->dict;
6413
6414    while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6415        xmlFreeInputStream(input);
6416    }
6417    ctxt->inputNr = 0;
6418    ctxt->input = NULL;
6419
6420    ctxt->spaceNr = 0;
6421    if (ctxt->spaceTab != NULL) {
6422	ctxt->spaceTab[0] = -1;
6423	ctxt->space = &ctxt->spaceTab[0];
6424    } else {
6425	ctxt->space = NULL;
6426    }
6427
6428
6429    ctxt->nodeNr = 0;
6430    ctxt->node = NULL;
6431
6432    ctxt->nameNr = 0;
6433    ctxt->name = NULL;
6434
6435    DICT_FREE(ctxt->version);
6436    ctxt->version = NULL;
6437    DICT_FREE(ctxt->encoding);
6438    ctxt->encoding = NULL;
6439    DICT_FREE(ctxt->directory);
6440    ctxt->directory = NULL;
6441    DICT_FREE(ctxt->extSubURI);
6442    ctxt->extSubURI = NULL;
6443    DICT_FREE(ctxt->extSubSystem);
6444    ctxt->extSubSystem = NULL;
6445    if (ctxt->myDoc != NULL)
6446        xmlFreeDoc(ctxt->myDoc);
6447    ctxt->myDoc = NULL;
6448
6449    ctxt->standalone = -1;
6450    ctxt->hasExternalSubset = 0;
6451    ctxt->hasPErefs = 0;
6452    ctxt->html = 1;
6453    ctxt->external = 0;
6454    ctxt->instate = XML_PARSER_START;
6455    ctxt->token = 0;
6456
6457    ctxt->wellFormed = 1;
6458    ctxt->nsWellFormed = 1;
6459    ctxt->disableSAX = 0;
6460    ctxt->valid = 1;
6461    ctxt->vctxt.userData = ctxt;
6462    ctxt->vctxt.error = xmlParserValidityError;
6463    ctxt->vctxt.warning = xmlParserValidityWarning;
6464    ctxt->record_info = 0;
6465    ctxt->nbChars = 0;
6466    ctxt->checkIndex = 0;
6467    ctxt->inSubset = 0;
6468    ctxt->errNo = XML_ERR_OK;
6469    ctxt->depth = 0;
6470    ctxt->charset = XML_CHAR_ENCODING_NONE;
6471    ctxt->catalogs = NULL;
6472    xmlInitNodeInfoSeq(&ctxt->node_seq);
6473
6474    if (ctxt->attsDefault != NULL) {
6475        xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
6476        ctxt->attsDefault = NULL;
6477    }
6478    if (ctxt->attsSpecial != NULL) {
6479        xmlHashFree(ctxt->attsSpecial, NULL);
6480        ctxt->attsSpecial = NULL;
6481    }
6482}
6483
6484/**
6485 * htmlCtxtUseOptions:
6486 * @ctxt: an HTML parser context
6487 * @options:  a combination of htmlParserOption(s)
6488 *
6489 * Applies the options to the parser context
6490 *
6491 * Returns 0 in case of success, the set of unknown or unimplemented options
6492 *         in case of error.
6493 */
6494int
6495htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6496{
6497    if (ctxt == NULL)
6498        return(-1);
6499
6500    if (options & HTML_PARSE_NOWARNING) {
6501        ctxt->sax->warning = NULL;
6502        ctxt->vctxt.warning = NULL;
6503        options -= XML_PARSE_NOWARNING;
6504	ctxt->options |= XML_PARSE_NOWARNING;
6505    }
6506    if (options & HTML_PARSE_NOERROR) {
6507        ctxt->sax->error = NULL;
6508        ctxt->vctxt.error = NULL;
6509        ctxt->sax->fatalError = NULL;
6510        options -= XML_PARSE_NOERROR;
6511	ctxt->options |= XML_PARSE_NOERROR;
6512    }
6513    if (options & HTML_PARSE_PEDANTIC) {
6514        ctxt->pedantic = 1;
6515        options -= XML_PARSE_PEDANTIC;
6516	ctxt->options |= XML_PARSE_PEDANTIC;
6517    } else
6518        ctxt->pedantic = 0;
6519    if (options & XML_PARSE_NOBLANKS) {
6520        ctxt->keepBlanks = 0;
6521        ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6522        options -= XML_PARSE_NOBLANKS;
6523	ctxt->options |= XML_PARSE_NOBLANKS;
6524    } else
6525        ctxt->keepBlanks = 1;
6526    if (options & HTML_PARSE_RECOVER) {
6527        ctxt->recovery = 1;
6528	options -= HTML_PARSE_RECOVER;
6529    } else
6530        ctxt->recovery = 0;
6531    if (options & HTML_PARSE_COMPACT) {
6532	ctxt->options |= HTML_PARSE_COMPACT;
6533        options -= HTML_PARSE_COMPACT;
6534    }
6535    if (options & XML_PARSE_HUGE) {
6536	ctxt->options |= XML_PARSE_HUGE;
6537        options -= XML_PARSE_HUGE;
6538    }
6539    if (options & HTML_PARSE_NODEFDTD) {
6540	ctxt->options |= HTML_PARSE_NODEFDTD;
6541        options -= HTML_PARSE_NODEFDTD;
6542    }
6543    if (options & HTML_PARSE_IGNORE_ENC) {
6544	ctxt->options |= HTML_PARSE_IGNORE_ENC;
6545        options -= HTML_PARSE_IGNORE_ENC;
6546    }
6547    ctxt->dictNames = 0;
6548    return (options);
6549}
6550
6551/**
6552 * htmlDoRead:
6553 * @ctxt:  an HTML parser context
6554 * @URL:  the base URL to use for the document
6555 * @encoding:  the document encoding, or NULL
6556 * @options:  a combination of htmlParserOption(s)
6557 * @reuse:  keep the context for reuse
6558 *
6559 * Common front-end for the htmlRead functions
6560 *
6561 * Returns the resulting document tree or NULL
6562 */
6563static htmlDocPtr
6564htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6565          int options, int reuse)
6566{
6567    htmlDocPtr ret;
6568
6569    htmlCtxtUseOptions(ctxt, options);
6570    ctxt->html = 1;
6571    if (encoding != NULL) {
6572        xmlCharEncodingHandlerPtr hdlr;
6573
6574	hdlr = xmlFindCharEncodingHandler(encoding);
6575	if (hdlr != NULL) {
6576	    xmlSwitchToEncoding(ctxt, hdlr);
6577	    if (ctxt->input->encoding != NULL)
6578	      xmlFree((xmlChar *) ctxt->input->encoding);
6579            ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6580        }
6581    }
6582    if ((URL != NULL) && (ctxt->input != NULL) &&
6583        (ctxt->input->filename == NULL))
6584        ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6585    htmlParseDocument(ctxt);
6586    ret = ctxt->myDoc;
6587    ctxt->myDoc = NULL;
6588    if (!reuse) {
6589        if ((ctxt->dictNames) &&
6590	    (ret != NULL) &&
6591	    (ret->dict == ctxt->dict))
6592	    ctxt->dict = NULL;
6593	xmlFreeParserCtxt(ctxt);
6594    }
6595    return (ret);
6596}
6597
6598/**
6599 * htmlReadDoc:
6600 * @cur:  a pointer to a zero terminated string
6601 * @URL:  the base URL to use for the document
6602 * @encoding:  the document encoding, or NULL
6603 * @options:  a combination of htmlParserOption(s)
6604 *
6605 * parse an XML in-memory document and build a tree.
6606 *
6607 * Returns the resulting document tree
6608 */
6609htmlDocPtr
6610htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6611{
6612    htmlParserCtxtPtr ctxt;
6613
6614    if (cur == NULL)
6615        return (NULL);
6616
6617    xmlInitParser();
6618    ctxt = htmlCreateDocParserCtxt(cur, NULL);
6619    if (ctxt == NULL)
6620        return (NULL);
6621    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6622}
6623
6624/**
6625 * htmlReadFile:
6626 * @filename:  a file or URL
6627 * @encoding:  the document encoding, or NULL
6628 * @options:  a combination of htmlParserOption(s)
6629 *
6630 * parse an XML file from the filesystem or the network.
6631 *
6632 * Returns the resulting document tree
6633 */
6634htmlDocPtr
6635htmlReadFile(const char *filename, const char *encoding, int options)
6636{
6637    htmlParserCtxtPtr ctxt;
6638
6639    xmlInitParser();
6640    ctxt = htmlCreateFileParserCtxt(filename, encoding);
6641    if (ctxt == NULL)
6642        return (NULL);
6643    return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6644}
6645
6646/**
6647 * htmlReadMemory:
6648 * @buffer:  a pointer to a char array
6649 * @size:  the size of the array
6650 * @URL:  the base URL to use for the document
6651 * @encoding:  the document encoding, or NULL
6652 * @options:  a combination of htmlParserOption(s)
6653 *
6654 * parse an XML in-memory document and build a tree.
6655 *
6656 * Returns the resulting document tree
6657 */
6658htmlDocPtr
6659htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6660{
6661    htmlParserCtxtPtr ctxt;
6662
6663    xmlInitParser();
6664    ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6665    if (ctxt == NULL)
6666        return (NULL);
6667    htmlDefaultSAXHandlerInit();
6668    if (ctxt->sax != NULL)
6669        memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6670    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6671}
6672
6673/**
6674 * htmlReadFd:
6675 * @fd:  an open file descriptor
6676 * @URL:  the base URL to use for the document
6677 * @encoding:  the document encoding, or NULL
6678 * @options:  a combination of htmlParserOption(s)
6679 *
6680 * parse an XML from a file descriptor and build a tree.
6681 *
6682 * Returns the resulting document tree
6683 */
6684htmlDocPtr
6685htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6686{
6687    htmlParserCtxtPtr ctxt;
6688    xmlParserInputBufferPtr input;
6689    xmlParserInputPtr stream;
6690
6691    if (fd < 0)
6692        return (NULL);
6693
6694    xmlInitParser();
6695    input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6696    if (input == NULL)
6697        return (NULL);
6698    ctxt = xmlNewParserCtxt();
6699    if (ctxt == NULL) {
6700        xmlFreeParserInputBuffer(input);
6701        return (NULL);
6702    }
6703    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6704    if (stream == NULL) {
6705        xmlFreeParserInputBuffer(input);
6706	xmlFreeParserCtxt(ctxt);
6707        return (NULL);
6708    }
6709    inputPush(ctxt, stream);
6710    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6711}
6712
6713/**
6714 * htmlReadIO:
6715 * @ioread:  an I/O read function
6716 * @ioclose:  an I/O close function
6717 * @ioctx:  an I/O handler
6718 * @URL:  the base URL to use for the document
6719 * @encoding:  the document encoding, or NULL
6720 * @options:  a combination of htmlParserOption(s)
6721 *
6722 * parse an HTML document from I/O functions and source and build a tree.
6723 *
6724 * Returns the resulting document tree
6725 */
6726htmlDocPtr
6727htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6728          void *ioctx, const char *URL, const char *encoding, int options)
6729{
6730    htmlParserCtxtPtr ctxt;
6731    xmlParserInputBufferPtr input;
6732    xmlParserInputPtr stream;
6733
6734    if (ioread == NULL)
6735        return (NULL);
6736    xmlInitParser();
6737
6738    input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6739                                         XML_CHAR_ENCODING_NONE);
6740    if (input == NULL)
6741        return (NULL);
6742    ctxt = htmlNewParserCtxt();
6743    if (ctxt == NULL) {
6744        xmlFreeParserInputBuffer(input);
6745        return (NULL);
6746    }
6747    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6748    if (stream == NULL) {
6749        xmlFreeParserInputBuffer(input);
6750	xmlFreeParserCtxt(ctxt);
6751        return (NULL);
6752    }
6753    inputPush(ctxt, stream);
6754    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6755}
6756
6757/**
6758 * htmlCtxtReadDoc:
6759 * @ctxt:  an HTML parser context
6760 * @cur:  a pointer to a zero terminated string
6761 * @URL:  the base URL to use for the document
6762 * @encoding:  the document encoding, or NULL
6763 * @options:  a combination of htmlParserOption(s)
6764 *
6765 * parse an XML in-memory document and build a tree.
6766 * This reuses the existing @ctxt parser context
6767 *
6768 * Returns the resulting document tree
6769 */
6770htmlDocPtr
6771htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6772               const char *URL, const char *encoding, int options)
6773{
6774    xmlParserInputPtr stream;
6775
6776    if (cur == NULL)
6777        return (NULL);
6778    if (ctxt == NULL)
6779        return (NULL);
6780
6781    htmlCtxtReset(ctxt);
6782
6783    stream = xmlNewStringInputStream(ctxt, cur);
6784    if (stream == NULL) {
6785        return (NULL);
6786    }
6787    inputPush(ctxt, stream);
6788    return (htmlDoRead(ctxt, URL, encoding, options, 1));
6789}
6790
6791/**
6792 * htmlCtxtReadFile:
6793 * @ctxt:  an HTML parser context
6794 * @filename:  a file or URL
6795 * @encoding:  the document encoding, or NULL
6796 * @options:  a combination of htmlParserOption(s)
6797 *
6798 * parse an XML file from the filesystem or the network.
6799 * This reuses the existing @ctxt parser context
6800 *
6801 * Returns the resulting document tree
6802 */
6803htmlDocPtr
6804htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6805                const char *encoding, int options)
6806{
6807    xmlParserInputPtr stream;
6808
6809    if (filename == NULL)
6810        return (NULL);
6811    if (ctxt == NULL)
6812        return (NULL);
6813
6814    htmlCtxtReset(ctxt);
6815
6816    stream = xmlLoadExternalEntity(filename, NULL, ctxt);
6817    if (stream == NULL) {
6818        return (NULL);
6819    }
6820    inputPush(ctxt, stream);
6821    return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6822}
6823
6824/**
6825 * htmlCtxtReadMemory:
6826 * @ctxt:  an HTML parser context
6827 * @buffer:  a pointer to a char array
6828 * @size:  the size of the array
6829 * @URL:  the base URL to use for the document
6830 * @encoding:  the document encoding, or NULL
6831 * @options:  a combination of htmlParserOption(s)
6832 *
6833 * parse an XML in-memory document and build a tree.
6834 * This reuses the existing @ctxt parser context
6835 *
6836 * Returns the resulting document tree
6837 */
6838htmlDocPtr
6839htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6840                  const char *URL, const char *encoding, int options)
6841{
6842    xmlParserInputBufferPtr input;
6843    xmlParserInputPtr stream;
6844
6845    if (ctxt == NULL)
6846        return (NULL);
6847    if (buffer == NULL)
6848        return (NULL);
6849
6850    htmlCtxtReset(ctxt);
6851
6852    input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6853    if (input == NULL) {
6854	return(NULL);
6855    }
6856
6857    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6858    if (stream == NULL) {
6859	xmlFreeParserInputBuffer(input);
6860	return(NULL);
6861    }
6862
6863    inputPush(ctxt, stream);
6864    return (htmlDoRead(ctxt, URL, encoding, options, 1));
6865}
6866
6867/**
6868 * htmlCtxtReadFd:
6869 * @ctxt:  an HTML parser context
6870 * @fd:  an open file descriptor
6871 * @URL:  the base URL to use for the document
6872 * @encoding:  the document encoding, or NULL
6873 * @options:  a combination of htmlParserOption(s)
6874 *
6875 * parse an XML from a file descriptor and build a tree.
6876 * This reuses the existing @ctxt parser context
6877 *
6878 * Returns the resulting document tree
6879 */
6880htmlDocPtr
6881htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6882              const char *URL, const char *encoding, int options)
6883{
6884    xmlParserInputBufferPtr input;
6885    xmlParserInputPtr stream;
6886
6887    if (fd < 0)
6888        return (NULL);
6889    if (ctxt == NULL)
6890        return (NULL);
6891
6892    htmlCtxtReset(ctxt);
6893
6894
6895    input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6896    if (input == NULL)
6897        return (NULL);
6898    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6899    if (stream == NULL) {
6900        xmlFreeParserInputBuffer(input);
6901        return (NULL);
6902    }
6903    inputPush(ctxt, stream);
6904    return (htmlDoRead(ctxt, URL, encoding, options, 1));
6905}
6906
6907/**
6908 * htmlCtxtReadIO:
6909 * @ctxt:  an HTML parser context
6910 * @ioread:  an I/O read function
6911 * @ioclose:  an I/O close function
6912 * @ioctx:  an I/O handler
6913 * @URL:  the base URL to use for the document
6914 * @encoding:  the document encoding, or NULL
6915 * @options:  a combination of htmlParserOption(s)
6916 *
6917 * parse an HTML document from I/O functions and source and build a tree.
6918 * This reuses the existing @ctxt parser context
6919 *
6920 * Returns the resulting document tree
6921 */
6922htmlDocPtr
6923htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6924              xmlInputCloseCallback ioclose, void *ioctx,
6925	      const char *URL,
6926              const char *encoding, int options)
6927{
6928    xmlParserInputBufferPtr input;
6929    xmlParserInputPtr stream;
6930
6931    if (ioread == NULL)
6932        return (NULL);
6933    if (ctxt == NULL)
6934        return (NULL);
6935
6936    htmlCtxtReset(ctxt);
6937
6938    input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6939                                         XML_CHAR_ENCODING_NONE);
6940    if (input == NULL)
6941        return (NULL);
6942    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6943    if (stream == NULL) {
6944        xmlFreeParserInputBuffer(input);
6945        return (NULL);
6946    }
6947    inputPush(ctxt, stream);
6948    return (htmlDoRead(ctxt, URL, encoding, options, 1));
6949}
6950
6951#define bottom_HTMLparser
6952#include "elfgcchack.h"
6953#endif /* LIBXML_HTML_ENABLED */
6954