parser.h revision 5e2dace1ca6fbb023d1ce848d4e98deefbbfec31
1/*
2 * parser.h : Interfaces, constants and types related to the XML parser.
3 *
4 * See Copyright for the status of this software.
5 *
6 * daniel@veillard.com
7 */
8
9#ifndef __XML_PARSER_H__
10#define __XML_PARSER_H__
11
12#include <libxml/tree.h>
13#include <libxml/valid.h>
14#include <libxml/xmlIO.h>
15#include <libxml/entities.h>
16
17
18#ifdef __cplusplus
19extern "C" {
20#endif
21
22/**
23 * XML_DEFAULT_VERSION:
24 *
25 * The default version of XML used: 1.0
26 */
27#define XML_DEFAULT_VERSION	"1.0"
28
29/**
30 * xmlParserInput:
31 *
32 * an xmlParserInput is an input flow for the XML processor.
33 * Each entity parsed is associated an xmlParserInput (except the
34 * few predefined ones). This is the case both for internal entities
35 * - in which case the flow is already completely in memory - or
36 * external entities - in which case we use the buf structure for
37 * progressive reading and I18N conversions to the internal UTF-8 format.
38 */
39
40typedef void (* xmlParserInputDeallocate)(xmlChar *);
41
42typedef struct _xmlParserInput xmlParserInput;
43typedef xmlParserInput *xmlParserInputPtr;
44struct _xmlParserInput {
45    /* Input buffer */
46    xmlParserInputBufferPtr buf;      /* UTF-8 encoded buffer */
47
48    const char *filename;             /* The file analyzed, if any */
49    const char *directory;            /* the directory/base of teh file */
50    const xmlChar *base;              /* Base of the array to parse */
51    const xmlChar *cur;               /* Current char being parsed */
52    const xmlChar *end;               /* end of the arry to parse */
53    int length;                       /* length if known */
54    int line;                         /* Current line */
55    int col;                          /* Current column */
56    int consumed;                     /* How many xmlChars already consumed */
57    xmlParserInputDeallocate free;    /* function to deallocate the base */
58    const xmlChar *encoding;          /* the encoding string for entity */
59    const xmlChar *version;           /* the version string for entity */
60    int standalone;                   /* Was that entity marked standalone */
61};
62
63/**
64 * xmlParserNodeInfo:
65 *
66 * the parser can be asked to collect Node informations, i.e. at what
67 * place in the file they were detected.
68 * NOTE: This is off by default and not very well tested.
69 */
70typedef struct _xmlParserNodeInfo xmlParserNodeInfo;
71typedef xmlParserNodeInfo *xmlParserNodeInfoPtr;
72
73struct _xmlParserNodeInfo {
74  const struct _xmlNode* node;
75  /* Position & line # that text that created the node begins & ends on */
76  unsigned long begin_pos;
77  unsigned long begin_line;
78  unsigned long end_pos;
79  unsigned long end_line;
80};
81
82typedef struct _xmlParserNodeInfoSeq xmlParserNodeInfoSeq;
83typedef xmlParserNodeInfoSeq *xmlParserNodeInfoSeqPtr;
84struct _xmlParserNodeInfoSeq {
85  unsigned long maximum;
86  unsigned long length;
87  xmlParserNodeInfo* buffer;
88};
89
90/**
91 * xmlParserInputState:
92 *
93 * The parser is now working also as a state based parser
94 * The recursive one use the stagte info for entities processing
95 */
96typedef enum {
97    XML_PARSER_EOF = -1,	/* nothing is to be parsed */
98    XML_PARSER_START = 0,	/* nothing has been parsed */
99    XML_PARSER_MISC,		/* Misc* before int subset */
100    XML_PARSER_PI,		/* Whithin a processing instruction */
101    XML_PARSER_DTD,		/* within some DTD content */
102    XML_PARSER_PROLOG,		/* Misc* after internal subset */
103    XML_PARSER_COMMENT,		/* within a comment */
104    XML_PARSER_START_TAG,	/* within a start tag */
105    XML_PARSER_CONTENT,		/* within the content */
106    XML_PARSER_CDATA_SECTION,	/* within a CDATA section */
107    XML_PARSER_END_TAG,		/* within a closing tag */
108    XML_PARSER_ENTITY_DECL,	/* within an entity declaration */
109    XML_PARSER_ENTITY_VALUE,	/* within an entity value in a decl */
110    XML_PARSER_ATTRIBUTE_VALUE,	/* within an attribute value */
111    XML_PARSER_SYSTEM_LITERAL,	/* within a SYSTEM value */
112    XML_PARSER_EPILOG, 		/* the Misc* after the last end tag */
113    XML_PARSER_IGNORE		/* within an IGNORED section */
114} xmlParserInputState;
115
116/**
117 * XML_DETECT_IDS:
118 *
119 * Bit in the loadsubset context field to tell to do ID/REFs lookups
120 * Use it to initialize xmlLoadExtDtdDefaultValue
121 */
122#define XML_DETECT_IDS		2
123
124/**
125 * XML_COMPLETE_ATTRS:
126 *
127 * Bit in the loadsubset context field to tell to do complete the
128 * elements attributes lists with the ones defaulted from the DTDs
129 * Use it to initialize xmlLoadExtDtdDefaultValue
130 */
131#define XML_COMPLETE_ATTRS	4
132
133/**
134 * xmlParserCtxt:
135 *
136 * The parser context.
137 * NOTE This doesn't completely defines the parser state, the (current ?)
138 *      design of the parser uses recursive function calls since this allow
139 *      and easy mapping from the production rules of the specification
140 *      to the actual code. The drawback is that the actual function call
141 *      also reflect the parser state. However most of the parsing routines
142 *      takes as the only argument the parser context pointer, so migrating
143 *      to a state based parser for progressive parsing shouldn't be too hard.
144 */
145typedef struct _xmlParserCtxt xmlParserCtxt;
146typedef xmlParserCtxt *xmlParserCtxtPtr;
147struct _xmlParserCtxt {
148    struct _xmlSAXHandler *sax;       /* The SAX handler */
149    void            *userData;        /* For SAX interface only, used by DOM build */
150    xmlDocPtr           myDoc;        /* the document being built */
151    int            wellFormed;        /* is the document well formed */
152    int       replaceEntities;        /* shall we replace entities ? */
153    const xmlChar    *version;        /* the XML version string */
154    const xmlChar   *encoding;        /* the declared encoding, if any */
155    int            standalone;        /* standalone document */
156    int                  html;        /* an HTML(1)/Docbook(2) document */
157
158    /* Input stream stack */
159    xmlParserInputPtr  input;         /* Current input stream */
160    int                inputNr;       /* Number of current input streams */
161    int                inputMax;      /* Max number of input streams */
162    xmlParserInputPtr *inputTab;      /* stack of inputs */
163
164    /* Node analysis stack only used for DOM building */
165    xmlNodePtr         node;          /* Current parsed Node */
166    int                nodeNr;        /* Depth of the parsing stack */
167    int                nodeMax;       /* Max depth of the parsing stack */
168    xmlNodePtr        *nodeTab;       /* array of nodes */
169
170    int record_info;                  /* Whether node info should be kept */
171    xmlParserNodeInfoSeq node_seq;    /* info about each node parsed */
172
173    int errNo;                        /* error code */
174
175    int     hasExternalSubset;        /* reference and external subset */
176    int             hasPErefs;        /* the internal subset has PE refs */
177    int              external;        /* are we parsing an external entity */
178
179    int                 valid;        /* is the document valid */
180    int              validate;        /* shall we try to validate ? */
181    xmlValidCtxt        vctxt;        /* The validity context */
182
183    xmlParserInputState instate;      /* current type of input */
184    int                 token;        /* next char look-ahead */
185
186    char           *directory;        /* the data directory */
187
188    /* Node name stack */
189    xmlChar           *name;          /* Current parsed Node */
190    int                nameNr;        /* Depth of the parsing stack */
191    int                nameMax;       /* Max depth of the parsing stack */
192    xmlChar *         *nameTab;       /* array of nodes */
193
194    long               nbChars;       /* number of xmlChar processed */
195    long            checkIndex;       /* used by progressive parsing lookup */
196    int             keepBlanks;       /* ugly but ... */
197    int             disableSAX;       /* SAX callbacks are disabled */
198    int               inSubset;       /* Parsing is in int 1/ext 2 subset */
199    xmlChar *          intSubName;    /* name of subset */
200    xmlChar *          extSubURI;     /* URI of external subset */
201    xmlChar *          extSubSystem;  /* SYSTEM ID of external subset */
202
203    /* xml:space values */
204    int *              space;         /* Should the parser preserve spaces */
205    int                spaceNr;       /* Depth of the parsing stack */
206    int                spaceMax;      /* Max depth of the parsing stack */
207    int *              spaceTab;      /* array of space infos */
208
209    int                depth;         /* to prevent entity substitution loops */
210    xmlParserInputPtr  entity;        /* used to check entities boundaries */
211    int                charset;       /* encoding of the in-memory content
212				         actually an xmlCharEncoding */
213    int                nodelen;       /* Those two fields are there to */
214    int                nodemem;       /* Speed up large node parsing */
215    int                pedantic;      /* signal pedantic warnings */
216    void              *_private;      /* For user data, libxml won't touch it */
217
218    int                loadsubset;    /* should the external subset be loaded */
219};
220
221/**
222 * xmlSAXLocator:
223 *
224 * a SAX Locator.
225 */
226typedef struct _xmlSAXLocator xmlSAXLocator;
227typedef xmlSAXLocator *xmlSAXLocatorPtr;
228struct _xmlSAXLocator {
229    const xmlChar *(*getPublicId)(void *ctx);
230    const xmlChar *(*getSystemId)(void *ctx);
231    int (*getLineNumber)(void *ctx);
232    int (*getColumnNumber)(void *ctx);
233};
234
235/**
236 * xmlSAXHandler:
237 *
238 * a SAX handler is bunch of callbacks called by the parser when processing
239 * of the input generate data or structure informations.
240 */
241
242typedef xmlParserInputPtr (*resolveEntitySAXFunc) (void *ctx,
243			    const xmlChar *publicId, const xmlChar *systemId);
244typedef void (*internalSubsetSAXFunc) (void *ctx, const xmlChar *name,
245                            const xmlChar *ExternalID, const xmlChar *SystemID);
246typedef void (*externalSubsetSAXFunc) (void *ctx, const xmlChar *name,
247                            const xmlChar *ExternalID, const xmlChar *SystemID);
248typedef xmlEntityPtr (*getEntitySAXFunc) (void *ctx,
249                            const xmlChar *name);
250typedef xmlEntityPtr (*getParameterEntitySAXFunc) (void *ctx,
251                            const xmlChar *name);
252typedef void (*entityDeclSAXFunc) (void *ctx,
253                            const xmlChar *name, int type, const xmlChar *publicId,
254			    const xmlChar *systemId, xmlChar *content);
255typedef void (*notationDeclSAXFunc)(void *ctx, const xmlChar *name,
256			    const xmlChar *publicId, const xmlChar *systemId);
257typedef void (*attributeDeclSAXFunc)(void *ctx, const xmlChar *elem,
258                            const xmlChar *name, int type, int def,
259			    const xmlChar *defaultValue, xmlEnumerationPtr tree);
260typedef void (*elementDeclSAXFunc)(void *ctx, const xmlChar *name,
261			    int type, xmlElementContentPtr content);
262typedef void (*unparsedEntityDeclSAXFunc)(void *ctx,
263                            const xmlChar *name, const xmlChar *publicId,
264			    const xmlChar *systemId, const xmlChar *notationName);
265typedef void (*setDocumentLocatorSAXFunc) (void *ctx,
266                            xmlSAXLocatorPtr loc);
267typedef void (*startDocumentSAXFunc) (void *ctx);
268typedef void (*endDocumentSAXFunc) (void *ctx);
269typedef void (*startElementSAXFunc) (void *ctx, const xmlChar *name,
270                            const xmlChar **atts);
271typedef void (*endElementSAXFunc) (void *ctx, const xmlChar *name);
272typedef void (*attributeSAXFunc) (void *ctx, const xmlChar *name,
273                                  const xmlChar *value);
274typedef void (*referenceSAXFunc) (void *ctx, const xmlChar *name);
275typedef void (*charactersSAXFunc) (void *ctx, const xmlChar *ch,
276		            int len);
277typedef void (*ignorableWhitespaceSAXFunc) (void *ctx,
278			    const xmlChar *ch, int len);
279typedef void (*processingInstructionSAXFunc) (void *ctx,
280                            const xmlChar *target, const xmlChar *data);
281typedef void (*commentSAXFunc) (void *ctx, const xmlChar *value);
282typedef void (*cdataBlockSAXFunc) (void *ctx, const xmlChar *value, int len);
283typedef void (*warningSAXFunc) (void *ctx, const char *msg, ...);
284typedef void (*errorSAXFunc) (void *ctx, const char *msg, ...);
285typedef void (*fatalErrorSAXFunc) (void *ctx, const char *msg, ...);
286typedef int (*isStandaloneSAXFunc) (void *ctx);
287typedef int (*hasInternalSubsetSAXFunc) (void *ctx);
288typedef int (*hasExternalSubsetSAXFunc) (void *ctx);
289
290typedef struct _xmlSAXHandler xmlSAXHandler;
291typedef xmlSAXHandler *xmlSAXHandlerPtr;
292struct _xmlSAXHandler {
293    internalSubsetSAXFunc internalSubset;
294    isStandaloneSAXFunc isStandalone;
295    hasInternalSubsetSAXFunc hasInternalSubset;
296    hasExternalSubsetSAXFunc hasExternalSubset;
297    resolveEntitySAXFunc resolveEntity;
298    getEntitySAXFunc getEntity;
299    entityDeclSAXFunc entityDecl;
300    notationDeclSAXFunc notationDecl;
301    attributeDeclSAXFunc attributeDecl;
302    elementDeclSAXFunc elementDecl;
303    unparsedEntityDeclSAXFunc unparsedEntityDecl;
304    setDocumentLocatorSAXFunc setDocumentLocator;
305    startDocumentSAXFunc startDocument;
306    endDocumentSAXFunc endDocument;
307    startElementSAXFunc startElement;
308    endElementSAXFunc endElement;
309    referenceSAXFunc reference;
310    charactersSAXFunc characters;
311    ignorableWhitespaceSAXFunc ignorableWhitespace;
312    processingInstructionSAXFunc processingInstruction;
313    commentSAXFunc comment;
314    warningSAXFunc warning;
315    errorSAXFunc error;
316    fatalErrorSAXFunc fatalError;
317    getParameterEntitySAXFunc getParameterEntity;
318    cdataBlockSAXFunc cdataBlock;
319    externalSubsetSAXFunc externalSubset;
320};
321
322/**
323 * xmlExternalEntityLoader:
324 * @URL: The System ID of the resource requested
325 * @ID: The Public ID of the resource requested
326 * @xmlParserCtxtPtr: the XML parser context
327 *
328 * External entity loaders types
329 */
330typedef xmlParserInputPtr (*xmlExternalEntityLoader)(const char *URL,
331						     const char *ID,
332						     xmlParserCtxtPtr context);
333
334/*
335 * Global variables: just the default SAX interface tables and XML
336 * version infos.
337 */
338LIBXML_DLL_IMPORT extern const char *xmlParserVersion;
339
340LIBXML_DLL_IMPORT extern xmlSAXLocator xmlDefaultSAXLocator;
341LIBXML_DLL_IMPORT extern xmlSAXHandler xmlDefaultSAXHandler;
342LIBXML_DLL_IMPORT extern xmlSAXHandler htmlDefaultSAXHandler;
343LIBXML_DLL_IMPORT extern xmlSAXHandler docbDefaultSAXHandler;
344
345/*
346 * entity substitution default behaviour.
347 */
348
349#ifdef VMS
350/**
351 * xmlSubstituteEntitiesDefaultValue:
352 *
353 * global variable controlling the entity substitution default behaviour
354 */
355LIBXML_DLL_IMPORT extern int xmlSubstituteEntitiesDefaultVal;
356#define xmlSubstituteEntitiesDefaultValue xmlSubstituteEntitiesDefaultVal
357#else
358LIBXML_DLL_IMPORT extern int xmlSubstituteEntitiesDefaultValue;
359#endif
360LIBXML_DLL_IMPORT extern int xmlGetWarningsDefaultValue;
361
362
363/*
364 * Init/Cleanup
365 */
366void		xmlInitParser		(void);
367void		xmlCleanupParser	(void);
368
369/*
370 * Input functions
371 */
372int		xmlParserInputRead	(xmlParserInputPtr in,
373					 int len);
374int		xmlParserInputGrow	(xmlParserInputPtr in,
375					 int len);
376
377/*
378 * xmlChar handling
379 */
380xmlChar *	xmlStrdup		(const xmlChar *cur);
381xmlChar *	xmlStrndup		(const xmlChar *cur,
382					 int len);
383xmlChar *	xmlCharStrndup		(const char *cur,
384					 int len);
385xmlChar *	xmlCharStrdup		(const char *cur);
386xmlChar *	xmlStrsub		(const xmlChar *str,
387					 int start,
388					 int len);
389const xmlChar *	xmlStrchr		(const xmlChar *str,
390					 xmlChar val);
391const xmlChar *	xmlStrstr		(const xmlChar *str,
392					 const xmlChar *val);
393const xmlChar *	xmlStrcasestr		(const xmlChar *str,
394					 xmlChar *val);
395int		xmlStrcmp		(const xmlChar *str1,
396					 const xmlChar *str2);
397int		xmlStrncmp		(const xmlChar *str1,
398					 const xmlChar *str2,
399					 int len);
400int		xmlStrcasecmp		(const xmlChar *str1,
401					 const xmlChar *str2);
402int		xmlStrncasecmp		(const xmlChar *str1,
403					 const xmlChar *str2,
404					 int len);
405int		xmlStrEqual		(const xmlChar *str1,
406					 const xmlChar *str2);
407int		xmlStrlen		(const xmlChar *str);
408xmlChar *	xmlStrcat		(xmlChar *cur,
409					 const xmlChar *add);
410xmlChar *	xmlStrncat		(xmlChar *cur,
411					 const xmlChar *add,
412					 int len);
413
414/*
415 * Basic parsing Interfaces
416 */
417xmlDocPtr	xmlParseDoc		(xmlChar *cur);
418xmlDocPtr	xmlParseMemory		(char *buffer,
419					 int size);
420xmlDocPtr	xmlParseFile		(const char *filename);
421int		xmlSubstituteEntitiesDefault(int val);
422int		xmlKeepBlanksDefault	(int val);
423void		xmlStopParser		(xmlParserCtxtPtr ctxt);
424int		xmlPedanticParserDefault(int val);
425
426/*
427 * Recovery mode
428 */
429xmlDocPtr	xmlRecoverDoc		(xmlChar *cur);
430xmlDocPtr	xmlRecoverMemory	(char *buffer,
431					 int size);
432xmlDocPtr	xmlRecoverFile		(const char *filename);
433
434/*
435 * Less common routines and SAX interfaces
436 */
437int		xmlParseDocument	(xmlParserCtxtPtr ctxt);
438int		xmlParseExtParsedEnt	(xmlParserCtxtPtr ctxt);
439xmlDocPtr	xmlSAXParseDoc		(xmlSAXHandlerPtr sax,
440					 xmlChar *cur,
441					 int recovery);
442int		xmlSAXUserParseFile	(xmlSAXHandlerPtr sax,
443					 void *user_data,
444					 const char *filename);
445int		xmlSAXUserParseMemory	(xmlSAXHandlerPtr sax,
446					 void *user_data,
447					 const char *buffer,
448					 int size);
449xmlDocPtr	xmlSAXParseMemory	(xmlSAXHandlerPtr sax,
450					 char *buffer,
451                                   	 int size,
452					 int recovery);
453xmlDocPtr	xmlSAXParseFile		(xmlSAXHandlerPtr sax,
454					 const char *filename,
455					 int recovery);
456xmlDocPtr	xmlSAXParseEntity	(xmlSAXHandlerPtr sax,
457					 const char *filename);
458xmlDocPtr	xmlParseEntity		(const char *filename);
459xmlDtdPtr	xmlParseDTD		(const xmlChar *ExternalID,
460					 const xmlChar *SystemID);
461xmlDtdPtr	xmlSAXParseDTD		(xmlSAXHandlerPtr sax,
462					 const xmlChar *ExternalID,
463					 const xmlChar *SystemID);
464xmlDtdPtr	xmlIOParseDTD		(xmlSAXHandlerPtr sax,
465					 xmlParserInputBufferPtr input,
466					 xmlCharEncoding enc);
467int		xmlParseBalancedChunkMemory(xmlDocPtr doc,
468					 xmlSAXHandlerPtr sax,
469					 void *user_data,
470					 int depth,
471					 const xmlChar *string,
472					 xmlNodePtr *list);
473int		xmlParseExternalEntity	(xmlDocPtr doc,
474					 xmlSAXHandlerPtr sax,
475					 void *user_data,
476					 int depth,
477					 const xmlChar *URL,
478					 const xmlChar *ID,
479					 xmlNodePtr *list);
480int		xmlParseCtxtExternalEntity(xmlParserCtxtPtr ctx,
481					 const xmlChar *URL,
482					 const xmlChar *ID,
483					 xmlNodePtr *list);
484
485/*
486 * SAX initialization routines
487 */
488void		xmlDefaultSAXHandlerInit(void);
489void		htmlDefaultSAXHandlerInit(void);
490
491/*
492 * Parser contexts handling.
493 */
494void		xmlInitParserCtxt	(xmlParserCtxtPtr ctxt);
495void		xmlClearParserCtxt	(xmlParserCtxtPtr ctxt);
496void		xmlFreeParserCtxt	(xmlParserCtxtPtr ctxt);
497void		xmlSetupParserForBuffer	(xmlParserCtxtPtr ctxt,
498					 const xmlChar* buffer,
499					 const char* filename);
500xmlParserCtxtPtr xmlCreateDocParserCtxt	(xmlChar *cur);
501
502/*
503 * Reading/setting optional parsing features.
504 */
505
506int		xmlGetFeaturesList	(int *len,
507					 const char **result);
508int		xmlGetFeature		(xmlParserCtxtPtr ctxt,
509					 const char *name,
510					 void *result);
511int		xmlSetFeature		(xmlParserCtxtPtr ctxt,
512					 const char *name,
513					 void *value);
514
515/*
516 * Interfaces for the Push mode
517 */
518xmlParserCtxtPtr xmlCreatePushParserCtxt(xmlSAXHandlerPtr sax,
519					 void *user_data,
520					 const char *chunk,
521					 int size,
522					 const char *filename);
523int		 xmlParseChunk		(xmlParserCtxtPtr ctxt,
524					 const char *chunk,
525					 int size,
526					 int terminate);
527
528/*
529 * Special I/O mode
530 */
531
532xmlParserCtxtPtr xmlCreateIOParserCtxt	(xmlSAXHandlerPtr sax,
533					 void *user_data,
534					 xmlInputReadCallback   ioread,
535					 xmlInputCloseCallback  ioclose,
536					 void *ioctx,
537					 xmlCharEncoding enc);
538
539xmlParserInputPtr xmlNewIOInputStream	(xmlParserCtxtPtr ctxt,
540					 xmlParserInputBufferPtr input,
541					 xmlCharEncoding enc);
542
543/*
544 * Node infos
545 */
546const xmlParserNodeInfo*
547		xmlParserFindNodeInfo	(const xmlParserCtxt* ctxt,
548                                               const xmlNode* node);
549void		xmlInitNodeInfoSeq	(xmlParserNodeInfoSeqPtr seq);
550void		xmlClearNodeInfoSeq	(xmlParserNodeInfoSeqPtr seq);
551unsigned long xmlParserFindNodeInfoIndex(const xmlParserNodeInfoSeq* seq,
552                                         const xmlNode* node);
553void		xmlParserAddNodeInfo	(xmlParserCtxtPtr ctxt,
554					 const xmlParserNodeInfo* info);
555
556/*
557 * External entities handling actually implemented in xmlIO
558 */
559
560void		xmlSetExternalEntityLoader(xmlExternalEntityLoader f);
561xmlExternalEntityLoader
562		xmlGetExternalEntityLoader(void);
563xmlParserInputPtr
564		xmlLoadExternalEntity	(const char *URL,
565					 const char *ID,
566					 xmlParserCtxtPtr context);
567
568#ifdef __cplusplus
569}
570#endif
571
572#endif /* __XML_PARSER_H__ */
573
574