parser.h revision d16df9f6efe5c0a4f41f4b3e60312c3f584659a5
1/*
2 * parser.h : Interfaces, constants and types related to the XML parser.
3 *
4 * See Copyright for the status of this software.
5 *
6 * Daniel.Veillard@w3.org
7 */
8
9#ifndef __XML_PARSER_H__
10#define __XML_PARSER_H__
11
12#include <libxml/tree.h>
13#include <libxml/valid.h>
14#include <libxml/xmlIO.h>
15#include <libxml/entities.h>
16
17
18#ifdef __cplusplus
19extern "C" {
20#endif
21
22/*
23 * Constants.
24 */
25#define XML_DEFAULT_VERSION	"1.0"
26
27/**
28 * xmlParserInput:
29 *
30 * an xmlParserInput is an input flow for the XML processor.
31 * Each entity parsed is associated an xmlParserInput (except the
32 * few predefined ones). This is the case both for internal entities
33 * - in which case the flow is already completely in memory - or
34 * external entities - in which case we use the buf structure for
35 * progressive reading and I18N conversions to the internal UTF-8 format.
36 */
37
38typedef void (* xmlParserInputDeallocate)(xmlChar *);
39typedef struct _xmlParserInput xmlParserInput;
40typedef xmlParserInput *xmlParserInputPtr;
41struct _xmlParserInput {
42    /* Input buffer */
43    xmlParserInputBufferPtr buf;      /* UTF-8 encoded buffer */
44
45    const char *filename;             /* The file analyzed, if any */
46    const char *directory;            /* the directory/base of teh file */
47    const xmlChar *base;              /* Base of the array to parse */
48    const xmlChar *cur;               /* Current char being parsed */
49    const xmlChar *end;               /* end of the arry to parse */
50    int length;                       /* length if known */
51    int line;                         /* Current line */
52    int col;                          /* Current column */
53    int consumed;                     /* How many xmlChars already consumed */
54    xmlParserInputDeallocate free;    /* function to deallocate the base */
55    const xmlChar *encoding;          /* the encoding string for entity */
56    const xmlChar *version;           /* the version string for entity */
57    int standalone;                   /* Was that entity marked standalone */
58};
59
60/**
61 * xmlParserNodeInfo:
62 *
63 * the parser can be asked to collect Node informations, i.e. at what
64 * place in the file they were detected.
65 * NOTE: This is off by default and not very well tested.
66 */
67typedef struct _xmlParserNodeInfo xmlParserNodeInfo;
68typedef xmlParserNodeInfo *xmlParserNodeInfoPtr;
69
70struct _xmlParserNodeInfo {
71  const struct _xmlNode* node;
72  /* Position & line # that text that created the node begins & ends on */
73  unsigned long begin_pos;
74  unsigned long begin_line;
75  unsigned long end_pos;
76  unsigned long end_line;
77};
78
79typedef struct _xmlParserNodeInfoSeq xmlParserNodeInfoSeq;
80typedef xmlParserNodeInfoSeq *xmlParserNodeInfoSeqPtr;
81struct _xmlParserNodeInfoSeq {
82  unsigned long maximum;
83  unsigned long length;
84  xmlParserNodeInfo* buffer;
85};
86
87/**
88 * xmlParserInputState:
89 *
90 * The parser is now working also as a state based parser
91 * The recursive one use the stagte info for entities processing
92 */
93typedef enum {
94    XML_PARSER_EOF = -1,	/* nothing is to be parsed */
95    XML_PARSER_START = 0,	/* nothing has been parsed */
96    XML_PARSER_MISC,		/* Misc* before int subset */
97    XML_PARSER_PI,		/* Whithin a processing instruction */
98    XML_PARSER_DTD,		/* within some DTD content */
99    XML_PARSER_PROLOG,		/* Misc* after internal subset */
100    XML_PARSER_COMMENT,		/* within a comment */
101    XML_PARSER_START_TAG,	/* within a start tag */
102    XML_PARSER_CONTENT,		/* within the content */
103    XML_PARSER_CDATA_SECTION,	/* within a CDATA section */
104    XML_PARSER_END_TAG,		/* within a closing tag */
105    XML_PARSER_ENTITY_DECL,	/* within an entity declaration */
106    XML_PARSER_ENTITY_VALUE,	/* within an entity value in a decl */
107    XML_PARSER_ATTRIBUTE_VALUE,	/* within an attribute value */
108    XML_PARSER_SYSTEM_LITERAL,	/* within a SYSTEM value */
109    XML_PARSER_EPILOG, 		/* the Misc* after the last end tag */
110    XML_PARSER_IGNORE		/* within an IGNORED section */
111} xmlParserInputState;
112
113/**
114 * XML_DETECT_IDS:
115 *
116 * Bit in the loadsubset context field to tell to do ID/REFs lookups
117 * Use it to initialize xmlLoadExtDtdDefaultValue
118 */
119#define XML_DETECT_IDS		2
120
121/**
122 * XML_COMPLETE_ATTRS:
123 *
124 * Bit in the loadsubset context field to tell to do complete the
125 * elements attributes lists with the ones defaulted from the DTDs
126 * Use it to initialize xmlLoadExtDtdDefaultValue
127 */
128#define XML_COMPLETE_ATTRS	4
129
130/**
131 * xmlParserCtxt:
132 *
133 * The parser context.
134 * NOTE This doesn't completely defines the parser state, the (current ?)
135 *      design of the parser uses recursive function calls since this allow
136 *      and easy mapping from the production rules of the specification
137 *      to the actual code. The drawback is that the actual function call
138 *      also reflect the parser state. However most of the parsing routines
139 *      takes as the only argument the parser context pointer, so migrating
140 *      to a state based parser for progressive parsing shouldn't be too hard.
141 */
142typedef struct _xmlParserCtxt xmlParserCtxt;
143typedef xmlParserCtxt *xmlParserCtxtPtr;
144struct _xmlParserCtxt {
145    struct _xmlSAXHandler *sax;       /* The SAX handler */
146    void            *userData;        /* For SAX interface only, used by DOM build */
147    xmlDocPtr           myDoc;        /* the document being built */
148    int            wellFormed;        /* is the document well formed */
149    int       replaceEntities;        /* shall we replace entities ? */
150    const xmlChar    *version;        /* the XML version string */
151    const xmlChar   *encoding;        /* the declared encoding, if any */
152    int            standalone;        /* standalone document */
153    int                  html;        /* an HTML(1)/Docbook(2) document */
154
155    /* Input stream stack */
156    xmlParserInputPtr  input;         /* Current input stream */
157    int                inputNr;       /* Number of current input streams */
158    int                inputMax;      /* Max number of input streams */
159    xmlParserInputPtr *inputTab;      /* stack of inputs */
160
161    /* Node analysis stack only used for DOM building */
162    xmlNodePtr         node;          /* Current parsed Node */
163    int                nodeNr;        /* Depth of the parsing stack */
164    int                nodeMax;       /* Max depth of the parsing stack */
165    xmlNodePtr        *nodeTab;       /* array of nodes */
166
167    int record_info;                  /* Whether node info should be kept */
168    xmlParserNodeInfoSeq node_seq;    /* info about each node parsed */
169
170    int errNo;                        /* error code */
171
172    int     hasExternalSubset;        /* reference and external subset */
173    int             hasPErefs;        /* the internal subset has PE refs */
174    int              external;        /* are we parsing an external entity */
175
176    int                 valid;        /* is the document valid */
177    int              validate;        /* shall we try to validate ? */
178    xmlValidCtxt        vctxt;        /* The validity context */
179
180    xmlParserInputState instate;      /* current type of input */
181    int                 token;        /* next char look-ahead */
182
183    char           *directory;        /* the data directory */
184
185    /* Node name stack */
186    xmlChar           *name;          /* Current parsed Node */
187    int                nameNr;        /* Depth of the parsing stack */
188    int                nameMax;       /* Max depth of the parsing stack */
189    xmlChar *         *nameTab;       /* array of nodes */
190
191    long               nbChars;       /* number of xmlChar processed */
192    long            checkIndex;       /* used by progressive parsing lookup */
193    int             keepBlanks;       /* ugly but ... */
194    int             disableSAX;       /* SAX callbacks are disabled */
195    int               inSubset;       /* Parsing is in int 1/ext 2 subset */
196    xmlChar *          intSubName;    /* name of subset */
197    xmlChar *          extSubURI;     /* URI of external subset */
198    xmlChar *          extSubSystem;  /* SYSTEM ID of external subset */
199
200    /* xml:space values */
201    int *              space;         /* Should the parser preserve spaces */
202    int                spaceNr;       /* Depth of the parsing stack */
203    int                spaceMax;      /* Max depth of the parsing stack */
204    int *              spaceTab;      /* array of space infos */
205
206    int                depth;         /* to prevent entity substitution loops */
207    xmlParserInputPtr  entity;        /* used to check entities boundaries */
208    int                charset;       /* encoding of the in-memory content
209				         actually an xmlCharEncoding */
210    int                nodelen;       /* Those two fields are there to */
211    int                nodemem;       /* Speed up large node parsing */
212    int                pedantic;      /* signal pedantic warnings */
213    void              *_private;      /* For user data, libxml won't touch it */
214
215    int                loadsubset;    /* should the external subset be loaded */
216};
217
218/**
219 * xmlSAXLocator:
220 *
221 * a SAX Locator.
222 */
223typedef struct _xmlSAXLocator xmlSAXLocator;
224typedef xmlSAXLocator *xmlSAXLocatorPtr;
225struct _xmlSAXLocator {
226    const xmlChar *(*getPublicId)(void *ctx);
227    const xmlChar *(*getSystemId)(void *ctx);
228    int (*getLineNumber)(void *ctx);
229    int (*getColumnNumber)(void *ctx);
230};
231
232/**
233 * xmlSAXHandler:
234 *
235 * a SAX handler is bunch of callbacks called by the parser when processing
236 * of the input generate data or structure informations.
237 */
238
239typedef xmlParserInputPtr (*resolveEntitySAXFunc) (void *ctx,
240			    const xmlChar *publicId, const xmlChar *systemId);
241typedef void (*internalSubsetSAXFunc) (void *ctx, const xmlChar *name,
242                            const xmlChar *ExternalID, const xmlChar *SystemID);
243typedef void (*externalSubsetSAXFunc) (void *ctx, const xmlChar *name,
244                            const xmlChar *ExternalID, const xmlChar *SystemID);
245typedef xmlEntityPtr (*getEntitySAXFunc) (void *ctx,
246                            const xmlChar *name);
247typedef xmlEntityPtr (*getParameterEntitySAXFunc) (void *ctx,
248                            const xmlChar *name);
249typedef void (*entityDeclSAXFunc) (void *ctx,
250                            const xmlChar *name, int type, const xmlChar *publicId,
251			    const xmlChar *systemId, xmlChar *content);
252typedef void (*notationDeclSAXFunc)(void *ctx, const xmlChar *name,
253			    const xmlChar *publicId, const xmlChar *systemId);
254typedef void (*attributeDeclSAXFunc)(void *ctx, const xmlChar *elem,
255                            const xmlChar *name, int type, int def,
256			    const xmlChar *defaultValue, xmlEnumerationPtr tree);
257typedef void (*elementDeclSAXFunc)(void *ctx, const xmlChar *name,
258			    int type, xmlElementContentPtr content);
259typedef void (*unparsedEntityDeclSAXFunc)(void *ctx,
260                            const xmlChar *name, const xmlChar *publicId,
261			    const xmlChar *systemId, const xmlChar *notationName);
262typedef void (*setDocumentLocatorSAXFunc) (void *ctx,
263                            xmlSAXLocatorPtr loc);
264typedef void (*startDocumentSAXFunc) (void *ctx);
265typedef void (*endDocumentSAXFunc) (void *ctx);
266typedef void (*startElementSAXFunc) (void *ctx, const xmlChar *name,
267                            const xmlChar **atts);
268typedef void (*endElementSAXFunc) (void *ctx, const xmlChar *name);
269typedef void (*attributeSAXFunc) (void *ctx, const xmlChar *name,
270                                  const xmlChar *value);
271typedef void (*referenceSAXFunc) (void *ctx, const xmlChar *name);
272typedef void (*charactersSAXFunc) (void *ctx, const xmlChar *ch,
273		            int len);
274typedef void (*ignorableWhitespaceSAXFunc) (void *ctx,
275			    const xmlChar *ch, int len);
276typedef void (*processingInstructionSAXFunc) (void *ctx,
277                            const xmlChar *target, const xmlChar *data);
278typedef void (*commentSAXFunc) (void *ctx, const xmlChar *value);
279typedef void (*cdataBlockSAXFunc) (void *ctx, const xmlChar *value, int len);
280typedef void (*warningSAXFunc) (void *ctx, const char *msg, ...);
281typedef void (*errorSAXFunc) (void *ctx, const char *msg, ...);
282typedef void (*fatalErrorSAXFunc) (void *ctx, const char *msg, ...);
283typedef int (*isStandaloneSAXFunc) (void *ctx);
284typedef int (*hasInternalSubsetSAXFunc) (void *ctx);
285typedef int (*hasExternalSubsetSAXFunc) (void *ctx);
286
287typedef struct _xmlSAXHandler xmlSAXHandler;
288typedef xmlSAXHandler *xmlSAXHandlerPtr;
289struct _xmlSAXHandler {
290    internalSubsetSAXFunc internalSubset;
291    isStandaloneSAXFunc isStandalone;
292    hasInternalSubsetSAXFunc hasInternalSubset;
293    hasExternalSubsetSAXFunc hasExternalSubset;
294    resolveEntitySAXFunc resolveEntity;
295    getEntitySAXFunc getEntity;
296    entityDeclSAXFunc entityDecl;
297    notationDeclSAXFunc notationDecl;
298    attributeDeclSAXFunc attributeDecl;
299    elementDeclSAXFunc elementDecl;
300    unparsedEntityDeclSAXFunc unparsedEntityDecl;
301    setDocumentLocatorSAXFunc setDocumentLocator;
302    startDocumentSAXFunc startDocument;
303    endDocumentSAXFunc endDocument;
304    startElementSAXFunc startElement;
305    endElementSAXFunc endElement;
306    referenceSAXFunc reference;
307    charactersSAXFunc characters;
308    ignorableWhitespaceSAXFunc ignorableWhitespace;
309    processingInstructionSAXFunc processingInstruction;
310    commentSAXFunc comment;
311    warningSAXFunc warning;
312    errorSAXFunc error;
313    fatalErrorSAXFunc fatalError;
314    getParameterEntitySAXFunc getParameterEntity;
315    cdataBlockSAXFunc cdataBlock;
316    externalSubsetSAXFunc externalSubset;
317};
318
319/**
320 * xmlExternalEntityLoader:
321 * @URL: The System ID of the resource requested
322 * @ID: The Public ID of the resource requested
323 * @xmlParserCtxtPtr: the XML parser context
324 *
325 * External entity loaders types
326 */
327typedef xmlParserInputPtr (*xmlExternalEntityLoader)(const char *URL,
328						     const char *ID,
329						     xmlParserCtxtPtr context);
330
331/*
332 * Global variables: just the default SAX interface tables and XML
333 * version infos.
334 */
335LIBXML_DLL_IMPORT extern const char *xmlParserVersion;
336
337LIBXML_DLL_IMPORT extern xmlSAXLocator xmlDefaultSAXLocator;
338LIBXML_DLL_IMPORT extern xmlSAXHandler xmlDefaultSAXHandler;
339LIBXML_DLL_IMPORT extern xmlSAXHandler htmlDefaultSAXHandler;
340LIBXML_DLL_IMPORT extern xmlSAXHandler docbDefaultSAXHandler;
341
342/*
343 * entity substitution default behaviour.
344 */
345
346#ifdef VMS
347LIBXML_DLL_IMPORT extern int xmlSubstituteEntitiesDefaultVal;
348#define xmlSubstituteEntitiesDefaultValue xmlSubstituteEntitiesDefaultVal
349#else
350LIBXML_DLL_IMPORT extern int xmlSubstituteEntitiesDefaultValue;
351#endif
352LIBXML_DLL_IMPORT extern int xmlGetWarningsDefaultValue;
353
354
355/*
356 * Init/Cleanup
357 */
358void		xmlInitParser		(void);
359void		xmlCleanupParser	(void);
360
361/*
362 * Input functions
363 */
364int		xmlParserInputRead	(xmlParserInputPtr in,
365					 int len);
366int		xmlParserInputGrow	(xmlParserInputPtr in,
367					 int len);
368
369/*
370 * xmlChar handling
371 */
372xmlChar *	xmlStrdup		(const xmlChar *cur);
373xmlChar *	xmlStrndup		(const xmlChar *cur,
374					 int len);
375xmlChar *	xmlCharStrndup		(const char *cur,
376					 int len);
377xmlChar *	xmlCharStrdup		(const char *cur);
378xmlChar *	xmlStrsub		(const xmlChar *str,
379					 int start,
380					 int len);
381const xmlChar *	xmlStrchr		(const xmlChar *str,
382					 xmlChar val);
383const xmlChar *	xmlStrstr		(const xmlChar *str,
384					 xmlChar *val);
385const xmlChar *	xmlStrcasestr		(const xmlChar *str,
386					 xmlChar *val);
387int		xmlStrcmp		(const xmlChar *str1,
388					 const xmlChar *str2);
389int		xmlStrncmp		(const xmlChar *str1,
390					 const xmlChar *str2,
391					 int len);
392int		xmlStrcasecmp		(const xmlChar *str1,
393					 const xmlChar *str2);
394int		xmlStrncasecmp		(const xmlChar *str1,
395					 const xmlChar *str2,
396					 int len);
397int		xmlStrEqual		(const xmlChar *str1,
398					 const xmlChar *str2);
399int		xmlStrlen		(const xmlChar *str);
400xmlChar *	xmlStrcat		(xmlChar *cur,
401					 const xmlChar *add);
402xmlChar *	xmlStrncat		(xmlChar *cur,
403					 const xmlChar *add,
404					 int len);
405
406/*
407 * Basic parsing Interfaces
408 */
409xmlDocPtr	xmlParseDoc		(xmlChar *cur);
410xmlDocPtr	xmlParseMemory		(char *buffer,
411					 int size);
412xmlDocPtr	xmlParseFile		(const char *filename);
413int		xmlSubstituteEntitiesDefault(int val);
414int		xmlKeepBlanksDefault	(int val);
415void		xmlStopParser		(xmlParserCtxtPtr ctxt);
416int		xmlPedanticParserDefault(int val);
417
418/*
419 * Recovery mode
420 */
421xmlDocPtr	xmlRecoverDoc		(xmlChar *cur);
422xmlDocPtr	xmlRecoverMemory	(char *buffer,
423					 int size);
424xmlDocPtr	xmlRecoverFile		(const char *filename);
425
426/*
427 * Less common routines and SAX interfaces
428 */
429int		xmlParseDocument	(xmlParserCtxtPtr ctxt);
430int		xmlParseExtParsedEnt	(xmlParserCtxtPtr ctxt);
431xmlDocPtr	xmlSAXParseDoc		(xmlSAXHandlerPtr sax,
432					 xmlChar *cur,
433					 int recovery);
434int		xmlSAXUserParseFile	(xmlSAXHandlerPtr sax,
435					 void *user_data,
436					 const char *filename);
437int		xmlSAXUserParseMemory	(xmlSAXHandlerPtr sax,
438					 void *user_data,
439					 const char *buffer,
440					 int size);
441xmlDocPtr	xmlSAXParseMemory	(xmlSAXHandlerPtr sax,
442					 char *buffer,
443                                   	 int size,
444					 int recovery);
445xmlDocPtr	xmlSAXParseFile		(xmlSAXHandlerPtr sax,
446					 const char *filename,
447					 int recovery);
448xmlDocPtr	xmlSAXParseEntity	(xmlSAXHandlerPtr sax,
449					 const char *filename);
450xmlDocPtr	xmlParseEntity		(const char *filename);
451xmlDtdPtr	xmlParseDTD		(const xmlChar *ExternalID,
452					 const xmlChar *SystemID);
453xmlDtdPtr	xmlSAXParseDTD		(xmlSAXHandlerPtr sax,
454					 const xmlChar *ExternalID,
455					 const xmlChar *SystemID);
456xmlDtdPtr	xmlIOParseDTD		(xmlSAXHandlerPtr sax,
457					 xmlParserInputBufferPtr input,
458					 xmlCharEncoding enc);
459int		xmlParseBalancedChunkMemory(xmlDocPtr doc,
460					 xmlSAXHandlerPtr sax,
461					 void *user_data,
462					 int depth,
463					 const xmlChar *string,
464					 xmlNodePtr *list);
465int		xmlParseExternalEntity	(xmlDocPtr doc,
466					 xmlSAXHandlerPtr sax,
467					 void *user_data,
468					 int depth,
469					 const xmlChar *URL,
470					 const xmlChar *ID,
471					 xmlNodePtr *list);
472int		xmlParseCtxtExternalEntity(xmlParserCtxtPtr ctx,
473					 const xmlChar *URL,
474					 const xmlChar *ID,
475					 xmlNodePtr *list);
476
477/*
478 * SAX initialization routines
479 */
480void		xmlDefaultSAXHandlerInit(void);
481void		htmlDefaultSAXHandlerInit(void);
482
483/*
484 * Parser contexts handling.
485 */
486void		xmlInitParserCtxt	(xmlParserCtxtPtr ctxt);
487void		xmlClearParserCtxt	(xmlParserCtxtPtr ctxt);
488void		xmlFreeParserCtxt	(xmlParserCtxtPtr ctxt);
489void		xmlSetupParserForBuffer	(xmlParserCtxtPtr ctxt,
490					 const xmlChar* buffer,
491					 const char* filename);
492xmlParserCtxtPtr xmlCreateDocParserCtxt	(xmlChar *cur);
493
494/*
495 * Reading/setting optional parsing features.
496 */
497
498int		xmlGetFeaturesList	(int *len,
499					 const char **result);
500int		xmlGetFeature		(xmlParserCtxtPtr ctxt,
501					 const char *name,
502					 void *result);
503int		xmlSetFeature		(xmlParserCtxtPtr ctxt,
504					 const char *name,
505					 void *value);
506
507/*
508 * Interfaces for the Push mode
509 */
510xmlParserCtxtPtr xmlCreatePushParserCtxt(xmlSAXHandlerPtr sax,
511					 void *user_data,
512					 const char *chunk,
513					 int size,
514					 const char *filename);
515int		 xmlParseChunk		(xmlParserCtxtPtr ctxt,
516					 const char *chunk,
517					 int size,
518					 int terminate);
519
520/*
521 * Special I/O mode
522 */
523
524xmlParserCtxtPtr xmlCreateIOParserCtxt	(xmlSAXHandlerPtr sax,
525					 void *user_data,
526					 xmlInputReadCallback   ioread,
527					 xmlInputCloseCallback  ioclose,
528					 void *ioctx,
529					 xmlCharEncoding enc);
530
531xmlParserInputPtr xmlNewIOInputStream	(xmlParserCtxtPtr ctxt,
532					 xmlParserInputBufferPtr input,
533					 xmlCharEncoding enc);
534
535/*
536 * Node infos
537 */
538const xmlParserNodeInfo*
539		xmlParserFindNodeInfo	(const xmlParserCtxt* ctxt,
540                                               const xmlNode* node);
541void		xmlInitNodeInfoSeq	(xmlParserNodeInfoSeqPtr seq);
542void		xmlClearNodeInfoSeq	(xmlParserNodeInfoSeqPtr seq);
543unsigned long xmlParserFindNodeInfoIndex(const xmlParserNodeInfoSeq* seq,
544                                         const xmlNode* node);
545void		xmlParserAddNodeInfo	(xmlParserCtxtPtr ctxt,
546					 const xmlParserNodeInfo* info);
547
548/*
549 * External entities handling actually implemented in xmlIO
550 */
551
552void		xmlSetExternalEntityLoader(xmlExternalEntityLoader f);
553xmlExternalEntityLoader
554		xmlGetExternalEntityLoader(void);
555xmlParserInputPtr
556		xmlLoadExternalEntity	(const char *URL,
557					 const char *ID,
558					 xmlParserCtxtPtr context);
559
560#ifdef __cplusplus
561}
562#endif
563
564#endif /* __XML_PARSER_H__ */
565
566