xmlstring.c revision bf5cf2196c619c6ec2fec5fdf31cd6b040df508d
1/*
2 * string.c : an XML string utilities module
3 *
4 * This module provides various utility functions for manipulating
5 * the xmlChar* type. All functions named xmlStr* have been moved here
6 * from the parser.c file (their original home).
7 *
8 * See Copyright for the status of this software.
9 *
10 * UTF8 string routines from:
11 * William Brack <wbrack@mmm.com.hk>
12 *
13 * daniel@veillard.com
14 */
15
16#define IN_LIBXML
17#include "libxml.h"
18
19#include <stdlib.h>
20#include <string.h>
21#include <libxml/xmlmemory.h>
22#include <libxml/parserInternals.h>
23#include <libxml/xmlstring.h>
24
25/************************************************************************
26 *                                                                      *
27 *                Commodity functions to handle xmlChars                *
28 *                                                                      *
29 ************************************************************************/
30
31/**
32 * xmlStrndup:
33 * @cur:  the input xmlChar *
34 * @len:  the len of @cur
35 *
36 * a strndup for array of xmlChar's
37 *
38 * Returns a new xmlChar * or NULL
39 */
40xmlChar *
41xmlStrndup(const xmlChar *cur, int len) {
42    xmlChar *ret;
43
44    if ((cur == NULL) || (len < 0)) return(NULL);
45    ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
46    if (ret == NULL) {
47        xmlErrMemory(NULL, NULL);
48        return(NULL);
49    }
50    memcpy(ret, cur, len * sizeof(xmlChar));
51    ret[len] = 0;
52    return(ret);
53}
54
55/**
56 * xmlStrdup:
57 * @cur:  the input xmlChar *
58 *
59 * a strdup for array of xmlChar's. Since they are supposed to be
60 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
61 * a termination mark of '0'.
62 *
63 * Returns a new xmlChar * or NULL
64 */
65xmlChar *
66xmlStrdup(const xmlChar *cur) {
67    const xmlChar *p = cur;
68
69    if (cur == NULL) return(NULL);
70    while (*p != 0) p++; /* non input consuming */
71    return(xmlStrndup(cur, p - cur));
72}
73
74/**
75 * xmlCharStrndup:
76 * @cur:  the input char *
77 * @len:  the len of @cur
78 *
79 * a strndup for char's to xmlChar's
80 *
81 * Returns a new xmlChar * or NULL
82 */
83
84xmlChar *
85xmlCharStrndup(const char *cur, int len) {
86    int i;
87    xmlChar *ret;
88
89    if ((cur == NULL) || (len < 0)) return(NULL);
90    ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
91    if (ret == NULL) {
92        xmlErrMemory(NULL, NULL);
93        return(NULL);
94    }
95    for (i = 0;i < len;i++)
96        ret[i] = (xmlChar) cur[i];
97    ret[len] = 0;
98    return(ret);
99}
100
101/**
102 * xmlCharStrdup:
103 * @cur:  the input char *
104 *
105 * a strdup for char's to xmlChar's
106 *
107 * Returns a new xmlChar * or NULL
108 */
109
110xmlChar *
111xmlCharStrdup(const char *cur) {
112    const char *p = cur;
113
114    if (cur == NULL) return(NULL);
115    while (*p != '\0') p++; /* non input consuming */
116    return(xmlCharStrndup(cur, p - cur));
117}
118
119/**
120 * xmlStrcmp:
121 * @str1:  the first xmlChar *
122 * @str2:  the second xmlChar *
123 *
124 * a strcmp for xmlChar's
125 *
126 * Returns the integer result of the comparison
127 */
128
129int
130xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
131    register int tmp;
132
133    if (str1 == str2) return(0);
134    if (str1 == NULL) return(-1);
135    if (str2 == NULL) return(1);
136    do {
137        tmp = *str1++ - *str2;
138        if (tmp != 0) return(tmp);
139    } while (*str2++ != 0);
140    return 0;
141}
142
143/**
144 * xmlStrEqual:
145 * @str1:  the first xmlChar *
146 * @str2:  the second xmlChar *
147 *
148 * Check if both string are equal of have same content
149 * Should be a bit more readable and faster than xmlStrEqual()
150 *
151 * Returns 1 if they are equal, 0 if they are different
152 */
153
154int
155xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
156    if (str1 == str2) return(1);
157    if (str1 == NULL) return(0);
158    if (str2 == NULL) return(0);
159    do {
160        if (*str1++ != *str2) return(0);
161    } while (*str2++);
162    return(1);
163}
164
165/**
166 * xmlStrQEqual:
167 * @pref:  the prefix of the QName
168 * @name:  the localname of the QName
169 * @str:  the second xmlChar *
170 *
171 * Check if a QName is Equal to a given string
172 *
173 * Returns 1 if they are equal, 0 if they are different
174 */
175
176int
177xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
178    if (pref == NULL) return(xmlStrEqual(name, str));
179    if (name == NULL) return(0);
180    if (str == NULL) return(0);
181
182    do {
183        if (*pref++ != *str) return(0);
184    } while ((*str++) && (*pref));
185    if (*str++ != ':') return(0);
186    do {
187        if (*name++ != *str) return(0);
188    } while (*str++);
189    return(1);
190}
191
192/**
193 * xmlStrncmp:
194 * @str1:  the first xmlChar *
195 * @str2:  the second xmlChar *
196 * @len:  the max comparison length
197 *
198 * a strncmp for xmlChar's
199 *
200 * Returns the integer result of the comparison
201 */
202
203int
204xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
205    register int tmp;
206
207    if (len <= 0) return(0);
208    if (str1 == str2) return(0);
209    if (str1 == NULL) return(-1);
210    if (str2 == NULL) return(1);
211#ifdef __GNUC__
212    tmp = strncmp(str1, str2, len);
213    return tmp;
214#else
215    do {
216        tmp = *str1++ - *str2;
217        if (tmp != 0 || --len == 0) return(tmp);
218    } while (*str2++ != 0);
219    return 0;
220#endif
221}
222
223static const xmlChar casemap[256] = {
224    0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
225    0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
226    0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
227    0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
228    0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
229    0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
230    0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
231    0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
232    0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
233    0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
234    0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
235    0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
236    0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
237    0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
238    0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
239    0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
240    0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
241    0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
242    0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
243    0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
244    0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
245    0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
246    0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
247    0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
248    0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
249    0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
250    0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
251    0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
252    0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
253    0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
254    0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
255    0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
256};
257
258/**
259 * xmlStrcasecmp:
260 * @str1:  the first xmlChar *
261 * @str2:  the second xmlChar *
262 *
263 * a strcasecmp for xmlChar's
264 *
265 * Returns the integer result of the comparison
266 */
267
268int
269xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
270    register int tmp;
271
272    if (str1 == str2) return(0);
273    if (str1 == NULL) return(-1);
274    if (str2 == NULL) return(1);
275    do {
276        tmp = casemap[*str1++] - casemap[*str2];
277        if (tmp != 0) return(tmp);
278    } while (*str2++ != 0);
279    return 0;
280}
281
282/**
283 * xmlStrncasecmp:
284 * @str1:  the first xmlChar *
285 * @str2:  the second xmlChar *
286 * @len:  the max comparison length
287 *
288 * a strncasecmp for xmlChar's
289 *
290 * Returns the integer result of the comparison
291 */
292
293int
294xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
295    register int tmp;
296
297    if (len <= 0) return(0);
298    if (str1 == str2) return(0);
299    if (str1 == NULL) return(-1);
300    if (str2 == NULL) return(1);
301    do {
302        tmp = casemap[*str1++] - casemap[*str2];
303        if (tmp != 0 || --len == 0) return(tmp);
304    } while (*str2++ != 0);
305    return 0;
306}
307
308/**
309 * xmlStrchr:
310 * @str:  the xmlChar * array
311 * @val:  the xmlChar to search
312 *
313 * a strchr for xmlChar's
314 *
315 * Returns the xmlChar * for the first occurrence or NULL.
316 */
317
318const xmlChar *
319xmlStrchr(const xmlChar *str, xmlChar val) {
320    if (str == NULL) return(NULL);
321    while (*str != 0) { /* non input consuming */
322        if (*str == val) return((xmlChar *) str);
323        str++;
324    }
325    return(NULL);
326}
327
328/**
329 * xmlStrstr:
330 * @str:  the xmlChar * array (haystack)
331 * @val:  the xmlChar to search (needle)
332 *
333 * a strstr for xmlChar's
334 *
335 * Returns the xmlChar * for the first occurrence or NULL.
336 */
337
338const xmlChar *
339xmlStrstr(const xmlChar *str, const xmlChar *val) {
340    int n;
341
342    if (str == NULL) return(NULL);
343    if (val == NULL) return(NULL);
344    n = xmlStrlen(val);
345
346    if (n == 0) return(str);
347    while (*str != 0) { /* non input consuming */
348        if (*str == *val) {
349            if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
350        }
351        str++;
352    }
353    return(NULL);
354}
355
356/**
357 * xmlStrcasestr:
358 * @str:  the xmlChar * array (haystack)
359 * @val:  the xmlChar to search (needle)
360 *
361 * a case-ignoring strstr for xmlChar's
362 *
363 * Returns the xmlChar * for the first occurrence or NULL.
364 */
365
366const xmlChar *
367xmlStrcasestr(const xmlChar *str, xmlChar *val) {
368    int n;
369
370    if (str == NULL) return(NULL);
371    if (val == NULL) return(NULL);
372    n = xmlStrlen(val);
373
374    if (n == 0) return(str);
375    while (*str != 0) { /* non input consuming */
376        if (casemap[*str] == casemap[*val])
377            if (!xmlStrncasecmp(str, val, n)) return(str);
378        str++;
379    }
380    return(NULL);
381}
382
383/**
384 * xmlStrsub:
385 * @str:  the xmlChar * array (haystack)
386 * @start:  the index of the first char (zero based)
387 * @len:  the length of the substring
388 *
389 * Extract a substring of a given string
390 *
391 * Returns the xmlChar * for the first occurrence or NULL.
392 */
393
394xmlChar *
395xmlStrsub(const xmlChar *str, int start, int len) {
396    int i;
397
398    if (str == NULL) return(NULL);
399    if (start < 0) return(NULL);
400    if (len < 0) return(NULL);
401
402    for (i = 0;i < start;i++) {
403        if (*str == 0) return(NULL);
404        str++;
405    }
406    if (*str == 0) return(NULL);
407    return(xmlStrndup(str, len));
408}
409
410/**
411 * xmlStrlen:
412 * @str:  the xmlChar * array
413 *
414 * length of a xmlChar's string
415 *
416 * Returns the number of xmlChar contained in the ARRAY.
417 */
418
419int
420xmlStrlen(const xmlChar *str) {
421    int len = 0;
422
423    if (str == NULL) return(0);
424    while (*str != 0) { /* non input consuming */
425        str++;
426        len++;
427    }
428    return(len);
429}
430
431/**
432 * xmlStrncat:
433 * @cur:  the original xmlChar * array
434 * @add:  the xmlChar * array added
435 * @len:  the length of @add
436 *
437 * a strncat for array of xmlChar's, it will extend @cur with the len
438 * first bytes of @add.
439 *
440 * Returns a new xmlChar *, the original @cur is reallocated if needed
441 * and should not be freed
442 */
443
444xmlChar *
445xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
446    int size;
447    xmlChar *ret;
448
449    if ((add == NULL) || (len == 0))
450        return(cur);
451    if (cur == NULL)
452        return(xmlStrndup(add, len));
453
454    size = xmlStrlen(cur);
455    ret = (xmlChar *) xmlRealloc(cur, (size + len + 1) * sizeof(xmlChar));
456    if (ret == NULL) {
457        xmlErrMemory(NULL, NULL);
458        return(cur);
459    }
460    memcpy(&ret[size], add, len * sizeof(xmlChar));
461    ret[size + len] = 0;
462    return(ret);
463}
464
465/**
466 * xmlStrncatNew:
467 * @str1:  first xmlChar string
468 * @str2:  second xmlChar string
469 * @len:  the len of @str2
470 *
471 * same as xmlStrncat, but creates a new string.  The original
472 * two strings are not freed.
473 *
474 * Returns a new xmlChar * or NULL
475 */
476xmlChar *
477xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
478    int size;
479    xmlChar *ret;
480
481    if ((str2 == NULL) || (len == 0))
482        return(xmlStrdup(str1));
483    if (str1 == NULL)
484        return(xmlStrndup(str2, len));
485
486    size = xmlStrlen(str1);
487    ret = (xmlChar *) xmlMalloc((size + len + 1) * sizeof(xmlChar));
488    if (ret == NULL) {
489        xmlErrMemory(NULL, NULL);
490        return(xmlStrndup(str1, size));
491    }
492    memcpy(ret, str1, size * sizeof(xmlChar));
493    memcpy(&ret[size], str2, len * sizeof(xmlChar));
494    ret[size + len] = 0;
495    return(ret);
496}
497
498/**
499 * xmlStrcat:
500 * @cur:  the original xmlChar * array
501 * @add:  the xmlChar * array added
502 *
503 * a strcat for array of xmlChar's. Since they are supposed to be
504 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
505 * a termination mark of '0'.
506 *
507 * Returns a new xmlChar * containing the concatenated string.
508 */
509xmlChar *
510xmlStrcat(xmlChar *cur, const xmlChar *add) {
511    const xmlChar *p = add;
512
513    if (add == NULL) return(cur);
514    if (cur == NULL)
515        return(xmlStrdup(add));
516
517    while (*p != 0) p++; /* non input consuming */
518    return(xmlStrncat(cur, add, p - add));
519}
520
521/**
522 * xmlStrPrintf:
523 * @buf:   the result buffer.
524 * @len:   the result buffer length.
525 * @msg:   the message with printf formatting.
526 * @...:   extra parameters for the message.
527 *
528 * Formats @msg and places result into @buf.
529 *
530 * Returns the number of characters written to @buf or -1 if an error occurs.
531 */
532int
533xmlStrPrintf(xmlChar *buf, int len, const xmlChar *msg, ...) {
534    va_list args;
535    int ret;
536
537    if((buf == NULL) || (msg == NULL)) {
538        return(-1);
539    }
540
541    va_start(args, msg);
542    ret = vsnprintf((char *) buf, len, (const char *) msg, args);
543    va_end(args);
544    buf[len - 1] = 0; /* be safe ! */
545
546    return(ret);
547}
548
549/**
550 * xmlStrVPrintf:
551 * @buf:   the result buffer.
552 * @len:   the result buffer length.
553 * @msg:   the message with printf formatting.
554 * @ap:    extra parameters for the message.
555 *
556 * Formats @msg and places result into @buf.
557 *
558 * Returns the number of characters written to @buf or -1 if an error occurs.
559 */
560int
561xmlStrVPrintf(xmlChar *buf, int len, const xmlChar *msg, va_list ap) {
562    int ret;
563
564    if((buf == NULL) || (msg == NULL)) {
565        return(-1);
566    }
567
568    ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
569    buf[len - 1] = 0; /* be safe ! */
570
571    return(ret);
572}
573
574/************************************************************************
575 *                                                                      *
576 *              Generic UTF8 handling routines                          *
577 *                                                                      *
578 * From rfc2044: encoding of the Unicode values on UTF-8:               *
579 *                                                                      *
580 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)           *
581 * 0000 0000-0000 007F   0xxxxxxx                                       *
582 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx                              *
583 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx                     *
584 *                                                                      *
585 * I hope we won't use values > 0xFFFF anytime soon !                   *
586 *                                                                      *
587 ************************************************************************/
588
589
590/**
591 * xmlUTF8Size:
592 * @utf: pointer to the UTF8 character
593 *
594 * calculates the internal size of a UTF8 character
595 *
596 * returns the numbers of bytes in the character, -1 on format error
597 */
598int
599xmlUTF8Size(const xmlChar *utf) {
600    xmlChar mask;
601    int len;
602
603    if (utf == NULL)
604        return -1;
605    if (*utf < 0x80)
606        return 1;
607    /* check valid UTF8 character */
608    if (!(*utf & 0x40))
609        return -1;
610    /* determine number of bytes in char */
611    len = 2;
612    for (mask=0x20; mask != 0; mask>>=1) {
613        if (!(*utf & mask))
614            return len;
615        len++;
616    }
617    return -1;
618}
619
620/**
621 * xmlUTF8Charcmp:
622 * @utf1: pointer to first UTF8 char
623 * @utf2: pointer to second UTF8 char
624 *
625 * compares the two UCS4 values
626 *
627 * returns result of the compare as with xmlStrncmp
628 */
629int
630xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
631
632    if (utf1 == NULL ) {
633        if (utf2 == NULL)
634            return 0;
635        return -1;
636    }
637    return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
638}
639
640/**
641 * xmlUTF8Strlen:
642 * @utf:  a sequence of UTF-8 encoded bytes
643 *
644 * compute the length of an UTF8 string, it doesn't do a full UTF8
645 * checking of the content of the string.
646 *
647 * Returns the number of characters in the string or -1 in case of error
648 */
649int
650xmlUTF8Strlen(const xmlChar *utf) {
651    int ret = 0;
652
653    if (utf == NULL)
654        return(-1);
655
656    while (*utf != 0) {
657        if (utf[0] & 0x80) {
658            if ((utf[1] & 0xc0) != 0x80)
659                return(-1);
660            if ((utf[0] & 0xe0) == 0xe0) {
661                if ((utf[2] & 0xc0) != 0x80)
662                    return(-1);
663                if ((utf[0] & 0xf0) == 0xf0) {
664                    if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
665                        return(-1);
666                    utf += 4;
667                } else {
668                    utf += 3;
669                }
670            } else {
671                utf += 2;
672            }
673        } else {
674            utf++;
675        }
676        ret++;
677    }
678    return(ret);
679}
680
681/**
682 * xmlGetUTF8Char:
683 * @utf:  a sequence of UTF-8 encoded bytes
684 * @len:  a pointer to @bytes len
685 *
686 * Read one UTF8 Char from @utf
687 *
688 * Returns the char value or -1 in case of error, and updates *len with the
689 *        number of bytes consumed
690 */
691int
692xmlGetUTF8Char(const unsigned char *utf, int *len) {
693    unsigned int c;
694
695    if (utf == NULL)
696        goto error;
697    if (len == NULL)
698        goto error;
699    if (*len < 1)
700        goto error;
701
702    c = utf[0];
703    if (c & 0x80) {
704        if (*len < 2)
705            goto error;
706        if ((utf[1] & 0xc0) != 0x80)
707            goto error;
708        if ((c & 0xe0) == 0xe0) {
709            if (*len < 3)
710                goto error;
711            if ((utf[2] & 0xc0) != 0x80)
712                goto error;
713            if ((c & 0xf0) == 0xf0) {
714                if (*len < 4)
715                    goto error;
716                if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
717                    goto error;
718                *len = 4;
719                /* 4-byte code */
720                c = (utf[0] & 0x7) << 18;
721                c |= (utf[1] & 0x3f) << 12;
722                c |= (utf[2] & 0x3f) << 6;
723                c |= utf[3] & 0x3f;
724            } else {
725              /* 3-byte code */
726                *len = 3;
727                c = (utf[0] & 0xf) << 12;
728                c |= (utf[1] & 0x3f) << 6;
729                c |= utf[2] & 0x3f;
730            }
731        } else {
732          /* 2-byte code */
733            *len = 2;
734            c = (utf[0] & 0x1f) << 6;
735            c |= utf[1] & 0x3f;
736        }
737    } else {
738        /* 1-byte code */
739        *len = 1;
740    }
741    return(c);
742
743error:
744    *len = 0;
745    return(-1);
746}
747
748/**
749 * xmlCheckUTF8:
750 * @utf: Pointer to putative UTF-8 encoded string.
751 *
752 * Checks @utf for being valid UTF-8. @utf is assumed to be
753 * null-terminated. This function is not super-strict, as it will
754 * allow longer UTF-8 sequences than necessary. Note that Java is
755 * capable of producing these sequences if provoked. Also note, this
756 * routine checks for the 4-byte maximum size, but does not check for
757 * 0x10ffff maximum value.
758 *
759 * Return value: true if @utf is valid.
760 **/
761int
762xmlCheckUTF8(const unsigned char *utf)
763{
764    int ix;
765    unsigned char c;
766
767    /*
768     * utf is a string of 1, 2, 3 or 4 bytes.  The valid strings
769     * are as follows (in "bit format"):
770     *    0xxxxxxx                                      valid 1-byte
771     *    110xxxxx 10xxxxxx                             valid 2-byte
772     *    1110xxxx 10xxxxxx 10xxxxxx                    valid 3-byte
773     *    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx           valid 4-byte
774     */
775    for (ix = 0; (c = utf[ix]);) {      /* string is 0-terminated */
776        if (c & 0x80) {			/* 1-byte code, starts with 10 */
777            ix++;
778	} else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
779	    if ((utf[ix+1] & 0xc0 ) != 0x80)
780	        return 0;
781	    ix += 2;
782	} else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
783	    if (((utf[ix+1] & 0xc0) != 0x80) ||
784	        ((utf[ix+2] & 0xc0) != 0x80))
785		    return 0;
786	    ix += 3;
787	} else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
788	    if (((utf[ix+1] & 0xc0) != 0x80) ||
789	        ((utf[ix+2] & 0xc0) != 0x80) ||
790		((utf[ix+3] & 0xc0) != 0x80))
791		    return 0;
792	    ix += 4;
793	} else				/* unknown encoding */
794	    return 0;
795      }
796      return(1);
797}
798
799/**
800 * xmlUTF8Strsize:
801 * @utf:  a sequence of UTF-8 encoded bytes
802 * @len:  the number of characters in the array
803 *
804 * storage size of an UTF8 string
805 *
806 * Returns the storage size of
807 * the first 'len' characters of ARRAY
808 *
809 */
810
811int
812xmlUTF8Strsize(const xmlChar *utf, int len) {
813    const xmlChar   *ptr=utf;
814    xmlChar         ch;
815
816    if (len <= 0)
817        return(0);
818
819    while ( len-- > 0) {
820        if ( !*ptr )
821            break;
822        if ( (ch = *ptr++) & 0x80)
823            while ( (ch<<=1) & 0x80 )
824                ptr++;
825    }
826    return (ptr - utf);
827}
828
829
830/**
831 * xmlUTF8Strndup:
832 * @utf:  the input UTF8 *
833 * @len:  the len of @utf (in chars)
834 *
835 * a strndup for array of UTF8's
836 *
837 * Returns a new UTF8 * or NULL
838 */
839xmlChar *
840xmlUTF8Strndup(const xmlChar *utf, int len) {
841    xmlChar *ret;
842    int i;
843
844    if ((utf == NULL) || (len < 0)) return(NULL);
845    i = xmlUTF8Strsize(utf, len);
846    ret = (xmlChar *) xmlMallocAtomic((i + 1) * sizeof(xmlChar));
847    if (ret == NULL) {
848        xmlGenericError(xmlGenericErrorContext,
849                "malloc of %ld byte failed\n",
850                (len + 1) * (long)sizeof(xmlChar));
851        return(NULL);
852    }
853    memcpy(ret, utf, i * sizeof(xmlChar));
854    ret[i] = 0;
855    return(ret);
856}
857
858/**
859 * xmlUTF8Strpos:
860 * @utf:  the input UTF8 *
861 * @pos:  the position of the desired UTF8 char (in chars)
862 *
863 * a function to provide the equivalent of fetching a
864 * character from a string array
865 *
866 * Returns a pointer to the UTF8 character or NULL
867 */
868xmlChar *
869xmlUTF8Strpos(const xmlChar *utf, int pos) {
870    xmlChar ch;
871
872    if (utf == NULL) return(NULL);
873    if ( (pos < 0) || (pos >= xmlUTF8Strlen(utf)) )
874        return(NULL);
875    while (pos--) {
876        if ((ch=*utf++) == 0) return(NULL);
877        if ( ch & 0x80 ) {
878            /* if not simple ascii, verify proper format */
879            if ( (ch & 0xc0) != 0xc0 )
880                return(NULL);
881            /* then skip over remaining bytes for this char */
882            while ( (ch <<= 1) & 0x80 )
883                if ( (*utf++ & 0xc0) != 0x80 )
884                    return(NULL);
885        }
886    }
887    return((xmlChar *)utf);
888}
889
890/**
891 * xmlUTF8Strloc:
892 * @utf:  the input UTF8 *
893 * @utfchar:  the UTF8 character to be found
894 *
895 * a function to provide the relative location of a UTF8 char
896 *
897 * Returns the relative character position of the desired char
898 * or -1 if not found
899 */
900int
901xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
902    int i, size;
903    xmlChar ch;
904
905    if (utf==NULL || utfchar==NULL) return -1;
906    size = xmlUTF8Strsize(utfchar, 1);
907        for(i=0; (ch=*utf) != 0; i++) {
908            if (xmlStrncmp(utf, utfchar, size)==0)
909                return(i);
910            utf++;
911            if ( ch & 0x80 ) {
912                /* if not simple ascii, verify proper format */
913                if ( (ch & 0xc0) != 0xc0 )
914                    return(-1);
915                /* then skip over remaining bytes for this char */
916                while ( (ch <<= 1) & 0x80 )
917                    if ( (*utf++ & 0xc0) != 0x80 )
918                        return(-1);
919            }
920        }
921
922    return(-1);
923}
924/**
925 * xmlUTF8Strsub:
926 * @utf:  a sequence of UTF-8 encoded bytes
927 * @start: relative pos of first char
928 * @len:   total number to copy
929 *
930 * Create a substring from a given UTF-8 string
931 * Note:  positions are given in units of UTF-8 chars
932 *
933 * Returns a pointer to a newly created string
934 * or NULL if any problem
935 */
936
937xmlChar *
938xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
939    int            i;
940    xmlChar ch;
941
942    if (utf == NULL) return(NULL);
943    if (start < 0) return(NULL);
944    if (len < 0) return(NULL);
945
946    /*
947     * Skip over any leading chars
948     */
949    for (i = 0;i < start;i++) {
950        if ((ch=*utf++) == 0) return(NULL);
951        if ( ch & 0x80 ) {
952            /* if not simple ascii, verify proper format */
953            if ( (ch & 0xc0) != 0xc0 )
954                return(NULL);
955            /* then skip over remaining bytes for this char */
956            while ( (ch <<= 1) & 0x80 )
957                if ( (*utf++ & 0xc0) != 0x80 )
958                    return(NULL);
959        }
960    }
961
962    return(xmlUTF8Strndup(utf, len));
963}
964