xmlstring.c revision 8fbbf5513d609c1770b391b99e33314cd0742704
1/*
2 * string.c : an XML string utilities module
3 *
4 * This module provides various utility functions for manipulating
5 * the xmlChar* type. All functions named xmlStr* have been moved here
6 * from the parser.c file (their original home).
7 *
8 * See Copyright for the status of this software.
9 *
10 * UTF8 string routines from:
11 * William Brack <wbrack@mmm.com.hk>
12 *
13 * daniel@veillard.com
14 */
15
16#define IN_LIBXML
17#include "libxml.h"
18
19#include <stdlib.h>
20#include <string.h>
21#include <libxml/xmlmemory.h>
22#include <libxml/parserInternals.h>
23#include <libxml/xmlstring.h>
24
25/************************************************************************
26 *                                                                      *
27 *                Commodity functions to handle xmlChars                *
28 *                                                                      *
29 ************************************************************************/
30
31/**
32 * xmlStrndup:
33 * @cur:  the input xmlChar *
34 * @len:  the len of @cur
35 *
36 * a strndup for array of xmlChar's
37 *
38 * Returns a new xmlChar * or NULL
39 */
40xmlChar *
41xmlStrndup(const xmlChar *cur, int len) {
42    xmlChar *ret;
43
44    if ((cur == NULL) || (len < 0)) return(NULL);
45    ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
46    if (ret == NULL) {
47        xmlErrMemory(NULL, NULL);
48        return(NULL);
49    }
50    memcpy(ret, cur, len * sizeof(xmlChar));
51    ret[len] = 0;
52    return(ret);
53}
54
55/**
56 * xmlStrdup:
57 * @cur:  the input xmlChar *
58 *
59 * a strdup for array of xmlChar's. Since they are supposed to be
60 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
61 * a termination mark of '0'.
62 *
63 * Returns a new xmlChar * or NULL
64 */
65xmlChar *
66xmlStrdup(const xmlChar *cur) {
67    const xmlChar *p = cur;
68
69    if (cur == NULL) return(NULL);
70    while (*p != 0) p++; /* non input consuming */
71    return(xmlStrndup(cur, p - cur));
72}
73
74/**
75 * xmlCharStrndup:
76 * @cur:  the input char *
77 * @len:  the len of @cur
78 *
79 * a strndup for char's to xmlChar's
80 *
81 * Returns a new xmlChar * or NULL
82 */
83
84xmlChar *
85xmlCharStrndup(const char *cur, int len) {
86    int i;
87    xmlChar *ret;
88
89    if ((cur == NULL) || (len < 0)) return(NULL);
90    ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
91    if (ret == NULL) {
92        xmlErrMemory(NULL, NULL);
93        return(NULL);
94    }
95    for (i = 0;i < len;i++) {
96        ret[i] = (xmlChar) cur[i];
97        if (ret[i] == 0) return(ret);
98    }
99    ret[len] = 0;
100    return(ret);
101}
102
103/**
104 * xmlCharStrdup:
105 * @cur:  the input char *
106 *
107 * a strdup for char's to xmlChar's
108 *
109 * Returns a new xmlChar * or NULL
110 */
111
112xmlChar *
113xmlCharStrdup(const char *cur) {
114    const char *p = cur;
115
116    if (cur == NULL) return(NULL);
117    while (*p != '\0') p++; /* non input consuming */
118    return(xmlCharStrndup(cur, p - cur));
119}
120
121/**
122 * xmlStrcmp:
123 * @str1:  the first xmlChar *
124 * @str2:  the second xmlChar *
125 *
126 * a strcmp for xmlChar's
127 *
128 * Returns the integer result of the comparison
129 */
130
131int
132xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
133    register int tmp;
134
135    if (str1 == str2) return(0);
136    if (str1 == NULL) return(-1);
137    if (str2 == NULL) return(1);
138    do {
139        tmp = *str1++ - *str2;
140        if (tmp != 0) return(tmp);
141    } while (*str2++ != 0);
142    return 0;
143}
144
145/**
146 * xmlStrEqual:
147 * @str1:  the first xmlChar *
148 * @str2:  the second xmlChar *
149 *
150 * Check if both strings are equal of have same content.
151 * Should be a bit more readable and faster than xmlStrcmp()
152 *
153 * Returns 1 if they are equal, 0 if they are different
154 */
155
156int
157xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
158    if (str1 == str2) return(1);
159    if (str1 == NULL) return(0);
160    if (str2 == NULL) return(0);
161    do {
162        if (*str1++ != *str2) return(0);
163    } while (*str2++);
164    return(1);
165}
166
167/**
168 * xmlStrQEqual:
169 * @pref:  the prefix of the QName
170 * @name:  the localname of the QName
171 * @str:  the second xmlChar *
172 *
173 * Check if a QName is Equal to a given string
174 *
175 * Returns 1 if they are equal, 0 if they are different
176 */
177
178int
179xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
180    if (pref == NULL) return(xmlStrEqual(name, str));
181    if (name == NULL) return(0);
182    if (str == NULL) return(0);
183
184    do {
185        if (*pref++ != *str) return(0);
186    } while ((*str++) && (*pref));
187    if (*str++ != ':') return(0);
188    do {
189        if (*name++ != *str) return(0);
190    } while (*str++);
191    return(1);
192}
193
194/**
195 * xmlStrncmp:
196 * @str1:  the first xmlChar *
197 * @str2:  the second xmlChar *
198 * @len:  the max comparison length
199 *
200 * a strncmp for xmlChar's
201 *
202 * Returns the integer result of the comparison
203 */
204
205int
206xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
207    register int tmp;
208
209    if (len <= 0) return(0);
210    if (str1 == str2) return(0);
211    if (str1 == NULL) return(-1);
212    if (str2 == NULL) return(1);
213#ifdef __GNUC__
214    tmp = strncmp((const char *)str1, (const char *)str2, len);
215    return tmp;
216#else
217    do {
218        tmp = *str1++ - *str2;
219        if (tmp != 0 || --len == 0) return(tmp);
220    } while (*str2++ != 0);
221    return 0;
222#endif
223}
224
225static const xmlChar casemap[256] = {
226    0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
227    0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
228    0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
229    0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
230    0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
231    0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
232    0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
233    0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
234    0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
235    0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
236    0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
237    0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
238    0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
239    0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
240    0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
241    0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
242    0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
243    0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
244    0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
245    0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
246    0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
247    0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
248    0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
249    0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
250    0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
251    0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
252    0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
253    0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
254    0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
255    0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
256    0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
257    0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
258};
259
260/**
261 * xmlStrcasecmp:
262 * @str1:  the first xmlChar *
263 * @str2:  the second xmlChar *
264 *
265 * a strcasecmp for xmlChar's
266 *
267 * Returns the integer result of the comparison
268 */
269
270int
271xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
272    register int tmp;
273
274    if (str1 == str2) return(0);
275    if (str1 == NULL) return(-1);
276    if (str2 == NULL) return(1);
277    do {
278        tmp = casemap[*str1++] - casemap[*str2];
279        if (tmp != 0) return(tmp);
280    } while (*str2++ != 0);
281    return 0;
282}
283
284/**
285 * xmlStrncasecmp:
286 * @str1:  the first xmlChar *
287 * @str2:  the second xmlChar *
288 * @len:  the max comparison length
289 *
290 * a strncasecmp for xmlChar's
291 *
292 * Returns the integer result of the comparison
293 */
294
295int
296xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
297    register int tmp;
298
299    if (len <= 0) return(0);
300    if (str1 == str2) return(0);
301    if (str1 == NULL) return(-1);
302    if (str2 == NULL) return(1);
303    do {
304        tmp = casemap[*str1++] - casemap[*str2];
305        if (tmp != 0 || --len == 0) return(tmp);
306    } while (*str2++ != 0);
307    return 0;
308}
309
310/**
311 * xmlStrchr:
312 * @str:  the xmlChar * array
313 * @val:  the xmlChar to search
314 *
315 * a strchr for xmlChar's
316 *
317 * Returns the xmlChar * for the first occurrence or NULL.
318 */
319
320const xmlChar *
321xmlStrchr(const xmlChar *str, xmlChar val) {
322    if (str == NULL) return(NULL);
323    while (*str != 0) { /* non input consuming */
324        if (*str == val) return((xmlChar *) str);
325        str++;
326    }
327    return(NULL);
328}
329
330/**
331 * xmlStrstr:
332 * @str:  the xmlChar * array (haystack)
333 * @val:  the xmlChar to search (needle)
334 *
335 * a strstr for xmlChar's
336 *
337 * Returns the xmlChar * for the first occurrence or NULL.
338 */
339
340const xmlChar *
341xmlStrstr(const xmlChar *str, const xmlChar *val) {
342    int n;
343
344    if (str == NULL) return(NULL);
345    if (val == NULL) return(NULL);
346    n = xmlStrlen(val);
347
348    if (n == 0) return(str);
349    while (*str != 0) { /* non input consuming */
350        if (*str == *val) {
351            if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
352        }
353        str++;
354    }
355    return(NULL);
356}
357
358/**
359 * xmlStrcasestr:
360 * @str:  the xmlChar * array (haystack)
361 * @val:  the xmlChar to search (needle)
362 *
363 * a case-ignoring strstr for xmlChar's
364 *
365 * Returns the xmlChar * for the first occurrence or NULL.
366 */
367
368const xmlChar *
369xmlStrcasestr(const xmlChar *str, const xmlChar *val) {
370    int n;
371
372    if (str == NULL) return(NULL);
373    if (val == NULL) return(NULL);
374    n = xmlStrlen(val);
375
376    if (n == 0) return(str);
377    while (*str != 0) { /* non input consuming */
378        if (casemap[*str] == casemap[*val])
379            if (!xmlStrncasecmp(str, val, n)) return(str);
380        str++;
381    }
382    return(NULL);
383}
384
385/**
386 * xmlStrsub:
387 * @str:  the xmlChar * array (haystack)
388 * @start:  the index of the first char (zero based)
389 * @len:  the length of the substring
390 *
391 * Extract a substring of a given string
392 *
393 * Returns the xmlChar * for the first occurrence or NULL.
394 */
395
396xmlChar *
397xmlStrsub(const xmlChar *str, int start, int len) {
398    int i;
399
400    if (str == NULL) return(NULL);
401    if (start < 0) return(NULL);
402    if (len < 0) return(NULL);
403
404    for (i = 0;i < start;i++) {
405        if (*str == 0) return(NULL);
406        str++;
407    }
408    if (*str == 0) return(NULL);
409    return(xmlStrndup(str, len));
410}
411
412/**
413 * xmlStrlen:
414 * @str:  the xmlChar * array
415 *
416 * length of a xmlChar's string
417 *
418 * Returns the number of xmlChar contained in the ARRAY.
419 */
420
421int
422xmlStrlen(const xmlChar *str) {
423    int len = 0;
424
425    if (str == NULL) return(0);
426    while (*str != 0) { /* non input consuming */
427        str++;
428        len++;
429    }
430    return(len);
431}
432
433/**
434 * xmlStrncat:
435 * @cur:  the original xmlChar * array
436 * @add:  the xmlChar * array added
437 * @len:  the length of @add
438 *
439 * a strncat for array of xmlChar's, it will extend @cur with the len
440 * first bytes of @add. Note that if @len < 0 then this is an API error
441 * and NULL will be returned.
442 *
443 * Returns a new xmlChar *, the original @cur is reallocated if needed
444 * and should not be freed
445 */
446
447xmlChar *
448xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
449    int size;
450    xmlChar *ret;
451
452    if ((add == NULL) || (len == 0))
453        return(cur);
454    if (len < 0)
455	return(NULL);
456    if (cur == NULL)
457        return(xmlStrndup(add, len));
458
459    size = xmlStrlen(cur);
460    if (size < 0)
461        return(NULL);
462    ret = (xmlChar *) xmlRealloc(cur, (size + len + 1) * sizeof(xmlChar));
463    if (ret == NULL) {
464        xmlErrMemory(NULL, NULL);
465        return(cur);
466    }
467    memcpy(&ret[size], add, len * sizeof(xmlChar));
468    ret[size + len] = 0;
469    return(ret);
470}
471
472/**
473 * xmlStrncatNew:
474 * @str1:  first xmlChar string
475 * @str2:  second xmlChar string
476 * @len:  the len of @str2 or < 0
477 *
478 * same as xmlStrncat, but creates a new string.  The original
479 * two strings are not freed. If @len is < 0 then the length
480 * will be calculated automatically.
481 *
482 * Returns a new xmlChar * or NULL
483 */
484xmlChar *
485xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
486    int size;
487    xmlChar *ret;
488
489    if (len < 0) {
490        len = xmlStrlen(str2);
491        if (len < 0)
492            return(NULL);
493    }
494    if ((str2 == NULL) || (len == 0))
495        return(xmlStrdup(str1));
496    if (str1 == NULL)
497        return(xmlStrndup(str2, len));
498
499    size = xmlStrlen(str1);
500    if (size < 0)
501        return(NULL);
502    ret = (xmlChar *) xmlMalloc((size + len + 1) * sizeof(xmlChar));
503    if (ret == NULL) {
504        xmlErrMemory(NULL, NULL);
505        return(xmlStrndup(str1, size));
506    }
507    memcpy(ret, str1, size * sizeof(xmlChar));
508    memcpy(&ret[size], str2, len * sizeof(xmlChar));
509    ret[size + len] = 0;
510    return(ret);
511}
512
513/**
514 * xmlStrcat:
515 * @cur:  the original xmlChar * array
516 * @add:  the xmlChar * array added
517 *
518 * a strcat for array of xmlChar's. Since they are supposed to be
519 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
520 * a termination mark of '0'.
521 *
522 * Returns a new xmlChar * containing the concatenated string.
523 */
524xmlChar *
525xmlStrcat(xmlChar *cur, const xmlChar *add) {
526    const xmlChar *p = add;
527
528    if (add == NULL) return(cur);
529    if (cur == NULL)
530        return(xmlStrdup(add));
531
532    while (*p != 0) p++; /* non input consuming */
533    return(xmlStrncat(cur, add, p - add));
534}
535
536/**
537 * xmlStrPrintf:
538 * @buf:   the result buffer.
539 * @len:   the result buffer length.
540 * @msg:   the message with printf formatting.
541 * @...:   extra parameters for the message.
542 *
543 * Formats @msg and places result into @buf.
544 *
545 * Returns the number of characters written to @buf or -1 if an error occurs.
546 */
547int XMLCDECL
548xmlStrPrintf(xmlChar *buf, int len, const xmlChar *msg, ...) {
549    va_list args;
550    int ret;
551
552    if((buf == NULL) || (msg == NULL)) {
553        return(-1);
554    }
555
556    va_start(args, msg);
557    ret = vsnprintf((char *) buf, len, (const char *) msg, args);
558    va_end(args);
559    buf[len - 1] = 0; /* be safe ! */
560
561    return(ret);
562}
563
564/**
565 * xmlStrVPrintf:
566 * @buf:   the result buffer.
567 * @len:   the result buffer length.
568 * @msg:   the message with printf formatting.
569 * @ap:    extra parameters for the message.
570 *
571 * Formats @msg and places result into @buf.
572 *
573 * Returns the number of characters written to @buf or -1 if an error occurs.
574 */
575int
576xmlStrVPrintf(xmlChar *buf, int len, const xmlChar *msg, va_list ap) {
577    int ret;
578
579    if((buf == NULL) || (msg == NULL)) {
580        return(-1);
581    }
582
583    ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
584    buf[len - 1] = 0; /* be safe ! */
585
586    return(ret);
587}
588
589/************************************************************************
590 *                                                                      *
591 *              Generic UTF8 handling routines                          *
592 *                                                                      *
593 * From rfc2044: encoding of the Unicode values on UTF-8:               *
594 *                                                                      *
595 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)           *
596 * 0000 0000-0000 007F   0xxxxxxx                                       *
597 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx                              *
598 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx                     *
599 *                                                                      *
600 * I hope we won't use values > 0xFFFF anytime soon !                   *
601 *                                                                      *
602 ************************************************************************/
603
604
605/**
606 * xmlUTF8Size:
607 * @utf: pointer to the UTF8 character
608 *
609 * calculates the internal size of a UTF8 character
610 *
611 * returns the numbers of bytes in the character, -1 on format error
612 */
613int
614xmlUTF8Size(const xmlChar *utf) {
615    xmlChar mask;
616    int len;
617
618    if (utf == NULL)
619        return -1;
620    if (*utf < 0x80)
621        return 1;
622    /* check valid UTF8 character */
623    if (!(*utf & 0x40))
624        return -1;
625    /* determine number of bytes in char */
626    len = 2;
627    for (mask=0x20; mask != 0; mask>>=1) {
628        if (!(*utf & mask))
629            return len;
630        len++;
631    }
632    return -1;
633}
634
635/**
636 * xmlUTF8Charcmp:
637 * @utf1: pointer to first UTF8 char
638 * @utf2: pointer to second UTF8 char
639 *
640 * compares the two UCS4 values
641 *
642 * returns result of the compare as with xmlStrncmp
643 */
644int
645xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
646
647    if (utf1 == NULL ) {
648        if (utf2 == NULL)
649            return 0;
650        return -1;
651    }
652    return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
653}
654
655/**
656 * xmlUTF8Strlen:
657 * @utf:  a sequence of UTF-8 encoded bytes
658 *
659 * compute the length of an UTF8 string, it doesn't do a full UTF8
660 * checking of the content of the string.
661 *
662 * Returns the number of characters in the string or -1 in case of error
663 */
664int
665xmlUTF8Strlen(const xmlChar *utf) {
666    int ret = 0;
667
668    if (utf == NULL)
669        return(-1);
670
671    while (*utf != 0) {
672        if (utf[0] & 0x80) {
673            if ((utf[1] & 0xc0) != 0x80)
674                return(-1);
675            if ((utf[0] & 0xe0) == 0xe0) {
676                if ((utf[2] & 0xc0) != 0x80)
677                    return(-1);
678                if ((utf[0] & 0xf0) == 0xf0) {
679                    if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
680                        return(-1);
681                    utf += 4;
682                } else {
683                    utf += 3;
684                }
685            } else {
686                utf += 2;
687            }
688        } else {
689            utf++;
690        }
691        ret++;
692    }
693    return(ret);
694}
695
696/**
697 * xmlGetUTF8Char:
698 * @utf:  a sequence of UTF-8 encoded bytes
699 * @len:  a pointer to the minimum number of bytes present in
700 *        the sequence.  This is used to assure the next character
701 *        is completely contained within the sequence.
702 *
703 * Read the first UTF8 character from @utf
704 *
705 * Returns the char value or -1 in case of error, and sets *len to
706 *        the actual number of bytes consumed (0 in case of error)
707 */
708int
709xmlGetUTF8Char(const unsigned char *utf, int *len) {
710    unsigned int c;
711
712    if (utf == NULL)
713        goto error;
714    if (len == NULL)
715        goto error;
716    if (*len < 1)
717        goto error;
718
719    c = utf[0];
720    if (c & 0x80) {
721        if (*len < 2)
722            goto error;
723        if ((utf[1] & 0xc0) != 0x80)
724            goto error;
725        if ((c & 0xe0) == 0xe0) {
726            if (*len < 3)
727                goto error;
728            if ((utf[2] & 0xc0) != 0x80)
729                goto error;
730            if ((c & 0xf0) == 0xf0) {
731                if (*len < 4)
732                    goto error;
733                if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
734                    goto error;
735                *len = 4;
736                /* 4-byte code */
737                c = (utf[0] & 0x7) << 18;
738                c |= (utf[1] & 0x3f) << 12;
739                c |= (utf[2] & 0x3f) << 6;
740                c |= utf[3] & 0x3f;
741            } else {
742              /* 3-byte code */
743                *len = 3;
744                c = (utf[0] & 0xf) << 12;
745                c |= (utf[1] & 0x3f) << 6;
746                c |= utf[2] & 0x3f;
747            }
748        } else {
749          /* 2-byte code */
750            *len = 2;
751            c = (utf[0] & 0x1f) << 6;
752            c |= utf[1] & 0x3f;
753        }
754    } else {
755        /* 1-byte code */
756        *len = 1;
757    }
758    return(c);
759
760error:
761    if (len != NULL)
762	*len = 0;
763    return(-1);
764}
765
766/**
767 * xmlCheckUTF8:
768 * @utf: Pointer to putative UTF-8 encoded string.
769 *
770 * Checks @utf for being valid UTF-8. @utf is assumed to be
771 * null-terminated. This function is not super-strict, as it will
772 * allow longer UTF-8 sequences than necessary. Note that Java is
773 * capable of producing these sequences if provoked. Also note, this
774 * routine checks for the 4-byte maximum size, but does not check for
775 * 0x10ffff maximum value.
776 *
777 * Return value: true if @utf is valid.
778 **/
779int
780xmlCheckUTF8(const unsigned char *utf)
781{
782    int ix;
783    unsigned char c;
784
785    if (utf == NULL)
786        return(0);
787    /*
788     * utf is a string of 1, 2, 3 or 4 bytes.  The valid strings
789     * are as follows (in "bit format"):
790     *    0xxxxxxx                                      valid 1-byte
791     *    110xxxxx 10xxxxxx                             valid 2-byte
792     *    1110xxxx 10xxxxxx 10xxxxxx                    valid 3-byte
793     *    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx           valid 4-byte
794     */
795    for (ix = 0; (c = utf[ix]);) {      /* string is 0-terminated */
796        if ((c & 0x80) == 0x00) {	/* 1-byte code, starts with 10 */
797            ix++;
798	} else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
799	    if ((utf[ix+1] & 0xc0 ) != 0x80)
800	        return 0;
801	    ix += 2;
802	} else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
803	    if (((utf[ix+1] & 0xc0) != 0x80) ||
804	        ((utf[ix+2] & 0xc0) != 0x80))
805		    return 0;
806	    ix += 3;
807	} else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
808	    if (((utf[ix+1] & 0xc0) != 0x80) ||
809	        ((utf[ix+2] & 0xc0) != 0x80) ||
810		((utf[ix+3] & 0xc0) != 0x80))
811		    return 0;
812	    ix += 4;
813	} else				/* unknown encoding */
814	    return 0;
815      }
816      return(1);
817}
818
819/**
820 * xmlUTF8Strsize:
821 * @utf:  a sequence of UTF-8 encoded bytes
822 * @len:  the number of characters in the array
823 *
824 * storage size of an UTF8 string
825 * the behaviour is not garanteed if the input string is not UTF-8
826 *
827 * Returns the storage size of
828 * the first 'len' characters of ARRAY
829 */
830
831int
832xmlUTF8Strsize(const xmlChar *utf, int len) {
833    const xmlChar   *ptr=utf;
834    xmlChar         ch;
835
836    if (utf == NULL)
837        return(0);
838
839    if (len <= 0)
840        return(0);
841
842    while ( len-- > 0) {
843        if ( !*ptr )
844            break;
845        if ( (ch = *ptr++) & 0x80)
846            while ((ch<<=1) & 0x80 ) {
847		if (*ptr == 0) break;
848                ptr++;
849	    }
850    }
851    return (ptr - utf);
852}
853
854
855/**
856 * xmlUTF8Strndup:
857 * @utf:  the input UTF8 *
858 * @len:  the len of @utf (in chars)
859 *
860 * a strndup for array of UTF8's
861 *
862 * Returns a new UTF8 * or NULL
863 */
864xmlChar *
865xmlUTF8Strndup(const xmlChar *utf, int len) {
866    xmlChar *ret;
867    int i;
868
869    if ((utf == NULL) || (len < 0)) return(NULL);
870    i = xmlUTF8Strsize(utf, len);
871    ret = (xmlChar *) xmlMallocAtomic((i + 1) * sizeof(xmlChar));
872    if (ret == NULL) {
873        xmlGenericError(xmlGenericErrorContext,
874                "malloc of %ld byte failed\n",
875                (len + 1) * (long)sizeof(xmlChar));
876        return(NULL);
877    }
878    memcpy(ret, utf, i * sizeof(xmlChar));
879    ret[i] = 0;
880    return(ret);
881}
882
883/**
884 * xmlUTF8Strpos:
885 * @utf:  the input UTF8 *
886 * @pos:  the position of the desired UTF8 char (in chars)
887 *
888 * a function to provide the equivalent of fetching a
889 * character from a string array
890 *
891 * Returns a pointer to the UTF8 character or NULL
892 */
893const xmlChar *
894xmlUTF8Strpos(const xmlChar *utf, int pos) {
895    xmlChar ch;
896
897    if (utf == NULL) return(NULL);
898    if (pos < 0)
899        return(NULL);
900    while (pos--) {
901        if ((ch=*utf++) == 0) return(NULL);
902        if ( ch & 0x80 ) {
903            /* if not simple ascii, verify proper format */
904            if ( (ch & 0xc0) != 0xc0 )
905                return(NULL);
906            /* then skip over remaining bytes for this char */
907            while ( (ch <<= 1) & 0x80 )
908                if ( (*utf++ & 0xc0) != 0x80 )
909                    return(NULL);
910        }
911    }
912    return((xmlChar *)utf);
913}
914
915/**
916 * xmlUTF8Strloc:
917 * @utf:  the input UTF8 *
918 * @utfchar:  the UTF8 character to be found
919 *
920 * a function to provide the relative location of a UTF8 char
921 *
922 * Returns the relative character position of the desired char
923 * or -1 if not found
924 */
925int
926xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
927    int i, size;
928    xmlChar ch;
929
930    if (utf==NULL || utfchar==NULL) return -1;
931    size = xmlUTF8Strsize(utfchar, 1);
932        for(i=0; (ch=*utf) != 0; i++) {
933            if (xmlStrncmp(utf, utfchar, size)==0)
934                return(i);
935            utf++;
936            if ( ch & 0x80 ) {
937                /* if not simple ascii, verify proper format */
938                if ( (ch & 0xc0) != 0xc0 )
939                    return(-1);
940                /* then skip over remaining bytes for this char */
941                while ( (ch <<= 1) & 0x80 )
942                    if ( (*utf++ & 0xc0) != 0x80 )
943                        return(-1);
944            }
945        }
946
947    return(-1);
948}
949/**
950 * xmlUTF8Strsub:
951 * @utf:  a sequence of UTF-8 encoded bytes
952 * @start: relative pos of first char
953 * @len:   total number to copy
954 *
955 * Create a substring from a given UTF-8 string
956 * Note:  positions are given in units of UTF-8 chars
957 *
958 * Returns a pointer to a newly created string
959 * or NULL if any problem
960 */
961
962xmlChar *
963xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
964    int            i;
965    xmlChar ch;
966
967    if (utf == NULL) return(NULL);
968    if (start < 0) return(NULL);
969    if (len < 0) return(NULL);
970
971    /*
972     * Skip over any leading chars
973     */
974    for (i = 0;i < start;i++) {
975        if ((ch=*utf++) == 0) return(NULL);
976        if ( ch & 0x80 ) {
977            /* if not simple ascii, verify proper format */
978            if ( (ch & 0xc0) != 0xc0 )
979                return(NULL);
980            /* then skip over remaining bytes for this char */
981            while ( (ch <<= 1) & 0x80 )
982                if ( (*utf++ & 0xc0) != 0x80 )
983                    return(NULL);
984        }
985    }
986
987    return(xmlUTF8Strndup(utf, len));
988}
989
990#define bottom_xmlstring
991#include "elfgcchack.h"
992