1/*
2 * string.c : an XML string utilities module
3 *
4 * This module provides various utility functions for manipulating
5 * the xmlChar* type. All functions named xmlStr* have been moved here
6 * from the parser.c file (their original home).
7 *
8 * See Copyright for the status of this software.
9 *
10 * UTF8 string routines from:
11 * William Brack <wbrack@mmm.com.hk>
12 *
13 * daniel@veillard.com
14 */
15
16#define IN_LIBXML
17#include "libxml.h"
18
19#include <stdlib.h>
20#include <string.h>
21#include <libxml/xmlmemory.h>
22#include <libxml/parserInternals.h>
23#include <libxml/xmlstring.h>
24
25/************************************************************************
26 *                                                                      *
27 *                Commodity functions to handle xmlChars                *
28 *                                                                      *
29 ************************************************************************/
30
31/**
32 * xmlStrndup:
33 * @cur:  the input xmlChar *
34 * @len:  the len of @cur
35 *
36 * a strndup for array of xmlChar's
37 *
38 * Returns a new xmlChar * or NULL
39 */
40xmlChar *
41xmlStrndup(const xmlChar *cur, int len) {
42    xmlChar *ret;
43
44    if ((cur == NULL) || (len < 0)) return(NULL);
45    ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
46    if (ret == NULL) {
47        xmlErrMemory(NULL, NULL);
48        return(NULL);
49    }
50    memcpy(ret, cur, len * sizeof(xmlChar));
51    ret[len] = 0;
52    return(ret);
53}
54
55/**
56 * xmlStrdup:
57 * @cur:  the input xmlChar *
58 *
59 * a strdup for array of xmlChar's. Since they are supposed to be
60 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
61 * a termination mark of '0'.
62 *
63 * Returns a new xmlChar * or NULL
64 */
65xmlChar *
66xmlStrdup(const xmlChar *cur) {
67    const xmlChar *p = cur;
68
69    if (cur == NULL) return(NULL);
70    while (*p != 0) p++; /* non input consuming */
71    return(xmlStrndup(cur, p - cur));
72}
73
74/**
75 * xmlCharStrndup:
76 * @cur:  the input char *
77 * @len:  the len of @cur
78 *
79 * a strndup for char's to xmlChar's
80 *
81 * Returns a new xmlChar * or NULL
82 */
83
84xmlChar *
85xmlCharStrndup(const char *cur, int len) {
86    int i;
87    xmlChar *ret;
88
89    if ((cur == NULL) || (len < 0)) return(NULL);
90    ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar));
91    if (ret == NULL) {
92        xmlErrMemory(NULL, NULL);
93        return(NULL);
94    }
95    for (i = 0;i < len;i++) {
96        ret[i] = (xmlChar) cur[i];
97        if (ret[i] == 0) return(ret);
98    }
99    ret[len] = 0;
100    return(ret);
101}
102
103/**
104 * xmlCharStrdup:
105 * @cur:  the input char *
106 *
107 * a strdup for char's to xmlChar's
108 *
109 * Returns a new xmlChar * or NULL
110 */
111
112xmlChar *
113xmlCharStrdup(const char *cur) {
114    const char *p = cur;
115
116    if (cur == NULL) return(NULL);
117    while (*p != '\0') p++; /* non input consuming */
118    return(xmlCharStrndup(cur, p - cur));
119}
120
121/**
122 * xmlStrcmp:
123 * @str1:  the first xmlChar *
124 * @str2:  the second xmlChar *
125 *
126 * a strcmp for xmlChar's
127 *
128 * Returns the integer result of the comparison
129 */
130
131int
132xmlStrcmp(const xmlChar *str1, const xmlChar *str2) {
133    register int tmp;
134
135    if (str1 == str2) return(0);
136    if (str1 == NULL) return(-1);
137    if (str2 == NULL) return(1);
138    do {
139        tmp = *str1++ - *str2;
140        if (tmp != 0) return(tmp);
141    } while (*str2++ != 0);
142    return 0;
143}
144
145/**
146 * xmlStrEqual:
147 * @str1:  the first xmlChar *
148 * @str2:  the second xmlChar *
149 *
150 * Check if both strings are equal of have same content.
151 * Should be a bit more readable and faster than xmlStrcmp()
152 *
153 * Returns 1 if they are equal, 0 if they are different
154 */
155
156int
157xmlStrEqual(const xmlChar *str1, const xmlChar *str2) {
158    if (str1 == str2) return(1);
159    if (str1 == NULL) return(0);
160    if (str2 == NULL) return(0);
161    do {
162        if (*str1++ != *str2) return(0);
163    } while (*str2++);
164    return(1);
165}
166
167/**
168 * xmlStrQEqual:
169 * @pref:  the prefix of the QName
170 * @name:  the localname of the QName
171 * @str:  the second xmlChar *
172 *
173 * Check if a QName is Equal to a given string
174 *
175 * Returns 1 if they are equal, 0 if they are different
176 */
177
178int
179xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) {
180    if (pref == NULL) return(xmlStrEqual(name, str));
181    if (name == NULL) return(0);
182    if (str == NULL) return(0);
183
184    do {
185        if (*pref++ != *str) return(0);
186    } while ((*str++) && (*pref));
187    if (*str++ != ':') return(0);
188    do {
189        if (*name++ != *str) return(0);
190    } while (*str++);
191    return(1);
192}
193
194/**
195 * xmlStrncmp:
196 * @str1:  the first xmlChar *
197 * @str2:  the second xmlChar *
198 * @len:  the max comparison length
199 *
200 * a strncmp for xmlChar's
201 *
202 * Returns the integer result of the comparison
203 */
204
205int
206xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) {
207    register int tmp;
208
209    if (len <= 0) return(0);
210    if (str1 == str2) return(0);
211    if (str1 == NULL) return(-1);
212    if (str2 == NULL) return(1);
213#ifdef __GNUC__
214    tmp = strncmp((const char *)str1, (const char *)str2, len);
215    return tmp;
216#else
217    do {
218        tmp = *str1++ - *str2;
219        if (tmp != 0 || --len == 0) return(tmp);
220    } while (*str2++ != 0);
221    return 0;
222#endif
223}
224
225static const xmlChar casemap[256] = {
226    0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
227    0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
228    0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
229    0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
230    0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
231    0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
232    0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
233    0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
234    0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
235    0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
236    0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
237    0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F,
238    0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
239    0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
240    0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
241    0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
242    0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87,
243    0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,
244    0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97,
245    0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F,
246    0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,
247    0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF,
248    0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7,
249    0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF,
250    0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,
251    0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,
252    0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7,
253    0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF,
254    0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,
255    0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,
256    0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,
257    0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF
258};
259
260/**
261 * xmlStrcasecmp:
262 * @str1:  the first xmlChar *
263 * @str2:  the second xmlChar *
264 *
265 * a strcasecmp for xmlChar's
266 *
267 * Returns the integer result of the comparison
268 */
269
270int
271xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) {
272    register int tmp;
273
274    if (str1 == str2) return(0);
275    if (str1 == NULL) return(-1);
276    if (str2 == NULL) return(1);
277    do {
278        tmp = casemap[*str1++] - casemap[*str2];
279        if (tmp != 0) return(tmp);
280    } while (*str2++ != 0);
281    return 0;
282}
283
284/**
285 * xmlStrncasecmp:
286 * @str1:  the first xmlChar *
287 * @str2:  the second xmlChar *
288 * @len:  the max comparison length
289 *
290 * a strncasecmp for xmlChar's
291 *
292 * Returns the integer result of the comparison
293 */
294
295int
296xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) {
297    register int tmp;
298
299    if (len <= 0) return(0);
300    if (str1 == str2) return(0);
301    if (str1 == NULL) return(-1);
302    if (str2 == NULL) return(1);
303    do {
304        tmp = casemap[*str1++] - casemap[*str2];
305        if (tmp != 0 || --len == 0) return(tmp);
306    } while (*str2++ != 0);
307    return 0;
308}
309
310/**
311 * xmlStrchr:
312 * @str:  the xmlChar * array
313 * @val:  the xmlChar to search
314 *
315 * a strchr for xmlChar's
316 *
317 * Returns the xmlChar * for the first occurrence or NULL.
318 */
319
320const xmlChar *
321xmlStrchr(const xmlChar *str, xmlChar val) {
322    if (str == NULL) return(NULL);
323    while (*str != 0) { /* non input consuming */
324        if (*str == val) return((xmlChar *) str);
325        str++;
326    }
327    return(NULL);
328}
329
330/**
331 * xmlStrstr:
332 * @str:  the xmlChar * array (haystack)
333 * @val:  the xmlChar to search (needle)
334 *
335 * a strstr for xmlChar's
336 *
337 * Returns the xmlChar * for the first occurrence or NULL.
338 */
339
340const xmlChar *
341xmlStrstr(const xmlChar *str, const xmlChar *val) {
342    int n;
343
344    if (str == NULL) return(NULL);
345    if (val == NULL) return(NULL);
346    n = xmlStrlen(val);
347
348    if (n == 0) return(str);
349    while (*str != 0) { /* non input consuming */
350        if (*str == *val) {
351            if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str);
352        }
353        str++;
354    }
355    return(NULL);
356}
357
358/**
359 * xmlStrcasestr:
360 * @str:  the xmlChar * array (haystack)
361 * @val:  the xmlChar to search (needle)
362 *
363 * a case-ignoring strstr for xmlChar's
364 *
365 * Returns the xmlChar * for the first occurrence or NULL.
366 */
367
368const xmlChar *
369xmlStrcasestr(const xmlChar *str, const xmlChar *val) {
370    int n;
371
372    if (str == NULL) return(NULL);
373    if (val == NULL) return(NULL);
374    n = xmlStrlen(val);
375
376    if (n == 0) return(str);
377    while (*str != 0) { /* non input consuming */
378        if (casemap[*str] == casemap[*val])
379            if (!xmlStrncasecmp(str, val, n)) return(str);
380        str++;
381    }
382    return(NULL);
383}
384
385/**
386 * xmlStrsub:
387 * @str:  the xmlChar * array (haystack)
388 * @start:  the index of the first char (zero based)
389 * @len:  the length of the substring
390 *
391 * Extract a substring of a given string
392 *
393 * Returns the xmlChar * for the first occurrence or NULL.
394 */
395
396xmlChar *
397xmlStrsub(const xmlChar *str, int start, int len) {
398    int i;
399
400    if (str == NULL) return(NULL);
401    if (start < 0) return(NULL);
402    if (len < 0) return(NULL);
403
404    for (i = 0;i < start;i++) {
405        if (*str == 0) return(NULL);
406        str++;
407    }
408    if (*str == 0) return(NULL);
409    return(xmlStrndup(str, len));
410}
411
412/**
413 * xmlStrlen:
414 * @str:  the xmlChar * array
415 *
416 * length of a xmlChar's string
417 *
418 * Returns the number of xmlChar contained in the ARRAY.
419 */
420
421int
422xmlStrlen(const xmlChar *str) {
423    int len = 0;
424
425    if (str == NULL) return(0);
426    while (*str != 0) { /* non input consuming */
427        str++;
428        len++;
429    }
430    return(len);
431}
432
433/**
434 * xmlStrncat:
435 * @cur:  the original xmlChar * array
436 * @add:  the xmlChar * array added
437 * @len:  the length of @add
438 *
439 * a strncat for array of xmlChar's, it will extend @cur with the len
440 * first bytes of @add. Note that if @len < 0 then this is an API error
441 * and NULL will be returned.
442 *
443 * Returns a new xmlChar *, the original @cur is reallocated if needed
444 * and should not be freed
445 */
446
447xmlChar *
448xmlStrncat(xmlChar *cur, const xmlChar *add, int len) {
449    int size;
450    xmlChar *ret;
451
452    if ((add == NULL) || (len == 0))
453        return(cur);
454    if (len < 0)
455	return(NULL);
456    if (cur == NULL)
457        return(xmlStrndup(add, len));
458
459    size = xmlStrlen(cur);
460    ret = (xmlChar *) xmlRealloc(cur, (size + len + 1) * sizeof(xmlChar));
461    if (ret == NULL) {
462        xmlErrMemory(NULL, NULL);
463        return(cur);
464    }
465    memcpy(&ret[size], add, len * sizeof(xmlChar));
466    ret[size + len] = 0;
467    return(ret);
468}
469
470/**
471 * xmlStrncatNew:
472 * @str1:  first xmlChar string
473 * @str2:  second xmlChar string
474 * @len:  the len of @str2 or < 0
475 *
476 * same as xmlStrncat, but creates a new string.  The original
477 * two strings are not freed. If @len is < 0 then the length
478 * will be calculated automatically.
479 *
480 * Returns a new xmlChar * or NULL
481 */
482xmlChar *
483xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) {
484    int size;
485    xmlChar *ret;
486
487    if (len < 0)
488        len = xmlStrlen(str2);
489    if ((str2 == NULL) || (len == 0))
490        return(xmlStrdup(str1));
491    if (str1 == NULL)
492        return(xmlStrndup(str2, len));
493
494    size = xmlStrlen(str1);
495    ret = (xmlChar *) xmlMalloc((size + len + 1) * sizeof(xmlChar));
496    if (ret == NULL) {
497        xmlErrMemory(NULL, NULL);
498        return(xmlStrndup(str1, size));
499    }
500    memcpy(ret, str1, size * sizeof(xmlChar));
501    memcpy(&ret[size], str2, len * sizeof(xmlChar));
502    ret[size + len] = 0;
503    return(ret);
504}
505
506/**
507 * xmlStrcat:
508 * @cur:  the original xmlChar * array
509 * @add:  the xmlChar * array added
510 *
511 * a strcat for array of xmlChar's. Since they are supposed to be
512 * encoded in UTF-8 or an encoding with 8bit based chars, we assume
513 * a termination mark of '0'.
514 *
515 * Returns a new xmlChar * containing the concatenated string.
516 */
517xmlChar *
518xmlStrcat(xmlChar *cur, const xmlChar *add) {
519    const xmlChar *p = add;
520
521    if (add == NULL) return(cur);
522    if (cur == NULL)
523        return(xmlStrdup(add));
524
525    while (*p != 0) p++; /* non input consuming */
526    return(xmlStrncat(cur, add, p - add));
527}
528
529/**
530 * xmlStrPrintf:
531 * @buf:   the result buffer.
532 * @len:   the result buffer length.
533 * @msg:   the message with printf formatting.
534 * @...:   extra parameters for the message.
535 *
536 * Formats @msg and places result into @buf.
537 *
538 * Returns the number of characters written to @buf or -1 if an error occurs.
539 */
540int XMLCDECL
541xmlStrPrintf(xmlChar *buf, int len, const xmlChar *msg, ...) {
542    va_list args;
543    int ret;
544
545    if((buf == NULL) || (msg == NULL)) {
546        return(-1);
547    }
548
549    va_start(args, msg);
550    ret = vsnprintf((char *) buf, len, (const char *) msg, args);
551    va_end(args);
552    buf[len - 1] = 0; /* be safe ! */
553
554    return(ret);
555}
556
557/**
558 * xmlStrVPrintf:
559 * @buf:   the result buffer.
560 * @len:   the result buffer length.
561 * @msg:   the message with printf formatting.
562 * @ap:    extra parameters for the message.
563 *
564 * Formats @msg and places result into @buf.
565 *
566 * Returns the number of characters written to @buf or -1 if an error occurs.
567 */
568int
569xmlStrVPrintf(xmlChar *buf, int len, const xmlChar *msg, va_list ap) {
570    int ret;
571
572    if((buf == NULL) || (msg == NULL)) {
573        return(-1);
574    }
575
576    ret = vsnprintf((char *) buf, len, (const char *) msg, ap);
577    buf[len - 1] = 0; /* be safe ! */
578
579    return(ret);
580}
581
582/************************************************************************
583 *                                                                      *
584 *              Generic UTF8 handling routines                          *
585 *                                                                      *
586 * From rfc2044: encoding of the Unicode values on UTF-8:               *
587 *                                                                      *
588 * UCS-4 range (hex.)           UTF-8 octet sequence (binary)           *
589 * 0000 0000-0000 007F   0xxxxxxx                                       *
590 * 0000 0080-0000 07FF   110xxxxx 10xxxxxx                              *
591 * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx                     *
592 *                                                                      *
593 * I hope we won't use values > 0xFFFF anytime soon !                   *
594 *                                                                      *
595 ************************************************************************/
596
597
598/**
599 * xmlUTF8Size:
600 * @utf: pointer to the UTF8 character
601 *
602 * calculates the internal size of a UTF8 character
603 *
604 * returns the numbers of bytes in the character, -1 on format error
605 */
606int
607xmlUTF8Size(const xmlChar *utf) {
608    xmlChar mask;
609    int len;
610
611    if (utf == NULL)
612        return -1;
613    if (*utf < 0x80)
614        return 1;
615    /* check valid UTF8 character */
616    if (!(*utf & 0x40))
617        return -1;
618    /* determine number of bytes in char */
619    len = 2;
620    for (mask=0x20; mask != 0; mask>>=1) {
621        if (!(*utf & mask))
622            return len;
623        len++;
624    }
625    return -1;
626}
627
628/**
629 * xmlUTF8Charcmp:
630 * @utf1: pointer to first UTF8 char
631 * @utf2: pointer to second UTF8 char
632 *
633 * compares the two UCS4 values
634 *
635 * returns result of the compare as with xmlStrncmp
636 */
637int
638xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) {
639
640    if (utf1 == NULL ) {
641        if (utf2 == NULL)
642            return 0;
643        return -1;
644    }
645    return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1));
646}
647
648/**
649 * xmlUTF8Strlen:
650 * @utf:  a sequence of UTF-8 encoded bytes
651 *
652 * compute the length of an UTF8 string, it doesn't do a full UTF8
653 * checking of the content of the string.
654 *
655 * Returns the number of characters in the string or -1 in case of error
656 */
657int
658xmlUTF8Strlen(const xmlChar *utf) {
659    int ret = 0;
660
661    if (utf == NULL)
662        return(-1);
663
664    while (*utf != 0) {
665        if (utf[0] & 0x80) {
666            if ((utf[1] & 0xc0) != 0x80)
667                return(-1);
668            if ((utf[0] & 0xe0) == 0xe0) {
669                if ((utf[2] & 0xc0) != 0x80)
670                    return(-1);
671                if ((utf[0] & 0xf0) == 0xf0) {
672                    if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
673                        return(-1);
674                    utf += 4;
675                } else {
676                    utf += 3;
677                }
678            } else {
679                utf += 2;
680            }
681        } else {
682            utf++;
683        }
684        ret++;
685    }
686    return(ret);
687}
688
689/**
690 * xmlGetUTF8Char:
691 * @utf:  a sequence of UTF-8 encoded bytes
692 * @len:  a pointer to the minimum number of bytes present in
693 *        the sequence.  This is used to assure the next character
694 *        is completely contained within the sequence.
695 *
696 * Read the first UTF8 character from @utf
697 *
698 * Returns the char value or -1 in case of error, and sets *len to
699 *        the actual number of bytes consumed (0 in case of error)
700 */
701int
702xmlGetUTF8Char(const unsigned char *utf, int *len) {
703    unsigned int c;
704
705    if (utf == NULL)
706        goto error;
707    if (len == NULL)
708        goto error;
709    if (*len < 1)
710        goto error;
711
712    c = utf[0];
713    if (c & 0x80) {
714        if (*len < 2)
715            goto error;
716        if ((utf[1] & 0xc0) != 0x80)
717            goto error;
718        if ((c & 0xe0) == 0xe0) {
719            if (*len < 3)
720                goto error;
721            if ((utf[2] & 0xc0) != 0x80)
722                goto error;
723            if ((c & 0xf0) == 0xf0) {
724                if (*len < 4)
725                    goto error;
726                if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80)
727                    goto error;
728                *len = 4;
729                /* 4-byte code */
730                c = (utf[0] & 0x7) << 18;
731                c |= (utf[1] & 0x3f) << 12;
732                c |= (utf[2] & 0x3f) << 6;
733                c |= utf[3] & 0x3f;
734            } else {
735              /* 3-byte code */
736                *len = 3;
737                c = (utf[0] & 0xf) << 12;
738                c |= (utf[1] & 0x3f) << 6;
739                c |= utf[2] & 0x3f;
740            }
741        } else {
742          /* 2-byte code */
743            *len = 2;
744            c = (utf[0] & 0x1f) << 6;
745            c |= utf[1] & 0x3f;
746        }
747    } else {
748        /* 1-byte code */
749        *len = 1;
750    }
751    return(c);
752
753error:
754    if (len != NULL)
755	*len = 0;
756    return(-1);
757}
758
759/**
760 * xmlCheckUTF8:
761 * @utf: Pointer to putative UTF-8 encoded string.
762 *
763 * Checks @utf for being valid UTF-8. @utf is assumed to be
764 * null-terminated. This function is not super-strict, as it will
765 * allow longer UTF-8 sequences than necessary. Note that Java is
766 * capable of producing these sequences if provoked. Also note, this
767 * routine checks for the 4-byte maximum size, but does not check for
768 * 0x10ffff maximum value.
769 *
770 * Return value: true if @utf is valid.
771 **/
772int
773xmlCheckUTF8(const unsigned char *utf)
774{
775    int ix;
776    unsigned char c;
777
778    if (utf == NULL)
779        return(0);
780    /*
781     * utf is a string of 1, 2, 3 or 4 bytes.  The valid strings
782     * are as follows (in "bit format"):
783     *    0xxxxxxx                                      valid 1-byte
784     *    110xxxxx 10xxxxxx                             valid 2-byte
785     *    1110xxxx 10xxxxxx 10xxxxxx                    valid 3-byte
786     *    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx           valid 4-byte
787     */
788    for (ix = 0; (c = utf[ix]);) {      /* string is 0-terminated */
789        if ((c & 0x80) == 0x00) {	/* 1-byte code, starts with 10 */
790            ix++;
791	} else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
792	    if ((utf[ix+1] & 0xc0 ) != 0x80)
793	        return 0;
794	    ix += 2;
795	} else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
796	    if (((utf[ix+1] & 0xc0) != 0x80) ||
797	        ((utf[ix+2] & 0xc0) != 0x80))
798		    return 0;
799	    ix += 3;
800	} else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
801	    if (((utf[ix+1] & 0xc0) != 0x80) ||
802	        ((utf[ix+2] & 0xc0) != 0x80) ||
803		((utf[ix+3] & 0xc0) != 0x80))
804		    return 0;
805	    ix += 4;
806	} else				/* unknown encoding */
807	    return 0;
808      }
809      return(1);
810}
811
812/**
813 * xmlUTF8Strsize:
814 * @utf:  a sequence of UTF-8 encoded bytes
815 * @len:  the number of characters in the array
816 *
817 * storage size of an UTF8 string
818 * the behaviour is not garanteed if the input string is not UTF-8
819 *
820 * Returns the storage size of
821 * the first 'len' characters of ARRAY
822 */
823
824int
825xmlUTF8Strsize(const xmlChar *utf, int len) {
826    const xmlChar   *ptr=utf;
827    xmlChar         ch;
828
829    if (utf == NULL)
830        return(0);
831
832    if (len <= 0)
833        return(0);
834
835    while ( len-- > 0) {
836        if ( !*ptr )
837            break;
838        if ( (ch = *ptr++) & 0x80)
839            while ((ch<<=1) & 0x80 ) {
840                ptr++;
841		if (*ptr == 0) break;
842	    }
843    }
844    return (ptr - utf);
845}
846
847
848/**
849 * xmlUTF8Strndup:
850 * @utf:  the input UTF8 *
851 * @len:  the len of @utf (in chars)
852 *
853 * a strndup for array of UTF8's
854 *
855 * Returns a new UTF8 * or NULL
856 */
857xmlChar *
858xmlUTF8Strndup(const xmlChar *utf, int len) {
859    xmlChar *ret;
860    int i;
861
862    if ((utf == NULL) || (len < 0)) return(NULL);
863    i = xmlUTF8Strsize(utf, len);
864    ret = (xmlChar *) xmlMallocAtomic((i + 1) * sizeof(xmlChar));
865    if (ret == NULL) {
866        xmlGenericError(xmlGenericErrorContext,
867                "malloc of %ld byte failed\n",
868                (len + 1) * (long)sizeof(xmlChar));
869        return(NULL);
870    }
871    memcpy(ret, utf, i * sizeof(xmlChar));
872    ret[i] = 0;
873    return(ret);
874}
875
876/**
877 * xmlUTF8Strpos:
878 * @utf:  the input UTF8 *
879 * @pos:  the position of the desired UTF8 char (in chars)
880 *
881 * a function to provide the equivalent of fetching a
882 * character from a string array
883 *
884 * Returns a pointer to the UTF8 character or NULL
885 */
886const xmlChar *
887xmlUTF8Strpos(const xmlChar *utf, int pos) {
888    xmlChar ch;
889
890    if (utf == NULL) return(NULL);
891    if (pos < 0)
892        return(NULL);
893    while (pos--) {
894        if ((ch=*utf++) == 0) return(NULL);
895        if ( ch & 0x80 ) {
896            /* if not simple ascii, verify proper format */
897            if ( (ch & 0xc0) != 0xc0 )
898                return(NULL);
899            /* then skip over remaining bytes for this char */
900            while ( (ch <<= 1) & 0x80 )
901                if ( (*utf++ & 0xc0) != 0x80 )
902                    return(NULL);
903        }
904    }
905    return((xmlChar *)utf);
906}
907
908/**
909 * xmlUTF8Strloc:
910 * @utf:  the input UTF8 *
911 * @utfchar:  the UTF8 character to be found
912 *
913 * a function to provide the relative location of a UTF8 char
914 *
915 * Returns the relative character position of the desired char
916 * or -1 if not found
917 */
918int
919xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) {
920    int i, size;
921    xmlChar ch;
922
923    if (utf==NULL || utfchar==NULL) return -1;
924    size = xmlUTF8Strsize(utfchar, 1);
925        for(i=0; (ch=*utf) != 0; i++) {
926            if (xmlStrncmp(utf, utfchar, size)==0)
927                return(i);
928            utf++;
929            if ( ch & 0x80 ) {
930                /* if not simple ascii, verify proper format */
931                if ( (ch & 0xc0) != 0xc0 )
932                    return(-1);
933                /* then skip over remaining bytes for this char */
934                while ( (ch <<= 1) & 0x80 )
935                    if ( (*utf++ & 0xc0) != 0x80 )
936                        return(-1);
937            }
938        }
939
940    return(-1);
941}
942/**
943 * xmlUTF8Strsub:
944 * @utf:  a sequence of UTF-8 encoded bytes
945 * @start: relative pos of first char
946 * @len:   total number to copy
947 *
948 * Create a substring from a given UTF-8 string
949 * Note:  positions are given in units of UTF-8 chars
950 *
951 * Returns a pointer to a newly created string
952 * or NULL if any problem
953 */
954
955xmlChar *
956xmlUTF8Strsub(const xmlChar *utf, int start, int len) {
957    int            i;
958    xmlChar ch;
959
960    if (utf == NULL) return(NULL);
961    if (start < 0) return(NULL);
962    if (len < 0) return(NULL);
963
964    /*
965     * Skip over any leading chars
966     */
967    for (i = 0;i < start;i++) {
968        if ((ch=*utf++) == 0) return(NULL);
969        if ( ch & 0x80 ) {
970            /* if not simple ascii, verify proper format */
971            if ( (ch & 0xc0) != 0xc0 )
972                return(NULL);
973            /* then skip over remaining bytes for this char */
974            while ( (ch <<= 1) & 0x80 )
975                if ( (*utf++ & 0xc0) != 0x80 )
976                    return(NULL);
977        }
978    }
979
980    return(xmlUTF8Strndup(utf, len));
981}
982
983#define bottom_xmlstring
984#include "elfgcchack.h"
985