xmlstring.c revision f409515f15f34811f319ca4d5d9ba77b525eda8c
1/* 2 * string.c : an XML string utilities module 3 * 4 * This module provides various utility functions for manipulating 5 * the xmlChar* type. All functions named xmlStr* have been moved here 6 * from the parser.c file (their original home). 7 * 8 * See Copyright for the status of this software. 9 * 10 * UTF8 string routines from: 11 * William Brack <wbrack@mmm.com.hk> 12 * 13 * daniel@veillard.com 14 */ 15 16#define IN_LIBXML 17#include "libxml.h" 18 19#include <stdlib.h> 20#include <string.h> 21#include <libxml/xmlmemory.h> 22#include <libxml/parserInternals.h> 23#include <libxml/xmlstring.h> 24 25/************************************************************************ 26 * * 27 * Commodity functions to handle xmlChars * 28 * * 29 ************************************************************************/ 30 31/** 32 * xmlStrndup: 33 * @cur: the input xmlChar * 34 * @len: the len of @cur 35 * 36 * a strndup for array of xmlChar's 37 * 38 * Returns a new xmlChar * or NULL 39 */ 40xmlChar * 41xmlStrndup(const xmlChar *cur, int len) { 42 xmlChar *ret; 43 44 if ((cur == NULL) || (len < 0)) return(NULL); 45 ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar)); 46 if (ret == NULL) { 47 xmlErrMemory(NULL, NULL); 48 return(NULL); 49 } 50 memcpy(ret, cur, len * sizeof(xmlChar)); 51 ret[len] = 0; 52 return(ret); 53} 54 55/** 56 * xmlStrdup: 57 * @cur: the input xmlChar * 58 * 59 * a strdup for array of xmlChar's. Since they are supposed to be 60 * encoded in UTF-8 or an encoding with 8bit based chars, we assume 61 * a termination mark of '0'. 62 * 63 * Returns a new xmlChar * or NULL 64 */ 65xmlChar * 66xmlStrdup(const xmlChar *cur) { 67 const xmlChar *p = cur; 68 69 if (cur == NULL) return(NULL); 70 while (*p != 0) p++; /* non input consuming */ 71 return(xmlStrndup(cur, p - cur)); 72} 73 74/** 75 * xmlCharStrndup: 76 * @cur: the input char * 77 * @len: the len of @cur 78 * 79 * a strndup for char's to xmlChar's 80 * 81 * Returns a new xmlChar * or NULL 82 */ 83 84xmlChar * 85xmlCharStrndup(const char *cur, int len) { 86 int i; 87 xmlChar *ret; 88 89 if ((cur == NULL) || (len < 0)) return(NULL); 90 ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar)); 91 if (ret == NULL) { 92 xmlErrMemory(NULL, NULL); 93 return(NULL); 94 } 95 for (i = 0;i < len;i++) 96 ret[i] = (xmlChar) cur[i]; 97 ret[len] = 0; 98 return(ret); 99} 100 101/** 102 * xmlCharStrdup: 103 * @cur: the input char * 104 * 105 * a strdup for char's to xmlChar's 106 * 107 * Returns a new xmlChar * or NULL 108 */ 109 110xmlChar * 111xmlCharStrdup(const char *cur) { 112 const char *p = cur; 113 114 if (cur == NULL) return(NULL); 115 while (*p != '\0') p++; /* non input consuming */ 116 return(xmlCharStrndup(cur, p - cur)); 117} 118 119/** 120 * xmlStrcmp: 121 * @str1: the first xmlChar * 122 * @str2: the second xmlChar * 123 * 124 * a strcmp for xmlChar's 125 * 126 * Returns the integer result of the comparison 127 */ 128 129int 130xmlStrcmp(const xmlChar *str1, const xmlChar *str2) { 131 register int tmp; 132 133 if (str1 == str2) return(0); 134 if (str1 == NULL) return(-1); 135 if (str2 == NULL) return(1); 136 do { 137 tmp = *str1++ - *str2; 138 if (tmp != 0) return(tmp); 139 } while (*str2++ != 0); 140 return 0; 141} 142 143/** 144 * xmlStrEqual: 145 * @str1: the first xmlChar * 146 * @str2: the second xmlChar * 147 * 148 * Check if both string are equal of have same content 149 * Should be a bit more readable and faster than xmlStrEqual() 150 * 151 * Returns 1 if they are equal, 0 if they are different 152 */ 153 154int 155xmlStrEqual(const xmlChar *str1, const xmlChar *str2) { 156 if (str1 == str2) return(1); 157 if (str1 == NULL) return(0); 158 if (str2 == NULL) return(0); 159 do { 160 if (*str1++ != *str2) return(0); 161 } while (*str2++); 162 return(1); 163} 164 165/** 166 * xmlStrQEqual: 167 * @pref: the prefix of the QName 168 * @name: the localname of the QName 169 * @str: the second xmlChar * 170 * 171 * Check if a QName is Equal to a given string 172 * 173 * Returns 1 if they are equal, 0 if they are different 174 */ 175 176int 177xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) { 178 if (pref == NULL) return(xmlStrEqual(name, str)); 179 if (name == NULL) return(0); 180 if (str == NULL) return(0); 181 182 do { 183 if (*pref++ != *str) return(0); 184 } while ((*str++) && (*pref)); 185 if (*str++ != ':') return(0); 186 do { 187 if (*name++ != *str) return(0); 188 } while (*str++); 189 return(1); 190} 191 192/** 193 * xmlStrncmp: 194 * @str1: the first xmlChar * 195 * @str2: the second xmlChar * 196 * @len: the max comparison length 197 * 198 * a strncmp for xmlChar's 199 * 200 * Returns the integer result of the comparison 201 */ 202 203int 204xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) { 205 register int tmp; 206 207 if (len <= 0) return(0); 208 if (str1 == str2) return(0); 209 if (str1 == NULL) return(-1); 210 if (str2 == NULL) return(1); 211#ifdef __GNUC__ 212 tmp = strncmp(str1, str2, len); 213 return tmp; 214#else 215 do { 216 tmp = *str1++ - *str2; 217 if (tmp != 0 || --len == 0) return(tmp); 218 } while (*str2++ != 0); 219 return 0; 220#endif 221} 222 223static const xmlChar casemap[256] = { 224 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 225 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, 226 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17, 227 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F, 228 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27, 229 0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F, 230 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37, 231 0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F, 232 0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 233 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F, 234 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 235 0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F, 236 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 237 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F, 238 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 239 0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F, 240 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87, 241 0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F, 242 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97, 243 0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F, 244 0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7, 245 0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF, 246 0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7, 247 0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF, 248 0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7, 249 0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF, 250 0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7, 251 0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF, 252 0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7, 253 0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF, 254 0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7, 255 0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF 256}; 257 258/** 259 * xmlStrcasecmp: 260 * @str1: the first xmlChar * 261 * @str2: the second xmlChar * 262 * 263 * a strcasecmp for xmlChar's 264 * 265 * Returns the integer result of the comparison 266 */ 267 268int 269xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) { 270 register int tmp; 271 272 if (str1 == str2) return(0); 273 if (str1 == NULL) return(-1); 274 if (str2 == NULL) return(1); 275 do { 276 tmp = casemap[*str1++] - casemap[*str2]; 277 if (tmp != 0) return(tmp); 278 } while (*str2++ != 0); 279 return 0; 280} 281 282/** 283 * xmlStrncasecmp: 284 * @str1: the first xmlChar * 285 * @str2: the second xmlChar * 286 * @len: the max comparison length 287 * 288 * a strncasecmp for xmlChar's 289 * 290 * Returns the integer result of the comparison 291 */ 292 293int 294xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) { 295 register int tmp; 296 297 if (len <= 0) return(0); 298 if (str1 == str2) return(0); 299 if (str1 == NULL) return(-1); 300 if (str2 == NULL) return(1); 301 do { 302 tmp = casemap[*str1++] - casemap[*str2]; 303 if (tmp != 0 || --len == 0) return(tmp); 304 } while (*str2++ != 0); 305 return 0; 306} 307 308/** 309 * xmlStrchr: 310 * @str: the xmlChar * array 311 * @val: the xmlChar to search 312 * 313 * a strchr for xmlChar's 314 * 315 * Returns the xmlChar * for the first occurrence or NULL. 316 */ 317 318const xmlChar * 319xmlStrchr(const xmlChar *str, xmlChar val) { 320 if (str == NULL) return(NULL); 321 while (*str != 0) { /* non input consuming */ 322 if (*str == val) return((xmlChar *) str); 323 str++; 324 } 325 return(NULL); 326} 327 328/** 329 * xmlStrstr: 330 * @str: the xmlChar * array (haystack) 331 * @val: the xmlChar to search (needle) 332 * 333 * a strstr for xmlChar's 334 * 335 * Returns the xmlChar * for the first occurrence or NULL. 336 */ 337 338const xmlChar * 339xmlStrstr(const xmlChar *str, const xmlChar *val) { 340 int n; 341 342 if (str == NULL) return(NULL); 343 if (val == NULL) return(NULL); 344 n = xmlStrlen(val); 345 346 if (n == 0) return(str); 347 while (*str != 0) { /* non input consuming */ 348 if (*str == *val) { 349 if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str); 350 } 351 str++; 352 } 353 return(NULL); 354} 355 356/** 357 * xmlStrcasestr: 358 * @str: the xmlChar * array (haystack) 359 * @val: the xmlChar to search (needle) 360 * 361 * a case-ignoring strstr for xmlChar's 362 * 363 * Returns the xmlChar * for the first occurrence or NULL. 364 */ 365 366const xmlChar * 367xmlStrcasestr(const xmlChar *str, xmlChar *val) { 368 int n; 369 370 if (str == NULL) return(NULL); 371 if (val == NULL) return(NULL); 372 n = xmlStrlen(val); 373 374 if (n == 0) return(str); 375 while (*str != 0) { /* non input consuming */ 376 if (casemap[*str] == casemap[*val]) 377 if (!xmlStrncasecmp(str, val, n)) return(str); 378 str++; 379 } 380 return(NULL); 381} 382 383/** 384 * xmlStrsub: 385 * @str: the xmlChar * array (haystack) 386 * @start: the index of the first char (zero based) 387 * @len: the length of the substring 388 * 389 * Extract a substring of a given string 390 * 391 * Returns the xmlChar * for the first occurrence or NULL. 392 */ 393 394xmlChar * 395xmlStrsub(const xmlChar *str, int start, int len) { 396 int i; 397 398 if (str == NULL) return(NULL); 399 if (start < 0) return(NULL); 400 if (len < 0) return(NULL); 401 402 for (i = 0;i < start;i++) { 403 if (*str == 0) return(NULL); 404 str++; 405 } 406 if (*str == 0) return(NULL); 407 return(xmlStrndup(str, len)); 408} 409 410/** 411 * xmlStrlen: 412 * @str: the xmlChar * array 413 * 414 * length of a xmlChar's string 415 * 416 * Returns the number of xmlChar contained in the ARRAY. 417 */ 418 419int 420xmlStrlen(const xmlChar *str) { 421 int len = 0; 422 423 if (str == NULL) return(0); 424 while (*str != 0) { /* non input consuming */ 425 str++; 426 len++; 427 } 428 return(len); 429} 430 431/** 432 * xmlStrncat: 433 * @cur: the original xmlChar * array 434 * @add: the xmlChar * array added 435 * @len: the length of @add 436 * 437 * a strncat for array of xmlChar's, it will extend @cur with the len 438 * first bytes of @add. 439 * 440 * Returns a new xmlChar *, the original @cur is reallocated if needed 441 * and should not be freed 442 */ 443 444xmlChar * 445xmlStrncat(xmlChar *cur, const xmlChar *add, int len) { 446 int size; 447 xmlChar *ret; 448 449 if ((add == NULL) || (len == 0)) 450 return(cur); 451 if (cur == NULL) 452 return(xmlStrndup(add, len)); 453 454 size = xmlStrlen(cur); 455 ret = (xmlChar *) xmlRealloc(cur, (size + len + 1) * sizeof(xmlChar)); 456 if (ret == NULL) { 457 xmlErrMemory(NULL, NULL); 458 return(cur); 459 } 460 memcpy(&ret[size], add, len * sizeof(xmlChar)); 461 ret[size + len] = 0; 462 return(ret); 463} 464 465/** 466 * xmlStrncatNew: 467 * @str1: first xmlChar string 468 * @str2: second xmlChar string 469 * @len: the len of @str2 470 * 471 * same as xmlStrncat, but creates a new string. The original 472 * two strings are not freed. 473 * 474 * Returns a new xmlChar * or NULL 475 */ 476xmlChar * 477xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) { 478 int size; 479 xmlChar *ret; 480 481 if ((str2 == NULL) || (len == 0)) 482 return(xmlStrdup(str1)); 483 if (str1 == NULL) 484 return(xmlStrndup(str2, len)); 485 486 size = xmlStrlen(str1); 487 ret = (xmlChar *) xmlMalloc((size + len + 1) * sizeof(xmlChar)); 488 if (ret == NULL) { 489 xmlErrMemory(NULL, NULL); 490 return(xmlStrndup(str1, size)); 491 } 492 memcpy(ret, str1, size * sizeof(xmlChar)); 493 memcpy(&ret[size], str2, len * sizeof(xmlChar)); 494 ret[size + len] = 0; 495 return(ret); 496} 497 498/** 499 * xmlStrcat: 500 * @cur: the original xmlChar * array 501 * @add: the xmlChar * array added 502 * 503 * a strcat for array of xmlChar's. Since they are supposed to be 504 * encoded in UTF-8 or an encoding with 8bit based chars, we assume 505 * a termination mark of '0'. 506 * 507 * Returns a new xmlChar * containing the concatenated string. 508 */ 509xmlChar * 510xmlStrcat(xmlChar *cur, const xmlChar *add) { 511 const xmlChar *p = add; 512 513 if (add == NULL) return(cur); 514 if (cur == NULL) 515 return(xmlStrdup(add)); 516 517 while (*p != 0) p++; /* non input consuming */ 518 return(xmlStrncat(cur, add, p - add)); 519} 520 521/** 522 * xmlStrPrintf: 523 * @buf: the result buffer. 524 * @len: the result buffer length. 525 * @msg: the message with printf formatting. 526 * @...: extra parameters for the message. 527 * 528 * Formats @msg and places result into @buf. 529 * 530 * Returns the number of characters written to @buf or -1 if an error occurs. 531 */ 532int 533xmlStrPrintf(xmlChar *buf, int len, const xmlChar *msg, ...) { 534 va_list args; 535 int ret; 536 537 if((buf == NULL) || (msg == NULL)) { 538 return(-1); 539 } 540 541 va_start(args, msg); 542 ret = vsnprintf((char *) buf, len, (const char *) msg, args); 543 va_end(args); 544 buf[len - 1] = 0; /* be safe ! */ 545 546 return(ret); 547} 548 549/** 550 * xmlStrVPrintf: 551 * @buf: the result buffer. 552 * @len: the result buffer length. 553 * @msg: the message with printf formatting. 554 * @ap: extra parameters for the message. 555 * 556 * Formats @msg and places result into @buf. 557 * 558 * Returns the number of characters written to @buf or -1 if an error occurs. 559 */ 560int 561xmlStrVPrintf(xmlChar *buf, int len, const xmlChar *msg, va_list ap) { 562 int ret; 563 564 if((buf == NULL) || (msg == NULL)) { 565 return(-1); 566 } 567 568 ret = vsnprintf((char *) buf, len, (const char *) msg, ap); 569 buf[len - 1] = 0; /* be safe ! */ 570 571 return(ret); 572} 573 574/************************************************************************ 575 * * 576 * Generic UTF8 handling routines * 577 * * 578 * From rfc2044: encoding of the Unicode values on UTF-8: * 579 * * 580 * UCS-4 range (hex.) UTF-8 octet sequence (binary) * 581 * 0000 0000-0000 007F 0xxxxxxx * 582 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx * 583 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx * 584 * * 585 * I hope we won't use values > 0xFFFF anytime soon ! * 586 * * 587 ************************************************************************/ 588 589 590/** 591 * xmlUTF8Size: 592 * @utf: pointer to the UTF8 character 593 * 594 * calculates the internal size of a UTF8 character 595 * 596 * returns the numbers of bytes in the character, -1 on format error 597 */ 598int 599xmlUTF8Size(const xmlChar *utf) { 600 xmlChar mask; 601 int len; 602 603 if (utf == NULL) 604 return -1; 605 if (*utf < 0x80) 606 return 1; 607 /* check valid UTF8 character */ 608 if (!(*utf & 0x40)) 609 return -1; 610 /* determine number of bytes in char */ 611 len = 2; 612 for (mask=0x20; mask != 0; mask>>=1) { 613 if (!(*utf & mask)) 614 return len; 615 len++; 616 } 617 return -1; 618} 619 620/** 621 * xmlUTF8Charcmp: 622 * @utf1: pointer to first UTF8 char 623 * @utf2: pointer to second UTF8 char 624 * 625 * compares the two UCS4 values 626 * 627 * returns result of the compare as with xmlStrncmp 628 */ 629int 630xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) { 631 632 if (utf1 == NULL ) { 633 if (utf2 == NULL) 634 return 0; 635 return -1; 636 } 637 return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1)); 638} 639 640/** 641 * xmlUTF8Strlen: 642 * @utf: a sequence of UTF-8 encoded bytes 643 * 644 * compute the length of an UTF8 string, it doesn't do a full UTF8 645 * checking of the content of the string. 646 * 647 * Returns the number of characters in the string or -1 in case of error 648 */ 649int 650xmlUTF8Strlen(const xmlChar *utf) { 651 int ret = 0; 652 653 if (utf == NULL) 654 return(-1); 655 656 while (*utf != 0) { 657 if (utf[0] & 0x80) { 658 if ((utf[1] & 0xc0) != 0x80) 659 return(-1); 660 if ((utf[0] & 0xe0) == 0xe0) { 661 if ((utf[2] & 0xc0) != 0x80) 662 return(-1); 663 if ((utf[0] & 0xf0) == 0xf0) { 664 if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80) 665 return(-1); 666 utf += 4; 667 } else { 668 utf += 3; 669 } 670 } else { 671 utf += 2; 672 } 673 } else { 674 utf++; 675 } 676 ret++; 677 } 678 return(ret); 679} 680 681/** 682 * xmlGetUTF8Char: 683 * @utf: a sequence of UTF-8 encoded bytes 684 * @len: a pointer to @bytes len 685 * 686 * Read one UTF8 Char from @utf 687 * 688 * Returns the char value or -1 in case of error, and updates *len with the 689 * number of bytes consumed 690 */ 691int 692xmlGetUTF8Char(const unsigned char *utf, int *len) { 693 unsigned int c; 694 695 if (utf == NULL) 696 goto error; 697 if (len == NULL) 698 goto error; 699 if (*len < 1) 700 goto error; 701 702 c = utf[0]; 703 if (c & 0x80) { 704 if (*len < 2) 705 goto error; 706 if ((utf[1] & 0xc0) != 0x80) 707 goto error; 708 if ((c & 0xe0) == 0xe0) { 709 if (*len < 3) 710 goto error; 711 if ((utf[2] & 0xc0) != 0x80) 712 goto error; 713 if ((c & 0xf0) == 0xf0) { 714 if (*len < 4) 715 goto error; 716 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80) 717 goto error; 718 *len = 4; 719 /* 4-byte code */ 720 c = (utf[0] & 0x7) << 18; 721 c |= (utf[1] & 0x3f) << 12; 722 c |= (utf[2] & 0x3f) << 6; 723 c |= utf[3] & 0x3f; 724 } else { 725 /* 3-byte code */ 726 *len = 3; 727 c = (utf[0] & 0xf) << 12; 728 c |= (utf[1] & 0x3f) << 6; 729 c |= utf[2] & 0x3f; 730 } 731 } else { 732 /* 2-byte code */ 733 *len = 2; 734 c = (utf[0] & 0x1f) << 6; 735 c |= utf[1] & 0x3f; 736 } 737 } else { 738 /* 1-byte code */ 739 *len = 1; 740 } 741 return(c); 742 743error: 744 *len = 0; 745 return(-1); 746} 747 748/** 749 * xmlCheckUTF8: 750 * @utf: Pointer to putative UTF-8 encoded string. 751 * 752 * Checks @utf for being valid UTF-8. @utf is assumed to be 753 * null-terminated. This function is not super-strict, as it will 754 * allow longer UTF-8 sequences than necessary. Note that Java is 755 * capable of producing these sequences if provoked. Also note, this 756 * routine checks for the 4-byte maximum size, but does not check for 757 * 0x10ffff maximum value. 758 * 759 * Return value: true if @utf is valid. 760 **/ 761int 762xmlCheckUTF8(const unsigned char *utf) 763{ 764 int ix; 765 unsigned char c; 766 767 /* 768 * utf is a string of 1, 2, 3 or 4 bytes. The valid strings 769 * are as follows (in "bit format"): 770 * 0xxxxxxx valid 1-byte 771 * 110xxxxx 10xxxxxx valid 2-byte 772 * 1110xxxx 10xxxxxx 10xxxxxx valid 3-byte 773 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx valid 4-byte 774 */ 775 for (ix = 0; (c = utf[ix]);) { /* string is 0-terminated */ 776 if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */ 777 ix++; 778 } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */ 779 if ((utf[ix+1] & 0xc0 ) != 0x80) 780 return 0; 781 ix += 2; 782 } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */ 783 if (((utf[ix+1] & 0xc0) != 0x80) || 784 ((utf[ix+2] & 0xc0) != 0x80)) 785 return 0; 786 ix += 3; 787 } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */ 788 if (((utf[ix+1] & 0xc0) != 0x80) || 789 ((utf[ix+2] & 0xc0) != 0x80) || 790 ((utf[ix+3] & 0xc0) != 0x80)) 791 return 0; 792 ix += 4; 793 } else /* unknown encoding */ 794 return 0; 795 } 796 return(1); 797} 798 799/** 800 * xmlUTF8Strsize: 801 * @utf: a sequence of UTF-8 encoded bytes 802 * @len: the number of characters in the array 803 * 804 * storage size of an UTF8 string 805 * 806 * Returns the storage size of 807 * the first 'len' characters of ARRAY 808 * 809 */ 810 811int 812xmlUTF8Strsize(const xmlChar *utf, int len) { 813 const xmlChar *ptr=utf; 814 xmlChar ch; 815 816 if (len <= 0) 817 return(0); 818 819 while ( len-- > 0) { 820 if ( !*ptr ) 821 break; 822 if ( (ch = *ptr++) & 0x80) 823 while ( (ch<<=1) & 0x80 ) 824 ptr++; 825 } 826 return (ptr - utf); 827} 828 829 830/** 831 * xmlUTF8Strndup: 832 * @utf: the input UTF8 * 833 * @len: the len of @utf (in chars) 834 * 835 * a strndup for array of UTF8's 836 * 837 * Returns a new UTF8 * or NULL 838 */ 839xmlChar * 840xmlUTF8Strndup(const xmlChar *utf, int len) { 841 xmlChar *ret; 842 int i; 843 844 if ((utf == NULL) || (len < 0)) return(NULL); 845 i = xmlUTF8Strsize(utf, len); 846 ret = (xmlChar *) xmlMallocAtomic((i + 1) * sizeof(xmlChar)); 847 if (ret == NULL) { 848 xmlGenericError(xmlGenericErrorContext, 849 "malloc of %ld byte failed\n", 850 (len + 1) * (long)sizeof(xmlChar)); 851 return(NULL); 852 } 853 memcpy(ret, utf, i * sizeof(xmlChar)); 854 ret[i] = 0; 855 return(ret); 856} 857 858/** 859 * xmlUTF8Strpos: 860 * @utf: the input UTF8 * 861 * @pos: the position of the desired UTF8 char (in chars) 862 * 863 * a function to provide the equivalent of fetching a 864 * character from a string array 865 * 866 * Returns a pointer to the UTF8 character or NULL 867 */ 868xmlChar * 869xmlUTF8Strpos(const xmlChar *utf, int pos) { 870 xmlChar ch; 871 872 if (utf == NULL) return(NULL); 873 if ( (pos < 0) || (pos >= xmlUTF8Strlen(utf)) ) 874 return(NULL); 875 while (pos--) { 876 if ((ch=*utf++) == 0) return(NULL); 877 if ( ch & 0x80 ) { 878 /* if not simple ascii, verify proper format */ 879 if ( (ch & 0xc0) != 0xc0 ) 880 return(NULL); 881 /* then skip over remaining bytes for this char */ 882 while ( (ch <<= 1) & 0x80 ) 883 if ( (*utf++ & 0xc0) != 0x80 ) 884 return(NULL); 885 } 886 } 887 return((xmlChar *)utf); 888} 889 890/** 891 * xmlUTF8Strloc: 892 * @utf: the input UTF8 * 893 * @utfchar: the UTF8 character to be found 894 * 895 * a function to provide the relative location of a UTF8 char 896 * 897 * Returns the relative character position of the desired char 898 * or -1 if not found 899 */ 900int 901xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) { 902 int i, size; 903 xmlChar ch; 904 905 if (utf==NULL || utfchar==NULL) return -1; 906 size = xmlUTF8Strsize(utfchar, 1); 907 for(i=0; (ch=*utf) != 0; i++) { 908 if (xmlStrncmp(utf, utfchar, size)==0) 909 return(i); 910 utf++; 911 if ( ch & 0x80 ) { 912 /* if not simple ascii, verify proper format */ 913 if ( (ch & 0xc0) != 0xc0 ) 914 return(-1); 915 /* then skip over remaining bytes for this char */ 916 while ( (ch <<= 1) & 0x80 ) 917 if ( (*utf++ & 0xc0) != 0x80 ) 918 return(-1); 919 } 920 } 921 922 return(-1); 923} 924/** 925 * xmlUTF8Strsub: 926 * @utf: a sequence of UTF-8 encoded bytes 927 * @start: relative pos of first char 928 * @len: total number to copy 929 * 930 * Create a substring from a given UTF-8 string 931 * Note: positions are given in units of UTF-8 chars 932 * 933 * Returns a pointer to a newly created string 934 * or NULL if any problem 935 */ 936 937xmlChar * 938xmlUTF8Strsub(const xmlChar *utf, int start, int len) { 939 int i; 940 xmlChar ch; 941 942 if (utf == NULL) return(NULL); 943 if (start < 0) return(NULL); 944 if (len < 0) return(NULL); 945 946 /* 947 * Skip over any leading chars 948 */ 949 for (i = 0;i < start;i++) { 950 if ((ch=*utf++) == 0) return(NULL); 951 if ( ch & 0x80 ) { 952 /* if not simple ascii, verify proper format */ 953 if ( (ch & 0xc0) != 0xc0 ) 954 return(NULL); 955 /* then skip over remaining bytes for this char */ 956 while ( (ch <<= 1) & 0x80 ) 957 if ( (*utf++ & 0xc0) != 0x80 ) 958 return(NULL); 959 } 960 } 961 962 return(xmlUTF8Strndup(utf, len)); 963} 964