xmlstring.c revision ffa3c74933baa45bef5e0d7f15473c38a4c3f9e8
1/* 2 * string.c : an XML string utilities module 3 * 4 * This module provides various utility functions for manipulating 5 * the xmlChar* type. All functions named xmlStr* have been moved here 6 * from the parser.c file (their original home). 7 * 8 * See Copyright for the status of this software. 9 * 10 * UTF8 string routines from: 11 * William Brack <wbrack@mmm.com.hk> 12 * 13 * daniel@veillard.com 14 */ 15 16#define IN_LIBXML 17#include "libxml.h" 18 19#include <stdlib.h> 20#include <string.h> 21#include <libxml/xmlmemory.h> 22#include <libxml/parserInternals.h> 23#include <libxml/xmlstring.h> 24 25/************************************************************************ 26 * * 27 * Commodity functions to handle xmlChars * 28 * * 29 ************************************************************************/ 30 31/** 32 * xmlStrndup: 33 * @cur: the input xmlChar * 34 * @len: the len of @cur 35 * 36 * a strndup for array of xmlChar's 37 * 38 * Returns a new xmlChar * or NULL 39 */ 40xmlChar * 41xmlStrndup(const xmlChar *cur, int len) { 42 xmlChar *ret; 43 44 if ((cur == NULL) || (len < 0)) return(NULL); 45 ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar)); 46 if (ret == NULL) { 47 xmlErrMemory(NULL, NULL); 48 return(NULL); 49 } 50 memcpy(ret, cur, len * sizeof(xmlChar)); 51 ret[len] = 0; 52 return(ret); 53} 54 55/** 56 * xmlStrdup: 57 * @cur: the input xmlChar * 58 * 59 * a strdup for array of xmlChar's. Since they are supposed to be 60 * encoded in UTF-8 or an encoding with 8bit based chars, we assume 61 * a termination mark of '0'. 62 * 63 * Returns a new xmlChar * or NULL 64 */ 65xmlChar * 66xmlStrdup(const xmlChar *cur) { 67 const xmlChar *p = cur; 68 69 if (cur == NULL) return(NULL); 70 while (*p != 0) p++; /* non input consuming */ 71 return(xmlStrndup(cur, p - cur)); 72} 73 74/** 75 * xmlCharStrndup: 76 * @cur: the input char * 77 * @len: the len of @cur 78 * 79 * a strndup for char's to xmlChar's 80 * 81 * Returns a new xmlChar * or NULL 82 */ 83 84xmlChar * 85xmlCharStrndup(const char *cur, int len) { 86 int i; 87 xmlChar *ret; 88 89 if ((cur == NULL) || (len < 0)) return(NULL); 90 ret = (xmlChar *) xmlMallocAtomic((len + 1) * sizeof(xmlChar)); 91 if (ret == NULL) { 92 xmlErrMemory(NULL, NULL); 93 return(NULL); 94 } 95 for (i = 0;i < len;i++) { 96 ret[i] = (xmlChar) cur[i]; 97 if (ret[i] == 0) return(ret); 98 } 99 ret[len] = 0; 100 return(ret); 101} 102 103/** 104 * xmlCharStrdup: 105 * @cur: the input char * 106 * 107 * a strdup for char's to xmlChar's 108 * 109 * Returns a new xmlChar * or NULL 110 */ 111 112xmlChar * 113xmlCharStrdup(const char *cur) { 114 const char *p = cur; 115 116 if (cur == NULL) return(NULL); 117 while (*p != '\0') p++; /* non input consuming */ 118 return(xmlCharStrndup(cur, p - cur)); 119} 120 121/** 122 * xmlStrcmp: 123 * @str1: the first xmlChar * 124 * @str2: the second xmlChar * 125 * 126 * a strcmp for xmlChar's 127 * 128 * Returns the integer result of the comparison 129 */ 130 131int 132xmlStrcmp(const xmlChar *str1, const xmlChar *str2) { 133 register int tmp; 134 135 if (str1 == str2) return(0); 136 if (str1 == NULL) return(-1); 137 if (str2 == NULL) return(1); 138 do { 139 tmp = *str1++ - *str2; 140 if (tmp != 0) return(tmp); 141 } while (*str2++ != 0); 142 return 0; 143} 144 145/** 146 * xmlStrEqual: 147 * @str1: the first xmlChar * 148 * @str2: the second xmlChar * 149 * 150 * Check if both string are equal of have same content 151 * Should be a bit more readable and faster than xmlStrEqual() 152 * 153 * Returns 1 if they are equal, 0 if they are different 154 */ 155 156int 157xmlStrEqual(const xmlChar *str1, const xmlChar *str2) { 158 if (str1 == str2) return(1); 159 if (str1 == NULL) return(0); 160 if (str2 == NULL) return(0); 161 do { 162 if (*str1++ != *str2) return(0); 163 } while (*str2++); 164 return(1); 165} 166 167/** 168 * xmlStrQEqual: 169 * @pref: the prefix of the QName 170 * @name: the localname of the QName 171 * @str: the second xmlChar * 172 * 173 * Check if a QName is Equal to a given string 174 * 175 * Returns 1 if they are equal, 0 if they are different 176 */ 177 178int 179xmlStrQEqual(const xmlChar *pref, const xmlChar *name, const xmlChar *str) { 180 if (pref == NULL) return(xmlStrEqual(name, str)); 181 if (name == NULL) return(0); 182 if (str == NULL) return(0); 183 184 do { 185 if (*pref++ != *str) return(0); 186 } while ((*str++) && (*pref)); 187 if (*str++ != ':') return(0); 188 do { 189 if (*name++ != *str) return(0); 190 } while (*str++); 191 return(1); 192} 193 194/** 195 * xmlStrncmp: 196 * @str1: the first xmlChar * 197 * @str2: the second xmlChar * 198 * @len: the max comparison length 199 * 200 * a strncmp for xmlChar's 201 * 202 * Returns the integer result of the comparison 203 */ 204 205int 206xmlStrncmp(const xmlChar *str1, const xmlChar *str2, int len) { 207 register int tmp; 208 209 if (len <= 0) return(0); 210 if (str1 == str2) return(0); 211 if (str1 == NULL) return(-1); 212 if (str2 == NULL) return(1); 213#ifdef __GNUC__ 214 tmp = strncmp((const char *)str1, (const char *)str2, len); 215 return tmp; 216#else 217 do { 218 tmp = *str1++ - *str2; 219 if (tmp != 0 || --len == 0) return(tmp); 220 } while (*str2++ != 0); 221 return 0; 222#endif 223} 224 225static const xmlChar casemap[256] = { 226 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 227 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, 228 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17, 229 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F, 230 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27, 231 0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F, 232 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37, 233 0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F, 234 0x40,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 235 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F, 236 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 237 0x78,0x79,0x7A,0x7B,0x5C,0x5D,0x5E,0x5F, 238 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67, 239 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F, 240 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, 241 0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F, 242 0x80,0x81,0x82,0x83,0x84,0x85,0x86,0x87, 243 0x88,0x89,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F, 244 0x90,0x91,0x92,0x93,0x94,0x95,0x96,0x97, 245 0x98,0x99,0x9A,0x9B,0x9C,0x9D,0x9E,0x9F, 246 0xA0,0xA1,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7, 247 0xA8,0xA9,0xAA,0xAB,0xAC,0xAD,0xAE,0xAF, 248 0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,0xB7, 249 0xB8,0xB9,0xBA,0xBB,0xBC,0xBD,0xBE,0xBF, 250 0xC0,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7, 251 0xC8,0xC9,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF, 252 0xD0,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,0xD7, 253 0xD8,0xD9,0xDA,0xDB,0xDC,0xDD,0xDE,0xDF, 254 0xE0,0xE1,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7, 255 0xE8,0xE9,0xEA,0xEB,0xEC,0xED,0xEE,0xEF, 256 0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7, 257 0xF8,0xF9,0xFA,0xFB,0xFC,0xFD,0xFE,0xFF 258}; 259 260/** 261 * xmlStrcasecmp: 262 * @str1: the first xmlChar * 263 * @str2: the second xmlChar * 264 * 265 * a strcasecmp for xmlChar's 266 * 267 * Returns the integer result of the comparison 268 */ 269 270int 271xmlStrcasecmp(const xmlChar *str1, const xmlChar *str2) { 272 register int tmp; 273 274 if (str1 == str2) return(0); 275 if (str1 == NULL) return(-1); 276 if (str2 == NULL) return(1); 277 do { 278 tmp = casemap[*str1++] - casemap[*str2]; 279 if (tmp != 0) return(tmp); 280 } while (*str2++ != 0); 281 return 0; 282} 283 284/** 285 * xmlStrncasecmp: 286 * @str1: the first xmlChar * 287 * @str2: the second xmlChar * 288 * @len: the max comparison length 289 * 290 * a strncasecmp for xmlChar's 291 * 292 * Returns the integer result of the comparison 293 */ 294 295int 296xmlStrncasecmp(const xmlChar *str1, const xmlChar *str2, int len) { 297 register int tmp; 298 299 if (len <= 0) return(0); 300 if (str1 == str2) return(0); 301 if (str1 == NULL) return(-1); 302 if (str2 == NULL) return(1); 303 do { 304 tmp = casemap[*str1++] - casemap[*str2]; 305 if (tmp != 0 || --len == 0) return(tmp); 306 } while (*str2++ != 0); 307 return 0; 308} 309 310/** 311 * xmlStrchr: 312 * @str: the xmlChar * array 313 * @val: the xmlChar to search 314 * 315 * a strchr for xmlChar's 316 * 317 * Returns the xmlChar * for the first occurrence or NULL. 318 */ 319 320const xmlChar * 321xmlStrchr(const xmlChar *str, xmlChar val) { 322 if (str == NULL) return(NULL); 323 while (*str != 0) { /* non input consuming */ 324 if (*str == val) return((xmlChar *) str); 325 str++; 326 } 327 return(NULL); 328} 329 330/** 331 * xmlStrstr: 332 * @str: the xmlChar * array (haystack) 333 * @val: the xmlChar to search (needle) 334 * 335 * a strstr for xmlChar's 336 * 337 * Returns the xmlChar * for the first occurrence or NULL. 338 */ 339 340const xmlChar * 341xmlStrstr(const xmlChar *str, const xmlChar *val) { 342 int n; 343 344 if (str == NULL) return(NULL); 345 if (val == NULL) return(NULL); 346 n = xmlStrlen(val); 347 348 if (n == 0) return(str); 349 while (*str != 0) { /* non input consuming */ 350 if (*str == *val) { 351 if (!xmlStrncmp(str, val, n)) return((const xmlChar *) str); 352 } 353 str++; 354 } 355 return(NULL); 356} 357 358/** 359 * xmlStrcasestr: 360 * @str: the xmlChar * array (haystack) 361 * @val: the xmlChar to search (needle) 362 * 363 * a case-ignoring strstr for xmlChar's 364 * 365 * Returns the xmlChar * for the first occurrence or NULL. 366 */ 367 368const xmlChar * 369xmlStrcasestr(const xmlChar *str, xmlChar *val) { 370 int n; 371 372 if (str == NULL) return(NULL); 373 if (val == NULL) return(NULL); 374 n = xmlStrlen(val); 375 376 if (n == 0) return(str); 377 while (*str != 0) { /* non input consuming */ 378 if (casemap[*str] == casemap[*val]) 379 if (!xmlStrncasecmp(str, val, n)) return(str); 380 str++; 381 } 382 return(NULL); 383} 384 385/** 386 * xmlStrsub: 387 * @str: the xmlChar * array (haystack) 388 * @start: the index of the first char (zero based) 389 * @len: the length of the substring 390 * 391 * Extract a substring of a given string 392 * 393 * Returns the xmlChar * for the first occurrence or NULL. 394 */ 395 396xmlChar * 397xmlStrsub(const xmlChar *str, int start, int len) { 398 int i; 399 400 if (str == NULL) return(NULL); 401 if (start < 0) return(NULL); 402 if (len < 0) return(NULL); 403 404 for (i = 0;i < start;i++) { 405 if (*str == 0) return(NULL); 406 str++; 407 } 408 if (*str == 0) return(NULL); 409 return(xmlStrndup(str, len)); 410} 411 412/** 413 * xmlStrlen: 414 * @str: the xmlChar * array 415 * 416 * length of a xmlChar's string 417 * 418 * Returns the number of xmlChar contained in the ARRAY. 419 */ 420 421int 422xmlStrlen(const xmlChar *str) { 423 int len = 0; 424 425 if (str == NULL) return(0); 426 while (*str != 0) { /* non input consuming */ 427 str++; 428 len++; 429 } 430 return(len); 431} 432 433/** 434 * xmlStrncat: 435 * @cur: the original xmlChar * array 436 * @add: the xmlChar * array added 437 * @len: the length of @add 438 * 439 * a strncat for array of xmlChar's, it will extend @cur with the len 440 * first bytes of @add. 441 * 442 * Returns a new xmlChar *, the original @cur is reallocated if needed 443 * and should not be freed 444 */ 445 446xmlChar * 447xmlStrncat(xmlChar *cur, const xmlChar *add, int len) { 448 int size; 449 xmlChar *ret; 450 451 if ((add == NULL) || (len == 0)) 452 return(cur); 453 if (cur == NULL) 454 return(xmlStrndup(add, len)); 455 456 size = xmlStrlen(cur); 457 ret = (xmlChar *) xmlRealloc(cur, (size + len + 1) * sizeof(xmlChar)); 458 if (ret == NULL) { 459 xmlErrMemory(NULL, NULL); 460 return(cur); 461 } 462 memcpy(&ret[size], add, len * sizeof(xmlChar)); 463 ret[size + len] = 0; 464 return(ret); 465} 466 467/** 468 * xmlStrncatNew: 469 * @str1: first xmlChar string 470 * @str2: second xmlChar string 471 * @len: the len of @str2 472 * 473 * same as xmlStrncat, but creates a new string. The original 474 * two strings are not freed. 475 * 476 * Returns a new xmlChar * or NULL 477 */ 478xmlChar * 479xmlStrncatNew(const xmlChar *str1, const xmlChar *str2, int len) { 480 int size; 481 xmlChar *ret; 482 483 if (len < 0) 484 len = xmlStrlen(str2); 485 if ((str2 == NULL) || (len == 0)) 486 return(xmlStrdup(str1)); 487 if (str1 == NULL) 488 return(xmlStrndup(str2, len)); 489 490 size = xmlStrlen(str1); 491 ret = (xmlChar *) xmlMalloc((size + len + 1) * sizeof(xmlChar)); 492 if (ret == NULL) { 493 xmlErrMemory(NULL, NULL); 494 return(xmlStrndup(str1, size)); 495 } 496 memcpy(ret, str1, size * sizeof(xmlChar)); 497 memcpy(&ret[size], str2, len * sizeof(xmlChar)); 498 ret[size + len] = 0; 499 return(ret); 500} 501 502/** 503 * xmlStrcat: 504 * @cur: the original xmlChar * array 505 * @add: the xmlChar * array added 506 * 507 * a strcat for array of xmlChar's. Since they are supposed to be 508 * encoded in UTF-8 or an encoding with 8bit based chars, we assume 509 * a termination mark of '0'. 510 * 511 * Returns a new xmlChar * containing the concatenated string. 512 */ 513xmlChar * 514xmlStrcat(xmlChar *cur, const xmlChar *add) { 515 const xmlChar *p = add; 516 517 if (add == NULL) return(cur); 518 if (cur == NULL) 519 return(xmlStrdup(add)); 520 521 while (*p != 0) p++; /* non input consuming */ 522 return(xmlStrncat(cur, add, p - add)); 523} 524 525/** 526 * xmlStrPrintf: 527 * @buf: the result buffer. 528 * @len: the result buffer length. 529 * @msg: the message with printf formatting. 530 * @...: extra parameters for the message. 531 * 532 * Formats @msg and places result into @buf. 533 * 534 * Returns the number of characters written to @buf or -1 if an error occurs. 535 */ 536int XMLCDECL 537xmlStrPrintf(xmlChar *buf, int len, const xmlChar *msg, ...) { 538 va_list args; 539 int ret; 540 541 if((buf == NULL) || (msg == NULL)) { 542 return(-1); 543 } 544 545 va_start(args, msg); 546 ret = vsnprintf((char *) buf, len, (const char *) msg, args); 547 va_end(args); 548 buf[len - 1] = 0; /* be safe ! */ 549 550 return(ret); 551} 552 553/** 554 * xmlStrVPrintf: 555 * @buf: the result buffer. 556 * @len: the result buffer length. 557 * @msg: the message with printf formatting. 558 * @ap: extra parameters for the message. 559 * 560 * Formats @msg and places result into @buf. 561 * 562 * Returns the number of characters written to @buf or -1 if an error occurs. 563 */ 564int 565xmlStrVPrintf(xmlChar *buf, int len, const xmlChar *msg, va_list ap) { 566 int ret; 567 568 if((buf == NULL) || (msg == NULL)) { 569 return(-1); 570 } 571 572 ret = vsnprintf((char *) buf, len, (const char *) msg, ap); 573 buf[len - 1] = 0; /* be safe ! */ 574 575 return(ret); 576} 577 578/************************************************************************ 579 * * 580 * Generic UTF8 handling routines * 581 * * 582 * From rfc2044: encoding of the Unicode values on UTF-8: * 583 * * 584 * UCS-4 range (hex.) UTF-8 octet sequence (binary) * 585 * 0000 0000-0000 007F 0xxxxxxx * 586 * 0000 0080-0000 07FF 110xxxxx 10xxxxxx * 587 * 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx * 588 * * 589 * I hope we won't use values > 0xFFFF anytime soon ! * 590 * * 591 ************************************************************************/ 592 593 594/** 595 * xmlUTF8Size: 596 * @utf: pointer to the UTF8 character 597 * 598 * calculates the internal size of a UTF8 character 599 * 600 * returns the numbers of bytes in the character, -1 on format error 601 */ 602int 603xmlUTF8Size(const xmlChar *utf) { 604 xmlChar mask; 605 int len; 606 607 if (utf == NULL) 608 return -1; 609 if (*utf < 0x80) 610 return 1; 611 /* check valid UTF8 character */ 612 if (!(*utf & 0x40)) 613 return -1; 614 /* determine number of bytes in char */ 615 len = 2; 616 for (mask=0x20; mask != 0; mask>>=1) { 617 if (!(*utf & mask)) 618 return len; 619 len++; 620 } 621 return -1; 622} 623 624/** 625 * xmlUTF8Charcmp: 626 * @utf1: pointer to first UTF8 char 627 * @utf2: pointer to second UTF8 char 628 * 629 * compares the two UCS4 values 630 * 631 * returns result of the compare as with xmlStrncmp 632 */ 633int 634xmlUTF8Charcmp(const xmlChar *utf1, const xmlChar *utf2) { 635 636 if (utf1 == NULL ) { 637 if (utf2 == NULL) 638 return 0; 639 return -1; 640 } 641 return xmlStrncmp(utf1, utf2, xmlUTF8Size(utf1)); 642} 643 644/** 645 * xmlUTF8Strlen: 646 * @utf: a sequence of UTF-8 encoded bytes 647 * 648 * compute the length of an UTF8 string, it doesn't do a full UTF8 649 * checking of the content of the string. 650 * 651 * Returns the number of characters in the string or -1 in case of error 652 */ 653int 654xmlUTF8Strlen(const xmlChar *utf) { 655 int ret = 0; 656 657 if (utf == NULL) 658 return(-1); 659 660 while (*utf != 0) { 661 if (utf[0] & 0x80) { 662 if ((utf[1] & 0xc0) != 0x80) 663 return(-1); 664 if ((utf[0] & 0xe0) == 0xe0) { 665 if ((utf[2] & 0xc0) != 0x80) 666 return(-1); 667 if ((utf[0] & 0xf0) == 0xf0) { 668 if ((utf[0] & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80) 669 return(-1); 670 utf += 4; 671 } else { 672 utf += 3; 673 } 674 } else { 675 utf += 2; 676 } 677 } else { 678 utf++; 679 } 680 ret++; 681 } 682 return(ret); 683} 684 685/** 686 * xmlGetUTF8Char: 687 * @utf: a sequence of UTF-8 encoded bytes 688 * @len: a pointer to the minimum number of bytes present in 689 * the sequence. This is used to assure the next character 690 * is completely contained within the sequence. 691 * 692 * Read the first UTF8 character from @utf 693 * 694 * Returns the char value or -1 in case of error, and sets *len to 695 * the actual number of bytes consumed (0 in case of error) 696 */ 697int 698xmlGetUTF8Char(const unsigned char *utf, int *len) { 699 unsigned int c; 700 701 if (utf == NULL) 702 goto error; 703 if (len == NULL) 704 goto error; 705 if (*len < 1) 706 goto error; 707 708 c = utf[0]; 709 if (c & 0x80) { 710 if (*len < 2) 711 goto error; 712 if ((utf[1] & 0xc0) != 0x80) 713 goto error; 714 if ((c & 0xe0) == 0xe0) { 715 if (*len < 3) 716 goto error; 717 if ((utf[2] & 0xc0) != 0x80) 718 goto error; 719 if ((c & 0xf0) == 0xf0) { 720 if (*len < 4) 721 goto error; 722 if ((c & 0xf8) != 0xf0 || (utf[3] & 0xc0) != 0x80) 723 goto error; 724 *len = 4; 725 /* 4-byte code */ 726 c = (utf[0] & 0x7) << 18; 727 c |= (utf[1] & 0x3f) << 12; 728 c |= (utf[2] & 0x3f) << 6; 729 c |= utf[3] & 0x3f; 730 } else { 731 /* 3-byte code */ 732 *len = 3; 733 c = (utf[0] & 0xf) << 12; 734 c |= (utf[1] & 0x3f) << 6; 735 c |= utf[2] & 0x3f; 736 } 737 } else { 738 /* 2-byte code */ 739 *len = 2; 740 c = (utf[0] & 0x1f) << 6; 741 c |= utf[1] & 0x3f; 742 } 743 } else { 744 /* 1-byte code */ 745 *len = 1; 746 } 747 return(c); 748 749error: 750 if (len != NULL) 751 *len = 0; 752 return(-1); 753} 754 755/** 756 * xmlCheckUTF8: 757 * @utf: Pointer to putative UTF-8 encoded string. 758 * 759 * Checks @utf for being valid UTF-8. @utf is assumed to be 760 * null-terminated. This function is not super-strict, as it will 761 * allow longer UTF-8 sequences than necessary. Note that Java is 762 * capable of producing these sequences if provoked. Also note, this 763 * routine checks for the 4-byte maximum size, but does not check for 764 * 0x10ffff maximum value. 765 * 766 * Return value: true if @utf is valid. 767 **/ 768int 769xmlCheckUTF8(const unsigned char *utf) 770{ 771 int ix; 772 unsigned char c; 773 774 if (utf == NULL) 775 return(0); 776 /* 777 * utf is a string of 1, 2, 3 or 4 bytes. The valid strings 778 * are as follows (in "bit format"): 779 * 0xxxxxxx valid 1-byte 780 * 110xxxxx 10xxxxxx valid 2-byte 781 * 1110xxxx 10xxxxxx 10xxxxxx valid 3-byte 782 * 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx valid 4-byte 783 */ 784 for (ix = 0; (c = utf[ix]);) { /* string is 0-terminated */ 785 if ((c & 0x80) == 0x00) { /* 1-byte code, starts with 10 */ 786 ix++; 787 } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */ 788 if ((utf[ix+1] & 0xc0 ) != 0x80) 789 return 0; 790 ix += 2; 791 } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */ 792 if (((utf[ix+1] & 0xc0) != 0x80) || 793 ((utf[ix+2] & 0xc0) != 0x80)) 794 return 0; 795 ix += 3; 796 } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */ 797 if (((utf[ix+1] & 0xc0) != 0x80) || 798 ((utf[ix+2] & 0xc0) != 0x80) || 799 ((utf[ix+3] & 0xc0) != 0x80)) 800 return 0; 801 ix += 4; 802 } else /* unknown encoding */ 803 return 0; 804 } 805 return(1); 806} 807 808/** 809 * xmlUTF8Strsize: 810 * @utf: a sequence of UTF-8 encoded bytes 811 * @len: the number of characters in the array 812 * 813 * storage size of an UTF8 string 814 * the behaviour is not garanteed if the input string is not UTF-8 815 * 816 * Returns the storage size of 817 * the first 'len' characters of ARRAY 818 */ 819 820int 821xmlUTF8Strsize(const xmlChar *utf, int len) { 822 const xmlChar *ptr=utf; 823 xmlChar ch; 824 825 if (utf == NULL) 826 return(0); 827 828 if (len <= 0) 829 return(0); 830 831 while ( len-- > 0) { 832 if ( !*ptr ) 833 break; 834 if ( (ch = *ptr++) & 0x80) 835 while ((ch<<=1) & 0x80 ) { 836 ptr++; 837 if (*ptr == 0) break; 838 } 839 } 840 return (ptr - utf); 841} 842 843 844/** 845 * xmlUTF8Strndup: 846 * @utf: the input UTF8 * 847 * @len: the len of @utf (in chars) 848 * 849 * a strndup for array of UTF8's 850 * 851 * Returns a new UTF8 * or NULL 852 */ 853xmlChar * 854xmlUTF8Strndup(const xmlChar *utf, int len) { 855 xmlChar *ret; 856 int i; 857 858 if ((utf == NULL) || (len < 0)) return(NULL); 859 i = xmlUTF8Strsize(utf, len); 860 ret = (xmlChar *) xmlMallocAtomic((i + 1) * sizeof(xmlChar)); 861 if (ret == NULL) { 862 xmlGenericError(xmlGenericErrorContext, 863 "malloc of %ld byte failed\n", 864 (len + 1) * (long)sizeof(xmlChar)); 865 return(NULL); 866 } 867 memcpy(ret, utf, i * sizeof(xmlChar)); 868 ret[i] = 0; 869 return(ret); 870} 871 872/** 873 * xmlUTF8Strpos: 874 * @utf: the input UTF8 * 875 * @pos: the position of the desired UTF8 char (in chars) 876 * 877 * a function to provide the equivalent of fetching a 878 * character from a string array 879 * 880 * Returns a pointer to the UTF8 character or NULL 881 */ 882const xmlChar * 883xmlUTF8Strpos(const xmlChar *utf, int pos) { 884 xmlChar ch; 885 886 if (utf == NULL) return(NULL); 887 if (pos < 0) 888 return(NULL); 889 while (pos--) { 890 if ((ch=*utf++) == 0) return(NULL); 891 if ( ch & 0x80 ) { 892 /* if not simple ascii, verify proper format */ 893 if ( (ch & 0xc0) != 0xc0 ) 894 return(NULL); 895 /* then skip over remaining bytes for this char */ 896 while ( (ch <<= 1) & 0x80 ) 897 if ( (*utf++ & 0xc0) != 0x80 ) 898 return(NULL); 899 } 900 } 901 return((xmlChar *)utf); 902} 903 904/** 905 * xmlUTF8Strloc: 906 * @utf: the input UTF8 * 907 * @utfchar: the UTF8 character to be found 908 * 909 * a function to provide the relative location of a UTF8 char 910 * 911 * Returns the relative character position of the desired char 912 * or -1 if not found 913 */ 914int 915xmlUTF8Strloc(const xmlChar *utf, const xmlChar *utfchar) { 916 int i, size; 917 xmlChar ch; 918 919 if (utf==NULL || utfchar==NULL) return -1; 920 size = xmlUTF8Strsize(utfchar, 1); 921 for(i=0; (ch=*utf) != 0; i++) { 922 if (xmlStrncmp(utf, utfchar, size)==0) 923 return(i); 924 utf++; 925 if ( ch & 0x80 ) { 926 /* if not simple ascii, verify proper format */ 927 if ( (ch & 0xc0) != 0xc0 ) 928 return(-1); 929 /* then skip over remaining bytes for this char */ 930 while ( (ch <<= 1) & 0x80 ) 931 if ( (*utf++ & 0xc0) != 0x80 ) 932 return(-1); 933 } 934 } 935 936 return(-1); 937} 938/** 939 * xmlUTF8Strsub: 940 * @utf: a sequence of UTF-8 encoded bytes 941 * @start: relative pos of first char 942 * @len: total number to copy 943 * 944 * Create a substring from a given UTF-8 string 945 * Note: positions are given in units of UTF-8 chars 946 * 947 * Returns a pointer to a newly created string 948 * or NULL if any problem 949 */ 950 951xmlChar * 952xmlUTF8Strsub(const xmlChar *utf, int start, int len) { 953 int i; 954 xmlChar ch; 955 956 if (utf == NULL) return(NULL); 957 if (start < 0) return(NULL); 958 if (len < 0) return(NULL); 959 960 /* 961 * Skip over any leading chars 962 */ 963 for (i = 0;i < start;i++) { 964 if ((ch=*utf++) == 0) return(NULL); 965 if ( ch & 0x80 ) { 966 /* if not simple ascii, verify proper format */ 967 if ( (ch & 0xc0) != 0xc0 ) 968 return(NULL); 969 /* then skip over remaining bytes for this char */ 970 while ( (ch <<= 1) & 0x80 ) 971 if ( (*utf++ & 0xc0) != 0x80 ) 972 return(NULL); 973 } 974 } 975 976 return(xmlUTF8Strndup(utf, len)); 977} 978 979#define bottom_xmlstring 980#include "elfgcchack.h" 981