1/* 2****************************************************************************** 3* 4* Copyright (C) 1998-2012, International Business Machines 5* Corporation and others. All Rights Reserved. 6* 7****************************************************************************** 8* 9* File ustring.cpp 10* 11* Modification History: 12* 13* Date Name Description 14* 12/07/98 bertrand Creation. 15****************************************************************************** 16*/ 17 18#include "unicode/utypes.h" 19#include "unicode/putil.h" 20#include "unicode/ustring.h" 21#include "unicode/utf16.h" 22#include "cstring.h" 23#include "cwchar.h" 24#include "cmemory.h" 25#include "ustr_imp.h" 26 27/* ANSI string.h - style functions ------------------------------------------ */ 28 29/* U+ffff is the highest BMP code point, the highest one that fits into a 16-bit UChar */ 30#define U_BMP_MAX 0xffff 31 32/* Forward binary string search functions ----------------------------------- */ 33 34/* 35 * Test if a substring match inside a string is at code point boundaries. 36 * All pointers refer to the same buffer. 37 * The limit pointer may be NULL, all others must be real pointers. 38 */ 39static inline UBool 40isMatchAtCPBoundary(const UChar *start, const UChar *match, const UChar *matchLimit, const UChar *limit) { 41 if(U16_IS_TRAIL(*match) && start!=match && U16_IS_LEAD(*(match-1))) { 42 /* the leading edge of the match is in the middle of a surrogate pair */ 43 return FALSE; 44 } 45 if(U16_IS_LEAD(*(matchLimit-1)) && match!=limit && U16_IS_TRAIL(*matchLimit)) { 46 /* the trailing edge of the match is in the middle of a surrogate pair */ 47 return FALSE; 48 } 49 return TRUE; 50} 51 52U_CAPI UChar * U_EXPORT2 53u_strFindFirst(const UChar *s, int32_t length, 54 const UChar *sub, int32_t subLength) { 55 const UChar *start, *p, *q, *subLimit; 56 UChar c, cs, cq; 57 58 if(sub==NULL || subLength<-1) { 59 return (UChar *)s; 60 } 61 if(s==NULL || length<-1) { 62 return NULL; 63 } 64 65 start=s; 66 67 if(length<0 && subLength<0) { 68 /* both strings are NUL-terminated */ 69 if((cs=*sub++)==0) { 70 return (UChar *)s; 71 } 72 if(*sub==0 && !U16_IS_SURROGATE(cs)) { 73 /* the substring consists of a single, non-surrogate BMP code point */ 74 return u_strchr(s, cs); 75 } 76 77 while((c=*s++)!=0) { 78 if(c==cs) { 79 /* found first substring UChar, compare rest */ 80 p=s; 81 q=sub; 82 for(;;) { 83 if((cq=*q)==0) { 84 if(isMatchAtCPBoundary(start, s-1, p, NULL)) { 85 return (UChar *)(s-1); /* well-formed match */ 86 } else { 87 break; /* no match because surrogate pair is split */ 88 } 89 } 90 if((c=*p)==0) { 91 return NULL; /* no match, and none possible after s */ 92 } 93 if(c!=cq) { 94 break; /* no match */ 95 } 96 ++p; 97 ++q; 98 } 99 } 100 } 101 102 /* not found */ 103 return NULL; 104 } 105 106 if(subLength<0) { 107 subLength=u_strlen(sub); 108 } 109 if(subLength==0) { 110 return (UChar *)s; 111 } 112 113 /* get sub[0] to search for it fast */ 114 cs=*sub++; 115 --subLength; 116 subLimit=sub+subLength; 117 118 if(subLength==0 && !U16_IS_SURROGATE(cs)) { 119 /* the substring consists of a single, non-surrogate BMP code point */ 120 return length<0 ? u_strchr(s, cs) : u_memchr(s, cs, length); 121 } 122 123 if(length<0) { 124 /* s is NUL-terminated */ 125 while((c=*s++)!=0) { 126 if(c==cs) { 127 /* found first substring UChar, compare rest */ 128 p=s; 129 q=sub; 130 for(;;) { 131 if(q==subLimit) { 132 if(isMatchAtCPBoundary(start, s-1, p, NULL)) { 133 return (UChar *)(s-1); /* well-formed match */ 134 } else { 135 break; /* no match because surrogate pair is split */ 136 } 137 } 138 if((c=*p)==0) { 139 return NULL; /* no match, and none possible after s */ 140 } 141 if(c!=*q) { 142 break; /* no match */ 143 } 144 ++p; 145 ++q; 146 } 147 } 148 } 149 } else { 150 const UChar *limit, *preLimit; 151 152 /* subLength was decremented above */ 153 if(length<=subLength) { 154 return NULL; /* s is shorter than sub */ 155 } 156 157 limit=s+length; 158 159 /* the substring must start before preLimit */ 160 preLimit=limit-subLength; 161 162 while(s!=preLimit) { 163 c=*s++; 164 if(c==cs) { 165 /* found first substring UChar, compare rest */ 166 p=s; 167 q=sub; 168 for(;;) { 169 if(q==subLimit) { 170 if(isMatchAtCPBoundary(start, s-1, p, limit)) { 171 return (UChar *)(s-1); /* well-formed match */ 172 } else { 173 break; /* no match because surrogate pair is split */ 174 } 175 } 176 if(*p!=*q) { 177 break; /* no match */ 178 } 179 ++p; 180 ++q; 181 } 182 } 183 } 184 } 185 186 /* not found */ 187 return NULL; 188} 189 190U_CAPI UChar * U_EXPORT2 191u_strstr(const UChar *s, const UChar *substring) { 192 return u_strFindFirst(s, -1, substring, -1); 193} 194 195U_CAPI UChar * U_EXPORT2 196u_strchr(const UChar *s, UChar c) { 197 if(U16_IS_SURROGATE(c)) { 198 /* make sure to not find half of a surrogate pair */ 199 return u_strFindFirst(s, -1, &c, 1); 200 } else { 201 UChar cs; 202 203 /* trivial search for a BMP code point */ 204 for(;;) { 205 if((cs=*s)==c) { 206 return (UChar *)s; 207 } 208 if(cs==0) { 209 return NULL; 210 } 211 ++s; 212 } 213 } 214} 215 216U_CAPI UChar * U_EXPORT2 217u_strchr32(const UChar *s, UChar32 c) { 218 if((uint32_t)c<=U_BMP_MAX) { 219 /* find BMP code point */ 220 return u_strchr(s, (UChar)c); 221 } else if((uint32_t)c<=UCHAR_MAX_VALUE) { 222 /* find supplementary code point as surrogate pair */ 223 UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c); 224 225 while((cs=*s++)!=0) { 226 if(cs==lead && *s==trail) { 227 return (UChar *)(s-1); 228 } 229 } 230 return NULL; 231 } else { 232 /* not a Unicode code point, not findable */ 233 return NULL; 234 } 235} 236 237U_CAPI UChar * U_EXPORT2 238u_memchr(const UChar *s, UChar c, int32_t count) { 239 if(count<=0) { 240 return NULL; /* no string */ 241 } else if(U16_IS_SURROGATE(c)) { 242 /* make sure to not find half of a surrogate pair */ 243 return u_strFindFirst(s, count, &c, 1); 244 } else { 245 /* trivial search for a BMP code point */ 246 const UChar *limit=s+count; 247 do { 248 if(*s==c) { 249 return (UChar *)s; 250 } 251 } while(++s!=limit); 252 return NULL; 253 } 254} 255 256U_CAPI UChar * U_EXPORT2 257u_memchr32(const UChar *s, UChar32 c, int32_t count) { 258 if((uint32_t)c<=U_BMP_MAX) { 259 /* find BMP code point */ 260 return u_memchr(s, (UChar)c, count); 261 } else if(count<2) { 262 /* too short for a surrogate pair */ 263 return NULL; 264 } else if((uint32_t)c<=UCHAR_MAX_VALUE) { 265 /* find supplementary code point as surrogate pair */ 266 const UChar *limit=s+count-1; /* -1 so that we do not need a separate check for the trail unit */ 267 UChar lead=U16_LEAD(c), trail=U16_TRAIL(c); 268 269 do { 270 if(*s==lead && *(s+1)==trail) { 271 return (UChar *)s; 272 } 273 } while(++s!=limit); 274 return NULL; 275 } else { 276 /* not a Unicode code point, not findable */ 277 return NULL; 278 } 279} 280 281/* Backward binary string search functions ---------------------------------- */ 282 283U_CAPI UChar * U_EXPORT2 284u_strFindLast(const UChar *s, int32_t length, 285 const UChar *sub, int32_t subLength) { 286 const UChar *start, *limit, *p, *q, *subLimit; 287 UChar c, cs; 288 289 if(sub==NULL || subLength<-1) { 290 return (UChar *)s; 291 } 292 if(s==NULL || length<-1) { 293 return NULL; 294 } 295 296 /* 297 * This implementation is more lazy than the one for u_strFindFirst(): 298 * There is no special search code for NUL-terminated strings. 299 * It does not seem to be worth it for searching substrings to 300 * search forward and find all matches like in u_strrchr() and similar. 301 * Therefore, we simply get both string lengths and search backward. 302 * 303 * markus 2002oct23 304 */ 305 306 if(subLength<0) { 307 subLength=u_strlen(sub); 308 } 309 if(subLength==0) { 310 return (UChar *)s; 311 } 312 313 /* get sub[subLength-1] to search for it fast */ 314 subLimit=sub+subLength; 315 cs=*(--subLimit); 316 --subLength; 317 318 if(subLength==0 && !U16_IS_SURROGATE(cs)) { 319 /* the substring consists of a single, non-surrogate BMP code point */ 320 return length<0 ? u_strrchr(s, cs) : u_memrchr(s, cs, length); 321 } 322 323 if(length<0) { 324 length=u_strlen(s); 325 } 326 327 /* subLength was decremented above */ 328 if(length<=subLength) { 329 return NULL; /* s is shorter than sub */ 330 } 331 332 start=s; 333 limit=s+length; 334 335 /* the substring must start no later than s+subLength */ 336 s+=subLength; 337 338 while(s!=limit) { 339 c=*(--limit); 340 if(c==cs) { 341 /* found last substring UChar, compare rest */ 342 p=limit; 343 q=subLimit; 344 for(;;) { 345 if(q==sub) { 346 if(isMatchAtCPBoundary(start, p, limit+1, start+length)) { 347 return (UChar *)p; /* well-formed match */ 348 } else { 349 break; /* no match because surrogate pair is split */ 350 } 351 } 352 if(*(--p)!=*(--q)) { 353 break; /* no match */ 354 } 355 } 356 } 357 } 358 359 /* not found */ 360 return NULL; 361} 362 363U_CAPI UChar * U_EXPORT2 364u_strrstr(const UChar *s, const UChar *substring) { 365 return u_strFindLast(s, -1, substring, -1); 366} 367 368U_CAPI UChar * U_EXPORT2 369u_strrchr(const UChar *s, UChar c) { 370 if(U16_IS_SURROGATE(c)) { 371 /* make sure to not find half of a surrogate pair */ 372 return u_strFindLast(s, -1, &c, 1); 373 } else { 374 const UChar *result=NULL; 375 UChar cs; 376 377 /* trivial search for a BMP code point */ 378 for(;;) { 379 if((cs=*s)==c) { 380 result=s; 381 } 382 if(cs==0) { 383 return (UChar *)result; 384 } 385 ++s; 386 } 387 } 388} 389 390U_CAPI UChar * U_EXPORT2 391u_strrchr32(const UChar *s, UChar32 c) { 392 if((uint32_t)c<=U_BMP_MAX) { 393 /* find BMP code point */ 394 return u_strrchr(s, (UChar)c); 395 } else if((uint32_t)c<=UCHAR_MAX_VALUE) { 396 /* find supplementary code point as surrogate pair */ 397 const UChar *result=NULL; 398 UChar cs, lead=U16_LEAD(c), trail=U16_TRAIL(c); 399 400 while((cs=*s++)!=0) { 401 if(cs==lead && *s==trail) { 402 result=s-1; 403 } 404 } 405 return (UChar *)result; 406 } else { 407 /* not a Unicode code point, not findable */ 408 return NULL; 409 } 410} 411 412U_CAPI UChar * U_EXPORT2 413u_memrchr(const UChar *s, UChar c, int32_t count) { 414 if(count<=0) { 415 return NULL; /* no string */ 416 } else if(U16_IS_SURROGATE(c)) { 417 /* make sure to not find half of a surrogate pair */ 418 return u_strFindLast(s, count, &c, 1); 419 } else { 420 /* trivial search for a BMP code point */ 421 const UChar *limit=s+count; 422 do { 423 if(*(--limit)==c) { 424 return (UChar *)limit; 425 } 426 } while(s!=limit); 427 return NULL; 428 } 429} 430 431U_CAPI UChar * U_EXPORT2 432u_memrchr32(const UChar *s, UChar32 c, int32_t count) { 433 if((uint32_t)c<=U_BMP_MAX) { 434 /* find BMP code point */ 435 return u_memrchr(s, (UChar)c, count); 436 } else if(count<2) { 437 /* too short for a surrogate pair */ 438 return NULL; 439 } else if((uint32_t)c<=UCHAR_MAX_VALUE) { 440 /* find supplementary code point as surrogate pair */ 441 const UChar *limit=s+count-1; 442 UChar lead=U16_LEAD(c), trail=U16_TRAIL(c); 443 444 do { 445 if(*limit==trail && *(limit-1)==lead) { 446 return (UChar *)(limit-1); 447 } 448 } while(s!=--limit); 449 return NULL; 450 } else { 451 /* not a Unicode code point, not findable */ 452 return NULL; 453 } 454} 455 456/* Tokenization functions --------------------------------------------------- */ 457 458/* 459 * Match each code point in a string against each code point in the matchSet. 460 * Return the index of the first string code point that 461 * is (polarity==TRUE) or is not (FALSE) contained in the matchSet. 462 * Return -(string length)-1 if there is no such code point. 463 */ 464static int32_t 465_matchFromSet(const UChar *string, const UChar *matchSet, UBool polarity) { 466 int32_t matchLen, matchBMPLen, strItr, matchItr; 467 UChar32 stringCh, matchCh; 468 UChar c, c2; 469 470 /* first part of matchSet contains only BMP code points */ 471 matchBMPLen = 0; 472 while((c = matchSet[matchBMPLen]) != 0 && U16_IS_SINGLE(c)) { 473 ++matchBMPLen; 474 } 475 476 /* second part of matchSet contains BMP and supplementary code points */ 477 matchLen = matchBMPLen; 478 while(matchSet[matchLen] != 0) { 479 ++matchLen; 480 } 481 482 for(strItr = 0; (c = string[strItr]) != 0;) { 483 ++strItr; 484 if(U16_IS_SINGLE(c)) { 485 if(polarity) { 486 for(matchItr = 0; matchItr < matchLen; ++matchItr) { 487 if(c == matchSet[matchItr]) { 488 return strItr - 1; /* one matches */ 489 } 490 } 491 } else { 492 for(matchItr = 0; matchItr < matchLen; ++matchItr) { 493 if(c == matchSet[matchItr]) { 494 goto endloop; 495 } 496 } 497 return strItr - 1; /* none matches */ 498 } 499 } else { 500 /* 501 * No need to check for string length before U16_IS_TRAIL 502 * because c2 could at worst be the terminating NUL. 503 */ 504 if(U16_IS_SURROGATE_LEAD(c) && U16_IS_TRAIL(c2 = string[strItr])) { 505 ++strItr; 506 stringCh = U16_GET_SUPPLEMENTARY(c, c2); 507 } else { 508 stringCh = c; /* unpaired trail surrogate */ 509 } 510 511 if(polarity) { 512 for(matchItr = matchBMPLen; matchItr < matchLen;) { 513 U16_NEXT(matchSet, matchItr, matchLen, matchCh); 514 if(stringCh == matchCh) { 515 return strItr - U16_LENGTH(stringCh); /* one matches */ 516 } 517 } 518 } else { 519 for(matchItr = matchBMPLen; matchItr < matchLen;) { 520 U16_NEXT(matchSet, matchItr, matchLen, matchCh); 521 if(stringCh == matchCh) { 522 goto endloop; 523 } 524 } 525 return strItr - U16_LENGTH(stringCh); /* none matches */ 526 } 527 } 528endloop: 529 /* wish C had continue with labels like Java... */; 530 } 531 532 /* Didn't find it. */ 533 return -strItr-1; 534} 535 536/* Search for a codepoint in a string that matches one of the matchSet codepoints. */ 537U_CAPI UChar * U_EXPORT2 538u_strpbrk(const UChar *string, const UChar *matchSet) 539{ 540 int32_t idx = _matchFromSet(string, matchSet, TRUE); 541 if(idx >= 0) { 542 return (UChar *)string + idx; 543 } else { 544 return NULL; 545 } 546} 547 548/* Search for a codepoint in a string that matches one of the matchSet codepoints. */ 549U_CAPI int32_t U_EXPORT2 550u_strcspn(const UChar *string, const UChar *matchSet) 551{ 552 int32_t idx = _matchFromSet(string, matchSet, TRUE); 553 if(idx >= 0) { 554 return idx; 555 } else { 556 return -idx - 1; /* == u_strlen(string) */ 557 } 558} 559 560/* Search for a codepoint in a string that does not match one of the matchSet codepoints. */ 561U_CAPI int32_t U_EXPORT2 562u_strspn(const UChar *string, const UChar *matchSet) 563{ 564 int32_t idx = _matchFromSet(string, matchSet, FALSE); 565 if(idx >= 0) { 566 return idx; 567 } else { 568 return -idx - 1; /* == u_strlen(string) */ 569 } 570} 571 572/* ----- Text manipulation functions --- */ 573 574U_CAPI UChar* U_EXPORT2 575u_strtok_r(UChar *src, 576 const UChar *delim, 577 UChar **saveState) 578{ 579 UChar *tokSource; 580 UChar *nextToken; 581 uint32_t nonDelimIdx; 582 583 /* If saveState is NULL, the user messed up. */ 584 if (src != NULL) { 585 tokSource = src; 586 *saveState = src; /* Set to "src" in case there are no delimiters */ 587 } 588 else if (*saveState) { 589 tokSource = *saveState; 590 } 591 else { 592 /* src == NULL && *saveState == NULL */ 593 /* This shouldn't happen. We already finished tokenizing. */ 594 return NULL; 595 } 596 597 /* Skip initial delimiters */ 598 nonDelimIdx = u_strspn(tokSource, delim); 599 tokSource = &tokSource[nonDelimIdx]; 600 601 if (*tokSource) { 602 nextToken = u_strpbrk(tokSource, delim); 603 if (nextToken != NULL) { 604 /* Create a token */ 605 *(nextToken++) = 0; 606 *saveState = nextToken; 607 return tokSource; 608 } 609 else if (*saveState) { 610 /* Return the last token */ 611 *saveState = NULL; 612 return tokSource; 613 } 614 } 615 else { 616 /* No tokens were found. Only delimiters were left. */ 617 *saveState = NULL; 618 } 619 return NULL; 620} 621 622/* Miscellaneous functions -------------------------------------------------- */ 623 624U_CAPI UChar* U_EXPORT2 625u_strcat(UChar *dst, 626 const UChar *src) 627{ 628 UChar *anchor = dst; /* save a pointer to start of dst */ 629 630 while(*dst != 0) { /* To end of first string */ 631 ++dst; 632 } 633 while((*(dst++) = *(src++)) != 0) { /* copy string 2 over */ 634 } 635 636 return anchor; 637} 638 639U_CAPI UChar* U_EXPORT2 640u_strncat(UChar *dst, 641 const UChar *src, 642 int32_t n ) 643{ 644 if(n > 0) { 645 UChar *anchor = dst; /* save a pointer to start of dst */ 646 647 while(*dst != 0) { /* To end of first string */ 648 ++dst; 649 } 650 while((*dst = *src) != 0) { /* copy string 2 over */ 651 ++dst; 652 if(--n == 0) { 653 *dst = 0; 654 break; 655 } 656 ++src; 657 } 658 659 return anchor; 660 } else { 661 return dst; 662 } 663} 664 665/* ----- Text property functions --- */ 666 667U_CAPI int32_t U_EXPORT2 668u_strcmp(const UChar *s1, 669 const UChar *s2) 670{ 671 UChar c1, c2; 672 673 for(;;) { 674 c1=*s1++; 675 c2=*s2++; 676 if (c1 != c2 || c1 == 0) { 677 break; 678 } 679 } 680 return (int32_t)c1 - (int32_t)c2; 681} 682 683U_CFUNC int32_t U_EXPORT2 684uprv_strCompare(const UChar *s1, int32_t length1, 685 const UChar *s2, int32_t length2, 686 UBool strncmpStyle, UBool codePointOrder) { 687 const UChar *start1, *start2, *limit1, *limit2; 688 UChar c1, c2; 689 690 /* setup for fix-up */ 691 start1=s1; 692 start2=s2; 693 694 /* compare identical prefixes - they do not need to be fixed up */ 695 if(length1<0 && length2<0) { 696 /* strcmp style, both NUL-terminated */ 697 if(s1==s2) { 698 return 0; 699 } 700 701 for(;;) { 702 c1=*s1; 703 c2=*s2; 704 if(c1!=c2) { 705 break; 706 } 707 if(c1==0) { 708 return 0; 709 } 710 ++s1; 711 ++s2; 712 } 713 714 /* setup for fix-up */ 715 limit1=limit2=NULL; 716 } else if(strncmpStyle) { 717 /* special handling for strncmp, assume length1==length2>=0 but also check for NUL */ 718 if(s1==s2) { 719 return 0; 720 } 721 722 limit1=start1+length1; 723 724 for(;;) { 725 /* both lengths are same, check only one limit */ 726 if(s1==limit1) { 727 return 0; 728 } 729 730 c1=*s1; 731 c2=*s2; 732 if(c1!=c2) { 733 break; 734 } 735 if(c1==0) { 736 return 0; 737 } 738 ++s1; 739 ++s2; 740 } 741 742 /* setup for fix-up */ 743 limit2=start2+length1; /* use length1 here, too, to enforce assumption */ 744 } else { 745 /* memcmp/UnicodeString style, both length-specified */ 746 int32_t lengthResult; 747 748 if(length1<0) { 749 length1=u_strlen(s1); 750 } 751 if(length2<0) { 752 length2=u_strlen(s2); 753 } 754 755 /* limit1=start1+min(lenght1, length2) */ 756 if(length1<length2) { 757 lengthResult=-1; 758 limit1=start1+length1; 759 } else if(length1==length2) { 760 lengthResult=0; 761 limit1=start1+length1; 762 } else /* length1>length2 */ { 763 lengthResult=1; 764 limit1=start1+length2; 765 } 766 767 if(s1==s2) { 768 return lengthResult; 769 } 770 771 for(;;) { 772 /* check pseudo-limit */ 773 if(s1==limit1) { 774 return lengthResult; 775 } 776 777 c1=*s1; 778 c2=*s2; 779 if(c1!=c2) { 780 break; 781 } 782 ++s1; 783 ++s2; 784 } 785 786 /* setup for fix-up */ 787 limit1=start1+length1; 788 limit2=start2+length2; 789 } 790 791 /* if both values are in or above the surrogate range, fix them up */ 792 if(c1>=0xd800 && c2>=0xd800 && codePointOrder) { 793 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ 794 if( 795 (c1<=0xdbff && (s1+1)!=limit1 && U16_IS_TRAIL(*(s1+1))) || 796 (U16_IS_TRAIL(c1) && start1!=s1 && U16_IS_LEAD(*(s1-1))) 797 ) { 798 /* part of a surrogate pair, leave >=d800 */ 799 } else { 800 /* BMP code point - may be surrogate code point - make <d800 */ 801 c1-=0x2800; 802 } 803 804 if( 805 (c2<=0xdbff && (s2+1)!=limit2 && U16_IS_TRAIL(*(s2+1))) || 806 (U16_IS_TRAIL(c2) && start2!=s2 && U16_IS_LEAD(*(s2-1))) 807 ) { 808 /* part of a surrogate pair, leave >=d800 */ 809 } else { 810 /* BMP code point - may be surrogate code point - make <d800 */ 811 c2-=0x2800; 812 } 813 } 814 815 /* now c1 and c2 are in the requested (code unit or code point) order */ 816 return (int32_t)c1-(int32_t)c2; 817} 818 819/* 820 * Compare two strings as presented by UCharIterators. 821 * Use code unit or code point order. 822 * When the function returns, it is undefined where the iterators 823 * have stopped. 824 */ 825U_CAPI int32_t U_EXPORT2 826u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2, UBool codePointOrder) { 827 UChar32 c1, c2; 828 829 /* argument checking */ 830 if(iter1==NULL || iter2==NULL) { 831 return 0; /* bad arguments */ 832 } 833 if(iter1==iter2) { 834 return 0; /* identical iterators */ 835 } 836 837 /* reset iterators to start? */ 838 iter1->move(iter1, 0, UITER_START); 839 iter2->move(iter2, 0, UITER_START); 840 841 /* compare identical prefixes - they do not need to be fixed up */ 842 for(;;) { 843 c1=iter1->next(iter1); 844 c2=iter2->next(iter2); 845 if(c1!=c2) { 846 break; 847 } 848 if(c1==-1) { 849 return 0; 850 } 851 } 852 853 /* if both values are in or above the surrogate range, fix them up */ 854 if(c1>=0xd800 && c2>=0xd800 && codePointOrder) { 855 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ 856 if( 857 (c1<=0xdbff && U16_IS_TRAIL(iter1->current(iter1))) || 858 (U16_IS_TRAIL(c1) && (iter1->previous(iter1), U16_IS_LEAD(iter1->previous(iter1)))) 859 ) { 860 /* part of a surrogate pair, leave >=d800 */ 861 } else { 862 /* BMP code point - may be surrogate code point - make <d800 */ 863 c1-=0x2800; 864 } 865 866 if( 867 (c2<=0xdbff && U16_IS_TRAIL(iter2->current(iter2))) || 868 (U16_IS_TRAIL(c2) && (iter2->previous(iter2), U16_IS_LEAD(iter2->previous(iter2)))) 869 ) { 870 /* part of a surrogate pair, leave >=d800 */ 871 } else { 872 /* BMP code point - may be surrogate code point - make <d800 */ 873 c2-=0x2800; 874 } 875 } 876 877 /* now c1 and c2 are in the requested (code unit or code point) order */ 878 return (int32_t)c1-(int32_t)c2; 879} 880 881#if 0 882/* 883 * u_strCompareIter() does not leave the iterators _on_ the different units. 884 * This is possible but would cost a few extra indirect function calls to back 885 * up if the last unit (c1 or c2 respectively) was >=0. 886 * 887 * Consistently leaving them _behind_ the different units is not an option 888 * because the current "unit" is the end of the string if that is reached, 889 * and in such a case the iterator does not move. 890 * For example, when comparing "ab" with "abc", both iterators rest _on_ the end 891 * of their strings. Calling previous() on each does not move them to where 892 * the comparison fails. 893 * 894 * So the simplest semantics is to not define where the iterators end up. 895 * 896 * The following fragment is part of what would need to be done for backing up. 897 */ 898void fragment { 899 /* iff a surrogate is part of a surrogate pair, leave >=d800 */ 900 if(c1<=0xdbff) { 901 if(!U16_IS_TRAIL(iter1->current(iter1))) { 902 /* lead surrogate code point - make <d800 */ 903 c1-=0x2800; 904 } 905 } else if(c1<=0xdfff) { 906 int32_t idx=iter1->getIndex(iter1, UITER_CURRENT); 907 iter1->previous(iter1); /* ==c1 */ 908 if(!U16_IS_LEAD(iter1->previous(iter1))) { 909 /* trail surrogate code point - make <d800 */ 910 c1-=0x2800; 911 } 912 /* go back to behind where the difference is */ 913 iter1->move(iter1, idx, UITER_ZERO); 914 } else /* 0xe000<=c1<=0xffff */ { 915 /* BMP code point - make <d800 */ 916 c1-=0x2800; 917 } 918} 919#endif 920 921U_CAPI int32_t U_EXPORT2 922u_strCompare(const UChar *s1, int32_t length1, 923 const UChar *s2, int32_t length2, 924 UBool codePointOrder) { 925 /* argument checking */ 926 if(s1==NULL || length1<-1 || s2==NULL || length2<-1) { 927 return 0; 928 } 929 return uprv_strCompare(s1, length1, s2, length2, FALSE, codePointOrder); 930} 931 932/* String compare in code point order - u_strcmp() compares in code unit order. */ 933U_CAPI int32_t U_EXPORT2 934u_strcmpCodePointOrder(const UChar *s1, const UChar *s2) { 935 return uprv_strCompare(s1, -1, s2, -1, FALSE, TRUE); 936} 937 938U_CAPI int32_t U_EXPORT2 939u_strncmp(const UChar *s1, 940 const UChar *s2, 941 int32_t n) 942{ 943 if(n > 0) { 944 int32_t rc; 945 for(;;) { 946 rc = (int32_t)*s1 - (int32_t)*s2; 947 if(rc != 0 || *s1 == 0 || --n == 0) { 948 return rc; 949 } 950 ++s1; 951 ++s2; 952 } 953 } else { 954 return 0; 955 } 956} 957 958U_CAPI int32_t U_EXPORT2 959u_strncmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t n) { 960 return uprv_strCompare(s1, n, s2, n, TRUE, TRUE); 961} 962 963U_CAPI UChar* U_EXPORT2 964u_strcpy(UChar *dst, 965 const UChar *src) 966{ 967 UChar *anchor = dst; /* save a pointer to start of dst */ 968 969 while((*(dst++) = *(src++)) != 0) { /* copy string 2 over */ 970 } 971 972 return anchor; 973} 974 975U_CAPI UChar* U_EXPORT2 976u_strncpy(UChar *dst, 977 const UChar *src, 978 int32_t n) 979{ 980 UChar *anchor = dst; /* save a pointer to start of dst */ 981 982 /* copy string 2 over */ 983 while(n > 0 && (*(dst++) = *(src++)) != 0) { 984 --n; 985 } 986 987 return anchor; 988} 989 990U_CAPI int32_t U_EXPORT2 991u_strlen(const UChar *s) 992{ 993#if U_SIZEOF_WCHAR_T == U_SIZEOF_UCHAR 994 return (int32_t)uprv_wcslen(s); 995#else 996 const UChar *t = s; 997 while(*t != 0) { 998 ++t; 999 } 1000 return t - s; 1001#endif 1002} 1003 1004U_CAPI int32_t U_EXPORT2 1005u_countChar32(const UChar *s, int32_t length) { 1006 int32_t count; 1007 1008 if(s==NULL || length<-1) { 1009 return 0; 1010 } 1011 1012 count=0; 1013 if(length>=0) { 1014 while(length>0) { 1015 ++count; 1016 if(U16_IS_LEAD(*s) && length>=2 && U16_IS_TRAIL(*(s+1))) { 1017 s+=2; 1018 length-=2; 1019 } else { 1020 ++s; 1021 --length; 1022 } 1023 } 1024 } else /* length==-1 */ { 1025 UChar c; 1026 1027 for(;;) { 1028 if((c=*s++)==0) { 1029 break; 1030 } 1031 ++count; 1032 1033 /* 1034 * sufficient to look ahead one because of UTF-16; 1035 * safe to look ahead one because at worst that would be the terminating NUL 1036 */ 1037 if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) { 1038 ++s; 1039 } 1040 } 1041 } 1042 return count; 1043} 1044 1045U_CAPI UBool U_EXPORT2 1046u_strHasMoreChar32Than(const UChar *s, int32_t length, int32_t number) { 1047 1048 if(number<0) { 1049 return TRUE; 1050 } 1051 if(s==NULL || length<-1) { 1052 return FALSE; 1053 } 1054 1055 if(length==-1) { 1056 /* s is NUL-terminated */ 1057 UChar c; 1058 1059 /* count code points until they exceed */ 1060 for(;;) { 1061 if((c=*s++)==0) { 1062 return FALSE; 1063 } 1064 if(number==0) { 1065 return TRUE; 1066 } 1067 if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) { 1068 ++s; 1069 } 1070 --number; 1071 } 1072 } else { 1073 /* length>=0 known */ 1074 const UChar *limit; 1075 int32_t maxSupplementary; 1076 1077 /* s contains at least (length+1)/2 code points: <=2 UChars per cp */ 1078 if(((length+1)/2)>number) { 1079 return TRUE; 1080 } 1081 1082 /* check if s does not even contain enough UChars */ 1083 maxSupplementary=length-number; 1084 if(maxSupplementary<=0) { 1085 return FALSE; 1086 } 1087 /* there are maxSupplementary=length-number more UChars than asked-for code points */ 1088 1089 /* 1090 * count code points until they exceed and also check that there are 1091 * no more than maxSupplementary supplementary code points (UChar pairs) 1092 */ 1093 limit=s+length; 1094 for(;;) { 1095 if(s==limit) { 1096 return FALSE; 1097 } 1098 if(number==0) { 1099 return TRUE; 1100 } 1101 if(U16_IS_LEAD(*s++) && s!=limit && U16_IS_TRAIL(*s)) { 1102 ++s; 1103 if(--maxSupplementary<=0) { 1104 /* too many pairs - too few code points */ 1105 return FALSE; 1106 } 1107 } 1108 --number; 1109 } 1110 } 1111} 1112 1113U_CAPI UChar * U_EXPORT2 1114u_memcpy(UChar *dest, const UChar *src, int32_t count) { 1115 if(count > 0) { 1116 uprv_memcpy(dest, src, count*U_SIZEOF_UCHAR); 1117 } 1118 return dest; 1119} 1120 1121U_CAPI UChar * U_EXPORT2 1122u_memmove(UChar *dest, const UChar *src, int32_t count) { 1123 if(count > 0) { 1124 uprv_memmove(dest, src, count*U_SIZEOF_UCHAR); 1125 } 1126 return dest; 1127} 1128 1129U_CAPI UChar * U_EXPORT2 1130u_memset(UChar *dest, UChar c, int32_t count) { 1131 if(count > 0) { 1132 UChar *ptr = dest; 1133 UChar *limit = dest + count; 1134 1135 while (ptr < limit) { 1136 *(ptr++) = c; 1137 } 1138 } 1139 return dest; 1140} 1141 1142U_CAPI int32_t U_EXPORT2 1143u_memcmp(const UChar *buf1, const UChar *buf2, int32_t count) { 1144 if(count > 0) { 1145 const UChar *limit = buf1 + count; 1146 int32_t result; 1147 1148 while (buf1 < limit) { 1149 result = (int32_t)(uint16_t)*buf1 - (int32_t)(uint16_t)*buf2; 1150 if (result != 0) { 1151 return result; 1152 } 1153 buf1++; 1154 buf2++; 1155 } 1156 } 1157 return 0; 1158} 1159 1160U_CAPI int32_t U_EXPORT2 1161u_memcmpCodePointOrder(const UChar *s1, const UChar *s2, int32_t count) { 1162 return uprv_strCompare(s1, count, s2, count, FALSE, TRUE); 1163} 1164 1165/* u_unescape & support fns ------------------------------------------------- */ 1166 1167/* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */ 1168static const UChar UNESCAPE_MAP[] = { 1169 /*" 0x22, 0x22 */ 1170 /*' 0x27, 0x27 */ 1171 /*? 0x3F, 0x3F */ 1172 /*\ 0x5C, 0x5C */ 1173 /*a*/ 0x61, 0x07, 1174 /*b*/ 0x62, 0x08, 1175 /*e*/ 0x65, 0x1b, 1176 /*f*/ 0x66, 0x0c, 1177 /*n*/ 0x6E, 0x0a, 1178 /*r*/ 0x72, 0x0d, 1179 /*t*/ 0x74, 0x09, 1180 /*v*/ 0x76, 0x0b 1181}; 1182enum { UNESCAPE_MAP_LENGTH = sizeof(UNESCAPE_MAP) / sizeof(UNESCAPE_MAP[0]) }; 1183 1184/* Convert one octal digit to a numeric value 0..7, or -1 on failure */ 1185static int8_t _digit8(UChar c) { 1186 if (c >= 0x0030 && c <= 0x0037) { 1187 return (int8_t)(c - 0x0030); 1188 } 1189 return -1; 1190} 1191 1192/* Convert one hex digit to a numeric value 0..F, or -1 on failure */ 1193static int8_t _digit16(UChar c) { 1194 if (c >= 0x0030 && c <= 0x0039) { 1195 return (int8_t)(c - 0x0030); 1196 } 1197 if (c >= 0x0041 && c <= 0x0046) { 1198 return (int8_t)(c - (0x0041 - 10)); 1199 } 1200 if (c >= 0x0061 && c <= 0x0066) { 1201 return (int8_t)(c - (0x0061 - 10)); 1202 } 1203 return -1; 1204} 1205 1206/* Parse a single escape sequence. Although this method deals in 1207 * UChars, it does not use C++ or UnicodeString. This allows it to 1208 * be used from C contexts. */ 1209U_CAPI UChar32 U_EXPORT2 1210u_unescapeAt(UNESCAPE_CHAR_AT charAt, 1211 int32_t *offset, 1212 int32_t length, 1213 void *context) { 1214 1215 int32_t start = *offset; 1216 UChar c; 1217 UChar32 result = 0; 1218 int8_t n = 0; 1219 int8_t minDig = 0; 1220 int8_t maxDig = 0; 1221 int8_t bitsPerDigit = 4; 1222 int8_t dig; 1223 int32_t i; 1224 UBool braces = FALSE; 1225 1226 /* Check that offset is in range */ 1227 if (*offset < 0 || *offset >= length) { 1228 goto err; 1229 } 1230 1231 /* Fetch first UChar after '\\' */ 1232 c = charAt((*offset)++, context); 1233 1234 /* Convert hexadecimal and octal escapes */ 1235 switch (c) { 1236 case 0x0075 /*'u'*/: 1237 minDig = maxDig = 4; 1238 break; 1239 case 0x0055 /*'U'*/: 1240 minDig = maxDig = 8; 1241 break; 1242 case 0x0078 /*'x'*/: 1243 minDig = 1; 1244 if (*offset < length && charAt(*offset, context) == 0x7B /*{*/) { 1245 ++(*offset); 1246 braces = TRUE; 1247 maxDig = 8; 1248 } else { 1249 maxDig = 2; 1250 } 1251 break; 1252 default: 1253 dig = _digit8(c); 1254 if (dig >= 0) { 1255 minDig = 1; 1256 maxDig = 3; 1257 n = 1; /* Already have first octal digit */ 1258 bitsPerDigit = 3; 1259 result = dig; 1260 } 1261 break; 1262 } 1263 if (minDig != 0) { 1264 while (*offset < length && n < maxDig) { 1265 c = charAt(*offset, context); 1266 dig = (int8_t)((bitsPerDigit == 3) ? _digit8(c) : _digit16(c)); 1267 if (dig < 0) { 1268 break; 1269 } 1270 result = (result << bitsPerDigit) | dig; 1271 ++(*offset); 1272 ++n; 1273 } 1274 if (n < minDig) { 1275 goto err; 1276 } 1277 if (braces) { 1278 if (c != 0x7D /*}*/) { 1279 goto err; 1280 } 1281 ++(*offset); 1282 } 1283 if (result < 0 || result >= 0x110000) { 1284 goto err; 1285 } 1286 /* If an escape sequence specifies a lead surrogate, see if 1287 * there is a trail surrogate after it, either as an escape or 1288 * as a literal. If so, join them up into a supplementary. 1289 */ 1290 if (*offset < length && U16_IS_LEAD(result)) { 1291 int32_t ahead = *offset + 1; 1292 c = charAt(*offset, context); 1293 if (c == 0x5C /*'\\'*/ && ahead < length) { 1294 c = (UChar) u_unescapeAt(charAt, &ahead, length, context); 1295 } 1296 if (U16_IS_TRAIL(c)) { 1297 *offset = ahead; 1298 result = U16_GET_SUPPLEMENTARY(result, c); 1299 } 1300 } 1301 return result; 1302 } 1303 1304 /* Convert C-style escapes in table */ 1305 for (i=0; i<UNESCAPE_MAP_LENGTH; i+=2) { 1306 if (c == UNESCAPE_MAP[i]) { 1307 return UNESCAPE_MAP[i+1]; 1308 } else if (c < UNESCAPE_MAP[i]) { 1309 break; 1310 } 1311 } 1312 1313 /* Map \cX to control-X: X & 0x1F */ 1314 if (c == 0x0063 /*'c'*/ && *offset < length) { 1315 c = charAt((*offset)++, context); 1316 if (U16_IS_LEAD(c) && *offset < length) { 1317 UChar c2 = charAt(*offset, context); 1318 if (U16_IS_TRAIL(c2)) { 1319 ++(*offset); 1320 c = (UChar) U16_GET_SUPPLEMENTARY(c, c2); /* [sic] */ 1321 } 1322 } 1323 return 0x1F & c; 1324 } 1325 1326 /* If no special forms are recognized, then consider 1327 * the backslash to generically escape the next character. 1328 * Deal with surrogate pairs. */ 1329 if (U16_IS_LEAD(c) && *offset < length) { 1330 UChar c2 = charAt(*offset, context); 1331 if (U16_IS_TRAIL(c2)) { 1332 ++(*offset); 1333 return U16_GET_SUPPLEMENTARY(c, c2); 1334 } 1335 } 1336 return c; 1337 1338 err: 1339 /* Invalid escape sequence */ 1340 *offset = start; /* Reset to initial value */ 1341 return (UChar32)0xFFFFFFFF; 1342} 1343 1344/* u_unescapeAt() callback to return a UChar from a char* */ 1345static UChar U_CALLCONV 1346_charPtr_charAt(int32_t offset, void *context) { 1347 UChar c16; 1348 /* It would be more efficient to access the invariant tables 1349 * directly but there is no API for that. */ 1350 u_charsToUChars(((char*) context) + offset, &c16, 1); 1351 return c16; 1352} 1353 1354/* Append an escape-free segment of the text; used by u_unescape() */ 1355static void _appendUChars(UChar *dest, int32_t destCapacity, 1356 const char *src, int32_t srcLen) { 1357 if (destCapacity < 0) { 1358 destCapacity = 0; 1359 } 1360 if (srcLen > destCapacity) { 1361 srcLen = destCapacity; 1362 } 1363 u_charsToUChars(src, dest, srcLen); 1364} 1365 1366/* Do an invariant conversion of char* -> UChar*, with escape parsing */ 1367U_CAPI int32_t U_EXPORT2 1368u_unescape(const char *src, UChar *dest, int32_t destCapacity) { 1369 const char *segment = src; 1370 int32_t i = 0; 1371 char c; 1372 1373 while ((c=*src) != 0) { 1374 /* '\\' intentionally written as compiler-specific 1375 * character constant to correspond to compiler-specific 1376 * char* constants. */ 1377 if (c == '\\') { 1378 int32_t lenParsed = 0; 1379 UChar32 c32; 1380 if (src != segment) { 1381 if (dest != NULL) { 1382 _appendUChars(dest + i, destCapacity - i, 1383 segment, (int32_t)(src - segment)); 1384 } 1385 i += (int32_t)(src - segment); 1386 } 1387 ++src; /* advance past '\\' */ 1388 c32 = (UChar32)u_unescapeAt(_charPtr_charAt, &lenParsed, (int32_t)uprv_strlen(src), (void*)src); 1389 if (lenParsed == 0) { 1390 goto err; 1391 } 1392 src += lenParsed; /* advance past escape seq. */ 1393 if (dest != NULL && U16_LENGTH(c32) <= (destCapacity - i)) { 1394 U16_APPEND_UNSAFE(dest, i, c32); 1395 } else { 1396 i += U16_LENGTH(c32); 1397 } 1398 segment = src; 1399 } else { 1400 ++src; 1401 } 1402 } 1403 if (src != segment) { 1404 if (dest != NULL) { 1405 _appendUChars(dest + i, destCapacity - i, 1406 segment, (int32_t)(src - segment)); 1407 } 1408 i += (int32_t)(src - segment); 1409 } 1410 if (dest != NULL && i < destCapacity) { 1411 dest[i] = 0; 1412 } 1413 return i; 1414 1415 err: 1416 if (dest != NULL && destCapacity > 0) { 1417 *dest = 0; 1418 } 1419 return 0; 1420} 1421 1422/* NUL-termination of strings ----------------------------------------------- */ 1423 1424/** 1425 * NUL-terminate a string no matter what its type. 1426 * Set warning and error codes accordingly. 1427 */ 1428#define __TERMINATE_STRING(dest, destCapacity, length, pErrorCode) \ 1429 if(pErrorCode!=NULL && U_SUCCESS(*pErrorCode)) { \ 1430 /* not a public function, so no complete argument checking */ \ 1431 \ 1432 if(length<0) { \ 1433 /* assume that the caller handles this */ \ 1434 } else if(length<destCapacity) { \ 1435 /* NUL-terminate the string, the NUL fits */ \ 1436 dest[length]=0; \ 1437 /* unset the not-terminated warning but leave all others */ \ 1438 if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) { \ 1439 *pErrorCode=U_ZERO_ERROR; \ 1440 } \ 1441 } else if(length==destCapacity) { \ 1442 /* unable to NUL-terminate, but the string itself fit - set a warning code */ \ 1443 *pErrorCode=U_STRING_NOT_TERMINATED_WARNING; \ 1444 } else /* length>destCapacity */ { \ 1445 /* even the string itself did not fit - set an error code */ \ 1446 *pErrorCode=U_BUFFER_OVERFLOW_ERROR; \ 1447 } \ 1448 } 1449 1450U_CAPI int32_t U_EXPORT2 1451u_terminateUChars(UChar *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { 1452 __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); 1453 return length; 1454} 1455 1456U_CAPI int32_t U_EXPORT2 1457u_terminateChars(char *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { 1458 __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); 1459 return length; 1460} 1461 1462U_CAPI int32_t U_EXPORT2 1463u_terminateUChar32s(UChar32 *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { 1464 __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); 1465 return length; 1466} 1467 1468U_CAPI int32_t U_EXPORT2 1469u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) { 1470 __TERMINATE_STRING(dest, destCapacity, length, pErrorCode); 1471 return length; 1472} 1473 1474// Compute the hash code for a string -------------------------------------- *** 1475 1476// Moved here from uhash.c so that UnicodeString::hashCode() does not depend 1477// on UHashtable code. 1478 1479/* 1480 Compute the hash by iterating sparsely over about 32 (up to 63) 1481 characters spaced evenly through the string. For each character, 1482 multiply the previous hash value by a prime number and add the new 1483 character in, like a linear congruential random number generator, 1484 producing a pseudorandom deterministic value well distributed over 1485 the output range. [LIU] 1486*/ 1487 1488#define STRING_HASH(TYPE, STR, STRLEN, DEREF) \ 1489 int32_t hash = 0; \ 1490 const TYPE *p = (const TYPE*) STR; \ 1491 if (p != NULL) { \ 1492 int32_t len = (int32_t)(STRLEN); \ 1493 int32_t inc = ((len - 32) / 32) + 1; \ 1494 const TYPE *limit = p + len; \ 1495 while (p<limit) { \ 1496 hash = (hash * 37) + DEREF; \ 1497 p += inc; \ 1498 } \ 1499 } \ 1500 return hash 1501 1502/* Used by UnicodeString to compute its hashcode - Not public API. */ 1503U_CAPI int32_t U_EXPORT2 1504ustr_hashUCharsN(const UChar *str, int32_t length) { 1505 STRING_HASH(UChar, str, length, *p); 1506} 1507 1508U_CAPI int32_t U_EXPORT2 1509ustr_hashCharsN(const char *str, int32_t length) { 1510 STRING_HASH(uint8_t, str, length, *p); 1511} 1512 1513U_CAPI int32_t U_EXPORT2 1514ustr_hashICharsN(const char *str, int32_t length) { 1515 STRING_HASH(char, str, length, (uint8_t)uprv_tolower(*p)); 1516} 1517