1/* 2******************************************************************************* 3* 4* Copyright (C) 2001-2014, International Business Machines 5* Corporation and others. All Rights Reserved. 6* 7******************************************************************************* 8* file name: unormcmp.cpp 9* encoding: US-ASCII 10* tab size: 8 (not used) 11* indentation:4 12* 13* created on: 2004sep13 14* created by: Markus W. Scherer 15* 16* unorm_compare() function moved here from unorm.cpp for better modularization. 17* Depends on both normalization and case folding. 18* Allows unorm.cpp to not depend on any character properties code. 19*/ 20 21#include "unicode/utypes.h" 22 23#if !UCONFIG_NO_NORMALIZATION 24 25#include "unicode/unorm.h" 26#include "unicode/ustring.h" 27#include "cmemory.h" 28#include "normalizer2impl.h" 29#include "ucase.h" 30#include "uprops.h" 31#include "ustr_imp.h" 32 33U_NAMESPACE_USE 34 35/* compare canonically equivalent ------------------------------------------- */ 36 37/* 38 * Compare two strings for canonical equivalence. 39 * Further options include case-insensitive comparison and 40 * code point order (as opposed to code unit order). 41 * 42 * In this function, canonical equivalence is optional as well. 43 * If canonical equivalence is tested, then both strings must fulfill 44 * the FCD check. 45 * 46 * Semantically, this is equivalent to 47 * strcmp[CodePointOrder](NFD(foldCase(s1)), NFD(foldCase(s2))) 48 * where code point order, NFD and foldCase are all optional. 49 * 50 * String comparisons almost always yield results before processing both strings 51 * completely. 52 * They are generally more efficient working incrementally instead of 53 * performing the sub-processing (strlen, normalization, case-folding) 54 * on the entire strings first. 55 * 56 * It is also unnecessary to not normalize identical characters. 57 * 58 * This function works in principle as follows: 59 * 60 * loop { 61 * get one code unit c1 from s1 (-1 if end of source) 62 * get one code unit c2 from s2 (-1 if end of source) 63 * 64 * if(either string finished) { 65 * return result; 66 * } 67 * if(c1==c2) { 68 * continue; 69 * } 70 * 71 * // c1!=c2 72 * try to decompose/case-fold c1/c2, and continue if one does; 73 * 74 * // still c1!=c2 and neither decomposes/case-folds, return result 75 * return c1-c2; 76 * } 77 * 78 * When a character decomposes, then the pointer for that source changes to 79 * the decomposition, pushing the previous pointer onto a stack. 80 * When the end of the decomposition is reached, then the code unit reader 81 * pops the previous source from the stack. 82 * (Same for case-folding.) 83 * 84 * This is complicated further by operating on variable-width UTF-16. 85 * The top part of the loop works on code units, while lookups for decomposition 86 * and case-folding need code points. 87 * Code points are assembled after the equality/end-of-source part. 88 * The source pointer is only advanced beyond all code units when the code point 89 * actually decomposes/case-folds. 90 * 91 * If we were on a trail surrogate unit when assembling a code point, 92 * and the code point decomposes/case-folds, then the decomposition/folding 93 * result must be compared with the part of the other string that corresponds to 94 * this string's lead surrogate. 95 * Since we only assemble a code point when hitting a trail unit when the 96 * preceding lead units were identical, we back up the other string by one unit 97 * in such a case. 98 * 99 * The optional code point order comparison at the end works with 100 * the same fix-up as the other code point order comparison functions. 101 * See ustring.c and the comment near the end of this function. 102 * 103 * Assumption: A decomposition or case-folding result string never contains 104 * a single surrogate. This is a safe assumption in the Unicode Standard. 105 * Therefore, we do not need to check for surrogate pairs across 106 * decomposition/case-folding boundaries. 107 * 108 * Further assumptions (see verifications tstnorm.cpp): 109 * The API function checks for FCD first, while the core function 110 * first case-folds and then decomposes. This requires that case-folding does not 111 * un-FCD any strings. 112 * 113 * The API function may also NFD the input and turn off decomposition. 114 * This requires that case-folding does not un-NFD strings either. 115 * 116 * TODO If any of the above two assumptions is violated, 117 * then this entire code must be re-thought. 118 * If this happens, then a simple solution is to case-fold both strings up front 119 * and to turn off UNORM_INPUT_IS_FCD. 120 * We already do this when not both strings are in FCD because makeFCD 121 * would be a partial NFD before the case folding, which does not work. 122 * Note that all of this is only a problem when case-folding _and_ 123 * canonical equivalence come together. 124 * (Comments in unorm_compare() are more up to date than this TODO.) 125 */ 126 127/* stack element for previous-level source/decomposition pointers */ 128struct CmpEquivLevel { 129 const UChar *start, *s, *limit; 130}; 131typedef struct CmpEquivLevel CmpEquivLevel; 132 133/** 134 * Internal option for unorm_cmpEquivFold() for decomposing. 135 * If not set, just do strcasecmp(). 136 */ 137#define _COMPARE_EQUIV 0x80000 138 139/* internal function */ 140static int32_t 141unorm_cmpEquivFold(const UChar *s1, int32_t length1, 142 const UChar *s2, int32_t length2, 143 uint32_t options, 144 UErrorCode *pErrorCode) { 145 const Normalizer2Impl *nfcImpl; 146 const UCaseProps *csp; 147 148 /* current-level start/limit - s1/s2 as current */ 149 const UChar *start1, *start2, *limit1, *limit2; 150 151 /* decomposition and case folding variables */ 152 const UChar *p; 153 int32_t length; 154 155 /* stacks of previous-level start/current/limit */ 156 CmpEquivLevel stack1[2], stack2[2]; 157 158 /* buffers for algorithmic decompositions */ 159 UChar decomp1[4], decomp2[4]; 160 161 /* case folding buffers, only use current-level start/limit */ 162 UChar fold1[UCASE_MAX_STRING_LENGTH+1], fold2[UCASE_MAX_STRING_LENGTH+1]; 163 164 /* track which is the current level per string */ 165 int32_t level1, level2; 166 167 /* current code units, and code points for lookups */ 168 UChar32 c1, c2, cp1, cp2; 169 170 /* no argument error checking because this itself is not an API */ 171 172 /* 173 * assume that at least one of the options _COMPARE_EQUIV and U_COMPARE_IGNORE_CASE is set 174 * otherwise this function must behave exactly as uprv_strCompare() 175 * not checking for that here makes testing this function easier 176 */ 177 178 /* normalization/properties data loaded? */ 179 if((options&_COMPARE_EQUIV)!=0) { 180 nfcImpl=Normalizer2Factory::getNFCImpl(*pErrorCode); 181 } else { 182 nfcImpl=NULL; 183 } 184 if((options&U_COMPARE_IGNORE_CASE)!=0) { 185 csp=ucase_getSingleton(); 186 } else { 187 csp=NULL; 188 } 189 if(U_FAILURE(*pErrorCode)) { 190 return 0; 191 } 192 193 /* initialize */ 194 start1=s1; 195 if(length1==-1) { 196 limit1=NULL; 197 } else { 198 limit1=s1+length1; 199 } 200 201 start2=s2; 202 if(length2==-1) { 203 limit2=NULL; 204 } else { 205 limit2=s2+length2; 206 } 207 208 level1=level2=0; 209 c1=c2=-1; 210 211 /* comparison loop */ 212 for(;;) { 213 /* 214 * here a code unit value of -1 means "get another code unit" 215 * below it will mean "this source is finished" 216 */ 217 218 if(c1<0) { 219 /* get next code unit from string 1, post-increment */ 220 for(;;) { 221 if(s1==limit1 || ((c1=*s1)==0 && (limit1==NULL || (options&_STRNCMP_STYLE)))) { 222 if(level1==0) { 223 c1=-1; 224 break; 225 } 226 } else { 227 ++s1; 228 break; 229 } 230 231 /* reached end of level buffer, pop one level */ 232 do { 233 --level1; 234 start1=stack1[level1].start; /*Not uninitialized*/ 235 } while(start1==NULL); 236 s1=stack1[level1].s; /*Not uninitialized*/ 237 limit1=stack1[level1].limit; /*Not uninitialized*/ 238 } 239 } 240 241 if(c2<0) { 242 /* get next code unit from string 2, post-increment */ 243 for(;;) { 244 if(s2==limit2 || ((c2=*s2)==0 && (limit2==NULL || (options&_STRNCMP_STYLE)))) { 245 if(level2==0) { 246 c2=-1; 247 break; 248 } 249 } else { 250 ++s2; 251 break; 252 } 253 254 /* reached end of level buffer, pop one level */ 255 do { 256 --level2; 257 start2=stack2[level2].start; /*Not uninitialized*/ 258 } while(start2==NULL); 259 s2=stack2[level2].s; /*Not uninitialized*/ 260 limit2=stack2[level2].limit; /*Not uninitialized*/ 261 } 262 } 263 264 /* 265 * compare c1 and c2 266 * either variable c1, c2 is -1 only if the corresponding string is finished 267 */ 268 if(c1==c2) { 269 if(c1<0) { 270 return 0; /* c1==c2==-1 indicating end of strings */ 271 } 272 c1=c2=-1; /* make us fetch new code units */ 273 continue; 274 } else if(c1<0) { 275 return -1; /* string 1 ends before string 2 */ 276 } else if(c2<0) { 277 return 1; /* string 2 ends before string 1 */ 278 } 279 /* c1!=c2 && c1>=0 && c2>=0 */ 280 281 /* get complete code points for c1, c2 for lookups if either is a surrogate */ 282 cp1=c1; 283 if(U_IS_SURROGATE(c1)) { 284 UChar c; 285 286 if(U_IS_SURROGATE_LEAD(c1)) { 287 if(s1!=limit1 && U16_IS_TRAIL(c=*s1)) { 288 /* advance ++s1; only below if cp1 decomposes/case-folds */ 289 cp1=U16_GET_SUPPLEMENTARY(c1, c); 290 } 291 } else /* isTrail(c1) */ { 292 if(start1<=(s1-2) && U16_IS_LEAD(c=*(s1-2))) { 293 cp1=U16_GET_SUPPLEMENTARY(c, c1); 294 } 295 } 296 } 297 298 cp2=c2; 299 if(U_IS_SURROGATE(c2)) { 300 UChar c; 301 302 if(U_IS_SURROGATE_LEAD(c2)) { 303 if(s2!=limit2 && U16_IS_TRAIL(c=*s2)) { 304 /* advance ++s2; only below if cp2 decomposes/case-folds */ 305 cp2=U16_GET_SUPPLEMENTARY(c2, c); 306 } 307 } else /* isTrail(c2) */ { 308 if(start2<=(s2-2) && U16_IS_LEAD(c=*(s2-2))) { 309 cp2=U16_GET_SUPPLEMENTARY(c, c2); 310 } 311 } 312 } 313 314 /* 315 * go down one level for each string 316 * continue with the main loop as soon as there is a real change 317 */ 318 319 if( level1==0 && (options&U_COMPARE_IGNORE_CASE) && 320 (length=ucase_toFullFolding(csp, (UChar32)cp1, &p, options))>=0 321 ) { 322 /* cp1 case-folds to the code point "length" or to p[length] */ 323 if(U_IS_SURROGATE(c1)) { 324 if(U_IS_SURROGATE_LEAD(c1)) { 325 /* advance beyond source surrogate pair if it case-folds */ 326 ++s1; 327 } else /* isTrail(c1) */ { 328 /* 329 * we got a supplementary code point when hitting its trail surrogate, 330 * therefore the lead surrogate must have been the same as in the other string; 331 * compare this decomposition with the lead surrogate in the other string 332 * remember that this simulates bulk text replacement: 333 * the decomposition would replace the entire code point 334 */ 335 --s2; 336 c2=*(s2-1); 337 } 338 } 339 340 /* push current level pointers */ 341 stack1[0].start=start1; 342 stack1[0].s=s1; 343 stack1[0].limit=limit1; 344 ++level1; 345 346 /* copy the folding result to fold1[] */ 347 if(length<=UCASE_MAX_STRING_LENGTH) { 348 u_memcpy(fold1, p, length); 349 } else { 350 int32_t i=0; 351 U16_APPEND_UNSAFE(fold1, i, length); 352 length=i; 353 } 354 355 /* set next level pointers to case folding */ 356 start1=s1=fold1; 357 limit1=fold1+length; 358 359 /* get ready to read from decomposition, continue with loop */ 360 c1=-1; 361 continue; 362 } 363 364 if( level2==0 && (options&U_COMPARE_IGNORE_CASE) && 365 (length=ucase_toFullFolding(csp, (UChar32)cp2, &p, options))>=0 366 ) { 367 /* cp2 case-folds to the code point "length" or to p[length] */ 368 if(U_IS_SURROGATE(c2)) { 369 if(U_IS_SURROGATE_LEAD(c2)) { 370 /* advance beyond source surrogate pair if it case-folds */ 371 ++s2; 372 } else /* isTrail(c2) */ { 373 /* 374 * we got a supplementary code point when hitting its trail surrogate, 375 * therefore the lead surrogate must have been the same as in the other string; 376 * compare this decomposition with the lead surrogate in the other string 377 * remember that this simulates bulk text replacement: 378 * the decomposition would replace the entire code point 379 */ 380 --s1; 381 c1=*(s1-1); 382 } 383 } 384 385 /* push current level pointers */ 386 stack2[0].start=start2; 387 stack2[0].s=s2; 388 stack2[0].limit=limit2; 389 ++level2; 390 391 /* copy the folding result to fold2[] */ 392 if(length<=UCASE_MAX_STRING_LENGTH) { 393 u_memcpy(fold2, p, length); 394 } else { 395 int32_t i=0; 396 U16_APPEND_UNSAFE(fold2, i, length); 397 length=i; 398 } 399 400 /* set next level pointers to case folding */ 401 start2=s2=fold2; 402 limit2=fold2+length; 403 404 /* get ready to read from decomposition, continue with loop */ 405 c2=-1; 406 continue; 407 } 408 409 if( level1<2 && (options&_COMPARE_EQUIV) && 410 0!=(p=nfcImpl->getDecomposition((UChar32)cp1, decomp1, length)) 411 ) { 412 /* cp1 decomposes into p[length] */ 413 if(U_IS_SURROGATE(c1)) { 414 if(U_IS_SURROGATE_LEAD(c1)) { 415 /* advance beyond source surrogate pair if it decomposes */ 416 ++s1; 417 } else /* isTrail(c1) */ { 418 /* 419 * we got a supplementary code point when hitting its trail surrogate, 420 * therefore the lead surrogate must have been the same as in the other string; 421 * compare this decomposition with the lead surrogate in the other string 422 * remember that this simulates bulk text replacement: 423 * the decomposition would replace the entire code point 424 */ 425 --s2; 426 c2=*(s2-1); 427 } 428 } 429 430 /* push current level pointers */ 431 stack1[level1].start=start1; 432 stack1[level1].s=s1; 433 stack1[level1].limit=limit1; 434 ++level1; 435 436 /* set empty intermediate level if skipped */ 437 if(level1<2) { 438 stack1[level1++].start=NULL; 439 } 440 441 /* set next level pointers to decomposition */ 442 start1=s1=p; 443 limit1=p+length; 444 445 /* get ready to read from decomposition, continue with loop */ 446 c1=-1; 447 continue; 448 } 449 450 if( level2<2 && (options&_COMPARE_EQUIV) && 451 0!=(p=nfcImpl->getDecomposition((UChar32)cp2, decomp2, length)) 452 ) { 453 /* cp2 decomposes into p[length] */ 454 if(U_IS_SURROGATE(c2)) { 455 if(U_IS_SURROGATE_LEAD(c2)) { 456 /* advance beyond source surrogate pair if it decomposes */ 457 ++s2; 458 } else /* isTrail(c2) */ { 459 /* 460 * we got a supplementary code point when hitting its trail surrogate, 461 * therefore the lead surrogate must have been the same as in the other string; 462 * compare this decomposition with the lead surrogate in the other string 463 * remember that this simulates bulk text replacement: 464 * the decomposition would replace the entire code point 465 */ 466 --s1; 467 c1=*(s1-1); 468 } 469 } 470 471 /* push current level pointers */ 472 stack2[level2].start=start2; 473 stack2[level2].s=s2; 474 stack2[level2].limit=limit2; 475 ++level2; 476 477 /* set empty intermediate level if skipped */ 478 if(level2<2) { 479 stack2[level2++].start=NULL; 480 } 481 482 /* set next level pointers to decomposition */ 483 start2=s2=p; 484 limit2=p+length; 485 486 /* get ready to read from decomposition, continue with loop */ 487 c2=-1; 488 continue; 489 } 490 491 /* 492 * no decomposition/case folding, max level for both sides: 493 * return difference result 494 * 495 * code point order comparison must not just return cp1-cp2 496 * because when single surrogates are present then the surrogate pairs 497 * that formed cp1 and cp2 may be from different string indexes 498 * 499 * example: { d800 d800 dc01 } vs. { d800 dc00 }, compare at second code units 500 * c1=d800 cp1=10001 c2=dc00 cp2=10000 501 * cp1-cp2>0 but c1-c2<0 and in fact in UTF-32 it is { d800 10001 } < { 10000 } 502 * 503 * therefore, use same fix-up as in ustring.c/uprv_strCompare() 504 * except: uprv_strCompare() fetches c=*s while this functions fetches c=*s++ 505 * so we have slightly different pointer/start/limit comparisons here 506 */ 507 508 if(c1>=0xd800 && c2>=0xd800 && (options&U_COMPARE_CODE_POINT_ORDER)) { 509 /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */ 510 if( 511 (c1<=0xdbff && s1!=limit1 && U16_IS_TRAIL(*s1)) || 512 (U16_IS_TRAIL(c1) && start1!=(s1-1) && U16_IS_LEAD(*(s1-2))) 513 ) { 514 /* part of a surrogate pair, leave >=d800 */ 515 } else { 516 /* BMP code point - may be surrogate code point - make <d800 */ 517 c1-=0x2800; 518 } 519 520 if( 521 (c2<=0xdbff && s2!=limit2 && U16_IS_TRAIL(*s2)) || 522 (U16_IS_TRAIL(c2) && start2!=(s2-1) && U16_IS_LEAD(*(s2-2))) 523 ) { 524 /* part of a surrogate pair, leave >=d800 */ 525 } else { 526 /* BMP code point - may be surrogate code point - make <d800 */ 527 c2-=0x2800; 528 } 529 } 530 531 return c1-c2; 532 } 533} 534 535static 536UBool _normalize(const Normalizer2 *n2, const UChar *s, int32_t length, 537 UnicodeString &normalized, UErrorCode *pErrorCode) { 538 UnicodeString str(length<0, s, length); 539 540 // check if s fulfill the conditions 541 int32_t spanQCYes=n2->spanQuickCheckYes(str, *pErrorCode); 542 if (U_FAILURE(*pErrorCode)) { 543 return FALSE; 544 } 545 /* 546 * ICU 2.4 had a further optimization: 547 * If both strings were not in FCD, then they were both NFD'ed, 548 * and the _COMPARE_EQUIV option was turned off. 549 * It is not entirely clear that this is valid with the current 550 * definition of the canonical caseless match. 551 * Therefore, ICU 2.6 removes that optimization. 552 */ 553 if(spanQCYes<str.length()) { 554 UnicodeString unnormalized=str.tempSubString(spanQCYes); 555 normalized.setTo(FALSE, str.getBuffer(), spanQCYes); 556 n2->normalizeSecondAndAppend(normalized, unnormalized, *pErrorCode); 557 if (U_SUCCESS(*pErrorCode)) { 558 return TRUE; 559 } 560 } 561 return FALSE; 562} 563 564U_CAPI int32_t U_EXPORT2 565unorm_compare(const UChar *s1, int32_t length1, 566 const UChar *s2, int32_t length2, 567 uint32_t options, 568 UErrorCode *pErrorCode) { 569 /* argument checking */ 570 if(U_FAILURE(*pErrorCode)) { 571 return 0; 572 } 573 if(s1==0 || length1<-1 || s2==0 || length2<-1) { 574 *pErrorCode=U_ILLEGAL_ARGUMENT_ERROR; 575 return 0; 576 } 577 578 UnicodeString fcd1, fcd2; 579 int32_t normOptions=(int32_t)(options>>UNORM_COMPARE_NORM_OPTIONS_SHIFT); 580 options|=_COMPARE_EQUIV; 581 582 /* 583 * UAX #21 Case Mappings, as fixed for Unicode version 4 584 * (see Jitterbug 2021), defines a canonical caseless match as 585 * 586 * A string X is a canonical caseless match 587 * for a string Y if and only if 588 * NFD(toCasefold(NFD(X))) = NFD(toCasefold(NFD(Y))) 589 * 590 * For better performance, we check for FCD (or let the caller tell us that 591 * both strings are in FCD) for the inner normalization. 592 * BasicNormalizerTest::FindFoldFCDExceptions() makes sure that 593 * case-folding preserves the FCD-ness of a string. 594 * The outer normalization is then only performed by unorm_cmpEquivFold() 595 * when there is a difference. 596 * 597 * Exception: When using the Turkic case-folding option, we do perform 598 * full NFD first. This is because in the Turkic case precomposed characters 599 * with 0049 capital I or 0069 small i fold differently whether they 600 * are first decomposed or not, so an FCD check - a check only for 601 * canonical order - is not sufficient. 602 */ 603 if(!(options&UNORM_INPUT_IS_FCD) || (options&U_FOLD_CASE_EXCLUDE_SPECIAL_I)) { 604 const Normalizer2 *n2; 605 if(options&U_FOLD_CASE_EXCLUDE_SPECIAL_I) { 606 n2=Normalizer2::getNFDInstance(*pErrorCode); 607 } else { 608 n2=Normalizer2Factory::getFCDInstance(*pErrorCode); 609 } 610 if (U_FAILURE(*pErrorCode)) { 611 return 0; 612 } 613 614 if(normOptions&UNORM_UNICODE_3_2) { 615 const UnicodeSet *uni32=uniset_getUnicode32Instance(*pErrorCode); 616 FilteredNormalizer2 fn2(*n2, *uni32); 617 if(_normalize(&fn2, s1, length1, fcd1, pErrorCode)) { 618 s1=fcd1.getBuffer(); 619 length1=fcd1.length(); 620 } 621 if(_normalize(&fn2, s2, length2, fcd2, pErrorCode)) { 622 s2=fcd2.getBuffer(); 623 length2=fcd2.length(); 624 } 625 } else { 626 if(_normalize(n2, s1, length1, fcd1, pErrorCode)) { 627 s1=fcd1.getBuffer(); 628 length1=fcd1.length(); 629 } 630 if(_normalize(n2, s2, length2, fcd2, pErrorCode)) { 631 s2=fcd2.getBuffer(); 632 length2=fcd2.length(); 633 } 634 } 635 } 636 637 if(U_SUCCESS(*pErrorCode)) { 638 return unorm_cmpEquivFold(s1, length1, s2, length2, options, pErrorCode); 639 } else { 640 return 0; 641 } 642} 643 644#endif /* #if !UCONFIG_NO_NORMALIZATION */ 645