1/******************************************************************** 2 * COPYRIGHT: 3 * Copyright (c) 1997-2009, International Business Machines Corporation and 4 * others. All Rights Reserved. 5 ********************************************************************/ 6/******************************************************************************** 7* 8* File CITERTST.C 9* 10* Modification History: 11* Date Name Description 12* Madhu Katragadda Ported for C API 13* 02/19/01 synwee Modified test case for new collation iterator 14*********************************************************************************/ 15/* 16 * Collation Iterator tests. 17 * (Let me reiterate my position...) 18 */ 19 20#include "unicode/utypes.h" 21 22#if !UCONFIG_NO_COLLATION 23 24#include "unicode/ucol.h" 25#include "unicode/uloc.h" 26#include "unicode/uchar.h" 27#include "unicode/ustring.h" 28#include "unicode/putil.h" 29#include "callcoll.h" 30#include "cmemory.h" 31#include "cintltst.h" 32#include "citertst.h" 33#include "ccolltst.h" 34#include "filestrm.h" 35#include "cstring.h" 36#include "ucol_imp.h" 37#include "ucol_tok.h" 38#include <stdio.h> 39 40extern uint8_t ucol_uprv_getCaseBits(const UChar *, uint32_t, UErrorCode *); 41 42void addCollIterTest(TestNode** root) 43{ 44 addTest(root, &TestPrevious, "tscoll/citertst/TestPrevious"); 45 addTest(root, &TestOffset, "tscoll/citertst/TestOffset"); 46 addTest(root, &TestSetText, "tscoll/citertst/TestSetText"); 47 addTest(root, &TestMaxExpansion, "tscoll/citertst/TestMaxExpansion"); 48 addTest(root, &TestUnicodeChar, "tscoll/citertst/TestUnicodeChar"); 49 addTest(root, &TestNormalizedUnicodeChar, 50 "tscoll/citertst/TestNormalizedUnicodeChar"); 51 addTest(root, &TestNormalization, "tscoll/citertst/TestNormalization"); 52 addTest(root, &TestBug672, "tscoll/citertst/TestBug672"); 53 addTest(root, &TestBug672Normalize, "tscoll/citertst/TestBug672Normalize"); 54 addTest(root, &TestSmallBuffer, "tscoll/citertst/TestSmallBuffer"); 55 addTest(root, &TestCEs, "tscoll/citertst/TestCEs"); 56 addTest(root, &TestDiscontiguos, "tscoll/citertst/TestDiscontiguos"); 57 addTest(root, &TestCEBufferOverflow, "tscoll/citertst/TestCEBufferOverflow"); 58 addTest(root, &TestCEValidity, "tscoll/citertst/TestCEValidity"); 59 addTest(root, &TestSortKeyValidity, "tscoll/citertst/TestSortKeyValidity"); 60} 61 62/* The locales we support */ 63 64static const char * LOCALES[] = {"en_AU", "en_BE", "en_CA"}; 65 66static void TestBug672() { 67 UErrorCode status = U_ZERO_ERROR; 68 UChar pattern[20]; 69 UChar text[50]; 70 int i; 71 int result[3][3]; 72 73 u_uastrcpy(pattern, "resume"); 74 u_uastrcpy(text, "Time to resume updating my resume."); 75 76 for (i = 0; i < 3; ++ i) { 77 UCollator *coll = ucol_open(LOCALES[i], &status); 78 UCollationElements *pitr = ucol_openElements(coll, pattern, -1, 79 &status); 80 UCollationElements *titer = ucol_openElements(coll, text, -1, 81 &status); 82 if (U_FAILURE(status)) { 83 log_err_status(status, "ERROR: in creation of either the collator or the collation iterator :%s\n", 84 myErrorName(status)); 85 return; 86 } 87 88 log_verbose("locale tested %s\n", LOCALES[i]); 89 90 while (ucol_next(pitr, &status) != UCOL_NULLORDER && 91 U_SUCCESS(status)) { 92 } 93 if (U_FAILURE(status)) { 94 log_err("ERROR: reversing collation iterator :%s\n", 95 myErrorName(status)); 96 return; 97 } 98 ucol_reset(pitr); 99 100 ucol_setOffset(titer, u_strlen(pattern), &status); 101 if (U_FAILURE(status)) { 102 log_err("ERROR: setting offset in collator :%s\n", 103 myErrorName(status)); 104 return; 105 } 106 result[i][0] = ucol_getOffset(titer); 107 log_verbose("Text iterator set to offset %d\n", result[i][0]); 108 109 /* Use previous() */ 110 ucol_previous(titer, &status); 111 result[i][1] = ucol_getOffset(titer); 112 log_verbose("Current offset %d after previous\n", result[i][1]); 113 114 /* Add one to index */ 115 log_verbose("Adding one to current offset...\n"); 116 ucol_setOffset(titer, ucol_getOffset(titer) + 1, &status); 117 if (U_FAILURE(status)) { 118 log_err("ERROR: setting offset in collator :%s\n", 119 myErrorName(status)); 120 return; 121 } 122 result[i][2] = ucol_getOffset(titer); 123 log_verbose("Current offset in text = %d\n", result[i][2]); 124 ucol_closeElements(pitr); 125 ucol_closeElements(titer); 126 ucol_close(coll); 127 } 128 129 if (uprv_memcmp(result[0], result[1], 3) != 0 || 130 uprv_memcmp(result[1], result[2], 3) != 0) { 131 log_err("ERROR: Different locales have different offsets at the same character\n"); 132 } 133} 134 135 136 137/* Running this test with normalization enabled showed up a bug in the incremental 138 normalization code. */ 139static void TestBug672Normalize() { 140 UErrorCode status = U_ZERO_ERROR; 141 UChar pattern[20]; 142 UChar text[50]; 143 int i; 144 int result[3][3]; 145 146 u_uastrcpy(pattern, "resume"); 147 u_uastrcpy(text, "Time to resume updating my resume."); 148 149 for (i = 0; i < 3; ++ i) { 150 UCollator *coll = ucol_open(LOCALES[i], &status); 151 UCollationElements *pitr = NULL; 152 UCollationElements *titer = NULL; 153 154 ucol_setAttribute(coll, UCOL_NORMALIZATION_MODE, UCOL_ON, &status); 155 156 pitr = ucol_openElements(coll, pattern, -1, &status); 157 titer = ucol_openElements(coll, text, -1, &status); 158 if (U_FAILURE(status)) { 159 log_err_status(status, "ERROR: in creation of either the collator or the collation iterator :%s\n", 160 myErrorName(status)); 161 return; 162 } 163 164 log_verbose("locale tested %s\n", LOCALES[i]); 165 166 while (ucol_next(pitr, &status) != UCOL_NULLORDER && 167 U_SUCCESS(status)) { 168 } 169 if (U_FAILURE(status)) { 170 log_err("ERROR: reversing collation iterator :%s\n", 171 myErrorName(status)); 172 return; 173 } 174 ucol_reset(pitr); 175 176 ucol_setOffset(titer, u_strlen(pattern), &status); 177 if (U_FAILURE(status)) { 178 log_err("ERROR: setting offset in collator :%s\n", 179 myErrorName(status)); 180 return; 181 } 182 result[i][0] = ucol_getOffset(titer); 183 log_verbose("Text iterator set to offset %d\n", result[i][0]); 184 185 /* Use previous() */ 186 ucol_previous(titer, &status); 187 result[i][1] = ucol_getOffset(titer); 188 log_verbose("Current offset %d after previous\n", result[i][1]); 189 190 /* Add one to index */ 191 log_verbose("Adding one to current offset...\n"); 192 ucol_setOffset(titer, ucol_getOffset(titer) + 1, &status); 193 if (U_FAILURE(status)) { 194 log_err("ERROR: setting offset in collator :%s\n", 195 myErrorName(status)); 196 return; 197 } 198 result[i][2] = ucol_getOffset(titer); 199 log_verbose("Current offset in text = %d\n", result[i][2]); 200 ucol_closeElements(pitr); 201 ucol_closeElements(titer); 202 ucol_close(coll); 203 } 204 205 if (uprv_memcmp(result[0], result[1], 3) != 0 || 206 uprv_memcmp(result[1], result[2], 3) != 0) { 207 log_err("ERROR: Different locales have different offsets at the same character\n"); 208 } 209} 210 211 212 213 214/** 215 * Test for CollationElementIterator previous and next for the whole set of 216 * unicode characters. 217 */ 218static void TestUnicodeChar() 219{ 220 UChar source[0x100]; 221 UCollator *en_us; 222 UCollationElements *iter; 223 UErrorCode status = U_ZERO_ERROR; 224 UChar codepoint; 225 226 UChar *test; 227 en_us = ucol_open("en_US", &status); 228 if (U_FAILURE(status)){ 229 log_err_status(status, "ERROR: in creation of collation data using ucol_open()\n %s\n", 230 myErrorName(status)); 231 return; 232 } 233 234 for (codepoint = 1; codepoint < 0xFFFE;) 235 { 236 test = source; 237 238 while (codepoint % 0xFF != 0) 239 { 240 if (u_isdefined(codepoint)) 241 *(test ++) = codepoint; 242 codepoint ++; 243 } 244 245 if (u_isdefined(codepoint)) 246 *(test ++) = codepoint; 247 248 if (codepoint != 0xFFFF) 249 codepoint ++; 250 251 *test = 0; 252 iter=ucol_openElements(en_us, source, u_strlen(source), &status); 253 if(U_FAILURE(status)){ 254 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 255 myErrorName(status)); 256 ucol_close(en_us); 257 return; 258 } 259 /* A basic test to see if it's working at all */ 260 log_verbose("codepoint testing %x\n", codepoint); 261 backAndForth(iter); 262 ucol_closeElements(iter); 263 264 /* null termination test */ 265 iter=ucol_openElements(en_us, source, -1, &status); 266 if(U_FAILURE(status)){ 267 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 268 myErrorName(status)); 269 ucol_close(en_us); 270 return; 271 } 272 /* A basic test to see if it's working at all */ 273 backAndForth(iter); 274 ucol_closeElements(iter); 275 } 276 277 ucol_close(en_us); 278} 279 280/** 281 * Test for CollationElementIterator previous and next for the whole set of 282 * unicode characters with normalization on. 283 */ 284static void TestNormalizedUnicodeChar() 285{ 286 UChar source[0x100]; 287 UCollator *th_th; 288 UCollationElements *iter; 289 UErrorCode status = U_ZERO_ERROR; 290 UChar codepoint; 291 292 UChar *test; 293 /* thai should have normalization on */ 294 th_th = ucol_open("th_TH", &status); 295 if (U_FAILURE(status)){ 296 log_err_status(status, "ERROR: in creation of thai collation using ucol_open()\n %s\n", 297 myErrorName(status)); 298 return; 299 } 300 301 for (codepoint = 1; codepoint < 0xFFFE;) 302 { 303 test = source; 304 305 while (codepoint % 0xFF != 0) 306 { 307 if (u_isdefined(codepoint)) 308 *(test ++) = codepoint; 309 codepoint ++; 310 } 311 312 if (u_isdefined(codepoint)) 313 *(test ++) = codepoint; 314 315 if (codepoint != 0xFFFF) 316 codepoint ++; 317 318 *test = 0; 319 iter=ucol_openElements(th_th, source, u_strlen(source), &status); 320 if(U_FAILURE(status)){ 321 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 322 myErrorName(status)); 323 ucol_close(th_th); 324 return; 325 } 326 327 backAndForth(iter); 328 ucol_closeElements(iter); 329 330 iter=ucol_openElements(th_th, source, -1, &status); 331 if(U_FAILURE(status)){ 332 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 333 myErrorName(status)); 334 ucol_close(th_th); 335 return; 336 } 337 338 backAndForth(iter); 339 ucol_closeElements(iter); 340 } 341 342 ucol_close(th_th); 343} 344 345/** 346* Test the incremental normalization 347*/ 348static void TestNormalization() 349{ 350 UErrorCode status = U_ZERO_ERROR; 351 const char *str = 352 "&a < \\u0300\\u0315 < A\\u0300\\u0315 < \\u0316\\u0315B < \\u0316\\u0300\\u0315"; 353 UCollator *coll; 354 UChar rule[50]; 355 int rulelen = u_unescape(str, rule, 50); 356 int count = 0; 357 const char *testdata[] = 358 {"\\u1ED9", "o\\u0323\\u0302", 359 "\\u0300\\u0315", "\\u0315\\u0300", 360 "A\\u0300\\u0315B", "A\\u0315\\u0300B", 361 "A\\u0316\\u0315B", "A\\u0315\\u0316B", 362 "\\u0316\\u0300\\u0315", "\\u0315\\u0300\\u0316", 363 "A\\u0316\\u0300\\u0315B", "A\\u0315\\u0300\\u0316B", 364 "\\u0316\\u0315\\u0300", "A\\u0316\\u0315\\u0300B"}; 365 int32_t srclen; 366 UChar source[10]; 367 UCollationElements *iter; 368 369 coll = ucol_openRules(rule, rulelen, UCOL_ON, UCOL_TERTIARY, NULL, &status); 370 ucol_setAttribute(coll, UCOL_NORMALIZATION_MODE, UCOL_ON, &status); 371 if (U_FAILURE(status)){ 372 log_err_status(status, "ERROR: in creation of collator using ucol_openRules()\n %s\n", 373 myErrorName(status)); 374 return; 375 } 376 377 srclen = u_unescape(testdata[0], source, 10); 378 iter = ucol_openElements(coll, source, srclen, &status); 379 backAndForth(iter); 380 ucol_closeElements(iter); 381 382 srclen = u_unescape(testdata[1], source, 10); 383 iter = ucol_openElements(coll, source, srclen, &status); 384 backAndForth(iter); 385 ucol_closeElements(iter); 386 387 while (count < 12) { 388 srclen = u_unescape(testdata[count], source, 10); 389 iter = ucol_openElements(coll, source, srclen, &status); 390 391 if (U_FAILURE(status)){ 392 log_err("ERROR: in creation of collator element iterator\n %s\n", 393 myErrorName(status)); 394 return; 395 } 396 backAndForth(iter); 397 ucol_closeElements(iter); 398 399 iter = ucol_openElements(coll, source, -1, &status); 400 401 if (U_FAILURE(status)){ 402 log_err("ERROR: in creation of collator element iterator\n %s\n", 403 myErrorName(status)); 404 return; 405 } 406 backAndForth(iter); 407 ucol_closeElements(iter); 408 count ++; 409 } 410 ucol_close(coll); 411} 412 413/** 414 * Test for CollationElementIterator.previous() 415 * 416 * @bug 4108758 - Make sure it works with contracting characters 417 * 418 */ 419static void TestPrevious() 420{ 421 UCollator *coll=NULL; 422 UChar rule[50]; 423 UChar *source; 424 UCollator *c1, *c2, *c3; 425 UCollationElements *iter; 426 UErrorCode status = U_ZERO_ERROR; 427 UChar test1[50]; 428 UChar test2[50]; 429 430 u_uastrcpy(test1, "What subset of all possible test cases?"); 431 u_uastrcpy(test2, "has the highest probability of detecting"); 432 coll = ucol_open("en_US", &status); 433 434 iter=ucol_openElements(coll, test1, u_strlen(test1), &status); 435 log_verbose("English locale testing back and forth\n"); 436 if(U_FAILURE(status)){ 437 log_err_status(status, "ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 438 myErrorName(status)); 439 ucol_close(coll); 440 return; 441 } 442 /* A basic test to see if it's working at all */ 443 backAndForth(iter); 444 ucol_closeElements(iter); 445 ucol_close(coll); 446 447 /* Test with a contracting character sequence */ 448 u_uastrcpy(rule, "&a,A < b,B < c,C, d,D < z,Z < ch,cH,Ch,CH"); 449 c1 = ucol_openRules(rule, u_strlen(rule), UCOL_OFF, UCOL_DEFAULT_STRENGTH, NULL, &status); 450 451 log_verbose("Contraction rule testing back and forth with no normalization\n"); 452 453 if (c1 == NULL || U_FAILURE(status)) 454 { 455 log_err("Couldn't create a RuleBasedCollator with a contracting sequence\n %s\n", 456 myErrorName(status)); 457 return; 458 } 459 source=(UChar*)malloc(sizeof(UChar) * 20); 460 u_uastrcpy(source, "abchdcba"); 461 iter=ucol_openElements(c1, source, u_strlen(source), &status); 462 if(U_FAILURE(status)){ 463 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 464 myErrorName(status)); 465 return; 466 } 467 backAndForth(iter); 468 ucol_closeElements(iter); 469 ucol_close(c1); 470 471 /* Test with an expanding character sequence */ 472 u_uastrcpy(rule, "&a < b < c/abd < d"); 473 c2 = ucol_openRules(rule, u_strlen(rule), UCOL_OFF, UCOL_DEFAULT_STRENGTH, NULL, &status); 474 log_verbose("Expansion rule testing back and forth with no normalization\n"); 475 if (c2 == NULL || U_FAILURE(status)) 476 { 477 log_err("Couldn't create a RuleBasedCollator with a contracting sequence.\n %s\n", 478 myErrorName(status)); 479 return; 480 } 481 u_uastrcpy(source, "abcd"); 482 iter=ucol_openElements(c2, source, u_strlen(source), &status); 483 if(U_FAILURE(status)){ 484 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 485 myErrorName(status)); 486 return; 487 } 488 backAndForth(iter); 489 ucol_closeElements(iter); 490 ucol_close(c2); 491 /* Now try both */ 492 u_uastrcpy(rule, "&a < b < c/aba < d < z < ch"); 493 c3 = ucol_openRules(rule, u_strlen(rule), UCOL_DEFAULT, UCOL_DEFAULT_STRENGTH,NULL, &status); 494 log_verbose("Expansion/contraction rule testing back and forth with no normalization\n"); 495 496 if (c3 == NULL || U_FAILURE(status)) 497 { 498 log_err("Couldn't create a RuleBasedCollator with a contracting sequence.\n %s\n", 499 myErrorName(status)); 500 return; 501 } 502 u_uastrcpy(source, "abcdbchdc"); 503 iter=ucol_openElements(c3, source, u_strlen(source), &status); 504 if(U_FAILURE(status)){ 505 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 506 myErrorName(status)); 507 return; 508 } 509 backAndForth(iter); 510 ucol_closeElements(iter); 511 ucol_close(c3); 512 source[0] = 0x0e41; 513 source[1] = 0x0e02; 514 source[2] = 0x0e41; 515 source[3] = 0x0e02; 516 source[4] = 0x0e27; 517 source[5] = 0x61; 518 source[6] = 0x62; 519 source[7] = 0x63; 520 source[8] = 0; 521 522 coll = ucol_open("th_TH", &status); 523 log_verbose("Thai locale testing back and forth with normalization\n"); 524 iter=ucol_openElements(coll, source, u_strlen(source), &status); 525 if(U_FAILURE(status)){ 526 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 527 myErrorName(status)); 528 return; 529 } 530 backAndForth(iter); 531 ucol_closeElements(iter); 532 ucol_close(coll); 533 534 /* prev test */ 535 source[0] = 0x0061; 536 source[1] = 0x30CF; 537 source[2] = 0x3099; 538 source[3] = 0x30FC; 539 source[4] = 0; 540 541 coll = ucol_open("ja_JP", &status); 542 log_verbose("Japanese locale testing back and forth with normalization\n"); 543 iter=ucol_openElements(coll, source, u_strlen(source), &status); 544 if(U_FAILURE(status)){ 545 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 546 myErrorName(status)); 547 return; 548 } 549 backAndForth(iter); 550 ucol_closeElements(iter); 551 ucol_close(coll); 552 553 free(source); 554} 555 556/** 557 * Test for getOffset() and setOffset() 558 */ 559static void TestOffset() 560{ 561 UErrorCode status= U_ZERO_ERROR; 562 UCollator *en_us=NULL; 563 UCollationElements *iter, *pristine; 564 int32_t offset; 565 OrderAndOffset *orders; 566 int32_t orderLength=0; 567 int count = 0; 568 UChar test1[50]; 569 UChar test2[50]; 570 571 u_uastrcpy(test1, "What subset of all possible test cases?"); 572 u_uastrcpy(test2, "has the highest probability of detecting"); 573 en_us = ucol_open("en_US", &status); 574 log_verbose("Testing getOffset and setOffset for collations\n"); 575 iter = ucol_openElements(en_us, test1, u_strlen(test1), &status); 576 if(U_FAILURE(status)){ 577 log_err_status(status, "ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 578 myErrorName(status)); 579 ucol_close(en_us); 580 return; 581 } 582 583 /* testing boundaries */ 584 ucol_setOffset(iter, 0, &status); 585 if (U_FAILURE(status) || ucol_previous(iter, &status) != UCOL_NULLORDER) { 586 log_err("Error: After setting offset to 0, we should be at the end " 587 "of the backwards iteration"); 588 } 589 ucol_setOffset(iter, u_strlen(test1), &status); 590 if (U_FAILURE(status) || ucol_next(iter, &status) != UCOL_NULLORDER) { 591 log_err("Error: After setting offset to end of the string, we should " 592 "be at the end of the backwards iteration"); 593 } 594 595 /* Run all the way through the iterator, then get the offset */ 596 597 orders = getOrders(iter, &orderLength); 598 599 offset = ucol_getOffset(iter); 600 601 if (offset != u_strlen(test1)) 602 { 603 log_err("offset at end != length %d vs %d\n", offset, 604 u_strlen(test1) ); 605 } 606 607 /* Now set the offset back to the beginning and see if it works */ 608 pristine=ucol_openElements(en_us, test1, u_strlen(test1), &status); 609 if(U_FAILURE(status)){ 610 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 611 myErrorName(status)); 612 ucol_close(en_us); 613 return; 614 } 615 status = U_ZERO_ERROR; 616 617 ucol_setOffset(iter, 0, &status); 618 if (U_FAILURE(status)) 619 { 620 log_err("setOffset failed. %s\n", myErrorName(status)); 621 } 622 else 623 { 624 assertEqual(iter, pristine); 625 } 626 627 ucol_closeElements(pristine); 628 ucol_closeElements(iter); 629 free(orders); 630 631 /* testing offsets in normalization buffer */ 632 test1[0] = 0x61; 633 test1[1] = 0x300; 634 test1[2] = 0x316; 635 test1[3] = 0x62; 636 test1[4] = 0; 637 ucol_setAttribute(en_us, UCOL_NORMALIZATION_MODE, UCOL_ON, &status); 638 iter = ucol_openElements(en_us, test1, 4, &status); 639 if(U_FAILURE(status)){ 640 log_err("ERROR: in creation of collation element iterator using ucol_openElements()\n %s\n", 641 myErrorName(status)); 642 ucol_close(en_us); 643 return; 644 } 645 646 count = 0; 647 while (ucol_next(iter, &status) != UCOL_NULLORDER && 648 U_SUCCESS(status)) { 649 switch (count) { 650 case 0: 651 if (ucol_getOffset(iter) != 1) { 652 log_err("ERROR: Offset of iteration should be 1\n"); 653 } 654 break; 655 case 3: 656 if (ucol_getOffset(iter) != 4) { 657 log_err("ERROR: Offset of iteration should be 4\n"); 658 } 659 break; 660 default: 661 if (ucol_getOffset(iter) != 3) { 662 log_err("ERROR: Offset of iteration should be 3\n"); 663 } 664 } 665 count ++; 666 } 667 668 ucol_reset(iter); 669 count = 0; 670 while (ucol_previous(iter, &status) != UCOL_NULLORDER && 671 U_SUCCESS(status)) { 672 switch (count) { 673 case 0: 674 case 1: 675 if (ucol_getOffset(iter) != 3) { 676 log_err("ERROR: Offset of iteration should be 3\n"); 677 } 678 break; 679 case 2: 680 if (ucol_getOffset(iter) != 1) { 681 log_err("ERROR: Offset of iteration should be 1\n"); 682 } 683 break; 684 default: 685 if (ucol_getOffset(iter) != 0) { 686 log_err("ERROR: Offset of iteration should be 0\n"); 687 } 688 } 689 count ++; 690 } 691 692 if(U_FAILURE(status)){ 693 log_err("ERROR: in iterating collation elements %s\n", 694 myErrorName(status)); 695 } 696 697 ucol_closeElements(iter); 698 ucol_close(en_us); 699} 700 701/** 702 * Test for setText() 703 */ 704static void TestSetText() 705{ 706 int32_t c,i; 707 UErrorCode status = U_ZERO_ERROR; 708 UCollator *en_us=NULL; 709 UCollationElements *iter1, *iter2; 710 UChar test1[50]; 711 UChar test2[50]; 712 713 u_uastrcpy(test1, "What subset of all possible test cases?"); 714 u_uastrcpy(test2, "has the highest probability of detecting"); 715 en_us = ucol_open("en_US", &status); 716 log_verbose("testing setText for Collation elements\n"); 717 iter1=ucol_openElements(en_us, test1, u_strlen(test1), &status); 718 if(U_FAILURE(status)){ 719 log_err_status(status, "ERROR: in creation of collation element iterator1 using ucol_openElements()\n %s\n", 720 myErrorName(status)); 721 ucol_close(en_us); 722 return; 723 } 724 iter2=ucol_openElements(en_us, test2, u_strlen(test2), &status); 725 if(U_FAILURE(status)){ 726 log_err("ERROR: in creation of collation element iterator2 using ucol_openElements()\n %s\n", 727 myErrorName(status)); 728 ucol_close(en_us); 729 return; 730 } 731 732 /* Run through the second iterator just to exercise it */ 733 c = ucol_next(iter2, &status); 734 i = 0; 735 736 while ( ++i < 10 && (c != UCOL_NULLORDER)) 737 { 738 if (U_FAILURE(status)) 739 { 740 log_err("iter2->next() returned an error. %s\n", myErrorName(status)); 741 ucol_closeElements(iter2); 742 ucol_closeElements(iter1); 743 ucol_close(en_us); 744 return; 745 } 746 747 c = ucol_next(iter2, &status); 748 } 749 750 /* Now set it to point to the same string as the first iterator */ 751 ucol_setText(iter2, test1, u_strlen(test1), &status); 752 if (U_FAILURE(status)) 753 { 754 log_err("call to iter2->setText(test1) failed. %s\n", myErrorName(status)); 755 } 756 else 757 { 758 assertEqual(iter1, iter2); 759 } 760 761 /* Now set it to point to a null string with fake length*/ 762 ucol_setText(iter2, NULL, 2, &status); 763 if (U_FAILURE(status)) 764 { 765 log_err("call to iter2->setText(null) failed. %s\n", myErrorName(status)); 766 } 767 else 768 { 769 if (ucol_next(iter2, &status) != UCOL_NULLORDER) { 770 log_err("iter2 with null text expected to return UCOL_NULLORDER\n"); 771 } 772 } 773 774 ucol_closeElements(iter2); 775 ucol_closeElements(iter1); 776 ucol_close(en_us); 777} 778 779/** @bug 4108762 780 * Test for getMaxExpansion() 781 */ 782static void TestMaxExpansion() 783{ 784 UErrorCode status = U_ZERO_ERROR; 785 UCollator *coll ;/*= ucol_open("en_US", &status);*/ 786 UChar ch = 0; 787 UChar32 unassigned = 0xEFFFD; 788 UChar supplementary[2]; 789 uint32_t index = 0; 790 UBool isError = FALSE; 791 uint32_t sorder = 0; 792 UCollationElements *iter ;/*= ucol_openElements(coll, &ch, 1, &status);*/ 793 uint32_t temporder = 0; 794 795 UChar rule[256]; 796 u_uastrcpy(rule, "&a < ab < c/aba < d < z < ch"); 797 coll = ucol_openRules(rule, u_strlen(rule), UCOL_DEFAULT, 798 UCOL_DEFAULT_STRENGTH,NULL, &status); 799 if(U_SUCCESS(status) && coll) { 800 iter = ucol_openElements(coll, &ch, 1, &status); 801 802 while (ch < 0xFFFF && U_SUCCESS(status)) { 803 int count = 1; 804 uint32_t order; 805 int32_t size = 0; 806 807 ch ++; 808 809 ucol_setText(iter, &ch, 1, &status); 810 order = ucol_previous(iter, &status); 811 812 /* thai management */ 813 if (order == 0) 814 order = ucol_previous(iter, &status); 815 816 while (U_SUCCESS(status) && 817 ucol_previous(iter, &status) != UCOL_NULLORDER) { 818 count ++; 819 } 820 821 size = ucol_getMaxExpansion(iter, order); 822 if (U_FAILURE(status) || size < count) { 823 log_err("Failure at codepoint %d, maximum expansion count < %d\n", 824 ch, count); 825 } 826 } 827 828 /* testing for exact max expansion */ 829 ch = 0; 830 while (ch < 0x61) { 831 uint32_t order; 832 int32_t size; 833 ucol_setText(iter, &ch, 1, &status); 834 order = ucol_previous(iter, &status); 835 size = ucol_getMaxExpansion(iter, order); 836 if (U_FAILURE(status) || size != 1) { 837 log_err("Failure at codepoint %d, maximum expansion count < %d\n", 838 ch, 1); 839 } 840 ch ++; 841 } 842 843 ch = 0x63; 844 ucol_setText(iter, &ch, 1, &status); 845 temporder = ucol_previous(iter, &status); 846 847 if (U_FAILURE(status) || ucol_getMaxExpansion(iter, temporder) != 3) { 848 log_err("Failure at codepoint %d, maximum expansion count != %d\n", 849 ch, 3); 850 } 851 852 ch = 0x64; 853 ucol_setText(iter, &ch, 1, &status); 854 temporder = ucol_previous(iter, &status); 855 856 if (U_FAILURE(status) || ucol_getMaxExpansion(iter, temporder) != 1) { 857 log_err("Failure at codepoint %d, maximum expansion count != %d\n", 858 ch, 3); 859 } 860 861 U16_APPEND(supplementary, index, 2, unassigned, isError); 862 ucol_setText(iter, supplementary, 2, &status); 863 sorder = ucol_previous(iter, &status); 864 865 if (U_FAILURE(status) || ucol_getMaxExpansion(iter, sorder) != 2) { 866 log_err("Failure at codepoint %d, maximum expansion count < %d\n", 867 ch, 2); 868 } 869 870 /* testing jamo */ 871 ch = 0x1165; 872 873 ucol_setText(iter, &ch, 1, &status); 874 temporder = ucol_previous(iter, &status); 875 if (U_FAILURE(status) || ucol_getMaxExpansion(iter, temporder) > 3) { 876 log_err("Failure at codepoint %d, maximum expansion count > %d\n", 877 ch, 3); 878 } 879 880 ucol_closeElements(iter); 881 ucol_close(coll); 882 883 /* testing special jamo &a<\u1160 */ 884 rule[0] = 0x26; 885 rule[1] = 0x71; 886 rule[2] = 0x3c; 887 rule[3] = 0x1165; 888 rule[4] = 0x2f; 889 rule[5] = 0x71; 890 rule[6] = 0x71; 891 rule[7] = 0x71; 892 rule[8] = 0x71; 893 rule[9] = 0; 894 895 coll = ucol_openRules(rule, u_strlen(rule), UCOL_DEFAULT, 896 UCOL_DEFAULT_STRENGTH,NULL, &status); 897 iter = ucol_openElements(coll, &ch, 1, &status); 898 899 temporder = ucol_previous(iter, &status); 900 if (U_FAILURE(status) || ucol_getMaxExpansion(iter, temporder) != 6) { 901 log_err("Failure at codepoint %d, maximum expansion count > %d\n", 902 ch, 5); 903 } 904 905 ucol_closeElements(iter); 906 ucol_close(coll); 907 } else { 908 log_err_status(status, "Couldn't open collator -> %s\n", u_errorName(status)); 909 } 910 911} 912 913 914static void assertEqual(UCollationElements *i1, UCollationElements *i2) 915{ 916 int32_t c1, c2; 917 int32_t count = 0; 918 UErrorCode status = U_ZERO_ERROR; 919 920 do 921 { 922 c1 = ucol_next(i1, &status); 923 c2 = ucol_next(i2, &status); 924 925 if (c1 != c2) 926 { 927 log_err("Error in iteration %d assetEqual between\n %d and %d, they are not equal\n", count, c1, c2); 928 break; 929 } 930 931 count += 1; 932 } 933 while (c1 != UCOL_NULLORDER); 934} 935 936/** 937 * Testing iterators with extremely small buffers 938 */ 939static void TestSmallBuffer() 940{ 941 UErrorCode status = U_ZERO_ERROR; 942 UCollator *coll; 943 UCollationElements *testiter, 944 *iter; 945 int32_t count = 0; 946 OrderAndOffset *testorders, 947 *orders; 948 949 UChar teststr[500]; 950 UChar str[] = {0x300, 0x31A, 0}; 951 /* 952 creating a long string of decomposable characters, 953 since by default the writable buffer is of size 256 954 */ 955 while (count < 500) { 956 if ((count & 1) == 0) { 957 teststr[count ++] = 0x300; 958 } 959 else { 960 teststr[count ++] = 0x31A; 961 } 962 } 963 964 coll = ucol_open("th_TH", &status); 965 if(U_SUCCESS(status) && coll) { 966 testiter = ucol_openElements(coll, teststr, 500, &status); 967 iter = ucol_openElements(coll, str, 2, &status); 968 969 orders = getOrders(iter, &count); 970 if (count != 2) { 971 log_err("Error collation elements size is not 2 for \\u0300\\u031A\n"); 972 } 973 974 /* 975 this will rearrange the string data to 250 characters of 0x300 first then 976 250 characters of 0x031A 977 */ 978 testorders = getOrders(testiter, &count); 979 980 if (count != 500) { 981 log_err("Error decomposition does not give the right sized collation elements\n"); 982 } 983 984 while (count != 0) { 985 /* UCA collation element for 0x0F76 */ 986 if ((count > 250 && testorders[-- count].order != orders[1].order) || 987 (count <= 250 && testorders[-- count].order != orders[0].order)) { 988 log_err("Error decomposition does not give the right collation element at %d count\n", count); 989 break; 990 } 991 } 992 993 free(testorders); 994 free(orders); 995 996 ucol_reset(testiter); 997 /* ensures that the writable buffer was cleared */ 998 if (testiter->iteratordata_.writableBuffer != 999 testiter->iteratordata_.stackWritableBuffer) { 1000 log_err("Error Writable buffer in collation element iterator not reset\n"); 1001 } 1002 1003 /* ensures closing of elements done properly to clear writable buffer */ 1004 ucol_next(testiter, &status); 1005 ucol_next(testiter, &status); 1006 ucol_closeElements(testiter); 1007 ucol_closeElements(iter); 1008 ucol_close(coll); 1009 } else { 1010 log_err_status(status, "Couldn't open collator -> %s\n", u_errorName(status)); 1011 } 1012} 1013 1014/** 1015* Sniplets of code from genuca 1016*/ 1017static int32_t hex2num(char hex) { 1018 if(hex>='0' && hex <='9') { 1019 return hex-'0'; 1020 } else if(hex>='a' && hex<='f') { 1021 return hex-'a'+10; 1022 } else if(hex>='A' && hex<='F') { 1023 return hex-'A'+10; 1024 } else { 1025 return 0; 1026 } 1027} 1028 1029/** 1030* Getting codepoints from a string 1031* @param str character string contain codepoints seperated by space and ended 1032* by a semicolon 1033* @param codepoints array for storage, assuming size > 5 1034* @return position at the end of the codepoint section 1035*/ 1036static char * getCodePoints(char *str, UChar *codepoints, UChar *contextCPs) { 1037 char *pStartCP = str; 1038 char *pEndCP = str + 4; 1039 1040 *codepoints = (UChar)((hex2num(*pStartCP) << 12) | 1041 (hex2num(*(pStartCP + 1)) << 8) | 1042 (hex2num(*(pStartCP + 2)) << 4) | 1043 (hex2num(*(pStartCP + 3)))); 1044 if (*pEndCP == '|' || *(pEndCP+1) == '|') { 1045 /* pre-context rule */ 1046 pStartCP = pEndCP; 1047 while (*pStartCP==' ' || *pStartCP== '|' ) { 1048 pStartCP++; 1049 } 1050 pEndCP = pStartCP+4; 1051 *contextCPs = *codepoints; 1052 *(++codepoints) = (UChar)((hex2num(*pStartCP) << 12) | 1053 (hex2num(*(pStartCP + 1)) << 8) | 1054 (hex2num(*(pStartCP + 2)) << 4) | 1055 (hex2num(*(pStartCP + 3)))); 1056 contextCPs++; 1057 } 1058 *contextCPs = 0; 1059 codepoints ++; 1060 while (*pEndCP != ';') { 1061 pStartCP = pEndCP + 1; 1062 *codepoints = (UChar)((hex2num(*pStartCP) << 12) | 1063 (hex2num(*(pStartCP + 1)) << 8) | 1064 (hex2num(*(pStartCP + 2)) << 4) | 1065 (hex2num(*(pStartCP + 3)))); 1066 codepoints ++; 1067 pEndCP = pStartCP + 4; 1068 } 1069 *codepoints = 0; 1070 return pEndCP + 1; 1071} 1072 1073/** 1074* Sniplets of code from genuca 1075*/ 1076static int32_t 1077readElement(char **from, char *to, char separator, UErrorCode *status) 1078{ 1079 if (U_SUCCESS(*status)) { 1080 char buffer[1024]; 1081 int32_t i = 0; 1082 while (**from != separator) { 1083 if (**from != ' ') { 1084 *(buffer+i++) = **from; 1085 } 1086 (*from)++; 1087 } 1088 (*from)++; 1089 *(buffer + i) = 0; 1090 strcpy(to, buffer); 1091 return i/2; 1092 } 1093 1094 return 0; 1095} 1096 1097/** 1098* Sniplets of code from genuca 1099*/ 1100static uint32_t 1101getSingleCEValue(char *primary, char *secondary, char *tertiary, 1102 UErrorCode *status) 1103{ 1104 if (U_SUCCESS(*status)) { 1105 uint32_t value = 0; 1106 char primsave = '\0'; 1107 char secsave = '\0'; 1108 char tersave = '\0'; 1109 char *primend = primary+4; 1110 char *secend = secondary+2; 1111 char *terend = tertiary+2; 1112 uint32_t primvalue; 1113 uint32_t secvalue; 1114 uint32_t tervalue; 1115 1116 if (uprv_strlen(primary) > 4) { 1117 primsave = *primend; 1118 *primend = '\0'; 1119 } 1120 1121 if (uprv_strlen(secondary) > 2) { 1122 secsave = *secend; 1123 *secend = '\0'; 1124 } 1125 1126 if (uprv_strlen(tertiary) > 2) { 1127 tersave = *terend; 1128 *terend = '\0'; 1129 } 1130 1131 primvalue = (*primary!='\0')?uprv_strtoul(primary, &primend, 16):0; 1132 secvalue = (*secondary!='\0')?uprv_strtoul(secondary, &secend, 16):0; 1133 tervalue = (*tertiary!='\0')?uprv_strtoul(tertiary, &terend, 16):0; 1134 if(primvalue <= 0xFF) { 1135 primvalue <<= 8; 1136 } 1137 1138 value = ((primvalue << UCOL_PRIMARYORDERSHIFT) & UCOL_PRIMARYORDERMASK) 1139 | ((secvalue << UCOL_SECONDARYORDERSHIFT) & UCOL_SECONDARYORDERMASK) 1140 | (tervalue & UCOL_TERTIARYORDERMASK); 1141 1142 if(primsave!='\0') { 1143 *primend = primsave; 1144 } 1145 if(secsave!='\0') { 1146 *secend = secsave; 1147 } 1148 if(tersave!='\0') { 1149 *terend = tersave; 1150 } 1151 return value; 1152 } 1153 return 0; 1154} 1155 1156/** 1157* Getting collation elements generated from a string 1158* @param str character string contain collation elements contained in [] and 1159* seperated by space 1160* @param ce array for storage, assuming size > 20 1161* @param status error status 1162* @return position at the end of the codepoint section 1163*/ 1164static char * getCEs(char *str, uint32_t *ces, UErrorCode *status) { 1165 char *pStartCP = uprv_strchr(str, '['); 1166 int count = 0; 1167 char *pEndCP; 1168 char primary[100]; 1169 char secondary[100]; 1170 char tertiary[100]; 1171 1172 while (*pStartCP == '[') { 1173 uint32_t primarycount = 0; 1174 uint32_t secondarycount = 0; 1175 uint32_t tertiarycount = 0; 1176 uint32_t CEi = 1; 1177 pEndCP = strchr(pStartCP, ']'); 1178 if(pEndCP == NULL) { 1179 break; 1180 } 1181 pStartCP ++; 1182 1183 primarycount = readElement(&pStartCP, primary, ',', status); 1184 secondarycount = readElement(&pStartCP, secondary, ',', status); 1185 tertiarycount = readElement(&pStartCP, tertiary, ']', status); 1186 1187 /* I want to get the CEs entered right here, including continuation */ 1188 ces[count ++] = getSingleCEValue(primary, secondary, tertiary, status); 1189 if (U_FAILURE(*status)) { 1190 break; 1191 } 1192 1193 while (2 * CEi < primarycount || CEi < secondarycount || 1194 CEi < tertiarycount) { 1195 uint32_t value = UCOL_CONTINUATION_MARKER; /* Continuation marker */ 1196 if (2 * CEi < primarycount) { 1197 value |= ((hex2num(*(primary + 4 * CEi)) & 0xF) << 28); 1198 value |= ((hex2num(*(primary + 4 * CEi + 1)) & 0xF) << 24); 1199 } 1200 1201 if (2 * CEi + 1 < primarycount) { 1202 value |= ((hex2num(*(primary + 4 * CEi + 2)) & 0xF) << 20); 1203 value |= ((hex2num(*(primary + 4 * CEi + 3)) &0xF) << 16); 1204 } 1205 1206 if (CEi < secondarycount) { 1207 value |= ((hex2num(*(secondary + 2 * CEi)) & 0xF) << 12); 1208 value |= ((hex2num(*(secondary + 2 * CEi + 1)) & 0xF) << 8); 1209 } 1210 1211 if (CEi < tertiarycount) { 1212 value |= ((hex2num(*(tertiary + 2 * CEi)) & 0x3) << 4); 1213 value |= (hex2num(*(tertiary + 2 * CEi + 1)) & 0xF); 1214 } 1215 1216 CEi ++; 1217 ces[count ++] = value; 1218 } 1219 1220 pStartCP = pEndCP + 1; 1221 } 1222 ces[count] = 0; 1223 return pStartCP; 1224} 1225 1226/** 1227* Getting the FractionalUCA.txt file stream 1228*/ 1229static FileStream * getFractionalUCA(void) 1230{ 1231 char newPath[256]; 1232 char backupPath[256]; 1233 FileStream *result = NULL; 1234 1235 /* Look inside ICU_DATA first */ 1236 uprv_strcpy(newPath, ctest_dataSrcDir()); 1237 uprv_strcat(newPath, "unidata" U_FILE_SEP_STRING ); 1238 uprv_strcat(newPath, "FractionalUCA.txt"); 1239 1240 /* As a fallback, try to guess where the source data was located 1241 * at the time ICU was built, and look there. 1242 */ 1243#if defined (U_TOPSRCDIR) 1244 strcpy(backupPath, U_TOPSRCDIR U_FILE_SEP_STRING "data"); 1245#else 1246 { 1247 UErrorCode errorCode = U_ZERO_ERROR; 1248 strcpy(backupPath, loadTestData(&errorCode)); 1249 strcat(backupPath, U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING ".." U_FILE_SEP_STRING "data"); 1250 } 1251#endif 1252 strcat(backupPath, U_FILE_SEP_STRING "unidata" U_FILE_SEP_STRING "FractionalUCA.txt"); 1253 1254 result = T_FileStream_open(newPath, "rb"); 1255 1256 if (result == NULL) { 1257 result = T_FileStream_open(backupPath, "rb"); 1258 if (result == NULL) { 1259 log_err("Failed to open either %s or %s\n", newPath, backupPath); 1260 } 1261 } 1262 return result; 1263} 1264 1265/** 1266* Testing the CEs returned by the iterator 1267*/ 1268static void TestCEs() { 1269 FileStream *file = NULL; 1270 char line[1024]; 1271 char *str; 1272 UChar codepoints[10]; 1273 uint32_t ces[20]; 1274 UErrorCode status = U_ZERO_ERROR; 1275 UCollator *coll = ucol_open("", &status); 1276 uint32_t lineNo = 0; 1277 UChar contextCPs[5]; 1278 1279 if (U_FAILURE(status)) { 1280 log_err_status(status, "Error in opening root collator -> %s\n", u_errorName(status)); 1281 return; 1282 } 1283 1284 file = getFractionalUCA(); 1285 1286 if (file == NULL) { 1287 log_err("*** unable to open input FractionalUCA.txt file ***\n"); 1288 return; 1289 } 1290 1291 1292 while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) { 1293 int count = 0; 1294 UCollationElements *iter; 1295 int32_t preContextCeLen=0; 1296 lineNo++; 1297 /* skip this line if it is empty or a comment or is a return value 1298 or start of some variable section */ 1299 if(line[0] == 0 || line[0] == '#' || line[0] == '\n' || 1300 line[0] == 0x000D || line[0] == '[') { 1301 continue; 1302 } 1303 1304 str = getCodePoints(line, codepoints, contextCPs); 1305 1306 /* these are 'fake' codepoints in the fractional UCA, and are used just 1307 * for positioning of indirect values. They should not go through this 1308 * test. 1309 */ 1310 if(*codepoints == 0xFDD0) { 1311 continue; 1312 } 1313 if (*contextCPs != 0) { 1314 iter = ucol_openElements(coll, contextCPs, -1, &status); 1315 if (U_FAILURE(status)) { 1316 log_err("Error in opening collation elements\n"); 1317 break; 1318 } 1319 while((ces[preContextCeLen] = ucol_next(iter, &status)) != (uint32_t)UCOL_NULLORDER) { 1320 preContextCeLen++; 1321 } 1322 ucol_closeElements(iter); 1323 } 1324 1325 getCEs(str, ces+preContextCeLen, &status); 1326 if (U_FAILURE(status)) { 1327 log_err("Error in parsing collation elements in FractionalUCA.txt\n"); 1328 break; 1329 } 1330 iter = ucol_openElements(coll, codepoints, -1, &status); 1331 if (U_FAILURE(status)) { 1332 log_err("Error in opening collation elements\n"); 1333 break; 1334 } 1335 for (;;) { 1336 uint32_t ce = (uint32_t)ucol_next(iter, &status); 1337 if (ce == 0xFFFFFFFF) { 1338 ce = 0; 1339 } 1340 /* we now unconditionally reorder Thai/Lao prevowels, so this 1341 * test would fail if we don't skip here. 1342 */ 1343 if(UCOL_ISTHAIPREVOWEL(*codepoints) && ce == 0 && count == 0) { 1344 continue; 1345 } 1346 if (ce != ces[count] || U_FAILURE(status)) { 1347 log_err("Collation elements in FractionalUCA.txt and iterators do not match!\n"); 1348 break; 1349 } 1350 if (ces[count] == 0) { 1351 break; 1352 } 1353 count ++; 1354 } 1355 ucol_closeElements(iter); 1356 } 1357 1358 T_FileStream_close(file); 1359 ucol_close(coll); 1360} 1361 1362/** 1363* Testing the discontigous contractions 1364*/ 1365static void TestDiscontiguos() { 1366 const char *rulestr = 1367 "&z < AB < X\\u0300 < ABC < X\\u0300\\u0315"; 1368 UChar rule[50]; 1369 int rulelen = u_unescape(rulestr, rule, 50); 1370 const char *src[] = { 1371 "ADB", "ADBC", "A\\u0315B", "A\\u0315BC", 1372 /* base character blocked */ 1373 "XD\\u0300", "XD\\u0300\\u0315", 1374 /* non blocking combining character */ 1375 "X\\u0319\\u0300", "X\\u0319\\u0300\\u0315", 1376 /* blocking combining character */ 1377 "X\\u0314\\u0300", "X\\u0314\\u0300\\u0315", 1378 /* contraction prefix */ 1379 "ABDC", "AB\\u0315C","X\\u0300D\\u0315", "X\\u0300\\u0319\\u0315", 1380 "X\\u0300\\u031A\\u0315", 1381 /* ends not with a contraction character */ 1382 "X\\u0319\\u0300D", "X\\u0319\\u0300\\u0315D", "X\\u0300D\\u0315D", 1383 "X\\u0300\\u0319\\u0315D", "X\\u0300\\u031A\\u0315D" 1384 }; 1385 const char *tgt[] = { 1386 /* non blocking combining character */ 1387 "A D B", "A D BC", "A \\u0315 B", "A \\u0315 BC", 1388 /* base character blocked */ 1389 "X D \\u0300", "X D \\u0300\\u0315", 1390 /* non blocking combining character */ 1391 "X\\u0300 \\u0319", "X\\u0300\\u0315 \\u0319", 1392 /* blocking combining character */ 1393 "X \\u0314 \\u0300", "X \\u0314 \\u0300\\u0315", 1394 /* contraction prefix */ 1395 "AB DC", "AB \\u0315 C","X\\u0300 D \\u0315", "X\\u0300\\u0315 \\u0319", 1396 "X\\u0300 \\u031A \\u0315", 1397 /* ends not with a contraction character */ 1398 "X\\u0300 \\u0319D", "X\\u0300\\u0315 \\u0319D", "X\\u0300 D\\u0315D", 1399 "X\\u0300\\u0315 \\u0319D", "X\\u0300 \\u031A\\u0315D" 1400 }; 1401 int size = 20; 1402 UCollator *coll; 1403 UErrorCode status = U_ZERO_ERROR; 1404 int count = 0; 1405 UCollationElements *iter; 1406 UCollationElements *resultiter; 1407 1408 coll = ucol_openRules(rule, rulelen, UCOL_OFF, UCOL_DEFAULT_STRENGTH,NULL, &status); 1409 iter = ucol_openElements(coll, rule, 1, &status); 1410 resultiter = ucol_openElements(coll, rule, 1, &status); 1411 1412 if (U_FAILURE(status)) { 1413 log_err_status(status, "Error opening collation rules -> %s\n", u_errorName(status)); 1414 return; 1415 } 1416 1417 while (count < size) { 1418 UChar str[20]; 1419 UChar tstr[20]; 1420 int strLen = u_unescape(src[count], str, 20); 1421 UChar *s; 1422 1423 ucol_setText(iter, str, strLen, &status); 1424 if (U_FAILURE(status)) { 1425 log_err("Error opening collation iterator\n"); 1426 return; 1427 } 1428 1429 u_unescape(tgt[count], tstr, 20); 1430 s = tstr; 1431 1432 log_verbose("count %d\n", count); 1433 1434 for (;;) { 1435 uint32_t ce; 1436 UChar *e = u_strchr(s, 0x20); 1437 if (e == 0) { 1438 e = u_strchr(s, 0); 1439 } 1440 ucol_setText(resultiter, s, (int32_t)(e - s), &status); 1441 ce = ucol_next(resultiter, &status); 1442 if (U_FAILURE(status)) { 1443 log_err("Error manipulating collation iterator\n"); 1444 return; 1445 } 1446 while (ce != UCOL_NULLORDER) { 1447 if (ce != (uint32_t)ucol_next(iter, &status) || 1448 U_FAILURE(status)) { 1449 log_err("Discontiguos contraction test mismatch\n"); 1450 return; 1451 } 1452 ce = ucol_next(resultiter, &status); 1453 if (U_FAILURE(status)) { 1454 log_err("Error getting next collation element\n"); 1455 return; 1456 } 1457 } 1458 s = e + 1; 1459 if (*e == 0) { 1460 break; 1461 } 1462 } 1463 ucol_reset(iter); 1464 backAndForth(iter); 1465 count ++; 1466 } 1467 ucol_closeElements(resultiter); 1468 ucol_closeElements(iter); 1469 ucol_close(coll); 1470} 1471 1472static void TestCEBufferOverflow() 1473{ 1474 UChar str[UCOL_EXPAND_CE_BUFFER_SIZE + 1]; 1475 UErrorCode status = U_ZERO_ERROR; 1476 UChar rule[10]; 1477 UCollator *coll; 1478 UCollationElements *iter; 1479 1480 u_uastrcpy(rule, "&z < AB"); 1481 coll = ucol_openRules(rule, u_strlen(rule), UCOL_OFF, UCOL_DEFAULT_STRENGTH, NULL,&status); 1482 if (U_FAILURE(status)) { 1483 log_err_status(status, "Rule based collator not created for testing ce buffer overflow -> %s\n", u_errorName(status)); 1484 return; 1485 } 1486 1487 /* 0xDCDC is a trail surrogate hence deemed unsafe by the heuristic 1488 test. this will cause an overflow in getPrev */ 1489 str[0] = 0x0041; /* 'A' */ 1490 /*uprv_memset(str + 1, 0xE0, sizeof(UChar) * UCOL_EXPAND_CE_BUFFER_SIZE);*/ 1491 uprv_memset(str + 1, 0xDC, sizeof(UChar) * UCOL_EXPAND_CE_BUFFER_SIZE); 1492 str[UCOL_EXPAND_CE_BUFFER_SIZE] = 0x0042; /* 'B' */ 1493 iter = ucol_openElements(coll, str, UCOL_EXPAND_CE_BUFFER_SIZE + 1, 1494 &status); 1495 if (ucol_previous(iter, &status) == UCOL_NULLORDER || 1496 status == U_BUFFER_OVERFLOW_ERROR) { 1497 log_err("CE buffer should not overflow with long string of trail surrogates\n"); 1498 } 1499 ucol_closeElements(iter); 1500 ucol_close(coll); 1501} 1502 1503/** 1504* Byte bounds checks. Checks if each byte in data is between upper and lower 1505* inclusive. 1506*/ 1507static UBool checkByteBounds(uint32_t data, char upper, char lower) 1508{ 1509 int count = 4; 1510 while (count > 0) { 1511 char b = (char)(data & 0xFF); 1512 if (b > upper || b < lower) { 1513 return FALSE; 1514 } 1515 data = data >> 8; 1516 count --; 1517 } 1518 return TRUE; 1519} 1520 1521/** 1522* Determines case of the string of codepoints. 1523* If it is a multiple codepoints it has to treated as a contraction. 1524*/ 1525#if 0 1526static uint8_t getCase(const UChar *s, uint32_t len) { 1527 UBool lower = FALSE; 1528 UBool upper = FALSE; 1529 UBool title = FALSE; 1530 UErrorCode status = U_ZERO_ERROR; 1531 UChar str[256]; 1532 const UChar *ps = s; 1533 1534 if (len == 0) { 1535 return UCOL_LOWER_CASE; 1536 } 1537 1538 while (len > 0) { 1539 UChar c = *ps ++; 1540 1541 if (u_islower(c)) { 1542 lower = TRUE; 1543 } 1544 if (u_isupper(c)) { 1545 upper = TRUE; 1546 } 1547 if (u_istitle(c)) { 1548 title = TRUE; 1549 } 1550 1551 len --; 1552 } 1553 if ((lower && !upper && !title) || (!lower && !upper && !title)){ 1554 return UCOL_LOWER_CASE; 1555 } 1556 if (upper && !lower && !title) { 1557 return UCOL_UPPER_CASE; 1558 } 1559 /* mix of cases here */ 1560 /* len = unorm_normalize(s, len, UNORM_NFKD, 0, str, 256, &status); 1561 if (U_FAILURE(status)) { 1562 log_err("Error normalizing data string\n"); 1563 return UCOL_LOWER_CASE; 1564 }*/ 1565 1566 if ((title && len >= 2) || (lower && upper)) { 1567 return UCOL_MIXED_CASE; 1568 } 1569 if (u_isupper(s[0])) { 1570 return UCOL_UPPER_CASE; 1571 } 1572 return UCOL_LOWER_CASE; 1573} 1574#endif 1575 1576/** 1577* Checking collation element validity given the boundary arguments. 1578*/ 1579static UBool checkCEValidity(const UCollator *coll, const UChar *codepoints, 1580 int length, uint32_t primarymax, 1581 uint32_t secondarymax) 1582{ 1583 UErrorCode status = U_ZERO_ERROR; 1584 UCollationElements *iter = ucol_openElements(coll, codepoints, length, 1585 &status); 1586 uint32_t ce; 1587 UBool first = TRUE; 1588/* 1589 UBool upper = FALSE; 1590 UBool lower = FALSE; 1591*/ 1592 1593 if (U_FAILURE(status)) { 1594 log_err("Error creating iterator for testing validity\n"); 1595 } 1596 1597 ce = ucol_next(iter, &status); 1598 1599 while (ce != UCOL_NULLORDER) { 1600 if (ce != 0) { 1601 uint32_t primary = UCOL_PRIMARYORDER(ce); 1602 uint32_t secondary = UCOL_SECONDARYORDER(ce); 1603 uint32_t tertiary = UCOL_TERTIARYORDER(ce); 1604/* uint32_t scasebits = tertiary & 0xC0;*/ 1605 1606 if ((tertiary == 0 && secondary != 0) || 1607 (tertiary < 0xC0 && secondary == 0 && primary != 0)) { 1608 /* n-1th level is not zero when the nth level is 1609 except for continuations, this is wrong */ 1610 log_err("Lower level weight not 0 when high level weight is 0\n"); 1611 goto fail; 1612 } 1613 else { 1614 /* checks if any byte is illegal ie = 01 02 03. */ 1615 if (checkByteBounds(ce, 0x3, 0x1)) { 1616 log_err("Byte range in CE lies in illegal bounds 0x1 - 0x3\n"); 1617 goto fail; 1618 } 1619 } 1620 if ((primary != 0 && primary < primarymax) 1621 || ((primary & 0xFF) == 0xFF) || (((primary>>8) & 0xFF) == 0xFF) 1622 || ((primary & 0xFF) && ((primary & 0xFF) <= 0x03)) 1623 || (((primary>>8) & 0xFF) && ((primary>>8) & 0xFF) <= 0x03) 1624 || (primary >= 0xFE00 && !isContinuation(ce))) { 1625 log_err("UCA primary weight out of bounds: %04X for string starting with %04X\n", 1626 primary, codepoints[0]); 1627 goto fail; 1628 } 1629 /* case matching not done since data generated by ken */ 1630 if (first) { 1631 if (secondary >= 6 && secondary <= secondarymax) { 1632 log_err("Secondary weight out of range\n"); 1633 goto fail; 1634 } 1635 first = FALSE; 1636 } 1637 } 1638 ce = ucol_next(iter, &status); 1639 } 1640 ucol_closeElements(iter); 1641 return TRUE; 1642fail : 1643 ucol_closeElements(iter); 1644 return FALSE; 1645} 1646 1647static void TestCEValidity() 1648{ 1649 /* testing UCA collation elements */ 1650 UErrorCode status = U_ZERO_ERROR; 1651 /* en_US has no tailorings */ 1652 UCollator *coll = ucol_open("root", &status); 1653 /* tailored locales */ 1654 char locale[][11] = {"fr_FR", "ko_KR", "sh_YU", "th_TH", "zh_CN", "zh__PINYIN"}; 1655 const char *loc; 1656 FileStream *file = NULL; 1657 char line[1024]; 1658 UChar codepoints[10]; 1659 int count = 0; 1660 int maxCount = 0; 1661 UChar contextCPs[3]; 1662 UParseError parseError; 1663 if (U_FAILURE(status)) { 1664 log_err_status(status, "en_US collator creation failed -> %s\n", u_errorName(status)); 1665 return; 1666 } 1667 log_verbose("Testing UCA elements\n"); 1668 file = getFractionalUCA(); 1669 if (file == NULL) { 1670 log_err("Fractional UCA data can not be opened\n"); 1671 return; 1672 } 1673 1674 while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) { 1675 if(line[0] == 0 || line[0] == '#' || line[0] == '\n' || 1676 line[0] == 0x000D || line[0] == '[') { 1677 continue; 1678 } 1679 1680 getCodePoints(line, codepoints, contextCPs); 1681 checkCEValidity(coll, codepoints, u_strlen(codepoints), 5, 86); 1682 } 1683 1684 log_verbose("Testing UCA elements for the whole range of unicode characters\n"); 1685 codepoints[0] = 0; 1686 while (codepoints[0] < 0xFFFF) { 1687 if (u_isdefined((UChar32)codepoints[0])) { 1688 checkCEValidity(coll, codepoints, 1, 5, 86); 1689 } 1690 codepoints[0] ++; 1691 } 1692 1693 ucol_close(coll); 1694 1695 /* testing tailored collation elements */ 1696 log_verbose("Testing tailored elements\n"); 1697 if(QUICK) { 1698 maxCount = sizeof(locale)/sizeof(locale[0]); 1699 } else { 1700 maxCount = uloc_countAvailable(); 1701 } 1702 while (count < maxCount) { 1703 const UChar *rules = NULL, 1704 *current = NULL; 1705 UChar *rulesCopy = NULL; 1706 int32_t ruleLen = 0; 1707 1708 uint32_t chOffset = 0; 1709 uint32_t chLen = 0; 1710 uint32_t exOffset = 0; 1711 uint32_t exLen = 0; 1712 uint32_t prefixOffset = 0; 1713 uint32_t prefixLen = 0; 1714 UBool startOfRules = TRUE; 1715 UColOptionSet opts; 1716 1717 UColTokenParser src; 1718 uint32_t strength = 0; 1719 uint16_t specs = 0; 1720 if(QUICK) { 1721 loc = locale[count]; 1722 } else { 1723 loc = uloc_getAvailable(count); 1724 if(!hasCollationElements(loc)) { 1725 count++; 1726 continue; 1727 } 1728 } 1729 1730 log_verbose("Testing CEs for %s\n", loc); 1731 1732 coll = ucol_open(loc, &status); 1733 if (U_FAILURE(status)) { 1734 log_err("%s collator creation failed\n", loc); 1735 return; 1736 } 1737 1738 src.opts = &opts; 1739 rules = ucol_getRules(coll, &ruleLen); 1740 1741 if (ruleLen > 0) { 1742 rulesCopy = (UChar *)malloc((ruleLen + 1743 UCOL_TOK_EXTRA_RULE_SPACE_SIZE) * sizeof(UChar)); 1744 uprv_memcpy(rulesCopy, rules, ruleLen * sizeof(UChar)); 1745 src.current = src.source = rulesCopy; 1746 src.end = rulesCopy + ruleLen; 1747 src.extraCurrent = src.end; 1748 src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE; 1749 1750 while ((current = ucol_tok_parseNextToken(&src, startOfRules, &parseError,&status)) != NULL) { 1751 strength = src.parsedToken.strength; 1752 chOffset = src.parsedToken.charsOffset; 1753 chLen = src.parsedToken.charsLen; 1754 exOffset = src.parsedToken.extensionOffset; 1755 exLen = src.parsedToken.extensionLen; 1756 prefixOffset = src.parsedToken.prefixOffset; 1757 prefixLen = src.parsedToken.prefixLen; 1758 specs = src.parsedToken.flags; 1759 1760 startOfRules = FALSE; 1761 uprv_memcpy(codepoints, src.source + chOffset, 1762 chLen * sizeof(UChar)); 1763 codepoints[chLen] = 0; 1764 checkCEValidity(coll, codepoints, chLen, 4, 85); 1765 } 1766 free(rulesCopy); 1767 } 1768 1769 ucol_close(coll); 1770 count ++; 1771 } 1772 T_FileStream_close(file); 1773} 1774 1775static void printSortKeyError(const UChar *codepoints, int length, 1776 uint8_t *sortkey, int sklen) 1777{ 1778 int count = 0; 1779 log_err("Sortkey not valid for "); 1780 while (length > 0) { 1781 log_err("0x%04x ", *codepoints); 1782 length --; 1783 codepoints ++; 1784 } 1785 log_err("\nSortkey : "); 1786 while (count < sklen) { 1787 log_err("0x%02x ", sortkey[count]); 1788 count ++; 1789 } 1790 log_err("\n"); 1791} 1792 1793/** 1794* Checking sort key validity for all levels 1795*/ 1796static UBool checkSortKeyValidity(UCollator *coll, 1797 const UChar *codepoints, 1798 int length) 1799{ 1800 UErrorCode status = U_ZERO_ERROR; 1801 UCollationStrength strength[5] = {UCOL_PRIMARY, UCOL_SECONDARY, 1802 UCOL_TERTIARY, UCOL_QUATERNARY, 1803 UCOL_IDENTICAL}; 1804 int strengthlen = 5; 1805 int index = 0; 1806 int caselevel = 0; 1807 1808 while (caselevel < 1) { 1809 if (caselevel == 0) { 1810 ucol_setAttribute(coll, UCOL_CASE_LEVEL, UCOL_OFF, &status); 1811 } 1812 else { 1813 ucol_setAttribute(coll, UCOL_CASE_LEVEL, UCOL_ON, &status); 1814 } 1815 1816 while (index < strengthlen) { 1817 int count01 = 0; 1818 uint32_t count = 0; 1819 uint8_t sortkey[128]; 1820 uint32_t sklen; 1821 1822 ucol_setStrength(coll, strength[index]); 1823 sklen = ucol_getSortKey(coll, codepoints, length, sortkey, 128); 1824 while (sortkey[count] != 0) { 1825 if (sortkey[count] == 2 || (sortkey[count] == 3 && count01 > 0 && index != 4)) { 1826 printSortKeyError(codepoints, length, sortkey, sklen); 1827 return FALSE; 1828 } 1829 if (sortkey[count] == 1) { 1830 count01 ++; 1831 } 1832 count ++; 1833 } 1834 1835 if (count + 1 != sklen || (count01 != index + caselevel)) { 1836 printSortKeyError(codepoints, length, sortkey, sklen); 1837 return FALSE; 1838 } 1839 index ++; 1840 } 1841 caselevel ++; 1842 } 1843 return TRUE; 1844} 1845 1846static void TestSortKeyValidity(void) 1847{ 1848 /* testing UCA collation elements */ 1849 UErrorCode status = U_ZERO_ERROR; 1850 /* en_US has no tailorings */ 1851 UCollator *coll = ucol_open("en_US", &status); 1852 /* tailored locales */ 1853 char locale[][6] = {"fr_FR", "ko_KR", "sh_YU", "th_TH", "zh_CN"}; 1854 FileStream *file = NULL; 1855 char line[1024]; 1856 UChar codepoints[10]; 1857 int count = 0; 1858 UChar contextCPs[5]; 1859 UParseError parseError; 1860 if (U_FAILURE(status)) { 1861 log_err_status(status, "en_US collator creation failed -> %s\n", u_errorName(status)); 1862 return; 1863 } 1864 log_verbose("Testing UCA elements\n"); 1865 file = getFractionalUCA(); 1866 if (file == NULL) { 1867 log_err("Fractional UCA data can not be opened\n"); 1868 return; 1869 } 1870 1871 while (T_FileStream_readLine(file, line, sizeof(line)) != NULL) { 1872 if(line[0] == 0 || line[0] == '#' || line[0] == '\n' || 1873 line[0] == 0x000D || line[0] == '[') { 1874 continue; 1875 } 1876 1877 getCodePoints(line, codepoints, contextCPs); 1878 checkSortKeyValidity(coll, codepoints, u_strlen(codepoints)); 1879 } 1880 1881 log_verbose("Testing UCA elements for the whole range of unicode characters\n"); 1882 codepoints[0] = 0; 1883 1884 while (codepoints[0] < 0xFFFF) { 1885 if (u_isdefined((UChar32)codepoints[0])) { 1886 checkSortKeyValidity(coll, codepoints, 1); 1887 } 1888 codepoints[0] ++; 1889 } 1890 1891 ucol_close(coll); 1892 1893 /* testing tailored collation elements */ 1894 log_verbose("Testing tailored elements\n"); 1895 while (count < 5) { 1896 const UChar *rules = NULL, 1897 *current = NULL; 1898 UChar *rulesCopy = NULL; 1899 int32_t ruleLen = 0; 1900 1901 uint32_t chOffset = 0; 1902 uint32_t chLen = 0; 1903 uint32_t exOffset = 0; 1904 uint32_t exLen = 0; 1905 uint32_t prefixOffset = 0; 1906 uint32_t prefixLen = 0; 1907 UBool startOfRules = TRUE; 1908 UColOptionSet opts; 1909 1910 UColTokenParser src; 1911 uint32_t strength = 0; 1912 uint16_t specs = 0; 1913 1914 coll = ucol_open(locale[count], &status); 1915 if (U_FAILURE(status)) { 1916 log_err("%s collator creation failed\n", locale[count]); 1917 return; 1918 } 1919 1920 src.opts = &opts; 1921 rules = ucol_getRules(coll, &ruleLen); 1922 1923 if (ruleLen > 0) { 1924 rulesCopy = (UChar *)malloc((ruleLen + 1925 UCOL_TOK_EXTRA_RULE_SPACE_SIZE) * sizeof(UChar)); 1926 uprv_memcpy(rulesCopy, rules, ruleLen * sizeof(UChar)); 1927 src.current = src.source = rulesCopy; 1928 src.end = rulesCopy + ruleLen; 1929 src.extraCurrent = src.end; 1930 src.extraEnd = src.end + UCOL_TOK_EXTRA_RULE_SPACE_SIZE; 1931 1932 while ((current = ucol_tok_parseNextToken(&src, startOfRules,&parseError, &status)) != NULL) { 1933 strength = src.parsedToken.strength; 1934 chOffset = src.parsedToken.charsOffset; 1935 chLen = src.parsedToken.charsLen; 1936 exOffset = src.parsedToken.extensionOffset; 1937 exLen = src.parsedToken.extensionLen; 1938 prefixOffset = src.parsedToken.prefixOffset; 1939 prefixLen = src.parsedToken.prefixLen; 1940 specs = src.parsedToken.flags; 1941 1942 startOfRules = FALSE; 1943 uprv_memcpy(codepoints, src.source + chOffset, 1944 chLen * sizeof(UChar)); 1945 codepoints[chLen] = 0; 1946 checkSortKeyValidity(coll, codepoints, chLen); 1947 } 1948 free(rulesCopy); 1949 } 1950 1951 ucol_close(coll); 1952 count ++; 1953 } 1954 T_FileStream_close(file); 1955} 1956 1957#endif /* #if !UCONFIG_NO_COLLATION */ 1958