1/* 2********************************************************************** 3* Copyright (C) 2001-2014 IBM and others. All rights reserved. 4********************************************************************** 5* Date Name Description 6* 07/02/2001 synwee Creation. 7********************************************************************** 8*/ 9 10#include "unicode/utypes.h" 11 12#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION 13 14#include "unicode/usearch.h" 15#include "unicode/ustring.h" 16#include "unicode/uchar.h" 17#include "unicode/utf16.h" 18#include "normalizer2impl.h" 19#include "usrchimp.h" 20#include "cmemory.h" 21#include "ucln_in.h" 22#include "uassert.h" 23#include "ustr_imp.h" 24 25U_NAMESPACE_USE 26 27// don't use Boyer-Moore 28// (and if we decide to turn this on again there are several new TODOs that will need to be addressed) 29#define BOYER_MOORE 0 30 31#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0])) 32 33// internal definition --------------------------------------------------- 34 35#define LAST_BYTE_MASK_ 0xFF 36#define SECOND_LAST_BYTE_SHIFT_ 8 37#define SUPPLEMENTARY_MIN_VALUE_ 0x10000 38 39static const Normalizer2Impl *g_nfcImpl = NULL; 40 41// internal methods ------------------------------------------------- 42 43/** 44* Fast collation element iterator setOffset. 45* This function does not check for bounds. 46* @param coleiter collation element iterator 47* @param offset to set 48*/ 49static 50inline void setColEIterOffset(UCollationElements *elems, 51 int32_t offset) 52{ 53 // Note: Not "fast" any more after the 2013 collation rewrite. 54 // We do not want to expose more internals than necessary. 55 UErrorCode status = U_ZERO_ERROR; 56 ucol_setOffset(elems, offset, &status); 57} 58 59/** 60* Getting the mask for collation strength 61* @param strength collation strength 62* @return collation element mask 63*/ 64static 65inline uint32_t getMask(UCollationStrength strength) 66{ 67 switch (strength) 68 { 69 case UCOL_PRIMARY: 70 return UCOL_PRIMARYORDERMASK; 71 case UCOL_SECONDARY: 72 return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK; 73 default: 74 return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK | 75 UCOL_PRIMARYORDERMASK; 76 } 77} 78 79/** 80* This is to squeeze the 21bit ces into a 256 table 81* @param ce collation element 82* @return collapsed version of the collation element 83*/ 84static 85inline int hash(uint32_t ce) 86{ 87 // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work 88 // well with the new collation where most of the latin 1 characters 89 // are of the value xx000xxx. their hashes will most of the time be 0 90 // to be discussed on the hash algo. 91 return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_; 92} 93 94U_CDECL_BEGIN 95static UBool U_CALLCONV 96usearch_cleanup(void) { 97 g_nfcImpl = NULL; 98 return TRUE; 99} 100U_CDECL_END 101 102/** 103* Initializing the fcd tables. 104* Internal method, status assumed to be a success. 105* @param status output error if any, caller to check status before calling 106* method, status assumed to be success when passed in. 107*/ 108static 109inline void initializeFCD(UErrorCode *status) 110{ 111 if (g_nfcImpl == NULL) { 112 g_nfcImpl = Normalizer2Factory::getNFCImpl(*status); 113 ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup); 114 } 115} 116 117/** 118* Gets the fcd value for a character at the argument index. 119* This method takes into accounts of the supplementary characters. 120* @param str UTF16 string where character for fcd retrieval resides 121* @param offset position of the character whose fcd is to be retrieved, to be 122* overwritten with the next character position, taking 123* surrogate characters into consideration. 124* @param strlength length of the argument string 125* @return fcd value 126*/ 127static 128uint16_t getFCD(const UChar *str, int32_t *offset, 129 int32_t strlength) 130{ 131 const UChar *temp = str + *offset; 132 uint16_t result = g_nfcImpl->nextFCD16(temp, str + strlength); 133 *offset = (int32_t)(temp - str); 134 return result; 135} 136 137/** 138* Getting the modified collation elements taking into account the collation 139* attributes 140* @param strsrch string search data 141* @param sourcece 142* @return the modified collation element 143*/ 144static 145inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece) 146{ 147 // note for tertiary we can't use the collator->tertiaryMask, that 148 // is a preprocessed mask that takes into account case options. since 149 // we are only concerned with exact matches, we don't need that. 150 sourcece &= strsrch->ceMask; 151 152 if (strsrch->toShift) { 153 // alternate handling here, since only the 16 most significant digits 154 // is only used, we can safely do a compare without masking 155 // if the ce is a variable, we mask and get only the primary values 156 // no shifting to quartenary is required since all primary values 157 // less than variabletop will need to be masked off anyway. 158 if (strsrch->variableTop > sourcece) { 159 if (strsrch->strength >= UCOL_QUATERNARY) { 160 sourcece &= UCOL_PRIMARYORDERMASK; 161 } 162 else { 163 sourcece = UCOL_IGNORABLE; 164 } 165 } 166 } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) { 167 sourcece = 0xFFFF; 168 } 169 170 return sourcece; 171} 172 173/** 174* Allocate a memory and returns NULL if it failed. 175* Internal method, status assumed to be a success. 176* @param size to allocate 177* @param status output error if any, caller to check status before calling 178* method, status assumed to be success when passed in. 179* @return newly allocated array, NULL otherwise 180*/ 181static 182inline void * allocateMemory(uint32_t size, UErrorCode *status) 183{ 184 uint32_t *result = (uint32_t *)uprv_malloc(size); 185 if (result == NULL) { 186 *status = U_MEMORY_ALLOCATION_ERROR; 187 } 188 return result; 189} 190 191/** 192* Adds a uint32_t value to a destination array. 193* Creates a new array if we run out of space. The caller will have to 194* manually deallocate the newly allocated array. 195* Internal method, status assumed to be success, caller has to check status 196* before calling this method. destination not to be NULL and has at least 197* size destinationlength. 198* @param destination target array 199* @param offset destination offset to add value 200* @param destinationlength target array size, return value for the new size 201* @param value to be added 202* @param increments incremental size expected 203* @param status output error if any, caller to check status before calling 204* method, status assumed to be success when passed in. 205* @return new destination array, destination if there was no new allocation 206*/ 207static 208inline int32_t * addTouint32_tArray(int32_t *destination, 209 uint32_t offset, 210 uint32_t *destinationlength, 211 uint32_t value, 212 uint32_t increments, 213 UErrorCode *status) 214{ 215 uint32_t newlength = *destinationlength; 216 if (offset + 1 == newlength) { 217 newlength += increments; 218 int32_t *temp = (int32_t *)allocateMemory( 219 sizeof(int32_t) * newlength, status); 220 if (U_FAILURE(*status)) { 221 return NULL; 222 } 223 uprv_memcpy(temp, destination, sizeof(int32_t) * offset); 224 *destinationlength = newlength; 225 destination = temp; 226 } 227 destination[offset] = value; 228 return destination; 229} 230 231/** 232* Adds a uint64_t value to a destination array. 233* Creates a new array if we run out of space. The caller will have to 234* manually deallocate the newly allocated array. 235* Internal method, status assumed to be success, caller has to check status 236* before calling this method. destination not to be NULL and has at least 237* size destinationlength. 238* @param destination target array 239* @param offset destination offset to add value 240* @param destinationlength target array size, return value for the new size 241* @param value to be added 242* @param increments incremental size expected 243* @param status output error if any, caller to check status before calling 244* method, status assumed to be success when passed in. 245* @return new destination array, destination if there was no new allocation 246*/ 247static 248inline int64_t * addTouint64_tArray(int64_t *destination, 249 uint32_t offset, 250 uint32_t *destinationlength, 251 uint64_t value, 252 uint32_t increments, 253 UErrorCode *status) 254{ 255 uint32_t newlength = *destinationlength; 256 if (offset + 1 == newlength) { 257 newlength += increments; 258 int64_t *temp = (int64_t *)allocateMemory( 259 sizeof(int64_t) * newlength, status); 260 261 if (U_FAILURE(*status)) { 262 return NULL; 263 } 264 265 uprv_memcpy(temp, destination, sizeof(int64_t) * offset); 266 *destinationlength = newlength; 267 destination = temp; 268 } 269 270 destination[offset] = value; 271 272 return destination; 273} 274 275/** 276* Initializing the ce table for a pattern. 277* Stores non-ignorable collation keys. 278* Table size will be estimated by the size of the pattern text. Table 279* expansion will be perform as we go along. Adding 1 to ensure that the table 280* size definitely increases. 281* Internal method, status assumed to be a success. 282* @param strsrch string search data 283* @param status output error if any, caller to check status before calling 284* method, status assumed to be success when passed in. 285* @return total number of expansions 286*/ 287static 288inline uint16_t initializePatternCETable(UStringSearch *strsrch, 289 UErrorCode *status) 290{ 291 UPattern *pattern = &(strsrch->pattern); 292 uint32_t cetablesize = INITIAL_ARRAY_SIZE_; 293 int32_t *cetable = pattern->CEBuffer; 294 uint32_t patternlength = pattern->textLength; 295 UCollationElements *coleiter = strsrch->utilIter; 296 297 if (coleiter == NULL) { 298 coleiter = ucol_openElements(strsrch->collator, pattern->text, 299 patternlength, status); 300 // status will be checked in ucol_next(..) later and if it is an 301 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be 302 // returned. 303 strsrch->utilIter = coleiter; 304 } 305 else { 306 ucol_setText(coleiter, pattern->text, pattern->textLength, status); 307 } 308 if(U_FAILURE(*status)) { 309 return 0; 310 } 311 312 if (pattern->CE != cetable && pattern->CE) { 313 uprv_free(pattern->CE); 314 } 315 316 uint16_t offset = 0; 317 uint16_t result = 0; 318 int32_t ce; 319 320 while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER && 321 U_SUCCESS(*status)) { 322 uint32_t newce = getCE(strsrch, ce); 323 if (newce) { 324 int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize, 325 newce, 326 patternlength - ucol_getOffset(coleiter) + 1, 327 status); 328 if (U_FAILURE(*status)) { 329 return 0; 330 } 331 offset ++; 332 if (cetable != temp && cetable != pattern->CEBuffer) { 333 uprv_free(cetable); 334 } 335 cetable = temp; 336 } 337 result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1); 338 } 339 340 cetable[offset] = 0; 341 pattern->CE = cetable; 342 pattern->CELength = offset; 343 344 return result; 345} 346 347/** 348* Initializing the pce table for a pattern. 349* Stores non-ignorable collation keys. 350* Table size will be estimated by the size of the pattern text. Table 351* expansion will be perform as we go along. Adding 1 to ensure that the table 352* size definitely increases. 353* Internal method, status assumed to be a success. 354* @param strsrch string search data 355* @param status output error if any, caller to check status before calling 356* method, status assumed to be success when passed in. 357* @return total number of expansions 358*/ 359static 360inline uint16_t initializePatternPCETable(UStringSearch *strsrch, 361 UErrorCode *status) 362{ 363 UPattern *pattern = &(strsrch->pattern); 364 uint32_t pcetablesize = INITIAL_ARRAY_SIZE_; 365 int64_t *pcetable = pattern->PCEBuffer; 366 uint32_t patternlength = pattern->textLength; 367 UCollationElements *coleiter = strsrch->utilIter; 368 369 if (coleiter == NULL) { 370 coleiter = ucol_openElements(strsrch->collator, pattern->text, 371 patternlength, status); 372 // status will be checked in ucol_next(..) later and if it is an 373 // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be 374 // returned. 375 strsrch->utilIter = coleiter; 376 } else { 377 ucol_setText(coleiter, pattern->text, pattern->textLength, status); 378 } 379 if(U_FAILURE(*status)) { 380 return 0; 381 } 382 383 if (pattern->PCE != pcetable && pattern->PCE != NULL) { 384 uprv_free(pattern->PCE); 385 } 386 387 uint16_t offset = 0; 388 uint16_t result = 0; 389 int64_t pce; 390 391 icu::UCollationPCE iter(coleiter); 392 393 // ** Should processed CEs be signed or unsigned? 394 // ** (the rest of the code in this file seems to play fast-and-loose with 395 // ** whether a CE is signed or unsigned. For example, look at routine above this one.) 396 while ((pce = iter.nextProcessed(NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER && 397 U_SUCCESS(*status)) { 398 int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize, 399 pce, 400 patternlength - ucol_getOffset(coleiter) + 1, 401 status); 402 403 if (U_FAILURE(*status)) { 404 return 0; 405 } 406 407 offset += 1; 408 409 if (pcetable != temp && pcetable != pattern->PCEBuffer) { 410 uprv_free(pcetable); 411 } 412 413 pcetable = temp; 414 //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1); 415 } 416 417 pcetable[offset] = 0; 418 pattern->PCE = pcetable; 419 pattern->PCELength = offset; 420 421 return result; 422} 423 424/** 425* Initializes the pattern struct. 426* Internal method, status assumed to be success. 427* @param strsrch UStringSearch data storage 428* @param status output error if any, caller to check status before calling 429* method, status assumed to be success when passed in. 430* @return expansionsize the total expansion size of the pattern 431*/ 432static 433inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status) 434{ 435 if (U_FAILURE(*status)) { return 0; } 436 UPattern *pattern = &(strsrch->pattern); 437 const UChar *patterntext = pattern->text; 438 int32_t length = pattern->textLength; 439 int32_t index = 0; 440 441 // Since the strength is primary, accents are ignored in the pattern. 442 if (strsrch->strength == UCOL_PRIMARY) { 443 pattern->hasPrefixAccents = 0; 444 pattern->hasSuffixAccents = 0; 445 } else { 446 pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >> 447 SECOND_LAST_BYTE_SHIFT_; 448 index = length; 449 U16_BACK_1(patterntext, 0, index); 450 pattern->hasSuffixAccents = getFCD(patterntext, &index, length) & 451 LAST_BYTE_MASK_; 452 } 453 454 // ** HACK ** 455 if (strsrch->pattern.PCE != NULL) { 456 if (strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) { 457 uprv_free(strsrch->pattern.PCE); 458 } 459 460 strsrch->pattern.PCE = NULL; 461 } 462 463 // since intializePattern is an internal method status is a success. 464 return initializePatternCETable(strsrch, status); 465} 466 467/** 468* Initializing shift tables, with the default values. 469* If a corresponding default value is 0, the shift table is not set. 470* @param shift table for forwards shift 471* @param backshift table for backwards shift 472* @param cetable table containing pattern ce 473* @param cesize size of the pattern ces 474* @param expansionsize total size of the expansions 475* @param defaultforward the default forward value 476* @param defaultbackward the default backward value 477*/ 478static 479inline void setShiftTable(int16_t shift[], int16_t backshift[], 480 int32_t *cetable, int32_t cesize, 481 int16_t expansionsize, 482 int16_t defaultforward, 483 int16_t defaultbackward) 484{ 485 // estimate the value to shift. to do that we estimate the smallest 486 // number of characters to give the relevant ces, ie approximately 487 // the number of ces minus their expansion, since expansions can come 488 // from a character. 489 int32_t count; 490 for (count = 0; count < MAX_TABLE_SIZE_; count ++) { 491 shift[count] = defaultforward; 492 } 493 cesize --; // down to the last index 494 for (count = 0; count < cesize; count ++) { 495 // number of ces from right of array to the count 496 int temp = defaultforward - count - 1; 497 shift[hash(cetable[count])] = temp > 1 ? temp : 1; 498 } 499 shift[hash(cetable[cesize])] = 1; 500 // for ignorables we just shift by one. see test examples. 501 shift[hash(0)] = 1; 502 503 for (count = 0; count < MAX_TABLE_SIZE_; count ++) { 504 backshift[count] = defaultbackward; 505 } 506 for (count = cesize; count > 0; count --) { 507 // the original value count does not seem to work 508 backshift[hash(cetable[count])] = count > expansionsize ? 509 (int16_t)(count - expansionsize) : 1; 510 } 511 backshift[hash(cetable[0])] = 1; 512 backshift[hash(0)] = 1; 513} 514 515/** 516* Building of the pattern collation element list and the boyer moore strsrch 517* table. 518* The canonical match will only be performed after the default match fails. 519* For both cases we need to remember the size of the composed and decomposed 520* versions of the string. Since the Boyer-Moore shift calculations shifts by 521* a number of characters in the text and tries to match the pattern from that 522* offset, the shift value can not be too large in case we miss some 523* characters. To choose a right shift size, we estimate the NFC form of the 524* and use its size as a shift guide. The NFC form should be the small 525* possible representation of the pattern. Anyways, we'll err on the smaller 526* shift size. Hence the calculation for minlength. 527* Canonical match will be performed slightly differently. We'll split the 528* pattern into 3 parts, the prefix accents (PA), the middle string bounded by 529* the first and last base character (MS), the ending accents (EA). Matches 530* will be done on MS first, and only when we match MS then some processing 531* will be required for the prefix and end accents in order to determine if 532* they match PA and EA. Hence the default shift values 533* for the canonical match will take the size of either end's accent into 534* consideration. Forwards search will take the end accents into consideration 535* for the default shift values and the backwards search will take the prefix 536* accents into consideration. 537* If pattern has no non-ignorable ce, we return a illegal argument error. 538* Internal method, status assumed to be success. 539* @param strsrch UStringSearch data storage 540* @param status for output errors if it occurs, status is assumed to be a 541* success when it is passed in. 542*/ 543static 544inline void initialize(UStringSearch *strsrch, UErrorCode *status) 545{ 546 int16_t expandlength = initializePattern(strsrch, status); 547 if (U_SUCCESS(*status) && strsrch->pattern.CELength > 0) { 548 UPattern *pattern = &strsrch->pattern; 549 int32_t cesize = pattern->CELength; 550 551 int16_t minlength = cesize > expandlength 552 ? (int16_t)cesize - expandlength : 1; 553 pattern->defaultShiftSize = minlength; 554 setShiftTable(pattern->shift, pattern->backShift, pattern->CE, 555 cesize, expandlength, minlength, minlength); 556 return; 557 } 558 strsrch->pattern.defaultShiftSize = 0; 559} 560 561#if BOYER_MOORE 562/** 563* Check to make sure that the match length is at the end of the character by 564* using the breakiterator. 565* @param strsrch string search data 566* @param start target text start offset 567* @param end target text end offset 568*/ 569static 570void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/, 571 int32_t *end) 572{ 573#if !UCONFIG_NO_BREAK_ITERATION 574 UBreakIterator *breakiterator = strsrch->search->internalBreakIter; 575 if (breakiterator) { 576 int32_t matchend = *end; 577 //int32_t matchstart = *start; 578 579 if (!ubrk_isBoundary(breakiterator, matchend)) { 580 *end = ubrk_following(breakiterator, matchend); 581 } 582 583 /* Check the start of the matched text to make sure it doesn't have any accents 584 * before it. This code may not be necessary and so it is commented out */ 585 /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) { 586 *start = ubrk_preceding(breakiterator, matchstart); 587 }*/ 588 } 589#endif 590} 591 592/** 593* Determine whether the target text in UStringSearch bounded by the offset 594* start and end is one or more whole units of text as 595* determined by the breakiterator in UStringSearch. 596* @param strsrch string search data 597* @param start target text start offset 598* @param end target text end offset 599*/ 600static 601UBool isBreakUnit(const UStringSearch *strsrch, int32_t start, 602 int32_t end) 603{ 604#if !UCONFIG_NO_BREAK_ITERATION 605 UBreakIterator *breakiterator = strsrch->search->breakIter; 606 //TODO: Add here. 607 if (breakiterator) { 608 int32_t startindex = ubrk_first(breakiterator); 609 int32_t endindex = ubrk_last(breakiterator); 610 611 // out-of-range indexes are never boundary positions 612 if (start < startindex || start > endindex || 613 end < startindex || end > endindex) { 614 return FALSE; 615 } 616 // otherwise, we can use following() on the position before the 617 // specified one and return true of the position we get back is the 618 // one the user specified 619 UBool result = (start == startindex || 620 ubrk_following(breakiterator, start - 1) == start) && 621 (end == endindex || 622 ubrk_following(breakiterator, end - 1) == end); 623 if (result) { 624 // iterates the individual ces 625 UCollationElements *coleiter = strsrch->utilIter; 626 const UChar *text = strsrch->search->text + 627 start; 628 UErrorCode status = U_ZERO_ERROR; 629 ucol_setText(coleiter, text, end - start, &status); 630 for (int32_t count = 0; count < strsrch->pattern.CELength; 631 count ++) { 632 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status)); 633 if (ce == UCOL_IGNORABLE) { 634 count --; 635 continue; 636 } 637 if (U_FAILURE(status) || ce != strsrch->pattern.CE[count]) { 638 return FALSE; 639 } 640 } 641 int32_t nextce = ucol_next(coleiter, &status); 642 while (ucol_getOffset(coleiter) == (end - start) 643 && getCE(strsrch, nextce) == UCOL_IGNORABLE) { 644 nextce = ucol_next(coleiter, &status); 645 } 646 if (ucol_getOffset(coleiter) == (end - start) 647 && nextce != UCOL_NULLORDER) { 648 // extra collation elements at the end of the match 649 return FALSE; 650 } 651 } 652 return result; 653 } 654#endif 655 return TRUE; 656} 657 658/** 659* Getting the next base character offset if current offset is an accent, 660* or the current offset if the current character contains a base character. 661* accents the following base character will be returned 662* @param text string 663* @param textoffset current offset 664* @param textlength length of text string 665* @return the next base character or the current offset 666* if the current character is contains a base character. 667*/ 668static 669inline int32_t getNextBaseOffset(const UChar *text, 670 int32_t textoffset, 671 int32_t textlength) 672{ 673 if (textoffset < textlength) { 674 int32_t temp = textoffset; 675 if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) { 676 while (temp < textlength) { 677 int32_t result = temp; 678 if ((getFCD(text, &temp, textlength) >> 679 SECOND_LAST_BYTE_SHIFT_) == 0) { 680 return result; 681 } 682 } 683 return textlength; 684 } 685 } 686 return textoffset; 687} 688 689/** 690* Gets the next base character offset depending on the string search pattern 691* data 692* @param strsrch string search data 693* @param textoffset current offset, one offset away from the last character 694* to search for. 695* @return start index of the next base character or the current offset 696* if the current character is contains a base character. 697*/ 698static 699inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch, 700 int32_t textoffset) 701{ 702 int32_t textlength = strsrch->search->textLength; 703 if (strsrch->pattern.hasSuffixAccents && 704 textoffset < textlength) { 705 int32_t temp = textoffset; 706 const UChar *text = strsrch->search->text; 707 U16_BACK_1(text, 0, temp); 708 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) { 709 return getNextBaseOffset(text, textoffset, textlength); 710 } 711 } 712 return textoffset; 713} 714 715/** 716* Shifting the collation element iterator position forward to prepare for 717* a following match. If the last character is a unsafe character, we'll only 718* shift by 1 to capture contractions, normalization etc. 719* Internal method, status assumed to be success. 720* @param text strsrch string search data 721* @param textoffset start text position to do search 722* @param ce the text ce which failed the match. 723* @param patternceindex index of the ce within the pattern ce buffer which 724* failed the match 725* @return final offset 726*/ 727static 728inline int32_t shiftForward(UStringSearch *strsrch, 729 int32_t textoffset, 730 int32_t ce, 731 int32_t patternceindex) 732{ 733 UPattern *pattern = &(strsrch->pattern); 734 if (ce != UCOL_NULLORDER) { 735 int32_t shift = pattern->shift[hash(ce)]; 736 // this is to adjust for characters in the middle of the 737 // substring for matching that failed. 738 int32_t adjust = pattern->CELength - patternceindex; 739 if (adjust > 1 && shift >= adjust) { 740 shift -= adjust - 1; 741 } 742 textoffset += shift; 743 } 744 else { 745 textoffset += pattern->defaultShiftSize; 746 } 747 748 textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset); 749 // check for unsafe characters 750 // * if it is the start or middle of a contraction: to be done after 751 // a initial match is found 752 // * thai or lao base consonant character: similar to contraction 753 // * high surrogate character: similar to contraction 754 // * next character is a accent: shift to the next base character 755 return textoffset; 756} 757#endif // #if BOYER_MOORE 758 759/** 760* sets match not found 761* @param strsrch string search data 762*/ 763static 764inline void setMatchNotFound(UStringSearch *strsrch) 765{ 766 // this method resets the match result regardless of the error status. 767 strsrch->search->matchedIndex = USEARCH_DONE; 768 strsrch->search->matchedLength = 0; 769 if (strsrch->search->isForwardSearching) { 770 setColEIterOffset(strsrch->textIter, strsrch->search->textLength); 771 } 772 else { 773 setColEIterOffset(strsrch->textIter, 0); 774 } 775} 776 777#if BOYER_MOORE 778/** 779* Gets the offset to the next safe point in text. 780* ie. not the middle of a contraction, swappable characters or supplementary 781* characters. 782* @param collator collation sata 783* @param text string to work with 784* @param textoffset offset in string 785* @param textlength length of text string 786* @return offset to the next safe character 787*/ 788static 789inline int32_t getNextSafeOffset(const UCollator *collator, 790 const UChar *text, 791 int32_t textoffset, 792 int32_t textlength) 793{ 794 int32_t result = textoffset; // first contraction character 795 while (result != textlength && ucol_unsafeCP(text[result], collator)) { 796 result ++; 797 } 798 return result; 799} 800 801/** 802* This checks for accents in the potential match started with a . 803* composite character. 804* This is really painful... we have to check that composite character do not 805* have any extra accents. We have to normalize the potential match and find 806* the immediate decomposed character before the match. 807* The first composite character would have been taken care of by the fcd 808* checks in checkForwardExactMatch. 809* This is the slow path after the fcd of the first character and 810* the last character has been checked by checkForwardExactMatch and we 811* determine that the potential match has extra non-ignorable preceding 812* ces. 813* E.g. looking for \u0301 acute in \u01FA A ring above and acute, 814* checkExtraMatchAccent should fail since there is a middle ring in \u01FA 815* Note here that accents checking are slow and cautioned in the API docs. 816* Internal method, status assumed to be a success, caller should check status 817* before calling this method 818* @param strsrch string search data 819* @param start index of the potential unfriendly composite character 820* @param end index of the potential unfriendly composite character 821* @param status output error status if any. 822* @return TRUE if there is non-ignorable accents before at the beginning 823* of the match, FALSE otherwise. 824*/ 825 826static 827UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start, 828 int32_t end, 829 UErrorCode *status) 830{ 831 UBool result = FALSE; 832 if (strsrch->pattern.hasPrefixAccents) { 833 int32_t length = end - start; 834 int32_t offset = 0; 835 const UChar *text = strsrch->search->text + start; 836 837 U16_FWD_1(text, offset, length); 838 // we are only concerned with the first composite character 839 if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) { 840 int32_t safeoffset = getNextSafeOffset(strsrch->collator, 841 text, 0, length); 842 if (safeoffset != length) { 843 safeoffset ++; 844 } 845 UChar *norm = NULL; 846 UChar buffer[INITIAL_ARRAY_SIZE_]; 847 int32_t size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, 848 buffer, INITIAL_ARRAY_SIZE_, 849 status); 850 if (U_FAILURE(*status)) { 851 return FALSE; 852 } 853 if (size >= INITIAL_ARRAY_SIZE_) { 854 norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar), 855 status); 856 // if allocation failed, status will be set to 857 // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally 858 // checks for it. 859 size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm, 860 size, status); 861 if (U_FAILURE(*status) && norm != NULL) { 862 uprv_free(norm); 863 return FALSE; 864 } 865 } 866 else { 867 norm = buffer; 868 } 869 870 UCollationElements *coleiter = strsrch->utilIter; 871 ucol_setText(coleiter, norm, size, status); 872 uint32_t firstce = strsrch->pattern.CE[0]; 873 UBool ignorable = TRUE; 874 uint32_t ce = UCOL_IGNORABLE; 875 while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) { 876 offset = ucol_getOffset(coleiter); 877 if (ce != firstce && ce != UCOL_IGNORABLE) { 878 ignorable = FALSE; 879 } 880 ce = ucol_next(coleiter, status); 881 } 882 UChar32 codepoint; 883 U16_PREV(norm, 0, offset, codepoint); 884 result = !ignorable && (u_getCombiningClass(codepoint) != 0); 885 886 if (norm != buffer) { 887 uprv_free(norm); 888 } 889 } 890 } 891 892 return result; 893} 894 895/** 896* Used by exact matches, checks if there are accents before the match. 897* This is really painful... we have to check that composite characters at 898* the start of the matches have to not have any extra accents. 899* We check the FCD of the character first, if it starts with an accent and 900* the first pattern ce does not match the first ce of the character, we bail. 901* Otherwise we try normalizing the first composite 902* character and find the immediate decomposed character before the match to 903* see if it is an non-ignorable accent. 904* Now normalizing the first composite character is enough because we ensure 905* that when the match is passed in here with extra beginning ces, the 906* first or last ce that match has to occur within the first character. 907* E.g. looking for \u0301 acute in \u01FA A ring above and acute, 908* checkExtraMatchAccent should fail since there is a middle ring in \u01FA 909* Note here that accents checking are slow and cautioned in the API docs. 910* @param strsrch string search data 911* @param start offset 912* @param end offset 913* @return TRUE if there are accents on either side of the match, 914* FALSE otherwise 915*/ 916static 917UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start, 918 int32_t end) 919{ 920 if (strsrch->pattern.hasPrefixAccents) { 921 UCollationElements *coleiter = strsrch->textIter; 922 UErrorCode status = U_ZERO_ERROR; 923 // we have been iterating forwards previously 924 uint32_t ignorable = TRUE; 925 int32_t firstce = strsrch->pattern.CE[0]; 926 927 setColEIterOffset(coleiter, start); 928 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status)); 929 if (U_FAILURE(status)) { 930 return TRUE; 931 } 932 while (ce != firstce) { 933 if (ce != UCOL_IGNORABLE) { 934 ignorable = FALSE; 935 } 936 ce = getCE(strsrch, ucol_next(coleiter, &status)); 937 if (U_FAILURE(status) || ce == UCOL_NULLORDER) { 938 return TRUE; 939 } 940 } 941 if (!ignorable && inNormBuf(coleiter)) { 942 // within normalization buffer, discontiguous handled here 943 return TRUE; 944 } 945 946 // within text 947 int32_t temp = start; 948 // original code 949 // accent = (getFCD(strsrch->search->text, &temp, 950 // strsrch->search->textLength) 951 // >> SECOND_LAST_BYTE_SHIFT_); 952 // however this code does not work well with VC7 .net in release mode. 953 // maybe the inlines for getFCD combined with shifting has bugs in 954 // VC7. anyways this is a work around. 955 UBool accent = getFCD(strsrch->search->text, &temp, 956 strsrch->search->textLength) > 0xFF; 957 if (!accent) { 958 return checkExtraMatchAccents(strsrch, start, end, &status); 959 } 960 if (!ignorable) { 961 return TRUE; 962 } 963 if (start > 0) { 964 temp = start; 965 U16_BACK_1(strsrch->search->text, 0, temp); 966 if (getFCD(strsrch->search->text, &temp, 967 strsrch->search->textLength) & LAST_BYTE_MASK_) { 968 setColEIterOffset(coleiter, start); 969 ce = ucol_previous(coleiter, &status); 970 if (U_FAILURE(status) || 971 (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) { 972 return TRUE; 973 } 974 } 975 } 976 } 977 978 return FALSE; 979} 980 981/** 982* Used by exact matches, checks if there are accents bounding the match. 983* Note this is the initial boundary check. If the potential match 984* starts or ends with composite characters, the accents in those 985* characters will be determined later. 986* Not doing backwards iteration here, since discontiguos contraction for 987* backwards collation element iterator, use up too many characters. 988* E.g. looking for \u030A ring in \u01FA A ring above and acute, 989* should fail since there is a acute at the end of \u01FA 990* Note here that accents checking are slow and cautioned in the API docs. 991* @param strsrch string search data 992* @param start offset of match 993* @param end end offset of the match 994* @return TRUE if there are accents on either side of the match, 995* FALSE otherwise 996*/ 997static 998UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start, 999 int32_t end) 1000{ 1001 if (strsrch->pattern.hasSuffixAccents) { 1002 const UChar *text = strsrch->search->text; 1003 int32_t temp = end; 1004 int32_t textlength = strsrch->search->textLength; 1005 U16_BACK_1(text, 0, temp); 1006 if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) { 1007 int32_t firstce = strsrch->pattern.CE[0]; 1008 UCollationElements *coleiter = strsrch->textIter; 1009 UErrorCode status = U_ZERO_ERROR; 1010 int32_t ce; 1011 setColEIterOffset(coleiter, start); 1012 while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) { 1013 if (U_FAILURE(status) || ce == UCOL_NULLORDER) { 1014 return TRUE; 1015 } 1016 } 1017 int32_t count = 1; 1018 while (count < strsrch->pattern.CELength) { 1019 if (getCE(strsrch, ucol_next(coleiter, &status)) 1020 == UCOL_IGNORABLE) { 1021 // Thai can give an ignorable here. 1022 count --; 1023 } 1024 if (U_FAILURE(status)) { 1025 return TRUE; 1026 } 1027 count ++; 1028 } 1029 1030 ce = ucol_next(coleiter, &status); 1031 if (U_FAILURE(status)) { 1032 return TRUE; 1033 } 1034 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) { 1035 ce = getCE(strsrch, ce); 1036 } 1037 if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) { 1038 if (ucol_getOffset(coleiter) <= end) { 1039 return TRUE; 1040 } 1041 if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) { 1042 return TRUE; 1043 } 1044 } 1045 } 1046 } 1047 return FALSE; 1048} 1049#endif // #if BOYER_MOORE 1050 1051/** 1052* Checks if the offset runs out of the text string 1053* @param offset 1054* @param textlength of the text string 1055* @return TRUE if offset is out of bounds, FALSE otherwise 1056*/ 1057static 1058inline UBool isOutOfBounds(int32_t textlength, int32_t offset) 1059{ 1060 return offset < 0 || offset > textlength; 1061} 1062 1063/** 1064* Checks for identical match 1065* @param strsrch string search data 1066* @param start offset of possible match 1067* @param end offset of possible match 1068* @return TRUE if identical match is found 1069*/ 1070static 1071inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start, 1072 int32_t end) 1073{ 1074 if (strsrch->strength != UCOL_IDENTICAL) { 1075 return TRUE; 1076 } 1077 1078 // Note: We could use Normalizer::compare() or similar, but for short strings 1079 // which may not be in FCD it might be faster to just NFD them. 1080 UErrorCode status = U_ZERO_ERROR; 1081 UnicodeString t2, p2; 1082 strsrch->nfd->normalize( 1083 UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status); 1084 strsrch->nfd->normalize( 1085 UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status); 1086 // return FALSE if NFD failed 1087 return U_SUCCESS(status) && t2 == p2; 1088} 1089 1090#if BOYER_MOORE 1091/** 1092* Checks to see if the match is repeated 1093* @param strsrch string search data 1094* @param start new match start index 1095* @param end new match end index 1096* @return TRUE if the the match is repeated, FALSE otherwise 1097*/ 1098static 1099inline UBool checkRepeatedMatch(UStringSearch *strsrch, 1100 int32_t start, 1101 int32_t end) 1102{ 1103 int32_t lastmatchindex = strsrch->search->matchedIndex; 1104 UBool result; 1105 if (lastmatchindex == USEARCH_DONE) { 1106 return FALSE; 1107 } 1108 if (strsrch->search->isForwardSearching) { 1109 result = start <= lastmatchindex; 1110 } 1111 else { 1112 result = start >= lastmatchindex; 1113 } 1114 if (!result && !strsrch->search->isOverlap) { 1115 if (strsrch->search->isForwardSearching) { 1116 result = start < lastmatchindex + strsrch->search->matchedLength; 1117 } 1118 else { 1119 result = end > lastmatchindex; 1120 } 1121 } 1122 return result; 1123} 1124 1125/** 1126* Gets the collation element iterator's current offset. 1127* @param coleiter collation element iterator 1128* @param forwards flag TRUE if we are moving in th forwards direction 1129* @return current offset 1130*/ 1131static 1132inline int32_t getColElemIterOffset(const UCollationElements *coleiter, 1133 UBool forwards) 1134{ 1135 int32_t result = ucol_getOffset(coleiter); 1136 // intricacies of the the backwards collation element iterator 1137 if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) { 1138 result ++; 1139 } 1140 return result; 1141} 1142 1143/** 1144* Checks match for contraction. 1145* If the match ends with a partial contraction we fail. 1146* If the match starts too far off (because of backwards iteration) we try to 1147* chip off the extra characters depending on whether a breakiterator has 1148* been used. 1149* Internal method, error assumed to be success, caller has to check status 1150* before calling this method. 1151* @param strsrch string search data 1152* @param start offset of potential match, to be modified if necessary 1153* @param end offset of potential match, to be modified if necessary 1154* @param status output error status if any 1155* @return TRUE if match passes the contraction test, FALSE otherwise 1156*/ 1157 1158static 1159UBool checkNextExactContractionMatch(UStringSearch *strsrch, 1160 int32_t *start, 1161 int32_t *end, UErrorCode *status) 1162{ 1163 UCollationElements *coleiter = strsrch->textIter; 1164 int32_t textlength = strsrch->search->textLength; 1165 int32_t temp = *start; 1166 const UCollator *collator = strsrch->collator; 1167 const UChar *text = strsrch->search->text; 1168 // This part checks if either ends of the match contains potential 1169 // contraction. If so we'll have to iterate through them 1170 // The start contraction needs to be checked since ucol_previous dumps 1171 // all characters till the first safe character into the buffer. 1172 // *start + 1 is used to test for the unsafe characters instead of *start 1173 // because ucol_prev takes all unsafe characters till the first safe 1174 // character ie *start. so by testing *start + 1, we can estimate if 1175 // excess prefix characters has been included in the potential search 1176 // results. 1177 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) || 1178 (*start + 1 < textlength 1179 && ucol_unsafeCP(text[*start + 1], collator))) { 1180 int32_t expansion = getExpansionPrefix(coleiter); 1181 UBool expandflag = expansion > 0; 1182 setColEIterOffset(coleiter, *start); 1183 while (expansion > 0) { 1184 // getting rid of the redundant ce, caused by setOffset. 1185 // since backward contraction/expansion may have extra ces if we 1186 // are in the normalization buffer, hasAccentsBeforeMatch would 1187 // have taken care of it. 1188 // E.g. the character \u01FA will have an expansion of 3, but if 1189 // we are only looking for acute and ring \u030A and \u0301, we'll 1190 // have to skip the first ce in the expansion buffer. 1191 ucol_next(coleiter, status); 1192 if (U_FAILURE(*status)) { 1193 return FALSE; 1194 } 1195 if (ucol_getOffset(coleiter) != temp) { 1196 *start = temp; 1197 temp = ucol_getOffset(coleiter); 1198 } 1199 expansion --; 1200 } 1201 1202 int32_t *patternce = strsrch->pattern.CE; 1203 int32_t patterncelength = strsrch->pattern.CELength; 1204 int32_t count = 0; 1205 while (count < patterncelength) { 1206 int32_t ce = getCE(strsrch, ucol_next(coleiter, status)); 1207 if (ce == UCOL_IGNORABLE) { 1208 continue; 1209 } 1210 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) { 1211 *start = temp; 1212 temp = ucol_getOffset(coleiter); 1213 } 1214 if (U_FAILURE(*status) || ce != patternce[count]) { 1215 (*end) ++; 1216 *end = getNextUStringSearchBaseOffset(strsrch, *end); 1217 return FALSE; 1218 } 1219 count ++; 1220 } 1221 } 1222 return TRUE; 1223} 1224 1225/** 1226* Checks and sets the match information if found. 1227* Checks 1228* <ul> 1229* <li> the potential match does not repeat the previous match 1230* <li> boundaries are correct 1231* <li> exact matches has no extra accents 1232* <li> identical matchesb 1233* <li> potential match does not end in the middle of a contraction 1234* <\ul> 1235* Otherwise the offset will be shifted to the next character. 1236* Internal method, status assumed to be success, caller has to check status 1237* before calling this method. 1238* @param strsrch string search data 1239* @param textoffset offset in the collation element text. the returned value 1240* will be the truncated end offset of the match or the new start 1241* search offset. 1242* @param status output error status if any 1243* @return TRUE if the match is valid, FALSE otherwise 1244*/ 1245static 1246inline UBool checkNextExactMatch(UStringSearch *strsrch, 1247 int32_t *textoffset, UErrorCode *status) 1248{ 1249 UCollationElements *coleiter = strsrch->textIter; 1250 int32_t start = getColElemIterOffset(coleiter, FALSE); 1251 1252 if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) { 1253 return FALSE; 1254 } 1255 1256 // this totally matches, however we need to check if it is repeating 1257 if (!isBreakUnit(strsrch, start, *textoffset) || 1258 checkRepeatedMatch(strsrch, start, *textoffset) || 1259 hasAccentsBeforeMatch(strsrch, start, *textoffset) || 1260 !checkIdentical(strsrch, start, *textoffset) || 1261 hasAccentsAfterMatch(strsrch, start, *textoffset)) { 1262 1263 (*textoffset) ++; 1264 *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset); 1265 return FALSE; 1266 } 1267 1268 //Add breakiterator boundary check for primary strength search. 1269 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) { 1270 checkBreakBoundary(strsrch, &start, textoffset); 1271 } 1272 1273 // totally match, we will get rid of the ending ignorables. 1274 strsrch->search->matchedIndex = start; 1275 strsrch->search->matchedLength = *textoffset - start; 1276 return TRUE; 1277} 1278 1279/** 1280* Getting the previous base character offset, or the current offset if the 1281* current character is a base character 1282* @param text string 1283* @param textoffset one offset after the current character 1284* @return the offset of the next character after the base character or the first 1285* composed character with accents 1286*/ 1287static 1288inline int32_t getPreviousBaseOffset(const UChar *text, 1289 int32_t textoffset) 1290{ 1291 if (textoffset > 0) { 1292 for (;;) { 1293 int32_t result = textoffset; 1294 U16_BACK_1(text, 0, textoffset); 1295 int32_t temp = textoffset; 1296 uint16_t fcd = getFCD(text, &temp, result); 1297 if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) { 1298 if (fcd & LAST_BYTE_MASK_) { 1299 return textoffset; 1300 } 1301 return result; 1302 } 1303 if (textoffset == 0) { 1304 return 0; 1305 } 1306 } 1307 } 1308 return textoffset; 1309} 1310 1311/** 1312* Getting the indexes of the accents that are not blocked in the argument 1313* accent array 1314* @param accents array of accents in nfd terminated by a 0. 1315* @param accentsindex array of indexes of the accents that are not blocked 1316*/ 1317static 1318inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex) 1319{ 1320 int32_t index = 0; 1321 int32_t length = u_strlen(accents); 1322 UChar32 codepoint = 0; 1323 int cclass = 0; 1324 int result = 0; 1325 int32_t temp; 1326 while (index < length) { 1327 temp = index; 1328 U16_NEXT(accents, index, length, codepoint); 1329 if (u_getCombiningClass(codepoint) != cclass) { 1330 cclass = u_getCombiningClass(codepoint); 1331 accentsindex[result] = temp; 1332 result ++; 1333 } 1334 } 1335 accentsindex[result] = length; 1336 return result; 1337} 1338 1339/** 1340* Appends 3 UChar arrays to a destination array. 1341* Creates a new array if we run out of space. The caller will have to 1342* manually deallocate the newly allocated array. 1343* Internal method, status assumed to be success, caller has to check status 1344* before calling this method. destination not to be NULL and has at least 1345* size destinationlength. 1346* @param destination target array 1347* @param destinationlength target array size, returning the appended length 1348* @param source1 null-terminated first array 1349* @param source2 second array 1350* @param source2length length of seond array 1351* @param source3 null-terminated third array 1352* @param status error status if any 1353* @return new destination array, destination if there was no new allocation 1354*/ 1355static 1356inline UChar * addToUCharArray( UChar *destination, 1357 int32_t *destinationlength, 1358 const UChar *source1, 1359 const UChar *source2, 1360 int32_t source2length, 1361 const UChar *source3, 1362 UErrorCode *status) 1363{ 1364 int32_t source1length = source1 ? u_strlen(source1) : 0; 1365 int32_t source3length = source3 ? u_strlen(source3) : 0; 1366 if (*destinationlength < source1length + source2length + source3length + 1367 1) 1368 { 1369 destination = (UChar *)allocateMemory( 1370 (source1length + source2length + source3length + 1) * sizeof(UChar), 1371 status); 1372 // if error allocating memory, status will be 1373 // U_MEMORY_ALLOCATION_ERROR 1374 if (U_FAILURE(*status)) { 1375 *destinationlength = 0; 1376 return NULL; 1377 } 1378 } 1379 if (source1length != 0) { 1380 uprv_memcpy(destination, source1, sizeof(UChar) * source1length); 1381 } 1382 if (source2length != 0) { 1383 uprv_memcpy(destination + source1length, source2, 1384 sizeof(UChar) * source2length); 1385 } 1386 if (source3length != 0) { 1387 uprv_memcpy(destination + source1length + source2length, source3, 1388 sizeof(UChar) * source3length); 1389 } 1390 *destinationlength = source1length + source2length + source3length; 1391 return destination; 1392} 1393 1394/** 1395* Running through a collation element iterator to see if the contents matches 1396* pattern in string search data 1397* @param strsrch string search data 1398* @param coleiter collation element iterator 1399* @return TRUE if a match if found, FALSE otherwise 1400*/ 1401static 1402inline UBool checkCollationMatch(const UStringSearch *strsrch, 1403 UCollationElements *coleiter) 1404{ 1405 int patternceindex = strsrch->pattern.CELength; 1406 int32_t *patternce = strsrch->pattern.CE; 1407 UErrorCode status = U_ZERO_ERROR; 1408 while (patternceindex > 0) { 1409 int32_t ce = getCE(strsrch, ucol_next(coleiter, &status)); 1410 if (ce == UCOL_IGNORABLE) { 1411 continue; 1412 } 1413 if (U_FAILURE(status) || ce != *patternce) { 1414 return FALSE; 1415 } 1416 patternce ++; 1417 patternceindex --; 1418 } 1419 return TRUE; 1420} 1421 1422/** 1423* Rearranges the front accents to try matching. 1424* Prefix accents in the text will be grouped according to their combining 1425* class and the groups will be mixed and matched to try find the perfect 1426* match with the pattern. 1427* So for instance looking for "\u0301" in "\u030A\u0301\u0325" 1428* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings 1429* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", 1430* "\u0301\u0325". 1431* step 2: check if any of the generated substrings matches the pattern. 1432* Internal method, status is assumed to be success, caller has to check status 1433* before calling this method. 1434* @param strsrch string search match 1435* @param start first offset of the accents to start searching 1436* @param end start of the last accent set 1437* @param status output error status if any 1438* @return USEARCH_DONE if a match is not found, otherwise return the starting 1439* offset of the match. Note this start includes all preceding accents. 1440*/ 1441static 1442int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch, 1443 int32_t start, 1444 int32_t end, 1445 UErrorCode *status) 1446{ 1447 const UChar *text = strsrch->search->text; 1448 int32_t textlength = strsrch->search->textLength; 1449 int32_t tempstart = start; 1450 1451 if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) { 1452 // die... failed at a base character 1453 return USEARCH_DONE; 1454 } 1455 1456 int32_t offset = getNextBaseOffset(text, tempstart, textlength); 1457 start = getPreviousBaseOffset(text, tempstart); 1458 1459 UChar accents[INITIAL_ARRAY_SIZE_]; 1460 // normalizing the offensive string 1461 unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents, 1462 INITIAL_ARRAY_SIZE_, status); 1463 if (U_FAILURE(*status)) { 1464 return USEARCH_DONE; 1465 } 1466 1467 int32_t accentsindex[INITIAL_ARRAY_SIZE_]; 1468 int32_t accentsize = getUnblockedAccentIndex(accents, 1469 accentsindex); 1470 int32_t count = (2 << (accentsize - 1)) - 1; 1471 UChar buffer[INITIAL_ARRAY_SIZE_]; 1472 UCollationElements *coleiter = strsrch->utilIter; 1473 while (U_SUCCESS(*status) && count > 0) { 1474 UChar *rearrange = strsrch->canonicalPrefixAccents; 1475 // copy the base characters 1476 for (int k = 0; k < accentsindex[0]; k ++) { 1477 *rearrange ++ = accents[k]; 1478 } 1479 // forming all possible canonical rearrangement by dropping 1480 // sets of accents 1481 for (int i = 0; i <= accentsize - 1; i ++) { 1482 int32_t mask = 1 << (accentsize - i - 1); 1483 if (count & mask) { 1484 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { 1485 *rearrange ++ = accents[j]; 1486 } 1487 } 1488 } 1489 *rearrange = 0; 1490 int32_t matchsize = INITIAL_ARRAY_SIZE_; 1491 UChar *match = addToUCharArray(buffer, &matchsize, 1492 strsrch->canonicalPrefixAccents, 1493 strsrch->search->text + offset, 1494 end - offset, 1495 strsrch->canonicalSuffixAccents, 1496 status); 1497 1498 // if status is a failure, ucol_setText does nothing. 1499 // run the collator iterator through this match 1500 ucol_setText(coleiter, match, matchsize, status); 1501 if (U_SUCCESS(*status)) { 1502 if (checkCollationMatch(strsrch, coleiter)) { 1503 if (match != buffer) { 1504 uprv_free(match); 1505 } 1506 return start; 1507 } 1508 } 1509 count --; 1510 } 1511 return USEARCH_DONE; 1512} 1513 1514/** 1515* Gets the offset to the safe point in text before textoffset. 1516* ie. not the middle of a contraction, swappable characters or supplementary 1517* characters. 1518* @param collator collation sata 1519* @param text string to work with 1520* @param textoffset offset in string 1521* @param textlength length of text string 1522* @return offset to the previous safe character 1523*/ 1524static 1525inline uint32_t getPreviousSafeOffset(const UCollator *collator, 1526 const UChar *text, 1527 int32_t textoffset) 1528{ 1529 int32_t result = textoffset; // first contraction character 1530 while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) { 1531 result --; 1532 } 1533 if (result != 0) { 1534 // the first contraction character is consider unsafe here 1535 result --; 1536 } 1537 return result; 1538} 1539 1540/** 1541* Cleaning up after we passed the safe zone 1542* @param strsrch string search data 1543* @param safetext safe text array 1544* @param safebuffer safe text buffer 1545* @param coleiter collation element iterator for safe text 1546*/ 1547static 1548inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext, 1549 UChar *safebuffer) 1550{ 1551 if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents) 1552 { 1553 uprv_free(safetext); 1554 } 1555} 1556 1557/** 1558* Take the rearranged end accents and tries matching. If match failed at 1559* a seperate preceding set of accents (seperated from the rearranged on by 1560* at least a base character) then we rearrange the preceding accents and 1561* tries matching again. 1562* We allow skipping of the ends of the accent set if the ces do not match. 1563* However if the failure is found before the accent set, it fails. 1564* Internal method, status assumed to be success, caller has to check status 1565* before calling this method. 1566* @param strsrch string search data 1567* @param textoffset of the start of the rearranged accent 1568* @param status output error status if any 1569* @return USEARCH_DONE if a match is not found, otherwise return the starting 1570* offset of the match. Note this start includes all preceding accents. 1571*/ 1572static 1573int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch, 1574 int32_t textoffset, 1575 UErrorCode *status) 1576{ 1577 const UChar *text = strsrch->search->text; 1578 const UCollator *collator = strsrch->collator; 1579 int32_t safelength = 0; 1580 UChar *safetext; 1581 int32_t safetextlength; 1582 UChar safebuffer[INITIAL_ARRAY_SIZE_]; 1583 UCollationElements *coleiter = strsrch->utilIter; 1584 int32_t safeoffset = textoffset; 1585 1586 if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0], 1587 collator)) { 1588 safeoffset = getPreviousSafeOffset(collator, text, textoffset); 1589 safelength = textoffset - safeoffset; 1590 safetextlength = INITIAL_ARRAY_SIZE_; 1591 safetext = addToUCharArray(safebuffer, &safetextlength, NULL, 1592 text + safeoffset, safelength, 1593 strsrch->canonicalSuffixAccents, 1594 status); 1595 } 1596 else { 1597 safetextlength = u_strlen(strsrch->canonicalSuffixAccents); 1598 safetext = strsrch->canonicalSuffixAccents; 1599 } 1600 1601 // if status is a failure, ucol_setText does nothing 1602 ucol_setText(coleiter, safetext, safetextlength, status); 1603 // status checked in loop below 1604 1605 int32_t *ce = strsrch->pattern.CE; 1606 int32_t celength = strsrch->pattern.CELength; 1607 int ceindex = celength - 1; 1608 UBool isSafe = TRUE; // indication flag for position in safe zone 1609 1610 while (ceindex >= 0) { 1611 int32_t textce = ucol_previous(coleiter, status); 1612 if (U_FAILURE(*status)) { 1613 if (isSafe) { 1614 cleanUpSafeText(strsrch, safetext, safebuffer); 1615 } 1616 return USEARCH_DONE; 1617 } 1618 if (textce == UCOL_NULLORDER) { 1619 // check if we have passed the safe buffer 1620 if (coleiter == strsrch->textIter) { 1621 cleanUpSafeText(strsrch, safetext, safebuffer); 1622 return USEARCH_DONE; 1623 } 1624 cleanUpSafeText(strsrch, safetext, safebuffer); 1625 safetext = safebuffer; 1626 coleiter = strsrch->textIter; 1627 setColEIterOffset(coleiter, safeoffset); 1628 // status checked at the start of the loop 1629 isSafe = FALSE; 1630 continue; 1631 } 1632 textce = getCE(strsrch, textce); 1633 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) { 1634 // do the beginning stuff 1635 int32_t failedoffset = getColElemIterOffset(coleiter, FALSE); 1636 if (isSafe && failedoffset >= safelength) { 1637 // alas... no hope. failed at rearranged accent set 1638 cleanUpSafeText(strsrch, safetext, safebuffer); 1639 return USEARCH_DONE; 1640 } 1641 else { 1642 if (isSafe) { 1643 failedoffset += safeoffset; 1644 cleanUpSafeText(strsrch, safetext, safebuffer); 1645 } 1646 1647 // try rearranging the front accents 1648 int32_t result = doNextCanonicalPrefixMatch(strsrch, 1649 failedoffset, textoffset, status); 1650 if (result != USEARCH_DONE) { 1651 // if status is a failure, ucol_setOffset does nothing 1652 setColEIterOffset(strsrch->textIter, result); 1653 } 1654 if (U_FAILURE(*status)) { 1655 return USEARCH_DONE; 1656 } 1657 return result; 1658 } 1659 } 1660 if (textce == ce[ceindex]) { 1661 ceindex --; 1662 } 1663 } 1664 // set offset here 1665 if (isSafe) { 1666 int32_t result = getColElemIterOffset(coleiter, FALSE); 1667 // sets the text iterator here with the correct expansion and offset 1668 int32_t leftoverces = getExpansionPrefix(coleiter); 1669 cleanUpSafeText(strsrch, safetext, safebuffer); 1670 if (result >= safelength) { 1671 result = textoffset; 1672 } 1673 else { 1674 result += safeoffset; 1675 } 1676 setColEIterOffset(strsrch->textIter, result); 1677 strsrch->textIter->iteratordata_.toReturn = 1678 setExpansionPrefix(strsrch->textIter, leftoverces); 1679 return result; 1680 } 1681 1682 return ucol_getOffset(coleiter); 1683} 1684 1685/** 1686* Trying out the substring and sees if it can be a canonical match. 1687* This will try normalizing the end accents and arranging them into canonical 1688* equivalents and check their corresponding ces with the pattern ce. 1689* Suffix accents in the text will be grouped according to their combining 1690* class and the groups will be mixed and matched to try find the perfect 1691* match with the pattern. 1692* So for instance looking for "\u0301" in "\u030A\u0301\u0325" 1693* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings 1694* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", 1695* "\u0301\u0325". 1696* step 2: check if any of the generated substrings matches the pattern. 1697* Internal method, status assumed to be success, caller has to check status 1698* before calling this method. 1699* @param strsrch string search data 1700* @param textoffset end offset in the collation element text that ends with 1701* the accents to be rearranged 1702* @param status error status if any 1703* @return TRUE if the match is valid, FALSE otherwise 1704*/ 1705static 1706UBool doNextCanonicalMatch(UStringSearch *strsrch, 1707 int32_t textoffset, 1708 UErrorCode *status) 1709{ 1710 const UChar *text = strsrch->search->text; 1711 int32_t temp = textoffset; 1712 U16_BACK_1(text, 0, temp); 1713 if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) { 1714 UCollationElements *coleiter = strsrch->textIter; 1715 int32_t offset = getColElemIterOffset(coleiter, FALSE); 1716 if (strsrch->pattern.hasPrefixAccents) { 1717 offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset, 1718 status); 1719 if (U_SUCCESS(*status) && offset != USEARCH_DONE) { 1720 setColEIterOffset(coleiter, offset); 1721 return TRUE; 1722 } 1723 } 1724 return FALSE; 1725 } 1726 1727 if (!strsrch->pattern.hasSuffixAccents) { 1728 return FALSE; 1729 } 1730 1731 UChar accents[INITIAL_ARRAY_SIZE_]; 1732 // offset to the last base character in substring to search 1733 int32_t baseoffset = getPreviousBaseOffset(text, textoffset); 1734 // normalizing the offensive string 1735 unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD, 1736 0, accents, INITIAL_ARRAY_SIZE_, status); 1737 // status checked in loop below 1738 1739 int32_t accentsindex[INITIAL_ARRAY_SIZE_]; 1740 int32_t size = getUnblockedAccentIndex(accents, accentsindex); 1741 1742 // 2 power n - 1 plus the full set of accents 1743 int32_t count = (2 << (size - 1)) - 1; 1744 while (U_SUCCESS(*status) && count > 0) { 1745 UChar *rearrange = strsrch->canonicalSuffixAccents; 1746 // copy the base characters 1747 for (int k = 0; k < accentsindex[0]; k ++) { 1748 *rearrange ++ = accents[k]; 1749 } 1750 // forming all possible canonical rearrangement by dropping 1751 // sets of accents 1752 for (int i = 0; i <= size - 1; i ++) { 1753 int32_t mask = 1 << (size - i - 1); 1754 if (count & mask) { 1755 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { 1756 *rearrange ++ = accents[j]; 1757 } 1758 } 1759 } 1760 *rearrange = 0; 1761 int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset, 1762 status); 1763 if (offset != USEARCH_DONE) { 1764 return TRUE; // match found 1765 } 1766 count --; 1767 } 1768 return FALSE; 1769} 1770 1771/** 1772* Gets the previous base character offset depending on the string search 1773* pattern data 1774* @param strsrch string search data 1775* @param textoffset current offset, current character 1776* @return the offset of the next character after this base character or itself 1777* if it is a composed character with accents 1778*/ 1779static 1780inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch, 1781 int32_t textoffset) 1782{ 1783 if (strsrch->pattern.hasPrefixAccents && textoffset > 0) { 1784 const UChar *text = strsrch->search->text; 1785 int32_t offset = textoffset; 1786 if (getFCD(text, &offset, strsrch->search->textLength) >> 1787 SECOND_LAST_BYTE_SHIFT_) { 1788 return getPreviousBaseOffset(text, textoffset); 1789 } 1790 } 1791 return textoffset; 1792} 1793 1794/** 1795* Checks match for contraction. 1796* If the match ends with a partial contraction we fail. 1797* If the match starts too far off (because of backwards iteration) we try to 1798* chip off the extra characters 1799* Internal method, status assumed to be success, caller has to check status 1800* before calling this method. 1801* @param strsrch string search data 1802* @param start offset of potential match, to be modified if necessary 1803* @param end offset of potential match, to be modified if necessary 1804* @param status output error status if any 1805* @return TRUE if match passes the contraction test, FALSE otherwise 1806*/ 1807static 1808UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch, 1809 int32_t *start, 1810 int32_t *end, 1811 UErrorCode *status) 1812{ 1813 UCollationElements *coleiter = strsrch->textIter; 1814 int32_t textlength = strsrch->search->textLength; 1815 int32_t temp = *start; 1816 const UCollator *collator = strsrch->collator; 1817 const UChar *text = strsrch->search->text; 1818 // This part checks if either ends of the match contains potential 1819 // contraction. If so we'll have to iterate through them 1820 if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) || 1821 (*start + 1 < textlength 1822 && ucol_unsafeCP(text[*start + 1], collator))) { 1823 int32_t expansion = getExpansionPrefix(coleiter); 1824 UBool expandflag = expansion > 0; 1825 setColEIterOffset(coleiter, *start); 1826 while (expansion > 0) { 1827 // getting rid of the redundant ce, caused by setOffset. 1828 // since backward contraction/expansion may have extra ces if we 1829 // are in the normalization buffer, hasAccentsBeforeMatch would 1830 // have taken care of it. 1831 // E.g. the character \u01FA will have an expansion of 3, but if 1832 // we are only looking for acute and ring \u030A and \u0301, we'll 1833 // have to skip the first ce in the expansion buffer. 1834 ucol_next(coleiter, status); 1835 if (U_FAILURE(*status)) { 1836 return FALSE; 1837 } 1838 if (ucol_getOffset(coleiter) != temp) { 1839 *start = temp; 1840 temp = ucol_getOffset(coleiter); 1841 } 1842 expansion --; 1843 } 1844 1845 int32_t *patternce = strsrch->pattern.CE; 1846 int32_t patterncelength = strsrch->pattern.CELength; 1847 int32_t count = 0; 1848 int32_t textlength = strsrch->search->textLength; 1849 while (count < patterncelength) { 1850 int32_t ce = getCE(strsrch, ucol_next(coleiter, status)); 1851 // status checked below, note that if status is a failure 1852 // ucol_next returns UCOL_NULLORDER 1853 if (ce == UCOL_IGNORABLE) { 1854 continue; 1855 } 1856 if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) { 1857 *start = temp; 1858 temp = ucol_getOffset(coleiter); 1859 } 1860 1861 if (count == 0 && ce != patternce[0]) { 1862 // accents may have extra starting ces, this occurs when a 1863 // pure accent pattern is matched without rearrangement 1864 // text \u0325\u0300 and looking for \u0300 1865 int32_t expected = patternce[0]; 1866 if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) { 1867 ce = getCE(strsrch, ucol_next(coleiter, status)); 1868 while (U_SUCCESS(*status) && ce != expected && 1869 ce != UCOL_NULLORDER && 1870 ucol_getOffset(coleiter) <= *end) { 1871 ce = getCE(strsrch, ucol_next(coleiter, status)); 1872 } 1873 } 1874 } 1875 if (U_FAILURE(*status) || ce != patternce[count]) { 1876 (*end) ++; 1877 *end = getNextUStringSearchBaseOffset(strsrch, *end); 1878 return FALSE; 1879 } 1880 count ++; 1881 } 1882 } 1883 return TRUE; 1884} 1885 1886/** 1887* Checks and sets the match information if found. 1888* Checks 1889* <ul> 1890* <li> the potential match does not repeat the previous match 1891* <li> boundaries are correct 1892* <li> potential match does not end in the middle of a contraction 1893* <li> identical matches 1894* <\ul> 1895* Otherwise the offset will be shifted to the next character. 1896* Internal method, status assumed to be success, caller has to check the 1897* status before calling this method. 1898* @param strsrch string search data 1899* @param textoffset offset in the collation element text. the returned value 1900* will be the truncated end offset of the match or the new start 1901* search offset. 1902* @param status output error status if any 1903* @return TRUE if the match is valid, FALSE otherwise 1904*/ 1905static 1906inline UBool checkNextCanonicalMatch(UStringSearch *strsrch, 1907 int32_t *textoffset, 1908 UErrorCode *status) 1909{ 1910 // to ensure that the start and ends are not composite characters 1911 UCollationElements *coleiter = strsrch->textIter; 1912 // if we have a canonical accent match 1913 if ((strsrch->pattern.hasSuffixAccents && 1914 strsrch->canonicalSuffixAccents[0]) || 1915 (strsrch->pattern.hasPrefixAccents && 1916 strsrch->canonicalPrefixAccents[0])) { 1917 strsrch->search->matchedIndex = getPreviousUStringSearchBaseOffset( 1918 strsrch, 1919 ucol_getOffset(coleiter)); 1920 strsrch->search->matchedLength = *textoffset - 1921 strsrch->search->matchedIndex; 1922 return TRUE; 1923 } 1924 1925 int32_t start = getColElemIterOffset(coleiter, FALSE); 1926 if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset, 1927 status) || U_FAILURE(*status)) { 1928 return FALSE; 1929 } 1930 1931 start = getPreviousUStringSearchBaseOffset(strsrch, start); 1932 // this totally matches, however we need to check if it is repeating 1933 if (checkRepeatedMatch(strsrch, start, *textoffset) || 1934 !isBreakUnit(strsrch, start, *textoffset) || 1935 !checkIdentical(strsrch, start, *textoffset)) { 1936 (*textoffset) ++; 1937 *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset, 1938 strsrch->search->textLength); 1939 return FALSE; 1940 } 1941 1942 strsrch->search->matchedIndex = start; 1943 strsrch->search->matchedLength = *textoffset - start; 1944 return TRUE; 1945} 1946 1947/** 1948* Shifting the collation element iterator position forward to prepare for 1949* a preceding match. If the first character is a unsafe character, we'll only 1950* shift by 1 to capture contractions, normalization etc. 1951* Internal method, status assumed to be success, caller has to check status 1952* before calling this method. 1953* @param text strsrch string search data 1954* @param textoffset start text position to do search 1955* @param ce the text ce which failed the match. 1956* @param patternceindex index of the ce within the pattern ce buffer which 1957* failed the match 1958* @return final offset 1959*/ 1960static 1961inline int32_t reverseShift(UStringSearch *strsrch, 1962 int32_t textoffset, 1963 int32_t ce, 1964 int32_t patternceindex) 1965{ 1966 if (strsrch->search->isOverlap) { 1967 if (textoffset != strsrch->search->textLength) { 1968 textoffset --; 1969 } 1970 else { 1971 textoffset -= strsrch->pattern.defaultShiftSize; 1972 } 1973 } 1974 else { 1975 if (ce != UCOL_NULLORDER) { 1976 int32_t shift = strsrch->pattern.backShift[hash(ce)]; 1977 1978 // this is to adjust for characters in the middle of the substring 1979 // for matching that failed. 1980 int32_t adjust = patternceindex; 1981 if (adjust > 1 && shift > adjust) { 1982 shift -= adjust - 1; 1983 } 1984 textoffset -= shift; 1985 } 1986 else { 1987 textoffset -= strsrch->pattern.defaultShiftSize; 1988 } 1989 } 1990 textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset); 1991 return textoffset; 1992} 1993 1994/** 1995* Checks match for contraction. 1996* If the match starts with a partial contraction we fail. 1997* Internal method, status assumed to be success, caller has to check status 1998* before calling this method. 1999* @param strsrch string search data 2000* @param start offset of potential match, to be modified if necessary 2001* @param end offset of potential match, to be modified if necessary 2002* @param status output error status if any 2003* @return TRUE if match passes the contraction test, FALSE otherwise 2004*/ 2005static 2006UBool checkPreviousExactContractionMatch(UStringSearch *strsrch, 2007 int32_t *start, 2008 int32_t *end, UErrorCode *status) 2009{ 2010 UCollationElements *coleiter = strsrch->textIter; 2011 int32_t textlength = strsrch->search->textLength; 2012 int32_t temp = *end; 2013 const UCollator *collator = strsrch->collator; 2014 const UChar *text = strsrch->search->text; 2015 // This part checks if either if the start of the match contains potential 2016 // contraction. If so we'll have to iterate through them 2017 // Since we used ucol_next while previously looking for the potential 2018 // match, this guarantees that our end will not be a partial contraction, 2019 // or a partial supplementary character. 2020 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) { 2021 int32_t expansion = getExpansionSuffix(coleiter); 2022 UBool expandflag = expansion > 0; 2023 setColEIterOffset(coleiter, *end); 2024 while (U_SUCCESS(*status) && expansion > 0) { 2025 // getting rid of the redundant ce 2026 // since forward contraction/expansion may have extra ces 2027 // if we are in the normalization buffer, hasAccentsBeforeMatch 2028 // would have taken care of it. 2029 // E.g. the character \u01FA will have an expansion of 3, but if 2030 // we are only looking for A ring A\u030A, we'll have to skip the 2031 // last ce in the expansion buffer 2032 ucol_previous(coleiter, status); 2033 if (U_FAILURE(*status)) { 2034 return FALSE; 2035 } 2036 if (ucol_getOffset(coleiter) != temp) { 2037 *end = temp; 2038 temp = ucol_getOffset(coleiter); 2039 } 2040 expansion --; 2041 } 2042 2043 int32_t *patternce = strsrch->pattern.CE; 2044 int32_t patterncelength = strsrch->pattern.CELength; 2045 int32_t count = patterncelength; 2046 while (count > 0) { 2047 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status)); 2048 // status checked below, note that if status is a failure 2049 // ucol_previous returns UCOL_NULLORDER 2050 if (ce == UCOL_IGNORABLE) { 2051 continue; 2052 } 2053 if (expandflag && count == 0 && 2054 getColElemIterOffset(coleiter, FALSE) != temp) { 2055 *end = temp; 2056 temp = ucol_getOffset(coleiter); 2057 } 2058 if (U_FAILURE(*status) || ce != patternce[count - 1]) { 2059 (*start) --; 2060 *start = getPreviousBaseOffset(text, *start); 2061 return FALSE; 2062 } 2063 count --; 2064 } 2065 } 2066 return TRUE; 2067} 2068 2069/** 2070* Checks and sets the match information if found. 2071* Checks 2072* <ul> 2073* <li> the current match does not repeat the last match 2074* <li> boundaries are correct 2075* <li> exact matches has no extra accents 2076* <li> identical matches 2077* <\ul> 2078* Otherwise the offset will be shifted to the preceding character. 2079* Internal method, status assumed to be success, caller has to check status 2080* before calling this method. 2081* @param strsrch string search data 2082* @param collator 2083* @param coleiter collation element iterator 2084* @param text string 2085* @param textoffset offset in the collation element text. the returned value 2086* will be the truncated start offset of the match or the new start 2087* search offset. 2088* @param status output error status if any 2089* @return TRUE if the match is valid, FALSE otherwise 2090*/ 2091static 2092inline UBool checkPreviousExactMatch(UStringSearch *strsrch, 2093 int32_t *textoffset, 2094 UErrorCode *status) 2095{ 2096 // to ensure that the start and ends are not composite characters 2097 int32_t end = ucol_getOffset(strsrch->textIter); 2098 if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status) 2099 || U_FAILURE(*status)) { 2100 return FALSE; 2101 } 2102 2103 // this totally matches, however we need to check if it is repeating 2104 // the old match 2105 if (checkRepeatedMatch(strsrch, *textoffset, end) || 2106 !isBreakUnit(strsrch, *textoffset, end) || 2107 hasAccentsBeforeMatch(strsrch, *textoffset, end) || 2108 !checkIdentical(strsrch, *textoffset, end) || 2109 hasAccentsAfterMatch(strsrch, *textoffset, end)) { 2110 (*textoffset) --; 2111 *textoffset = getPreviousBaseOffset(strsrch->search->text, 2112 *textoffset); 2113 return FALSE; 2114 } 2115 2116 //Add breakiterator boundary check for primary strength search. 2117 if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) { 2118 checkBreakBoundary(strsrch, textoffset, &end); 2119 } 2120 2121 strsrch->search->matchedIndex = *textoffset; 2122 strsrch->search->matchedLength = end - *textoffset; 2123 return TRUE; 2124} 2125 2126/** 2127* Rearranges the end accents to try matching. 2128* Suffix accents in the text will be grouped according to their combining 2129* class and the groups will be mixed and matched to try find the perfect 2130* match with the pattern. 2131* So for instance looking for "\u0301" in "\u030A\u0301\u0325" 2132* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings 2133* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", 2134* "\u0301\u0325". 2135* step 2: check if any of the generated substrings matches the pattern. 2136* Internal method, status assumed to be success, user has to check status 2137* before calling this method. 2138* @param strsrch string search match 2139* @param start offset of the first base character 2140* @param end start of the last accent set 2141* @param status only error status if any 2142* @return USEARCH_DONE if a match is not found, otherwise return the ending 2143* offset of the match. Note this start includes all following accents. 2144*/ 2145static 2146int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch, 2147 int32_t start, 2148 int32_t end, 2149 UErrorCode *status) 2150{ 2151 const UChar *text = strsrch->search->text; 2152 int32_t tempend = end; 2153 2154 U16_BACK_1(text, 0, tempend); 2155 if (!(getFCD(text, &tempend, strsrch->search->textLength) & 2156 LAST_BYTE_MASK_)) { 2157 // die... failed at a base character 2158 return USEARCH_DONE; 2159 } 2160 end = getNextBaseOffset(text, end, strsrch->search->textLength); 2161 2162 if (U_SUCCESS(*status)) { 2163 UChar accents[INITIAL_ARRAY_SIZE_]; 2164 int32_t offset = getPreviousBaseOffset(text, end); 2165 // normalizing the offensive string 2166 unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents, 2167 INITIAL_ARRAY_SIZE_, status); 2168 2169 int32_t accentsindex[INITIAL_ARRAY_SIZE_]; 2170 int32_t accentsize = getUnblockedAccentIndex(accents, 2171 accentsindex); 2172 int32_t count = (2 << (accentsize - 1)) - 1; 2173 UChar buffer[INITIAL_ARRAY_SIZE_]; 2174 UCollationElements *coleiter = strsrch->utilIter; 2175 while (U_SUCCESS(*status) && count > 0) { 2176 UChar *rearrange = strsrch->canonicalSuffixAccents; 2177 // copy the base characters 2178 for (int k = 0; k < accentsindex[0]; k ++) { 2179 *rearrange ++ = accents[k]; 2180 } 2181 // forming all possible canonical rearrangement by dropping 2182 // sets of accents 2183 for (int i = 0; i <= accentsize - 1; i ++) { 2184 int32_t mask = 1 << (accentsize - i - 1); 2185 if (count & mask) { 2186 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { 2187 *rearrange ++ = accents[j]; 2188 } 2189 } 2190 } 2191 *rearrange = 0; 2192 int32_t matchsize = INITIAL_ARRAY_SIZE_; 2193 UChar *match = addToUCharArray(buffer, &matchsize, 2194 strsrch->canonicalPrefixAccents, 2195 strsrch->search->text + start, 2196 offset - start, 2197 strsrch->canonicalSuffixAccents, 2198 status); 2199 2200 // run the collator iterator through this match 2201 // if status is a failure ucol_setText does nothing 2202 ucol_setText(coleiter, match, matchsize, status); 2203 if (U_SUCCESS(*status)) { 2204 if (checkCollationMatch(strsrch, coleiter)) { 2205 if (match != buffer) { 2206 uprv_free(match); 2207 } 2208 return end; 2209 } 2210 } 2211 count --; 2212 } 2213 } 2214 return USEARCH_DONE; 2215} 2216 2217/** 2218* Take the rearranged start accents and tries matching. If match failed at 2219* a seperate following set of accents (seperated from the rearranged on by 2220* at least a base character) then we rearrange the preceding accents and 2221* tries matching again. 2222* We allow skipping of the ends of the accent set if the ces do not match. 2223* However if the failure is found before the accent set, it fails. 2224* Internal method, status assumed to be success, caller has to check status 2225* before calling this method. 2226* @param strsrch string search data 2227* @param textoffset of the ends of the rearranged accent 2228* @param status output error status if any 2229* @return USEARCH_DONE if a match is not found, otherwise return the ending 2230* offset of the match. Note this start includes all following accents. 2231*/ 2232static 2233int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch, 2234 int32_t textoffset, 2235 UErrorCode *status) 2236{ 2237 const UChar *text = strsrch->search->text; 2238 const UCollator *collator = strsrch->collator; 2239 int32_t safelength = 0; 2240 UChar *safetext; 2241 int32_t safetextlength; 2242 UChar safebuffer[INITIAL_ARRAY_SIZE_]; 2243 int32_t safeoffset = textoffset; 2244 2245 if (textoffset && 2246 ucol_unsafeCP(strsrch->canonicalPrefixAccents[ 2247 u_strlen(strsrch->canonicalPrefixAccents) - 1 2248 ], collator)) { 2249 safeoffset = getNextSafeOffset(collator, text, textoffset, 2250 strsrch->search->textLength); 2251 safelength = safeoffset - textoffset; 2252 safetextlength = INITIAL_ARRAY_SIZE_; 2253 safetext = addToUCharArray(safebuffer, &safetextlength, 2254 strsrch->canonicalPrefixAccents, 2255 text + textoffset, safelength, 2256 NULL, status); 2257 } 2258 else { 2259 safetextlength = u_strlen(strsrch->canonicalPrefixAccents); 2260 safetext = strsrch->canonicalPrefixAccents; 2261 } 2262 2263 UCollationElements *coleiter = strsrch->utilIter; 2264 // if status is a failure, ucol_setText does nothing 2265 ucol_setText(coleiter, safetext, safetextlength, status); 2266 // status checked in loop below 2267 2268 int32_t *ce = strsrch->pattern.CE; 2269 int32_t celength = strsrch->pattern.CELength; 2270 int ceindex = 0; 2271 UBool isSafe = TRUE; // safe zone indication flag for position 2272 int32_t prefixlength = u_strlen(strsrch->canonicalPrefixAccents); 2273 2274 while (ceindex < celength) { 2275 int32_t textce = ucol_next(coleiter, status); 2276 if (U_FAILURE(*status)) { 2277 if (isSafe) { 2278 cleanUpSafeText(strsrch, safetext, safebuffer); 2279 } 2280 return USEARCH_DONE; 2281 } 2282 if (textce == UCOL_NULLORDER) { 2283 // check if we have passed the safe buffer 2284 if (coleiter == strsrch->textIter) { 2285 cleanUpSafeText(strsrch, safetext, safebuffer); 2286 return USEARCH_DONE; 2287 } 2288 cleanUpSafeText(strsrch, safetext, safebuffer); 2289 safetext = safebuffer; 2290 coleiter = strsrch->textIter; 2291 setColEIterOffset(coleiter, safeoffset); 2292 // status checked at the start of the loop 2293 isSafe = FALSE; 2294 continue; 2295 } 2296 textce = getCE(strsrch, textce); 2297 if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) { 2298 // do the beginning stuff 2299 int32_t failedoffset = ucol_getOffset(coleiter); 2300 if (isSafe && failedoffset <= prefixlength) { 2301 // alas... no hope. failed at rearranged accent set 2302 cleanUpSafeText(strsrch, safetext, safebuffer); 2303 return USEARCH_DONE; 2304 } 2305 else { 2306 if (isSafe) { 2307 failedoffset = safeoffset - failedoffset; 2308 cleanUpSafeText(strsrch, safetext, safebuffer); 2309 } 2310 2311 // try rearranging the end accents 2312 int32_t result = doPreviousCanonicalSuffixMatch(strsrch, 2313 textoffset, failedoffset, status); 2314 if (result != USEARCH_DONE) { 2315 // if status is a failure, ucol_setOffset does nothing 2316 setColEIterOffset(strsrch->textIter, result); 2317 } 2318 if (U_FAILURE(*status)) { 2319 return USEARCH_DONE; 2320 } 2321 return result; 2322 } 2323 } 2324 if (textce == ce[ceindex]) { 2325 ceindex ++; 2326 } 2327 } 2328 // set offset here 2329 if (isSafe) { 2330 int32_t result = ucol_getOffset(coleiter); 2331 // sets the text iterator here with the correct expansion and offset 2332 int32_t leftoverces = getExpansionSuffix(coleiter); 2333 cleanUpSafeText(strsrch, safetext, safebuffer); 2334 if (result <= prefixlength) { 2335 result = textoffset; 2336 } 2337 else { 2338 result = textoffset + (safeoffset - result); 2339 } 2340 setColEIterOffset(strsrch->textIter, result); 2341 setExpansionSuffix(strsrch->textIter, leftoverces); 2342 return result; 2343 } 2344 2345 return ucol_getOffset(coleiter); 2346} 2347 2348/** 2349* Trying out the substring and sees if it can be a canonical match. 2350* This will try normalizing the starting accents and arranging them into 2351* canonical equivalents and check their corresponding ces with the pattern ce. 2352* Prefix accents in the text will be grouped according to their combining 2353* class and the groups will be mixed and matched to try find the perfect 2354* match with the pattern. 2355* So for instance looking for "\u0301" in "\u030A\u0301\u0325" 2356* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings 2357* "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325", 2358* "\u0301\u0325". 2359* step 2: check if any of the generated substrings matches the pattern. 2360* Internal method, status assumed to be success, caller has to check status 2361* before calling this method. 2362* @param strsrch string search data 2363* @param textoffset start offset in the collation element text that starts 2364* with the accents to be rearranged 2365* @param status output error status if any 2366* @return TRUE if the match is valid, FALSE otherwise 2367*/ 2368static 2369UBool doPreviousCanonicalMatch(UStringSearch *strsrch, 2370 int32_t textoffset, 2371 UErrorCode *status) 2372{ 2373 const UChar *text = strsrch->search->text; 2374 int32_t temp = textoffset; 2375 int32_t textlength = strsrch->search->textLength; 2376 if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) { 2377 UCollationElements *coleiter = strsrch->textIter; 2378 int32_t offset = ucol_getOffset(coleiter); 2379 if (strsrch->pattern.hasSuffixAccents) { 2380 offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset, 2381 offset, status); 2382 if (U_SUCCESS(*status) && offset != USEARCH_DONE) { 2383 setColEIterOffset(coleiter, offset); 2384 return TRUE; 2385 } 2386 } 2387 return FALSE; 2388 } 2389 2390 if (!strsrch->pattern.hasPrefixAccents) { 2391 return FALSE; 2392 } 2393 2394 UChar accents[INITIAL_ARRAY_SIZE_]; 2395 // offset to the last base character in substring to search 2396 int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength); 2397 // normalizing the offensive string 2398 unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD, 2399 0, accents, INITIAL_ARRAY_SIZE_, status); 2400 // status checked in loop 2401 2402 int32_t accentsindex[INITIAL_ARRAY_SIZE_]; 2403 int32_t size = getUnblockedAccentIndex(accents, accentsindex); 2404 2405 // 2 power n - 1 plus the full set of accents 2406 int32_t count = (2 << (size - 1)) - 1; 2407 while (U_SUCCESS(*status) && count > 0) { 2408 UChar *rearrange = strsrch->canonicalPrefixAccents; 2409 // copy the base characters 2410 for (int k = 0; k < accentsindex[0]; k ++) { 2411 *rearrange ++ = accents[k]; 2412 } 2413 // forming all possible canonical rearrangement by dropping 2414 // sets of accents 2415 for (int i = 0; i <= size - 1; i ++) { 2416 int32_t mask = 1 << (size - i - 1); 2417 if (count & mask) { 2418 for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) { 2419 *rearrange ++ = accents[j]; 2420 } 2421 } 2422 } 2423 *rearrange = 0; 2424 int32_t offset = doPreviousCanonicalPrefixMatch(strsrch, 2425 baseoffset, status); 2426 if (offset != USEARCH_DONE) { 2427 return TRUE; // match found 2428 } 2429 count --; 2430 } 2431 return FALSE; 2432} 2433 2434/** 2435* Checks match for contraction. 2436* If the match starts with a partial contraction we fail. 2437* Internal method, status assumed to be success, caller has to check status 2438* before calling this method. 2439* @param strsrch string search data 2440* @param start offset of potential match, to be modified if necessary 2441* @param end offset of potential match, to be modified if necessary 2442* @param status only error status if any 2443* @return TRUE if match passes the contraction test, FALSE otherwise 2444*/ 2445static 2446UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch, 2447 int32_t *start, 2448 int32_t *end, UErrorCode *status) 2449{ 2450 UCollationElements *coleiter = strsrch->textIter; 2451 int32_t textlength = strsrch->search->textLength; 2452 int32_t temp = *end; 2453 const UCollator *collator = strsrch->collator; 2454 const UChar *text = strsrch->search->text; 2455 // This part checks if either if the start of the match contains potential 2456 // contraction. If so we'll have to iterate through them 2457 // Since we used ucol_next while previously looking for the potential 2458 // match, this guarantees that our end will not be a partial contraction, 2459 // or a partial supplementary character. 2460 if (*start < textlength && ucol_unsafeCP(text[*start], collator)) { 2461 int32_t expansion = getExpansionSuffix(coleiter); 2462 UBool expandflag = expansion > 0; 2463 setColEIterOffset(coleiter, *end); 2464 while (expansion > 0) { 2465 // getting rid of the redundant ce 2466 // since forward contraction/expansion may have extra ces 2467 // if we are in the normalization buffer, hasAccentsBeforeMatch 2468 // would have taken care of it. 2469 // E.g. the character \u01FA will have an expansion of 3, but if 2470 // we are only looking for A ring A\u030A, we'll have to skip the 2471 // last ce in the expansion buffer 2472 ucol_previous(coleiter, status); 2473 if (U_FAILURE(*status)) { 2474 return FALSE; 2475 } 2476 if (ucol_getOffset(coleiter) != temp) { 2477 *end = temp; 2478 temp = ucol_getOffset(coleiter); 2479 } 2480 expansion --; 2481 } 2482 2483 int32_t *patternce = strsrch->pattern.CE; 2484 int32_t patterncelength = strsrch->pattern.CELength; 2485 int32_t count = patterncelength; 2486 while (count > 0) { 2487 int32_t ce = getCE(strsrch, ucol_previous(coleiter, status)); 2488 // status checked below, note that if status is a failure 2489 // ucol_previous returns UCOL_NULLORDER 2490 if (ce == UCOL_IGNORABLE) { 2491 continue; 2492 } 2493 if (expandflag && count == 0 && 2494 getColElemIterOffset(coleiter, FALSE) != temp) { 2495 *end = temp; 2496 temp = ucol_getOffset(coleiter); 2497 } 2498 if (count == patterncelength && 2499 ce != patternce[patterncelength - 1]) { 2500 // accents may have extra starting ces, this occurs when a 2501 // pure accent pattern is matched without rearrangement 2502 int32_t expected = patternce[patterncelength - 1]; 2503 U16_BACK_1(text, 0, *end); 2504 if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) { 2505 ce = getCE(strsrch, ucol_previous(coleiter, status)); 2506 while (U_SUCCESS(*status) && ce != expected && 2507 ce != UCOL_NULLORDER && 2508 ucol_getOffset(coleiter) <= *start) { 2509 ce = getCE(strsrch, ucol_previous(coleiter, status)); 2510 } 2511 } 2512 } 2513 if (U_FAILURE(*status) || ce != patternce[count - 1]) { 2514 (*start) --; 2515 *start = getPreviousBaseOffset(text, *start); 2516 return FALSE; 2517 } 2518 count --; 2519 } 2520 } 2521 return TRUE; 2522} 2523 2524/** 2525* Checks and sets the match information if found. 2526* Checks 2527* <ul> 2528* <li> the potential match does not repeat the previous match 2529* <li> boundaries are correct 2530* <li> potential match does not end in the middle of a contraction 2531* <li> identical matches 2532* <\ul> 2533* Otherwise the offset will be shifted to the next character. 2534* Internal method, status assumed to be success, caller has to check status 2535* before calling this method. 2536* @param strsrch string search data 2537* @param textoffset offset in the collation element text. the returned value 2538* will be the truncated start offset of the match or the new start 2539* search offset. 2540* @param status only error status if any 2541* @return TRUE if the match is valid, FALSE otherwise 2542*/ 2543static 2544inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch, 2545 int32_t *textoffset, 2546 UErrorCode *status) 2547{ 2548 // to ensure that the start and ends are not composite characters 2549 UCollationElements *coleiter = strsrch->textIter; 2550 // if we have a canonical accent match 2551 if ((strsrch->pattern.hasSuffixAccents && 2552 strsrch->canonicalSuffixAccents[0]) || 2553 (strsrch->pattern.hasPrefixAccents && 2554 strsrch->canonicalPrefixAccents[0])) { 2555 strsrch->search->matchedIndex = *textoffset; 2556 strsrch->search->matchedLength = 2557 getNextUStringSearchBaseOffset(strsrch, 2558 getColElemIterOffset(coleiter, FALSE)) 2559 - *textoffset; 2560 return TRUE; 2561 } 2562 2563 int32_t end = ucol_getOffset(coleiter); 2564 if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end, 2565 status) || 2566 U_FAILURE(*status)) { 2567 return FALSE; 2568 } 2569 2570 end = getNextUStringSearchBaseOffset(strsrch, end); 2571 // this totally matches, however we need to check if it is repeating 2572 if (checkRepeatedMatch(strsrch, *textoffset, end) || 2573 !isBreakUnit(strsrch, *textoffset, end) || 2574 !checkIdentical(strsrch, *textoffset, end)) { 2575 (*textoffset) --; 2576 *textoffset = getPreviousBaseOffset(strsrch->search->text, 2577 *textoffset); 2578 return FALSE; 2579 } 2580 2581 strsrch->search->matchedIndex = *textoffset; 2582 strsrch->search->matchedLength = end - *textoffset; 2583 return TRUE; 2584} 2585#endif // #if BOYER_MOORE 2586 2587// constructors and destructor ------------------------------------------- 2588 2589U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern, 2590 int32_t patternlength, 2591 const UChar *text, 2592 int32_t textlength, 2593 const char *locale, 2594 UBreakIterator *breakiter, 2595 UErrorCode *status) 2596{ 2597 if (U_FAILURE(*status)) { 2598 return NULL; 2599 } 2600#if UCONFIG_NO_BREAK_ITERATION 2601 if (breakiter != NULL) { 2602 *status = U_UNSUPPORTED_ERROR; 2603 return NULL; 2604 } 2605#endif 2606 if (locale) { 2607 // ucol_open internally checks for status 2608 UCollator *collator = ucol_open(locale, status); 2609 // pattern, text checks are done in usearch_openFromCollator 2610 UStringSearch *result = usearch_openFromCollator(pattern, 2611 patternlength, text, textlength, 2612 collator, breakiter, status); 2613 2614 if (result == NULL || U_FAILURE(*status)) { 2615 if (collator) { 2616 ucol_close(collator); 2617 } 2618 return NULL; 2619 } 2620 else { 2621 result->ownCollator = TRUE; 2622 } 2623 return result; 2624 } 2625 *status = U_ILLEGAL_ARGUMENT_ERROR; 2626 return NULL; 2627} 2628 2629U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator( 2630 const UChar *pattern, 2631 int32_t patternlength, 2632 const UChar *text, 2633 int32_t textlength, 2634 const UCollator *collator, 2635 UBreakIterator *breakiter, 2636 UErrorCode *status) 2637{ 2638 if (U_FAILURE(*status)) { 2639 return NULL; 2640 } 2641#if UCONFIG_NO_BREAK_ITERATION 2642 if (breakiter != NULL) { 2643 *status = U_UNSUPPORTED_ERROR; 2644 return NULL; 2645 } 2646#endif 2647 if (pattern == NULL || text == NULL || collator == NULL) { 2648 *status = U_ILLEGAL_ARGUMENT_ERROR; 2649 return NULL; 2650 } 2651 2652 // string search does not really work when numeric collation is turned on 2653 if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) { 2654 *status = U_UNSUPPORTED_ERROR; 2655 return NULL; 2656 } 2657 2658 if (U_SUCCESS(*status)) { 2659 initializeFCD(status); 2660 if (U_FAILURE(*status)) { 2661 return NULL; 2662 } 2663 2664 UStringSearch *result; 2665 if (textlength == -1) { 2666 textlength = u_strlen(text); 2667 } 2668 if (patternlength == -1) { 2669 patternlength = u_strlen(pattern); 2670 } 2671 if (textlength <= 0 || patternlength <= 0) { 2672 *status = U_ILLEGAL_ARGUMENT_ERROR; 2673 return NULL; 2674 } 2675 2676 result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch)); 2677 if (result == NULL) { 2678 *status = U_MEMORY_ALLOCATION_ERROR; 2679 return NULL; 2680 } 2681 2682 result->collator = collator; 2683 result->strength = ucol_getStrength(collator); 2684 result->ceMask = getMask(result->strength); 2685 result->toShift = 2686 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) == 2687 UCOL_SHIFTED; 2688 result->variableTop = ucol_getVariableTop(collator, status); 2689 2690 result->nfd = Normalizer2Factory::getNFDInstance(*status); 2691 2692 if (U_FAILURE(*status)) { 2693 uprv_free(result); 2694 return NULL; 2695 } 2696 2697 result->search = (USearch *)uprv_malloc(sizeof(USearch)); 2698 if (result->search == NULL) { 2699 *status = U_MEMORY_ALLOCATION_ERROR; 2700 uprv_free(result); 2701 return NULL; 2702 } 2703 2704 result->search->text = text; 2705 result->search->textLength = textlength; 2706 2707 result->pattern.text = pattern; 2708 result->pattern.textLength = patternlength; 2709 result->pattern.CE = NULL; 2710 result->pattern.PCE = NULL; 2711 2712 result->search->breakIter = breakiter; 2713#if !UCONFIG_NO_BREAK_ITERATION 2714 result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status); 2715 if (breakiter) { 2716 ubrk_setText(breakiter, text, textlength, status); 2717 } 2718#endif 2719 2720 result->ownCollator = FALSE; 2721 result->search->matchedLength = 0; 2722 result->search->matchedIndex = USEARCH_DONE; 2723 result->utilIter = NULL; 2724 result->textIter = ucol_openElements(collator, text, 2725 textlength, status); 2726 result->textProcessedIter = NULL; 2727 if (U_FAILURE(*status)) { 2728 usearch_close(result); 2729 return NULL; 2730 } 2731 2732 result->search->isOverlap = FALSE; 2733 result->search->isCanonicalMatch = FALSE; 2734 result->search->elementComparisonType = 0; 2735 result->search->isForwardSearching = TRUE; 2736 result->search->reset = TRUE; 2737 2738 initialize(result, status); 2739 2740 if (U_FAILURE(*status)) { 2741 usearch_close(result); 2742 return NULL; 2743 } 2744 2745 return result; 2746 } 2747 return NULL; 2748} 2749 2750U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch) 2751{ 2752 if (strsrch) { 2753 if (strsrch->pattern.CE != strsrch->pattern.CEBuffer && 2754 strsrch->pattern.CE) { 2755 uprv_free(strsrch->pattern.CE); 2756 } 2757 2758 if (strsrch->pattern.PCE != NULL && 2759 strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) { 2760 uprv_free(strsrch->pattern.PCE); 2761 } 2762 2763 delete strsrch->textProcessedIter; 2764 ucol_closeElements(strsrch->textIter); 2765 ucol_closeElements(strsrch->utilIter); 2766 2767 if (strsrch->ownCollator && strsrch->collator) { 2768 ucol_close((UCollator *)strsrch->collator); 2769 } 2770 2771#if !UCONFIG_NO_BREAK_ITERATION 2772 if (strsrch->search->internalBreakIter) { 2773 ubrk_close(strsrch->search->internalBreakIter); 2774 } 2775#endif 2776 2777 uprv_free(strsrch->search); 2778 uprv_free(strsrch); 2779 } 2780} 2781 2782namespace { 2783 2784UBool initTextProcessedIter(UStringSearch *strsrch, UErrorCode *status) { 2785 if (U_FAILURE(*status)) { return FALSE; } 2786 if (strsrch->textProcessedIter == NULL) { 2787 strsrch->textProcessedIter = new icu::UCollationPCE(strsrch->textIter); 2788 if (strsrch->textProcessedIter == NULL) { 2789 *status = U_MEMORY_ALLOCATION_ERROR; 2790 return FALSE; 2791 } 2792 } else { 2793 strsrch->textProcessedIter->init(strsrch->textIter); 2794 } 2795 return TRUE; 2796} 2797 2798} 2799 2800// set and get methods -------------------------------------------------- 2801 2802U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch, 2803 int32_t position, 2804 UErrorCode *status) 2805{ 2806 if (U_SUCCESS(*status) && strsrch) { 2807 if (isOutOfBounds(strsrch->search->textLength, position)) { 2808 *status = U_INDEX_OUTOFBOUNDS_ERROR; 2809 } 2810 else { 2811 setColEIterOffset(strsrch->textIter, position); 2812 } 2813 strsrch->search->matchedIndex = USEARCH_DONE; 2814 strsrch->search->matchedLength = 0; 2815 strsrch->search->reset = FALSE; 2816 } 2817} 2818 2819U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch) 2820{ 2821 if (strsrch) { 2822 int32_t result = ucol_getOffset(strsrch->textIter); 2823 if (isOutOfBounds(strsrch->search->textLength, result)) { 2824 return USEARCH_DONE; 2825 } 2826 return result; 2827 } 2828 return USEARCH_DONE; 2829} 2830 2831U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch, 2832 USearchAttribute attribute, 2833 USearchAttributeValue value, 2834 UErrorCode *status) 2835{ 2836 if (U_SUCCESS(*status) && strsrch) { 2837 switch (attribute) 2838 { 2839 case USEARCH_OVERLAP : 2840 strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE); 2841 break; 2842 case USEARCH_CANONICAL_MATCH : 2843 strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE : 2844 FALSE); 2845 break; 2846 case USEARCH_ELEMENT_COMPARISON : 2847 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) { 2848 strsrch->search->elementComparisonType = (int16_t)value; 2849 } else { 2850 strsrch->search->elementComparisonType = 0; 2851 } 2852 break; 2853 case USEARCH_ATTRIBUTE_COUNT : 2854 default: 2855 *status = U_ILLEGAL_ARGUMENT_ERROR; 2856 } 2857 } 2858 if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) { 2859 *status = U_ILLEGAL_ARGUMENT_ERROR; 2860 } 2861} 2862 2863U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute( 2864 const UStringSearch *strsrch, 2865 USearchAttribute attribute) 2866{ 2867 if (strsrch) { 2868 switch (attribute) { 2869 case USEARCH_OVERLAP : 2870 return (strsrch->search->isOverlap == TRUE ? USEARCH_ON : 2871 USEARCH_OFF); 2872 case USEARCH_CANONICAL_MATCH : 2873 return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON : 2874 USEARCH_OFF); 2875 case USEARCH_ELEMENT_COMPARISON : 2876 { 2877 int16_t value = strsrch->search->elementComparisonType; 2878 if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) { 2879 return (USearchAttributeValue)value; 2880 } else { 2881 return USEARCH_STANDARD_ELEMENT_COMPARISON; 2882 } 2883 } 2884 case USEARCH_ATTRIBUTE_COUNT : 2885 return USEARCH_DEFAULT; 2886 } 2887 } 2888 return USEARCH_DEFAULT; 2889} 2890 2891U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart( 2892 const UStringSearch *strsrch) 2893{ 2894 if (strsrch == NULL) { 2895 return USEARCH_DONE; 2896 } 2897 return strsrch->search->matchedIndex; 2898} 2899 2900 2901U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch, 2902 UChar *result, 2903 int32_t resultCapacity, 2904 UErrorCode *status) 2905{ 2906 if (U_FAILURE(*status)) { 2907 return USEARCH_DONE; 2908 } 2909 if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 && 2910 result == NULL)) { 2911 *status = U_ILLEGAL_ARGUMENT_ERROR; 2912 return USEARCH_DONE; 2913 } 2914 2915 int32_t copylength = strsrch->search->matchedLength; 2916 int32_t copyindex = strsrch->search->matchedIndex; 2917 if (copyindex == USEARCH_DONE) { 2918 u_terminateUChars(result, resultCapacity, 0, status); 2919 return USEARCH_DONE; 2920 } 2921 2922 if (resultCapacity < copylength) { 2923 copylength = resultCapacity; 2924 } 2925 if (copylength > 0) { 2926 uprv_memcpy(result, strsrch->search->text + copyindex, 2927 copylength * sizeof(UChar)); 2928 } 2929 return u_terminateUChars(result, resultCapacity, 2930 strsrch->search->matchedLength, status); 2931} 2932 2933U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength( 2934 const UStringSearch *strsrch) 2935{ 2936 if (strsrch) { 2937 return strsrch->search->matchedLength; 2938 } 2939 return USEARCH_DONE; 2940} 2941 2942#if !UCONFIG_NO_BREAK_ITERATION 2943 2944U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch *strsrch, 2945 UBreakIterator *breakiter, 2946 UErrorCode *status) 2947{ 2948 if (U_SUCCESS(*status) && strsrch) { 2949 strsrch->search->breakIter = breakiter; 2950 if (breakiter) { 2951 ubrk_setText(breakiter, strsrch->search->text, 2952 strsrch->search->textLength, status); 2953 } 2954 } 2955} 2956 2957U_CAPI const UBreakIterator* U_EXPORT2 2958usearch_getBreakIterator(const UStringSearch *strsrch) 2959{ 2960 if (strsrch) { 2961 return strsrch->search->breakIter; 2962 } 2963 return NULL; 2964} 2965 2966#endif 2967 2968U_CAPI void U_EXPORT2 usearch_setText( UStringSearch *strsrch, 2969 const UChar *text, 2970 int32_t textlength, 2971 UErrorCode *status) 2972{ 2973 if (U_SUCCESS(*status)) { 2974 if (strsrch == NULL || text == NULL || textlength < -1 || 2975 textlength == 0) { 2976 *status = U_ILLEGAL_ARGUMENT_ERROR; 2977 } 2978 else { 2979 if (textlength == -1) { 2980 textlength = u_strlen(text); 2981 } 2982 strsrch->search->text = text; 2983 strsrch->search->textLength = textlength; 2984 ucol_setText(strsrch->textIter, text, textlength, status); 2985 strsrch->search->matchedIndex = USEARCH_DONE; 2986 strsrch->search->matchedLength = 0; 2987 strsrch->search->reset = TRUE; 2988#if !UCONFIG_NO_BREAK_ITERATION 2989 if (strsrch->search->breakIter != NULL) { 2990 ubrk_setText(strsrch->search->breakIter, text, 2991 textlength, status); 2992 } 2993 ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status); 2994#endif 2995 } 2996 } 2997} 2998 2999U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch, 3000 int32_t *length) 3001{ 3002 if (strsrch) { 3003 *length = strsrch->search->textLength; 3004 return strsrch->search->text; 3005 } 3006 return NULL; 3007} 3008 3009U_CAPI void U_EXPORT2 usearch_setCollator( UStringSearch *strsrch, 3010 const UCollator *collator, 3011 UErrorCode *status) 3012{ 3013 if (U_SUCCESS(*status)) { 3014 if (collator == NULL) { 3015 *status = U_ILLEGAL_ARGUMENT_ERROR; 3016 return; 3017 } 3018 3019 if (strsrch) { 3020 delete strsrch->textProcessedIter; 3021 strsrch->textProcessedIter = NULL; 3022 ucol_closeElements(strsrch->textIter); 3023 ucol_closeElements(strsrch->utilIter); 3024 strsrch->textIter = strsrch->utilIter = NULL; 3025 if (strsrch->ownCollator && (strsrch->collator != collator)) { 3026 ucol_close((UCollator *)strsrch->collator); 3027 strsrch->ownCollator = FALSE; 3028 } 3029 strsrch->collator = collator; 3030 strsrch->strength = ucol_getStrength(collator); 3031 strsrch->ceMask = getMask(strsrch->strength); 3032#if !UCONFIG_NO_BREAK_ITERATION 3033 ubrk_close(strsrch->search->internalBreakIter); 3034 strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status), 3035 strsrch->search->text, strsrch->search->textLength, status); 3036#endif 3037 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT 3038 strsrch->toShift = 3039 ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) == 3040 UCOL_SHIFTED; 3041 // if status is a failure, ucol_getVariableTop returns 0 3042 strsrch->variableTop = ucol_getVariableTop(collator, status); 3043 strsrch->textIter = ucol_openElements(collator, 3044 strsrch->search->text, 3045 strsrch->search->textLength, 3046 status); 3047 strsrch->utilIter = ucol_openElements( 3048 collator, strsrch->pattern.text, strsrch->pattern.textLength, status); 3049 // initialize() _after_ setting the iterators for the new collator. 3050 initialize(strsrch, status); 3051 } 3052 3053 // **** are these calls needed? 3054 // **** we call uprv_init_pce in initializePatternPCETable 3055 // **** and the CEBuffer constructor... 3056#if 0 3057 uprv_init_pce(strsrch->textIter); 3058 uprv_init_pce(strsrch->utilIter); 3059#endif 3060 } 3061} 3062 3063U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch) 3064{ 3065 if (strsrch) { 3066 return (UCollator *)strsrch->collator; 3067 } 3068 return NULL; 3069} 3070 3071U_CAPI void U_EXPORT2 usearch_setPattern( UStringSearch *strsrch, 3072 const UChar *pattern, 3073 int32_t patternlength, 3074 UErrorCode *status) 3075{ 3076 if (U_SUCCESS(*status)) { 3077 if (strsrch == NULL || pattern == NULL) { 3078 *status = U_ILLEGAL_ARGUMENT_ERROR; 3079 } 3080 else { 3081 if (patternlength == -1) { 3082 patternlength = u_strlen(pattern); 3083 } 3084 if (patternlength == 0) { 3085 *status = U_ILLEGAL_ARGUMENT_ERROR; 3086 return; 3087 } 3088 strsrch->pattern.text = pattern; 3089 strsrch->pattern.textLength = patternlength; 3090 initialize(strsrch, status); 3091 } 3092 } 3093} 3094 3095U_CAPI const UChar* U_EXPORT2 3096usearch_getPattern(const UStringSearch *strsrch, 3097 int32_t *length) 3098{ 3099 if (strsrch) { 3100 *length = strsrch->pattern.textLength; 3101 return strsrch->pattern.text; 3102 } 3103 return NULL; 3104} 3105 3106// miscellanous methods -------------------------------------------------- 3107 3108U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch, 3109 UErrorCode *status) 3110{ 3111 if (strsrch && U_SUCCESS(*status)) { 3112 strsrch->search->isForwardSearching = TRUE; 3113 usearch_setOffset(strsrch, 0, status); 3114 if (U_SUCCESS(*status)) { 3115 return usearch_next(strsrch, status); 3116 } 3117 } 3118 return USEARCH_DONE; 3119} 3120 3121U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch, 3122 int32_t position, 3123 UErrorCode *status) 3124{ 3125 if (strsrch && U_SUCCESS(*status)) { 3126 strsrch->search->isForwardSearching = TRUE; 3127 // position checked in usearch_setOffset 3128 usearch_setOffset(strsrch, position, status); 3129 if (U_SUCCESS(*status)) { 3130 return usearch_next(strsrch, status); 3131 } 3132 } 3133 return USEARCH_DONE; 3134} 3135 3136U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch, 3137 UErrorCode *status) 3138{ 3139 if (strsrch && U_SUCCESS(*status)) { 3140 strsrch->search->isForwardSearching = FALSE; 3141 usearch_setOffset(strsrch, strsrch->search->textLength, status); 3142 if (U_SUCCESS(*status)) { 3143 return usearch_previous(strsrch, status); 3144 } 3145 } 3146 return USEARCH_DONE; 3147} 3148 3149U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch, 3150 int32_t position, 3151 UErrorCode *status) 3152{ 3153 if (strsrch && U_SUCCESS(*status)) { 3154 strsrch->search->isForwardSearching = FALSE; 3155 // position checked in usearch_setOffset 3156 usearch_setOffset(strsrch, position, status); 3157 if (U_SUCCESS(*status)) { 3158 return usearch_previous(strsrch, status); 3159 } 3160 } 3161 return USEARCH_DONE; 3162} 3163 3164/** 3165* If a direction switch is required, we'll count the number of ces till the 3166* beginning of the collation element iterator and iterate forwards that 3167* number of times. This is so that we get to the correct point within the 3168* string to continue the search in. Imagine when we are in the middle of the 3169* normalization buffer when the change in direction is request. arrrgghh.... 3170* After searching the offset within the collation element iterator will be 3171* shifted to the start of the match. If a match is not found, the offset would 3172* have been set to the end of the text string in the collation element 3173* iterator. 3174* Okay, here's my take on normalization buffer. The only time when there can 3175* be 2 matches within the same normalization is when the pattern is consists 3176* of all accents. But since the offset returned is from the text string, we 3177* should not confuse the caller by returning the second match within the 3178* same normalization buffer. If we do, the 2 results will have the same match 3179* offsets, and that'll be confusing. I'll return the next match that doesn't 3180* fall within the same normalization buffer. Note this does not affect the 3181* results of matches spanning the text and the normalization buffer. 3182* The position to start searching is taken from the collation element 3183* iterator. Callers of this API would have to set the offset in the collation 3184* element iterator before using this method. 3185*/ 3186U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch, 3187 UErrorCode *status) 3188{ 3189 if (U_SUCCESS(*status) && strsrch) { 3190 // note offset is either equivalent to the start of the previous match 3191 // or is set by the user 3192 int32_t offset = usearch_getOffset(strsrch); 3193 USearch *search = strsrch->search; 3194 search->reset = FALSE; 3195 int32_t textlength = search->textLength; 3196 if (search->isForwardSearching) { 3197#if BOYER_MOORE 3198 if (offset == textlength 3199 || (!search->isOverlap && 3200 (offset + strsrch->pattern.defaultShiftSize > textlength || 3201 (search->matchedIndex != USEARCH_DONE && 3202 offset + search->matchedLength >= textlength)))) { 3203 // not enough characters to match 3204 setMatchNotFound(strsrch); 3205 return USEARCH_DONE; 3206 } 3207#else 3208 if (offset == textlength || 3209 (! search->isOverlap && 3210 (search->matchedIndex != USEARCH_DONE && 3211 offset + search->matchedLength > textlength))) { 3212 // not enough characters to match 3213 setMatchNotFound(strsrch); 3214 return USEARCH_DONE; 3215 } 3216#endif 3217 } 3218 else { 3219 // switching direction. 3220 // if matchedIndex == USEARCH_DONE, it means that either a 3221 // setOffset has been called or that previous ran off the text 3222 // string. the iterator would have been set to offset 0 if a 3223 // match is not found. 3224 search->isForwardSearching = TRUE; 3225 if (search->matchedIndex != USEARCH_DONE) { 3226 // there's no need to set the collation element iterator 3227 // the next call to next will set the offset. 3228 return search->matchedIndex; 3229 } 3230 } 3231 3232 if (U_SUCCESS(*status)) { 3233 if (strsrch->pattern.CELength == 0) { 3234 if (search->matchedIndex == USEARCH_DONE) { 3235 search->matchedIndex = offset; 3236 } 3237 else { // moves by codepoints 3238 U16_FWD_1(search->text, search->matchedIndex, textlength); 3239 } 3240 3241 search->matchedLength = 0; 3242 setColEIterOffset(strsrch->textIter, search->matchedIndex); 3243 // status checked below 3244 if (search->matchedIndex == textlength) { 3245 search->matchedIndex = USEARCH_DONE; 3246 } 3247 } 3248 else { 3249 if (search->matchedLength > 0) { 3250 // if matchlength is 0 we are at the start of the iteration 3251 if (search->isOverlap) { 3252 ucol_setOffset(strsrch->textIter, offset + 1, status); 3253 } 3254 else { 3255 ucol_setOffset(strsrch->textIter, 3256 offset + search->matchedLength, status); 3257 } 3258 } 3259 else { 3260 // for boundary check purposes. this will ensure that the 3261 // next match will not preceed the current offset 3262 // note search->matchedIndex will always be set to something 3263 // in the code 3264 search->matchedIndex = offset - 1; 3265 } 3266 3267 if (search->isCanonicalMatch) { 3268 // can't use exact here since extra accents are allowed. 3269 usearch_handleNextCanonical(strsrch, status); 3270 } 3271 else { 3272 usearch_handleNextExact(strsrch, status); 3273 } 3274 } 3275 3276 if (U_FAILURE(*status)) { 3277 return USEARCH_DONE; 3278 } 3279 3280#if !BOYER_MOORE 3281 if (search->matchedIndex == USEARCH_DONE) { 3282 ucol_setOffset(strsrch->textIter, search->textLength, status); 3283 } else { 3284 ucol_setOffset(strsrch->textIter, search->matchedIndex, status); 3285 } 3286#endif 3287 3288 return search->matchedIndex; 3289 } 3290 } 3291 return USEARCH_DONE; 3292} 3293 3294U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch, 3295 UErrorCode *status) 3296{ 3297 if (U_SUCCESS(*status) && strsrch) { 3298 int32_t offset; 3299 USearch *search = strsrch->search; 3300 if (search->reset) { 3301 offset = search->textLength; 3302 search->isForwardSearching = FALSE; 3303 search->reset = FALSE; 3304 setColEIterOffset(strsrch->textIter, offset); 3305 } 3306 else { 3307 offset = usearch_getOffset(strsrch); 3308 } 3309 3310 int32_t matchedindex = search->matchedIndex; 3311 if (search->isForwardSearching == TRUE) { 3312 // switching direction. 3313 // if matchedIndex == USEARCH_DONE, it means that either a 3314 // setOffset has been called or that next ran off the text 3315 // string. the iterator would have been set to offset textLength if 3316 // a match is not found. 3317 search->isForwardSearching = FALSE; 3318 if (matchedindex != USEARCH_DONE) { 3319 return matchedindex; 3320 } 3321 } 3322 else { 3323#if BOYER_MOORE 3324 if (offset == 0 || matchedindex == 0 || 3325 (!search->isOverlap && 3326 (offset < strsrch->pattern.defaultShiftSize || 3327 (matchedindex != USEARCH_DONE && 3328 matchedindex < strsrch->pattern.defaultShiftSize)))) { 3329 // not enough characters to match 3330 setMatchNotFound(strsrch); 3331 return USEARCH_DONE; 3332 } 3333#else 3334 // Could check pattern length, but the 3335 // linear search will do the right thing 3336 if (offset == 0 || matchedindex == 0) { 3337 setMatchNotFound(strsrch); 3338 return USEARCH_DONE; 3339 } 3340#endif 3341 } 3342 3343 if (U_SUCCESS(*status)) { 3344 if (strsrch->pattern.CELength == 0) { 3345 search->matchedIndex = 3346 (matchedindex == USEARCH_DONE ? offset : matchedindex); 3347 if (search->matchedIndex == 0) { 3348 setMatchNotFound(strsrch); 3349 // status checked below 3350 } 3351 else { // move by codepoints 3352 U16_BACK_1(search->text, 0, search->matchedIndex); 3353 setColEIterOffset(strsrch->textIter, search->matchedIndex); 3354 // status checked below 3355 search->matchedLength = 0; 3356 } 3357 } 3358 else { 3359 if (strsrch->search->isCanonicalMatch) { 3360 // can't use exact here since extra accents are allowed. 3361 usearch_handlePreviousCanonical(strsrch, status); 3362 // status checked below 3363 } 3364 else { 3365 usearch_handlePreviousExact(strsrch, status); 3366 // status checked below 3367 } 3368 } 3369 3370 if (U_FAILURE(*status)) { 3371 return USEARCH_DONE; 3372 } 3373 3374 return search->matchedIndex; 3375 } 3376 } 3377 return USEARCH_DONE; 3378} 3379 3380 3381 3382U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch) 3383{ 3384 /* 3385 reset is setting the attributes that are already in 3386 string search, hence all attributes in the collator should 3387 be retrieved without any problems 3388 */ 3389 if (strsrch) { 3390 UErrorCode status = U_ZERO_ERROR; 3391 UBool sameCollAttribute = TRUE; 3392 uint32_t ceMask; 3393 UBool shift; 3394 uint32_t varTop; 3395 3396 // **** hack to deal w/ how processed CEs encode quaternary **** 3397 UCollationStrength newStrength = ucol_getStrength(strsrch->collator); 3398 if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) || 3399 (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) { 3400 sameCollAttribute = FALSE; 3401 } 3402 3403 strsrch->strength = ucol_getStrength(strsrch->collator); 3404 ceMask = getMask(strsrch->strength); 3405 if (strsrch->ceMask != ceMask) { 3406 strsrch->ceMask = ceMask; 3407 sameCollAttribute = FALSE; 3408 } 3409 3410 // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT 3411 shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING, 3412 &status) == UCOL_SHIFTED; 3413 if (strsrch->toShift != shift) { 3414 strsrch->toShift = shift; 3415 sameCollAttribute = FALSE; 3416 } 3417 3418 // if status is a failure, ucol_getVariableTop returns 0 3419 varTop = ucol_getVariableTop(strsrch->collator, &status); 3420 if (strsrch->variableTop != varTop) { 3421 strsrch->variableTop = varTop; 3422 sameCollAttribute = FALSE; 3423 } 3424 if (!sameCollAttribute) { 3425 initialize(strsrch, &status); 3426 } 3427 ucol_setText(strsrch->textIter, strsrch->search->text, 3428 strsrch->search->textLength, 3429 &status); 3430 strsrch->search->matchedLength = 0; 3431 strsrch->search->matchedIndex = USEARCH_DONE; 3432 strsrch->search->isOverlap = FALSE; 3433 strsrch->search->isCanonicalMatch = FALSE; 3434 strsrch->search->elementComparisonType = 0; 3435 strsrch->search->isForwardSearching = TRUE; 3436 strsrch->search->reset = TRUE; 3437 } 3438} 3439 3440// 3441// CEI Collation Element + source text index. 3442// These structs are kept in the circular buffer. 3443// 3444struct CEI { 3445 int64_t ce; 3446 int32_t lowIndex; 3447 int32_t highIndex; 3448}; 3449 3450U_NAMESPACE_BEGIN 3451 3452namespace { 3453// 3454// CEBuffer A circular buffer of CEs from the text being searched. 3455// 3456#define DEFAULT_CEBUFFER_SIZE 96 3457#define CEBUFFER_EXTRA 32 3458// Some typical max values to make buffer size more reasonable for asymmetric search. 3459// #8694 is for a better long-term solution to allocation of this buffer. 3460#define MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8 3461#define MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3 3462#define MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186)) 3463struct CEBuffer { 3464 CEI defBuf[DEFAULT_CEBUFFER_SIZE]; 3465 CEI *buf; 3466 int32_t bufSize; 3467 int32_t firstIx; 3468 int32_t limitIx; 3469 UCollationElements *ceIter; 3470 UStringSearch *strSearch; 3471 3472 3473 3474 CEBuffer(UStringSearch *ss, UErrorCode *status); 3475 ~CEBuffer(); 3476 const CEI *get(int32_t index); 3477 const CEI *getPrevious(int32_t index); 3478}; 3479 3480 3481CEBuffer::CEBuffer(UStringSearch *ss, UErrorCode *status) { 3482 buf = defBuf; 3483 strSearch = ss; 3484 bufSize = ss->pattern.PCELength + CEBUFFER_EXTRA; 3485 if (ss->search->elementComparisonType != 0) { 3486 const UChar * patText = ss->pattern.text; 3487 if (patText) { 3488 const UChar * patTextLimit = patText + ss->pattern.textLength; 3489 while ( patText < patTextLimit ) { 3490 UChar c = *patText++; 3491 if (MIGHT_BE_JAMO_L(c)) { 3492 bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L; 3493 } else { 3494 // No check for surrogates, we might allocate slightly more buffer than necessary. 3495 bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER; 3496 } 3497 } 3498 } 3499 } 3500 ceIter = ss->textIter; 3501 firstIx = 0; 3502 limitIx = 0; 3503 3504 if (!initTextProcessedIter(ss, status)) { return; } 3505 3506 if (bufSize>DEFAULT_CEBUFFER_SIZE) { 3507 buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI)); 3508 if (buf == NULL) { 3509 *status = U_MEMORY_ALLOCATION_ERROR; 3510 } 3511 } 3512} 3513 3514// TODO: add a reset or init function so that allocated 3515// buffers can be retained & reused. 3516 3517CEBuffer::~CEBuffer() { 3518 if (buf != defBuf) { 3519 uprv_free(buf); 3520 } 3521} 3522 3523 3524// Get the CE with the specified index. 3525// Index must be in the range 3526// n-history_size < index < n+1 3527// where n is the largest index to have been fetched by some previous call to this function. 3528// The CE value will be UCOL__PROCESSED_NULLORDER at end of input. 3529// 3530const CEI *CEBuffer::get(int32_t index) { 3531 int i = index % bufSize; 3532 3533 if (index>=firstIx && index<limitIx) { 3534 // The request was for an entry already in our buffer. 3535 // Just return it. 3536 return &buf[i]; 3537 } 3538 3539 // Caller is requesting a new, never accessed before, CE. 3540 // Verify that it is the next one in sequence, which is all 3541 // that is allowed. 3542 if (index != limitIx) { 3543 U_ASSERT(FALSE); 3544 3545 return NULL; 3546 } 3547 3548 // Manage the circular CE buffer indexing 3549 limitIx++; 3550 3551 if (limitIx - firstIx >= bufSize) { 3552 // The buffer is full, knock out the lowest-indexed entry. 3553 firstIx++; 3554 } 3555 3556 UErrorCode status = U_ZERO_ERROR; 3557 3558 buf[i].ce = strSearch->textProcessedIter->nextProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status); 3559 3560 return &buf[i]; 3561} 3562 3563// Get the CE with the specified index. 3564// Index must be in the range 3565// n-history_size < index < n+1 3566// where n is the largest index to have been fetched by some previous call to this function. 3567// The CE value will be UCOL__PROCESSED_NULLORDER at end of input. 3568// 3569const CEI *CEBuffer::getPrevious(int32_t index) { 3570 int i = index % bufSize; 3571 3572 if (index>=firstIx && index<limitIx) { 3573 // The request was for an entry already in our buffer. 3574 // Just return it. 3575 return &buf[i]; 3576 } 3577 3578 // Caller is requesting a new, never accessed before, CE. 3579 // Verify that it is the next one in sequence, which is all 3580 // that is allowed. 3581 if (index != limitIx) { 3582 U_ASSERT(FALSE); 3583 3584 return NULL; 3585 } 3586 3587 // Manage the circular CE buffer indexing 3588 limitIx++; 3589 3590 if (limitIx - firstIx >= bufSize) { 3591 // The buffer is full, knock out the lowest-indexed entry. 3592 firstIx++; 3593 } 3594 3595 UErrorCode status = U_ZERO_ERROR; 3596 3597 buf[i].ce = strSearch->textProcessedIter->previousProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status); 3598 3599 return &buf[i]; 3600} 3601 3602} 3603 3604U_NAMESPACE_END 3605 3606 3607// #define USEARCH_DEBUG 3608 3609#ifdef USEARCH_DEBUG 3610#include <stdio.h> 3611#include <stdlib.h> 3612#endif 3613 3614/* 3615 * Find the next break boundary after startIndex. If the UStringSearch object 3616 * has an external break iterator, use that. Otherwise use the internal character 3617 * break iterator. 3618 */ 3619static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) { 3620#if 0 3621 const UChar *text = strsrch->search->text; 3622 int32_t textLen = strsrch->search->textLength; 3623 3624 U_ASSERT(startIndex>=0); 3625 U_ASSERT(startIndex<=textLen); 3626 3627 if (startIndex >= textLen) { 3628 return startIndex; 3629 } 3630 3631 UChar32 c; 3632 int32_t i = startIndex; 3633 U16_NEXT(text, i, textLen, c); 3634 3635 // If we are on a control character, stop without looking for combining marks. 3636 // Control characters do not combine. 3637 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 3638 if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) { 3639 return i; 3640 } 3641 3642 // The initial character was not a control, and can thus accept trailing 3643 // combining characters. Advance over however many of them there are. 3644 int32_t indexOfLastCharChecked; 3645 for (;;) { 3646 indexOfLastCharChecked = i; 3647 if (i>=textLen) { 3648 break; 3649 } 3650 U16_NEXT(text, i, textLen, c); 3651 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 3652 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) { 3653 break; 3654 } 3655 } 3656 return indexOfLastCharChecked; 3657#elif !UCONFIG_NO_BREAK_ITERATION 3658 UBreakIterator *breakiterator = strsrch->search->breakIter; 3659 3660 if (breakiterator == NULL) { 3661 breakiterator = strsrch->search->internalBreakIter; 3662 } 3663 3664 if (breakiterator != NULL) { 3665 return ubrk_following(breakiterator, startIndex); 3666 } 3667 3668 return startIndex; 3669#else 3670 // **** or should we use the original code? **** 3671 return startIndex; 3672#endif 3673 3674} 3675 3676/* 3677 * Returns TRUE if index is on a break boundary. If the UStringSearch 3678 * has an external break iterator, test using that, otherwise test 3679 * using the internal character break iterator. 3680 */ 3681static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) { 3682#if 0 3683 const UChar *text = strsrch->search->text; 3684 int32_t textLen = strsrch->search->textLength; 3685 3686 U_ASSERT(index>=0); 3687 U_ASSERT(index<=textLen); 3688 3689 if (index>=textLen || index<=0) { 3690 return TRUE; 3691 } 3692 3693 // If the character at the current index is not a GRAPHEME_EXTEND 3694 // then we can not be within a combining sequence. 3695 UChar32 c; 3696 U16_GET(text, 0, index, textLen, c); 3697 int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 3698 if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) { 3699 return TRUE; 3700 } 3701 3702 // We are at a combining mark. If the preceding character is anything 3703 // except a CONTROL, CR or LF, we are in a combining sequence. 3704 U16_PREV(text, 0, index, c); 3705 gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK); 3706 UBool combining = !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR); 3707 return !combining; 3708#elif !UCONFIG_NO_BREAK_ITERATION 3709 UBreakIterator *breakiterator = strsrch->search->breakIter; 3710 3711 if (breakiterator == NULL) { 3712 breakiterator = strsrch->search->internalBreakIter; 3713 } 3714 3715 return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index)); 3716#else 3717 // **** or use the original code? **** 3718 return TRUE; 3719#endif 3720} 3721 3722#if 0 3723static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end) 3724{ 3725#if !UCONFIG_NO_BREAK_ITERATION 3726 UBreakIterator *breakiterator = strsrch->search->breakIter; 3727 3728 if (breakiterator != NULL) { 3729 int32_t startindex = ubrk_first(breakiterator); 3730 int32_t endindex = ubrk_last(breakiterator); 3731 3732 // out-of-range indexes are never boundary positions 3733 if (start < startindex || start > endindex || 3734 end < startindex || end > endindex) { 3735 return FALSE; 3736 } 3737 3738 return ubrk_isBoundary(breakiterator, start) && 3739 ubrk_isBoundary(breakiterator, end); 3740 } 3741#endif 3742 3743 return TRUE; 3744} 3745#endif 3746 3747typedef enum { 3748 U_CE_MATCH = -1, 3749 U_CE_NO_MATCH = 0, 3750 U_CE_SKIP_TARG, 3751 U_CE_SKIP_PATN 3752} UCompareCEsResult; 3753#define U_CE_LEVEL2_BASE 0x00000005 3754#define U_CE_LEVEL3_BASE 0x00050000 3755 3756static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) { 3757 if (targCE == patCE) { 3758 return U_CE_MATCH; 3759 } 3760 if (compareType == 0) { 3761 return U_CE_NO_MATCH; 3762 } 3763 3764 int64_t targCEshifted = targCE >> 32; 3765 int64_t patCEshifted = patCE >> 32; 3766 int64_t mask; 3767 3768 mask = 0xFFFF0000; 3769 int32_t targLev1 = (int32_t)(targCEshifted & mask); 3770 int32_t patLev1 = (int32_t)(patCEshifted & mask); 3771 if ( targLev1 != patLev1 ) { 3772 if ( targLev1 == 0 ) { 3773 return U_CE_SKIP_TARG; 3774 } 3775 if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) { 3776 return U_CE_SKIP_PATN; 3777 } 3778 return U_CE_NO_MATCH; 3779 } 3780 3781 mask = 0x0000FFFF; 3782 int32_t targLev2 = (int32_t)(targCEshifted & mask); 3783 int32_t patLev2 = (int32_t)(patCEshifted & mask); 3784 if ( targLev2 != patLev2 ) { 3785 if ( targLev2 == 0 ) { 3786 return U_CE_SKIP_TARG; 3787 } 3788 if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) { 3789 return U_CE_SKIP_PATN; 3790 } 3791 return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )? 3792 U_CE_MATCH: U_CE_NO_MATCH; 3793 } 3794 3795 mask = 0xFFFF0000; 3796 int32_t targLev3 = (int32_t)(targCE & mask); 3797 int32_t patLev3 = (int32_t)(patCE & mask); 3798 if ( targLev3 != patLev3 ) { 3799 return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )? 3800 U_CE_MATCH: U_CE_NO_MATCH; 3801 } 3802 3803 return U_CE_MATCH; 3804} 3805 3806#if BOYER_MOORE 3807// TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s 3808#endif 3809 3810U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch *strsrch, 3811 int32_t startIdx, 3812 int32_t *matchStart, 3813 int32_t *matchLimit, 3814 UErrorCode *status) 3815{ 3816 if (U_FAILURE(*status)) { 3817 return FALSE; 3818 } 3819 3820 // TODO: reject search patterns beginning with a combining char. 3821 3822#ifdef USEARCH_DEBUG 3823 if (getenv("USEARCH_DEBUG") != NULL) { 3824 printf("Pattern CEs\n"); 3825 for (int ii=0; ii<strsrch->pattern.CELength; ii++) { 3826 printf(" %8x", strsrch->pattern.CE[ii]); 3827 } 3828 printf("\n"); 3829 } 3830 3831#endif 3832 // Input parameter sanity check. 3833 // TODO: should input indicies clip to the text length 3834 // in the same way that UText does. 3835 if(strsrch->pattern.CELength == 0 || 3836 startIdx < 0 || 3837 startIdx > strsrch->search->textLength || 3838 strsrch->pattern.CE == NULL) { 3839 *status = U_ILLEGAL_ARGUMENT_ERROR; 3840 return FALSE; 3841 } 3842 3843 if (strsrch->pattern.PCE == NULL) { 3844 initializePatternPCETable(strsrch, status); 3845 } 3846 3847 ucol_setOffset(strsrch->textIter, startIdx, status); 3848 CEBuffer ceb(strsrch, status); 3849 3850 3851 int32_t targetIx = 0; 3852 const CEI *targetCEI = NULL; 3853 int32_t patIx; 3854 UBool found; 3855 3856 int32_t mStart = -1; 3857 int32_t mLimit = -1; 3858 int32_t minLimit; 3859 int32_t maxLimit; 3860 3861 3862 3863 // Outer loop moves over match starting positions in the 3864 // target CE space. 3865 // Here we see the target as a sequence of collation elements, resulting from the following: 3866 // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied 3867 // (for example, digraphs such as IJ may be broken into two characters). 3868 // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next 3869 // 16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these 3870 // fields that are for strengths below that of the collator are set to 0. If this makes the int64_t 3871 // CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary), 3872 // then the CE is deleted, so the following code sees only CEs that are relevant. 3873 // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text. 3874 // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text 3875 // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER). 3876 // 3877 for(targetIx=0; ; targetIx++) 3878 { 3879 found = TRUE; 3880 // Inner loop checks for a match beginning at each 3881 // position from the outer loop. 3882 int32_t targetIxOffset = 0; 3883 int64_t patCE = 0; 3884 // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer 3885 // (compared to the last CE fetched for the previous targetIx value) as we need to go 3886 // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK. 3887 const CEI *firstCEI = ceb.get(targetIx); 3888 if (firstCEI == NULL) { 3889 *status = U_INTERNAL_PROGRAM_ERROR; 3890 found = FALSE; 3891 break; 3892 } 3893 3894 for (patIx=0; patIx<strsrch->pattern.PCELength; patIx++) { 3895 patCE = strsrch->pattern.PCE[patIx]; 3896 targetCEI = ceb.get(targetIx+patIx+targetIxOffset); 3897 // Compare CE from target string with CE from the pattern. 3898 // Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input, 3899 // which will fail the compare, below. 3900 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType); 3901 if ( ceMatch == U_CE_NO_MATCH ) { 3902 found = FALSE; 3903 break; 3904 } else if ( ceMatch > U_CE_NO_MATCH ) { 3905 if ( ceMatch == U_CE_SKIP_TARG ) { 3906 // redo with same patCE, next targCE 3907 patIx--; 3908 targetIxOffset++; 3909 } else { // ceMatch == U_CE_SKIP_PATN 3910 // redo with same targCE, next patCE 3911 targetIxOffset--; 3912 } 3913 } 3914 } 3915 targetIxOffset += strsrch->pattern.PCELength; // this is now the offset in target CE space to end of the match so far 3916 3917 if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) { 3918 // No match at this targetIx. Try again at the next. 3919 continue; 3920 } 3921 3922 if (!found) { 3923 // No match at all, we have run off the end of the target text. 3924 break; 3925 } 3926 3927 3928 // We have found a match in CE space. 3929 // Now determine the bounds in string index space. 3930 // There still is a chance of match failure if the CE range not correspond to 3931 // an acceptable character range. 3932 // 3933 const CEI *lastCEI = ceb.get(targetIx + targetIxOffset - 1); 3934 3935 mStart = firstCEI->lowIndex; 3936 minLimit = lastCEI->lowIndex; 3937 3938 // Look at the CE following the match. If it is UCOL_NULLORDER the match 3939 // extended to the end of input, and the match is good. 3940 3941 // Look at the high and low indices of the CE following the match. If 3942 // they are the same it means one of two things: 3943 // 1. The match extended to the last CE from the target text, which is OK, or 3944 // 2. The last CE that was part of the match is in an expansion that extends 3945 // to the first CE after the match. In this case, we reject the match. 3946 const CEI *nextCEI = 0; 3947 if (strsrch->search->elementComparisonType == 0) { 3948 nextCEI = ceb.get(targetIx + targetIxOffset); 3949 maxLimit = nextCEI->lowIndex; 3950 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) { 3951 found = FALSE; 3952 } 3953 } else { 3954 for ( ; ; ++targetIxOffset ) { 3955 nextCEI = ceb.get(targetIx + targetIxOffset); 3956 maxLimit = nextCEI->lowIndex; 3957 // If we are at the end of the target too, match succeeds 3958 if ( nextCEI->ce == UCOL_PROCESSED_NULLORDER ) { 3959 break; 3960 } 3961 // As long as the next CE has primary weight of 0, 3962 // it is part of the last target element matched by the pattern; 3963 // make sure it can be part of a match with the last patCE 3964 if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) { 3965 UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType); 3966 if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) { 3967 found = FALSE; 3968 break; 3969 } 3970 // If lowIndex == highIndex, this target CE is part of an expansion of the last matched 3971 // target element, but it has non-zero primary weight => match fails 3972 } else if ( nextCEI->lowIndex == nextCEI->highIndex ) { 3973 found = false; 3974 break; 3975 // Else the target CE is not part of an expansion of the last matched element, match succeeds 3976 } else { 3977 break; 3978 } 3979 } 3980 } 3981 3982 3983 // Check for the start of the match being within a combining sequence. 3984 // This can happen if the pattern itself begins with a combining char, and 3985 // the match found combining marks in the target text that were attached 3986 // to something else. 3987 // This type of match should be rejected for not completely consuming a 3988 // combining sequence. 3989 if (!isBreakBoundary(strsrch, mStart)) { 3990 found = FALSE; 3991 } 3992 3993 // Check for the start of the match being within an Collation Element Expansion, 3994 // meaning that the first char of the match is only partially matched. 3995 // With exapnsions, the first CE will report the index of the source 3996 // character, and all subsequent (expansions) CEs will report the source index of the 3997 // _following_ character. 3998 int32_t secondIx = firstCEI->highIndex; 3999 if (mStart == secondIx) { 4000 found = FALSE; 4001 } 4002 4003 // Advance the match end position to the first acceptable match boundary. 4004 // This advances the index over any combining charcters. 4005 mLimit = maxLimit; 4006 if (minLimit < maxLimit) { 4007 // When the last CE's low index is same with its high index, the CE is likely 4008 // a part of expansion. In this case, the index is located just after the 4009 // character corresponding to the CEs compared above. If the index is right 4010 // at the break boundary, move the position to the next boundary will result 4011 // incorrect match length when there are ignorable characters exist between 4012 // the position and the next character produces CE(s). See ticket#8482. 4013 if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) { 4014 mLimit = minLimit; 4015 } else { 4016 int32_t nba = nextBoundaryAfter(strsrch, minLimit); 4017 if (nba >= lastCEI->highIndex) { 4018 mLimit = nba; 4019 } 4020 } 4021 } 4022 4023 #ifdef USEARCH_DEBUG 4024 if (getenv("USEARCH_DEBUG") != NULL) { 4025 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit); 4026 } 4027 #endif 4028 4029 // If advancing to the end of a combining sequence in character indexing space 4030 // advanced us beyond the end of the match in CE space, reject this match. 4031 if (mLimit > maxLimit) { 4032 found = FALSE; 4033 } 4034 4035 if (!isBreakBoundary(strsrch, mLimit)) { 4036 found = FALSE; 4037 } 4038 4039 if (! checkIdentical(strsrch, mStart, mLimit)) { 4040 found = FALSE; 4041 } 4042 4043 if (found) { 4044 break; 4045 } 4046 } 4047 4048 #ifdef USEARCH_DEBUG 4049 if (getenv("USEARCH_DEBUG") != NULL) { 4050 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx); 4051 int32_t lastToPrint = ceb.limitIx+2; 4052 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) { 4053 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex); 4054 } 4055 printf("\n%s\n", found? "match found" : "no match"); 4056 } 4057 #endif 4058 4059 // All Done. Store back the match bounds to the caller. 4060 // 4061 if (found==FALSE) { 4062 mLimit = -1; 4063 mStart = -1; 4064 } 4065 4066 if (matchStart != NULL) { 4067 *matchStart= mStart; 4068 } 4069 4070 if (matchLimit != NULL) { 4071 *matchLimit = mLimit; 4072 } 4073 4074 return found; 4075} 4076 4077U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch *strsrch, 4078 int32_t startIdx, 4079 int32_t *matchStart, 4080 int32_t *matchLimit, 4081 UErrorCode *status) 4082{ 4083 if (U_FAILURE(*status)) { 4084 return FALSE; 4085 } 4086 4087 // TODO: reject search patterns beginning with a combining char. 4088 4089#ifdef USEARCH_DEBUG 4090 if (getenv("USEARCH_DEBUG") != NULL) { 4091 printf("Pattern CEs\n"); 4092 for (int ii=0; ii<strsrch->pattern.CELength; ii++) { 4093 printf(" %8x", strsrch->pattern.CE[ii]); 4094 } 4095 printf("\n"); 4096 } 4097 4098#endif 4099 // Input parameter sanity check. 4100 // TODO: should input indicies clip to the text length 4101 // in the same way that UText does. 4102 if(strsrch->pattern.CELength == 0 || 4103 startIdx < 0 || 4104 startIdx > strsrch->search->textLength || 4105 strsrch->pattern.CE == NULL) { 4106 *status = U_ILLEGAL_ARGUMENT_ERROR; 4107 return FALSE; 4108 } 4109 4110 if (strsrch->pattern.PCE == NULL) { 4111 initializePatternPCETable(strsrch, status); 4112 } 4113 4114 CEBuffer ceb(strsrch, status); 4115 int32_t targetIx = 0; 4116 4117 /* 4118 * Pre-load the buffer with the CE's for the grapheme 4119 * after our starting position so that we're sure that 4120 * we can look at the CE following the match when we 4121 * check the match boundaries. 4122 * 4123 * This will also pre-fetch the first CE that we'll 4124 * consider for the match. 4125 */ 4126 if (startIdx < strsrch->search->textLength) { 4127 UBreakIterator *bi = strsrch->search->internalBreakIter; 4128 int32_t next = ubrk_following(bi, startIdx); 4129 4130 ucol_setOffset(strsrch->textIter, next, status); 4131 4132 for (targetIx = 0; ; targetIx += 1) { 4133 if (ceb.getPrevious(targetIx)->lowIndex < startIdx) { 4134 break; 4135 } 4136 } 4137 } else { 4138 ucol_setOffset(strsrch->textIter, startIdx, status); 4139 } 4140 4141 4142 const CEI *targetCEI = NULL; 4143 int32_t patIx; 4144 UBool found; 4145 4146 int32_t limitIx = targetIx; 4147 int32_t mStart = -1; 4148 int32_t mLimit = -1; 4149 int32_t minLimit; 4150 int32_t maxLimit; 4151 4152 4153 4154 // Outer loop moves over match starting positions in the 4155 // target CE space. 4156 // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order). 4157 // But patIx is 0 at the beginning of the pattern and increases toward the end. 4158 // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern 4159 // and the beginning of the base text. 4160 for(targetIx = limitIx; ; targetIx += 1) 4161 { 4162 found = TRUE; 4163 // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer 4164 // (compared to the last CE fetched for the previous targetIx value) as we need to go 4165 // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK. 4166 const CEI *lastCEI = ceb.getPrevious(targetIx); 4167 if (lastCEI == NULL) { 4168 *status = U_INTERNAL_PROGRAM_ERROR; 4169 found = FALSE; 4170 break; 4171 } 4172 // Inner loop checks for a match beginning at each 4173 // position from the outer loop. 4174 int32_t targetIxOffset = 0; 4175 for (patIx = strsrch->pattern.PCELength - 1; patIx >= 0; patIx -= 1) { 4176 int64_t patCE = strsrch->pattern.PCE[patIx]; 4177 4178 targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 - patIx + targetIxOffset); 4179 // Compare CE from target string with CE from the pattern. 4180 // Note that the target CE will be UCOL_NULLORDER if we reach the end of input, 4181 // which will fail the compare, below. 4182 UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType); 4183 if ( ceMatch == U_CE_NO_MATCH ) { 4184 found = FALSE; 4185 break; 4186 } else if ( ceMatch > U_CE_NO_MATCH ) { 4187 if ( ceMatch == U_CE_SKIP_TARG ) { 4188 // redo with same patCE, next targCE 4189 patIx++; 4190 targetIxOffset++; 4191 } else { // ceMatch == U_CE_SKIP_PATN 4192 // redo with same targCE, next patCE 4193 targetIxOffset--; 4194 } 4195 } 4196 } 4197 4198 if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) { 4199 // No match at this targetIx. Try again at the next. 4200 continue; 4201 } 4202 4203 if (!found) { 4204 // No match at all, we have run off the end of the target text. 4205 break; 4206 } 4207 4208 4209 // We have found a match in CE space. 4210 // Now determine the bounds in string index space. 4211 // There still is a chance of match failure if the CE range not correspond to 4212 // an acceptable character range. 4213 // 4214 const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 + targetIxOffset); 4215 mStart = firstCEI->lowIndex; 4216 4217 // Check for the start of the match being within a combining sequence. 4218 // This can happen if the pattern itself begins with a combining char, and 4219 // the match found combining marks in the target text that were attached 4220 // to something else. 4221 // This type of match should be rejected for not completely consuming a 4222 // combining sequence. 4223 if (!isBreakBoundary(strsrch, mStart)) { 4224 found = FALSE; 4225 } 4226 4227 // Look at the high index of the first CE in the match. If it's the same as the 4228 // low index, the first CE in the match is in the middle of an expansion. 4229 if (mStart == firstCEI->highIndex) { 4230 found = FALSE; 4231 } 4232 4233 4234 minLimit = lastCEI->lowIndex; 4235 4236 if (targetIx > 0) { 4237 // Look at the CE following the match. If it is UCOL_NULLORDER the match 4238 // extended to the end of input, and the match is good. 4239 4240 // Look at the high and low indices of the CE following the match. If 4241 // they are the same it means one of two things: 4242 // 1. The match extended to the last CE from the target text, which is OK, or 4243 // 2. The last CE that was part of the match is in an expansion that extends 4244 // to the first CE after the match. In this case, we reject the match. 4245 const CEI *nextCEI = ceb.getPrevious(targetIx - 1); 4246 4247 if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) { 4248 found = FALSE; 4249 } 4250 4251 mLimit = maxLimit = nextCEI->lowIndex; 4252 4253 // Advance the match end position to the first acceptable match boundary. 4254 // This advances the index over any combining charcters. 4255 if (minLimit < maxLimit) { 4256 int32_t nba = nextBoundaryAfter(strsrch, minLimit); 4257 4258 if (nba >= lastCEI->highIndex) { 4259 mLimit = nba; 4260 } 4261 } 4262 4263 // If advancing to the end of a combining sequence in character indexing space 4264 // advanced us beyond the end of the match in CE space, reject this match. 4265 if (mLimit > maxLimit) { 4266 found = FALSE; 4267 } 4268 4269 // Make sure the end of the match is on a break boundary 4270 if (!isBreakBoundary(strsrch, mLimit)) { 4271 found = FALSE; 4272 } 4273 4274 } else { 4275 // No non-ignorable CEs after this point. 4276 // The maximum position is detected by boundary after 4277 // the last non-ignorable CE. Combining sequence 4278 // across the start index will be truncated. 4279 int32_t nba = nextBoundaryAfter(strsrch, minLimit); 4280 mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx; 4281 } 4282 4283 #ifdef USEARCH_DEBUG 4284 if (getenv("USEARCH_DEBUG") != NULL) { 4285 printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit); 4286 } 4287 #endif 4288 4289 4290 if (! checkIdentical(strsrch, mStart, mLimit)) { 4291 found = FALSE; 4292 } 4293 4294 if (found) { 4295 break; 4296 } 4297 } 4298 4299 #ifdef USEARCH_DEBUG 4300 if (getenv("USEARCH_DEBUG") != NULL) { 4301 printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx); 4302 int32_t lastToPrint = ceb.limitIx+2; 4303 for (int ii=ceb.firstIx; ii<lastToPrint; ii++) { 4304 printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex); 4305 } 4306 printf("\n%s\n", found? "match found" : "no match"); 4307 } 4308 #endif 4309 4310 // All Done. Store back the match bounds to the caller. 4311 // 4312 if (found==FALSE) { 4313 mLimit = -1; 4314 mStart = -1; 4315 } 4316 4317 if (matchStart != NULL) { 4318 *matchStart= mStart; 4319 } 4320 4321 if (matchLimit != NULL) { 4322 *matchLimit = mLimit; 4323 } 4324 4325 return found; 4326} 4327 4328// internal use methods declared in usrchimp.h ----------------------------- 4329 4330UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status) 4331{ 4332 if (U_FAILURE(*status)) { 4333 setMatchNotFound(strsrch); 4334 return FALSE; 4335 } 4336 4337#if BOYER_MOORE 4338 UCollationElements *coleiter = strsrch->textIter; 4339 int32_t textlength = strsrch->search->textLength; 4340 int32_t *patternce = strsrch->pattern.CE; 4341 int32_t patterncelength = strsrch->pattern.CELength; 4342 int32_t textoffset = ucol_getOffset(coleiter); 4343 4344 // status used in setting coleiter offset, since offset is checked in 4345 // shiftForward before setting the coleiter offset, status never 4346 // a failure 4347 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER, 4348 patterncelength); 4349 while (textoffset <= textlength) 4350 { 4351 uint32_t patternceindex = patterncelength - 1; 4352 int32_t targetce; 4353 UBool found = FALSE; 4354 int32_t lastce = UCOL_NULLORDER; 4355 4356 setColEIterOffset(coleiter, textoffset); 4357 4358 for (;;) { 4359 // finding the last pattern ce match, imagine composite characters 4360 // for example: search for pattern A in text \u00C0 4361 // we'll have to skip \u0300 the grave first before we get to A 4362 targetce = ucol_previous(coleiter, status); 4363 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4364 found = FALSE; 4365 break; 4366 } 4367 targetce = getCE(strsrch, targetce); 4368 if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) { 4369 // this is for the text \u0315\u0300 that requires 4370 // normalization and pattern \u0300, where \u0315 is ignorable 4371 continue; 4372 } 4373 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) { 4374 lastce = targetce; 4375 } 4376 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4377 if (targetce == patternce[patternceindex]) { 4378 // the first ce can be a contraction 4379 found = TRUE; 4380 break; 4381 } 4382 if (!hasExpansion(coleiter)) { 4383 found = FALSE; 4384 break; 4385 } 4386 } 4387 4388 //targetce = lastce; 4389 4390 while (found && patternceindex > 0) { 4391 lastce = targetce; 4392 targetce = ucol_previous(coleiter, status); 4393 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4394 found = FALSE; 4395 break; 4396 } 4397 targetce = getCE(strsrch, targetce); 4398 if (targetce == UCOL_IGNORABLE) { 4399 continue; 4400 } 4401 4402 patternceindex --; 4403 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4404 found = found && targetce == patternce[patternceindex]; 4405 } 4406 4407 targetce = lastce; 4408 4409 if (!found) { 4410 if (U_FAILURE(*status)) { 4411 break; 4412 } 4413 textoffset = shiftForward(strsrch, textoffset, lastce, 4414 patternceindex); 4415 // status checked at loop. 4416 patternceindex = patterncelength; 4417 continue; 4418 } 4419 4420 if (checkNextExactMatch(strsrch, &textoffset, status)) { 4421 // status checked in ucol_setOffset 4422 setColEIterOffset(coleiter, strsrch->search->matchedIndex); 4423 return TRUE; 4424 } 4425 } 4426 setMatchNotFound(strsrch); 4427 return FALSE; 4428#else 4429 int32_t textOffset = ucol_getOffset(strsrch->textIter); 4430 int32_t start = -1; 4431 int32_t end = -1; 4432 4433 if (usearch_search(strsrch, textOffset, &start, &end, status)) { 4434 strsrch->search->matchedIndex = start; 4435 strsrch->search->matchedLength = end - start; 4436 return TRUE; 4437 } else { 4438 setMatchNotFound(strsrch); 4439 return FALSE; 4440 } 4441#endif 4442} 4443 4444UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status) 4445{ 4446 if (U_FAILURE(*status)) { 4447 setMatchNotFound(strsrch); 4448 return FALSE; 4449 } 4450 4451#if BOYER_MOORE 4452 UCollationElements *coleiter = strsrch->textIter; 4453 int32_t textlength = strsrch->search->textLength; 4454 int32_t *patternce = strsrch->pattern.CE; 4455 int32_t patterncelength = strsrch->pattern.CELength; 4456 int32_t textoffset = ucol_getOffset(coleiter); 4457 UBool hasPatternAccents = 4458 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents; 4459 4460 textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER, 4461 patterncelength); 4462 strsrch->canonicalPrefixAccents[0] = 0; 4463 strsrch->canonicalSuffixAccents[0] = 0; 4464 4465 while (textoffset <= textlength) 4466 { 4467 int32_t patternceindex = patterncelength - 1; 4468 int32_t targetce; 4469 UBool found = FALSE; 4470 int32_t lastce = UCOL_NULLORDER; 4471 4472 setColEIterOffset(coleiter, textoffset); 4473 4474 for (;;) { 4475 // finding the last pattern ce match, imagine composite characters 4476 // for example: search for pattern A in text \u00C0 4477 // we'll have to skip \u0300 the grave first before we get to A 4478 targetce = ucol_previous(coleiter, status); 4479 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4480 found = FALSE; 4481 break; 4482 } 4483 targetce = getCE(strsrch, targetce); 4484 if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) { 4485 lastce = targetce; 4486 } 4487 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4488 if (targetce == patternce[patternceindex]) { 4489 // the first ce can be a contraction 4490 found = TRUE; 4491 break; 4492 } 4493 if (!hasExpansion(coleiter)) { 4494 found = FALSE; 4495 break; 4496 } 4497 } 4498 4499 while (found && patternceindex > 0) { 4500 targetce = ucol_previous(coleiter, status); 4501 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4502 found = FALSE; 4503 break; 4504 } 4505 targetce = getCE(strsrch, targetce); 4506 if (targetce == UCOL_IGNORABLE) { 4507 continue; 4508 } 4509 4510 patternceindex --; 4511 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4512 found = found && targetce == patternce[patternceindex]; 4513 } 4514 4515 // initializing the rearranged accent array 4516 if (hasPatternAccents && !found) { 4517 strsrch->canonicalPrefixAccents[0] = 0; 4518 strsrch->canonicalSuffixAccents[0] = 0; 4519 if (U_FAILURE(*status)) { 4520 break; 4521 } 4522 found = doNextCanonicalMatch(strsrch, textoffset, status); 4523 } 4524 4525 if (!found) { 4526 if (U_FAILURE(*status)) { 4527 break; 4528 } 4529 textoffset = shiftForward(strsrch, textoffset, lastce, 4530 patternceindex); 4531 // status checked at loop 4532 patternceindex = patterncelength; 4533 continue; 4534 } 4535 4536 if (checkNextCanonicalMatch(strsrch, &textoffset, status)) { 4537 setColEIterOffset(coleiter, strsrch->search->matchedIndex); 4538 return TRUE; 4539 } 4540 } 4541 setMatchNotFound(strsrch); 4542 return FALSE; 4543#else 4544 int32_t textOffset = ucol_getOffset(strsrch->textIter); 4545 int32_t start = -1; 4546 int32_t end = -1; 4547 4548 if (usearch_search(strsrch, textOffset, &start, &end, status)) { 4549 strsrch->search->matchedIndex = start; 4550 strsrch->search->matchedLength = end - start; 4551 return TRUE; 4552 } else { 4553 setMatchNotFound(strsrch); 4554 return FALSE; 4555 } 4556#endif 4557} 4558 4559UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status) 4560{ 4561 if (U_FAILURE(*status)) { 4562 setMatchNotFound(strsrch); 4563 return FALSE; 4564 } 4565 4566#if BOYER_MOORE 4567 UCollationElements *coleiter = strsrch->textIter; 4568 int32_t *patternce = strsrch->pattern.CE; 4569 int32_t patterncelength = strsrch->pattern.CELength; 4570 int32_t textoffset = ucol_getOffset(coleiter); 4571 4572 // shifting it check for setting offset 4573 // if setOffset is called previously or there was no previous match, we 4574 // leave the offset as it is. 4575 if (strsrch->search->matchedIndex != USEARCH_DONE) { 4576 textoffset = strsrch->search->matchedIndex; 4577 } 4578 4579 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER, 4580 patterncelength); 4581 4582 while (textoffset >= 0) 4583 { 4584 int32_t patternceindex = 1; 4585 int32_t targetce; 4586 UBool found = FALSE; 4587 int32_t firstce = UCOL_NULLORDER; 4588 4589 // if status is a failure, ucol_setOffset does nothing 4590 setColEIterOffset(coleiter, textoffset); 4591 4592 for (;;) { 4593 // finding the first pattern ce match, imagine composite 4594 // characters. for example: search for pattern \u0300 in text 4595 // \u00C0, we'll have to skip A first before we get to 4596 // \u0300 the grave accent 4597 targetce = ucol_next(coleiter, status); 4598 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4599 found = FALSE; 4600 break; 4601 } 4602 targetce = getCE(strsrch, targetce); 4603 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) { 4604 firstce = targetce; 4605 } 4606 if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) { 4607 continue; 4608 } 4609 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4610 if (targetce == patternce[0]) { 4611 found = TRUE; 4612 break; 4613 } 4614 if (!hasExpansion(coleiter)) { 4615 // checking for accents in composite character 4616 found = FALSE; 4617 break; 4618 } 4619 } 4620 4621 //targetce = firstce; 4622 4623 while (found && (patternceindex < patterncelength)) { 4624 firstce = targetce; 4625 targetce = ucol_next(coleiter, status); 4626 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4627 found = FALSE; 4628 break; 4629 } 4630 targetce = getCE(strsrch, targetce); 4631 if (targetce == UCOL_IGNORABLE) { 4632 continue; 4633 } 4634 4635 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4636 found = found && targetce == patternce[patternceindex]; 4637 patternceindex ++; 4638 } 4639 4640 targetce = firstce; 4641 4642 if (!found) { 4643 if (U_FAILURE(*status)) { 4644 break; 4645 } 4646 4647 textoffset = reverseShift(strsrch, textoffset, targetce, 4648 patternceindex); 4649 patternceindex = 0; 4650 continue; 4651 } 4652 4653 if (checkPreviousExactMatch(strsrch, &textoffset, status)) { 4654 setColEIterOffset(coleiter, textoffset); 4655 return TRUE; 4656 } 4657 } 4658 setMatchNotFound(strsrch); 4659 return FALSE; 4660#else 4661 int32_t textOffset; 4662 4663 if (strsrch->search->isOverlap) { 4664 if (strsrch->search->matchedIndex != USEARCH_DONE) { 4665 textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1; 4666 } else { 4667 // move the start position at the end of possible match 4668 initializePatternPCETable(strsrch, status); 4669 if (!initTextProcessedIter(strsrch, status)) { 4670 setMatchNotFound(strsrch); 4671 return FALSE; 4672 } 4673 for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.PCELength - 1; nPCEs++) { 4674 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status); 4675 if (pce == UCOL_PROCESSED_NULLORDER) { 4676 // at the end of the text 4677 break; 4678 } 4679 } 4680 if (U_FAILURE(*status)) { 4681 setMatchNotFound(strsrch); 4682 return FALSE; 4683 } 4684 textOffset = ucol_getOffset(strsrch->textIter); 4685 } 4686 } else { 4687 textOffset = ucol_getOffset(strsrch->textIter); 4688 } 4689 4690 int32_t start = -1; 4691 int32_t end = -1; 4692 4693 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) { 4694 strsrch->search->matchedIndex = start; 4695 strsrch->search->matchedLength = end - start; 4696 return TRUE; 4697 } else { 4698 setMatchNotFound(strsrch); 4699 return FALSE; 4700 } 4701#endif 4702} 4703 4704UBool usearch_handlePreviousCanonical(UStringSearch *strsrch, 4705 UErrorCode *status) 4706{ 4707 if (U_FAILURE(*status)) { 4708 setMatchNotFound(strsrch); 4709 return FALSE; 4710 } 4711 4712#if BOYER_MOORE 4713 UCollationElements *coleiter = strsrch->textIter; 4714 int32_t *patternce = strsrch->pattern.CE; 4715 int32_t patterncelength = strsrch->pattern.CELength; 4716 int32_t textoffset = ucol_getOffset(coleiter); 4717 UBool hasPatternAccents = 4718 strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents; 4719 4720 // shifting it check for setting offset 4721 // if setOffset is called previously or there was no previous match, we 4722 // leave the offset as it is. 4723 if (strsrch->search->matchedIndex != USEARCH_DONE) { 4724 textoffset = strsrch->search->matchedIndex; 4725 } 4726 4727 textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER, 4728 patterncelength); 4729 strsrch->canonicalPrefixAccents[0] = 0; 4730 strsrch->canonicalSuffixAccents[0] = 0; 4731 4732 while (textoffset >= 0) 4733 { 4734 int32_t patternceindex = 1; 4735 int32_t targetce; 4736 UBool found = FALSE; 4737 int32_t firstce = UCOL_NULLORDER; 4738 4739 setColEIterOffset(coleiter, textoffset); 4740 for (;;) { 4741 // finding the first pattern ce match, imagine composite 4742 // characters. for example: search for pattern \u0300 in text 4743 // \u00C0, we'll have to skip A first before we get to 4744 // \u0300 the grave accent 4745 targetce = ucol_next(coleiter, status); 4746 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4747 found = FALSE; 4748 break; 4749 } 4750 targetce = getCE(strsrch, targetce); 4751 if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) { 4752 firstce = targetce; 4753 } 4754 4755 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4756 if (targetce == patternce[0]) { 4757 // the first ce can be a contraction 4758 found = TRUE; 4759 break; 4760 } 4761 if (!hasExpansion(coleiter)) { 4762 // checking for accents in composite character 4763 found = FALSE; 4764 break; 4765 } 4766 } 4767 4768 targetce = firstce; 4769 4770 while (found && patternceindex < patterncelength) { 4771 targetce = ucol_next(coleiter, status); 4772 if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) { 4773 found = FALSE; 4774 break; 4775 } 4776 targetce = getCE(strsrch, targetce); 4777 if (targetce == UCOL_IGNORABLE) { 4778 continue; 4779 } 4780 4781 // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s 4782 found = found && targetce == patternce[patternceindex]; 4783 patternceindex ++; 4784 } 4785 4786 // initializing the rearranged accent array 4787 if (hasPatternAccents && !found) { 4788 strsrch->canonicalPrefixAccents[0] = 0; 4789 strsrch->canonicalSuffixAccents[0] = 0; 4790 if (U_FAILURE(*status)) { 4791 break; 4792 } 4793 found = doPreviousCanonicalMatch(strsrch, textoffset, status); 4794 } 4795 4796 if (!found) { 4797 if (U_FAILURE(*status)) { 4798 break; 4799 } 4800 textoffset = reverseShift(strsrch, textoffset, targetce, 4801 patternceindex); 4802 patternceindex = 0; 4803 continue; 4804 } 4805 4806 if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) { 4807 setColEIterOffset(coleiter, textoffset); 4808 return TRUE; 4809 } 4810 } 4811 setMatchNotFound(strsrch); 4812 return FALSE; 4813#else 4814 int32_t textOffset; 4815 4816 if (strsrch->search->isOverlap) { 4817 if (strsrch->search->matchedIndex != USEARCH_DONE) { 4818 textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1; 4819 } else { 4820 // move the start position at the end of possible match 4821 initializePatternPCETable(strsrch, status); 4822 if (!initTextProcessedIter(strsrch, status)) { 4823 setMatchNotFound(strsrch); 4824 return FALSE; 4825 } 4826 for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.PCELength - 1; nPCEs++) { 4827 int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status); 4828 if (pce == UCOL_PROCESSED_NULLORDER) { 4829 // at the end of the text 4830 break; 4831 } 4832 } 4833 if (U_FAILURE(*status)) { 4834 setMatchNotFound(strsrch); 4835 return FALSE; 4836 } 4837 textOffset = ucol_getOffset(strsrch->textIter); 4838 } 4839 } else { 4840 textOffset = ucol_getOffset(strsrch->textIter); 4841 } 4842 4843 int32_t start = -1; 4844 int32_t end = -1; 4845 4846 if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) { 4847 strsrch->search->matchedIndex = start; 4848 strsrch->search->matchedLength = end - start; 4849 return TRUE; 4850 } else { 4851 setMatchNotFound(strsrch); 4852 return FALSE; 4853 } 4854#endif 4855} 4856 4857#endif /* #if !UCONFIG_NO_COLLATION */ 4858