1/*
2**********************************************************************
3*   Copyright (C) 2001-2014 IBM and others. All rights reserved.
4**********************************************************************
5*   Date        Name        Description
6*  07/02/2001   synwee      Creation.
7**********************************************************************
8*/
9
10#include "unicode/utypes.h"
11
12#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
13
14#include "unicode/usearch.h"
15#include "unicode/ustring.h"
16#include "unicode/uchar.h"
17#include "unicode/utf16.h"
18#include "normalizer2impl.h"
19#include "usrchimp.h"
20#include "cmemory.h"
21#include "ucln_in.h"
22#include "uassert.h"
23#include "ustr_imp.h"
24
25U_NAMESPACE_USE
26
27// don't use Boyer-Moore
28// (and if we decide to turn this on again there are several new TODOs that will need to be addressed)
29#define BOYER_MOORE 0
30
31#define LENGTHOF(array) (int32_t)(sizeof(array)/sizeof((array)[0]))
32
33// internal definition ---------------------------------------------------
34
35#define LAST_BYTE_MASK_          0xFF
36#define SECOND_LAST_BYTE_SHIFT_  8
37#define SUPPLEMENTARY_MIN_VALUE_ 0x10000
38
39static const Normalizer2Impl *g_nfcImpl = NULL;
40
41// internal methods -------------------------------------------------
42
43/**
44* Fast collation element iterator setOffset.
45* This function does not check for bounds.
46* @param coleiter collation element iterator
47* @param offset to set
48*/
49static
50inline void setColEIterOffset(UCollationElements *elems,
51                      int32_t             offset)
52{
53    // Note: Not "fast" any more after the 2013 collation rewrite.
54    // We do not want to expose more internals than necessary.
55    UErrorCode status = U_ZERO_ERROR;
56    ucol_setOffset(elems, offset, &status);
57}
58
59/**
60* Getting the mask for collation strength
61* @param strength collation strength
62* @return collation element mask
63*/
64static
65inline uint32_t getMask(UCollationStrength strength)
66{
67    switch (strength)
68    {
69    case UCOL_PRIMARY:
70        return UCOL_PRIMARYORDERMASK;
71    case UCOL_SECONDARY:
72        return UCOL_SECONDARYORDERMASK | UCOL_PRIMARYORDERMASK;
73    default:
74        return UCOL_TERTIARYORDERMASK | UCOL_SECONDARYORDERMASK |
75               UCOL_PRIMARYORDERMASK;
76    }
77}
78
79/**
80* This is to squeeze the 21bit ces into a 256 table
81* @param ce collation element
82* @return collapsed version of the collation element
83*/
84static
85inline int hash(uint32_t ce)
86{
87    // the old value UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_ does not work
88    // well with the new collation where most of the latin 1 characters
89    // are of the value xx000xxx. their hashes will most of the time be 0
90    // to be discussed on the hash algo.
91    return UCOL_PRIMARYORDER(ce) % MAX_TABLE_SIZE_;
92}
93
94U_CDECL_BEGIN
95static UBool U_CALLCONV
96usearch_cleanup(void) {
97    g_nfcImpl = NULL;
98    return TRUE;
99}
100U_CDECL_END
101
102/**
103* Initializing the fcd tables.
104* Internal method, status assumed to be a success.
105* @param status output error if any, caller to check status before calling
106*               method, status assumed to be success when passed in.
107*/
108static
109inline void initializeFCD(UErrorCode *status)
110{
111    if (g_nfcImpl == NULL) {
112        g_nfcImpl = Normalizer2Factory::getNFCImpl(*status);
113        ucln_i18n_registerCleanup(UCLN_I18N_USEARCH, usearch_cleanup);
114    }
115}
116
117/**
118* Gets the fcd value for a character at the argument index.
119* This method takes into accounts of the supplementary characters.
120* @param str UTF16 string where character for fcd retrieval resides
121* @param offset position of the character whose fcd is to be retrieved, to be
122*               overwritten with the next character position, taking
123*               surrogate characters into consideration.
124* @param strlength length of the argument string
125* @return fcd value
126*/
127static
128uint16_t getFCD(const UChar   *str, int32_t *offset,
129                             int32_t  strlength)
130{
131    const UChar *temp = str + *offset;
132    uint16_t    result = g_nfcImpl->nextFCD16(temp, str + strlength);
133    *offset = (int32_t)(temp - str);
134    return result;
135}
136
137/**
138* Getting the modified collation elements taking into account the collation
139* attributes
140* @param strsrch string search data
141* @param sourcece
142* @return the modified collation element
143*/
144static
145inline int32_t getCE(const UStringSearch *strsrch, uint32_t sourcece)
146{
147    // note for tertiary we can't use the collator->tertiaryMask, that
148    // is a preprocessed mask that takes into account case options. since
149    // we are only concerned with exact matches, we don't need that.
150    sourcece &= strsrch->ceMask;
151
152    if (strsrch->toShift) {
153        // alternate handling here, since only the 16 most significant digits
154        // is only used, we can safely do a compare without masking
155        // if the ce is a variable, we mask and get only the primary values
156        // no shifting to quartenary is required since all primary values
157        // less than variabletop will need to be masked off anyway.
158        if (strsrch->variableTop > sourcece) {
159            if (strsrch->strength >= UCOL_QUATERNARY) {
160                sourcece &= UCOL_PRIMARYORDERMASK;
161            }
162            else {
163                sourcece = UCOL_IGNORABLE;
164            }
165        }
166    } else if (strsrch->strength >= UCOL_QUATERNARY && sourcece == UCOL_IGNORABLE) {
167        sourcece = 0xFFFF;
168    }
169
170    return sourcece;
171}
172
173/**
174* Allocate a memory and returns NULL if it failed.
175* Internal method, status assumed to be a success.
176* @param size to allocate
177* @param status output error if any, caller to check status before calling
178*               method, status assumed to be success when passed in.
179* @return newly allocated array, NULL otherwise
180*/
181static
182inline void * allocateMemory(uint32_t size, UErrorCode *status)
183{
184    uint32_t *result = (uint32_t *)uprv_malloc(size);
185    if (result == NULL) {
186        *status = U_MEMORY_ALLOCATION_ERROR;
187    }
188    return result;
189}
190
191/**
192* Adds a uint32_t value to a destination array.
193* Creates a new array if we run out of space. The caller will have to
194* manually deallocate the newly allocated array.
195* Internal method, status assumed to be success, caller has to check status
196* before calling this method. destination not to be NULL and has at least
197* size destinationlength.
198* @param destination target array
199* @param offset destination offset to add value
200* @param destinationlength target array size, return value for the new size
201* @param value to be added
202* @param increments incremental size expected
203* @param status output error if any, caller to check status before calling
204*               method, status assumed to be success when passed in.
205* @return new destination array, destination if there was no new allocation
206*/
207static
208inline int32_t * addTouint32_tArray(int32_t    *destination,
209                                    uint32_t    offset,
210                                    uint32_t   *destinationlength,
211                                    uint32_t    value,
212                                    uint32_t    increments,
213                                    UErrorCode *status)
214{
215    uint32_t newlength = *destinationlength;
216    if (offset + 1 == newlength) {
217        newlength += increments;
218        int32_t *temp = (int32_t *)allocateMemory(
219                                         sizeof(int32_t) * newlength, status);
220        if (U_FAILURE(*status)) {
221            return NULL;
222        }
223        uprv_memcpy(temp, destination, sizeof(int32_t) * offset);
224        *destinationlength = newlength;
225        destination        = temp;
226    }
227    destination[offset] = value;
228    return destination;
229}
230
231/**
232* Adds a uint64_t value to a destination array.
233* Creates a new array if we run out of space. The caller will have to
234* manually deallocate the newly allocated array.
235* Internal method, status assumed to be success, caller has to check status
236* before calling this method. destination not to be NULL and has at least
237* size destinationlength.
238* @param destination target array
239* @param offset destination offset to add value
240* @param destinationlength target array size, return value for the new size
241* @param value to be added
242* @param increments incremental size expected
243* @param status output error if any, caller to check status before calling
244*               method, status assumed to be success when passed in.
245* @return new destination array, destination if there was no new allocation
246*/
247static
248inline int64_t * addTouint64_tArray(int64_t    *destination,
249                                    uint32_t    offset,
250                                    uint32_t   *destinationlength,
251                                    uint64_t    value,
252                                    uint32_t    increments,
253                                    UErrorCode *status)
254{
255    uint32_t newlength = *destinationlength;
256    if (offset + 1 == newlength) {
257        newlength += increments;
258        int64_t *temp = (int64_t *)allocateMemory(
259                                         sizeof(int64_t) * newlength, status);
260
261        if (U_FAILURE(*status)) {
262            return NULL;
263        }
264
265        uprv_memcpy(temp, destination, sizeof(int64_t) * offset);
266        *destinationlength = newlength;
267        destination        = temp;
268    }
269
270    destination[offset] = value;
271
272    return destination;
273}
274
275/**
276* Initializing the ce table for a pattern.
277* Stores non-ignorable collation keys.
278* Table size will be estimated by the size of the pattern text. Table
279* expansion will be perform as we go along. Adding 1 to ensure that the table
280* size definitely increases.
281* Internal method, status assumed to be a success.
282* @param strsrch string search data
283* @param status output error if any, caller to check status before calling
284*               method, status assumed to be success when passed in.
285* @return total number of expansions
286*/
287static
288inline uint16_t initializePatternCETable(UStringSearch *strsrch,
289                                         UErrorCode    *status)
290{
291    UPattern *pattern            = &(strsrch->pattern);
292    uint32_t  cetablesize        = INITIAL_ARRAY_SIZE_;
293    int32_t  *cetable            = pattern->CEBuffer;
294    uint32_t  patternlength      = pattern->textLength;
295    UCollationElements *coleiter = strsrch->utilIter;
296
297    if (coleiter == NULL) {
298        coleiter = ucol_openElements(strsrch->collator, pattern->text,
299                                     patternlength, status);
300        // status will be checked in ucol_next(..) later and if it is an
301        // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
302        // returned.
303        strsrch->utilIter = coleiter;
304    }
305    else {
306        ucol_setText(coleiter, pattern->text, pattern->textLength, status);
307    }
308    if(U_FAILURE(*status)) {
309        return 0;
310    }
311
312    if (pattern->CE != cetable && pattern->CE) {
313        uprv_free(pattern->CE);
314    }
315
316    uint16_t  offset      = 0;
317    uint16_t  result      = 0;
318    int32_t   ce;
319
320    while ((ce = ucol_next(coleiter, status)) != UCOL_NULLORDER &&
321           U_SUCCESS(*status)) {
322        uint32_t newce = getCE(strsrch, ce);
323        if (newce) {
324            int32_t *temp = addTouint32_tArray(cetable, offset, &cetablesize,
325                                  newce,
326                                  patternlength - ucol_getOffset(coleiter) + 1,
327                                  status);
328            if (U_FAILURE(*status)) {
329                return 0;
330            }
331            offset ++;
332            if (cetable != temp && cetable != pattern->CEBuffer) {
333                uprv_free(cetable);
334            }
335            cetable = temp;
336        }
337        result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
338    }
339
340    cetable[offset]   = 0;
341    pattern->CE       = cetable;
342    pattern->CELength = offset;
343
344    return result;
345}
346
347/**
348* Initializing the pce table for a pattern.
349* Stores non-ignorable collation keys.
350* Table size will be estimated by the size of the pattern text. Table
351* expansion will be perform as we go along. Adding 1 to ensure that the table
352* size definitely increases.
353* Internal method, status assumed to be a success.
354* @param strsrch string search data
355* @param status output error if any, caller to check status before calling
356*               method, status assumed to be success when passed in.
357* @return total number of expansions
358*/
359static
360inline uint16_t initializePatternPCETable(UStringSearch *strsrch,
361                                          UErrorCode    *status)
362{
363    UPattern *pattern            = &(strsrch->pattern);
364    uint32_t  pcetablesize       = INITIAL_ARRAY_SIZE_;
365    int64_t  *pcetable           = pattern->PCEBuffer;
366    uint32_t  patternlength      = pattern->textLength;
367    UCollationElements *coleiter = strsrch->utilIter;
368
369    if (coleiter == NULL) {
370        coleiter = ucol_openElements(strsrch->collator, pattern->text,
371                                     patternlength, status);
372        // status will be checked in ucol_next(..) later and if it is an
373        // error UCOL_NULLORDER the result of ucol_next(..) and 0 will be
374        // returned.
375        strsrch->utilIter = coleiter;
376    } else {
377        ucol_setText(coleiter, pattern->text, pattern->textLength, status);
378    }
379    if(U_FAILURE(*status)) {
380        return 0;
381    }
382
383    if (pattern->PCE != pcetable && pattern->PCE != NULL) {
384        uprv_free(pattern->PCE);
385    }
386
387    uint16_t  offset = 0;
388    uint16_t  result = 0;
389    int64_t   pce;
390
391    icu::UCollationPCE iter(coleiter);
392
393    // ** Should processed CEs be signed or unsigned?
394    // ** (the rest of the code in this file seems to play fast-and-loose with
395    // **  whether a CE is signed or unsigned. For example, look at routine above this one.)
396    while ((pce = iter.nextProcessed(NULL, NULL, status)) != UCOL_PROCESSED_NULLORDER &&
397           U_SUCCESS(*status)) {
398        int64_t *temp = addTouint64_tArray(pcetable, offset, &pcetablesize,
399                              pce,
400                              patternlength - ucol_getOffset(coleiter) + 1,
401                              status);
402
403        if (U_FAILURE(*status)) {
404            return 0;
405        }
406
407        offset += 1;
408
409        if (pcetable != temp && pcetable != pattern->PCEBuffer) {
410            uprv_free(pcetable);
411        }
412
413        pcetable = temp;
414        //result += (uint16_t)(ucol_getMaxExpansion(coleiter, ce) - 1);
415    }
416
417    pcetable[offset]   = 0;
418    pattern->PCE       = pcetable;
419    pattern->PCELength = offset;
420
421    return result;
422}
423
424/**
425* Initializes the pattern struct.
426* Internal method, status assumed to be success.
427* @param strsrch UStringSearch data storage
428* @param status output error if any, caller to check status before calling
429*               method, status assumed to be success when passed in.
430* @return expansionsize the total expansion size of the pattern
431*/
432static
433inline int16_t initializePattern(UStringSearch *strsrch, UErrorCode *status)
434{
435    if (U_FAILURE(*status)) { return 0; }
436          UPattern   *pattern     = &(strsrch->pattern);
437    const UChar      *patterntext = pattern->text;
438          int32_t     length      = pattern->textLength;
439          int32_t index       = 0;
440
441    // Since the strength is primary, accents are ignored in the pattern.
442    if (strsrch->strength == UCOL_PRIMARY) {
443        pattern->hasPrefixAccents = 0;
444        pattern->hasSuffixAccents = 0;
445    } else {
446        pattern->hasPrefixAccents = getFCD(patterntext, &index, length) >>
447                                                         SECOND_LAST_BYTE_SHIFT_;
448        index = length;
449        U16_BACK_1(patterntext, 0, index);
450        pattern->hasSuffixAccents = getFCD(patterntext, &index, length) &
451                                                                 LAST_BYTE_MASK_;
452    }
453
454    // ** HACK **
455    if (strsrch->pattern.PCE != NULL) {
456        if (strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) {
457            uprv_free(strsrch->pattern.PCE);
458        }
459
460        strsrch->pattern.PCE = NULL;
461    }
462
463    // since intializePattern is an internal method status is a success.
464    return initializePatternCETable(strsrch, status);
465}
466
467/**
468* Initializing shift tables, with the default values.
469* If a corresponding default value is 0, the shift table is not set.
470* @param shift table for forwards shift
471* @param backshift table for backwards shift
472* @param cetable table containing pattern ce
473* @param cesize size of the pattern ces
474* @param expansionsize total size of the expansions
475* @param defaultforward the default forward value
476* @param defaultbackward the default backward value
477*/
478static
479inline void setShiftTable(int16_t   shift[], int16_t backshift[],
480                          int32_t  *cetable, int32_t cesize,
481                          int16_t   expansionsize,
482                          int16_t   defaultforward,
483                          int16_t   defaultbackward)
484{
485    // estimate the value to shift. to do that we estimate the smallest
486    // number of characters to give the relevant ces, ie approximately
487    // the number of ces minus their expansion, since expansions can come
488    // from a character.
489    int32_t count;
490    for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
491        shift[count] = defaultforward;
492    }
493    cesize --; // down to the last index
494    for (count = 0; count < cesize; count ++) {
495        // number of ces from right of array to the count
496        int temp = defaultforward - count - 1;
497        shift[hash(cetable[count])] = temp > 1 ? temp : 1;
498    }
499    shift[hash(cetable[cesize])] = 1;
500    // for ignorables we just shift by one. see test examples.
501    shift[hash(0)] = 1;
502
503    for (count = 0; count < MAX_TABLE_SIZE_; count ++) {
504        backshift[count] = defaultbackward;
505    }
506    for (count = cesize; count > 0; count --) {
507        // the original value count does not seem to work
508        backshift[hash(cetable[count])] = count > expansionsize ?
509                                          (int16_t)(count - expansionsize) : 1;
510    }
511    backshift[hash(cetable[0])] = 1;
512    backshift[hash(0)] = 1;
513}
514
515/**
516* Building of the pattern collation element list and the boyer moore strsrch
517* table.
518* The canonical match will only be performed after the default match fails.
519* For both cases we need to remember the size of the composed and decomposed
520* versions of the string. Since the Boyer-Moore shift calculations shifts by
521* a number of characters in the text and tries to match the pattern from that
522* offset, the shift value can not be too large in case we miss some
523* characters. To choose a right shift size, we estimate the NFC form of the
524* and use its size as a shift guide. The NFC form should be the small
525* possible representation of the pattern. Anyways, we'll err on the smaller
526* shift size. Hence the calculation for minlength.
527* Canonical match will be performed slightly differently. We'll split the
528* pattern into 3 parts, the prefix accents (PA), the middle string bounded by
529* the first and last base character (MS), the ending accents (EA). Matches
530* will be done on MS first, and only when we match MS then some processing
531* will be required for the prefix and end accents in order to determine if
532* they match PA and EA. Hence the default shift values
533* for the canonical match will take the size of either end's accent into
534* consideration. Forwards search will take the end accents into consideration
535* for the default shift values and the backwards search will take the prefix
536* accents into consideration.
537* If pattern has no non-ignorable ce, we return a illegal argument error.
538* Internal method, status assumed to be success.
539* @param strsrch UStringSearch data storage
540* @param status  for output errors if it occurs, status is assumed to be a
541*                success when it is passed in.
542*/
543static
544inline void initialize(UStringSearch *strsrch, UErrorCode *status)
545{
546    int16_t expandlength  = initializePattern(strsrch, status);
547    if (U_SUCCESS(*status) && strsrch->pattern.CELength > 0) {
548        UPattern *pattern = &strsrch->pattern;
549        int32_t   cesize  = pattern->CELength;
550
551        int16_t minlength = cesize > expandlength
552                            ? (int16_t)cesize - expandlength : 1;
553        pattern->defaultShiftSize    = minlength;
554        setShiftTable(pattern->shift, pattern->backShift, pattern->CE,
555                      cesize, expandlength, minlength, minlength);
556        return;
557    }
558    strsrch->pattern.defaultShiftSize = 0;
559}
560
561#if BOYER_MOORE
562/**
563* Check to make sure that the match length is at the end of the character by
564* using the breakiterator.
565* @param strsrch string search data
566* @param start target text start offset
567* @param end target text end offset
568*/
569static
570void checkBreakBoundary(const UStringSearch *strsrch, int32_t * /*start*/,
571                               int32_t *end)
572{
573#if !UCONFIG_NO_BREAK_ITERATION
574    UBreakIterator *breakiterator = strsrch->search->internalBreakIter;
575    if (breakiterator) {
576        int32_t matchend = *end;
577        //int32_t matchstart = *start;
578
579        if (!ubrk_isBoundary(breakiterator, matchend)) {
580            *end = ubrk_following(breakiterator, matchend);
581        }
582
583        /* Check the start of the matched text to make sure it doesn't have any accents
584         * before it.  This code may not be necessary and so it is commented out */
585        /*if (!ubrk_isBoundary(breakiterator, matchstart) && !ubrk_isBoundary(breakiterator, matchstart-1)) {
586            *start = ubrk_preceding(breakiterator, matchstart);
587        }*/
588    }
589#endif
590}
591
592/**
593* Determine whether the target text in UStringSearch bounded by the offset
594* start and end is one or more whole units of text as
595* determined by the breakiterator in UStringSearch.
596* @param strsrch string search data
597* @param start target text start offset
598* @param end target text end offset
599*/
600static
601UBool isBreakUnit(const UStringSearch *strsrch, int32_t start,
602                               int32_t    end)
603{
604#if !UCONFIG_NO_BREAK_ITERATION
605    UBreakIterator *breakiterator = strsrch->search->breakIter;
606    //TODO: Add here.
607    if (breakiterator) {
608        int32_t startindex = ubrk_first(breakiterator);
609        int32_t endindex   = ubrk_last(breakiterator);
610
611        // out-of-range indexes are never boundary positions
612        if (start < startindex || start > endindex ||
613            end < startindex || end > endindex) {
614            return FALSE;
615        }
616        // otherwise, we can use following() on the position before the
617        // specified one and return true of the position we get back is the
618        // one the user specified
619        UBool result = (start == startindex ||
620                ubrk_following(breakiterator, start - 1) == start) &&
621               (end == endindex ||
622                ubrk_following(breakiterator, end - 1) == end);
623        if (result) {
624            // iterates the individual ces
625                  UCollationElements *coleiter  = strsrch->utilIter;
626            const UChar              *text      = strsrch->search->text +
627                                                                      start;
628                  UErrorCode          status    = U_ZERO_ERROR;
629            ucol_setText(coleiter, text, end - start, &status);
630            for (int32_t count = 0; count < strsrch->pattern.CELength;
631                 count ++) {
632                int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
633                if (ce == UCOL_IGNORABLE) {
634                    count --;
635                    continue;
636                }
637                if (U_FAILURE(status) || ce != strsrch->pattern.CE[count]) {
638                    return FALSE;
639                }
640            }
641            int32_t nextce = ucol_next(coleiter, &status);
642            while (ucol_getOffset(coleiter) == (end - start)
643                   && getCE(strsrch, nextce) == UCOL_IGNORABLE) {
644                nextce = ucol_next(coleiter, &status);
645            }
646            if (ucol_getOffset(coleiter) == (end - start)
647                && nextce != UCOL_NULLORDER) {
648                // extra collation elements at the end of the match
649                return FALSE;
650            }
651        }
652        return result;
653    }
654#endif
655    return TRUE;
656}
657
658/**
659* Getting the next base character offset if current offset is an accent,
660* or the current offset if the current character contains a base character.
661* accents the following base character will be returned
662* @param text string
663* @param textoffset current offset
664* @param textlength length of text string
665* @return the next base character or the current offset
666*         if the current character is contains a base character.
667*/
668static
669inline int32_t getNextBaseOffset(const UChar       *text,
670                                           int32_t  textoffset,
671                                           int32_t      textlength)
672{
673    if (textoffset < textlength) {
674        int32_t temp = textoffset;
675        if (getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
676            while (temp < textlength) {
677                int32_t result = temp;
678                if ((getFCD(text, &temp, textlength) >>
679                     SECOND_LAST_BYTE_SHIFT_) == 0) {
680                    return result;
681                }
682            }
683            return textlength;
684        }
685    }
686    return textoffset;
687}
688
689/**
690* Gets the next base character offset depending on the string search pattern
691* data
692* @param strsrch string search data
693* @param textoffset current offset, one offset away from the last character
694*                   to search for.
695* @return start index of the next base character or the current offset
696*         if the current character is contains a base character.
697*/
698static
699inline int32_t getNextUStringSearchBaseOffset(UStringSearch *strsrch,
700                                                  int32_t    textoffset)
701{
702    int32_t textlength = strsrch->search->textLength;
703    if (strsrch->pattern.hasSuffixAccents &&
704        textoffset < textlength) {
705              int32_t  temp       = textoffset;
706        const UChar       *text       = strsrch->search->text;
707        U16_BACK_1(text, 0, temp);
708        if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
709            return getNextBaseOffset(text, textoffset, textlength);
710        }
711    }
712    return textoffset;
713}
714
715/**
716* Shifting the collation element iterator position forward to prepare for
717* a following match. If the last character is a unsafe character, we'll only
718* shift by 1 to capture contractions, normalization etc.
719* Internal method, status assumed to be success.
720* @param text strsrch string search data
721* @param textoffset start text position to do search
722* @param ce the text ce which failed the match.
723* @param patternceindex index of the ce within the pattern ce buffer which
724*        failed the match
725* @return final offset
726*/
727static
728inline int32_t shiftForward(UStringSearch *strsrch,
729                                int32_t    textoffset,
730                                int32_t       ce,
731                                int32_t        patternceindex)
732{
733    UPattern *pattern = &(strsrch->pattern);
734    if (ce != UCOL_NULLORDER) {
735        int32_t shift = pattern->shift[hash(ce)];
736        // this is to adjust for characters in the middle of the
737        // substring for matching that failed.
738        int32_t adjust = pattern->CELength - patternceindex;
739        if (adjust > 1 && shift >= adjust) {
740            shift -= adjust - 1;
741        }
742        textoffset += shift;
743    }
744    else {
745        textoffset += pattern->defaultShiftSize;
746    }
747
748    textoffset = getNextUStringSearchBaseOffset(strsrch, textoffset);
749    // check for unsafe characters
750    // * if it is the start or middle of a contraction: to be done after
751    //   a initial match is found
752    // * thai or lao base consonant character: similar to contraction
753    // * high surrogate character: similar to contraction
754    // * next character is a accent: shift to the next base character
755    return textoffset;
756}
757#endif // #if BOYER_MOORE
758
759/**
760* sets match not found
761* @param strsrch string search data
762*/
763static
764inline void setMatchNotFound(UStringSearch *strsrch)
765{
766    // this method resets the match result regardless of the error status.
767    strsrch->search->matchedIndex = USEARCH_DONE;
768    strsrch->search->matchedLength = 0;
769    if (strsrch->search->isForwardSearching) {
770        setColEIterOffset(strsrch->textIter, strsrch->search->textLength);
771    }
772    else {
773        setColEIterOffset(strsrch->textIter, 0);
774    }
775}
776
777#if BOYER_MOORE
778/**
779* Gets the offset to the next safe point in text.
780* ie. not the middle of a contraction, swappable characters or supplementary
781* characters.
782* @param collator collation sata
783* @param text string to work with
784* @param textoffset offset in string
785* @param textlength length of text string
786* @return offset to the next safe character
787*/
788static
789inline int32_t getNextSafeOffset(const UCollator   *collator,
790                                     const UChar       *text,
791                                           int32_t  textoffset,
792                                           int32_t      textlength)
793{
794    int32_t result = textoffset; // first contraction character
795    while (result != textlength && ucol_unsafeCP(text[result], collator)) {
796        result ++;
797    }
798    return result;
799}
800
801/**
802* This checks for accents in the potential match started with a .
803* composite character.
804* This is really painful... we have to check that composite character do not
805* have any extra accents. We have to normalize the potential match and find
806* the immediate decomposed character before the match.
807* The first composite character would have been taken care of by the fcd
808* checks in checkForwardExactMatch.
809* This is the slow path after the fcd of the first character and
810* the last character has been checked by checkForwardExactMatch and we
811* determine that the potential match has extra non-ignorable preceding
812* ces.
813* E.g. looking for \u0301 acute in \u01FA A ring above and acute,
814* checkExtraMatchAccent should fail since there is a middle ring in \u01FA
815* Note here that accents checking are slow and cautioned in the API docs.
816* Internal method, status assumed to be a success, caller should check status
817* before calling this method
818* @param strsrch string search data
819* @param start index of the potential unfriendly composite character
820* @param end index of the potential unfriendly composite character
821* @param status output error status if any.
822* @return TRUE if there is non-ignorable accents before at the beginning
823*              of the match, FALSE otherwise.
824*/
825
826static
827UBool checkExtraMatchAccents(const UStringSearch *strsrch, int32_t start,
828                                   int32_t    end,
829                                   UErrorCode    *status)
830{
831    UBool result = FALSE;
832    if (strsrch->pattern.hasPrefixAccents) {
833              int32_t  length = end - start;
834              int32_t  offset = 0;
835        const UChar       *text   = strsrch->search->text + start;
836
837        U16_FWD_1(text, offset, length);
838        // we are only concerned with the first composite character
839        if (unorm_quickCheck(text, offset, UNORM_NFD, status) == UNORM_NO) {
840            int32_t safeoffset = getNextSafeOffset(strsrch->collator,
841                                                       text, 0, length);
842            if (safeoffset != length) {
843                safeoffset ++;
844            }
845            UChar   *norm = NULL;
846            UChar    buffer[INITIAL_ARRAY_SIZE_];
847            int32_t  size = unorm_normalize(text, safeoffset, UNORM_NFD, 0,
848                                            buffer, INITIAL_ARRAY_SIZE_,
849                                            status);
850            if (U_FAILURE(*status)) {
851                return FALSE;
852            }
853            if (size >= INITIAL_ARRAY_SIZE_) {
854                norm = (UChar *)allocateMemory((size + 1) * sizeof(UChar),
855                                               status);
856                // if allocation failed, status will be set to
857                // U_MEMORY_ALLOCATION_ERROR and unorm_normalize internally
858                // checks for it.
859                size = unorm_normalize(text, safeoffset, UNORM_NFD, 0, norm,
860                                       size, status);
861                if (U_FAILURE(*status) && norm != NULL) {
862                    uprv_free(norm);
863                    return FALSE;
864                }
865            }
866            else {
867                norm = buffer;
868            }
869
870            UCollationElements *coleiter  = strsrch->utilIter;
871            ucol_setText(coleiter, norm, size, status);
872            uint32_t            firstce   = strsrch->pattern.CE[0];
873            UBool               ignorable = TRUE;
874            uint32_t            ce        = UCOL_IGNORABLE;
875            while (U_SUCCESS(*status) && ce != firstce && ce != (uint32_t)UCOL_NULLORDER) {
876                offset = ucol_getOffset(coleiter);
877                if (ce != firstce && ce != UCOL_IGNORABLE) {
878                    ignorable = FALSE;
879                }
880                ce = ucol_next(coleiter, status);
881            }
882            UChar32 codepoint;
883            U16_PREV(norm, 0, offset, codepoint);
884            result = !ignorable && (u_getCombiningClass(codepoint) != 0);
885
886            if (norm != buffer) {
887                uprv_free(norm);
888            }
889        }
890    }
891
892    return result;
893}
894
895/**
896* Used by exact matches, checks if there are accents before the match.
897* This is really painful... we have to check that composite characters at
898* the start of the matches have to not have any extra accents.
899* We check the FCD of the character first, if it starts with an accent and
900* the first pattern ce does not match the first ce of the character, we bail.
901* Otherwise we try normalizing the first composite
902* character and find the immediate decomposed character before the match to
903* see if it is an non-ignorable accent.
904* Now normalizing the first composite character is enough because we ensure
905* that when the match is passed in here with extra beginning ces, the
906* first or last ce that match has to occur within the first character.
907* E.g. looking for \u0301 acute in \u01FA A ring above and acute,
908* checkExtraMatchAccent should fail since there is a middle ring in \u01FA
909* Note here that accents checking are slow and cautioned in the API docs.
910* @param strsrch string search data
911* @param start offset
912* @param end offset
913* @return TRUE if there are accents on either side of the match,
914*         FALSE otherwise
915*/
916static
917UBool hasAccentsBeforeMatch(const UStringSearch *strsrch, int32_t start,
918                                  int32_t    end)
919{
920    if (strsrch->pattern.hasPrefixAccents) {
921        UCollationElements *coleiter  = strsrch->textIter;
922        UErrorCode          status    = U_ZERO_ERROR;
923        // we have been iterating forwards previously
924        uint32_t            ignorable = TRUE;
925        int32_t             firstce   = strsrch->pattern.CE[0];
926
927        setColEIterOffset(coleiter, start);
928        int32_t ce  = getCE(strsrch, ucol_next(coleiter, &status));
929        if (U_FAILURE(status)) {
930            return TRUE;
931        }
932        while (ce != firstce) {
933            if (ce != UCOL_IGNORABLE) {
934                ignorable = FALSE;
935            }
936            ce = getCE(strsrch, ucol_next(coleiter, &status));
937            if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
938                return TRUE;
939            }
940        }
941        if (!ignorable && inNormBuf(coleiter)) {
942            // within normalization buffer, discontiguous handled here
943            return TRUE;
944        }
945
946        // within text
947        int32_t temp = start;
948        // original code
949        // accent = (getFCD(strsrch->search->text, &temp,
950        //                  strsrch->search->textLength)
951        //            >> SECOND_LAST_BYTE_SHIFT_);
952        // however this code does not work well with VC7 .net in release mode.
953        // maybe the inlines for getFCD combined with shifting has bugs in
954        // VC7. anyways this is a work around.
955        UBool accent = getFCD(strsrch->search->text, &temp,
956                              strsrch->search->textLength) > 0xFF;
957        if (!accent) {
958            return checkExtraMatchAccents(strsrch, start, end, &status);
959        }
960        if (!ignorable) {
961            return TRUE;
962        }
963        if (start > 0) {
964            temp = start;
965            U16_BACK_1(strsrch->search->text, 0, temp);
966            if (getFCD(strsrch->search->text, &temp,
967                       strsrch->search->textLength) & LAST_BYTE_MASK_) {
968                setColEIterOffset(coleiter, start);
969                ce = ucol_previous(coleiter, &status);
970                if (U_FAILURE(status) ||
971                    (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE)) {
972                    return TRUE;
973                }
974            }
975        }
976    }
977
978    return FALSE;
979}
980
981/**
982* Used by exact matches, checks if there are accents bounding the match.
983* Note this is the initial boundary check. If the potential match
984* starts or ends with composite characters, the accents in those
985* characters will be determined later.
986* Not doing backwards iteration here, since discontiguos contraction for
987* backwards collation element iterator, use up too many characters.
988* E.g. looking for \u030A ring in \u01FA A ring above and acute,
989* should fail since there is a acute at the end of \u01FA
990* Note here that accents checking are slow and cautioned in the API docs.
991* @param strsrch string search data
992* @param start offset of match
993* @param end end offset of the match
994* @return TRUE if there are accents on either side of the match,
995*         FALSE otherwise
996*/
997static
998UBool hasAccentsAfterMatch(const UStringSearch *strsrch, int32_t start,
999                                 int32_t    end)
1000{
1001    if (strsrch->pattern.hasSuffixAccents) {
1002        const UChar       *text       = strsrch->search->text;
1003              int32_t  temp       = end;
1004              int32_t      textlength = strsrch->search->textLength;
1005        U16_BACK_1(text, 0, temp);
1006        if (getFCD(text, &temp, textlength) & LAST_BYTE_MASK_) {
1007            int32_t             firstce  = strsrch->pattern.CE[0];
1008            UCollationElements *coleiter = strsrch->textIter;
1009            UErrorCode          status   = U_ZERO_ERROR;
1010            int32_t ce;
1011            setColEIterOffset(coleiter, start);
1012            while ((ce = getCE(strsrch, ucol_next(coleiter, &status))) != firstce) {
1013                if (U_FAILURE(status) || ce == UCOL_NULLORDER) {
1014                    return TRUE;
1015                }
1016            }
1017            int32_t count = 1;
1018            while (count < strsrch->pattern.CELength) {
1019                if (getCE(strsrch, ucol_next(coleiter, &status))
1020                    == UCOL_IGNORABLE) {
1021                    // Thai can give an ignorable here.
1022                    count --;
1023                }
1024                if (U_FAILURE(status)) {
1025                    return TRUE;
1026                }
1027                count ++;
1028            }
1029
1030            ce = ucol_next(coleiter, &status);
1031            if (U_FAILURE(status)) {
1032                return TRUE;
1033            }
1034            if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1035                ce = getCE(strsrch, ce);
1036            }
1037            if (ce != UCOL_NULLORDER && ce != UCOL_IGNORABLE) {
1038                if (ucol_getOffset(coleiter) <= end) {
1039                    return TRUE;
1040                }
1041                if (getFCD(text, &end, textlength) >> SECOND_LAST_BYTE_SHIFT_) {
1042                    return TRUE;
1043                }
1044            }
1045        }
1046    }
1047    return FALSE;
1048}
1049#endif // #if BOYER_MOORE
1050
1051/**
1052* Checks if the offset runs out of the text string
1053* @param offset
1054* @param textlength of the text string
1055* @return TRUE if offset is out of bounds, FALSE otherwise
1056*/
1057static
1058inline UBool isOutOfBounds(int32_t textlength, int32_t offset)
1059{
1060    return offset < 0 || offset > textlength;
1061}
1062
1063/**
1064* Checks for identical match
1065* @param strsrch string search data
1066* @param start offset of possible match
1067* @param end offset of possible match
1068* @return TRUE if identical match is found
1069*/
1070static
1071inline UBool checkIdentical(const UStringSearch *strsrch, int32_t start,
1072                                  int32_t    end)
1073{
1074    if (strsrch->strength != UCOL_IDENTICAL) {
1075        return TRUE;
1076    }
1077
1078    // Note: We could use Normalizer::compare() or similar, but for short strings
1079    // which may not be in FCD it might be faster to just NFD them.
1080    UErrorCode status = U_ZERO_ERROR;
1081    UnicodeString t2, p2;
1082    strsrch->nfd->normalize(
1083        UnicodeString(FALSE, strsrch->search->text + start, end - start), t2, status);
1084    strsrch->nfd->normalize(
1085        UnicodeString(FALSE, strsrch->pattern.text, strsrch->pattern.textLength), p2, status);
1086    // return FALSE if NFD failed
1087    return U_SUCCESS(status) && t2 == p2;
1088}
1089
1090#if BOYER_MOORE
1091/**
1092* Checks to see if the match is repeated
1093* @param strsrch string search data
1094* @param start new match start index
1095* @param end new match end index
1096* @return TRUE if the the match is repeated, FALSE otherwise
1097*/
1098static
1099inline UBool checkRepeatedMatch(UStringSearch *strsrch,
1100                                int32_t    start,
1101                                int32_t    end)
1102{
1103    int32_t lastmatchindex = strsrch->search->matchedIndex;
1104    UBool       result;
1105    if (lastmatchindex == USEARCH_DONE) {
1106        return FALSE;
1107    }
1108    if (strsrch->search->isForwardSearching) {
1109        result = start <= lastmatchindex;
1110    }
1111    else {
1112        result = start >= lastmatchindex;
1113    }
1114    if (!result && !strsrch->search->isOverlap) {
1115        if (strsrch->search->isForwardSearching) {
1116            result = start < lastmatchindex + strsrch->search->matchedLength;
1117        }
1118        else {
1119            result = end > lastmatchindex;
1120        }
1121    }
1122    return result;
1123}
1124
1125/**
1126* Gets the collation element iterator's current offset.
1127* @param coleiter collation element iterator
1128* @param forwards flag TRUE if we are moving in th forwards direction
1129* @return current offset
1130*/
1131static
1132inline int32_t getColElemIterOffset(const UCollationElements *coleiter,
1133                                              UBool               forwards)
1134{
1135    int32_t result = ucol_getOffset(coleiter);
1136    // intricacies of the the backwards collation element iterator
1137    if (FALSE && !forwards && inNormBuf(coleiter) && !isFCDPointerNull(coleiter)) {
1138        result ++;
1139    }
1140    return result;
1141}
1142
1143/**
1144* Checks match for contraction.
1145* If the match ends with a partial contraction we fail.
1146* If the match starts too far off (because of backwards iteration) we try to
1147* chip off the extra characters depending on whether a breakiterator has
1148* been used.
1149* Internal method, error assumed to be success, caller has to check status
1150* before calling this method.
1151* @param strsrch string search data
1152* @param start offset of potential match, to be modified if necessary
1153* @param end offset of potential match, to be modified if necessary
1154* @param status output error status if any
1155* @return TRUE if match passes the contraction test, FALSE otherwise
1156*/
1157
1158static
1159UBool checkNextExactContractionMatch(UStringSearch *strsrch,
1160                                     int32_t   *start,
1161                                     int32_t   *end, UErrorCode  *status)
1162{
1163          UCollationElements *coleiter   = strsrch->textIter;
1164          int32_t             textlength = strsrch->search->textLength;
1165          int32_t             temp       = *start;
1166    const UCollator          *collator   = strsrch->collator;
1167    const UChar              *text       = strsrch->search->text;
1168    // This part checks if either ends of the match contains potential
1169    // contraction. If so we'll have to iterate through them
1170    // The start contraction needs to be checked since ucol_previous dumps
1171    // all characters till the first safe character into the buffer.
1172    // *start + 1 is used to test for the unsafe characters instead of *start
1173    // because ucol_prev takes all unsafe characters till the first safe
1174    // character ie *start. so by testing *start + 1, we can estimate if
1175    // excess prefix characters has been included in the potential search
1176    // results.
1177    if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1178        (*start + 1 < textlength
1179         && ucol_unsafeCP(text[*start + 1], collator))) {
1180        int32_t expansion  = getExpansionPrefix(coleiter);
1181        UBool   expandflag = expansion > 0;
1182        setColEIterOffset(coleiter, *start);
1183        while (expansion > 0) {
1184            // getting rid of the redundant ce, caused by setOffset.
1185            // since backward contraction/expansion may have extra ces if we
1186            // are in the normalization buffer, hasAccentsBeforeMatch would
1187            // have taken care of it.
1188            // E.g. the character \u01FA will have an expansion of 3, but if
1189            // we are only looking for acute and ring \u030A and \u0301, we'll
1190            // have to skip the first ce in the expansion buffer.
1191            ucol_next(coleiter, status);
1192            if (U_FAILURE(*status)) {
1193                return FALSE;
1194            }
1195            if (ucol_getOffset(coleiter) != temp) {
1196                *start = temp;
1197                temp  = ucol_getOffset(coleiter);
1198            }
1199            expansion --;
1200        }
1201
1202        int32_t  *patternce       = strsrch->pattern.CE;
1203        int32_t   patterncelength = strsrch->pattern.CELength;
1204        int32_t   count           = 0;
1205        while (count < patterncelength) {
1206            int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1207            if (ce == UCOL_IGNORABLE) {
1208                continue;
1209            }
1210            if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1211                *start = temp;
1212                temp   = ucol_getOffset(coleiter);
1213            }
1214            if (U_FAILURE(*status) || ce != patternce[count]) {
1215                (*end) ++;
1216                *end = getNextUStringSearchBaseOffset(strsrch, *end);
1217                return FALSE;
1218            }
1219            count ++;
1220        }
1221    }
1222    return TRUE;
1223}
1224
1225/**
1226* Checks and sets the match information if found.
1227* Checks
1228* <ul>
1229* <li> the potential match does not repeat the previous match
1230* <li> boundaries are correct
1231* <li> exact matches has no extra accents
1232* <li> identical matchesb
1233* <li> potential match does not end in the middle of a contraction
1234* <\ul>
1235* Otherwise the offset will be shifted to the next character.
1236* Internal method, status assumed to be success, caller has to check status
1237* before calling this method.
1238* @param strsrch string search data
1239* @param textoffset offset in the collation element text. the returned value
1240*        will be the truncated end offset of the match or the new start
1241*        search offset.
1242* @param status output error status if any
1243* @return TRUE if the match is valid, FALSE otherwise
1244*/
1245static
1246inline UBool checkNextExactMatch(UStringSearch *strsrch,
1247                                 int32_t   *textoffset, UErrorCode *status)
1248{
1249    UCollationElements *coleiter = strsrch->textIter;
1250    int32_t         start    = getColElemIterOffset(coleiter, FALSE);
1251
1252    if (!checkNextExactContractionMatch(strsrch, &start, textoffset, status)) {
1253        return FALSE;
1254    }
1255
1256    // this totally matches, however we need to check if it is repeating
1257    if (!isBreakUnit(strsrch, start, *textoffset) ||
1258        checkRepeatedMatch(strsrch, start, *textoffset) ||
1259        hasAccentsBeforeMatch(strsrch, start, *textoffset) ||
1260        !checkIdentical(strsrch, start, *textoffset) ||
1261        hasAccentsAfterMatch(strsrch, start, *textoffset)) {
1262
1263        (*textoffset) ++;
1264        *textoffset = getNextUStringSearchBaseOffset(strsrch, *textoffset);
1265        return FALSE;
1266    }
1267
1268    //Add breakiterator boundary check for primary strength search.
1269    if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
1270        checkBreakBoundary(strsrch, &start, textoffset);
1271    }
1272
1273    // totally match, we will get rid of the ending ignorables.
1274    strsrch->search->matchedIndex  = start;
1275    strsrch->search->matchedLength = *textoffset - start;
1276    return TRUE;
1277}
1278
1279/**
1280* Getting the previous base character offset, or the current offset if the
1281* current character is a base character
1282* @param text string
1283* @param textoffset one offset after the current character
1284* @return the offset of the next character after the base character or the first
1285*         composed character with accents
1286*/
1287static
1288inline int32_t getPreviousBaseOffset(const UChar       *text,
1289                                               int32_t  textoffset)
1290{
1291    if (textoffset > 0) {
1292        for (;;) {
1293            int32_t result = textoffset;
1294            U16_BACK_1(text, 0, textoffset);
1295            int32_t temp = textoffset;
1296            uint16_t fcd = getFCD(text, &temp, result);
1297            if ((fcd >> SECOND_LAST_BYTE_SHIFT_) == 0) {
1298                if (fcd & LAST_BYTE_MASK_) {
1299                    return textoffset;
1300                }
1301                return result;
1302            }
1303            if (textoffset == 0) {
1304                return 0;
1305            }
1306        }
1307    }
1308    return textoffset;
1309}
1310
1311/**
1312* Getting the indexes of the accents that are not blocked in the argument
1313* accent array
1314* @param accents array of accents in nfd terminated by a 0.
1315* @param accentsindex array of indexes of the accents that are not blocked
1316*/
1317static
1318inline int getUnblockedAccentIndex(UChar *accents, int32_t *accentsindex)
1319{
1320    int32_t index     = 0;
1321    int32_t     length    = u_strlen(accents);
1322    UChar32     codepoint = 0;
1323    int         cclass    = 0;
1324    int         result    = 0;
1325    int32_t temp;
1326    while (index < length) {
1327        temp = index;
1328        U16_NEXT(accents, index, length, codepoint);
1329        if (u_getCombiningClass(codepoint) != cclass) {
1330            cclass        = u_getCombiningClass(codepoint);
1331            accentsindex[result] = temp;
1332            result ++;
1333        }
1334    }
1335    accentsindex[result] = length;
1336    return result;
1337}
1338
1339/**
1340* Appends 3 UChar arrays to a destination array.
1341* Creates a new array if we run out of space. The caller will have to
1342* manually deallocate the newly allocated array.
1343* Internal method, status assumed to be success, caller has to check status
1344* before calling this method. destination not to be NULL and has at least
1345* size destinationlength.
1346* @param destination target array
1347* @param destinationlength target array size, returning the appended length
1348* @param source1 null-terminated first array
1349* @param source2 second array
1350* @param source2length length of seond array
1351* @param source3 null-terminated third array
1352* @param status error status if any
1353* @return new destination array, destination if there was no new allocation
1354*/
1355static
1356inline UChar * addToUCharArray(      UChar      *destination,
1357                                     int32_t    *destinationlength,
1358                               const UChar      *source1,
1359                               const UChar      *source2,
1360                                     int32_t     source2length,
1361                               const UChar      *source3,
1362                                     UErrorCode *status)
1363{
1364    int32_t source1length = source1 ? u_strlen(source1) : 0;
1365    int32_t source3length = source3 ? u_strlen(source3) : 0;
1366    if (*destinationlength < source1length + source2length + source3length +
1367                                                                           1)
1368    {
1369        destination = (UChar *)allocateMemory(
1370          (source1length + source2length + source3length + 1) * sizeof(UChar),
1371          status);
1372        // if error allocating memory, status will be
1373        // U_MEMORY_ALLOCATION_ERROR
1374        if (U_FAILURE(*status)) {
1375            *destinationlength = 0;
1376            return NULL;
1377        }
1378    }
1379    if (source1length != 0) {
1380        uprv_memcpy(destination, source1, sizeof(UChar) * source1length);
1381    }
1382    if (source2length != 0) {
1383        uprv_memcpy(destination + source1length, source2,
1384                    sizeof(UChar) * source2length);
1385    }
1386    if (source3length != 0) {
1387        uprv_memcpy(destination + source1length + source2length, source3,
1388                    sizeof(UChar) * source3length);
1389    }
1390    *destinationlength = source1length + source2length + source3length;
1391    return destination;
1392}
1393
1394/**
1395* Running through a collation element iterator to see if the contents matches
1396* pattern in string search data
1397* @param strsrch string search data
1398* @param coleiter collation element iterator
1399* @return TRUE if a match if found, FALSE otherwise
1400*/
1401static
1402inline UBool checkCollationMatch(const UStringSearch      *strsrch,
1403                                       UCollationElements *coleiter)
1404{
1405    int         patternceindex = strsrch->pattern.CELength;
1406    int32_t    *patternce      = strsrch->pattern.CE;
1407    UErrorCode  status = U_ZERO_ERROR;
1408    while (patternceindex > 0) {
1409        int32_t ce = getCE(strsrch, ucol_next(coleiter, &status));
1410        if (ce == UCOL_IGNORABLE) {
1411            continue;
1412        }
1413        if (U_FAILURE(status) || ce != *patternce) {
1414            return FALSE;
1415        }
1416        patternce ++;
1417        patternceindex --;
1418    }
1419    return TRUE;
1420}
1421
1422/**
1423* Rearranges the front accents to try matching.
1424* Prefix accents in the text will be grouped according to their combining
1425* class and the groups will be mixed and matched to try find the perfect
1426* match with the pattern.
1427* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1428* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1429*         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1430*         "\u0301\u0325".
1431* step 2: check if any of the generated substrings matches the pattern.
1432* Internal method, status is assumed to be success, caller has to check status
1433* before calling this method.
1434* @param strsrch string search match
1435* @param start first offset of the accents to start searching
1436* @param end start of the last accent set
1437* @param status output error status if any
1438* @return USEARCH_DONE if a match is not found, otherwise return the starting
1439*         offset of the match. Note this start includes all preceding accents.
1440*/
1441static
1442int32_t doNextCanonicalPrefixMatch(UStringSearch *strsrch,
1443                                       int32_t    start,
1444                                       int32_t    end,
1445                                       UErrorCode    *status)
1446{
1447    const UChar       *text       = strsrch->search->text;
1448          int32_t      textlength = strsrch->search->textLength;
1449          int32_t  tempstart  = start;
1450
1451    if ((getFCD(text, &tempstart, textlength) & LAST_BYTE_MASK_) == 0) {
1452        // die... failed at a base character
1453        return USEARCH_DONE;
1454    }
1455
1456    int32_t offset = getNextBaseOffset(text, tempstart, textlength);
1457    start = getPreviousBaseOffset(text, tempstart);
1458
1459    UChar       accents[INITIAL_ARRAY_SIZE_];
1460    // normalizing the offensive string
1461    unorm_normalize(text + start, offset - start, UNORM_NFD, 0, accents,
1462                    INITIAL_ARRAY_SIZE_, status);
1463    if (U_FAILURE(*status)) {
1464        return USEARCH_DONE;
1465    }
1466
1467    int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
1468    int32_t         accentsize = getUnblockedAccentIndex(accents,
1469                                                                 accentsindex);
1470    int32_t         count      = (2 << (accentsize - 1)) - 1;
1471    UChar               buffer[INITIAL_ARRAY_SIZE_];
1472    UCollationElements *coleiter   = strsrch->utilIter;
1473    while (U_SUCCESS(*status) && count > 0) {
1474        UChar *rearrange = strsrch->canonicalPrefixAccents;
1475        // copy the base characters
1476        for (int k = 0; k < accentsindex[0]; k ++) {
1477            *rearrange ++ = accents[k];
1478        }
1479        // forming all possible canonical rearrangement by dropping
1480        // sets of accents
1481        for (int i = 0; i <= accentsize - 1; i ++) {
1482            int32_t mask = 1 << (accentsize - i - 1);
1483            if (count & mask) {
1484                for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1485                    *rearrange ++ = accents[j];
1486                }
1487            }
1488        }
1489        *rearrange = 0;
1490        int32_t  matchsize = INITIAL_ARRAY_SIZE_;
1491        UChar   *match     = addToUCharArray(buffer, &matchsize,
1492                                           strsrch->canonicalPrefixAccents,
1493                                           strsrch->search->text + offset,
1494                                           end - offset,
1495                                           strsrch->canonicalSuffixAccents,
1496                                           status);
1497
1498        // if status is a failure, ucol_setText does nothing.
1499        // run the collator iterator through this match
1500        ucol_setText(coleiter, match, matchsize, status);
1501        if (U_SUCCESS(*status)) {
1502            if (checkCollationMatch(strsrch, coleiter)) {
1503                if (match != buffer) {
1504                    uprv_free(match);
1505                }
1506                return start;
1507            }
1508        }
1509        count --;
1510    }
1511    return USEARCH_DONE;
1512}
1513
1514/**
1515* Gets the offset to the safe point in text before textoffset.
1516* ie. not the middle of a contraction, swappable characters or supplementary
1517* characters.
1518* @param collator collation sata
1519* @param text string to work with
1520* @param textoffset offset in string
1521* @param textlength length of text string
1522* @return offset to the previous safe character
1523*/
1524static
1525inline uint32_t getPreviousSafeOffset(const UCollator   *collator,
1526                                      const UChar       *text,
1527                                            int32_t  textoffset)
1528{
1529    int32_t result = textoffset; // first contraction character
1530    while (result != 0 && ucol_unsafeCP(text[result - 1], collator)) {
1531        result --;
1532    }
1533    if (result != 0) {
1534        // the first contraction character is consider unsafe here
1535        result --;
1536    }
1537    return result;
1538}
1539
1540/**
1541* Cleaning up after we passed the safe zone
1542* @param strsrch string search data
1543* @param safetext safe text array
1544* @param safebuffer safe text buffer
1545* @param coleiter collation element iterator for safe text
1546*/
1547static
1548inline void cleanUpSafeText(const UStringSearch *strsrch, UChar *safetext,
1549                                  UChar         *safebuffer)
1550{
1551    if (safetext != safebuffer && safetext != strsrch->canonicalSuffixAccents)
1552    {
1553       uprv_free(safetext);
1554    }
1555}
1556
1557/**
1558* Take the rearranged end accents and tries matching. If match failed at
1559* a seperate preceding set of accents (seperated from the rearranged on by
1560* at least a base character) then we rearrange the preceding accents and
1561* tries matching again.
1562* We allow skipping of the ends of the accent set if the ces do not match.
1563* However if the failure is found before the accent set, it fails.
1564* Internal method, status assumed to be success, caller has to check status
1565* before calling this method.
1566* @param strsrch string search data
1567* @param textoffset of the start of the rearranged accent
1568* @param status output error status if any
1569* @return USEARCH_DONE if a match is not found, otherwise return the starting
1570*         offset of the match. Note this start includes all preceding accents.
1571*/
1572static
1573int32_t doNextCanonicalSuffixMatch(UStringSearch *strsrch,
1574                                       int32_t    textoffset,
1575                                       UErrorCode    *status)
1576{
1577    const UChar              *text           = strsrch->search->text;
1578    const UCollator          *collator       = strsrch->collator;
1579          int32_t             safelength     = 0;
1580          UChar              *safetext;
1581          int32_t             safetextlength;
1582          UChar               safebuffer[INITIAL_ARRAY_SIZE_];
1583          UCollationElements *coleiter       = strsrch->utilIter;
1584          int32_t         safeoffset     = textoffset;
1585
1586    if (textoffset != 0 && ucol_unsafeCP(strsrch->canonicalSuffixAccents[0],
1587                                         collator)) {
1588        safeoffset     = getPreviousSafeOffset(collator, text, textoffset);
1589        safelength     = textoffset - safeoffset;
1590        safetextlength = INITIAL_ARRAY_SIZE_;
1591        safetext       = addToUCharArray(safebuffer, &safetextlength, NULL,
1592                                         text + safeoffset, safelength,
1593                                         strsrch->canonicalSuffixAccents,
1594                                         status);
1595    }
1596    else {
1597        safetextlength = u_strlen(strsrch->canonicalSuffixAccents);
1598        safetext       = strsrch->canonicalSuffixAccents;
1599    }
1600
1601    // if status is a failure, ucol_setText does nothing
1602    ucol_setText(coleiter, safetext, safetextlength, status);
1603    // status checked in loop below
1604
1605    int32_t  *ce        = strsrch->pattern.CE;
1606    int32_t   celength  = strsrch->pattern.CELength;
1607    int       ceindex   = celength - 1;
1608    UBool     isSafe    = TRUE; // indication flag for position in safe zone
1609
1610    while (ceindex >= 0) {
1611        int32_t textce = ucol_previous(coleiter, status);
1612        if (U_FAILURE(*status)) {
1613            if (isSafe) {
1614                cleanUpSafeText(strsrch, safetext, safebuffer);
1615            }
1616            return USEARCH_DONE;
1617        }
1618        if (textce == UCOL_NULLORDER) {
1619            // check if we have passed the safe buffer
1620            if (coleiter == strsrch->textIter) {
1621                cleanUpSafeText(strsrch, safetext, safebuffer);
1622                return USEARCH_DONE;
1623            }
1624            cleanUpSafeText(strsrch, safetext, safebuffer);
1625            safetext = safebuffer;
1626            coleiter = strsrch->textIter;
1627            setColEIterOffset(coleiter, safeoffset);
1628            // status checked at the start of the loop
1629            isSafe = FALSE;
1630            continue;
1631        }
1632        textce = getCE(strsrch, textce);
1633        if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
1634            // do the beginning stuff
1635            int32_t failedoffset = getColElemIterOffset(coleiter, FALSE);
1636            if (isSafe && failedoffset >= safelength) {
1637                // alas... no hope. failed at rearranged accent set
1638                cleanUpSafeText(strsrch, safetext, safebuffer);
1639                return USEARCH_DONE;
1640            }
1641            else {
1642                if (isSafe) {
1643                    failedoffset += safeoffset;
1644                    cleanUpSafeText(strsrch, safetext, safebuffer);
1645                }
1646
1647                // try rearranging the front accents
1648                int32_t result = doNextCanonicalPrefixMatch(strsrch,
1649                                        failedoffset, textoffset, status);
1650                if (result != USEARCH_DONE) {
1651                    // if status is a failure, ucol_setOffset does nothing
1652                    setColEIterOffset(strsrch->textIter, result);
1653                }
1654                if (U_FAILURE(*status)) {
1655                    return USEARCH_DONE;
1656                }
1657                return result;
1658            }
1659        }
1660        if (textce == ce[ceindex]) {
1661            ceindex --;
1662        }
1663    }
1664    // set offset here
1665    if (isSafe) {
1666        int32_t result     = getColElemIterOffset(coleiter, FALSE);
1667        // sets the text iterator here with the correct expansion and offset
1668        int32_t    leftoverces = getExpansionPrefix(coleiter);
1669        cleanUpSafeText(strsrch, safetext, safebuffer);
1670        if (result >= safelength) {
1671            result = textoffset;
1672        }
1673        else {
1674            result += safeoffset;
1675        }
1676        setColEIterOffset(strsrch->textIter, result);
1677        strsrch->textIter->iteratordata_.toReturn =
1678                       setExpansionPrefix(strsrch->textIter, leftoverces);
1679        return result;
1680    }
1681
1682    return ucol_getOffset(coleiter);
1683}
1684
1685/**
1686* Trying out the substring and sees if it can be a canonical match.
1687* This will try normalizing the end accents and arranging them into canonical
1688* equivalents and check their corresponding ces with the pattern ce.
1689* Suffix accents in the text will be grouped according to their combining
1690* class and the groups will be mixed and matched to try find the perfect
1691* match with the pattern.
1692* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
1693* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
1694*         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
1695*         "\u0301\u0325".
1696* step 2: check if any of the generated substrings matches the pattern.
1697* Internal method, status assumed to be success, caller has to check status
1698* before calling this method.
1699* @param strsrch string search data
1700* @param textoffset end offset in the collation element text that ends with
1701*                   the accents to be rearranged
1702* @param status error status if any
1703* @return TRUE if the match is valid, FALSE otherwise
1704*/
1705static
1706UBool doNextCanonicalMatch(UStringSearch *strsrch,
1707                           int32_t    textoffset,
1708                           UErrorCode    *status)
1709{
1710    const UChar       *text = strsrch->search->text;
1711          int32_t  temp = textoffset;
1712    U16_BACK_1(text, 0, temp);
1713    if ((getFCD(text, &temp, textoffset) & LAST_BYTE_MASK_) == 0) {
1714        UCollationElements *coleiter = strsrch->textIter;
1715        int32_t         offset   = getColElemIterOffset(coleiter, FALSE);
1716        if (strsrch->pattern.hasPrefixAccents) {
1717            offset = doNextCanonicalPrefixMatch(strsrch, offset, textoffset,
1718                                                status);
1719            if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
1720                setColEIterOffset(coleiter, offset);
1721                return TRUE;
1722            }
1723        }
1724        return FALSE;
1725    }
1726
1727    if (!strsrch->pattern.hasSuffixAccents) {
1728        return FALSE;
1729    }
1730
1731    UChar       accents[INITIAL_ARRAY_SIZE_];
1732    // offset to the last base character in substring to search
1733    int32_t baseoffset = getPreviousBaseOffset(text, textoffset);
1734    // normalizing the offensive string
1735    unorm_normalize(text + baseoffset, textoffset - baseoffset, UNORM_NFD,
1736                               0, accents, INITIAL_ARRAY_SIZE_, status);
1737    // status checked in loop below
1738
1739    int32_t accentsindex[INITIAL_ARRAY_SIZE_];
1740    int32_t size = getUnblockedAccentIndex(accents, accentsindex);
1741
1742    // 2 power n - 1 plus the full set of accents
1743    int32_t  count = (2 << (size - 1)) - 1;
1744    while (U_SUCCESS(*status) && count > 0) {
1745        UChar *rearrange = strsrch->canonicalSuffixAccents;
1746        // copy the base characters
1747        for (int k = 0; k < accentsindex[0]; k ++) {
1748            *rearrange ++ = accents[k];
1749        }
1750        // forming all possible canonical rearrangement by dropping
1751        // sets of accents
1752        for (int i = 0; i <= size - 1; i ++) {
1753            int32_t mask = 1 << (size - i - 1);
1754            if (count & mask) {
1755                for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
1756                    *rearrange ++ = accents[j];
1757                }
1758            }
1759        }
1760        *rearrange = 0;
1761        int32_t offset = doNextCanonicalSuffixMatch(strsrch, baseoffset,
1762                                                        status);
1763        if (offset != USEARCH_DONE) {
1764            return TRUE; // match found
1765        }
1766        count --;
1767    }
1768    return FALSE;
1769}
1770
1771/**
1772* Gets the previous base character offset depending on the string search
1773* pattern data
1774* @param strsrch string search data
1775* @param textoffset current offset, current character
1776* @return the offset of the next character after this base character or itself
1777*         if it is a composed character with accents
1778*/
1779static
1780inline int32_t getPreviousUStringSearchBaseOffset(UStringSearch *strsrch,
1781                                                      int32_t textoffset)
1782{
1783    if (strsrch->pattern.hasPrefixAccents && textoffset > 0) {
1784        const UChar       *text = strsrch->search->text;
1785              int32_t  offset = textoffset;
1786        if (getFCD(text, &offset, strsrch->search->textLength) >>
1787                                                   SECOND_LAST_BYTE_SHIFT_) {
1788            return getPreviousBaseOffset(text, textoffset);
1789        }
1790    }
1791    return textoffset;
1792}
1793
1794/**
1795* Checks match for contraction.
1796* If the match ends with a partial contraction we fail.
1797* If the match starts too far off (because of backwards iteration) we try to
1798* chip off the extra characters
1799* Internal method, status assumed to be success, caller has to check status
1800* before calling this method.
1801* @param strsrch string search data
1802* @param start offset of potential match, to be modified if necessary
1803* @param end offset of potential match, to be modified if necessary
1804* @param status output error status if any
1805* @return TRUE if match passes the contraction test, FALSE otherwise
1806*/
1807static
1808UBool checkNextCanonicalContractionMatch(UStringSearch *strsrch,
1809                                         int32_t   *start,
1810                                         int32_t   *end,
1811                                         UErrorCode    *status)
1812{
1813          UCollationElements *coleiter   = strsrch->textIter;
1814          int32_t             textlength = strsrch->search->textLength;
1815          int32_t         temp       = *start;
1816    const UCollator          *collator   = strsrch->collator;
1817    const UChar              *text       = strsrch->search->text;
1818    // This part checks if either ends of the match contains potential
1819    // contraction. If so we'll have to iterate through them
1820    if ((*end < textlength && ucol_unsafeCP(text[*end], collator)) ||
1821        (*start + 1 < textlength
1822         && ucol_unsafeCP(text[*start + 1], collator))) {
1823        int32_t expansion  = getExpansionPrefix(coleiter);
1824        UBool   expandflag = expansion > 0;
1825        setColEIterOffset(coleiter, *start);
1826        while (expansion > 0) {
1827            // getting rid of the redundant ce, caused by setOffset.
1828            // since backward contraction/expansion may have extra ces if we
1829            // are in the normalization buffer, hasAccentsBeforeMatch would
1830            // have taken care of it.
1831            // E.g. the character \u01FA will have an expansion of 3, but if
1832            // we are only looking for acute and ring \u030A and \u0301, we'll
1833            // have to skip the first ce in the expansion buffer.
1834            ucol_next(coleiter, status);
1835            if (U_FAILURE(*status)) {
1836                return FALSE;
1837            }
1838            if (ucol_getOffset(coleiter) != temp) {
1839                *start = temp;
1840                temp  = ucol_getOffset(coleiter);
1841            }
1842            expansion --;
1843        }
1844
1845        int32_t  *patternce       = strsrch->pattern.CE;
1846        int32_t   patterncelength = strsrch->pattern.CELength;
1847        int32_t   count           = 0;
1848        int32_t   textlength      = strsrch->search->textLength;
1849        while (count < patterncelength) {
1850            int32_t ce = getCE(strsrch, ucol_next(coleiter, status));
1851            // status checked below, note that if status is a failure
1852            // ucol_next returns UCOL_NULLORDER
1853            if (ce == UCOL_IGNORABLE) {
1854                continue;
1855            }
1856            if (expandflag && count == 0 && ucol_getOffset(coleiter) != temp) {
1857                *start = temp;
1858                temp   = ucol_getOffset(coleiter);
1859            }
1860
1861            if (count == 0 && ce != patternce[0]) {
1862                // accents may have extra starting ces, this occurs when a
1863                // pure accent pattern is matched without rearrangement
1864                // text \u0325\u0300 and looking for \u0300
1865                int32_t expected = patternce[0];
1866                if (getFCD(text, start, textlength) & LAST_BYTE_MASK_) {
1867                    ce = getCE(strsrch, ucol_next(coleiter, status));
1868                    while (U_SUCCESS(*status) && ce != expected &&
1869                           ce != UCOL_NULLORDER &&
1870                           ucol_getOffset(coleiter) <= *end) {
1871                        ce = getCE(strsrch, ucol_next(coleiter, status));
1872                    }
1873                }
1874            }
1875            if (U_FAILURE(*status) || ce != patternce[count]) {
1876                (*end) ++;
1877                *end = getNextUStringSearchBaseOffset(strsrch, *end);
1878                return FALSE;
1879            }
1880            count ++;
1881        }
1882    }
1883    return TRUE;
1884}
1885
1886/**
1887* Checks and sets the match information if found.
1888* Checks
1889* <ul>
1890* <li> the potential match does not repeat the previous match
1891* <li> boundaries are correct
1892* <li> potential match does not end in the middle of a contraction
1893* <li> identical matches
1894* <\ul>
1895* Otherwise the offset will be shifted to the next character.
1896* Internal method, status assumed to be success, caller has to check the
1897* status before calling this method.
1898* @param strsrch string search data
1899* @param textoffset offset in the collation element text. the returned value
1900*        will be the truncated end offset of the match or the new start
1901*        search offset.
1902* @param status output error status if any
1903* @return TRUE if the match is valid, FALSE otherwise
1904*/
1905static
1906inline UBool checkNextCanonicalMatch(UStringSearch *strsrch,
1907                                     int32_t   *textoffset,
1908                                     UErrorCode    *status)
1909{
1910    // to ensure that the start and ends are not composite characters
1911    UCollationElements *coleiter = strsrch->textIter;
1912    // if we have a canonical accent match
1913    if ((strsrch->pattern.hasSuffixAccents &&
1914        strsrch->canonicalSuffixAccents[0]) ||
1915        (strsrch->pattern.hasPrefixAccents &&
1916        strsrch->canonicalPrefixAccents[0])) {
1917        strsrch->search->matchedIndex  = getPreviousUStringSearchBaseOffset(
1918                                                    strsrch,
1919                                                    ucol_getOffset(coleiter));
1920        strsrch->search->matchedLength = *textoffset -
1921                                                strsrch->search->matchedIndex;
1922        return TRUE;
1923    }
1924
1925    int32_t start = getColElemIterOffset(coleiter, FALSE);
1926    if (!checkNextCanonicalContractionMatch(strsrch, &start, textoffset,
1927                                            status) || U_FAILURE(*status)) {
1928        return FALSE;
1929    }
1930
1931    start = getPreviousUStringSearchBaseOffset(strsrch, start);
1932    // this totally matches, however we need to check if it is repeating
1933    if (checkRepeatedMatch(strsrch, start, *textoffset) ||
1934        !isBreakUnit(strsrch, start, *textoffset) ||
1935        !checkIdentical(strsrch, start, *textoffset)) {
1936        (*textoffset) ++;
1937        *textoffset = getNextBaseOffset(strsrch->search->text, *textoffset,
1938                                        strsrch->search->textLength);
1939        return FALSE;
1940    }
1941
1942    strsrch->search->matchedIndex  = start;
1943    strsrch->search->matchedLength = *textoffset - start;
1944    return TRUE;
1945}
1946
1947/**
1948* Shifting the collation element iterator position forward to prepare for
1949* a preceding match. If the first character is a unsafe character, we'll only
1950* shift by 1 to capture contractions, normalization etc.
1951* Internal method, status assumed to be success, caller has to check status
1952* before calling this method.
1953* @param text strsrch string search data
1954* @param textoffset start text position to do search
1955* @param ce the text ce which failed the match.
1956* @param patternceindex index of the ce within the pattern ce buffer which
1957*        failed the match
1958* @return final offset
1959*/
1960static
1961inline int32_t reverseShift(UStringSearch *strsrch,
1962                                int32_t    textoffset,
1963                                int32_t       ce,
1964                                int32_t        patternceindex)
1965{
1966    if (strsrch->search->isOverlap) {
1967        if (textoffset != strsrch->search->textLength) {
1968            textoffset --;
1969        }
1970        else {
1971            textoffset -= strsrch->pattern.defaultShiftSize;
1972        }
1973    }
1974    else {
1975        if (ce != UCOL_NULLORDER) {
1976            int32_t shift = strsrch->pattern.backShift[hash(ce)];
1977
1978            // this is to adjust for characters in the middle of the substring
1979            // for matching that failed.
1980            int32_t adjust = patternceindex;
1981            if (adjust > 1 && shift > adjust) {
1982                shift -= adjust - 1;
1983            }
1984            textoffset -= shift;
1985        }
1986        else {
1987            textoffset -= strsrch->pattern.defaultShiftSize;
1988        }
1989    }
1990    textoffset = getPreviousUStringSearchBaseOffset(strsrch, textoffset);
1991    return textoffset;
1992}
1993
1994/**
1995* Checks match for contraction.
1996* If the match starts with a partial contraction we fail.
1997* Internal method, status assumed to be success, caller has to check status
1998* before calling this method.
1999* @param strsrch string search data
2000* @param start offset of potential match, to be modified if necessary
2001* @param end offset of potential match, to be modified if necessary
2002* @param status output error status if any
2003* @return TRUE if match passes the contraction test, FALSE otherwise
2004*/
2005static
2006UBool checkPreviousExactContractionMatch(UStringSearch *strsrch,
2007                                     int32_t   *start,
2008                                     int32_t   *end, UErrorCode  *status)
2009{
2010          UCollationElements *coleiter   = strsrch->textIter;
2011          int32_t             textlength = strsrch->search->textLength;
2012          int32_t             temp       = *end;
2013    const UCollator          *collator   = strsrch->collator;
2014    const UChar              *text       = strsrch->search->text;
2015    // This part checks if either if the start of the match contains potential
2016    // contraction. If so we'll have to iterate through them
2017    // Since we used ucol_next while previously looking for the potential
2018    // match, this guarantees that our end will not be a partial contraction,
2019    // or a partial supplementary character.
2020    if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2021        int32_t expansion  = getExpansionSuffix(coleiter);
2022        UBool   expandflag = expansion > 0;
2023        setColEIterOffset(coleiter, *end);
2024        while (U_SUCCESS(*status) && expansion > 0) {
2025            // getting rid of the redundant ce
2026            // since forward contraction/expansion may have extra ces
2027            // if we are in the normalization buffer, hasAccentsBeforeMatch
2028            // would have taken care of it.
2029            // E.g. the character \u01FA will have an expansion of 3, but if
2030            // we are only looking for A ring A\u030A, we'll have to skip the
2031            // last ce in the expansion buffer
2032            ucol_previous(coleiter, status);
2033            if (U_FAILURE(*status)) {
2034                return FALSE;
2035            }
2036            if (ucol_getOffset(coleiter) != temp) {
2037                *end = temp;
2038                temp  = ucol_getOffset(coleiter);
2039            }
2040            expansion --;
2041        }
2042
2043        int32_t  *patternce       = strsrch->pattern.CE;
2044        int32_t   patterncelength = strsrch->pattern.CELength;
2045        int32_t   count           = patterncelength;
2046        while (count > 0) {
2047            int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2048            // status checked below, note that if status is a failure
2049            // ucol_previous returns UCOL_NULLORDER
2050            if (ce == UCOL_IGNORABLE) {
2051                continue;
2052            }
2053            if (expandflag && count == 0 &&
2054                getColElemIterOffset(coleiter, FALSE) != temp) {
2055                *end = temp;
2056                temp  = ucol_getOffset(coleiter);
2057            }
2058            if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2059                (*start) --;
2060                *start = getPreviousBaseOffset(text, *start);
2061                return FALSE;
2062            }
2063            count --;
2064        }
2065    }
2066    return TRUE;
2067}
2068
2069/**
2070* Checks and sets the match information if found.
2071* Checks
2072* <ul>
2073* <li> the current match does not repeat the last match
2074* <li> boundaries are correct
2075* <li> exact matches has no extra accents
2076* <li> identical matches
2077* <\ul>
2078* Otherwise the offset will be shifted to the preceding character.
2079* Internal method, status assumed to be success, caller has to check status
2080* before calling this method.
2081* @param strsrch string search data
2082* @param collator
2083* @param coleiter collation element iterator
2084* @param text string
2085* @param textoffset offset in the collation element text. the returned value
2086*        will be the truncated start offset of the match or the new start
2087*        search offset.
2088* @param status output error status if any
2089* @return TRUE if the match is valid, FALSE otherwise
2090*/
2091static
2092inline UBool checkPreviousExactMatch(UStringSearch *strsrch,
2093                                     int32_t   *textoffset,
2094                                     UErrorCode    *status)
2095{
2096    // to ensure that the start and ends are not composite characters
2097    int32_t end = ucol_getOffset(strsrch->textIter);
2098    if (!checkPreviousExactContractionMatch(strsrch, textoffset, &end, status)
2099        || U_FAILURE(*status)) {
2100            return FALSE;
2101    }
2102
2103    // this totally matches, however we need to check if it is repeating
2104    // the old match
2105    if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2106        !isBreakUnit(strsrch, *textoffset, end) ||
2107        hasAccentsBeforeMatch(strsrch, *textoffset, end) ||
2108        !checkIdentical(strsrch, *textoffset, end) ||
2109        hasAccentsAfterMatch(strsrch, *textoffset, end)) {
2110        (*textoffset) --;
2111        *textoffset = getPreviousBaseOffset(strsrch->search->text,
2112                                            *textoffset);
2113        return FALSE;
2114    }
2115
2116    //Add breakiterator boundary check for primary strength search.
2117    if (!strsrch->search->breakIter && strsrch->strength == UCOL_PRIMARY) {
2118        checkBreakBoundary(strsrch, textoffset, &end);
2119    }
2120
2121    strsrch->search->matchedIndex = *textoffset;
2122    strsrch->search->matchedLength = end - *textoffset;
2123    return TRUE;
2124}
2125
2126/**
2127* Rearranges the end accents to try matching.
2128* Suffix accents in the text will be grouped according to their combining
2129* class and the groups will be mixed and matched to try find the perfect
2130* match with the pattern.
2131* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2132* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2133*         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2134*         "\u0301\u0325".
2135* step 2: check if any of the generated substrings matches the pattern.
2136* Internal method, status assumed to be success, user has to check status
2137* before calling this method.
2138* @param strsrch string search match
2139* @param start offset of the first base character
2140* @param end start of the last accent set
2141* @param status only error status if any
2142* @return USEARCH_DONE if a match is not found, otherwise return the ending
2143*         offset of the match. Note this start includes all following accents.
2144*/
2145static
2146int32_t doPreviousCanonicalSuffixMatch(UStringSearch *strsrch,
2147                                           int32_t    start,
2148                                           int32_t    end,
2149                                           UErrorCode    *status)
2150{
2151    const UChar       *text       = strsrch->search->text;
2152          int32_t  tempend    = end;
2153
2154    U16_BACK_1(text, 0, tempend);
2155    if (!(getFCD(text, &tempend, strsrch->search->textLength) &
2156                                                           LAST_BYTE_MASK_)) {
2157        // die... failed at a base character
2158        return USEARCH_DONE;
2159    }
2160    end = getNextBaseOffset(text, end, strsrch->search->textLength);
2161
2162    if (U_SUCCESS(*status)) {
2163        UChar       accents[INITIAL_ARRAY_SIZE_];
2164        int32_t offset = getPreviousBaseOffset(text, end);
2165        // normalizing the offensive string
2166        unorm_normalize(text + offset, end - offset, UNORM_NFD, 0, accents,
2167                        INITIAL_ARRAY_SIZE_, status);
2168
2169        int32_t         accentsindex[INITIAL_ARRAY_SIZE_];
2170        int32_t         accentsize = getUnblockedAccentIndex(accents,
2171                                                         accentsindex);
2172        int32_t         count      = (2 << (accentsize - 1)) - 1;
2173        UChar               buffer[INITIAL_ARRAY_SIZE_];
2174        UCollationElements *coleiter = strsrch->utilIter;
2175        while (U_SUCCESS(*status) && count > 0) {
2176            UChar *rearrange = strsrch->canonicalSuffixAccents;
2177            // copy the base characters
2178            for (int k = 0; k < accentsindex[0]; k ++) {
2179                *rearrange ++ = accents[k];
2180            }
2181            // forming all possible canonical rearrangement by dropping
2182            // sets of accents
2183            for (int i = 0; i <= accentsize - 1; i ++) {
2184                int32_t mask = 1 << (accentsize - i - 1);
2185                if (count & mask) {
2186                    for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2187                        *rearrange ++ = accents[j];
2188                    }
2189                }
2190            }
2191            *rearrange = 0;
2192            int32_t  matchsize = INITIAL_ARRAY_SIZE_;
2193            UChar   *match     = addToUCharArray(buffer, &matchsize,
2194                                           strsrch->canonicalPrefixAccents,
2195                                           strsrch->search->text + start,
2196                                           offset - start,
2197                                           strsrch->canonicalSuffixAccents,
2198                                           status);
2199
2200            // run the collator iterator through this match
2201            // if status is a failure ucol_setText does nothing
2202            ucol_setText(coleiter, match, matchsize, status);
2203            if (U_SUCCESS(*status)) {
2204                if (checkCollationMatch(strsrch, coleiter)) {
2205                    if (match != buffer) {
2206                        uprv_free(match);
2207                    }
2208                    return end;
2209                }
2210            }
2211            count --;
2212        }
2213    }
2214    return USEARCH_DONE;
2215}
2216
2217/**
2218* Take the rearranged start accents and tries matching. If match failed at
2219* a seperate following set of accents (seperated from the rearranged on by
2220* at least a base character) then we rearrange the preceding accents and
2221* tries matching again.
2222* We allow skipping of the ends of the accent set if the ces do not match.
2223* However if the failure is found before the accent set, it fails.
2224* Internal method, status assumed to be success, caller has to check status
2225* before calling this method.
2226* @param strsrch string search data
2227* @param textoffset of the ends of the rearranged accent
2228* @param status output error status if any
2229* @return USEARCH_DONE if a match is not found, otherwise return the ending
2230*         offset of the match. Note this start includes all following accents.
2231*/
2232static
2233int32_t doPreviousCanonicalPrefixMatch(UStringSearch *strsrch,
2234                                           int32_t    textoffset,
2235                                           UErrorCode    *status)
2236{
2237    const UChar       *text       = strsrch->search->text;
2238    const UCollator   *collator   = strsrch->collator;
2239          int32_t      safelength = 0;
2240          UChar       *safetext;
2241          int32_t      safetextlength;
2242          UChar        safebuffer[INITIAL_ARRAY_SIZE_];
2243          int32_t  safeoffset = textoffset;
2244
2245    if (textoffset &&
2246        ucol_unsafeCP(strsrch->canonicalPrefixAccents[
2247                                 u_strlen(strsrch->canonicalPrefixAccents) - 1
2248                                         ], collator)) {
2249        safeoffset     = getNextSafeOffset(collator, text, textoffset,
2250                                           strsrch->search->textLength);
2251        safelength     = safeoffset - textoffset;
2252        safetextlength = INITIAL_ARRAY_SIZE_;
2253        safetext       = addToUCharArray(safebuffer, &safetextlength,
2254                                         strsrch->canonicalPrefixAccents,
2255                                         text + textoffset, safelength,
2256                                         NULL, status);
2257    }
2258    else {
2259        safetextlength = u_strlen(strsrch->canonicalPrefixAccents);
2260        safetext       = strsrch->canonicalPrefixAccents;
2261    }
2262
2263    UCollationElements *coleiter = strsrch->utilIter;
2264     // if status is a failure, ucol_setText does nothing
2265    ucol_setText(coleiter, safetext, safetextlength, status);
2266    // status checked in loop below
2267
2268    int32_t  *ce           = strsrch->pattern.CE;
2269    int32_t   celength     = strsrch->pattern.CELength;
2270    int       ceindex      = 0;
2271    UBool     isSafe       = TRUE; // safe zone indication flag for position
2272    int32_t   prefixlength = u_strlen(strsrch->canonicalPrefixAccents);
2273
2274    while (ceindex < celength) {
2275        int32_t textce = ucol_next(coleiter, status);
2276        if (U_FAILURE(*status)) {
2277            if (isSafe) {
2278                cleanUpSafeText(strsrch, safetext, safebuffer);
2279            }
2280            return USEARCH_DONE;
2281        }
2282        if (textce == UCOL_NULLORDER) {
2283            // check if we have passed the safe buffer
2284            if (coleiter == strsrch->textIter) {
2285                cleanUpSafeText(strsrch, safetext, safebuffer);
2286                return USEARCH_DONE;
2287            }
2288            cleanUpSafeText(strsrch, safetext, safebuffer);
2289            safetext = safebuffer;
2290            coleiter = strsrch->textIter;
2291            setColEIterOffset(coleiter, safeoffset);
2292            // status checked at the start of the loop
2293            isSafe = FALSE;
2294            continue;
2295        }
2296        textce = getCE(strsrch, textce);
2297        if (textce != UCOL_IGNORABLE && textce != ce[ceindex]) {
2298            // do the beginning stuff
2299            int32_t failedoffset = ucol_getOffset(coleiter);
2300            if (isSafe && failedoffset <= prefixlength) {
2301                // alas... no hope. failed at rearranged accent set
2302                cleanUpSafeText(strsrch, safetext, safebuffer);
2303                return USEARCH_DONE;
2304            }
2305            else {
2306                if (isSafe) {
2307                    failedoffset = safeoffset - failedoffset;
2308                    cleanUpSafeText(strsrch, safetext, safebuffer);
2309                }
2310
2311                // try rearranging the end accents
2312                int32_t result = doPreviousCanonicalSuffixMatch(strsrch,
2313                                        textoffset, failedoffset, status);
2314                if (result != USEARCH_DONE) {
2315                    // if status is a failure, ucol_setOffset does nothing
2316                    setColEIterOffset(strsrch->textIter, result);
2317                }
2318                if (U_FAILURE(*status)) {
2319                    return USEARCH_DONE;
2320                }
2321                return result;
2322            }
2323        }
2324        if (textce == ce[ceindex]) {
2325            ceindex ++;
2326        }
2327    }
2328    // set offset here
2329    if (isSafe) {
2330        int32_t result      = ucol_getOffset(coleiter);
2331        // sets the text iterator here with the correct expansion and offset
2332        int32_t     leftoverces = getExpansionSuffix(coleiter);
2333        cleanUpSafeText(strsrch, safetext, safebuffer);
2334        if (result <= prefixlength) {
2335            result = textoffset;
2336        }
2337        else {
2338            result = textoffset + (safeoffset - result);
2339        }
2340        setColEIterOffset(strsrch->textIter, result);
2341        setExpansionSuffix(strsrch->textIter, leftoverces);
2342        return result;
2343    }
2344
2345    return ucol_getOffset(coleiter);
2346}
2347
2348/**
2349* Trying out the substring and sees if it can be a canonical match.
2350* This will try normalizing the starting accents and arranging them into
2351* canonical equivalents and check their corresponding ces with the pattern ce.
2352* Prefix accents in the text will be grouped according to their combining
2353* class and the groups will be mixed and matched to try find the perfect
2354* match with the pattern.
2355* So for instance looking for "\u0301" in "\u030A\u0301\u0325"
2356* step 1: split "\u030A\u0301" into 6 other type of potential accent substrings
2357*         "\u030A", "\u0301", "\u0325", "\u030A\u0301", "\u030A\u0325",
2358*         "\u0301\u0325".
2359* step 2: check if any of the generated substrings matches the pattern.
2360* Internal method, status assumed to be success, caller has to check status
2361* before calling this method.
2362* @param strsrch string search data
2363* @param textoffset start offset in the collation element text that starts
2364*                   with the accents to be rearranged
2365* @param status output error status if any
2366* @return TRUE if the match is valid, FALSE otherwise
2367*/
2368static
2369UBool doPreviousCanonicalMatch(UStringSearch *strsrch,
2370                               int32_t    textoffset,
2371                               UErrorCode    *status)
2372{
2373    const UChar       *text       = strsrch->search->text;
2374          int32_t  temp       = textoffset;
2375          int32_t      textlength = strsrch->search->textLength;
2376    if ((getFCD(text, &temp, textlength) >> SECOND_LAST_BYTE_SHIFT_) == 0) {
2377        UCollationElements *coleiter = strsrch->textIter;
2378        int32_t         offset   = ucol_getOffset(coleiter);
2379        if (strsrch->pattern.hasSuffixAccents) {
2380            offset = doPreviousCanonicalSuffixMatch(strsrch, textoffset,
2381                                                    offset, status);
2382            if (U_SUCCESS(*status) && offset != USEARCH_DONE) {
2383                setColEIterOffset(coleiter, offset);
2384                return TRUE;
2385            }
2386        }
2387        return FALSE;
2388    }
2389
2390    if (!strsrch->pattern.hasPrefixAccents) {
2391        return FALSE;
2392    }
2393
2394    UChar       accents[INITIAL_ARRAY_SIZE_];
2395    // offset to the last base character in substring to search
2396    int32_t baseoffset = getNextBaseOffset(text, textoffset, textlength);
2397    // normalizing the offensive string
2398    unorm_normalize(text + textoffset, baseoffset - textoffset, UNORM_NFD,
2399                               0, accents, INITIAL_ARRAY_SIZE_, status);
2400    // status checked in loop
2401
2402    int32_t accentsindex[INITIAL_ARRAY_SIZE_];
2403    int32_t size = getUnblockedAccentIndex(accents, accentsindex);
2404
2405    // 2 power n - 1 plus the full set of accents
2406    int32_t  count = (2 << (size - 1)) - 1;
2407    while (U_SUCCESS(*status) && count > 0) {
2408        UChar *rearrange = strsrch->canonicalPrefixAccents;
2409        // copy the base characters
2410        for (int k = 0; k < accentsindex[0]; k ++) {
2411            *rearrange ++ = accents[k];
2412        }
2413        // forming all possible canonical rearrangement by dropping
2414        // sets of accents
2415        for (int i = 0; i <= size - 1; i ++) {
2416            int32_t mask = 1 << (size - i - 1);
2417            if (count & mask) {
2418                for (int j = accentsindex[i]; j < accentsindex[i + 1]; j ++) {
2419                    *rearrange ++ = accents[j];
2420                }
2421            }
2422        }
2423        *rearrange = 0;
2424        int32_t offset = doPreviousCanonicalPrefixMatch(strsrch,
2425                                                          baseoffset, status);
2426        if (offset != USEARCH_DONE) {
2427            return TRUE; // match found
2428        }
2429        count --;
2430    }
2431    return FALSE;
2432}
2433
2434/**
2435* Checks match for contraction.
2436* If the match starts with a partial contraction we fail.
2437* Internal method, status assumed to be success, caller has to check status
2438* before calling this method.
2439* @param strsrch string search data
2440* @param start offset of potential match, to be modified if necessary
2441* @param end offset of potential match, to be modified if necessary
2442* @param status only error status if any
2443* @return TRUE if match passes the contraction test, FALSE otherwise
2444*/
2445static
2446UBool checkPreviousCanonicalContractionMatch(UStringSearch *strsrch,
2447                                     int32_t   *start,
2448                                     int32_t   *end, UErrorCode  *status)
2449{
2450          UCollationElements *coleiter   = strsrch->textIter;
2451          int32_t             textlength = strsrch->search->textLength;
2452          int32_t         temp       = *end;
2453    const UCollator          *collator   = strsrch->collator;
2454    const UChar              *text       = strsrch->search->text;
2455    // This part checks if either if the start of the match contains potential
2456    // contraction. If so we'll have to iterate through them
2457    // Since we used ucol_next while previously looking for the potential
2458    // match, this guarantees that our end will not be a partial contraction,
2459    // or a partial supplementary character.
2460    if (*start < textlength && ucol_unsafeCP(text[*start], collator)) {
2461        int32_t expansion  = getExpansionSuffix(coleiter);
2462        UBool   expandflag = expansion > 0;
2463        setColEIterOffset(coleiter, *end);
2464        while (expansion > 0) {
2465            // getting rid of the redundant ce
2466            // since forward contraction/expansion may have extra ces
2467            // if we are in the normalization buffer, hasAccentsBeforeMatch
2468            // would have taken care of it.
2469            // E.g. the character \u01FA will have an expansion of 3, but if
2470            // we are only looking for A ring A\u030A, we'll have to skip the
2471            // last ce in the expansion buffer
2472            ucol_previous(coleiter, status);
2473            if (U_FAILURE(*status)) {
2474                return FALSE;
2475            }
2476            if (ucol_getOffset(coleiter) != temp) {
2477                *end = temp;
2478                temp  = ucol_getOffset(coleiter);
2479            }
2480            expansion --;
2481        }
2482
2483        int32_t  *patternce       = strsrch->pattern.CE;
2484        int32_t   patterncelength = strsrch->pattern.CELength;
2485        int32_t   count           = patterncelength;
2486        while (count > 0) {
2487            int32_t ce = getCE(strsrch, ucol_previous(coleiter, status));
2488            // status checked below, note that if status is a failure
2489            // ucol_previous returns UCOL_NULLORDER
2490            if (ce == UCOL_IGNORABLE) {
2491                continue;
2492            }
2493            if (expandflag && count == 0 &&
2494                getColElemIterOffset(coleiter, FALSE) != temp) {
2495                *end = temp;
2496                temp  = ucol_getOffset(coleiter);
2497            }
2498            if (count == patterncelength &&
2499                ce != patternce[patterncelength - 1]) {
2500                // accents may have extra starting ces, this occurs when a
2501                // pure accent pattern is matched without rearrangement
2502                int32_t    expected = patternce[patterncelength - 1];
2503                U16_BACK_1(text, 0, *end);
2504                if (getFCD(text, end, textlength) & LAST_BYTE_MASK_) {
2505                    ce = getCE(strsrch, ucol_previous(coleiter, status));
2506                    while (U_SUCCESS(*status) && ce != expected &&
2507                           ce != UCOL_NULLORDER &&
2508                           ucol_getOffset(coleiter) <= *start) {
2509                        ce = getCE(strsrch, ucol_previous(coleiter, status));
2510                    }
2511                }
2512            }
2513            if (U_FAILURE(*status) || ce != patternce[count - 1]) {
2514                (*start) --;
2515                *start = getPreviousBaseOffset(text, *start);
2516                return FALSE;
2517            }
2518            count --;
2519        }
2520    }
2521    return TRUE;
2522}
2523
2524/**
2525* Checks and sets the match information if found.
2526* Checks
2527* <ul>
2528* <li> the potential match does not repeat the previous match
2529* <li> boundaries are correct
2530* <li> potential match does not end in the middle of a contraction
2531* <li> identical matches
2532* <\ul>
2533* Otherwise the offset will be shifted to the next character.
2534* Internal method, status assumed to be success, caller has to check status
2535* before calling this method.
2536* @param strsrch string search data
2537* @param textoffset offset in the collation element text. the returned value
2538*        will be the truncated start offset of the match or the new start
2539*        search offset.
2540* @param status only error status if any
2541* @return TRUE if the match is valid, FALSE otherwise
2542*/
2543static
2544inline UBool checkPreviousCanonicalMatch(UStringSearch *strsrch,
2545                                         int32_t   *textoffset,
2546                                         UErrorCode    *status)
2547{
2548    // to ensure that the start and ends are not composite characters
2549    UCollationElements *coleiter = strsrch->textIter;
2550    // if we have a canonical accent match
2551    if ((strsrch->pattern.hasSuffixAccents &&
2552        strsrch->canonicalSuffixAccents[0]) ||
2553        (strsrch->pattern.hasPrefixAccents &&
2554        strsrch->canonicalPrefixAccents[0])) {
2555        strsrch->search->matchedIndex  = *textoffset;
2556        strsrch->search->matchedLength =
2557            getNextUStringSearchBaseOffset(strsrch,
2558                                      getColElemIterOffset(coleiter, FALSE))
2559            - *textoffset;
2560        return TRUE;
2561    }
2562
2563    int32_t end = ucol_getOffset(coleiter);
2564    if (!checkPreviousCanonicalContractionMatch(strsrch, textoffset, &end,
2565                                                status) ||
2566         U_FAILURE(*status)) {
2567        return FALSE;
2568    }
2569
2570    end = getNextUStringSearchBaseOffset(strsrch, end);
2571    // this totally matches, however we need to check if it is repeating
2572    if (checkRepeatedMatch(strsrch, *textoffset, end) ||
2573        !isBreakUnit(strsrch, *textoffset, end) ||
2574        !checkIdentical(strsrch, *textoffset, end)) {
2575        (*textoffset) --;
2576        *textoffset = getPreviousBaseOffset(strsrch->search->text,
2577                                            *textoffset);
2578        return FALSE;
2579    }
2580
2581    strsrch->search->matchedIndex  = *textoffset;
2582    strsrch->search->matchedLength = end - *textoffset;
2583    return TRUE;
2584}
2585#endif // #if BOYER_MOORE
2586
2587// constructors and destructor -------------------------------------------
2588
2589U_CAPI UStringSearch * U_EXPORT2 usearch_open(const UChar *pattern,
2590                                          int32_t         patternlength,
2591                                    const UChar          *text,
2592                                          int32_t         textlength,
2593                                    const char           *locale,
2594                                          UBreakIterator *breakiter,
2595                                          UErrorCode     *status)
2596{
2597    if (U_FAILURE(*status)) {
2598        return NULL;
2599    }
2600#if UCONFIG_NO_BREAK_ITERATION
2601    if (breakiter != NULL) {
2602        *status = U_UNSUPPORTED_ERROR;
2603        return NULL;
2604    }
2605#endif
2606    if (locale) {
2607        // ucol_open internally checks for status
2608        UCollator     *collator = ucol_open(locale, status);
2609        // pattern, text checks are done in usearch_openFromCollator
2610        UStringSearch *result   = usearch_openFromCollator(pattern,
2611                                              patternlength, text, textlength,
2612                                              collator, breakiter, status);
2613
2614        if (result == NULL || U_FAILURE(*status)) {
2615            if (collator) {
2616                ucol_close(collator);
2617            }
2618            return NULL;
2619        }
2620        else {
2621            result->ownCollator = TRUE;
2622        }
2623        return result;
2624    }
2625    *status = U_ILLEGAL_ARGUMENT_ERROR;
2626    return NULL;
2627}
2628
2629U_CAPI UStringSearch * U_EXPORT2 usearch_openFromCollator(
2630                                  const UChar          *pattern,
2631                                        int32_t         patternlength,
2632                                  const UChar          *text,
2633                                        int32_t         textlength,
2634                                  const UCollator      *collator,
2635                                        UBreakIterator *breakiter,
2636                                        UErrorCode     *status)
2637{
2638    if (U_FAILURE(*status)) {
2639        return NULL;
2640    }
2641#if UCONFIG_NO_BREAK_ITERATION
2642    if (breakiter != NULL) {
2643        *status = U_UNSUPPORTED_ERROR;
2644        return NULL;
2645    }
2646#endif
2647    if (pattern == NULL || text == NULL || collator == NULL) {
2648        *status = U_ILLEGAL_ARGUMENT_ERROR;
2649        return NULL;
2650    }
2651
2652    // string search does not really work when numeric collation is turned on
2653    if(ucol_getAttribute(collator, UCOL_NUMERIC_COLLATION, status) == UCOL_ON) {
2654        *status = U_UNSUPPORTED_ERROR;
2655        return NULL;
2656    }
2657
2658    if (U_SUCCESS(*status)) {
2659        initializeFCD(status);
2660        if (U_FAILURE(*status)) {
2661            return NULL;
2662        }
2663
2664        UStringSearch *result;
2665        if (textlength == -1) {
2666            textlength = u_strlen(text);
2667        }
2668        if (patternlength == -1) {
2669            patternlength = u_strlen(pattern);
2670        }
2671        if (textlength <= 0 || patternlength <= 0) {
2672            *status = U_ILLEGAL_ARGUMENT_ERROR;
2673            return NULL;
2674        }
2675
2676        result = (UStringSearch *)uprv_malloc(sizeof(UStringSearch));
2677        if (result == NULL) {
2678            *status = U_MEMORY_ALLOCATION_ERROR;
2679            return NULL;
2680        }
2681
2682        result->collator    = collator;
2683        result->strength    = ucol_getStrength(collator);
2684        result->ceMask      = getMask(result->strength);
2685        result->toShift     =
2686             ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
2687                                                            UCOL_SHIFTED;
2688        result->variableTop = ucol_getVariableTop(collator, status);
2689
2690        result->nfd         = Normalizer2Factory::getNFDInstance(*status);
2691
2692        if (U_FAILURE(*status)) {
2693            uprv_free(result);
2694            return NULL;
2695        }
2696
2697        result->search             = (USearch *)uprv_malloc(sizeof(USearch));
2698        if (result->search == NULL) {
2699            *status = U_MEMORY_ALLOCATION_ERROR;
2700            uprv_free(result);
2701            return NULL;
2702        }
2703
2704        result->search->text       = text;
2705        result->search->textLength = textlength;
2706
2707        result->pattern.text       = pattern;
2708        result->pattern.textLength = patternlength;
2709        result->pattern.CE         = NULL;
2710        result->pattern.PCE        = NULL;
2711
2712        result->search->breakIter  = breakiter;
2713#if !UCONFIG_NO_BREAK_ITERATION
2714        result->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(result->collator, ULOC_VALID_LOCALE, status), text, textlength, status);
2715        if (breakiter) {
2716            ubrk_setText(breakiter, text, textlength, status);
2717        }
2718#endif
2719
2720        result->ownCollator           = FALSE;
2721        result->search->matchedLength = 0;
2722        result->search->matchedIndex  = USEARCH_DONE;
2723        result->utilIter              = NULL;
2724        result->textIter              = ucol_openElements(collator, text,
2725                                                          textlength, status);
2726        result->textProcessedIter     = NULL;
2727        if (U_FAILURE(*status)) {
2728            usearch_close(result);
2729            return NULL;
2730        }
2731
2732        result->search->isOverlap          = FALSE;
2733        result->search->isCanonicalMatch   = FALSE;
2734        result->search->elementComparisonType = 0;
2735        result->search->isForwardSearching = TRUE;
2736        result->search->reset              = TRUE;
2737
2738        initialize(result, status);
2739
2740        if (U_FAILURE(*status)) {
2741            usearch_close(result);
2742            return NULL;
2743        }
2744
2745        return result;
2746    }
2747    return NULL;
2748}
2749
2750U_CAPI void U_EXPORT2 usearch_close(UStringSearch *strsrch)
2751{
2752    if (strsrch) {
2753        if (strsrch->pattern.CE != strsrch->pattern.CEBuffer &&
2754            strsrch->pattern.CE) {
2755            uprv_free(strsrch->pattern.CE);
2756        }
2757
2758        if (strsrch->pattern.PCE != NULL &&
2759            strsrch->pattern.PCE != strsrch->pattern.PCEBuffer) {
2760            uprv_free(strsrch->pattern.PCE);
2761        }
2762
2763        delete strsrch->textProcessedIter;
2764        ucol_closeElements(strsrch->textIter);
2765        ucol_closeElements(strsrch->utilIter);
2766
2767        if (strsrch->ownCollator && strsrch->collator) {
2768            ucol_close((UCollator *)strsrch->collator);
2769        }
2770
2771#if !UCONFIG_NO_BREAK_ITERATION
2772        if (strsrch->search->internalBreakIter) {
2773            ubrk_close(strsrch->search->internalBreakIter);
2774        }
2775#endif
2776
2777        uprv_free(strsrch->search);
2778        uprv_free(strsrch);
2779    }
2780}
2781
2782namespace {
2783
2784UBool initTextProcessedIter(UStringSearch *strsrch, UErrorCode *status) {
2785    if (U_FAILURE(*status)) { return FALSE; }
2786    if (strsrch->textProcessedIter == NULL) {
2787        strsrch->textProcessedIter = new icu::UCollationPCE(strsrch->textIter);
2788        if (strsrch->textProcessedIter == NULL) {
2789            *status = U_MEMORY_ALLOCATION_ERROR;
2790            return FALSE;
2791        }
2792    } else {
2793        strsrch->textProcessedIter->init(strsrch->textIter);
2794    }
2795    return TRUE;
2796}
2797
2798}
2799
2800// set and get methods --------------------------------------------------
2801
2802U_CAPI void U_EXPORT2 usearch_setOffset(UStringSearch *strsrch,
2803                                        int32_t    position,
2804                                        UErrorCode    *status)
2805{
2806    if (U_SUCCESS(*status) && strsrch) {
2807        if (isOutOfBounds(strsrch->search->textLength, position)) {
2808            *status = U_INDEX_OUTOFBOUNDS_ERROR;
2809        }
2810        else {
2811            setColEIterOffset(strsrch->textIter, position);
2812        }
2813        strsrch->search->matchedIndex  = USEARCH_DONE;
2814        strsrch->search->matchedLength = 0;
2815        strsrch->search->reset         = FALSE;
2816    }
2817}
2818
2819U_CAPI int32_t U_EXPORT2 usearch_getOffset(const UStringSearch *strsrch)
2820{
2821    if (strsrch) {
2822        int32_t result = ucol_getOffset(strsrch->textIter);
2823        if (isOutOfBounds(strsrch->search->textLength, result)) {
2824            return USEARCH_DONE;
2825        }
2826        return result;
2827    }
2828    return USEARCH_DONE;
2829}
2830
2831U_CAPI void U_EXPORT2 usearch_setAttribute(UStringSearch *strsrch,
2832                                 USearchAttribute attribute,
2833                                 USearchAttributeValue value,
2834                                 UErrorCode *status)
2835{
2836    if (U_SUCCESS(*status) && strsrch) {
2837        switch (attribute)
2838        {
2839        case USEARCH_OVERLAP :
2840            strsrch->search->isOverlap = (value == USEARCH_ON ? TRUE : FALSE);
2841            break;
2842        case USEARCH_CANONICAL_MATCH :
2843            strsrch->search->isCanonicalMatch = (value == USEARCH_ON ? TRUE :
2844                                                                      FALSE);
2845            break;
2846        case USEARCH_ELEMENT_COMPARISON :
2847            if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2848                strsrch->search->elementComparisonType = (int16_t)value;
2849            } else {
2850                strsrch->search->elementComparisonType = 0;
2851            }
2852            break;
2853        case USEARCH_ATTRIBUTE_COUNT :
2854        default:
2855            *status = U_ILLEGAL_ARGUMENT_ERROR;
2856        }
2857    }
2858    if (value == USEARCH_ATTRIBUTE_VALUE_COUNT) {
2859        *status = U_ILLEGAL_ARGUMENT_ERROR;
2860    }
2861}
2862
2863U_CAPI USearchAttributeValue U_EXPORT2 usearch_getAttribute(
2864                                                const UStringSearch *strsrch,
2865                                                USearchAttribute attribute)
2866{
2867    if (strsrch) {
2868        switch (attribute) {
2869        case USEARCH_OVERLAP :
2870            return (strsrch->search->isOverlap == TRUE ? USEARCH_ON :
2871                                                        USEARCH_OFF);
2872        case USEARCH_CANONICAL_MATCH :
2873            return (strsrch->search->isCanonicalMatch == TRUE ? USEARCH_ON :
2874                                                               USEARCH_OFF);
2875        case USEARCH_ELEMENT_COMPARISON :
2876            {
2877                int16_t value = strsrch->search->elementComparisonType;
2878                if (value == USEARCH_PATTERN_BASE_WEIGHT_IS_WILDCARD || value == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD) {
2879                    return (USearchAttributeValue)value;
2880                } else {
2881                    return USEARCH_STANDARD_ELEMENT_COMPARISON;
2882                }
2883            }
2884        case USEARCH_ATTRIBUTE_COUNT :
2885            return USEARCH_DEFAULT;
2886        }
2887    }
2888    return USEARCH_DEFAULT;
2889}
2890
2891U_CAPI int32_t U_EXPORT2 usearch_getMatchedStart(
2892                                                const UStringSearch *strsrch)
2893{
2894    if (strsrch == NULL) {
2895        return USEARCH_DONE;
2896    }
2897    return strsrch->search->matchedIndex;
2898}
2899
2900
2901U_CAPI int32_t U_EXPORT2 usearch_getMatchedText(const UStringSearch *strsrch,
2902                                            UChar         *result,
2903                                            int32_t        resultCapacity,
2904                                            UErrorCode    *status)
2905{
2906    if (U_FAILURE(*status)) {
2907        return USEARCH_DONE;
2908    }
2909    if (strsrch == NULL || resultCapacity < 0 || (resultCapacity > 0 &&
2910        result == NULL)) {
2911        *status = U_ILLEGAL_ARGUMENT_ERROR;
2912        return USEARCH_DONE;
2913    }
2914
2915    int32_t     copylength = strsrch->search->matchedLength;
2916    int32_t copyindex  = strsrch->search->matchedIndex;
2917    if (copyindex == USEARCH_DONE) {
2918        u_terminateUChars(result, resultCapacity, 0, status);
2919        return USEARCH_DONE;
2920    }
2921
2922    if (resultCapacity < copylength) {
2923        copylength = resultCapacity;
2924    }
2925    if (copylength > 0) {
2926        uprv_memcpy(result, strsrch->search->text + copyindex,
2927                    copylength * sizeof(UChar));
2928    }
2929    return u_terminateUChars(result, resultCapacity,
2930                             strsrch->search->matchedLength, status);
2931}
2932
2933U_CAPI int32_t U_EXPORT2 usearch_getMatchedLength(
2934                                              const UStringSearch *strsrch)
2935{
2936    if (strsrch) {
2937        return strsrch->search->matchedLength;
2938    }
2939    return USEARCH_DONE;
2940}
2941
2942#if !UCONFIG_NO_BREAK_ITERATION
2943
2944U_CAPI void U_EXPORT2 usearch_setBreakIterator(UStringSearch  *strsrch,
2945                                               UBreakIterator *breakiter,
2946                                               UErrorCode     *status)
2947{
2948    if (U_SUCCESS(*status) && strsrch) {
2949        strsrch->search->breakIter = breakiter;
2950        if (breakiter) {
2951            ubrk_setText(breakiter, strsrch->search->text,
2952                         strsrch->search->textLength, status);
2953        }
2954    }
2955}
2956
2957U_CAPI const UBreakIterator* U_EXPORT2
2958usearch_getBreakIterator(const UStringSearch *strsrch)
2959{
2960    if (strsrch) {
2961        return strsrch->search->breakIter;
2962    }
2963    return NULL;
2964}
2965
2966#endif
2967
2968U_CAPI void U_EXPORT2 usearch_setText(      UStringSearch *strsrch,
2969                                      const UChar         *text,
2970                                            int32_t        textlength,
2971                                            UErrorCode    *status)
2972{
2973    if (U_SUCCESS(*status)) {
2974        if (strsrch == NULL || text == NULL || textlength < -1 ||
2975            textlength == 0) {
2976            *status = U_ILLEGAL_ARGUMENT_ERROR;
2977        }
2978        else {
2979            if (textlength == -1) {
2980                textlength = u_strlen(text);
2981            }
2982            strsrch->search->text       = text;
2983            strsrch->search->textLength = textlength;
2984            ucol_setText(strsrch->textIter, text, textlength, status);
2985            strsrch->search->matchedIndex  = USEARCH_DONE;
2986            strsrch->search->matchedLength = 0;
2987            strsrch->search->reset         = TRUE;
2988#if !UCONFIG_NO_BREAK_ITERATION
2989            if (strsrch->search->breakIter != NULL) {
2990                ubrk_setText(strsrch->search->breakIter, text,
2991                             textlength, status);
2992            }
2993            ubrk_setText(strsrch->search->internalBreakIter, text, textlength, status);
2994#endif
2995        }
2996    }
2997}
2998
2999U_CAPI const UChar * U_EXPORT2 usearch_getText(const UStringSearch *strsrch,
3000                                                     int32_t       *length)
3001{
3002    if (strsrch) {
3003        *length = strsrch->search->textLength;
3004        return strsrch->search->text;
3005    }
3006    return NULL;
3007}
3008
3009U_CAPI void U_EXPORT2 usearch_setCollator(      UStringSearch *strsrch,
3010                                          const UCollator     *collator,
3011                                                UErrorCode    *status)
3012{
3013    if (U_SUCCESS(*status)) {
3014        if (collator == NULL) {
3015            *status = U_ILLEGAL_ARGUMENT_ERROR;
3016            return;
3017        }
3018
3019        if (strsrch) {
3020            delete strsrch->textProcessedIter;
3021            strsrch->textProcessedIter = NULL;
3022            ucol_closeElements(strsrch->textIter);
3023            ucol_closeElements(strsrch->utilIter);
3024            strsrch->textIter = strsrch->utilIter = NULL;
3025            if (strsrch->ownCollator && (strsrch->collator != collator)) {
3026                ucol_close((UCollator *)strsrch->collator);
3027                strsrch->ownCollator = FALSE;
3028            }
3029            strsrch->collator    = collator;
3030            strsrch->strength    = ucol_getStrength(collator);
3031            strsrch->ceMask      = getMask(strsrch->strength);
3032#if !UCONFIG_NO_BREAK_ITERATION
3033            ubrk_close(strsrch->search->internalBreakIter);
3034            strsrch->search->internalBreakIter = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(collator, ULOC_VALID_LOCALE, status),
3035                                                     strsrch->search->text, strsrch->search->textLength, status);
3036#endif
3037            // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3038            strsrch->toShift     =
3039               ucol_getAttribute(collator, UCOL_ALTERNATE_HANDLING, status) ==
3040                                                                UCOL_SHIFTED;
3041            // if status is a failure, ucol_getVariableTop returns 0
3042            strsrch->variableTop = ucol_getVariableTop(collator, status);
3043            strsrch->textIter = ucol_openElements(collator,
3044                                      strsrch->search->text,
3045                                      strsrch->search->textLength,
3046                                      status);
3047            strsrch->utilIter = ucol_openElements(
3048                    collator, strsrch->pattern.text, strsrch->pattern.textLength, status);
3049            // initialize() _after_ setting the iterators for the new collator.
3050            initialize(strsrch, status);
3051        }
3052
3053        // **** are these calls needed?
3054        // **** we call uprv_init_pce in initializePatternPCETable
3055        // **** and the CEBuffer constructor...
3056#if 0
3057        uprv_init_pce(strsrch->textIter);
3058        uprv_init_pce(strsrch->utilIter);
3059#endif
3060    }
3061}
3062
3063U_CAPI UCollator * U_EXPORT2 usearch_getCollator(const UStringSearch *strsrch)
3064{
3065    if (strsrch) {
3066        return (UCollator *)strsrch->collator;
3067    }
3068    return NULL;
3069}
3070
3071U_CAPI void U_EXPORT2 usearch_setPattern(      UStringSearch *strsrch,
3072                                         const UChar         *pattern,
3073                                               int32_t        patternlength,
3074                                               UErrorCode    *status)
3075{
3076    if (U_SUCCESS(*status)) {
3077        if (strsrch == NULL || pattern == NULL) {
3078            *status = U_ILLEGAL_ARGUMENT_ERROR;
3079        }
3080        else {
3081            if (patternlength == -1) {
3082                patternlength = u_strlen(pattern);
3083            }
3084            if (patternlength == 0) {
3085                *status = U_ILLEGAL_ARGUMENT_ERROR;
3086                return;
3087            }
3088            strsrch->pattern.text       = pattern;
3089            strsrch->pattern.textLength = patternlength;
3090            initialize(strsrch, status);
3091        }
3092    }
3093}
3094
3095U_CAPI const UChar* U_EXPORT2
3096usearch_getPattern(const UStringSearch *strsrch,
3097                   int32_t       *length)
3098{
3099    if (strsrch) {
3100        *length = strsrch->pattern.textLength;
3101        return strsrch->pattern.text;
3102    }
3103    return NULL;
3104}
3105
3106// miscellanous methods --------------------------------------------------
3107
3108U_CAPI int32_t U_EXPORT2 usearch_first(UStringSearch *strsrch,
3109                                           UErrorCode    *status)
3110{
3111    if (strsrch && U_SUCCESS(*status)) {
3112        strsrch->search->isForwardSearching = TRUE;
3113        usearch_setOffset(strsrch, 0, status);
3114        if (U_SUCCESS(*status)) {
3115            return usearch_next(strsrch, status);
3116        }
3117    }
3118    return USEARCH_DONE;
3119}
3120
3121U_CAPI int32_t U_EXPORT2 usearch_following(UStringSearch *strsrch,
3122                                               int32_t    position,
3123                                               UErrorCode    *status)
3124{
3125    if (strsrch && U_SUCCESS(*status)) {
3126        strsrch->search->isForwardSearching = TRUE;
3127        // position checked in usearch_setOffset
3128        usearch_setOffset(strsrch, position, status);
3129        if (U_SUCCESS(*status)) {
3130            return usearch_next(strsrch, status);
3131        }
3132    }
3133    return USEARCH_DONE;
3134}
3135
3136U_CAPI int32_t U_EXPORT2 usearch_last(UStringSearch *strsrch,
3137                                          UErrorCode    *status)
3138{
3139    if (strsrch && U_SUCCESS(*status)) {
3140        strsrch->search->isForwardSearching = FALSE;
3141        usearch_setOffset(strsrch, strsrch->search->textLength, status);
3142        if (U_SUCCESS(*status)) {
3143            return usearch_previous(strsrch, status);
3144        }
3145    }
3146    return USEARCH_DONE;
3147}
3148
3149U_CAPI int32_t U_EXPORT2 usearch_preceding(UStringSearch *strsrch,
3150                                               int32_t    position,
3151                                               UErrorCode    *status)
3152{
3153    if (strsrch && U_SUCCESS(*status)) {
3154        strsrch->search->isForwardSearching = FALSE;
3155        // position checked in usearch_setOffset
3156        usearch_setOffset(strsrch, position, status);
3157        if (U_SUCCESS(*status)) {
3158            return usearch_previous(strsrch, status);
3159        }
3160    }
3161    return USEARCH_DONE;
3162}
3163
3164/**
3165* If a direction switch is required, we'll count the number of ces till the
3166* beginning of the collation element iterator and iterate forwards that
3167* number of times. This is so that we get to the correct point within the
3168* string to continue the search in. Imagine when we are in the middle of the
3169* normalization buffer when the change in direction is request. arrrgghh....
3170* After searching the offset within the collation element iterator will be
3171* shifted to the start of the match. If a match is not found, the offset would
3172* have been set to the end of the text string in the collation element
3173* iterator.
3174* Okay, here's my take on normalization buffer. The only time when there can
3175* be 2 matches within the same normalization is when the pattern is consists
3176* of all accents. But since the offset returned is from the text string, we
3177* should not confuse the caller by returning the second match within the
3178* same normalization buffer. If we do, the 2 results will have the same match
3179* offsets, and that'll be confusing. I'll return the next match that doesn't
3180* fall within the same normalization buffer. Note this does not affect the
3181* results of matches spanning the text and the normalization buffer.
3182* The position to start searching is taken from the collation element
3183* iterator. Callers of this API would have to set the offset in the collation
3184* element iterator before using this method.
3185*/
3186U_CAPI int32_t U_EXPORT2 usearch_next(UStringSearch *strsrch,
3187                                          UErrorCode    *status)
3188{
3189    if (U_SUCCESS(*status) && strsrch) {
3190        // note offset is either equivalent to the start of the previous match
3191        // or is set by the user
3192        int32_t      offset       = usearch_getOffset(strsrch);
3193        USearch     *search       = strsrch->search;
3194        search->reset             = FALSE;
3195        int32_t      textlength   = search->textLength;
3196        if (search->isForwardSearching) {
3197#if BOYER_MOORE
3198            if (offset == textlength
3199                || (!search->isOverlap &&
3200                    (offset + strsrch->pattern.defaultShiftSize > textlength ||
3201                    (search->matchedIndex != USEARCH_DONE &&
3202                     offset + search->matchedLength >= textlength)))) {
3203                // not enough characters to match
3204                setMatchNotFound(strsrch);
3205                return USEARCH_DONE;
3206            }
3207#else
3208            if (offset == textlength ||
3209                (! search->isOverlap &&
3210                (search->matchedIndex != USEARCH_DONE &&
3211                offset + search->matchedLength > textlength))) {
3212                    // not enough characters to match
3213                    setMatchNotFound(strsrch);
3214                    return USEARCH_DONE;
3215            }
3216#endif
3217        }
3218        else {
3219            // switching direction.
3220            // if matchedIndex == USEARCH_DONE, it means that either a
3221            // setOffset has been called or that previous ran off the text
3222            // string. the iterator would have been set to offset 0 if a
3223            // match is not found.
3224            search->isForwardSearching = TRUE;
3225            if (search->matchedIndex != USEARCH_DONE) {
3226                // there's no need to set the collation element iterator
3227                // the next call to next will set the offset.
3228                return search->matchedIndex;
3229            }
3230        }
3231
3232        if (U_SUCCESS(*status)) {
3233            if (strsrch->pattern.CELength == 0) {
3234                if (search->matchedIndex == USEARCH_DONE) {
3235                    search->matchedIndex = offset;
3236                }
3237                else { // moves by codepoints
3238                    U16_FWD_1(search->text, search->matchedIndex, textlength);
3239                }
3240
3241                search->matchedLength = 0;
3242                setColEIterOffset(strsrch->textIter, search->matchedIndex);
3243                // status checked below
3244                if (search->matchedIndex == textlength) {
3245                    search->matchedIndex = USEARCH_DONE;
3246                }
3247            }
3248            else {
3249                if (search->matchedLength > 0) {
3250                    // if matchlength is 0 we are at the start of the iteration
3251                    if (search->isOverlap) {
3252                        ucol_setOffset(strsrch->textIter, offset + 1, status);
3253                    }
3254                    else {
3255                        ucol_setOffset(strsrch->textIter,
3256                                       offset + search->matchedLength, status);
3257                    }
3258                }
3259                else {
3260                    // for boundary check purposes. this will ensure that the
3261                    // next match will not preceed the current offset
3262                    // note search->matchedIndex will always be set to something
3263                    // in the code
3264                    search->matchedIndex = offset - 1;
3265                }
3266
3267                if (search->isCanonicalMatch) {
3268                    // can't use exact here since extra accents are allowed.
3269                    usearch_handleNextCanonical(strsrch, status);
3270                }
3271                else {
3272                    usearch_handleNextExact(strsrch, status);
3273                }
3274            }
3275
3276            if (U_FAILURE(*status)) {
3277                return USEARCH_DONE;
3278            }
3279
3280#if !BOYER_MOORE
3281            if (search->matchedIndex == USEARCH_DONE) {
3282                ucol_setOffset(strsrch->textIter, search->textLength, status);
3283            } else {
3284                ucol_setOffset(strsrch->textIter, search->matchedIndex, status);
3285            }
3286#endif
3287
3288            return search->matchedIndex;
3289        }
3290    }
3291    return USEARCH_DONE;
3292}
3293
3294U_CAPI int32_t U_EXPORT2 usearch_previous(UStringSearch *strsrch,
3295                                              UErrorCode *status)
3296{
3297    if (U_SUCCESS(*status) && strsrch) {
3298        int32_t offset;
3299        USearch *search = strsrch->search;
3300        if (search->reset) {
3301            offset                     = search->textLength;
3302            search->isForwardSearching = FALSE;
3303            search->reset              = FALSE;
3304            setColEIterOffset(strsrch->textIter, offset);
3305        }
3306        else {
3307            offset = usearch_getOffset(strsrch);
3308        }
3309
3310        int32_t matchedindex = search->matchedIndex;
3311        if (search->isForwardSearching == TRUE) {
3312            // switching direction.
3313            // if matchedIndex == USEARCH_DONE, it means that either a
3314            // setOffset has been called or that next ran off the text
3315            // string. the iterator would have been set to offset textLength if
3316            // a match is not found.
3317            search->isForwardSearching = FALSE;
3318            if (matchedindex != USEARCH_DONE) {
3319                return matchedindex;
3320            }
3321        }
3322        else {
3323#if BOYER_MOORE
3324            if (offset == 0 || matchedindex == 0 ||
3325                (!search->isOverlap &&
3326                    (offset < strsrch->pattern.defaultShiftSize ||
3327                    (matchedindex != USEARCH_DONE &&
3328                    matchedindex < strsrch->pattern.defaultShiftSize)))) {
3329                // not enough characters to match
3330                setMatchNotFound(strsrch);
3331                return USEARCH_DONE;
3332            }
3333#else
3334            // Could check pattern length, but the
3335            // linear search will do the right thing
3336            if (offset == 0 || matchedindex == 0) {
3337                setMatchNotFound(strsrch);
3338                return USEARCH_DONE;
3339            }
3340#endif
3341        }
3342
3343        if (U_SUCCESS(*status)) {
3344            if (strsrch->pattern.CELength == 0) {
3345                search->matchedIndex =
3346                      (matchedindex == USEARCH_DONE ? offset : matchedindex);
3347                if (search->matchedIndex == 0) {
3348                    setMatchNotFound(strsrch);
3349                    // status checked below
3350                }
3351                else { // move by codepoints
3352                    U16_BACK_1(search->text, 0, search->matchedIndex);
3353                    setColEIterOffset(strsrch->textIter, search->matchedIndex);
3354                    // status checked below
3355                    search->matchedLength = 0;
3356                }
3357            }
3358            else {
3359                if (strsrch->search->isCanonicalMatch) {
3360                    // can't use exact here since extra accents are allowed.
3361                    usearch_handlePreviousCanonical(strsrch, status);
3362                    // status checked below
3363                }
3364                else {
3365                    usearch_handlePreviousExact(strsrch, status);
3366                    // status checked below
3367                }
3368            }
3369
3370            if (U_FAILURE(*status)) {
3371                return USEARCH_DONE;
3372            }
3373
3374            return search->matchedIndex;
3375        }
3376    }
3377    return USEARCH_DONE;
3378}
3379
3380
3381
3382U_CAPI void U_EXPORT2 usearch_reset(UStringSearch *strsrch)
3383{
3384    /*
3385    reset is setting the attributes that are already in
3386    string search, hence all attributes in the collator should
3387    be retrieved without any problems
3388    */
3389    if (strsrch) {
3390        UErrorCode status            = U_ZERO_ERROR;
3391        UBool      sameCollAttribute = TRUE;
3392        uint32_t   ceMask;
3393        UBool      shift;
3394        uint32_t   varTop;
3395
3396        // **** hack to deal w/ how processed CEs encode quaternary ****
3397        UCollationStrength newStrength = ucol_getStrength(strsrch->collator);
3398        if ((strsrch->strength < UCOL_QUATERNARY && newStrength >= UCOL_QUATERNARY) ||
3399            (strsrch->strength >= UCOL_QUATERNARY && newStrength < UCOL_QUATERNARY)) {
3400                sameCollAttribute = FALSE;
3401        }
3402
3403        strsrch->strength    = ucol_getStrength(strsrch->collator);
3404        ceMask = getMask(strsrch->strength);
3405        if (strsrch->ceMask != ceMask) {
3406            strsrch->ceMask = ceMask;
3407            sameCollAttribute = FALSE;
3408        }
3409
3410        // if status is a failure, ucol_getAttribute returns UCOL_DEFAULT
3411        shift = ucol_getAttribute(strsrch->collator, UCOL_ALTERNATE_HANDLING,
3412                                  &status) == UCOL_SHIFTED;
3413        if (strsrch->toShift != shift) {
3414            strsrch->toShift  = shift;
3415            sameCollAttribute = FALSE;
3416        }
3417
3418        // if status is a failure, ucol_getVariableTop returns 0
3419        varTop = ucol_getVariableTop(strsrch->collator, &status);
3420        if (strsrch->variableTop != varTop) {
3421            strsrch->variableTop = varTop;
3422            sameCollAttribute    = FALSE;
3423        }
3424        if (!sameCollAttribute) {
3425            initialize(strsrch, &status);
3426        }
3427        ucol_setText(strsrch->textIter, strsrch->search->text,
3428                              strsrch->search->textLength,
3429                              &status);
3430        strsrch->search->matchedLength      = 0;
3431        strsrch->search->matchedIndex       = USEARCH_DONE;
3432        strsrch->search->isOverlap          = FALSE;
3433        strsrch->search->isCanonicalMatch   = FALSE;
3434        strsrch->search->elementComparisonType = 0;
3435        strsrch->search->isForwardSearching = TRUE;
3436        strsrch->search->reset              = TRUE;
3437    }
3438}
3439
3440//
3441//  CEI  Collation Element + source text index.
3442//       These structs are kept in the circular buffer.
3443//
3444struct  CEI {
3445    int64_t ce;
3446    int32_t lowIndex;
3447    int32_t highIndex;
3448};
3449
3450U_NAMESPACE_BEGIN
3451
3452namespace {
3453//
3454//  CEBuffer   A circular buffer of CEs from the text being searched.
3455//
3456#define   DEFAULT_CEBUFFER_SIZE 96
3457#define   CEBUFFER_EXTRA 32
3458// Some typical max values to make buffer size more reasonable for asymmetric search.
3459// #8694 is for a better long-term solution to allocation of this buffer.
3460#define   MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L 8
3461#define   MAX_TARGET_IGNORABLES_PER_PAT_OTHER 3
3462#define   MIGHT_BE_JAMO_L(c) ((c >= 0x1100 && c <= 0x115E) || (c >= 0x3131 && c <= 0x314E) || (c >= 0x3165 && c <= 0x3186))
3463struct CEBuffer {
3464    CEI                  defBuf[DEFAULT_CEBUFFER_SIZE];
3465    CEI                 *buf;
3466    int32_t              bufSize;
3467    int32_t              firstIx;
3468    int32_t              limitIx;
3469    UCollationElements  *ceIter;
3470    UStringSearch       *strSearch;
3471
3472
3473
3474               CEBuffer(UStringSearch *ss, UErrorCode *status);
3475               ~CEBuffer();
3476   const CEI   *get(int32_t index);
3477   const CEI   *getPrevious(int32_t index);
3478};
3479
3480
3481CEBuffer::CEBuffer(UStringSearch *ss, UErrorCode *status) {
3482    buf = defBuf;
3483    strSearch = ss;
3484    bufSize = ss->pattern.PCELength + CEBUFFER_EXTRA;
3485    if (ss->search->elementComparisonType != 0) {
3486        const UChar * patText = ss->pattern.text;
3487        if (patText) {
3488            const UChar * patTextLimit = patText + ss->pattern.textLength;
3489            while ( patText < patTextLimit ) {
3490                UChar c = *patText++;
3491                if (MIGHT_BE_JAMO_L(c)) {
3492                    bufSize += MAX_TARGET_IGNORABLES_PER_PAT_JAMO_L;
3493                } else {
3494                    // No check for surrogates, we might allocate slightly more buffer than necessary.
3495                    bufSize += MAX_TARGET_IGNORABLES_PER_PAT_OTHER;
3496                }
3497            }
3498        }
3499    }
3500    ceIter    = ss->textIter;
3501    firstIx = 0;
3502    limitIx = 0;
3503
3504    if (!initTextProcessedIter(ss, status)) { return; }
3505
3506    if (bufSize>DEFAULT_CEBUFFER_SIZE) {
3507        buf = (CEI *)uprv_malloc(bufSize * sizeof(CEI));
3508        if (buf == NULL) {
3509            *status = U_MEMORY_ALLOCATION_ERROR;
3510        }
3511    }
3512}
3513
3514// TODO: add a reset or init function so that allocated
3515//       buffers can be retained & reused.
3516
3517CEBuffer::~CEBuffer() {
3518    if (buf != defBuf) {
3519        uprv_free(buf);
3520    }
3521}
3522
3523
3524// Get the CE with the specified index.
3525//   Index must be in the range
3526//          n-history_size < index < n+1
3527//   where n is the largest index to have been fetched by some previous call to this function.
3528//   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3529//
3530const CEI *CEBuffer::get(int32_t index) {
3531    int i = index % bufSize;
3532
3533    if (index>=firstIx && index<limitIx) {
3534        // The request was for an entry already in our buffer.
3535        //  Just return it.
3536        return &buf[i];
3537    }
3538
3539    // Caller is requesting a new, never accessed before, CE.
3540    //   Verify that it is the next one in sequence, which is all
3541    //   that is allowed.
3542    if (index != limitIx) {
3543        U_ASSERT(FALSE);
3544
3545        return NULL;
3546    }
3547
3548    // Manage the circular CE buffer indexing
3549    limitIx++;
3550
3551    if (limitIx - firstIx >= bufSize) {
3552        // The buffer is full, knock out the lowest-indexed entry.
3553        firstIx++;
3554    }
3555
3556    UErrorCode status = U_ZERO_ERROR;
3557
3558    buf[i].ce = strSearch->textProcessedIter->nextProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
3559
3560    return &buf[i];
3561}
3562
3563// Get the CE with the specified index.
3564//   Index must be in the range
3565//          n-history_size < index < n+1
3566//   where n is the largest index to have been fetched by some previous call to this function.
3567//   The CE value will be UCOL__PROCESSED_NULLORDER at end of input.
3568//
3569const CEI *CEBuffer::getPrevious(int32_t index) {
3570    int i = index % bufSize;
3571
3572    if (index>=firstIx && index<limitIx) {
3573        // The request was for an entry already in our buffer.
3574        //  Just return it.
3575        return &buf[i];
3576    }
3577
3578    // Caller is requesting a new, never accessed before, CE.
3579    //   Verify that it is the next one in sequence, which is all
3580    //   that is allowed.
3581    if (index != limitIx) {
3582        U_ASSERT(FALSE);
3583
3584        return NULL;
3585    }
3586
3587    // Manage the circular CE buffer indexing
3588    limitIx++;
3589
3590    if (limitIx - firstIx >= bufSize) {
3591        // The buffer is full, knock out the lowest-indexed entry.
3592        firstIx++;
3593    }
3594
3595    UErrorCode status = U_ZERO_ERROR;
3596
3597    buf[i].ce = strSearch->textProcessedIter->previousProcessed(&buf[i].lowIndex, &buf[i].highIndex, &status);
3598
3599    return &buf[i];
3600}
3601
3602}
3603
3604U_NAMESPACE_END
3605
3606
3607// #define USEARCH_DEBUG
3608
3609#ifdef USEARCH_DEBUG
3610#include <stdio.h>
3611#include <stdlib.h>
3612#endif
3613
3614/*
3615 * Find the next break boundary after startIndex. If the UStringSearch object
3616 * has an external break iterator, use that. Otherwise use the internal character
3617 * break iterator.
3618 */
3619static int32_t nextBoundaryAfter(UStringSearch *strsrch, int32_t startIndex) {
3620#if 0
3621    const UChar *text = strsrch->search->text;
3622    int32_t textLen   = strsrch->search->textLength;
3623
3624    U_ASSERT(startIndex>=0);
3625    U_ASSERT(startIndex<=textLen);
3626
3627    if (startIndex >= textLen) {
3628        return startIndex;
3629    }
3630
3631    UChar32  c;
3632    int32_t  i = startIndex;
3633    U16_NEXT(text, i, textLen, c);
3634
3635    // If we are on a control character, stop without looking for combining marks.
3636    //    Control characters do not combine.
3637    int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3638    if (gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR) {
3639        return i;
3640    }
3641
3642    // The initial character was not a control, and can thus accept trailing
3643    //   combining characters.  Advance over however many of them there are.
3644    int32_t  indexOfLastCharChecked;
3645    for (;;) {
3646        indexOfLastCharChecked = i;
3647        if (i>=textLen) {
3648            break;
3649        }
3650        U16_NEXT(text, i, textLen, c);
3651        gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3652        if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3653            break;
3654        }
3655    }
3656    return indexOfLastCharChecked;
3657#elif !UCONFIG_NO_BREAK_ITERATION
3658    UBreakIterator *breakiterator = strsrch->search->breakIter;
3659
3660    if (breakiterator == NULL) {
3661        breakiterator = strsrch->search->internalBreakIter;
3662    }
3663
3664    if (breakiterator != NULL) {
3665        return ubrk_following(breakiterator, startIndex);
3666    }
3667
3668    return startIndex;
3669#else
3670    // **** or should we use the original code? ****
3671    return startIndex;
3672#endif
3673
3674}
3675
3676/*
3677 * Returns TRUE if index is on a break boundary. If the UStringSearch
3678 * has an external break iterator, test using that, otherwise test
3679 * using the internal character break iterator.
3680 */
3681static UBool isBreakBoundary(UStringSearch *strsrch, int32_t index) {
3682#if 0
3683    const UChar *text = strsrch->search->text;
3684    int32_t textLen   = strsrch->search->textLength;
3685
3686    U_ASSERT(index>=0);
3687    U_ASSERT(index<=textLen);
3688
3689    if (index>=textLen || index<=0) {
3690        return TRUE;
3691    }
3692
3693    // If the character at the current index is not a GRAPHEME_EXTEND
3694    //    then we can not be within a combining sequence.
3695    UChar32  c;
3696    U16_GET(text, 0, index, textLen, c);
3697    int32_t gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3698    if (gcProperty != U_GCB_EXTEND && gcProperty != U_GCB_SPACING_MARK) {
3699        return TRUE;
3700    }
3701
3702    // We are at a combining mark.  If the preceding character is anything
3703    //   except a CONTROL, CR or LF, we are in a combining sequence.
3704    U16_PREV(text, 0, index, c);
3705    gcProperty = u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
3706    UBool combining =  !(gcProperty==U_GCB_CONTROL || gcProperty==U_GCB_LF || gcProperty==U_GCB_CR);
3707    return !combining;
3708#elif !UCONFIG_NO_BREAK_ITERATION
3709    UBreakIterator *breakiterator = strsrch->search->breakIter;
3710
3711    if (breakiterator == NULL) {
3712        breakiterator = strsrch->search->internalBreakIter;
3713    }
3714
3715    return (breakiterator != NULL && ubrk_isBoundary(breakiterator, index));
3716#else
3717    // **** or use the original code? ****
3718    return TRUE;
3719#endif
3720}
3721
3722#if 0
3723static UBool onBreakBoundaries(const UStringSearch *strsrch, int32_t start, int32_t end)
3724{
3725#if !UCONFIG_NO_BREAK_ITERATION
3726    UBreakIterator *breakiterator = strsrch->search->breakIter;
3727
3728    if (breakiterator != NULL) {
3729        int32_t startindex = ubrk_first(breakiterator);
3730        int32_t endindex   = ubrk_last(breakiterator);
3731
3732        // out-of-range indexes are never boundary positions
3733        if (start < startindex || start > endindex ||
3734            end < startindex || end > endindex) {
3735            return FALSE;
3736        }
3737
3738        return ubrk_isBoundary(breakiterator, start) &&
3739               ubrk_isBoundary(breakiterator, end);
3740    }
3741#endif
3742
3743    return TRUE;
3744}
3745#endif
3746
3747typedef enum {
3748    U_CE_MATCH = -1,
3749    U_CE_NO_MATCH = 0,
3750    U_CE_SKIP_TARG,
3751    U_CE_SKIP_PATN
3752} UCompareCEsResult;
3753#define U_CE_LEVEL2_BASE 0x00000005
3754#define U_CE_LEVEL3_BASE 0x00050000
3755
3756static UCompareCEsResult compareCE64s(int64_t targCE, int64_t patCE, int16_t compareType) {
3757    if (targCE == patCE) {
3758        return U_CE_MATCH;
3759    }
3760    if (compareType == 0) {
3761        return U_CE_NO_MATCH;
3762    }
3763
3764    int64_t targCEshifted = targCE >> 32;
3765    int64_t patCEshifted = patCE >> 32;
3766    int64_t mask;
3767
3768    mask = 0xFFFF0000;
3769    int32_t targLev1 = (int32_t)(targCEshifted & mask);
3770    int32_t patLev1 = (int32_t)(patCEshifted & mask);
3771    if ( targLev1 != patLev1 ) {
3772        if ( targLev1 == 0 ) {
3773            return U_CE_SKIP_TARG;
3774        }
3775        if ( patLev1 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3776            return U_CE_SKIP_PATN;
3777        }
3778        return U_CE_NO_MATCH;
3779    }
3780
3781    mask = 0x0000FFFF;
3782    int32_t targLev2 = (int32_t)(targCEshifted & mask);
3783    int32_t patLev2 = (int32_t)(patCEshifted & mask);
3784    if ( targLev2 != patLev2 ) {
3785        if ( targLev2 == 0 ) {
3786            return U_CE_SKIP_TARG;
3787        }
3788        if ( patLev2 == 0 && compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD ) {
3789            return U_CE_SKIP_PATN;
3790        }
3791        return (patLev2 == U_CE_LEVEL2_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev2 == U_CE_LEVEL2_BASE) )?
3792            U_CE_MATCH: U_CE_NO_MATCH;
3793    }
3794
3795    mask = 0xFFFF0000;
3796    int32_t targLev3 = (int32_t)(targCE & mask);
3797    int32_t patLev3 = (int32_t)(patCE & mask);
3798    if ( targLev3 != patLev3 ) {
3799        return (patLev3 == U_CE_LEVEL3_BASE || (compareType == USEARCH_ANY_BASE_WEIGHT_IS_WILDCARD && targLev3 == U_CE_LEVEL3_BASE) )?
3800            U_CE_MATCH: U_CE_NO_MATCH;
3801   }
3802
3803    return U_CE_MATCH;
3804}
3805
3806#if BOYER_MOORE
3807// TODO: #if BOYER_MOORE, need 32-bit version of compareCE64s
3808#endif
3809
3810U_CAPI UBool U_EXPORT2 usearch_search(UStringSearch  *strsrch,
3811                                       int32_t        startIdx,
3812                                       int32_t        *matchStart,
3813                                       int32_t        *matchLimit,
3814                                       UErrorCode     *status)
3815{
3816    if (U_FAILURE(*status)) {
3817        return FALSE;
3818    }
3819
3820    // TODO:  reject search patterns beginning with a combining char.
3821
3822#ifdef USEARCH_DEBUG
3823    if (getenv("USEARCH_DEBUG") != NULL) {
3824        printf("Pattern CEs\n");
3825        for (int ii=0; ii<strsrch->pattern.CELength; ii++) {
3826            printf(" %8x", strsrch->pattern.CE[ii]);
3827        }
3828        printf("\n");
3829    }
3830
3831#endif
3832    // Input parameter sanity check.
3833    //  TODO:  should input indicies clip to the text length
3834    //         in the same way that UText does.
3835    if(strsrch->pattern.CELength == 0         ||
3836       startIdx < 0                           ||
3837       startIdx > strsrch->search->textLength ||
3838       strsrch->pattern.CE == NULL) {
3839           *status = U_ILLEGAL_ARGUMENT_ERROR;
3840           return FALSE;
3841    }
3842
3843    if (strsrch->pattern.PCE == NULL) {
3844        initializePatternPCETable(strsrch, status);
3845    }
3846
3847    ucol_setOffset(strsrch->textIter, startIdx, status);
3848    CEBuffer ceb(strsrch, status);
3849
3850
3851    int32_t    targetIx = 0;
3852    const CEI *targetCEI = NULL;
3853    int32_t    patIx;
3854    UBool      found;
3855
3856    int32_t  mStart = -1;
3857    int32_t  mLimit = -1;
3858    int32_t  minLimit;
3859    int32_t  maxLimit;
3860
3861
3862
3863    // Outer loop moves over match starting positions in the
3864    //      target CE space.
3865    // Here we see the target as a sequence of collation elements, resulting from the following:
3866    // 1. Target characters were decomposed, and (if appropriate) other compressions and expansions are applied
3867    //    (for example, digraphs such as IJ may be broken into two characters).
3868    // 2. An int64_t CE weight is determined for each resulting unit (high 16 bits are primary strength, next
3869    //    16 bits are secondary, next 16 (the high 16 bits of the low 32-bit half) are tertiary. Any of these
3870    //    fields that are for strengths below that of the collator are set to 0. If this makes the int64_t
3871    //    CE weight 0 (as for a combining diacritic with secondary weight when the collator strentgh is primary),
3872    //    then the CE is deleted, so the following code sees only CEs that are relevant.
3873    // For each CE, the lowIndex and highIndex correspond to where this CE begins and ends in the original text.
3874    // If lowIndex==highIndex, either the CE resulted from an expansion/decomposition of one of the original text
3875    // characters, or the CE marks the limit of the target text (in which case the CE weight is UCOL_PROCESSED_NULLORDER).
3876    //
3877    for(targetIx=0; ; targetIx++)
3878    {
3879        found = TRUE;
3880        //  Inner loop checks for a match beginning at each
3881        //  position from the outer loop.
3882        int32_t targetIxOffset = 0;
3883        int64_t patCE = 0;
3884        // For targetIx > 0, this ceb.get gets a CE that is as far back in the ring buffer
3885        // (compared to the last CE fetched for the previous targetIx value) as we need to go
3886        // for this targetIx value, so if it is non-NULL then other ceb.get calls should be OK.
3887        const CEI *firstCEI = ceb.get(targetIx);
3888        if (firstCEI == NULL) {
3889            *status = U_INTERNAL_PROGRAM_ERROR;
3890            found = FALSE;
3891            break;
3892        }
3893
3894        for (patIx=0; patIx<strsrch->pattern.PCELength; patIx++) {
3895            patCE = strsrch->pattern.PCE[patIx];
3896            targetCEI = ceb.get(targetIx+patIx+targetIxOffset);
3897            //  Compare CE from target string with CE from the pattern.
3898            //    Note that the target CE will be UCOL_PROCESSED_NULLORDER if we reach the end of input,
3899            //    which will fail the compare, below.
3900            UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
3901            if ( ceMatch == U_CE_NO_MATCH ) {
3902                found = FALSE;
3903                break;
3904            } else if ( ceMatch > U_CE_NO_MATCH ) {
3905                if ( ceMatch == U_CE_SKIP_TARG ) {
3906                    // redo with same patCE, next targCE
3907                    patIx--;
3908                    targetIxOffset++;
3909                } else { // ceMatch == U_CE_SKIP_PATN
3910                    // redo with same targCE, next patCE
3911                    targetIxOffset--;
3912                }
3913            }
3914        }
3915        targetIxOffset += strsrch->pattern.PCELength; // this is now the offset in target CE space to end of the match so far
3916
3917        if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
3918            // No match at this targetIx.  Try again at the next.
3919            continue;
3920        }
3921
3922        if (!found) {
3923            // No match at all, we have run off the end of the target text.
3924            break;
3925        }
3926
3927
3928        // We have found a match in CE space.
3929        // Now determine the bounds in string index space.
3930        //  There still is a chance of match failure if the CE range not correspond to
3931        //     an acceptable character range.
3932        //
3933        const CEI *lastCEI  = ceb.get(targetIx + targetIxOffset - 1);
3934
3935        mStart   = firstCEI->lowIndex;
3936        minLimit = lastCEI->lowIndex;
3937
3938        // Look at the CE following the match.  If it is UCOL_NULLORDER the match
3939        //   extended to the end of input, and the match is good.
3940
3941        // Look at the high and low indices of the CE following the match. If
3942        // they are the same it means one of two things:
3943        //    1. The match extended to the last CE from the target text, which is OK, or
3944        //    2. The last CE that was part of the match is in an expansion that extends
3945        //       to the first CE after the match. In this case, we reject the match.
3946        const CEI *nextCEI = 0;
3947        if (strsrch->search->elementComparisonType == 0) {
3948            nextCEI  = ceb.get(targetIx + targetIxOffset);
3949            maxLimit = nextCEI->lowIndex;
3950            if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
3951                found = FALSE;
3952            }
3953        } else {
3954            for ( ; ; ++targetIxOffset ) {
3955                nextCEI = ceb.get(targetIx + targetIxOffset);
3956                maxLimit = nextCEI->lowIndex;
3957                // If we are at the end of the target too, match succeeds
3958                if (  nextCEI->ce == UCOL_PROCESSED_NULLORDER ) {
3959                    break;
3960                }
3961                // As long as the next CE has primary weight of 0,
3962                // it is part of the last target element matched by the pattern;
3963                // make sure it can be part of a match with the last patCE
3964                if ( (((nextCEI->ce) >> 32) & 0xFFFF0000UL) == 0 ) {
3965                    UCompareCEsResult ceMatch = compareCE64s(nextCEI->ce, patCE, strsrch->search->elementComparisonType);
3966                    if ( ceMatch == U_CE_NO_MATCH || ceMatch == U_CE_SKIP_PATN ) {
3967                        found = FALSE;
3968                        break;
3969                    }
3970                // If lowIndex == highIndex, this target CE is part of an expansion of the last matched
3971                // target element, but it has non-zero primary weight => match fails
3972                } else if ( nextCEI->lowIndex == nextCEI->highIndex ) {
3973                    found = false;
3974                    break;
3975                // Else the target CE is not part of an expansion of the last matched element, match succeeds
3976                } else {
3977                    break;
3978                }
3979            }
3980        }
3981
3982
3983        // Check for the start of the match being within a combining sequence.
3984        //   This can happen if the pattern itself begins with a combining char, and
3985        //   the match found combining marks in the target text that were attached
3986        //    to something else.
3987        //   This type of match should be rejected for not completely consuming a
3988        //   combining sequence.
3989        if (!isBreakBoundary(strsrch, mStart)) {
3990            found = FALSE;
3991        }
3992
3993        // Check for the start of the match being within an Collation Element Expansion,
3994        //   meaning that the first char of the match is only partially matched.
3995        //   With exapnsions, the first CE will report the index of the source
3996        //   character, and all subsequent (expansions) CEs will report the source index of the
3997        //    _following_ character.
3998        int32_t secondIx = firstCEI->highIndex;
3999        if (mStart == secondIx) {
4000            found = FALSE;
4001        }
4002
4003        //  Advance the match end position to the first acceptable match boundary.
4004        //    This advances the index over any combining charcters.
4005        mLimit = maxLimit;
4006        if (minLimit < maxLimit) {
4007            // When the last CE's low index is same with its high index, the CE is likely
4008            // a part of expansion. In this case, the index is located just after the
4009            // character corresponding to the CEs compared above. If the index is right
4010            // at the break boundary, move the position to the next boundary will result
4011            // incorrect match length when there are ignorable characters exist between
4012            // the position and the next character produces CE(s). See ticket#8482.
4013            if (minLimit == lastCEI->highIndex && isBreakBoundary(strsrch, minLimit)) {
4014                mLimit = minLimit;
4015            } else {
4016                int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4017                if (nba >= lastCEI->highIndex) {
4018                    mLimit = nba;
4019                }
4020            }
4021        }
4022
4023    #ifdef USEARCH_DEBUG
4024        if (getenv("USEARCH_DEBUG") != NULL) {
4025            printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4026        }
4027    #endif
4028
4029        // If advancing to the end of a combining sequence in character indexing space
4030        //   advanced us beyond the end of the match in CE space, reject this match.
4031        if (mLimit > maxLimit) {
4032            found = FALSE;
4033        }
4034
4035        if (!isBreakBoundary(strsrch, mLimit)) {
4036            found = FALSE;
4037        }
4038
4039        if (! checkIdentical(strsrch, mStart, mLimit)) {
4040            found = FALSE;
4041        }
4042
4043        if (found) {
4044            break;
4045        }
4046    }
4047
4048    #ifdef USEARCH_DEBUG
4049    if (getenv("USEARCH_DEBUG") != NULL) {
4050        printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4051        int32_t  lastToPrint = ceb.limitIx+2;
4052        for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4053            printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4054        }
4055        printf("\n%s\n", found? "match found" : "no match");
4056    }
4057    #endif
4058
4059    // All Done.  Store back the match bounds to the caller.
4060    //
4061    if (found==FALSE) {
4062        mLimit = -1;
4063        mStart = -1;
4064    }
4065
4066    if (matchStart != NULL) {
4067        *matchStart= mStart;
4068    }
4069
4070    if (matchLimit != NULL) {
4071        *matchLimit = mLimit;
4072    }
4073
4074    return found;
4075}
4076
4077U_CAPI UBool U_EXPORT2 usearch_searchBackwards(UStringSearch  *strsrch,
4078                                                int32_t        startIdx,
4079                                                int32_t        *matchStart,
4080                                                int32_t        *matchLimit,
4081                                                UErrorCode     *status)
4082{
4083    if (U_FAILURE(*status)) {
4084        return FALSE;
4085    }
4086
4087    // TODO:  reject search patterns beginning with a combining char.
4088
4089#ifdef USEARCH_DEBUG
4090    if (getenv("USEARCH_DEBUG") != NULL) {
4091        printf("Pattern CEs\n");
4092        for (int ii=0; ii<strsrch->pattern.CELength; ii++) {
4093            printf(" %8x", strsrch->pattern.CE[ii]);
4094        }
4095        printf("\n");
4096    }
4097
4098#endif
4099    // Input parameter sanity check.
4100    //  TODO:  should input indicies clip to the text length
4101    //         in the same way that UText does.
4102    if(strsrch->pattern.CELength == 0         ||
4103       startIdx < 0                           ||
4104       startIdx > strsrch->search->textLength ||
4105       strsrch->pattern.CE == NULL) {
4106           *status = U_ILLEGAL_ARGUMENT_ERROR;
4107           return FALSE;
4108    }
4109
4110    if (strsrch->pattern.PCE == NULL) {
4111        initializePatternPCETable(strsrch, status);
4112    }
4113
4114    CEBuffer ceb(strsrch, status);
4115    int32_t    targetIx = 0;
4116
4117    /*
4118     * Pre-load the buffer with the CE's for the grapheme
4119     * after our starting position so that we're sure that
4120     * we can look at the CE following the match when we
4121     * check the match boundaries.
4122     *
4123     * This will also pre-fetch the first CE that we'll
4124     * consider for the match.
4125     */
4126    if (startIdx < strsrch->search->textLength) {
4127        UBreakIterator *bi = strsrch->search->internalBreakIter;
4128        int32_t next = ubrk_following(bi, startIdx);
4129
4130        ucol_setOffset(strsrch->textIter, next, status);
4131
4132        for (targetIx = 0; ; targetIx += 1) {
4133            if (ceb.getPrevious(targetIx)->lowIndex < startIdx) {
4134                break;
4135            }
4136        }
4137    } else {
4138        ucol_setOffset(strsrch->textIter, startIdx, status);
4139    }
4140
4141
4142    const CEI *targetCEI = NULL;
4143    int32_t    patIx;
4144    UBool      found;
4145
4146    int32_t  limitIx = targetIx;
4147    int32_t  mStart = -1;
4148    int32_t  mLimit = -1;
4149    int32_t  minLimit;
4150    int32_t  maxLimit;
4151
4152
4153
4154    // Outer loop moves over match starting positions in the
4155    //      target CE space.
4156    // Here, targetIx values increase toward the beginning of the base text (i.e. we get the text CEs in reverse order).
4157    // But  patIx is 0 at the beginning of the pattern and increases toward the end.
4158    // So this loop performs a comparison starting with the end of pattern, and prcessd toward the beginning of the pattern
4159    // and the beginning of the base text.
4160    for(targetIx = limitIx; ; targetIx += 1)
4161    {
4162        found = TRUE;
4163        // For targetIx > limitIx, this ceb.getPrevious gets a CE that is as far back in the ring buffer
4164        // (compared to the last CE fetched for the previous targetIx value) as we need to go
4165        // for this targetIx value, so if it is non-NULL then other ceb.getPrevious calls should be OK.
4166        const CEI *lastCEI  = ceb.getPrevious(targetIx);
4167        if (lastCEI == NULL) {
4168            *status = U_INTERNAL_PROGRAM_ERROR;
4169            found = FALSE;
4170             break;
4171        }
4172        //  Inner loop checks for a match beginning at each
4173        //  position from the outer loop.
4174        int32_t targetIxOffset = 0;
4175        for (patIx = strsrch->pattern.PCELength - 1; patIx >= 0; patIx -= 1) {
4176            int64_t patCE = strsrch->pattern.PCE[patIx];
4177
4178            targetCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 - patIx + targetIxOffset);
4179            //  Compare CE from target string with CE from the pattern.
4180            //    Note that the target CE will be UCOL_NULLORDER if we reach the end of input,
4181            //    which will fail the compare, below.
4182            UCompareCEsResult ceMatch = compareCE64s(targetCEI->ce, patCE, strsrch->search->elementComparisonType);
4183            if ( ceMatch == U_CE_NO_MATCH ) {
4184                found = FALSE;
4185                break;
4186            } else if ( ceMatch > U_CE_NO_MATCH ) {
4187                if ( ceMatch == U_CE_SKIP_TARG ) {
4188                    // redo with same patCE, next targCE
4189                    patIx++;
4190                    targetIxOffset++;
4191                } else { // ceMatch == U_CE_SKIP_PATN
4192                    // redo with same targCE, next patCE
4193                    targetIxOffset--;
4194                }
4195            }
4196        }
4197
4198        if (!found && ((targetCEI == NULL) || (targetCEI->ce != UCOL_PROCESSED_NULLORDER))) {
4199            // No match at this targetIx.  Try again at the next.
4200            continue;
4201        }
4202
4203        if (!found) {
4204            // No match at all, we have run off the end of the target text.
4205            break;
4206        }
4207
4208
4209        // We have found a match in CE space.
4210        // Now determine the bounds in string index space.
4211        //  There still is a chance of match failure if the CE range not correspond to
4212        //     an acceptable character range.
4213        //
4214        const CEI *firstCEI = ceb.getPrevious(targetIx + strsrch->pattern.PCELength - 1 + targetIxOffset);
4215        mStart   = firstCEI->lowIndex;
4216
4217        // Check for the start of the match being within a combining sequence.
4218        //   This can happen if the pattern itself begins with a combining char, and
4219        //   the match found combining marks in the target text that were attached
4220        //    to something else.
4221        //   This type of match should be rejected for not completely consuming a
4222        //   combining sequence.
4223        if (!isBreakBoundary(strsrch, mStart)) {
4224            found = FALSE;
4225        }
4226
4227        // Look at the high index of the first CE in the match. If it's the same as the
4228        // low index, the first CE in the match is in the middle of an expansion.
4229        if (mStart == firstCEI->highIndex) {
4230            found = FALSE;
4231        }
4232
4233
4234        minLimit = lastCEI->lowIndex;
4235
4236        if (targetIx > 0) {
4237            // Look at the CE following the match.  If it is UCOL_NULLORDER the match
4238            //   extended to the end of input, and the match is good.
4239
4240            // Look at the high and low indices of the CE following the match. If
4241            // they are the same it means one of two things:
4242            //    1. The match extended to the last CE from the target text, which is OK, or
4243            //    2. The last CE that was part of the match is in an expansion that extends
4244            //       to the first CE after the match. In this case, we reject the match.
4245            const CEI *nextCEI  = ceb.getPrevious(targetIx - 1);
4246
4247            if (nextCEI->lowIndex == nextCEI->highIndex && nextCEI->ce != UCOL_PROCESSED_NULLORDER) {
4248                found = FALSE;
4249            }
4250
4251            mLimit = maxLimit = nextCEI->lowIndex;
4252
4253            //  Advance the match end position to the first acceptable match boundary.
4254            //    This advances the index over any combining charcters.
4255            if (minLimit < maxLimit) {
4256                int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4257
4258                if (nba >= lastCEI->highIndex) {
4259                    mLimit = nba;
4260                }
4261            }
4262
4263            // If advancing to the end of a combining sequence in character indexing space
4264            //   advanced us beyond the end of the match in CE space, reject this match.
4265            if (mLimit > maxLimit) {
4266                found = FALSE;
4267            }
4268
4269            // Make sure the end of the match is on a break boundary
4270            if (!isBreakBoundary(strsrch, mLimit)) {
4271                found = FALSE;
4272            }
4273
4274        } else {
4275            // No non-ignorable CEs after this point.
4276            // The maximum position is detected by boundary after
4277            // the last non-ignorable CE. Combining sequence
4278            // across the start index will be truncated.
4279            int32_t nba = nextBoundaryAfter(strsrch, minLimit);
4280            mLimit = maxLimit = (nba > 0) && (startIdx > nba) ? nba : startIdx;
4281        }
4282
4283    #ifdef USEARCH_DEBUG
4284        if (getenv("USEARCH_DEBUG") != NULL) {
4285            printf("minLimit, maxLimit, mLimit = %d, %d, %d\n", minLimit, maxLimit, mLimit);
4286        }
4287    #endif
4288
4289
4290        if (! checkIdentical(strsrch, mStart, mLimit)) {
4291            found = FALSE;
4292        }
4293
4294        if (found) {
4295            break;
4296        }
4297    }
4298
4299    #ifdef USEARCH_DEBUG
4300    if (getenv("USEARCH_DEBUG") != NULL) {
4301        printf("Target CEs [%d .. %d]\n", ceb.firstIx, ceb.limitIx);
4302        int32_t  lastToPrint = ceb.limitIx+2;
4303        for (int ii=ceb.firstIx; ii<lastToPrint; ii++) {
4304            printf("%8x@%d ", ceb.get(ii)->ce, ceb.get(ii)->srcIndex);
4305        }
4306        printf("\n%s\n", found? "match found" : "no match");
4307    }
4308    #endif
4309
4310    // All Done.  Store back the match bounds to the caller.
4311    //
4312    if (found==FALSE) {
4313        mLimit = -1;
4314        mStart = -1;
4315    }
4316
4317    if (matchStart != NULL) {
4318        *matchStart= mStart;
4319    }
4320
4321    if (matchLimit != NULL) {
4322        *matchLimit = mLimit;
4323    }
4324
4325    return found;
4326}
4327
4328// internal use methods declared in usrchimp.h -----------------------------
4329
4330UBool usearch_handleNextExact(UStringSearch *strsrch, UErrorCode *status)
4331{
4332    if (U_FAILURE(*status)) {
4333        setMatchNotFound(strsrch);
4334        return FALSE;
4335    }
4336
4337#if BOYER_MOORE
4338    UCollationElements *coleiter        = strsrch->textIter;
4339    int32_t             textlength      = strsrch->search->textLength;
4340    int32_t            *patternce       = strsrch->pattern.CE;
4341    int32_t             patterncelength = strsrch->pattern.CELength;
4342    int32_t             textoffset      = ucol_getOffset(coleiter);
4343
4344    // status used in setting coleiter offset, since offset is checked in
4345    // shiftForward before setting the coleiter offset, status never
4346    // a failure
4347    textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4348                              patterncelength);
4349    while (textoffset <= textlength)
4350    {
4351        uint32_t    patternceindex = patterncelength - 1;
4352        int32_t     targetce;
4353        UBool       found          = FALSE;
4354        int32_t    lastce          = UCOL_NULLORDER;
4355
4356        setColEIterOffset(coleiter, textoffset);
4357
4358        for (;;) {
4359            // finding the last pattern ce match, imagine composite characters
4360            // for example: search for pattern A in text \u00C0
4361            // we'll have to skip \u0300 the grave first before we get to A
4362            targetce = ucol_previous(coleiter, status);
4363            if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4364                found = FALSE;
4365                break;
4366            }
4367            targetce = getCE(strsrch, targetce);
4368            if (targetce == UCOL_IGNORABLE && inNormBuf(coleiter)) {
4369                // this is for the text \u0315\u0300 that requires
4370                // normalization and pattern \u0300, where \u0315 is ignorable
4371                continue;
4372            }
4373            if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4374                lastce = targetce;
4375            }
4376            // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4377            if (targetce == patternce[patternceindex]) {
4378                // the first ce can be a contraction
4379                found = TRUE;
4380                break;
4381            }
4382            if (!hasExpansion(coleiter)) {
4383                found = FALSE;
4384                break;
4385            }
4386        }
4387
4388        //targetce = lastce;
4389
4390        while (found && patternceindex > 0) {
4391            lastce = targetce;
4392            targetce    = ucol_previous(coleiter, status);
4393            if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4394                found = FALSE;
4395                break;
4396            }
4397            targetce    = getCE(strsrch, targetce);
4398            if (targetce == UCOL_IGNORABLE) {
4399                continue;
4400            }
4401
4402            patternceindex --;
4403            // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4404            found = found && targetce == patternce[patternceindex];
4405        }
4406
4407        targetce = lastce;
4408
4409        if (!found) {
4410            if (U_FAILURE(*status)) {
4411                break;
4412            }
4413            textoffset = shiftForward(strsrch, textoffset, lastce,
4414                                      patternceindex);
4415            // status checked at loop.
4416            patternceindex = patterncelength;
4417            continue;
4418        }
4419
4420        if (checkNextExactMatch(strsrch, &textoffset, status)) {
4421            // status checked in ucol_setOffset
4422            setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4423            return TRUE;
4424        }
4425    }
4426    setMatchNotFound(strsrch);
4427    return FALSE;
4428#else
4429    int32_t textOffset = ucol_getOffset(strsrch->textIter);
4430    int32_t start = -1;
4431    int32_t end = -1;
4432
4433    if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4434        strsrch->search->matchedIndex  = start;
4435        strsrch->search->matchedLength = end - start;
4436        return TRUE;
4437    } else {
4438        setMatchNotFound(strsrch);
4439        return FALSE;
4440    }
4441#endif
4442}
4443
4444UBool usearch_handleNextCanonical(UStringSearch *strsrch, UErrorCode *status)
4445{
4446    if (U_FAILURE(*status)) {
4447        setMatchNotFound(strsrch);
4448        return FALSE;
4449    }
4450
4451#if BOYER_MOORE
4452    UCollationElements *coleiter        = strsrch->textIter;
4453    int32_t             textlength      = strsrch->search->textLength;
4454    int32_t            *patternce       = strsrch->pattern.CE;
4455    int32_t             patterncelength = strsrch->pattern.CELength;
4456    int32_t             textoffset      = ucol_getOffset(coleiter);
4457    UBool               hasPatternAccents =
4458       strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
4459
4460    textoffset = shiftForward(strsrch, textoffset, UCOL_NULLORDER,
4461                              patterncelength);
4462    strsrch->canonicalPrefixAccents[0] = 0;
4463    strsrch->canonicalSuffixAccents[0] = 0;
4464
4465    while (textoffset <= textlength)
4466    {
4467        int32_t     patternceindex = patterncelength - 1;
4468        int32_t     targetce;
4469        UBool       found          = FALSE;
4470        int32_t     lastce         = UCOL_NULLORDER;
4471
4472        setColEIterOffset(coleiter, textoffset);
4473
4474        for (;;) {
4475            // finding the last pattern ce match, imagine composite characters
4476            // for example: search for pattern A in text \u00C0
4477            // we'll have to skip \u0300 the grave first before we get to A
4478            targetce = ucol_previous(coleiter, status);
4479            if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4480                found = FALSE;
4481                break;
4482            }
4483            targetce = getCE(strsrch, targetce);
4484            if (lastce == UCOL_NULLORDER || lastce == UCOL_IGNORABLE) {
4485                lastce = targetce;
4486            }
4487            // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4488            if (targetce == patternce[patternceindex]) {
4489                // the first ce can be a contraction
4490                found = TRUE;
4491                break;
4492            }
4493            if (!hasExpansion(coleiter)) {
4494                found = FALSE;
4495                break;
4496            }
4497        }
4498
4499        while (found && patternceindex > 0) {
4500            targetce    = ucol_previous(coleiter, status);
4501            if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4502                found = FALSE;
4503                break;
4504            }
4505            targetce    = getCE(strsrch, targetce);
4506            if (targetce == UCOL_IGNORABLE) {
4507                continue;
4508            }
4509
4510            patternceindex --;
4511            // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4512            found = found && targetce == patternce[patternceindex];
4513        }
4514
4515        // initializing the rearranged accent array
4516        if (hasPatternAccents && !found) {
4517            strsrch->canonicalPrefixAccents[0] = 0;
4518            strsrch->canonicalSuffixAccents[0] = 0;
4519            if (U_FAILURE(*status)) {
4520                break;
4521            }
4522            found = doNextCanonicalMatch(strsrch, textoffset, status);
4523        }
4524
4525        if (!found) {
4526            if (U_FAILURE(*status)) {
4527                break;
4528            }
4529            textoffset = shiftForward(strsrch, textoffset, lastce,
4530                                      patternceindex);
4531            // status checked at loop
4532            patternceindex = patterncelength;
4533            continue;
4534        }
4535
4536        if (checkNextCanonicalMatch(strsrch, &textoffset, status)) {
4537            setColEIterOffset(coleiter, strsrch->search->matchedIndex);
4538            return TRUE;
4539        }
4540    }
4541    setMatchNotFound(strsrch);
4542    return FALSE;
4543#else
4544    int32_t textOffset = ucol_getOffset(strsrch->textIter);
4545    int32_t start = -1;
4546    int32_t end = -1;
4547
4548    if (usearch_search(strsrch, textOffset, &start, &end, status)) {
4549        strsrch->search->matchedIndex  = start;
4550        strsrch->search->matchedLength = end - start;
4551        return TRUE;
4552    } else {
4553        setMatchNotFound(strsrch);
4554        return FALSE;
4555    }
4556#endif
4557}
4558
4559UBool usearch_handlePreviousExact(UStringSearch *strsrch, UErrorCode *status)
4560{
4561    if (U_FAILURE(*status)) {
4562        setMatchNotFound(strsrch);
4563        return FALSE;
4564    }
4565
4566#if BOYER_MOORE
4567    UCollationElements *coleiter        = strsrch->textIter;
4568    int32_t            *patternce       = strsrch->pattern.CE;
4569    int32_t             patterncelength = strsrch->pattern.CELength;
4570    int32_t             textoffset      = ucol_getOffset(coleiter);
4571
4572    // shifting it check for setting offset
4573    // if setOffset is called previously or there was no previous match, we
4574    // leave the offset as it is.
4575    if (strsrch->search->matchedIndex != USEARCH_DONE) {
4576        textoffset = strsrch->search->matchedIndex;
4577    }
4578
4579    textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4580                              patterncelength);
4581
4582    while (textoffset >= 0)
4583    {
4584        int32_t     patternceindex = 1;
4585        int32_t     targetce;
4586        UBool       found          = FALSE;
4587        int32_t     firstce        = UCOL_NULLORDER;
4588
4589        // if status is a failure, ucol_setOffset does nothing
4590        setColEIterOffset(coleiter, textoffset);
4591
4592        for (;;) {
4593            // finding the first pattern ce match, imagine composite
4594            // characters. for example: search for pattern \u0300 in text
4595            // \u00C0, we'll have to skip A first before we get to
4596            // \u0300 the grave accent
4597            targetce = ucol_next(coleiter, status);
4598            if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4599                found = FALSE;
4600                break;
4601            }
4602            targetce = getCE(strsrch, targetce);
4603            if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4604                firstce = targetce;
4605            }
4606            if (targetce == UCOL_IGNORABLE && strsrch->strength != UCOL_PRIMARY) {
4607                continue;
4608            }
4609            // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4610            if (targetce == patternce[0]) {
4611                found = TRUE;
4612                break;
4613            }
4614            if (!hasExpansion(coleiter)) {
4615                // checking for accents in composite character
4616                found = FALSE;
4617                break;
4618            }
4619        }
4620
4621        //targetce = firstce;
4622
4623        while (found && (patternceindex < patterncelength)) {
4624            firstce = targetce;
4625            targetce    = ucol_next(coleiter, status);
4626            if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4627                found = FALSE;
4628                break;
4629            }
4630            targetce    = getCE(strsrch, targetce);
4631            if (targetce == UCOL_IGNORABLE) {
4632                continue;
4633            }
4634
4635            // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4636            found = found && targetce == patternce[patternceindex];
4637            patternceindex ++;
4638        }
4639
4640        targetce = firstce;
4641
4642        if (!found) {
4643            if (U_FAILURE(*status)) {
4644                break;
4645            }
4646
4647            textoffset = reverseShift(strsrch, textoffset, targetce,
4648                                      patternceindex);
4649            patternceindex = 0;
4650            continue;
4651        }
4652
4653        if (checkPreviousExactMatch(strsrch, &textoffset, status)) {
4654            setColEIterOffset(coleiter, textoffset);
4655            return TRUE;
4656        }
4657    }
4658    setMatchNotFound(strsrch);
4659    return FALSE;
4660#else
4661    int32_t textOffset;
4662
4663    if (strsrch->search->isOverlap) {
4664        if (strsrch->search->matchedIndex != USEARCH_DONE) {
4665            textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4666        } else {
4667            // move the start position at the end of possible match
4668            initializePatternPCETable(strsrch, status);
4669            if (!initTextProcessedIter(strsrch, status)) {
4670                setMatchNotFound(strsrch);
4671                return FALSE;
4672            }
4673            for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.PCELength - 1; nPCEs++) {
4674                int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4675                if (pce == UCOL_PROCESSED_NULLORDER) {
4676                    // at the end of the text
4677                    break;
4678                }
4679            }
4680            if (U_FAILURE(*status)) {
4681                setMatchNotFound(strsrch);
4682                return FALSE;
4683            }
4684            textOffset = ucol_getOffset(strsrch->textIter);
4685        }
4686    } else {
4687        textOffset = ucol_getOffset(strsrch->textIter);
4688    }
4689
4690    int32_t start = -1;
4691    int32_t end = -1;
4692
4693    if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4694        strsrch->search->matchedIndex = start;
4695        strsrch->search->matchedLength = end - start;
4696        return TRUE;
4697    } else {
4698        setMatchNotFound(strsrch);
4699        return FALSE;
4700    }
4701#endif
4702}
4703
4704UBool usearch_handlePreviousCanonical(UStringSearch *strsrch,
4705                                      UErrorCode    *status)
4706{
4707    if (U_FAILURE(*status)) {
4708        setMatchNotFound(strsrch);
4709        return FALSE;
4710    }
4711
4712#if BOYER_MOORE
4713    UCollationElements *coleiter        = strsrch->textIter;
4714    int32_t            *patternce       = strsrch->pattern.CE;
4715    int32_t             patterncelength = strsrch->pattern.CELength;
4716    int32_t             textoffset      = ucol_getOffset(coleiter);
4717    UBool               hasPatternAccents =
4718       strsrch->pattern.hasSuffixAccents || strsrch->pattern.hasPrefixAccents;
4719
4720    // shifting it check for setting offset
4721    // if setOffset is called previously or there was no previous match, we
4722    // leave the offset as it is.
4723    if (strsrch->search->matchedIndex != USEARCH_DONE) {
4724        textoffset = strsrch->search->matchedIndex;
4725    }
4726
4727    textoffset = reverseShift(strsrch, textoffset, UCOL_NULLORDER,
4728                              patterncelength);
4729    strsrch->canonicalPrefixAccents[0] = 0;
4730    strsrch->canonicalSuffixAccents[0] = 0;
4731
4732    while (textoffset >= 0)
4733    {
4734        int32_t     patternceindex = 1;
4735        int32_t     targetce;
4736        UBool       found          = FALSE;
4737        int32_t     firstce        = UCOL_NULLORDER;
4738
4739        setColEIterOffset(coleiter, textoffset);
4740        for (;;) {
4741            // finding the first pattern ce match, imagine composite
4742            // characters. for example: search for pattern \u0300 in text
4743            // \u00C0, we'll have to skip A first before we get to
4744            // \u0300 the grave accent
4745            targetce = ucol_next(coleiter, status);
4746            if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4747                found = FALSE;
4748                break;
4749            }
4750            targetce = getCE(strsrch, targetce);
4751            if (firstce == UCOL_NULLORDER || firstce == UCOL_IGNORABLE) {
4752                firstce = targetce;
4753            }
4754
4755            // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4756            if (targetce == patternce[0]) {
4757                // the first ce can be a contraction
4758                found = TRUE;
4759                break;
4760            }
4761            if (!hasExpansion(coleiter)) {
4762                // checking for accents in composite character
4763                found = FALSE;
4764                break;
4765            }
4766        }
4767
4768        targetce = firstce;
4769
4770        while (found && patternceindex < patterncelength) {
4771            targetce    = ucol_next(coleiter, status);
4772            if (U_FAILURE(*status) || targetce == UCOL_NULLORDER) {
4773                found = FALSE;
4774                break;
4775            }
4776            targetce = getCE(strsrch, targetce);
4777            if (targetce == UCOL_IGNORABLE) {
4778                continue;
4779            }
4780
4781            // TODO: #if BOYER_MOORE, replace with code using 32-bit version of compareCE64s
4782            found = found && targetce == patternce[patternceindex];
4783            patternceindex ++;
4784        }
4785
4786        // initializing the rearranged accent array
4787        if (hasPatternAccents && !found) {
4788            strsrch->canonicalPrefixAccents[0] = 0;
4789            strsrch->canonicalSuffixAccents[0] = 0;
4790            if (U_FAILURE(*status)) {
4791                break;
4792            }
4793            found = doPreviousCanonicalMatch(strsrch, textoffset, status);
4794        }
4795
4796        if (!found) {
4797            if (U_FAILURE(*status)) {
4798                break;
4799            }
4800            textoffset = reverseShift(strsrch, textoffset, targetce,
4801                                      patternceindex);
4802            patternceindex = 0;
4803            continue;
4804        }
4805
4806        if (checkPreviousCanonicalMatch(strsrch, &textoffset, status)) {
4807            setColEIterOffset(coleiter, textoffset);
4808            return TRUE;
4809        }
4810    }
4811    setMatchNotFound(strsrch);
4812    return FALSE;
4813#else
4814    int32_t textOffset;
4815
4816    if (strsrch->search->isOverlap) {
4817        if (strsrch->search->matchedIndex != USEARCH_DONE) {
4818            textOffset = strsrch->search->matchedIndex + strsrch->search->matchedLength - 1;
4819        } else {
4820            // move the start position at the end of possible match
4821            initializePatternPCETable(strsrch, status);
4822            if (!initTextProcessedIter(strsrch, status)) {
4823                setMatchNotFound(strsrch);
4824                return FALSE;
4825            }
4826            for (int32_t nPCEs = 0; nPCEs < strsrch->pattern.PCELength - 1; nPCEs++) {
4827                int64_t pce = strsrch->textProcessedIter->nextProcessed(NULL, NULL, status);
4828                if (pce == UCOL_PROCESSED_NULLORDER) {
4829                    // at the end of the text
4830                    break;
4831                }
4832            }
4833            if (U_FAILURE(*status)) {
4834                setMatchNotFound(strsrch);
4835                return FALSE;
4836            }
4837            textOffset = ucol_getOffset(strsrch->textIter);
4838        }
4839    } else {
4840        textOffset = ucol_getOffset(strsrch->textIter);
4841    }
4842
4843    int32_t start = -1;
4844    int32_t end = -1;
4845
4846    if (usearch_searchBackwards(strsrch, textOffset, &start, &end, status)) {
4847        strsrch->search->matchedIndex = start;
4848        strsrch->search->matchedLength = end - start;
4849        return TRUE;
4850    } else {
4851        setMatchNotFound(strsrch);
4852        return FALSE;
4853    }
4854#endif
4855}
4856
4857#endif /* #if !UCONFIG_NO_COLLATION */
4858