1/*
2 ******************************************************************************
3 *   Copyright (C) 1996-2010, International Business Machines                 *
4 *   Corporation and others.  All Rights Reserved.                            *
5 ******************************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION
11
12#include "unicode/unistr.h"
13#include "unicode/putil.h"
14#include "unicode/usearch.h"
15
16#include "cmemory.h"
17#include "unicode/coll.h"
18#include "unicode/tblcoll.h"
19#include "unicode/coleitr.h"
20#include "unicode/ucoleitr.h"
21
22#include "unicode/regex.h"        // TODO: make conditional on regexp being built.
23
24#include "unicode/uniset.h"
25#include "unicode/uset.h"
26#include "unicode/ustring.h"
27#include "hash.h"
28#include "uhash.h"
29#include "ucol_imp.h"
30#include "normalizer2impl.h"
31
32#include "unicode/colldata.h"
33#include "unicode/bmsearch.h"
34
35U_NAMESPACE_BEGIN
36
37#define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
38#define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
39#define DELETE_ARRAY(array) uprv_free((void *) (array))
40
41
42struct CEI
43{
44    uint32_t order;
45    int32_t  lowOffset;
46    int32_t  highOffset;
47};
48
49class Target : public UMemory
50{
51public:
52    Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status);
53    ~Target();
54
55    void setTargetString(const UnicodeString *target);
56
57    const CEI *nextCE(int32_t offset);
58    const CEI *prevCE(int32_t offset);
59
60    int32_t stringLength();
61    UChar charAt(int32_t offset);
62
63    UBool isBreakBoundary(int32_t offset);
64    int32_t nextBreakBoundary(int32_t offset);
65    int32_t nextSafeBoundary(int32_t offset);
66
67    UBool isIdentical(UnicodeString &pattern, int32_t start, int32_t end);
68
69    void setOffset(int32_t offset);
70    void setLast(int32_t last);
71    int32_t getOffset();
72
73private:
74    CEI *ceb;
75    int32_t bufferSize;
76    int32_t bufferMin;
77    int32_t bufferMax;
78
79    uint32_t strengthMask;
80    UCollationStrength strength;
81    uint32_t variableTop;
82    UBool toShift;
83    UCollator *coll;
84    const Normalizer2 &nfd;
85
86    const UnicodeString *targetString;
87    const UChar *targetBuffer;
88    int32_t targetLength;
89
90    UCollationElements *elements;
91    UBreakIterator *charBreakIterator;
92};
93
94Target::Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status)
95    : bufferSize(0), bufferMin(0), bufferMax(0),
96      strengthMask(0), strength(UCOL_PRIMARY), variableTop(0), toShift(FALSE), coll(theCollator),
97      nfd(*Normalizer2Factory::getNFDInstance(status)),
98      targetString(NULL), targetBuffer(NULL), targetLength(0), elements(NULL), charBreakIterator(NULL)
99{
100    strength = ucol_getStrength(coll);
101    toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) ==  UCOL_SHIFTED;
102    variableTop = ucol_getVariableTop(coll, &status);
103
104    // find the largest expansion
105    uint8_t maxExpansion = 0;
106    for (const uint8_t *expansion = coll->expansionCESize; *expansion != 0; expansion += 1) {
107        if (*expansion > maxExpansion) {
108            maxExpansion = *expansion;
109        }
110    }
111
112    // room for an extra character on each end, plus 4 for safety
113    bufferSize = patternLength + (2 * maxExpansion) + 4;
114
115    ceb = NEW_ARRAY(CEI, bufferSize);
116
117    if (ceb == NULL) {
118        status = U_MEMORY_ALLOCATION_ERROR;
119        return;
120    }
121
122    if (target != NULL) {
123        setTargetString(target);
124    }
125
126    switch (strength)
127    {
128    default:
129        strengthMask |= UCOL_TERTIARYORDERMASK;
130        /* fall through */
131
132    case UCOL_SECONDARY:
133        strengthMask |= UCOL_SECONDARYORDERMASK;
134        /* fall through */
135
136    case UCOL_PRIMARY:
137        strengthMask |= UCOL_PRIMARYORDERMASK;
138    }
139}
140
141Target::~Target()
142{
143    ubrk_close(charBreakIterator);
144    ucol_closeElements(elements);
145
146    DELETE_ARRAY(ceb);
147}
148
149void Target::setTargetString(const UnicodeString *target)
150{
151    if (charBreakIterator != NULL) {
152        ubrk_close(charBreakIterator);
153        ucol_closeElements(elements);
154    }
155
156    targetString = target;
157
158    if (targetString != NULL) {
159        UErrorCode status = U_ZERO_ERROR;
160
161        targetBuffer = targetString->getBuffer();
162        targetLength = targetString->length();
163
164        elements = ucol_openElements(coll, target->getBuffer(), target->length(), &status);
165        ucol_forceHanImplicit(elements, &status);
166
167        charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status),
168                                      targetBuffer, targetLength, &status);
169    } else {
170        targetBuffer = NULL;
171        targetLength = 0;
172    }
173}
174
175const CEI *Target::nextCE(int32_t offset)
176{
177    UErrorCode status = U_ZERO_ERROR;
178    int32_t low = -1, high = -1;
179    uint32_t order;
180    UBool cont = FALSE;
181
182    if (offset >= bufferMin && offset < bufferMax) {
183        return &ceb[offset];
184    }
185
186    if (bufferMax >= bufferSize || offset != bufferMax) {
187        return NULL;
188    }
189
190    do {
191        low   = ucol_getOffset(elements);
192        order = ucol_next(elements, &status);
193        high  = ucol_getOffset(elements);
194
195        if (order == (uint32_t)UCOL_NULLORDER) {
196          //high = low = -1;
197            break;
198        }
199
200        cont = isContinuation(order);
201        order &= strengthMask;
202
203        if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) {
204            if (strength >= UCOL_QUATERNARY) {
205                order &= UCOL_PRIMARYORDERMASK;
206            } else {
207                order = UCOL_IGNORABLE;
208            }
209        }
210    } while (order == UCOL_IGNORABLE);
211
212    if (cont) {
213        order |= UCOL_CONTINUATION_MARKER;
214    }
215
216    ceb[offset].order = order;
217    ceb[offset].lowOffset = low;
218    ceb[offset].highOffset = high;
219
220    bufferMax += 1;
221
222    return &ceb[offset];
223}
224
225const CEI *Target::prevCE(int32_t offset)
226{
227    UErrorCode status = U_ZERO_ERROR;
228    int32_t low = -1, high = -1;
229    uint32_t order;
230    UBool cont = FALSE;
231
232    if (offset >= bufferMin && offset < bufferMax) {
233        return &ceb[offset];
234    }
235
236    if (bufferMax >= bufferSize || offset != bufferMax) {
237        return NULL;
238    }
239
240    do {
241        high  = ucol_getOffset(elements);
242        order = ucol_previous(elements, &status);
243        low   = ucol_getOffset(elements);
244
245        if (order == (uint32_t)UCOL_NULLORDER) {
246            break;
247        }
248
249        cont = isContinuation(order);
250        order &= strengthMask;
251
252        if (toShift && variableTop > order && (order & UCOL_PRIMARYORDERMASK) != 0) {
253            if (strength >= UCOL_QUATERNARY) {
254                order &= UCOL_PRIMARYORDERMASK;
255            } else {
256                order = UCOL_IGNORABLE;
257            }
258        }
259    } while (order == UCOL_IGNORABLE);
260
261    bufferMax += 1;
262
263    if (cont) {
264        order |= UCOL_CONTINUATION_MARKER;
265    }
266
267    ceb[offset].order       = order;
268    ceb[offset].lowOffset   = low;
269    ceb[offset].highOffset = high;
270
271    return &ceb[offset];
272}
273
274int32_t Target::stringLength()
275{
276    if (targetString != NULL) {
277        return targetLength;
278    }
279
280    return 0;
281}
282
283UChar Target::charAt(int32_t offset)
284{
285    if (targetString != NULL) {
286        return targetBuffer[offset];
287    }
288
289    return 0x0000;
290}
291
292void Target::setOffset(int32_t offset)
293{
294    UErrorCode status = U_ZERO_ERROR;
295
296    bufferMin = 0;
297    bufferMax = 0;
298
299    ucol_setOffset(elements, offset, &status);
300}
301
302void Target::setLast(int32_t last)
303{
304    UErrorCode status = U_ZERO_ERROR;
305
306    bufferMin = 0;
307    bufferMax = 1;
308
309    ceb[0].order      = UCOL_NULLORDER;
310    ceb[0].lowOffset  = last;
311    ceb[0].highOffset = last;
312
313    ucol_setOffset(elements, last, &status);
314}
315
316int32_t Target::getOffset()
317{
318    return ucol_getOffset(elements);
319}
320
321UBool Target::isBreakBoundary(int32_t offset)
322{
323    return ubrk_isBoundary(charBreakIterator, offset);
324}
325
326int32_t Target::nextBreakBoundary(int32_t offset)
327{
328    return ubrk_following(charBreakIterator, offset);
329}
330
331int32_t Target::nextSafeBoundary(int32_t offset)
332{
333    while (offset < targetLength) {
334      //UChar ch = charAt(offset);
335        UChar ch = targetBuffer[offset];
336
337        if (U_IS_LEAD(ch) || ! ucol_unsafeCP(ch, coll)) {
338            return offset;
339        }
340
341        offset += 1;
342    }
343
344    return targetLength;
345}
346
347UBool Target::isIdentical(UnicodeString &pattern, int32_t start, int32_t end)
348{
349    if (strength < UCOL_IDENTICAL) {
350        return TRUE;
351    }
352
353    // Note: We could use Normalizer::compare() or similar, but for short strings
354    // which may not be in FCD it might be faster to just NFD them.
355    UErrorCode status = U_ZERO_ERROR;
356    UnicodeString t2, p2;
357    nfd.normalize(UnicodeString(FALSE, targetBuffer + start, end - start), t2, status);
358    nfd.normalize(pattern, p2, status);
359    // return FALSE if NFD failed
360    return U_SUCCESS(status) && t2 == p2;
361}
362
363#define HASH_TABLE_SIZE 257
364
365class BadCharacterTable : public UMemory
366{
367public:
368    BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status);
369    ~BadCharacterTable();
370
371    int32_t operator[](uint32_t ce) const;
372    int32_t getMaxSkip() const;
373    int32_t minLengthInChars(int32_t index);
374
375private:
376    static int32_t hash(uint32_t ce);
377
378    int32_t maxSkip;
379    int32_t badCharacterTable[HASH_TABLE_SIZE];
380
381    int32_t *minLengthCache;
382};
383
384BadCharacterTable::BadCharacterTable(CEList &patternCEs, CollData *data, UErrorCode &status)
385    : minLengthCache(NULL)
386{
387    int32_t plen = patternCEs.size();
388
389    // **** need a better way to deal with this ****
390    if (U_FAILURE(status) || plen == 0) {
391        return;
392    }
393
394    int32_t *history = NEW_ARRAY(int32_t, plen);
395
396    if (history == NULL) {
397        status = U_MEMORY_ALLOCATION_ERROR;
398        return;
399    }
400
401    for (int32_t i = 0; i < plen; i += 1) {
402        history[i] = -1;
403    }
404
405    minLengthCache = NEW_ARRAY(int32_t, plen + 1);
406
407    if (minLengthCache == NULL) {
408        DELETE_ARRAY(history);
409        status = U_MEMORY_ALLOCATION_ERROR;
410        return;
411    }
412
413    maxSkip = minLengthCache[0] = data->minLengthInChars(&patternCEs, 0, history);
414
415    for(int32_t j = 0; j < HASH_TABLE_SIZE; j += 1) {
416        badCharacterTable[j] = maxSkip;
417    }
418
419    for(int32_t p = 1; p < plen; p += 1) {
420        minLengthCache[p] = data->minLengthInChars(&patternCEs, p, history);
421
422        // Make sure this entry is not bigger than the previous one.
423        // Otherwise, we might skip too far in some cases.
424        if (minLengthCache[p] < 0 || minLengthCache[p] > minLengthCache[p - 1]) {
425            minLengthCache[p] = minLengthCache[p - 1];
426        }
427    }
428
429    minLengthCache[plen] = 0;
430
431    for(int32_t p = 0; p < plen - 1; p += 1) {
432        badCharacterTable[hash(patternCEs[p])] = minLengthCache[p + 1];
433    }
434
435    DELETE_ARRAY(history);
436}
437
438BadCharacterTable::~BadCharacterTable()
439{
440    DELETE_ARRAY(minLengthCache);
441}
442
443int32_t BadCharacterTable::operator[](uint32_t ce) const
444{
445    return badCharacterTable[hash(ce)];
446}
447
448int32_t BadCharacterTable::getMaxSkip() const
449{
450    return maxSkip;
451}
452
453int32_t BadCharacterTable::minLengthInChars(int32_t index)
454{
455    return minLengthCache[index];
456}
457
458int32_t BadCharacterTable::hash(uint32_t ce)
459{
460    return UCOL_PRIMARYORDER(ce) % HASH_TABLE_SIZE;
461}
462
463class GoodSuffixTable : public UMemory
464{
465public:
466    GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status);
467    ~GoodSuffixTable();
468
469    int32_t operator[](int32_t offset) const;
470
471private:
472    int32_t *goodSuffixTable;
473};
474
475GoodSuffixTable::GoodSuffixTable(CEList &patternCEs, BadCharacterTable &badCharacterTable, UErrorCode &status)
476    : goodSuffixTable(NULL)
477{
478    int32_t patlen = patternCEs.size();
479
480    // **** need a better way to deal with this ****
481    if (U_FAILURE(status) || patlen <= 0) {
482        return;
483    }
484
485    int32_t *suff  = NEW_ARRAY(int32_t, patlen);
486    int32_t start = patlen - 1, end = - 1;
487    int32_t maxSkip = badCharacterTable.getMaxSkip();
488
489    if (suff == NULL) {
490        status = U_MEMORY_ALLOCATION_ERROR;
491        return;
492    }
493
494    // initialze suff
495    suff[patlen - 1] = patlen;
496
497    for (int32_t i = patlen - 2; i >= 0; i -= 1) {
498        // (i > start) means we're inside the last suffix match we found
499        // ((patlen - 1) - end) is how far the end of that match is from end of pattern
500        // (i - start) is how far we are from start of that match
501        // (i + (patlen - 1) - end) is index of same character at end of pattern
502        // so if any suffix match at that character doesn't extend beyond the last match,
503        // it's the suffix for this character as well
504        if (i > start && suff[i + patlen - 1 - end] < i - start) {
505            suff[i] = suff[i + patlen - 1 - end];
506        } else {
507            start = end = i;
508
509            int32_t s = patlen;
510
511            while (start >= 0 && patternCEs[start] == patternCEs[--s]) {
512                start -= 1;
513            }
514
515            suff[i] = end - start;
516        }
517    }
518
519    // now build goodSuffixTable
520    goodSuffixTable  = NEW_ARRAY(int32_t, patlen);
521
522    if (goodSuffixTable == NULL) {
523        DELETE_ARRAY(suff);
524        status = U_MEMORY_ALLOCATION_ERROR;
525        return;
526    }
527
528
529    // initialize entries to minLengthInChars of the pattern
530    for (int32_t i = 0; i < patlen; i += 1) {
531        goodSuffixTable[i] = maxSkip;
532    }
533
534    int32_t prefix = 0;
535
536    for (int32_t i = patlen - /*1*/ 2; i >= 0; i -= 1) {
537        if (suff[i] == i + 1) {
538            // this matching suffix is a prefix of the pattern
539            int32_t prefixSkip = badCharacterTable.minLengthInChars(i + 1);
540
541            // for any mis-match before this suffix, we should skip
542            // so that the front of the pattern (i.e. the prefix)
543            // lines up with the front of the suffix.
544            // (patlen - 1 - i) is the start of the suffix
545            while (prefix < patlen - 1 - i) {
546                // value of maxSkip means never set...
547                if (goodSuffixTable[prefix] == maxSkip) {
548                    goodSuffixTable[prefix] = prefixSkip;
549                }
550
551                prefix += 1;
552            }
553        }
554    }
555
556    for (int32_t i = 0; i < patlen - 1; i += 1) {
557        goodSuffixTable[patlen - 1 - suff[i]] = badCharacterTable.minLengthInChars(i + 1);
558    }
559
560    DELETE_ARRAY(suff);
561}
562
563GoodSuffixTable::~GoodSuffixTable()
564{
565    DELETE_ARRAY(goodSuffixTable);
566}
567
568int32_t GoodSuffixTable::operator[](int32_t offset) const
569{
570    return goodSuffixTable[offset];
571}
572
573UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BoyerMooreSearch)
574
575
576UBool BoyerMooreSearch::empty()
577{
578    return patCEs->size() <= 0;
579}
580
581CollData *BoyerMooreSearch::getData()
582{
583    return data;
584}
585
586CEList *BoyerMooreSearch::getPatternCEs()
587{
588    return patCEs;
589}
590
591BadCharacterTable *BoyerMooreSearch::getBadCharacterTable()
592{
593    return badCharacterTable;
594}
595
596GoodSuffixTable *BoyerMooreSearch::getGoodSuffixTable()
597{
598    return goodSuffixTable;
599}
600
601BoyerMooreSearch::BoyerMooreSearch(CollData *theData, const UnicodeString &patternString, const UnicodeString *targetString,
602                                   UErrorCode &status)
603    : data(theData), patCEs(NULL), badCharacterTable(NULL), goodSuffixTable(NULL), pattern(patternString), target(NULL)
604{
605
606    if (U_FAILURE(status)) {
607        return;
608    }
609
610    UCollator *collator = data->getCollator();
611
612    patCEs = new CEList(collator, patternString, status);
613
614    if (patCEs == NULL || U_FAILURE(status)) {
615        return;
616    }
617
618    badCharacterTable = new BadCharacterTable(*patCEs, data, status);
619
620    if (badCharacterTable == NULL || U_FAILURE(status)) {
621        return;
622    }
623
624    goodSuffixTable = new GoodSuffixTable(*patCEs, *badCharacterTable, status);
625
626    if (targetString != NULL) {
627        target = new Target(collator, targetString, patCEs->size(), status);
628    }
629}
630
631BoyerMooreSearch::~BoyerMooreSearch()
632{
633    delete target;
634    delete goodSuffixTable;
635    delete badCharacterTable;
636    delete patCEs;
637}
638
639void BoyerMooreSearch::setTargetString(const UnicodeString *targetString, UErrorCode &status)
640{
641    if (U_FAILURE(status)) {
642        return;
643    }
644
645    if (target == NULL) {
646        target = new Target(data->getCollator(), targetString, patCEs->size(), status);
647    } else {
648        target->setTargetString(targetString);
649    }
650}
651
652// **** main flow of this code from Laura Werner's "Unicode Text Searching in Java" paper. ****
653/*
654 * TODO:
655 *  * deal with trailing (and leading?) ignorables.
656 *  * Adding BoyerMooreSearch object slowed it down. How can we speed it up?
657 */
658UBool BoyerMooreSearch::search(int32_t offset, int32_t &start, int32_t &end)
659{
660    /*UCollator *coll =*/ data->getCollator();
661    int32_t plen = patCEs->size();
662    int32_t tlen = target->stringLength();
663    int32_t maxSkip = badCharacterTable->getMaxSkip();
664    int32_t tOffset = offset + maxSkip;
665
666    if (plen <= 0) {
667        // Searching for a zero length pattern always fails.
668        start = end = -1;
669        return FALSE;
670    }
671
672    while (tOffset <= tlen) {
673        int32_t pIndex = plen - 1;
674        int32_t tIndex = 0;
675        int32_t lIndex = 0;
676
677        if (tOffset < tlen) {
678            // **** we really want to skip ahead enough to  ****
679            // **** be sure we get at least 1 non-ignorable ****
680            // **** CE after the end of the pattern.        ****
681            int32_t next = target->nextSafeBoundary(tOffset + 1);
682
683            target->setOffset(next);
684
685            for (lIndex = 0; ; lIndex += 1) {
686                const CEI *cei = target->prevCE(lIndex);
687                int32_t low = cei->lowOffset;
688                int32_t high = cei->highOffset;
689
690                if (high == 0 || (low < high && low <= tOffset)) {
691                    if (low < tOffset) {
692                        while (lIndex >= 0 && target->prevCE(lIndex)->highOffset == high) {
693                            lIndex -= 1;
694                        }
695
696                        if (high > tOffset) {
697                            tOffset = high;
698                        }
699                    }
700
701                    break;
702                }
703            }
704        } else {
705            target->setLast(tOffset);
706            lIndex = 0;
707        }
708
709        tIndex = ++lIndex;
710
711        // Iterate backward until we hit the beginning of the pattern
712        while (pIndex >= 0) {
713            uint32_t pce = (*patCEs)[pIndex];
714            const CEI *tcei = target->prevCE(tIndex++);
715
716
717            if (tcei->order != pce) {
718                // There is a mismatch at this position.  Decide how far
719                // over to shift the pattern, then try again.
720
721                int32_t gsOffset = tOffset + (*goodSuffixTable)[pIndex];
722#ifdef EXTRA_CAUTIOUS
723                int32_t old = tOffset;
724#endif
725
726                tOffset += (*badCharacterTable)[tcei->order] - badCharacterTable->minLengthInChars(pIndex + 1);
727
728                if (gsOffset > tOffset) {
729                    tOffset = gsOffset;
730                }
731
732#ifdef EXTRA_CAUTIOUS
733                // Make sure we don't skip backwards...
734                if (tOffset <= old) {
735                    tOffset = old + 1;
736                }
737#endif
738
739                break;
740            }
741
742            pIndex -= 1;
743        }
744
745        if (pIndex < 0) {
746            // We made it back to the beginning of the pattern,
747            // which means we matched it all.  Return the location.
748            const CEI firstCEI = *target->prevCE(tIndex - 1);
749            const CEI lastCEI  = *target->prevCE(lIndex);
750            int32_t mStart   = firstCEI.lowOffset;
751            int32_t minLimit = lastCEI.lowOffset;
752            int32_t maxLimit = lastCEI.highOffset;
753            int32_t mLimit;
754            UBool found = TRUE;
755
756            target->setOffset(/*tOffset*/maxLimit);
757
758            const CEI nextCEI = *target->nextCE(0);
759
760            if (nextCEI.lowOffset > maxLimit) {
761                maxLimit = nextCEI.lowOffset;
762            }
763
764            if (nextCEI.lowOffset == nextCEI.highOffset && nextCEI.order != (uint32_t)UCOL_NULLORDER) {
765                found = FALSE;
766            }
767
768            if (! target->isBreakBoundary(mStart)) {
769                found = FALSE;
770            }
771
772            if (firstCEI.lowOffset == firstCEI.highOffset) {
773                found = FALSE;
774            }
775
776            mLimit = maxLimit;
777            if (minLimit < maxLimit) {
778                int32_t nbb = target->nextBreakBoundary(minLimit);
779
780                if (nbb >= lastCEI.highOffset) {
781                    mLimit = nbb;
782                }
783            }
784
785            if (mLimit > maxLimit) {
786                found = FALSE;
787            }
788
789            if (! target->isBreakBoundary(mLimit)) {
790                found = FALSE;
791            }
792
793            if (! target->isIdentical(pattern, mStart, mLimit)) {
794                found = FALSE;
795            }
796
797            if (found) {
798                start = mStart;
799                end   = mLimit;
800
801                return TRUE;
802            }
803
804            tOffset += (*goodSuffixTable)[0]; // really? Maybe += 1 or += maxSkip?
805        }
806        // Otherwise, we're here because of a mismatch, so keep going....
807    }
808
809    // no match
810   start = -1;
811   end = -1;
812   return FALSE;
813}
814
815U_NAMESPACE_END
816
817#endif // #if !UCONFIG_NO_COLLATION
818