1// Copyright (C) 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4 **********************************************************************
5 *   Copyright (C) 2005-2016, International Business Machines
6 *   Corporation and others.  All Rights Reserved.
7 **********************************************************************
8 */
9
10#include "unicode/utypes.h"
11
12#if !UCONFIG_NO_COLLATION
13
14#include "cmemory.h"
15#include "cstring.h"
16#include "usrchimp.h"
17
18#include "unicode/coll.h"
19#include "unicode/tblcoll.h"
20#include "unicode/usearch.h"
21#include "unicode/uset.h"
22#include "unicode/ustring.h"
23
24#include "unicode/coleitr.h"
25#include "unicode/regex.h"        // TODO: make conditional on regexp being built.
26
27#include "colldata.h"
28#include "ssearch.h"
29#include "xmlparser.h"
30
31#include <stdio.h>  // for sprintf
32
33char testId[100];
34
35#define TEST_ASSERT(x) {if (!(x)) { \
36    errln("Failure in file %s, line %d, test ID = \"%s\"", __FILE__, __LINE__, testId);}}
37
38#define TEST_ASSERT_M(x, m) {if (!(x)) { \
39    dataerrln("Failure in file %s, line %d.   \"%s\"", __FILE__, __LINE__, m);return;}}
40
41#define TEST_ASSERT_SUCCESS(errcode) {if (U_FAILURE(errcode)) { \
42    dataerrln("Failure in file %s, line %d, test ID \"%s\", status = \"%s\"", \
43          __FILE__, __LINE__, testId, u_errorName(errcode));}}
44
45#define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
46#define DELETE_ARRAY(array) uprv_free((void *) (array))
47
48//---------------------------------------------------------------------------
49//
50//  Test class boilerplate
51//
52//---------------------------------------------------------------------------
53SSearchTest::SSearchTest()
54{
55}
56
57SSearchTest::~SSearchTest()
58{
59}
60
61void SSearchTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char *params )
62{
63    if (exec) logln("TestSuite SSearchTest: ");
64    switch (index) {
65#if !UCONFIG_NO_BREAK_ITERATION
66       case 0: name = "searchTest";
67            if (exec) searchTest();
68            break;
69
70        case 1: name = "offsetTest";
71            if (exec) offsetTest();
72            break;
73
74        case 2: name = "monkeyTest";
75            if (exec) monkeyTest(params);
76            break;
77
78        case 3: name = "sharpSTest";
79            if (exec) sharpSTest();
80            break;
81
82        case 4: name = "goodSuffixTest";
83            if (exec) goodSuffixTest();
84            break;
85
86        case 5: name = "searchTime";
87            if (exec) searchTime();
88            break;
89#endif
90        default: name = "";
91            break; //needed to end loop
92    }
93}
94
95
96#if !UCONFIG_NO_BREAK_ITERATION
97
98#define PATH_BUFFER_SIZE 2048
99const char *SSearchTest::getPath(char buffer[2048], const char *filename) {
100    UErrorCode status = U_ZERO_ERROR;
101    const char *testDataDirectory = IntlTest::getSourceTestData(status);
102
103    if (U_FAILURE(status) || strlen(testDataDirectory) + strlen(filename) + 1 >= PATH_BUFFER_SIZE) {
104        errln("ERROR: getPath() failed - %s", u_errorName(status));
105        return NULL;
106    }
107
108    strcpy(buffer, testDataDirectory);
109    strcat(buffer, filename);
110    return buffer;
111}
112
113
114void SSearchTest::searchTest()
115{
116#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
117    UErrorCode status = U_ZERO_ERROR;
118    char path[PATH_BUFFER_SIZE];
119    const char *testFilePath = getPath(path, "ssearch.xml");
120
121    if (testFilePath == NULL) {
122        return; /* Couldn't get path: error message already output. */
123    }
124
125    LocalPointer<UXMLParser> parser(UXMLParser::createParser(status));
126    TEST_ASSERT_SUCCESS(status);
127    LocalPointer<UXMLElement> root(parser->parseFile(testFilePath, status));
128    TEST_ASSERT_SUCCESS(status);
129    if (U_FAILURE(status)) {
130        return;
131    }
132
133    const UnicodeString *debugTestCase = root->getAttribute("debug");
134    if (debugTestCase != NULL) {
135//       setenv("USEARCH_DEBUG", "1", 1);
136    }
137
138
139    const UXMLElement *testCase;
140    int32_t tc = 0;
141
142    while((testCase = root->nextChildElement(tc)) != NULL) {
143
144        if (testCase->getTagName().compare("test-case") != 0) {
145            errln("ssearch, unrecognized XML Element in test file");
146            continue;
147        }
148        const UnicodeString *id       = testCase->getAttribute("id");
149        *testId = 0;
150        if (id != NULL) {
151            id->extract(0, id->length(), testId,  sizeof(testId), US_INV);
152        }
153
154        // If debugging test case has been specified and this is not it, skip to next.
155        if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) {
156            continue;
157        }
158        //
159        //  Get the requested collation strength.
160        //    Default is tertiary if the XML attribute is missing from the test case.
161        //
162        const UnicodeString *strength = testCase->getAttribute("strength");
163        UColAttributeValue collatorStrength = UCOL_PRIMARY;
164        if      (strength==NULL)          { collatorStrength = UCOL_TERTIARY;}
165        else if (*strength=="PRIMARY")    { collatorStrength = UCOL_PRIMARY;}
166        else if (*strength=="SECONDARY")  { collatorStrength = UCOL_SECONDARY;}
167        else if (*strength=="TERTIARY")   { collatorStrength = UCOL_TERTIARY;}
168        else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;}
169        else if (*strength=="IDENTICAL")  { collatorStrength = UCOL_IDENTICAL;}
170        else {
171            // Bogus value supplied for strength.  Shouldn't happen, even from
172            //  typos, if the  XML source has been validated.
173            //  This assert is a little deceiving in that strength can be
174            //   any of the allowed values, not just TERTIARY, but it will
175            //   do the job of getting the error output.
176            TEST_ASSERT(*strength=="TERTIARY")
177        }
178
179        //
180        // Get the collator normalization flag.  Default is UCOL_OFF.
181        //
182        UColAttributeValue normalize = UCOL_OFF;
183        const UnicodeString *norm = testCase->getAttribute("norm");
184        TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF");
185        if (norm!=NULL && *norm=="ON") {
186            normalize = UCOL_ON;
187        }
188
189        //
190        // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE.
191        //
192        UColAttributeValue alternateHandling = UCOL_NON_IGNORABLE;
193        const UnicodeString *alt = testCase->getAttribute("alternate_handling");
194        TEST_ASSERT (alt == NULL || *alt == "SHIFTED" || *alt == "NON_IGNORABLE");
195        if (alt != NULL && *alt == "SHIFTED") {
196            alternateHandling = UCOL_SHIFTED;
197        }
198
199        const UnicodeString defLocale("en");
200        char  clocale[100];
201        const UnicodeString *locale   = testCase->getAttribute("locale");
202        if (locale == NULL || locale->length()==0) {
203            locale = &defLocale;
204        };
205        locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL);
206
207
208        UnicodeString  text;
209        UnicodeString  target;
210        UnicodeString  pattern;
211        int32_t        expectedMatchStart = -1;
212        int32_t        expectedMatchLimit = -1;
213        const UXMLElement  *n;
214        int32_t                nodeCount = 0;
215
216        n = testCase->getChildElement("pattern");
217        TEST_ASSERT(n != NULL);
218        if (n==NULL) {
219            continue;
220        }
221        text = n->getText(FALSE);
222        text = text.unescape();
223        pattern.append(text);
224        nodeCount++;
225
226        n = testCase->getChildElement("pre");
227        if (n!=NULL) {
228            text = n->getText(FALSE);
229            text = text.unescape();
230            target.append(text);
231            nodeCount++;
232        }
233
234        n = testCase->getChildElement("m");
235        if (n!=NULL) {
236            expectedMatchStart = target.length();
237            text = n->getText(FALSE);
238            text = text.unescape();
239            target.append(text);
240            expectedMatchLimit = target.length();
241            nodeCount++;
242        }
243
244        n = testCase->getChildElement("post");
245        if (n!=NULL) {
246            text = n->getText(FALSE);
247            text = text.unescape();
248            target.append(text);
249            nodeCount++;
250        }
251
252        //  Check that there weren't extra things in the XML
253        TEST_ASSERT(nodeCount == testCase->countChildren());
254
255        // Open a collator and StringSearch based on the parameters
256        //   obtained from the XML.
257        //
258        status = U_ZERO_ERROR;
259        LocalUCollatorPointer collator(ucol_open(clocale, &status));
260        ucol_setStrength(collator.getAlias(), collatorStrength);
261        ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize, &status);
262        ucol_setAttribute(collator.getAlias(), UCOL_ALTERNATE_HANDLING, alternateHandling, &status);
263        LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
264                                                               target.getBuffer(), target.length(),
265                                                               collator.getAlias(),
266                                                               NULL,     // the break iterator
267                                                               &status));
268
269        TEST_ASSERT_SUCCESS(status);
270        if (U_FAILURE(status)) {
271            continue;
272        }
273
274        int32_t foundStart = 0;
275        int32_t foundLimit = 0;
276        UBool   foundMatch;
277
278        //
279        // Do the search, check the match result against the expected results.
280        //
281        foundMatch= usearch_search(uss.getAlias(), 0, &foundStart, &foundLimit, &status);
282        TEST_ASSERT_SUCCESS(status);
283        if ((foundMatch && expectedMatchStart<0) ||
284            (foundStart != expectedMatchStart)   ||
285            (foundLimit != expectedMatchLimit)) {
286                TEST_ASSERT(FALSE);   //  ouput generic error position
287                infoln("Found, expected match start = %d, %d \n"
288                       "Found, expected match limit = %d, %d",
289                foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
290        }
291
292        // In case there are other matches...
293        // (should we only do this if the test case passed?)
294        while (foundMatch) {
295            expectedMatchStart = foundStart;
296            expectedMatchLimit = foundLimit;
297
298            foundMatch = usearch_search(uss.getAlias(), foundLimit, &foundStart, &foundLimit, &status);
299        }
300
301        uss.adoptInstead(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
302            target.getBuffer(), target.length(),
303            collator.getAlias(),
304            NULL,
305            &status));
306
307        //
308        // Do the backwards search, check the match result against the expected results.
309        //
310        foundMatch= usearch_searchBackwards(uss.getAlias(), target.length(), &foundStart, &foundLimit, &status);
311        TEST_ASSERT_SUCCESS(status);
312        if ((foundMatch && expectedMatchStart<0) ||
313            (foundStart != expectedMatchStart)   ||
314            (foundLimit != expectedMatchLimit)) {
315                TEST_ASSERT(FALSE);   //  ouput generic error position
316                infoln("Found, expected backwards match start = %d, %d \n"
317                       "Found, expected backwards match limit = %d, %d",
318                foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
319        }
320    }
321#endif
322}
323
324struct Order
325{
326    int32_t order;
327    int32_t lowOffset;
328    int32_t highOffset;
329};
330
331class OrderList
332{
333public:
334    OrderList();
335    OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset = 0);
336    ~OrderList();
337
338    int32_t size(void) const;
339    void add(int32_t order, int32_t low, int32_t high);
340    const Order *get(int32_t index) const;
341    int32_t getLowOffset(int32_t index) const;
342    int32_t getHighOffset(int32_t index) const;
343    int32_t getOrder(int32_t index) const;
344    void reverse(void);
345    UBool compare(const OrderList &other) const;
346    UBool matchesAt(int32_t offset, const OrderList &other) const;
347
348private:
349    Order *list;
350    int32_t listMax;
351    int32_t listSize;
352};
353
354OrderList::OrderList()
355  : list(NULL),  listMax(16), listSize(0)
356{
357    list = new Order[listMax];
358}
359
360OrderList::OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset)
361    : list(NULL), listMax(16), listSize(0)
362{
363    UErrorCode status = U_ZERO_ERROR;
364    UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
365    uint32_t strengthMask = 0;
366    int32_t order, low, high;
367
368    switch (ucol_getStrength(coll))
369    {
370    default:
371        strengthMask |= UCOL_TERTIARYORDERMASK;
372        U_FALLTHROUGH;
373    case UCOL_SECONDARY:
374        strengthMask |= UCOL_SECONDARYORDERMASK;
375        U_FALLTHROUGH;
376    case UCOL_PRIMARY:
377        strengthMask |= UCOL_PRIMARYORDERMASK;
378    }
379
380    list = new Order[listMax];
381
382    ucol_setOffset(elems, stringOffset, &status);
383
384    do {
385        low   = ucol_getOffset(elems);
386        order = ucol_next(elems, &status);
387        high  = ucol_getOffset(elems);
388
389        if (order != UCOL_NULLORDER) {
390            order &= strengthMask;
391        }
392
393        if (order != UCOL_IGNORABLE) {
394            add(order, low, high);
395        }
396    } while (order != UCOL_NULLORDER);
397
398    ucol_closeElements(elems);
399}
400
401OrderList::~OrderList()
402{
403    delete[] list;
404}
405
406void OrderList::add(int32_t order, int32_t low, int32_t high)
407{
408    if (listSize >= listMax) {
409        listMax *= 2;
410
411        Order *newList = new Order[listMax];
412
413        uprv_memcpy(newList, list, listSize * sizeof(Order));
414        delete[] list;
415        list = newList;
416    }
417
418    list[listSize].order      = order;
419    list[listSize].lowOffset  = low;
420    list[listSize].highOffset = high;
421
422    listSize += 1;
423}
424
425const Order *OrderList::get(int32_t index) const
426{
427    if (index >= listSize) {
428        return NULL;
429    }
430
431    return &list[index];
432}
433
434int32_t OrderList::getLowOffset(int32_t index) const
435{
436    const Order *order = get(index);
437
438    if (order != NULL) {
439        return order->lowOffset;
440    }
441
442    return -1;
443}
444
445int32_t OrderList::getHighOffset(int32_t index) const
446{
447    const Order *order = get(index);
448
449    if (order != NULL) {
450        return order->highOffset;
451    }
452
453    return -1;
454}
455
456int32_t OrderList::getOrder(int32_t index) const
457{
458    const Order *order = get(index);
459
460    if (order != NULL) {
461        return order->order;
462    }
463
464    return UCOL_NULLORDER;
465}
466
467int32_t OrderList::size() const
468{
469    return listSize;
470}
471
472void OrderList::reverse()
473{
474    for(int32_t f = 0, b = listSize - 1; f < b; f += 1, b -= 1) {
475        Order swap = list[b];
476
477        list[b] = list[f];
478        list[f] = swap;
479    }
480}
481
482UBool OrderList::compare(const OrderList &other) const
483{
484    if (listSize != other.listSize) {
485        return FALSE;
486    }
487
488    for(int32_t i = 0; i < listSize; i += 1) {
489        if (list[i].order  != other.list[i].order ||
490            list[i].lowOffset != other.list[i].lowOffset ||
491            list[i].highOffset != other.list[i].highOffset) {
492                return FALSE;
493        }
494    }
495
496    return TRUE;
497}
498
499UBool OrderList::matchesAt(int32_t offset, const OrderList &other) const
500{
501    // NOTE: sizes include the NULLORDER, which we don't want to compare.
502    int32_t otherSize = other.size() - 1;
503
504    if (listSize - 1 - offset < otherSize) {
505        return FALSE;
506    }
507
508    for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) {
509        if (getOrder(i) != other.getOrder(j)) {
510            return FALSE;
511        }
512    }
513
514    return TRUE;
515}
516
517static char *printOffsets(char *buffer, OrderList &list)
518{
519    int32_t size = list.size();
520    char *s = buffer;
521
522    for(int32_t i = 0; i < size; i += 1) {
523        const Order *order = list.get(i);
524
525        if (i != 0) {
526            s += sprintf(s, ", ");
527        }
528
529        s += sprintf(s, "(%d, %d)", order->lowOffset, order->highOffset);
530    }
531
532    return buffer;
533}
534
535static char *printOrders(char *buffer, OrderList &list)
536{
537    int32_t size = list.size();
538    char *s = buffer;
539
540    for(int32_t i = 0; i < size; i += 1) {
541        const Order *order = list.get(i);
542
543        if (i != 0) {
544            s += sprintf(s, ", ");
545        }
546
547        s += sprintf(s, "%8.8X", order->order);
548    }
549
550    return buffer;
551}
552
553void SSearchTest::offsetTest()
554{
555    const char *test[] = {
556        // The sequence \u0FB3\u0F71\u0F71\u0F80 contains a discontiguous
557        // contraction (\u0FB3\u0F71\u0F80) logically followed by \u0F71.
558        "\\u1E33\\u0FB3\\u0F71\\u0F71\\u0F80\\uD835\\uDF6C\\u01B0",
559
560        "\\ua191\\u16ef\\u2036\\u017a",
561
562#if 0
563        // This results in a complex interaction between contraction,
564        // expansion and normalization that confuses the backwards offset fixups.
565        "\\u0F7F\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
566#endif
567
568        "\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
569        "\\u07E9\\u07EA\\u07F1\\u07F2\\u07F3",
570
571        "\\u02FE\\u02FF"
572        "\\u0300\\u0301\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u0309\\u030A\\u030B\\u030C\\u030D\\u030E\\u030F"
573        "\\u0310\\u0311\\u0312\\u0313\\u0314\\u0315\\u0316\\u0317\\u0318\\u0319\\u031A\\u031B\\u031C\\u031D\\u031E\\u031F"
574        "\\u0320\\u0321\\u0322\\u0323\\u0324\\u0325\\u0326\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F"
575        "\\u0330\\u0331\\u0332\\u0333\\u0334\\u0335\\u0336\\u0337\\u0338\\u0339\\u033A\\u033B\\u033C\\u033D\\u033E\\u033F"
576        "\\u0340\\u0341\\u0342\\u0343\\u0344\\u0345\\u0346\\u0347\\u0348\\u0349\\u034A\\u034B\\u034C\\u034D\\u034E", // currently not working, see #8081
577
578        "\\u02FE\\u02FF\\u0300\\u0301\\u0302\\u0303\\u0316\\u0317\\u0318", // currently not working, see #8081
579        "a\\u02FF\\u0301\\u0316", // currently not working, see #8081
580        "a\\u02FF\\u0316\\u0301",
581        "a\\u0430\\u0301\\u0316",
582        "a\\u0430\\u0316\\u0301",
583        "abc\\u0E41\\u0301\\u0316",
584        "abc\\u0E41\\u0316\\u0301",
585        "\\u0E41\\u0301\\u0316",
586        "\\u0E41\\u0316\\u0301",
587        "a\\u0301\\u0316",
588        "a\\u0316\\u0301",
589        "\\uAC52\\uAC53",
590        "\\u34CA\\u34CB",
591        "\\u11ED\\u11EE",
592        "\\u30C3\\u30D0",
593        "p\\u00E9ch\\u00E9",
594        "a\\u0301\\u0325",
595        "a\\u0300\\u0325",
596        "a\\u0325\\u0300",
597        "A\\u0323\\u0300B",
598        "A\\u0300\\u0323B",
599        "A\\u0301\\u0323B",
600        "A\\u0302\\u0301\\u0323B",
601        "abc",
602        "ab\\u0300c",
603        "ab\\u0300\\u0323c",
604        " \\uD800\\uDC00\\uDC00",
605        "a\\uD800\\uDC00\\uDC00",
606        "A\\u0301\\u0301",
607        "A\\u0301\\u0323",
608        "A\\u0301\\u0323B",
609        "B\\u0301\\u0323C",
610        "A\\u0300\\u0323B",
611        "\\u0301A\\u0301\\u0301",
612        "abcd\\r\\u0301",
613        "p\\u00EAche",
614        "pe\\u0302che",
615    };
616
617    int32_t testCount = UPRV_LENGTHOF(test);
618    UErrorCode status = U_ZERO_ERROR;
619    RuleBasedCollator *col = (RuleBasedCollator *) Collator::createInstance(Locale::getEnglish(), status);
620    if (U_FAILURE(status)) {
621        errcheckln(status, "Failed to create collator in offsetTest! - %s", u_errorName(status));
622        return;
623    }
624    char buffer[4096];  // A bit of a hack... just happens to be long enough for all the test cases...
625                        // We could allocate one that's the right size by (CE_count * 10) + 2
626                        // 10 chars is enough room for 8 hex digits plus ", ". 2 extra chars for "[" and "]"
627
628    col->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status);
629
630    for(int32_t i = 0; i < testCount; i += 1) {
631        UnicodeString ts = CharsToUnicodeString(test[i]);
632        CollationElementIterator *iter = col->createCollationElementIterator(ts);
633        OrderList forwardList;
634        OrderList backwardList;
635        int32_t order, low, high;
636
637        do {
638            low   = iter->getOffset();
639            order = iter->next(status);
640            high  = iter->getOffset();
641
642            forwardList.add(order, low, high);
643        } while (order != CollationElementIterator::NULLORDER);
644
645        iter->reset();
646        iter->setOffset(ts.length(), status);
647
648        backwardList.add(CollationElementIterator::NULLORDER, iter->getOffset(), iter->getOffset());
649
650        do {
651            high  = iter->getOffset();
652            order = iter->previous(status);
653            low   = iter->getOffset();
654
655            if (order == CollationElementIterator::NULLORDER) {
656                break;
657            }
658
659            backwardList.add(order, low, high);
660        } while (TRUE);
661
662        backwardList.reverse();
663
664        if (forwardList.compare(backwardList)) {
665            logln("Works with \"%s\"", test[i]);
666            logln("Forward offsets:  [%s]", printOffsets(buffer, forwardList));
667//          logln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
668
669            logln("Forward CEs:  [%s]", printOrders(buffer, forwardList));
670//          logln("Backward CEs: [%s]", printOrders(buffer, backwardList));
671
672            logln();
673        } else {
674            errln("Fails with \"%s\"", test[i]);
675            infoln("Forward offsets:  [%s]", printOffsets(buffer, forwardList));
676            infoln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
677
678            infoln("Forward CEs:  [%s]", printOrders(buffer, forwardList));
679            infoln("Backward CEs: [%s]", printOrders(buffer, backwardList));
680
681            infoln();
682        }
683        delete iter;
684    }
685    delete col;
686}
687
688#if 0
689static UnicodeString &escape(const UnicodeString &string, UnicodeString &buffer)
690{
691    for(int32_t i = 0; i < string.length(); i += 1) {
692        UChar32 ch = string.char32At(i);
693
694        if (ch >= 0x0020 && ch <= 0x007F) {
695            if (ch == 0x005C) {
696                buffer.append("\\\\");
697            } else {
698                buffer.append(ch);
699            }
700        } else {
701            char cbuffer[12];
702
703            if (ch <= 0xFFFFL) {
704                sprintf(cbuffer, "\\u%4.4X", ch);
705            } else {
706                sprintf(cbuffer, "\\U%8.8X", ch);
707            }
708
709            buffer.append(cbuffer);
710        }
711
712        if (ch >= 0x10000L) {
713            i += 1;
714        }
715    }
716
717    return buffer;
718}
719#endif
720
721void SSearchTest::sharpSTest()
722{
723    UErrorCode status = U_ZERO_ERROR;
724    UCollator *coll = NULL;
725    UnicodeString lp  = "fuss";
726    UnicodeString sp = "fu\\u00DF";
727    UnicodeString targets[]  = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball", "12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball",
728                                "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF",
729                                "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu\\u00DF", "1234fu\\u00DF"};
730    int32_t start = -1, end = -1;
731
732    coll = ucol_openFromShortString("LEN_S1", FALSE, NULL, &status);
733    TEST_ASSERT_SUCCESS(status);
734
735    UnicodeString lpUnescaped = lp.unescape();
736    UnicodeString spUnescaped = sp.unescape();
737
738    LocalUStringSearchPointer ussLong(usearch_openFromCollator(lpUnescaped.getBuffer(), lpUnescaped.length(),
739                                                           lpUnescaped.getBuffer(), lpUnescaped.length(),   // actual test data will be set later
740                                                           coll,
741                                                           NULL,     // the break iterator
742                                                           &status));
743
744    LocalUStringSearchPointer ussShort(usearch_openFromCollator(spUnescaped.getBuffer(), spUnescaped.length(),
745                                                           spUnescaped.getBuffer(), spUnescaped.length(),   // actual test data will be set later
746                                                           coll,
747                                                           NULL,     // the break iterator
748                                                           &status));
749    TEST_ASSERT_SUCCESS(status);
750
751    for (uint32_t t = 0; t < UPRV_LENGTHOF(targets); t += 1) {
752        UBool bFound;
753        UnicodeString target = targets[t].unescape();
754
755        start = end = -1;
756        usearch_setText(ussLong.getAlias(), target.getBuffer(), target.length(), &status);
757        bFound = usearch_search(ussLong.getAlias(), 0, &start, &end, &status);
758        TEST_ASSERT_SUCCESS(status);
759        if (bFound) {
760            logln("Test %d: found long pattern at [%d, %d].", t, start, end);
761        } else {
762            dataerrln("Test %d: did not find long pattern.", t);
763        }
764
765        usearch_setText(ussShort.getAlias(), target.getBuffer(), target.length(), &status);
766        bFound = usearch_search(ussShort.getAlias(), 0, &start, &end, &status);
767        TEST_ASSERT_SUCCESS(status);
768        if (bFound) {
769            logln("Test %d: found long pattern at [%d, %d].", t, start, end);
770        } else {
771            dataerrln("Test %d: did not find long pattern.", t);
772        }
773    }
774
775    ucol_close(coll);
776}
777
778void SSearchTest::goodSuffixTest()
779{
780    UErrorCode status = U_ZERO_ERROR;
781    UCollator *coll = NULL;
782    UnicodeString pat = /*"gcagagag"*/ "fxeld";
783    UnicodeString target = /*"gcatcgcagagagtatacagtacg"*/ "cloveldfxeld";
784    int32_t start = -1, end = -1;
785    UBool bFound;
786
787    coll = ucol_open(NULL, &status);
788    TEST_ASSERT_SUCCESS(status);
789
790    LocalUStringSearchPointer ss(usearch_openFromCollator(pat.getBuffer(), pat.length(),
791                                                          target.getBuffer(), target.length(),
792                                                          coll,
793                                                          NULL,     // the break iterator
794                                                          &status));
795    TEST_ASSERT_SUCCESS(status);
796
797    bFound = usearch_search(ss.getAlias(), 0, &start, &end, &status);
798    TEST_ASSERT_SUCCESS(status);
799    if (bFound) {
800        logln("Found pattern at [%d, %d].", start, end);
801    } else {
802        dataerrln("Did not find pattern.");
803    }
804
805    ucol_close(coll);
806}
807
808//
809//  searchTime()    A quick and dirty performance test for string search.
810//                  Probably  doesn't really belong as part of intltest, but it
811//                  does check that the search succeeds, and gets the right result,
812//                  so it serves as a functionality test also.
813//
814//                  To run as a perf test, up the loop count, select by commenting
815//                  and uncommenting in the code the operation to be measured,
816//                  rebuild, and measure the running time of this test alone.
817//
818//                     time LD_LIBRARY_PATH=whatever  ./intltest  collate/SSearchTest/searchTime
819//
820void SSearchTest::searchTime() {
821    static const char *longishText =
822"Whylom, as olde stories tellen us,\n"
823"Ther was a duk that highte Theseus:\n"
824"Of Athenes he was lord and governour,\n"
825"And in his tyme swich a conquerour,\n"
826"That gretter was ther noon under the sonne.\n"
827"Ful many a riche contree hadde he wonne;\n"
828"What with his wisdom and his chivalrye,\n"
829"He conquered al the regne of Femenye,\n"
830"That whylom was y-cleped Scithia;\n"
831"And weddede the quene Ipolita,\n"
832"And broghte hir hoom with him in his contree\n"
833"With muchel glorie and greet solempnitee,\n"
834"And eek hir yonge suster Emelye.\n"
835"And thus with victorie and with melodye\n"
836"Lete I this noble duk to Athenes ryde,\n"
837"And al his hoost, in armes, him bisyde.\n"
838"And certes, if it nere to long to here,\n"
839"I wolde han told yow fully the manere,\n"
840"How wonnen was the regne of Femenye\n"
841"By Theseus, and by his chivalrye;\n"
842"And of the grete bataille for the nones\n"
843"Bitwixen Athen's and Amazones;\n"
844"And how asseged was Ipolita,\n"
845"The faire hardy quene of Scithia;\n"
846"And of the feste that was at hir weddinge,\n"
847"And of the tempest at hir hoom-cominge;\n"
848"But al that thing I moot as now forbere.\n"
849"I have, God woot, a large feeld to ere,\n"
850"And wayke been the oxen in my plough.\n"
851"The remenant of the tale is long y-nough.\n"
852"I wol nat letten eek noon of this route;\n"
853"Lat every felawe telle his tale aboute,\n"
854"And lat see now who shal the soper winne;\n"
855"And ther I lefte, I wol ageyn biginne.\n"
856"This duk, of whom I make mencioun,\n"
857"When he was come almost unto the toun,\n"
858"In al his wele and in his moste pryde,\n"
859"He was war, as he caste his eye asyde,\n"
860"Wher that ther kneled in the hye weye\n"
861"A companye of ladies, tweye and tweye,\n"
862"Ech after other, clad in clothes blake; \n"
863"But swich a cry and swich a wo they make,\n"
864"That in this world nis creature livinge,\n"
865"That herde swich another weymentinge;\n"
866"And of this cry they nolde never stenten,\n"
867"Til they the reynes of his brydel henten.\n"
868"'What folk ben ye, that at myn hoomcominge\n"
869"Perturben so my feste with cryinge'?\n"
870"Quod Theseus, 'have ye so greet envye\n"
871"Of myn honour, that thus compleyne and crye? \n"
872"Or who hath yow misboden, or offended?\n"
873"And telleth me if it may been amended;\n"
874"And why that ye ben clothed thus in blak'?\n"
875"The eldest lady of hem alle spak,\n"
876"When she hadde swowned with a deedly chere,\n"
877"That it was routhe for to seen and here,\n"
878"And seyde: 'Lord, to whom Fortune hath yiven\n"
879"Victorie, and as a conquerour to liven,\n"
880"Noght greveth us your glorie and your honour;\n"
881"But we biseken mercy and socour.\n"
882"Have mercy on our wo and our distresse.\n"
883"Som drope of pitee, thurgh thy gentilesse,\n"
884"Up-on us wrecched wommen lat thou falle.\n"
885"For certes, lord, ther nis noon of us alle,\n"
886"That she nath been a duchesse or a quene;\n"
887"Now be we caitifs, as it is wel sene:\n"
888"Thanked be Fortune, and hir false wheel,\n"
889"That noon estat assureth to be weel.\n"
890"And certes, lord, t'abyden your presence,\n"
891"Here in the temple of the goddesse Clemence\n"
892"We han ben waytinge al this fourtenight;\n"
893"Now help us, lord, sith it is in thy might.\n"
894"I wrecche, which that wepe and waille thus,\n"
895"Was whylom wyf to king Capaneus,\n"
896"That starf at Thebes, cursed be that day!\n"
897"And alle we, that been in this array,\n"
898"And maken al this lamentacioun,\n"
899"We losten alle our housbondes at that toun,\n"
900"Whyl that the sege ther-aboute lay.\n"
901"And yet now th'olde Creon, weylaway!\n"
902"The lord is now of Thebes the citee, \n"
903"Fulfild of ire and of iniquitee,\n"
904"He, for despyt, and for his tirannye,\n"
905"To do the dede bodyes vileinye,\n"
906"Of alle our lordes, whiche that ben slawe,\n"
907"Hath alle the bodyes on an heep y-drawe,\n"
908"And wol nat suffren hem, by noon assent,\n"
909"Neither to been y-buried nor y-brent,\n"
910"But maketh houndes ete hem in despyt. zet'\n";
911
912const char *cPattern = "maketh houndes ete hem";
913//const char *cPattern = "Whylom";
914//const char *cPattern = "zet";
915    const char *testId = "searchTime()";   // for error macros.
916    UnicodeString target = longishText;
917    UErrorCode status = U_ZERO_ERROR;
918
919
920    LocalUCollatorPointer collator(ucol_open("en", &status));
921    //ucol_setStrength(collator.getAlias(), collatorStrength);
922    //ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize, &status);
923    UnicodeString uPattern = cPattern;
924    LocalUStringSearchPointer uss(usearch_openFromCollator(uPattern.getBuffer(), uPattern.length(),
925                                                           target.getBuffer(), target.length(),
926                                                           collator.getAlias(),
927                                                           NULL,     // the break iterator
928                                                           &status));
929    TEST_ASSERT_SUCCESS(status);
930
931//  int32_t foundStart;
932//  int32_t foundEnd;
933    UBool   found;
934
935    // Find the match position usgin strstr
936    const char *pm = strstr(longishText, cPattern);
937    TEST_ASSERT_M(pm!=NULL, "No pattern match with strstr");
938    int32_t  refMatchPos = (int32_t)(pm - longishText);
939    int32_t  icuMatchPos;
940    int32_t  icuMatchEnd;
941    usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &status);
942    TEST_ASSERT_SUCCESS(status);
943    TEST_ASSERT_M(refMatchPos == icuMatchPos, "strstr and icu give different match positions.");
944
945    int32_t i;
946    // int32_t j=0;
947
948    // Try loopcounts around 100000 to some millions, depending on the operation,
949    //   to get runtimes of at least several seconds.
950    for (i=0; i<10000; i++) {
951        found = usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &status);
952        (void)found;   // Suppress set but not used warning.
953        //TEST_ASSERT_SUCCESS(status);
954        //TEST_ASSERT(found);
955
956        // usearch_setOffset(uss.getAlias(), 0, &status);
957        // icuMatchPos = usearch_next(uss.getAlias(), &status);
958
959         // The i+j stuff is to confuse the optimizer and get it to actually leave the
960         //   call to strstr in place.
961         //pm = strstr(longishText+j, cPattern);
962         //j = (j + i)%5;
963    }
964
965    //printf("%ld, %d\n", pm-longishText, j);
966}
967
968//----------------------------------------------------------------------------------------
969//
970//   Random Numbers.  Similar to standard lib rand() and srand()
971//                    Not using library to
972//                      1.  Get same results on all platforms.
973//                      2.  Get access to current seed, to more easily reproduce failures.
974//
975//---------------------------------------------------------------------------------------
976static uint32_t m_seed = 1;
977
978static uint32_t m_rand()
979{
980    m_seed = m_seed * 1103515245 + 12345;
981    return (uint32_t)(m_seed/65536) % 32768;
982}
983
984class Monkey
985{
986public:
987    virtual void append(UnicodeString &test, UnicodeString &alternate) = 0;
988
989protected:
990    Monkey();
991    virtual ~Monkey();
992};
993
994Monkey::Monkey()
995{
996    // ook?
997}
998
999Monkey::~Monkey()
1000{
1001    // ook?
1002}
1003
1004class SetMonkey : public Monkey
1005{
1006public:
1007    SetMonkey(const USet *theSet);
1008    ~SetMonkey();
1009
1010    virtual void append(UnicodeString &test, UnicodeString &alternate);
1011
1012private:
1013    const USet *set;
1014};
1015
1016SetMonkey::SetMonkey(const USet *theSet)
1017    : Monkey(), set(theSet)
1018{
1019    // ook?
1020}
1021
1022SetMonkey::~SetMonkey()
1023{
1024    //ook...
1025}
1026
1027void SetMonkey::append(UnicodeString &test, UnicodeString &alternate)
1028{
1029    int32_t size = uset_size(set);
1030    int32_t index = m_rand() % size;
1031    UChar32 ch = uset_charAt(set, index);
1032    UnicodeString str(ch);
1033
1034    test.append(str);
1035    alternate.append(str); // flip case, or some junk?
1036}
1037
1038class StringSetMonkey : public Monkey
1039{
1040public:
1041    StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData);
1042    ~StringSetMonkey();
1043
1044    void append(UnicodeString &testCase, UnicodeString &alternate);
1045
1046private:
1047    UnicodeString &generateAlternative(const UnicodeString &testCase, UnicodeString &alternate);
1048
1049    const USet *set;
1050    UCollator  *coll;
1051    CollData   *collData;
1052};
1053
1054StringSetMonkey::StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData)
1055: Monkey(), set(theSet), coll(theCollator), collData(theCollData)
1056{
1057    // ook.
1058}
1059
1060StringSetMonkey::~StringSetMonkey()
1061{
1062    // ook?
1063}
1064
1065void StringSetMonkey::append(UnicodeString &testCase, UnicodeString &alternate)
1066{
1067    int32_t itemCount = uset_getItemCount(set), len = 0;
1068    int32_t index = m_rand() % itemCount;
1069    UChar32 rangeStart = 0, rangeEnd = 0;
1070    UChar buffer[16];
1071    UErrorCode err = U_ZERO_ERROR;
1072
1073    len = uset_getItem(set, index, &rangeStart, &rangeEnd, buffer, 16, &err);
1074
1075    if (len == 0) {
1076        int32_t offset = m_rand() % (rangeEnd - rangeStart + 1);
1077        UChar32 ch = rangeStart + offset;
1078        UnicodeString str(ch);
1079
1080        testCase.append(str);
1081        generateAlternative(str, alternate);
1082    } else if (len > 0) {
1083        // should check that len < 16...
1084        UnicodeString str(buffer, len);
1085
1086        testCase.append(str);
1087        generateAlternative(str, alternate);
1088    } else {
1089        // shouldn't happen...
1090    }
1091}
1092
1093UnicodeString &StringSetMonkey::generateAlternative(const UnicodeString &testCase, UnicodeString &alternate)
1094{
1095    // find out shortest string for the longest sequence of ces.
1096    // needs to be refined to use dynamic programming, but will be roughly right
1097    UErrorCode status = U_ZERO_ERROR;
1098    CEList ceList(coll, testCase, status);
1099    UnicodeString alt;
1100    int32_t offset = 0;
1101
1102    if (ceList.size() == 0) {
1103        return alternate.append(testCase);
1104    }
1105
1106    while (offset < ceList.size()) {
1107        int32_t ce = ceList.get(offset);
1108        const StringList *strings = collData->getStringList(ce);
1109
1110        if (strings == NULL) {
1111            return alternate.append(testCase);
1112        }
1113
1114        int32_t stringCount = strings->size();
1115        int32_t tries = 0;
1116
1117        // find random string that generates the same CEList
1118        const CEList *ceList2 = NULL;
1119        const UnicodeString *string = NULL;
1120              UBool matches = FALSE;
1121
1122        do {
1123            int32_t s = m_rand() % stringCount;
1124
1125            if (tries++ > stringCount) {
1126                alternate.append(testCase);
1127                return alternate;
1128            }
1129
1130            string = strings->get(s);
1131            ceList2 = collData->getCEList(string);
1132            matches = ceList.matchesAt(offset, ceList2);
1133
1134            if (! matches) {
1135                collData->freeCEList((CEList *) ceList2);
1136            }
1137        } while (! matches);
1138
1139        alt.append(*string);
1140        offset += ceList2->size();
1141        collData->freeCEList(ceList2);
1142    }
1143
1144    const CEList altCEs(coll, alt, status);
1145
1146    if (ceList.matchesAt(0, &altCEs)) {
1147        return alternate.append(alt);
1148    }
1149
1150    return alternate.append(testCase);
1151}
1152
1153static void generateTestCase(UCollator *coll, Monkey *monkeys[], int32_t monkeyCount, UnicodeString &testCase, UnicodeString &alternate)
1154{
1155    int32_t pieces = (m_rand() % 4) + 1;
1156    UErrorCode status = U_ZERO_ERROR;
1157    UBool matches;
1158
1159    do {
1160        testCase.remove();
1161        alternate.remove();
1162        monkeys[0]->append(testCase, alternate);
1163
1164        for(int32_t piece = 0; piece < pieces; piece += 1) {
1165            int32_t monkey = m_rand() % monkeyCount;
1166
1167            monkeys[monkey]->append(testCase, alternate);
1168        }
1169
1170        const CEList ceTest(coll, testCase, status);
1171        const CEList ceAlt(coll, alternate, status);
1172
1173        matches = ceTest.matchesAt(0, &ceAlt);
1174    } while (! matches);
1175}
1176
1177static UBool simpleSearch(UCollator *coll, const UnicodeString &target, int32_t offset, const UnicodeString &pattern, int32_t &matchStart, int32_t &matchEnd)
1178{
1179    UErrorCode      status = U_ZERO_ERROR;
1180    OrderList       targetOrders(coll, target, offset);
1181    OrderList       patternOrders(coll, pattern);
1182    int32_t         targetSize  = targetOrders.size() - 1;
1183    int32_t         patternSize = patternOrders.size() - 1;
1184    UBreakIterator *charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status),
1185                                                  target.getBuffer(), target.length(), &status);
1186
1187    if (patternSize == 0) {
1188        // Searching for an empty pattern always fails
1189        matchStart = matchEnd = -1;
1190        ubrk_close(charBreakIterator);
1191        return FALSE;
1192    }
1193
1194    matchStart = matchEnd = -1;
1195
1196    for(int32_t i = 0; i < targetSize; i += 1) {
1197        if (targetOrders.matchesAt(i, patternOrders)) {
1198            int32_t start    = targetOrders.getLowOffset(i);
1199            int32_t maxLimit = targetOrders.getLowOffset(i + patternSize);
1200            int32_t minLimit = targetOrders.getLowOffset(i + patternSize - 1);
1201
1202            // if the low and high offsets of the first CE in
1203            // the match are the same, it means that the match
1204            // starts in the middle of an expansion - all but
1205            // the first CE of the expansion will have the offset
1206            // of the following character.
1207            if (start == targetOrders.getHighOffset(i)) {
1208                continue;
1209            }
1210
1211            // Make sure match starts on a grapheme boundary
1212            if (! ubrk_isBoundary(charBreakIterator, start)) {
1213                continue;
1214            }
1215
1216            // If the low and high offsets of the CE after the match
1217            // are the same, it means that the match ends in the middle
1218            // of an expansion sequence.
1219            if (maxLimit == targetOrders.getHighOffset(i + patternSize) &&
1220                targetOrders.getOrder(i + patternSize) != UCOL_NULLORDER) {
1221                continue;
1222            }
1223
1224            int32_t mend = maxLimit;
1225
1226            // Find the first grapheme break after the character index
1227            // of the last CE in the match. If it's after character index
1228            // that's after the last CE in the match, use that index
1229            // as the end of the match.
1230            if (minLimit < maxLimit) {
1231                // When the last CE's low index is same with its high index, the CE is likely
1232                // a part of expansion. In this case, the index is located just after the
1233                // character corresponding to the CEs compared above. If the index is right
1234                // at the break boundary, move the position to the next boundary will result
1235                // incorrect match length when there are ignorable characters exist between
1236                // the position and the next character produces CE(s). See ticket#8482.
1237                if (minLimit == targetOrders.getHighOffset(i + patternSize - 1) && ubrk_isBoundary(charBreakIterator, minLimit)) {
1238                    mend = minLimit;
1239                } else {
1240                    int32_t nba = ubrk_following(charBreakIterator, minLimit);
1241
1242                    if (nba >= targetOrders.getHighOffset(i + patternSize - 1)) {
1243                        mend = nba;
1244                    }
1245                }
1246            }
1247
1248            if (mend > maxLimit) {
1249                continue;
1250            }
1251
1252            if (! ubrk_isBoundary(charBreakIterator, mend)) {
1253                continue;
1254            }
1255
1256            matchStart = start;
1257            matchEnd   = mend;
1258
1259            ubrk_close(charBreakIterator);
1260            return TRUE;
1261        }
1262    }
1263
1264    ubrk_close(charBreakIterator);
1265    return FALSE;
1266}
1267
1268#if !UCONFIG_NO_REGULAR_EXPRESSIONS
1269static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
1270    int32_t val = defaultVal;
1271
1272    name.append(" *= *(-?\\d+)");
1273
1274    UErrorCode status = U_ZERO_ERROR;
1275    RegexMatcher m(name, params, 0, status);
1276
1277    if (m.find()) {
1278        // The param exists.  Convert the string to an int.
1279        char valString[100];
1280        int32_t paramLength = m.end(1, status) - m.start(1, status);
1281
1282        if (paramLength >= (int32_t)(sizeof(valString)-1)) {
1283            paramLength = (int32_t)(sizeof(valString)-2);
1284        }
1285
1286        params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
1287        val = uprv_strtol(valString,  NULL, 10);
1288
1289        // Delete this parameter from the params string.
1290        m.reset();
1291        params = m.replaceFirst("", status);
1292    }
1293
1294  //U_ASSERT(U_SUCCESS(status));
1295    if (! U_SUCCESS(status)) {
1296        val = defaultVal;
1297    }
1298
1299    return val;
1300}
1301#endif
1302
1303#if !UCONFIG_NO_COLLATION
1304int32_t SSearchTest::monkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern,
1305                                    const char *name, const char *strength, uint32_t seed)
1306{
1307    UErrorCode status = U_ZERO_ERROR;
1308    int32_t actualStart = -1, actualEnd = -1;
1309  //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length();
1310    int32_t expectedStart = -1, expectedEnd = -1;
1311    int32_t notFoundCount = 0;
1312    LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
1313                                                           testCase.getBuffer(), testCase.length(),
1314                                                           coll,
1315                                                           NULL,     // the break iterator
1316                                                           &status));
1317
1318    // **** TODO: find *all* matches, not just first one ****
1319    simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd);
1320
1321    usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status);
1322
1323    if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
1324        errln("Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
1325              "    strength=%s seed=%d",
1326              name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
1327    }
1328
1329    if (expectedStart == -1 && actualStart == -1) {
1330        notFoundCount += 1;
1331    }
1332
1333    // **** TODO: find *all* matches, not just first one ****
1334    simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd);
1335
1336    usearch_setPattern(uss.getAlias(), altPattern.getBuffer(), altPattern.length(), &status);
1337
1338    usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status);
1339
1340    if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
1341        errln("Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
1342              "    strength=%s seed=%d",
1343              name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
1344    }
1345
1346    if (expectedStart == -1 && actualStart == -1) {
1347        notFoundCount += 1;
1348    }
1349
1350    return notFoundCount;
1351}
1352#endif
1353
1354void SSearchTest::monkeyTest(char *params)
1355{
1356    // ook!
1357    UErrorCode status = U_ZERO_ERROR;
1358  //UCollator *coll = ucol_open(NULL, &status);
1359    UCollator *coll = ucol_openFromShortString("S1", FALSE, NULL, &status);
1360
1361    if (U_FAILURE(status)) {
1362        errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_errorName(status));
1363        return;
1364    }
1365
1366    CollData  *monkeyData = new CollData(coll, status);
1367
1368    USet *expansions   = uset_openEmpty();
1369    USet *contractions = uset_openEmpty();
1370
1371    ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
1372
1373    U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
1374    U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
1375    USet *letters = uset_openPattern(letter_pattern, 39, &status);
1376    SetMonkey letterMonkey(letters);
1377    StringSetMonkey contractionMonkey(contractions, coll, monkeyData);
1378    StringSetMonkey expansionMonkey(expansions, coll, monkeyData);
1379    UnicodeString testCase;
1380    UnicodeString alternate;
1381    UnicodeString pattern, altPattern;
1382    UnicodeString prefix, altPrefix;
1383    UnicodeString suffix, altSuffix;
1384
1385    Monkey *monkeys[] = {
1386        &letterMonkey,
1387        &contractionMonkey,
1388        &expansionMonkey,
1389        &contractionMonkey,
1390        &expansionMonkey,
1391        &contractionMonkey,
1392        &expansionMonkey,
1393        &contractionMonkey,
1394        &expansionMonkey};
1395    int32_t monkeyCount = UPRV_LENGTHOF(monkeys);
1396    // int32_t nonMatchCount = 0;
1397
1398    UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY};
1399    const char *strengthNames[] = {"primary", "secondary", "tertiary"};
1400    int32_t strengthCount = UPRV_LENGTHOF(strengths);
1401    int32_t loopCount = quick? 1000 : 10000;
1402    int32_t firstStrength = 0;
1403    int32_t lastStrength  = strengthCount - 1; //*/ 0;
1404
1405    if (params != NULL) {
1406#if !UCONFIG_NO_REGULAR_EXPRESSIONS
1407        UnicodeString p(params);
1408
1409        loopCount = getIntParam("loop", p, loopCount);
1410        m_seed    = getIntParam("seed", p, m_seed);
1411
1412        RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status);
1413        if (m.find()) {
1414            UnicodeString breakType = m.group(1, status);
1415
1416            for (int32_t s = 0; s < strengthCount; s += 1) {
1417                if (breakType == strengthNames[s]) {
1418                    firstStrength = lastStrength = s;
1419                    break;
1420                }
1421            }
1422
1423            m.reset();
1424            p = m.replaceFirst("", status);
1425        }
1426
1427        if (RegexMatcher("\\S", p, 0, status).find()) {
1428            // Each option is stripped out of the option string as it is processed.
1429            // All options have been checked.  The option string should have been completely emptied..
1430            char buf[100];
1431            p.extract(buf, sizeof(buf), NULL, status);
1432            buf[sizeof(buf)-1] = 0;
1433            errln("Unrecognized or extra parameter:  %s\n", buf);
1434            return;
1435        }
1436#else
1437        infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters.");
1438#endif
1439    }
1440
1441    for(int32_t s = firstStrength; s <= lastStrength; s += 1) {
1442        int32_t notFoundCount = 0;
1443
1444        logln("Setting strength to %s.", strengthNames[s]);
1445        ucol_setStrength(coll, strengths[s]);
1446
1447        // TODO: try alternate prefix and suffix too?
1448        // TODO: alternates are only equal at primary strength. Is this OK?
1449        for(int32_t t = 0; t < loopCount; t += 1) {
1450            uint32_t seed = m_seed;
1451            // int32_t  nmc = 0;
1452
1453            generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern);
1454            generateTestCase(coll, monkeys, monkeyCount, prefix,  altPrefix);
1455            generateTestCase(coll, monkeys, monkeyCount, suffix,  altSuffix);
1456
1457            // pattern
1458            notFoundCount += monkeyTestCase(coll, pattern, pattern, altPattern, "pattern", strengthNames[s], seed);
1459
1460            testCase.remove();
1461            testCase.append(prefix);
1462            testCase.append(/*alt*/pattern);
1463
1464            // prefix + pattern
1465            notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern", strengthNames[s], seed);
1466
1467            testCase.append(suffix);
1468
1469            // prefix + pattern + suffix
1470            notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern + suffix", strengthNames[s], seed);
1471
1472            testCase.remove();
1473            testCase.append(pattern);
1474            testCase.append(suffix);
1475
1476            // pattern + suffix
1477            notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "pattern + suffix", strengthNames[s], seed);
1478        }
1479
1480       logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount);
1481    }
1482
1483    uset_close(contractions);
1484    uset_close(expansions);
1485    uset_close(letters);
1486    delete monkeyData;
1487
1488    ucol_close(coll);
1489}
1490
1491#endif
1492
1493#endif
1494