1/*
2 **********************************************************************
3 *   Copyright (C) 2005-2014, International Business Machines
4 *   Corporation and others.  All Rights Reserved.
5 **********************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_COLLATION
11
12#include "cmemory.h"
13#include "cstring.h"
14#include "usrchimp.h"
15
16#include "unicode/coll.h"
17#include "unicode/tblcoll.h"
18#include "unicode/usearch.h"
19#include "unicode/uset.h"
20#include "unicode/ustring.h"
21
22#include "unicode/coleitr.h"
23#include "unicode/regex.h"        // TODO: make conditional on regexp being built.
24
25#include "colldata.h"
26#include "ssearch.h"
27#include "xmlparser.h"
28
29#include <stdio.h>  // for sprintf
30
31char testId[100];
32
33#define TEST_ASSERT(x) {if (!(x)) { \
34    errln("Failure in file %s, line %d, test ID = \"%s\"", __FILE__, __LINE__, testId);}}
35
36#define TEST_ASSERT_M(x, m) {if (!(x)) { \
37    dataerrln("Failure in file %s, line %d.   \"%s\"", __FILE__, __LINE__, m);return;}}
38
39#define TEST_ASSERT_SUCCESS(errcode) {if (U_FAILURE(errcode)) { \
40    dataerrln("Failure in file %s, line %d, test ID \"%s\", status = \"%s\"", \
41          __FILE__, __LINE__, testId, u_errorName(errcode));}}
42
43#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
44#define NEW_ARRAY(type, count) (type *) uprv_malloc((count) * sizeof(type))
45#define DELETE_ARRAY(array) uprv_free((void *) (array))
46
47//---------------------------------------------------------------------------
48//
49//  Test class boilerplate
50//
51//---------------------------------------------------------------------------
52SSearchTest::SSearchTest()
53{
54}
55
56SSearchTest::~SSearchTest()
57{
58}
59
60void SSearchTest::runIndexedTest( int32_t index, UBool exec, const char* &name, char *params )
61{
62    if (exec) logln("TestSuite SSearchTest: ");
63    switch (index) {
64#if !UCONFIG_NO_BREAK_ITERATION
65       case 0: name = "searchTest";
66            if (exec) searchTest();
67            break;
68
69        case 1: name = "offsetTest";
70            if (exec) offsetTest();
71            break;
72
73        case 2: name = "monkeyTest";
74            if (exec) monkeyTest(params);
75            break;
76
77        case 3: name = "sharpSTest";
78            if (exec) sharpSTest();
79            break;
80
81        case 4: name = "goodSuffixTest";
82            if (exec) goodSuffixTest();
83            break;
84
85        case 5: name = "searchTime";
86            if (exec) searchTime();
87            break;
88#endif
89        default: name = "";
90            break; //needed to end loop
91    }
92}
93
94
95#if !UCONFIG_NO_BREAK_ITERATION
96
97#define PATH_BUFFER_SIZE 2048
98const char *SSearchTest::getPath(char buffer[2048], const char *filename) {
99    UErrorCode status = U_ZERO_ERROR;
100    const char *testDataDirectory = IntlTest::getSourceTestData(status);
101
102    if (U_FAILURE(status) || strlen(testDataDirectory) + strlen(filename) + 1 >= PATH_BUFFER_SIZE) {
103        errln("ERROR: getPath() failed - %s", u_errorName(status));
104        return NULL;
105    }
106
107    strcpy(buffer, testDataDirectory);
108    strcat(buffer, filename);
109    return buffer;
110}
111
112
113void SSearchTest::searchTest()
114{
115#if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FILE_IO
116    UErrorCode status = U_ZERO_ERROR;
117    char path[PATH_BUFFER_SIZE];
118    const char *testFilePath = getPath(path, "ssearch.xml");
119
120    if (testFilePath == NULL) {
121        return; /* Couldn't get path: error message already output. */
122    }
123
124    LocalPointer<UXMLParser> parser(UXMLParser::createParser(status));
125    TEST_ASSERT_SUCCESS(status);
126    LocalPointer<UXMLElement> root(parser->parseFile(testFilePath, status));
127    TEST_ASSERT_SUCCESS(status);
128    if (U_FAILURE(status)) {
129        return;
130    }
131
132    const UnicodeString *debugTestCase = root->getAttribute("debug");
133    if (debugTestCase != NULL) {
134//       setenv("USEARCH_DEBUG", "1", 1);
135    }
136
137
138    const UXMLElement *testCase;
139    int32_t tc = 0;
140
141    while((testCase = root->nextChildElement(tc)) != NULL) {
142
143        if (testCase->getTagName().compare("test-case") != 0) {
144            errln("ssearch, unrecognized XML Element in test file");
145            continue;
146        }
147        const UnicodeString *id       = testCase->getAttribute("id");
148        *testId = 0;
149        if (id != NULL) {
150            id->extract(0, id->length(), testId,  sizeof(testId), US_INV);
151        }
152
153        // If debugging test case has been specified and this is not it, skip to next.
154        if (id!=NULL && debugTestCase!=NULL && *id != *debugTestCase) {
155            continue;
156        }
157        //
158        //  Get the requested collation strength.
159        //    Default is tertiary if the XML attribute is missing from the test case.
160        //
161        const UnicodeString *strength = testCase->getAttribute("strength");
162        UColAttributeValue collatorStrength = UCOL_PRIMARY;
163        if      (strength==NULL)          { collatorStrength = UCOL_TERTIARY;}
164        else if (*strength=="PRIMARY")    { collatorStrength = UCOL_PRIMARY;}
165        else if (*strength=="SECONDARY")  { collatorStrength = UCOL_SECONDARY;}
166        else if (*strength=="TERTIARY")   { collatorStrength = UCOL_TERTIARY;}
167        else if (*strength=="QUATERNARY") { collatorStrength = UCOL_QUATERNARY;}
168        else if (*strength=="IDENTICAL")  { collatorStrength = UCOL_IDENTICAL;}
169        else {
170            // Bogus value supplied for strength.  Shouldn't happen, even from
171            //  typos, if the  XML source has been validated.
172            //  This assert is a little deceiving in that strength can be
173            //   any of the allowed values, not just TERTIARY, but it will
174            //   do the job of getting the error output.
175            TEST_ASSERT(*strength=="TERTIARY")
176        }
177
178        //
179        // Get the collator normalization flag.  Default is UCOL_OFF.
180        //
181        UColAttributeValue normalize = UCOL_OFF;
182        const UnicodeString *norm = testCase->getAttribute("norm");
183        TEST_ASSERT (norm==NULL || *norm=="ON" || *norm=="OFF");
184        if (norm!=NULL && *norm=="ON") {
185            normalize = UCOL_ON;
186        }
187
188        //
189        // Get the alternate_handling flag. Default is UCOL_NON_IGNORABLE.
190        //
191        UColAttributeValue alternateHandling = UCOL_NON_IGNORABLE;
192        const UnicodeString *alt = testCase->getAttribute("alternate_handling");
193        TEST_ASSERT (alt == NULL || *alt == "SHIFTED" || *alt == "NON_IGNORABLE");
194        if (alt != NULL && *alt == "SHIFTED") {
195            alternateHandling = UCOL_SHIFTED;
196        }
197
198        const UnicodeString defLocale("en");
199        char  clocale[100];
200        const UnicodeString *locale   = testCase->getAttribute("locale");
201        if (locale == NULL || locale->length()==0) {
202            locale = &defLocale;
203        };
204        locale->extract(0, locale->length(), clocale, sizeof(clocale), NULL);
205
206
207        UnicodeString  text;
208        UnicodeString  target;
209        UnicodeString  pattern;
210        int32_t        expectedMatchStart = -1;
211        int32_t        expectedMatchLimit = -1;
212        const UXMLElement  *n;
213        int32_t                nodeCount = 0;
214
215        n = testCase->getChildElement("pattern");
216        TEST_ASSERT(n != NULL);
217        if (n==NULL) {
218            continue;
219        }
220        text = n->getText(FALSE);
221        text = text.unescape();
222        pattern.append(text);
223        nodeCount++;
224
225        n = testCase->getChildElement("pre");
226        if (n!=NULL) {
227            text = n->getText(FALSE);
228            text = text.unescape();
229            target.append(text);
230            nodeCount++;
231        }
232
233        n = testCase->getChildElement("m");
234        if (n!=NULL) {
235            expectedMatchStart = target.length();
236            text = n->getText(FALSE);
237            text = text.unescape();
238            target.append(text);
239            expectedMatchLimit = target.length();
240            nodeCount++;
241        }
242
243        n = testCase->getChildElement("post");
244        if (n!=NULL) {
245            text = n->getText(FALSE);
246            text = text.unescape();
247            target.append(text);
248            nodeCount++;
249        }
250
251        //  Check that there weren't extra things in the XML
252        TEST_ASSERT(nodeCount == testCase->countChildren());
253
254        // Open a collator and StringSearch based on the parameters
255        //   obtained from the XML.
256        //
257        status = U_ZERO_ERROR;
258        LocalUCollatorPointer collator(ucol_open(clocale, &status));
259        ucol_setStrength(collator.getAlias(), collatorStrength);
260        ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize, &status);
261        ucol_setAttribute(collator.getAlias(), UCOL_ALTERNATE_HANDLING, alternateHandling, &status);
262        LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
263                                                               target.getBuffer(), target.length(),
264                                                               collator.getAlias(),
265                                                               NULL,     // the break iterator
266                                                               &status));
267
268        TEST_ASSERT_SUCCESS(status);
269        if (U_FAILURE(status)) {
270            continue;
271        }
272
273        int32_t foundStart = 0;
274        int32_t foundLimit = 0;
275        UBool   foundMatch;
276
277        //
278        // Do the search, check the match result against the expected results.
279        //
280        foundMatch= usearch_search(uss.getAlias(), 0, &foundStart, &foundLimit, &status);
281        TEST_ASSERT_SUCCESS(status);
282        if ((foundMatch && expectedMatchStart<0) ||
283            (foundStart != expectedMatchStart)   ||
284            (foundLimit != expectedMatchLimit)) {
285                TEST_ASSERT(FALSE);   //  ouput generic error position
286                infoln("Found, expected match start = %d, %d \n"
287                       "Found, expected match limit = %d, %d",
288                foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
289        }
290
291        // In case there are other matches...
292        // (should we only do this if the test case passed?)
293        while (foundMatch) {
294            expectedMatchStart = foundStart;
295            expectedMatchLimit = foundLimit;
296
297            foundMatch = usearch_search(uss.getAlias(), foundLimit, &foundStart, &foundLimit, &status);
298        }
299
300        uss.adoptInstead(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
301            target.getBuffer(), target.length(),
302            collator.getAlias(),
303            NULL,
304            &status));
305
306        //
307        // Do the backwards search, check the match result against the expected results.
308        //
309        foundMatch= usearch_searchBackwards(uss.getAlias(), target.length(), &foundStart, &foundLimit, &status);
310        TEST_ASSERT_SUCCESS(status);
311        if ((foundMatch && expectedMatchStart<0) ||
312            (foundStart != expectedMatchStart)   ||
313            (foundLimit != expectedMatchLimit)) {
314                TEST_ASSERT(FALSE);   //  ouput generic error position
315                infoln("Found, expected backwards match start = %d, %d \n"
316                       "Found, expected backwards match limit = %d, %d",
317                foundStart, expectedMatchStart, foundLimit, expectedMatchLimit);
318        }
319    }
320#endif
321}
322
323struct Order
324{
325    int32_t order;
326    int32_t lowOffset;
327    int32_t highOffset;
328};
329
330class OrderList
331{
332public:
333    OrderList();
334    OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset = 0);
335    ~OrderList();
336
337    int32_t size(void) const;
338    void add(int32_t order, int32_t low, int32_t high);
339    const Order *get(int32_t index) const;
340    int32_t getLowOffset(int32_t index) const;
341    int32_t getHighOffset(int32_t index) const;
342    int32_t getOrder(int32_t index) const;
343    void reverse(void);
344    UBool compare(const OrderList &other) const;
345    UBool matchesAt(int32_t offset, const OrderList &other) const;
346
347private:
348    Order *list;
349    int32_t listMax;
350    int32_t listSize;
351};
352
353OrderList::OrderList()
354  : list(NULL),  listMax(16), listSize(0)
355{
356    list = new Order[listMax];
357}
358
359OrderList::OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset)
360    : list(NULL), listMax(16), listSize(0)
361{
362    UErrorCode status = U_ZERO_ERROR;
363    UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
364    uint32_t strengthMask = 0;
365    int32_t order, low, high;
366
367    switch (ucol_getStrength(coll))
368    {
369    default:
370        strengthMask |= UCOL_TERTIARYORDERMASK;
371        /* fall through */
372
373    case UCOL_SECONDARY:
374        strengthMask |= UCOL_SECONDARYORDERMASK;
375        /* fall through */
376
377    case UCOL_PRIMARY:
378        strengthMask |= UCOL_PRIMARYORDERMASK;
379    }
380
381    list = new Order[listMax];
382
383    ucol_setOffset(elems, stringOffset, &status);
384
385    do {
386        low   = ucol_getOffset(elems);
387        order = ucol_next(elems, &status);
388        high  = ucol_getOffset(elems);
389
390        if (order != UCOL_NULLORDER) {
391            order &= strengthMask;
392        }
393
394        if (order != UCOL_IGNORABLE) {
395            add(order, low, high);
396        }
397    } while (order != UCOL_NULLORDER);
398
399    ucol_closeElements(elems);
400}
401
402OrderList::~OrderList()
403{
404    delete[] list;
405}
406
407void OrderList::add(int32_t order, int32_t low, int32_t high)
408{
409    if (listSize >= listMax) {
410        listMax *= 2;
411
412        Order *newList = new Order[listMax];
413
414        uprv_memcpy(newList, list, listSize * sizeof(Order));
415        delete[] list;
416        list = newList;
417    }
418
419    list[listSize].order      = order;
420    list[listSize].lowOffset  = low;
421    list[listSize].highOffset = high;
422
423    listSize += 1;
424}
425
426const Order *OrderList::get(int32_t index) const
427{
428    if (index >= listSize) {
429        return NULL;
430    }
431
432    return &list[index];
433}
434
435int32_t OrderList::getLowOffset(int32_t index) const
436{
437    const Order *order = get(index);
438
439    if (order != NULL) {
440        return order->lowOffset;
441    }
442
443    return -1;
444}
445
446int32_t OrderList::getHighOffset(int32_t index) const
447{
448    const Order *order = get(index);
449
450    if (order != NULL) {
451        return order->highOffset;
452    }
453
454    return -1;
455}
456
457int32_t OrderList::getOrder(int32_t index) const
458{
459    const Order *order = get(index);
460
461    if (order != NULL) {
462        return order->order;
463    }
464
465    return UCOL_NULLORDER;
466}
467
468int32_t OrderList::size() const
469{
470    return listSize;
471}
472
473void OrderList::reverse()
474{
475    for(int32_t f = 0, b = listSize - 1; f < b; f += 1, b -= 1) {
476        Order swap = list[b];
477
478        list[b] = list[f];
479        list[f] = swap;
480    }
481}
482
483UBool OrderList::compare(const OrderList &other) const
484{
485    if (listSize != other.listSize) {
486        return FALSE;
487    }
488
489    for(int32_t i = 0; i < listSize; i += 1) {
490        if (list[i].order  != other.list[i].order ||
491            list[i].lowOffset != other.list[i].lowOffset ||
492            list[i].highOffset != other.list[i].highOffset) {
493                return FALSE;
494        }
495    }
496
497    return TRUE;
498}
499
500UBool OrderList::matchesAt(int32_t offset, const OrderList &other) const
501{
502    // NOTE: sizes include the NULLORDER, which we don't want to compare.
503    int32_t otherSize = other.size() - 1;
504
505    if (listSize - 1 - offset < otherSize) {
506        return FALSE;
507    }
508
509    for (int32_t i = offset, j = 0; j < otherSize; i += 1, j += 1) {
510        if (getOrder(i) != other.getOrder(j)) {
511            return FALSE;
512        }
513    }
514
515    return TRUE;
516}
517
518static char *printOffsets(char *buffer, OrderList &list)
519{
520    int32_t size = list.size();
521    char *s = buffer;
522
523    for(int32_t i = 0; i < size; i += 1) {
524        const Order *order = list.get(i);
525
526        if (i != 0) {
527            s += sprintf(s, ", ");
528        }
529
530        s += sprintf(s, "(%d, %d)", order->lowOffset, order->highOffset);
531    }
532
533    return buffer;
534}
535
536static char *printOrders(char *buffer, OrderList &list)
537{
538    int32_t size = list.size();
539    char *s = buffer;
540
541    for(int32_t i = 0; i < size; i += 1) {
542        const Order *order = list.get(i);
543
544        if (i != 0) {
545            s += sprintf(s, ", ");
546        }
547
548        s += sprintf(s, "%8.8X", order->order);
549    }
550
551    return buffer;
552}
553
554void SSearchTest::offsetTest()
555{
556    const char *test[] = {
557        // The sequence \u0FB3\u0F71\u0F71\u0F80 contains a discontiguous
558        // contraction (\u0FB3\u0F71\u0F80) logically followed by \u0F71.
559        "\\u1E33\\u0FB3\\u0F71\\u0F71\\u0F80\\uD835\\uDF6C\\u01B0",
560
561        "\\ua191\\u16ef\\u2036\\u017a",
562
563#if 0
564        // This results in a complex interaction between contraction,
565        // expansion and normalization that confuses the backwards offset fixups.
566        "\\u0F7F\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
567#endif
568
569        "\\u0F80\\u0F81\\u0F82\\u0F83\\u0F84\\u0F85",
570        "\\u07E9\\u07EA\\u07F1\\u07F2\\u07F3",
571
572        "\\u02FE\\u02FF"
573        "\\u0300\\u0301\\u0302\\u0303\\u0304\\u0305\\u0306\\u0307\\u0308\\u0309\\u030A\\u030B\\u030C\\u030D\\u030E\\u030F"
574        "\\u0310\\u0311\\u0312\\u0313\\u0314\\u0315\\u0316\\u0317\\u0318\\u0319\\u031A\\u031B\\u031C\\u031D\\u031E\\u031F"
575        "\\u0320\\u0321\\u0322\\u0323\\u0324\\u0325\\u0326\\u0327\\u0328\\u0329\\u032A\\u032B\\u032C\\u032D\\u032E\\u032F"
576        "\\u0330\\u0331\\u0332\\u0333\\u0334\\u0335\\u0336\\u0337\\u0338\\u0339\\u033A\\u033B\\u033C\\u033D\\u033E\\u033F"
577        "\\u0340\\u0341\\u0342\\u0343\\u0344\\u0345\\u0346\\u0347\\u0348\\u0349\\u034A\\u034B\\u034C\\u034D\\u034E", // currently not working, see #8081
578
579        "\\u02FE\\u02FF\\u0300\\u0301\\u0302\\u0303\\u0316\\u0317\\u0318", // currently not working, see #8081
580        "a\\u02FF\\u0301\\u0316", // currently not working, see #8081
581        "a\\u02FF\\u0316\\u0301",
582        "a\\u0430\\u0301\\u0316",
583        "a\\u0430\\u0316\\u0301",
584        "abc\\u0E41\\u0301\\u0316",
585        "abc\\u0E41\\u0316\\u0301",
586        "\\u0E41\\u0301\\u0316",
587        "\\u0E41\\u0316\\u0301",
588        "a\\u0301\\u0316",
589        "a\\u0316\\u0301",
590        "\\uAC52\\uAC53",
591        "\\u34CA\\u34CB",
592        "\\u11ED\\u11EE",
593        "\\u30C3\\u30D0",
594        "p\\u00E9ch\\u00E9",
595        "a\\u0301\\u0325",
596        "a\\u0300\\u0325",
597        "a\\u0325\\u0300",
598        "A\\u0323\\u0300B",
599        "A\\u0300\\u0323B",
600        "A\\u0301\\u0323B",
601        "A\\u0302\\u0301\\u0323B",
602        "abc",
603        "ab\\u0300c",
604        "ab\\u0300\\u0323c",
605        " \\uD800\\uDC00\\uDC00",
606        "a\\uD800\\uDC00\\uDC00",
607        "A\\u0301\\u0301",
608        "A\\u0301\\u0323",
609        "A\\u0301\\u0323B",
610        "B\\u0301\\u0323C",
611        "A\\u0300\\u0323B",
612        "\\u0301A\\u0301\\u0301",
613        "abcd\\r\\u0301",
614        "p\\u00EAche",
615        "pe\\u0302che",
616    };
617
618    int32_t testCount = ARRAY_SIZE(test);
619    UErrorCode status = U_ZERO_ERROR;
620    RuleBasedCollator *col = (RuleBasedCollator *) Collator::createInstance(Locale::getEnglish(), status);
621    if (U_FAILURE(status)) {
622        errcheckln(status, "Failed to create collator in offsetTest! - %s", u_errorName(status));
623        return;
624    }
625    char buffer[4096];  // A bit of a hack... just happens to be long enough for all the test cases...
626                        // We could allocate one that's the right size by (CE_count * 10) + 2
627                        // 10 chars is enough room for 8 hex digits plus ", ". 2 extra chars for "[" and "]"
628
629    col->setAttribute(UCOL_NORMALIZATION_MODE, UCOL_ON, status);
630
631    for(int32_t i = 0; i < testCount; i += 1) {
632        UnicodeString ts = CharsToUnicodeString(test[i]);
633        CollationElementIterator *iter = col->createCollationElementIterator(ts);
634        OrderList forwardList;
635        OrderList backwardList;
636        int32_t order, low, high;
637
638        do {
639            low   = iter->getOffset();
640            order = iter->next(status);
641            high  = iter->getOffset();
642
643            forwardList.add(order, low, high);
644        } while (order != CollationElementIterator::NULLORDER);
645
646        iter->reset();
647        iter->setOffset(ts.length(), status);
648
649        backwardList.add(CollationElementIterator::NULLORDER, iter->getOffset(), iter->getOffset());
650
651        do {
652            high  = iter->getOffset();
653            order = iter->previous(status);
654            low   = iter->getOffset();
655
656            if (order == CollationElementIterator::NULLORDER) {
657                break;
658            }
659
660            backwardList.add(order, low, high);
661        } while (TRUE);
662
663        backwardList.reverse();
664
665        if (forwardList.compare(backwardList)) {
666            logln("Works with \"%s\"", test[i]);
667            logln("Forward offsets:  [%s]", printOffsets(buffer, forwardList));
668//          logln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
669
670            logln("Forward CEs:  [%s]", printOrders(buffer, forwardList));
671//          logln("Backward CEs: [%s]", printOrders(buffer, backwardList));
672
673            logln();
674        } else {
675            errln("Fails with \"%s\"", test[i]);
676            infoln("Forward offsets:  [%s]", printOffsets(buffer, forwardList));
677            infoln("Backward offsets: [%s]", printOffsets(buffer, backwardList));
678
679            infoln("Forward CEs:  [%s]", printOrders(buffer, forwardList));
680            infoln("Backward CEs: [%s]", printOrders(buffer, backwardList));
681
682            infoln();
683        }
684        delete iter;
685    }
686    delete col;
687}
688
689#if 0
690static UnicodeString &escape(const UnicodeString &string, UnicodeString &buffer)
691{
692    for(int32_t i = 0; i < string.length(); i += 1) {
693        UChar32 ch = string.char32At(i);
694
695        if (ch >= 0x0020 && ch <= 0x007F) {
696            if (ch == 0x005C) {
697                buffer.append("\\\\");
698            } else {
699                buffer.append(ch);
700            }
701        } else {
702            char cbuffer[12];
703
704            if (ch <= 0xFFFFL) {
705                sprintf(cbuffer, "\\u%4.4X", ch);
706            } else {
707                sprintf(cbuffer, "\\U%8.8X", ch);
708            }
709
710            buffer.append(cbuffer);
711        }
712
713        if (ch >= 0x10000L) {
714            i += 1;
715        }
716    }
717
718    return buffer;
719}
720#endif
721
722void SSearchTest::sharpSTest()
723{
724    UErrorCode status = U_ZERO_ERROR;
725    UCollator *coll = NULL;
726    UnicodeString lp  = "fuss";
727    UnicodeString sp = "fu\\u00DF";
728    UnicodeString targets[]  = {"fu\\u00DF", "fu\\u00DFball", "1fu\\u00DFball", "12fu\\u00DFball", "123fu\\u00DFball", "1234fu\\u00DFball",
729                                "ffu\\u00DF", "fufu\\u00DF", "fusfu\\u00DF",
730                                "fuss", "ffuss", "fufuss", "fusfuss", "1fuss", "12fuss", "123fuss", "1234fuss", "fu\\u00DF", "1fu\\u00DF", "12fu\\u00DF", "123fu\\u00DF", "1234fu\\u00DF"};
731    int32_t start = -1, end = -1;
732
733    coll = ucol_openFromShortString("LEN_S1", FALSE, NULL, &status);
734    TEST_ASSERT_SUCCESS(status);
735
736    UnicodeString lpUnescaped = lp.unescape();
737    UnicodeString spUnescaped = sp.unescape();
738
739    LocalUStringSearchPointer ussLong(usearch_openFromCollator(lpUnescaped.getBuffer(), lpUnescaped.length(),
740                                                           lpUnescaped.getBuffer(), lpUnescaped.length(),   // actual test data will be set later
741                                                           coll,
742                                                           NULL,     // the break iterator
743                                                           &status));
744
745    LocalUStringSearchPointer ussShort(usearch_openFromCollator(spUnescaped.getBuffer(), spUnescaped.length(),
746                                                           spUnescaped.getBuffer(), spUnescaped.length(),   // actual test data will be set later
747                                                           coll,
748                                                           NULL,     // the break iterator
749                                                           &status));
750    TEST_ASSERT_SUCCESS(status);
751
752    for (uint32_t t = 0; t < (sizeof(targets)/sizeof(targets[0])); t += 1) {
753        UBool bFound;
754        UnicodeString target = targets[t].unescape();
755
756        start = end = -1;
757        usearch_setText(ussLong.getAlias(), target.getBuffer(), target.length(), &status);
758        bFound = usearch_search(ussLong.getAlias(), 0, &start, &end, &status);
759        TEST_ASSERT_SUCCESS(status);
760        if (bFound) {
761            logln("Test %d: found long pattern at [%d, %d].", t, start, end);
762        } else {
763            dataerrln("Test %d: did not find long pattern.", t);
764        }
765
766        usearch_setText(ussShort.getAlias(), target.getBuffer(), target.length(), &status);
767        bFound = usearch_search(ussShort.getAlias(), 0, &start, &end, &status);
768        TEST_ASSERT_SUCCESS(status);
769        if (bFound) {
770            logln("Test %d: found long pattern at [%d, %d].", t, start, end);
771        } else {
772            dataerrln("Test %d: did not find long pattern.", t);
773        }
774    }
775
776    ucol_close(coll);
777}
778
779void SSearchTest::goodSuffixTest()
780{
781    UErrorCode status = U_ZERO_ERROR;
782    UCollator *coll = NULL;
783    UnicodeString pat = /*"gcagagag"*/ "fxeld";
784    UnicodeString target = /*"gcatcgcagagagtatacagtacg"*/ "cloveldfxeld";
785    int32_t start = -1, end = -1;
786    UBool bFound;
787
788    coll = ucol_open(NULL, &status);
789    TEST_ASSERT_SUCCESS(status);
790
791    LocalUStringSearchPointer ss(usearch_openFromCollator(pat.getBuffer(), pat.length(),
792                                                          target.getBuffer(), target.length(),
793                                                          coll,
794                                                          NULL,     // the break iterator
795                                                          &status));
796    TEST_ASSERT_SUCCESS(status);
797
798    bFound = usearch_search(ss.getAlias(), 0, &start, &end, &status);
799    TEST_ASSERT_SUCCESS(status);
800    if (bFound) {
801        logln("Found pattern at [%d, %d].", start, end);
802    } else {
803        dataerrln("Did not find pattern.");
804    }
805
806    ucol_close(coll);
807}
808
809//
810//  searchTime()    A quick and dirty performance test for string search.
811//                  Probably  doesn't really belong as part of intltest, but it
812//                  does check that the search succeeds, and gets the right result,
813//                  so it serves as a functionality test also.
814//
815//                  To run as a perf test, up the loop count, select by commenting
816//                  and uncommenting in the code the operation to be measured,
817//                  rebuild, and measure the running time of this test alone.
818//
819//                     time LD_LIBRARY_PATH=whatever  ./intltest  collate/SSearchTest/searchTime
820//
821void SSearchTest::searchTime() {
822    static const char *longishText =
823"Whylom, as olde stories tellen us,\n"
824"Ther was a duk that highte Theseus:\n"
825"Of Athenes he was lord and governour,\n"
826"And in his tyme swich a conquerour,\n"
827"That gretter was ther noon under the sonne.\n"
828"Ful many a riche contree hadde he wonne;\n"
829"What with his wisdom and his chivalrye,\n"
830"He conquered al the regne of Femenye,\n"
831"That whylom was y-cleped Scithia;\n"
832"And weddede the quene Ipolita,\n"
833"And broghte hir hoom with him in his contree\n"
834"With muchel glorie and greet solempnitee,\n"
835"And eek hir yonge suster Emelye.\n"
836"And thus with victorie and with melodye\n"
837"Lete I this noble duk to Athenes ryde,\n"
838"And al his hoost, in armes, him bisyde.\n"
839"And certes, if it nere to long to here,\n"
840"I wolde han told yow fully the manere,\n"
841"How wonnen was the regne of Femenye\n"
842"By Theseus, and by his chivalrye;\n"
843"And of the grete bataille for the nones\n"
844"Bitwixen Athen's and Amazones;\n"
845"And how asseged was Ipolita,\n"
846"The faire hardy quene of Scithia;\n"
847"And of the feste that was at hir weddinge,\n"
848"And of the tempest at hir hoom-cominge;\n"
849"But al that thing I moot as now forbere.\n"
850"I have, God woot, a large feeld to ere,\n"
851"And wayke been the oxen in my plough.\n"
852"The remenant of the tale is long y-nough.\n"
853"I wol nat letten eek noon of this route;\n"
854"Lat every felawe telle his tale aboute,\n"
855"And lat see now who shal the soper winne;\n"
856"And ther I lefte, I wol ageyn biginne.\n"
857"This duk, of whom I make mencioun,\n"
858"When he was come almost unto the toun,\n"
859"In al his wele and in his moste pryde,\n"
860"He was war, as he caste his eye asyde,\n"
861"Wher that ther kneled in the hye weye\n"
862"A companye of ladies, tweye and tweye,\n"
863"Ech after other, clad in clothes blake; \n"
864"But swich a cry and swich a wo they make,\n"
865"That in this world nis creature livinge,\n"
866"That herde swich another weymentinge;\n"
867"And of this cry they nolde never stenten,\n"
868"Til they the reynes of his brydel henten.\n"
869"'What folk ben ye, that at myn hoomcominge\n"
870"Perturben so my feste with cryinge'?\n"
871"Quod Theseus, 'have ye so greet envye\n"
872"Of myn honour, that thus compleyne and crye? \n"
873"Or who hath yow misboden, or offended?\n"
874"And telleth me if it may been amended;\n"
875"And why that ye ben clothed thus in blak'?\n"
876"The eldest lady of hem alle spak,\n"
877"When she hadde swowned with a deedly chere,\n"
878"That it was routhe for to seen and here,\n"
879"And seyde: 'Lord, to whom Fortune hath yiven\n"
880"Victorie, and as a conquerour to liven,\n"
881"Noght greveth us your glorie and your honour;\n"
882"But we biseken mercy and socour.\n"
883"Have mercy on our wo and our distresse.\n"
884"Som drope of pitee, thurgh thy gentilesse,\n"
885"Up-on us wrecched wommen lat thou falle.\n"
886"For certes, lord, ther nis noon of us alle,\n"
887"That she nath been a duchesse or a quene;\n"
888"Now be we caitifs, as it is wel sene:\n"
889"Thanked be Fortune, and hir false wheel,\n"
890"That noon estat assureth to be weel.\n"
891"And certes, lord, t'abyden your presence,\n"
892"Here in the temple of the goddesse Clemence\n"
893"We han ben waytinge al this fourtenight;\n"
894"Now help us, lord, sith it is in thy might.\n"
895"I wrecche, which that wepe and waille thus,\n"
896"Was whylom wyf to king Capaneus,\n"
897"That starf at Thebes, cursed be that day!\n"
898"And alle we, that been in this array,\n"
899"And maken al this lamentacioun,\n"
900"We losten alle our housbondes at that toun,\n"
901"Whyl that the sege ther-aboute lay.\n"
902"And yet now th'olde Creon, weylaway!\n"
903"The lord is now of Thebes the citee, \n"
904"Fulfild of ire and of iniquitee,\n"
905"He, for despyt, and for his tirannye,\n"
906"To do the dede bodyes vileinye,\n"
907"Of alle our lordes, whiche that ben slawe,\n"
908"Hath alle the bodyes on an heep y-drawe,\n"
909"And wol nat suffren hem, by noon assent,\n"
910"Neither to been y-buried nor y-brent,\n"
911"But maketh houndes ete hem in despyt. zet'\n";
912
913const char *cPattern = "maketh houndes ete hem";
914//const char *cPattern = "Whylom";
915//const char *cPattern = "zet";
916    const char *testId = "searchTime()";   // for error macros.
917    UnicodeString target = longishText;
918    UErrorCode status = U_ZERO_ERROR;
919
920
921    LocalUCollatorPointer collator(ucol_open("en", &status));
922    //ucol_setStrength(collator.getAlias(), collatorStrength);
923    //ucol_setAttribute(collator.getAlias(), UCOL_NORMALIZATION_MODE, normalize, &status);
924    UnicodeString uPattern = cPattern;
925    LocalUStringSearchPointer uss(usearch_openFromCollator(uPattern.getBuffer(), uPattern.length(),
926                                                           target.getBuffer(), target.length(),
927                                                           collator.getAlias(),
928                                                           NULL,     // the break iterator
929                                                           &status));
930    TEST_ASSERT_SUCCESS(status);
931
932//  int32_t foundStart;
933//  int32_t foundEnd;
934    UBool   found;
935
936    // Find the match position usgin strstr
937    const char *pm = strstr(longishText, cPattern);
938    TEST_ASSERT_M(pm!=NULL, "No pattern match with strstr");
939    int32_t  refMatchPos = (int32_t)(pm - longishText);
940    int32_t  icuMatchPos;
941    int32_t  icuMatchEnd;
942    usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &status);
943    TEST_ASSERT_SUCCESS(status);
944    TEST_ASSERT_M(refMatchPos == icuMatchPos, "strstr and icu give different match positions.");
945
946    int32_t i;
947    // int32_t j=0;
948
949    // Try loopcounts around 100000 to some millions, depending on the operation,
950    //   to get runtimes of at least several seconds.
951    for (i=0; i<10000; i++) {
952        found = usearch_search(uss.getAlias(), 0, &icuMatchPos, &icuMatchEnd, &status);
953        (void)found;   // Suppress set but not used warning.
954        //TEST_ASSERT_SUCCESS(status);
955        //TEST_ASSERT(found);
956
957        // usearch_setOffset(uss.getAlias(), 0, &status);
958        // icuMatchPos = usearch_next(uss.getAlias(), &status);
959
960         // The i+j stuff is to confuse the optimizer and get it to actually leave the
961         //   call to strstr in place.
962         //pm = strstr(longishText+j, cPattern);
963         //j = (j + i)%5;
964    }
965
966    //printf("%ld, %d\n", pm-longishText, j);
967}
968
969//----------------------------------------------------------------------------------------
970//
971//   Random Numbers.  Similar to standard lib rand() and srand()
972//                    Not using library to
973//                      1.  Get same results on all platforms.
974//                      2.  Get access to current seed, to more easily reproduce failures.
975//
976//---------------------------------------------------------------------------------------
977static uint32_t m_seed = 1;
978
979static uint32_t m_rand()
980{
981    m_seed = m_seed * 1103515245 + 12345;
982    return (uint32_t)(m_seed/65536) % 32768;
983}
984
985class Monkey
986{
987public:
988    virtual void append(UnicodeString &test, UnicodeString &alternate) = 0;
989
990protected:
991    Monkey();
992    virtual ~Monkey();
993};
994
995Monkey::Monkey()
996{
997    // ook?
998}
999
1000Monkey::~Monkey()
1001{
1002    // ook?
1003}
1004
1005class SetMonkey : public Monkey
1006{
1007public:
1008    SetMonkey(const USet *theSet);
1009    ~SetMonkey();
1010
1011    virtual void append(UnicodeString &test, UnicodeString &alternate);
1012
1013private:
1014    const USet *set;
1015};
1016
1017SetMonkey::SetMonkey(const USet *theSet)
1018    : Monkey(), set(theSet)
1019{
1020    // ook?
1021}
1022
1023SetMonkey::~SetMonkey()
1024{
1025    //ook...
1026}
1027
1028void SetMonkey::append(UnicodeString &test, UnicodeString &alternate)
1029{
1030    int32_t size = uset_size(set);
1031    int32_t index = m_rand() % size;
1032    UChar32 ch = uset_charAt(set, index);
1033    UnicodeString str(ch);
1034
1035    test.append(str);
1036    alternate.append(str); // flip case, or some junk?
1037}
1038
1039class StringSetMonkey : public Monkey
1040{
1041public:
1042    StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData);
1043    ~StringSetMonkey();
1044
1045    void append(UnicodeString &testCase, UnicodeString &alternate);
1046
1047private:
1048    UnicodeString &generateAlternative(const UnicodeString &testCase, UnicodeString &alternate);
1049
1050    const USet *set;
1051    UCollator  *coll;
1052    CollData   *collData;
1053};
1054
1055StringSetMonkey::StringSetMonkey(const USet *theSet, UCollator *theCollator, CollData *theCollData)
1056: Monkey(), set(theSet), coll(theCollator), collData(theCollData)
1057{
1058    // ook.
1059}
1060
1061StringSetMonkey::~StringSetMonkey()
1062{
1063    // ook?
1064}
1065
1066void StringSetMonkey::append(UnicodeString &testCase, UnicodeString &alternate)
1067{
1068    int32_t itemCount = uset_getItemCount(set), len = 0;
1069    int32_t index = m_rand() % itemCount;
1070    UChar32 rangeStart = 0, rangeEnd = 0;
1071    UChar buffer[16];
1072    UErrorCode err = U_ZERO_ERROR;
1073
1074    len = uset_getItem(set, index, &rangeStart, &rangeEnd, buffer, 16, &err);
1075
1076    if (len == 0) {
1077        int32_t offset = m_rand() % (rangeEnd - rangeStart + 1);
1078        UChar32 ch = rangeStart + offset;
1079        UnicodeString str(ch);
1080
1081        testCase.append(str);
1082        generateAlternative(str, alternate);
1083    } else if (len > 0) {
1084        // should check that len < 16...
1085        UnicodeString str(buffer, len);
1086
1087        testCase.append(str);
1088        generateAlternative(str, alternate);
1089    } else {
1090        // shouldn't happen...
1091    }
1092}
1093
1094UnicodeString &StringSetMonkey::generateAlternative(const UnicodeString &testCase, UnicodeString &alternate)
1095{
1096    // find out shortest string for the longest sequence of ces.
1097    // needs to be refined to use dynamic programming, but will be roughly right
1098    UErrorCode status = U_ZERO_ERROR;
1099    CEList ceList(coll, testCase, status);
1100    UnicodeString alt;
1101    int32_t offset = 0;
1102
1103    if (ceList.size() == 0) {
1104        return alternate.append(testCase);
1105    }
1106
1107    while (offset < ceList.size()) {
1108        int32_t ce = ceList.get(offset);
1109        const StringList *strings = collData->getStringList(ce);
1110
1111        if (strings == NULL) {
1112            return alternate.append(testCase);
1113        }
1114
1115        int32_t stringCount = strings->size();
1116        int32_t tries = 0;
1117
1118        // find random string that generates the same CEList
1119        const CEList *ceList2 = NULL;
1120        const UnicodeString *string = NULL;
1121              UBool matches = FALSE;
1122
1123        do {
1124            int32_t s = m_rand() % stringCount;
1125
1126            if (tries++ > stringCount) {
1127                alternate.append(testCase);
1128                return alternate;
1129            }
1130
1131            string = strings->get(s);
1132            ceList2 = collData->getCEList(string);
1133            matches = ceList.matchesAt(offset, ceList2);
1134
1135            if (! matches) {
1136                collData->freeCEList((CEList *) ceList2);
1137            }
1138        } while (! matches);
1139
1140        alt.append(*string);
1141        offset += ceList2->size();
1142        collData->freeCEList(ceList2);
1143    }
1144
1145    const CEList altCEs(coll, alt, status);
1146
1147    if (ceList.matchesAt(0, &altCEs)) {
1148        return alternate.append(alt);
1149    }
1150
1151    return alternate.append(testCase);
1152}
1153
1154static void generateTestCase(UCollator *coll, Monkey *monkeys[], int32_t monkeyCount, UnicodeString &testCase, UnicodeString &alternate)
1155{
1156    int32_t pieces = (m_rand() % 4) + 1;
1157    UErrorCode status = U_ZERO_ERROR;
1158    UBool matches;
1159
1160    do {
1161        testCase.remove();
1162        alternate.remove();
1163        monkeys[0]->append(testCase, alternate);
1164
1165        for(int32_t piece = 0; piece < pieces; piece += 1) {
1166            int32_t monkey = m_rand() % monkeyCount;
1167
1168            monkeys[monkey]->append(testCase, alternate);
1169        }
1170
1171        const CEList ceTest(coll, testCase, status);
1172        const CEList ceAlt(coll, alternate, status);
1173
1174        matches = ceTest.matchesAt(0, &ceAlt);
1175    } while (! matches);
1176}
1177
1178static UBool simpleSearch(UCollator *coll, const UnicodeString &target, int32_t offset, const UnicodeString &pattern, int32_t &matchStart, int32_t &matchEnd)
1179{
1180    UErrorCode      status = U_ZERO_ERROR;
1181    OrderList       targetOrders(coll, target, offset);
1182    OrderList       patternOrders(coll, pattern);
1183    int32_t         targetSize  = targetOrders.size() - 1;
1184    int32_t         patternSize = patternOrders.size() - 1;
1185    UBreakIterator *charBreakIterator = ubrk_open(UBRK_CHARACTER, ucol_getLocaleByType(coll, ULOC_VALID_LOCALE, &status),
1186                                                  target.getBuffer(), target.length(), &status);
1187
1188    if (patternSize == 0) {
1189        // Searching for an empty pattern always fails
1190        matchStart = matchEnd = -1;
1191        ubrk_close(charBreakIterator);
1192        return FALSE;
1193    }
1194
1195    matchStart = matchEnd = -1;
1196
1197    for(int32_t i = 0; i < targetSize; i += 1) {
1198        if (targetOrders.matchesAt(i, patternOrders)) {
1199            int32_t start    = targetOrders.getLowOffset(i);
1200            int32_t maxLimit = targetOrders.getLowOffset(i + patternSize);
1201            int32_t minLimit = targetOrders.getLowOffset(i + patternSize - 1);
1202
1203            // if the low and high offsets of the first CE in
1204            // the match are the same, it means that the match
1205            // starts in the middle of an expansion - all but
1206            // the first CE of the expansion will have the offset
1207            // of the following character.
1208            if (start == targetOrders.getHighOffset(i)) {
1209                continue;
1210            }
1211
1212            // Make sure match starts on a grapheme boundary
1213            if (! ubrk_isBoundary(charBreakIterator, start)) {
1214                continue;
1215            }
1216
1217            // If the low and high offsets of the CE after the match
1218            // are the same, it means that the match ends in the middle
1219            // of an expansion sequence.
1220            if (maxLimit == targetOrders.getHighOffset(i + patternSize) &&
1221                targetOrders.getOrder(i + patternSize) != UCOL_NULLORDER) {
1222                continue;
1223            }
1224
1225            int32_t mend = maxLimit;
1226
1227            // Find the first grapheme break after the character index
1228            // of the last CE in the match. If it's after character index
1229            // that's after the last CE in the match, use that index
1230            // as the end of the match.
1231            if (minLimit < maxLimit) {
1232                // When the last CE's low index is same with its high index, the CE is likely
1233                // a part of expansion. In this case, the index is located just after the
1234                // character corresponding to the CEs compared above. If the index is right
1235                // at the break boundary, move the position to the next boundary will result
1236                // incorrect match length when there are ignorable characters exist between
1237                // the position and the next character produces CE(s). See ticket#8482.
1238                if (minLimit == targetOrders.getHighOffset(i + patternSize - 1) && ubrk_isBoundary(charBreakIterator, minLimit)) {
1239                    mend = minLimit;
1240                } else {
1241                    int32_t nba = ubrk_following(charBreakIterator, minLimit);
1242
1243                    if (nba >= targetOrders.getHighOffset(i + patternSize - 1)) {
1244                        mend = nba;
1245                    }
1246                }
1247            }
1248
1249            if (mend > maxLimit) {
1250                continue;
1251            }
1252
1253            if (! ubrk_isBoundary(charBreakIterator, mend)) {
1254                continue;
1255            }
1256
1257            matchStart = start;
1258            matchEnd   = mend;
1259
1260            ubrk_close(charBreakIterator);
1261            return TRUE;
1262        }
1263    }
1264
1265    ubrk_close(charBreakIterator);
1266    return FALSE;
1267}
1268
1269#if !UCONFIG_NO_REGULAR_EXPRESSIONS
1270static int32_t  getIntParam(UnicodeString name, UnicodeString &params, int32_t defaultVal) {
1271    int32_t val = defaultVal;
1272
1273    name.append(" *= *(-?\\d+)");
1274
1275    UErrorCode status = U_ZERO_ERROR;
1276    RegexMatcher m(name, params, 0, status);
1277
1278    if (m.find()) {
1279        // The param exists.  Convert the string to an int.
1280        char valString[100];
1281        int32_t paramLength = m.end(1, status) - m.start(1, status);
1282
1283        if (paramLength >= (int32_t)(sizeof(valString)-1)) {
1284            paramLength = (int32_t)(sizeof(valString)-2);
1285        }
1286
1287        params.extract(m.start(1, status), paramLength, valString, sizeof(valString));
1288        val = uprv_strtol(valString,  NULL, 10);
1289
1290        // Delete this parameter from the params string.
1291        m.reset();
1292        params = m.replaceFirst("", status);
1293    }
1294
1295  //U_ASSERT(U_SUCCESS(status));
1296    if (! U_SUCCESS(status)) {
1297        val = defaultVal;
1298    }
1299
1300    return val;
1301}
1302#endif
1303
1304#if !UCONFIG_NO_COLLATION
1305int32_t SSearchTest::monkeyTestCase(UCollator *coll, const UnicodeString &testCase, const UnicodeString &pattern, const UnicodeString &altPattern,
1306                                    const char *name, const char *strength, uint32_t seed)
1307{
1308    UErrorCode status = U_ZERO_ERROR;
1309    int32_t actualStart = -1, actualEnd = -1;
1310  //int32_t expectedStart = prefix.length(), expectedEnd = prefix.length() + altPattern.length();
1311    int32_t expectedStart = -1, expectedEnd = -1;
1312    int32_t notFoundCount = 0;
1313    LocalUStringSearchPointer uss(usearch_openFromCollator(pattern.getBuffer(), pattern.length(),
1314                                                           testCase.getBuffer(), testCase.length(),
1315                                                           coll,
1316                                                           NULL,     // the break iterator
1317                                                           &status));
1318
1319    // **** TODO: find *all* matches, not just first one ****
1320    simpleSearch(coll, testCase, 0, pattern, expectedStart, expectedEnd);
1321
1322    usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status);
1323
1324    if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
1325        errln("Search for <pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
1326              "    strength=%s seed=%d",
1327              name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
1328    }
1329
1330    if (expectedStart == -1 && actualStart == -1) {
1331        notFoundCount += 1;
1332    }
1333
1334    // **** TODO: find *all* matches, not just first one ****
1335    simpleSearch(coll, testCase, 0, altPattern, expectedStart, expectedEnd);
1336
1337    usearch_setPattern(uss.getAlias(), altPattern.getBuffer(), altPattern.length(), &status);
1338
1339    usearch_search(uss.getAlias(), 0, &actualStart, &actualEnd, &status);
1340
1341    if (expectedStart >= 0 && (actualStart != expectedStart || actualEnd != expectedEnd)) {
1342        errln("Search for <alt_pattern> in <%s> failed: expected [%d, %d], got [%d, %d]\n"
1343              "    strength=%s seed=%d",
1344              name, expectedStart, expectedEnd, actualStart, actualEnd, strength, seed);
1345    }
1346
1347    if (expectedStart == -1 && actualStart == -1) {
1348        notFoundCount += 1;
1349    }
1350
1351    return notFoundCount;
1352}
1353#endif
1354
1355void SSearchTest::monkeyTest(char *params)
1356{
1357    // ook!
1358    UErrorCode status = U_ZERO_ERROR;
1359  //UCollator *coll = ucol_open(NULL, &status);
1360    UCollator *coll = ucol_openFromShortString("S1", FALSE, NULL, &status);
1361
1362    if (U_FAILURE(status)) {
1363        errcheckln(status, "Failed to create collator in MonkeyTest! - %s", u_errorName(status));
1364        return;
1365    }
1366
1367    CollData  *monkeyData = new CollData(coll, status);
1368
1369    USet *expansions   = uset_openEmpty();
1370    USet *contractions = uset_openEmpty();
1371
1372    ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);
1373
1374    U_STRING_DECL(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
1375    U_STRING_INIT(letter_pattern, "[[:letter:]-[:ideographic:]-[:hangul:]]", 39);
1376    USet *letters = uset_openPattern(letter_pattern, 39, &status);
1377    SetMonkey letterMonkey(letters);
1378    StringSetMonkey contractionMonkey(contractions, coll, monkeyData);
1379    StringSetMonkey expansionMonkey(expansions, coll, monkeyData);
1380    UnicodeString testCase;
1381    UnicodeString alternate;
1382    UnicodeString pattern, altPattern;
1383    UnicodeString prefix, altPrefix;
1384    UnicodeString suffix, altSuffix;
1385
1386    Monkey *monkeys[] = {
1387        &letterMonkey,
1388        &contractionMonkey,
1389        &expansionMonkey,
1390        &contractionMonkey,
1391        &expansionMonkey,
1392        &contractionMonkey,
1393        &expansionMonkey,
1394        &contractionMonkey,
1395        &expansionMonkey};
1396    int32_t monkeyCount = sizeof(monkeys) / sizeof(monkeys[0]);
1397    // int32_t nonMatchCount = 0;
1398
1399    UCollationStrength strengths[] = {UCOL_PRIMARY, UCOL_SECONDARY, UCOL_TERTIARY};
1400    const char *strengthNames[] = {"primary", "secondary", "tertiary"};
1401    int32_t strengthCount = sizeof(strengths) / sizeof(strengths[0]);
1402    int32_t loopCount = quick? 1000 : 10000;
1403    int32_t firstStrength = 0;
1404    int32_t lastStrength  = strengthCount - 1; //*/ 0;
1405
1406    if (params != NULL) {
1407#if !UCONFIG_NO_REGULAR_EXPRESSIONS
1408        UnicodeString p(params);
1409
1410        loopCount = getIntParam("loop", p, loopCount);
1411        m_seed    = getIntParam("seed", p, m_seed);
1412
1413        RegexMatcher m(" *strength *= *(primary|secondary|tertiary) *", p, 0, status);
1414        if (m.find()) {
1415            UnicodeString breakType = m.group(1, status);
1416
1417            for (int32_t s = 0; s < strengthCount; s += 1) {
1418                if (breakType == strengthNames[s]) {
1419                    firstStrength = lastStrength = s;
1420                    break;
1421                }
1422            }
1423
1424            m.reset();
1425            p = m.replaceFirst("", status);
1426        }
1427
1428        if (RegexMatcher("\\S", p, 0, status).find()) {
1429            // Each option is stripped out of the option string as it is processed.
1430            // All options have been checked.  The option string should have been completely emptied..
1431            char buf[100];
1432            p.extract(buf, sizeof(buf), NULL, status);
1433            buf[sizeof(buf)-1] = 0;
1434            errln("Unrecognized or extra parameter:  %s\n", buf);
1435            return;
1436        }
1437#else
1438        infoln("SSearchTest built with UCONFIG_NO_REGULAR_EXPRESSIONS: ignoring parameters.");
1439#endif
1440    }
1441
1442    for(int32_t s = firstStrength; s <= lastStrength; s += 1) {
1443        int32_t notFoundCount = 0;
1444
1445        logln("Setting strength to %s.", strengthNames[s]);
1446        ucol_setStrength(coll, strengths[s]);
1447
1448        // TODO: try alternate prefix and suffix too?
1449        // TODO: alternates are only equal at primary strength. Is this OK?
1450        for(int32_t t = 0; t < loopCount; t += 1) {
1451            uint32_t seed = m_seed;
1452            // int32_t  nmc = 0;
1453
1454            generateTestCase(coll, monkeys, monkeyCount, pattern, altPattern);
1455            generateTestCase(coll, monkeys, monkeyCount, prefix,  altPrefix);
1456            generateTestCase(coll, monkeys, monkeyCount, suffix,  altSuffix);
1457
1458            // pattern
1459            notFoundCount += monkeyTestCase(coll, pattern, pattern, altPattern, "pattern", strengthNames[s], seed);
1460
1461            testCase.remove();
1462            testCase.append(prefix);
1463            testCase.append(/*alt*/pattern);
1464
1465            // prefix + pattern
1466            notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern", strengthNames[s], seed);
1467
1468            testCase.append(suffix);
1469
1470            // prefix + pattern + suffix
1471            notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "prefix + pattern + suffix", strengthNames[s], seed);
1472
1473            testCase.remove();
1474            testCase.append(pattern);
1475            testCase.append(suffix);
1476
1477            // pattern + suffix
1478            notFoundCount += monkeyTestCase(coll, testCase, pattern, altPattern, "pattern + suffix", strengthNames[s], seed);
1479        }
1480
1481       logln("For strength %s the not found count is %d.", strengthNames[s], notFoundCount);
1482    }
1483
1484    uset_close(contractions);
1485    uset_close(expansions);
1486    uset_close(letters);
1487    delete monkeyData;
1488
1489    ucol_close(coll);
1490}
1491
1492#endif
1493
1494#endif
1495