164339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert// Copyright (C) 2016 and later: Unicode, Inc. and others.
264339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html
38de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert/********************************************************************
48de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert * Copyright (c) 2016, International Business Machines Corporation and
58de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert * others. All Rights Reserved.
68de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert ********************************************************************/
78de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
88de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
98de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert#include "unicode/utypes.h"
108de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
118de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING
128de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
138de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert#include "rbbimonkeytest.h"
148de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert#include "unicode/utypes.h"
158de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert#include "unicode/brkiter.h"
168de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert#include "unicode/utf16.h"
178de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert#include "unicode/uniset.h"
188de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert#include "unicode/unistr.h"
198de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
208de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert#include "charstr.h"
218de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert#include "cmemory.h"
228de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert#include "cstr.h"
238de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert#include "uelement.h"
248de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert#include "uhash.h"
258de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
2664339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert#include <iostream>
2764339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert#include <stdio.h>
2864339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert#include <stdlib.h>
2964339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert#include <string>
308de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
318de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertusing namespace icu;
328de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
338de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
348de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid RBBIMonkeyTest::runIndexedTest(int32_t index, UBool exec, const char* &name, char* params) {
358de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fParams = params;            // Work around TESTCASE_AUTO not being able to pass params to test function.
368de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
378de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    TESTCASE_AUTO_BEGIN;
388de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    TESTCASE_AUTO(testMonkey);
398de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    TESTCASE_AUTO_END;
408de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert}
418de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
428de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//---------------------------------------------------------------------------------------
438de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//
448de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//   class BreakRule implementation.
458de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//
468de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//---------------------------------------------------------------------------------------
478de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
488de051c3d18a56cc126f0f44e368495a52f9148cFredrik RoubertBreakRule::BreakRule()      // :  all field default initialized.
498de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert{
508de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert}
518de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
528de051c3d18a56cc126f0f44e368495a52f9148cFredrik RoubertBreakRule::~BreakRule() {}
538de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
548de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
558de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//---------------------------------------------------------------------------------------
568de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//
578de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//   class BreakRules implementation.
588de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//
598de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//---------------------------------------------------------------------------------------
608de051c3d18a56cc126f0f44e368495a52f9148cFredrik RoubertBreakRules::BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status)  :
618de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        fMonkeyImpl(monkeyImpl), fBreakRules(status), fType(UBRK_COUNT) {
628de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fCharClasses.adoptInstead(uhash_open(uhash_hashUnicodeString,
638de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                                         uhash_compareUnicodeString,
648de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                                         NULL,      // value comparator.
658de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                                         &status));
668de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    if (U_FAILURE(status)) {
678de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        return;
688de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
698de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    uhash_setKeyDeleter(fCharClasses.getAlias(), uprv_deleteUObject);
708de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    uhash_setValueDeleter(fCharClasses.getAlias(), uprv_deleteUObject);
718de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fBreakRules.setDeleter(uprv_deleteUObject);
728de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
738de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fCharClassList.adoptInstead(new UVector(status));
748de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
758de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fSetRefsMatcher.adoptInstead(new RegexMatcher(UnicodeString(
768de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert             "(?!(?:\\{|=|\\[:)[ \\t]{0,4})"              // Negative lookbehind for '{' or '=' or '[:'
778de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                                                          //   (the identifier is a unicode property name or value)
788de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert             "(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)"),     // The char class name
798de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        0, status));
808de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
818de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    // Match comments and blank lines. Matches will be replaced with "", stripping the comments from the rules.
828de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fCommentsMatcher.adoptInstead(new RegexMatcher(UnicodeString(
838de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                "(^|(?<=;))"                    // Start either at start of line, or just after a ';' (look-behind for ';')
848de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                "[ \\t]*+"                      //   Match white space.
858de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                "(#.*)?+"                       //   Optional # plus whatever follows
868de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                "\\R$"                          //   new-line at end of line.
878de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            ), 0, status));
888de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
898de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    // Match (initial parse) of a character class defintion line.
908de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fClassDefMatcher.adoptInstead(new RegexMatcher(UnicodeString(
918de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                "[ \\t]*"                                // leading white space
928de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                "(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)"   // The char class name
938de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                "[ \\t]*=[ \\t]*"                        //   =
948de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                "(?<ClassDef>.*?)"                       // The char class UnicodeSet expression
958de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                "[ \\t]*;$"),                     // ; <end of line>
968de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            0, status));
978de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
988de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    // Match (initial parse) of a break rule line.
998de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fRuleDefMatcher.adoptInstead(new RegexMatcher(UnicodeString(
1008de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                "[ \\t]*"                                // leading white space
1018de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                "(?<RuleName>[A-Za-z_][A-Za-z0-9_.]*)"    // The rule name
1028de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                "[ \\t]*:[ \\t]*"                        //   :
1038de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                "(?<RuleDef>.*?)"                        // The rule definition
1048de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                "[ \\t]*;$"),                            // ; <end of line>
1058de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            0, status));
1068de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
1078de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert}
1088de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
1098de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
1108de051c3d18a56cc126f0f44e368495a52f9148cFredrik RoubertBreakRules::~BreakRules() {}
1118de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
1128de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
1138de051c3d18a56cc126f0f44e368495a52f9148cFredrik RoubertCharClass *BreakRules::addCharClass(const UnicodeString &name, const UnicodeString &definition, UErrorCode &status) {
1148de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
1158de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    // Create the expanded definition for this char class,
1168de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    // replacing any set references with the corresponding definition.
1178de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
1188de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    UnicodeString expandedDef;
1198de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    UnicodeString emptyString;
1208de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fSetRefsMatcher->reset(definition);
1218de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    while (fSetRefsMatcher->find() && U_SUCCESS(status)) {
1228de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        const UnicodeString name =
1238de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                fSetRefsMatcher->group(fSetRefsMatcher->pattern().groupNumberFromName("ClassName", status), status);
1248de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        CharClass *nameClass = static_cast<CharClass *>(uhash_get(fCharClasses.getAlias(), &name));
1258de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        const UnicodeString &expansionForName = nameClass ? nameClass->fExpandedDef : name;
1268de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
1278de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        fSetRefsMatcher->appendReplacement(expandedDef, emptyString, status);
1288de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        expandedDef.append(expansionForName);
1298de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
1308de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fSetRefsMatcher->appendTail(expandedDef);
1318de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
1328de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    // Verify that the expanded set defintion is valid.
1338de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
1348de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    if (fMonkeyImpl->fDumpExpansions) {
1358de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        printf("epandedDef: %s\n", CStr(expandedDef)());
1368de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
1378de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
1388de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    UnicodeSet *s = new UnicodeSet(expandedDef, USET_IGNORE_SPACE, NULL, status);
1398de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    if (U_FAILURE(status)) {
1408de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        IntlTest::gTest->errln("%s:%d: error %s creating UnicodeSet %s", __FILE__, __LINE__,
1418de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                               u_errorName(status), CStr(name)());
1428de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        return NULL;
1438de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
1448de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    CharClass *cclass = new CharClass(name, definition, expandedDef, s);
1458de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    CharClass *previousClass = static_cast<CharClass *>(uhash_put(fCharClasses.getAlias(),
1468de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                                                        new UnicodeString(name),   // Key, owned by hash table.
1478de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                                                        cclass,                    // Value, owned by hash table.
1488de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                                                        &status));
1498de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
1508de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    if (previousClass != NULL) {
1518de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        // Duplicate class def.
1528de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        // These are legitimate, they are adustments of an existing class.
1538de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        // TODO: will need to keep the old around when we handle tailorings.
1548de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        IntlTest::gTest->logln("Redefinition of character class %s\n", CStr(cclass->fName)());
1558de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        delete previousClass;
1568de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
1578de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    return cclass;
1588de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert}
1598de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
1608de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
1618de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid BreakRules::addRule(const UnicodeString &name, const UnicodeString &definition, UErrorCode &status) {
1628de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    LocalPointer<BreakRule> thisRule(new BreakRule);
1638de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    thisRule->fName = name;
1648de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    thisRule->fRule = definition;
1658de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
1668de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    // If the rule name contains embedded digits, pad the first numeric field to a fixed length with leading zeroes,
1678de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    // This gives a numeric sort order that matches Unicode UAX rule numbering conventions.
1688de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    UnicodeString emptyString;
1698de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
1708de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    // Expand the char class definitions within the rule.
1718de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fSetRefsMatcher->reset(definition);
1728de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    while (fSetRefsMatcher->find() && U_SUCCESS(status)) {
1738de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        const UnicodeString name =
1748de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                fSetRefsMatcher->group(fSetRefsMatcher->pattern().groupNumberFromName("ClassName", status), status);
1758de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        CharClass *nameClass = static_cast<CharClass *>(uhash_get(fCharClasses.getAlias(), &name));
1768de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        if (!nameClass) {
1778de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            IntlTest::gTest->errln("%s:%d char class \"%s\" unrecognized in rule \"%s\"",
1788de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                __FILE__, __LINE__, CStr(name)(), CStr(definition)());
1798de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
1808de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        const UnicodeString &expansionForName = nameClass ? nameClass->fExpandedDef : name;
1818de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
1828de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        fSetRefsMatcher->appendReplacement(thisRule->fExpandedRule, emptyString, status);
1838de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        thisRule->fExpandedRule.append(expansionForName);
1848de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
1858de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fSetRefsMatcher->appendTail(thisRule->fExpandedRule);
1868de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
1878de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    // Replace the divide sign (\u00f7) with a regular expression named capture.
1888de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    // When running the rules, a match that includes this group means we found a break position.
1898de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
1908de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    int32_t dividePos = thisRule->fExpandedRule.indexOf((UChar)0x00f7);
1918de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    if (dividePos >= 0) {
1928de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        thisRule->fExpandedRule.replace(dividePos, 1, UnicodeString("(?<BreakPosition>)"));
1938de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
1948de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    if (thisRule->fExpandedRule.indexOf((UChar)0x00f7) != -1) {
1958de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        status = U_ILLEGAL_ARGUMENT_ERROR;   // TODO: produce a good error message.
1968de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
1978de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
1988de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    // UAX break rule set definitions can be empty, just [].
1998de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    // Regular expression set expressions don't accept this. Substitute with [^\u0000-\U0010ffff], which
2008de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    // also matches nothing.
2018de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
2028de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    static const UChar emptySet[] = {(UChar)0x5b, (UChar)0x5d, 0};
2038de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    int32_t where = 0;
2048de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    while ((where = thisRule->fExpandedRule.indexOf(emptySet, 2, 0)) >= 0) {
2058de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        thisRule->fExpandedRule.replace(where, 2, UnicodeString("[^\\u0000-\\U0010ffff]"));
2068de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
2078de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    if (fMonkeyImpl->fDumpExpansions) {
2088de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        printf("fExpandedRule: %s\n", CStr(thisRule->fExpandedRule)());
2098de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
2108de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
2118de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    // Compile a regular expression for this rule.
2128de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    thisRule->fRuleMatcher.adoptInstead(new RegexMatcher(thisRule->fExpandedRule, UREGEX_COMMENTS | UREGEX_DOTALL, status));
2138de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    if (U_FAILURE(status)) {
2148de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        IntlTest::gTest->errln("%s:%d Error creating regular expression for %s",
2158de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                __FILE__, __LINE__, CStr(thisRule->fExpandedRule)());
2168de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        return;
2178de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
2188de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
2198de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    // Put this new rule into the vector of all Rules.
2208de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fBreakRules.addElement(thisRule.orphan(), status);
2218de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert}
2228de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
2238de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
2248de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertbool BreakRules::setKeywordParameter(const UnicodeString &keyword, const UnicodeString &value, UErrorCode &status) {
2258de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    if (keyword == UnicodeString("locale")) {
2268de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        CharString localeName;
2278de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        localeName.append(CStr(value)(), -1, status);
2288de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        fLocale = Locale::createFromName(localeName.data());
2298de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        return true;
2308de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
2318de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    if (keyword == UnicodeString("type")) {
2328de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        if (value == UnicodeString("grapheme")) {
2338de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            fType = UBRK_CHARACTER;
2348de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        } else if (value == UnicodeString("word")) {
2358de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            fType = UBRK_WORD;
2368de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        } else if (value == UnicodeString("line")) {
2378de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            fType = UBRK_LINE;
2388de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        } else if (value == UnicodeString("sentence")) {
2398de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            fType = UBRK_SENTENCE;
2408de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        } else {
2418de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            IntlTest::gTest->errln("%s:%d Unrecognized break type %s", __FILE__, __LINE__,  CStr(value)());
2428de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
2438de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        return true;
2448de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
2458de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    // TODO: add tailoring base setting here.
2468de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    return false;
2478de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert}
2488de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
2498de051c3d18a56cc126f0f44e368495a52f9148cFredrik RoubertRuleBasedBreakIterator *BreakRules::createICUBreakIterator(UErrorCode &status) {
2508de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    if (U_FAILURE(status)) {
2518de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        return NULL;
2528de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
2538de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    RuleBasedBreakIterator *bi = NULL;
2548de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    switch(fType) {
2558de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        case UBRK_CHARACTER:
2568de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createCharacterInstance(fLocale, status));
2578de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            break;
2588de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        case UBRK_WORD:
2598de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createWordInstance(fLocale, status));
2608de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            break;
2618de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        case UBRK_LINE:
2628de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createLineInstance(fLocale, status));
2638de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            break;
2648de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        case UBRK_SENTENCE:
2658de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createSentenceInstance(fLocale, status));
2668de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            break;
2678de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        default:
2688de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            IntlTest::gTest->errln("%s:%d Bad break iterator type of %d", __FILE__, __LINE__, fType);
2698de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            status = U_ILLEGAL_ARGUMENT_ERROR;
2708de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
2718de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    return bi;
2728de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert}
2738de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
2748de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
2758de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid BreakRules::compileRules(UCHARBUF *rules, UErrorCode &status) {
2768de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    if (U_FAILURE(status)) {
2778de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        return;
2788de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
2798de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
2808de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    UnicodeString emptyString;
2818de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    for (int32_t lineNumber=0; ;lineNumber++) {    // Loop once per input line.
2828de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        if (U_FAILURE(status)) {
2838de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            return;
2848de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
2858de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        int32_t lineLength = 0;
2868de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        const UChar *lineBuf = ucbuf_readline(rules, &lineLength, &status);
2878de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        if (lineBuf == NULL) {
2888de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            break;
2898de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
2908de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        UnicodeString line(lineBuf, lineLength);
2918de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
2928de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        // Strip comment lines.
2938de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        fCommentsMatcher->reset(line);
2948de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        line = fCommentsMatcher->replaceFirst(emptyString, status);
2958de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        if (line.isEmpty()) {
2968de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            continue;
2978de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
2988de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
2998de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        // Recognize character class definition and keyword lines
3008de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        fClassDefMatcher->reset(line);
3018de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        if (fClassDefMatcher->matches(status)) {
3028de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            UnicodeString className = fClassDefMatcher->group(fClassDefMatcher->pattern().groupNumberFromName("ClassName", status), status);
3038de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            UnicodeString classDef  = fClassDefMatcher->group(fClassDefMatcher->pattern().groupNumberFromName("ClassDef", status), status);
3048de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            if (fMonkeyImpl->fDumpExpansions) {
3058de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                printf("scanned class: %s = %s\n", CStr(className)(), CStr(classDef)());
3068de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            }
3078de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            if (setKeywordParameter(className, classDef, status)) {
3088de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                // The scanned item was "type = ..." or "locale = ...", etc.
3098de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                //   which are not actual character classes.
3108de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                continue;
3118de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            }
3128de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            addCharClass(className, classDef, status);
3138de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            continue;
3148de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
3158de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
3168de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        // Recognize rule lines.
3178de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        fRuleDefMatcher->reset(line);
3188de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        if (fRuleDefMatcher->matches(status)) {
3198de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            UnicodeString ruleName = fRuleDefMatcher->group(fRuleDefMatcher->pattern().groupNumberFromName("RuleName", status), status);
3208de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            UnicodeString ruleDef  = fRuleDefMatcher->group(fRuleDefMatcher->pattern().groupNumberFromName("RuleDef", status), status);
3218de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            if (fMonkeyImpl->fDumpExpansions) {
3228de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                printf("scanned rule: %s : %s\n", CStr(ruleName)(), CStr(ruleDef)());
3238de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            }
3248de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            addRule(ruleName, ruleDef, status);
3258de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            continue;
3268de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
3278de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
3288de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        IntlTest::gTest->errln("%s:%d: Unrecognized line in rule file %s: \"%s\"\n",
3298de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            __FILE__, __LINE__, fMonkeyImpl->fRuleFileName, CStr(line)());
3308de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
3318de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
3328de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    // Build the vector of char classes, omitting the dictionary class if there is one.
3338de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    // This will be used when constructing the random text to be tested.
3348de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
3358de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    // Also compute the "other" set, consisting of any characters not included in
3368de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    // one or more of the user defined sets.
3378de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
3388de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    UnicodeSet otherSet((UChar32)0, 0x10ffff);
3398de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    int32_t pos = UHASH_FIRST;
3408de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    const UHashElement *el = NULL;
3418de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    while ((el = uhash_nextElement(fCharClasses.getAlias(), &pos)) != NULL) {
3428de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        const UnicodeString *ccName = static_cast<const UnicodeString *>(el->key.pointer);
3438de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        CharClass *cclass = static_cast<CharClass *>(el->value.pointer);
3448de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        // printf("    Adding %s\n", CStr(*ccName)());
3458de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        if (*ccName != cclass->fName) {
3468de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            IntlTest::gTest->errln("%s:%d: internal error, set names (%s, %s) inconsistent.\n",
3478de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                    __FILE__, __LINE__, CStr(*ccName)(), CStr(cclass->fName)());
3488de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
3498de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        const UnicodeSet *set = cclass->fSet.getAlias();
3508de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        otherSet.removeAll(*set);
3518de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        if (*ccName == UnicodeString("dictionary")) {
3528de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            fDictionarySet = *set;
3538de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        } else {
3548de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            fCharClassList->addElement(cclass, status);
3558de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
3568de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
3578de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
3588de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    if (!otherSet.isEmpty()) {
3598de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        // fprintf(stderr, "have an other set.\n");
3608de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        UnicodeString pattern;
3618de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        CharClass *cclass = addCharClass(UnicodeString("__Others"), otherSet.toPattern(pattern), status);
3628de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        fCharClassList->addElement(cclass, status);
3638de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
3648de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert}
3658de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
3668de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
3678de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertconst CharClass *BreakRules::getClassForChar(UChar32 c, int32_t *iter) const {
3688de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert   int32_t localIter = 0;
3698de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert   int32_t &it = iter? *iter : localIter;
3708de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
3718de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert   while (it < fCharClassList->size()) {
3728de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert       const CharClass *cc = static_cast<const CharClass *>(fCharClassList->elementAt(it));
3738de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert       ++it;
3748de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert       if (cc->fSet->contains(c)) {
3758de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert           return cc;
3768de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert       }
3778de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
3788de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    return NULL;
3798de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert}
3808de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
3818de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//---------------------------------------------------------------------------------------
3828de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//
3838de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//   class MonkeyTestData implementation.
3848de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//
3858de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//---------------------------------------------------------------------------------------
3868de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
3878de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid MonkeyTestData::set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode &status) {
3888de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    const int32_t dataLength = 1000;
3898de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
3908de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    // Fill the test string with random characters.
3918de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    // First randomly pick a char class, then randomly pick a character from that class.
3928de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    // Exclude any characters from the dictionary set.
3938de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
3948de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    // std::cout << "Populating Test Data" << std::endl;
3958de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fRandomSeed = rand.getSeed();         // Save initial seed for use in error messages,
3968de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                                          // allowing recreation of failing data.
3978de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fBkRules = rules;
3988de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fString.remove();
3998de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    for (int32_t n=0; n<dataLength;) {
4008de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        int charClassIndex = rand() % rules->fCharClassList->size();
4018de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        const CharClass *cclass = static_cast<CharClass *>(rules->fCharClassList->elementAt(charClassIndex));
4028de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        if (cclass->fSet->size() == 0) {
4038de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            // Some rules or tailorings do end up with empty char classes.
4048de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            continue;
4058de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
4068de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        int32_t charIndex = rand() % cclass->fSet->size();
4078de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        UChar32 c = cclass->fSet->charAt(charIndex);
4088de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        if (U16_IS_TRAIL(c) && fString.length() > 0 && U16_IS_LEAD(fString.charAt(fString.length()-1))) {
4098de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            // Character classes may contain unpaired surrogates, e.g. Grapheme_Cluster_Break = Control.
4108de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            // Don't let random unpaired surrogates combine in the test data because they might
4118de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            // produce an unwanted dictionary character.
4128de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            continue;
4138de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
4148de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
4158de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        if (!rules->fDictionarySet.contains(c)) {
4168de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            fString.append(c);
4178de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            ++n;
4188de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
4198de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
4208de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
4218de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    // Reset each rule matcher regex with this new string.
4228de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    //    (Although we are always using the same string object, ICU regular expressions
4238de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    //    don't like the underlying string data changing without doing a reset).
4248de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
4258de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    for (int32_t ruleNum=0; ruleNum<rules->fBreakRules.size(); ruleNum++) {
4268de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        BreakRule *rule = static_cast<BreakRule *>(rules->fBreakRules.elementAt(ruleNum));
4278de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            rule->fRuleMatcher->reset(fString);
4288de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
4298de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
4308de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    // Init the expectedBreaks, actualBreaks and ruleForPosition strings (used as arrays).
4318de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    // Expected and Actual breaks are one longer than the input string; a non-zero value
4328de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    // will indicate a boundary preceding that position.
4338de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
4348de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    clearActualBreaks();
4358de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fExpectedBreaks  = fActualBreaks;
4368de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fRuleForPosition = fActualBreaks;
4378de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    f2ndRuleForPos   = fActualBreaks;
4388de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
4398de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    // Apply reference rules to find the expected breaks.
4408de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
4418de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fExpectedBreaks.setCharAt(0, (UChar)1);  // Force an expected break before the start of the text.
4428de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                                             // ICU always reports a break there.
4438de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                                             // The reference rules do not have a means to do so.
4448de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    int32_t strIdx = 0;
4458de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    while (strIdx < fString.length()) {
4468de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        BreakRule *matchingRule = NULL;
4478de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        UBool      hasBreak = FALSE;
4488de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        int32_t ruleNum = 0;
4498de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        int32_t matchStart = 0;
4508de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        int32_t matchEnd = 0;
4518de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        int32_t breakGroup = 0;
4528de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        for (ruleNum=0; ruleNum<rules->fBreakRules.size(); ruleNum++) {
4538de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            BreakRule *rule = static_cast<BreakRule *>(rules->fBreakRules.elementAt(ruleNum));
4548de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            rule->fRuleMatcher->reset();
4558de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            if (rule->fRuleMatcher->lookingAt(strIdx, status)) {
4568de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                // A candidate rule match, check further to see if we take it or continue to check other rules.
4578de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                // Matches of zero or one codepoint count only if they also specify a break.
4588de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                matchStart = rule->fRuleMatcher->start(status);
4598de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                matchEnd = rule->fRuleMatcher->end(status);
4608de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                breakGroup = rule->fRuleMatcher->pattern().groupNumberFromName("BreakPosition", status);
4618de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                hasBreak = U_SUCCESS(status);
4628de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                if (status == U_REGEX_INVALID_CAPTURE_GROUP_NAME) {
4638de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                    status = U_ZERO_ERROR;
4648de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                }
4658de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                if (hasBreak || fString.moveIndex32(matchStart, 1) < matchEnd) {
4668de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                    matchingRule = rule;
4678de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                    break;
4688de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                }
4698de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            }
4708de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
4718de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        if (matchingRule == NULL) {
4728de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            // No reference rule matched. This is an error in the rules that should never happen.
4738de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            IntlTest::gTest->errln("%s:%d Trouble with monkey test reference rules at position %d. ",
4748de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                 __FILE__, __LINE__, strIdx);
4758de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            dump(strIdx);
4768de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            status = U_INVALID_FORMAT_ERROR;
4778de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            return;
4788de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
4798de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        if (matchingRule->fRuleMatcher->group(status).length() == 0) {
4808de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            // Zero length rule match. This is also an error in the rule expressions.
4818de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            IntlTest::gTest->errln("%s:%d Zero length rule match.",
4828de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                __FILE__, __LINE__);
4838de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            status =  U_INVALID_FORMAT_ERROR;
4848de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            return;
4858de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
4868de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
4878de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        // Record which rule matched over the length of the match.
4888de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        for (int i = matchStart; i < matchEnd; i++) {
4898de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            if (fRuleForPosition.charAt(i) == 0) {
4908de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                fRuleForPosition.setCharAt(i, (UChar)ruleNum);
4918de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            } else {
4928de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                f2ndRuleForPos.setCharAt(i, (UChar)ruleNum);
4938de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            }
4948de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
4958de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
4968de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        // Break positions appear in rules as a matching named capture of zero length at the break position,
4978de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        //   the adjusted pattern contains (?<BreakPosition>)
4988de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        if (hasBreak) {
4998de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            int32_t breakPos = matchingRule->fRuleMatcher->start(breakGroup, status);
5008de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            if (U_FAILURE(status) || breakPos < 0) {
5018de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                // Rule specified a break, but that break wasn't part of the match, even
5028de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                // though the rule as a whole matched.
5038de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                // Can't happen with regular expressions derived from (equivalent to) ICU break rules.
5048de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                // Shouldn't get here.
5058de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                IntlTest::gTest->errln("%s:%d Internal Rule Error.", __FILE__, __LINE__);
5068de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                status =  U_INVALID_FORMAT_ERROR;
5078de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                break;
5088de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            }
5098de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            fExpectedBreaks.setCharAt(breakPos, (UChar)1);
5108de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            // printf("recording break at %d\n", breakPos);
5118de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            // For the next iteration, pick up applying rules immediately after the break,
5128de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            // which may differ from end of the match. The matching rule may have included
5138de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            // context following the boundary that needs to be looked at again.
5148de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            strIdx = matchingRule->fRuleMatcher->end(breakGroup, status);
5158de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        } else {
5168de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            // Original rule didn't specify a break.
5178de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            // Continue applying rules starting on the last code point of this match.
5188de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            strIdx = fString.moveIndex32(matchEnd, -1);
5198de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            if (strIdx == matchStart) {
5208de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                // Match was only one code point, no progress if we continue.
5218de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                // Shouldn't get here, case is filtered out at top of loop.
5228de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                CharString ruleName;
5238de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                ruleName.appendInvariantChars(matchingRule->fName, status);
5248de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                IntlTest::gTest->errln("%s:%d Rule %s internal error",
5258de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                        __FILE__, __LINE__, ruleName.data());
5268de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                status = U_INVALID_FORMAT_ERROR;
5278de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                break;
5288de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            }
5298de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
5308de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        if (U_FAILURE(status)) {
5318de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            IntlTest::gTest->errln("%s:%d status = %s. Unexpected failure, perhaps problem internal to test.",
5328de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                __FILE__, __LINE__, u_errorName(status));
5338de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            break;
5348de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
5358de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
5368de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert}
5378de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
5388de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid MonkeyTestData::clearActualBreaks() {
5398de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fActualBreaks.remove();
5408de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    // Actual Breaks length is one longer than the data string length, allowing
5418de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    //    for breaks before the first and after the last character in the data.
5428de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    for (int32_t i=0; i<=fString.length(); i++) {
5438de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        fActualBreaks.append((UChar)0);
5448de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
5458de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert}
5468de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
5478de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid MonkeyTestData::dump(int32_t around) const {
5488de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    printf("\n"
5498de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert           "         char                        break  Rule                     Character\n"
5508de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert           "   pos   code   class                 R I   name                     name\n"
5518de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert           "---------------------------------------------------------------------------------------------\n");
5528de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
5538de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    int32_t start;
5548de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    int32_t end;
5558de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
5568de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    if (around == -1) {
5578de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        start = 0;
5588de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        end = fString.length();
5598de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    } else {
5608de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        // Display context around a failure.
5618de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        start = fString.moveIndex32(around, -30);
5628de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        end = fString.moveIndex32(around, +30);
5638de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
5648de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
5658de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    for (int charIdx = start; charIdx < end; charIdx=fString.moveIndex32(charIdx, 1)) {
5668de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        UErrorCode status = U_ZERO_ERROR;
5678de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        UChar32 c = fString.char32At(charIdx);
5688de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        const CharClass *cc = fBkRules->getClassForChar(c);
5698de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        CharString ccName;
5708de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        ccName.appendInvariantChars(cc->fName, status);
5718de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        CharString ruleName, secondRuleName;
5728de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        const BreakRule *rule = static_cast<BreakRule *>(fBkRules->fBreakRules.elementAt(fRuleForPosition.charAt(charIdx)));
5738de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        ruleName.appendInvariantChars(rule->fName, status);
5748de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        if (f2ndRuleForPos.charAt(charIdx) > 0) {
5758de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            const BreakRule *secondRule = static_cast<BreakRule *>(fBkRules->fBreakRules.elementAt(f2ndRuleForPos.charAt(charIdx)));
5768de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            secondRuleName.appendInvariantChars(secondRule->fName, status);
5778de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
5788de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        char cName[200];
5798de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status);
5808de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
5818de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        printf("  %4.1d %6.4x   %-20s  %c %c   %-10s %-10s    %s\n",
5828de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            charIdx, c, ccName.data(),
5838de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            fExpectedBreaks.charAt(charIdx) ? '*' : '.',
5848de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            fActualBreaks.charAt(charIdx) ? '*' : '.',
5858de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            ruleName.data(), secondRuleName.data(), cName
5868de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        );
5878de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
5888de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert}
5898de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
5908de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
5918de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//---------------------------------------------------------------------------------------
5928de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//
5938de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//   class RBBIMonkeyImpl
5948de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//
5958de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//---------------------------------------------------------------------------------------
5968de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
5978de051c3d18a56cc126f0f44e368495a52f9148cFredrik RoubertRBBIMonkeyImpl::RBBIMonkeyImpl(UErrorCode &status) : fDumpExpansions(FALSE), fThread(this) {
5988de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    (void)status;    // suppress unused parameter compiler warning.
5998de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert}
6008de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
6018de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
6028de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert// RBBIMonkeyImpl setup       does all of the setup for a single rule set - compiling the
6038de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//                            reference rules and creating the icu breakiterator to test,
6048de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//                            with its type and locale coming from the reference rules.
6058de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
6068de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid RBBIMonkeyImpl::setup(const char *ruleFile, UErrorCode &status) {
6078de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fRuleFileName = ruleFile;
6088de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    openBreakRules(ruleFile, status);
6098de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    if (U_FAILURE(status)) {
6108de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        IntlTest::gTest->errln("%s:%d Error %s opening file %s.", __FILE__, __LINE__, u_errorName(status), ruleFile);
6118de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        return;
6128de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
6138de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fRuleSet.adoptInstead(new BreakRules(this, status));
6148de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fRuleSet->compileRules(fRuleCharBuffer.getAlias(), status);
6158de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    if (U_FAILURE(status)) {
6168de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        IntlTest::gTest->errln("%s:%d Error %s processing file %s.", __FILE__, __LINE__, u_errorName(status), ruleFile);
6178de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        return;
6188de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
6198de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fBI.adoptInstead(fRuleSet->createICUBreakIterator(status));
6208de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fTestData.adoptInstead(new MonkeyTestData());
6218de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert}
6228de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
6238de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
6248de051c3d18a56cc126f0f44e368495a52f9148cFredrik RoubertRBBIMonkeyImpl::~RBBIMonkeyImpl() {
6258de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert}
6268de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
6278de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
6288de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid RBBIMonkeyImpl::openBreakRules(const char *fileName, UErrorCode &status) {
6298de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    CharString path;
6308de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    path.append(IntlTest::getSourceTestData(status), status);
6318de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    path.append("break_rules" U_FILE_SEP_STRING, status);
6328de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    path.appendPathPart(fileName, status);
6338de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    const char *codePage = "UTF-8";
6348de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fRuleCharBuffer.adoptInstead(ucbuf_open(path.data(), &codePage, TRUE, FALSE, &status));
6358de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert}
6368de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
6378de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
6388de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid RBBIMonkeyImpl::startTest() {
6398de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fThread.start();   // invokes runTest() in a separate thread.
6408de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert}
6418de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
6428de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid RBBIMonkeyImpl::join() {
6438de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fThread.join();
6448de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert}
6458de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
6468de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
6478de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert#define MONKEY_ERROR(msg, index) { \
6488de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    IntlTest::gTest->errln("%s:%d %s at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ", \
6498de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                    __FILE__, __LINE__, msg, index, fRuleFileName, fTestData->fRandomSeed); \
6508de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    if (fVerbose) { fTestData->dump(index); } \
6518de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    status = U_INVALID_STATE_ERROR;  \
6528de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert}
6538de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
6548de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid RBBIMonkeyImpl::runTest() {
6558de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    UErrorCode status = U_ZERO_ERROR;
6568de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    int32_t errorCount = 0;
6578de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    for (int64_t loopCount = 0; fLoopCount < 0 || loopCount < fLoopCount; loopCount++) {
6588de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        status = U_ZERO_ERROR;
6598de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        fTestData->set(fRuleSet.getAlias(), fRandomGenerator, status);
6608de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        if (fBI.isNull()) {
6618de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            IntlTest::gTest->dataerrln("Unable to run test because fBI is null.");
6628de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            return;
6638de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
6648de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        // fTestData->dump();
6658de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        testForwards(status);
6668de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        testPrevious(status);
6678de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        testFollowing(status);
6688de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        testPreceding(status);
6698de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        testIsBoundary(status);
6708de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
6718de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        if (fLoopCount < 0 && loopCount % 100 == 0) {
6728de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            fprintf(stderr, ".");
6738de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
6748de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        if (U_FAILURE(status)) {
6758de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            if (++errorCount > 10) {
6768de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                return;
6778de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            }
6788de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
6798de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
6808de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert}
6818de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
6828de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid RBBIMonkeyImpl::testForwards(UErrorCode &status) {
6838de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    if (U_FAILURE(status)) {
6848de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        return;
6858de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
6868de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fTestData->clearActualBreaks();
6878de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fBI->setText(fTestData->fString);
6888de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    int32_t previousBreak = -2;
6898de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    for (int32_t bk=fBI->first(); bk != BreakIterator::DONE; bk=fBI->next()) {
6908de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        if (bk <= previousBreak) {
6918de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            MONKEY_ERROR("Break Iterator Stall", bk);
6928de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            return;
6938de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
6948de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        if (bk < 0 || bk > fTestData->fString.length()) {
6958de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            MONKEY_ERROR("Boundary out of bounds", bk);
6968de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            return;
6978de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
6988de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        fTestData->fActualBreaks.setCharAt(bk, 1);
6998de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
7008de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    checkResults("testForwards", FORWARD, status);
7018de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert}
7028de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
7038de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid RBBIMonkeyImpl::testFollowing(UErrorCode &status) {
7048de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    if (U_FAILURE(status)) {
7058de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        return;
7068de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
7078de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fTestData->clearActualBreaks();
7088de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fBI->setText(fTestData->fString);
7098de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    int32_t nextBreak = -1;
7108de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    for (int32_t i=-1 ; i<fTestData->fString.length(); ++i) {
7118de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        int32_t bk = fBI->following(i);
7128de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        if (bk == BreakIterator::DONE && i == fTestData->fString.length()) {
7138de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            continue;
7148de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
7158de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        if (bk == nextBreak && bk > i) {
7168de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            // i is in the gap between two breaks.
7178de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            continue;
7188de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
7198de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        if (i == nextBreak && bk > nextBreak) {
7208de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            fTestData->fActualBreaks.setCharAt(bk, 1);
7218de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            nextBreak = bk;
7228de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            continue;
7238de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
7248de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        MONKEY_ERROR("following(i)", i);
7258de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        return;
7268de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
7278de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    checkResults("testFollowing", FORWARD, status);
7288de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert}
7298de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
7308de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
7318de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
7328de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid RBBIMonkeyImpl::testPrevious(UErrorCode &status) {
7338de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    if (U_FAILURE(status)) {return;}
7348de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
7358de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fTestData->clearActualBreaks();
7368de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fBI->setText(fTestData->fString);
7378de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    int32_t previousBreak = INT32_MAX;
7388de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    for (int32_t bk=fBI->last(); bk != BreakIterator::DONE; bk=fBI->previous()) {
7398de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert         if (bk >= previousBreak) {
7408de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            MONKEY_ERROR("Break Iterator Stall", bk);
7418de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            return;
7428de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
7438de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        if (bk < 0 || bk > fTestData->fString.length()) {
7448de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            MONKEY_ERROR("Boundary out of bounds", bk);
7458de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            return;
7468de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
7478de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        fTestData->fActualBreaks.setCharAt(bk, 1);
7488de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
7498de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    checkResults("testPrevius", REVERSE, status);
7508de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert}
7518de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
7528de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
7538de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid RBBIMonkeyImpl::testPreceding(UErrorCode &status) {
7548de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    if (U_FAILURE(status)) {
7558de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        return;
7568de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
7578de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fTestData->clearActualBreaks();
7588de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fBI->setText(fTestData->fString);
7598de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    int32_t nextBreak = fTestData->fString.length()+1;
7608de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    for (int32_t i=fTestData->fString.length()+1 ; i>=0; --i) {
7618de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        int32_t bk = fBI->preceding(i);
7628de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        // printf("i:%d  bk:%d  nextBreak:%d\n", i, bk, nextBreak);
7638de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        if (bk == BreakIterator::DONE && i == 0) {
7648de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            continue;
7658de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
7668de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        if (bk == nextBreak && bk < i) {
7678de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            // i is in the gap between two breaks.
7688de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            continue;
7698de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
7708de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        if (i<fTestData->fString.length() && fTestData->fString.getChar32Start(i) < i) {
7718de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            // i indexes to a trailing surrogate.
7728de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            // Break Iterators treat an index to either half as referring to the supplemental code point,
7738de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            // with preceding going to some preceding code point.
7748de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            if (fBI->preceding(i) != fBI->preceding(fTestData->fString.getChar32Start(i))) {
7758de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                MONKEY_ERROR("preceding of trailing surrogate error", i);
7768de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            }
7778de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            continue;
7788de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
7798de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        if (i == nextBreak && bk < nextBreak) {
7808de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            fTestData->fActualBreaks.setCharAt(bk, 1);
7818de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            nextBreak = bk;
7828de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            continue;
7838de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
7848de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        MONKEY_ERROR("preceding(i)", i);
7858de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        return;
7868de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
7878de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    checkResults("testPreceding", REVERSE, status);
7888de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert}
7898de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
7908de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
7918de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid RBBIMonkeyImpl::testIsBoundary(UErrorCode &status) {
7928de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    if (U_FAILURE(status)) {
7938de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        return;
7948de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
7958de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fTestData->clearActualBreaks();
7968de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    fBI->setText(fTestData->fString);
7978de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    for (int i=fTestData->fString.length(); i>=0; --i) {
7988de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        if (fBI->isBoundary(i)) {
7998de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            fTestData->fActualBreaks.setCharAt(i, 1);
8008de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
8018de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
8028de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    checkResults("testForwards", FORWARD, status);
8038de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert}
8048de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
8058de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid RBBIMonkeyImpl::checkResults(const char *msg, CheckDirection direction, UErrorCode &status) {
8068de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    if (U_FAILURE(status)) {
8078de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        return;
8088de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
8098de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    if (direction == FORWARD) {
8108de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        for (int i=0; i<=fTestData->fString.length(); ++i) {
8118de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            if (fTestData->fExpectedBreaks.charAt(i) != fTestData->fActualBreaks.charAt(i)) {
8128de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                IntlTest::gTest->errln("%s:%d %s failure at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
8138de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                        __FILE__, __LINE__, msg, i, fRuleFileName, fTestData->fRandomSeed);
8148de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                if (fVerbose) {
8158de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                    fTestData->dump(i);
8168de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                }
8178de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                status = U_INVALID_STATE_ERROR;   // Prevent the test from continuing, which would likely
8188de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                break;                            // produce many redundant errors.
8198de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            }
8208de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
8218de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    } else {
8228de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        for (int i=fTestData->fString.length(); i>=0; i--) {
8238de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            if (fTestData->fExpectedBreaks.charAt(i) != fTestData->fActualBreaks.charAt(i)) {
8248de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                IntlTest::gTest->errln("%s:%d %s failure at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ",
8258de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                        __FILE__, __LINE__, msg, i, fRuleFileName, fTestData->fRandomSeed);
8268de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                if (fVerbose) {
8278de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                    fTestData->dump(i);
8288de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                }
8298de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                status = U_INVALID_STATE_ERROR;
8308de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                break;
8318de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            }
8328de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
8338de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
8348de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert}
8358de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
8368de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
8378de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
8388de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//---------------------------------------------------------------------------------------
8398de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//
8408de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//   class RBBIMonkeyTest implementation.
8418de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//
8428de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//---------------------------------------------------------------------------------------
8438de051c3d18a56cc126f0f44e368495a52f9148cFredrik RoubertRBBIMonkeyTest::RBBIMonkeyTest() {
8448de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert}
8458de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
8468de051c3d18a56cc126f0f44e368495a52f9148cFredrik RoubertRBBIMonkeyTest::~RBBIMonkeyTest() {
8478de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert}
8488de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
8498de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
8508de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//     params, taken from this->fParams.
8518de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//       rules=file_name   Name of file containing the reference rules.
8528de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//       seed=nnnnn        Random number starting seed.
8538de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//                         Setting the seed allows errors to be reproduced.
8548de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//       loop=nnn          Looping count.  Controls running time.
8558de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//                         -1:  run forever.
8568de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//                          0 or greater:  run length.
8578de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//       expansions        debug option, show expansions of rules and sets.
8588de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//       verbose           Display details of the failure.
8598de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//
8608de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//     Parameters on the intltest command line follow the test name, and are preceded by '@'.
8618de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//     For example,
8628de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//           intltest rbbi/RBBIMonkeyTest/testMonkey@rules=line.txt,loop=-1
8638de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//
8648de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid RBBIMonkeyTest::testMonkey() {
8658de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    // printf("Test parameters: %s\n", fParams);
8668de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    UnicodeString params(fParams);
8678de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    UErrorCode status = U_ZERO_ERROR;
8688de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
8698de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    const char *tests[] = {"grapheme.txt", "word.txt", "line.txt", "sentence.txt", "line_normal.txt",
8708de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                           "line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt",
8718de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert                           NULL };
8728de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    CharString testNameFromParams;
8738de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    if (getStringParam("rules", params, testNameFromParams, status)) {
8748de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        tests[0] = testNameFromParams.data();
8758de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        tests[1] = NULL;
8768de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
8778de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
8788de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    int64_t loopCount = quick? 100 : 5000;
8798de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    getIntParam("loop", params, loopCount, status);
8808de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
8818de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    UBool dumpExpansions = FALSE;
8828de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    getBoolParam("expansions", params, dumpExpansions, status);
8838de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
8848de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    UBool verbose = FALSE;
8858de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    getBoolParam("verbose", params, verbose, status);
8868de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
8878de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    int64_t seed = 0;
8888de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    getIntParam("seed", params, seed, status);
8898de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
8908de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    if (params.length() != 0) {
8918de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        // Options processing did not consume all of the parameters. Something unrecognized was present.
8928de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        CharString unrecognizedParameters;
8938de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        unrecognizedParameters.append(CStr(params)(), -1, status);
8948de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        errln("%s:%d unrecognized test parameter(s) \"%s\"", __FILE__, __LINE__, unrecognizedParameters.data());
8958de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        return;
8968de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
8978de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
8988de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    UVector startedTests(status);
8998de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    if (U_FAILURE(status)) {
9008de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        errln("%s:%d: error %s while setting up test.", __FILE__, __LINE__, u_errorName(status));
9018de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        return;
9028de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
9038de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
9048de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    // Monkey testing is multi-threaded.
9058de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    // Each set of break rules to be tested is run in a separate thread.
9068de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    // Each thread/set of rules gets a separate RBBIMonkeyImpl object.
9078de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    int32_t i;
9088de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    for (i=0; tests[i] != NULL; ++i) {
9098de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        logln("beginning testing of %s", tests[i]);
9108de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        RBBIMonkeyImpl *test = new RBBIMonkeyImpl(status);
91164339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert        if (U_FAILURE(status)) {
91264339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert            errln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]);
91364339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert            break;
91464339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert        }
9158de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        test->fDumpExpansions = dumpExpansions;
9168de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        test->fVerbose = verbose;
9178de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        test->fRandomGenerator.seed((uint32_t)seed);
9188de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        test->fLoopCount = loopCount;
9198de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        test->setup(tests[i], status);
92064339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert        if (U_FAILURE(status)) {
92164339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert            errln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]);
92264339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert            break;
92364339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert        }
9248de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        test->startTest();
9258de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        startedTests.addElement(test, status);
9268de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        if (U_FAILURE(status)) {
92764339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert            errln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]);
9288de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            break;
9298de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
9308de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
9318de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
9328de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    for (i=0; i<startedTests.size(); ++i) {
9338de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        RBBIMonkeyImpl *test = static_cast<RBBIMonkeyImpl *>(startedTests.elementAt(i));
9348de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        test->join();
9358de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        delete test;
9368de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
9378de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert}
9388de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
9398de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
9408de051c3d18a56cc126f0f44e368495a52f9148cFredrik RoubertUBool  RBBIMonkeyTest::getIntParam(UnicodeString name, UnicodeString &params, int64_t &val, UErrorCode &status) {
9418de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    name.append(" *= *(-?\\d+) *,? *");
9428de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    RegexMatcher m(name, params, 0, status);
9438de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    if (m.find()) {
9448de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        // The param exists.  Convert the string to an int.
9458de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        CharString str;
9468de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        str.append(CStr(m.group(1, status))(), -1, status);
9478de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        val = strtol(str.data(),  NULL, 10);
9488de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
9498de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        // Delete this parameter from the params string.
9508de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        m.reset();
9518de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        params = m.replaceFirst(UnicodeString(), status);
9528de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        return TRUE;
9538de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
9548de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    return FALSE;
9558de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert}
9568de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
9578de051c3d18a56cc126f0f44e368495a52f9148cFredrik RoubertUBool RBBIMonkeyTest::getStringParam(UnicodeString name, UnicodeString &params, CharString &dest, UErrorCode &status) {
9588de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    name.append(" *= *([^ ,]*) *,? *");
9598de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    RegexMatcher m(name, params, 0, status);
9608de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    if (m.find()) {
9618de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        // The param exists.
9628de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        dest.append(CStr(m.group(1, status))(), -1, status);
9638de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
9648de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        // Delete this parameter from the params string.
9658de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        m.reset();
9668de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        params = m.replaceFirst(UnicodeString(), status);
9678de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        return TRUE;
9688de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
9698de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    return FALSE;
9708de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert}
9718de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
9728de051c3d18a56cc126f0f44e368495a52f9148cFredrik RoubertUBool RBBIMonkeyTest::getBoolParam(UnicodeString name, UnicodeString &params, UBool &dest, UErrorCode &status) {
9738de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    name.append("(?: *= *(true|false))? *,? *");
9748de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    RegexMatcher m(name, params, UREGEX_CASE_INSENSITIVE, status);
9758de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    if (m.find()) {
9768de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        if (m.start(1, status) > 0) {
9778de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            // user option included a value.
9788de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            dest = m.group(1, status).caseCompare(UnicodeString("true"), U_FOLD_CASE_DEFAULT) == 0;
9798de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        } else {
9808de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            // No explicit user value, implies true.
9818de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert            dest = TRUE;
9828de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        }
9838de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
9848de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        // Delete this parameter from the params string.
9858de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        m.reset();
9868de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        params = m.replaceFirst(UnicodeString(), status);
9878de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert        return TRUE;
9888de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    }
9898de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert    return FALSE;
9908de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert}
9918de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert
9928de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert#endif /* !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING */
993