164339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert// Copyright (C) 2016 and later: Unicode, Inc. and others. 264339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html 38de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert/******************************************************************** 48de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert * Copyright (c) 2016, International Business Machines Corporation and 58de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert * others. All Rights Reserved. 68de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert ********************************************************************/ 78de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 88de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 98de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert#include "unicode/utypes.h" 108de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 118de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING 128de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 138de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert#include "rbbimonkeytest.h" 148de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert#include "unicode/utypes.h" 158de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert#include "unicode/brkiter.h" 168de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert#include "unicode/utf16.h" 178de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert#include "unicode/uniset.h" 188de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert#include "unicode/unistr.h" 198de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 208de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert#include "charstr.h" 218de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert#include "cmemory.h" 228de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert#include "cstr.h" 238de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert#include "uelement.h" 248de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert#include "uhash.h" 258de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 2664339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert#include <iostream> 2764339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert#include <stdio.h> 2864339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert#include <stdlib.h> 2964339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert#include <string> 308de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 318de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertusing namespace icu; 328de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 338de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 348de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid RBBIMonkeyTest::runIndexedTest(int32_t index, UBool exec, const char* &name, char* params) { 358de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fParams = params; // Work around TESTCASE_AUTO not being able to pass params to test function. 368de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 378de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert TESTCASE_AUTO_BEGIN; 388de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert TESTCASE_AUTO(testMonkey); 398de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert TESTCASE_AUTO_END; 408de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert} 418de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 428de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//--------------------------------------------------------------------------------------- 438de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert// 448de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert// class BreakRule implementation. 458de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert// 468de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//--------------------------------------------------------------------------------------- 478de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 488de051c3d18a56cc126f0f44e368495a52f9148cFredrik RoubertBreakRule::BreakRule() // : all field default initialized. 498de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert{ 508de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert} 518de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 528de051c3d18a56cc126f0f44e368495a52f9148cFredrik RoubertBreakRule::~BreakRule() {} 538de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 548de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 558de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//--------------------------------------------------------------------------------------- 568de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert// 578de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert// class BreakRules implementation. 588de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert// 598de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//--------------------------------------------------------------------------------------- 608de051c3d18a56cc126f0f44e368495a52f9148cFredrik RoubertBreakRules::BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status) : 618de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fMonkeyImpl(monkeyImpl), fBreakRules(status), fType(UBRK_COUNT) { 628de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fCharClasses.adoptInstead(uhash_open(uhash_hashUnicodeString, 638de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert uhash_compareUnicodeString, 648de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert NULL, // value comparator. 658de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert &status)); 668de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (U_FAILURE(status)) { 678de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return; 688de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 698de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert uhash_setKeyDeleter(fCharClasses.getAlias(), uprv_deleteUObject); 708de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert uhash_setValueDeleter(fCharClasses.getAlias(), uprv_deleteUObject); 718de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fBreakRules.setDeleter(uprv_deleteUObject); 728de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 738de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fCharClassList.adoptInstead(new UVector(status)); 748de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 758de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fSetRefsMatcher.adoptInstead(new RegexMatcher(UnicodeString( 768de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert "(?!(?:\\{|=|\\[:)[ \\t]{0,4})" // Negative lookbehind for '{' or '=' or '[:' 778de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // (the identifier is a unicode property name or value) 788de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert "(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)"), // The char class name 798de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 0, status)); 808de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 818de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Match comments and blank lines. Matches will be replaced with "", stripping the comments from the rules. 828de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fCommentsMatcher.adoptInstead(new RegexMatcher(UnicodeString( 838de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert "(^|(?<=;))" // Start either at start of line, or just after a ';' (look-behind for ';') 848de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert "[ \\t]*+" // Match white space. 858de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert "(#.*)?+" // Optional # plus whatever follows 868de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert "\\R$" // new-line at end of line. 878de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert ), 0, status)); 888de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 898de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Match (initial parse) of a character class defintion line. 908de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fClassDefMatcher.adoptInstead(new RegexMatcher(UnicodeString( 918de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert "[ \\t]*" // leading white space 928de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert "(?<ClassName>[A-Za-z_][A-Za-z0-9_]*)" // The char class name 938de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert "[ \\t]*=[ \\t]*" // = 948de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert "(?<ClassDef>.*?)" // The char class UnicodeSet expression 958de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert "[ \\t]*;$"), // ; <end of line> 968de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 0, status)); 978de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 988de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Match (initial parse) of a break rule line. 998de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fRuleDefMatcher.adoptInstead(new RegexMatcher(UnicodeString( 1008de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert "[ \\t]*" // leading white space 1018de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert "(?<RuleName>[A-Za-z_][A-Za-z0-9_.]*)" // The rule name 1028de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert "[ \\t]*:[ \\t]*" // : 1038de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert "(?<RuleDef>.*?)" // The rule definition 1048de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert "[ \\t]*;$"), // ; <end of line> 1058de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 0, status)); 1068de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 1078de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert} 1088de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 1098de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 1108de051c3d18a56cc126f0f44e368495a52f9148cFredrik RoubertBreakRules::~BreakRules() {} 1118de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 1128de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 1138de051c3d18a56cc126f0f44e368495a52f9148cFredrik RoubertCharClass *BreakRules::addCharClass(const UnicodeString &name, const UnicodeString &definition, UErrorCode &status) { 1148de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 1158de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Create the expanded definition for this char class, 1168de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // replacing any set references with the corresponding definition. 1178de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 1188de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert UnicodeString expandedDef; 1198de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert UnicodeString emptyString; 1208de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fSetRefsMatcher->reset(definition); 1218de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert while (fSetRefsMatcher->find() && U_SUCCESS(status)) { 1228de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert const UnicodeString name = 1238de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fSetRefsMatcher->group(fSetRefsMatcher->pattern().groupNumberFromName("ClassName", status), status); 1248de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert CharClass *nameClass = static_cast<CharClass *>(uhash_get(fCharClasses.getAlias(), &name)); 1258de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert const UnicodeString &expansionForName = nameClass ? nameClass->fExpandedDef : name; 1268de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 1278de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fSetRefsMatcher->appendReplacement(expandedDef, emptyString, status); 1288de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert expandedDef.append(expansionForName); 1298de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 1308de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fSetRefsMatcher->appendTail(expandedDef); 1318de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 1328de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Verify that the expanded set defintion is valid. 1338de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 1348de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (fMonkeyImpl->fDumpExpansions) { 1358de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert printf("epandedDef: %s\n", CStr(expandedDef)()); 1368de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 1378de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 1388de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert UnicodeSet *s = new UnicodeSet(expandedDef, USET_IGNORE_SPACE, NULL, status); 1398de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (U_FAILURE(status)) { 1408de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert IntlTest::gTest->errln("%s:%d: error %s creating UnicodeSet %s", __FILE__, __LINE__, 1418de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert u_errorName(status), CStr(name)()); 1428de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return NULL; 1438de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 1448de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert CharClass *cclass = new CharClass(name, definition, expandedDef, s); 1458de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert CharClass *previousClass = static_cast<CharClass *>(uhash_put(fCharClasses.getAlias(), 1468de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert new UnicodeString(name), // Key, owned by hash table. 1478de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert cclass, // Value, owned by hash table. 1488de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert &status)); 1498de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 1508de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (previousClass != NULL) { 1518de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Duplicate class def. 1528de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // These are legitimate, they are adustments of an existing class. 1538de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // TODO: will need to keep the old around when we handle tailorings. 1548de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert IntlTest::gTest->logln("Redefinition of character class %s\n", CStr(cclass->fName)()); 1558de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert delete previousClass; 1568de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 1578de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return cclass; 1588de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert} 1598de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 1608de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 1618de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid BreakRules::addRule(const UnicodeString &name, const UnicodeString &definition, UErrorCode &status) { 1628de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert LocalPointer<BreakRule> thisRule(new BreakRule); 1638de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert thisRule->fName = name; 1648de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert thisRule->fRule = definition; 1658de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 1668de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // If the rule name contains embedded digits, pad the first numeric field to a fixed length with leading zeroes, 1678de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // This gives a numeric sort order that matches Unicode UAX rule numbering conventions. 1688de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert UnicodeString emptyString; 1698de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 1708de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Expand the char class definitions within the rule. 1718de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fSetRefsMatcher->reset(definition); 1728de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert while (fSetRefsMatcher->find() && U_SUCCESS(status)) { 1738de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert const UnicodeString name = 1748de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fSetRefsMatcher->group(fSetRefsMatcher->pattern().groupNumberFromName("ClassName", status), status); 1758de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert CharClass *nameClass = static_cast<CharClass *>(uhash_get(fCharClasses.getAlias(), &name)); 1768de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (!nameClass) { 1778de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert IntlTest::gTest->errln("%s:%d char class \"%s\" unrecognized in rule \"%s\"", 1788de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert __FILE__, __LINE__, CStr(name)(), CStr(definition)()); 1798de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 1808de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert const UnicodeString &expansionForName = nameClass ? nameClass->fExpandedDef : name; 1818de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 1828de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fSetRefsMatcher->appendReplacement(thisRule->fExpandedRule, emptyString, status); 1838de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert thisRule->fExpandedRule.append(expansionForName); 1848de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 1858de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fSetRefsMatcher->appendTail(thisRule->fExpandedRule); 1868de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 1878de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Replace the divide sign (\u00f7) with a regular expression named capture. 1888de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // When running the rules, a match that includes this group means we found a break position. 1898de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 1908de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert int32_t dividePos = thisRule->fExpandedRule.indexOf((UChar)0x00f7); 1918de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (dividePos >= 0) { 1928de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert thisRule->fExpandedRule.replace(dividePos, 1, UnicodeString("(?<BreakPosition>)")); 1938de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 1948de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (thisRule->fExpandedRule.indexOf((UChar)0x00f7) != -1) { 1958de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert status = U_ILLEGAL_ARGUMENT_ERROR; // TODO: produce a good error message. 1968de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 1978de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 1988de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // UAX break rule set definitions can be empty, just []. 1998de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Regular expression set expressions don't accept this. Substitute with [^\u0000-\U0010ffff], which 2008de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // also matches nothing. 2018de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 2028de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert static const UChar emptySet[] = {(UChar)0x5b, (UChar)0x5d, 0}; 2038de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert int32_t where = 0; 2048de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert while ((where = thisRule->fExpandedRule.indexOf(emptySet, 2, 0)) >= 0) { 2058de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert thisRule->fExpandedRule.replace(where, 2, UnicodeString("[^\\u0000-\\U0010ffff]")); 2068de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 2078de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (fMonkeyImpl->fDumpExpansions) { 2088de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert printf("fExpandedRule: %s\n", CStr(thisRule->fExpandedRule)()); 2098de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 2108de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 2118de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Compile a regular expression for this rule. 2128de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert thisRule->fRuleMatcher.adoptInstead(new RegexMatcher(thisRule->fExpandedRule, UREGEX_COMMENTS | UREGEX_DOTALL, status)); 2138de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (U_FAILURE(status)) { 2148de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert IntlTest::gTest->errln("%s:%d Error creating regular expression for %s", 2158de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert __FILE__, __LINE__, CStr(thisRule->fExpandedRule)()); 2168de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return; 2178de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 2188de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 2198de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Put this new rule into the vector of all Rules. 2208de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fBreakRules.addElement(thisRule.orphan(), status); 2218de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert} 2228de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 2238de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 2248de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertbool BreakRules::setKeywordParameter(const UnicodeString &keyword, const UnicodeString &value, UErrorCode &status) { 2258de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (keyword == UnicodeString("locale")) { 2268de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert CharString localeName; 2278de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert localeName.append(CStr(value)(), -1, status); 2288de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fLocale = Locale::createFromName(localeName.data()); 2298de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return true; 2308de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 2318de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (keyword == UnicodeString("type")) { 2328de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (value == UnicodeString("grapheme")) { 2338de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fType = UBRK_CHARACTER; 2348de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } else if (value == UnicodeString("word")) { 2358de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fType = UBRK_WORD; 2368de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } else if (value == UnicodeString("line")) { 2378de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fType = UBRK_LINE; 2388de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } else if (value == UnicodeString("sentence")) { 2398de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fType = UBRK_SENTENCE; 2408de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } else { 2418de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert IntlTest::gTest->errln("%s:%d Unrecognized break type %s", __FILE__, __LINE__, CStr(value)()); 2428de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 2438de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return true; 2448de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 2458de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // TODO: add tailoring base setting here. 2468de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return false; 2478de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert} 2488de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 2498de051c3d18a56cc126f0f44e368495a52f9148cFredrik RoubertRuleBasedBreakIterator *BreakRules::createICUBreakIterator(UErrorCode &status) { 2508de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (U_FAILURE(status)) { 2518de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return NULL; 2528de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 2538de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert RuleBasedBreakIterator *bi = NULL; 2548de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert switch(fType) { 2558de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert case UBRK_CHARACTER: 2568de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createCharacterInstance(fLocale, status)); 2578de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert break; 2588de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert case UBRK_WORD: 2598de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createWordInstance(fLocale, status)); 2608de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert break; 2618de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert case UBRK_LINE: 2628de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createLineInstance(fLocale, status)); 2638de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert break; 2648de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert case UBRK_SENTENCE: 2658de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert bi = dynamic_cast<RuleBasedBreakIterator *>(BreakIterator::createSentenceInstance(fLocale, status)); 2668de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert break; 2678de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert default: 2688de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert IntlTest::gTest->errln("%s:%d Bad break iterator type of %d", __FILE__, __LINE__, fType); 2698de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert status = U_ILLEGAL_ARGUMENT_ERROR; 2708de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 2718de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return bi; 2728de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert} 2738de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 2748de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 2758de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid BreakRules::compileRules(UCHARBUF *rules, UErrorCode &status) { 2768de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (U_FAILURE(status)) { 2778de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return; 2788de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 2798de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 2808de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert UnicodeString emptyString; 2818de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert for (int32_t lineNumber=0; ;lineNumber++) { // Loop once per input line. 2828de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (U_FAILURE(status)) { 2838de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return; 2848de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 2858de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert int32_t lineLength = 0; 2868de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert const UChar *lineBuf = ucbuf_readline(rules, &lineLength, &status); 2878de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (lineBuf == NULL) { 2888de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert break; 2898de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 2908de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert UnicodeString line(lineBuf, lineLength); 2918de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 2928de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Strip comment lines. 2938de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fCommentsMatcher->reset(line); 2948de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert line = fCommentsMatcher->replaceFirst(emptyString, status); 2958de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (line.isEmpty()) { 2968de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert continue; 2978de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 2988de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 2998de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Recognize character class definition and keyword lines 3008de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fClassDefMatcher->reset(line); 3018de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (fClassDefMatcher->matches(status)) { 3028de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert UnicodeString className = fClassDefMatcher->group(fClassDefMatcher->pattern().groupNumberFromName("ClassName", status), status); 3038de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert UnicodeString classDef = fClassDefMatcher->group(fClassDefMatcher->pattern().groupNumberFromName("ClassDef", status), status); 3048de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (fMonkeyImpl->fDumpExpansions) { 3058de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert printf("scanned class: %s = %s\n", CStr(className)(), CStr(classDef)()); 3068de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 3078de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (setKeywordParameter(className, classDef, status)) { 3088de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // The scanned item was "type = ..." or "locale = ...", etc. 3098de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // which are not actual character classes. 3108de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert continue; 3118de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 3128de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert addCharClass(className, classDef, status); 3138de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert continue; 3148de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 3158de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 3168de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Recognize rule lines. 3178de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fRuleDefMatcher->reset(line); 3188de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (fRuleDefMatcher->matches(status)) { 3198de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert UnicodeString ruleName = fRuleDefMatcher->group(fRuleDefMatcher->pattern().groupNumberFromName("RuleName", status), status); 3208de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert UnicodeString ruleDef = fRuleDefMatcher->group(fRuleDefMatcher->pattern().groupNumberFromName("RuleDef", status), status); 3218de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (fMonkeyImpl->fDumpExpansions) { 3228de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert printf("scanned rule: %s : %s\n", CStr(ruleName)(), CStr(ruleDef)()); 3238de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 3248de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert addRule(ruleName, ruleDef, status); 3258de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert continue; 3268de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 3278de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 3288de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert IntlTest::gTest->errln("%s:%d: Unrecognized line in rule file %s: \"%s\"\n", 3298de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert __FILE__, __LINE__, fMonkeyImpl->fRuleFileName, CStr(line)()); 3308de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 3318de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 3328de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Build the vector of char classes, omitting the dictionary class if there is one. 3338de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // This will be used when constructing the random text to be tested. 3348de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 3358de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Also compute the "other" set, consisting of any characters not included in 3368de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // one or more of the user defined sets. 3378de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 3388de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert UnicodeSet otherSet((UChar32)0, 0x10ffff); 3398de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert int32_t pos = UHASH_FIRST; 3408de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert const UHashElement *el = NULL; 3418de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert while ((el = uhash_nextElement(fCharClasses.getAlias(), &pos)) != NULL) { 3428de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert const UnicodeString *ccName = static_cast<const UnicodeString *>(el->key.pointer); 3438de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert CharClass *cclass = static_cast<CharClass *>(el->value.pointer); 3448de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // printf(" Adding %s\n", CStr(*ccName)()); 3458de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (*ccName != cclass->fName) { 3468de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert IntlTest::gTest->errln("%s:%d: internal error, set names (%s, %s) inconsistent.\n", 3478de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert __FILE__, __LINE__, CStr(*ccName)(), CStr(cclass->fName)()); 3488de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 3498de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert const UnicodeSet *set = cclass->fSet.getAlias(); 3508de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert otherSet.removeAll(*set); 3518de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (*ccName == UnicodeString("dictionary")) { 3528de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fDictionarySet = *set; 3538de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } else { 3548de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fCharClassList->addElement(cclass, status); 3558de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 3568de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 3578de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 3588de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (!otherSet.isEmpty()) { 3598de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // fprintf(stderr, "have an other set.\n"); 3608de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert UnicodeString pattern; 3618de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert CharClass *cclass = addCharClass(UnicodeString("__Others"), otherSet.toPattern(pattern), status); 3628de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fCharClassList->addElement(cclass, status); 3638de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 3648de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert} 3658de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 3668de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 3678de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertconst CharClass *BreakRules::getClassForChar(UChar32 c, int32_t *iter) const { 3688de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert int32_t localIter = 0; 3698de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert int32_t &it = iter? *iter : localIter; 3708de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 3718de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert while (it < fCharClassList->size()) { 3728de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert const CharClass *cc = static_cast<const CharClass *>(fCharClassList->elementAt(it)); 3738de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert ++it; 3748de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (cc->fSet->contains(c)) { 3758de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return cc; 3768de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 3778de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 3788de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return NULL; 3798de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert} 3808de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 3818de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//--------------------------------------------------------------------------------------- 3828de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert// 3838de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert// class MonkeyTestData implementation. 3848de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert// 3858de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//--------------------------------------------------------------------------------------- 3868de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 3878de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid MonkeyTestData::set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode &status) { 3888de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert const int32_t dataLength = 1000; 3898de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 3908de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Fill the test string with random characters. 3918de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // First randomly pick a char class, then randomly pick a character from that class. 3928de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Exclude any characters from the dictionary set. 3938de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 3948de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // std::cout << "Populating Test Data" << std::endl; 3958de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fRandomSeed = rand.getSeed(); // Save initial seed for use in error messages, 3968de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // allowing recreation of failing data. 3978de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fBkRules = rules; 3988de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fString.remove(); 3998de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert for (int32_t n=0; n<dataLength;) { 4008de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert int charClassIndex = rand() % rules->fCharClassList->size(); 4018de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert const CharClass *cclass = static_cast<CharClass *>(rules->fCharClassList->elementAt(charClassIndex)); 4028de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (cclass->fSet->size() == 0) { 4038de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Some rules or tailorings do end up with empty char classes. 4048de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert continue; 4058de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 4068de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert int32_t charIndex = rand() % cclass->fSet->size(); 4078de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert UChar32 c = cclass->fSet->charAt(charIndex); 4088de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (U16_IS_TRAIL(c) && fString.length() > 0 && U16_IS_LEAD(fString.charAt(fString.length()-1))) { 4098de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Character classes may contain unpaired surrogates, e.g. Grapheme_Cluster_Break = Control. 4108de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Don't let random unpaired surrogates combine in the test data because they might 4118de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // produce an unwanted dictionary character. 4128de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert continue; 4138de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 4148de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 4158de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (!rules->fDictionarySet.contains(c)) { 4168de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fString.append(c); 4178de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert ++n; 4188de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 4198de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 4208de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 4218de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Reset each rule matcher regex with this new string. 4228de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // (Although we are always using the same string object, ICU regular expressions 4238de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // don't like the underlying string data changing without doing a reset). 4248de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 4258de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert for (int32_t ruleNum=0; ruleNum<rules->fBreakRules.size(); ruleNum++) { 4268de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert BreakRule *rule = static_cast<BreakRule *>(rules->fBreakRules.elementAt(ruleNum)); 4278de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert rule->fRuleMatcher->reset(fString); 4288de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 4298de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 4308de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Init the expectedBreaks, actualBreaks and ruleForPosition strings (used as arrays). 4318de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Expected and Actual breaks are one longer than the input string; a non-zero value 4328de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // will indicate a boundary preceding that position. 4338de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 4348de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert clearActualBreaks(); 4358de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fExpectedBreaks = fActualBreaks; 4368de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fRuleForPosition = fActualBreaks; 4378de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert f2ndRuleForPos = fActualBreaks; 4388de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 4398de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Apply reference rules to find the expected breaks. 4408de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 4418de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fExpectedBreaks.setCharAt(0, (UChar)1); // Force an expected break before the start of the text. 4428de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // ICU always reports a break there. 4438de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // The reference rules do not have a means to do so. 4448de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert int32_t strIdx = 0; 4458de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert while (strIdx < fString.length()) { 4468de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert BreakRule *matchingRule = NULL; 4478de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert UBool hasBreak = FALSE; 4488de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert int32_t ruleNum = 0; 4498de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert int32_t matchStart = 0; 4508de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert int32_t matchEnd = 0; 4518de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert int32_t breakGroup = 0; 4528de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert for (ruleNum=0; ruleNum<rules->fBreakRules.size(); ruleNum++) { 4538de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert BreakRule *rule = static_cast<BreakRule *>(rules->fBreakRules.elementAt(ruleNum)); 4548de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert rule->fRuleMatcher->reset(); 4558de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (rule->fRuleMatcher->lookingAt(strIdx, status)) { 4568de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // A candidate rule match, check further to see if we take it or continue to check other rules. 4578de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Matches of zero or one codepoint count only if they also specify a break. 4588de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert matchStart = rule->fRuleMatcher->start(status); 4598de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert matchEnd = rule->fRuleMatcher->end(status); 4608de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert breakGroup = rule->fRuleMatcher->pattern().groupNumberFromName("BreakPosition", status); 4618de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert hasBreak = U_SUCCESS(status); 4628de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (status == U_REGEX_INVALID_CAPTURE_GROUP_NAME) { 4638de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert status = U_ZERO_ERROR; 4648de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 4658de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (hasBreak || fString.moveIndex32(matchStart, 1) < matchEnd) { 4668de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert matchingRule = rule; 4678de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert break; 4688de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 4698de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 4708de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 4718de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (matchingRule == NULL) { 4728de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // No reference rule matched. This is an error in the rules that should never happen. 4738de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert IntlTest::gTest->errln("%s:%d Trouble with monkey test reference rules at position %d. ", 4748de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert __FILE__, __LINE__, strIdx); 4758de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert dump(strIdx); 4768de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert status = U_INVALID_FORMAT_ERROR; 4778de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return; 4788de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 4798de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (matchingRule->fRuleMatcher->group(status).length() == 0) { 4808de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Zero length rule match. This is also an error in the rule expressions. 4818de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert IntlTest::gTest->errln("%s:%d Zero length rule match.", 4828de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert __FILE__, __LINE__); 4838de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert status = U_INVALID_FORMAT_ERROR; 4848de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return; 4858de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 4868de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 4878de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Record which rule matched over the length of the match. 4888de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert for (int i = matchStart; i < matchEnd; i++) { 4898de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (fRuleForPosition.charAt(i) == 0) { 4908de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fRuleForPosition.setCharAt(i, (UChar)ruleNum); 4918de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } else { 4928de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert f2ndRuleForPos.setCharAt(i, (UChar)ruleNum); 4938de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 4948de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 4958de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 4968de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Break positions appear in rules as a matching named capture of zero length at the break position, 4978de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // the adjusted pattern contains (?<BreakPosition>) 4988de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (hasBreak) { 4998de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert int32_t breakPos = matchingRule->fRuleMatcher->start(breakGroup, status); 5008de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (U_FAILURE(status) || breakPos < 0) { 5018de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Rule specified a break, but that break wasn't part of the match, even 5028de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // though the rule as a whole matched. 5038de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Can't happen with regular expressions derived from (equivalent to) ICU break rules. 5048de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Shouldn't get here. 5058de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert IntlTest::gTest->errln("%s:%d Internal Rule Error.", __FILE__, __LINE__); 5068de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert status = U_INVALID_FORMAT_ERROR; 5078de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert break; 5088de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 5098de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fExpectedBreaks.setCharAt(breakPos, (UChar)1); 5108de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // printf("recording break at %d\n", breakPos); 5118de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // For the next iteration, pick up applying rules immediately after the break, 5128de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // which may differ from end of the match. The matching rule may have included 5138de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // context following the boundary that needs to be looked at again. 5148de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert strIdx = matchingRule->fRuleMatcher->end(breakGroup, status); 5158de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } else { 5168de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Original rule didn't specify a break. 5178de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Continue applying rules starting on the last code point of this match. 5188de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert strIdx = fString.moveIndex32(matchEnd, -1); 5198de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (strIdx == matchStart) { 5208de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Match was only one code point, no progress if we continue. 5218de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Shouldn't get here, case is filtered out at top of loop. 5228de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert CharString ruleName; 5238de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert ruleName.appendInvariantChars(matchingRule->fName, status); 5248de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert IntlTest::gTest->errln("%s:%d Rule %s internal error", 5258de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert __FILE__, __LINE__, ruleName.data()); 5268de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert status = U_INVALID_FORMAT_ERROR; 5278de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert break; 5288de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 5298de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 5308de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (U_FAILURE(status)) { 5318de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert IntlTest::gTest->errln("%s:%d status = %s. Unexpected failure, perhaps problem internal to test.", 5328de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert __FILE__, __LINE__, u_errorName(status)); 5338de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert break; 5348de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 5358de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 5368de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert} 5378de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 5388de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid MonkeyTestData::clearActualBreaks() { 5398de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fActualBreaks.remove(); 5408de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Actual Breaks length is one longer than the data string length, allowing 5418de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // for breaks before the first and after the last character in the data. 5428de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert for (int32_t i=0; i<=fString.length(); i++) { 5438de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fActualBreaks.append((UChar)0); 5448de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 5458de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert} 5468de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 5478de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid MonkeyTestData::dump(int32_t around) const { 5488de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert printf("\n" 5498de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert " char break Rule Character\n" 5508de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert " pos code class R I name name\n" 5518de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert "---------------------------------------------------------------------------------------------\n"); 5528de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 5538de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert int32_t start; 5548de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert int32_t end; 5558de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 5568de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (around == -1) { 5578de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert start = 0; 5588de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert end = fString.length(); 5598de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } else { 5608de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Display context around a failure. 5618de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert start = fString.moveIndex32(around, -30); 5628de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert end = fString.moveIndex32(around, +30); 5638de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 5648de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 5658de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert for (int charIdx = start; charIdx < end; charIdx=fString.moveIndex32(charIdx, 1)) { 5668de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert UErrorCode status = U_ZERO_ERROR; 5678de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert UChar32 c = fString.char32At(charIdx); 5688de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert const CharClass *cc = fBkRules->getClassForChar(c); 5698de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert CharString ccName; 5708de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert ccName.appendInvariantChars(cc->fName, status); 5718de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert CharString ruleName, secondRuleName; 5728de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert const BreakRule *rule = static_cast<BreakRule *>(fBkRules->fBreakRules.elementAt(fRuleForPosition.charAt(charIdx))); 5738de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert ruleName.appendInvariantChars(rule->fName, status); 5748de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (f2ndRuleForPos.charAt(charIdx) > 0) { 5758de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert const BreakRule *secondRule = static_cast<BreakRule *>(fBkRules->fBreakRules.elementAt(f2ndRuleForPos.charAt(charIdx))); 5768de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert secondRuleName.appendInvariantChars(secondRule->fName, status); 5778de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 5788de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert char cName[200]; 5798de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert u_charName(c, U_EXTENDED_CHAR_NAME, cName, sizeof(cName), &status); 5808de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 5818de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert printf(" %4.1d %6.4x %-20s %c %c %-10s %-10s %s\n", 5828de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert charIdx, c, ccName.data(), 5838de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fExpectedBreaks.charAt(charIdx) ? '*' : '.', 5848de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fActualBreaks.charAt(charIdx) ? '*' : '.', 5858de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert ruleName.data(), secondRuleName.data(), cName 5868de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert ); 5878de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 5888de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert} 5898de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 5908de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 5918de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//--------------------------------------------------------------------------------------- 5928de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert// 5938de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert// class RBBIMonkeyImpl 5948de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert// 5958de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//--------------------------------------------------------------------------------------- 5968de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 5978de051c3d18a56cc126f0f44e368495a52f9148cFredrik RoubertRBBIMonkeyImpl::RBBIMonkeyImpl(UErrorCode &status) : fDumpExpansions(FALSE), fThread(this) { 5988de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert (void)status; // suppress unused parameter compiler warning. 5998de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert} 6008de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 6018de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 6028de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert// RBBIMonkeyImpl setup does all of the setup for a single rule set - compiling the 6038de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert// reference rules and creating the icu breakiterator to test, 6048de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert// with its type and locale coming from the reference rules. 6058de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 6068de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid RBBIMonkeyImpl::setup(const char *ruleFile, UErrorCode &status) { 6078de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fRuleFileName = ruleFile; 6088de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert openBreakRules(ruleFile, status); 6098de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (U_FAILURE(status)) { 6108de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert IntlTest::gTest->errln("%s:%d Error %s opening file %s.", __FILE__, __LINE__, u_errorName(status), ruleFile); 6118de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return; 6128de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 6138de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fRuleSet.adoptInstead(new BreakRules(this, status)); 6148de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fRuleSet->compileRules(fRuleCharBuffer.getAlias(), status); 6158de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (U_FAILURE(status)) { 6168de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert IntlTest::gTest->errln("%s:%d Error %s processing file %s.", __FILE__, __LINE__, u_errorName(status), ruleFile); 6178de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return; 6188de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 6198de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fBI.adoptInstead(fRuleSet->createICUBreakIterator(status)); 6208de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fTestData.adoptInstead(new MonkeyTestData()); 6218de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert} 6228de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 6238de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 6248de051c3d18a56cc126f0f44e368495a52f9148cFredrik RoubertRBBIMonkeyImpl::~RBBIMonkeyImpl() { 6258de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert} 6268de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 6278de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 6288de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid RBBIMonkeyImpl::openBreakRules(const char *fileName, UErrorCode &status) { 6298de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert CharString path; 6308de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert path.append(IntlTest::getSourceTestData(status), status); 6318de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert path.append("break_rules" U_FILE_SEP_STRING, status); 6328de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert path.appendPathPart(fileName, status); 6338de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert const char *codePage = "UTF-8"; 6348de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fRuleCharBuffer.adoptInstead(ucbuf_open(path.data(), &codePage, TRUE, FALSE, &status)); 6358de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert} 6368de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 6378de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 6388de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid RBBIMonkeyImpl::startTest() { 6398de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fThread.start(); // invokes runTest() in a separate thread. 6408de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert} 6418de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 6428de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid RBBIMonkeyImpl::join() { 6438de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fThread.join(); 6448de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert} 6458de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 6468de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 6478de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert#define MONKEY_ERROR(msg, index) { \ 6488de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert IntlTest::gTest->errln("%s:%d %s at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ", \ 6498de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert __FILE__, __LINE__, msg, index, fRuleFileName, fTestData->fRandomSeed); \ 6508de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (fVerbose) { fTestData->dump(index); } \ 6518de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert status = U_INVALID_STATE_ERROR; \ 6528de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert} 6538de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 6548de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid RBBIMonkeyImpl::runTest() { 6558de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert UErrorCode status = U_ZERO_ERROR; 6568de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert int32_t errorCount = 0; 6578de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert for (int64_t loopCount = 0; fLoopCount < 0 || loopCount < fLoopCount; loopCount++) { 6588de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert status = U_ZERO_ERROR; 6598de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fTestData->set(fRuleSet.getAlias(), fRandomGenerator, status); 6608de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (fBI.isNull()) { 6618de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert IntlTest::gTest->dataerrln("Unable to run test because fBI is null."); 6628de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return; 6638de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 6648de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // fTestData->dump(); 6658de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert testForwards(status); 6668de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert testPrevious(status); 6678de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert testFollowing(status); 6688de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert testPreceding(status); 6698de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert testIsBoundary(status); 6708de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 6718de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (fLoopCount < 0 && loopCount % 100 == 0) { 6728de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fprintf(stderr, "."); 6738de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 6748de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (U_FAILURE(status)) { 6758de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (++errorCount > 10) { 6768de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return; 6778de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 6788de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 6798de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 6808de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert} 6818de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 6828de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid RBBIMonkeyImpl::testForwards(UErrorCode &status) { 6838de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (U_FAILURE(status)) { 6848de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return; 6858de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 6868de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fTestData->clearActualBreaks(); 6878de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fBI->setText(fTestData->fString); 6888de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert int32_t previousBreak = -2; 6898de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert for (int32_t bk=fBI->first(); bk != BreakIterator::DONE; bk=fBI->next()) { 6908de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (bk <= previousBreak) { 6918de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert MONKEY_ERROR("Break Iterator Stall", bk); 6928de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return; 6938de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 6948de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (bk < 0 || bk > fTestData->fString.length()) { 6958de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert MONKEY_ERROR("Boundary out of bounds", bk); 6968de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return; 6978de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 6988de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fTestData->fActualBreaks.setCharAt(bk, 1); 6998de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 7008de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert checkResults("testForwards", FORWARD, status); 7018de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert} 7028de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 7038de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid RBBIMonkeyImpl::testFollowing(UErrorCode &status) { 7048de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (U_FAILURE(status)) { 7058de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return; 7068de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 7078de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fTestData->clearActualBreaks(); 7088de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fBI->setText(fTestData->fString); 7098de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert int32_t nextBreak = -1; 7108de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert for (int32_t i=-1 ; i<fTestData->fString.length(); ++i) { 7118de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert int32_t bk = fBI->following(i); 7128de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (bk == BreakIterator::DONE && i == fTestData->fString.length()) { 7138de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert continue; 7148de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 7158de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (bk == nextBreak && bk > i) { 7168de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // i is in the gap between two breaks. 7178de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert continue; 7188de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 7198de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (i == nextBreak && bk > nextBreak) { 7208de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fTestData->fActualBreaks.setCharAt(bk, 1); 7218de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert nextBreak = bk; 7228de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert continue; 7238de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 7248de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert MONKEY_ERROR("following(i)", i); 7258de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return; 7268de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 7278de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert checkResults("testFollowing", FORWARD, status); 7288de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert} 7298de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 7308de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 7318de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 7328de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid RBBIMonkeyImpl::testPrevious(UErrorCode &status) { 7338de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (U_FAILURE(status)) {return;} 7348de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 7358de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fTestData->clearActualBreaks(); 7368de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fBI->setText(fTestData->fString); 7378de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert int32_t previousBreak = INT32_MAX; 7388de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert for (int32_t bk=fBI->last(); bk != BreakIterator::DONE; bk=fBI->previous()) { 7398de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (bk >= previousBreak) { 7408de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert MONKEY_ERROR("Break Iterator Stall", bk); 7418de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return; 7428de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 7438de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (bk < 0 || bk > fTestData->fString.length()) { 7448de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert MONKEY_ERROR("Boundary out of bounds", bk); 7458de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return; 7468de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 7478de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fTestData->fActualBreaks.setCharAt(bk, 1); 7488de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 7498de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert checkResults("testPrevius", REVERSE, status); 7508de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert} 7518de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 7528de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 7538de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid RBBIMonkeyImpl::testPreceding(UErrorCode &status) { 7548de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (U_FAILURE(status)) { 7558de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return; 7568de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 7578de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fTestData->clearActualBreaks(); 7588de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fBI->setText(fTestData->fString); 7598de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert int32_t nextBreak = fTestData->fString.length()+1; 7608de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert for (int32_t i=fTestData->fString.length()+1 ; i>=0; --i) { 7618de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert int32_t bk = fBI->preceding(i); 7628de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // printf("i:%d bk:%d nextBreak:%d\n", i, bk, nextBreak); 7638de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (bk == BreakIterator::DONE && i == 0) { 7648de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert continue; 7658de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 7668de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (bk == nextBreak && bk < i) { 7678de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // i is in the gap between two breaks. 7688de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert continue; 7698de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 7708de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (i<fTestData->fString.length() && fTestData->fString.getChar32Start(i) < i) { 7718de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // i indexes to a trailing surrogate. 7728de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Break Iterators treat an index to either half as referring to the supplemental code point, 7738de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // with preceding going to some preceding code point. 7748de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (fBI->preceding(i) != fBI->preceding(fTestData->fString.getChar32Start(i))) { 7758de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert MONKEY_ERROR("preceding of trailing surrogate error", i); 7768de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 7778de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert continue; 7788de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 7798de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (i == nextBreak && bk < nextBreak) { 7808de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fTestData->fActualBreaks.setCharAt(bk, 1); 7818de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert nextBreak = bk; 7828de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert continue; 7838de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 7848de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert MONKEY_ERROR("preceding(i)", i); 7858de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return; 7868de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 7878de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert checkResults("testPreceding", REVERSE, status); 7888de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert} 7898de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 7908de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 7918de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid RBBIMonkeyImpl::testIsBoundary(UErrorCode &status) { 7928de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (U_FAILURE(status)) { 7938de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return; 7948de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 7958de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fTestData->clearActualBreaks(); 7968de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fBI->setText(fTestData->fString); 7978de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert for (int i=fTestData->fString.length(); i>=0; --i) { 7988de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (fBI->isBoundary(i)) { 7998de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fTestData->fActualBreaks.setCharAt(i, 1); 8008de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 8018de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 8028de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert checkResults("testForwards", FORWARD, status); 8038de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert} 8048de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 8058de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid RBBIMonkeyImpl::checkResults(const char *msg, CheckDirection direction, UErrorCode &status) { 8068de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (U_FAILURE(status)) { 8078de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return; 8088de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 8098de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (direction == FORWARD) { 8108de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert for (int i=0; i<=fTestData->fString.length(); ++i) { 8118de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (fTestData->fExpectedBreaks.charAt(i) != fTestData->fActualBreaks.charAt(i)) { 8128de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert IntlTest::gTest->errln("%s:%d %s failure at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ", 8138de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert __FILE__, __LINE__, msg, i, fRuleFileName, fTestData->fRandomSeed); 8148de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (fVerbose) { 8158de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fTestData->dump(i); 8168de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 8178de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert status = U_INVALID_STATE_ERROR; // Prevent the test from continuing, which would likely 8188de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert break; // produce many redundant errors. 8198de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 8208de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 8218de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } else { 8228de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert for (int i=fTestData->fString.length(); i>=0; i--) { 8238de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (fTestData->fExpectedBreaks.charAt(i) != fTestData->fActualBreaks.charAt(i)) { 8248de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert IntlTest::gTest->errln("%s:%d %s failure at index %d. Parameters to reproduce: @rules=%s,seed=%u,loop=1,verbose ", 8258de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert __FILE__, __LINE__, msg, i, fRuleFileName, fTestData->fRandomSeed); 8268de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (fVerbose) { 8278de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert fTestData->dump(i); 8288de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 8298de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert status = U_INVALID_STATE_ERROR; 8308de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert break; 8318de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 8328de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 8338de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 8348de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert} 8358de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 8368de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 8378de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 8388de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//--------------------------------------------------------------------------------------- 8398de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert// 8408de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert// class RBBIMonkeyTest implementation. 8418de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert// 8428de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert//--------------------------------------------------------------------------------------- 8438de051c3d18a56cc126f0f44e368495a52f9148cFredrik RoubertRBBIMonkeyTest::RBBIMonkeyTest() { 8448de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert} 8458de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 8468de051c3d18a56cc126f0f44e368495a52f9148cFredrik RoubertRBBIMonkeyTest::~RBBIMonkeyTest() { 8478de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert} 8488de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 8498de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 8508de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert// params, taken from this->fParams. 8518de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert// rules=file_name Name of file containing the reference rules. 8528de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert// seed=nnnnn Random number starting seed. 8538de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert// Setting the seed allows errors to be reproduced. 8548de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert// loop=nnn Looping count. Controls running time. 8558de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert// -1: run forever. 8568de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert// 0 or greater: run length. 8578de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert// expansions debug option, show expansions of rules and sets. 8588de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert// verbose Display details of the failure. 8598de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert// 8608de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert// Parameters on the intltest command line follow the test name, and are preceded by '@'. 8618de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert// For example, 8628de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert// intltest rbbi/RBBIMonkeyTest/testMonkey@rules=line.txt,loop=-1 8638de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert// 8648de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubertvoid RBBIMonkeyTest::testMonkey() { 8658de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // printf("Test parameters: %s\n", fParams); 8668de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert UnicodeString params(fParams); 8678de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert UErrorCode status = U_ZERO_ERROR; 8688de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 8698de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert const char *tests[] = {"grapheme.txt", "word.txt", "line.txt", "sentence.txt", "line_normal.txt", 8708de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert "line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt", 8718de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert NULL }; 8728de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert CharString testNameFromParams; 8738de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (getStringParam("rules", params, testNameFromParams, status)) { 8748de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert tests[0] = testNameFromParams.data(); 8758de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert tests[1] = NULL; 8768de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 8778de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 8788de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert int64_t loopCount = quick? 100 : 5000; 8798de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert getIntParam("loop", params, loopCount, status); 8808de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 8818de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert UBool dumpExpansions = FALSE; 8828de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert getBoolParam("expansions", params, dumpExpansions, status); 8838de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 8848de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert UBool verbose = FALSE; 8858de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert getBoolParam("verbose", params, verbose, status); 8868de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 8878de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert int64_t seed = 0; 8888de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert getIntParam("seed", params, seed, status); 8898de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 8908de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (params.length() != 0) { 8918de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Options processing did not consume all of the parameters. Something unrecognized was present. 8928de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert CharString unrecognizedParameters; 8938de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert unrecognizedParameters.append(CStr(params)(), -1, status); 8948de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert errln("%s:%d unrecognized test parameter(s) \"%s\"", __FILE__, __LINE__, unrecognizedParameters.data()); 8958de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return; 8968de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 8978de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 8988de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert UVector startedTests(status); 8998de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (U_FAILURE(status)) { 9008de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert errln("%s:%d: error %s while setting up test.", __FILE__, __LINE__, u_errorName(status)); 9018de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return; 9028de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 9038de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 9048de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Monkey testing is multi-threaded. 9058de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Each set of break rules to be tested is run in a separate thread. 9068de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Each thread/set of rules gets a separate RBBIMonkeyImpl object. 9078de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert int32_t i; 9088de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert for (i=0; tests[i] != NULL; ++i) { 9098de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert logln("beginning testing of %s", tests[i]); 9108de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert RBBIMonkeyImpl *test = new RBBIMonkeyImpl(status); 91164339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert if (U_FAILURE(status)) { 91264339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert errln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]); 91364339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert break; 91464339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert } 9158de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert test->fDumpExpansions = dumpExpansions; 9168de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert test->fVerbose = verbose; 9178de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert test->fRandomGenerator.seed((uint32_t)seed); 9188de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert test->fLoopCount = loopCount; 9198de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert test->setup(tests[i], status); 92064339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert if (U_FAILURE(status)) { 92164339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert errln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]); 92264339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert break; 92364339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert } 9248de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert test->startTest(); 9258de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert startedTests.addElement(test, status); 9268de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (U_FAILURE(status)) { 92764339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert errln("%s:%d: error %s while starting test %s.", __FILE__, __LINE__, u_errorName(status), tests[i]); 9288de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert break; 9298de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 9308de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 9318de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 9328de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert for (i=0; i<startedTests.size(); ++i) { 9338de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert RBBIMonkeyImpl *test = static_cast<RBBIMonkeyImpl *>(startedTests.elementAt(i)); 9348de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert test->join(); 9358de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert delete test; 9368de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 9378de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert} 9388de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 9398de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 9408de051c3d18a56cc126f0f44e368495a52f9148cFredrik RoubertUBool RBBIMonkeyTest::getIntParam(UnicodeString name, UnicodeString ¶ms, int64_t &val, UErrorCode &status) { 9418de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert name.append(" *= *(-?\\d+) *,? *"); 9428de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert RegexMatcher m(name, params, 0, status); 9438de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (m.find()) { 9448de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // The param exists. Convert the string to an int. 9458de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert CharString str; 9468de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert str.append(CStr(m.group(1, status))(), -1, status); 9478de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert val = strtol(str.data(), NULL, 10); 9488de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 9498de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Delete this parameter from the params string. 9508de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert m.reset(); 9518de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert params = m.replaceFirst(UnicodeString(), status); 9528de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return TRUE; 9538de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 9548de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return FALSE; 9558de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert} 9568de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 9578de051c3d18a56cc126f0f44e368495a52f9148cFredrik RoubertUBool RBBIMonkeyTest::getStringParam(UnicodeString name, UnicodeString ¶ms, CharString &dest, UErrorCode &status) { 9588de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert name.append(" *= *([^ ,]*) *,? *"); 9598de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert RegexMatcher m(name, params, 0, status); 9608de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (m.find()) { 9618de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // The param exists. 9628de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert dest.append(CStr(m.group(1, status))(), -1, status); 9638de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 9648de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Delete this parameter from the params string. 9658de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert m.reset(); 9668de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert params = m.replaceFirst(UnicodeString(), status); 9678de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return TRUE; 9688de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 9698de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return FALSE; 9708de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert} 9718de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 9728de051c3d18a56cc126f0f44e368495a52f9148cFredrik RoubertUBool RBBIMonkeyTest::getBoolParam(UnicodeString name, UnicodeString ¶ms, UBool &dest, UErrorCode &status) { 9738de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert name.append("(?: *= *(true|false))? *,? *"); 9748de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert RegexMatcher m(name, params, UREGEX_CASE_INSENSITIVE, status); 9758de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (m.find()) { 9768de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert if (m.start(1, status) > 0) { 9778de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // user option included a value. 9788de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert dest = m.group(1, status).caseCompare(UnicodeString("true"), U_FOLD_CASE_DEFAULT) == 0; 9798de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } else { 9808de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // No explicit user value, implies true. 9818de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert dest = TRUE; 9828de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 9838de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 9848de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert // Delete this parameter from the params string. 9858de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert m.reset(); 9868de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert params = m.replaceFirst(UnicodeString(), status); 9878de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return TRUE; 9888de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert } 9898de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert return FALSE; 9908de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert} 9918de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert 9928de051c3d18a56cc126f0f44e368495a52f9148cFredrik Roubert#endif /* !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING */ 993