1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3/************************************************************************* 4 * Copyright (c) 2016, International Business Machines 5 * Corporation and others. All Rights Reserved. 6 ************************************************************************* 7*/ 8#ifndef RBBIMONKEYTEST_H 9#define RBBIMONKEYTEST_H 10 11#include "unicode/utypes.h" 12 13#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING 14 15#include "intltest.h" 16 17#include "unicode/rbbi.h" 18#include "unicode/regex.h" 19#include "unicode/uniset.h" 20#include "unicode/unistr.h" 21#include "unicode/uobject.h" 22 23#include "simplethread.h" 24#include "ucbuf.h" 25#include "uhash.h" 26#include "uvector.h" 27 28// 29// TODO: 30// Develop a tailoring format. 31// Hook to old tests that use monkey impl to get expected data. 32// Remove old tests. 33 34class BreakRules; // Forward declaration 35class RBBIMonkeyImpl; 36 37/** 38 * Test the RuleBasedBreakIterator class giving different rules 39 */ 40class RBBIMonkeyTest: public IntlTest { 41 public: 42 RBBIMonkeyTest(); 43 virtual ~RBBIMonkeyTest(); 44 45 void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL ); 46 void testMonkey(); 47 48 49 private: 50 const char *fParams; // Copy of user parameters passed in from IntlTest. 51 52 53 void testRules(const char *ruleFile); 54 static UBool getIntParam(UnicodeString name, UnicodeString ¶ms, int64_t &val, UErrorCode &status); 55 static UBool getStringParam(UnicodeString name, UnicodeString ¶ms, CharString &dest, UErrorCode &status); 56 static UBool getBoolParam(UnicodeString name, UnicodeString ¶ms, UBool &dest, UErrorCode &status); 57 58}; 59 60// The following classes are internal to the RBBI Monkey Test implementation. 61 62 63 64// class CharClass Represents a single character class from the source break rules. 65// Inherits from UObject because instances are adopted by UHashtable, which ultimately 66// deletes them using hash's object deleter function. 67 68class CharClass: public UObject { 69 public: 70 UnicodeString fName; 71 UnicodeString fOriginalDef; // set definition as it appeared in user supplied rules. 72 UnicodeString fExpandedDef; // set definition with any embedded named sets replaced by their defs, recursively. 73 LocalPointer<const UnicodeSet> fSet; 74 CharClass(const UnicodeString &name, const UnicodeString &originalDef, const UnicodeString &expandedDef, const UnicodeSet *set) : 75 fName(name), fOriginalDef(originalDef), fExpandedDef(expandedDef), fSet(set) {} 76}; 77 78 79// class BreakRule represents a single rule from a set of break rules. 80// Each rule has the set definitions expanded, and 81// is compiled to a regular expression. 82 83class BreakRule: public UObject { 84 public: 85 BreakRule(); 86 ~BreakRule(); 87 UnicodeString fName; // Name of the rule. 88 UnicodeString fRule; // Rule expression, excluding the name, as written in user source. 89 UnicodeString fExpandedRule; // Rule expression after expanding the set definitions. 90 LocalPointer<RegexMatcher> fRuleMatcher; // Regular expression that matches the rule. 91}; 92 93 94// class BreakRules represents a complete set of break rules, possibly tailored, 95// compiled from testdata break rules. 96 97class BreakRules: public UObject { 98 public: 99 BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status); 100 ~BreakRules(); 101 102 void compileRules(UCHARBUF *rules, UErrorCode &status); 103 104 const CharClass *getClassForChar(UChar32 c, int32_t *iter=NULL) const; 105 106 107 RBBIMonkeyImpl *fMonkeyImpl; // Pointer back to the owning MonkeyImpl instance. 108 icu::UVector fBreakRules; // Contents are of type (BreakRule *). 109 110 LocalUHashtablePointer fCharClasses; // Key is set name (UnicodeString). 111 // Value is (CharClass *) 112 LocalPointer<UVector> fCharClassList; // Char Classes, same contents as fCharClasses values, 113 // but in a vector so they can be accessed by index. 114 UnicodeSet fDictionarySet; // Dictionary set, empty if none is defined. 115 Locale fLocale; 116 UBreakIteratorType fType; 117 118 CharClass *addCharClass(const UnicodeString &name, const UnicodeString &def, UErrorCode &status); 119 void addRule(const UnicodeString &name, const UnicodeString &def, UErrorCode &status); 120 bool setKeywordParameter(const UnicodeString &keyword, const UnicodeString &value, UErrorCode &status); 121 RuleBasedBreakIterator *createICUBreakIterator(UErrorCode &status); 122 123 LocalPointer<RegexMatcher> fSetRefsMatcher; 124 LocalPointer<RegexMatcher> fCommentsMatcher; 125 LocalPointer<RegexMatcher> fClassDefMatcher; 126 LocalPointer<RegexMatcher> fRuleDefMatcher; 127}; 128 129 130// class MonkeyTestData represents a randomly synthesized test data string together 131// with the expected break positions obtained by applying 132// the test break rules. 133 134class MonkeyTestData: public UObject { 135 public: 136 MonkeyTestData() {}; 137 ~MonkeyTestData() {}; 138 void set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode &status); 139 void clearActualBreaks(); 140 void dump(int32_t around = -1) const; 141 142 uint32_t fRandomSeed; // The initial seed value from the random number genererator. 143 const BreakRules *fBkRules; // The break rules used to generate this data. 144 UnicodeString fString; // The text. 145 UnicodeString fExpectedBreaks; // Breaks as found by the reference rules. 146 // Parallel to fString. Non-zero if break preceding. 147 UnicodeString fActualBreaks; // Breaks as found by ICU break iterator. 148 UnicodeString fRuleForPosition; // Index into BreakRules.fBreakRules of rule that applied at each position. 149 // Also parallel to fString. 150 UnicodeString f2ndRuleForPos; // As above. A 2nd rule applies when the preceding rule 151 // didn't cause a break, and a subsequent rule match starts 152 // on the last code point of the preceding match. 153 154}; 155 156 157 158 159// class RBBIMonkeyImpl holds (some indirectly) everything associated with running a monkey 160// test for one set of break rules. 161// 162// When running RBBIMonkeyTest with multiple threads, there is a 1:1 correspondence 163// between instances of RBBIMonkeyImpl and threads. 164// 165class RBBIMonkeyImpl: public UObject { 166 public: 167 RBBIMonkeyImpl(UErrorCode &status); 168 ~RBBIMonkeyImpl(); 169 170 void setup(const char *ruleFileName, UErrorCode &status); 171 172 void startTest(); 173 void runTest(); 174 void join(); 175 176 LocalUCHARBUFPointer fRuleCharBuffer; // source file contents of the reference rules. 177 LocalPointer<BreakRules> fRuleSet; 178 LocalPointer<RuleBasedBreakIterator> fBI; 179 LocalPointer<MonkeyTestData> fTestData; 180 IntlTest::icu_rand fRandomGenerator; 181 const char *fRuleFileName; 182 UBool fVerbose; // True to do long dump of failing data. 183 int32_t fLoopCount; 184 185 UBool fDumpExpansions; // Debug flag to output epananded form of rules and sets. 186 187 enum CheckDirection { 188 FORWARD = 1, 189 REVERSE = 2 190 }; 191 void clearActualBreaks(); 192 void testForwards(UErrorCode &status); 193 void testPrevious(UErrorCode &status); 194 void testFollowing(UErrorCode &status); 195 void testPreceding(UErrorCode &status); 196 void testIsBoundary(UErrorCode &status); 197 void testIsBoundaryRandom(UErrorCode &status); 198 void checkResults(const char *msg, CheckDirection dir, UErrorCode &status); 199 200 class RBBIMonkeyThread: public SimpleThread { 201 private: 202 RBBIMonkeyImpl *fMonkeyImpl; 203 public: 204 RBBIMonkeyThread(RBBIMonkeyImpl *impl) : fMonkeyImpl(impl) {}; 205 void run() U_OVERRIDE { fMonkeyImpl->runTest(); }; 206 }; 207 private: 208 void openBreakRules(const char *fileName, UErrorCode &status); 209 RBBIMonkeyThread fThread; 210 211}; 212 213#endif /* !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING */ 214 215#endif // RBBIMONKEYTEST_H 216