1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*************************************************************************
4 * Copyright (c) 2016, International Business Machines
5 * Corporation and others. All Rights Reserved.
6 *************************************************************************
7*/
8#ifndef RBBIMONKEYTEST_H
9#define RBBIMONKEYTEST_H
10
11#include "unicode/utypes.h"
12
13#if !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING
14
15#include "intltest.h"
16
17#include "unicode/rbbi.h"
18#include "unicode/regex.h"
19#include "unicode/uniset.h"
20#include "unicode/unistr.h"
21#include "unicode/uobject.h"
22
23#include "simplethread.h"
24#include "ucbuf.h"
25#include "uhash.h"
26#include "uvector.h"
27
28//
29//  TODO:
30//     Develop a tailoring format.
31//     Hook to old tests that use monkey impl to get expected data.
32//     Remove old tests.
33
34class BreakRules;       // Forward declaration
35class RBBIMonkeyImpl;
36
37/**
38 * Test the RuleBasedBreakIterator class giving different rules
39 */
40class RBBIMonkeyTest: public IntlTest {
41  public:
42    RBBIMonkeyTest();
43    virtual ~RBBIMonkeyTest();
44
45    void runIndexedTest( int32_t index, UBool exec, const char* &name, char* par = NULL );
46    void testMonkey();
47
48
49  private:
50    const char *fParams;                  // Copy of user parameters passed in from IntlTest.
51
52
53    void testRules(const char *ruleFile);
54    static UBool getIntParam(UnicodeString name, UnicodeString &params, int64_t &val, UErrorCode &status);
55    static UBool getStringParam(UnicodeString name, UnicodeString &params, CharString &dest, UErrorCode &status);
56    static UBool getBoolParam(UnicodeString name, UnicodeString &params, UBool &dest, UErrorCode &status);
57
58};
59
60// The following classes are internal to the RBBI Monkey Test implementation.
61
62
63
64//  class CharClass    Represents a single character class from the source break rules.
65//                     Inherits from UObject because instances are adopted by UHashtable, which ultimately
66//                     deletes them using hash's object deleter function.
67
68class CharClass: public UObject {
69  public:
70    UnicodeString                fName;
71    UnicodeString                fOriginalDef;    // set definition as it appeared in user supplied rules.
72    UnicodeString                fExpandedDef;    // set definition with any embedded named sets replaced by their defs, recursively.
73    LocalPointer<const UnicodeSet>     fSet;
74    CharClass(const UnicodeString &name, const UnicodeString &originalDef, const UnicodeString &expandedDef, const UnicodeSet *set) :
75            fName(name), fOriginalDef(originalDef), fExpandedDef(expandedDef), fSet(set) {}
76};
77
78
79// class BreakRule    represents a single rule from a set of break rules.
80//                    Each rule has the set definitions expanded, and
81//                    is compiled to a regular expression.
82
83class BreakRule: public UObject {
84  public:
85    BreakRule();
86    ~BreakRule();
87    UnicodeString    fName;                            // Name of the rule.
88    UnicodeString    fRule;                            // Rule expression, excluding the name, as written in user source.
89    UnicodeString    fExpandedRule;                    // Rule expression after expanding the set definitions.
90    LocalPointer<RegexMatcher>  fRuleMatcher;          // Regular expression that matches the rule.
91};
92
93
94// class BreakRules    represents a complete set of break rules, possibly tailored,
95//                     compiled from testdata break rules.
96
97class BreakRules: public UObject {
98  public:
99    BreakRules(RBBIMonkeyImpl *monkeyImpl, UErrorCode &status);
100    ~BreakRules();
101
102    void compileRules(UCHARBUF *rules, UErrorCode &status);
103
104    const CharClass *getClassForChar(UChar32 c, int32_t *iter=NULL) const;
105
106
107    RBBIMonkeyImpl    *fMonkeyImpl;        // Pointer back to the owning MonkeyImpl instance.
108    icu::UVector       fBreakRules;        // Contents are of type (BreakRule *).
109
110    LocalUHashtablePointer fCharClasses;   // Key is set name (UnicodeString).
111                                           // Value is (CharClass *)
112    LocalPointer<UVector>  fCharClassList; // Char Classes, same contents as fCharClasses values,
113                                           //   but in a vector so they can be accessed by index.
114    UnicodeSet         fDictionarySet;     // Dictionary set, empty if none is defined.
115    Locale             fLocale;
116    UBreakIteratorType fType;
117
118    CharClass *addCharClass(const UnicodeString &name, const UnicodeString &def, UErrorCode &status);
119    void addRule(const UnicodeString &name, const UnicodeString &def, UErrorCode &status);
120    bool setKeywordParameter(const UnicodeString &keyword, const UnicodeString &value, UErrorCode &status);
121    RuleBasedBreakIterator *createICUBreakIterator(UErrorCode &status);
122
123    LocalPointer<RegexMatcher> fSetRefsMatcher;
124    LocalPointer<RegexMatcher> fCommentsMatcher;
125    LocalPointer<RegexMatcher> fClassDefMatcher;
126    LocalPointer<RegexMatcher> fRuleDefMatcher;
127};
128
129
130// class MonkeyTestData    represents a randomly synthesized test data string together
131//                         with the expected break positions obtained by applying
132//                         the test break rules.
133
134class MonkeyTestData: public UObject {
135  public:
136    MonkeyTestData() {};
137    ~MonkeyTestData() {};
138    void set(BreakRules *rules, IntlTest::icu_rand &rand, UErrorCode &status);
139    void clearActualBreaks();
140    void dump(int32_t around = -1) const;
141
142    uint32_t               fRandomSeed;        // The initial seed value from the random number genererator.
143    const BreakRules      *fBkRules;           // The break rules used to generate this data.
144    UnicodeString          fString;            // The text.
145    UnicodeString          fExpectedBreaks;    // Breaks as found by the reference rules.
146                                               //     Parallel to fString. Non-zero if break preceding.
147    UnicodeString          fActualBreaks;      // Breaks as found by ICU break iterator.
148    UnicodeString          fRuleForPosition;   // Index into BreakRules.fBreakRules of rule that applied at each position.
149                                               // Also parallel to fString.
150    UnicodeString          f2ndRuleForPos;     // As above. A 2nd rule applies when the preceding rule
151                                               //   didn't cause a break, and a subsequent rule match starts
152                                               //   on the last code point of the preceding match.
153
154};
155
156
157
158
159// class RBBIMonkeyImpl     holds (some indirectly) everything associated with running a monkey
160//                          test for one set of break rules.
161//
162//                          When running RBBIMonkeyTest with multiple threads, there is a 1:1 correspondence
163//                          between instances of RBBIMonkeyImpl and threads.
164//
165class RBBIMonkeyImpl: public UObject {
166  public:
167    RBBIMonkeyImpl(UErrorCode &status);
168    ~RBBIMonkeyImpl();
169
170    void setup(const char *ruleFileName, UErrorCode &status);
171
172    void startTest();
173    void runTest();
174    void join();
175
176    LocalUCHARBUFPointer                 fRuleCharBuffer;         // source file contents of the reference rules.
177    LocalPointer<BreakRules>             fRuleSet;
178    LocalPointer<RuleBasedBreakIterator> fBI;
179    LocalPointer<MonkeyTestData>         fTestData;
180    IntlTest::icu_rand                   fRandomGenerator;
181    const char                          *fRuleFileName;
182    UBool                                fVerbose;                 // True to do long dump of failing data.
183    int32_t                              fLoopCount;
184
185    UBool                                fDumpExpansions;          // Debug flag to output epananded form of rules and sets.
186
187    enum CheckDirection {
188        FORWARD = 1,
189        REVERSE = 2
190    };
191    void clearActualBreaks();
192    void testForwards(UErrorCode &status);
193    void testPrevious(UErrorCode &status);
194    void testFollowing(UErrorCode &status);
195    void testPreceding(UErrorCode &status);
196    void testIsBoundary(UErrorCode &status);
197    void testIsBoundaryRandom(UErrorCode &status);
198    void checkResults(const char *msg, CheckDirection dir, UErrorCode &status);
199
200    class RBBIMonkeyThread: public SimpleThread {
201      private:
202        RBBIMonkeyImpl *fMonkeyImpl;
203      public:
204        RBBIMonkeyThread(RBBIMonkeyImpl *impl) : fMonkeyImpl(impl) {};
205        void run() U_OVERRIDE { fMonkeyImpl->runTest(); };
206    };
207  private:
208    void openBreakRules(const char *fileName, UErrorCode &status);
209    RBBIMonkeyThread fThread;
210
211};
212
213#endif /* !UCONFIG_NO_BREAK_ITERATION && !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_FORMATTING */
214
215#endif  //  RBBIMONKEYTEST_H
216