1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3
4package com.ibm.icu.dev.test.rbbi;
5
6import java.io.IOException;
7import java.io.InputStream;
8import java.io.InputStreamReader;
9import java.util.ArrayList;
10import java.util.Arrays;
11import java.util.HashMap;
12import java.util.List;
13import java.util.Map;
14import java.util.regex.Matcher;
15import java.util.regex.Pattern;
16import java.util.regex.PatternSyntaxException;
17
18import org.junit.Test;
19import org.junit.runner.RunWith;
20import org.junit.runners.JUnit4;
21
22import com.ibm.icu.dev.test.TestFmwk;
23import com.ibm.icu.impl.UCharacterName;
24import com.ibm.icu.impl.UCharacterNameChoice;
25import com.ibm.icu.text.BreakIterator;
26import com.ibm.icu.text.RuleBasedBreakIterator;
27import com.ibm.icu.text.UnicodeSet;
28import com.ibm.icu.util.ULocale;
29
30/**
31 * RBBI Monkey Test. Ported from ICU4C test/intltest/rbbimonkeytest.cpp.
32 * This is the newer, data driven monkey test. It is completely separate from the
33 * older class RBBITestMonkey.
34 */
35
36@RunWith(JUnit4.class)
37public class RBBIMonkeyTest extends TestFmwk {
38
39
40    //  class CharClass    Represents a single character class from the source break rules.
41    //                     Inherits from UObject because instances are adopted by UHashtable, which ultimately
42    //                     deletes them using hash's object deleter function.
43
44    static class CharClass  {
45        String         fName;
46        String         fOriginalDef;    // set definition as it appeared in user supplied rules.
47        String         fExpandedDef;    // set definition with any embedded named sets replaced by their defs, recursively.
48        UnicodeSet     fSet;
49        CharClass(String name, String originalDef, String expandedDef, UnicodeSet set) {
50            fName = name;
51            fOriginalDef = originalDef;
52            fExpandedDef = expandedDef;
53            fSet = set;
54        };
55    }
56
57
58    // class BreakRule    Struct-like class represents a single rule from a set of break rules.
59    //                    Each rule has the set definitions expanded, and
60    //                    is compiled to a regular expression.
61
62    static class BreakRule {
63        String    fName;                   // Name of the rule.
64        String    fRule;                   // Rule expression, excluding the name, as written in user source.
65        String    fExpandedRule;           // Rule expression after expanding the set definitions.
66        Matcher   fRuleMatcher;            // Regular expression that matches the rule.
67    };
68
69
70    // class BreakRules    represents a complete set of break rules, possibly tailored,
71    //                     compiled from testdata break rules.
72
73    static class BreakRules {
74        BreakRules(RBBIMonkeyImpl monkeyImpl) {
75            fMonkeyImpl = monkeyImpl;
76            fBreakRules = new ArrayList<BreakRule>();
77            fType = BreakIterator.KIND_TITLE;
78            fCharClasses = new HashMap<String, CharClass>();
79            fCharClassList = new ArrayList<CharClass>();
80            fDictionarySet = new UnicodeSet();
81
82            // Match an alpha-numeric identifier in a rule. Will be a set name.
83            // Use negative look-behind to exclude non-identifiers, mostly property names or values.
84            fSetRefsMatcher = Pattern.compile(
85                    "(?<!\\{[ \\t]{0,4})" +
86                    "(?<!=[ \\t]{0,4})" +
87                    "(?<!\\[:[ \\t]{0,4})" +
88                    "(?<!\\\\)" +
89                    "(?<![A-Za-z0-9_])" +
90                    "([A-Za-z_][A-Za-z0-9_]*)").     // The char class name
91                    matcher("");
92
93            // Match comments and blank lines. Matches will be replaced with "", stripping the comments from the rules.
94            fCommentsMatcher = Pattern.compile("" +
95                    "(^|(?<=;))"   +                // Start either at start of line, or just after a ';' (look-behind for ';')
96                    "[ \\t]*+"     +                //   Match white space.
97                    "(#.*)?+"      +                //   Optional # plus whatever follows
98                    "$").                           //   new-line at end of line.
99                    matcher("");
100
101            // Match (initial parse) of a character class definition line.
102            fClassDefMatcher = Pattern.compile("" +
103                    "[ \\t]*"           +                    // leading white space
104                    "([A-Za-z_][A-Za-z0-9_]*)" +             // The char class name
105                    "[ \\t]*=[ \\t]*"   +                    //   =
106                    "(.*?)"  +                               // The char class UnicodeSet expression
107                    "[ \\t]*;$").                            // ; <end of line>
108                    matcher("");
109
110            // Match (initial parse) of a break rule line.
111            fRuleDefMatcher = Pattern.compile("" +
112                    "[ \\t]*"           +                     // leading white space
113                    "([A-Za-z_][A-Za-z0-9_.]*)" +             // The rule name
114                    "[ \\t]*:[ \\t]*"   +                     //   :
115                    "(.*?)"   +                               // The rule definition
116                    "[ \\t]*;$").                             // ; <end of line>
117                    matcher("");
118
119            // Match a property expression, either [:xxx:] or \p{...}
120            fPropertyMatcher = Pattern.compile("" +
121                    "\\[:.*?:]|\\\\(?:p|P)\\{.*?\\}").
122                    matcher("");
123
124
125        }
126
127        /**
128         * Create the expanded definition for this char class,
129         * replacing any set references with the corresponding definition.
130         */
131        CharClass  addCharClass(String name, String definition) {
132            StringBuffer expandedDef = new StringBuffer();
133            fSetRefsMatcher.reset(definition);
134            while (fSetRefsMatcher.find()) {
135                String sname = fSetRefsMatcher.group(/*"ClassName"*/ 1);
136                CharClass snameClass = fCharClasses.get(sname);
137                String expansionForName = snameClass != null ? snameClass.fExpandedDef : sname;
138
139                fSetRefsMatcher.appendReplacement(expandedDef, "");
140                expandedDef.append(expansionForName);
141            }
142            fSetRefsMatcher.appendTail(expandedDef);
143            String expandedDefString = expandedDef.toString();
144
145            if (fMonkeyImpl.fDumpExpansions) {
146                System.out.printf("addCharClass(\"%s\"\n", name);
147                System.out.printf("             %s\n", definition);
148                System.out.printf("expandedDef: %s\n", expandedDefString);
149            }
150
151            // Verify that the expanded set definition is valid.
152
153            UnicodeSet s;
154            try {
155                s = new UnicodeSet(expandedDefString, UnicodeSet.IGNORE_SPACE);
156            } catch (java.lang.IllegalArgumentException e) {
157                System.err.printf("%s: error %s creating UnicodeSet %s", fMonkeyImpl.fRuleFileName, e.toString(), name);
158                throw e;
159            }
160
161            // Get an expanded equivalent pattern from the UnicodeSet.
162            // This removes set difference operators, which would fail if passed through to Java regex.
163
164            StringBuffer expandedPattern = new StringBuffer();
165            s._generatePattern(expandedPattern, true);
166            expandedDefString = expandedPattern.toString();
167            if (fMonkeyImpl.fDumpExpansions) {
168                System.out.printf("expandedDef2: %s\n", expandedDefString);
169            }
170
171            CharClass cclass = new CharClass(name, definition, expandedDefString, s);
172            CharClass previousClass = fCharClasses.put(name, cclass);
173
174            if (previousClass != null) {
175                // TODO: decide whether or not to allow redefinitions.
176                //       Can be convenient in some cases.
177                // String msg = String.format("%s: Redefinition of character class %s\n",
178                //         fMonkeyImpl.fRuleFileName, cclass.fName);
179                // System.err.println(msg);
180                // throw new IllegalArgumentException(msg);
181            }
182            return cclass;
183
184        };
185
186
187        void addRule(String  name, String  definition) {
188            BreakRule  thisRule = new BreakRule();
189            StringBuffer expandedDefsRule = new StringBuffer();
190            thisRule.fName = name;
191            thisRule.fRule = definition;
192
193            // Expand the char class definitions within the rule.
194            fSetRefsMatcher.reset(definition);
195            while (fSetRefsMatcher.find()) {
196                String sname = fSetRefsMatcher.group(/*"ClassName"*/ 1);
197                CharClass nameClass = fCharClasses.get(sname);
198                if (nameClass == null) {
199                    System.err.printf("char class \"%s\" unrecognized in rule \"%s\"\n", sname, definition);
200                }
201                String expansionForName = nameClass != null ? nameClass.fExpandedDef : sname;
202                fSetRefsMatcher.appendReplacement(expandedDefsRule, "");
203                expandedDefsRule.append(expansionForName);
204            }
205            fSetRefsMatcher.appendTail(expandedDefsRule);
206
207            // Replace any property expressions, \p{...} or [:...:] with an equivalent expansion,
208            // obtained from ICU UnicodeSet. Need to do this substitution because Java regex
209            // does not recognize all properties, and because Java's definitions are likely
210            // older than ICU's.
211
212            StringBuffer expandedRule = new StringBuffer();
213            fPropertyMatcher.reset(expandedDefsRule);
214            while (fPropertyMatcher.find()) {
215                String prop = fPropertyMatcher.group();
216                UnicodeSet propSet = new UnicodeSet("[" + prop + "]");
217                StringBuffer propExpansion = new StringBuffer();
218                propSet._generatePattern(propExpansion, true);
219                fPropertyMatcher.appendReplacement(expandedRule, propExpansion.toString());
220            }
221            fPropertyMatcher.appendTail(expandedRule);
222
223            //   Replace any [^negated sets] with equivalent flattened sets generated by
224            //   ICU UnicodeSet. [^ ...] in Java Regex character classes does not apply
225            //   to any nested classes. Variable substitution in rules produces
226            //   nested sets that [^negation] needs to apply to.
227
228            StringBuffer ruleWithFlattenedSets = new StringBuffer();
229            int idx = 0;
230            while (idx<expandedRule.length()) {
231                int setOpenPos = expandedRule.indexOf("[^", idx);
232                if (setOpenPos < 0) {
233                    break;
234                }
235                if (setOpenPos > idx) {
236                    // Move anything from the source rule preceding the [^ into the processed rule, unchanged.
237                    ruleWithFlattenedSets.append(expandedRule.substring(idx,  setOpenPos));
238                }
239                int nestingLevel = 1;
240                boolean haveNesting = false;
241                int setClosePos;
242                for (setClosePos = setOpenPos + 2; nestingLevel > 0 && setClosePos<expandedRule.length(); ++setClosePos) {
243                    char c = expandedRule.charAt(setClosePos);
244                    if (c == '\\') {
245                        ++setClosePos;
246                    } else if (c == '[') {
247                        ++nestingLevel;
248                        haveNesting = true;
249                    } else if (c == ']') {
250                        --nestingLevel;
251                    }
252                }
253                if (haveNesting && nestingLevel == 0) {
254                    // Found one, a negated set that includes interior nested sets.
255                    // Create an ICU UnicodeSet from the source pattern, and obtain an
256                    // equivalent flattened pattern from that.
257                    UnicodeSet uset = new UnicodeSet(expandedRule.substring(setOpenPos, setClosePos), true);
258                    uset._generatePattern(ruleWithFlattenedSets, true);
259                } else {
260                    // The [^ set definition did not include any nested sets.
261                    // Copy the original definition without change.
262                    // Java regular expressions will handle it without needing to recast it.
263                    if (nestingLevel > 0) {
264                        // Error case of an unclosed character class expression.
265                        // Java regex will also eventually flag the error.
266                        System.err.printf("No closing ] found in rule %s\n", name);
267                    }
268                    ruleWithFlattenedSets.append(expandedRule.substring(setOpenPos, setClosePos));
269                }
270                idx = setClosePos;
271            }
272
273            if (idx < expandedRule.length()) {
274                ruleWithFlattenedSets.append(expandedRule.substring(idx, expandedRule.length()));
275            }
276
277            thisRule.fExpandedRule = ruleWithFlattenedSets.toString();
278
279            // Replace the divide sign (\u00f7) with a regular expression named capture.
280            // When running the rules, a match that includes this group means we found a break position.
281
282            // thisRule.fExpandedRule = thisRule.fExpandedRule.replace("÷", "(?<BreakPosition>)");
283            thisRule.fExpandedRule = thisRule.fExpandedRule.replace("÷", "()");
284            if (thisRule.fExpandedRule.indexOf("÷") != -1) {
285                String msg = String.format("%s Rule %s contains multiple ÷ signs", fMonkeyImpl.fRuleFileName, name);
286                System.err.println(msg);
287                throw new IllegalArgumentException(msg);
288            }
289
290            // UAX break rule set definitions can be empty, just [].
291            // Regular expression set expressions don't accept this. Substitute with [a&&[^a]], which
292            // also matches nothing.
293
294            thisRule.fExpandedRule = thisRule.fExpandedRule.replace("[]", "[a&&[^a]]");
295
296            // Change Unicode escape syntax for compatibility with Java regular expressions (Java 7 or newer)
297            //    \udddd     => \x{dddd}
298            //    \U00hhhhhh => \x{hhhhhh}
299
300            // thisRule.fExpandedRule = thisRule.fExpandedRule.replaceAll("\\\\u([0-9A-Fa-f]{4})", "\\\\x{$1}");
301            // thisRule.fExpandedRule = thisRule.fExpandedRule.replaceAll("\\\\U00([0-9A-Fa-f]{6})", "\\\\x{$1}");
302
303            // Java 6 compatibility troubles - there is no syntax for escaping a supplementary character
304            // within a regular expression character class. Put them in as unescaped literal chars.
305            StringBuilder sb = new StringBuilder(thisRule.fExpandedRule);
306            while (true) {
307                int where = sb.indexOf("\\U00");
308                if (where < 0) {
309                    break;
310                }
311                String cp = hexToCodePoint(sb.substring(where+2, where+10));
312                sb.replace(where, where+10, cp);
313            }
314            thisRule.fExpandedRule = sb.toString();
315
316            // Escape any literal '#' in the rule expression. Without escaping, these introduce a comment.
317            // UnicodeSet._generatePattern() inserts un-escaped "#"s
318
319            thisRule.fExpandedRule = thisRule.fExpandedRule.replace("#", "\\#");
320            if (fMonkeyImpl.fDumpExpansions) {
321                System.out.printf("fExpandedRule: %s\n", thisRule.fExpandedRule);
322            }
323
324            // Compile a regular expression for this rule.
325
326            try {
327                thisRule.fRuleMatcher = Pattern.compile(thisRule.fExpandedRule, Pattern.COMMENTS | Pattern.DOTALL).matcher("");
328            } catch (PatternSyntaxException e) {
329                System.err.printf("%s: Error creating regular expression for rule %s. Expansion is \n\"%s\"",
330                        fMonkeyImpl.fRuleFileName, name, thisRule.fExpandedRule);
331                throw e;
332            }
333
334            // Put this new rule into the vector of all Rules.
335
336            fBreakRules.add(thisRule);
337        };
338
339        private static String hexToCodePoint(String hex) {
340            int cp = Integer.parseInt(hex, 16);
341            return new StringBuilder().appendCodePoint(cp).toString();
342        }
343
344
345        boolean setKeywordParameter(String keyword, String value) {
346            if (keyword.equals("locale")) {
347                fLocale = new ULocale(value);
348                return true;
349            }
350            if (keyword.equals("type")) {
351                if (value.equals("grapheme")) {
352                    fType = BreakIterator.KIND_CHARACTER;
353                } else if (value.equals("word")) {
354                    fType = BreakIterator.KIND_WORD;
355                } else if (value.equals("line")) {
356                    fType = BreakIterator.KIND_LINE;
357                } else if (value.equals("sentence")) {
358                    fType = BreakIterator.KIND_SENTENCE;
359                } else {
360                    String msg = String.format("%s: Unrecognized break type %s", fMonkeyImpl.fRuleFileName, value);
361                    System.err.println(msg);
362                    throw new IllegalArgumentException(msg);
363                }
364                return true;
365            }
366            return false;
367        }
368
369
370        RuleBasedBreakIterator createICUBreakIterator() {
371            BreakIterator bi;
372            switch(fType) {
373                case BreakIterator.KIND_CHARACTER:
374                    bi = (BreakIterator.getCharacterInstance(fLocale));
375                    break;
376                case BreakIterator.KIND_WORD:
377                    bi = (BreakIterator.getWordInstance(fLocale));
378                    break;
379                case BreakIterator.KIND_LINE:
380                    bi = (BreakIterator.getLineInstance(fLocale));
381                    break;
382                case BreakIterator.KIND_SENTENCE:
383                    bi = (BreakIterator.getSentenceInstance(fLocale));
384                    break;
385                default:
386                    String msg = String.format("%s: Bad break iterator type of %d", fMonkeyImpl.fRuleFileName, fType);
387                    System.err.println(msg);
388                    throw new IllegalArgumentException(msg);
389            }
390            return (RuleBasedBreakIterator)bi;
391
392        };
393
394
395
396        void compileRules(String rules) {
397            int lineNumber = 0;
398            for (String line: rules.split("\\r?\\n")) {
399                ++lineNumber;
400                // Strip comment lines.
401                fCommentsMatcher.reset(line);
402                line = fCommentsMatcher.replaceFirst("");
403                if (line.isEmpty()) {
404                    continue;
405                }
406
407                // Recognize character class definition and keyword lines
408                fClassDefMatcher.reset(line);
409                if (fClassDefMatcher.matches()) {
410                    String className = fClassDefMatcher.group(/*"ClassName"*/ 1);
411                    String classDef  = fClassDefMatcher.group(/*"ClassDef"*/ 2);
412                    if (fMonkeyImpl.fDumpExpansions) {
413                        System.out.printf("scanned class: %s = %s\n", className, classDef);
414                    }
415                    if (setKeywordParameter(className, classDef)) {
416                        // The scanned item was "type = ..." or "locale = ...", etc.
417                        //   which are not actual character classes.
418                        continue;
419                    }
420                    addCharClass(className, classDef);
421                    continue;
422                }
423
424                // Recognize rule lines.
425                fRuleDefMatcher.reset(line);
426                if (fRuleDefMatcher.matches()) {
427                    String ruleName = fRuleDefMatcher.group(/*"RuleName"*/ 1);
428                    String ruleDef  = fRuleDefMatcher.group(/*"RuleDef"*/ 2);
429                    if (fMonkeyImpl.fDumpExpansions) {
430                        System.out.printf("scanned rule: %s : %s\n", ruleName, ruleDef);
431                    }
432                    addRule(ruleName, ruleDef);
433                    continue;
434                }
435
436                String msg = String.format("Unrecognized line in rule file %s:%d \"%s\"",
437                        fMonkeyImpl.fRuleFileName, lineNumber, line);
438                System.err.println(msg);
439                throw new IllegalArgumentException(msg);
440            }
441
442            // Build the vector of char classes, omitting the dictionary class if there is one.
443            // This will be used when constructing the random text to be tested.
444
445            // Also compute the "other" set, consisting of any characters not included in
446            // one or more of the user defined sets.
447
448            UnicodeSet otherSet = new UnicodeSet(0, 0x10ffff);
449
450            for (Map.Entry<String, CharClass> el: fCharClasses.entrySet()) {
451                String ccName = el.getKey();
452                CharClass cclass = el.getValue();
453
454                // System.out.printf("    Adding %s\n", ccName);
455                if (!ccName.equals(cclass.fName)) {
456                    throw new IllegalArgumentException(
457                            String.format("%s: internal error, set names (%s, %s) inconsistent.\n",
458                                    fMonkeyImpl.fRuleFileName, ccName, cclass.fName));
459                }
460                otherSet.removeAll(cclass.fSet);
461                if (ccName.equals("dictionary")) {
462                    fDictionarySet = cclass.fSet;
463                } else {
464                    fCharClassList.add(cclass);
465                }
466            }
467
468            if (!otherSet.isEmpty()) {
469                // System.out.printf("have an other set.\n");
470                CharClass cclass = addCharClass("__Others", otherSet.toPattern(true));
471                fCharClassList.add(cclass);
472            }
473
474        };
475
476        CharClass getClassForChar(int c) {
477            for (CharClass cc: fCharClassList) {
478                if (cc.fSet.contains(c)) {
479                    return cc;
480                }
481            }
482            return null;
483        };
484
485
486        RBBIMonkeyImpl          fMonkeyImpl;        // Pointer back to the owning MonkeyImpl instance.
487        List<BreakRule>         fBreakRules;        // Contents are of type (BreakRule *).
488
489        Map<String, CharClass>  fCharClasses;       // Key is the set name.
490        //                                          // Value is the corresponding CharClass
491        List<CharClass>         fCharClassList;     // Char Classes, same contents as fCharClasses values,
492
493        UnicodeSet              fDictionarySet;     // Dictionary set, empty if none is defined.
494        ULocale                 fLocale;
495        int                     fType;              // BreakItererator.KIND_WORD, etc.
496
497
498        Matcher fSetRefsMatcher;
499        Matcher fCommentsMatcher;
500        Matcher fClassDefMatcher;
501        Matcher fRuleDefMatcher;
502        Matcher fPropertyMatcher;
503    };
504
505
506
507
508    // class MonkeyTestData    represents a randomly synthesized test data string together
509    //                         with the expected break positions obtained by applying
510    //                         the test break rules.
511
512    static class MonkeyTestData{
513
514        void set(BreakRules rules, ICU_Rand rand) {
515            int dataLength = 1000;   // length of test data to generate, in code points.
516
517            // Fill the test string with random characters.
518            // First randomly pick a char class, then randomly pick a character from that class.
519            // Exclude any characters from the dictionary set.
520
521            // System.out.println("Populating Test Data");
522            fRandomSeed = rand.getSeed();         // Save initial seed for use in error messages,
523                                                  // allowing recreation of failing data.
524            fBkRules = rules;
525            StringBuilder newString = new StringBuilder();
526            for (int n=0; n<dataLength;) {
527                int charClassIndex = rand.next() % rules.fCharClassList.size();
528                CharClass cclass = rules.fCharClassList.get(charClassIndex);
529                if (cclass.fSet.size() == 0) {
530                    // Some rules or tailorings do end up with empty char classes.
531                    continue;
532                }
533                int charIndex = rand.next() % cclass.fSet.size();
534                int c = cclass.fSet.charAt(charIndex);
535                if (/*Character.isBmpCodePoint(c)*/ c<=0x0ffff && Character.isLowSurrogate((char)c) &&
536                        newString.length() > 0 && Character.isHighSurrogate(newString.charAt(newString.length()-1))) {
537                    // Character classes may contain unpaired surrogates, e.g. Grapheme_Cluster_Break = Control.
538                    // Don't let random unpaired surrogates combine in the test data because they might
539                    // produce an unwanted dictionary character.
540                    continue;
541                }
542
543                if (!rules.fDictionarySet.contains(c)) {
544                    newString.appendCodePoint(c);
545                    ++n;
546                }
547            }
548            fString = newString.toString();
549
550            // Init the expectedBreaks, actualBreaks and ruleForPosition.
551            // Expected and Actual breaks are one longer than the input string; a true value
552            // will indicate a boundary preceding that position.
553
554            fActualBreaks    = new boolean[fString.length()+1];
555            fExpectedBreaks  = new boolean[fString.length()+1];
556            fRuleForPosition = new int[fString.length()+1];
557            f2ndRuleForPos   = new int[fString.length()+1];
558
559            // Apply reference rules to find the expected breaks.
560
561            fExpectedBreaks[0] = true;       // Force an expected break before the start of the text.
562                                             // ICU always reports a break there.
563                                             // The reference rules do not have a means to do so.
564            int strIdx = 0;
565            while (strIdx < fString.length()) {
566                BreakRule matchingRule = null;
567                boolean hasBreak = false;
568                int ruleNum = 0;
569                int matchStart = 0;
570                int matchEnd = 0;
571                for (ruleNum=0; ruleNum<rules.fBreakRules.size(); ruleNum++) {
572                    BreakRule rule = rules.fBreakRules.get(ruleNum);
573                    rule.fRuleMatcher.reset(fString.substring(strIdx));
574                    if (rule.fRuleMatcher.lookingAt()) {
575                        // A candidate rule match, check further to see if we take it or continue to check other rules.
576                        // Matches of zero or one code point count only if they also specify a break.
577                        matchStart = strIdx;
578                        matchEnd = strIdx + rule.fRuleMatcher.end();
579                        hasBreak = BreakGroupStart(rule.fRuleMatcher) >= 0;
580                        if (hasBreak ||
581                                (matchStart < fString.length() && fString.offsetByCodePoints(matchStart, 1) < matchEnd)) {
582                            matchingRule = rule;
583                            break;
584                        }
585                    }
586                }
587                if (matchingRule == null) {
588                    // No reference rule matched. This is an error in the rules that should never happen.
589                    String msg = String.format("%s: No reference rules matched at position %d. ",
590                            rules.fMonkeyImpl.fRuleFileName, strIdx);
591                    System.err.println(msg);
592                    dump(strIdx);
593                    throw new IllegalArgumentException(msg);
594                }
595                if (matchingRule.fRuleMatcher.group().length() == 0) {
596                    // Zero length rule match. This is also an error in the rule expressions.
597                    String msg = String.format("%s:%s: Zero length rule match at %d.",
598                            rules.fMonkeyImpl.fRuleFileName, matchingRule.fName, strIdx);
599                    System.err.println(msg);
600                    dump(strIdx);
601                    throw new IllegalArgumentException(msg);
602                }
603
604                // Record which rule matched over the length of the match.
605                for (int i = matchStart; i < matchEnd; i++) {
606                    if (fRuleForPosition[i] == 0) {
607                        fRuleForPosition[i] = ruleNum;
608                    } else {
609                        f2ndRuleForPos[i] = ruleNum;
610                    }
611                }
612
613                // Break positions appear in rules as a matching named capture of zero length at the break position,
614                //   the adjusted pattern contains (?<BreakPosition>)
615                if (hasBreak) {
616                    int breakPos = strIdx + BreakGroupStart(matchingRule.fRuleMatcher);
617                    fExpectedBreaks[breakPos] = true;
618                    // System.out.printf("recording break at %d\n", breakPos);
619                    // For the next iteration, pick up applying rules immediately after the break,
620                    // which may differ from end of the match. The matching rule may have included
621                    // context following the boundary that needs to be looked at again.
622                    strIdx = breakPos;
623                } else {
624                    // Original rule didn't specify a break.
625                    // Continue applying rules starting on the last code point of this match.
626                    int updatedStrIdx = fString.offsetByCodePoints(matchEnd, -1);
627                    if (updatedStrIdx == matchStart) {
628                        // Match was only one code point, no progress if we continue.
629                        // Shouldn't get here, case is filtered out at top of loop.
630                        throw new IllegalArgumentException(String.format("%s: Rule %s internal error.",
631                                rules.fMonkeyImpl.fRuleFileName, matchingRule.fName));
632                    }
633                    strIdx = updatedStrIdx;
634                }
635            }
636        };
637
638        // Helper function to find the starting index of a match of the "BreakPosition" named capture group.
639        // @param m: a Java regex Matcher that has completed a matching operation.
640        // @return m.start("BreakPosition),
641        //         or -1 if there is no such group, or the group did not participate in the match.
642        //
643        // TODO: this becomes m.start("BreakPosition") with Java 8.
644        //       In the mean time, assume that the only zero-length capturing group in
645        //       a reference rule expression is the "BreakPosition" that corresponds to a "÷".
646
647        static int BreakGroupStart(Matcher m) {
648            for (int groupNum=1; groupNum <= m.groupCount(); ++groupNum) {
649                String group = m.group(groupNum);
650                if (group == null) {
651                    continue;
652                }
653                if (group.equals("")) {
654                    // assert(m.end(groupNum) == m.end("BreakPosition"));
655                    return m.start(groupNum);
656                }
657            }
658            return -1;
659        }
660
661        void dump(int around) {
662            System.out.print("\n"
663                    +        "         char                        break  Rule                     Character\n"
664                    +        "   pos   code   class                 R I   name                     name\n"
665                    +        "---------------------------------------------------------------------------------------------\n");
666
667            int start;
668            int end;
669
670            if (around == -1) {
671                start = 0;
672                end = fString.length();
673            } else {
674                // Display context around a failure.
675                try {
676                    start = fString.offsetByCodePoints(around, -30);
677                } catch (Exception e) {
678                    start = 0;
679                }
680                try {
681                    end = fString.offsetByCodePoints(around, +30);
682                } catch (Exception e) {
683                    end = fString.length();
684                }
685            }
686
687            for (int charIdx = start; charIdx < end; charIdx=fString.offsetByCodePoints(charIdx, 1)) {
688                int c = fString.codePointAt(charIdx);
689                CharClass cc = fBkRules.getClassForChar(c);
690
691                BreakRule rule = fBkRules.fBreakRules.get(fRuleForPosition[charIdx]);
692                String secondRuleName = "";
693                if (f2ndRuleForPos[charIdx] > 0) {
694                    secondRuleName = fBkRules.fBreakRules.get(f2ndRuleForPos[charIdx]).fName;
695                }
696                String cName = UCharacterName.INSTANCE.getName(c, UCharacterNameChoice.EXTENDED_CHAR_NAME);
697
698                System.out.printf("  %4d %6x   %-20s  %c %c   %-10s %-10s    %s\n",
699                        charIdx, c, cc.fName,
700                        fExpectedBreaks[charIdx] ? '*' : '.',
701                        fActualBreaks[charIdx] ? '*' : '.',
702                        rule.fName, secondRuleName, cName
703                        );
704                }
705
706        };
707
708        void clearActualBreaks() {
709            Arrays.fill(fActualBreaks, false);
710        }
711
712
713        int               fRandomSeed;        // The initial seed value from the random number generator.
714        BreakRules        fBkRules;           // The break rules used to generate this data.
715        String            fString;            // The text.
716        boolean           fExpectedBreaks[];  // Breaks as found by the reference rules.
717                                              //     Parallel to fString. true if break preceding.
718        boolean           fActualBreaks[];    // Breaks as found by ICU break iterator.
719        int               fRuleForPosition[]; // Index into BreakRules.fBreakRules of rule that applied at each position.
720                                              // Also parallel to fString.
721        int               f2ndRuleForPos[];   // As above. A 2nd rule applies when the preceding rule
722                                              //   didn't cause a break, and a subsequent rule match starts
723                                              //   on the last code point of the preceding match.
724
725    }
726
727
728    // class RBBIMonkeyImpl     holds (some indirectly) everything associated with running a monkey
729    //                          test for one set of break rules.
730    //
731
732    static class RBBIMonkeyImpl extends Thread {
733
734        void setup(String ruleFile) {
735            fRuleFileName = ruleFile;
736            openBreakRules(ruleFile);
737            fRuleSet = new BreakRules(this);
738            fRuleSet.compileRules(fRuleCharBuffer);
739            fBI = fRuleSet.createICUBreakIterator();
740            fTestData = new MonkeyTestData();
741        };
742
743        void openBreakRules(String fileName) {
744            StringBuilder testFileBuf = new StringBuilder();
745            InputStream is = null;
746            String filePath = "break_rules/" + fileName;
747            try {
748                is = RBBIMonkeyImpl.class.getResourceAsStream(filePath);
749                if (is == null) {
750                    errln("Could not open test data file " + fileName);
751                    return;
752                }
753                InputStreamReader isr = new InputStreamReader(is, "UTF-8");
754                try {
755                    int c;
756                    int count = 0;
757                    for (;;) {
758                        c = isr.read();
759                        if (c < 0) {
760                            break;
761                        }
762                        count++;
763                        if (c == 0xFEFF && count == 1) {
764                            // BOM in the test data file. Discard it.
765                            continue;
766                        }
767                       testFileBuf.appendCodePoint(c);
768                    }
769                } finally {
770                    isr.close();
771                }
772                } catch (IOException e) {
773                try {
774                    is.close();
775                } catch (IOException ignored) {
776                }
777                errln(e.toString());
778            }
779            fRuleCharBuffer =  testFileBuf.toString();  /* the file as a String */
780        }
781
782        class MonkeyException extends RuntimeException  {
783            private static final long serialVersionUID = 1L;
784            public int fPosition;    // Position of the failure in the test data.
785            MonkeyException(String description, int pos) {
786                super(description);
787                fPosition = pos;
788            }
789        }
790
791        @Override
792        public void run() {
793            int errorCount = 0;
794            if (fBI == null) {
795                fErrorMsgs.append("Unable to run test because fBI is null.\n");
796                return;
797            }
798            for (long loopCount = 0; fLoopCount < 0 || loopCount < fLoopCount; loopCount++) {
799                try {
800                    fTestData.set(fRuleSet, fRandomGenerator);
801                    // fTestData.dump(-1);
802                    testForwards();
803                    testPrevious();
804                    testFollowing();
805                    testPreceding();
806                    testIsBoundary();
807                } catch (MonkeyException e) {
808                    String formattedMsg = String.format(
809                            "%s at index %d. VM Arguments to reproduce: -Drules=%s -Dseed=%d -Dloop=1 -Dverbose=1 \"\n",
810                            e.getMessage(), e.fPosition, fRuleFileName, fTestData.fRandomSeed);
811                    System.err.print(formattedMsg);
812                    if (fVerbose) {
813                        fTestData.dump(e.fPosition);
814                    }
815                    fErrorMsgs.append(formattedMsg);
816                    if (++errorCount > 10) {
817                        return;
818                    }
819                }
820                if (fLoopCount < 0 && loopCount % 100 == 0) {
821                    System.err.print(".");
822                }
823            }
824        }
825
826        enum CheckDirection {
827            FORWARD,
828            REVERSE
829        };
830
831        void testForwards() {
832            fTestData.clearActualBreaks();
833            fBI.setText(fTestData.fString);
834            int previousBreak = -2;
835            for (int bk=fBI.first(); bk != BreakIterator.DONE; bk=fBI.next()) {
836                if (bk <= previousBreak) {
837                    throw new MonkeyException("Break Iterator Stall", bk);
838                }
839                if (bk < 0 || bk > fTestData.fString.length()) {
840                    throw new MonkeyException("Boundary out of bounds", bk);
841                }
842                fTestData.fActualBreaks[bk] = true;
843            }
844            checkResults("testForwards", CheckDirection.FORWARD);
845        };
846
847
848       void testFollowing() {
849           fTestData.clearActualBreaks();
850           fBI.setText(fTestData.fString);
851           int nextBreak = -1;
852           for (int i=-1 ; i<fTestData.fString.length(); ++i) {
853               int bk = fBI.following(i);
854               if (bk == BreakIterator.DONE && i == fTestData.fString.length()) {
855                   continue;
856               }
857               if (bk == nextBreak && bk > i) {
858                   // i is in the gap between two breaks.
859                   continue;
860               }
861               if (i == nextBreak && bk > nextBreak) {
862                   fTestData.fActualBreaks[bk] = true;
863                   nextBreak = bk;
864                   continue;
865               }
866               throw new MonkeyException("following(i)", i);
867           }
868           checkResults("testFollowing", CheckDirection.FORWARD);
869        };
870
871
872        void testPrevious() {
873            fTestData.clearActualBreaks();
874            fBI.setText(fTestData.fString);
875            int previousBreak = Integer.MAX_VALUE;
876            for (int bk=fBI.last(); bk != BreakIterator.DONE; bk=fBI.previous()) {
877                 if (bk >= previousBreak) {
878                     throw new MonkeyException("Break Iterator Stall", bk);
879                }
880                if (bk < 0 || bk > fTestData.fString.length()) {
881                    throw new MonkeyException("Boundary out of bounds", bk);
882                }
883                fTestData.fActualBreaks[bk] = true;
884            }
885            checkResults("testPrevius", CheckDirection.REVERSE);
886        };
887
888
889        /**
890         * Given an index into a string, if it refers to the trail surrogate of a surrogate pair,
891         * adjust it to point to the lead surrogate, which is the start of the code point.
892         * @param s the String.
893         * @param i the initial index
894         * @return the adjusted index
895         */
896        private int getChar32Start(String s, int i) {
897            if (i > 0 && i < s.length() &&
898                    Character.isLowSurrogate(s.charAt(i)) && Character.isHighSurrogate(s.charAt(i-1))) {
899                --i;
900            }
901            return i;
902        }
903
904
905        void testPreceding() {
906            fTestData.clearActualBreaks();
907            fBI.setText(fTestData.fString);
908            int nextBreak = fTestData.fString.length()+1;
909            for (int i=fTestData.fString.length()+1 ; i>=0; --i) {
910                int bk = fBI.preceding(i);
911                // System.err.printf("testPreceding() i:%d  bk:%d  nextBreak:%d\n", i, bk, nextBreak);
912                if (bk == BreakIterator.DONE && i == 0) {
913                    continue;
914                }
915                if (bk == nextBreak && bk < i) {
916                    // i is in the gap between two breaks.
917                    continue;
918                }
919                if (i<fTestData.fString.length() && getChar32Start(fTestData.fString, i) < i) {
920                    // i indexes to a trailing surrogate.
921                    // Break Iterators treat an index to either half as referring to the supplemental code point,
922                    // with preceding going to some preceding code point.
923                    if (fBI.preceding(i) != fBI.preceding(getChar32Start(fTestData.fString, i))) {
924                        throw new MonkeyException("preceding of trailing surrogate error", i);
925                    }
926                    continue;
927                }
928                if (i == nextBreak && bk < nextBreak) {
929                    fTestData.fActualBreaks[bk] = true;
930                    nextBreak = bk;
931                    continue;
932                }
933                throw new MonkeyException("preceding(i)", i);
934            }
935            checkResults("testPreceding", CheckDirection.REVERSE);
936
937        };
938
939
940        void testIsBoundary() {
941            fTestData.clearActualBreaks();
942            fBI.setText(fTestData.fString);
943            for (int i=fTestData.fString.length(); i>=0; --i) {
944                if (fBI.isBoundary(i)) {
945                    fTestData.fActualBreaks[i] = true;
946                }
947            }
948            checkResults("testForwards", CheckDirection.FORWARD);
949        };
950
951
952        void checkResults(String msg, CheckDirection direction) {
953            if (direction == CheckDirection.FORWARD) {
954                for (int i=0; i<=fTestData.fString.length(); ++i) {
955                    if (fTestData.fExpectedBreaks[i] != fTestData.fActualBreaks[i]) {
956                        throw new MonkeyException(msg, i);
957                    }
958                }
959            } else {
960                for (int i=fTestData.fString.length(); i>=0; i--) {
961                    if (fTestData.fExpectedBreaks[i] != fTestData.fActualBreaks[i]) {
962                        throw new MonkeyException(msg, i);
963                    }
964                }
965            }
966
967        };
968
969        String                 fRuleCharBuffer;         // source file contents of the reference rules.
970        BreakRules             fRuleSet;
971        RuleBasedBreakIterator fBI;
972        MonkeyTestData         fTestData;
973        ICU_Rand               fRandomGenerator;
974        String                 fRuleFileName;
975        boolean                fVerbose;                 // True to do long dump of failing data.
976        int                    fLoopCount;
977        int                    fErrorCount;
978
979        boolean                fDumpExpansions;          // Debug flag to output expanded form of rules and sets.
980        StringBuilder          fErrorMsgs = new StringBuilder();
981
982    }
983
984    //  Test parameters, specified via Java properties.
985    //
986    //  rules=file_name   Name of file containing the reference rules.
987    //  seed=nnnnn        Random number starting seed.
988    //                    Setting the seed allows errors to be reproduced.
989    //  loop=nnn          Looping count.  Controls running time.
990    //                    -1:  run forever.
991    //                     0 or greater:  run length.
992    //  expansions        debug option, show expansions of rules and sets.
993    //  verbose           Display details of the failure.
994    //
995    // Parameters are passed to the JVM on the command line, or
996    // via the Eclipse Run Configuration settings, arguments tab, VM parameters.
997    // For example,
998    //      -ea -Drules=line.txt -Dloop=-1
999    //
1000    @Test
1001    public void TestMonkey() {
1002        String tests[] = {"grapheme.txt", "word.txt", "line.txt", "sentence.txt", "line_normal.txt",
1003                "line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt"
1004        };
1005
1006        String testNameFromParams = getProperty("rules");
1007
1008        if (testNameFromParams != null) {
1009            tests = new String[] {testNameFromParams};
1010        }
1011
1012        int loopCount = getIntProperty("loop", isQuick() ? 100 : 5000);
1013        boolean dumpExpansions =  getBooleanProperty("expansions", false);
1014        boolean verbose = getBooleanProperty("verbose", false);
1015        int seed = getIntProperty("seed", 1);
1016
1017        List<RBBIMonkeyImpl> startedTests = new ArrayList<RBBIMonkeyImpl>();
1018
1019        // Monkey testing is multi-threaded.
1020        // Each set of break rules to be tested is run in a separate thread.
1021        // Each thread/set of rules gets a separate RBBIMonkeyImpl object.
1022
1023        for (String testName: tests) {
1024            logln(String.format("beginning testing of %s", testName));
1025
1026            RBBIMonkeyImpl test = new RBBIMonkeyImpl();
1027
1028            test.fDumpExpansions = dumpExpansions;
1029            test.fVerbose = verbose;
1030            test.fRandomGenerator = new ICU_Rand(seed);
1031            test.fLoopCount = loopCount;
1032            test.setup(testName);
1033
1034            test.start();
1035            startedTests.add(test);
1036        }
1037
1038        StringBuilder errors = new StringBuilder();
1039        for (RBBIMonkeyImpl test: startedTests) {
1040            try {
1041                test.join();
1042                errors.append(test.fErrorMsgs);
1043            } catch (InterruptedException e) {
1044                errors.append(e + "\n");
1045            }
1046        }
1047        String errorMsgs = errors.toString();
1048        assertEquals(errorMsgs, "", errorMsgs);
1049
1050    }
1051
1052
1053}
1054