105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert/* GENERATED SOURCE. DO NOT MODIFY. */
205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert// © 2016 and later: Unicode, Inc. and others.
305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html#License
405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
505fa7802d0874812c234a29745586677ee5837eaFredrik Roubertpackage android.icu.dev.test.rbbi;
605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
705fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport java.io.IOException;
805fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport java.io.InputStream;
905fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport java.io.InputStreamReader;
1005fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport java.util.ArrayList;
1105fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport java.util.Arrays;
1205fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport java.util.HashMap;
1305fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport java.util.List;
1405fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport java.util.Map;
1505fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport java.util.regex.Matcher;
1605fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport java.util.regex.Pattern;
1705fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport java.util.regex.PatternSyntaxException;
1805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
1905fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport org.junit.Test;
2005fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport org.junit.runner.RunWith;
2105fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport org.junit.runners.JUnit4;
2205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
2305fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport android.icu.dev.test.TestFmwk;
2405fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport android.icu.impl.UCharacterName;
2505fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport android.icu.impl.UCharacterNameChoice;
2605fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport android.icu.text.BreakIterator;
2705fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport android.icu.text.RuleBasedBreakIterator;
2805fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport android.icu.text.UnicodeSet;
2905fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport android.icu.util.ULocale;
3005fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport android.icu.testsharding.MainTestShard;
3105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
3205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert/**
3305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert * RBBI Monkey Test. Ported from ICU4C test/intltest/rbbimonkeytest.cpp.
3405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert * This is the newer, data driven monkey test. It is completely separate from the
3505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert * older class RBBITestMonkey.
3605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert */
3705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
3805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert@MainTestShard
3905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert@RunWith(JUnit4.class)
4005fa7802d0874812c234a29745586677ee5837eaFredrik Roubertpublic class RBBIMonkeyTest extends TestFmwk {
4105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
4205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
4305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    //  class CharClass    Represents a single character class from the source break rules.
4405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    //                     Inherits from UObject because instances are adopted by UHashtable, which ultimately
4505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    //                     deletes them using hash's object deleter function.
4605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
4705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    static class CharClass  {
4805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        String         fName;
4905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        String         fOriginalDef;    // set definition as it appeared in user supplied rules.
5005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        String         fExpandedDef;    // set definition with any embedded named sets replaced by their defs, recursively.
5105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        UnicodeSet     fSet;
5205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        CharClass(String name, String originalDef, String expandedDef, UnicodeSet set) {
5305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fName = name;
5405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fOriginalDef = originalDef;
5505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fExpandedDef = expandedDef;
5605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fSet = set;
5705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        };
5805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    }
5905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
6005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
6105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    // class BreakRule    Struct-like class represents a single rule from a set of break rules.
6205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    //                    Each rule has the set definitions expanded, and
6305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    //                    is compiled to a regular expression.
6405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
6505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    static class BreakRule {
6605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        String    fName;                   // Name of the rule.
6705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        String    fRule;                   // Rule expression, excluding the name, as written in user source.
6805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        String    fExpandedRule;           // Rule expression after expanding the set definitions.
6905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        Matcher   fRuleMatcher;            // Regular expression that matches the rule.
7005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    };
7105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
7205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
7305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    // class BreakRules    represents a complete set of break rules, possibly tailored,
7405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    //                     compiled from testdata break rules.
7505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
7605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    static class BreakRules {
7705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        BreakRules(RBBIMonkeyImpl monkeyImpl) {
7805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fMonkeyImpl = monkeyImpl;
7905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fBreakRules = new ArrayList<BreakRule>();
8005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fType = BreakIterator.KIND_TITLE;
8105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fCharClasses = new HashMap<String, CharClass>();
8205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fCharClassList = new ArrayList<CharClass>();
8305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fDictionarySet = new UnicodeSet();
8405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
8505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // Match an alpha-numeric identifier in a rule. Will be a set name.
8605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // Use negative look-behind to exclude non-identifiers, mostly property names or values.
8705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fSetRefsMatcher = Pattern.compile(
8805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    "(?<!\\{[ \\t]{0,4})" +
8905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    "(?<!=[ \\t]{0,4})" +
9005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    "(?<!\\[:[ \\t]{0,4})" +
9105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    "(?<!\\\\)" +
9205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    "(?<![A-Za-z0-9_])" +
9305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    "([A-Za-z_][A-Za-z0-9_]*)").     // The char class name
9405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    matcher("");
9505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
9605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // Match comments and blank lines. Matches will be replaced with "", stripping the comments from the rules.
9705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fCommentsMatcher = Pattern.compile("" +
9805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    "(^|(?<=;))"   +                // Start either at start of line, or just after a ';' (look-behind for ';')
9905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    "[ \\t]*+"     +                //   Match white space.
10005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    "(#.*)?+"      +                //   Optional # plus whatever follows
10105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    "$").                           //   new-line at end of line.
10205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    matcher("");
10305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
10405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // Match (initial parse) of a character class definition line.
10505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fClassDefMatcher = Pattern.compile("" +
10605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    "[ \\t]*"           +                    // leading white space
10705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    "([A-Za-z_][A-Za-z0-9_]*)" +             // The char class name
10805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    "[ \\t]*=[ \\t]*"   +                    //   =
10905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    "(.*?)"  +                               // The char class UnicodeSet expression
11005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    "[ \\t]*;$").                            // ; <end of line>
11105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    matcher("");
11205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
11305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // Match (initial parse) of a break rule line.
11405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fRuleDefMatcher = Pattern.compile("" +
11505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    "[ \\t]*"           +                     // leading white space
11605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    "([A-Za-z_][A-Za-z0-9_.]*)" +             // The rule name
11705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    "[ \\t]*:[ \\t]*"   +                     //   :
11805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    "(.*?)"   +                               // The rule definition
11905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    "[ \\t]*;$").                             // ; <end of line>
12005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    matcher("");
12105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
12205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // Match a property expression, either [:xxx:] or \p{...}
12305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fPropertyMatcher = Pattern.compile("" +
12405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    "\\[:.*?:]|\\\\(?:p|P)\\{.*?\\}").
12505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    matcher("");
12605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
12705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
12805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        }
12905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
13005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        /**
13105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert         * Create the expanded definition for this char class,
13205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert         * replacing any set references with the corresponding definition.
13305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert         */
13405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        CharClass  addCharClass(String name, String definition) {
13505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            StringBuffer expandedDef = new StringBuffer();
13605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fSetRefsMatcher.reset(definition);
13705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            while (fSetRefsMatcher.find()) {
13805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                String sname = fSetRefsMatcher.group(/*"ClassName"*/ 1);
13905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                CharClass snameClass = fCharClasses.get(sname);
14005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                String expansionForName = snameClass != null ? snameClass.fExpandedDef : sname;
14105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
14205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                fSetRefsMatcher.appendReplacement(expandedDef, "");
14305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                expandedDef.append(expansionForName);
14405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            }
14505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fSetRefsMatcher.appendTail(expandedDef);
14605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            String expandedDefString = expandedDef.toString();
14705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
14805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            if (fMonkeyImpl.fDumpExpansions) {
14905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                System.out.printf("addCharClass(\"%s\"\n", name);
15005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                System.out.printf("             %s\n", definition);
15105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                System.out.printf("expandedDef: %s\n", expandedDefString);
15205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            }
15305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
15405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // Verify that the expanded set definition is valid.
15505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
15605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            UnicodeSet s;
15705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            try {
15805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                s = new UnicodeSet(expandedDefString, UnicodeSet.IGNORE_SPACE);
15905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            } catch (java.lang.IllegalArgumentException e) {
16005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                System.err.printf("%s: error %s creating UnicodeSet %s", fMonkeyImpl.fRuleFileName, e.toString(), name);
16105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                throw e;
16205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            }
16305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
16405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // Get an expanded equivalent pattern from the UnicodeSet.
16505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // This removes set difference operators, which would fail if passed through to Java regex.
16605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
16705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            StringBuffer expandedPattern = new StringBuffer();
16805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            s._generatePattern(expandedPattern, true);
16905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            expandedDefString = expandedPattern.toString();
17005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            if (fMonkeyImpl.fDumpExpansions) {
17105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                System.out.printf("expandedDef2: %s\n", expandedDefString);
17205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            }
17305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
17405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            CharClass cclass = new CharClass(name, definition, expandedDefString, s);
17505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            CharClass previousClass = fCharClasses.put(name, cclass);
17605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
17705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            if (previousClass != null) {
17805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                // TODO: decide whether or not to allow redefinitions.
17905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                //       Can be convenient in some cases.
18005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                // String msg = String.format("%s: Redefinition of character class %s\n",
18105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                //         fMonkeyImpl.fRuleFileName, cclass.fName);
18205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                // System.err.println(msg);
18305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                // throw new IllegalArgumentException(msg);
18405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            }
18505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            return cclass;
18605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
18705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        };
18805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
18905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
19005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        void addRule(String  name, String  definition) {
19105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            BreakRule  thisRule = new BreakRule();
19205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            StringBuffer expandedDefsRule = new StringBuffer();
19305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            thisRule.fName = name;
19405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            thisRule.fRule = definition;
19505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
19605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // Expand the char class definitions within the rule.
19705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fSetRefsMatcher.reset(definition);
19805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            while (fSetRefsMatcher.find()) {
19905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                String sname = fSetRefsMatcher.group(/*"ClassName"*/ 1);
20005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                CharClass nameClass = fCharClasses.get(sname);
20105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                if (nameClass == null) {
20205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    System.err.printf("char class \"%s\" unrecognized in rule \"%s\"\n", sname, definition);
20305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
20405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                String expansionForName = nameClass != null ? nameClass.fExpandedDef : sname;
20505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                fSetRefsMatcher.appendReplacement(expandedDefsRule, "");
20605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                expandedDefsRule.append(expansionForName);
20705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            }
20805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fSetRefsMatcher.appendTail(expandedDefsRule);
20905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
21005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // Replace any property expressions, \p{...} or [:...:] with an equivalent expansion,
21105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // obtained from ICU UnicodeSet. Need to do this substitution because Java regex
21205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // does not recognize all properties, and because Java's definitions are likely
21305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // older than ICU's.
21405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
21505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            StringBuffer expandedRule = new StringBuffer();
21605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fPropertyMatcher.reset(expandedDefsRule);
21705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            while (fPropertyMatcher.find()) {
21805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                String prop = fPropertyMatcher.group();
21905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                UnicodeSet propSet = new UnicodeSet("[" + prop + "]");
22005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                StringBuffer propExpansion = new StringBuffer();
22105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                propSet._generatePattern(propExpansion, true);
22205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                fPropertyMatcher.appendReplacement(expandedRule, propExpansion.toString());
22305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            }
22405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fPropertyMatcher.appendTail(expandedRule);
22505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
22605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            //   Replace any [^negated sets] with equivalent flattened sets generated by
22705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            //   ICU UnicodeSet. [^ ...] in Java Regex character classes does not apply
22805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            //   to any nested classes. Variable substitution in rules produces
22905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            //   nested sets that [^negation] needs to apply to.
23005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
23105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            StringBuffer ruleWithFlattenedSets = new StringBuffer();
23205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            int idx = 0;
23305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            while (idx<expandedRule.length()) {
23405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                int setOpenPos = expandedRule.indexOf("[^", idx);
23505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                if (setOpenPos < 0) {
23605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    break;
23705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
23805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                if (setOpenPos > idx) {
23905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    // Move anything from the source rule preceding the [^ into the processed rule, unchanged.
24005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    ruleWithFlattenedSets.append(expandedRule.substring(idx,  setOpenPos));
24105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
24205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                int nestingLevel = 1;
24305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                boolean haveNesting = false;
24405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                int setClosePos;
24505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                for (setClosePos = setOpenPos + 2; nestingLevel > 0 && setClosePos<expandedRule.length(); ++setClosePos) {
24605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    char c = expandedRule.charAt(setClosePos);
24705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    if (c == '\\') {
24805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        ++setClosePos;
24905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    } else if (c == '[') {
25005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        ++nestingLevel;
25105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        haveNesting = true;
25205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    } else if (c == ']') {
25305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        --nestingLevel;
25405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    }
25505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
25605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                if (haveNesting && nestingLevel == 0) {
25705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    // Found one, a negated set that includes interior nested sets.
25805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    // Create an ICU UnicodeSet from the source pattern, and obtain an
25905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    // equivalent flattened pattern from that.
26005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    UnicodeSet uset = new UnicodeSet(expandedRule.substring(setOpenPos, setClosePos), true);
26105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    uset._generatePattern(ruleWithFlattenedSets, true);
26205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                } else {
26305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    // The [^ set definition did not include any nested sets.
26405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    // Copy the original definition without change.
26505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    // Java regular expressions will handle it without needing to recast it.
26605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    if (nestingLevel > 0) {
26705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        // Error case of an unclosed character class expression.
26805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        // Java regex will also eventually flag the error.
26905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        System.err.printf("No closing ] found in rule %s\n", name);
27005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    }
27105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    ruleWithFlattenedSets.append(expandedRule.substring(setOpenPos, setClosePos));
27205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
27305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                idx = setClosePos;
27405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            }
27505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
27605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            if (idx < expandedRule.length()) {
27705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                ruleWithFlattenedSets.append(expandedRule.substring(idx, expandedRule.length()));
27805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            }
27905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
28005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            thisRule.fExpandedRule = ruleWithFlattenedSets.toString();
28105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
28205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // Replace the divide sign (\u00f7) with a regular expression named capture.
28305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // When running the rules, a match that includes this group means we found a break position.
28405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
28505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // thisRule.fExpandedRule = thisRule.fExpandedRule.replace("÷", "(?<BreakPosition>)");
28605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            thisRule.fExpandedRule = thisRule.fExpandedRule.replace("÷", "()");
28705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            if (thisRule.fExpandedRule.indexOf("÷") != -1) {
28805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                String msg = String.format("%s Rule %s contains multiple ÷ signs", fMonkeyImpl.fRuleFileName, name);
28905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                System.err.println(msg);
29005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                throw new IllegalArgumentException(msg);
29105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            }
29205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
29305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // UAX break rule set definitions can be empty, just [].
29405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // Regular expression set expressions don't accept this. Substitute with [a&&[^a]], which
29505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // also matches nothing.
29605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
29705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            thisRule.fExpandedRule = thisRule.fExpandedRule.replace("[]", "[a&&[^a]]");
29805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
29905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // Change Unicode escape syntax for compatibility with Java regular expressions (Java 7 or newer)
30005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            //    \udddd     => \x{dddd}
30105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            //    \U00hhhhhh => \x{hhhhhh}
30205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
30305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // thisRule.fExpandedRule = thisRule.fExpandedRule.replaceAll("\\\\u([0-9A-Fa-f]{4})", "\\\\x{$1}");
30405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // thisRule.fExpandedRule = thisRule.fExpandedRule.replaceAll("\\\\U00([0-9A-Fa-f]{6})", "\\\\x{$1}");
30505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
30605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // Java 6 compatibility troubles - there is no syntax for escaping a supplementary character
30705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // within a regular expression character class. Put them in as unescaped literal chars.
30805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            StringBuilder sb = new StringBuilder(thisRule.fExpandedRule);
30905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            while (true) {
31005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                int where = sb.indexOf("\\U00");
31105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                if (where < 0) {
31205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    break;
31305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
31405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                String cp = hexToCodePoint(sb.substring(where+2, where+10));
31505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                sb.replace(where, where+10, cp);
31605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            }
31705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            thisRule.fExpandedRule = sb.toString();
31805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
31905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // Escape any literal '#' in the rule expression. Without escaping, these introduce a comment.
32005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // UnicodeSet._generatePattern() inserts un-escaped "#"s
32105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
32205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            thisRule.fExpandedRule = thisRule.fExpandedRule.replace("#", "\\#");
32305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            if (fMonkeyImpl.fDumpExpansions) {
32405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                System.out.printf("fExpandedRule: %s\n", thisRule.fExpandedRule);
32505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            }
32605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
32705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // Compile a regular expression for this rule.
32805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
32905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            try {
33005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                thisRule.fRuleMatcher = Pattern.compile(thisRule.fExpandedRule, Pattern.COMMENTS | Pattern.DOTALL).matcher("");
33105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            } catch (PatternSyntaxException e) {
33205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                System.err.printf("%s: Error creating regular expression for rule %s. Expansion is \n\"%s\"",
33305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        fMonkeyImpl.fRuleFileName, name, thisRule.fExpandedRule);
33405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                throw e;
33505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            }
33605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
33705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // Put this new rule into the vector of all Rules.
33805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
33905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fBreakRules.add(thisRule);
34005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        };
34105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
34205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        private static String hexToCodePoint(String hex) {
34305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            int cp = Integer.parseInt(hex, 16);
34405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            return new StringBuilder().appendCodePoint(cp).toString();
34505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        }
34605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
34705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
34805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        boolean setKeywordParameter(String keyword, String value) {
34905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            if (keyword.equals("locale")) {
35005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                fLocale = new ULocale(value);
35105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                return true;
35205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            }
35305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            if (keyword.equals("type")) {
35405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                if (value.equals("grapheme")) {
35505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    fType = BreakIterator.KIND_CHARACTER;
35605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                } else if (value.equals("word")) {
35705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    fType = BreakIterator.KIND_WORD;
35805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                } else if (value.equals("line")) {
35905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    fType = BreakIterator.KIND_LINE;
36005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                } else if (value.equals("sentence")) {
36105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    fType = BreakIterator.KIND_SENTENCE;
36205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                } else {
36305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    String msg = String.format("%s: Unrecognized break type %s", fMonkeyImpl.fRuleFileName, value);
36405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    System.err.println(msg);
36505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    throw new IllegalArgumentException(msg);
36605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
36705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                return true;
36805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            }
36905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            return false;
37005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        }
37105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
37205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
37305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        RuleBasedBreakIterator createICUBreakIterator() {
37405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            BreakIterator bi;
37505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            switch(fType) {
37605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                case BreakIterator.KIND_CHARACTER:
37705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    bi = (BreakIterator.getCharacterInstance(fLocale));
37805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    break;
37905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                case BreakIterator.KIND_WORD:
38005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    bi = (BreakIterator.getWordInstance(fLocale));
38105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    break;
38205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                case BreakIterator.KIND_LINE:
38305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    bi = (BreakIterator.getLineInstance(fLocale));
38405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    break;
38505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                case BreakIterator.KIND_SENTENCE:
38605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    bi = (BreakIterator.getSentenceInstance(fLocale));
38705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    break;
38805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                default:
38905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    String msg = String.format("%s: Bad break iterator type of %d", fMonkeyImpl.fRuleFileName, fType);
39005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    System.err.println(msg);
39105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    throw new IllegalArgumentException(msg);
39205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            }
39305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            return (RuleBasedBreakIterator)bi;
39405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
39505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        };
39605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
39705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
39805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
39905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        void compileRules(String rules) {
40005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            int lineNumber = 0;
40105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            for (String line: rules.split("\\r?\\n")) {
40205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                ++lineNumber;
40305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                // Strip comment lines.
40405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                fCommentsMatcher.reset(line);
40505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                line = fCommentsMatcher.replaceFirst("");
40605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                if (line.isEmpty()) {
40705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    continue;
40805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
40905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
41005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                // Recognize character class definition and keyword lines
41105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                fClassDefMatcher.reset(line);
41205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                if (fClassDefMatcher.matches()) {
41305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    String className = fClassDefMatcher.group(/*"ClassName"*/ 1);
41405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    String classDef  = fClassDefMatcher.group(/*"ClassDef"*/ 2);
41505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    if (fMonkeyImpl.fDumpExpansions) {
41605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        System.out.printf("scanned class: %s = %s\n", className, classDef);
41705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    }
41805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    if (setKeywordParameter(className, classDef)) {
41905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        // The scanned item was "type = ..." or "locale = ...", etc.
42005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        //   which are not actual character classes.
42105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        continue;
42205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    }
42305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    addCharClass(className, classDef);
42405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    continue;
42505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
42605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
42705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                // Recognize rule lines.
42805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                fRuleDefMatcher.reset(line);
42905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                if (fRuleDefMatcher.matches()) {
43005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    String ruleName = fRuleDefMatcher.group(/*"RuleName"*/ 1);
43105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    String ruleDef  = fRuleDefMatcher.group(/*"RuleDef"*/ 2);
43205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    if (fMonkeyImpl.fDumpExpansions) {
43305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        System.out.printf("scanned rule: %s : %s\n", ruleName, ruleDef);
43405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    }
43505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    addRule(ruleName, ruleDef);
43605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    continue;
43705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
43805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
43905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                String msg = String.format("Unrecognized line in rule file %s:%d \"%s\"",
44005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        fMonkeyImpl.fRuleFileName, lineNumber, line);
44105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                System.err.println(msg);
44205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                throw new IllegalArgumentException(msg);
44305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            }
44405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
44505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // Build the vector of char classes, omitting the dictionary class if there is one.
44605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // This will be used when constructing the random text to be tested.
44705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
44805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // Also compute the "other" set, consisting of any characters not included in
44905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // one or more of the user defined sets.
45005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
45105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            UnicodeSet otherSet = new UnicodeSet(0, 0x10ffff);
45205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
45305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            for (Map.Entry<String, CharClass> el: fCharClasses.entrySet()) {
45405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                String ccName = el.getKey();
45505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                CharClass cclass = el.getValue();
45605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
45705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                // System.out.printf("    Adding %s\n", ccName);
45805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                if (!ccName.equals(cclass.fName)) {
45905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    throw new IllegalArgumentException(
46005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                            String.format("%s: internal error, set names (%s, %s) inconsistent.\n",
46105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                                    fMonkeyImpl.fRuleFileName, ccName, cclass.fName));
46205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
46305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                otherSet.removeAll(cclass.fSet);
46405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                if (ccName.equals("dictionary")) {
46505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    fDictionarySet = cclass.fSet;
46605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                } else {
46705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    fCharClassList.add(cclass);
46805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
46905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            }
47005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
47105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            if (!otherSet.isEmpty()) {
47205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                // System.out.printf("have an other set.\n");
47305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                CharClass cclass = addCharClass("__Others", otherSet.toPattern(true));
47405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                fCharClassList.add(cclass);
47505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            }
47605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
47705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        };
47805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
47905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        CharClass getClassForChar(int c) {
48005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            for (CharClass cc: fCharClassList) {
48105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                if (cc.fSet.contains(c)) {
48205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    return cc;
48305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
48405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            }
48505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            return null;
48605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        };
48705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
48805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
48905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        RBBIMonkeyImpl          fMonkeyImpl;        // Pointer back to the owning MonkeyImpl instance.
49005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        List<BreakRule>         fBreakRules;        // Contents are of type (BreakRule *).
49105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
49205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        Map<String, CharClass>  fCharClasses;       // Key is the set name.
49305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        //                                          // Value is the corresponding CharClass
49405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        List<CharClass>         fCharClassList;     // Char Classes, same contents as fCharClasses values,
49505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
49605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        UnicodeSet              fDictionarySet;     // Dictionary set, empty if none is defined.
49705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        ULocale                 fLocale;
49805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        int                     fType;              // BreakItererator.KIND_WORD, etc.
49905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
50005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
50105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        Matcher fSetRefsMatcher;
50205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        Matcher fCommentsMatcher;
50305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        Matcher fClassDefMatcher;
50405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        Matcher fRuleDefMatcher;
50505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        Matcher fPropertyMatcher;
50605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    };
50705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
50805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
50905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
51005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
51105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    // class MonkeyTestData    represents a randomly synthesized test data string together
51205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    //                         with the expected break positions obtained by applying
51305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    //                         the test break rules.
51405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
51505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    static class MonkeyTestData{
51605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
51705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        void set(BreakRules rules, ICU_Rand rand) {
51805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            int dataLength = 1000;   // length of test data to generate, in code points.
51905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
52005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // Fill the test string with random characters.
52105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // First randomly pick a char class, then randomly pick a character from that class.
52205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // Exclude any characters from the dictionary set.
52305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
52405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // System.out.println("Populating Test Data");
52505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fRandomSeed = rand.getSeed();         // Save initial seed for use in error messages,
52605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                                                  // allowing recreation of failing data.
52705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fBkRules = rules;
52805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            StringBuilder newString = new StringBuilder();
52905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            for (int n=0; n<dataLength;) {
53005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                int charClassIndex = rand.next() % rules.fCharClassList.size();
53105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                CharClass cclass = rules.fCharClassList.get(charClassIndex);
53205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                if (cclass.fSet.size() == 0) {
53305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    // Some rules or tailorings do end up with empty char classes.
53405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    continue;
53505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
53605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                int charIndex = rand.next() % cclass.fSet.size();
53705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                int c = cclass.fSet.charAt(charIndex);
53805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                if (/*Character.isBmpCodePoint(c)*/ c<=0x0ffff && Character.isLowSurrogate((char)c) &&
53905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        newString.length() > 0 && Character.isHighSurrogate(newString.charAt(newString.length()-1))) {
54005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    // Character classes may contain unpaired surrogates, e.g. Grapheme_Cluster_Break = Control.
54105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    // Don't let random unpaired surrogates combine in the test data because they might
54205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    // produce an unwanted dictionary character.
54305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    continue;
54405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
54505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
54605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                if (!rules.fDictionarySet.contains(c)) {
54705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    newString.appendCodePoint(c);
54805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    ++n;
54905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
55005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            }
55105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fString = newString.toString();
55205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
55305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // Init the expectedBreaks, actualBreaks and ruleForPosition.
55405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // Expected and Actual breaks are one longer than the input string; a true value
55505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // will indicate a boundary preceding that position.
55605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
55705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fActualBreaks    = new boolean[fString.length()+1];
55805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fExpectedBreaks  = new boolean[fString.length()+1];
55905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fRuleForPosition = new int[fString.length()+1];
56005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            f2ndRuleForPos   = new int[fString.length()+1];
56105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
56205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            // Apply reference rules to find the expected breaks.
56305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
56405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fExpectedBreaks[0] = true;       // Force an expected break before the start of the text.
56505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                                             // ICU always reports a break there.
56605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                                             // The reference rules do not have a means to do so.
56705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            int strIdx = 0;
56805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            while (strIdx < fString.length()) {
56905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                BreakRule matchingRule = null;
57005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                boolean hasBreak = false;
57105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                int ruleNum = 0;
57205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                int matchStart = 0;
57305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                int matchEnd = 0;
57405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                for (ruleNum=0; ruleNum<rules.fBreakRules.size(); ruleNum++) {
57505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    BreakRule rule = rules.fBreakRules.get(ruleNum);
57605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    rule.fRuleMatcher.reset(fString.substring(strIdx));
57705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    if (rule.fRuleMatcher.lookingAt()) {
57805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        // A candidate rule match, check further to see if we take it or continue to check other rules.
57905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        // Matches of zero or one code point count only if they also specify a break.
58005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        matchStart = strIdx;
58105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        matchEnd = strIdx + rule.fRuleMatcher.end();
58205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        hasBreak = BreakGroupStart(rule.fRuleMatcher) >= 0;
58305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        if (hasBreak ||
58405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                                (matchStart < fString.length() && fString.offsetByCodePoints(matchStart, 1) < matchEnd)) {
58505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                            matchingRule = rule;
58605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                            break;
58705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        }
58805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    }
58905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
59005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                if (matchingRule == null) {
59105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    // No reference rule matched. This is an error in the rules that should never happen.
59205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    String msg = String.format("%s: No reference rules matched at position %d. ",
59305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                            rules.fMonkeyImpl.fRuleFileName, strIdx);
59405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    System.err.println(msg);
59505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    dump(strIdx);
59605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    throw new IllegalArgumentException(msg);
59705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
59805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                if (matchingRule.fRuleMatcher.group().length() == 0) {
59905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    // Zero length rule match. This is also an error in the rule expressions.
60005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    String msg = String.format("%s:%s: Zero length rule match at %d.",
60105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                            rules.fMonkeyImpl.fRuleFileName, matchingRule.fName, strIdx);
60205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    System.err.println(msg);
60305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    dump(strIdx);
60405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    throw new IllegalArgumentException(msg);
60505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
60605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
60705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                // Record which rule matched over the length of the match.
60805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                for (int i = matchStart; i < matchEnd; i++) {
60905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    if (fRuleForPosition[i] == 0) {
61005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        fRuleForPosition[i] = ruleNum;
61105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    } else {
61205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        f2ndRuleForPos[i] = ruleNum;
61305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    }
61405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
61505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
61605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                // Break positions appear in rules as a matching named capture of zero length at the break position,
61705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                //   the adjusted pattern contains (?<BreakPosition>)
61805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                if (hasBreak) {
61905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    int breakPos = strIdx + BreakGroupStart(matchingRule.fRuleMatcher);
62005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    fExpectedBreaks[breakPos] = true;
62105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    // System.out.printf("recording break at %d\n", breakPos);
62205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    // For the next iteration, pick up applying rules immediately after the break,
62305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    // which may differ from end of the match. The matching rule may have included
62405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    // context following the boundary that needs to be looked at again.
62505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    strIdx = breakPos;
62605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                } else {
62705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    // Original rule didn't specify a break.
62805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    // Continue applying rules starting on the last code point of this match.
62905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    int updatedStrIdx = fString.offsetByCodePoints(matchEnd, -1);
63005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    if (updatedStrIdx == matchStart) {
63105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        // Match was only one code point, no progress if we continue.
63205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        // Shouldn't get here, case is filtered out at top of loop.
63305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        throw new IllegalArgumentException(String.format("%s: Rule %s internal error.",
63405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                                rules.fMonkeyImpl.fRuleFileName, matchingRule.fName));
63505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    }
63605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    strIdx = updatedStrIdx;
63705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
63805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            }
63905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        };
64005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
64105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        // Helper function to find the starting index of a match of the "BreakPosition" named capture group.
64205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        // @param m: a Java regex Matcher that has completed a matching operation.
64305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        // @return m.start("BreakPosition),
64405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        //         or -1 if there is no such group, or the group did not participate in the match.
64505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        //
64605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        // TODO: this becomes m.start("BreakPosition") with Java 8.
64705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        //       In the mean time, assume that the only zero-length capturing group in
64805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        //       a reference rule expression is the "BreakPosition" that corresponds to a "÷".
64905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
65005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        static int BreakGroupStart(Matcher m) {
65105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            for (int groupNum=1; groupNum <= m.groupCount(); ++groupNum) {
65205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                String group = m.group(groupNum);
65305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                if (group == null) {
65405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    continue;
65505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
65605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                if (group.equals("")) {
65705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    // assert(m.end(groupNum) == m.end("BreakPosition"));
65805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    return m.start(groupNum);
65905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
66005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            }
66105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            return -1;
66205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        }
66305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
66405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        void dump(int around) {
66505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            System.out.print("\n"
66605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    +        "         char                        break  Rule                     Character\n"
66705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    +        "   pos   code   class                 R I   name                     name\n"
66805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    +        "---------------------------------------------------------------------------------------------\n");
66905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
67005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            int start;
67105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            int end;
67205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
67305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            if (around == -1) {
67405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                start = 0;
67505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                end = fString.length();
67605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            } else {
67705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                // Display context around a failure.
67805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                try {
67905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    start = fString.offsetByCodePoints(around, -30);
68005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                } catch (Exception e) {
68105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    start = 0;
68205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
68305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                try {
68405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    end = fString.offsetByCodePoints(around, +30);
68505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                } catch (Exception e) {
68605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    end = fString.length();
68705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
68805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            }
68905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
69005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            for (int charIdx = start; charIdx < end; charIdx=fString.offsetByCodePoints(charIdx, 1)) {
69105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                int c = fString.codePointAt(charIdx);
69205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                CharClass cc = fBkRules.getClassForChar(c);
69305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
69405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                BreakRule rule = fBkRules.fBreakRules.get(fRuleForPosition[charIdx]);
69505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                String secondRuleName = "";
69605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                if (f2ndRuleForPos[charIdx] > 0) {
69705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    secondRuleName = fBkRules.fBreakRules.get(f2ndRuleForPos[charIdx]).fName;
69805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
69905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                String cName = UCharacterName.INSTANCE.getName(c, UCharacterNameChoice.EXTENDED_CHAR_NAME);
70005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
70105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                System.out.printf("  %4d %6x   %-20s  %c %c   %-10s %-10s    %s\n",
70205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        charIdx, c, cc.fName,
70305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        fExpectedBreaks[charIdx] ? '*' : '.',
70405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        fActualBreaks[charIdx] ? '*' : '.',
70505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        rule.fName, secondRuleName, cName
70605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        );
70705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
70805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
70905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        };
71005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
71105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        void clearActualBreaks() {
71205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            Arrays.fill(fActualBreaks, false);
71305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        }
71405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
71505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
71605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        int               fRandomSeed;        // The initial seed value from the random number generator.
71705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        BreakRules        fBkRules;           // The break rules used to generate this data.
71805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        String            fString;            // The text.
71905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        boolean           fExpectedBreaks[];  // Breaks as found by the reference rules.
72005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                                              //     Parallel to fString. true if break preceding.
72105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        boolean           fActualBreaks[];    // Breaks as found by ICU break iterator.
72205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        int               fRuleForPosition[]; // Index into BreakRules.fBreakRules of rule that applied at each position.
72305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                                              // Also parallel to fString.
72405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        int               f2ndRuleForPos[];   // As above. A 2nd rule applies when the preceding rule
72505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                                              //   didn't cause a break, and a subsequent rule match starts
72605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                                              //   on the last code point of the preceding match.
72705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
72805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    }
72905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
73005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
73105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    // class RBBIMonkeyImpl     holds (some indirectly) everything associated with running a monkey
73205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    //                          test for one set of break rules.
73305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    //
73405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
73505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    static class RBBIMonkeyImpl extends Thread {
73605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
73705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        void setup(String ruleFile) {
73805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fRuleFileName = ruleFile;
73905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            openBreakRules(ruleFile);
74005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fRuleSet = new BreakRules(this);
74105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fRuleSet.compileRules(fRuleCharBuffer);
74205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fBI = fRuleSet.createICUBreakIterator();
74305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fTestData = new MonkeyTestData();
74405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        };
74505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
74605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        void openBreakRules(String fileName) {
74705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            StringBuilder testFileBuf = new StringBuilder();
74805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            InputStream is = null;
74905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            String filePath = "break_rules/" + fileName;
75005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            try {
75105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                is = RBBIMonkeyImpl.class.getResourceAsStream(filePath);
75205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                if (is == null) {
75305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    errln("Could not open test data file " + fileName);
75405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    return;
75505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
75605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                InputStreamReader isr = new InputStreamReader(is, "UTF-8");
75705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                try {
75805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    int c;
75905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    int count = 0;
76005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    for (;;) {
76105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        c = isr.read();
76205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        if (c < 0) {
76305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                            break;
76405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        }
76505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        count++;
76605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        if (c == 0xFEFF && count == 1) {
76705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                            // BOM in the test data file. Discard it.
76805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                            continue;
76905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        }
77005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                       testFileBuf.appendCodePoint(c);
77105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    }
77205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                } finally {
77305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    isr.close();
77405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
77505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                } catch (IOException e) {
77605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                try {
77705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    is.close();
77805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                } catch (IOException ignored) {
77905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
78005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                errln(e.toString());
78105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            }
78205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fRuleCharBuffer =  testFileBuf.toString();  /* the file as a String */
78305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        }
78405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
78505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        class MonkeyException extends RuntimeException  {
78605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            private static final long serialVersionUID = 1L;
78705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            public int fPosition;    // Position of the failure in the test data.
78805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            MonkeyException(String description, int pos) {
78905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                super(description);
79005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                fPosition = pos;
79105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            }
79205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        }
79305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
79405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        @Override
79505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        public void run() {
79605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            int errorCount = 0;
79705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            if (fBI == null) {
79805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                fErrorMsgs.append("Unable to run test because fBI is null.\n");
79905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                return;
80005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            }
80105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            for (long loopCount = 0; fLoopCount < 0 || loopCount < fLoopCount; loopCount++) {
80205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                try {
80305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    fTestData.set(fRuleSet, fRandomGenerator);
80405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    // fTestData.dump(-1);
80505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    testForwards();
80605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    testPrevious();
80705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    testFollowing();
80805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    testPreceding();
80905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    testIsBoundary();
81005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                } catch (MonkeyException e) {
81105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    String formattedMsg = String.format(
81205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                            "%s at index %d. VM Arguments to reproduce: -Drules=%s -Dseed=%d -Dloop=1 -Dverbose=1 \"\n",
81305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                            e.getMessage(), e.fPosition, fRuleFileName, fTestData.fRandomSeed);
81405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    System.err.print(formattedMsg);
81505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    if (fVerbose) {
81605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        fTestData.dump(e.fPosition);
81705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    }
81805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    fErrorMsgs.append(formattedMsg);
81905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    if (++errorCount > 10) {
82005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        return;
82105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    }
82205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
82305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                if (fLoopCount < 0 && loopCount % 100 == 0) {
82405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    System.err.print(".");
82505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
82605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            }
82705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        }
82805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
82905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        enum CheckDirection {
83005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            FORWARD,
83105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            REVERSE
83205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        };
83305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
83405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        void testForwards() {
83505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fTestData.clearActualBreaks();
83605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fBI.setText(fTestData.fString);
83705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            int previousBreak = -2;
83805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            for (int bk=fBI.first(); bk != BreakIterator.DONE; bk=fBI.next()) {
83905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                if (bk <= previousBreak) {
84005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    throw new MonkeyException("Break Iterator Stall", bk);
84105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
84205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                if (bk < 0 || bk > fTestData.fString.length()) {
84305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    throw new MonkeyException("Boundary out of bounds", bk);
84405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
84505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                fTestData.fActualBreaks[bk] = true;
84605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            }
84705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            checkResults("testForwards", CheckDirection.FORWARD);
84805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        };
84905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
85005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
85105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert       void testFollowing() {
85205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert           fTestData.clearActualBreaks();
85305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert           fBI.setText(fTestData.fString);
85405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert           int nextBreak = -1;
85505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert           for (int i=-1 ; i<fTestData.fString.length(); ++i) {
85605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert               int bk = fBI.following(i);
85705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert               if (bk == BreakIterator.DONE && i == fTestData.fString.length()) {
85805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                   continue;
85905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert               }
86005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert               if (bk == nextBreak && bk > i) {
86105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                   // i is in the gap between two breaks.
86205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                   continue;
86305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert               }
86405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert               if (i == nextBreak && bk > nextBreak) {
86505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                   fTestData.fActualBreaks[bk] = true;
86605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                   nextBreak = bk;
86705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                   continue;
86805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert               }
86905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert               throw new MonkeyException("following(i)", i);
87005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert           }
87105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert           checkResults("testFollowing", CheckDirection.FORWARD);
87205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        };
87305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
87405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
87505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        void testPrevious() {
87605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fTestData.clearActualBreaks();
87705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fBI.setText(fTestData.fString);
87805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            int previousBreak = Integer.MAX_VALUE;
87905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            for (int bk=fBI.last(); bk != BreakIterator.DONE; bk=fBI.previous()) {
88005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                 if (bk >= previousBreak) {
88105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                     throw new MonkeyException("Break Iterator Stall", bk);
88205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
88305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                if (bk < 0 || bk > fTestData.fString.length()) {
88405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    throw new MonkeyException("Boundary out of bounds", bk);
88505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
88605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                fTestData.fActualBreaks[bk] = true;
88705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            }
88805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            checkResults("testPrevius", CheckDirection.REVERSE);
88905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        };
89005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
89105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
89205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        /**
89305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert         * Given an index into a string, if it refers to the trail surrogate of a surrogate pair,
89405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert         * adjust it to point to the lead surrogate, which is the start of the code point.
89505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert         * @param s the String.
89605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert         * @param i the initial index
89705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert         * @return the adjusted index
89805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert         */
89905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        private int getChar32Start(String s, int i) {
90005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            if (i > 0 && i < s.length() &&
90105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    Character.isLowSurrogate(s.charAt(i)) && Character.isHighSurrogate(s.charAt(i-1))) {
90205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                --i;
90305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            }
90405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            return i;
90505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        }
90605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
90705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
90805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        void testPreceding() {
90905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fTestData.clearActualBreaks();
91005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fBI.setText(fTestData.fString);
91105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            int nextBreak = fTestData.fString.length()+1;
91205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            for (int i=fTestData.fString.length()+1 ; i>=0; --i) {
91305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                int bk = fBI.preceding(i);
91405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                // System.err.printf("testPreceding() i:%d  bk:%d  nextBreak:%d\n", i, bk, nextBreak);
91505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                if (bk == BreakIterator.DONE && i == 0) {
91605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    continue;
91705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
91805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                if (bk == nextBreak && bk < i) {
91905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    // i is in the gap between two breaks.
92005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    continue;
92105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
92205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                if (i<fTestData.fString.length() && getChar32Start(fTestData.fString, i) < i) {
92305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    // i indexes to a trailing surrogate.
92405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    // Break Iterators treat an index to either half as referring to the supplemental code point,
92505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    // with preceding going to some preceding code point.
92605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    if (fBI.preceding(i) != fBI.preceding(getChar32Start(fTestData.fString, i))) {
92705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        throw new MonkeyException("preceding of trailing surrogate error", i);
92805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    }
92905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    continue;
93005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
93105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                if (i == nextBreak && bk < nextBreak) {
93205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    fTestData.fActualBreaks[bk] = true;
93305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    nextBreak = bk;
93405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    continue;
93505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
93605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                throw new MonkeyException("preceding(i)", i);
93705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            }
93805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            checkResults("testPreceding", CheckDirection.REVERSE);
93905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
94005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        };
94105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
94205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
94305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        void testIsBoundary() {
94405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fTestData.clearActualBreaks();
94505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            fBI.setText(fTestData.fString);
94605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            for (int i=fTestData.fString.length(); i>=0; --i) {
94705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                if (fBI.isBoundary(i)) {
94805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    fTestData.fActualBreaks[i] = true;
94905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
95005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            }
95105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            checkResults("testForwards", CheckDirection.FORWARD);
95205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        };
95305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
95405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
95505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        void checkResults(String msg, CheckDirection direction) {
95605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            if (direction == CheckDirection.FORWARD) {
95705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                for (int i=0; i<=fTestData.fString.length(); ++i) {
95805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    if (fTestData.fExpectedBreaks[i] != fTestData.fActualBreaks[i]) {
95905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        throw new MonkeyException(msg, i);
96005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    }
96105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
96205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            } else {
96305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                for (int i=fTestData.fString.length(); i>=0; i--) {
96405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    if (fTestData.fExpectedBreaks[i] != fTestData.fActualBreaks[i]) {
96505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                        throw new MonkeyException(msg, i);
96605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                    }
96705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                }
96805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            }
96905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
97005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        };
97105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
97205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        String                 fRuleCharBuffer;         // source file contents of the reference rules.
97305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        BreakRules             fRuleSet;
97405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        RuleBasedBreakIterator fBI;
97505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        MonkeyTestData         fTestData;
97605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        ICU_Rand               fRandomGenerator;
97705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        String                 fRuleFileName;
97805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        boolean                fVerbose;                 // True to do long dump of failing data.
97905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        int                    fLoopCount;
98005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        int                    fErrorCount;
98105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
98205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        boolean                fDumpExpansions;          // Debug flag to output expanded form of rules and sets.
98305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        StringBuilder          fErrorMsgs = new StringBuilder();
98405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
98505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    }
98605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
98705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    //  Test parameters, specified via Java properties.
98805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    //
98905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    //  rules=file_name   Name of file containing the reference rules.
99005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    //  seed=nnnnn        Random number starting seed.
99105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    //                    Setting the seed allows errors to be reproduced.
99205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    //  loop=nnn          Looping count.  Controls running time.
99305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    //                    -1:  run forever.
99405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    //                     0 or greater:  run length.
99505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    //  expansions        debug option, show expansions of rules and sets.
99605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    //  verbose           Display details of the failure.
99705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    //
99805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    // Parameters are passed to the JVM on the command line, or
99905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    // via the Eclipse Run Configuration settings, arguments tab, VM parameters.
100005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    // For example,
100105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    //      -ea -Drules=line.txt -Dloop=-1
100205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    //
100305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    @Test
100405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    public void TestMonkey() {
100505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        String tests[] = {"grapheme.txt", "word.txt", "line.txt", "sentence.txt", "line_normal.txt",
100605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                "line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt"
100705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        };
100805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
100905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        String testNameFromParams = getProperty("rules");
101005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
101105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        if (testNameFromParams != null) {
101205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            tests = new String[] {testNameFromParams};
101305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        }
101405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
101505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        int loopCount = getIntProperty("loop", isQuick() ? 100 : 5000);
101605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        boolean dumpExpansions =  getBooleanProperty("expansions", false);
101705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        boolean verbose = getBooleanProperty("verbose", false);
101805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        int seed = getIntProperty("seed", 1);
101905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
102005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        List<RBBIMonkeyImpl> startedTests = new ArrayList<RBBIMonkeyImpl>();
102105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
102205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        // Monkey testing is multi-threaded.
102305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        // Each set of break rules to be tested is run in a separate thread.
102405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        // Each thread/set of rules gets a separate RBBIMonkeyImpl object.
102505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
102605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        for (String testName: tests) {
102705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            logln(String.format("beginning testing of %s", testName));
102805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
102905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            RBBIMonkeyImpl test = new RBBIMonkeyImpl();
103005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
103105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            test.fDumpExpansions = dumpExpansions;
103205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            test.fVerbose = verbose;
103305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            test.fRandomGenerator = new ICU_Rand(seed);
103405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            test.fLoopCount = loopCount;
103505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            test.setup(testName);
103605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
103705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            test.start();
103805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            startedTests.add(test);
103905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        }
104005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
104105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        StringBuilder errors = new StringBuilder();
104205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        for (RBBIMonkeyImpl test: startedTests) {
104305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            try {
104405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                test.join();
104505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                errors.append(test.fErrorMsgs);
104605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            } catch (InterruptedException e) {
104705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert                errors.append(e + "\n");
104805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert            }
104905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        }
105005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        String errorMsgs = errors.toString();
105105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert        assertEquals(errorMsgs, "", errorMsgs);
105205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
105305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert    }
105405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
105505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert
105605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert}
1057