test/rbbi/RBBIMonkeyTest.java

// © 2016 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License

package com.ibm.icu.dev.test.rbbi;

import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;

import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;

import com.ibm.icu.dev.test.TestFmwk;
import com.ibm.icu.impl.UCharacterName;
import com.ibm.icu.impl.UCharacterNameChoice;
import com.ibm.icu.text.BreakIterator;
import com.ibm.icu.text.RuleBasedBreakIterator;
import com.ibm.icu.text.UnicodeSet;
import com.ibm.icu.util.ULocale;

/**
 * RBBI Monkey Test. Ported from ICU4C test/intltest/rbbimonkeytest.cpp.
 * This is the newer, data driven monkey test. It is completely separate from the
 * older class RBBITestMonkey.
 */

@RunWith(JUnit4.class)
public class RBBIMonkeyTest extends TestFmwk {


    //  class CharClass    Represents a single character class from the source break rules.
    //                     Inherits from UObject because instances are adopted by UHashtable, which ultimately
    //                     deletes them using hash's object deleter function.

    static class CharClass  {
        String         fName;
        String         fOriginalDef;    // set definition as it appeared in user supplied rules.
        String         fExpandedDef;    // set definition with any embedded named sets replaced by their defs, recursively.
        UnicodeSet     fSet;
        CharClass(String name, String originalDef, String expandedDef, UnicodeSet set) {
            fName = name;
            fOriginalDef = originalDef;
            fExpandedDef = expandedDef;
            fSet = set;
        };
    }


    // class BreakRule    Struct-like class represents a single rule from a set of break rules.
    //                    Each rule has the set definitions expanded, and
    //                    is compiled to a regular expression.

    static class BreakRule {
        String    fName;                   // Name of the rule.
        String    fRule;                   // Rule expression, excluding the name, as written in user source.
        String    fExpandedRule;           // Rule expression after expanding the set definitions.
        Matcher   fRuleMatcher;            // Regular expression that matches the rule.
    };


    // class BreakRules    represents a complete set of break rules, possibly tailored,
    //                     compiled from testdata break rules.

    static class BreakRules {
        BreakRules(RBBIMonkeyImpl monkeyImpl) {
            fMonkeyImpl = monkeyImpl;
            fBreakRules = new ArrayList<BreakRule>();
            fType = BreakIterator.KIND_TITLE;
            fCharClasses = new HashMap<String, CharClass>();
            fCharClassList = new ArrayList<CharClass>();
            fDictionarySet = new UnicodeSet();

            // Match an alpha-numeric identifier in a rule. Will be a set name.
            // Use negative look-behind to exclude non-identifiers, mostly property names or values.
            fSetRefsMatcher = Pattern.compile(
                    "(?<!\\{[ \\t]{0,4})" +
                    "(?<!=[ \\t]{0,4})" +
                    "(?<!\\[:[ \\t]{0,4})" +
                    "(?<!\\\\)" +
                    "(?<![A-Za-z0-9_])" +
                    "([A-Za-z_][A-Za-z0-9_]*)").     // The char class name
                    matcher("");

            // Match comments and blank lines. Matches will be replaced with "", stripping the comments from the rules.
            fCommentsMatcher = Pattern.compile("" +
                    "(^|(?<=;))"   +                // Start either at start of line, or just after a ';' (look-behind for ';')
                    "[ \\t]*+"     +                //   Match white space.
                    "(#.*)?+"      +                //   Optional # plus whatever follows
                    "$").                           //   new-line at end of line.
                    matcher("");

            // Match (initial parse) of a character class definition line.
            fClassDefMatcher = Pattern.compile("" +
                    "[ \\t]*"           +                    // leading white space
                    "([A-Za-z_][A-Za-z0-9_]*)" +             // The char class name
                    "[ \\t]*=[ \\t]*"   +                    //   =
                    "(.*?)"  +                               // The char class UnicodeSet expression
                    "[ \\t]*;$").                            // ; <end of line>
                    matcher("");

            // Match (initial parse) of a break rule line.
            fRuleDefMatcher = Pattern.compile("" +
                    "[ \\t]*"           +                     // leading white space
                    "([A-Za-z_][A-Za-z0-9_.]*)" +             // The rule name
                    "[ \\t]*:[ \\t]*"   +                     //   :
                    "(.*?)"   +                               // The rule definition
                    "[ \\t]*;$").                             // ; <end of line>
                    matcher("");

            // Match a property expression, either [:xxx:] or \p{...}
            fPropertyMatcher = Pattern.compile("" +
                    "\\[:.*?:]|\\\\(?:p|P)\\{.*?\\}").
                    matcher("");


        }

        /**
         * Create the expanded definition for this char class,
         * replacing any set references with the corresponding definition.
         */
        CharClass  addCharClass(String name, String definition) {
            StringBuffer expandedDef = new StringBuffer();
            fSetRefsMatcher.reset(definition);
            while (fSetRefsMatcher.find()) {
                String sname = fSetRefsMatcher.group(/*"ClassName"*/ 1);
                CharClass snameClass = fCharClasses.get(sname);
                String expansionForName = snameClass != null ? snameClass.fExpandedDef : sname;

                fSetRefsMatcher.appendReplacement(expandedDef, "");
                expandedDef.append(expansionForName);
            }
            fSetRefsMatcher.appendTail(expandedDef);
            String expandedDefString = expandedDef.toString();

            if (fMonkeyImpl.fDumpExpansions) {
                System.out.printf("addCharClass(\"%s\"\n", name);
                System.out.printf("             %s\n", definition);
                System.out.printf("expandedDef: %s\n", expandedDefString);
            }

            // Verify that the expanded set definition is valid.

            UnicodeSet s;
            try {
                s = new UnicodeSet(expandedDefString, UnicodeSet.IGNORE_SPACE);
            } catch (java.lang.IllegalArgumentException e) {
                System.err.printf("%s: error %s creating UnicodeSet %s", fMonkeyImpl.fRuleFileName, e.toString(), name);
                throw e;
            }

            // Get an expanded equivalent pattern from the UnicodeSet.
            // This removes set difference operators, which would fail if passed through to Java regex.

            StringBuffer expandedPattern = new StringBuffer();
            s._generatePattern(expandedPattern, true);
            expandedDefString = expandedPattern.toString();
            if (fMonkeyImpl.fDumpExpansions) {
                System.out.printf("expandedDef2: %s\n", expandedDefString);
            }

            CharClass cclass = new CharClass(name, definition, expandedDefString, s);
            CharClass previousClass = fCharClasses.put(name, cclass);

            if (previousClass != null) {
                // TODO: decide whether or not to allow redefinitions.
                //       Can be convenient in some cases.
                // String msg = String.format("%s: Redefinition of character class %s\n",
                //         fMonkeyImpl.fRuleFileName, cclass.fName);
                // System.err.println(msg);
                // throw new IllegalArgumentException(msg);
            }
            return cclass;

        };


        void addRule(String  name, String  definition) {
            BreakRule  thisRule = new BreakRule();
            StringBuffer expandedDefsRule = new StringBuffer();
            thisRule.fName = name;
            thisRule.fRule = definition;

            // Expand the char class definitions within the rule.
            fSetRefsMatcher.reset(definition);
            while (fSetRefsMatcher.find()) {
                String sname = fSetRefsMatcher.group(/*"ClassName"*/ 1);
                CharClass nameClass = fCharClasses.get(sname);
                if (nameClass == null) {
                    System.err.printf("char class \"%s\" unrecognized in rule \"%s\"\n", sname, definition);
                }
                String expansionForName = nameClass != null ? nameClass.fExpandedDef : sname;
                fSetRefsMatcher.appendReplacement(expandedDefsRule, "");
                expandedDefsRule.append(expansionForName);
            }
            fSetRefsMatcher.appendTail(expandedDefsRule);

            // Replace any property expressions, \p{...} or [:...:] with an equivalent expansion,
            // obtained from ICU UnicodeSet. Need to do this substitution because Java regex
            // does not recognize all properties, and because Java's definitions are likely
            // older than ICU's.

            StringBuffer expandedRule = new StringBuffer();
            fPropertyMatcher.reset(expandedDefsRule);
            while (fPropertyMatcher.find()) {
                String prop = fPropertyMatcher.group();
                UnicodeSet propSet = new UnicodeSet("[" + prop + "]");
                StringBuffer propExpansion = new StringBuffer();
                propSet._generatePattern(propExpansion, true);
                fPropertyMatcher.appendReplacement(expandedRule, propExpansion.toString());
            }
            fPropertyMatcher.appendTail(expandedRule);

            //   Replace any [^negated sets] with equivalent flattened sets generated by
            //   ICU UnicodeSet. [^ ...] in Java Regex character classes does not apply
            //   to any nested classes. Variable substitution in rules produces
            //   nested sets that [^negation] needs to apply to.

            StringBuffer ruleWithFlattenedSets = new StringBuffer();
            int idx = 0;
            while (idx<expandedRule.length()) {
                int setOpenPos = expandedRule.indexOf("[^", idx);
                if (setOpenPos < 0) {
                    break;
                }
                if (setOpenPos > idx) {
                    // Move anything from the source rule preceding the [^ into the processed rule, unchanged.
                    ruleWithFlattenedSets.append(expandedRule.substring(idx,  setOpenPos));
                }
                int nestingLevel = 1;
                boolean haveNesting = false;
                int setClosePos;
                for (setClosePos = setOpenPos + 2; nestingLevel > 0 && setClosePos<expandedRule.length(); ++setClosePos) {
                    char c = expandedRule.charAt(setClosePos);
                    if (c == '\\') {
                        ++setClosePos;
                    } else if (c == '[') {
                        ++nestingLevel;
                        haveNesting = true;
                    } else if (c == ']') {
                        --nestingLevel;
                    }
                }
                if (haveNesting && nestingLevel == 0) {
                    // Found one, a negated set that includes interior nested sets.
                    // Create an ICU UnicodeSet from the source pattern, and obtain an
                    // equivalent flattened pattern from that.
                    UnicodeSet uset = new UnicodeSet(expandedRule.substring(setOpenPos, setClosePos), true);
                    uset._generatePattern(ruleWithFlattenedSets, true);
                } else {
                    // The [^ set definition did not include any nested sets.
                    // Copy the original definition without change.
                    // Java regular expressions will handle it without needing to recast it.
                    if (nestingLevel > 0) {
                        // Error case of an unclosed character class expression.
                        // Java regex will also eventually flag the error.
                        System.err.printf("No closing ] found in rule %s\n", name);
                    }
                    ruleWithFlattenedSets.append(expandedRule.substring(setOpenPos, setClosePos));
                }
                idx = setClosePos;
            }

            if (idx < expandedRule.length()) {
                ruleWithFlattenedSets.append(expandedRule.substring(idx, expandedRule.length()));
            }

            thisRule.fExpandedRule = ruleWithFlattenedSets.toString();

            // Replace the divide sign (\u00f7) with a regular expression named capture.
            // When running the rules, a match that includes this group means we found a break position.

            // thisRule.fExpandedRule = thisRule.fExpandedRule.replace("÷", "(?<BreakPosition>)");
            thisRule.fExpandedRule = thisRule.fExpandedRule.replace("÷", "()");
            if (thisRule.fExpandedRule.indexOf("÷") != -1) {
                String msg = String.format("%s Rule %s contains multiple ÷ signs", fMonkeyImpl.fRuleFileName, name);
                System.err.println(msg);
                throw new IllegalArgumentException(msg);
            }

            // UAX break rule set definitions can be empty, just [].
            // Regular expression set expressions don't accept this. Substitute with [a&&[^a]], which
            // also matches nothing.

            thisRule.fExpandedRule = thisRule.fExpandedRule.replace("[]", "[a&&[^a]]");

            // Change Unicode escape syntax for compatibility with Java regular expressions (Java 7 or newer)
            //    \udddd     => \x{dddd}
            //    \U00hhhhhh => \x{hhhhhh}

            // thisRule.fExpandedRule = thisRule.fExpandedRule.replaceAll("\\\\u([0-9A-Fa-f]{4})", "\\\\x{$1}");
            // thisRule.fExpandedRule = thisRule.fExpandedRule.replaceAll("\\\\U00([0-9A-Fa-f]{6})", "\\\\x{$1}");

            // Java 6 compatibility troubles - there is no syntax for escaping a supplementary character
            // within a regular expression character class. Put them in as unescaped literal chars.
            StringBuilder sb = new StringBuilder(thisRule.fExpandedRule);
            while (true) {
                int where = sb.indexOf("\\U00");
                if (where < 0) {
                    break;
                }
                String cp = hexToCodePoint(sb.substring(where+2, where+10));
                sb.replace(where, where+10, cp);
            }
            thisRule.fExpandedRule = sb.toString();

            // Escape any literal '#' in the rule expression. Without escaping, these introduce a comment.
            // UnicodeSet._generatePattern() inserts un-escaped "#"s

            thisRule.fExpandedRule = thisRule.fExpandedRule.replace("#", "\\#");
            if (fMonkeyImpl.fDumpExpansions) {
                System.out.printf("fExpandedRule: %s\n", thisRule.fExpandedRule);
            }

            // Compile a regular expression for this rule.

            try {
                thisRule.fRuleMatcher = Pattern.compile(thisRule.fExpandedRule, Pattern.COMMENTS | Pattern.DOTALL).matcher("");
            } catch (PatternSyntaxException e) {
                System.err.printf("%s: Error creating regular expression for rule %s. Expansion is \n\"%s\"",
                        fMonkeyImpl.fRuleFileName, name, thisRule.fExpandedRule);
                throw e;
            }

            // Put this new rule into the vector of all Rules.

            fBreakRules.add(thisRule);
        };

        private static String hexToCodePoint(String hex) {
            int cp = Integer.parseInt(hex, 16);
            return new StringBuilder().appendCodePoint(cp).toString();
        }


        boolean setKeywordParameter(String keyword, String value) {
            if (keyword.equals("locale")) {
                fLocale = new ULocale(value);
                return true;
            }
            if (keyword.equals("type")) {
                if (value.equals("grapheme")) {
                    fType = BreakIterator.KIND_CHARACTER;
                } else if (value.equals("word")) {
                    fType = BreakIterator.KIND_WORD;
                } else if (value.equals("line")) {
                    fType = BreakIterator.KIND_LINE;
                } else if (value.equals("sentence")) {
                    fType = BreakIterator.KIND_SENTENCE;
                } else {
                    String msg = String.format("%s: Unrecognized break type %s", fMonkeyImpl.fRuleFileName, value);
                    System.err.println(msg);
                    throw new IllegalArgumentException(msg);
                }
                return true;
            }
            return false;
        }


        RuleBasedBreakIterator createICUBreakIterator() {
            BreakIterator bi;
            switch(fType) {
                case BreakIterator.KIND_CHARACTER:
                    bi = (BreakIterator.getCharacterInstance(fLocale));
                    break;
                case BreakIterator.KIND_WORD:
                    bi = (BreakIterator.getWordInstance(fLocale));
                    break;
                case BreakIterator.KIND_LINE:
                    bi = (BreakIterator.getLineInstance(fLocale));
                    break;
                case BreakIterator.KIND_SENTENCE:
                    bi = (BreakIterator.getSentenceInstance(fLocale));
                    break;
                default:
                    String msg = String.format("%s: Bad break iterator type of %d", fMonkeyImpl.fRuleFileName, fType);
                    System.err.println(msg);
                    throw new IllegalArgumentException(msg);
            }
            return (RuleBasedBreakIterator)bi;

        };


        void compileRules(String rules) {
            int lineNumber = 0;
            for (String line: rules.split("\\r?\\n")) {
                ++lineNumber;
                // Strip comment lines.
                fCommentsMatcher.reset(line);
                line = fCommentsMatcher.replaceFirst("");
                if (line.isEmpty()) {
                    continue;
                }

                // Recognize character class definition and keyword lines
                fClassDefMatcher.reset(line);
                if (fClassDefMatcher.matches()) {
                    String className = fClassDefMatcher.group(/*"ClassName"*/ 1);
                    String classDef  = fClassDefMatcher.group(/*"ClassDef"*/ 2);
                    if (fMonkeyImpl.fDumpExpansions) {
                        System.out.printf("scanned class: %s = %s\n", className, classDef);
                    }
                    if (setKeywordParameter(className, classDef)) {
                        // The scanned item was "type = ..." or "locale = ...", etc.
                        //   which are not actual character classes.
                        continue;
                    }
                    addCharClass(className, classDef);
                    continue;
                }

                // Recognize rule lines.
                fRuleDefMatcher.reset(line);
                if (fRuleDefMatcher.matches()) {
                    String ruleName = fRuleDefMatcher.group(/*"RuleName"*/ 1);
                    String ruleDef  = fRuleDefMatcher.group(/*"RuleDef"*/ 2);
                    if (fMonkeyImpl.fDumpExpansions) {
                        System.out.printf("scanned rule: %s : %s\n", ruleName, ruleDef);
                    }
                    addRule(ruleName, ruleDef);
                    continue;
                }

                String msg = String.format("Unrecognized line in rule file %s:%d \"%s\"",
                        fMonkeyImpl.fRuleFileName, lineNumber, line);
                System.err.println(msg);
                throw new IllegalArgumentException(msg);
            }

            // Build the vector of char classes, omitting the dictionary class if there is one.
            // This will be used when constructing the random text to be tested.

            // Also compute the "other" set, consisting of any characters not included in
            // one or more of the user defined sets.

            UnicodeSet otherSet = new UnicodeSet(0, 0x10ffff);

            for (Map.Entry<String, CharClass> el: fCharClasses.entrySet()) {
                String ccName = el.getKey();
                CharClass cclass = el.getValue();

                // System.out.printf("    Adding %s\n", ccName);
                if (!ccName.equals(cclass.fName)) {
                    throw new IllegalArgumentException(
                            String.format("%s: internal error, set names (%s, %s) inconsistent.\n",
                                    fMonkeyImpl.fRuleFileName, ccName, cclass.fName));
                }
                otherSet.removeAll(cclass.fSet);
                if (ccName.equals("dictionary")) {
                    fDictionarySet = cclass.fSet;
                } else {
                    fCharClassList.add(cclass);
                }
            }

            if (!otherSet.isEmpty()) {
                // System.out.printf("have an other set.\n");
                CharClass cclass = addCharClass("__Others", otherSet.toPattern(true));
                fCharClassList.add(cclass);
            }

        };

        CharClass getClassForChar(int c) {
            for (CharClass cc: fCharClassList) {
                if (cc.fSet.contains(c)) {
                    return cc;
                }
            }
            return null;
        };


        RBBIMonkeyImpl          fMonkeyImpl;        // Pointer back to the owning MonkeyImpl instance.
        List<BreakRule>         fBreakRules;        // Contents are of type (BreakRule *).

        Map<String, CharClass>  fCharClasses;       // Key is the set name.
        //                                          // Value is the corresponding CharClass
        List<CharClass>         fCharClassList;     // Char Classes, same contents as fCharClasses values,

        UnicodeSet              fDictionarySet;     // Dictionary set, empty if none is defined.
        ULocale                 fLocale;
        int                     fType;              // BreakItererator.KIND_WORD, etc.


        Matcher fSetRefsMatcher;
        Matcher fCommentsMatcher;
        Matcher fClassDefMatcher;
        Matcher fRuleDefMatcher;
        Matcher fPropertyMatcher;
    };


    // class MonkeyTestData    represents a randomly synthesized test data string together
    //                         with the expected break positions obtained by applying
    //                         the test break rules.

    static class MonkeyTestData{

        void set(BreakRules rules, ICU_Rand rand) {
            int dataLength = 1000;   // length of test data to generate, in code points.

            // Fill the test string with random characters.
            // First randomly pick a char class, then randomly pick a character from that class.
            // Exclude any characters from the dictionary set.

            // System.out.println("Populating Test Data");
            fRandomSeed = rand.getSeed();         // Save initial seed for use in error messages,
                                                  // allowing recreation of failing data.
            fBkRules = rules;
            StringBuilder newString = new StringBuilder();
            for (int n=0; n<dataLength;) {
                int charClassIndex = rand.next() % rules.fCharClassList.size();
                CharClass cclass = rules.fCharClassList.get(charClassIndex);
                if (cclass.fSet.size() == 0) {
                    // Some rules or tailorings do end up with empty char classes.
                    continue;
                }
                int charIndex = rand.next() % cclass.fSet.size();
                int c = cclass.fSet.charAt(charIndex);
                if (/*Character.isBmpCodePoint(c)*/ c<=0x0ffff && Character.isLowSurrogate((char)c) &&
                        newString.length() > 0 && Character.isHighSurrogate(newString.charAt(newString.length()-1))) {
                    // Character classes may contain unpaired surrogates, e.g. Grapheme_Cluster_Break = Control.
                    // Don't let random unpaired surrogates combine in the test data because they might
                    // produce an unwanted dictionary character.
                    continue;
                }

                if (!rules.fDictionarySet.contains(c)) {
                    newString.appendCodePoint(c);
                    ++n;
                }
            }
            fString = newString.toString();

            // Init the expectedBreaks, actualBreaks and ruleForPosition.
            // Expected and Actual breaks are one longer than the input string; a true value
            // will indicate a boundary preceding that position.

            fActualBreaks    = new boolean[fString.length()+1];
            fExpectedBreaks  = new boolean[fString.length()+1];
            fRuleForPosition = new int[fString.length()+1];
            f2ndRuleForPos   = new int[fString.length()+1];

            // Apply reference rules to find the expected breaks.

            fExpectedBreaks[0] = true;       // Force an expected break before the start of the text.
                                             // ICU always reports a break there.
                                             // The reference rules do not have a means to do so.
            int strIdx = 0;
            while (strIdx < fString.length()) {
                BreakRule matchingRule = null;
                boolean hasBreak = false;
                int ruleNum = 0;
                int matchStart = 0;
                int matchEnd = 0;
                for (ruleNum=0; ruleNum<rules.fBreakRules.size(); ruleNum++) {
                    BreakRule rule = rules.fBreakRules.get(ruleNum);
                    rule.fRuleMatcher.reset(fString.substring(strIdx));
                    if (rule.fRuleMatcher.lookingAt()) {
                        // A candidate rule match, check further to see if we take it or continue to check other rules.
                        // Matches of zero or one code point count only if they also specify a break.
                        matchStart = strIdx;
                        matchEnd = strIdx + rule.fRuleMatcher.end();
                        hasBreak = BreakGroupStart(rule.fRuleMatcher) >= 0;
                        if (hasBreak ||
                                (matchStart < fString.length() && fString.offsetByCodePoints(matchStart, 1) < matchEnd)) {
                            matchingRule = rule;
                            break;
                        }
                    }
                }
                if (matchingRule == null) {
                    // No reference rule matched. This is an error in the rules that should never happen.
                    String msg = String.format("%s: No reference rules matched at position %d. ",
                            rules.fMonkeyImpl.fRuleFileName, strIdx);
                    System.err.println(msg);
                    dump(strIdx);
                    throw new IllegalArgumentException(msg);
                }
                if (matchingRule.fRuleMatcher.group().length() == 0) {
                    // Zero length rule match. This is also an error in the rule expressions.
                    String msg = String.format("%s:%s: Zero length rule match at %d.",
                            rules.fMonkeyImpl.fRuleFileName, matchingRule.fName, strIdx);
                    System.err.println(msg);
                    dump(strIdx);
                    throw new IllegalArgumentException(msg);
                }

                // Record which rule matched over the length of the match.
                for (int i = matchStart; i < matchEnd; i++) {
                    if (fRuleForPosition[i] == 0) {
                        fRuleForPosition[i] = ruleNum;
                    } else {
                        f2ndRuleForPos[i] = ruleNum;
                    }
                }

                // Break positions appear in rules as a matching named capture of zero length at the break position,
                //   the adjusted pattern contains (?<BreakPosition>)
                if (hasBreak) {
                    int breakPos = strIdx + BreakGroupStart(matchingRule.fRuleMatcher);
                    fExpectedBreaks[breakPos] = true;
                    // System.out.printf("recording break at %d\n", breakPos);
                    // For the next iteration, pick up applying rules immediately after the break,
                    // which may differ from end of the match. The matching rule may have included
                    // context following the boundary that needs to be looked at again.
                    strIdx = breakPos;
                } else {
                    // Original rule didn't specify a break.
                    // Continue applying rules starting on the last code point of this match.
                    int updatedStrIdx = fString.offsetByCodePoints(matchEnd, -1);
                    if (updatedStrIdx == matchStart) {
                        // Match was only one code point, no progress if we continue.
                        // Shouldn't get here, case is filtered out at top of loop.
                        throw new IllegalArgumentException(String.format("%s: Rule %s internal error.",
                                rules.fMonkeyImpl.fRuleFileName, matchingRule.fName));
                    }
                    strIdx = updatedStrIdx;
                }
            }
        };

        // Helper function to find the starting index of a match of the "BreakPosition" named capture group.
        // @param m: a Java regex Matcher that has completed a matching operation.
        // @return m.start("BreakPosition),
        //         or -1 if there is no such group, or the group did not participate in the match.
        //
        // TODO: this becomes m.start("BreakPosition") with Java 8.
        //       In the mean time, assume that the only zero-length capturing group in
        //       a reference rule expression is the "BreakPosition" that corresponds to a "÷".

        static int BreakGroupStart(Matcher m) {
            for (int groupNum=1; groupNum <= m.groupCount(); ++groupNum) {
                String group = m.group(groupNum);
                if (group == null) {
                    continue;
                }
                if (group.equals("")) {
                    // assert(m.end(groupNum) == m.end("BreakPosition"));
                    return m.start(groupNum);
                }
            }
            return -1;
        }

        void dump(int around) {
            System.out.print("\n"
                    +        "         char                        break  Rule                     Character\n"
                    +        "   pos   code   class                 R I   name                     name\n"
                    +        "---------------------------------------------------------------------------------------------\n");

            int start;
            int end;

            if (around == -1) {
                start = 0;
                end = fString.length();
            } else {
                // Display context around a failure.
                try {
                    start = fString.offsetByCodePoints(around, -30);
                } catch (Exception e) {
                    start = 0;
                }
                try {
                    end = fString.offsetByCodePoints(around, +30);
                } catch (Exception e) {
                    end = fString.length();
                }
            }

            for (int charIdx = start; charIdx < end; charIdx=fString.offsetByCodePoints(charIdx, 1)) {
                int c = fString.codePointAt(charIdx);
                CharClass cc = fBkRules.getClassForChar(c);

                BreakRule rule = fBkRules.fBreakRules.get(fRuleForPosition[charIdx]);
                String secondRuleName = "";
                if (f2ndRuleForPos[charIdx] > 0) {
                    secondRuleName = fBkRules.fBreakRules.get(f2ndRuleForPos[charIdx]).fName;
                }
                String cName = UCharacterName.INSTANCE.getName(c, UCharacterNameChoice.EXTENDED_CHAR_NAME);

                System.out.printf("  %4d %6x   %-20s  %c %c   %-10s %-10s    %s\n",
                        charIdx, c, cc.fName,
                        fExpectedBreaks[charIdx] ? '*' : '.',
                        fActualBreaks[charIdx] ? '*' : '.',
                        rule.fName, secondRuleName, cName
                        );
                }

        };

        void clearActualBreaks() {
            Arrays.fill(fActualBreaks, false);
        }


        int               fRandomSeed;        // The initial seed value from the random number generator.
        BreakRules        fBkRules;           // The break rules used to generate this data.
        String            fString;            // The text.
        boolean           fExpectedBreaks[];  // Breaks as found by the reference rules.
                                              //     Parallel to fString. true if break preceding.
        boolean           fActualBreaks[];    // Breaks as found by ICU break iterator.
        int               fRuleForPosition[]; // Index into BreakRules.fBreakRules of rule that applied at each position.
                                              // Also parallel to fString.
        int               f2ndRuleForPos[];   // As above. A 2nd rule applies when the preceding rule
                                              //   didn't cause a break, and a subsequent rule match starts
                                              //   on the last code point of the preceding match.

    }


    // class RBBIMonkeyImpl     holds (some indirectly) everything associated with running a monkey
    //                          test for one set of break rules.
    //

    static class RBBIMonkeyImpl extends Thread {

        void setup(String ruleFile) {
            fRuleFileName = ruleFile;
            openBreakRules(ruleFile);
            fRuleSet = new BreakRules(this);
            fRuleSet.compileRules(fRuleCharBuffer);
            fBI = fRuleSet.createICUBreakIterator();
            fTestData = new MonkeyTestData();
        };

        void openBreakRules(String fileName) {
            StringBuilder testFileBuf = new StringBuilder();
            InputStream is = null;
            String filePath = "break_rules/" + fileName;
            try {
                is = RBBIMonkeyImpl.class.getResourceAsStream(filePath);
                if (is == null) {
                    errln("Could not open test data file " + fileName);
                    return;
                }
                InputStreamReader isr = new InputStreamReader(is, "UTF-8");
                try {
                    int c;
                    int count = 0;
                    for (;;) {
                        c = isr.read();
                        if (c < 0) {
                            break;
                        }
                        count++;
                        if (c == 0xFEFF && count == 1) {
                            // BOM in the test data file. Discard it.
                            continue;
                        }
                       testFileBuf.appendCodePoint(c);
                    }
                } finally {
                    isr.close();
                }
                } catch (IOException e) {
                try {
                    is.close();
                } catch (IOException ignored) {
                }
                errln(e.toString());
            }
            fRuleCharBuffer =  testFileBuf.toString();  /* the file as a String */
        }

        class MonkeyException extends RuntimeException  {
            private static final long serialVersionUID = 1L;
            public int fPosition;    // Position of the failure in the test data.
            MonkeyException(String description, int pos) {
                super(description);
                fPosition = pos;
            }
        }

        @Override
        public void run() {
            int errorCount = 0;
            if (fBI == null) {
                fErrorMsgs.append("Unable to run test because fBI is null.\n");
                return;
            }
            for (long loopCount = 0; fLoopCount < 0 || loopCount < fLoopCount; loopCount++) {
                try {
                    fTestData.set(fRuleSet, fRandomGenerator);
                    // fTestData.dump(-1);
                    testForwards();
                    testPrevious();
                    testFollowing();
                    testPreceding();
                    testIsBoundary();
                } catch (MonkeyException e) {
                    String formattedMsg = String.format(
                            "%s at index %d. VM Arguments to reproduce: -Drules=%s -Dseed=%d -Dloop=1 -Dverbose=1 \"\n",
                            e.getMessage(), e.fPosition, fRuleFileName, fTestData.fRandomSeed);
                    System.err.print(formattedMsg);
                    if (fVerbose) {
                        fTestData.dump(e.fPosition);
                    }
                    fErrorMsgs.append(formattedMsg);
                    if (++errorCount > 10) {
                        return;
                    }
                }
                if (fLoopCount < 0 && loopCount % 100 == 0) {
                    System.err.print(".");
                }
            }
        }

        enum CheckDirection {
            FORWARD,
            REVERSE
        };

        void testForwards() {
            fTestData.clearActualBreaks();
            fBI.setText(fTestData.fString);
            int previousBreak = -2;
            for (int bk=fBI.first(); bk != BreakIterator.DONE; bk=fBI.next()) {
                if (bk <= previousBreak) {
                    throw new MonkeyException("Break Iterator Stall", bk);
                }
                if (bk < 0 || bk > fTestData.fString.length()) {
                    throw new MonkeyException("Boundary out of bounds", bk);
                }
                fTestData.fActualBreaks[bk] = true;
            }
            checkResults("testForwards", CheckDirection.FORWARD);
        };


       void testFollowing() {
           fTestData.clearActualBreaks();
           fBI.setText(fTestData.fString);
           int nextBreak = -1;
           for (int i=-1 ; i<fTestData.fString.length(); ++i) {
               int bk = fBI.following(i);
               if (bk == BreakIterator.DONE && i == fTestData.fString.length()) {
                   continue;
               }
               if (bk == nextBreak && bk > i) {
                   // i is in the gap between two breaks.
                   continue;
               }
               if (i == nextBreak && bk > nextBreak) {
                   fTestData.fActualBreaks[bk] = true;
                   nextBreak = bk;
                   continue;
               }
               throw new MonkeyException("following(i)", i);
           }
           checkResults("testFollowing", CheckDirection.FORWARD);
        };


        void testPrevious() {
            fTestData.clearActualBreaks();
            fBI.setText(fTestData.fString);
            int previousBreak = Integer.MAX_VALUE;
            for (int bk=fBI.last(); bk != BreakIterator.DONE; bk=fBI.previous()) {
                 if (bk >= previousBreak) {
                     throw new MonkeyException("Break Iterator Stall", bk);
                }
                if (bk < 0 || bk > fTestData.fString.length()) {
                    throw new MonkeyException("Boundary out of bounds", bk);
                }
                fTestData.fActualBreaks[bk] = true;
            }
            checkResults("testPrevius", CheckDirection.REVERSE);
        };


        /**
         * Given an index into a string, if it refers to the trail surrogate of a surrogate pair,
         * adjust it to point to the lead surrogate, which is the start of the code point.
         * @param s the String.
         * @param i the initial index
         * @return the adjusted index
         */
        private int getChar32Start(String s, int i) {
            if (i > 0 && i < s.length() &&
                    Character.isLowSurrogate(s.charAt(i)) && Character.isHighSurrogate(s.charAt(i-1))) {
                --i;
            }
            return i;
        }


        void testPreceding() {
            fTestData.clearActualBreaks();
            fBI.setText(fTestData.fString);
            int nextBreak = fTestData.fString.length()+1;
            for (int i=fTestData.fString.length()+1 ; i>=0; --i) {
                int bk = fBI.preceding(i);
                // System.err.printf("testPreceding() i:%d  bk:%d  nextBreak:%d\n", i, bk, nextBreak);
                if (bk == BreakIterator.DONE && i == 0) {
                    continue;
                }
                if (bk == nextBreak && bk < i) {
                    // i is in the gap between two breaks.
                    continue;
                }
                if (i<fTestData.fString.length() && getChar32Start(fTestData.fString, i) < i) {
                    // i indexes to a trailing surrogate.
                    // Break Iterators treat an index to either half as referring to the supplemental code point,
                    // with preceding going to some preceding code point.
                    if (fBI.preceding(i) != fBI.preceding(getChar32Start(fTestData.fString, i))) {
                        throw new MonkeyException("preceding of trailing surrogate error", i);
                    }
                    continue;
                }
                if (i == nextBreak && bk < nextBreak) {
                    fTestData.fActualBreaks[bk] = true;
                    nextBreak = bk;
                    continue;
                }
                throw new MonkeyException("preceding(i)", i);
            }
            checkResults("testPreceding", CheckDirection.REVERSE);

        };


        void testIsBoundary() {
            fTestData.clearActualBreaks();
            fBI.setText(fTestData.fString);
            for (int i=fTestData.fString.length(); i>=0; --i) {
                if (fBI.isBoundary(i)) {
                    fTestData.fActualBreaks[i] = true;
                }
            }
            checkResults("testForwards", CheckDirection.FORWARD);
        };


        void checkResults(String msg, CheckDirection direction) {
            if (direction == CheckDirection.FORWARD) {
                for (int i=0; i<=fTestData.fString.length(); ++i) {
                    if (fTestData.fExpectedBreaks[i] != fTestData.fActualBreaks[i]) {
                        throw new MonkeyException(msg, i);
                    }
                }
            } else {
                for (int i=fTestData.fString.length(); i>=0; i--) {
                    if (fTestData.fExpectedBreaks[i] != fTestData.fActualBreaks[i]) {
                        throw new MonkeyException(msg, i);
                    }
                }
            }

        };

        String                 fRuleCharBuffer;         // source file contents of the reference rules.
        BreakRules             fRuleSet;
        RuleBasedBreakIterator fBI;
        MonkeyTestData         fTestData;
        ICU_Rand               fRandomGenerator;
        String                 fRuleFileName;
        boolean                fVerbose;                 // True to do long dump of failing data.
        int                    fLoopCount;
        int                    fErrorCount;

        boolean                fDumpExpansions;          // Debug flag to output expanded form of rules and sets.
        StringBuilder          fErrorMsgs = new StringBuilder();

    }

    //  Test parameters, specified via Java properties.
    //
    //  rules=file_name   Name of file containing the reference rules.
    //  seed=nnnnn        Random number starting seed.
    //                    Setting the seed allows errors to be reproduced.
    //  loop=nnn          Looping count.  Controls running time.
    //                    -1:  run forever.
    //                     0 or greater:  run length.
    //  expansions        debug option, show expansions of rules and sets.
    //  verbose           Display details of the failure.
    //
    // Parameters are passed to the JVM on the command line, or
    // via the Eclipse Run Configuration settings, arguments tab, VM parameters.
    // For example,
    //      -ea -Drules=line.txt -Dloop=-1
    //
    @Test
    public void TestMonkey() {
        String tests[] = {"grapheme.txt", "word.txt", "line.txt", "sentence.txt", "line_normal.txt",
                "line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt"
        };

        String testNameFromParams = getProperty("rules");

        if (testNameFromParams != null) {
            tests = new String[] {testNameFromParams};
        }

        int loopCount = getIntProperty("loop", isQuick() ? 100 : 5000);
        boolean dumpExpansions =  getBooleanProperty("expansions", false);
        boolean verbose = getBooleanProperty("verbose", false);
        int seed = getIntProperty("seed", 1);

        List<RBBIMonkeyImpl> startedTests = new ArrayList<RBBIMonkeyImpl>();

        // Monkey testing is multi-threaded.
        // Each set of break rules to be tested is run in a separate thread.
        // Each thread/set of rules gets a separate RBBIMonkeyImpl object.

        for (String testName: tests) {
            logln(String.format("beginning testing of %s", testName));

            RBBIMonkeyImpl test = new RBBIMonkeyImpl();

            test.fDumpExpansions = dumpExpansions;
            test.fVerbose = verbose;
            test.fRandomGenerator = new ICU_Rand(seed);
            test.fLoopCount = loopCount;
            test.setup(testName);

            test.start();
            startedTests.add(test);
        }

        StringBuilder errors = new StringBuilder();
        for (RBBIMonkeyImpl test: startedTests) {
            try {
                test.join();
                errors.append(test.fErrorMsgs);
            } catch (InterruptedException e) {
                errors.append(e + "\n");
            }
        }
        String errorMsgs = errors.toString();
        assertEquals(errorMsgs, "", errorMsgs);

    }


}