105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert/* GENERATED SOURCE. DO NOT MODIFY. */ 205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert// © 2016 and later: Unicode, Inc. and others. 305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html#License 405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 505fa7802d0874812c234a29745586677ee5837eaFredrik Roubertpackage android.icu.dev.test.rbbi; 605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 705fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport java.io.IOException; 805fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport java.io.InputStream; 905fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport java.io.InputStreamReader; 1005fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport java.util.ArrayList; 1105fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport java.util.Arrays; 1205fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport java.util.HashMap; 1305fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport java.util.List; 1405fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport java.util.Map; 1505fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport java.util.regex.Matcher; 1605fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport java.util.regex.Pattern; 1705fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport java.util.regex.PatternSyntaxException; 1805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 1905fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport org.junit.Test; 2005fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport org.junit.runner.RunWith; 2105fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport org.junit.runners.JUnit4; 2205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 2305fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport android.icu.dev.test.TestFmwk; 2405fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport android.icu.impl.UCharacterName; 2505fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport android.icu.impl.UCharacterNameChoice; 2605fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport android.icu.text.BreakIterator; 2705fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport android.icu.text.RuleBasedBreakIterator; 2805fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport android.icu.text.UnicodeSet; 2905fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport android.icu.util.ULocale; 3005fa7802d0874812c234a29745586677ee5837eaFredrik Roubertimport android.icu.testsharding.MainTestShard; 3105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 3205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert/** 3305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert * RBBI Monkey Test. Ported from ICU4C test/intltest/rbbimonkeytest.cpp. 3405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert * This is the newer, data driven monkey test. It is completely separate from the 3505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert * older class RBBITestMonkey. 3605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert */ 3705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 3805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert@MainTestShard 3905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert@RunWith(JUnit4.class) 4005fa7802d0874812c234a29745586677ee5837eaFredrik Roubertpublic class RBBIMonkeyTest extends TestFmwk { 4105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 4205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 4305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // class CharClass Represents a single character class from the source break rules. 4405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Inherits from UObject because instances are adopted by UHashtable, which ultimately 4505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // deletes them using hash's object deleter function. 4605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 4705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert static class CharClass { 4805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String fName; 4905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String fOriginalDef; // set definition as it appeared in user supplied rules. 5005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String fExpandedDef; // set definition with any embedded named sets replaced by their defs, recursively. 5105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert UnicodeSet fSet; 5205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert CharClass(String name, String originalDef, String expandedDef, UnicodeSet set) { 5305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fName = name; 5405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fOriginalDef = originalDef; 5505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fExpandedDef = expandedDef; 5605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fSet = set; 5705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert }; 5805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 5905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 6005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 6105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // class BreakRule Struct-like class represents a single rule from a set of break rules. 6205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Each rule has the set definitions expanded, and 6305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // is compiled to a regular expression. 6405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 6505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert static class BreakRule { 6605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String fName; // Name of the rule. 6705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String fRule; // Rule expression, excluding the name, as written in user source. 6805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String fExpandedRule; // Rule expression after expanding the set definitions. 6905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert Matcher fRuleMatcher; // Regular expression that matches the rule. 7005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert }; 7105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 7205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 7305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // class BreakRules represents a complete set of break rules, possibly tailored, 7405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // compiled from testdata break rules. 7505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 7605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert static class BreakRules { 7705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert BreakRules(RBBIMonkeyImpl monkeyImpl) { 7805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fMonkeyImpl = monkeyImpl; 7905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fBreakRules = new ArrayList<BreakRule>(); 8005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fType = BreakIterator.KIND_TITLE; 8105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fCharClasses = new HashMap<String, CharClass>(); 8205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fCharClassList = new ArrayList<CharClass>(); 8305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fDictionarySet = new UnicodeSet(); 8405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 8505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Match an alpha-numeric identifier in a rule. Will be a set name. 8605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Use negative look-behind to exclude non-identifiers, mostly property names or values. 8705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fSetRefsMatcher = Pattern.compile( 8805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert "(?<!\\{[ \\t]{0,4})" + 8905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert "(?<!=[ \\t]{0,4})" + 9005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert "(?<!\\[:[ \\t]{0,4})" + 9105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert "(?<!\\\\)" + 9205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert "(?<![A-Za-z0-9_])" + 9305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert "([A-Za-z_][A-Za-z0-9_]*)"). // The char class name 9405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert matcher(""); 9505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 9605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Match comments and blank lines. Matches will be replaced with "", stripping the comments from the rules. 9705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fCommentsMatcher = Pattern.compile("" + 9805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert "(^|(?<=;))" + // Start either at start of line, or just after a ';' (look-behind for ';') 9905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert "[ \\t]*+" + // Match white space. 10005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert "(#.*)?+" + // Optional # plus whatever follows 10105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert "$"). // new-line at end of line. 10205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert matcher(""); 10305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 10405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Match (initial parse) of a character class definition line. 10505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fClassDefMatcher = Pattern.compile("" + 10605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert "[ \\t]*" + // leading white space 10705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert "([A-Za-z_][A-Za-z0-9_]*)" + // The char class name 10805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert "[ \\t]*=[ \\t]*" + // = 10905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert "(.*?)" + // The char class UnicodeSet expression 11005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert "[ \\t]*;$"). // ; <end of line> 11105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert matcher(""); 11205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 11305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Match (initial parse) of a break rule line. 11405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fRuleDefMatcher = Pattern.compile("" + 11505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert "[ \\t]*" + // leading white space 11605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert "([A-Za-z_][A-Za-z0-9_.]*)" + // The rule name 11705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert "[ \\t]*:[ \\t]*" + // : 11805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert "(.*?)" + // The rule definition 11905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert "[ \\t]*;$"). // ; <end of line> 12005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert matcher(""); 12105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 12205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Match a property expression, either [:xxx:] or \p{...} 12305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fPropertyMatcher = Pattern.compile("" + 12405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert "\\[:.*?:]|\\\\(?:p|P)\\{.*?\\}"). 12505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert matcher(""); 12605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 12705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 12805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 12905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 13005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert /** 13105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert * Create the expanded definition for this char class, 13205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert * replacing any set references with the corresponding definition. 13305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert */ 13405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert CharClass addCharClass(String name, String definition) { 13505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert StringBuffer expandedDef = new StringBuffer(); 13605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fSetRefsMatcher.reset(definition); 13705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert while (fSetRefsMatcher.find()) { 13805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String sname = fSetRefsMatcher.group(/*"ClassName"*/ 1); 13905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert CharClass snameClass = fCharClasses.get(sname); 14005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String expansionForName = snameClass != null ? snameClass.fExpandedDef : sname; 14105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 14205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fSetRefsMatcher.appendReplacement(expandedDef, ""); 14305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert expandedDef.append(expansionForName); 14405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 14505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fSetRefsMatcher.appendTail(expandedDef); 14605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String expandedDefString = expandedDef.toString(); 14705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 14805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (fMonkeyImpl.fDumpExpansions) { 14905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert System.out.printf("addCharClass(\"%s\"\n", name); 15005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert System.out.printf(" %s\n", definition); 15105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert System.out.printf("expandedDef: %s\n", expandedDefString); 15205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 15305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 15405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Verify that the expanded set definition is valid. 15505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 15605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert UnicodeSet s; 15705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert try { 15805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert s = new UnicodeSet(expandedDefString, UnicodeSet.IGNORE_SPACE); 15905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } catch (java.lang.IllegalArgumentException e) { 16005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert System.err.printf("%s: error %s creating UnicodeSet %s", fMonkeyImpl.fRuleFileName, e.toString(), name); 16105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert throw e; 16205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 16305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 16405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Get an expanded equivalent pattern from the UnicodeSet. 16505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // This removes set difference operators, which would fail if passed through to Java regex. 16605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 16705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert StringBuffer expandedPattern = new StringBuffer(); 16805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert s._generatePattern(expandedPattern, true); 16905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert expandedDefString = expandedPattern.toString(); 17005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (fMonkeyImpl.fDumpExpansions) { 17105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert System.out.printf("expandedDef2: %s\n", expandedDefString); 17205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 17305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 17405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert CharClass cclass = new CharClass(name, definition, expandedDefString, s); 17505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert CharClass previousClass = fCharClasses.put(name, cclass); 17605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 17705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (previousClass != null) { 17805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // TODO: decide whether or not to allow redefinitions. 17905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Can be convenient in some cases. 18005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // String msg = String.format("%s: Redefinition of character class %s\n", 18105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // fMonkeyImpl.fRuleFileName, cclass.fName); 18205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // System.err.println(msg); 18305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // throw new IllegalArgumentException(msg); 18405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 18505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert return cclass; 18605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 18705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert }; 18805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 18905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 19005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert void addRule(String name, String definition) { 19105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert BreakRule thisRule = new BreakRule(); 19205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert StringBuffer expandedDefsRule = new StringBuffer(); 19305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert thisRule.fName = name; 19405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert thisRule.fRule = definition; 19505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 19605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Expand the char class definitions within the rule. 19705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fSetRefsMatcher.reset(definition); 19805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert while (fSetRefsMatcher.find()) { 19905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String sname = fSetRefsMatcher.group(/*"ClassName"*/ 1); 20005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert CharClass nameClass = fCharClasses.get(sname); 20105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (nameClass == null) { 20205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert System.err.printf("char class \"%s\" unrecognized in rule \"%s\"\n", sname, definition); 20305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 20405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String expansionForName = nameClass != null ? nameClass.fExpandedDef : sname; 20505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fSetRefsMatcher.appendReplacement(expandedDefsRule, ""); 20605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert expandedDefsRule.append(expansionForName); 20705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 20805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fSetRefsMatcher.appendTail(expandedDefsRule); 20905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 21005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Replace any property expressions, \p{...} or [:...:] with an equivalent expansion, 21105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // obtained from ICU UnicodeSet. Need to do this substitution because Java regex 21205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // does not recognize all properties, and because Java's definitions are likely 21305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // older than ICU's. 21405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 21505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert StringBuffer expandedRule = new StringBuffer(); 21605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fPropertyMatcher.reset(expandedDefsRule); 21705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert while (fPropertyMatcher.find()) { 21805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String prop = fPropertyMatcher.group(); 21905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert UnicodeSet propSet = new UnicodeSet("[" + prop + "]"); 22005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert StringBuffer propExpansion = new StringBuffer(); 22105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert propSet._generatePattern(propExpansion, true); 22205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fPropertyMatcher.appendReplacement(expandedRule, propExpansion.toString()); 22305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 22405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fPropertyMatcher.appendTail(expandedRule); 22505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 22605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Replace any [^negated sets] with equivalent flattened sets generated by 22705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // ICU UnicodeSet. [^ ...] in Java Regex character classes does not apply 22805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // to any nested classes. Variable substitution in rules produces 22905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // nested sets that [^negation] needs to apply to. 23005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 23105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert StringBuffer ruleWithFlattenedSets = new StringBuffer(); 23205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int idx = 0; 23305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert while (idx<expandedRule.length()) { 23405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int setOpenPos = expandedRule.indexOf("[^", idx); 23505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (setOpenPos < 0) { 23605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert break; 23705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 23805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (setOpenPos > idx) { 23905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Move anything from the source rule preceding the [^ into the processed rule, unchanged. 24005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert ruleWithFlattenedSets.append(expandedRule.substring(idx, setOpenPos)); 24105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 24205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int nestingLevel = 1; 24305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert boolean haveNesting = false; 24405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int setClosePos; 24505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert for (setClosePos = setOpenPos + 2; nestingLevel > 0 && setClosePos<expandedRule.length(); ++setClosePos) { 24605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert char c = expandedRule.charAt(setClosePos); 24705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (c == '\\') { 24805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert ++setClosePos; 24905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } else if (c == '[') { 25005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert ++nestingLevel; 25105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert haveNesting = true; 25205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } else if (c == ']') { 25305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert --nestingLevel; 25405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 25505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 25605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (haveNesting && nestingLevel == 0) { 25705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Found one, a negated set that includes interior nested sets. 25805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Create an ICU UnicodeSet from the source pattern, and obtain an 25905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // equivalent flattened pattern from that. 26005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert UnicodeSet uset = new UnicodeSet(expandedRule.substring(setOpenPos, setClosePos), true); 26105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert uset._generatePattern(ruleWithFlattenedSets, true); 26205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } else { 26305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // The [^ set definition did not include any nested sets. 26405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Copy the original definition without change. 26505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Java regular expressions will handle it without needing to recast it. 26605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (nestingLevel > 0) { 26705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Error case of an unclosed character class expression. 26805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Java regex will also eventually flag the error. 26905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert System.err.printf("No closing ] found in rule %s\n", name); 27005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 27105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert ruleWithFlattenedSets.append(expandedRule.substring(setOpenPos, setClosePos)); 27205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 27305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert idx = setClosePos; 27405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 27505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 27605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (idx < expandedRule.length()) { 27705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert ruleWithFlattenedSets.append(expandedRule.substring(idx, expandedRule.length())); 27805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 27905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 28005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert thisRule.fExpandedRule = ruleWithFlattenedSets.toString(); 28105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 28205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Replace the divide sign (\u00f7) with a regular expression named capture. 28305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // When running the rules, a match that includes this group means we found a break position. 28405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 28505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // thisRule.fExpandedRule = thisRule.fExpandedRule.replace("÷", "(?<BreakPosition>)"); 28605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert thisRule.fExpandedRule = thisRule.fExpandedRule.replace("÷", "()"); 28705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (thisRule.fExpandedRule.indexOf("÷") != -1) { 28805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String msg = String.format("%s Rule %s contains multiple ÷ signs", fMonkeyImpl.fRuleFileName, name); 28905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert System.err.println(msg); 29005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert throw new IllegalArgumentException(msg); 29105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 29205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 29305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // UAX break rule set definitions can be empty, just []. 29405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Regular expression set expressions don't accept this. Substitute with [a&&[^a]], which 29505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // also matches nothing. 29605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 29705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert thisRule.fExpandedRule = thisRule.fExpandedRule.replace("[]", "[a&&[^a]]"); 29805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 29905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Change Unicode escape syntax for compatibility with Java regular expressions (Java 7 or newer) 30005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // \udddd => \x{dddd} 30105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // \U00hhhhhh => \x{hhhhhh} 30205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 30305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // thisRule.fExpandedRule = thisRule.fExpandedRule.replaceAll("\\\\u([0-9A-Fa-f]{4})", "\\\\x{$1}"); 30405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // thisRule.fExpandedRule = thisRule.fExpandedRule.replaceAll("\\\\U00([0-9A-Fa-f]{6})", "\\\\x{$1}"); 30505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 30605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Java 6 compatibility troubles - there is no syntax for escaping a supplementary character 30705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // within a regular expression character class. Put them in as unescaped literal chars. 30805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert StringBuilder sb = new StringBuilder(thisRule.fExpandedRule); 30905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert while (true) { 31005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int where = sb.indexOf("\\U00"); 31105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (where < 0) { 31205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert break; 31305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 31405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String cp = hexToCodePoint(sb.substring(where+2, where+10)); 31505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert sb.replace(where, where+10, cp); 31605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 31705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert thisRule.fExpandedRule = sb.toString(); 31805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 31905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Escape any literal '#' in the rule expression. Without escaping, these introduce a comment. 32005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // UnicodeSet._generatePattern() inserts un-escaped "#"s 32105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 32205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert thisRule.fExpandedRule = thisRule.fExpandedRule.replace("#", "\\#"); 32305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (fMonkeyImpl.fDumpExpansions) { 32405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert System.out.printf("fExpandedRule: %s\n", thisRule.fExpandedRule); 32505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 32605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 32705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Compile a regular expression for this rule. 32805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 32905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert try { 33005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert thisRule.fRuleMatcher = Pattern.compile(thisRule.fExpandedRule, Pattern.COMMENTS | Pattern.DOTALL).matcher(""); 33105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } catch (PatternSyntaxException e) { 33205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert System.err.printf("%s: Error creating regular expression for rule %s. Expansion is \n\"%s\"", 33305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fMonkeyImpl.fRuleFileName, name, thisRule.fExpandedRule); 33405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert throw e; 33505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 33605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 33705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Put this new rule into the vector of all Rules. 33805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 33905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fBreakRules.add(thisRule); 34005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert }; 34105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 34205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert private static String hexToCodePoint(String hex) { 34305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int cp = Integer.parseInt(hex, 16); 34405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert return new StringBuilder().appendCodePoint(cp).toString(); 34505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 34605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 34705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 34805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert boolean setKeywordParameter(String keyword, String value) { 34905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (keyword.equals("locale")) { 35005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fLocale = new ULocale(value); 35105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert return true; 35205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 35305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (keyword.equals("type")) { 35405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (value.equals("grapheme")) { 35505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fType = BreakIterator.KIND_CHARACTER; 35605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } else if (value.equals("word")) { 35705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fType = BreakIterator.KIND_WORD; 35805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } else if (value.equals("line")) { 35905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fType = BreakIterator.KIND_LINE; 36005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } else if (value.equals("sentence")) { 36105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fType = BreakIterator.KIND_SENTENCE; 36205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } else { 36305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String msg = String.format("%s: Unrecognized break type %s", fMonkeyImpl.fRuleFileName, value); 36405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert System.err.println(msg); 36505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert throw new IllegalArgumentException(msg); 36605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 36705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert return true; 36805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 36905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert return false; 37005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 37105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 37205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 37305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert RuleBasedBreakIterator createICUBreakIterator() { 37405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert BreakIterator bi; 37505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert switch(fType) { 37605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert case BreakIterator.KIND_CHARACTER: 37705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert bi = (BreakIterator.getCharacterInstance(fLocale)); 37805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert break; 37905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert case BreakIterator.KIND_WORD: 38005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert bi = (BreakIterator.getWordInstance(fLocale)); 38105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert break; 38205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert case BreakIterator.KIND_LINE: 38305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert bi = (BreakIterator.getLineInstance(fLocale)); 38405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert break; 38505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert case BreakIterator.KIND_SENTENCE: 38605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert bi = (BreakIterator.getSentenceInstance(fLocale)); 38705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert break; 38805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert default: 38905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String msg = String.format("%s: Bad break iterator type of %d", fMonkeyImpl.fRuleFileName, fType); 39005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert System.err.println(msg); 39105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert throw new IllegalArgumentException(msg); 39205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 39305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert return (RuleBasedBreakIterator)bi; 39405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 39505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert }; 39605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 39705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 39805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 39905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert void compileRules(String rules) { 40005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int lineNumber = 0; 40105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert for (String line: rules.split("\\r?\\n")) { 40205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert ++lineNumber; 40305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Strip comment lines. 40405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fCommentsMatcher.reset(line); 40505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert line = fCommentsMatcher.replaceFirst(""); 40605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (line.isEmpty()) { 40705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert continue; 40805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 40905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 41005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Recognize character class definition and keyword lines 41105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fClassDefMatcher.reset(line); 41205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (fClassDefMatcher.matches()) { 41305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String className = fClassDefMatcher.group(/*"ClassName"*/ 1); 41405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String classDef = fClassDefMatcher.group(/*"ClassDef"*/ 2); 41505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (fMonkeyImpl.fDumpExpansions) { 41605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert System.out.printf("scanned class: %s = %s\n", className, classDef); 41705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 41805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (setKeywordParameter(className, classDef)) { 41905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // The scanned item was "type = ..." or "locale = ...", etc. 42005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // which are not actual character classes. 42105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert continue; 42205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 42305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert addCharClass(className, classDef); 42405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert continue; 42505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 42605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 42705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Recognize rule lines. 42805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fRuleDefMatcher.reset(line); 42905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (fRuleDefMatcher.matches()) { 43005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String ruleName = fRuleDefMatcher.group(/*"RuleName"*/ 1); 43105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String ruleDef = fRuleDefMatcher.group(/*"RuleDef"*/ 2); 43205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (fMonkeyImpl.fDumpExpansions) { 43305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert System.out.printf("scanned rule: %s : %s\n", ruleName, ruleDef); 43405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 43505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert addRule(ruleName, ruleDef); 43605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert continue; 43705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 43805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 43905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String msg = String.format("Unrecognized line in rule file %s:%d \"%s\"", 44005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fMonkeyImpl.fRuleFileName, lineNumber, line); 44105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert System.err.println(msg); 44205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert throw new IllegalArgumentException(msg); 44305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 44405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 44505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Build the vector of char classes, omitting the dictionary class if there is one. 44605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // This will be used when constructing the random text to be tested. 44705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 44805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Also compute the "other" set, consisting of any characters not included in 44905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // one or more of the user defined sets. 45005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 45105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert UnicodeSet otherSet = new UnicodeSet(0, 0x10ffff); 45205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 45305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert for (Map.Entry<String, CharClass> el: fCharClasses.entrySet()) { 45405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String ccName = el.getKey(); 45505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert CharClass cclass = el.getValue(); 45605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 45705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // System.out.printf(" Adding %s\n", ccName); 45805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (!ccName.equals(cclass.fName)) { 45905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert throw new IllegalArgumentException( 46005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String.format("%s: internal error, set names (%s, %s) inconsistent.\n", 46105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fMonkeyImpl.fRuleFileName, ccName, cclass.fName)); 46205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 46305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert otherSet.removeAll(cclass.fSet); 46405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (ccName.equals("dictionary")) { 46505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fDictionarySet = cclass.fSet; 46605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } else { 46705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fCharClassList.add(cclass); 46805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 46905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 47005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 47105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (!otherSet.isEmpty()) { 47205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // System.out.printf("have an other set.\n"); 47305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert CharClass cclass = addCharClass("__Others", otherSet.toPattern(true)); 47405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fCharClassList.add(cclass); 47505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 47605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 47705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert }; 47805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 47905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert CharClass getClassForChar(int c) { 48005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert for (CharClass cc: fCharClassList) { 48105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (cc.fSet.contains(c)) { 48205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert return cc; 48305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 48405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 48505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert return null; 48605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert }; 48705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 48805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 48905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert RBBIMonkeyImpl fMonkeyImpl; // Pointer back to the owning MonkeyImpl instance. 49005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert List<BreakRule> fBreakRules; // Contents are of type (BreakRule *). 49105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 49205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert Map<String, CharClass> fCharClasses; // Key is the set name. 49305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // // Value is the corresponding CharClass 49405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert List<CharClass> fCharClassList; // Char Classes, same contents as fCharClasses values, 49505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 49605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert UnicodeSet fDictionarySet; // Dictionary set, empty if none is defined. 49705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert ULocale fLocale; 49805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int fType; // BreakItererator.KIND_WORD, etc. 49905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 50005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 50105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert Matcher fSetRefsMatcher; 50205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert Matcher fCommentsMatcher; 50305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert Matcher fClassDefMatcher; 50405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert Matcher fRuleDefMatcher; 50505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert Matcher fPropertyMatcher; 50605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert }; 50705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 50805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 50905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 51005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 51105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // class MonkeyTestData represents a randomly synthesized test data string together 51205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // with the expected break positions obtained by applying 51305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // the test break rules. 51405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 51505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert static class MonkeyTestData{ 51605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 51705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert void set(BreakRules rules, ICU_Rand rand) { 51805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int dataLength = 1000; // length of test data to generate, in code points. 51905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 52005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Fill the test string with random characters. 52105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // First randomly pick a char class, then randomly pick a character from that class. 52205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Exclude any characters from the dictionary set. 52305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 52405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // System.out.println("Populating Test Data"); 52505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fRandomSeed = rand.getSeed(); // Save initial seed for use in error messages, 52605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // allowing recreation of failing data. 52705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fBkRules = rules; 52805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert StringBuilder newString = new StringBuilder(); 52905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert for (int n=0; n<dataLength;) { 53005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int charClassIndex = rand.next() % rules.fCharClassList.size(); 53105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert CharClass cclass = rules.fCharClassList.get(charClassIndex); 53205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (cclass.fSet.size() == 0) { 53305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Some rules or tailorings do end up with empty char classes. 53405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert continue; 53505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 53605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int charIndex = rand.next() % cclass.fSet.size(); 53705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int c = cclass.fSet.charAt(charIndex); 53805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (/*Character.isBmpCodePoint(c)*/ c<=0x0ffff && Character.isLowSurrogate((char)c) && 53905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert newString.length() > 0 && Character.isHighSurrogate(newString.charAt(newString.length()-1))) { 54005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Character classes may contain unpaired surrogates, e.g. Grapheme_Cluster_Break = Control. 54105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Don't let random unpaired surrogates combine in the test data because they might 54205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // produce an unwanted dictionary character. 54305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert continue; 54405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 54505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 54605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (!rules.fDictionarySet.contains(c)) { 54705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert newString.appendCodePoint(c); 54805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert ++n; 54905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 55005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 55105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fString = newString.toString(); 55205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 55305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Init the expectedBreaks, actualBreaks and ruleForPosition. 55405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Expected and Actual breaks are one longer than the input string; a true value 55505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // will indicate a boundary preceding that position. 55605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 55705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fActualBreaks = new boolean[fString.length()+1]; 55805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fExpectedBreaks = new boolean[fString.length()+1]; 55905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fRuleForPosition = new int[fString.length()+1]; 56005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert f2ndRuleForPos = new int[fString.length()+1]; 56105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 56205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Apply reference rules to find the expected breaks. 56305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 56405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fExpectedBreaks[0] = true; // Force an expected break before the start of the text. 56505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // ICU always reports a break there. 56605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // The reference rules do not have a means to do so. 56705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int strIdx = 0; 56805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert while (strIdx < fString.length()) { 56905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert BreakRule matchingRule = null; 57005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert boolean hasBreak = false; 57105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int ruleNum = 0; 57205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int matchStart = 0; 57305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int matchEnd = 0; 57405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert for (ruleNum=0; ruleNum<rules.fBreakRules.size(); ruleNum++) { 57505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert BreakRule rule = rules.fBreakRules.get(ruleNum); 57605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert rule.fRuleMatcher.reset(fString.substring(strIdx)); 57705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (rule.fRuleMatcher.lookingAt()) { 57805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // A candidate rule match, check further to see if we take it or continue to check other rules. 57905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Matches of zero or one code point count only if they also specify a break. 58005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert matchStart = strIdx; 58105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert matchEnd = strIdx + rule.fRuleMatcher.end(); 58205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert hasBreak = BreakGroupStart(rule.fRuleMatcher) >= 0; 58305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (hasBreak || 58405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert (matchStart < fString.length() && fString.offsetByCodePoints(matchStart, 1) < matchEnd)) { 58505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert matchingRule = rule; 58605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert break; 58705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 58805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 58905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 59005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (matchingRule == null) { 59105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // No reference rule matched. This is an error in the rules that should never happen. 59205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String msg = String.format("%s: No reference rules matched at position %d. ", 59305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert rules.fMonkeyImpl.fRuleFileName, strIdx); 59405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert System.err.println(msg); 59505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert dump(strIdx); 59605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert throw new IllegalArgumentException(msg); 59705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 59805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (matchingRule.fRuleMatcher.group().length() == 0) { 59905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Zero length rule match. This is also an error in the rule expressions. 60005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String msg = String.format("%s:%s: Zero length rule match at %d.", 60105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert rules.fMonkeyImpl.fRuleFileName, matchingRule.fName, strIdx); 60205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert System.err.println(msg); 60305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert dump(strIdx); 60405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert throw new IllegalArgumentException(msg); 60505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 60605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 60705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Record which rule matched over the length of the match. 60805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert for (int i = matchStart; i < matchEnd; i++) { 60905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (fRuleForPosition[i] == 0) { 61005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fRuleForPosition[i] = ruleNum; 61105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } else { 61205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert f2ndRuleForPos[i] = ruleNum; 61305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 61405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 61505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 61605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Break positions appear in rules as a matching named capture of zero length at the break position, 61705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // the adjusted pattern contains (?<BreakPosition>) 61805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (hasBreak) { 61905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int breakPos = strIdx + BreakGroupStart(matchingRule.fRuleMatcher); 62005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fExpectedBreaks[breakPos] = true; 62105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // System.out.printf("recording break at %d\n", breakPos); 62205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // For the next iteration, pick up applying rules immediately after the break, 62305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // which may differ from end of the match. The matching rule may have included 62405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // context following the boundary that needs to be looked at again. 62505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert strIdx = breakPos; 62605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } else { 62705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Original rule didn't specify a break. 62805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Continue applying rules starting on the last code point of this match. 62905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int updatedStrIdx = fString.offsetByCodePoints(matchEnd, -1); 63005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (updatedStrIdx == matchStart) { 63105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Match was only one code point, no progress if we continue. 63205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Shouldn't get here, case is filtered out at top of loop. 63305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert throw new IllegalArgumentException(String.format("%s: Rule %s internal error.", 63405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert rules.fMonkeyImpl.fRuleFileName, matchingRule.fName)); 63505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 63605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert strIdx = updatedStrIdx; 63705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 63805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 63905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert }; 64005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 64105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Helper function to find the starting index of a match of the "BreakPosition" named capture group. 64205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // @param m: a Java regex Matcher that has completed a matching operation. 64305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // @return m.start("BreakPosition), 64405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // or -1 if there is no such group, or the group did not participate in the match. 64505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // 64605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // TODO: this becomes m.start("BreakPosition") with Java 8. 64705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // In the mean time, assume that the only zero-length capturing group in 64805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // a reference rule expression is the "BreakPosition" that corresponds to a "÷". 64905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 65005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert static int BreakGroupStart(Matcher m) { 65105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert for (int groupNum=1; groupNum <= m.groupCount(); ++groupNum) { 65205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String group = m.group(groupNum); 65305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (group == null) { 65405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert continue; 65505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 65605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (group.equals("")) { 65705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // assert(m.end(groupNum) == m.end("BreakPosition")); 65805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert return m.start(groupNum); 65905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 66005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 66105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert return -1; 66205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 66305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 66405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert void dump(int around) { 66505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert System.out.print("\n" 66605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert + " char break Rule Character\n" 66705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert + " pos code class R I name name\n" 66805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert + "---------------------------------------------------------------------------------------------\n"); 66905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 67005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int start; 67105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int end; 67205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 67305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (around == -1) { 67405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert start = 0; 67505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert end = fString.length(); 67605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } else { 67705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Display context around a failure. 67805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert try { 67905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert start = fString.offsetByCodePoints(around, -30); 68005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } catch (Exception e) { 68105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert start = 0; 68205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 68305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert try { 68405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert end = fString.offsetByCodePoints(around, +30); 68505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } catch (Exception e) { 68605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert end = fString.length(); 68705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 68805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 68905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 69005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert for (int charIdx = start; charIdx < end; charIdx=fString.offsetByCodePoints(charIdx, 1)) { 69105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int c = fString.codePointAt(charIdx); 69205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert CharClass cc = fBkRules.getClassForChar(c); 69305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 69405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert BreakRule rule = fBkRules.fBreakRules.get(fRuleForPosition[charIdx]); 69505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String secondRuleName = ""; 69605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (f2ndRuleForPos[charIdx] > 0) { 69705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert secondRuleName = fBkRules.fBreakRules.get(f2ndRuleForPos[charIdx]).fName; 69805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 69905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String cName = UCharacterName.INSTANCE.getName(c, UCharacterNameChoice.EXTENDED_CHAR_NAME); 70005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 70105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert System.out.printf(" %4d %6x %-20s %c %c %-10s %-10s %s\n", 70205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert charIdx, c, cc.fName, 70305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fExpectedBreaks[charIdx] ? '*' : '.', 70405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fActualBreaks[charIdx] ? '*' : '.', 70505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert rule.fName, secondRuleName, cName 70605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert ); 70705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 70805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 70905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert }; 71005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 71105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert void clearActualBreaks() { 71205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert Arrays.fill(fActualBreaks, false); 71305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 71405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 71505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 71605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int fRandomSeed; // The initial seed value from the random number generator. 71705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert BreakRules fBkRules; // The break rules used to generate this data. 71805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String fString; // The text. 71905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert boolean fExpectedBreaks[]; // Breaks as found by the reference rules. 72005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Parallel to fString. true if break preceding. 72105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert boolean fActualBreaks[]; // Breaks as found by ICU break iterator. 72205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int fRuleForPosition[]; // Index into BreakRules.fBreakRules of rule that applied at each position. 72305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Also parallel to fString. 72405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int f2ndRuleForPos[]; // As above. A 2nd rule applies when the preceding rule 72505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // didn't cause a break, and a subsequent rule match starts 72605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // on the last code point of the preceding match. 72705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 72805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 72905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 73005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 73105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // class RBBIMonkeyImpl holds (some indirectly) everything associated with running a monkey 73205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // test for one set of break rules. 73305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // 73405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 73505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert static class RBBIMonkeyImpl extends Thread { 73605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 73705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert void setup(String ruleFile) { 73805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fRuleFileName = ruleFile; 73905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert openBreakRules(ruleFile); 74005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fRuleSet = new BreakRules(this); 74105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fRuleSet.compileRules(fRuleCharBuffer); 74205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fBI = fRuleSet.createICUBreakIterator(); 74305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fTestData = new MonkeyTestData(); 74405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert }; 74505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 74605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert void openBreakRules(String fileName) { 74705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert StringBuilder testFileBuf = new StringBuilder(); 74805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert InputStream is = null; 74905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String filePath = "break_rules/" + fileName; 75005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert try { 75105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert is = RBBIMonkeyImpl.class.getResourceAsStream(filePath); 75205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (is == null) { 75305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert errln("Could not open test data file " + fileName); 75405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert return; 75505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 75605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert InputStreamReader isr = new InputStreamReader(is, "UTF-8"); 75705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert try { 75805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int c; 75905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int count = 0; 76005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert for (;;) { 76105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert c = isr.read(); 76205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (c < 0) { 76305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert break; 76405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 76505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert count++; 76605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (c == 0xFEFF && count == 1) { 76705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // BOM in the test data file. Discard it. 76805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert continue; 76905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 77005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert testFileBuf.appendCodePoint(c); 77105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 77205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } finally { 77305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert isr.close(); 77405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 77505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } catch (IOException e) { 77605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert try { 77705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert is.close(); 77805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } catch (IOException ignored) { 77905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 78005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert errln(e.toString()); 78105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 78205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fRuleCharBuffer = testFileBuf.toString(); /* the file as a String */ 78305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 78405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 78505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert class MonkeyException extends RuntimeException { 78605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert private static final long serialVersionUID = 1L; 78705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert public int fPosition; // Position of the failure in the test data. 78805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert MonkeyException(String description, int pos) { 78905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert super(description); 79005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fPosition = pos; 79105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 79205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 79305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 79405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert @Override 79505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert public void run() { 79605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int errorCount = 0; 79705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (fBI == null) { 79805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fErrorMsgs.append("Unable to run test because fBI is null.\n"); 79905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert return; 80005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 80105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert for (long loopCount = 0; fLoopCount < 0 || loopCount < fLoopCount; loopCount++) { 80205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert try { 80305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fTestData.set(fRuleSet, fRandomGenerator); 80405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // fTestData.dump(-1); 80505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert testForwards(); 80605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert testPrevious(); 80705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert testFollowing(); 80805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert testPreceding(); 80905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert testIsBoundary(); 81005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } catch (MonkeyException e) { 81105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String formattedMsg = String.format( 81205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert "%s at index %d. VM Arguments to reproduce: -Drules=%s -Dseed=%d -Dloop=1 -Dverbose=1 \"\n", 81305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert e.getMessage(), e.fPosition, fRuleFileName, fTestData.fRandomSeed); 81405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert System.err.print(formattedMsg); 81505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (fVerbose) { 81605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fTestData.dump(e.fPosition); 81705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 81805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fErrorMsgs.append(formattedMsg); 81905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (++errorCount > 10) { 82005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert return; 82105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 82205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 82305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (fLoopCount < 0 && loopCount % 100 == 0) { 82405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert System.err.print("."); 82505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 82605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 82705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 82805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 82905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert enum CheckDirection { 83005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert FORWARD, 83105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert REVERSE 83205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert }; 83305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 83405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert void testForwards() { 83505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fTestData.clearActualBreaks(); 83605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fBI.setText(fTestData.fString); 83705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int previousBreak = -2; 83805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert for (int bk=fBI.first(); bk != BreakIterator.DONE; bk=fBI.next()) { 83905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (bk <= previousBreak) { 84005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert throw new MonkeyException("Break Iterator Stall", bk); 84105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 84205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (bk < 0 || bk > fTestData.fString.length()) { 84305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert throw new MonkeyException("Boundary out of bounds", bk); 84405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 84505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fTestData.fActualBreaks[bk] = true; 84605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 84705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert checkResults("testForwards", CheckDirection.FORWARD); 84805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert }; 84905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 85005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 85105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert void testFollowing() { 85205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fTestData.clearActualBreaks(); 85305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fBI.setText(fTestData.fString); 85405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int nextBreak = -1; 85505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert for (int i=-1 ; i<fTestData.fString.length(); ++i) { 85605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int bk = fBI.following(i); 85705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (bk == BreakIterator.DONE && i == fTestData.fString.length()) { 85805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert continue; 85905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 86005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (bk == nextBreak && bk > i) { 86105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // i is in the gap between two breaks. 86205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert continue; 86305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 86405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (i == nextBreak && bk > nextBreak) { 86505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fTestData.fActualBreaks[bk] = true; 86605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert nextBreak = bk; 86705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert continue; 86805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 86905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert throw new MonkeyException("following(i)", i); 87005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 87105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert checkResults("testFollowing", CheckDirection.FORWARD); 87205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert }; 87305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 87405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 87505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert void testPrevious() { 87605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fTestData.clearActualBreaks(); 87705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fBI.setText(fTestData.fString); 87805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int previousBreak = Integer.MAX_VALUE; 87905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert for (int bk=fBI.last(); bk != BreakIterator.DONE; bk=fBI.previous()) { 88005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (bk >= previousBreak) { 88105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert throw new MonkeyException("Break Iterator Stall", bk); 88205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 88305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (bk < 0 || bk > fTestData.fString.length()) { 88405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert throw new MonkeyException("Boundary out of bounds", bk); 88505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 88605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fTestData.fActualBreaks[bk] = true; 88705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 88805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert checkResults("testPrevius", CheckDirection.REVERSE); 88905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert }; 89005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 89105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 89205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert /** 89305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert * Given an index into a string, if it refers to the trail surrogate of a surrogate pair, 89405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert * adjust it to point to the lead surrogate, which is the start of the code point. 89505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert * @param s the String. 89605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert * @param i the initial index 89705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert * @return the adjusted index 89805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert */ 89905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert private int getChar32Start(String s, int i) { 90005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (i > 0 && i < s.length() && 90105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert Character.isLowSurrogate(s.charAt(i)) && Character.isHighSurrogate(s.charAt(i-1))) { 90205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert --i; 90305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 90405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert return i; 90505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 90605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 90705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 90805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert void testPreceding() { 90905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fTestData.clearActualBreaks(); 91005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fBI.setText(fTestData.fString); 91105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int nextBreak = fTestData.fString.length()+1; 91205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert for (int i=fTestData.fString.length()+1 ; i>=0; --i) { 91305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int bk = fBI.preceding(i); 91405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // System.err.printf("testPreceding() i:%d bk:%d nextBreak:%d\n", i, bk, nextBreak); 91505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (bk == BreakIterator.DONE && i == 0) { 91605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert continue; 91705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 91805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (bk == nextBreak && bk < i) { 91905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // i is in the gap between two breaks. 92005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert continue; 92105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 92205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (i<fTestData.fString.length() && getChar32Start(fTestData.fString, i) < i) { 92305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // i indexes to a trailing surrogate. 92405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Break Iterators treat an index to either half as referring to the supplemental code point, 92505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // with preceding going to some preceding code point. 92605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (fBI.preceding(i) != fBI.preceding(getChar32Start(fTestData.fString, i))) { 92705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert throw new MonkeyException("preceding of trailing surrogate error", i); 92805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 92905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert continue; 93005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 93105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (i == nextBreak && bk < nextBreak) { 93205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fTestData.fActualBreaks[bk] = true; 93305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert nextBreak = bk; 93405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert continue; 93505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 93605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert throw new MonkeyException("preceding(i)", i); 93705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 93805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert checkResults("testPreceding", CheckDirection.REVERSE); 93905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 94005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert }; 94105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 94205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 94305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert void testIsBoundary() { 94405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fTestData.clearActualBreaks(); 94505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fBI.setText(fTestData.fString); 94605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert for (int i=fTestData.fString.length(); i>=0; --i) { 94705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (fBI.isBoundary(i)) { 94805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert fTestData.fActualBreaks[i] = true; 94905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 95005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 95105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert checkResults("testForwards", CheckDirection.FORWARD); 95205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert }; 95305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 95405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 95505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert void checkResults(String msg, CheckDirection direction) { 95605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (direction == CheckDirection.FORWARD) { 95705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert for (int i=0; i<=fTestData.fString.length(); ++i) { 95805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (fTestData.fExpectedBreaks[i] != fTestData.fActualBreaks[i]) { 95905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert throw new MonkeyException(msg, i); 96005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 96105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 96205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } else { 96305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert for (int i=fTestData.fString.length(); i>=0; i--) { 96405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (fTestData.fExpectedBreaks[i] != fTestData.fActualBreaks[i]) { 96505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert throw new MonkeyException(msg, i); 96605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 96705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 96805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 96905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 97005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert }; 97105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 97205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String fRuleCharBuffer; // source file contents of the reference rules. 97305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert BreakRules fRuleSet; 97405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert RuleBasedBreakIterator fBI; 97505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert MonkeyTestData fTestData; 97605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert ICU_Rand fRandomGenerator; 97705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String fRuleFileName; 97805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert boolean fVerbose; // True to do long dump of failing data. 97905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int fLoopCount; 98005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int fErrorCount; 98105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 98205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert boolean fDumpExpansions; // Debug flag to output expanded form of rules and sets. 98305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert StringBuilder fErrorMsgs = new StringBuilder(); 98405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 98505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 98605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 98705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Test parameters, specified via Java properties. 98805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // 98905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // rules=file_name Name of file containing the reference rules. 99005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // seed=nnnnn Random number starting seed. 99105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Setting the seed allows errors to be reproduced. 99205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // loop=nnn Looping count. Controls running time. 99305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // -1: run forever. 99405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // 0 or greater: run length. 99505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // expansions debug option, show expansions of rules and sets. 99605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // verbose Display details of the failure. 99705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // 99805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Parameters are passed to the JVM on the command line, or 99905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // via the Eclipse Run Configuration settings, arguments tab, VM parameters. 100005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // For example, 100105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // -ea -Drules=line.txt -Dloop=-1 100205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // 100305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert @Test 100405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert public void TestMonkey() { 100505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String tests[] = {"grapheme.txt", "word.txt", "line.txt", "sentence.txt", "line_normal.txt", 100605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert "line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt" 100705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert }; 100805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 100905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String testNameFromParams = getProperty("rules"); 101005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 101105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert if (testNameFromParams != null) { 101205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert tests = new String[] {testNameFromParams}; 101305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 101405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 101505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int loopCount = getIntProperty("loop", isQuick() ? 100 : 5000); 101605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert boolean dumpExpansions = getBooleanProperty("expansions", false); 101705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert boolean verbose = getBooleanProperty("verbose", false); 101805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert int seed = getIntProperty("seed", 1); 101905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 102005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert List<RBBIMonkeyImpl> startedTests = new ArrayList<RBBIMonkeyImpl>(); 102105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 102205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Monkey testing is multi-threaded. 102305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Each set of break rules to be tested is run in a separate thread. 102405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert // Each thread/set of rules gets a separate RBBIMonkeyImpl object. 102505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 102605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert for (String testName: tests) { 102705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert logln(String.format("beginning testing of %s", testName)); 102805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 102905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert RBBIMonkeyImpl test = new RBBIMonkeyImpl(); 103005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 103105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert test.fDumpExpansions = dumpExpansions; 103205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert test.fVerbose = verbose; 103305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert test.fRandomGenerator = new ICU_Rand(seed); 103405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert test.fLoopCount = loopCount; 103505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert test.setup(testName); 103605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 103705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert test.start(); 103805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert startedTests.add(test); 103905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 104005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 104105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert StringBuilder errors = new StringBuilder(); 104205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert for (RBBIMonkeyImpl test: startedTests) { 104305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert try { 104405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert test.join(); 104505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert errors.append(test.fErrorMsgs); 104605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } catch (InterruptedException e) { 104705fa7802d0874812c234a29745586677ee5837eaFredrik Roubert errors.append(e + "\n"); 104805fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 104905fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 105005fa7802d0874812c234a29745586677ee5837eaFredrik Roubert String errorMsgs = errors.toString(); 105105fa7802d0874812c234a29745586677ee5837eaFredrik Roubert assertEquals(errorMsgs, "", errorMsgs); 105205fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 105305fa7802d0874812c234a29745586677ee5837eaFredrik Roubert } 105405fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 105505fa7802d0874812c234a29745586677ee5837eaFredrik Roubert 105605fa7802d0874812c234a29745586677ee5837eaFredrik Roubert} 1057