1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html#License 3 4package com.ibm.icu.dev.test.rbbi; 5 6import java.io.IOException; 7import java.io.InputStream; 8import java.io.InputStreamReader; 9import java.util.ArrayList; 10import java.util.Arrays; 11import java.util.HashMap; 12import java.util.List; 13import java.util.Map; 14import java.util.regex.Matcher; 15import java.util.regex.Pattern; 16import java.util.regex.PatternSyntaxException; 17 18import org.junit.Test; 19import org.junit.runner.RunWith; 20import org.junit.runners.JUnit4; 21 22import com.ibm.icu.dev.test.TestFmwk; 23import com.ibm.icu.impl.UCharacterName; 24import com.ibm.icu.impl.UCharacterNameChoice; 25import com.ibm.icu.text.BreakIterator; 26import com.ibm.icu.text.RuleBasedBreakIterator; 27import com.ibm.icu.text.UnicodeSet; 28import com.ibm.icu.util.ULocale; 29 30/** 31 * RBBI Monkey Test. Ported from ICU4C test/intltest/rbbimonkeytest.cpp. 32 * This is the newer, data driven monkey test. It is completely separate from the 33 * older class RBBITestMonkey. 34 */ 35 36@RunWith(JUnit4.class) 37public class RBBIMonkeyTest extends TestFmwk { 38 39 40 // class CharClass Represents a single character class from the source break rules. 41 // Inherits from UObject because instances are adopted by UHashtable, which ultimately 42 // deletes them using hash's object deleter function. 43 44 static class CharClass { 45 String fName; 46 String fOriginalDef; // set definition as it appeared in user supplied rules. 47 String fExpandedDef; // set definition with any embedded named sets replaced by their defs, recursively. 48 UnicodeSet fSet; 49 CharClass(String name, String originalDef, String expandedDef, UnicodeSet set) { 50 fName = name; 51 fOriginalDef = originalDef; 52 fExpandedDef = expandedDef; 53 fSet = set; 54 }; 55 } 56 57 58 // class BreakRule Struct-like class represents a single rule from a set of break rules. 59 // Each rule has the set definitions expanded, and 60 // is compiled to a regular expression. 61 62 static class BreakRule { 63 String fName; // Name of the rule. 64 String fRule; // Rule expression, excluding the name, as written in user source. 65 String fExpandedRule; // Rule expression after expanding the set definitions. 66 Matcher fRuleMatcher; // Regular expression that matches the rule. 67 }; 68 69 70 // class BreakRules represents a complete set of break rules, possibly tailored, 71 // compiled from testdata break rules. 72 73 static class BreakRules { 74 BreakRules(RBBIMonkeyImpl monkeyImpl) { 75 fMonkeyImpl = monkeyImpl; 76 fBreakRules = new ArrayList<BreakRule>(); 77 fType = BreakIterator.KIND_TITLE; 78 fCharClasses = new HashMap<String, CharClass>(); 79 fCharClassList = new ArrayList<CharClass>(); 80 fDictionarySet = new UnicodeSet(); 81 82 // Match an alpha-numeric identifier in a rule. Will be a set name. 83 // Use negative look-behind to exclude non-identifiers, mostly property names or values. 84 fSetRefsMatcher = Pattern.compile( 85 "(?<!\\{[ \\t]{0,4})" + 86 "(?<!=[ \\t]{0,4})" + 87 "(?<!\\[:[ \\t]{0,4})" + 88 "(?<!\\\\)" + 89 "(?<![A-Za-z0-9_])" + 90 "([A-Za-z_][A-Za-z0-9_]*)"). // The char class name 91 matcher(""); 92 93 // Match comments and blank lines. Matches will be replaced with "", stripping the comments from the rules. 94 fCommentsMatcher = Pattern.compile("" + 95 "(^|(?<=;))" + // Start either at start of line, or just after a ';' (look-behind for ';') 96 "[ \\t]*+" + // Match white space. 97 "(#.*)?+" + // Optional # plus whatever follows 98 "$"). // new-line at end of line. 99 matcher(""); 100 101 // Match (initial parse) of a character class definition line. 102 fClassDefMatcher = Pattern.compile("" + 103 "[ \\t]*" + // leading white space 104 "([A-Za-z_][A-Za-z0-9_]*)" + // The char class name 105 "[ \\t]*=[ \\t]*" + // = 106 "(.*?)" + // The char class UnicodeSet expression 107 "[ \\t]*;$"). // ; <end of line> 108 matcher(""); 109 110 // Match (initial parse) of a break rule line. 111 fRuleDefMatcher = Pattern.compile("" + 112 "[ \\t]*" + // leading white space 113 "([A-Za-z_][A-Za-z0-9_.]*)" + // The rule name 114 "[ \\t]*:[ \\t]*" + // : 115 "(.*?)" + // The rule definition 116 "[ \\t]*;$"). // ; <end of line> 117 matcher(""); 118 119 // Match a property expression, either [:xxx:] or \p{...} 120 fPropertyMatcher = Pattern.compile("" + 121 "\\[:.*?:]|\\\\(?:p|P)\\{.*?\\}"). 122 matcher(""); 123 124 125 } 126 127 /** 128 * Create the expanded definition for this char class, 129 * replacing any set references with the corresponding definition. 130 */ 131 CharClass addCharClass(String name, String definition) { 132 StringBuffer expandedDef = new StringBuffer(); 133 fSetRefsMatcher.reset(definition); 134 while (fSetRefsMatcher.find()) { 135 String sname = fSetRefsMatcher.group(/*"ClassName"*/ 1); 136 CharClass snameClass = fCharClasses.get(sname); 137 String expansionForName = snameClass != null ? snameClass.fExpandedDef : sname; 138 139 fSetRefsMatcher.appendReplacement(expandedDef, ""); 140 expandedDef.append(expansionForName); 141 } 142 fSetRefsMatcher.appendTail(expandedDef); 143 String expandedDefString = expandedDef.toString(); 144 145 if (fMonkeyImpl.fDumpExpansions) { 146 System.out.printf("addCharClass(\"%s\"\n", name); 147 System.out.printf(" %s\n", definition); 148 System.out.printf("expandedDef: %s\n", expandedDefString); 149 } 150 151 // Verify that the expanded set definition is valid. 152 153 UnicodeSet s; 154 try { 155 s = new UnicodeSet(expandedDefString, UnicodeSet.IGNORE_SPACE); 156 } catch (java.lang.IllegalArgumentException e) { 157 System.err.printf("%s: error %s creating UnicodeSet %s", fMonkeyImpl.fRuleFileName, e.toString(), name); 158 throw e; 159 } 160 161 // Get an expanded equivalent pattern from the UnicodeSet. 162 // This removes set difference operators, which would fail if passed through to Java regex. 163 164 StringBuffer expandedPattern = new StringBuffer(); 165 s._generatePattern(expandedPattern, true); 166 expandedDefString = expandedPattern.toString(); 167 if (fMonkeyImpl.fDumpExpansions) { 168 System.out.printf("expandedDef2: %s\n", expandedDefString); 169 } 170 171 CharClass cclass = new CharClass(name, definition, expandedDefString, s); 172 CharClass previousClass = fCharClasses.put(name, cclass); 173 174 if (previousClass != null) { 175 // TODO: decide whether or not to allow redefinitions. 176 // Can be convenient in some cases. 177 // String msg = String.format("%s: Redefinition of character class %s\n", 178 // fMonkeyImpl.fRuleFileName, cclass.fName); 179 // System.err.println(msg); 180 // throw new IllegalArgumentException(msg); 181 } 182 return cclass; 183 184 }; 185 186 187 void addRule(String name, String definition) { 188 BreakRule thisRule = new BreakRule(); 189 StringBuffer expandedDefsRule = new StringBuffer(); 190 thisRule.fName = name; 191 thisRule.fRule = definition; 192 193 // Expand the char class definitions within the rule. 194 fSetRefsMatcher.reset(definition); 195 while (fSetRefsMatcher.find()) { 196 String sname = fSetRefsMatcher.group(/*"ClassName"*/ 1); 197 CharClass nameClass = fCharClasses.get(sname); 198 if (nameClass == null) { 199 System.err.printf("char class \"%s\" unrecognized in rule \"%s\"\n", sname, definition); 200 } 201 String expansionForName = nameClass != null ? nameClass.fExpandedDef : sname; 202 fSetRefsMatcher.appendReplacement(expandedDefsRule, ""); 203 expandedDefsRule.append(expansionForName); 204 } 205 fSetRefsMatcher.appendTail(expandedDefsRule); 206 207 // Replace any property expressions, \p{...} or [:...:] with an equivalent expansion, 208 // obtained from ICU UnicodeSet. Need to do this substitution because Java regex 209 // does not recognize all properties, and because Java's definitions are likely 210 // older than ICU's. 211 212 StringBuffer expandedRule = new StringBuffer(); 213 fPropertyMatcher.reset(expandedDefsRule); 214 while (fPropertyMatcher.find()) { 215 String prop = fPropertyMatcher.group(); 216 UnicodeSet propSet = new UnicodeSet("[" + prop + "]"); 217 StringBuffer propExpansion = new StringBuffer(); 218 propSet._generatePattern(propExpansion, true); 219 fPropertyMatcher.appendReplacement(expandedRule, propExpansion.toString()); 220 } 221 fPropertyMatcher.appendTail(expandedRule); 222 223 // Replace any [^negated sets] with equivalent flattened sets generated by 224 // ICU UnicodeSet. [^ ...] in Java Regex character classes does not apply 225 // to any nested classes. Variable substitution in rules produces 226 // nested sets that [^negation] needs to apply to. 227 228 StringBuffer ruleWithFlattenedSets = new StringBuffer(); 229 int idx = 0; 230 while (idx<expandedRule.length()) { 231 int setOpenPos = expandedRule.indexOf("[^", idx); 232 if (setOpenPos < 0) { 233 break; 234 } 235 if (setOpenPos > idx) { 236 // Move anything from the source rule preceding the [^ into the processed rule, unchanged. 237 ruleWithFlattenedSets.append(expandedRule.substring(idx, setOpenPos)); 238 } 239 int nestingLevel = 1; 240 boolean haveNesting = false; 241 int setClosePos; 242 for (setClosePos = setOpenPos + 2; nestingLevel > 0 && setClosePos<expandedRule.length(); ++setClosePos) { 243 char c = expandedRule.charAt(setClosePos); 244 if (c == '\\') { 245 ++setClosePos; 246 } else if (c == '[') { 247 ++nestingLevel; 248 haveNesting = true; 249 } else if (c == ']') { 250 --nestingLevel; 251 } 252 } 253 if (haveNesting && nestingLevel == 0) { 254 // Found one, a negated set that includes interior nested sets. 255 // Create an ICU UnicodeSet from the source pattern, and obtain an 256 // equivalent flattened pattern from that. 257 UnicodeSet uset = new UnicodeSet(expandedRule.substring(setOpenPos, setClosePos), true); 258 uset._generatePattern(ruleWithFlattenedSets, true); 259 } else { 260 // The [^ set definition did not include any nested sets. 261 // Copy the original definition without change. 262 // Java regular expressions will handle it without needing to recast it. 263 if (nestingLevel > 0) { 264 // Error case of an unclosed character class expression. 265 // Java regex will also eventually flag the error. 266 System.err.printf("No closing ] found in rule %s\n", name); 267 } 268 ruleWithFlattenedSets.append(expandedRule.substring(setOpenPos, setClosePos)); 269 } 270 idx = setClosePos; 271 } 272 273 if (idx < expandedRule.length()) { 274 ruleWithFlattenedSets.append(expandedRule.substring(idx, expandedRule.length())); 275 } 276 277 thisRule.fExpandedRule = ruleWithFlattenedSets.toString(); 278 279 // Replace the divide sign (\u00f7) with a regular expression named capture. 280 // When running the rules, a match that includes this group means we found a break position. 281 282 // thisRule.fExpandedRule = thisRule.fExpandedRule.replace("÷", "(?<BreakPosition>)"); 283 thisRule.fExpandedRule = thisRule.fExpandedRule.replace("÷", "()"); 284 if (thisRule.fExpandedRule.indexOf("÷") != -1) { 285 String msg = String.format("%s Rule %s contains multiple ÷ signs", fMonkeyImpl.fRuleFileName, name); 286 System.err.println(msg); 287 throw new IllegalArgumentException(msg); 288 } 289 290 // UAX break rule set definitions can be empty, just []. 291 // Regular expression set expressions don't accept this. Substitute with [a&&[^a]], which 292 // also matches nothing. 293 294 thisRule.fExpandedRule = thisRule.fExpandedRule.replace("[]", "[a&&[^a]]"); 295 296 // Change Unicode escape syntax for compatibility with Java regular expressions (Java 7 or newer) 297 // \udddd => \x{dddd} 298 // \U00hhhhhh => \x{hhhhhh} 299 300 // thisRule.fExpandedRule = thisRule.fExpandedRule.replaceAll("\\\\u([0-9A-Fa-f]{4})", "\\\\x{$1}"); 301 // thisRule.fExpandedRule = thisRule.fExpandedRule.replaceAll("\\\\U00([0-9A-Fa-f]{6})", "\\\\x{$1}"); 302 303 // Java 6 compatibility troubles - there is no syntax for escaping a supplementary character 304 // within a regular expression character class. Put them in as unescaped literal chars. 305 StringBuilder sb = new StringBuilder(thisRule.fExpandedRule); 306 while (true) { 307 int where = sb.indexOf("\\U00"); 308 if (where < 0) { 309 break; 310 } 311 String cp = hexToCodePoint(sb.substring(where+2, where+10)); 312 sb.replace(where, where+10, cp); 313 } 314 thisRule.fExpandedRule = sb.toString(); 315 316 // Escape any literal '#' in the rule expression. Without escaping, these introduce a comment. 317 // UnicodeSet._generatePattern() inserts un-escaped "#"s 318 319 thisRule.fExpandedRule = thisRule.fExpandedRule.replace("#", "\\#"); 320 if (fMonkeyImpl.fDumpExpansions) { 321 System.out.printf("fExpandedRule: %s\n", thisRule.fExpandedRule); 322 } 323 324 // Compile a regular expression for this rule. 325 326 try { 327 thisRule.fRuleMatcher = Pattern.compile(thisRule.fExpandedRule, Pattern.COMMENTS | Pattern.DOTALL).matcher(""); 328 } catch (PatternSyntaxException e) { 329 System.err.printf("%s: Error creating regular expression for rule %s. Expansion is \n\"%s\"", 330 fMonkeyImpl.fRuleFileName, name, thisRule.fExpandedRule); 331 throw e; 332 } 333 334 // Put this new rule into the vector of all Rules. 335 336 fBreakRules.add(thisRule); 337 }; 338 339 private static String hexToCodePoint(String hex) { 340 int cp = Integer.parseInt(hex, 16); 341 return new StringBuilder().appendCodePoint(cp).toString(); 342 } 343 344 345 boolean setKeywordParameter(String keyword, String value) { 346 if (keyword.equals("locale")) { 347 fLocale = new ULocale(value); 348 return true; 349 } 350 if (keyword.equals("type")) { 351 if (value.equals("grapheme")) { 352 fType = BreakIterator.KIND_CHARACTER; 353 } else if (value.equals("word")) { 354 fType = BreakIterator.KIND_WORD; 355 } else if (value.equals("line")) { 356 fType = BreakIterator.KIND_LINE; 357 } else if (value.equals("sentence")) { 358 fType = BreakIterator.KIND_SENTENCE; 359 } else { 360 String msg = String.format("%s: Unrecognized break type %s", fMonkeyImpl.fRuleFileName, value); 361 System.err.println(msg); 362 throw new IllegalArgumentException(msg); 363 } 364 return true; 365 } 366 return false; 367 } 368 369 370 RuleBasedBreakIterator createICUBreakIterator() { 371 BreakIterator bi; 372 switch(fType) { 373 case BreakIterator.KIND_CHARACTER: 374 bi = (BreakIterator.getCharacterInstance(fLocale)); 375 break; 376 case BreakIterator.KIND_WORD: 377 bi = (BreakIterator.getWordInstance(fLocale)); 378 break; 379 case BreakIterator.KIND_LINE: 380 bi = (BreakIterator.getLineInstance(fLocale)); 381 break; 382 case BreakIterator.KIND_SENTENCE: 383 bi = (BreakIterator.getSentenceInstance(fLocale)); 384 break; 385 default: 386 String msg = String.format("%s: Bad break iterator type of %d", fMonkeyImpl.fRuleFileName, fType); 387 System.err.println(msg); 388 throw new IllegalArgumentException(msg); 389 } 390 return (RuleBasedBreakIterator)bi; 391 392 }; 393 394 395 396 void compileRules(String rules) { 397 int lineNumber = 0; 398 for (String line: rules.split("\\r?\\n")) { 399 ++lineNumber; 400 // Strip comment lines. 401 fCommentsMatcher.reset(line); 402 line = fCommentsMatcher.replaceFirst(""); 403 if (line.isEmpty()) { 404 continue; 405 } 406 407 // Recognize character class definition and keyword lines 408 fClassDefMatcher.reset(line); 409 if (fClassDefMatcher.matches()) { 410 String className = fClassDefMatcher.group(/*"ClassName"*/ 1); 411 String classDef = fClassDefMatcher.group(/*"ClassDef"*/ 2); 412 if (fMonkeyImpl.fDumpExpansions) { 413 System.out.printf("scanned class: %s = %s\n", className, classDef); 414 } 415 if (setKeywordParameter(className, classDef)) { 416 // The scanned item was "type = ..." or "locale = ...", etc. 417 // which are not actual character classes. 418 continue; 419 } 420 addCharClass(className, classDef); 421 continue; 422 } 423 424 // Recognize rule lines. 425 fRuleDefMatcher.reset(line); 426 if (fRuleDefMatcher.matches()) { 427 String ruleName = fRuleDefMatcher.group(/*"RuleName"*/ 1); 428 String ruleDef = fRuleDefMatcher.group(/*"RuleDef"*/ 2); 429 if (fMonkeyImpl.fDumpExpansions) { 430 System.out.printf("scanned rule: %s : %s\n", ruleName, ruleDef); 431 } 432 addRule(ruleName, ruleDef); 433 continue; 434 } 435 436 String msg = String.format("Unrecognized line in rule file %s:%d \"%s\"", 437 fMonkeyImpl.fRuleFileName, lineNumber, line); 438 System.err.println(msg); 439 throw new IllegalArgumentException(msg); 440 } 441 442 // Build the vector of char classes, omitting the dictionary class if there is one. 443 // This will be used when constructing the random text to be tested. 444 445 // Also compute the "other" set, consisting of any characters not included in 446 // one or more of the user defined sets. 447 448 UnicodeSet otherSet = new UnicodeSet(0, 0x10ffff); 449 450 for (Map.Entry<String, CharClass> el: fCharClasses.entrySet()) { 451 String ccName = el.getKey(); 452 CharClass cclass = el.getValue(); 453 454 // System.out.printf(" Adding %s\n", ccName); 455 if (!ccName.equals(cclass.fName)) { 456 throw new IllegalArgumentException( 457 String.format("%s: internal error, set names (%s, %s) inconsistent.\n", 458 fMonkeyImpl.fRuleFileName, ccName, cclass.fName)); 459 } 460 otherSet.removeAll(cclass.fSet); 461 if (ccName.equals("dictionary")) { 462 fDictionarySet = cclass.fSet; 463 } else { 464 fCharClassList.add(cclass); 465 } 466 } 467 468 if (!otherSet.isEmpty()) { 469 // System.out.printf("have an other set.\n"); 470 CharClass cclass = addCharClass("__Others", otherSet.toPattern(true)); 471 fCharClassList.add(cclass); 472 } 473 474 }; 475 476 CharClass getClassForChar(int c) { 477 for (CharClass cc: fCharClassList) { 478 if (cc.fSet.contains(c)) { 479 return cc; 480 } 481 } 482 return null; 483 }; 484 485 486 RBBIMonkeyImpl fMonkeyImpl; // Pointer back to the owning MonkeyImpl instance. 487 List<BreakRule> fBreakRules; // Contents are of type (BreakRule *). 488 489 Map<String, CharClass> fCharClasses; // Key is the set name. 490 // // Value is the corresponding CharClass 491 List<CharClass> fCharClassList; // Char Classes, same contents as fCharClasses values, 492 493 UnicodeSet fDictionarySet; // Dictionary set, empty if none is defined. 494 ULocale fLocale; 495 int fType; // BreakItererator.KIND_WORD, etc. 496 497 498 Matcher fSetRefsMatcher; 499 Matcher fCommentsMatcher; 500 Matcher fClassDefMatcher; 501 Matcher fRuleDefMatcher; 502 Matcher fPropertyMatcher; 503 }; 504 505 506 507 508 // class MonkeyTestData represents a randomly synthesized test data string together 509 // with the expected break positions obtained by applying 510 // the test break rules. 511 512 static class MonkeyTestData{ 513 514 void set(BreakRules rules, ICU_Rand rand) { 515 int dataLength = 1000; // length of test data to generate, in code points. 516 517 // Fill the test string with random characters. 518 // First randomly pick a char class, then randomly pick a character from that class. 519 // Exclude any characters from the dictionary set. 520 521 // System.out.println("Populating Test Data"); 522 fRandomSeed = rand.getSeed(); // Save initial seed for use in error messages, 523 // allowing recreation of failing data. 524 fBkRules = rules; 525 StringBuilder newString = new StringBuilder(); 526 for (int n=0; n<dataLength;) { 527 int charClassIndex = rand.next() % rules.fCharClassList.size(); 528 CharClass cclass = rules.fCharClassList.get(charClassIndex); 529 if (cclass.fSet.size() == 0) { 530 // Some rules or tailorings do end up with empty char classes. 531 continue; 532 } 533 int charIndex = rand.next() % cclass.fSet.size(); 534 int c = cclass.fSet.charAt(charIndex); 535 if (/*Character.isBmpCodePoint(c)*/ c<=0x0ffff && Character.isLowSurrogate((char)c) && 536 newString.length() > 0 && Character.isHighSurrogate(newString.charAt(newString.length()-1))) { 537 // Character classes may contain unpaired surrogates, e.g. Grapheme_Cluster_Break = Control. 538 // Don't let random unpaired surrogates combine in the test data because they might 539 // produce an unwanted dictionary character. 540 continue; 541 } 542 543 if (!rules.fDictionarySet.contains(c)) { 544 newString.appendCodePoint(c); 545 ++n; 546 } 547 } 548 fString = newString.toString(); 549 550 // Init the expectedBreaks, actualBreaks and ruleForPosition. 551 // Expected and Actual breaks are one longer than the input string; a true value 552 // will indicate a boundary preceding that position. 553 554 fActualBreaks = new boolean[fString.length()+1]; 555 fExpectedBreaks = new boolean[fString.length()+1]; 556 fRuleForPosition = new int[fString.length()+1]; 557 f2ndRuleForPos = new int[fString.length()+1]; 558 559 // Apply reference rules to find the expected breaks. 560 561 fExpectedBreaks[0] = true; // Force an expected break before the start of the text. 562 // ICU always reports a break there. 563 // The reference rules do not have a means to do so. 564 int strIdx = 0; 565 while (strIdx < fString.length()) { 566 BreakRule matchingRule = null; 567 boolean hasBreak = false; 568 int ruleNum = 0; 569 int matchStart = 0; 570 int matchEnd = 0; 571 for (ruleNum=0; ruleNum<rules.fBreakRules.size(); ruleNum++) { 572 BreakRule rule = rules.fBreakRules.get(ruleNum); 573 rule.fRuleMatcher.reset(fString.substring(strIdx)); 574 if (rule.fRuleMatcher.lookingAt()) { 575 // A candidate rule match, check further to see if we take it or continue to check other rules. 576 // Matches of zero or one code point count only if they also specify a break. 577 matchStart = strIdx; 578 matchEnd = strIdx + rule.fRuleMatcher.end(); 579 hasBreak = BreakGroupStart(rule.fRuleMatcher) >= 0; 580 if (hasBreak || 581 (matchStart < fString.length() && fString.offsetByCodePoints(matchStart, 1) < matchEnd)) { 582 matchingRule = rule; 583 break; 584 } 585 } 586 } 587 if (matchingRule == null) { 588 // No reference rule matched. This is an error in the rules that should never happen. 589 String msg = String.format("%s: No reference rules matched at position %d. ", 590 rules.fMonkeyImpl.fRuleFileName, strIdx); 591 System.err.println(msg); 592 dump(strIdx); 593 throw new IllegalArgumentException(msg); 594 } 595 if (matchingRule.fRuleMatcher.group().length() == 0) { 596 // Zero length rule match. This is also an error in the rule expressions. 597 String msg = String.format("%s:%s: Zero length rule match at %d.", 598 rules.fMonkeyImpl.fRuleFileName, matchingRule.fName, strIdx); 599 System.err.println(msg); 600 dump(strIdx); 601 throw new IllegalArgumentException(msg); 602 } 603 604 // Record which rule matched over the length of the match. 605 for (int i = matchStart; i < matchEnd; i++) { 606 if (fRuleForPosition[i] == 0) { 607 fRuleForPosition[i] = ruleNum; 608 } else { 609 f2ndRuleForPos[i] = ruleNum; 610 } 611 } 612 613 // Break positions appear in rules as a matching named capture of zero length at the break position, 614 // the adjusted pattern contains (?<BreakPosition>) 615 if (hasBreak) { 616 int breakPos = strIdx + BreakGroupStart(matchingRule.fRuleMatcher); 617 fExpectedBreaks[breakPos] = true; 618 // System.out.printf("recording break at %d\n", breakPos); 619 // For the next iteration, pick up applying rules immediately after the break, 620 // which may differ from end of the match. The matching rule may have included 621 // context following the boundary that needs to be looked at again. 622 strIdx = breakPos; 623 } else { 624 // Original rule didn't specify a break. 625 // Continue applying rules starting on the last code point of this match. 626 int updatedStrIdx = fString.offsetByCodePoints(matchEnd, -1); 627 if (updatedStrIdx == matchStart) { 628 // Match was only one code point, no progress if we continue. 629 // Shouldn't get here, case is filtered out at top of loop. 630 throw new IllegalArgumentException(String.format("%s: Rule %s internal error.", 631 rules.fMonkeyImpl.fRuleFileName, matchingRule.fName)); 632 } 633 strIdx = updatedStrIdx; 634 } 635 } 636 }; 637 638 // Helper function to find the starting index of a match of the "BreakPosition" named capture group. 639 // @param m: a Java regex Matcher that has completed a matching operation. 640 // @return m.start("BreakPosition), 641 // or -1 if there is no such group, or the group did not participate in the match. 642 // 643 // TODO: this becomes m.start("BreakPosition") with Java 8. 644 // In the mean time, assume that the only zero-length capturing group in 645 // a reference rule expression is the "BreakPosition" that corresponds to a "÷". 646 647 static int BreakGroupStart(Matcher m) { 648 for (int groupNum=1; groupNum <= m.groupCount(); ++groupNum) { 649 String group = m.group(groupNum); 650 if (group == null) { 651 continue; 652 } 653 if (group.equals("")) { 654 // assert(m.end(groupNum) == m.end("BreakPosition")); 655 return m.start(groupNum); 656 } 657 } 658 return -1; 659 } 660 661 void dump(int around) { 662 System.out.print("\n" 663 + " char break Rule Character\n" 664 + " pos code class R I name name\n" 665 + "---------------------------------------------------------------------------------------------\n"); 666 667 int start; 668 int end; 669 670 if (around == -1) { 671 start = 0; 672 end = fString.length(); 673 } else { 674 // Display context around a failure. 675 try { 676 start = fString.offsetByCodePoints(around, -30); 677 } catch (Exception e) { 678 start = 0; 679 } 680 try { 681 end = fString.offsetByCodePoints(around, +30); 682 } catch (Exception e) { 683 end = fString.length(); 684 } 685 } 686 687 for (int charIdx = start; charIdx < end; charIdx=fString.offsetByCodePoints(charIdx, 1)) { 688 int c = fString.codePointAt(charIdx); 689 CharClass cc = fBkRules.getClassForChar(c); 690 691 BreakRule rule = fBkRules.fBreakRules.get(fRuleForPosition[charIdx]); 692 String secondRuleName = ""; 693 if (f2ndRuleForPos[charIdx] > 0) { 694 secondRuleName = fBkRules.fBreakRules.get(f2ndRuleForPos[charIdx]).fName; 695 } 696 String cName = UCharacterName.INSTANCE.getName(c, UCharacterNameChoice.EXTENDED_CHAR_NAME); 697 698 System.out.printf(" %4d %6x %-20s %c %c %-10s %-10s %s\n", 699 charIdx, c, cc.fName, 700 fExpectedBreaks[charIdx] ? '*' : '.', 701 fActualBreaks[charIdx] ? '*' : '.', 702 rule.fName, secondRuleName, cName 703 ); 704 } 705 706 }; 707 708 void clearActualBreaks() { 709 Arrays.fill(fActualBreaks, false); 710 } 711 712 713 int fRandomSeed; // The initial seed value from the random number generator. 714 BreakRules fBkRules; // The break rules used to generate this data. 715 String fString; // The text. 716 boolean fExpectedBreaks[]; // Breaks as found by the reference rules. 717 // Parallel to fString. true if break preceding. 718 boolean fActualBreaks[]; // Breaks as found by ICU break iterator. 719 int fRuleForPosition[]; // Index into BreakRules.fBreakRules of rule that applied at each position. 720 // Also parallel to fString. 721 int f2ndRuleForPos[]; // As above. A 2nd rule applies when the preceding rule 722 // didn't cause a break, and a subsequent rule match starts 723 // on the last code point of the preceding match. 724 725 } 726 727 728 // class RBBIMonkeyImpl holds (some indirectly) everything associated with running a monkey 729 // test for one set of break rules. 730 // 731 732 static class RBBIMonkeyImpl extends Thread { 733 734 void setup(String ruleFile) { 735 fRuleFileName = ruleFile; 736 openBreakRules(ruleFile); 737 fRuleSet = new BreakRules(this); 738 fRuleSet.compileRules(fRuleCharBuffer); 739 fBI = fRuleSet.createICUBreakIterator(); 740 fTestData = new MonkeyTestData(); 741 }; 742 743 void openBreakRules(String fileName) { 744 StringBuilder testFileBuf = new StringBuilder(); 745 InputStream is = null; 746 String filePath = "break_rules/" + fileName; 747 try { 748 is = RBBIMonkeyImpl.class.getResourceAsStream(filePath); 749 if (is == null) { 750 errln("Could not open test data file " + fileName); 751 return; 752 } 753 InputStreamReader isr = new InputStreamReader(is, "UTF-8"); 754 try { 755 int c; 756 int count = 0; 757 for (;;) { 758 c = isr.read(); 759 if (c < 0) { 760 break; 761 } 762 count++; 763 if (c == 0xFEFF && count == 1) { 764 // BOM in the test data file. Discard it. 765 continue; 766 } 767 testFileBuf.appendCodePoint(c); 768 } 769 } finally { 770 isr.close(); 771 } 772 } catch (IOException e) { 773 try { 774 is.close(); 775 } catch (IOException ignored) { 776 } 777 errln(e.toString()); 778 } 779 fRuleCharBuffer = testFileBuf.toString(); /* the file as a String */ 780 } 781 782 class MonkeyException extends RuntimeException { 783 private static final long serialVersionUID = 1L; 784 public int fPosition; // Position of the failure in the test data. 785 MonkeyException(String description, int pos) { 786 super(description); 787 fPosition = pos; 788 } 789 } 790 791 @Override 792 public void run() { 793 int errorCount = 0; 794 if (fBI == null) { 795 fErrorMsgs.append("Unable to run test because fBI is null.\n"); 796 return; 797 } 798 for (long loopCount = 0; fLoopCount < 0 || loopCount < fLoopCount; loopCount++) { 799 try { 800 fTestData.set(fRuleSet, fRandomGenerator); 801 // fTestData.dump(-1); 802 testForwards(); 803 testPrevious(); 804 testFollowing(); 805 testPreceding(); 806 testIsBoundary(); 807 } catch (MonkeyException e) { 808 String formattedMsg = String.format( 809 "%s at index %d. VM Arguments to reproduce: -Drules=%s -Dseed=%d -Dloop=1 -Dverbose=1 \"\n", 810 e.getMessage(), e.fPosition, fRuleFileName, fTestData.fRandomSeed); 811 System.err.print(formattedMsg); 812 if (fVerbose) { 813 fTestData.dump(e.fPosition); 814 } 815 fErrorMsgs.append(formattedMsg); 816 if (++errorCount > 10) { 817 return; 818 } 819 } 820 if (fLoopCount < 0 && loopCount % 100 == 0) { 821 System.err.print("."); 822 } 823 } 824 } 825 826 enum CheckDirection { 827 FORWARD, 828 REVERSE 829 }; 830 831 void testForwards() { 832 fTestData.clearActualBreaks(); 833 fBI.setText(fTestData.fString); 834 int previousBreak = -2; 835 for (int bk=fBI.first(); bk != BreakIterator.DONE; bk=fBI.next()) { 836 if (bk <= previousBreak) { 837 throw new MonkeyException("Break Iterator Stall", bk); 838 } 839 if (bk < 0 || bk > fTestData.fString.length()) { 840 throw new MonkeyException("Boundary out of bounds", bk); 841 } 842 fTestData.fActualBreaks[bk] = true; 843 } 844 checkResults("testForwards", CheckDirection.FORWARD); 845 }; 846 847 848 void testFollowing() { 849 fTestData.clearActualBreaks(); 850 fBI.setText(fTestData.fString); 851 int nextBreak = -1; 852 for (int i=-1 ; i<fTestData.fString.length(); ++i) { 853 int bk = fBI.following(i); 854 if (bk == BreakIterator.DONE && i == fTestData.fString.length()) { 855 continue; 856 } 857 if (bk == nextBreak && bk > i) { 858 // i is in the gap between two breaks. 859 continue; 860 } 861 if (i == nextBreak && bk > nextBreak) { 862 fTestData.fActualBreaks[bk] = true; 863 nextBreak = bk; 864 continue; 865 } 866 throw new MonkeyException("following(i)", i); 867 } 868 checkResults("testFollowing", CheckDirection.FORWARD); 869 }; 870 871 872 void testPrevious() { 873 fTestData.clearActualBreaks(); 874 fBI.setText(fTestData.fString); 875 int previousBreak = Integer.MAX_VALUE; 876 for (int bk=fBI.last(); bk != BreakIterator.DONE; bk=fBI.previous()) { 877 if (bk >= previousBreak) { 878 throw new MonkeyException("Break Iterator Stall", bk); 879 } 880 if (bk < 0 || bk > fTestData.fString.length()) { 881 throw new MonkeyException("Boundary out of bounds", bk); 882 } 883 fTestData.fActualBreaks[bk] = true; 884 } 885 checkResults("testPrevius", CheckDirection.REVERSE); 886 }; 887 888 889 /** 890 * Given an index into a string, if it refers to the trail surrogate of a surrogate pair, 891 * adjust it to point to the lead surrogate, which is the start of the code point. 892 * @param s the String. 893 * @param i the initial index 894 * @return the adjusted index 895 */ 896 private int getChar32Start(String s, int i) { 897 if (i > 0 && i < s.length() && 898 Character.isLowSurrogate(s.charAt(i)) && Character.isHighSurrogate(s.charAt(i-1))) { 899 --i; 900 } 901 return i; 902 } 903 904 905 void testPreceding() { 906 fTestData.clearActualBreaks(); 907 fBI.setText(fTestData.fString); 908 int nextBreak = fTestData.fString.length()+1; 909 for (int i=fTestData.fString.length()+1 ; i>=0; --i) { 910 int bk = fBI.preceding(i); 911 // System.err.printf("testPreceding() i:%d bk:%d nextBreak:%d\n", i, bk, nextBreak); 912 if (bk == BreakIterator.DONE && i == 0) { 913 continue; 914 } 915 if (bk == nextBreak && bk < i) { 916 // i is in the gap between two breaks. 917 continue; 918 } 919 if (i<fTestData.fString.length() && getChar32Start(fTestData.fString, i) < i) { 920 // i indexes to a trailing surrogate. 921 // Break Iterators treat an index to either half as referring to the supplemental code point, 922 // with preceding going to some preceding code point. 923 if (fBI.preceding(i) != fBI.preceding(getChar32Start(fTestData.fString, i))) { 924 throw new MonkeyException("preceding of trailing surrogate error", i); 925 } 926 continue; 927 } 928 if (i == nextBreak && bk < nextBreak) { 929 fTestData.fActualBreaks[bk] = true; 930 nextBreak = bk; 931 continue; 932 } 933 throw new MonkeyException("preceding(i)", i); 934 } 935 checkResults("testPreceding", CheckDirection.REVERSE); 936 937 }; 938 939 940 void testIsBoundary() { 941 fTestData.clearActualBreaks(); 942 fBI.setText(fTestData.fString); 943 for (int i=fTestData.fString.length(); i>=0; --i) { 944 if (fBI.isBoundary(i)) { 945 fTestData.fActualBreaks[i] = true; 946 } 947 } 948 checkResults("testForwards", CheckDirection.FORWARD); 949 }; 950 951 952 void checkResults(String msg, CheckDirection direction) { 953 if (direction == CheckDirection.FORWARD) { 954 for (int i=0; i<=fTestData.fString.length(); ++i) { 955 if (fTestData.fExpectedBreaks[i] != fTestData.fActualBreaks[i]) { 956 throw new MonkeyException(msg, i); 957 } 958 } 959 } else { 960 for (int i=fTestData.fString.length(); i>=0; i--) { 961 if (fTestData.fExpectedBreaks[i] != fTestData.fActualBreaks[i]) { 962 throw new MonkeyException(msg, i); 963 } 964 } 965 } 966 967 }; 968 969 String fRuleCharBuffer; // source file contents of the reference rules. 970 BreakRules fRuleSet; 971 RuleBasedBreakIterator fBI; 972 MonkeyTestData fTestData; 973 ICU_Rand fRandomGenerator; 974 String fRuleFileName; 975 boolean fVerbose; // True to do long dump of failing data. 976 int fLoopCount; 977 int fErrorCount; 978 979 boolean fDumpExpansions; // Debug flag to output expanded form of rules and sets. 980 StringBuilder fErrorMsgs = new StringBuilder(); 981 982 } 983 984 // Test parameters, specified via Java properties. 985 // 986 // rules=file_name Name of file containing the reference rules. 987 // seed=nnnnn Random number starting seed. 988 // Setting the seed allows errors to be reproduced. 989 // loop=nnn Looping count. Controls running time. 990 // -1: run forever. 991 // 0 or greater: run length. 992 // expansions debug option, show expansions of rules and sets. 993 // verbose Display details of the failure. 994 // 995 // Parameters are passed to the JVM on the command line, or 996 // via the Eclipse Run Configuration settings, arguments tab, VM parameters. 997 // For example, 998 // -ea -Drules=line.txt -Dloop=-1 999 // 1000 @Test 1001 public void TestMonkey() { 1002 String tests[] = {"grapheme.txt", "word.txt", "line.txt", "sentence.txt", "line_normal.txt", 1003 "line_normal_cj.txt", "line_loose.txt", "line_loose_cj.txt", "word_POSIX.txt" 1004 }; 1005 1006 String testNameFromParams = getProperty("rules"); 1007 1008 if (testNameFromParams != null) { 1009 tests = new String[] {testNameFromParams}; 1010 } 1011 1012 int loopCount = getIntProperty("loop", isQuick() ? 100 : 5000); 1013 boolean dumpExpansions = getBooleanProperty("expansions", false); 1014 boolean verbose = getBooleanProperty("verbose", false); 1015 int seed = getIntProperty("seed", 1); 1016 1017 List<RBBIMonkeyImpl> startedTests = new ArrayList<RBBIMonkeyImpl>(); 1018 1019 // Monkey testing is multi-threaded. 1020 // Each set of break rules to be tested is run in a separate thread. 1021 // Each thread/set of rules gets a separate RBBIMonkeyImpl object. 1022 1023 for (String testName: tests) { 1024 logln(String.format("beginning testing of %s", testName)); 1025 1026 RBBIMonkeyImpl test = new RBBIMonkeyImpl(); 1027 1028 test.fDumpExpansions = dumpExpansions; 1029 test.fVerbose = verbose; 1030 test.fRandomGenerator = new ICU_Rand(seed); 1031 test.fLoopCount = loopCount; 1032 test.setup(testName); 1033 1034 test.start(); 1035 startedTests.add(test); 1036 } 1037 1038 StringBuilder errors = new StringBuilder(); 1039 for (RBBIMonkeyImpl test: startedTests) { 1040 try { 1041 test.join(); 1042 errors.append(test.fErrorMsgs); 1043 } catch (InterruptedException e) { 1044 errors.append(e + "\n"); 1045 } 1046 } 1047 String errorMsgs = errors.toString(); 1048 assertEquals(errorMsgs, "", errorMsgs); 1049 1050 } 1051 1052 1053} 1054