1// © 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html#License 3/* 4 ******************************************************************************* 5 * Copyright (C) 2003-2016 International Business Machines Corporation and 6 * others. All Rights Reserved. 7 ******************************************************************************* 8 */ 9package com.ibm.icu.dev.test.rbbi; 10 11 12// Monkey testing of RuleBasedBreakIterator 13import java.util.ArrayList; 14import java.util.Arrays; 15import java.util.List; 16import java.util.Locale; 17 18import org.junit.Test; 19 20import com.ibm.icu.dev.test.TestFmwk; 21import com.ibm.icu.lang.UCharacter; 22import com.ibm.icu.lang.UProperty; 23import com.ibm.icu.text.BreakIterator; 24import com.ibm.icu.text.RuleBasedBreakIterator; 25import com.ibm.icu.text.UTF16; 26import com.ibm.icu.text.UnicodeSet; 27 28 29/** 30 * Monkey tests for RBBI. These tests have independent implementations of 31 * the Unicode TR boundary rules, and compare results between these and ICU's 32 * implementation, using random data. 33 * 34 * Tests cover Grapheme Cluster (char), Word and Line breaks 35 * 36 * Ported from ICU4C, original code in file source/test/intltest/rbbitst.cpp 37 * 38 */ 39public class RBBITestMonkey extends TestFmwk { 40 // 41 // class RBBIMonkeyKind 42 // 43 // Monkey Test for Break Iteration 44 // Abstract interface class. Concrete derived classes independently 45 // implement the break rules for different iterator types. 46 // 47 // The Monkey Test itself uses doesn't know which type of break iterator it is 48 // testing, but works purely in terms of the interface defined here. 49 // 50 abstract static class RBBIMonkeyKind { 51 52 // Return a List of UnicodeSets, representing the character classes used 53 // for this type of iterator. 54 abstract List charClasses(); 55 56 // Set the test text on which subsequent calls to next() will operate 57 abstract void setText(StringBuffer text); 58 59 // Find the next break position, starting from the specified position. 60 // Return -1 after reaching end of string. 61 abstract int next(int i); 62 63 // A Character Property, one of the constants defined in class UProperty. 64 // The value of this property will be displayed for the characters 65 // near any test failure. 66 int fCharProperty; 67 } 68 69 // 70 // Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773 71 // 72 static String gExtended_Pict = "[" + 73 "\\U0001F774-\\U0001F77F\\u2700-\\u2701\\u2703-\\u2704\\u270E\\u2710-\\u2711\\u2765-\\u2767\\U0001F030-\\U0001F093" + 74 "\\U0001F094-\\U0001F09F\\U0001F10D-\\U0001F10F\\U0001F12F\\U0001F16C-\\U0001F16F\\U0001F1AD-\\U0001F1E5" + 75 "\\U0001F203-\\U0001F20F\\U0001F23C-\\U0001F23F\\U0001F249-\\U0001F24F\\U0001F252-\\U0001F2FF\\U0001F7D5-\\U0001F7FF" + 76 "\\U0001F000-\\U0001F003\\U0001F005-\\U0001F02B\\U0001F02C-\\U0001F02F\\U0001F322-\\U0001F323\\U0001F394-\\U0001F395" + 77 "\\U0001F398\\U0001F39C-\\U0001F39D\\U0001F3F1-\\U0001F3F2\\U0001F3F6\\U0001F4FE\\U0001F53E-\\U0001F548" + 78 "\\U0001F54F\\U0001F568-\\U0001F56E\\U0001F571-\\U0001F572\\U0001F57B-\\U0001F586\\U0001F588-\\U0001F589" + 79 "\\U0001F58E-\\U0001F58F\\U0001F591-\\U0001F594\\U0001F597-\\U0001F5A3\\U0001F5A6-\\U0001F5A7\\U0001F5A9-\\U0001F5B0" + 80 "\\U0001F5B3-\\U0001F5BB\\U0001F5BD-\\U0001F5C1\\U0001F5C5-\\U0001F5D0\\U0001F5D4-\\U0001F5DB\\U0001F5DF-\\U0001F5E0" + 81 "\\U0001F5E2\\U0001F5E4-\\U0001F5E7\\U0001F5E9-\\U0001F5EE\\U0001F5F0-\\U0001F5F2\\U0001F5F4-\\U0001F5F9" + 82 "\\u2605\\u2607-\\u260D\\u260F-\\u2610\\u2612\\u2616-\\u2617\\u2619-\\u261C\\u261E-\\u261F\\u2621\\u2624-\\u2625" + 83 "\\u2627-\\u2629\\u262B-\\u262D\\u2630-\\u2637\\u263B-\\u2647\\u2654-\\u265F\\u2661-\\u2662\\u2664\\u2667" + 84 "\\u2669-\\u267A\\u267C-\\u267E\\u2680-\\u2691\\u2695\\u2698\\u269A\\u269D-\\u269F\\u26A2-\\u26A9\\u26AC-\\u26AF" + 85 "\\u26B2-\\u26BC\\u26BF-\\u26C3\\u26C6-\\u26C7\\u26C9-\\u26CD\\u26D0\\u26D2\\u26D5-\\u26E8\\u26EB-\\u26EF" + 86 "\\u26F6\\u26FB-\\u26FC\\u26FE-\\u26FF\\u2388\\U0001FA00-\\U0001FFFD\\U0001F0A0-\\U0001F0AE\\U0001F0B1-\\U0001F0BF" + 87 "\\U0001F0C1-\\U0001F0CF\\U0001F0D1-\\U0001F0F5\\U0001F0AF-\\U0001F0B0\\U0001F0C0\\U0001F0D0\\U0001F0F6-\\U0001F0FF" + 88 "\\U0001F80C-\\U0001F80F\\U0001F848-\\U0001F84F\\U0001F85A-\\U0001F85F\\U0001F888-\\U0001F88F\\U0001F8AE-\\U0001F8FF" + 89 "\\U0001F900-\\U0001F90F\\U0001F91F\\U0001F928-\\U0001F92F\\U0001F931-\\U0001F932\\U0001F93F\\U0001F94C-\\U0001F94F" + 90 "\\U0001F95F-\\U0001F97F\\U0001F992-\\U0001F9BF\\U0001F9C1-\\U0001F9FF\\U0001F6C6-\\U0001F6CA\\U0001F6E6-\\U0001F6E8" + 91 "\\U0001F6EA\\U0001F6F1-\\U0001F6F2\\U0001F6D3-\\U0001F6DF\\U0001F6ED-\\U0001F6EF\\U0001F6F7-\\U0001F6FF" + 92 "]"; 93 94 95 /** 96 * Monkey test subclass for testing Character (Grapheme Cluster) boundaries. 97 * Note: As of Unicode 6.1, fPrependSet is empty, so don't add it to fSets 98 */ 99 static class RBBICharMonkey extends RBBIMonkeyKind { 100 List fSets; 101 102 UnicodeSet fCRLFSet; 103 UnicodeSet fControlSet; 104 UnicodeSet fExtendSet; 105 UnicodeSet fRegionalIndicatorSet; 106 UnicodeSet fPrependSet; 107 UnicodeSet fSpacingSet; 108 UnicodeSet fLSet; 109 UnicodeSet fVSet; 110 UnicodeSet fTSet; 111 UnicodeSet fLVSet; 112 UnicodeSet fLVTSet; 113 UnicodeSet fHangulSet; 114 UnicodeSet fEmojiModifierSet; 115 UnicodeSet fEmojiBaseSet; 116 UnicodeSet fZWJSet; 117 UnicodeSet fExtendedPictSet; 118 UnicodeSet fEBGSet; 119 UnicodeSet fEmojiNRKSet; 120 UnicodeSet fAnySet; 121 122 123 StringBuffer fText; 124 125 126 RBBICharMonkey() { 127 fText = null; 128 fCharProperty = UProperty.GRAPHEME_CLUSTER_BREAK; 129 fCRLFSet = new UnicodeSet("[\\r\\n]"); 130 fControlSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Control}]"); 131 fExtendSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}]"); 132 fZWJSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = ZWJ}]"); 133 fRegionalIndicatorSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"); 134 fPrependSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Prepend}]"); 135 fSpacingSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = SpacingMark}]"); 136 fLSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = L}]"); 137 fVSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = V}]"); 138 fTSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = T}]"); 139 fLVSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LV}]"); 140 fLVTSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LVT}]"); 141 fHangulSet = new UnicodeSet(); 142 fHangulSet.addAll(fLSet); 143 fHangulSet.addAll(fVSet); 144 fHangulSet.addAll(fTSet); 145 fHangulSet.addAll(fLVSet); 146 fHangulSet.addAll(fLVTSet); 147 148 fEmojiBaseSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"); 149 fEmojiModifierSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = EM}]"); 150 fExtendedPictSet = new UnicodeSet(gExtended_Pict); 151 fEBGSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = EBG}]"); 152 fEmojiNRKSet = new UnicodeSet("[[\\p{Emoji}]-[\\p{Grapheme_Cluster_Break = Regional_Indicator}*#0-9©®™〰〽]]"); 153 fAnySet = new UnicodeSet("[\\u0000-\\U0010ffff]"); 154 155 156 fSets = new ArrayList(); 157 fSets.add(fCRLFSet); 158 fSets.add(fControlSet); 159 fSets.add(fExtendSet); 160 fSets.add(fRegionalIndicatorSet); 161 if (!fPrependSet.isEmpty()) { 162 fSets.add(fPrependSet); 163 } 164 fSets.add(fSpacingSet); 165 fSets.add(fHangulSet); 166 fSets.add(fAnySet); 167 fSets.add(fEmojiBaseSet); 168 fSets.add(fEmojiModifierSet); 169 fSets.add(fZWJSet); 170 fSets.add(fExtendedPictSet); 171 fSets.add(fEBGSet); 172 fSets.add(fEmojiNRKSet); 173 } 174 175 176 @Override 177 void setText(StringBuffer s) { 178 fText = s; 179 } 180 181 @Override 182 List charClasses() { 183 return fSets; 184 } 185 186 @Override 187 int next(int prevPos) { 188 int /*p0,*/ p1, p2, p3; // Indices of the significant code points around the 189 // break position being tested. The candidate break 190 // location is before p2. 191 192 int breakPos = -1; 193 194 int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 195 int cBase; // for (X Extend*) patterns, the X character. 196 197 // Previous break at end of string. return DONE. 198 if (prevPos >= fText.length()) { 199 return -1; 200 } 201 /* p0 = */ p1 = p2 = p3 = prevPos; 202 c3 = UTF16.charAt(fText, prevPos); 203 c0 = c1 = c2 = cBase = 0; 204 205 // Loop runs once per "significant" character position in the input text. 206 for (;;) { 207 // Move all of the positions forward in the input string. 208 /* p0 = p1;*/ c0 = c1; 209 p1 = p2; c1 = c2; 210 p2 = p3; c2 = c3; 211 212 // Advance p3 by one codepoint 213 p3 = moveIndex32(fText, p3, 1); 214 c3 = (p3>=fText.length())? -1: UTF16.charAt(fText, p3); 215 216 if (p1 == p2) { 217 // Still warming up the loop. (won't work with zero length strings, but we don't care) 218 continue; 219 } 220 if (p2 == fText.length()) { 221 // Reached end of string. Always a break position. 222 break; 223 } 224 225 // Rule GB3 CR x LF 226 // No Extend or Format characters may appear between the CR and LF, 227 // which requires the additional check for p2 immediately following p1. 228 // 229 if (c1==0x0D && c2==0x0A && p1==(p2-1)) { 230 continue; 231 } 232 233 // Rule (GB4). ( Control | CR | LF ) <break> 234 if (fControlSet.contains(c1) || 235 c1 == 0x0D || 236 c1 == 0x0A) { 237 break; 238 } 239 240 // Rule (GB5) <break> ( Control | CR | LF ) 241 // 242 if (fControlSet.contains(c2) || 243 c2 == 0x0D || 244 c2 == 0x0A) { 245 break; 246 } 247 248 249 // Rule (GB6) L x ( L | V | LV | LVT ) 250 if (fLSet.contains(c1) && 251 (fLSet.contains(c2) || 252 fVSet.contains(c2) || 253 fLVSet.contains(c2) || 254 fLVTSet.contains(c2))) { 255 continue; 256 } 257 258 // Rule (GB7) ( LV | V ) x ( V | T ) 259 if ((fLVSet.contains(c1) || fVSet.contains(c1)) && 260 (fVSet.contains(c2) || fTSet.contains(c2))) { 261 continue; 262 } 263 264 // Rule (GB8) ( LVT | T) x T 265 if ((fLVTSet.contains(c1) || fTSet.contains(c1)) && 266 fTSet.contains(c2)) { 267 continue; 268 } 269 270 // Rule (GB9) x (Extend | ZWJ) 271 if (fExtendSet.contains(c2) || fZWJSet.contains(c2)) { 272 if (!fExtendSet.contains(c1)) { 273 cBase = c1; 274 } 275 continue; 276 } 277 278 // Rule (GB9a) x SpacingMark 279 if (fSpacingSet.contains(c2)) { 280 continue; 281 } 282 283 // Rule (GB9b) Prepend x 284 if (fPrependSet.contains(c1)) { 285 continue; 286 } 287 // Rule (GB10) (Emoji_Base | EBG) Extend* x Emoji_Modifier 288 if ((fEmojiBaseSet.contains(c1) || fEBGSet.contains(c1)) && fEmojiModifierSet.contains(c2)) { 289 continue; 290 } 291 if ((fEmojiBaseSet.contains(cBase) || fEBGSet.contains(cBase)) && 292 fExtendSet.contains(c1) && fEmojiModifierSet.contains(c2)) { 293 continue; 294 } 295 296 // Rule (GB11) (Extended_Pictographic | Emoji) ZWJ x (Extended_Pictographic | Emoji) 297 if ((fExtendedPictSet.contains(c0) || fEmojiNRKSet.contains(c0)) && fZWJSet.contains(c1) && 298 (fExtendedPictSet.contains(c2) || fEmojiNRKSet.contains(c2))) { 299 continue; 300 } 301 302 // Rule (GB12-13) Regional_Indicator x Regional_Indicator 303 // Note: The first if condition is a little tricky. We only need to force 304 // a break if there are three or more contiguous RIs. If there are 305 // only two, a break following will occur via other rules, and will include 306 // any trailing extend characters, which is needed behavior. 307 if (fRegionalIndicatorSet.contains(c0) && fRegionalIndicatorSet.contains(c1) 308 && fRegionalIndicatorSet.contains(c2)) { 309 break; 310 } 311 if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) { 312 continue; 313 } 314 315 // Rule (GB999) Any <break> Any 316 break; 317 } 318 319 breakPos = p2; 320 return breakPos; 321 } 322 } 323 324 325 /** 326 * 327 * Word Monkey Test Class 328 * 329 * 330 * 331 */ 332 static class RBBIWordMonkey extends RBBIMonkeyKind { 333 List fSets; 334 StringBuffer fText; 335 336 UnicodeSet fCRSet; 337 UnicodeSet fLFSet; 338 UnicodeSet fNewlineSet; 339 UnicodeSet fRegionalIndicatorSet; 340 UnicodeSet fKatakanaSet; 341 UnicodeSet fHebrew_LetterSet; 342 UnicodeSet fALetterSet; 343 UnicodeSet fSingle_QuoteSet; 344 UnicodeSet fDouble_QuoteSet; 345 UnicodeSet fMidNumLetSet; 346 UnicodeSet fMidLetterSet; 347 UnicodeSet fMidNumSet; 348 UnicodeSet fNumericSet; 349 UnicodeSet fFormatSet; 350 UnicodeSet fExtendSet; 351 UnicodeSet fExtendNumLetSet; 352 UnicodeSet fOtherSet; 353 UnicodeSet fDictionarySet; 354 UnicodeSet fEBaseSet; 355 UnicodeSet fEBGSet; 356 UnicodeSet fEModifierSet; 357 UnicodeSet fZWJSet; 358 UnicodeSet fExtendedPictSet; 359 UnicodeSet fEmojiNRKSet; 360 361 362 RBBIWordMonkey() { 363 fCharProperty = UProperty.WORD_BREAK; 364 365 fCRSet = new UnicodeSet("[\\p{Word_Break = CR}]"); 366 fLFSet = new UnicodeSet("[\\p{Word_Break = LF}]"); 367 fNewlineSet = new UnicodeSet("[\\p{Word_Break = Newline}]"); 368 fRegionalIndicatorSet = new UnicodeSet("[\\p{Word_Break = Regional_Indicator}]"); 369 fKatakanaSet = new UnicodeSet("[\\p{Word_Break = Katakana}]"); 370 fHebrew_LetterSet = new UnicodeSet("[\\p{Word_Break = Hebrew_Letter}]"); 371 fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}]"); 372 fSingle_QuoteSet = new UnicodeSet("[\\p{Word_Break = Single_Quote}]"); 373 fDouble_QuoteSet = new UnicodeSet("[\\p{Word_Break = Double_Quote}]"); 374 fMidNumLetSet = new UnicodeSet("[\\p{Word_Break = MidNumLet}]"); 375 fMidLetterSet = new UnicodeSet("[\\p{Word_Break = MidLetter}]"); 376 fMidNumSet = new UnicodeSet("[\\p{Word_Break = MidNum}]"); 377 fNumericSet = new UnicodeSet("[\\p{Word_Break = Numeric}]"); 378 fFormatSet = new UnicodeSet("[\\p{Word_Break = Format}]"); 379 fExtendNumLetSet = new UnicodeSet("[\\p{Word_Break = ExtendNumLet}]"); 380 fExtendSet = new UnicodeSet("[\\p{Word_Break = Extend}]"); 381 fEBaseSet = new UnicodeSet("[\\p{Word_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"); 382 fEBGSet = new UnicodeSet("[\\p{Word_Break = EBG}]"); 383 fEModifierSet = new UnicodeSet("[\\p{Word_Break = EM}]"); 384 fZWJSet = new UnicodeSet("[\\p{Word_Break = ZWJ}]"); 385 fExtendedPictSet = new UnicodeSet(gExtended_Pict); 386 fEmojiNRKSet = new UnicodeSet("[[\\p{Emoji}]-[\\p{Grapheme_Cluster_Break = Regional_Indicator}*#0-9©®™〰〽]]"); 387 388 fDictionarySet = new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]"); 389 fDictionarySet.addAll(fKatakanaSet); 390 fDictionarySet.addAll(new UnicodeSet("[\\p{LineBreak = Complex_Context}]")); 391 392 fALetterSet.removeAll(fDictionarySet); 393 394 fOtherSet = new UnicodeSet(); 395 fOtherSet.complement(); 396 fOtherSet.removeAll(fCRSet); 397 fOtherSet.removeAll(fLFSet); 398 fOtherSet.removeAll(fNewlineSet); 399 fOtherSet.removeAll(fALetterSet); 400 fOtherSet.removeAll(fSingle_QuoteSet); 401 fOtherSet.removeAll(fDouble_QuoteSet); 402 fOtherSet.removeAll(fKatakanaSet); 403 fOtherSet.removeAll(fHebrew_LetterSet); 404 fOtherSet.removeAll(fMidLetterSet); 405 fOtherSet.removeAll(fMidNumSet); 406 fOtherSet.removeAll(fNumericSet); 407 fOtherSet.removeAll(fFormatSet); 408 fOtherSet.removeAll(fExtendSet); 409 fOtherSet.removeAll(fExtendNumLetSet); 410 fOtherSet.removeAll(fRegionalIndicatorSet); 411 fOtherSet.removeAll(fEBaseSet); 412 fOtherSet.removeAll(fEBGSet); 413 fOtherSet.removeAll(fEModifierSet); 414 fOtherSet.removeAll(fZWJSet); 415 fOtherSet.removeAll(fExtendedPictSet); 416 fOtherSet.removeAll(fEmojiNRKSet); 417 418 // Inhibit dictionary characters from being tested at all. 419 // remove surrogates so as to not generate higher CJK characters 420 fOtherSet.removeAll(new UnicodeSet("[[\\p{LineBreak = Complex_Context}][:Line_Break=Surrogate:]]")); 421 fOtherSet.removeAll(fDictionarySet); 422 423 fSets = new ArrayList(); 424 fSets.add(fCRSet); 425 fSets.add(fLFSet); 426 fSets.add(fNewlineSet); 427 fSets.add(fRegionalIndicatorSet); 428 fSets.add(fHebrew_LetterSet); 429 fSets.add(fALetterSet); 430 //fSets.add(fKatakanaSet); // Omit Katakana from fSets, which omits Katakana characters 431 // from the test data. They are all in the dictionary set, 432 // which this (old, to be retired) monkey test cannot handle. 433 fSets.add(fSingle_QuoteSet); 434 fSets.add(fDouble_QuoteSet); 435 fSets.add(fMidLetterSet); 436 fSets.add(fMidNumLetSet); 437 fSets.add(fMidNumSet); 438 fSets.add(fNumericSet); 439 fSets.add(fFormatSet); 440 fSets.add(fExtendSet); 441 fSets.add(fExtendNumLetSet); 442 fSets.add(fRegionalIndicatorSet); 443 fSets.add(fEBaseSet); 444 fSets.add(fEBGSet); 445 fSets.add(fEModifierSet); 446 fSets.add(fZWJSet); 447 fSets.add(fExtendedPictSet); 448 fSets.add(fEmojiNRKSet); 449 fSets.add(fOtherSet); 450 } 451 452 453 @Override 454 List charClasses() { 455 return fSets; 456 } 457 458 @Override 459 void setText(StringBuffer s) { 460 fText = s; 461 } 462 463 @Override 464 int next(int prevPos) { 465 int /*p0,*/ p1, p2, p3; // Indices of the significant code points around the 466 // break position being tested. The candidate break 467 // location is before p2. 468 int breakPos = -1; 469 470 int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 471 472 // Previous break at end of string. return DONE. 473 if (prevPos >= fText.length()) { 474 return -1; 475 } 476 /*p0 =*/ p1 = p2 = p3 = prevPos; 477 c3 = UTF16.charAt(fText, prevPos); 478 c0 = c1 = c2 = 0; 479 480 481 482 // Loop runs once per "significant" character position in the input text. 483 for (;;) { 484 // Move all of the positions forward in the input string. 485 /*p0 = p1;*/ c0 = c1; 486 p1 = p2; c1 = c2; 487 p2 = p3; c2 = c3; 488 489 // Advance p3 by X(Extend | Format)* Rule 4 490 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change) 491 do { 492 p3 = moveIndex32(fText, p3, 1); 493 c3 = -1; 494 if (p3>=fText.length()) { 495 break; 496 } 497 c3 = UTF16.charAt(fText, p3); 498 if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) { 499 break; 500 } 501 } 502 while (setContains(fFormatSet, c3) || setContains(fExtendSet, c3) || setContains(fZWJSet, c3)); 503 504 if (p1 == p2) { 505 // Still warming up the loop. (won't work with zero length strings, but we don't care) 506 continue; 507 } 508 if (p2 == fText.length()) { 509 // Reached end of string. Always a break position. 510 break; 511 } 512 513 // Rule (3) CR x LF 514 // No Extend or Format characters may appear between the CR and LF, 515 // which requires the additional check for p2 immediately following p1. 516 // 517 if (c1==0x0D && c2==0x0A) { 518 continue; 519 } 520 521 // Rule (3a) Break before and after newlines (including CR and LF) 522 // 523 if (fCRSet.contains(c1) || fLFSet.contains(c1) || fNewlineSet.contains(c1)) { 524 break; 525 } 526 if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) { 527 break; 528 } 529 530 // Rule (3c) ZWJ x (Extended_Pictographic | Emoji). 531 // Not ignoring extend chars, so peek into input text to 532 // get the potential ZWJ, the character immediately preceding c2. 533 if (fZWJSet.contains(fText.codePointBefore(p2)) && (fExtendedPictSet.contains(c2) || fEmojiNRKSet.contains(c2))) { 534 continue; 535 } 536 537 // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter) 538 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) && 539 (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) { 540 continue; 541 } 542 543 // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter) 544 // 545 if ( (fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) && 546 (fMidLetterSet.contains(c2) || fMidNumLetSet.contains(c2) || fSingle_QuoteSet.contains(c2)) && 547 (setContains(fALetterSet, c3) || setContains(fHebrew_LetterSet, c3))) { 548 continue; 549 } 550 551 // Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter) 552 if ((fALetterSet.contains(c0) || fHebrew_LetterSet.contains(c0)) && 553 (fMidLetterSet.contains(c1) || fMidNumLetSet.contains(c1) || fSingle_QuoteSet.contains(c1)) && 554 (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) { 555 continue; 556 } 557 558 // Rule (7a) Hebrew_Letter x Single_Quote 559 if (fHebrew_LetterSet.contains(c1) && fSingle_QuoteSet.contains(c2)) { 560 continue; 561 } 562 563 // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter 564 if (fHebrew_LetterSet.contains(c1) && fDouble_QuoteSet.contains(c2) && setContains(fHebrew_LetterSet,c3)) { 565 continue; 566 } 567 568 // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter 569 if (fHebrew_LetterSet.contains(c0) && fDouble_QuoteSet.contains(c1) && fHebrew_LetterSet.contains(c2)) { 570 continue; 571 } 572 573 // Rule (8) Numeric x Numeric 574 if (fNumericSet.contains(c1) && 575 fNumericSet.contains(c2)) { 576 continue; 577 } 578 579 // Rule (9) (ALetter | Hebrew_Letter) x Numeric 580 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) && 581 fNumericSet.contains(c2)) { 582 continue; 583 } 584 585 // Rule (10) Numeric x (ALetter | Hebrew_Letter) 586 if (fNumericSet.contains(c1) && 587 (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) { 588 continue; 589 } 590 591 // Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric 592 if (fNumericSet.contains(c0) && 593 (fMidNumSet.contains(c1) || fMidNumLetSet.contains(c1) || fSingle_QuoteSet.contains(c1)) && 594 fNumericSet.contains(c2)) { 595 continue; 596 } 597 598 // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric 599 if (fNumericSet.contains(c1) && 600 (fMidNumSet.contains(c2) || fMidNumLetSet.contains(c2) || fSingle_QuoteSet.contains(c2)) && 601 setContains(fNumericSet, c3)) { 602 continue; 603 } 604 605 // Rule (13) Katakana x Katakana 606 // Note: matches UAX 29 rules, but doesn't come into play for ICU because 607 // all Katakana are handled by the dictionary breaker. 608 if (fKatakanaSet.contains(c1) && 609 fKatakanaSet.contains(c2)) { 610 continue; 611 } 612 613 // Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet 614 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1) ||fNumericSet.contains(c1) || 615 fKatakanaSet.contains(c1) || fExtendNumLetSet.contains(c1)) && 616 fExtendNumLetSet.contains(c2)) { 617 continue; 618 } 619 620 // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana) 621 if (fExtendNumLetSet.contains(c1) && 622 (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2) || 623 fNumericSet.contains(c2) || fKatakanaSet.contains(c2))) { 624 continue; 625 } 626 627 628 // Rule 14 (E_Base | EBG) x E_Modifier 629 if ((fEBaseSet.contains(c1) || fEBGSet.contains(c1)) && fEModifierSet.contains(c2)) { 630 continue; 631 } 632 633 // Rule 15 - 17 Group piars of Regional Indicators 634 if (fRegionalIndicatorSet.contains(c0) && fRegionalIndicatorSet.contains(c1)) { 635 break; 636 } 637 if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) { 638 continue; 639 } 640 641 // Rule 999. Break found here. 642 break; 643 } 644 645 breakPos = p2; 646 return breakPos; 647 } 648 649 } 650 651 652 static class RBBILineMonkey extends RBBIMonkeyKind { 653 654 List fSets; 655 656 // UnicodeSets for each of the Line Breaking character classes. 657 // Order matches that of Unicode UAX 14, Table 1, which makes it a little easier 658 // to verify that they are all accounted for. 659 660 UnicodeSet fBK; 661 UnicodeSet fCR; 662 UnicodeSet fLF; 663 UnicodeSet fCM; 664 UnicodeSet fNL; 665 UnicodeSet fSG; 666 UnicodeSet fWJ; 667 UnicodeSet fZW; 668 UnicodeSet fGL; 669 UnicodeSet fSP; 670 UnicodeSet fB2; 671 UnicodeSet fBA; 672 UnicodeSet fBB; 673 UnicodeSet fHY; 674 UnicodeSet fCB; 675 UnicodeSet fCL; 676 UnicodeSet fCP; 677 UnicodeSet fEX; 678 UnicodeSet fIN; 679 UnicodeSet fNS; 680 UnicodeSet fOP; 681 UnicodeSet fQU; 682 UnicodeSet fIS; 683 UnicodeSet fNU; 684 UnicodeSet fPO; 685 UnicodeSet fPR; 686 UnicodeSet fSY; 687 UnicodeSet fAI; 688 UnicodeSet fAL; 689 UnicodeSet fCJ; 690 UnicodeSet fH2; 691 UnicodeSet fH3; 692 UnicodeSet fHL; 693 UnicodeSet fID; 694 UnicodeSet fJL; 695 UnicodeSet fJV; 696 UnicodeSet fJT; 697 UnicodeSet fRI; 698 UnicodeSet fXX; 699 UnicodeSet fEB; 700 UnicodeSet fEM; 701 UnicodeSet fZWJ; 702 UnicodeSet fExtendedPict; 703 UnicodeSet fEmojiNRK; 704 705 StringBuffer fText; 706 int fOrigPositions; 707 708 709 710 RBBILineMonkey() 711 { 712 fCharProperty = UProperty.LINE_BREAK; 713 fSets = new ArrayList(); 714 715 fBK = new UnicodeSet("[\\p{Line_Break=BK}]"); 716 fCR = new UnicodeSet("[\\p{Line_break=CR}]"); 717 fLF = new UnicodeSet("[\\p{Line_break=LF}]"); 718 fCM = new UnicodeSet("[\\p{Line_break=CM}]"); 719 fNL = new UnicodeSet("[\\p{Line_break=NL}]"); 720 fSG = new UnicodeSet("[\\ud800-\\udfff]"); 721 fWJ = new UnicodeSet("[\\p{Line_break=WJ}]"); 722 fZW = new UnicodeSet("[\\p{Line_break=ZW}]"); 723 fGL = new UnicodeSet("[\\p{Line_break=GL}]"); 724 fSP = new UnicodeSet("[\\p{Line_break=SP}]"); 725 fB2 = new UnicodeSet("[\\p{Line_break=B2}]"); 726 fBA = new UnicodeSet("[\\p{Line_break=BA}]"); 727 fBB = new UnicodeSet("[\\p{Line_break=BB}]"); 728 fHY = new UnicodeSet("[\\p{Line_break=HY}]"); 729 fCB = new UnicodeSet("[\\p{Line_break=CB}]"); 730 fCL = new UnicodeSet("[\\p{Line_break=CL}]"); 731 fCP = new UnicodeSet("[\\p{Line_break=CP}]"); 732 fEX = new UnicodeSet("[\\p{Line_break=EX}]"); 733 fIN = new UnicodeSet("[\\p{Line_break=IN}]"); 734 fNS = new UnicodeSet("[\\p{Line_break=NS}]"); 735 fOP = new UnicodeSet("[\\p{Line_break=OP}]"); 736 fQU = new UnicodeSet("[\\p{Line_break=QU}]"); 737 fIS = new UnicodeSet("[\\p{Line_break=IS}]"); 738 fNU = new UnicodeSet("[\\p{Line_break=NU}]"); 739 fPO = new UnicodeSet("[\\p{Line_break=PO}]"); 740 fPR = new UnicodeSet("[\\p{Line_break=PR}]"); 741 fSY = new UnicodeSet("[\\p{Line_break=SY}]"); 742 fAI = new UnicodeSet("[\\p{Line_break=AI}]"); 743 fAL = new UnicodeSet("[\\p{Line_break=AL}]"); 744 fCJ = new UnicodeSet("[\\p{Line_break=CJ}]"); 745 fH2 = new UnicodeSet("[\\p{Line_break=H2}]"); 746 fH3 = new UnicodeSet("[\\p{Line_break=H3}]"); 747 fHL = new UnicodeSet("[\\p{Line_break=HL}]"); 748 fID = new UnicodeSet("[\\p{Line_break=ID}]"); 749 fJL = new UnicodeSet("[\\p{Line_break=JL}]"); 750 fJV = new UnicodeSet("[\\p{Line_break=JV}]"); 751 fJT = new UnicodeSet("[\\p{Line_break=JT}]"); 752 fRI = new UnicodeSet("[\\p{Line_break=RI}]"); 753 fXX = new UnicodeSet("[\\p{Line_break=XX}]"); 754 fEB = new UnicodeSet("[\\p{Line_break=EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"); 755 fEM = new UnicodeSet("[\\p{Line_break=EM}]"); 756 fZWJ = new UnicodeSet("[\\p{Line_break=ZWJ}]"); 757 fEmojiNRK = new UnicodeSet("[[\\p{Emoji}]-[\\p{Line_break=RI}*#0-9©®™〰〽]]"); 758 fExtendedPict = new UnicodeSet(gExtended_Pict); 759 760 761 // Remove dictionary characters. 762 // The monkey test reference implementation of line break does not replicate the dictionary behavior, 763 // so dictionary characters are omitted from the monkey test data. 764 @SuppressWarnings("unused") 765 UnicodeSet dictionarySet = new UnicodeSet( 766 "[[:LineBreak = Complex_Context:] & [[:Script = Thai:][:Script = Lao:][:Script = Khmer:] [:script = Myanmar:]]]"); 767 768 fAL.addAll(fXX); // Default behavior for XX is identical to AL 769 fAL.addAll(fAI); // Default behavior for AI is identical to AL 770 fAL.addAll(fSG); // Default behavior for SG (unpaired surrogates) is AL 771 772 fNS.addAll(fCJ); // Default behavior for CJ is identical to NS. 773 fCM.addAll(fZWJ); // ZWJ behaves as a CM. 774 775 fSets.add(fBK); 776 fSets.add(fCR); 777 fSets.add(fLF); 778 fSets.add(fCM); 779 fSets.add(fNL); 780 fSets.add(fWJ); 781 fSets.add(fZW); 782 fSets.add(fGL); 783 fSets.add(fSP); 784 fSets.add(fB2); 785 fSets.add(fBA); 786 fSets.add(fBB); 787 fSets.add(fHY); 788 fSets.add(fCB); 789 fSets.add(fCL); 790 fSets.add(fCP); 791 fSets.add(fEX); 792 fSets.add(fIN); 793 fSets.add(fJL); 794 fSets.add(fJT); 795 fSets.add(fJV); 796 fSets.add(fNS); 797 fSets.add(fOP); 798 fSets.add(fQU); 799 fSets.add(fIS); 800 fSets.add(fNU); 801 fSets.add(fPO); 802 fSets.add(fPR); 803 fSets.add(fSY); 804 fSets.add(fAI); 805 fSets.add(fAL); 806 fSets.add(fH2); 807 fSets.add(fH3); 808 fSets.add(fHL); 809 fSets.add(fID); 810 fSets.add(fWJ); 811 fSets.add(fRI); 812 fSets.add(fSG); 813 fSets.add(fEB); 814 fSets.add(fEM); 815 fSets.add(fZWJ); 816 fSets.add(fExtendedPict); 817 fSets.add(fEmojiNRK); 818 } 819 820 @Override 821 void setText(StringBuffer s) { 822 fText = s; 823 } 824 825 826 827 828 @Override 829 int next(int startPos) { 830 int pos; // Index of the char following a potential break position 831 int thisChar; // Character at above position "pos" 832 833 int prevPos; // Index of the char preceding a potential break position 834 int prevChar; // Character at above position. Note that prevChar 835 // and thisChar may not be adjacent because combining 836 // characters between them will be ignored. 837 int prevCharX2; // Character before prevChar, more contex for LB 21a 838 839 int nextPos; // Index of the next character following pos. 840 // Usually skips over combining marks. 841 int tPos; // temp value. 842 int matchVals[] = null; // Number Expression Match Results 843 844 845 if (startPos >= fText.length()) { 846 return -1; 847 } 848 849 850 // Initial values for loop. Loop will run the first time without finding breaks, 851 // while the invalid values shift out and the "this" and 852 // "prev" positions are filled in with good values. 853 pos = prevPos = -1; // Invalid value, serves as flag for initial loop iteration. 854 thisChar = prevChar = prevCharX2 = 0; 855 nextPos = startPos; 856 857 858 // Loop runs once per position in the test text, until a break position 859 // is found. In each iteration, we are testing for a possible break 860 // just preceding the character at index "pos". The character preceding 861 // this char is at postion "prevPos"; because of combining sequences, 862 // "prevPos" can be arbitrarily far before "pos". 863 for (;;) { 864 // Advance to the next position to be tested. 865 prevCharX2 = prevChar; 866 prevPos = pos; 867 prevChar = thisChar; 868 pos = nextPos; 869 nextPos = moveIndex32(fText, pos, 1); 870 871 // Rule LB2 - Break at end of text. 872 if (pos >= fText.length()) { 873 break; 874 } 875 876 // Rule LB 9 - adjust for combining sequences. 877 // We do this rule out-of-order because the adjustment does 878 // not effect the way that rules LB 3 through LB 6 match, 879 // and doing it here rather than after LB 6 is substantially 880 // simpler when combining sequences do occur. 881 882 883 // LB 9 Keep combining sequences together. 884 // advance over any CM class chars at "pos", 885 // result is "nextPos" for the following loop iteration. 886 thisChar = UTF16.charAt(fText, pos); 887 if (!(fSP.contains(thisChar) || fBK.contains(thisChar) || thisChar==0x0d || 888 thisChar==0x0a || fNL.contains(thisChar) || fZW.contains(thisChar) )) { 889 for (;;) { 890 if (nextPos == fText.length()) { 891 break; 892 } 893 int nextChar = UTF16.charAt(fText, nextPos); 894 if (!fCM.contains(nextChar)) { 895 break; 896 } 897 nextPos = moveIndex32(fText, nextPos, 1); 898 } 899 } 900 901 // LB 9 Treat X CM* as if it were X 902 // No explicit action required. 903 904 // LB 10 Treat any remaining combining mark as AL 905 if (fCM.contains(thisChar)) { 906 thisChar = 'A'; 907 } 908 909 910 // If the loop is still warming up - if we haven't shifted the initial 911 // -1 positions out of prevPos yet - loop back to advance the 912 // position in the input without any further looking for breaks. 913 if (prevPos == -1) { 914 continue; 915 } 916 917 // LB 4 Always break after hard line breaks, 918 if (fBK.contains(prevChar)) { 919 break; 920 } 921 922 // LB 5 Break after CR, LF, NL, but not inside CR LF 923 if (fCR.contains(prevChar) && fLF.contains(thisChar)) { 924 continue; 925 } 926 if (fCR.contains(prevChar) || 927 fLF.contains(prevChar) || 928 fNL.contains(prevChar)) { 929 break; 930 } 931 932 // LB 6 Don't break before hard line breaks 933 if (fBK.contains(thisChar) || fCR.contains(thisChar) || 934 fLF.contains(thisChar) || fNL.contains(thisChar) ) { 935 continue; 936 } 937 938 939 // LB 7 Don't break before spaces or zero-width space. 940 if (fSP.contains(thisChar)) { 941 continue; 942 } 943 944 if (fZW.contains(thisChar)) { 945 continue; 946 } 947 948 // LB 8 Break after zero width space 949 if (fZW.contains(prevChar)) { 950 break; 951 } 952 953 // LB 8a: ZWJ x (ID | Extended_Pictographic | Emoji) 954 // The monkey test's way of ignoring combining characters doesn't work 955 // for this rule. ZWJ is also a CM. Need to get the actual character 956 // preceding "thisChar", not ignoring combining marks, possibly ZWJ. 957 { 958 int prevC = fText.codePointBefore(pos); 959 if (fZWJ.contains(prevC) && (fID.contains(thisChar) || fExtendedPict.contains(thisChar) || fEmojiNRK.contains(thisChar))) { 960 continue; 961 } 962 } 963 964 // LB 9, 10 Already done, at top of loop. 965 // 966 967 968 // LB 11 969 // x WJ 970 // WJ x 971 if (fWJ.contains(thisChar) || fWJ.contains(prevChar)) { 972 continue; 973 } 974 975 976 // LB 12 977 // GL x 978 if (fGL.contains(prevChar)) { 979 continue; 980 } 981 982 // LB 12a 983 // [^SP BA HY] x GL 984 if (!(fSP.contains(prevChar) || 985 fBA.contains(prevChar) || 986 fHY.contains(prevChar) ) && fGL.contains(thisChar)) { 987 continue; 988 } 989 990 991 992 // LB 13 Don't break before closings. 993 // NU x CL, NU x CP and NU x IS are not matched here so that they will 994 // fall into LB 17 and the more general number regular expression. 995 // 996 if (!fNU.contains(prevChar) && fCL.contains(thisChar) || 997 !fNU.contains(prevChar) && fCP.contains(thisChar) || 998 fEX.contains(thisChar) || 999 !fNU.contains(prevChar) && fIS.contains(thisChar) || 1000 !fNU.contains(prevChar) && fSY.contains(thisChar)) { 1001 continue; 1002 } 1003 1004 // LB 14 Don't break after OP SP* 1005 // Scan backwards, checking for this sequence. 1006 // The OP char could include combining marks, so we actually check for 1007 // OP CM* SP* x 1008 tPos = prevPos; 1009 if (fSP.contains(prevChar)) { 1010 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) { 1011 tPos=moveIndex32(fText, tPos, -1); 1012 } 1013 } 1014 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) { 1015 tPos=moveIndex32(fText, tPos, -1); 1016 } 1017 if (fOP.contains(UTF16.charAt(fText, tPos))) { 1018 continue; 1019 } 1020 1021 // LB 15 Do not break within "[ 1022 // QU CM* SP* x OP 1023 if (fOP.contains(thisChar)) { 1024 // Scan backwards from prevChar to see if it is preceded by QU CM* SP* 1025 tPos = prevPos; 1026 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) { 1027 tPos = moveIndex32(fText, tPos, -1); 1028 } 1029 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) { 1030 tPos = moveIndex32(fText, tPos, -1); 1031 } 1032 if (fQU.contains(UTF16.charAt(fText, tPos))) { 1033 continue; 1034 } 1035 } 1036 1037 // LB 16 (CL | CP) SP* x NS 1038 if (fNS.contains(thisChar)) { 1039 tPos = prevPos; 1040 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) { 1041 tPos = moveIndex32(fText, tPos, -1); 1042 } 1043 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) { 1044 tPos = moveIndex32(fText, tPos, -1); 1045 } 1046 if (fCL.contains(UTF16.charAt(fText, tPos)) || fCP.contains(UTF16.charAt(fText, tPos))) { 1047 continue; 1048 } 1049 } 1050 1051 1052 // LB 17 B2 SP* x B2 1053 if (fB2.contains(thisChar)) { 1054 tPos = prevPos; 1055 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) { 1056 tPos = moveIndex32(fText, tPos, -1); 1057 } 1058 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) { 1059 tPos = moveIndex32(fText, tPos, -1); 1060 } 1061 if (fB2.contains(UTF16.charAt(fText, tPos))) { 1062 continue; 1063 } 1064 } 1065 1066 // LB 18 break after space 1067 if (fSP.contains(prevChar)) { 1068 break; 1069 } 1070 1071 // LB 19 1072 // x QU 1073 // QU x 1074 if (fQU.contains(thisChar) || fQU.contains(prevChar)) { 1075 continue; 1076 } 1077 1078 // LB 20 Break around a CB 1079 if (fCB.contains(thisChar) || fCB.contains(prevChar)) { 1080 break; 1081 } 1082 1083 // LB 21 1084 if (fBA.contains(thisChar) || 1085 fHY.contains(thisChar) || 1086 fNS.contains(thisChar) || 1087 fBB.contains(prevChar) ) { 1088 continue; 1089 } 1090 1091 // LB 21a, HL (HY | BA) x 1092 if (fHL.contains(prevCharX2) && (fHY.contains(prevChar) || fBA.contains(prevChar))) { 1093 continue; 1094 } 1095 1096 // LB 21b, SY x HL 1097 if (fSY.contains(prevChar) && fHL.contains(thisChar)) { 1098 continue; 1099 } 1100 1101 // LB 22 1102 if (fAL.contains(prevChar) && fIN.contains(thisChar) || 1103 fEX.contains(prevChar) && fIN.contains(thisChar) || 1104 fHL.contains(prevChar) && fIN.contains(thisChar) || 1105 (fID.contains(prevChar) || fEB.contains(prevChar) || fEM.contains(prevChar)) && fIN.contains(thisChar) || 1106 fIN.contains(prevChar) && fIN.contains(thisChar) || 1107 fNU.contains(prevChar) && fIN.contains(thisChar) ) { 1108 continue; 1109 } 1110 1111 // LB 23 (AL | HL) x NU 1112 // NU x (AL | HL) 1113 if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && fNU.contains(thisChar)) { 1114 continue; 1115 } 1116 if (fNU.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) { 1117 continue; 1118 } 1119 1120 // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes. 1121 // PR x (ID | EB | EM) 1122 // (ID | EB | EM) x PO 1123 if (fPR.contains(prevChar) && 1124 (fID.contains(thisChar) || fEB.contains(thisChar) || fEM.contains(thisChar))) { 1125 continue; 1126 } 1127 if ((fID.contains(prevChar) || fEB.contains(prevChar) || fEM.contains(prevChar)) && 1128 fPO.contains(thisChar)) { 1129 continue; 1130 } 1131 1132 // LB 24 Do not break between prefix and letters or ideographs. 1133 // (PR | PO) x (AL | HL) 1134 // (AL | HL) x (PR | PO) 1135 if ((fPR.contains(prevChar) || fPO.contains(prevChar)) && 1136 (fAL.contains(thisChar) || fHL.contains(thisChar))) { 1137 continue; 1138 } 1139 if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && 1140 (fPR.contains(thisChar) || fPO.contains(thisChar))) { 1141 continue; 1142 } 1143 1144 1145 // LB 25 Numbers 1146 matchVals = LBNumberCheck(fText, prevPos, matchVals); 1147 if (matchVals[0] != -1) { 1148 // Matched a number. But could have been just a single digit, which would 1149 // not represent a "no break here" between prevChar and thisChar 1150 int numEndIdx = matchVals[1]; // idx of first char following num 1151 if (numEndIdx > pos) { 1152 // Number match includes at least the two chars being checked 1153 if (numEndIdx > nextPos) { 1154 // Number match includes additional chars. Update pos and nextPos 1155 // so that next loop iteration will continue at the end of the number, 1156 // checking for breaks between last char in number & whatever follows. 1157 nextPos = numEndIdx; 1158 pos = numEndIdx; 1159 do { 1160 pos = moveIndex32(fText, pos, -1); 1161 thisChar = UTF16.charAt(fText, pos); 1162 } 1163 while (fCM.contains(thisChar)); 1164 } 1165 continue; 1166 } 1167 } 1168 1169 1170 // LB 26 Do not break Korean Syllables 1171 if (fJL.contains(prevChar) && (fJL.contains(thisChar) || 1172 fJV.contains(thisChar) || 1173 fH2.contains(thisChar) || 1174 fH3.contains(thisChar))) { 1175 continue; 1176 } 1177 1178 if ((fJV.contains(prevChar) || fH2.contains(prevChar)) && 1179 (fJV.contains(thisChar) || fJT.contains(thisChar))) { 1180 continue; 1181 } 1182 1183 if ((fJT.contains(prevChar) || fH3.contains(prevChar)) && 1184 fJT.contains(thisChar)) { 1185 continue; 1186 } 1187 1188 // LB 27 Treat a Korean Syllable Block the same as ID 1189 if ((fJL.contains(prevChar) || fJV.contains(prevChar) || 1190 fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) && 1191 fIN.contains(thisChar)) { 1192 continue; 1193 } 1194 if ((fJL.contains(prevChar) || fJV.contains(prevChar) || 1195 fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) && 1196 fPO.contains(thisChar)) { 1197 continue; 1198 } 1199 if (fPR.contains(prevChar) && (fJL.contains(thisChar) || fJV.contains(thisChar) || 1200 fJT.contains(thisChar) || fH2.contains(thisChar) || fH3.contains(thisChar))) { 1201 continue; 1202 } 1203 1204 1205 1206 // LB 28 Do not break between alphabetics 1207 if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && (fAL.contains(thisChar) || fHL.contains(thisChar))) { 1208 continue; 1209 } 1210 1211 // LB 29 Do not break between numeric punctuation and alphabetics 1212 if (fIS.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) { 1213 continue; 1214 } 1215 1216 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation. 1217 // (AL | NU) x OP 1218 // CP x (AL | NU) 1219 if ((fAL.contains(prevChar) || fHL.contains(prevChar) || fNU.contains(prevChar)) && fOP.contains(thisChar)) { 1220 continue; 1221 } 1222 if (fCP.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar) || fNU.contains(thisChar))) { 1223 continue; 1224 } 1225 1226 // LB 30a Break between pairs of Regional Indicators. 1227 // RI RI <break> RI 1228 // RI x RI 1229 if (fRI.contains(prevCharX2) && fRI.contains(prevChar) && fRI.contains(thisChar)) { 1230 break; 1231 } 1232 if (fRI.contains(prevChar) && fRI.contains(thisChar)) { 1233 continue; 1234 } 1235 1236 // LB30b Emoji Base x Emoji Modifier 1237 if (fEB.contains(prevChar) && fEM.contains(thisChar)) { 1238 continue; 1239 } 1240 // LB 31 Break everywhere else 1241 break; 1242 } 1243 1244 return pos; 1245 } 1246 1247 1248 1249 // Match the following regular expression in the input text. 1250 // ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * ((CL | CP) CM*)? (PR | PO) CM*)? 1251 // 0 0 1 3 3 4 7 7 7 7 9 9 9 11 11 (match states) 1252 // retVals array [0] index of the start of the match, or -1 if no match 1253 // [1] index of first char following the match. 1254 // Can not use Java regex because need supplementary character support, 1255 // and because Unicode char properties version must be the same as in 1256 // the version of ICU being tested. 1257 private int[] LBNumberCheck(StringBuffer s, int startIdx, int[] retVals) { 1258 if (retVals == null) { 1259 retVals = new int[2]; 1260 } 1261 retVals[0] = -1; // Indicates no match. 1262 int matchState = 0; 1263 int idx = startIdx; 1264 1265 matchLoop: for (idx = startIdx; idx<s.length(); idx = moveIndex32(s, idx, 1)){ 1266 int c = UTF16.charAt(s, idx); 1267 int cLBType = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK); 1268 switch (matchState) { 1269 case 0: 1270 if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC || 1271 cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) { 1272 matchState = 1; 1273 break; 1274 } 1275 if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) { 1276 matchState = 4; 1277 break; 1278 } 1279 if (cLBType == UCharacter.LineBreak.HYPHEN) { 1280 matchState = 4; 1281 break; 1282 } 1283 if (cLBType == UCharacter.LineBreak.NUMERIC) { 1284 matchState = 7; 1285 break; 1286 } 1287 break matchLoop; /* No Match */ 1288 1289 case 1: 1290 if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) { 1291 matchState = 1; 1292 break; 1293 } 1294 if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) { 1295 matchState = 4; 1296 break; 1297 } 1298 if (cLBType == UCharacter.LineBreak.HYPHEN) { 1299 matchState = 4; 1300 break; 1301 } 1302 if (cLBType == UCharacter.LineBreak.NUMERIC) { 1303 matchState = 7; 1304 break; 1305 } 1306 break matchLoop; /* No Match */ 1307 1308 1309 case 4: 1310 if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) { 1311 matchState = 4; 1312 break; 1313 } 1314 if (cLBType == UCharacter.LineBreak.NUMERIC) { 1315 matchState = 7; 1316 break; 1317 } 1318 break matchLoop; /* No Match */ 1319 // ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)? (PR | PO) CM*)? 1320 // 0 0 1 3 3 4 7 7 7 7 9 9 11 11 (match states) 1321 1322 case 7: 1323 if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) { 1324 matchState = 7; 1325 break; 1326 } 1327 if (cLBType == UCharacter.LineBreak.NUMERIC) { 1328 matchState = 7; 1329 break; 1330 } 1331 if (cLBType == UCharacter.LineBreak.INFIX_NUMERIC) { 1332 matchState = 7; 1333 break; 1334 } 1335 if (cLBType == UCharacter.LineBreak.BREAK_SYMBOLS) { 1336 matchState = 7; 1337 break; 1338 } 1339 if (cLBType == UCharacter.LineBreak.CLOSE_PUNCTUATION) { 1340 matchState = 9; 1341 break; 1342 } 1343 if (cLBType == UCharacter.LineBreak.CLOSE_PARENTHESIS) { 1344 matchState = 9; 1345 break; 1346 } 1347 if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) { 1348 matchState = 11; 1349 break; 1350 } 1351 if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) { 1352 matchState = 11; 1353 break; 1354 } 1355 1356 break matchLoop; // Match Complete. 1357 case 9: 1358 if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) { 1359 matchState = 9; 1360 break; 1361 } 1362 if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) { 1363 matchState = 11; 1364 break; 1365 } 1366 if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) { 1367 matchState = 11; 1368 break; 1369 } 1370 break matchLoop; // Match Complete. 1371 case 11: 1372 if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) { 1373 matchState = 11; 1374 break; 1375 } 1376 break matchLoop; // Match Complete. 1377 } 1378 } 1379 if (matchState > 4) { 1380 retVals[0] = startIdx; 1381 retVals[1] = idx; 1382 } 1383 return retVals; 1384 } 1385 1386 1387 @Override 1388 List charClasses() { 1389 return fSets; 1390 } 1391 1392 1393 1394 } 1395 1396 1397 /** 1398 * 1399 * Sentence Monkey Test Class 1400 * 1401 * 1402 * 1403 */ 1404 static class RBBISentenceMonkey extends RBBIMonkeyKind { 1405 List fSets; 1406 StringBuffer fText; 1407 1408 UnicodeSet fSepSet; 1409 UnicodeSet fFormatSet; 1410 UnicodeSet fSpSet; 1411 UnicodeSet fLowerSet; 1412 UnicodeSet fUpperSet; 1413 UnicodeSet fOLetterSet; 1414 UnicodeSet fNumericSet; 1415 UnicodeSet fATermSet; 1416 UnicodeSet fSContinueSet; 1417 UnicodeSet fSTermSet; 1418 UnicodeSet fCloseSet; 1419 UnicodeSet fOtherSet; 1420 UnicodeSet fExtendSet; 1421 1422 1423 1424 RBBISentenceMonkey() { 1425 fCharProperty = UProperty.SENTENCE_BREAK; 1426 1427 fSets = new ArrayList(); 1428 1429 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator 1430 // set and made into character classes of their own. For the monkey impl, 1431 // they remain in SEP, since Sep always appears with CR and LF in the rules. 1432 fSepSet = new UnicodeSet("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"); 1433 fFormatSet = new UnicodeSet("[\\p{Sentence_Break = Format}]"); 1434 fSpSet = new UnicodeSet("[\\p{Sentence_Break = Sp}]"); 1435 fLowerSet = new UnicodeSet("[\\p{Sentence_Break = Lower}]"); 1436 fUpperSet = new UnicodeSet("[\\p{Sentence_Break = Upper}]"); 1437 fOLetterSet = new UnicodeSet("[\\p{Sentence_Break = OLetter}]"); 1438 fNumericSet = new UnicodeSet("[\\p{Sentence_Break = Numeric}]"); 1439 fATermSet = new UnicodeSet("[\\p{Sentence_Break = ATerm}]"); 1440 fSContinueSet = new UnicodeSet("[\\p{Sentence_Break = SContinue}]"); 1441 fSTermSet = new UnicodeSet("[\\p{Sentence_Break = STerm}]"); 1442 fCloseSet = new UnicodeSet("[\\p{Sentence_Break = Close}]"); 1443 fExtendSet = new UnicodeSet("[\\p{Sentence_Break = Extend}]"); 1444 fOtherSet = new UnicodeSet(); 1445 1446 1447 fOtherSet.complement(); 1448 fOtherSet.removeAll(fSepSet); 1449 fOtherSet.removeAll(fFormatSet); 1450 fOtherSet.removeAll(fSpSet); 1451 fOtherSet.removeAll(fLowerSet); 1452 fOtherSet.removeAll(fUpperSet); 1453 fOtherSet.removeAll(fOLetterSet); 1454 fOtherSet.removeAll(fNumericSet); 1455 fOtherSet.removeAll(fATermSet); 1456 fOtherSet.removeAll(fSContinueSet); 1457 fOtherSet.removeAll(fSTermSet); 1458 fOtherSet.removeAll(fCloseSet); 1459 fOtherSet.removeAll(fExtendSet); 1460 1461 fSets.add(fSepSet); 1462 fSets.add(fFormatSet); 1463 1464 fSets.add(fSpSet); 1465 fSets.add(fLowerSet); 1466 fSets.add(fUpperSet); 1467 fSets.add(fOLetterSet); 1468 fSets.add(fNumericSet); 1469 fSets.add(fATermSet); 1470 fSets.add(fSContinueSet); 1471 fSets.add(fSTermSet); 1472 fSets.add(fCloseSet); 1473 fSets.add(fOtherSet); 1474 fSets.add(fExtendSet); 1475 } 1476 1477 1478 @Override 1479 List charClasses() { 1480 return fSets; 1481 } 1482 1483 @Override 1484 void setText(StringBuffer s) { 1485 fText = s; 1486 } 1487 1488 1489 // moveBack() Find the "significant" code point preceding the index i. 1490 // Skips over ($Extend | $Format)* 1491 // 1492 private int moveBack(int i) { 1493 1494 if (i <= 0) { 1495 return -1; 1496 } 1497 1498 int c; 1499 int j = i; 1500 do { 1501 j = moveIndex32(fText, j, -1); 1502 c = UTF16.charAt(fText, j); 1503 } 1504 while (j>0 &&(fFormatSet.contains(c) || fExtendSet.contains(c))); 1505 return j; 1506 } 1507 1508 1509 int moveForward(int i) { 1510 if (i>=fText.length()) { 1511 return fText.length(); 1512 } 1513 int c; 1514 int j = i; 1515 do { 1516 j = moveIndex32(fText, j, 1); 1517 c = cAt(j); 1518 } 1519 while (c>=0 && (fFormatSet.contains(c) || fExtendSet.contains(c))); 1520 return j; 1521 1522 } 1523 1524 int cAt(int pos) { 1525 if (pos<0 || pos>=fText.length()) { 1526 return -1; 1527 } 1528 return UTF16.charAt(fText, pos); 1529 } 1530 1531 @Override 1532 int next(int prevPos) { 1533 int /*p0,*/ p1, p2, p3; // Indices of the significant code points around the 1534 // break position being tested. The candidate break 1535 // location is before p2. 1536 int breakPos = -1; 1537 1538 int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 1539 int c; 1540 1541 // Prev break at end of string. return DONE. 1542 if (prevPos >= fText.length()) { 1543 return -1; 1544 } 1545 /*p0 =*/ p1 = p2 = p3 = prevPos; 1546 c3 = UTF16.charAt(fText, prevPos); 1547 c0 = c1 = c2 = 0; 1548 1549 // Loop runs once per "significant" character position in the input text. 1550 for (;;) { 1551 // Move all of the positions forward in the input string. 1552 /*p0 = p1;*/ c0 = c1; 1553 p1 = p2; c1 = c2; 1554 p2 = p3; c2 = c3; 1555 1556 // Advancd p3 by X(Extend | Format)* Rule 4 1557 p3 = moveForward(p3); 1558 c3 = cAt(p3); 1559 1560 // Rule (3) CR x LF 1561 if (c1==0x0d && c2==0x0a && p2==(p1+1)) { 1562 continue; 1563 } 1564 1565 // Rule (4) Sep <break> 1566 if (fSepSet.contains(c1)) { 1567 p2 = p1+1; // Separators don't combine with Extend or Format 1568 break; 1569 } 1570 1571 if (p2 >= fText.length()) { 1572 // Reached end of string. Always a break position. 1573 break; 1574 } 1575 1576 if (p2 == prevPos) { 1577 // Still warming up the loop. (won't work with zero length strings, but we don't care) 1578 continue; 1579 } 1580 1581 // Rule (6). ATerm x Numeric 1582 if (fATermSet.contains(c1) && fNumericSet.contains(c2)) { 1583 continue; 1584 } 1585 1586 // Rule (7). (Upper | Lower) ATerm x Uppper 1587 if ((fUpperSet.contains(c0) || fLowerSet.contains(c0)) && 1588 fATermSet.contains(c1) && fUpperSet.contains(c2)) { 1589 continue; 1590 } 1591 1592 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep))* Lower 1593 // Note: Sterm | ATerm are added to the negated part of the expression by a 1594 // note to the Unicode 5.0 documents. 1595 int p8 = p1; 1596 while (p8>0 && fSpSet.contains(cAt(p8))) { 1597 p8 = moveBack(p8); 1598 } 1599 while (p8>0 && fCloseSet.contains(cAt(p8))) { 1600 p8 = moveBack(p8); 1601 } 1602 if (fATermSet.contains(cAt(p8))) { 1603 p8=p2; 1604 for (;;) { 1605 c = cAt(p8); 1606 if (c==-1 || fOLetterSet.contains(c) || fUpperSet.contains(c) || 1607 fLowerSet.contains(c) || fSepSet.contains(c) || 1608 fATermSet.contains(c) || fSTermSet.contains(c)) 1609 { 1610 break; 1611 } 1612 p8 = moveForward(p8); 1613 } 1614 if (p8<fText.length() && fLowerSet.contains(cAt(p8))) { 1615 continue; 1616 } 1617 } 1618 1619 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | Sterm | ATerm) 1620 if (fSContinueSet.contains(c2) || fSTermSet.contains(c2) || fATermSet.contains(c2)) { 1621 p8 = p1; 1622 while (setContains(fSpSet, cAt(p8))) { 1623 p8 = moveBack(p8); 1624 } 1625 while (setContains(fCloseSet, cAt(p8))) { 1626 p8 = moveBack(p8); 1627 } 1628 c = cAt(p8); 1629 if (setContains(fSTermSet, c) || setContains(fATermSet, c)) { 1630 continue; 1631 } 1632 } 1633 1634 1635 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF) 1636 int p9 = p1; 1637 while (p9>0 && fCloseSet.contains(cAt(p9))) { 1638 p9 = moveBack(p9); 1639 } 1640 c = cAt(p9); 1641 if ((fSTermSet.contains(c) || fATermSet.contains(c))) { 1642 if (fCloseSet.contains(c2) || fSpSet.contains(c2) || fSepSet.contains(c2)) { 1643 continue; 1644 } 1645 } 1646 1647 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF) 1648 int p10 = p1; 1649 while (p10>0 && fSpSet.contains(cAt(p10))) { 1650 p10 = moveBack(p10); 1651 } 1652 while (p10>0 && fCloseSet.contains(cAt(p10))) { 1653 p10 = moveBack(p10); 1654 } 1655 if (fSTermSet.contains(cAt(p10)) || fATermSet.contains(cAt(p10))) { 1656 if (fSpSet.contains(c2) || fSepSet.contains(c2)) { 1657 continue; 1658 } 1659 } 1660 1661 // Rule (11) (STerm | ATerm) Close* Sp* <break> 1662 int p11 = p1; 1663 if (p11>0 && fSepSet.contains(cAt(p11))) { 1664 p11 = moveBack(p11); 1665 } 1666 while (p11>0 && fSpSet.contains(cAt(p11))) { 1667 p11 = moveBack(p11); 1668 } 1669 while (p11>0 && fCloseSet.contains(cAt(p11))) { 1670 p11 = moveBack(p11); 1671 } 1672 if (fSTermSet.contains(cAt(p11)) || fATermSet.contains(cAt(p11))) { 1673 break; 1674 } 1675 1676 // Rule (12) Any x Any 1677 continue; 1678 } 1679 breakPos = p2; 1680 return breakPos; 1681 } 1682 1683 1684 1685 } 1686 1687 1688 /** 1689 * Move an index into a string by n code points. 1690 * Similar to UTF16.moveCodePointOffset, but without the exceptions, which were 1691 * complicating usage. 1692 * @param s a Text string 1693 * @param pos The starting code unit index into the text string 1694 * @param amt The amount to adjust the string by. 1695 * @return The adjusted code unit index, pinned to the string's length, or 1696 * unchanged if input index was outside of the string. 1697 */ 1698 static int moveIndex32(StringBuffer s, int pos, int amt) { 1699 int i; 1700 char c; 1701 if (amt>0) { 1702 for (i=0; i<amt; i++) { 1703 if (pos >= s.length()) { 1704 return s.length(); 1705 } 1706 c = s.charAt(pos); 1707 pos++; 1708 if (UTF16.isLeadSurrogate(c) && pos < s.length()) { 1709 c = s.charAt(pos); 1710 if (UTF16.isTrailSurrogate(c)) { 1711 pos++; 1712 } 1713 } 1714 } 1715 } else { 1716 for (i=0; i>amt; i--) { 1717 if (pos <= 0) { 1718 return 0; 1719 } 1720 pos--; 1721 c = s.charAt(pos); 1722 if (UTF16.isTrailSurrogate(c) && pos >= 0) { 1723 c = s.charAt(pos); 1724 if (UTF16.isLeadSurrogate(c)) { 1725 pos--; 1726 } 1727 } 1728 } 1729 } 1730 return pos; 1731 } 1732 1733 /** 1734 * No-exceptions form of UnicodeSet.contains(c). 1735 * Simplifies loops that terminate with an end-of-input character value. 1736 * @param s A unicode set 1737 * @param c A code point value 1738 * @return true if the set contains c. 1739 */ 1740 static boolean setContains(UnicodeSet s, int c) { 1741 if (c<0 || c>UTF16.CODEPOINT_MAX_VALUE ) { 1742 return false; 1743 } 1744 return s.contains(c); 1745 } 1746 1747 1748 /** 1749 * return the index of the next code point in the input text. 1750 * @param i the preceding index 1751 */ 1752 static int nextCP(StringBuffer s, int i) { 1753 if (i == -1) { 1754 // End of Input indication. Continue to return end value. 1755 return -1; 1756 } 1757 int retVal = i + 1; 1758 if (retVal > s.length()) { 1759 return -1; 1760 } 1761 int c = UTF16.charAt(s, i); 1762 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && UTF16.isLeadSurrogate(s.charAt(i))) { 1763 retVal++; 1764 } 1765 return retVal; 1766 } 1767 1768 1769 /** 1770 * random number generator. Not using Java's built-in Randoms for two reasons: 1771 * 1. Using this code allows obtaining the same sequences as those from the ICU4C monkey test. 1772 * 2. We need to get and restore the seed from values occurring in the middle 1773 * of a long sequence, to more easily reproduce failing cases. 1774 */ 1775 private static int m_seed = 1; 1776 private static int m_rand() 1777 { 1778 m_seed = m_seed * 1103515245 + 12345; 1779 return (m_seed >>> 16) % 32768; 1780 } 1781 1782 // Helper function for formatting error output. 1783 // Append a string into a fixed-size field in a StringBuffer. 1784 // Blank-pad the string if it is shorter than the field. 1785 // Truncate the source string if it is too long. 1786 // 1787 private static void appendToBuf(StringBuffer dest, String src, int fieldLen) { 1788 int appendLen = src.length(); 1789 if (appendLen >= fieldLen) { 1790 dest.append(src.substring(0, fieldLen)); 1791 } else { 1792 dest.append(src); 1793 while (appendLen < fieldLen) { 1794 dest.append(' '); 1795 appendLen++; 1796 } 1797 } 1798 } 1799 1800 // Helper function for formatting error output. 1801 // Display a code point in "\\uxxxx" or "\Uxxxxxxxx" format 1802 private static void appendCharToBuf(StringBuffer dest, int c, int fieldLen) { 1803 String hexChars = "0123456789abcdef"; 1804 if (c < 0x10000) { 1805 dest.append("\\u"); 1806 for (int bn=12; bn>=0; bn-=4) { 1807 dest.append(hexChars.charAt(((c)>>bn)&0xf)); 1808 } 1809 appendToBuf(dest, " ", fieldLen-6); 1810 } else { 1811 dest.append("\\U"); 1812 for (int bn=28; bn>=0; bn-=4) { 1813 dest.append(hexChars.charAt(((c)>>bn)&0xf)); 1814 } 1815 appendToBuf(dest, " ", fieldLen-10); 1816 1817 } 1818 } 1819 1820 /** 1821 * Run a RBBI monkey test. Common routine, for all break iterator types. 1822 * Parameters: 1823 * bi - the break iterator to use 1824 * mk - MonkeyKind, abstraction for obtaining expected results 1825 * name - Name of test (char, word, etc.) for use in error messages 1826 * seed - Seed for starting random number generator (parameter from user) 1827 * numIterations 1828 */ 1829 void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int numIterations) { 1830 int TESTSTRINGLEN = 500; 1831 StringBuffer testText = new StringBuffer(); 1832 int numCharClasses; 1833 List chClasses; 1834 int[] expected = new int[TESTSTRINGLEN*2 + 1]; 1835 int expectedCount = 0; 1836 boolean[] expectedBreaks = new boolean[TESTSTRINGLEN*2 + 1]; 1837 boolean[] forwardBreaks = new boolean[TESTSTRINGLEN*2 + 1]; 1838 boolean[] reverseBreaks = new boolean[TESTSTRINGLEN*2 + 1]; 1839 boolean[] isBoundaryBreaks = new boolean[TESTSTRINGLEN*2 + 1]; 1840 boolean[] followingBreaks = new boolean[TESTSTRINGLEN*2 + 1]; 1841 boolean[] precedingBreaks = new boolean[TESTSTRINGLEN*2 + 1]; 1842 int i; 1843 int loopCount = 0; 1844 boolean printTestData = false; 1845 boolean printBreaksFromBI = false; 1846 1847 m_seed = seed; 1848 1849 numCharClasses = mk.charClasses().size(); 1850 chClasses = mk.charClasses(); 1851 1852 // Verify that the character classes all have at least one member. 1853 for (i=0; i<numCharClasses; i++) { 1854 UnicodeSet s = (UnicodeSet)chClasses.get(i); 1855 if (s == null || s.size() == 0) { 1856 errln("Character Class " + i + " is null or of zero size."); 1857 return; 1858 } 1859 } 1860 1861 //-------------------------------------------------------------------------------------------- 1862 // 1863 // Debugging settings. Comment out everything in the following block for normal operation 1864 // 1865 //-------------------------------------------------------------------------------------------- 1866 // numIterations = -1; 1867 // numIterations = 10000; // Same as exhaustive. 1868 // RuleBasedBreakIterator_New.fTrace = true; 1869 // m_seed = 859056465; 1870 // TESTSTRINGLEN = 50; 1871 // printTestData = true; 1872 // printBreaksFromBI = true; 1873 // ((RuleBasedBreakIterator_New)bi).dump(); 1874 1875 //-------------------------------------------------------------------------------------------- 1876 // 1877 // End of Debugging settings. 1878 // 1879 //-------------------------------------------------------------------------------------------- 1880 1881 int dotsOnLine = 0; 1882 while (loopCount < numIterations || numIterations == -1) { 1883 if (numIterations == -1 && loopCount % 10 == 0) { 1884 // If test is running in an infinite loop, display a periodic tic so 1885 // we can tell that it is making progress. 1886 System.out.print("."); 1887 if (dotsOnLine++ >= 80){ 1888 System.out.println(); 1889 dotsOnLine = 0; 1890 } 1891 } 1892 // Save current random number seed, so that we can recreate the random numbers 1893 // for this loop iteration in event of an error. 1894 seed = m_seed; 1895 1896 testText.setLength(0); 1897 // Populate a test string with data. 1898 if (printTestData) { 1899 System.out.println("Test Data string ..."); 1900 } 1901 for (i=0; i<TESTSTRINGLEN; i++) { 1902 int aClassNum = m_rand() % numCharClasses; 1903 UnicodeSet classSet = (UnicodeSet)chClasses.get(aClassNum); 1904 int charIdx = m_rand() % classSet.size(); 1905 int c = classSet.charAt(charIdx); 1906 if (c < 0) { // TODO: deal with sets containing strings. 1907 errln("c < 0"); 1908 } 1909 UTF16.appendCodePoint(testText, c); 1910 if (printTestData) { 1911 System.out.print(Integer.toHexString(c) + " "); 1912 } 1913 } 1914 if (printTestData) { 1915 System.out.println(); 1916 } 1917 1918 Arrays.fill(expected, 0); 1919 Arrays.fill(expectedBreaks, false); 1920 Arrays.fill(forwardBreaks, false); 1921 Arrays.fill(reverseBreaks, false); 1922 Arrays.fill(isBoundaryBreaks, false); 1923 Arrays.fill(followingBreaks, false); 1924 Arrays.fill(precedingBreaks, false); 1925 1926 // Calculate the expected results for this test string. 1927 mk.setText(testText); 1928 expectedCount = 0; 1929 expectedBreaks[0] = true; 1930 expected[expectedCount ++] = 0; 1931 int breakPos = 0; 1932 int lastBreakPos = -1; 1933 for (;;) { 1934 lastBreakPos = breakPos; 1935 breakPos = mk.next(breakPos); 1936 if (breakPos == -1) { 1937 break; 1938 } 1939 if (breakPos > testText.length()) { 1940 errln("breakPos > testText.length()"); 1941 } 1942 if (lastBreakPos >= breakPos) { 1943 errln("Next() not increasing."); 1944 // break; 1945 } 1946 expectedBreaks[breakPos] = true; 1947 expected[expectedCount ++] = breakPos; 1948 } 1949 1950 // Find the break positions using forward iteration 1951 if (printBreaksFromBI) { 1952 System.out.println("Breaks from BI..."); 1953 } 1954 bi.setText(testText.toString()); 1955 for (i=bi.first(); i != BreakIterator.DONE; i=bi.next()) { 1956 if (i < 0 || i > testText.length()) { 1957 errln(name + " break monkey test: Out of range value returned by breakIterator::next()"); 1958 break; 1959 } 1960 if (printBreaksFromBI) { 1961 System.out.print(Integer.toHexString(i) + " "); 1962 } 1963 forwardBreaks[i] = true; 1964 } 1965 if (printBreaksFromBI) { 1966 System.out.println(); 1967 } 1968 1969 // Find the break positions using reverse iteration 1970 for (i=bi.last(); i != BreakIterator.DONE; i=bi.previous()) { 1971 if (i < 0 || i > testText.length()) { 1972 errln(name + " break monkey test: Out of range value returned by breakIterator.next()" + name); 1973 break; 1974 } 1975 reverseBreaks[i] = true; 1976 } 1977 1978 // Find the break positions using isBoundary() tests. 1979 for (i=0; i<=testText.length(); i++) { 1980 isBoundaryBreaks[i] = bi.isBoundary(i); 1981 } 1982 1983 // Find the break positions using the following() function. 1984 lastBreakPos = 0; 1985 followingBreaks[0] = true; 1986 for (i=0; i<testText.length(); i++) { 1987 breakPos = bi.following(i); 1988 if (breakPos <= i || 1989 breakPos < lastBreakPos || 1990 breakPos > testText.length() || 1991 breakPos > lastBreakPos && lastBreakPos > i ) { 1992 errln(name + " break monkey test: " + 1993 "Out of range value returned by BreakIterator::following().\n" + 1994 "index=" + i + "following returned=" + breakPos + 1995 "lastBreak=" + lastBreakPos); 1996 precedingBreaks[i] = !expectedBreaks[i]; // Forces an error. 1997 } else { 1998 followingBreaks[breakPos] = true; 1999 lastBreakPos = breakPos; 2000 } 2001 } 2002 2003 // Find the break positions using the preceding() function. 2004 lastBreakPos = testText.length(); 2005 precedingBreaks[testText.length()] = true; 2006 for (i=testText.length(); i>0; i--) { 2007 breakPos = bi.preceding(i); 2008 if (breakPos >= i || 2009 breakPos > lastBreakPos || 2010 breakPos < 0 || 2011 breakPos < lastBreakPos && lastBreakPos < i ) { 2012 errln(name + " break monkey test: " + 2013 "Out of range value returned by BreakIterator::preceding().\n" + 2014 "index=" + i + "preceding returned=" + breakPos + 2015 "lastBreak=" + lastBreakPos); 2016 precedingBreaks[i] = !expectedBreaks[i]; // Forces an error. 2017 } else { 2018 precedingBreaks[breakPos] = true; 2019 lastBreakPos = breakPos; 2020 } 2021 } 2022 2023 2024 2025 // Compare the expected and actual results. 2026 for (i=0; i<=testText.length(); i++) { 2027 String errorType = null; 2028 if (forwardBreaks[i] != expectedBreaks[i]) { 2029 errorType = "next()"; 2030 } else if (reverseBreaks[i] != forwardBreaks[i]) { 2031 errorType = "previous()"; 2032 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) { 2033 errorType = "isBoundary()"; 2034 } else if (followingBreaks[i] != expectedBreaks[i]) { 2035 errorType = "following()"; 2036 } else if (precedingBreaks[i] != expectedBreaks[i]) { 2037 errorType = "preceding()"; 2038 } 2039 2040 if (errorType != null) { 2041 // Format a range of the test text that includes the failure as 2042 // a data item that can be included in the rbbi test data file. 2043 2044 // Start of the range is the last point where expected and actual results 2045 // both agreed that there was a break position. 2046 int startContext = i; 2047 int count = 0; 2048 for (;;) { 2049 if (startContext==0) { break; } 2050 startContext --; 2051 if (expectedBreaks[startContext]) { 2052 if (count == 2) break; 2053 count ++; 2054 } 2055 } 2056 2057 // End of range is two expected breaks past the start position. 2058 int endContext = i + 1; 2059 int ci; 2060 for (ci=0; ci<2; ci++) { // Number of items to include in error text. 2061 for (;;) { 2062 if (endContext >= testText.length()) {break;} 2063 if (expectedBreaks[endContext-1]) { 2064 if (count == 0) break; 2065 count --; 2066 } 2067 endContext ++; 2068 } 2069 } 2070 2071 // Format looks like "<data><>\uabcd\uabcd<>\U0001abcd...</data>" 2072 StringBuffer errorText = new StringBuffer(); 2073 2074 int c; // Char from test data 2075 for (ci = startContext; ci <= endContext && ci != -1; ci = nextCP(testText, ci)) { 2076 if (ci == i) { 2077 // This is the location of the error. 2078 errorText.append("<?>---------------------------------\n"); 2079 } else if (expectedBreaks[ci]) { 2080 // This a non-error expected break position. 2081 errorText.append("------------------------------------\n"); 2082 } 2083 if (ci < testText.length()) { 2084 c = UTF16.charAt(testText, ci); 2085 appendCharToBuf(errorText, c, 11); 2086 String gc = UCharacter.getPropertyValueName(UProperty.GENERAL_CATEGORY, UCharacter.getType(c), UProperty.NameChoice.SHORT); 2087 appendToBuf(errorText, gc, 8); 2088 int extraProp = UCharacter.getIntPropertyValue(c, mk.fCharProperty); 2089 String extraPropValue = 2090 UCharacter.getPropertyValueName(mk.fCharProperty, extraProp, UProperty.NameChoice.LONG); 2091 appendToBuf(errorText, extraPropValue, 20); 2092 2093 String charName = UCharacter.getExtendedName(c); 2094 appendToBuf(errorText, charName, 40); 2095 errorText.append('\n'); 2096 } 2097 } 2098 if (ci == testText.length() && ci != -1) { 2099 errorText.append("<>"); 2100 } 2101 errorText.append("</data>\n"); 2102 2103 // Output the error 2104 errln(name + " break monkey test error. " + 2105 (expectedBreaks[i]? "Break expected but not found." : "Break found but not expected.") + 2106 "\nOperation = " + errorType + "; random seed = " + seed + "; buf Idx = " + i + "\n" + 2107 errorText); 2108 break; 2109 } 2110 } 2111 2112 loopCount++; 2113 } 2114 } 2115 2116 @Test 2117 public void TestCharMonkey() { 2118 2119 int loopCount = 500; 2120 int seed = 1; 2121 2122 if (TestFmwk.getExhaustiveness() >= 9) { 2123 loopCount = 10000; 2124 } 2125 2126 RBBICharMonkey m = new RBBICharMonkey(); 2127 BreakIterator bi = BreakIterator.getCharacterInstance(Locale.US); 2128 RunMonkey(bi, m, "char", seed, loopCount); 2129 } 2130 2131 @Test 2132 public void TestWordMonkey() { 2133 2134 int loopCount = 500; 2135 int seed = 1; 2136 2137 if (TestFmwk.getExhaustiveness() >= 9) { 2138 loopCount = 10000; 2139 } 2140 2141 logln("Word Break Monkey Test"); 2142 RBBIWordMonkey m = new RBBIWordMonkey(); 2143 BreakIterator bi = BreakIterator.getWordInstance(Locale.US); 2144 RunMonkey(bi, m, "word", seed, loopCount); 2145 } 2146 2147 @Test 2148 public void TestLineMonkey() { 2149 int loopCount = 500; 2150 int seed = 1; 2151 2152 if (TestFmwk.getExhaustiveness() >= 9) { 2153 loopCount = 10000; 2154 } 2155 2156 logln("Line Break Monkey Test"); 2157 RBBILineMonkey m = new RBBILineMonkey(); 2158 BreakIterator bi = BreakIterator.getLineInstance(Locale.US); 2159 RunMonkey(bi, m, "line", seed, loopCount); 2160 } 2161 2162 @Test 2163 public void TestSentMonkey() { 2164 2165 int loopCount = 500; 2166 int seed = 1; 2167 2168 if (TestFmwk.getExhaustiveness() >= 9) { 2169 loopCount = 3000; 2170 } 2171 2172 logln("Sentence Break Monkey Test"); 2173 RBBISentenceMonkey m = new RBBISentenceMonkey(); 2174 BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US); 2175 RunMonkey(bi, m, "sent", seed, loopCount); 2176 } 2177 // 2178 // Round-trip monkey tests. 2179 // Verify that break iterators created from the rule source from the default 2180 // break iterators still pass the monkey test for the iterator type. 2181 // 2182 // This is a major test for the Rule Compiler. The default break iterators are built 2183 // from pre-compiled binary rule data that was created using ICU4C; these 2184 // round-trip rule recompile tests verify that the Java rule compiler can 2185 // rebuild break iterators from the original source rules. 2186 // 2187 @Test 2188 public void TestRTCharMonkey() { 2189 2190 int loopCount = 200; 2191 int seed = 1; 2192 2193 if (TestFmwk.getExhaustiveness() >= 9) { 2194 loopCount = 2000; 2195 } 2196 2197 RBBICharMonkey m = new RBBICharMonkey(); 2198 BreakIterator bi = BreakIterator.getCharacterInstance(Locale.US); 2199 String rules = bi.toString(); 2200 BreakIterator rtbi = new RuleBasedBreakIterator(rules); 2201 RunMonkey(rtbi, m, "char", seed, loopCount); 2202 } 2203 2204 @Test 2205 public void TestRTWordMonkey() { 2206 2207 int loopCount = 200; 2208 int seed = 1; 2209 2210 if (TestFmwk.getExhaustiveness() >= 9) { 2211 loopCount = 2000; 2212 } 2213 logln("Word Break Monkey Test"); 2214 RBBIWordMonkey m = new RBBIWordMonkey(); 2215 BreakIterator bi = BreakIterator.getWordInstance(Locale.US); 2216 String rules = bi.toString(); 2217 BreakIterator rtbi = new RuleBasedBreakIterator(rules); 2218 RunMonkey(rtbi, m, "word", seed, loopCount); 2219 } 2220 2221 @Test 2222 public void TestRTLineMonkey() { 2223 int loopCount = 200; 2224 int seed = 1; 2225 2226 if (TestFmwk.getExhaustiveness() >= 9) { 2227 loopCount = 2000; 2228 } 2229 2230 logln("Line Break Monkey Test"); 2231 RBBILineMonkey m = new RBBILineMonkey(); 2232 BreakIterator bi = BreakIterator.getLineInstance(Locale.US); 2233 String rules = bi.toString(); 2234 BreakIterator rtbi = new RuleBasedBreakIterator(rules); 2235 RunMonkey(rtbi, m, "line", seed, loopCount); 2236 } 2237 2238 @Test 2239 public void TestRTSentMonkey() { 2240 2241 int loopCount = 200; 2242 int seed = 1; 2243 2244 if (TestFmwk.getExhaustiveness() >= 9) { 2245 loopCount = 1000; 2246 } 2247 2248 logln("Sentence Break Monkey Test"); 2249 RBBISentenceMonkey m = new RBBISentenceMonkey(); 2250 BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US); 2251 String rules = bi.toString(); 2252 BreakIterator rtbi = new RuleBasedBreakIterator(rules); 2253 RunMonkey(rtbi, m, "sent", seed, loopCount); 2254 } 2255} 2256 2257