1/* GENERATED SOURCE. DO NOT MODIFY. */ 2// © 2016 and later: Unicode, Inc. and others. 3// License & terms of use: http://www.unicode.org/copyright.html#License 4/* 5 ******************************************************************************* 6 * Copyright (C) 2003-2016 International Business Machines Corporation and 7 * others. All Rights Reserved. 8 ******************************************************************************* 9 */ 10package android.icu.dev.test.rbbi; 11 12 13// Monkey testing of RuleBasedBreakIterator 14import java.util.ArrayList; 15import java.util.Arrays; 16import java.util.List; 17import java.util.Locale; 18 19import org.junit.Test; 20 21import android.icu.dev.test.TestFmwk; 22import android.icu.lang.UCharacter; 23import android.icu.lang.UProperty; 24import android.icu.text.BreakIterator; 25import android.icu.text.RuleBasedBreakIterator; 26import android.icu.text.UTF16; 27import android.icu.text.UnicodeSet; 28 29 30/** 31 * Monkey tests for RBBI. These tests have independent implementations of 32 * the Unicode TR boundary rules, and compare results between these and ICU's 33 * implementation, using random data. 34 * 35 * Tests cover Grapheme Cluster (char), Word and Line breaks 36 * 37 * Ported from ICU4C, original code in file source/test/intltest/rbbitst.cpp 38 * 39 */ 40public class RBBITestMonkey extends TestFmwk { 41 // 42 // class RBBIMonkeyKind 43 // 44 // Monkey Test for Break Iteration 45 // Abstract interface class. Concrete derived classes independently 46 // implement the break rules for different iterator types. 47 // 48 // The Monkey Test itself uses doesn't know which type of break iterator it is 49 // testing, but works purely in terms of the interface defined here. 50 // 51 abstract static class RBBIMonkeyKind { 52 53 // Return a List of UnicodeSets, representing the character classes used 54 // for this type of iterator. 55 abstract List charClasses(); 56 57 // Set the test text on which subsequent calls to next() will operate 58 abstract void setText(StringBuffer text); 59 60 // Find the next break position, starting from the specified position. 61 // Return -1 after reaching end of string. 62 abstract int next(int i); 63 64 // A Character Property, one of the constants defined in class UProperty. 65 // The value of this property will be displayed for the characters 66 // near any test failure. 67 int fCharProperty; 68 } 69 70 // 71 // Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773 72 // 73 static String gExtended_Pict = "[" + 74 "\\U0001F774-\\U0001F77F\\u2700-\\u2701\\u2703-\\u2704\\u270E\\u2710-\\u2711\\u2765-\\u2767\\U0001F030-\\U0001F093" + 75 "\\U0001F094-\\U0001F09F\\U0001F10D-\\U0001F10F\\U0001F12F\\U0001F16C-\\U0001F16F\\U0001F1AD-\\U0001F1E5" + 76 "\\U0001F203-\\U0001F20F\\U0001F23C-\\U0001F23F\\U0001F249-\\U0001F24F\\U0001F252-\\U0001F2FF\\U0001F7D5-\\U0001F7FF" + 77 "\\U0001F000-\\U0001F003\\U0001F005-\\U0001F02B\\U0001F02C-\\U0001F02F\\U0001F322-\\U0001F323\\U0001F394-\\U0001F395" + 78 "\\U0001F398\\U0001F39C-\\U0001F39D\\U0001F3F1-\\U0001F3F2\\U0001F3F6\\U0001F4FE\\U0001F53E-\\U0001F548" + 79 "\\U0001F54F\\U0001F568-\\U0001F56E\\U0001F571-\\U0001F572\\U0001F57B-\\U0001F586\\U0001F588-\\U0001F589" + 80 "\\U0001F58E-\\U0001F58F\\U0001F591-\\U0001F594\\U0001F597-\\U0001F5A3\\U0001F5A6-\\U0001F5A7\\U0001F5A9-\\U0001F5B0" + 81 "\\U0001F5B3-\\U0001F5BB\\U0001F5BD-\\U0001F5C1\\U0001F5C5-\\U0001F5D0\\U0001F5D4-\\U0001F5DB\\U0001F5DF-\\U0001F5E0" + 82 "\\U0001F5E2\\U0001F5E4-\\U0001F5E7\\U0001F5E9-\\U0001F5EE\\U0001F5F0-\\U0001F5F2\\U0001F5F4-\\U0001F5F9" + 83 "\\u2605\\u2607-\\u260D\\u260F-\\u2610\\u2612\\u2616-\\u2617\\u2619-\\u261C\\u261E-\\u261F\\u2621\\u2624-\\u2625" + 84 "\\u2627-\\u2629\\u262B-\\u262D\\u2630-\\u2637\\u263B-\\u2647\\u2654-\\u265F\\u2661-\\u2662\\u2664\\u2667" + 85 "\\u2669-\\u267A\\u267C-\\u267E\\u2680-\\u2691\\u2695\\u2698\\u269A\\u269D-\\u269F\\u26A2-\\u26A9\\u26AC-\\u26AF" + 86 "\\u26B2-\\u26BC\\u26BF-\\u26C3\\u26C6-\\u26C7\\u26C9-\\u26CD\\u26D0\\u26D2\\u26D5-\\u26E8\\u26EB-\\u26EF" + 87 "\\u26F6\\u26FB-\\u26FC\\u26FE-\\u26FF\\u2388\\U0001FA00-\\U0001FFFD\\U0001F0A0-\\U0001F0AE\\U0001F0B1-\\U0001F0BF" + 88 "\\U0001F0C1-\\U0001F0CF\\U0001F0D1-\\U0001F0F5\\U0001F0AF-\\U0001F0B0\\U0001F0C0\\U0001F0D0\\U0001F0F6-\\U0001F0FF" + 89 "\\U0001F80C-\\U0001F80F\\U0001F848-\\U0001F84F\\U0001F85A-\\U0001F85F\\U0001F888-\\U0001F88F\\U0001F8AE-\\U0001F8FF" + 90 "\\U0001F900-\\U0001F90F\\U0001F91F\\U0001F928-\\U0001F92F\\U0001F931-\\U0001F932\\U0001F93F\\U0001F94C-\\U0001F94F" + 91 "\\U0001F95F-\\U0001F97F\\U0001F992-\\U0001F9BF\\U0001F9C1-\\U0001F9FF\\U0001F6C6-\\U0001F6CA\\U0001F6E6-\\U0001F6E8" + 92 "\\U0001F6EA\\U0001F6F1-\\U0001F6F2\\U0001F6D3-\\U0001F6DF\\U0001F6ED-\\U0001F6EF\\U0001F6F7-\\U0001F6FF" + 93 "]"; 94 95 96 /** 97 * Monkey test subclass for testing Character (Grapheme Cluster) boundaries. 98 * Note: As of Unicode 6.1, fPrependSet is empty, so don't add it to fSets 99 */ 100 static class RBBICharMonkey extends RBBIMonkeyKind { 101 List fSets; 102 103 UnicodeSet fCRLFSet; 104 UnicodeSet fControlSet; 105 UnicodeSet fExtendSet; 106 UnicodeSet fRegionalIndicatorSet; 107 UnicodeSet fPrependSet; 108 UnicodeSet fSpacingSet; 109 UnicodeSet fLSet; 110 UnicodeSet fVSet; 111 UnicodeSet fTSet; 112 UnicodeSet fLVSet; 113 UnicodeSet fLVTSet; 114 UnicodeSet fHangulSet; 115 UnicodeSet fEmojiModifierSet; 116 UnicodeSet fEmojiBaseSet; 117 UnicodeSet fZWJSet; 118 UnicodeSet fExtendedPictSet; 119 UnicodeSet fEBGSet; 120 UnicodeSet fEmojiNRKSet; 121 UnicodeSet fAnySet; 122 123 124 StringBuffer fText; 125 126 127 RBBICharMonkey() { 128 fText = null; 129 fCharProperty = UProperty.GRAPHEME_CLUSTER_BREAK; 130 fCRLFSet = new UnicodeSet("[\\r\\n]"); 131 fControlSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Control}]"); 132 fExtendSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}]"); 133 fZWJSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = ZWJ}]"); 134 fRegionalIndicatorSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]"); 135 fPrependSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Prepend}]"); 136 fSpacingSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = SpacingMark}]"); 137 fLSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = L}]"); 138 fVSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = V}]"); 139 fTSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = T}]"); 140 fLVSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LV}]"); 141 fLVTSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LVT}]"); 142 fHangulSet = new UnicodeSet(); 143 fHangulSet.addAll(fLSet); 144 fHangulSet.addAll(fVSet); 145 fHangulSet.addAll(fTSet); 146 fHangulSet.addAll(fLVSet); 147 fHangulSet.addAll(fLVTSet); 148 149 fEmojiBaseSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"); 150 fEmojiModifierSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = EM}]"); 151 fExtendedPictSet = new UnicodeSet(gExtended_Pict); 152 fEBGSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = EBG}]"); 153 fEmojiNRKSet = new UnicodeSet("[[\\p{Emoji}]-[\\p{Grapheme_Cluster_Break = Regional_Indicator}*#0-9©®™〰〽]]"); 154 fAnySet = new UnicodeSet("[\\u0000-\\U0010ffff]"); 155 156 157 fSets = new ArrayList(); 158 fSets.add(fCRLFSet); 159 fSets.add(fControlSet); 160 fSets.add(fExtendSet); 161 fSets.add(fRegionalIndicatorSet); 162 if (!fPrependSet.isEmpty()) { 163 fSets.add(fPrependSet); 164 } 165 fSets.add(fSpacingSet); 166 fSets.add(fHangulSet); 167 fSets.add(fAnySet); 168 fSets.add(fEmojiBaseSet); 169 fSets.add(fEmojiModifierSet); 170 fSets.add(fZWJSet); 171 fSets.add(fExtendedPictSet); 172 fSets.add(fEBGSet); 173 fSets.add(fEmojiNRKSet); 174 } 175 176 177 @Override 178 void setText(StringBuffer s) { 179 fText = s; 180 } 181 182 @Override 183 List charClasses() { 184 return fSets; 185 } 186 187 @Override 188 int next(int prevPos) { 189 int /*p0,*/ p1, p2, p3; // Indices of the significant code points around the 190 // break position being tested. The candidate break 191 // location is before p2. 192 193 int breakPos = -1; 194 195 int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 196 int cBase; // for (X Extend*) patterns, the X character. 197 198 // Previous break at end of string. return DONE. 199 if (prevPos >= fText.length()) { 200 return -1; 201 } 202 /* p0 = */ p1 = p2 = p3 = prevPos; 203 c3 = UTF16.charAt(fText, prevPos); 204 c0 = c1 = c2 = cBase = 0; 205 206 // Loop runs once per "significant" character position in the input text. 207 for (;;) { 208 // Move all of the positions forward in the input string. 209 /* p0 = p1;*/ c0 = c1; 210 p1 = p2; c1 = c2; 211 p2 = p3; c2 = c3; 212 213 // Advance p3 by one codepoint 214 p3 = moveIndex32(fText, p3, 1); 215 c3 = (p3>=fText.length())? -1: UTF16.charAt(fText, p3); 216 217 if (p1 == p2) { 218 // Still warming up the loop. (won't work with zero length strings, but we don't care) 219 continue; 220 } 221 if (p2 == fText.length()) { 222 // Reached end of string. Always a break position. 223 break; 224 } 225 226 // Rule GB3 CR x LF 227 // No Extend or Format characters may appear between the CR and LF, 228 // which requires the additional check for p2 immediately following p1. 229 // 230 if (c1==0x0D && c2==0x0A && p1==(p2-1)) { 231 continue; 232 } 233 234 // Rule (GB4). ( Control | CR | LF ) <break> 235 if (fControlSet.contains(c1) || 236 c1 == 0x0D || 237 c1 == 0x0A) { 238 break; 239 } 240 241 // Rule (GB5) <break> ( Control | CR | LF ) 242 // 243 if (fControlSet.contains(c2) || 244 c2 == 0x0D || 245 c2 == 0x0A) { 246 break; 247 } 248 249 250 // Rule (GB6) L x ( L | V | LV | LVT ) 251 if (fLSet.contains(c1) && 252 (fLSet.contains(c2) || 253 fVSet.contains(c2) || 254 fLVSet.contains(c2) || 255 fLVTSet.contains(c2))) { 256 continue; 257 } 258 259 // Rule (GB7) ( LV | V ) x ( V | T ) 260 if ((fLVSet.contains(c1) || fVSet.contains(c1)) && 261 (fVSet.contains(c2) || fTSet.contains(c2))) { 262 continue; 263 } 264 265 // Rule (GB8) ( LVT | T) x T 266 if ((fLVTSet.contains(c1) || fTSet.contains(c1)) && 267 fTSet.contains(c2)) { 268 continue; 269 } 270 271 // Rule (GB9) x (Extend | ZWJ) 272 if (fExtendSet.contains(c2) || fZWJSet.contains(c2)) { 273 if (!fExtendSet.contains(c1)) { 274 cBase = c1; 275 } 276 continue; 277 } 278 279 // Rule (GB9a) x SpacingMark 280 if (fSpacingSet.contains(c2)) { 281 continue; 282 } 283 284 // Rule (GB9b) Prepend x 285 if (fPrependSet.contains(c1)) { 286 continue; 287 } 288 // Rule (GB10) (Emoji_Base | EBG) Extend* x Emoji_Modifier 289 if ((fEmojiBaseSet.contains(c1) || fEBGSet.contains(c1)) && fEmojiModifierSet.contains(c2)) { 290 continue; 291 } 292 if ((fEmojiBaseSet.contains(cBase) || fEBGSet.contains(cBase)) && 293 fExtendSet.contains(c1) && fEmojiModifierSet.contains(c2)) { 294 continue; 295 } 296 297 // Rule (GB11) (Extended_Pictographic | Emoji) ZWJ x (Extended_Pictographic | Emoji) 298 if ((fExtendedPictSet.contains(c0) || fEmojiNRKSet.contains(c0)) && fZWJSet.contains(c1) && 299 (fExtendedPictSet.contains(c2) || fEmojiNRKSet.contains(c2))) { 300 continue; 301 } 302 303 // Rule (GB12-13) Regional_Indicator x Regional_Indicator 304 // Note: The first if condition is a little tricky. We only need to force 305 // a break if there are three or more contiguous RIs. If there are 306 // only two, a break following will occur via other rules, and will include 307 // any trailing extend characters, which is needed behavior. 308 if (fRegionalIndicatorSet.contains(c0) && fRegionalIndicatorSet.contains(c1) 309 && fRegionalIndicatorSet.contains(c2)) { 310 break; 311 } 312 if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) { 313 continue; 314 } 315 316 // Rule (GB999) Any <break> Any 317 break; 318 } 319 320 breakPos = p2; 321 return breakPos; 322 } 323 } 324 325 326 /** 327 * 328 * Word Monkey Test Class 329 * 330 * 331 * 332 */ 333 static class RBBIWordMonkey extends RBBIMonkeyKind { 334 List fSets; 335 StringBuffer fText; 336 337 UnicodeSet fCRSet; 338 UnicodeSet fLFSet; 339 UnicodeSet fNewlineSet; 340 UnicodeSet fRegionalIndicatorSet; 341 UnicodeSet fKatakanaSet; 342 UnicodeSet fHebrew_LetterSet; 343 UnicodeSet fALetterSet; 344 UnicodeSet fSingle_QuoteSet; 345 UnicodeSet fDouble_QuoteSet; 346 UnicodeSet fMidNumLetSet; 347 UnicodeSet fMidLetterSet; 348 UnicodeSet fMidNumSet; 349 UnicodeSet fNumericSet; 350 UnicodeSet fFormatSet; 351 UnicodeSet fExtendSet; 352 UnicodeSet fExtendNumLetSet; 353 UnicodeSet fOtherSet; 354 UnicodeSet fDictionarySet; 355 UnicodeSet fEBaseSet; 356 UnicodeSet fEBGSet; 357 UnicodeSet fEModifierSet; 358 UnicodeSet fZWJSet; 359 UnicodeSet fExtendedPictSet; 360 UnicodeSet fEmojiNRKSet; 361 362 363 RBBIWordMonkey() { 364 fCharProperty = UProperty.WORD_BREAK; 365 366 fCRSet = new UnicodeSet("[\\p{Word_Break = CR}]"); 367 fLFSet = new UnicodeSet("[\\p{Word_Break = LF}]"); 368 fNewlineSet = new UnicodeSet("[\\p{Word_Break = Newline}]"); 369 fRegionalIndicatorSet = new UnicodeSet("[\\p{Word_Break = Regional_Indicator}]"); 370 fKatakanaSet = new UnicodeSet("[\\p{Word_Break = Katakana}]"); 371 fHebrew_LetterSet = new UnicodeSet("[\\p{Word_Break = Hebrew_Letter}]"); 372 fALetterSet = new UnicodeSet("[\\p{Word_Break = ALetter}]"); 373 fSingle_QuoteSet = new UnicodeSet("[\\p{Word_Break = Single_Quote}]"); 374 fDouble_QuoteSet = new UnicodeSet("[\\p{Word_Break = Double_Quote}]"); 375 fMidNumLetSet = new UnicodeSet("[\\p{Word_Break = MidNumLet}]"); 376 fMidLetterSet = new UnicodeSet("[\\p{Word_Break = MidLetter}]"); 377 fMidNumSet = new UnicodeSet("[\\p{Word_Break = MidNum}]"); 378 fNumericSet = new UnicodeSet("[\\p{Word_Break = Numeric}]"); 379 fFormatSet = new UnicodeSet("[\\p{Word_Break = Format}]"); 380 fExtendNumLetSet = new UnicodeSet("[\\p{Word_Break = ExtendNumLet}]"); 381 fExtendSet = new UnicodeSet("[\\p{Word_Break = Extend}]"); 382 fEBaseSet = new UnicodeSet("[\\p{Word_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"); 383 fEBGSet = new UnicodeSet("[\\p{Word_Break = EBG}]"); 384 fEModifierSet = new UnicodeSet("[\\p{Word_Break = EM}]"); 385 fZWJSet = new UnicodeSet("[\\p{Word_Break = ZWJ}]"); 386 fExtendedPictSet = new UnicodeSet(gExtended_Pict); 387 fEmojiNRKSet = new UnicodeSet("[[\\p{Emoji}]-[\\p{Grapheme_Cluster_Break = Regional_Indicator}*#0-9©®™〰〽]]"); 388 389 fDictionarySet = new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]"); 390 fDictionarySet.addAll(fKatakanaSet); 391 fDictionarySet.addAll(new UnicodeSet("[\\p{LineBreak = Complex_Context}]")); 392 393 fALetterSet.removeAll(fDictionarySet); 394 395 fOtherSet = new UnicodeSet(); 396 fOtherSet.complement(); 397 fOtherSet.removeAll(fCRSet); 398 fOtherSet.removeAll(fLFSet); 399 fOtherSet.removeAll(fNewlineSet); 400 fOtherSet.removeAll(fALetterSet); 401 fOtherSet.removeAll(fSingle_QuoteSet); 402 fOtherSet.removeAll(fDouble_QuoteSet); 403 fOtherSet.removeAll(fKatakanaSet); 404 fOtherSet.removeAll(fHebrew_LetterSet); 405 fOtherSet.removeAll(fMidLetterSet); 406 fOtherSet.removeAll(fMidNumSet); 407 fOtherSet.removeAll(fNumericSet); 408 fOtherSet.removeAll(fFormatSet); 409 fOtherSet.removeAll(fExtendSet); 410 fOtherSet.removeAll(fExtendNumLetSet); 411 fOtherSet.removeAll(fRegionalIndicatorSet); 412 fOtherSet.removeAll(fEBaseSet); 413 fOtherSet.removeAll(fEBGSet); 414 fOtherSet.removeAll(fEModifierSet); 415 fOtherSet.removeAll(fZWJSet); 416 fOtherSet.removeAll(fExtendedPictSet); 417 fOtherSet.removeAll(fEmojiNRKSet); 418 419 // Inhibit dictionary characters from being tested at all. 420 // remove surrogates so as to not generate higher CJK characters 421 fOtherSet.removeAll(new UnicodeSet("[[\\p{LineBreak = Complex_Context}][:Line_Break=Surrogate:]]")); 422 fOtherSet.removeAll(fDictionarySet); 423 424 fSets = new ArrayList(); 425 fSets.add(fCRSet); 426 fSets.add(fLFSet); 427 fSets.add(fNewlineSet); 428 fSets.add(fRegionalIndicatorSet); 429 fSets.add(fHebrew_LetterSet); 430 fSets.add(fALetterSet); 431 //fSets.add(fKatakanaSet); // Omit Katakana from fSets, which omits Katakana characters 432 // from the test data. They are all in the dictionary set, 433 // which this (old, to be retired) monkey test cannot handle. 434 fSets.add(fSingle_QuoteSet); 435 fSets.add(fDouble_QuoteSet); 436 fSets.add(fMidLetterSet); 437 fSets.add(fMidNumLetSet); 438 fSets.add(fMidNumSet); 439 fSets.add(fNumericSet); 440 fSets.add(fFormatSet); 441 fSets.add(fExtendSet); 442 fSets.add(fExtendNumLetSet); 443 fSets.add(fRegionalIndicatorSet); 444 fSets.add(fEBaseSet); 445 fSets.add(fEBGSet); 446 fSets.add(fEModifierSet); 447 fSets.add(fZWJSet); 448 fSets.add(fExtendedPictSet); 449 fSets.add(fEmojiNRKSet); 450 fSets.add(fOtherSet); 451 } 452 453 454 @Override 455 List charClasses() { 456 return fSets; 457 } 458 459 @Override 460 void setText(StringBuffer s) { 461 fText = s; 462 } 463 464 @Override 465 int next(int prevPos) { 466 int /*p0,*/ p1, p2, p3; // Indices of the significant code points around the 467 // break position being tested. The candidate break 468 // location is before p2. 469 int breakPos = -1; 470 471 int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 472 473 // Previous break at end of string. return DONE. 474 if (prevPos >= fText.length()) { 475 return -1; 476 } 477 /*p0 =*/ p1 = p2 = p3 = prevPos; 478 c3 = UTF16.charAt(fText, prevPos); 479 c0 = c1 = c2 = 0; 480 481 482 483 // Loop runs once per "significant" character position in the input text. 484 for (;;) { 485 // Move all of the positions forward in the input string. 486 /*p0 = p1;*/ c0 = c1; 487 p1 = p2; c1 = c2; 488 p2 = p3; c2 = c3; 489 490 // Advance p3 by X(Extend | Format)* Rule 4 491 // But do not advance over Extend & Format following a new line. (Unicode 5.1 change) 492 do { 493 p3 = moveIndex32(fText, p3, 1); 494 c3 = -1; 495 if (p3>=fText.length()) { 496 break; 497 } 498 c3 = UTF16.charAt(fText, p3); 499 if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) { 500 break; 501 } 502 } 503 while (setContains(fFormatSet, c3) || setContains(fExtendSet, c3) || setContains(fZWJSet, c3)); 504 505 if (p1 == p2) { 506 // Still warming up the loop. (won't work with zero length strings, but we don't care) 507 continue; 508 } 509 if (p2 == fText.length()) { 510 // Reached end of string. Always a break position. 511 break; 512 } 513 514 // Rule (3) CR x LF 515 // No Extend or Format characters may appear between the CR and LF, 516 // which requires the additional check for p2 immediately following p1. 517 // 518 if (c1==0x0D && c2==0x0A) { 519 continue; 520 } 521 522 // Rule (3a) Break before and after newlines (including CR and LF) 523 // 524 if (fCRSet.contains(c1) || fLFSet.contains(c1) || fNewlineSet.contains(c1)) { 525 break; 526 } 527 if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) { 528 break; 529 } 530 531 // Rule (3c) ZWJ x (Extended_Pictographic | Emoji). 532 // Not ignoring extend chars, so peek into input text to 533 // get the potential ZWJ, the character immediately preceding c2. 534 if (fZWJSet.contains(fText.codePointBefore(p2)) && (fExtendedPictSet.contains(c2) || fEmojiNRKSet.contains(c2))) { 535 continue; 536 } 537 538 // Rule (5). (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter) 539 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) && 540 (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) { 541 continue; 542 } 543 544 // Rule (6) (ALetter | Hebrew_Letter) x (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter) 545 // 546 if ( (fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) && 547 (fMidLetterSet.contains(c2) || fMidNumLetSet.contains(c2) || fSingle_QuoteSet.contains(c2)) && 548 (setContains(fALetterSet, c3) || setContains(fHebrew_LetterSet, c3))) { 549 continue; 550 } 551 552 // Rule (7) (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote) x (ALetter | Hebrew_Letter) 553 if ((fALetterSet.contains(c0) || fHebrew_LetterSet.contains(c0)) && 554 (fMidLetterSet.contains(c1) || fMidNumLetSet.contains(c1) || fSingle_QuoteSet.contains(c1)) && 555 (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) { 556 continue; 557 } 558 559 // Rule (7a) Hebrew_Letter x Single_Quote 560 if (fHebrew_LetterSet.contains(c1) && fSingle_QuoteSet.contains(c2)) { 561 continue; 562 } 563 564 // Rule (7b) Hebrew_Letter x Double_Quote Hebrew_Letter 565 if (fHebrew_LetterSet.contains(c1) && fDouble_QuoteSet.contains(c2) && setContains(fHebrew_LetterSet,c3)) { 566 continue; 567 } 568 569 // Rule (7c) Hebrew_Letter Double_Quote x Hebrew_Letter 570 if (fHebrew_LetterSet.contains(c0) && fDouble_QuoteSet.contains(c1) && fHebrew_LetterSet.contains(c2)) { 571 continue; 572 } 573 574 // Rule (8) Numeric x Numeric 575 if (fNumericSet.contains(c1) && 576 fNumericSet.contains(c2)) { 577 continue; 578 } 579 580 // Rule (9) (ALetter | Hebrew_Letter) x Numeric 581 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) && 582 fNumericSet.contains(c2)) { 583 continue; 584 } 585 586 // Rule (10) Numeric x (ALetter | Hebrew_Letter) 587 if (fNumericSet.contains(c1) && 588 (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) { 589 continue; 590 } 591 592 // Rule (11) Numeric (MidNum | MidNumLet | Single_Quote) x Numeric 593 if (fNumericSet.contains(c0) && 594 (fMidNumSet.contains(c1) || fMidNumLetSet.contains(c1) || fSingle_QuoteSet.contains(c1)) && 595 fNumericSet.contains(c2)) { 596 continue; 597 } 598 599 // Rule (12) Numeric x (MidNum | MidNumLet | SingleQuote) Numeric 600 if (fNumericSet.contains(c1) && 601 (fMidNumSet.contains(c2) || fMidNumLetSet.contains(c2) || fSingle_QuoteSet.contains(c2)) && 602 setContains(fNumericSet, c3)) { 603 continue; 604 } 605 606 // Rule (13) Katakana x Katakana 607 // Note: matches UAX 29 rules, but doesn't come into play for ICU because 608 // all Katakana are handled by the dictionary breaker. 609 if (fKatakanaSet.contains(c1) && 610 fKatakanaSet.contains(c2)) { 611 continue; 612 } 613 614 // Rule 13a (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet 615 if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1) ||fNumericSet.contains(c1) || 616 fKatakanaSet.contains(c1) || fExtendNumLetSet.contains(c1)) && 617 fExtendNumLetSet.contains(c2)) { 618 continue; 619 } 620 621 // Rule 13b ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana) 622 if (fExtendNumLetSet.contains(c1) && 623 (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2) || 624 fNumericSet.contains(c2) || fKatakanaSet.contains(c2))) { 625 continue; 626 } 627 628 629 // Rule 14 (E_Base | EBG) x E_Modifier 630 if ((fEBaseSet.contains(c1) || fEBGSet.contains(c1)) && fEModifierSet.contains(c2)) { 631 continue; 632 } 633 634 // Rule 15 - 17 Group piars of Regional Indicators 635 if (fRegionalIndicatorSet.contains(c0) && fRegionalIndicatorSet.contains(c1)) { 636 break; 637 } 638 if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) { 639 continue; 640 } 641 642 // Rule 999. Break found here. 643 break; 644 } 645 646 breakPos = p2; 647 return breakPos; 648 } 649 650 } 651 652 653 static class RBBILineMonkey extends RBBIMonkeyKind { 654 655 List fSets; 656 657 // UnicodeSets for each of the Line Breaking character classes. 658 // Order matches that of Unicode UAX 14, Table 1, which makes it a little easier 659 // to verify that they are all accounted for. 660 661 UnicodeSet fBK; 662 UnicodeSet fCR; 663 UnicodeSet fLF; 664 UnicodeSet fCM; 665 UnicodeSet fNL; 666 UnicodeSet fSG; 667 UnicodeSet fWJ; 668 UnicodeSet fZW; 669 UnicodeSet fGL; 670 UnicodeSet fSP; 671 UnicodeSet fB2; 672 UnicodeSet fBA; 673 UnicodeSet fBB; 674 UnicodeSet fHY; 675 UnicodeSet fCB; 676 UnicodeSet fCL; 677 UnicodeSet fCP; 678 UnicodeSet fEX; 679 UnicodeSet fIN; 680 UnicodeSet fNS; 681 UnicodeSet fOP; 682 UnicodeSet fQU; 683 UnicodeSet fIS; 684 UnicodeSet fNU; 685 UnicodeSet fPO; 686 UnicodeSet fPR; 687 UnicodeSet fSY; 688 UnicodeSet fAI; 689 UnicodeSet fAL; 690 UnicodeSet fCJ; 691 UnicodeSet fH2; 692 UnicodeSet fH3; 693 UnicodeSet fHL; 694 UnicodeSet fID; 695 UnicodeSet fJL; 696 UnicodeSet fJV; 697 UnicodeSet fJT; 698 UnicodeSet fRI; 699 UnicodeSet fXX; 700 UnicodeSet fEB; 701 UnicodeSet fEM; 702 UnicodeSet fZWJ; 703 UnicodeSet fExtendedPict; 704 UnicodeSet fEmojiNRK; 705 706 StringBuffer fText; 707 int fOrigPositions; 708 709 710 711 RBBILineMonkey() 712 { 713 fCharProperty = UProperty.LINE_BREAK; 714 fSets = new ArrayList(); 715 716 fBK = new UnicodeSet("[\\p{Line_Break=BK}]"); 717 fCR = new UnicodeSet("[\\p{Line_break=CR}]"); 718 fLF = new UnicodeSet("[\\p{Line_break=LF}]"); 719 fCM = new UnicodeSet("[\\p{Line_break=CM}]"); 720 fNL = new UnicodeSet("[\\p{Line_break=NL}]"); 721 fSG = new UnicodeSet("[\\ud800-\\udfff]"); 722 fWJ = new UnicodeSet("[\\p{Line_break=WJ}]"); 723 fZW = new UnicodeSet("[\\p{Line_break=ZW}]"); 724 fGL = new UnicodeSet("[\\p{Line_break=GL}]"); 725 fSP = new UnicodeSet("[\\p{Line_break=SP}]"); 726 fB2 = new UnicodeSet("[\\p{Line_break=B2}]"); 727 fBA = new UnicodeSet("[\\p{Line_break=BA}]"); 728 fBB = new UnicodeSet("[\\p{Line_break=BB}]"); 729 fHY = new UnicodeSet("[\\p{Line_break=HY}]"); 730 fCB = new UnicodeSet("[\\p{Line_break=CB}]"); 731 fCL = new UnicodeSet("[\\p{Line_break=CL}]"); 732 fCP = new UnicodeSet("[\\p{Line_break=CP}]"); 733 fEX = new UnicodeSet("[\\p{Line_break=EX}]"); 734 fIN = new UnicodeSet("[\\p{Line_break=IN}]"); 735 fNS = new UnicodeSet("[\\p{Line_break=NS}]"); 736 fOP = new UnicodeSet("[\\p{Line_break=OP}]"); 737 fQU = new UnicodeSet("[\\p{Line_break=QU}]"); 738 fIS = new UnicodeSet("[\\p{Line_break=IS}]"); 739 fNU = new UnicodeSet("[\\p{Line_break=NU}]"); 740 fPO = new UnicodeSet("[\\p{Line_break=PO}]"); 741 fPR = new UnicodeSet("[\\p{Line_break=PR}]"); 742 fSY = new UnicodeSet("[\\p{Line_break=SY}]"); 743 fAI = new UnicodeSet("[\\p{Line_break=AI}]"); 744 fAL = new UnicodeSet("[\\p{Line_break=AL}]"); 745 fCJ = new UnicodeSet("[\\p{Line_break=CJ}]"); 746 fH2 = new UnicodeSet("[\\p{Line_break=H2}]"); 747 fH3 = new UnicodeSet("[\\p{Line_break=H3}]"); 748 fHL = new UnicodeSet("[\\p{Line_break=HL}]"); 749 fID = new UnicodeSet("[\\p{Line_break=ID}]"); 750 fJL = new UnicodeSet("[\\p{Line_break=JL}]"); 751 fJV = new UnicodeSet("[\\p{Line_break=JV}]"); 752 fJT = new UnicodeSet("[\\p{Line_break=JT}]"); 753 fRI = new UnicodeSet("[\\p{Line_break=RI}]"); 754 fXX = new UnicodeSet("[\\p{Line_break=XX}]"); 755 fEB = new UnicodeSet("[\\p{Line_break=EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]"); 756 fEM = new UnicodeSet("[\\p{Line_break=EM}]"); 757 fZWJ = new UnicodeSet("[\\p{Line_break=ZWJ}]"); 758 fEmojiNRK = new UnicodeSet("[[\\p{Emoji}]-[\\p{Line_break=RI}*#0-9©®™〰〽]]"); 759 fExtendedPict = new UnicodeSet(gExtended_Pict); 760 761 762 // Remove dictionary characters. 763 // The monkey test reference implementation of line break does not replicate the dictionary behavior, 764 // so dictionary characters are omitted from the monkey test data. 765 @SuppressWarnings("unused") 766 UnicodeSet dictionarySet = new UnicodeSet( 767 "[[:LineBreak = Complex_Context:] & [[:Script = Thai:][:Script = Lao:][:Script = Khmer:] [:script = Myanmar:]]]"); 768 769 fAL.addAll(fXX); // Default behavior for XX is identical to AL 770 fAL.addAll(fAI); // Default behavior for AI is identical to AL 771 fAL.addAll(fSG); // Default behavior for SG (unpaired surrogates) is AL 772 773 fNS.addAll(fCJ); // Default behavior for CJ is identical to NS. 774 fCM.addAll(fZWJ); // ZWJ behaves as a CM. 775 776 fSets.add(fBK); 777 fSets.add(fCR); 778 fSets.add(fLF); 779 fSets.add(fCM); 780 fSets.add(fNL); 781 fSets.add(fWJ); 782 fSets.add(fZW); 783 fSets.add(fGL); 784 fSets.add(fSP); 785 fSets.add(fB2); 786 fSets.add(fBA); 787 fSets.add(fBB); 788 fSets.add(fHY); 789 fSets.add(fCB); 790 fSets.add(fCL); 791 fSets.add(fCP); 792 fSets.add(fEX); 793 fSets.add(fIN); 794 fSets.add(fJL); 795 fSets.add(fJT); 796 fSets.add(fJV); 797 fSets.add(fNS); 798 fSets.add(fOP); 799 fSets.add(fQU); 800 fSets.add(fIS); 801 fSets.add(fNU); 802 fSets.add(fPO); 803 fSets.add(fPR); 804 fSets.add(fSY); 805 fSets.add(fAI); 806 fSets.add(fAL); 807 fSets.add(fH2); 808 fSets.add(fH3); 809 fSets.add(fHL); 810 fSets.add(fID); 811 fSets.add(fWJ); 812 fSets.add(fRI); 813 fSets.add(fSG); 814 fSets.add(fEB); 815 fSets.add(fEM); 816 fSets.add(fZWJ); 817 fSets.add(fExtendedPict); 818 fSets.add(fEmojiNRK); 819 } 820 821 @Override 822 void setText(StringBuffer s) { 823 fText = s; 824 } 825 826 827 828 829 @Override 830 int next(int startPos) { 831 int pos; // Index of the char following a potential break position 832 int thisChar; // Character at above position "pos" 833 834 int prevPos; // Index of the char preceding a potential break position 835 int prevChar; // Character at above position. Note that prevChar 836 // and thisChar may not be adjacent because combining 837 // characters between them will be ignored. 838 int prevCharX2; // Character before prevChar, more contex for LB 21a 839 840 int nextPos; // Index of the next character following pos. 841 // Usually skips over combining marks. 842 int tPos; // temp value. 843 int matchVals[] = null; // Number Expression Match Results 844 845 846 if (startPos >= fText.length()) { 847 return -1; 848 } 849 850 851 // Initial values for loop. Loop will run the first time without finding breaks, 852 // while the invalid values shift out and the "this" and 853 // "prev" positions are filled in with good values. 854 pos = prevPos = -1; // Invalid value, serves as flag for initial loop iteration. 855 thisChar = prevChar = prevCharX2 = 0; 856 nextPos = startPos; 857 858 859 // Loop runs once per position in the test text, until a break position 860 // is found. In each iteration, we are testing for a possible break 861 // just preceding the character at index "pos". The character preceding 862 // this char is at postion "prevPos"; because of combining sequences, 863 // "prevPos" can be arbitrarily far before "pos". 864 for (;;) { 865 // Advance to the next position to be tested. 866 prevCharX2 = prevChar; 867 prevPos = pos; 868 prevChar = thisChar; 869 pos = nextPos; 870 nextPos = moveIndex32(fText, pos, 1); 871 872 // Rule LB2 - Break at end of text. 873 if (pos >= fText.length()) { 874 break; 875 } 876 877 // Rule LB 9 - adjust for combining sequences. 878 // We do this rule out-of-order because the adjustment does 879 // not effect the way that rules LB 3 through LB 6 match, 880 // and doing it here rather than after LB 6 is substantially 881 // simpler when combining sequences do occur. 882 883 884 // LB 9 Keep combining sequences together. 885 // advance over any CM class chars at "pos", 886 // result is "nextPos" for the following loop iteration. 887 thisChar = UTF16.charAt(fText, pos); 888 if (!(fSP.contains(thisChar) || fBK.contains(thisChar) || thisChar==0x0d || 889 thisChar==0x0a || fNL.contains(thisChar) || fZW.contains(thisChar) )) { 890 for (;;) { 891 if (nextPos == fText.length()) { 892 break; 893 } 894 int nextChar = UTF16.charAt(fText, nextPos); 895 if (!fCM.contains(nextChar)) { 896 break; 897 } 898 nextPos = moveIndex32(fText, nextPos, 1); 899 } 900 } 901 902 // LB 9 Treat X CM* as if it were X 903 // No explicit action required. 904 905 // LB 10 Treat any remaining combining mark as AL 906 if (fCM.contains(thisChar)) { 907 thisChar = 'A'; 908 } 909 910 911 // If the loop is still warming up - if we haven't shifted the initial 912 // -1 positions out of prevPos yet - loop back to advance the 913 // position in the input without any further looking for breaks. 914 if (prevPos == -1) { 915 continue; 916 } 917 918 // LB 4 Always break after hard line breaks, 919 if (fBK.contains(prevChar)) { 920 break; 921 } 922 923 // LB 5 Break after CR, LF, NL, but not inside CR LF 924 if (fCR.contains(prevChar) && fLF.contains(thisChar)) { 925 continue; 926 } 927 if (fCR.contains(prevChar) || 928 fLF.contains(prevChar) || 929 fNL.contains(prevChar)) { 930 break; 931 } 932 933 // LB 6 Don't break before hard line breaks 934 if (fBK.contains(thisChar) || fCR.contains(thisChar) || 935 fLF.contains(thisChar) || fNL.contains(thisChar) ) { 936 continue; 937 } 938 939 940 // LB 7 Don't break before spaces or zero-width space. 941 if (fSP.contains(thisChar)) { 942 continue; 943 } 944 945 if (fZW.contains(thisChar)) { 946 continue; 947 } 948 949 // LB 8 Break after zero width space 950 if (fZW.contains(prevChar)) { 951 break; 952 } 953 954 // LB 8a: ZWJ x (ID | Extended_Pictographic | Emoji) 955 // The monkey test's way of ignoring combining characters doesn't work 956 // for this rule. ZWJ is also a CM. Need to get the actual character 957 // preceding "thisChar", not ignoring combining marks, possibly ZWJ. 958 { 959 int prevC = fText.codePointBefore(pos); 960 if (fZWJ.contains(prevC) && (fID.contains(thisChar) || fExtendedPict.contains(thisChar) || fEmojiNRK.contains(thisChar))) { 961 continue; 962 } 963 } 964 965 // LB 9, 10 Already done, at top of loop. 966 // 967 968 969 // LB 11 970 // x WJ 971 // WJ x 972 if (fWJ.contains(thisChar) || fWJ.contains(prevChar)) { 973 continue; 974 } 975 976 977 // LB 12 978 // GL x 979 if (fGL.contains(prevChar)) { 980 continue; 981 } 982 983 // LB 12a 984 // [^SP BA HY] x GL 985 if (!(fSP.contains(prevChar) || 986 fBA.contains(prevChar) || 987 fHY.contains(prevChar) ) && fGL.contains(thisChar)) { 988 continue; 989 } 990 991 992 993 // LB 13 Don't break before closings. 994 // NU x CL, NU x CP and NU x IS are not matched here so that they will 995 // fall into LB 17 and the more general number regular expression. 996 // 997 if (!fNU.contains(prevChar) && fCL.contains(thisChar) || 998 !fNU.contains(prevChar) && fCP.contains(thisChar) || 999 fEX.contains(thisChar) || 1000 !fNU.contains(prevChar) && fIS.contains(thisChar) || 1001 !fNU.contains(prevChar) && fSY.contains(thisChar)) { 1002 continue; 1003 } 1004 1005 // LB 14 Don't break after OP SP* 1006 // Scan backwards, checking for this sequence. 1007 // The OP char could include combining marks, so we actually check for 1008 // OP CM* SP* x 1009 tPos = prevPos; 1010 if (fSP.contains(prevChar)) { 1011 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) { 1012 tPos=moveIndex32(fText, tPos, -1); 1013 } 1014 } 1015 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) { 1016 tPos=moveIndex32(fText, tPos, -1); 1017 } 1018 if (fOP.contains(UTF16.charAt(fText, tPos))) { 1019 continue; 1020 } 1021 1022 // LB 15 Do not break within "[ 1023 // QU CM* SP* x OP 1024 if (fOP.contains(thisChar)) { 1025 // Scan backwards from prevChar to see if it is preceded by QU CM* SP* 1026 tPos = prevPos; 1027 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) { 1028 tPos = moveIndex32(fText, tPos, -1); 1029 } 1030 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) { 1031 tPos = moveIndex32(fText, tPos, -1); 1032 } 1033 if (fQU.contains(UTF16.charAt(fText, tPos))) { 1034 continue; 1035 } 1036 } 1037 1038 // LB 16 (CL | CP) SP* x NS 1039 if (fNS.contains(thisChar)) { 1040 tPos = prevPos; 1041 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) { 1042 tPos = moveIndex32(fText, tPos, -1); 1043 } 1044 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) { 1045 tPos = moveIndex32(fText, tPos, -1); 1046 } 1047 if (fCL.contains(UTF16.charAt(fText, tPos)) || fCP.contains(UTF16.charAt(fText, tPos))) { 1048 continue; 1049 } 1050 } 1051 1052 1053 // LB 17 B2 SP* x B2 1054 if (fB2.contains(thisChar)) { 1055 tPos = prevPos; 1056 while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) { 1057 tPos = moveIndex32(fText, tPos, -1); 1058 } 1059 while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) { 1060 tPos = moveIndex32(fText, tPos, -1); 1061 } 1062 if (fB2.contains(UTF16.charAt(fText, tPos))) { 1063 continue; 1064 } 1065 } 1066 1067 // LB 18 break after space 1068 if (fSP.contains(prevChar)) { 1069 break; 1070 } 1071 1072 // LB 19 1073 // x QU 1074 // QU x 1075 if (fQU.contains(thisChar) || fQU.contains(prevChar)) { 1076 continue; 1077 } 1078 1079 // LB 20 Break around a CB 1080 if (fCB.contains(thisChar) || fCB.contains(prevChar)) { 1081 break; 1082 } 1083 1084 // LB 21 1085 if (fBA.contains(thisChar) || 1086 fHY.contains(thisChar) || 1087 fNS.contains(thisChar) || 1088 fBB.contains(prevChar) ) { 1089 continue; 1090 } 1091 1092 // LB 21a, HL (HY | BA) x 1093 if (fHL.contains(prevCharX2) && (fHY.contains(prevChar) || fBA.contains(prevChar))) { 1094 continue; 1095 } 1096 1097 // LB 21b, SY x HL 1098 if (fSY.contains(prevChar) && fHL.contains(thisChar)) { 1099 continue; 1100 } 1101 1102 // LB 22 1103 if (fAL.contains(prevChar) && fIN.contains(thisChar) || 1104 fEX.contains(prevChar) && fIN.contains(thisChar) || 1105 fHL.contains(prevChar) && fIN.contains(thisChar) || 1106 (fID.contains(prevChar) || fEB.contains(prevChar) || fEM.contains(prevChar)) && fIN.contains(thisChar) || 1107 fIN.contains(prevChar) && fIN.contains(thisChar) || 1108 fNU.contains(prevChar) && fIN.contains(thisChar) ) { 1109 continue; 1110 } 1111 1112 // LB 23 (AL | HL) x NU 1113 // NU x (AL | HL) 1114 if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && fNU.contains(thisChar)) { 1115 continue; 1116 } 1117 if (fNU.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) { 1118 continue; 1119 } 1120 1121 // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes. 1122 // PR x (ID | EB | EM) 1123 // (ID | EB | EM) x PO 1124 if (fPR.contains(prevChar) && 1125 (fID.contains(thisChar) || fEB.contains(thisChar) || fEM.contains(thisChar))) { 1126 continue; 1127 } 1128 if ((fID.contains(prevChar) || fEB.contains(prevChar) || fEM.contains(prevChar)) && 1129 fPO.contains(thisChar)) { 1130 continue; 1131 } 1132 1133 // LB 24 Do not break between prefix and letters or ideographs. 1134 // (PR | PO) x (AL | HL) 1135 // (AL | HL) x (PR | PO) 1136 if ((fPR.contains(prevChar) || fPO.contains(prevChar)) && 1137 (fAL.contains(thisChar) || fHL.contains(thisChar))) { 1138 continue; 1139 } 1140 if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && 1141 (fPR.contains(thisChar) || fPO.contains(thisChar))) { 1142 continue; 1143 } 1144 1145 1146 // LB 25 Numbers 1147 matchVals = LBNumberCheck(fText, prevPos, matchVals); 1148 if (matchVals[0] != -1) { 1149 // Matched a number. But could have been just a single digit, which would 1150 // not represent a "no break here" between prevChar and thisChar 1151 int numEndIdx = matchVals[1]; // idx of first char following num 1152 if (numEndIdx > pos) { 1153 // Number match includes at least the two chars being checked 1154 if (numEndIdx > nextPos) { 1155 // Number match includes additional chars. Update pos and nextPos 1156 // so that next loop iteration will continue at the end of the number, 1157 // checking for breaks between last char in number & whatever follows. 1158 nextPos = numEndIdx; 1159 pos = numEndIdx; 1160 do { 1161 pos = moveIndex32(fText, pos, -1); 1162 thisChar = UTF16.charAt(fText, pos); 1163 } 1164 while (fCM.contains(thisChar)); 1165 } 1166 continue; 1167 } 1168 } 1169 1170 1171 // LB 26 Do not break Korean Syllables 1172 if (fJL.contains(prevChar) && (fJL.contains(thisChar) || 1173 fJV.contains(thisChar) || 1174 fH2.contains(thisChar) || 1175 fH3.contains(thisChar))) { 1176 continue; 1177 } 1178 1179 if ((fJV.contains(prevChar) || fH2.contains(prevChar)) && 1180 (fJV.contains(thisChar) || fJT.contains(thisChar))) { 1181 continue; 1182 } 1183 1184 if ((fJT.contains(prevChar) || fH3.contains(prevChar)) && 1185 fJT.contains(thisChar)) { 1186 continue; 1187 } 1188 1189 // LB 27 Treat a Korean Syllable Block the same as ID 1190 if ((fJL.contains(prevChar) || fJV.contains(prevChar) || 1191 fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) && 1192 fIN.contains(thisChar)) { 1193 continue; 1194 } 1195 if ((fJL.contains(prevChar) || fJV.contains(prevChar) || 1196 fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) && 1197 fPO.contains(thisChar)) { 1198 continue; 1199 } 1200 if (fPR.contains(prevChar) && (fJL.contains(thisChar) || fJV.contains(thisChar) || 1201 fJT.contains(thisChar) || fH2.contains(thisChar) || fH3.contains(thisChar))) { 1202 continue; 1203 } 1204 1205 1206 1207 // LB 28 Do not break between alphabetics 1208 if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && (fAL.contains(thisChar) || fHL.contains(thisChar))) { 1209 continue; 1210 } 1211 1212 // LB 29 Do not break between numeric punctuation and alphabetics 1213 if (fIS.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) { 1214 continue; 1215 } 1216 1217 // LB 30 Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation. 1218 // (AL | NU) x OP 1219 // CP x (AL | NU) 1220 if ((fAL.contains(prevChar) || fHL.contains(prevChar) || fNU.contains(prevChar)) && fOP.contains(thisChar)) { 1221 continue; 1222 } 1223 if (fCP.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar) || fNU.contains(thisChar))) { 1224 continue; 1225 } 1226 1227 // LB 30a Break between pairs of Regional Indicators. 1228 // RI RI <break> RI 1229 // RI x RI 1230 if (fRI.contains(prevCharX2) && fRI.contains(prevChar) && fRI.contains(thisChar)) { 1231 break; 1232 } 1233 if (fRI.contains(prevChar) && fRI.contains(thisChar)) { 1234 continue; 1235 } 1236 1237 // LB30b Emoji Base x Emoji Modifier 1238 if (fEB.contains(prevChar) && fEM.contains(thisChar)) { 1239 continue; 1240 } 1241 // LB 31 Break everywhere else 1242 break; 1243 } 1244 1245 return pos; 1246 } 1247 1248 1249 1250 // Match the following regular expression in the input text. 1251 // ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * ((CL | CP) CM*)? (PR | PO) CM*)? 1252 // 0 0 1 3 3 4 7 7 7 7 9 9 9 11 11 (match states) 1253 // retVals array [0] index of the start of the match, or -1 if no match 1254 // [1] index of first char following the match. 1255 // Can not use Java regex because need supplementary character support, 1256 // and because Unicode char properties version must be the same as in 1257 // the version of ICU being tested. 1258 private int[] LBNumberCheck(StringBuffer s, int startIdx, int[] retVals) { 1259 if (retVals == null) { 1260 retVals = new int[2]; 1261 } 1262 retVals[0] = -1; // Indicates no match. 1263 int matchState = 0; 1264 int idx = startIdx; 1265 1266 matchLoop: for (idx = startIdx; idx<s.length(); idx = moveIndex32(s, idx, 1)){ 1267 int c = UTF16.charAt(s, idx); 1268 int cLBType = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK); 1269 switch (matchState) { 1270 case 0: 1271 if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC || 1272 cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) { 1273 matchState = 1; 1274 break; 1275 } 1276 if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) { 1277 matchState = 4; 1278 break; 1279 } 1280 if (cLBType == UCharacter.LineBreak.HYPHEN) { 1281 matchState = 4; 1282 break; 1283 } 1284 if (cLBType == UCharacter.LineBreak.NUMERIC) { 1285 matchState = 7; 1286 break; 1287 } 1288 break matchLoop; /* No Match */ 1289 1290 case 1: 1291 if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) { 1292 matchState = 1; 1293 break; 1294 } 1295 if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) { 1296 matchState = 4; 1297 break; 1298 } 1299 if (cLBType == UCharacter.LineBreak.HYPHEN) { 1300 matchState = 4; 1301 break; 1302 } 1303 if (cLBType == UCharacter.LineBreak.NUMERIC) { 1304 matchState = 7; 1305 break; 1306 } 1307 break matchLoop; /* No Match */ 1308 1309 1310 case 4: 1311 if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) { 1312 matchState = 4; 1313 break; 1314 } 1315 if (cLBType == UCharacter.LineBreak.NUMERIC) { 1316 matchState = 7; 1317 break; 1318 } 1319 break matchLoop; /* No Match */ 1320 // ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)? (PR | PO) CM*)? 1321 // 0 0 1 3 3 4 7 7 7 7 9 9 11 11 (match states) 1322 1323 case 7: 1324 if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) { 1325 matchState = 7; 1326 break; 1327 } 1328 if (cLBType == UCharacter.LineBreak.NUMERIC) { 1329 matchState = 7; 1330 break; 1331 } 1332 if (cLBType == UCharacter.LineBreak.INFIX_NUMERIC) { 1333 matchState = 7; 1334 break; 1335 } 1336 if (cLBType == UCharacter.LineBreak.BREAK_SYMBOLS) { 1337 matchState = 7; 1338 break; 1339 } 1340 if (cLBType == UCharacter.LineBreak.CLOSE_PUNCTUATION) { 1341 matchState = 9; 1342 break; 1343 } 1344 if (cLBType == UCharacter.LineBreak.CLOSE_PARENTHESIS) { 1345 matchState = 9; 1346 break; 1347 } 1348 if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) { 1349 matchState = 11; 1350 break; 1351 } 1352 if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) { 1353 matchState = 11; 1354 break; 1355 } 1356 1357 break matchLoop; // Match Complete. 1358 case 9: 1359 if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) { 1360 matchState = 9; 1361 break; 1362 } 1363 if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) { 1364 matchState = 11; 1365 break; 1366 } 1367 if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) { 1368 matchState = 11; 1369 break; 1370 } 1371 break matchLoop; // Match Complete. 1372 case 11: 1373 if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) { 1374 matchState = 11; 1375 break; 1376 } 1377 break matchLoop; // Match Complete. 1378 } 1379 } 1380 if (matchState > 4) { 1381 retVals[0] = startIdx; 1382 retVals[1] = idx; 1383 } 1384 return retVals; 1385 } 1386 1387 1388 @Override 1389 List charClasses() { 1390 return fSets; 1391 } 1392 1393 1394 1395 } 1396 1397 1398 /** 1399 * 1400 * Sentence Monkey Test Class 1401 * 1402 * 1403 * 1404 */ 1405 static class RBBISentenceMonkey extends RBBIMonkeyKind { 1406 List fSets; 1407 StringBuffer fText; 1408 1409 UnicodeSet fSepSet; 1410 UnicodeSet fFormatSet; 1411 UnicodeSet fSpSet; 1412 UnicodeSet fLowerSet; 1413 UnicodeSet fUpperSet; 1414 UnicodeSet fOLetterSet; 1415 UnicodeSet fNumericSet; 1416 UnicodeSet fATermSet; 1417 UnicodeSet fSContinueSet; 1418 UnicodeSet fSTermSet; 1419 UnicodeSet fCloseSet; 1420 UnicodeSet fOtherSet; 1421 UnicodeSet fExtendSet; 1422 1423 1424 1425 RBBISentenceMonkey() { 1426 fCharProperty = UProperty.SENTENCE_BREAK; 1427 1428 fSets = new ArrayList(); 1429 1430 // Separator Set Note: Beginning with Unicode 5.1, CR and LF were removed from the separator 1431 // set and made into character classes of their own. For the monkey impl, 1432 // they remain in SEP, since Sep always appears with CR and LF in the rules. 1433 fSepSet = new UnicodeSet("[\\p{Sentence_Break = Sep} \\u000a \\u000d]"); 1434 fFormatSet = new UnicodeSet("[\\p{Sentence_Break = Format}]"); 1435 fSpSet = new UnicodeSet("[\\p{Sentence_Break = Sp}]"); 1436 fLowerSet = new UnicodeSet("[\\p{Sentence_Break = Lower}]"); 1437 fUpperSet = new UnicodeSet("[\\p{Sentence_Break = Upper}]"); 1438 fOLetterSet = new UnicodeSet("[\\p{Sentence_Break = OLetter}]"); 1439 fNumericSet = new UnicodeSet("[\\p{Sentence_Break = Numeric}]"); 1440 fATermSet = new UnicodeSet("[\\p{Sentence_Break = ATerm}]"); 1441 fSContinueSet = new UnicodeSet("[\\p{Sentence_Break = SContinue}]"); 1442 fSTermSet = new UnicodeSet("[\\p{Sentence_Break = STerm}]"); 1443 fCloseSet = new UnicodeSet("[\\p{Sentence_Break = Close}]"); 1444 fExtendSet = new UnicodeSet("[\\p{Sentence_Break = Extend}]"); 1445 fOtherSet = new UnicodeSet(); 1446 1447 1448 fOtherSet.complement(); 1449 fOtherSet.removeAll(fSepSet); 1450 fOtherSet.removeAll(fFormatSet); 1451 fOtherSet.removeAll(fSpSet); 1452 fOtherSet.removeAll(fLowerSet); 1453 fOtherSet.removeAll(fUpperSet); 1454 fOtherSet.removeAll(fOLetterSet); 1455 fOtherSet.removeAll(fNumericSet); 1456 fOtherSet.removeAll(fATermSet); 1457 fOtherSet.removeAll(fSContinueSet); 1458 fOtherSet.removeAll(fSTermSet); 1459 fOtherSet.removeAll(fCloseSet); 1460 fOtherSet.removeAll(fExtendSet); 1461 1462 fSets.add(fSepSet); 1463 fSets.add(fFormatSet); 1464 1465 fSets.add(fSpSet); 1466 fSets.add(fLowerSet); 1467 fSets.add(fUpperSet); 1468 fSets.add(fOLetterSet); 1469 fSets.add(fNumericSet); 1470 fSets.add(fATermSet); 1471 fSets.add(fSContinueSet); 1472 fSets.add(fSTermSet); 1473 fSets.add(fCloseSet); 1474 fSets.add(fOtherSet); 1475 fSets.add(fExtendSet); 1476 } 1477 1478 1479 @Override 1480 List charClasses() { 1481 return fSets; 1482 } 1483 1484 @Override 1485 void setText(StringBuffer s) { 1486 fText = s; 1487 } 1488 1489 1490 // moveBack() Find the "significant" code point preceding the index i. 1491 // Skips over ($Extend | $Format)* 1492 // 1493 private int moveBack(int i) { 1494 1495 if (i <= 0) { 1496 return -1; 1497 } 1498 1499 int c; 1500 int j = i; 1501 do { 1502 j = moveIndex32(fText, j, -1); 1503 c = UTF16.charAt(fText, j); 1504 } 1505 while (j>0 &&(fFormatSet.contains(c) || fExtendSet.contains(c))); 1506 return j; 1507 } 1508 1509 1510 int moveForward(int i) { 1511 if (i>=fText.length()) { 1512 return fText.length(); 1513 } 1514 int c; 1515 int j = i; 1516 do { 1517 j = moveIndex32(fText, j, 1); 1518 c = cAt(j); 1519 } 1520 while (c>=0 && (fFormatSet.contains(c) || fExtendSet.contains(c))); 1521 return j; 1522 1523 } 1524 1525 int cAt(int pos) { 1526 if (pos<0 || pos>=fText.length()) { 1527 return -1; 1528 } 1529 return UTF16.charAt(fText, pos); 1530 } 1531 1532 @Override 1533 int next(int prevPos) { 1534 int /*p0,*/ p1, p2, p3; // Indices of the significant code points around the 1535 // break position being tested. The candidate break 1536 // location is before p2. 1537 int breakPos = -1; 1538 1539 int c0, c1, c2, c3; // The code points at p0, p1, p2 & p3. 1540 int c; 1541 1542 // Prev break at end of string. return DONE. 1543 if (prevPos >= fText.length()) { 1544 return -1; 1545 } 1546 /*p0 =*/ p1 = p2 = p3 = prevPos; 1547 c3 = UTF16.charAt(fText, prevPos); 1548 c0 = c1 = c2 = 0; 1549 1550 // Loop runs once per "significant" character position in the input text. 1551 for (;;) { 1552 // Move all of the positions forward in the input string. 1553 /*p0 = p1;*/ c0 = c1; 1554 p1 = p2; c1 = c2; 1555 p2 = p3; c2 = c3; 1556 1557 // Advancd p3 by X(Extend | Format)* Rule 4 1558 p3 = moveForward(p3); 1559 c3 = cAt(p3); 1560 1561 // Rule (3) CR x LF 1562 if (c1==0x0d && c2==0x0a && p2==(p1+1)) { 1563 continue; 1564 } 1565 1566 // Rule (4) Sep <break> 1567 if (fSepSet.contains(c1)) { 1568 p2 = p1+1; // Separators don't combine with Extend or Format 1569 break; 1570 } 1571 1572 if (p2 >= fText.length()) { 1573 // Reached end of string. Always a break position. 1574 break; 1575 } 1576 1577 if (p2 == prevPos) { 1578 // Still warming up the loop. (won't work with zero length strings, but we don't care) 1579 continue; 1580 } 1581 1582 // Rule (6). ATerm x Numeric 1583 if (fATermSet.contains(c1) && fNumericSet.contains(c2)) { 1584 continue; 1585 } 1586 1587 // Rule (7). (Upper | Lower) ATerm x Uppper 1588 if ((fUpperSet.contains(c0) || fLowerSet.contains(c0)) && 1589 fATermSet.contains(c1) && fUpperSet.contains(c2)) { 1590 continue; 1591 } 1592 1593 // Rule (8) ATerm Close* Sp* x (not (OLettter | Upper | Lower | Sep))* Lower 1594 // Note: Sterm | ATerm are added to the negated part of the expression by a 1595 // note to the Unicode 5.0 documents. 1596 int p8 = p1; 1597 while (p8>0 && fSpSet.contains(cAt(p8))) { 1598 p8 = moveBack(p8); 1599 } 1600 while (p8>0 && fCloseSet.contains(cAt(p8))) { 1601 p8 = moveBack(p8); 1602 } 1603 if (fATermSet.contains(cAt(p8))) { 1604 p8=p2; 1605 for (;;) { 1606 c = cAt(p8); 1607 if (c==-1 || fOLetterSet.contains(c) || fUpperSet.contains(c) || 1608 fLowerSet.contains(c) || fSepSet.contains(c) || 1609 fATermSet.contains(c) || fSTermSet.contains(c)) 1610 { 1611 break; 1612 } 1613 p8 = moveForward(p8); 1614 } 1615 if (p8<fText.length() && fLowerSet.contains(cAt(p8))) { 1616 continue; 1617 } 1618 } 1619 1620 // Rule 8a (STerm | ATerm) Close* Sp* x (SContinue | Sterm | ATerm) 1621 if (fSContinueSet.contains(c2) || fSTermSet.contains(c2) || fATermSet.contains(c2)) { 1622 p8 = p1; 1623 while (setContains(fSpSet, cAt(p8))) { 1624 p8 = moveBack(p8); 1625 } 1626 while (setContains(fCloseSet, cAt(p8))) { 1627 p8 = moveBack(p8); 1628 } 1629 c = cAt(p8); 1630 if (setContains(fSTermSet, c) || setContains(fATermSet, c)) { 1631 continue; 1632 } 1633 } 1634 1635 1636 // Rule (9) (STerm | ATerm) Close* x (Close | Sp | Sep | CR | LF) 1637 int p9 = p1; 1638 while (p9>0 && fCloseSet.contains(cAt(p9))) { 1639 p9 = moveBack(p9); 1640 } 1641 c = cAt(p9); 1642 if ((fSTermSet.contains(c) || fATermSet.contains(c))) { 1643 if (fCloseSet.contains(c2) || fSpSet.contains(c2) || fSepSet.contains(c2)) { 1644 continue; 1645 } 1646 } 1647 1648 // Rule (10) (Sterm | ATerm) Close* Sp* x (Sp | Sep | CR | LF) 1649 int p10 = p1; 1650 while (p10>0 && fSpSet.contains(cAt(p10))) { 1651 p10 = moveBack(p10); 1652 } 1653 while (p10>0 && fCloseSet.contains(cAt(p10))) { 1654 p10 = moveBack(p10); 1655 } 1656 if (fSTermSet.contains(cAt(p10)) || fATermSet.contains(cAt(p10))) { 1657 if (fSpSet.contains(c2) || fSepSet.contains(c2)) { 1658 continue; 1659 } 1660 } 1661 1662 // Rule (11) (STerm | ATerm) Close* Sp* <break> 1663 int p11 = p1; 1664 if (p11>0 && fSepSet.contains(cAt(p11))) { 1665 p11 = moveBack(p11); 1666 } 1667 while (p11>0 && fSpSet.contains(cAt(p11))) { 1668 p11 = moveBack(p11); 1669 } 1670 while (p11>0 && fCloseSet.contains(cAt(p11))) { 1671 p11 = moveBack(p11); 1672 } 1673 if (fSTermSet.contains(cAt(p11)) || fATermSet.contains(cAt(p11))) { 1674 break; 1675 } 1676 1677 // Rule (12) Any x Any 1678 continue; 1679 } 1680 breakPos = p2; 1681 return breakPos; 1682 } 1683 1684 1685 1686 } 1687 1688 1689 /** 1690 * Move an index into a string by n code points. 1691 * Similar to UTF16.moveCodePointOffset, but without the exceptions, which were 1692 * complicating usage. 1693 * @param s a Text string 1694 * @param pos The starting code unit index into the text string 1695 * @param amt The amount to adjust the string by. 1696 * @return The adjusted code unit index, pinned to the string's length, or 1697 * unchanged if input index was outside of the string. 1698 */ 1699 static int moveIndex32(StringBuffer s, int pos, int amt) { 1700 int i; 1701 char c; 1702 if (amt>0) { 1703 for (i=0; i<amt; i++) { 1704 if (pos >= s.length()) { 1705 return s.length(); 1706 } 1707 c = s.charAt(pos); 1708 pos++; 1709 if (UTF16.isLeadSurrogate(c) && pos < s.length()) { 1710 c = s.charAt(pos); 1711 if (UTF16.isTrailSurrogate(c)) { 1712 pos++; 1713 } 1714 } 1715 } 1716 } else { 1717 for (i=0; i>amt; i--) { 1718 if (pos <= 0) { 1719 return 0; 1720 } 1721 pos--; 1722 c = s.charAt(pos); 1723 if (UTF16.isTrailSurrogate(c) && pos >= 0) { 1724 c = s.charAt(pos); 1725 if (UTF16.isLeadSurrogate(c)) { 1726 pos--; 1727 } 1728 } 1729 } 1730 } 1731 return pos; 1732 } 1733 1734 /** 1735 * No-exceptions form of UnicodeSet.contains(c). 1736 * Simplifies loops that terminate with an end-of-input character value. 1737 * @param s A unicode set 1738 * @param c A code point value 1739 * @return true if the set contains c. 1740 */ 1741 static boolean setContains(UnicodeSet s, int c) { 1742 if (c<0 || c>UTF16.CODEPOINT_MAX_VALUE ) { 1743 return false; 1744 } 1745 return s.contains(c); 1746 } 1747 1748 1749 /** 1750 * return the index of the next code point in the input text. 1751 * @param i the preceding index 1752 */ 1753 static int nextCP(StringBuffer s, int i) { 1754 if (i == -1) { 1755 // End of Input indication. Continue to return end value. 1756 return -1; 1757 } 1758 int retVal = i + 1; 1759 if (retVal > s.length()) { 1760 return -1; 1761 } 1762 int c = UTF16.charAt(s, i); 1763 if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && UTF16.isLeadSurrogate(s.charAt(i))) { 1764 retVal++; 1765 } 1766 return retVal; 1767 } 1768 1769 1770 /** 1771 * random number generator. Not using Java's built-in Randoms for two reasons: 1772 * 1. Using this code allows obtaining the same sequences as those from the ICU4C monkey test. 1773 * 2. We need to get and restore the seed from values occurring in the middle 1774 * of a long sequence, to more easily reproduce failing cases. 1775 */ 1776 private static int m_seed = 1; 1777 private static int m_rand() 1778 { 1779 m_seed = m_seed * 1103515245 + 12345; 1780 return (m_seed >>> 16) % 32768; 1781 } 1782 1783 // Helper function for formatting error output. 1784 // Append a string into a fixed-size field in a StringBuffer. 1785 // Blank-pad the string if it is shorter than the field. 1786 // Truncate the source string if it is too long. 1787 // 1788 private static void appendToBuf(StringBuffer dest, String src, int fieldLen) { 1789 int appendLen = src.length(); 1790 if (appendLen >= fieldLen) { 1791 dest.append(src.substring(0, fieldLen)); 1792 } else { 1793 dest.append(src); 1794 while (appendLen < fieldLen) { 1795 dest.append(' '); 1796 appendLen++; 1797 } 1798 } 1799 } 1800 1801 // Helper function for formatting error output. 1802 // Display a code point in "\\uxxxx" or "\Uxxxxxxxx" format 1803 private static void appendCharToBuf(StringBuffer dest, int c, int fieldLen) { 1804 String hexChars = "0123456789abcdef"; 1805 if (c < 0x10000) { 1806 dest.append("\\u"); 1807 for (int bn=12; bn>=0; bn-=4) { 1808 dest.append(hexChars.charAt(((c)>>bn)&0xf)); 1809 } 1810 appendToBuf(dest, " ", fieldLen-6); 1811 } else { 1812 dest.append("\\U"); 1813 for (int bn=28; bn>=0; bn-=4) { 1814 dest.append(hexChars.charAt(((c)>>bn)&0xf)); 1815 } 1816 appendToBuf(dest, " ", fieldLen-10); 1817 1818 } 1819 } 1820 1821 /** 1822 * Run a RBBI monkey test. Common routine, for all break iterator types. 1823 * Parameters: 1824 * bi - the break iterator to use 1825 * mk - MonkeyKind, abstraction for obtaining expected results 1826 * name - Name of test (char, word, etc.) for use in error messages 1827 * seed - Seed for starting random number generator (parameter from user) 1828 * numIterations 1829 */ 1830 void RunMonkey(BreakIterator bi, RBBIMonkeyKind mk, String name, int seed, int numIterations) { 1831 int TESTSTRINGLEN = 500; 1832 StringBuffer testText = new StringBuffer(); 1833 int numCharClasses; 1834 List chClasses; 1835 int[] expected = new int[TESTSTRINGLEN*2 + 1]; 1836 int expectedCount = 0; 1837 boolean[] expectedBreaks = new boolean[TESTSTRINGLEN*2 + 1]; 1838 boolean[] forwardBreaks = new boolean[TESTSTRINGLEN*2 + 1]; 1839 boolean[] reverseBreaks = new boolean[TESTSTRINGLEN*2 + 1]; 1840 boolean[] isBoundaryBreaks = new boolean[TESTSTRINGLEN*2 + 1]; 1841 boolean[] followingBreaks = new boolean[TESTSTRINGLEN*2 + 1]; 1842 boolean[] precedingBreaks = new boolean[TESTSTRINGLEN*2 + 1]; 1843 int i; 1844 int loopCount = 0; 1845 boolean printTestData = false; 1846 boolean printBreaksFromBI = false; 1847 1848 m_seed = seed; 1849 1850 numCharClasses = mk.charClasses().size(); 1851 chClasses = mk.charClasses(); 1852 1853 // Verify that the character classes all have at least one member. 1854 for (i=0; i<numCharClasses; i++) { 1855 UnicodeSet s = (UnicodeSet)chClasses.get(i); 1856 if (s == null || s.size() == 0) { 1857 errln("Character Class " + i + " is null or of zero size."); 1858 return; 1859 } 1860 } 1861 1862 //-------------------------------------------------------------------------------------------- 1863 // 1864 // Debugging settings. Comment out everything in the following block for normal operation 1865 // 1866 //-------------------------------------------------------------------------------------------- 1867 // numIterations = -1; 1868 // numIterations = 10000; // Same as exhaustive. 1869 // RuleBasedBreakIterator_New.fTrace = true; 1870 // m_seed = 859056465; 1871 // TESTSTRINGLEN = 50; 1872 // printTestData = true; 1873 // printBreaksFromBI = true; 1874 // ((RuleBasedBreakIterator_New)bi).dump(); 1875 1876 //-------------------------------------------------------------------------------------------- 1877 // 1878 // End of Debugging settings. 1879 // 1880 //-------------------------------------------------------------------------------------------- 1881 1882 int dotsOnLine = 0; 1883 while (loopCount < numIterations || numIterations == -1) { 1884 if (numIterations == -1 && loopCount % 10 == 0) { 1885 // If test is running in an infinite loop, display a periodic tic so 1886 // we can tell that it is making progress. 1887 System.out.print("."); 1888 if (dotsOnLine++ >= 80){ 1889 System.out.println(); 1890 dotsOnLine = 0; 1891 } 1892 } 1893 // Save current random number seed, so that we can recreate the random numbers 1894 // for this loop iteration in event of an error. 1895 seed = m_seed; 1896 1897 testText.setLength(0); 1898 // Populate a test string with data. 1899 if (printTestData) { 1900 System.out.println("Test Data string ..."); 1901 } 1902 for (i=0; i<TESTSTRINGLEN; i++) { 1903 int aClassNum = m_rand() % numCharClasses; 1904 UnicodeSet classSet = (UnicodeSet)chClasses.get(aClassNum); 1905 int charIdx = m_rand() % classSet.size(); 1906 int c = classSet.charAt(charIdx); 1907 if (c < 0) { // TODO: deal with sets containing strings. 1908 errln("c < 0"); 1909 } 1910 UTF16.appendCodePoint(testText, c); 1911 if (printTestData) { 1912 System.out.print(Integer.toHexString(c) + " "); 1913 } 1914 } 1915 if (printTestData) { 1916 System.out.println(); 1917 } 1918 1919 Arrays.fill(expected, 0); 1920 Arrays.fill(expectedBreaks, false); 1921 Arrays.fill(forwardBreaks, false); 1922 Arrays.fill(reverseBreaks, false); 1923 Arrays.fill(isBoundaryBreaks, false); 1924 Arrays.fill(followingBreaks, false); 1925 Arrays.fill(precedingBreaks, false); 1926 1927 // Calculate the expected results for this test string. 1928 mk.setText(testText); 1929 expectedCount = 0; 1930 expectedBreaks[0] = true; 1931 expected[expectedCount ++] = 0; 1932 int breakPos = 0; 1933 int lastBreakPos = -1; 1934 for (;;) { 1935 lastBreakPos = breakPos; 1936 breakPos = mk.next(breakPos); 1937 if (breakPos == -1) { 1938 break; 1939 } 1940 if (breakPos > testText.length()) { 1941 errln("breakPos > testText.length()"); 1942 } 1943 if (lastBreakPos >= breakPos) { 1944 errln("Next() not increasing."); 1945 // break; 1946 } 1947 expectedBreaks[breakPos] = true; 1948 expected[expectedCount ++] = breakPos; 1949 } 1950 1951 // Find the break positions using forward iteration 1952 if (printBreaksFromBI) { 1953 System.out.println("Breaks from BI..."); 1954 } 1955 bi.setText(testText.toString()); 1956 for (i=bi.first(); i != BreakIterator.DONE; i=bi.next()) { 1957 if (i < 0 || i > testText.length()) { 1958 errln(name + " break monkey test: Out of range value returned by breakIterator::next()"); 1959 break; 1960 } 1961 if (printBreaksFromBI) { 1962 System.out.print(Integer.toHexString(i) + " "); 1963 } 1964 forwardBreaks[i] = true; 1965 } 1966 if (printBreaksFromBI) { 1967 System.out.println(); 1968 } 1969 1970 // Find the break positions using reverse iteration 1971 for (i=bi.last(); i != BreakIterator.DONE; i=bi.previous()) { 1972 if (i < 0 || i > testText.length()) { 1973 errln(name + " break monkey test: Out of range value returned by breakIterator.next()" + name); 1974 break; 1975 } 1976 reverseBreaks[i] = true; 1977 } 1978 1979 // Find the break positions using isBoundary() tests. 1980 for (i=0; i<=testText.length(); i++) { 1981 isBoundaryBreaks[i] = bi.isBoundary(i); 1982 } 1983 1984 // Find the break positions using the following() function. 1985 lastBreakPos = 0; 1986 followingBreaks[0] = true; 1987 for (i=0; i<testText.length(); i++) { 1988 breakPos = bi.following(i); 1989 if (breakPos <= i || 1990 breakPos < lastBreakPos || 1991 breakPos > testText.length() || 1992 breakPos > lastBreakPos && lastBreakPos > i ) { 1993 errln(name + " break monkey test: " + 1994 "Out of range value returned by BreakIterator::following().\n" + 1995 "index=" + i + "following returned=" + breakPos + 1996 "lastBreak=" + lastBreakPos); 1997 precedingBreaks[i] = !expectedBreaks[i]; // Forces an error. 1998 } else { 1999 followingBreaks[breakPos] = true; 2000 lastBreakPos = breakPos; 2001 } 2002 } 2003 2004 // Find the break positions using the preceding() function. 2005 lastBreakPos = testText.length(); 2006 precedingBreaks[testText.length()] = true; 2007 for (i=testText.length(); i>0; i--) { 2008 breakPos = bi.preceding(i); 2009 if (breakPos >= i || 2010 breakPos > lastBreakPos || 2011 breakPos < 0 || 2012 breakPos < lastBreakPos && lastBreakPos < i ) { 2013 errln(name + " break monkey test: " + 2014 "Out of range value returned by BreakIterator::preceding().\n" + 2015 "index=" + i + "preceding returned=" + breakPos + 2016 "lastBreak=" + lastBreakPos); 2017 precedingBreaks[i] = !expectedBreaks[i]; // Forces an error. 2018 } else { 2019 precedingBreaks[breakPos] = true; 2020 lastBreakPos = breakPos; 2021 } 2022 } 2023 2024 2025 2026 // Compare the expected and actual results. 2027 for (i=0; i<=testText.length(); i++) { 2028 String errorType = null; 2029 if (forwardBreaks[i] != expectedBreaks[i]) { 2030 errorType = "next()"; 2031 } else if (reverseBreaks[i] != forwardBreaks[i]) { 2032 errorType = "previous()"; 2033 } else if (isBoundaryBreaks[i] != expectedBreaks[i]) { 2034 errorType = "isBoundary()"; 2035 } else if (followingBreaks[i] != expectedBreaks[i]) { 2036 errorType = "following()"; 2037 } else if (precedingBreaks[i] != expectedBreaks[i]) { 2038 errorType = "preceding()"; 2039 } 2040 2041 if (errorType != null) { 2042 // Format a range of the test text that includes the failure as 2043 // a data item that can be included in the rbbi test data file. 2044 2045 // Start of the range is the last point where expected and actual results 2046 // both agreed that there was a break position. 2047 int startContext = i; 2048 int count = 0; 2049 for (;;) { 2050 if (startContext==0) { break; } 2051 startContext --; 2052 if (expectedBreaks[startContext]) { 2053 if (count == 2) break; 2054 count ++; 2055 } 2056 } 2057 2058 // End of range is two expected breaks past the start position. 2059 int endContext = i + 1; 2060 int ci; 2061 for (ci=0; ci<2; ci++) { // Number of items to include in error text. 2062 for (;;) { 2063 if (endContext >= testText.length()) {break;} 2064 if (expectedBreaks[endContext-1]) { 2065 if (count == 0) break; 2066 count --; 2067 } 2068 endContext ++; 2069 } 2070 } 2071 2072 // Format looks like "<data><>\uabcd\uabcd<>\U0001abcd...</data>" 2073 StringBuffer errorText = new StringBuffer(); 2074 2075 int c; // Char from test data 2076 for (ci = startContext; ci <= endContext && ci != -1; ci = nextCP(testText, ci)) { 2077 if (ci == i) { 2078 // This is the location of the error. 2079 errorText.append("<?>---------------------------------\n"); 2080 } else if (expectedBreaks[ci]) { 2081 // This a non-error expected break position. 2082 errorText.append("------------------------------------\n"); 2083 } 2084 if (ci < testText.length()) { 2085 c = UTF16.charAt(testText, ci); 2086 appendCharToBuf(errorText, c, 11); 2087 String gc = UCharacter.getPropertyValueName(UProperty.GENERAL_CATEGORY, UCharacter.getType(c), UProperty.NameChoice.SHORT); 2088 appendToBuf(errorText, gc, 8); 2089 int extraProp = UCharacter.getIntPropertyValue(c, mk.fCharProperty); 2090 String extraPropValue = 2091 UCharacter.getPropertyValueName(mk.fCharProperty, extraProp, UProperty.NameChoice.LONG); 2092 appendToBuf(errorText, extraPropValue, 20); 2093 2094 String charName = UCharacter.getExtendedName(c); 2095 appendToBuf(errorText, charName, 40); 2096 errorText.append('\n'); 2097 } 2098 } 2099 if (ci == testText.length() && ci != -1) { 2100 errorText.append("<>"); 2101 } 2102 errorText.append("</data>\n"); 2103 2104 // Output the error 2105 errln(name + " break monkey test error. " + 2106 (expectedBreaks[i]? "Break expected but not found." : "Break found but not expected.") + 2107 "\nOperation = " + errorType + "; random seed = " + seed + "; buf Idx = " + i + "\n" + 2108 errorText); 2109 break; 2110 } 2111 } 2112 2113 loopCount++; 2114 } 2115 } 2116 2117 @Test 2118 public void TestCharMonkey() { 2119 2120 int loopCount = 500; 2121 int seed = 1; 2122 2123 if (TestFmwk.getExhaustiveness() >= 9) { 2124 loopCount = 10000; 2125 } 2126 2127 RBBICharMonkey m = new RBBICharMonkey(); 2128 BreakIterator bi = BreakIterator.getCharacterInstance(Locale.US); 2129 RunMonkey(bi, m, "char", seed, loopCount); 2130 } 2131 2132 @Test 2133 public void TestWordMonkey() { 2134 2135 int loopCount = 500; 2136 int seed = 1; 2137 2138 if (TestFmwk.getExhaustiveness() >= 9) { 2139 loopCount = 10000; 2140 } 2141 2142 logln("Word Break Monkey Test"); 2143 RBBIWordMonkey m = new RBBIWordMonkey(); 2144 BreakIterator bi = BreakIterator.getWordInstance(Locale.US); 2145 RunMonkey(bi, m, "word", seed, loopCount); 2146 } 2147 2148 @Test 2149 public void TestLineMonkey() { 2150 int loopCount = 500; 2151 int seed = 1; 2152 2153 if (TestFmwk.getExhaustiveness() >= 9) { 2154 loopCount = 10000; 2155 } 2156 2157 logln("Line Break Monkey Test"); 2158 RBBILineMonkey m = new RBBILineMonkey(); 2159 BreakIterator bi = BreakIterator.getLineInstance(Locale.US); 2160 RunMonkey(bi, m, "line", seed, loopCount); 2161 } 2162 2163 @Test 2164 public void TestSentMonkey() { 2165 2166 int loopCount = 500; 2167 int seed = 1; 2168 2169 if (TestFmwk.getExhaustiveness() >= 9) { 2170 loopCount = 3000; 2171 } 2172 2173 logln("Sentence Break Monkey Test"); 2174 RBBISentenceMonkey m = new RBBISentenceMonkey(); 2175 BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US); 2176 RunMonkey(bi, m, "sent", seed, loopCount); 2177 } 2178 // 2179 // Round-trip monkey tests. 2180 // Verify that break iterators created from the rule source from the default 2181 // break iterators still pass the monkey test for the iterator type. 2182 // 2183 // This is a major test for the Rule Compiler. The default break iterators are built 2184 // from pre-compiled binary rule data that was created using ICU4C; these 2185 // round-trip rule recompile tests verify that the Java rule compiler can 2186 // rebuild break iterators from the original source rules. 2187 // 2188 @Test 2189 public void TestRTCharMonkey() { 2190 2191 int loopCount = 200; 2192 int seed = 1; 2193 2194 if (TestFmwk.getExhaustiveness() >= 9) { 2195 loopCount = 2000; 2196 } 2197 2198 RBBICharMonkey m = new RBBICharMonkey(); 2199 BreakIterator bi = BreakIterator.getCharacterInstance(Locale.US); 2200 String rules = bi.toString(); 2201 BreakIterator rtbi = new RuleBasedBreakIterator(rules); 2202 RunMonkey(rtbi, m, "char", seed, loopCount); 2203 } 2204 2205 @Test 2206 public void TestRTWordMonkey() { 2207 2208 int loopCount = 200; 2209 int seed = 1; 2210 2211 if (TestFmwk.getExhaustiveness() >= 9) { 2212 loopCount = 2000; 2213 } 2214 logln("Word Break Monkey Test"); 2215 RBBIWordMonkey m = new RBBIWordMonkey(); 2216 BreakIterator bi = BreakIterator.getWordInstance(Locale.US); 2217 String rules = bi.toString(); 2218 BreakIterator rtbi = new RuleBasedBreakIterator(rules); 2219 RunMonkey(rtbi, m, "word", seed, loopCount); 2220 } 2221 2222 @Test 2223 public void TestRTLineMonkey() { 2224 int loopCount = 200; 2225 int seed = 1; 2226 2227 if (TestFmwk.getExhaustiveness() >= 9) { 2228 loopCount = 2000; 2229 } 2230 2231 logln("Line Break Monkey Test"); 2232 RBBILineMonkey m = new RBBILineMonkey(); 2233 BreakIterator bi = BreakIterator.getLineInstance(Locale.US); 2234 String rules = bi.toString(); 2235 BreakIterator rtbi = new RuleBasedBreakIterator(rules); 2236 RunMonkey(rtbi, m, "line", seed, loopCount); 2237 } 2238 2239 @Test 2240 public void TestRTSentMonkey() { 2241 2242 int loopCount = 200; 2243 int seed = 1; 2244 2245 if (TestFmwk.getExhaustiveness() >= 9) { 2246 loopCount = 1000; 2247 } 2248 2249 logln("Sentence Break Monkey Test"); 2250 RBBISentenceMonkey m = new RBBISentenceMonkey(); 2251 BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US); 2252 String rules = bi.toString(); 2253 BreakIterator rtbi = new RuleBasedBreakIterator(rules); 2254 RunMonkey(rtbi, m, "sent", seed, loopCount); 2255 } 2256} 2257 2258