1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4 *******************************************************************************
5 * Copyright (C) 2003-2016 International Business Machines Corporation and
6 * others. All Rights Reserved.
7 *******************************************************************************
8 */
9package com.ibm.icu.dev.test.rbbi;
10
11
12// Monkey testing of RuleBasedBreakIterator
13import java.util.ArrayList;
14import java.util.Arrays;
15import java.util.List;
16import java.util.Locale;
17
18import org.junit.Test;
19
20import com.ibm.icu.dev.test.TestFmwk;
21import com.ibm.icu.lang.UCharacter;
22import com.ibm.icu.lang.UProperty;
23import com.ibm.icu.text.BreakIterator;
24import com.ibm.icu.text.RuleBasedBreakIterator;
25import com.ibm.icu.text.UTF16;
26import com.ibm.icu.text.UnicodeSet;
27
28
29/**
30 * Monkey tests for RBBI.  These tests have independent implementations of
31 * the Unicode TR boundary rules, and compare results between these and ICU's
32 * implementation, using random data.
33 *
34 * Tests cover Grapheme Cluster (char), Word and Line breaks
35 *
36 * Ported from ICU4C, original code in file source/test/intltest/rbbitst.cpp
37 *
38 */
39public class RBBITestMonkey extends TestFmwk {
40    //
41    //     class RBBIMonkeyKind
42    //
43    //        Monkey Test for Break Iteration
44    //        Abstract interface class.   Concrete derived classes independently
45    //        implement the break rules for different iterator types.
46    //
47    //        The Monkey Test itself uses doesn't know which type of break iterator it is
48    //        testing, but works purely in terms of the interface defined here.
49    //
50    abstract static class RBBIMonkeyKind {
51
52        // Return a List of UnicodeSets, representing the character classes used
53        //   for this type of iterator.
54        abstract  List  charClasses();
55
56        // Set the test text on which subsequent calls to next() will operate
57        abstract  void   setText(StringBuffer text);
58
59        // Find the next break position, starting from the specified position.
60        // Return -1 after reaching end of string.
61        abstract   int   next(int i);
62
63        // A Character Property, one of the constants defined in class UProperty.
64        //   The value of this property will be displayed for the characters
65        //    near any test failure.
66        int   fCharProperty;
67    }
68
69    //
70    // Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
71    //
72    static String gExtended_Pict = "[" +
73            "\\U0001F774-\\U0001F77F\\u2700-\\u2701\\u2703-\\u2704\\u270E\\u2710-\\u2711\\u2765-\\u2767\\U0001F030-\\U0001F093" +
74            "\\U0001F094-\\U0001F09F\\U0001F10D-\\U0001F10F\\U0001F12F\\U0001F16C-\\U0001F16F\\U0001F1AD-\\U0001F1E5" +
75            "\\U0001F203-\\U0001F20F\\U0001F23C-\\U0001F23F\\U0001F249-\\U0001F24F\\U0001F252-\\U0001F2FF\\U0001F7D5-\\U0001F7FF" +
76            "\\U0001F000-\\U0001F003\\U0001F005-\\U0001F02B\\U0001F02C-\\U0001F02F\\U0001F322-\\U0001F323\\U0001F394-\\U0001F395" +
77            "\\U0001F398\\U0001F39C-\\U0001F39D\\U0001F3F1-\\U0001F3F2\\U0001F3F6\\U0001F4FE\\U0001F53E-\\U0001F548" +
78            "\\U0001F54F\\U0001F568-\\U0001F56E\\U0001F571-\\U0001F572\\U0001F57B-\\U0001F586\\U0001F588-\\U0001F589" +
79            "\\U0001F58E-\\U0001F58F\\U0001F591-\\U0001F594\\U0001F597-\\U0001F5A3\\U0001F5A6-\\U0001F5A7\\U0001F5A9-\\U0001F5B0" +
80            "\\U0001F5B3-\\U0001F5BB\\U0001F5BD-\\U0001F5C1\\U0001F5C5-\\U0001F5D0\\U0001F5D4-\\U0001F5DB\\U0001F5DF-\\U0001F5E0" +
81            "\\U0001F5E2\\U0001F5E4-\\U0001F5E7\\U0001F5E9-\\U0001F5EE\\U0001F5F0-\\U0001F5F2\\U0001F5F4-\\U0001F5F9" +
82            "\\u2605\\u2607-\\u260D\\u260F-\\u2610\\u2612\\u2616-\\u2617\\u2619-\\u261C\\u261E-\\u261F\\u2621\\u2624-\\u2625" +
83            "\\u2627-\\u2629\\u262B-\\u262D\\u2630-\\u2637\\u263B-\\u2647\\u2654-\\u265F\\u2661-\\u2662\\u2664\\u2667" +
84            "\\u2669-\\u267A\\u267C-\\u267E\\u2680-\\u2691\\u2695\\u2698\\u269A\\u269D-\\u269F\\u26A2-\\u26A9\\u26AC-\\u26AF" +
85            "\\u26B2-\\u26BC\\u26BF-\\u26C3\\u26C6-\\u26C7\\u26C9-\\u26CD\\u26D0\\u26D2\\u26D5-\\u26E8\\u26EB-\\u26EF" +
86            "\\u26F6\\u26FB-\\u26FC\\u26FE-\\u26FF\\u2388\\U0001FA00-\\U0001FFFD\\U0001F0A0-\\U0001F0AE\\U0001F0B1-\\U0001F0BF" +
87            "\\U0001F0C1-\\U0001F0CF\\U0001F0D1-\\U0001F0F5\\U0001F0AF-\\U0001F0B0\\U0001F0C0\\U0001F0D0\\U0001F0F6-\\U0001F0FF" +
88            "\\U0001F80C-\\U0001F80F\\U0001F848-\\U0001F84F\\U0001F85A-\\U0001F85F\\U0001F888-\\U0001F88F\\U0001F8AE-\\U0001F8FF" +
89            "\\U0001F900-\\U0001F90F\\U0001F91F\\U0001F928-\\U0001F92F\\U0001F931-\\U0001F932\\U0001F93F\\U0001F94C-\\U0001F94F" +
90            "\\U0001F95F-\\U0001F97F\\U0001F992-\\U0001F9BF\\U0001F9C1-\\U0001F9FF\\U0001F6C6-\\U0001F6CA\\U0001F6E6-\\U0001F6E8" +
91            "\\U0001F6EA\\U0001F6F1-\\U0001F6F2\\U0001F6D3-\\U0001F6DF\\U0001F6ED-\\U0001F6EF\\U0001F6F7-\\U0001F6FF" +
92            "]";
93
94
95    /**
96     * Monkey test subclass for testing Character (Grapheme Cluster) boundaries.
97     * Note: As of Unicode 6.1, fPrependSet is empty, so don't add it to fSets
98     */
99    static class RBBICharMonkey extends RBBIMonkeyKind {
100        List                      fSets;
101
102        UnicodeSet                fCRLFSet;
103        UnicodeSet                fControlSet;
104        UnicodeSet                fExtendSet;
105        UnicodeSet                fRegionalIndicatorSet;
106        UnicodeSet                fPrependSet;
107        UnicodeSet                fSpacingSet;
108        UnicodeSet                fLSet;
109        UnicodeSet                fVSet;
110        UnicodeSet                fTSet;
111        UnicodeSet                fLVSet;
112        UnicodeSet                fLVTSet;
113        UnicodeSet                fHangulSet;
114        UnicodeSet                fEmojiModifierSet;
115        UnicodeSet                fEmojiBaseSet;
116        UnicodeSet                fZWJSet;
117        UnicodeSet                fExtendedPictSet;
118        UnicodeSet                fEBGSet;
119        UnicodeSet                fEmojiNRKSet;
120        UnicodeSet                fAnySet;
121
122
123        StringBuffer              fText;
124
125
126        RBBICharMonkey() {
127            fText       = null;
128            fCharProperty = UProperty.GRAPHEME_CLUSTER_BREAK;
129            fCRLFSet    = new UnicodeSet("[\\r\\n]");
130            fControlSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Control}]");
131            fExtendSet  = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}]");
132            fZWJSet     = new UnicodeSet("[\\p{Grapheme_Cluster_Break = ZWJ}]");
133            fRegionalIndicatorSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]");
134            fPrependSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Prepend}]");
135            fSpacingSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = SpacingMark}]");
136            fLSet       = new UnicodeSet("[\\p{Grapheme_Cluster_Break = L}]");
137            fVSet       = new UnicodeSet("[\\p{Grapheme_Cluster_Break = V}]");
138            fTSet       = new UnicodeSet("[\\p{Grapheme_Cluster_Break = T}]");
139            fLVSet      = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LV}]");
140            fLVTSet     = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LVT}]");
141            fHangulSet  = new UnicodeSet();
142            fHangulSet.addAll(fLSet);
143            fHangulSet.addAll(fVSet);
144            fHangulSet.addAll(fTSet);
145            fHangulSet.addAll(fLVSet);
146            fHangulSet.addAll(fLVTSet);
147
148            fEmojiBaseSet     = new UnicodeSet("[\\p{Grapheme_Cluster_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]");
149            fEmojiModifierSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = EM}]");
150            fExtendedPictSet  = new UnicodeSet(gExtended_Pict);
151            fEBGSet           = new UnicodeSet("[\\p{Grapheme_Cluster_Break = EBG}]");
152            fEmojiNRKSet      = new UnicodeSet("[[\\p{Emoji}]-[\\p{Grapheme_Cluster_Break = Regional_Indicator}*#0-9©®™〰〽]]");
153            fAnySet           = new UnicodeSet("[\\u0000-\\U0010ffff]");
154
155
156            fSets       = new ArrayList();
157            fSets.add(fCRLFSet);
158            fSets.add(fControlSet);
159            fSets.add(fExtendSet);
160            fSets.add(fRegionalIndicatorSet);
161            if (!fPrependSet.isEmpty()) {
162                fSets.add(fPrependSet);
163            }
164            fSets.add(fSpacingSet);
165            fSets.add(fHangulSet);
166            fSets.add(fAnySet);
167            fSets.add(fEmojiBaseSet);
168            fSets.add(fEmojiModifierSet);
169            fSets.add(fZWJSet);
170            fSets.add(fExtendedPictSet);
171            fSets.add(fEBGSet);
172            fSets.add(fEmojiNRKSet);
173        }
174
175
176        @Override
177        void setText(StringBuffer s) {
178            fText = s;
179        }
180
181        @Override
182        List charClasses() {
183            return fSets;
184        }
185
186        @Override
187        int next(int prevPos) {
188            int    /*p0,*/ p1, p2, p3;    // Indices of the significant code points around the
189            //   break position being tested.  The candidate break
190            //   location is before p2.
191
192            int     breakPos = -1;
193
194            int   c0, c1, c2, c3;     // The code points at p0, p1, p2 & p3.
195            int   cBase;              // for (X Extend*) patterns, the X character.
196
197            // Previous break at end of string.  return DONE.
198            if (prevPos >= fText.length()) {
199                return -1;
200            }
201            /* p0 = */ p1 = p2 = p3 = prevPos;
202            c3 =  UTF16.charAt(fText, prevPos);
203            c0 = c1 = c2 = cBase = 0;
204
205            // Loop runs once per "significant" character position in the input text.
206            for (;;) {
207                // Move all of the positions forward in the input string.
208                /* p0 = p1;*/  c0 = c1;
209                p1 = p2;  c1 = c2;
210                p2 = p3;  c2 = c3;
211
212                // Advance p3 by one codepoint
213                p3 = moveIndex32(fText, p3, 1);
214                c3 = (p3>=fText.length())? -1: UTF16.charAt(fText, p3);
215
216                if (p1 == p2) {
217                    // Still warming up the loop.  (won't work with zero length strings, but we don't care)
218                    continue;
219                }
220                if (p2 == fText.length()) {
221                    // Reached end of string.  Always a break position.
222                    break;
223                }
224
225                // Rule  GB3   CR x LF
226                //     No Extend or Format characters may appear between the CR and LF,
227                //     which requires the additional check for p2 immediately following p1.
228                //
229                if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
230                    continue;
231                }
232
233                // Rule (GB4).   ( Control | CR | LF ) <break>
234                if (fControlSet.contains(c1) ||
235                        c1 == 0x0D ||
236                        c1 == 0x0A)  {
237                    break;
238                }
239
240                // Rule (GB5)    <break>  ( Control | CR | LF )
241                //
242                if (fControlSet.contains(c2) ||
243                        c2 == 0x0D ||
244                        c2 == 0x0A)  {
245                    break;
246                }
247
248
249                // Rule (GB6)  L x ( L | V | LV | LVT )
250                if (fLSet.contains(c1) &&
251                        (fLSet.contains(c2)  ||
252                                fVSet.contains(c2)  ||
253                                fLVSet.contains(c2) ||
254                                fLVTSet.contains(c2))) {
255                    continue;
256                }
257
258                // Rule (GB7)    ( LV | V )  x  ( V | T )
259                if ((fLVSet.contains(c1) || fVSet.contains(c1)) &&
260                        (fVSet.contains(c2) || fTSet.contains(c2)))  {
261                    continue;
262                }
263
264                // Rule (GB8)    ( LVT | T)  x T
265                if ((fLVTSet.contains(c1) || fTSet.contains(c1)) &&
266                        fTSet.contains(c2))  {
267                    continue;
268                }
269
270                // Rule (GB9)    x (Extend | ZWJ)
271                if (fExtendSet.contains(c2) || fZWJSet.contains(c2))  {
272                    if (!fExtendSet.contains(c1)) {
273                        cBase = c1;
274                    }
275                    continue;
276                }
277
278                // Rule (GB9a)   x  SpacingMark
279                if (fSpacingSet.contains(c2)) {
280                    continue;
281                }
282
283                // Rule (GB9b)   Prepend x
284                if (fPrependSet.contains(c1)) {
285                    continue;
286                }
287                // Rule (GB10)   (Emoji_Base | EBG) Extend* x Emoji_Modifier
288                if ((fEmojiBaseSet.contains(c1) || fEBGSet.contains(c1)) && fEmojiModifierSet.contains(c2)) {
289                    continue;
290                }
291                if ((fEmojiBaseSet.contains(cBase) || fEBGSet.contains(cBase)) &&
292                        fExtendSet.contains(c1) && fEmojiModifierSet.contains(c2)) {
293                    continue;
294                }
295
296                // Rule (GB11)   (Extended_Pictographic | Emoji) ZWJ x (Extended_Pictographic | Emoji)
297                if ((fExtendedPictSet.contains(c0) || fEmojiNRKSet.contains(c0)) && fZWJSet.contains(c1) &&
298                        (fExtendedPictSet.contains(c2) || fEmojiNRKSet.contains(c2))) {
299                    continue;
300                }
301
302                // Rule (GB12-13)   Regional_Indicator x Regional_Indicator
303                //                  Note: The first if condition is a little tricky. We only need to force
304                //                      a break if there are three or more contiguous RIs. If there are
305                //                      only two, a break following will occur via other rules, and will include
306                //                      any trailing extend characters, which is needed behavior.
307                if (fRegionalIndicatorSet.contains(c0) && fRegionalIndicatorSet.contains(c1)
308                        && fRegionalIndicatorSet.contains(c2)) {
309                    break;
310                }
311                if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) {
312                    continue;
313                }
314
315                // Rule (GB999)  Any  <break>  Any
316                break;
317            }
318
319            breakPos = p2;
320            return breakPos;
321        }
322    }
323
324
325    /**
326     *
327     * Word Monkey Test Class
328     *
329     *
330     *
331     */
332    static class RBBIWordMonkey extends RBBIMonkeyKind {
333        List                      fSets;
334        StringBuffer              fText;
335
336        UnicodeSet                fCRSet;
337        UnicodeSet                fLFSet;
338        UnicodeSet                fNewlineSet;
339        UnicodeSet                fRegionalIndicatorSet;
340        UnicodeSet                fKatakanaSet;
341        UnicodeSet                fHebrew_LetterSet;
342        UnicodeSet                fALetterSet;
343        UnicodeSet                fSingle_QuoteSet;
344        UnicodeSet                fDouble_QuoteSet;
345        UnicodeSet                fMidNumLetSet;
346        UnicodeSet                fMidLetterSet;
347        UnicodeSet                fMidNumSet;
348        UnicodeSet                fNumericSet;
349        UnicodeSet                fFormatSet;
350        UnicodeSet                fExtendSet;
351        UnicodeSet                fExtendNumLetSet;
352        UnicodeSet                fOtherSet;
353        UnicodeSet                fDictionarySet;
354        UnicodeSet                fEBaseSet;
355        UnicodeSet                fEBGSet;
356        UnicodeSet                fEModifierSet;
357        UnicodeSet                fZWJSet;
358        UnicodeSet                fExtendedPictSet;
359        UnicodeSet                fEmojiNRKSet;
360
361
362        RBBIWordMonkey() {
363            fCharProperty    = UProperty.WORD_BREAK;
364
365            fCRSet           = new UnicodeSet("[\\p{Word_Break = CR}]");
366            fLFSet           = new UnicodeSet("[\\p{Word_Break = LF}]");
367            fNewlineSet      = new UnicodeSet("[\\p{Word_Break = Newline}]");
368            fRegionalIndicatorSet = new UnicodeSet("[\\p{Word_Break = Regional_Indicator}]");
369            fKatakanaSet     = new UnicodeSet("[\\p{Word_Break = Katakana}]");
370            fHebrew_LetterSet = new UnicodeSet("[\\p{Word_Break = Hebrew_Letter}]");
371            fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}]");
372            fSingle_QuoteSet = new UnicodeSet("[\\p{Word_Break = Single_Quote}]");
373            fDouble_QuoteSet = new UnicodeSet("[\\p{Word_Break = Double_Quote}]");
374            fMidNumLetSet    = new UnicodeSet("[\\p{Word_Break = MidNumLet}]");
375            fMidLetterSet    = new UnicodeSet("[\\p{Word_Break = MidLetter}]");
376            fMidNumSet       = new UnicodeSet("[\\p{Word_Break = MidNum}]");
377            fNumericSet      = new UnicodeSet("[\\p{Word_Break = Numeric}]");
378            fFormatSet       = new UnicodeSet("[\\p{Word_Break = Format}]");
379            fExtendNumLetSet = new UnicodeSet("[\\p{Word_Break = ExtendNumLet}]");
380            fExtendSet       = new UnicodeSet("[\\p{Word_Break = Extend}]");
381            fEBaseSet        = new UnicodeSet("[\\p{Word_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]");
382            fEBGSet          = new UnicodeSet("[\\p{Word_Break = EBG}]");
383            fEModifierSet    = new UnicodeSet("[\\p{Word_Break = EM}]");
384            fZWJSet          = new UnicodeSet("[\\p{Word_Break = ZWJ}]");
385            fExtendedPictSet = new UnicodeSet(gExtended_Pict);
386            fEmojiNRKSet     = new UnicodeSet("[[\\p{Emoji}]-[\\p{Grapheme_Cluster_Break = Regional_Indicator}*#0-9©®™〰〽]]");
387
388            fDictionarySet = new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]");
389            fDictionarySet.addAll(fKatakanaSet);
390            fDictionarySet.addAll(new UnicodeSet("[\\p{LineBreak = Complex_Context}]"));
391
392            fALetterSet.removeAll(fDictionarySet);
393
394            fOtherSet        = new UnicodeSet();
395            fOtherSet.complement();
396            fOtherSet.removeAll(fCRSet);
397            fOtherSet.removeAll(fLFSet);
398            fOtherSet.removeAll(fNewlineSet);
399            fOtherSet.removeAll(fALetterSet);
400            fOtherSet.removeAll(fSingle_QuoteSet);
401            fOtherSet.removeAll(fDouble_QuoteSet);
402            fOtherSet.removeAll(fKatakanaSet);
403            fOtherSet.removeAll(fHebrew_LetterSet);
404            fOtherSet.removeAll(fMidLetterSet);
405            fOtherSet.removeAll(fMidNumSet);
406            fOtherSet.removeAll(fNumericSet);
407            fOtherSet.removeAll(fFormatSet);
408            fOtherSet.removeAll(fExtendSet);
409            fOtherSet.removeAll(fExtendNumLetSet);
410            fOtherSet.removeAll(fRegionalIndicatorSet);
411            fOtherSet.removeAll(fEBaseSet);
412            fOtherSet.removeAll(fEBGSet);
413            fOtherSet.removeAll(fEModifierSet);
414            fOtherSet.removeAll(fZWJSet);
415            fOtherSet.removeAll(fExtendedPictSet);
416            fOtherSet.removeAll(fEmojiNRKSet);
417
418            // Inhibit dictionary characters from being tested at all.
419            // remove surrogates so as to not generate higher CJK characters
420            fOtherSet.removeAll(new UnicodeSet("[[\\p{LineBreak = Complex_Context}][:Line_Break=Surrogate:]]"));
421            fOtherSet.removeAll(fDictionarySet);
422
423            fSets            = new ArrayList();
424            fSets.add(fCRSet);
425            fSets.add(fLFSet);
426            fSets.add(fNewlineSet);
427            fSets.add(fRegionalIndicatorSet);
428            fSets.add(fHebrew_LetterSet);
429            fSets.add(fALetterSet);
430            //fSets.add(fKatakanaSet);  // Omit Katakana from fSets, which omits Katakana characters
431            // from the test data. They are all in the dictionary set,
432            // which this (old, to be retired) monkey test cannot handle.
433            fSets.add(fSingle_QuoteSet);
434            fSets.add(fDouble_QuoteSet);
435            fSets.add(fMidLetterSet);
436            fSets.add(fMidNumLetSet);
437            fSets.add(fMidNumSet);
438            fSets.add(fNumericSet);
439            fSets.add(fFormatSet);
440            fSets.add(fExtendSet);
441            fSets.add(fExtendNumLetSet);
442            fSets.add(fRegionalIndicatorSet);
443            fSets.add(fEBaseSet);
444            fSets.add(fEBGSet);
445            fSets.add(fEModifierSet);
446            fSets.add(fZWJSet);
447            fSets.add(fExtendedPictSet);
448            fSets.add(fEmojiNRKSet);
449            fSets.add(fOtherSet);
450        }
451
452
453        @Override
454        List  charClasses() {
455            return fSets;
456        }
457
458        @Override
459        void   setText(StringBuffer s) {
460            fText = s;
461        }
462
463        @Override
464        int   next(int prevPos) {
465            int    /*p0,*/ p1, p2, p3;      // Indices of the significant code points around the
466            //   break position being tested.  The candidate break
467            //   location is before p2.
468            int     breakPos = -1;
469
470            int c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
471
472            // Previous break at end of string.  return DONE.
473            if (prevPos >= fText.length()) {
474                return -1;
475            }
476            /*p0 =*/ p1 = p2 = p3 = prevPos;
477            c3 = UTF16.charAt(fText, prevPos);
478            c0 = c1 = c2 = 0;
479
480
481
482            // Loop runs once per "significant" character position in the input text.
483            for (;;) {
484                // Move all of the positions forward in the input string.
485                /*p0 = p1;*/  c0 = c1;
486                p1 = p2;  c1 = c2;
487                p2 = p3;  c2 = c3;
488
489                // Advance p3 by    X(Extend | Format)*   Rule 4
490                //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
491                do {
492                    p3 = moveIndex32(fText, p3, 1);
493                    c3 = -1;
494                    if (p3>=fText.length()) {
495                        break;
496                    }
497                    c3 = UTF16.charAt(fText, p3);
498                    if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) {
499                        break;
500                    }
501                }
502                while (setContains(fFormatSet, c3) || setContains(fExtendSet, c3) || setContains(fZWJSet, c3));
503
504                if (p1 == p2) {
505                    // Still warming up the loop.  (won't work with zero length strings, but we don't care)
506                    continue;
507                }
508                if (p2 == fText.length()) {
509                    // Reached end of string.  Always a break position.
510                    break;
511                }
512
513                // Rule (3)   CR x LF
514                //     No Extend or Format characters may appear between the CR and LF,
515                //     which requires the additional check for p2 immediately following p1.
516                //
517                if (c1==0x0D && c2==0x0A) {
518                    continue;
519                }
520
521                // Rule (3a)  Break before and after newlines (including CR and LF)
522                //
523                if (fCRSet.contains(c1) || fLFSet.contains(c1) || fNewlineSet.contains(c1)) {
524                    break;
525                }
526                if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) {
527                    break;
528                }
529
530                // Rule (3c)    ZWJ x (Extended_Pictographic | Emoji).
531                //              Not ignoring extend chars, so peek into input text to
532                //              get the potential ZWJ, the character immediately preceding c2.
533                if (fZWJSet.contains(fText.codePointBefore(p2)) && (fExtendedPictSet.contains(c2) || fEmojiNRKSet.contains(c2))) {
534                    continue;
535                }
536
537                // Rule (5).   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
538                if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) &&
539                        (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2)))  {
540                    continue;
541                }
542
543                // Rule (6)  (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
544                //
545                if ( (fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1))   &&
546                        (fMidLetterSet.contains(c2) || fMidNumLetSet.contains(c2) || fSingle_QuoteSet.contains(c2)) &&
547                        (setContains(fALetterSet, c3) || setContains(fHebrew_LetterSet, c3))) {
548                    continue;
549                }
550
551                // Rule (7)  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)
552                if ((fALetterSet.contains(c0) || fHebrew_LetterSet.contains(c0)) &&
553                        (fMidLetterSet.contains(c1) || fMidNumLetSet.contains(c1) || fSingle_QuoteSet.contains(c1)) &&
554                        (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) {
555                    continue;
556                }
557
558                // Rule (7a)     Hebrew_Letter x Single_Quote
559                if (fHebrew_LetterSet.contains(c1) && fSingle_QuoteSet.contains(c2)) {
560                    continue;
561                }
562
563                // Rule (7b)    Hebrew_Letter x Double_Quote Hebrew_Letter
564                if (fHebrew_LetterSet.contains(c1) && fDouble_QuoteSet.contains(c2) && setContains(fHebrew_LetterSet,c3)) {
565                    continue;
566                }
567
568                // Rule (7c)    Hebrew_Letter Double_Quote x Hebrew_Letter
569                if (fHebrew_LetterSet.contains(c0) && fDouble_QuoteSet.contains(c1) && fHebrew_LetterSet.contains(c2)) {
570                    continue;
571                }
572
573                //  Rule (8)    Numeric x Numeric
574                if (fNumericSet.contains(c1) &&
575                        fNumericSet.contains(c2))  {
576                    continue;
577                }
578
579                // Rule (9)    (ALetter | Hebrew_Letter) x Numeric
580                if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) &&
581                        fNumericSet.contains(c2))  {
582                    continue;
583                }
584
585                // Rule (10)    Numeric x (ALetter | Hebrew_Letter)
586                if (fNumericSet.contains(c1) &&
587                        (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2)))  {
588                    continue;
589                }
590
591                // Rule (11)   Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric
592                if (fNumericSet.contains(c0) &&
593                        (fMidNumSet.contains(c1) || fMidNumLetSet.contains(c1) || fSingle_QuoteSet.contains(c1))  &&
594                        fNumericSet.contains(c2)) {
595                    continue;
596                }
597
598                // Rule (12)  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
599                if (fNumericSet.contains(c1) &&
600                        (fMidNumSet.contains(c2) || fMidNumLetSet.contains(c2) || fSingle_QuoteSet.contains(c2))  &&
601                        setContains(fNumericSet, c3)) {
602                    continue;
603                }
604
605                // Rule (13)  Katakana x Katakana
606                //            Note: matches UAX 29 rules, but doesn't come into play for ICU because
607                //                  all Katakana are handled by the dictionary breaker.
608                if (fKatakanaSet.contains(c1) &&
609                        fKatakanaSet.contains(c2))  {
610                    continue;
611                }
612
613                // Rule 13a    (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
614                if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1) ||fNumericSet.contains(c1) ||
615                        fKatakanaSet.contains(c1) || fExtendNumLetSet.contains(c1)) &&
616                        fExtendNumLetSet.contains(c2)) {
617                    continue;
618                }
619
620                // Rule 13b   ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
621                if (fExtendNumLetSet.contains(c1) &&
622                        (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2) ||
623                                fNumericSet.contains(c2) || fKatakanaSet.contains(c2)))  {
624                    continue;
625                }
626
627
628                // Rule 14 (E_Base | EBG) x E_Modifier
629                if ((fEBaseSet.contains(c1)  || fEBGSet.contains(c1)) && fEModifierSet.contains(c2)) {
630                    continue;
631                }
632
633                // Rule 15 - 17   Group piars of Regional Indicators
634                if (fRegionalIndicatorSet.contains(c0) && fRegionalIndicatorSet.contains(c1)) {
635                    break;
636                }
637                if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) {
638                    continue;
639                }
640
641                // Rule 999.  Break found here.
642                break;
643            }
644
645            breakPos = p2;
646            return breakPos;
647        }
648
649    }
650
651
652    static class RBBILineMonkey extends RBBIMonkeyKind {
653
654        List        fSets;
655
656        // UnicodeSets for each of the Line Breaking character classes.
657        // Order matches that of Unicode UAX 14, Table 1, which makes it a little easier
658        // to verify that they are all accounted for.
659
660        UnicodeSet  fBK;
661        UnicodeSet  fCR;
662        UnicodeSet  fLF;
663        UnicodeSet  fCM;
664        UnicodeSet  fNL;
665        UnicodeSet  fSG;
666        UnicodeSet  fWJ;
667        UnicodeSet  fZW;
668        UnicodeSet  fGL;
669        UnicodeSet  fSP;
670        UnicodeSet  fB2;
671        UnicodeSet  fBA;
672        UnicodeSet  fBB;
673        UnicodeSet  fHY;
674        UnicodeSet  fCB;
675        UnicodeSet  fCL;
676        UnicodeSet  fCP;
677        UnicodeSet  fEX;
678        UnicodeSet  fIN;
679        UnicodeSet  fNS;
680        UnicodeSet  fOP;
681        UnicodeSet  fQU;
682        UnicodeSet  fIS;
683        UnicodeSet  fNU;
684        UnicodeSet  fPO;
685        UnicodeSet  fPR;
686        UnicodeSet  fSY;
687        UnicodeSet  fAI;
688        UnicodeSet  fAL;
689        UnicodeSet  fCJ;
690        UnicodeSet  fH2;
691        UnicodeSet  fH3;
692        UnicodeSet  fHL;
693        UnicodeSet  fID;
694        UnicodeSet  fJL;
695        UnicodeSet  fJV;
696        UnicodeSet  fJT;
697        UnicodeSet  fRI;
698        UnicodeSet  fXX;
699        UnicodeSet  fEB;
700        UnicodeSet  fEM;
701        UnicodeSet  fZWJ;
702        UnicodeSet  fExtendedPict;
703        UnicodeSet  fEmojiNRK;
704
705        StringBuffer  fText;
706        int           fOrigPositions;
707
708
709
710        RBBILineMonkey()
711        {
712            fCharProperty  = UProperty.LINE_BREAK;
713            fSets          = new ArrayList();
714
715            fBK    = new UnicodeSet("[\\p{Line_Break=BK}]");
716            fCR    = new UnicodeSet("[\\p{Line_break=CR}]");
717            fLF    = new UnicodeSet("[\\p{Line_break=LF}]");
718            fCM    = new UnicodeSet("[\\p{Line_break=CM}]");
719            fNL    = new UnicodeSet("[\\p{Line_break=NL}]");
720            fSG    = new UnicodeSet("[\\ud800-\\udfff]");
721            fWJ    = new UnicodeSet("[\\p{Line_break=WJ}]");
722            fZW    = new UnicodeSet("[\\p{Line_break=ZW}]");
723            fGL    = new UnicodeSet("[\\p{Line_break=GL}]");
724            fSP    = new UnicodeSet("[\\p{Line_break=SP}]");
725            fB2    = new UnicodeSet("[\\p{Line_break=B2}]");
726            fBA    = new UnicodeSet("[\\p{Line_break=BA}]");
727            fBB    = new UnicodeSet("[\\p{Line_break=BB}]");
728            fHY    = new UnicodeSet("[\\p{Line_break=HY}]");
729            fCB    = new UnicodeSet("[\\p{Line_break=CB}]");
730            fCL    = new UnicodeSet("[\\p{Line_break=CL}]");
731            fCP    = new UnicodeSet("[\\p{Line_break=CP}]");
732            fEX    = new UnicodeSet("[\\p{Line_break=EX}]");
733            fIN    = new UnicodeSet("[\\p{Line_break=IN}]");
734            fNS    = new UnicodeSet("[\\p{Line_break=NS}]");
735            fOP    = new UnicodeSet("[\\p{Line_break=OP}]");
736            fQU    = new UnicodeSet("[\\p{Line_break=QU}]");
737            fIS    = new UnicodeSet("[\\p{Line_break=IS}]");
738            fNU    = new UnicodeSet("[\\p{Line_break=NU}]");
739            fPO    = new UnicodeSet("[\\p{Line_break=PO}]");
740            fPR    = new UnicodeSet("[\\p{Line_break=PR}]");
741            fSY    = new UnicodeSet("[\\p{Line_break=SY}]");
742            fAI    = new UnicodeSet("[\\p{Line_break=AI}]");
743            fAL    = new UnicodeSet("[\\p{Line_break=AL}]");
744            fCJ    = new UnicodeSet("[\\p{Line_break=CJ}]");
745            fH2    = new UnicodeSet("[\\p{Line_break=H2}]");
746            fH3    = new UnicodeSet("[\\p{Line_break=H3}]");
747            fHL    = new UnicodeSet("[\\p{Line_break=HL}]");
748            fID    = new UnicodeSet("[\\p{Line_break=ID}]");
749            fJL    = new UnicodeSet("[\\p{Line_break=JL}]");
750            fJV    = new UnicodeSet("[\\p{Line_break=JV}]");
751            fJT    = new UnicodeSet("[\\p{Line_break=JT}]");
752            fRI    = new UnicodeSet("[\\p{Line_break=RI}]");
753            fXX    = new UnicodeSet("[\\p{Line_break=XX}]");
754            fEB    = new UnicodeSet("[\\p{Line_break=EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]");
755            fEM    = new UnicodeSet("[\\p{Line_break=EM}]");
756            fZWJ   = new UnicodeSet("[\\p{Line_break=ZWJ}]");
757            fEmojiNRK = new UnicodeSet("[[\\p{Emoji}]-[\\p{Line_break=RI}*#0-9©®™〰〽]]");
758            fExtendedPict = new UnicodeSet(gExtended_Pict);
759
760
761            // Remove dictionary characters.
762            // The monkey test reference implementation of line break does not replicate the dictionary behavior,
763            // so dictionary characters are omitted from the monkey test data.
764            @SuppressWarnings("unused")
765            UnicodeSet dictionarySet = new UnicodeSet(
766                    "[[:LineBreak = Complex_Context:] & [[:Script = Thai:][:Script = Lao:][:Script = Khmer:] [:script = Myanmar:]]]");
767
768            fAL.addAll(fXX);     // Default behavior for XX is identical to AL
769            fAL.addAll(fAI);     // Default behavior for AI is identical to AL
770            fAL.addAll(fSG);     // Default behavior for SG (unpaired surrogates) is AL
771
772            fNS.addAll(fCJ);     // Default behavior for CJ is identical to NS.
773            fCM.addAll(fZWJ);    // ZWJ behaves as a CM.
774
775            fSets.add(fBK);
776            fSets.add(fCR);
777            fSets.add(fLF);
778            fSets.add(fCM);
779            fSets.add(fNL);
780            fSets.add(fWJ);
781            fSets.add(fZW);
782            fSets.add(fGL);
783            fSets.add(fSP);
784            fSets.add(fB2);
785            fSets.add(fBA);
786            fSets.add(fBB);
787            fSets.add(fHY);
788            fSets.add(fCB);
789            fSets.add(fCL);
790            fSets.add(fCP);
791            fSets.add(fEX);
792            fSets.add(fIN);
793            fSets.add(fJL);
794            fSets.add(fJT);
795            fSets.add(fJV);
796            fSets.add(fNS);
797            fSets.add(fOP);
798            fSets.add(fQU);
799            fSets.add(fIS);
800            fSets.add(fNU);
801            fSets.add(fPO);
802            fSets.add(fPR);
803            fSets.add(fSY);
804            fSets.add(fAI);
805            fSets.add(fAL);
806            fSets.add(fH2);
807            fSets.add(fH3);
808            fSets.add(fHL);
809            fSets.add(fID);
810            fSets.add(fWJ);
811            fSets.add(fRI);
812            fSets.add(fSG);
813            fSets.add(fEB);
814            fSets.add(fEM);
815            fSets.add(fZWJ);
816            fSets.add(fExtendedPict);
817            fSets.add(fEmojiNRK);
818        }
819
820        @Override
821        void setText(StringBuffer s) {
822            fText       = s;
823        }
824
825
826
827
828        @Override
829        int next(int startPos) {
830            int    pos;       //  Index of the char following a potential break position
831            int    thisChar;  //  Character at above position "pos"
832
833            int    prevPos;   //  Index of the char preceding a potential break position
834            int    prevChar;  //  Character at above position.  Note that prevChar
835            //   and thisChar may not be adjacent because combining
836            //   characters between them will be ignored.
837            int    prevCharX2; //  Character before prevChar, more contex for LB 21a
838
839            int    nextPos;   //  Index of the next character following pos.
840            //     Usually skips over combining marks.
841            int    tPos;      //  temp value.
842            int    matchVals[]  = null;       // Number  Expression Match Results
843
844
845            if (startPos >= fText.length()) {
846                return -1;
847            }
848
849
850            // Initial values for loop.  Loop will run the first time without finding breaks,
851            //                           while the invalid values shift out and the "this" and
852            //                           "prev" positions are filled in with good values.
853            pos      = prevPos   = -1;    // Invalid value, serves as flag for initial loop iteration.
854            thisChar = prevChar  = prevCharX2 = 0;
855            nextPos  = startPos;
856
857
858            // Loop runs once per position in the test text, until a break position
859            //  is found.  In each iteration, we are testing for a possible break
860            //  just preceding the character at index "pos".  The character preceding
861            //  this char is at postion "prevPos"; because of combining sequences,
862            //  "prevPos" can be arbitrarily far before "pos".
863            for (;;) {
864                // Advance to the next position to be tested.
865                prevCharX2 = prevChar;
866                prevPos   = pos;
867                prevChar  = thisChar;
868                pos       = nextPos;
869                nextPos   = moveIndex32(fText, pos, 1);
870
871                // Rule LB2 - Break at end of text.
872                if (pos >= fText.length()) {
873                    break;
874                }
875
876                // Rule LB 9 - adjust for combining sequences.
877                //             We do this rule out-of-order because the adjustment does
878                //             not effect the way that rules LB 3 through LB 6 match,
879                //             and doing it here rather than after LB 6 is substantially
880                //             simpler when combining sequences do occur.
881
882
883                // LB 9         Keep combining sequences together.
884                //              advance over any CM class chars at "pos",
885                //              result is "nextPos" for the following loop iteration.
886                thisChar  = UTF16.charAt(fText, pos);
887                if (!(fSP.contains(thisChar) || fBK.contains(thisChar) || thisChar==0x0d ||
888                        thisChar==0x0a || fNL.contains(thisChar) || fZW.contains(thisChar) )) {
889                    for (;;) {
890                        if (nextPos == fText.length()) {
891                            break;
892                        }
893                        int nextChar = UTF16.charAt(fText, nextPos);
894                        if (!fCM.contains(nextChar)) {
895                            break;
896                        }
897                        nextPos = moveIndex32(fText, nextPos, 1);
898                    }
899                }
900
901                // LB 9 Treat X CM* as if it were X
902                //        No explicit action required.
903
904                // LB 10     Treat any remaining combining mark as AL
905                if (fCM.contains(thisChar)) {
906                    thisChar = 'A';
907                }
908
909
910                // If the loop is still warming up - if we haven't shifted the initial
911                //   -1 positions out of prevPos yet - loop back to advance the
912                //    position in the input without any further looking for breaks.
913                if (prevPos == -1) {
914                    continue;
915                }
916
917                // LB 4  Always break after hard line breaks,
918                if (fBK.contains(prevChar)) {
919                    break;
920                }
921
922                // LB 5  Break after CR, LF, NL, but not inside CR LF
923                if (fCR.contains(prevChar) && fLF.contains(thisChar)) {
924                    continue;
925                }
926                if  (fCR.contains(prevChar) ||
927                        fLF.contains(prevChar) ||
928                        fNL.contains(prevChar))  {
929                    break;
930                }
931
932                // LB 6  Don't break before hard line breaks
933                if (fBK.contains(thisChar) || fCR.contains(thisChar) ||
934                        fLF.contains(thisChar) || fNL.contains(thisChar) ) {
935                    continue;
936                }
937
938
939                // LB 7  Don't break before spaces or zero-width space.
940                if (fSP.contains(thisChar)) {
941                    continue;
942                }
943
944                if (fZW.contains(thisChar)) {
945                    continue;
946                }
947
948                // LB 8  Break after zero width space
949                if (fZW.contains(prevChar)) {
950                    break;
951                }
952
953                // LB 8a:  ZWJ x (ID | Extended_Pictographic | Emoji)
954                //       The monkey test's way of ignoring combining characters doesn't work
955                //       for this rule. ZWJ is also a CM. Need to get the actual character
956                //       preceding "thisChar", not ignoring combining marks, possibly ZWJ.
957                {
958                    int prevC = fText.codePointBefore(pos);
959                    if (fZWJ.contains(prevC) && (fID.contains(thisChar) || fExtendedPict.contains(thisChar) || fEmojiNRK.contains(thisChar))) {
960                        continue;
961                    }
962                }
963
964                //  LB 9, 10  Already done, at top of loop.
965                //
966
967
968                // LB 11
969                //    x  WJ
970                //    WJ  x
971                if (fWJ.contains(thisChar) || fWJ.contains(prevChar)) {
972                    continue;
973                }
974
975
976                // LB 12
977                //        GL x
978                if (fGL.contains(prevChar)) {
979                    continue;
980                }
981
982                // LB 12a
983                //    [^SP BA HY] x GL
984                if (!(fSP.contains(prevChar) ||
985                        fBA.contains(prevChar) ||
986                        fHY.contains(prevChar)     ) && fGL.contains(thisChar)) {
987                    continue;
988                }
989
990
991
992                // LB 13  Don't break before closings.
993                //       NU x CL, NU x CP  and NU x IS are not matched here so that they will
994                //       fall into LB 17 and the more general number regular expression.
995                //
996                if (!fNU.contains(prevChar) && fCL.contains(thisChar) ||
997                        !fNU.contains(prevChar) && fCP.contains(thisChar) ||
998                        fEX.contains(thisChar) ||
999                        !fNU.contains(prevChar) && fIS.contains(thisChar) ||
1000                        !fNU.contains(prevChar) && fSY.contains(thisChar))    {
1001                    continue;
1002                }
1003
1004                // LB 14  Don't break after OP SP*
1005                //       Scan backwards, checking for this sequence.
1006                //       The OP char could include combining marks, so we actually check for
1007                //           OP CM* SP* x
1008                tPos = prevPos;
1009                if (fSP.contains(prevChar)) {
1010                    while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
1011                        tPos=moveIndex32(fText, tPos, -1);
1012                    }
1013                }
1014                while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
1015                    tPos=moveIndex32(fText, tPos, -1);
1016                }
1017                if (fOP.contains(UTF16.charAt(fText, tPos))) {
1018                    continue;
1019                }
1020
1021                // LB 15 Do not break within "[
1022                //       QU CM* SP* x OP
1023                if (fOP.contains(thisChar)) {
1024                    // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
1025                    tPos = prevPos;
1026                    while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
1027                        tPos = moveIndex32(fText, tPos, -1);
1028                    }
1029                    while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
1030                        tPos = moveIndex32(fText, tPos, -1);
1031                    }
1032                    if (fQU.contains(UTF16.charAt(fText, tPos))) {
1033                        continue;
1034                    }
1035                }
1036
1037                // LB 16   (CL | CP) SP* x NS
1038                if (fNS.contains(thisChar)) {
1039                    tPos = prevPos;
1040                    while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
1041                        tPos = moveIndex32(fText, tPos, -1);
1042                    }
1043                    while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
1044                        tPos = moveIndex32(fText, tPos, -1);
1045                    }
1046                    if (fCL.contains(UTF16.charAt(fText, tPos)) || fCP.contains(UTF16.charAt(fText, tPos))) {
1047                        continue;
1048                    }
1049                }
1050
1051
1052                // LB 17        B2 SP* x B2
1053                if (fB2.contains(thisChar)) {
1054                    tPos = prevPos;
1055                    while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
1056                        tPos = moveIndex32(fText, tPos, -1);
1057                    }
1058                    while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
1059                        tPos = moveIndex32(fText, tPos, -1);
1060                    }
1061                    if (fB2.contains(UTF16.charAt(fText, tPos))) {
1062                        continue;
1063                    }
1064                }
1065
1066                // LB 18    break after space
1067                if (fSP.contains(prevChar)) {
1068                    break;
1069                }
1070
1071                // LB 19
1072                //    x   QU
1073                //    QU  x
1074                if (fQU.contains(thisChar) || fQU.contains(prevChar)) {
1075                    continue;
1076                }
1077
1078                // LB 20  Break around a CB
1079                if (fCB.contains(thisChar) || fCB.contains(prevChar)) {
1080                    break;
1081                }
1082
1083                // LB 21
1084                if (fBA.contains(thisChar) ||
1085                        fHY.contains(thisChar) ||
1086                        fNS.contains(thisChar) ||
1087                        fBB.contains(prevChar) )   {
1088                    continue;
1089                }
1090
1091                // LB 21a, HL (HY | BA) x
1092                if (fHL.contains(prevCharX2) && (fHY.contains(prevChar) || fBA.contains(prevChar))) {
1093                    continue;
1094                }
1095
1096                // LB 21b, SY x HL
1097                if (fSY.contains(prevChar) && fHL.contains(thisChar)) {
1098                    continue;
1099                }
1100
1101                // LB 22
1102                if (fAL.contains(prevChar) && fIN.contains(thisChar) ||
1103                        fEX.contains(prevChar) && fIN.contains(thisChar) ||
1104                        fHL.contains(prevChar) && fIN.contains(thisChar) ||
1105                        (fID.contains(prevChar) || fEB.contains(prevChar) || fEM.contains(prevChar)) && fIN.contains(thisChar) ||
1106                        fIN.contains(prevChar) && fIN.contains(thisChar) ||
1107                        fNU.contains(prevChar) && fIN.contains(thisChar) )   {
1108                    continue;
1109                }
1110
1111                // LB 23    (AL | HL) x NU
1112                //          NU x (AL | HL)
1113                if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && fNU.contains(thisChar)) {
1114                    continue;
1115                }
1116                if (fNU.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
1117                    continue;
1118                }
1119
1120                // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
1121                //      PR x (ID | EB | EM)
1122                //     (ID | EB | EM) x PO
1123                if (fPR.contains(prevChar) &&
1124                        (fID.contains(thisChar) || fEB.contains(thisChar) || fEM.contains(thisChar)))  {
1125                    continue;
1126                }
1127                if ((fID.contains(prevChar) || fEB.contains(prevChar) || fEM.contains(prevChar)) &&
1128                        fPO.contains(thisChar)) {
1129                    continue;
1130                }
1131
1132                // LB 24  Do not break between prefix and letters or ideographs.
1133                //         (PR | PO) x (AL | HL)
1134                //         (AL | HL) x (PR | PO)
1135                if ((fPR.contains(prevChar) || fPO.contains(prevChar)) &&
1136                        (fAL.contains(thisChar) || fHL.contains(thisChar))) {
1137                    continue;
1138                }
1139                if ((fAL.contains(prevChar) || fHL.contains(prevChar)) &&
1140                        (fPR.contains(thisChar) || fPO.contains(thisChar))) {
1141                    continue;
1142                }
1143
1144
1145                // LB 25    Numbers
1146                matchVals = LBNumberCheck(fText, prevPos, matchVals);
1147                if (matchVals[0] != -1) {
1148                    // Matched a number.  But could have been just a single digit, which would
1149                    //    not represent a "no break here" between prevChar and thisChar
1150                    int numEndIdx = matchVals[1];  // idx of first char following num
1151                    if (numEndIdx > pos) {
1152                        // Number match includes at least the two chars being checked
1153                        if (numEndIdx > nextPos) {
1154                            // Number match includes additional chars.  Update pos and nextPos
1155                            //   so that next loop iteration will continue at the end of the number,
1156                            //   checking for breaks between last char in number & whatever follows.
1157                            nextPos = numEndIdx;
1158                            pos     = numEndIdx;
1159                            do {
1160                                pos = moveIndex32(fText, pos, -1);
1161                                thisChar = UTF16.charAt(fText, pos);
1162                            }
1163                            while (fCM.contains(thisChar));
1164                        }
1165                        continue;
1166                    }
1167                }
1168
1169
1170                // LB 26  Do not break Korean Syllables
1171                if (fJL.contains(prevChar) && (fJL.contains(thisChar) ||
1172                        fJV.contains(thisChar) ||
1173                        fH2.contains(thisChar) ||
1174                        fH3.contains(thisChar))) {
1175                    continue;
1176                }
1177
1178                if ((fJV.contains(prevChar) || fH2.contains(prevChar))  &&
1179                        (fJV.contains(thisChar) || fJT.contains(thisChar))) {
1180                    continue;
1181                }
1182
1183                if ((fJT.contains(prevChar) || fH3.contains(prevChar)) &&
1184                        fJT.contains(thisChar)) {
1185                    continue;
1186                }
1187
1188                // LB 27 Treat a Korean Syllable Block the same as ID
1189                if ((fJL.contains(prevChar) || fJV.contains(prevChar) ||
1190                        fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) &&
1191                        fIN.contains(thisChar)) {
1192                    continue;
1193                }
1194                if ((fJL.contains(prevChar) || fJV.contains(prevChar) ||
1195                        fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) &&
1196                        fPO.contains(thisChar)) {
1197                    continue;
1198                }
1199                if (fPR.contains(prevChar) && (fJL.contains(thisChar) || fJV.contains(thisChar) ||
1200                        fJT.contains(thisChar) || fH2.contains(thisChar) || fH3.contains(thisChar))) {
1201                    continue;
1202                }
1203
1204
1205
1206                // LB 28 Do not break between alphabetics
1207                if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
1208                    continue;
1209                }
1210
1211                // LB 29  Do not break between numeric punctuation and alphabetics
1212                if (fIS.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
1213                    continue;
1214                }
1215
1216                // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
1217                //          (AL | NU) x OP
1218                //          CP x (AL | NU)
1219                if ((fAL.contains(prevChar) || fHL.contains(prevChar) || fNU.contains(prevChar)) && fOP.contains(thisChar)) {
1220                    continue;
1221                }
1222                if (fCP.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar) || fNU.contains(thisChar))) {
1223                    continue;
1224                }
1225
1226                // LB 30a   Break between pairs of Regional Indicators.
1227                //             RI RI <break> RI
1228                //             RI    x    RI
1229                if (fRI.contains(prevCharX2) && fRI.contains(prevChar) && fRI.contains(thisChar)) {
1230                    break;
1231                }
1232                if (fRI.contains(prevChar) && fRI.contains(thisChar)) {
1233                    continue;
1234                }
1235
1236                // LB30b    Emoji Base x Emoji Modifier
1237                if (fEB.contains(prevChar) && fEM.contains(thisChar)) {
1238                    continue;
1239                }
1240                // LB 31    Break everywhere else
1241                break;
1242            }
1243
1244            return pos;
1245        }
1246
1247
1248
1249        // Match the following regular expression in the input text.
1250        //    ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * ((CL | CP) CM*)?  (PR | PO) CM*)?
1251        //      0    0   1       3    3    4              7    7    7    7      9    9    9     11   11    (match states)
1252        //  retVals array  [0]  index of the start of the match, or -1 if no match
1253        //                 [1]  index of first char following the match.
1254        //  Can not use Java regex because need supplementary character support,
1255        //     and because Unicode char properties version must be the same as in
1256        //     the version of ICU being tested.
1257        private int[] LBNumberCheck(StringBuffer s, int startIdx, int[] retVals) {
1258            if (retVals == null) {
1259                retVals = new int[2];
1260            }
1261            retVals[0]     = -1;  // Indicates no match.
1262            int matchState = 0;
1263            int idx        = startIdx;
1264
1265            matchLoop: for (idx = startIdx; idx<s.length(); idx = moveIndex32(s, idx, 1)){
1266                int c = UTF16.charAt(s, idx);
1267                int cLBType = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK);
1268                switch (matchState) {
1269                case 0:
1270                    if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC ||
1271                    cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
1272                        matchState = 1;
1273                        break;
1274                    }
1275                    if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {
1276                        matchState = 4;
1277                        break;
1278                    }
1279                    if (cLBType == UCharacter.LineBreak.HYPHEN) {
1280                        matchState = 4;
1281                        break;
1282                    }
1283                    if (cLBType == UCharacter.LineBreak.NUMERIC) {
1284                        matchState = 7;
1285                        break;
1286                    }
1287                    break matchLoop;   /* No Match  */
1288
1289                case 1:
1290                    if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) {
1291                        matchState = 1;
1292                        break;
1293                    }
1294                    if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {
1295                        matchState = 4;
1296                        break;
1297                    }
1298                    if (cLBType == UCharacter.LineBreak.HYPHEN) {
1299                        matchState = 4;
1300                        break;
1301                    }
1302                    if (cLBType == UCharacter.LineBreak.NUMERIC) {
1303                        matchState = 7;
1304                        break;
1305                    }
1306                    break matchLoop;   /* No Match  */
1307
1308
1309                case 4:
1310                    if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) {
1311                        matchState = 4;
1312                        break;
1313                    }
1314                    if (cLBType == UCharacter.LineBreak.NUMERIC) {
1315                        matchState = 7;
1316                        break;
1317                    }
1318                    break matchLoop;   /* No Match  */
1319                    //    ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)?  (PR | PO) CM*)?
1320                    //      0    0   1       3    3    4              7    7    7    7      9   9     11   11    (match states)
1321
1322                case 7:
1323                    if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) {
1324                        matchState = 7;
1325                        break;
1326                    }
1327                    if (cLBType == UCharacter.LineBreak.NUMERIC) {
1328                        matchState = 7;
1329                        break;
1330                    }
1331                    if (cLBType == UCharacter.LineBreak.INFIX_NUMERIC) {
1332                        matchState = 7;
1333                        break;
1334                    }
1335                    if (cLBType == UCharacter.LineBreak.BREAK_SYMBOLS) {
1336                        matchState = 7;
1337                        break;
1338                    }
1339                    if (cLBType == UCharacter.LineBreak.CLOSE_PUNCTUATION) {
1340                        matchState = 9;
1341                        break;
1342                    }
1343                    if (cLBType == UCharacter.LineBreak.CLOSE_PARENTHESIS) {
1344                        matchState = 9;
1345                        break;
1346                    }
1347                    if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
1348                        matchState = 11;
1349                        break;
1350                    }
1351                    if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {
1352                        matchState = 11;
1353                        break;
1354                    }
1355
1356                    break matchLoop;    // Match Complete.
1357                case 9:
1358                    if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) {
1359                        matchState = 9;
1360                        break;
1361                    }
1362                    if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
1363                        matchState = 11;
1364                        break;
1365                    }
1366                    if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {
1367                        matchState = 11;
1368                        break;
1369                    }
1370                    break matchLoop;    // Match Complete.
1371                case 11:
1372                    if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) {
1373                        matchState = 11;
1374                        break;
1375                    }
1376                    break matchLoop;    // Match Complete.
1377                }
1378            }
1379            if (matchState > 4) {
1380                retVals[0] = startIdx;
1381                retVals[1] = idx;
1382            }
1383            return retVals;
1384        }
1385
1386
1387        @Override
1388        List  charClasses() {
1389            return fSets;
1390        }
1391
1392
1393
1394    }
1395
1396
1397    /**
1398     *
1399     * Sentence Monkey Test Class
1400     *
1401     *
1402     *
1403     */
1404    static class RBBISentenceMonkey extends RBBIMonkeyKind {
1405        List                 fSets;
1406        StringBuffer         fText;
1407
1408        UnicodeSet           fSepSet;
1409        UnicodeSet           fFormatSet;
1410        UnicodeSet           fSpSet;
1411        UnicodeSet           fLowerSet;
1412        UnicodeSet           fUpperSet;
1413        UnicodeSet           fOLetterSet;
1414        UnicodeSet           fNumericSet;
1415        UnicodeSet           fATermSet;
1416        UnicodeSet           fSContinueSet;
1417        UnicodeSet           fSTermSet;
1418        UnicodeSet           fCloseSet;
1419        UnicodeSet           fOtherSet;
1420        UnicodeSet           fExtendSet;
1421
1422
1423
1424        RBBISentenceMonkey() {
1425            fCharProperty  = UProperty.SENTENCE_BREAK;
1426
1427            fSets            = new ArrayList();
1428
1429            //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
1430            //                       set and made into character classes of their own.  For the monkey impl,
1431            //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
1432            fSepSet          = new UnicodeSet("[\\p{Sentence_Break = Sep} \\u000a \\u000d]");
1433            fFormatSet       = new UnicodeSet("[\\p{Sentence_Break = Format}]");
1434            fSpSet           = new UnicodeSet("[\\p{Sentence_Break = Sp}]");
1435            fLowerSet        = new UnicodeSet("[\\p{Sentence_Break = Lower}]");
1436            fUpperSet        = new UnicodeSet("[\\p{Sentence_Break = Upper}]");
1437            fOLetterSet      = new UnicodeSet("[\\p{Sentence_Break = OLetter}]");
1438            fNumericSet      = new UnicodeSet("[\\p{Sentence_Break = Numeric}]");
1439            fATermSet        = new UnicodeSet("[\\p{Sentence_Break = ATerm}]");
1440            fSContinueSet    = new UnicodeSet("[\\p{Sentence_Break = SContinue}]");
1441            fSTermSet        = new UnicodeSet("[\\p{Sentence_Break = STerm}]");
1442            fCloseSet        = new UnicodeSet("[\\p{Sentence_Break = Close}]");
1443            fExtendSet       = new UnicodeSet("[\\p{Sentence_Break = Extend}]");
1444            fOtherSet        = new UnicodeSet();
1445
1446
1447            fOtherSet.complement();
1448            fOtherSet.removeAll(fSepSet);
1449            fOtherSet.removeAll(fFormatSet);
1450            fOtherSet.removeAll(fSpSet);
1451            fOtherSet.removeAll(fLowerSet);
1452            fOtherSet.removeAll(fUpperSet);
1453            fOtherSet.removeAll(fOLetterSet);
1454            fOtherSet.removeAll(fNumericSet);
1455            fOtherSet.removeAll(fATermSet);
1456            fOtherSet.removeAll(fSContinueSet);
1457            fOtherSet.removeAll(fSTermSet);
1458            fOtherSet.removeAll(fCloseSet);
1459            fOtherSet.removeAll(fExtendSet);
1460
1461            fSets.add(fSepSet);
1462            fSets.add(fFormatSet);
1463
1464            fSets.add(fSpSet);
1465            fSets.add(fLowerSet);
1466            fSets.add(fUpperSet);
1467            fSets.add(fOLetterSet);
1468            fSets.add(fNumericSet);
1469            fSets.add(fATermSet);
1470            fSets.add(fSContinueSet);
1471            fSets.add(fSTermSet);
1472            fSets.add(fCloseSet);
1473            fSets.add(fOtherSet);
1474            fSets.add(fExtendSet);
1475        }
1476
1477
1478        @Override
1479        List  charClasses() {
1480            return fSets;
1481        }
1482
1483        @Override
1484        void   setText(StringBuffer s) {
1485            fText = s;
1486        }
1487
1488
1489        //      moveBack()   Find the "significant" code point preceding the index i.
1490        //      Skips over ($Extend | $Format)*
1491        //
1492        private int moveBack(int i) {
1493
1494            if (i <= 0) {
1495                return -1;
1496            }
1497
1498            int      c;
1499            int      j = i;
1500            do {
1501                j = moveIndex32(fText, j, -1);
1502                c = UTF16.charAt(fText, j);
1503            }
1504            while (j>0 &&(fFormatSet.contains(c) || fExtendSet.contains(c)));
1505            return j;
1506        }
1507
1508
1509        int moveForward(int i) {
1510            if (i>=fText.length()) {
1511                return fText.length();
1512            }
1513            int   c;
1514            int   j = i;
1515            do {
1516                j = moveIndex32(fText, j, 1);
1517                c = cAt(j);
1518            }
1519            while (c>=0 && (fFormatSet.contains(c) || fExtendSet.contains(c)));
1520            return j;
1521
1522        }
1523
1524        int cAt(int pos) {
1525            if (pos<0 || pos>=fText.length()) {
1526                return -1;
1527            }
1528            return UTF16.charAt(fText, pos);
1529        }
1530
1531        @Override
1532        int   next(int prevPos) {
1533            int    /*p0,*/ p1, p2, p3;      // Indices of the significant code points around the
1534            //   break position being tested.  The candidate break
1535            //   location is before p2.
1536            int     breakPos = -1;
1537
1538            int c0, c1, c2, c3;         // The code points at p0, p1, p2 & p3.
1539            int c;
1540
1541            // Prev break at end of string.  return DONE.
1542            if (prevPos >= fText.length()) {
1543                return -1;
1544            }
1545            /*p0 =*/ p1 = p2 = p3 = prevPos;
1546            c3 = UTF16.charAt(fText, prevPos);
1547            c0 = c1 = c2 = 0;
1548
1549            // Loop runs once per "significant" character position in the input text.
1550            for (;;) {
1551                // Move all of the positions forward in the input string.
1552                /*p0 = p1;*/  c0 = c1;
1553                p1 = p2;  c1 = c2;
1554                p2 = p3;  c2 = c3;
1555
1556                // Advancd p3 by  X(Extend | Format)*   Rule 4
1557                p3 = moveForward(p3);
1558                c3 = cAt(p3);
1559
1560                // Rule (3) CR x LF
1561                if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
1562                    continue;
1563                }
1564
1565                // Rule (4)    Sep  <break>
1566                if (fSepSet.contains(c1)) {
1567                    p2 = p1+1;   // Separators don't combine with Extend or Format
1568                    break;
1569                }
1570
1571                if (p2 >= fText.length()) {
1572                    // Reached end of string.  Always a break position.
1573                    break;
1574                }
1575
1576                if (p2 == prevPos) {
1577                    // Still warming up the loop.  (won't work with zero length strings, but we don't care)
1578                    continue;
1579                }
1580
1581                // Rule (6).   ATerm x Numeric
1582                if (fATermSet.contains(c1) &&  fNumericSet.contains(c2))  {
1583                    continue;
1584                }
1585
1586                // Rule (7).  (Upper | Lower) ATerm  x  Uppper
1587                if ((fUpperSet.contains(c0) || fLowerSet.contains(c0)) &&
1588                        fATermSet.contains(c1) && fUpperSet.contains(c2)) {
1589                    continue;
1590                }
1591
1592                // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep))* Lower
1593                //           Note:  Sterm | ATerm are added to the negated part of the expression by a
1594                //                  note to the Unicode 5.0 documents.
1595                int p8 = p1;
1596                while (p8>0 && fSpSet.contains(cAt(p8))) {
1597                    p8 = moveBack(p8);
1598                }
1599                while (p8>0 && fCloseSet.contains(cAt(p8))) {
1600                    p8 = moveBack(p8);
1601                }
1602                if (fATermSet.contains(cAt(p8))) {
1603                    p8=p2;
1604                    for (;;) {
1605                        c = cAt(p8);
1606                        if (c==-1 || fOLetterSet.contains(c) || fUpperSet.contains(c) ||
1607                                fLowerSet.contains(c) || fSepSet.contains(c) ||
1608                                fATermSet.contains(c) || fSTermSet.contains(c))
1609                        {
1610                            break;
1611                        }
1612                        p8 = moveForward(p8);
1613                    }
1614                    if (p8<fText.length() && fLowerSet.contains(cAt(p8))) {
1615                        continue;
1616                    }
1617                }
1618
1619                // Rule 8a  (STerm | ATerm) Close* Sp* x (SContinue | Sterm | ATerm)
1620                if (fSContinueSet.contains(c2) || fSTermSet.contains(c2) || fATermSet.contains(c2)) {
1621                    p8 = p1;
1622                    while (setContains(fSpSet, cAt(p8))) {
1623                        p8 = moveBack(p8);
1624                    }
1625                    while (setContains(fCloseSet, cAt(p8))) {
1626                        p8 = moveBack(p8);
1627                    }
1628                    c = cAt(p8);
1629                    if (setContains(fSTermSet, c) || setContains(fATermSet, c)) {
1630                        continue;
1631                    }
1632                }
1633
1634
1635                // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
1636                int p9 = p1;
1637                while (p9>0 && fCloseSet.contains(cAt(p9))) {
1638                    p9 = moveBack(p9);
1639                }
1640                c = cAt(p9);
1641                if ((fSTermSet.contains(c) || fATermSet.contains(c))) {
1642                    if (fCloseSet.contains(c2) || fSpSet.contains(c2) || fSepSet.contains(c2)) {
1643                        continue;
1644                    }
1645                }
1646
1647                // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
1648                int p10 = p1;
1649                while (p10>0 && fSpSet.contains(cAt(p10))) {
1650                    p10 = moveBack(p10);
1651                }
1652                while (p10>0 && fCloseSet.contains(cAt(p10))) {
1653                    p10 = moveBack(p10);
1654                }
1655                if (fSTermSet.contains(cAt(p10)) || fATermSet.contains(cAt(p10))) {
1656                    if (fSpSet.contains(c2) || fSepSet.contains(c2)) {
1657                        continue;
1658                    }
1659                }
1660
1661                // Rule (11)  (STerm | ATerm) Close* Sp*   <break>
1662                int p11 = p1;
1663                if (p11>0 && fSepSet.contains(cAt(p11))) {
1664                    p11 = moveBack(p11);
1665                }
1666                while (p11>0 && fSpSet.contains(cAt(p11))) {
1667                    p11 = moveBack(p11);
1668                }
1669                while (p11>0 && fCloseSet.contains(cAt(p11))) {
1670                    p11 = moveBack(p11);
1671                }
1672                if (fSTermSet.contains(cAt(p11)) || fATermSet.contains(cAt(p11))) {
1673                    break;
1674                }
1675
1676                //  Rule (12)  Any x Any
1677                continue;
1678            }
1679            breakPos = p2;
1680            return breakPos;
1681        }
1682
1683
1684
1685    }
1686
1687
1688    /**
1689     * Move an index into a string by n code points.
1690     *   Similar to UTF16.moveCodePointOffset, but without the exceptions, which were
1691     *   complicating usage.
1692     * @param s   a Text string
1693     * @param pos The starting code unit index into the text string
1694     * @param amt The amount to adjust the string by.
1695     * @return    The adjusted code unit index, pinned to the string's length, or
1696     *            unchanged if input index was outside of the string.
1697     */
1698    static int moveIndex32(StringBuffer s, int pos, int amt) {
1699        int i;
1700        char  c;
1701        if (amt>0) {
1702            for (i=0; i<amt; i++) {
1703                if (pos >= s.length()) {
1704                    return s.length();
1705                }
1706                c = s.charAt(pos);
1707                pos++;
1708                if (UTF16.isLeadSurrogate(c) && pos < s.length()) {
1709                    c = s.charAt(pos);
1710                    if (UTF16.isTrailSurrogate(c)) {
1711                        pos++;
1712                    }
1713                }
1714            }
1715        } else {
1716            for (i=0; i>amt; i--) {
1717                if (pos <= 0) {
1718                    return 0;
1719                }
1720                pos--;
1721                c = s.charAt(pos);
1722                if (UTF16.isTrailSurrogate(c) && pos >= 0) {
1723                    c = s.charAt(pos);
1724                    if (UTF16.isLeadSurrogate(c)) {
1725                        pos--;
1726                    }
1727                }
1728            }
1729        }
1730        return pos;
1731    }
1732
1733    /**
1734     * No-exceptions form of UnicodeSet.contains(c).
1735     *    Simplifies loops that terminate with an end-of-input character value.
1736     * @param s  A unicode set
1737     * @param c  A code point value
1738     * @return   true if the set contains c.
1739     */
1740    static boolean setContains(UnicodeSet s, int c) {
1741        if (c<0 || c>UTF16.CODEPOINT_MAX_VALUE ) {
1742            return false;
1743        }
1744        return s.contains(c);
1745    }
1746
1747
1748    /**
1749     * return the index of the next code point in the input text.
1750     * @param i the preceding index
1751     */
1752    static int  nextCP(StringBuffer s, int i) {
1753        if (i == -1) {
1754            // End of Input indication.  Continue to return end value.
1755            return -1;
1756        }
1757        int  retVal = i + 1;
1758        if (retVal > s.length()) {
1759            return -1;
1760        }
1761        int  c = UTF16.charAt(s, i);
1762        if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && UTF16.isLeadSurrogate(s.charAt(i))) {
1763            retVal++;
1764        }
1765        return retVal;
1766    }
1767
1768
1769    /**
1770     * random number generator.  Not using Java's built-in Randoms for two reasons:
1771     *    1.  Using this code allows obtaining the same sequences as those from the ICU4C monkey test.
1772     *    2.  We need to get and restore the seed from values occurring in the middle
1773     *        of a long sequence, to more easily reproduce failing cases.
1774     */
1775    private static int m_seed = 1;
1776    private static int  m_rand()
1777    {
1778        m_seed = m_seed * 1103515245 + 12345;
1779        return (m_seed >>> 16) % 32768;
1780    }
1781
1782    // Helper function for formatting error output.
1783    //   Append a string into a fixed-size field in a StringBuffer.
1784    //   Blank-pad the string if it is shorter than the field.
1785    //   Truncate the source string if it is too long.
1786    //
1787    private static void appendToBuf(StringBuffer dest, String src, int fieldLen) {
1788        int appendLen = src.length();
1789        if (appendLen >= fieldLen) {
1790            dest.append(src.substring(0, fieldLen));
1791        } else {
1792            dest.append(src);
1793            while (appendLen < fieldLen) {
1794                dest.append(' ');
1795                appendLen++;
1796            }
1797        }
1798    }
1799
1800    // Helper function for formatting error output.
1801    // Display a code point in "\\uxxxx" or "\Uxxxxxxxx" format
1802    private static void appendCharToBuf(StringBuffer dest, int c, int fieldLen) {
1803        String hexChars = "0123456789abcdef";
1804        if (c < 0x10000) {
1805            dest.append("\\u");
1806            for (int bn=12; bn>=0; bn-=4) {
1807                dest.append(hexChars.charAt(((c)>>bn)&0xf));
1808            }
1809            appendToBuf(dest, " ", fieldLen-6);
1810        } else {
1811            dest.append("\\U");
1812            for (int bn=28; bn>=0; bn-=4) {
1813                dest.append(hexChars.charAt(((c)>>bn)&0xf));
1814            }
1815            appendToBuf(dest, " ", fieldLen-10);
1816
1817        }
1818    }
1819
1820    /**
1821     *  Run a RBBI monkey test.  Common routine, for all break iterator types.
1822     *    Parameters:
1823     *       bi      - the break iterator to use
1824     *       mk      - MonkeyKind, abstraction for obtaining expected results
1825     *       name    - Name of test (char, word, etc.) for use in error messages
1826     *       seed    - Seed for starting random number generator (parameter from user)
1827     *       numIterations
1828     */
1829    void RunMonkey(BreakIterator  bi, RBBIMonkeyKind mk, String name, int  seed, int numIterations) {
1830        int              TESTSTRINGLEN = 500;
1831        StringBuffer     testText         = new StringBuffer();
1832        int              numCharClasses;
1833        List             chClasses;
1834        int[]            expected         = new int[TESTSTRINGLEN*2 + 1];
1835        int              expectedCount    = 0;
1836        boolean[]        expectedBreaks   = new boolean[TESTSTRINGLEN*2 + 1];
1837        boolean[]        forwardBreaks    = new boolean[TESTSTRINGLEN*2 + 1];
1838        boolean[]        reverseBreaks    = new boolean[TESTSTRINGLEN*2 + 1];
1839        boolean[]        isBoundaryBreaks = new boolean[TESTSTRINGLEN*2 + 1];
1840        boolean[]        followingBreaks  = new boolean[TESTSTRINGLEN*2 + 1];
1841        boolean[]        precedingBreaks  = new boolean[TESTSTRINGLEN*2 + 1];
1842        int              i;
1843        int              loopCount        = 0;
1844        boolean          printTestData    = false;
1845        boolean          printBreaksFromBI = false;
1846
1847        m_seed = seed;
1848
1849        numCharClasses = mk.charClasses().size();
1850        chClasses      = mk.charClasses();
1851
1852        // Verify that the character classes all have at least one member.
1853        for (i=0; i<numCharClasses; i++) {
1854            UnicodeSet s = (UnicodeSet)chClasses.get(i);
1855            if (s == null || s.size() == 0) {
1856                errln("Character Class " + i + " is null or of zero size.");
1857                return;
1858            }
1859        }
1860
1861        //--------------------------------------------------------------------------------------------
1862        //
1863        //  Debugging settings.  Comment out everything in the following block for normal operation
1864        //
1865        //--------------------------------------------------------------------------------------------
1866        // numIterations = -1;
1867        // numIterations = 10000;   // Same as exhaustive.
1868        // RuleBasedBreakIterator_New.fTrace = true;
1869        // m_seed = 859056465;
1870        // TESTSTRINGLEN = 50;
1871        // printTestData = true;
1872        // printBreaksFromBI = true;
1873        // ((RuleBasedBreakIterator_New)bi).dump();
1874
1875        //--------------------------------------------------------------------------------------------
1876        //
1877        //  End of Debugging settings.
1878        //
1879        //--------------------------------------------------------------------------------------------
1880
1881        int  dotsOnLine = 0;
1882        while (loopCount < numIterations || numIterations == -1) {
1883            if (numIterations == -1 && loopCount % 10 == 0) {
1884                // If test is running in an infinite loop, display a periodic tic so
1885                //   we can tell that it is making progress.
1886                System.out.print(".");
1887                if (dotsOnLine++ >= 80){
1888                    System.out.println();
1889                    dotsOnLine = 0;
1890                }
1891            }
1892            // Save current random number seed, so that we can recreate the random numbers
1893            //   for this loop iteration in event of an error.
1894            seed = m_seed;
1895
1896            testText.setLength(0);
1897            // Populate a test string with data.
1898            if (printTestData) {
1899                System.out.println("Test Data string ...");
1900            }
1901            for (i=0; i<TESTSTRINGLEN; i++) {
1902                int        aClassNum = m_rand() % numCharClasses;
1903                UnicodeSet classSet  = (UnicodeSet)chClasses.get(aClassNum);
1904                int        charIdx   = m_rand() % classSet.size();
1905                int        c         = classSet.charAt(charIdx);
1906                if (c < 0) {   // TODO:  deal with sets containing strings.
1907                    errln("c < 0");
1908                }
1909                UTF16.appendCodePoint(testText, c);
1910                if (printTestData) {
1911                    System.out.print(Integer.toHexString(c) + " ");
1912                }
1913            }
1914            if (printTestData) {
1915                System.out.println();
1916            }
1917
1918            Arrays.fill(expected, 0);
1919            Arrays.fill(expectedBreaks, false);
1920            Arrays.fill(forwardBreaks, false);
1921            Arrays.fill(reverseBreaks, false);
1922            Arrays.fill(isBoundaryBreaks, false);
1923            Arrays.fill(followingBreaks, false);
1924            Arrays.fill(precedingBreaks, false);
1925
1926            // Calculate the expected results for this test string.
1927            mk.setText(testText);
1928            expectedCount = 0;
1929            expectedBreaks[0] = true;
1930            expected[expectedCount ++] = 0;
1931            int breakPos = 0;
1932            int lastBreakPos = -1;
1933            for (;;) {
1934                lastBreakPos = breakPos;
1935                breakPos = mk.next(breakPos);
1936                if (breakPos == -1) {
1937                    break;
1938                }
1939                if (breakPos > testText.length()) {
1940                    errln("breakPos > testText.length()");
1941                }
1942                if (lastBreakPos >= breakPos) {
1943                    errln("Next() not increasing.");
1944                    // break;
1945                }
1946                expectedBreaks[breakPos] = true;
1947                expected[expectedCount ++] = breakPos;
1948            }
1949
1950            // Find the break positions using forward iteration
1951            if (printBreaksFromBI) {
1952                System.out.println("Breaks from BI...");
1953            }
1954            bi.setText(testText.toString());
1955            for (i=bi.first(); i != BreakIterator.DONE; i=bi.next()) {
1956                if (i < 0 || i > testText.length()) {
1957                    errln(name + " break monkey test: Out of range value returned by breakIterator::next()");
1958                    break;
1959                }
1960                if (printBreaksFromBI) {
1961                    System.out.print(Integer.toHexString(i) + " ");
1962                }
1963                forwardBreaks[i] = true;
1964            }
1965            if (printBreaksFromBI) {
1966                System.out.println();
1967            }
1968
1969            // Find the break positions using reverse iteration
1970            for (i=bi.last(); i != BreakIterator.DONE; i=bi.previous()) {
1971                if (i < 0 || i > testText.length()) {
1972                    errln(name + " break monkey test: Out of range value returned by breakIterator.next()" + name);
1973                    break;
1974                }
1975                reverseBreaks[i] = true;
1976            }
1977
1978            // Find the break positions using isBoundary() tests.
1979            for (i=0; i<=testText.length(); i++) {
1980                isBoundaryBreaks[i] = bi.isBoundary(i);
1981            }
1982
1983            // Find the break positions using the following() function.
1984            lastBreakPos = 0;
1985            followingBreaks[0] = true;
1986            for (i=0; i<testText.length(); i++) {
1987                breakPos = bi.following(i);
1988                if (breakPos <= i ||
1989                        breakPos < lastBreakPos ||
1990                        breakPos > testText.length() ||
1991                        breakPos > lastBreakPos && lastBreakPos > i ) {
1992                    errln(name + " break monkey test: " +
1993                            "Out of range value returned by BreakIterator::following().\n" +
1994                            "index=" + i + "following returned=" + breakPos +
1995                            "lastBreak=" + lastBreakPos);
1996                    precedingBreaks[i] = !expectedBreaks[i];   // Forces an error.
1997                } else {
1998                    followingBreaks[breakPos] = true;
1999                    lastBreakPos = breakPos;
2000                }
2001            }
2002
2003            // Find the break positions using the preceding() function.
2004            lastBreakPos = testText.length();
2005            precedingBreaks[testText.length()] = true;
2006            for (i=testText.length(); i>0; i--) {
2007                breakPos = bi.preceding(i);
2008                if (breakPos >= i ||
2009                        breakPos > lastBreakPos ||
2010                        breakPos < 0 ||
2011                        breakPos < lastBreakPos && lastBreakPos < i ) {
2012                    errln(name + " break monkey test: " +
2013                            "Out of range value returned by BreakIterator::preceding().\n" +
2014                            "index=" + i + "preceding returned=" + breakPos +
2015                            "lastBreak=" + lastBreakPos);
2016                    precedingBreaks[i] = !expectedBreaks[i];   // Forces an error.
2017                } else {
2018                    precedingBreaks[breakPos] = true;
2019                    lastBreakPos = breakPos;
2020                }
2021            }
2022
2023
2024
2025            // Compare the expected and actual results.
2026            for (i=0; i<=testText.length(); i++) {
2027                String errorType = null;
2028                if  (forwardBreaks[i] != expectedBreaks[i]) {
2029                    errorType = "next()";
2030                } else if (reverseBreaks[i] != forwardBreaks[i]) {
2031                    errorType = "previous()";
2032                } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
2033                    errorType = "isBoundary()";
2034                } else if (followingBreaks[i] != expectedBreaks[i]) {
2035                    errorType = "following()";
2036                } else if (precedingBreaks[i] != expectedBreaks[i]) {
2037                    errorType = "preceding()";
2038                }
2039
2040                if (errorType != null) {
2041                    // Format a range of the test text that includes the failure as
2042                    //  a data item that can be included in the rbbi test data file.
2043
2044                    // Start of the range is the last point where expected and actual results
2045                    //   both agreed that there was a break position.
2046                    int startContext = i;
2047                    int count = 0;
2048                    for (;;) {
2049                        if (startContext==0) { break; }
2050                        startContext --;
2051                        if (expectedBreaks[startContext]) {
2052                            if (count == 2) break;
2053                            count ++;
2054                        }
2055                    }
2056
2057                    // End of range is two expected breaks past the start position.
2058                    int endContext = i + 1;
2059                    int ci;
2060                    for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
2061                        for (;;) {
2062                            if (endContext >= testText.length()) {break;}
2063                            if (expectedBreaks[endContext-1]) {
2064                                if (count == 0) break;
2065                                count --;
2066                            }
2067                            endContext ++;
2068                        }
2069                    }
2070
2071                    // Format looks like   "<data><>\uabcd\uabcd<>\U0001abcd...</data>"
2072                    StringBuffer errorText = new StringBuffer();
2073
2074                    int      c;    // Char from test data
2075                    for (ci = startContext;  ci <= endContext && ci != -1;  ci = nextCP(testText, ci)) {
2076                        if (ci == i) {
2077                            // This is the location of the error.
2078                            errorText.append("<?>---------------------------------\n");
2079                        } else if (expectedBreaks[ci]) {
2080                            // This a non-error expected break position.
2081                            errorText.append("------------------------------------\n");
2082                        }
2083                        if (ci < testText.length()) {
2084                            c = UTF16.charAt(testText, ci);
2085                            appendCharToBuf(errorText, c, 11);
2086                            String gc = UCharacter.getPropertyValueName(UProperty.GENERAL_CATEGORY, UCharacter.getType(c), UProperty.NameChoice.SHORT);
2087                            appendToBuf(errorText, gc, 8);
2088                            int extraProp = UCharacter.getIntPropertyValue(c, mk.fCharProperty);
2089                            String extraPropValue =
2090                                    UCharacter.getPropertyValueName(mk.fCharProperty, extraProp, UProperty.NameChoice.LONG);
2091                            appendToBuf(errorText, extraPropValue, 20);
2092
2093                            String charName = UCharacter.getExtendedName(c);
2094                            appendToBuf(errorText, charName, 40);
2095                            errorText.append('\n');
2096                        }
2097                    }
2098                    if (ci == testText.length() && ci != -1) {
2099                        errorText.append("<>");
2100                    }
2101                    errorText.append("</data>\n");
2102
2103                    // Output the error
2104                    errln(name + " break monkey test error.  " +
2105                            (expectedBreaks[i]? "Break expected but not found." : "Break found but not expected.") +
2106                            "\nOperation = " + errorType + "; random seed = " + seed + ";  buf Idx = " + i + "\n" +
2107                            errorText);
2108                    break;
2109                }
2110            }
2111
2112            loopCount++;
2113        }
2114    }
2115
2116    @Test
2117    public void TestCharMonkey() {
2118
2119        int        loopCount = 500;
2120        int        seed      = 1;
2121
2122        if (TestFmwk.getExhaustiveness() >= 9) {
2123            loopCount = 10000;
2124        }
2125
2126        RBBICharMonkey  m = new RBBICharMonkey();
2127        BreakIterator   bi = BreakIterator.getCharacterInstance(Locale.US);
2128        RunMonkey(bi, m, "char", seed, loopCount);
2129    }
2130
2131    @Test
2132    public void TestWordMonkey() {
2133
2134        int        loopCount = 500;
2135        int        seed      = 1;
2136
2137        if (TestFmwk.getExhaustiveness() >= 9) {
2138            loopCount = 10000;
2139        }
2140
2141        logln("Word Break Monkey Test");
2142        RBBIWordMonkey  m = new RBBIWordMonkey();
2143        BreakIterator   bi = BreakIterator.getWordInstance(Locale.US);
2144        RunMonkey(bi, m, "word", seed, loopCount);
2145    }
2146
2147    @Test
2148    public void TestLineMonkey() {
2149        int        loopCount = 500;
2150        int        seed      = 1;
2151
2152        if (TestFmwk.getExhaustiveness() >= 9) {
2153            loopCount = 10000;
2154        }
2155
2156        logln("Line Break Monkey Test");
2157        RBBILineMonkey  m = new RBBILineMonkey();
2158        BreakIterator   bi = BreakIterator.getLineInstance(Locale.US);
2159        RunMonkey(bi, m, "line", seed, loopCount);
2160    }
2161
2162    @Test
2163    public void TestSentMonkey() {
2164
2165        int        loopCount = 500;
2166        int        seed      = 1;
2167
2168        if (TestFmwk.getExhaustiveness() >= 9) {
2169            loopCount = 3000;
2170        }
2171
2172        logln("Sentence Break Monkey Test");
2173        RBBISentenceMonkey  m = new RBBISentenceMonkey();
2174        BreakIterator   bi = BreakIterator.getSentenceInstance(Locale.US);
2175        RunMonkey(bi, m, "sent", seed, loopCount);
2176    }
2177    //
2178    //  Round-trip monkey tests.
2179    //  Verify that break iterators created from the rule source from the default
2180    //    break iterators still pass the monkey test for the iterator type.
2181    //
2182    //  This is a major test for the Rule Compiler.  The default break iterators are built
2183    //  from pre-compiled binary rule data that was created using ICU4C; these
2184    //  round-trip rule recompile tests verify that the Java rule compiler can
2185    //  rebuild break iterators from the original source rules.
2186    //
2187    @Test
2188    public void TestRTCharMonkey() {
2189
2190        int        loopCount = 200;
2191        int        seed      = 1;
2192
2193        if (TestFmwk.getExhaustiveness() >= 9) {
2194            loopCount = 2000;
2195        }
2196
2197        RBBICharMonkey  m = new RBBICharMonkey();
2198        BreakIterator   bi = BreakIterator.getCharacterInstance(Locale.US);
2199        String rules = bi.toString();
2200        BreakIterator rtbi = new RuleBasedBreakIterator(rules);
2201        RunMonkey(rtbi, m, "char", seed, loopCount);
2202    }
2203
2204    @Test
2205    public void TestRTWordMonkey() {
2206
2207        int        loopCount = 200;
2208        int        seed      = 1;
2209
2210        if (TestFmwk.getExhaustiveness() >= 9) {
2211            loopCount = 2000;
2212        }
2213        logln("Word Break Monkey Test");
2214        RBBIWordMonkey  m = new RBBIWordMonkey();
2215        BreakIterator   bi = BreakIterator.getWordInstance(Locale.US);
2216        String rules = bi.toString();
2217        BreakIterator rtbi = new RuleBasedBreakIterator(rules);
2218        RunMonkey(rtbi, m, "word", seed, loopCount);
2219    }
2220
2221    @Test
2222    public void TestRTLineMonkey() {
2223        int        loopCount = 200;
2224        int        seed      = 1;
2225
2226        if (TestFmwk.getExhaustiveness() >= 9) {
2227            loopCount = 2000;
2228        }
2229
2230        logln("Line Break Monkey Test");
2231        RBBILineMonkey  m = new RBBILineMonkey();
2232        BreakIterator   bi = BreakIterator.getLineInstance(Locale.US);
2233        String rules = bi.toString();
2234        BreakIterator rtbi = new RuleBasedBreakIterator(rules);
2235        RunMonkey(rtbi, m, "line", seed, loopCount);
2236    }
2237
2238    @Test
2239    public void TestRTSentMonkey() {
2240
2241        int        loopCount = 200;
2242        int        seed      = 1;
2243
2244        if (TestFmwk.getExhaustiveness() >= 9) {
2245            loopCount = 1000;
2246        }
2247
2248        logln("Sentence Break Monkey Test");
2249        RBBISentenceMonkey  m = new RBBISentenceMonkey();
2250        BreakIterator   bi = BreakIterator.getSentenceInstance(Locale.US);
2251        String rules = bi.toString();
2252        BreakIterator rtbi = new RuleBasedBreakIterator(rules);
2253        RunMonkey(rtbi, m, "sent", seed, loopCount);
2254    }
2255}
2256
2257