1/* GENERATED SOURCE. DO NOT MODIFY. */
2// © 2016 and later: Unicode, Inc. and others.
3// License & terms of use: http://www.unicode.org/copyright.html#License
4/*
5 *******************************************************************************
6 * Copyright (C) 2003-2016 International Business Machines Corporation and
7 * others. All Rights Reserved.
8 *******************************************************************************
9 */
10package android.icu.dev.test.rbbi;
11
12
13// Monkey testing of RuleBasedBreakIterator
14import java.util.ArrayList;
15import java.util.Arrays;
16import java.util.List;
17import java.util.Locale;
18
19import org.junit.Test;
20
21import android.icu.dev.test.TestFmwk;
22import android.icu.lang.UCharacter;
23import android.icu.lang.UProperty;
24import android.icu.text.BreakIterator;
25import android.icu.text.RuleBasedBreakIterator;
26import android.icu.text.UTF16;
27import android.icu.text.UnicodeSet;
28
29
30/**
31 * Monkey tests for RBBI.  These tests have independent implementations of
32 * the Unicode TR boundary rules, and compare results between these and ICU's
33 * implementation, using random data.
34 *
35 * Tests cover Grapheme Cluster (char), Word and Line breaks
36 *
37 * Ported from ICU4C, original code in file source/test/intltest/rbbitst.cpp
38 *
39 */
40public class RBBITestMonkey extends TestFmwk {
41    //
42    //     class RBBIMonkeyKind
43    //
44    //        Monkey Test for Break Iteration
45    //        Abstract interface class.   Concrete derived classes independently
46    //        implement the break rules for different iterator types.
47    //
48    //        The Monkey Test itself uses doesn't know which type of break iterator it is
49    //        testing, but works purely in terms of the interface defined here.
50    //
51    abstract static class RBBIMonkeyKind {
52
53        // Return a List of UnicodeSets, representing the character classes used
54        //   for this type of iterator.
55        abstract  List  charClasses();
56
57        // Set the test text on which subsequent calls to next() will operate
58        abstract  void   setText(StringBuffer text);
59
60        // Find the next break position, starting from the specified position.
61        // Return -1 after reaching end of string.
62        abstract   int   next(int i);
63
64        // A Character Property, one of the constants defined in class UProperty.
65        //   The value of this property will be displayed for the characters
66        //    near any test failure.
67        int   fCharProperty;
68    }
69
70    //
71    // Data for Extended Pictographic scraped from CLDR common/properties/ExtendedPictographic.txt, r12773
72    //
73    static String gExtended_Pict = "[" +
74            "\\U0001F774-\\U0001F77F\\u2700-\\u2701\\u2703-\\u2704\\u270E\\u2710-\\u2711\\u2765-\\u2767\\U0001F030-\\U0001F093" +
75            "\\U0001F094-\\U0001F09F\\U0001F10D-\\U0001F10F\\U0001F12F\\U0001F16C-\\U0001F16F\\U0001F1AD-\\U0001F1E5" +
76            "\\U0001F203-\\U0001F20F\\U0001F23C-\\U0001F23F\\U0001F249-\\U0001F24F\\U0001F252-\\U0001F2FF\\U0001F7D5-\\U0001F7FF" +
77            "\\U0001F000-\\U0001F003\\U0001F005-\\U0001F02B\\U0001F02C-\\U0001F02F\\U0001F322-\\U0001F323\\U0001F394-\\U0001F395" +
78            "\\U0001F398\\U0001F39C-\\U0001F39D\\U0001F3F1-\\U0001F3F2\\U0001F3F6\\U0001F4FE\\U0001F53E-\\U0001F548" +
79            "\\U0001F54F\\U0001F568-\\U0001F56E\\U0001F571-\\U0001F572\\U0001F57B-\\U0001F586\\U0001F588-\\U0001F589" +
80            "\\U0001F58E-\\U0001F58F\\U0001F591-\\U0001F594\\U0001F597-\\U0001F5A3\\U0001F5A6-\\U0001F5A7\\U0001F5A9-\\U0001F5B0" +
81            "\\U0001F5B3-\\U0001F5BB\\U0001F5BD-\\U0001F5C1\\U0001F5C5-\\U0001F5D0\\U0001F5D4-\\U0001F5DB\\U0001F5DF-\\U0001F5E0" +
82            "\\U0001F5E2\\U0001F5E4-\\U0001F5E7\\U0001F5E9-\\U0001F5EE\\U0001F5F0-\\U0001F5F2\\U0001F5F4-\\U0001F5F9" +
83            "\\u2605\\u2607-\\u260D\\u260F-\\u2610\\u2612\\u2616-\\u2617\\u2619-\\u261C\\u261E-\\u261F\\u2621\\u2624-\\u2625" +
84            "\\u2627-\\u2629\\u262B-\\u262D\\u2630-\\u2637\\u263B-\\u2647\\u2654-\\u265F\\u2661-\\u2662\\u2664\\u2667" +
85            "\\u2669-\\u267A\\u267C-\\u267E\\u2680-\\u2691\\u2695\\u2698\\u269A\\u269D-\\u269F\\u26A2-\\u26A9\\u26AC-\\u26AF" +
86            "\\u26B2-\\u26BC\\u26BF-\\u26C3\\u26C6-\\u26C7\\u26C9-\\u26CD\\u26D0\\u26D2\\u26D5-\\u26E8\\u26EB-\\u26EF" +
87            "\\u26F6\\u26FB-\\u26FC\\u26FE-\\u26FF\\u2388\\U0001FA00-\\U0001FFFD\\U0001F0A0-\\U0001F0AE\\U0001F0B1-\\U0001F0BF" +
88            "\\U0001F0C1-\\U0001F0CF\\U0001F0D1-\\U0001F0F5\\U0001F0AF-\\U0001F0B0\\U0001F0C0\\U0001F0D0\\U0001F0F6-\\U0001F0FF" +
89            "\\U0001F80C-\\U0001F80F\\U0001F848-\\U0001F84F\\U0001F85A-\\U0001F85F\\U0001F888-\\U0001F88F\\U0001F8AE-\\U0001F8FF" +
90            "\\U0001F900-\\U0001F90F\\U0001F91F\\U0001F928-\\U0001F92F\\U0001F931-\\U0001F932\\U0001F93F\\U0001F94C-\\U0001F94F" +
91            "\\U0001F95F-\\U0001F97F\\U0001F992-\\U0001F9BF\\U0001F9C1-\\U0001F9FF\\U0001F6C6-\\U0001F6CA\\U0001F6E6-\\U0001F6E8" +
92            "\\U0001F6EA\\U0001F6F1-\\U0001F6F2\\U0001F6D3-\\U0001F6DF\\U0001F6ED-\\U0001F6EF\\U0001F6F7-\\U0001F6FF" +
93            "]";
94
95
96    /**
97     * Monkey test subclass for testing Character (Grapheme Cluster) boundaries.
98     * Note: As of Unicode 6.1, fPrependSet is empty, so don't add it to fSets
99     */
100    static class RBBICharMonkey extends RBBIMonkeyKind {
101        List                      fSets;
102
103        UnicodeSet                fCRLFSet;
104        UnicodeSet                fControlSet;
105        UnicodeSet                fExtendSet;
106        UnicodeSet                fRegionalIndicatorSet;
107        UnicodeSet                fPrependSet;
108        UnicodeSet                fSpacingSet;
109        UnicodeSet                fLSet;
110        UnicodeSet                fVSet;
111        UnicodeSet                fTSet;
112        UnicodeSet                fLVSet;
113        UnicodeSet                fLVTSet;
114        UnicodeSet                fHangulSet;
115        UnicodeSet                fEmojiModifierSet;
116        UnicodeSet                fEmojiBaseSet;
117        UnicodeSet                fZWJSet;
118        UnicodeSet                fExtendedPictSet;
119        UnicodeSet                fEBGSet;
120        UnicodeSet                fEmojiNRKSet;
121        UnicodeSet                fAnySet;
122
123
124        StringBuffer              fText;
125
126
127        RBBICharMonkey() {
128            fText       = null;
129            fCharProperty = UProperty.GRAPHEME_CLUSTER_BREAK;
130            fCRLFSet    = new UnicodeSet("[\\r\\n]");
131            fControlSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Control}]");
132            fExtendSet  = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Extend}]");
133            fZWJSet     = new UnicodeSet("[\\p{Grapheme_Cluster_Break = ZWJ}]");
134            fRegionalIndicatorSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Regional_Indicator}]");
135            fPrependSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = Prepend}]");
136            fSpacingSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = SpacingMark}]");
137            fLSet       = new UnicodeSet("[\\p{Grapheme_Cluster_Break = L}]");
138            fVSet       = new UnicodeSet("[\\p{Grapheme_Cluster_Break = V}]");
139            fTSet       = new UnicodeSet("[\\p{Grapheme_Cluster_Break = T}]");
140            fLVSet      = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LV}]");
141            fLVTSet     = new UnicodeSet("[\\p{Grapheme_Cluster_Break = LVT}]");
142            fHangulSet  = new UnicodeSet();
143            fHangulSet.addAll(fLSet);
144            fHangulSet.addAll(fVSet);
145            fHangulSet.addAll(fTSet);
146            fHangulSet.addAll(fLVSet);
147            fHangulSet.addAll(fLVTSet);
148
149            fEmojiBaseSet     = new UnicodeSet("[\\p{Grapheme_Cluster_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]");
150            fEmojiModifierSet = new UnicodeSet("[\\p{Grapheme_Cluster_Break = EM}]");
151            fExtendedPictSet  = new UnicodeSet(gExtended_Pict);
152            fEBGSet           = new UnicodeSet("[\\p{Grapheme_Cluster_Break = EBG}]");
153            fEmojiNRKSet      = new UnicodeSet("[[\\p{Emoji}]-[\\p{Grapheme_Cluster_Break = Regional_Indicator}*#0-9©®™〰〽]]");
154            fAnySet           = new UnicodeSet("[\\u0000-\\U0010ffff]");
155
156
157            fSets       = new ArrayList();
158            fSets.add(fCRLFSet);
159            fSets.add(fControlSet);
160            fSets.add(fExtendSet);
161            fSets.add(fRegionalIndicatorSet);
162            if (!fPrependSet.isEmpty()) {
163                fSets.add(fPrependSet);
164            }
165            fSets.add(fSpacingSet);
166            fSets.add(fHangulSet);
167            fSets.add(fAnySet);
168            fSets.add(fEmojiBaseSet);
169            fSets.add(fEmojiModifierSet);
170            fSets.add(fZWJSet);
171            fSets.add(fExtendedPictSet);
172            fSets.add(fEBGSet);
173            fSets.add(fEmojiNRKSet);
174        }
175
176
177        @Override
178        void setText(StringBuffer s) {
179            fText = s;
180        }
181
182        @Override
183        List charClasses() {
184            return fSets;
185        }
186
187        @Override
188        int next(int prevPos) {
189            int    /*p0,*/ p1, p2, p3;    // Indices of the significant code points around the
190            //   break position being tested.  The candidate break
191            //   location is before p2.
192
193            int     breakPos = -1;
194
195            int   c0, c1, c2, c3;     // The code points at p0, p1, p2 & p3.
196            int   cBase;              // for (X Extend*) patterns, the X character.
197
198            // Previous break at end of string.  return DONE.
199            if (prevPos >= fText.length()) {
200                return -1;
201            }
202            /* p0 = */ p1 = p2 = p3 = prevPos;
203            c3 =  UTF16.charAt(fText, prevPos);
204            c0 = c1 = c2 = cBase = 0;
205
206            // Loop runs once per "significant" character position in the input text.
207            for (;;) {
208                // Move all of the positions forward in the input string.
209                /* p0 = p1;*/  c0 = c1;
210                p1 = p2;  c1 = c2;
211                p2 = p3;  c2 = c3;
212
213                // Advance p3 by one codepoint
214                p3 = moveIndex32(fText, p3, 1);
215                c3 = (p3>=fText.length())? -1: UTF16.charAt(fText, p3);
216
217                if (p1 == p2) {
218                    // Still warming up the loop.  (won't work with zero length strings, but we don't care)
219                    continue;
220                }
221                if (p2 == fText.length()) {
222                    // Reached end of string.  Always a break position.
223                    break;
224                }
225
226                // Rule  GB3   CR x LF
227                //     No Extend or Format characters may appear between the CR and LF,
228                //     which requires the additional check for p2 immediately following p1.
229                //
230                if (c1==0x0D && c2==0x0A && p1==(p2-1)) {
231                    continue;
232                }
233
234                // Rule (GB4).   ( Control | CR | LF ) <break>
235                if (fControlSet.contains(c1) ||
236                        c1 == 0x0D ||
237                        c1 == 0x0A)  {
238                    break;
239                }
240
241                // Rule (GB5)    <break>  ( Control | CR | LF )
242                //
243                if (fControlSet.contains(c2) ||
244                        c2 == 0x0D ||
245                        c2 == 0x0A)  {
246                    break;
247                }
248
249
250                // Rule (GB6)  L x ( L | V | LV | LVT )
251                if (fLSet.contains(c1) &&
252                        (fLSet.contains(c2)  ||
253                                fVSet.contains(c2)  ||
254                                fLVSet.contains(c2) ||
255                                fLVTSet.contains(c2))) {
256                    continue;
257                }
258
259                // Rule (GB7)    ( LV | V )  x  ( V | T )
260                if ((fLVSet.contains(c1) || fVSet.contains(c1)) &&
261                        (fVSet.contains(c2) || fTSet.contains(c2)))  {
262                    continue;
263                }
264
265                // Rule (GB8)    ( LVT | T)  x T
266                if ((fLVTSet.contains(c1) || fTSet.contains(c1)) &&
267                        fTSet.contains(c2))  {
268                    continue;
269                }
270
271                // Rule (GB9)    x (Extend | ZWJ)
272                if (fExtendSet.contains(c2) || fZWJSet.contains(c2))  {
273                    if (!fExtendSet.contains(c1)) {
274                        cBase = c1;
275                    }
276                    continue;
277                }
278
279                // Rule (GB9a)   x  SpacingMark
280                if (fSpacingSet.contains(c2)) {
281                    continue;
282                }
283
284                // Rule (GB9b)   Prepend x
285                if (fPrependSet.contains(c1)) {
286                    continue;
287                }
288                // Rule (GB10)   (Emoji_Base | EBG) Extend* x Emoji_Modifier
289                if ((fEmojiBaseSet.contains(c1) || fEBGSet.contains(c1)) && fEmojiModifierSet.contains(c2)) {
290                    continue;
291                }
292                if ((fEmojiBaseSet.contains(cBase) || fEBGSet.contains(cBase)) &&
293                        fExtendSet.contains(c1) && fEmojiModifierSet.contains(c2)) {
294                    continue;
295                }
296
297                // Rule (GB11)   (Extended_Pictographic | Emoji) ZWJ x (Extended_Pictographic | Emoji)
298                if ((fExtendedPictSet.contains(c0) || fEmojiNRKSet.contains(c0)) && fZWJSet.contains(c1) &&
299                        (fExtendedPictSet.contains(c2) || fEmojiNRKSet.contains(c2))) {
300                    continue;
301                }
302
303                // Rule (GB12-13)   Regional_Indicator x Regional_Indicator
304                //                  Note: The first if condition is a little tricky. We only need to force
305                //                      a break if there are three or more contiguous RIs. If there are
306                //                      only two, a break following will occur via other rules, and will include
307                //                      any trailing extend characters, which is needed behavior.
308                if (fRegionalIndicatorSet.contains(c0) && fRegionalIndicatorSet.contains(c1)
309                        && fRegionalIndicatorSet.contains(c2)) {
310                    break;
311                }
312                if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) {
313                    continue;
314                }
315
316                // Rule (GB999)  Any  <break>  Any
317                break;
318            }
319
320            breakPos = p2;
321            return breakPos;
322        }
323    }
324
325
326    /**
327     *
328     * Word Monkey Test Class
329     *
330     *
331     *
332     */
333    static class RBBIWordMonkey extends RBBIMonkeyKind {
334        List                      fSets;
335        StringBuffer              fText;
336
337        UnicodeSet                fCRSet;
338        UnicodeSet                fLFSet;
339        UnicodeSet                fNewlineSet;
340        UnicodeSet                fRegionalIndicatorSet;
341        UnicodeSet                fKatakanaSet;
342        UnicodeSet                fHebrew_LetterSet;
343        UnicodeSet                fALetterSet;
344        UnicodeSet                fSingle_QuoteSet;
345        UnicodeSet                fDouble_QuoteSet;
346        UnicodeSet                fMidNumLetSet;
347        UnicodeSet                fMidLetterSet;
348        UnicodeSet                fMidNumSet;
349        UnicodeSet                fNumericSet;
350        UnicodeSet                fFormatSet;
351        UnicodeSet                fExtendSet;
352        UnicodeSet                fExtendNumLetSet;
353        UnicodeSet                fOtherSet;
354        UnicodeSet                fDictionarySet;
355        UnicodeSet                fEBaseSet;
356        UnicodeSet                fEBGSet;
357        UnicodeSet                fEModifierSet;
358        UnicodeSet                fZWJSet;
359        UnicodeSet                fExtendedPictSet;
360        UnicodeSet                fEmojiNRKSet;
361
362
363        RBBIWordMonkey() {
364            fCharProperty    = UProperty.WORD_BREAK;
365
366            fCRSet           = new UnicodeSet("[\\p{Word_Break = CR}]");
367            fLFSet           = new UnicodeSet("[\\p{Word_Break = LF}]");
368            fNewlineSet      = new UnicodeSet("[\\p{Word_Break = Newline}]");
369            fRegionalIndicatorSet = new UnicodeSet("[\\p{Word_Break = Regional_Indicator}]");
370            fKatakanaSet     = new UnicodeSet("[\\p{Word_Break = Katakana}]");
371            fHebrew_LetterSet = new UnicodeSet("[\\p{Word_Break = Hebrew_Letter}]");
372            fALetterSet      = new UnicodeSet("[\\p{Word_Break = ALetter}]");
373            fSingle_QuoteSet = new UnicodeSet("[\\p{Word_Break = Single_Quote}]");
374            fDouble_QuoteSet = new UnicodeSet("[\\p{Word_Break = Double_Quote}]");
375            fMidNumLetSet    = new UnicodeSet("[\\p{Word_Break = MidNumLet}]");
376            fMidLetterSet    = new UnicodeSet("[\\p{Word_Break = MidLetter}]");
377            fMidNumSet       = new UnicodeSet("[\\p{Word_Break = MidNum}]");
378            fNumericSet      = new UnicodeSet("[\\p{Word_Break = Numeric}]");
379            fFormatSet       = new UnicodeSet("[\\p{Word_Break = Format}]");
380            fExtendNumLetSet = new UnicodeSet("[\\p{Word_Break = ExtendNumLet}]");
381            fExtendSet       = new UnicodeSet("[\\p{Word_Break = Extend}]");
382            fEBaseSet        = new UnicodeSet("[\\p{Word_Break = EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]");
383            fEBGSet          = new UnicodeSet("[\\p{Word_Break = EBG}]");
384            fEModifierSet    = new UnicodeSet("[\\p{Word_Break = EM}]");
385            fZWJSet          = new UnicodeSet("[\\p{Word_Break = ZWJ}]");
386            fExtendedPictSet = new UnicodeSet(gExtended_Pict);
387            fEmojiNRKSet     = new UnicodeSet("[[\\p{Emoji}]-[\\p{Grapheme_Cluster_Break = Regional_Indicator}*#0-9©®™〰〽]]");
388
389            fDictionarySet = new UnicodeSet("[[\\uac00-\\ud7a3][:Han:][:Hiragana:]]");
390            fDictionarySet.addAll(fKatakanaSet);
391            fDictionarySet.addAll(new UnicodeSet("[\\p{LineBreak = Complex_Context}]"));
392
393            fALetterSet.removeAll(fDictionarySet);
394
395            fOtherSet        = new UnicodeSet();
396            fOtherSet.complement();
397            fOtherSet.removeAll(fCRSet);
398            fOtherSet.removeAll(fLFSet);
399            fOtherSet.removeAll(fNewlineSet);
400            fOtherSet.removeAll(fALetterSet);
401            fOtherSet.removeAll(fSingle_QuoteSet);
402            fOtherSet.removeAll(fDouble_QuoteSet);
403            fOtherSet.removeAll(fKatakanaSet);
404            fOtherSet.removeAll(fHebrew_LetterSet);
405            fOtherSet.removeAll(fMidLetterSet);
406            fOtherSet.removeAll(fMidNumSet);
407            fOtherSet.removeAll(fNumericSet);
408            fOtherSet.removeAll(fFormatSet);
409            fOtherSet.removeAll(fExtendSet);
410            fOtherSet.removeAll(fExtendNumLetSet);
411            fOtherSet.removeAll(fRegionalIndicatorSet);
412            fOtherSet.removeAll(fEBaseSet);
413            fOtherSet.removeAll(fEBGSet);
414            fOtherSet.removeAll(fEModifierSet);
415            fOtherSet.removeAll(fZWJSet);
416            fOtherSet.removeAll(fExtendedPictSet);
417            fOtherSet.removeAll(fEmojiNRKSet);
418
419            // Inhibit dictionary characters from being tested at all.
420            // remove surrogates so as to not generate higher CJK characters
421            fOtherSet.removeAll(new UnicodeSet("[[\\p{LineBreak = Complex_Context}][:Line_Break=Surrogate:]]"));
422            fOtherSet.removeAll(fDictionarySet);
423
424            fSets            = new ArrayList();
425            fSets.add(fCRSet);
426            fSets.add(fLFSet);
427            fSets.add(fNewlineSet);
428            fSets.add(fRegionalIndicatorSet);
429            fSets.add(fHebrew_LetterSet);
430            fSets.add(fALetterSet);
431            //fSets.add(fKatakanaSet);  // Omit Katakana from fSets, which omits Katakana characters
432            // from the test data. They are all in the dictionary set,
433            // which this (old, to be retired) monkey test cannot handle.
434            fSets.add(fSingle_QuoteSet);
435            fSets.add(fDouble_QuoteSet);
436            fSets.add(fMidLetterSet);
437            fSets.add(fMidNumLetSet);
438            fSets.add(fMidNumSet);
439            fSets.add(fNumericSet);
440            fSets.add(fFormatSet);
441            fSets.add(fExtendSet);
442            fSets.add(fExtendNumLetSet);
443            fSets.add(fRegionalIndicatorSet);
444            fSets.add(fEBaseSet);
445            fSets.add(fEBGSet);
446            fSets.add(fEModifierSet);
447            fSets.add(fZWJSet);
448            fSets.add(fExtendedPictSet);
449            fSets.add(fEmojiNRKSet);
450            fSets.add(fOtherSet);
451        }
452
453
454        @Override
455        List  charClasses() {
456            return fSets;
457        }
458
459        @Override
460        void   setText(StringBuffer s) {
461            fText = s;
462        }
463
464        @Override
465        int   next(int prevPos) {
466            int    /*p0,*/ p1, p2, p3;      // Indices of the significant code points around the
467            //   break position being tested.  The candidate break
468            //   location is before p2.
469            int     breakPos = -1;
470
471            int c0, c1, c2, c3;   // The code points at p0, p1, p2 & p3.
472
473            // Previous break at end of string.  return DONE.
474            if (prevPos >= fText.length()) {
475                return -1;
476            }
477            /*p0 =*/ p1 = p2 = p3 = prevPos;
478            c3 = UTF16.charAt(fText, prevPos);
479            c0 = c1 = c2 = 0;
480
481
482
483            // Loop runs once per "significant" character position in the input text.
484            for (;;) {
485                // Move all of the positions forward in the input string.
486                /*p0 = p1;*/  c0 = c1;
487                p1 = p2;  c1 = c2;
488                p2 = p3;  c2 = c3;
489
490                // Advance p3 by    X(Extend | Format)*   Rule 4
491                //    But do not advance over Extend & Format following a new line. (Unicode 5.1 change)
492                do {
493                    p3 = moveIndex32(fText, p3, 1);
494                    c3 = -1;
495                    if (p3>=fText.length()) {
496                        break;
497                    }
498                    c3 = UTF16.charAt(fText, p3);
499                    if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) {
500                        break;
501                    }
502                }
503                while (setContains(fFormatSet, c3) || setContains(fExtendSet, c3) || setContains(fZWJSet, c3));
504
505                if (p1 == p2) {
506                    // Still warming up the loop.  (won't work with zero length strings, but we don't care)
507                    continue;
508                }
509                if (p2 == fText.length()) {
510                    // Reached end of string.  Always a break position.
511                    break;
512                }
513
514                // Rule (3)   CR x LF
515                //     No Extend or Format characters may appear between the CR and LF,
516                //     which requires the additional check for p2 immediately following p1.
517                //
518                if (c1==0x0D && c2==0x0A) {
519                    continue;
520                }
521
522                // Rule (3a)  Break before and after newlines (including CR and LF)
523                //
524                if (fCRSet.contains(c1) || fLFSet.contains(c1) || fNewlineSet.contains(c1)) {
525                    break;
526                }
527                if (fCRSet.contains(c2) || fLFSet.contains(c2) || fNewlineSet.contains(c2)) {
528                    break;
529                }
530
531                // Rule (3c)    ZWJ x (Extended_Pictographic | Emoji).
532                //              Not ignoring extend chars, so peek into input text to
533                //              get the potential ZWJ, the character immediately preceding c2.
534                if (fZWJSet.contains(fText.codePointBefore(p2)) && (fExtendedPictSet.contains(c2) || fEmojiNRKSet.contains(c2))) {
535                    continue;
536                }
537
538                // Rule (5).   (ALetter | Hebrew_Letter) x (ALetter | Hebrew_Letter)
539                if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) &&
540                        (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2)))  {
541                    continue;
542                }
543
544                // Rule (6)  (ALetter | Hebrew_Letter)  x  (MidLetter | MidNumLet | Single_Quote) (ALetter | Hebrew_Letter)
545                //
546                if ( (fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1))   &&
547                        (fMidLetterSet.contains(c2) || fMidNumLetSet.contains(c2) || fSingle_QuoteSet.contains(c2)) &&
548                        (setContains(fALetterSet, c3) || setContains(fHebrew_LetterSet, c3))) {
549                    continue;
550                }
551
552                // Rule (7)  (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | Single_Quote)  x  (ALetter | Hebrew_Letter)
553                if ((fALetterSet.contains(c0) || fHebrew_LetterSet.contains(c0)) &&
554                        (fMidLetterSet.contains(c1) || fMidNumLetSet.contains(c1) || fSingle_QuoteSet.contains(c1)) &&
555                        (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2))) {
556                    continue;
557                }
558
559                // Rule (7a)     Hebrew_Letter x Single_Quote
560                if (fHebrew_LetterSet.contains(c1) && fSingle_QuoteSet.contains(c2)) {
561                    continue;
562                }
563
564                // Rule (7b)    Hebrew_Letter x Double_Quote Hebrew_Letter
565                if (fHebrew_LetterSet.contains(c1) && fDouble_QuoteSet.contains(c2) && setContains(fHebrew_LetterSet,c3)) {
566                    continue;
567                }
568
569                // Rule (7c)    Hebrew_Letter Double_Quote x Hebrew_Letter
570                if (fHebrew_LetterSet.contains(c0) && fDouble_QuoteSet.contains(c1) && fHebrew_LetterSet.contains(c2)) {
571                    continue;
572                }
573
574                //  Rule (8)    Numeric x Numeric
575                if (fNumericSet.contains(c1) &&
576                        fNumericSet.contains(c2))  {
577                    continue;
578                }
579
580                // Rule (9)    (ALetter | Hebrew_Letter) x Numeric
581                if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1)) &&
582                        fNumericSet.contains(c2))  {
583                    continue;
584                }
585
586                // Rule (10)    Numeric x (ALetter | Hebrew_Letter)
587                if (fNumericSet.contains(c1) &&
588                        (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2)))  {
589                    continue;
590                }
591
592                // Rule (11)   Numeric (MidNum | MidNumLet | Single_Quote)  x  Numeric
593                if (fNumericSet.contains(c0) &&
594                        (fMidNumSet.contains(c1) || fMidNumLetSet.contains(c1) || fSingle_QuoteSet.contains(c1))  &&
595                        fNumericSet.contains(c2)) {
596                    continue;
597                }
598
599                // Rule (12)  Numeric x (MidNum | MidNumLet | SingleQuote) Numeric
600                if (fNumericSet.contains(c1) &&
601                        (fMidNumSet.contains(c2) || fMidNumLetSet.contains(c2) || fSingle_QuoteSet.contains(c2))  &&
602                        setContains(fNumericSet, c3)) {
603                    continue;
604                }
605
606                // Rule (13)  Katakana x Katakana
607                //            Note: matches UAX 29 rules, but doesn't come into play for ICU because
608                //                  all Katakana are handled by the dictionary breaker.
609                if (fKatakanaSet.contains(c1) &&
610                        fKatakanaSet.contains(c2))  {
611                    continue;
612                }
613
614                // Rule 13a    (ALetter | Hebrew_Letter | Numeric | KataKana | ExtendNumLet) x ExtendNumLet
615                if ((fALetterSet.contains(c1) || fHebrew_LetterSet.contains(c1) ||fNumericSet.contains(c1) ||
616                        fKatakanaSet.contains(c1) || fExtendNumLetSet.contains(c1)) &&
617                        fExtendNumLetSet.contains(c2)) {
618                    continue;
619                }
620
621                // Rule 13b   ExtendNumLet x (ALetter | Hebrew_Letter | Numeric | Katakana)
622                if (fExtendNumLetSet.contains(c1) &&
623                        (fALetterSet.contains(c2) || fHebrew_LetterSet.contains(c2) ||
624                                fNumericSet.contains(c2) || fKatakanaSet.contains(c2)))  {
625                    continue;
626                }
627
628
629                // Rule 14 (E_Base | EBG) x E_Modifier
630                if ((fEBaseSet.contains(c1)  || fEBGSet.contains(c1)) && fEModifierSet.contains(c2)) {
631                    continue;
632                }
633
634                // Rule 15 - 17   Group piars of Regional Indicators
635                if (fRegionalIndicatorSet.contains(c0) && fRegionalIndicatorSet.contains(c1)) {
636                    break;
637                }
638                if (fRegionalIndicatorSet.contains(c1) && fRegionalIndicatorSet.contains(c2)) {
639                    continue;
640                }
641
642                // Rule 999.  Break found here.
643                break;
644            }
645
646            breakPos = p2;
647            return breakPos;
648        }
649
650    }
651
652
653    static class RBBILineMonkey extends RBBIMonkeyKind {
654
655        List        fSets;
656
657        // UnicodeSets for each of the Line Breaking character classes.
658        // Order matches that of Unicode UAX 14, Table 1, which makes it a little easier
659        // to verify that they are all accounted for.
660
661        UnicodeSet  fBK;
662        UnicodeSet  fCR;
663        UnicodeSet  fLF;
664        UnicodeSet  fCM;
665        UnicodeSet  fNL;
666        UnicodeSet  fSG;
667        UnicodeSet  fWJ;
668        UnicodeSet  fZW;
669        UnicodeSet  fGL;
670        UnicodeSet  fSP;
671        UnicodeSet  fB2;
672        UnicodeSet  fBA;
673        UnicodeSet  fBB;
674        UnicodeSet  fHY;
675        UnicodeSet  fCB;
676        UnicodeSet  fCL;
677        UnicodeSet  fCP;
678        UnicodeSet  fEX;
679        UnicodeSet  fIN;
680        UnicodeSet  fNS;
681        UnicodeSet  fOP;
682        UnicodeSet  fQU;
683        UnicodeSet  fIS;
684        UnicodeSet  fNU;
685        UnicodeSet  fPO;
686        UnicodeSet  fPR;
687        UnicodeSet  fSY;
688        UnicodeSet  fAI;
689        UnicodeSet  fAL;
690        UnicodeSet  fCJ;
691        UnicodeSet  fH2;
692        UnicodeSet  fH3;
693        UnicodeSet  fHL;
694        UnicodeSet  fID;
695        UnicodeSet  fJL;
696        UnicodeSet  fJV;
697        UnicodeSet  fJT;
698        UnicodeSet  fRI;
699        UnicodeSet  fXX;
700        UnicodeSet  fEB;
701        UnicodeSet  fEM;
702        UnicodeSet  fZWJ;
703        UnicodeSet  fExtendedPict;
704        UnicodeSet  fEmojiNRK;
705
706        StringBuffer  fText;
707        int           fOrigPositions;
708
709
710
711        RBBILineMonkey()
712        {
713            fCharProperty  = UProperty.LINE_BREAK;
714            fSets          = new ArrayList();
715
716            fBK    = new UnicodeSet("[\\p{Line_Break=BK}]");
717            fCR    = new UnicodeSet("[\\p{Line_break=CR}]");
718            fLF    = new UnicodeSet("[\\p{Line_break=LF}]");
719            fCM    = new UnicodeSet("[\\p{Line_break=CM}]");
720            fNL    = new UnicodeSet("[\\p{Line_break=NL}]");
721            fSG    = new UnicodeSet("[\\ud800-\\udfff]");
722            fWJ    = new UnicodeSet("[\\p{Line_break=WJ}]");
723            fZW    = new UnicodeSet("[\\p{Line_break=ZW}]");
724            fGL    = new UnicodeSet("[\\p{Line_break=GL}]");
725            fSP    = new UnicodeSet("[\\p{Line_break=SP}]");
726            fB2    = new UnicodeSet("[\\p{Line_break=B2}]");
727            fBA    = new UnicodeSet("[\\p{Line_break=BA}]");
728            fBB    = new UnicodeSet("[\\p{Line_break=BB}]");
729            fHY    = new UnicodeSet("[\\p{Line_break=HY}]");
730            fCB    = new UnicodeSet("[\\p{Line_break=CB}]");
731            fCL    = new UnicodeSet("[\\p{Line_break=CL}]");
732            fCP    = new UnicodeSet("[\\p{Line_break=CP}]");
733            fEX    = new UnicodeSet("[\\p{Line_break=EX}]");
734            fIN    = new UnicodeSet("[\\p{Line_break=IN}]");
735            fNS    = new UnicodeSet("[\\p{Line_break=NS}]");
736            fOP    = new UnicodeSet("[\\p{Line_break=OP}]");
737            fQU    = new UnicodeSet("[\\p{Line_break=QU}]");
738            fIS    = new UnicodeSet("[\\p{Line_break=IS}]");
739            fNU    = new UnicodeSet("[\\p{Line_break=NU}]");
740            fPO    = new UnicodeSet("[\\p{Line_break=PO}]");
741            fPR    = new UnicodeSet("[\\p{Line_break=PR}]");
742            fSY    = new UnicodeSet("[\\p{Line_break=SY}]");
743            fAI    = new UnicodeSet("[\\p{Line_break=AI}]");
744            fAL    = new UnicodeSet("[\\p{Line_break=AL}]");
745            fCJ    = new UnicodeSet("[\\p{Line_break=CJ}]");
746            fH2    = new UnicodeSet("[\\p{Line_break=H2}]");
747            fH3    = new UnicodeSet("[\\p{Line_break=H3}]");
748            fHL    = new UnicodeSet("[\\p{Line_break=HL}]");
749            fID    = new UnicodeSet("[\\p{Line_break=ID}]");
750            fJL    = new UnicodeSet("[\\p{Line_break=JL}]");
751            fJV    = new UnicodeSet("[\\p{Line_break=JV}]");
752            fJT    = new UnicodeSet("[\\p{Line_break=JT}]");
753            fRI    = new UnicodeSet("[\\p{Line_break=RI}]");
754            fXX    = new UnicodeSet("[\\p{Line_break=XX}]");
755            fEB    = new UnicodeSet("[\\p{Line_break=EB}\\U0001F3C2\\U0001F3C7\\U0001F3CC\\U0001F46A-\\U0001F46D\\U0001F46F\\U0001F574\\U0001F6CC]");
756            fEM    = new UnicodeSet("[\\p{Line_break=EM}]");
757            fZWJ   = new UnicodeSet("[\\p{Line_break=ZWJ}]");
758            fEmojiNRK = new UnicodeSet("[[\\p{Emoji}]-[\\p{Line_break=RI}*#0-9©®™〰〽]]");
759            fExtendedPict = new UnicodeSet(gExtended_Pict);
760
761
762            // Remove dictionary characters.
763            // The monkey test reference implementation of line break does not replicate the dictionary behavior,
764            // so dictionary characters are omitted from the monkey test data.
765            @SuppressWarnings("unused")
766            UnicodeSet dictionarySet = new UnicodeSet(
767                    "[[:LineBreak = Complex_Context:] & [[:Script = Thai:][:Script = Lao:][:Script = Khmer:] [:script = Myanmar:]]]");
768
769            fAL.addAll(fXX);     // Default behavior for XX is identical to AL
770            fAL.addAll(fAI);     // Default behavior for AI is identical to AL
771            fAL.addAll(fSG);     // Default behavior for SG (unpaired surrogates) is AL
772
773            fNS.addAll(fCJ);     // Default behavior for CJ is identical to NS.
774            fCM.addAll(fZWJ);    // ZWJ behaves as a CM.
775
776            fSets.add(fBK);
777            fSets.add(fCR);
778            fSets.add(fLF);
779            fSets.add(fCM);
780            fSets.add(fNL);
781            fSets.add(fWJ);
782            fSets.add(fZW);
783            fSets.add(fGL);
784            fSets.add(fSP);
785            fSets.add(fB2);
786            fSets.add(fBA);
787            fSets.add(fBB);
788            fSets.add(fHY);
789            fSets.add(fCB);
790            fSets.add(fCL);
791            fSets.add(fCP);
792            fSets.add(fEX);
793            fSets.add(fIN);
794            fSets.add(fJL);
795            fSets.add(fJT);
796            fSets.add(fJV);
797            fSets.add(fNS);
798            fSets.add(fOP);
799            fSets.add(fQU);
800            fSets.add(fIS);
801            fSets.add(fNU);
802            fSets.add(fPO);
803            fSets.add(fPR);
804            fSets.add(fSY);
805            fSets.add(fAI);
806            fSets.add(fAL);
807            fSets.add(fH2);
808            fSets.add(fH3);
809            fSets.add(fHL);
810            fSets.add(fID);
811            fSets.add(fWJ);
812            fSets.add(fRI);
813            fSets.add(fSG);
814            fSets.add(fEB);
815            fSets.add(fEM);
816            fSets.add(fZWJ);
817            fSets.add(fExtendedPict);
818            fSets.add(fEmojiNRK);
819        }
820
821        @Override
822        void setText(StringBuffer s) {
823            fText       = s;
824        }
825
826
827
828
829        @Override
830        int next(int startPos) {
831            int    pos;       //  Index of the char following a potential break position
832            int    thisChar;  //  Character at above position "pos"
833
834            int    prevPos;   //  Index of the char preceding a potential break position
835            int    prevChar;  //  Character at above position.  Note that prevChar
836            //   and thisChar may not be adjacent because combining
837            //   characters between them will be ignored.
838            int    prevCharX2; //  Character before prevChar, more contex for LB 21a
839
840            int    nextPos;   //  Index of the next character following pos.
841            //     Usually skips over combining marks.
842            int    tPos;      //  temp value.
843            int    matchVals[]  = null;       // Number  Expression Match Results
844
845
846            if (startPos >= fText.length()) {
847                return -1;
848            }
849
850
851            // Initial values for loop.  Loop will run the first time without finding breaks,
852            //                           while the invalid values shift out and the "this" and
853            //                           "prev" positions are filled in with good values.
854            pos      = prevPos   = -1;    // Invalid value, serves as flag for initial loop iteration.
855            thisChar = prevChar  = prevCharX2 = 0;
856            nextPos  = startPos;
857
858
859            // Loop runs once per position in the test text, until a break position
860            //  is found.  In each iteration, we are testing for a possible break
861            //  just preceding the character at index "pos".  The character preceding
862            //  this char is at postion "prevPos"; because of combining sequences,
863            //  "prevPos" can be arbitrarily far before "pos".
864            for (;;) {
865                // Advance to the next position to be tested.
866                prevCharX2 = prevChar;
867                prevPos   = pos;
868                prevChar  = thisChar;
869                pos       = nextPos;
870                nextPos   = moveIndex32(fText, pos, 1);
871
872                // Rule LB2 - Break at end of text.
873                if (pos >= fText.length()) {
874                    break;
875                }
876
877                // Rule LB 9 - adjust for combining sequences.
878                //             We do this rule out-of-order because the adjustment does
879                //             not effect the way that rules LB 3 through LB 6 match,
880                //             and doing it here rather than after LB 6 is substantially
881                //             simpler when combining sequences do occur.
882
883
884                // LB 9         Keep combining sequences together.
885                //              advance over any CM class chars at "pos",
886                //              result is "nextPos" for the following loop iteration.
887                thisChar  = UTF16.charAt(fText, pos);
888                if (!(fSP.contains(thisChar) || fBK.contains(thisChar) || thisChar==0x0d ||
889                        thisChar==0x0a || fNL.contains(thisChar) || fZW.contains(thisChar) )) {
890                    for (;;) {
891                        if (nextPos == fText.length()) {
892                            break;
893                        }
894                        int nextChar = UTF16.charAt(fText, nextPos);
895                        if (!fCM.contains(nextChar)) {
896                            break;
897                        }
898                        nextPos = moveIndex32(fText, nextPos, 1);
899                    }
900                }
901
902                // LB 9 Treat X CM* as if it were X
903                //        No explicit action required.
904
905                // LB 10     Treat any remaining combining mark as AL
906                if (fCM.contains(thisChar)) {
907                    thisChar = 'A';
908                }
909
910
911                // If the loop is still warming up - if we haven't shifted the initial
912                //   -1 positions out of prevPos yet - loop back to advance the
913                //    position in the input without any further looking for breaks.
914                if (prevPos == -1) {
915                    continue;
916                }
917
918                // LB 4  Always break after hard line breaks,
919                if (fBK.contains(prevChar)) {
920                    break;
921                }
922
923                // LB 5  Break after CR, LF, NL, but not inside CR LF
924                if (fCR.contains(prevChar) && fLF.contains(thisChar)) {
925                    continue;
926                }
927                if  (fCR.contains(prevChar) ||
928                        fLF.contains(prevChar) ||
929                        fNL.contains(prevChar))  {
930                    break;
931                }
932
933                // LB 6  Don't break before hard line breaks
934                if (fBK.contains(thisChar) || fCR.contains(thisChar) ||
935                        fLF.contains(thisChar) || fNL.contains(thisChar) ) {
936                    continue;
937                }
938
939
940                // LB 7  Don't break before spaces or zero-width space.
941                if (fSP.contains(thisChar)) {
942                    continue;
943                }
944
945                if (fZW.contains(thisChar)) {
946                    continue;
947                }
948
949                // LB 8  Break after zero width space
950                if (fZW.contains(prevChar)) {
951                    break;
952                }
953
954                // LB 8a:  ZWJ x (ID | Extended_Pictographic | Emoji)
955                //       The monkey test's way of ignoring combining characters doesn't work
956                //       for this rule. ZWJ is also a CM. Need to get the actual character
957                //       preceding "thisChar", not ignoring combining marks, possibly ZWJ.
958                {
959                    int prevC = fText.codePointBefore(pos);
960                    if (fZWJ.contains(prevC) && (fID.contains(thisChar) || fExtendedPict.contains(thisChar) || fEmojiNRK.contains(thisChar))) {
961                        continue;
962                    }
963                }
964
965                //  LB 9, 10  Already done, at top of loop.
966                //
967
968
969                // LB 11
970                //    x  WJ
971                //    WJ  x
972                if (fWJ.contains(thisChar) || fWJ.contains(prevChar)) {
973                    continue;
974                }
975
976
977                // LB 12
978                //        GL x
979                if (fGL.contains(prevChar)) {
980                    continue;
981                }
982
983                // LB 12a
984                //    [^SP BA HY] x GL
985                if (!(fSP.contains(prevChar) ||
986                        fBA.contains(prevChar) ||
987                        fHY.contains(prevChar)     ) && fGL.contains(thisChar)) {
988                    continue;
989                }
990
991
992
993                // LB 13  Don't break before closings.
994                //       NU x CL, NU x CP  and NU x IS are not matched here so that they will
995                //       fall into LB 17 and the more general number regular expression.
996                //
997                if (!fNU.contains(prevChar) && fCL.contains(thisChar) ||
998                        !fNU.contains(prevChar) && fCP.contains(thisChar) ||
999                        fEX.contains(thisChar) ||
1000                        !fNU.contains(prevChar) && fIS.contains(thisChar) ||
1001                        !fNU.contains(prevChar) && fSY.contains(thisChar))    {
1002                    continue;
1003                }
1004
1005                // LB 14  Don't break after OP SP*
1006                //       Scan backwards, checking for this sequence.
1007                //       The OP char could include combining marks, so we actually check for
1008                //           OP CM* SP* x
1009                tPos = prevPos;
1010                if (fSP.contains(prevChar)) {
1011                    while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
1012                        tPos=moveIndex32(fText, tPos, -1);
1013                    }
1014                }
1015                while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
1016                    tPos=moveIndex32(fText, tPos, -1);
1017                }
1018                if (fOP.contains(UTF16.charAt(fText, tPos))) {
1019                    continue;
1020                }
1021
1022                // LB 15 Do not break within "[
1023                //       QU CM* SP* x OP
1024                if (fOP.contains(thisChar)) {
1025                    // Scan backwards from prevChar to see if it is preceded by QU CM* SP*
1026                    tPos = prevPos;
1027                    while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
1028                        tPos = moveIndex32(fText, tPos, -1);
1029                    }
1030                    while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
1031                        tPos = moveIndex32(fText, tPos, -1);
1032                    }
1033                    if (fQU.contains(UTF16.charAt(fText, tPos))) {
1034                        continue;
1035                    }
1036                }
1037
1038                // LB 16   (CL | CP) SP* x NS
1039                if (fNS.contains(thisChar)) {
1040                    tPos = prevPos;
1041                    while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
1042                        tPos = moveIndex32(fText, tPos, -1);
1043                    }
1044                    while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
1045                        tPos = moveIndex32(fText, tPos, -1);
1046                    }
1047                    if (fCL.contains(UTF16.charAt(fText, tPos)) || fCP.contains(UTF16.charAt(fText, tPos))) {
1048                        continue;
1049                    }
1050                }
1051
1052
1053                // LB 17        B2 SP* x B2
1054                if (fB2.contains(thisChar)) {
1055                    tPos = prevPos;
1056                    while (tPos > 0 && fSP.contains(UTF16.charAt(fText, tPos))) {
1057                        tPos = moveIndex32(fText, tPos, -1);
1058                    }
1059                    while (tPos > 0 && fCM.contains(UTF16.charAt(fText, tPos))) {
1060                        tPos = moveIndex32(fText, tPos, -1);
1061                    }
1062                    if (fB2.contains(UTF16.charAt(fText, tPos))) {
1063                        continue;
1064                    }
1065                }
1066
1067                // LB 18    break after space
1068                if (fSP.contains(prevChar)) {
1069                    break;
1070                }
1071
1072                // LB 19
1073                //    x   QU
1074                //    QU  x
1075                if (fQU.contains(thisChar) || fQU.contains(prevChar)) {
1076                    continue;
1077                }
1078
1079                // LB 20  Break around a CB
1080                if (fCB.contains(thisChar) || fCB.contains(prevChar)) {
1081                    break;
1082                }
1083
1084                // LB 21
1085                if (fBA.contains(thisChar) ||
1086                        fHY.contains(thisChar) ||
1087                        fNS.contains(thisChar) ||
1088                        fBB.contains(prevChar) )   {
1089                    continue;
1090                }
1091
1092                // LB 21a, HL (HY | BA) x
1093                if (fHL.contains(prevCharX2) && (fHY.contains(prevChar) || fBA.contains(prevChar))) {
1094                    continue;
1095                }
1096
1097                // LB 21b, SY x HL
1098                if (fSY.contains(prevChar) && fHL.contains(thisChar)) {
1099                    continue;
1100                }
1101
1102                // LB 22
1103                if (fAL.contains(prevChar) && fIN.contains(thisChar) ||
1104                        fEX.contains(prevChar) && fIN.contains(thisChar) ||
1105                        fHL.contains(prevChar) && fIN.contains(thisChar) ||
1106                        (fID.contains(prevChar) || fEB.contains(prevChar) || fEM.contains(prevChar)) && fIN.contains(thisChar) ||
1107                        fIN.contains(prevChar) && fIN.contains(thisChar) ||
1108                        fNU.contains(prevChar) && fIN.contains(thisChar) )   {
1109                    continue;
1110                }
1111
1112                // LB 23    (AL | HL) x NU
1113                //          NU x (AL | HL)
1114                if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && fNU.contains(thisChar)) {
1115                    continue;
1116                }
1117                if (fNU.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
1118                    continue;
1119                }
1120
1121                // LB 23a Do not break between numeric prefixes and ideographs, or between ideographs and numeric postfixes.
1122                //      PR x (ID | EB | EM)
1123                //     (ID | EB | EM) x PO
1124                if (fPR.contains(prevChar) &&
1125                        (fID.contains(thisChar) || fEB.contains(thisChar) || fEM.contains(thisChar)))  {
1126                    continue;
1127                }
1128                if ((fID.contains(prevChar) || fEB.contains(prevChar) || fEM.contains(prevChar)) &&
1129                        fPO.contains(thisChar)) {
1130                    continue;
1131                }
1132
1133                // LB 24  Do not break between prefix and letters or ideographs.
1134                //         (PR | PO) x (AL | HL)
1135                //         (AL | HL) x (PR | PO)
1136                if ((fPR.contains(prevChar) || fPO.contains(prevChar)) &&
1137                        (fAL.contains(thisChar) || fHL.contains(thisChar))) {
1138                    continue;
1139                }
1140                if ((fAL.contains(prevChar) || fHL.contains(prevChar)) &&
1141                        (fPR.contains(thisChar) || fPO.contains(thisChar))) {
1142                    continue;
1143                }
1144
1145
1146                // LB 25    Numbers
1147                matchVals = LBNumberCheck(fText, prevPos, matchVals);
1148                if (matchVals[0] != -1) {
1149                    // Matched a number.  But could have been just a single digit, which would
1150                    //    not represent a "no break here" between prevChar and thisChar
1151                    int numEndIdx = matchVals[1];  // idx of first char following num
1152                    if (numEndIdx > pos) {
1153                        // Number match includes at least the two chars being checked
1154                        if (numEndIdx > nextPos) {
1155                            // Number match includes additional chars.  Update pos and nextPos
1156                            //   so that next loop iteration will continue at the end of the number,
1157                            //   checking for breaks between last char in number & whatever follows.
1158                            nextPos = numEndIdx;
1159                            pos     = numEndIdx;
1160                            do {
1161                                pos = moveIndex32(fText, pos, -1);
1162                                thisChar = UTF16.charAt(fText, pos);
1163                            }
1164                            while (fCM.contains(thisChar));
1165                        }
1166                        continue;
1167                    }
1168                }
1169
1170
1171                // LB 26  Do not break Korean Syllables
1172                if (fJL.contains(prevChar) && (fJL.contains(thisChar) ||
1173                        fJV.contains(thisChar) ||
1174                        fH2.contains(thisChar) ||
1175                        fH3.contains(thisChar))) {
1176                    continue;
1177                }
1178
1179                if ((fJV.contains(prevChar) || fH2.contains(prevChar))  &&
1180                        (fJV.contains(thisChar) || fJT.contains(thisChar))) {
1181                    continue;
1182                }
1183
1184                if ((fJT.contains(prevChar) || fH3.contains(prevChar)) &&
1185                        fJT.contains(thisChar)) {
1186                    continue;
1187                }
1188
1189                // LB 27 Treat a Korean Syllable Block the same as ID
1190                if ((fJL.contains(prevChar) || fJV.contains(prevChar) ||
1191                        fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) &&
1192                        fIN.contains(thisChar)) {
1193                    continue;
1194                }
1195                if ((fJL.contains(prevChar) || fJV.contains(prevChar) ||
1196                        fJT.contains(prevChar) || fH2.contains(prevChar) || fH3.contains(prevChar)) &&
1197                        fPO.contains(thisChar)) {
1198                    continue;
1199                }
1200                if (fPR.contains(prevChar) && (fJL.contains(thisChar) || fJV.contains(thisChar) ||
1201                        fJT.contains(thisChar) || fH2.contains(thisChar) || fH3.contains(thisChar))) {
1202                    continue;
1203                }
1204
1205
1206
1207                // LB 28 Do not break between alphabetics
1208                if ((fAL.contains(prevChar) || fHL.contains(prevChar)) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
1209                    continue;
1210                }
1211
1212                // LB 29  Do not break between numeric punctuation and alphabetics
1213                if (fIS.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar))) {
1214                    continue;
1215                }
1216
1217                // LB 30    Do not break between letters, numbers, or ordinary symbols and opening or closing punctuation.
1218                //          (AL | NU) x OP
1219                //          CP x (AL | NU)
1220                if ((fAL.contains(prevChar) || fHL.contains(prevChar) || fNU.contains(prevChar)) && fOP.contains(thisChar)) {
1221                    continue;
1222                }
1223                if (fCP.contains(prevChar) && (fAL.contains(thisChar) || fHL.contains(thisChar) || fNU.contains(thisChar))) {
1224                    continue;
1225                }
1226
1227                // LB 30a   Break between pairs of Regional Indicators.
1228                //             RI RI <break> RI
1229                //             RI    x    RI
1230                if (fRI.contains(prevCharX2) && fRI.contains(prevChar) && fRI.contains(thisChar)) {
1231                    break;
1232                }
1233                if (fRI.contains(prevChar) && fRI.contains(thisChar)) {
1234                    continue;
1235                }
1236
1237                // LB30b    Emoji Base x Emoji Modifier
1238                if (fEB.contains(prevChar) && fEM.contains(thisChar)) {
1239                    continue;
1240                }
1241                // LB 31    Break everywhere else
1242                break;
1243            }
1244
1245            return pos;
1246        }
1247
1248
1249
1250        // Match the following regular expression in the input text.
1251        //    ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * ((CL | CP) CM*)?  (PR | PO) CM*)?
1252        //      0    0   1       3    3    4              7    7    7    7      9    9    9     11   11    (match states)
1253        //  retVals array  [0]  index of the start of the match, or -1 if no match
1254        //                 [1]  index of first char following the match.
1255        //  Can not use Java regex because need supplementary character support,
1256        //     and because Unicode char properties version must be the same as in
1257        //     the version of ICU being tested.
1258        private int[] LBNumberCheck(StringBuffer s, int startIdx, int[] retVals) {
1259            if (retVals == null) {
1260                retVals = new int[2];
1261            }
1262            retVals[0]     = -1;  // Indicates no match.
1263            int matchState = 0;
1264            int idx        = startIdx;
1265
1266            matchLoop: for (idx = startIdx; idx<s.length(); idx = moveIndex32(s, idx, 1)){
1267                int c = UTF16.charAt(s, idx);
1268                int cLBType = UCharacter.getIntPropertyValue(c, UProperty.LINE_BREAK);
1269                switch (matchState) {
1270                case 0:
1271                    if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC ||
1272                    cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
1273                        matchState = 1;
1274                        break;
1275                    }
1276                    if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {
1277                        matchState = 4;
1278                        break;
1279                    }
1280                    if (cLBType == UCharacter.LineBreak.HYPHEN) {
1281                        matchState = 4;
1282                        break;
1283                    }
1284                    if (cLBType == UCharacter.LineBreak.NUMERIC) {
1285                        matchState = 7;
1286                        break;
1287                    }
1288                    break matchLoop;   /* No Match  */
1289
1290                case 1:
1291                    if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) {
1292                        matchState = 1;
1293                        break;
1294                    }
1295                    if (cLBType == UCharacter.LineBreak.OPEN_PUNCTUATION) {
1296                        matchState = 4;
1297                        break;
1298                    }
1299                    if (cLBType == UCharacter.LineBreak.HYPHEN) {
1300                        matchState = 4;
1301                        break;
1302                    }
1303                    if (cLBType == UCharacter.LineBreak.NUMERIC) {
1304                        matchState = 7;
1305                        break;
1306                    }
1307                    break matchLoop;   /* No Match  */
1308
1309
1310                case 4:
1311                    if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) {
1312                        matchState = 4;
1313                        break;
1314                    }
1315                    if (cLBType == UCharacter.LineBreak.NUMERIC) {
1316                        matchState = 7;
1317                        break;
1318                    }
1319                    break matchLoop;   /* No Match  */
1320                    //    ((PR | PO) CM*)? ((OP | HY) CM*)? NU CM* ((NU | IS | SY) CM*) * (CL CM*)?  (PR | PO) CM*)?
1321                    //      0    0   1       3    3    4              7    7    7    7      9   9     11   11    (match states)
1322
1323                case 7:
1324                    if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) {
1325                        matchState = 7;
1326                        break;
1327                    }
1328                    if (cLBType == UCharacter.LineBreak.NUMERIC) {
1329                        matchState = 7;
1330                        break;
1331                    }
1332                    if (cLBType == UCharacter.LineBreak.INFIX_NUMERIC) {
1333                        matchState = 7;
1334                        break;
1335                    }
1336                    if (cLBType == UCharacter.LineBreak.BREAK_SYMBOLS) {
1337                        matchState = 7;
1338                        break;
1339                    }
1340                    if (cLBType == UCharacter.LineBreak.CLOSE_PUNCTUATION) {
1341                        matchState = 9;
1342                        break;
1343                    }
1344                    if (cLBType == UCharacter.LineBreak.CLOSE_PARENTHESIS) {
1345                        matchState = 9;
1346                        break;
1347                    }
1348                    if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
1349                        matchState = 11;
1350                        break;
1351                    }
1352                    if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {
1353                        matchState = 11;
1354                        break;
1355                    }
1356
1357                    break matchLoop;    // Match Complete.
1358                case 9:
1359                    if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) {
1360                        matchState = 9;
1361                        break;
1362                    }
1363                    if (cLBType == UCharacter.LineBreak.POSTFIX_NUMERIC) {
1364                        matchState = 11;
1365                        break;
1366                    }
1367                    if (cLBType == UCharacter.LineBreak.PREFIX_NUMERIC) {
1368                        matchState = 11;
1369                        break;
1370                    }
1371                    break matchLoop;    // Match Complete.
1372                case 11:
1373                    if (cLBType == UCharacter.LineBreak.COMBINING_MARK || cLBType == UCharacter.LineBreak.ZWJ) {
1374                        matchState = 11;
1375                        break;
1376                    }
1377                    break matchLoop;    // Match Complete.
1378                }
1379            }
1380            if (matchState > 4) {
1381                retVals[0] = startIdx;
1382                retVals[1] = idx;
1383            }
1384            return retVals;
1385        }
1386
1387
1388        @Override
1389        List  charClasses() {
1390            return fSets;
1391        }
1392
1393
1394
1395    }
1396
1397
1398    /**
1399     *
1400     * Sentence Monkey Test Class
1401     *
1402     *
1403     *
1404     */
1405    static class RBBISentenceMonkey extends RBBIMonkeyKind {
1406        List                 fSets;
1407        StringBuffer         fText;
1408
1409        UnicodeSet           fSepSet;
1410        UnicodeSet           fFormatSet;
1411        UnicodeSet           fSpSet;
1412        UnicodeSet           fLowerSet;
1413        UnicodeSet           fUpperSet;
1414        UnicodeSet           fOLetterSet;
1415        UnicodeSet           fNumericSet;
1416        UnicodeSet           fATermSet;
1417        UnicodeSet           fSContinueSet;
1418        UnicodeSet           fSTermSet;
1419        UnicodeSet           fCloseSet;
1420        UnicodeSet           fOtherSet;
1421        UnicodeSet           fExtendSet;
1422
1423
1424
1425        RBBISentenceMonkey() {
1426            fCharProperty  = UProperty.SENTENCE_BREAK;
1427
1428            fSets            = new ArrayList();
1429
1430            //  Separator Set Note:  Beginning with Unicode 5.1, CR and LF were removed from the separator
1431            //                       set and made into character classes of their own.  For the monkey impl,
1432            //                       they remain in SEP, since Sep always appears with CR and LF in the rules.
1433            fSepSet          = new UnicodeSet("[\\p{Sentence_Break = Sep} \\u000a \\u000d]");
1434            fFormatSet       = new UnicodeSet("[\\p{Sentence_Break = Format}]");
1435            fSpSet           = new UnicodeSet("[\\p{Sentence_Break = Sp}]");
1436            fLowerSet        = new UnicodeSet("[\\p{Sentence_Break = Lower}]");
1437            fUpperSet        = new UnicodeSet("[\\p{Sentence_Break = Upper}]");
1438            fOLetterSet      = new UnicodeSet("[\\p{Sentence_Break = OLetter}]");
1439            fNumericSet      = new UnicodeSet("[\\p{Sentence_Break = Numeric}]");
1440            fATermSet        = new UnicodeSet("[\\p{Sentence_Break = ATerm}]");
1441            fSContinueSet    = new UnicodeSet("[\\p{Sentence_Break = SContinue}]");
1442            fSTermSet        = new UnicodeSet("[\\p{Sentence_Break = STerm}]");
1443            fCloseSet        = new UnicodeSet("[\\p{Sentence_Break = Close}]");
1444            fExtendSet       = new UnicodeSet("[\\p{Sentence_Break = Extend}]");
1445            fOtherSet        = new UnicodeSet();
1446
1447
1448            fOtherSet.complement();
1449            fOtherSet.removeAll(fSepSet);
1450            fOtherSet.removeAll(fFormatSet);
1451            fOtherSet.removeAll(fSpSet);
1452            fOtherSet.removeAll(fLowerSet);
1453            fOtherSet.removeAll(fUpperSet);
1454            fOtherSet.removeAll(fOLetterSet);
1455            fOtherSet.removeAll(fNumericSet);
1456            fOtherSet.removeAll(fATermSet);
1457            fOtherSet.removeAll(fSContinueSet);
1458            fOtherSet.removeAll(fSTermSet);
1459            fOtherSet.removeAll(fCloseSet);
1460            fOtherSet.removeAll(fExtendSet);
1461
1462            fSets.add(fSepSet);
1463            fSets.add(fFormatSet);
1464
1465            fSets.add(fSpSet);
1466            fSets.add(fLowerSet);
1467            fSets.add(fUpperSet);
1468            fSets.add(fOLetterSet);
1469            fSets.add(fNumericSet);
1470            fSets.add(fATermSet);
1471            fSets.add(fSContinueSet);
1472            fSets.add(fSTermSet);
1473            fSets.add(fCloseSet);
1474            fSets.add(fOtherSet);
1475            fSets.add(fExtendSet);
1476        }
1477
1478
1479        @Override
1480        List  charClasses() {
1481            return fSets;
1482        }
1483
1484        @Override
1485        void   setText(StringBuffer s) {
1486            fText = s;
1487        }
1488
1489
1490        //      moveBack()   Find the "significant" code point preceding the index i.
1491        //      Skips over ($Extend | $Format)*
1492        //
1493        private int moveBack(int i) {
1494
1495            if (i <= 0) {
1496                return -1;
1497            }
1498
1499            int      c;
1500            int      j = i;
1501            do {
1502                j = moveIndex32(fText, j, -1);
1503                c = UTF16.charAt(fText, j);
1504            }
1505            while (j>0 &&(fFormatSet.contains(c) || fExtendSet.contains(c)));
1506            return j;
1507        }
1508
1509
1510        int moveForward(int i) {
1511            if (i>=fText.length()) {
1512                return fText.length();
1513            }
1514            int   c;
1515            int   j = i;
1516            do {
1517                j = moveIndex32(fText, j, 1);
1518                c = cAt(j);
1519            }
1520            while (c>=0 && (fFormatSet.contains(c) || fExtendSet.contains(c)));
1521            return j;
1522
1523        }
1524
1525        int cAt(int pos) {
1526            if (pos<0 || pos>=fText.length()) {
1527                return -1;
1528            }
1529            return UTF16.charAt(fText, pos);
1530        }
1531
1532        @Override
1533        int   next(int prevPos) {
1534            int    /*p0,*/ p1, p2, p3;      // Indices of the significant code points around the
1535            //   break position being tested.  The candidate break
1536            //   location is before p2.
1537            int     breakPos = -1;
1538
1539            int c0, c1, c2, c3;         // The code points at p0, p1, p2 & p3.
1540            int c;
1541
1542            // Prev break at end of string.  return DONE.
1543            if (prevPos >= fText.length()) {
1544                return -1;
1545            }
1546            /*p0 =*/ p1 = p2 = p3 = prevPos;
1547            c3 = UTF16.charAt(fText, prevPos);
1548            c0 = c1 = c2 = 0;
1549
1550            // Loop runs once per "significant" character position in the input text.
1551            for (;;) {
1552                // Move all of the positions forward in the input string.
1553                /*p0 = p1;*/  c0 = c1;
1554                p1 = p2;  c1 = c2;
1555                p2 = p3;  c2 = c3;
1556
1557                // Advancd p3 by  X(Extend | Format)*   Rule 4
1558                p3 = moveForward(p3);
1559                c3 = cAt(p3);
1560
1561                // Rule (3) CR x LF
1562                if (c1==0x0d && c2==0x0a && p2==(p1+1)) {
1563                    continue;
1564                }
1565
1566                // Rule (4)    Sep  <break>
1567                if (fSepSet.contains(c1)) {
1568                    p2 = p1+1;   // Separators don't combine with Extend or Format
1569                    break;
1570                }
1571
1572                if (p2 >= fText.length()) {
1573                    // Reached end of string.  Always a break position.
1574                    break;
1575                }
1576
1577                if (p2 == prevPos) {
1578                    // Still warming up the loop.  (won't work with zero length strings, but we don't care)
1579                    continue;
1580                }
1581
1582                // Rule (6).   ATerm x Numeric
1583                if (fATermSet.contains(c1) &&  fNumericSet.contains(c2))  {
1584                    continue;
1585                }
1586
1587                // Rule (7).  (Upper | Lower) ATerm  x  Uppper
1588                if ((fUpperSet.contains(c0) || fLowerSet.contains(c0)) &&
1589                        fATermSet.contains(c1) && fUpperSet.contains(c2)) {
1590                    continue;
1591                }
1592
1593                // Rule (8)  ATerm Close* Sp*  x  (not (OLettter | Upper | Lower | Sep))* Lower
1594                //           Note:  Sterm | ATerm are added to the negated part of the expression by a
1595                //                  note to the Unicode 5.0 documents.
1596                int p8 = p1;
1597                while (p8>0 && fSpSet.contains(cAt(p8))) {
1598                    p8 = moveBack(p8);
1599                }
1600                while (p8>0 && fCloseSet.contains(cAt(p8))) {
1601                    p8 = moveBack(p8);
1602                }
1603                if (fATermSet.contains(cAt(p8))) {
1604                    p8=p2;
1605                    for (;;) {
1606                        c = cAt(p8);
1607                        if (c==-1 || fOLetterSet.contains(c) || fUpperSet.contains(c) ||
1608                                fLowerSet.contains(c) || fSepSet.contains(c) ||
1609                                fATermSet.contains(c) || fSTermSet.contains(c))
1610                        {
1611                            break;
1612                        }
1613                        p8 = moveForward(p8);
1614                    }
1615                    if (p8<fText.length() && fLowerSet.contains(cAt(p8))) {
1616                        continue;
1617                    }
1618                }
1619
1620                // Rule 8a  (STerm | ATerm) Close* Sp* x (SContinue | Sterm | ATerm)
1621                if (fSContinueSet.contains(c2) || fSTermSet.contains(c2) || fATermSet.contains(c2)) {
1622                    p8 = p1;
1623                    while (setContains(fSpSet, cAt(p8))) {
1624                        p8 = moveBack(p8);
1625                    }
1626                    while (setContains(fCloseSet, cAt(p8))) {
1627                        p8 = moveBack(p8);
1628                    }
1629                    c = cAt(p8);
1630                    if (setContains(fSTermSet, c) || setContains(fATermSet, c)) {
1631                        continue;
1632                    }
1633                }
1634
1635
1636                // Rule (9)  (STerm | ATerm) Close*  x  (Close | Sp | Sep | CR | LF)
1637                int p9 = p1;
1638                while (p9>0 && fCloseSet.contains(cAt(p9))) {
1639                    p9 = moveBack(p9);
1640                }
1641                c = cAt(p9);
1642                if ((fSTermSet.contains(c) || fATermSet.contains(c))) {
1643                    if (fCloseSet.contains(c2) || fSpSet.contains(c2) || fSepSet.contains(c2)) {
1644                        continue;
1645                    }
1646                }
1647
1648                // Rule (10)  (Sterm | ATerm) Close* Sp*  x  (Sp | Sep | CR | LF)
1649                int p10 = p1;
1650                while (p10>0 && fSpSet.contains(cAt(p10))) {
1651                    p10 = moveBack(p10);
1652                }
1653                while (p10>0 && fCloseSet.contains(cAt(p10))) {
1654                    p10 = moveBack(p10);
1655                }
1656                if (fSTermSet.contains(cAt(p10)) || fATermSet.contains(cAt(p10))) {
1657                    if (fSpSet.contains(c2) || fSepSet.contains(c2)) {
1658                        continue;
1659                    }
1660                }
1661
1662                // Rule (11)  (STerm | ATerm) Close* Sp*   <break>
1663                int p11 = p1;
1664                if (p11>0 && fSepSet.contains(cAt(p11))) {
1665                    p11 = moveBack(p11);
1666                }
1667                while (p11>0 && fSpSet.contains(cAt(p11))) {
1668                    p11 = moveBack(p11);
1669                }
1670                while (p11>0 && fCloseSet.contains(cAt(p11))) {
1671                    p11 = moveBack(p11);
1672                }
1673                if (fSTermSet.contains(cAt(p11)) || fATermSet.contains(cAt(p11))) {
1674                    break;
1675                }
1676
1677                //  Rule (12)  Any x Any
1678                continue;
1679            }
1680            breakPos = p2;
1681            return breakPos;
1682        }
1683
1684
1685
1686    }
1687
1688
1689    /**
1690     * Move an index into a string by n code points.
1691     *   Similar to UTF16.moveCodePointOffset, but without the exceptions, which were
1692     *   complicating usage.
1693     * @param s   a Text string
1694     * @param pos The starting code unit index into the text string
1695     * @param amt The amount to adjust the string by.
1696     * @return    The adjusted code unit index, pinned to the string's length, or
1697     *            unchanged if input index was outside of the string.
1698     */
1699    static int moveIndex32(StringBuffer s, int pos, int amt) {
1700        int i;
1701        char  c;
1702        if (amt>0) {
1703            for (i=0; i<amt; i++) {
1704                if (pos >= s.length()) {
1705                    return s.length();
1706                }
1707                c = s.charAt(pos);
1708                pos++;
1709                if (UTF16.isLeadSurrogate(c) && pos < s.length()) {
1710                    c = s.charAt(pos);
1711                    if (UTF16.isTrailSurrogate(c)) {
1712                        pos++;
1713                    }
1714                }
1715            }
1716        } else {
1717            for (i=0; i>amt; i--) {
1718                if (pos <= 0) {
1719                    return 0;
1720                }
1721                pos--;
1722                c = s.charAt(pos);
1723                if (UTF16.isTrailSurrogate(c) && pos >= 0) {
1724                    c = s.charAt(pos);
1725                    if (UTF16.isLeadSurrogate(c)) {
1726                        pos--;
1727                    }
1728                }
1729            }
1730        }
1731        return pos;
1732    }
1733
1734    /**
1735     * No-exceptions form of UnicodeSet.contains(c).
1736     *    Simplifies loops that terminate with an end-of-input character value.
1737     * @param s  A unicode set
1738     * @param c  A code point value
1739     * @return   true if the set contains c.
1740     */
1741    static boolean setContains(UnicodeSet s, int c) {
1742        if (c<0 || c>UTF16.CODEPOINT_MAX_VALUE ) {
1743            return false;
1744        }
1745        return s.contains(c);
1746    }
1747
1748
1749    /**
1750     * return the index of the next code point in the input text.
1751     * @param i the preceding index
1752     */
1753    static int  nextCP(StringBuffer s, int i) {
1754        if (i == -1) {
1755            // End of Input indication.  Continue to return end value.
1756            return -1;
1757        }
1758        int  retVal = i + 1;
1759        if (retVal > s.length()) {
1760            return -1;
1761        }
1762        int  c = UTF16.charAt(s, i);
1763        if (c >= UTF16.SUPPLEMENTARY_MIN_VALUE && UTF16.isLeadSurrogate(s.charAt(i))) {
1764            retVal++;
1765        }
1766        return retVal;
1767    }
1768
1769
1770    /**
1771     * random number generator.  Not using Java's built-in Randoms for two reasons:
1772     *    1.  Using this code allows obtaining the same sequences as those from the ICU4C monkey test.
1773     *    2.  We need to get and restore the seed from values occurring in the middle
1774     *        of a long sequence, to more easily reproduce failing cases.
1775     */
1776    private static int m_seed = 1;
1777    private static int  m_rand()
1778    {
1779        m_seed = m_seed * 1103515245 + 12345;
1780        return (m_seed >>> 16) % 32768;
1781    }
1782
1783    // Helper function for formatting error output.
1784    //   Append a string into a fixed-size field in a StringBuffer.
1785    //   Blank-pad the string if it is shorter than the field.
1786    //   Truncate the source string if it is too long.
1787    //
1788    private static void appendToBuf(StringBuffer dest, String src, int fieldLen) {
1789        int appendLen = src.length();
1790        if (appendLen >= fieldLen) {
1791            dest.append(src.substring(0, fieldLen));
1792        } else {
1793            dest.append(src);
1794            while (appendLen < fieldLen) {
1795                dest.append(' ');
1796                appendLen++;
1797            }
1798        }
1799    }
1800
1801    // Helper function for formatting error output.
1802    // Display a code point in "\\uxxxx" or "\Uxxxxxxxx" format
1803    private static void appendCharToBuf(StringBuffer dest, int c, int fieldLen) {
1804        String hexChars = "0123456789abcdef";
1805        if (c < 0x10000) {
1806            dest.append("\\u");
1807            for (int bn=12; bn>=0; bn-=4) {
1808                dest.append(hexChars.charAt(((c)>>bn)&0xf));
1809            }
1810            appendToBuf(dest, " ", fieldLen-6);
1811        } else {
1812            dest.append("\\U");
1813            for (int bn=28; bn>=0; bn-=4) {
1814                dest.append(hexChars.charAt(((c)>>bn)&0xf));
1815            }
1816            appendToBuf(dest, " ", fieldLen-10);
1817
1818        }
1819    }
1820
1821    /**
1822     *  Run a RBBI monkey test.  Common routine, for all break iterator types.
1823     *    Parameters:
1824     *       bi      - the break iterator to use
1825     *       mk      - MonkeyKind, abstraction for obtaining expected results
1826     *       name    - Name of test (char, word, etc.) for use in error messages
1827     *       seed    - Seed for starting random number generator (parameter from user)
1828     *       numIterations
1829     */
1830    void RunMonkey(BreakIterator  bi, RBBIMonkeyKind mk, String name, int  seed, int numIterations) {
1831        int              TESTSTRINGLEN = 500;
1832        StringBuffer     testText         = new StringBuffer();
1833        int              numCharClasses;
1834        List             chClasses;
1835        int[]            expected         = new int[TESTSTRINGLEN*2 + 1];
1836        int              expectedCount    = 0;
1837        boolean[]        expectedBreaks   = new boolean[TESTSTRINGLEN*2 + 1];
1838        boolean[]        forwardBreaks    = new boolean[TESTSTRINGLEN*2 + 1];
1839        boolean[]        reverseBreaks    = new boolean[TESTSTRINGLEN*2 + 1];
1840        boolean[]        isBoundaryBreaks = new boolean[TESTSTRINGLEN*2 + 1];
1841        boolean[]        followingBreaks  = new boolean[TESTSTRINGLEN*2 + 1];
1842        boolean[]        precedingBreaks  = new boolean[TESTSTRINGLEN*2 + 1];
1843        int              i;
1844        int              loopCount        = 0;
1845        boolean          printTestData    = false;
1846        boolean          printBreaksFromBI = false;
1847
1848        m_seed = seed;
1849
1850        numCharClasses = mk.charClasses().size();
1851        chClasses      = mk.charClasses();
1852
1853        // Verify that the character classes all have at least one member.
1854        for (i=0; i<numCharClasses; i++) {
1855            UnicodeSet s = (UnicodeSet)chClasses.get(i);
1856            if (s == null || s.size() == 0) {
1857                errln("Character Class " + i + " is null or of zero size.");
1858                return;
1859            }
1860        }
1861
1862        //--------------------------------------------------------------------------------------------
1863        //
1864        //  Debugging settings.  Comment out everything in the following block for normal operation
1865        //
1866        //--------------------------------------------------------------------------------------------
1867        // numIterations = -1;
1868        // numIterations = 10000;   // Same as exhaustive.
1869        // RuleBasedBreakIterator_New.fTrace = true;
1870        // m_seed = 859056465;
1871        // TESTSTRINGLEN = 50;
1872        // printTestData = true;
1873        // printBreaksFromBI = true;
1874        // ((RuleBasedBreakIterator_New)bi).dump();
1875
1876        //--------------------------------------------------------------------------------------------
1877        //
1878        //  End of Debugging settings.
1879        //
1880        //--------------------------------------------------------------------------------------------
1881
1882        int  dotsOnLine = 0;
1883        while (loopCount < numIterations || numIterations == -1) {
1884            if (numIterations == -1 && loopCount % 10 == 0) {
1885                // If test is running in an infinite loop, display a periodic tic so
1886                //   we can tell that it is making progress.
1887                System.out.print(".");
1888                if (dotsOnLine++ >= 80){
1889                    System.out.println();
1890                    dotsOnLine = 0;
1891                }
1892            }
1893            // Save current random number seed, so that we can recreate the random numbers
1894            //   for this loop iteration in event of an error.
1895            seed = m_seed;
1896
1897            testText.setLength(0);
1898            // Populate a test string with data.
1899            if (printTestData) {
1900                System.out.println("Test Data string ...");
1901            }
1902            for (i=0; i<TESTSTRINGLEN; i++) {
1903                int        aClassNum = m_rand() % numCharClasses;
1904                UnicodeSet classSet  = (UnicodeSet)chClasses.get(aClassNum);
1905                int        charIdx   = m_rand() % classSet.size();
1906                int        c         = classSet.charAt(charIdx);
1907                if (c < 0) {   // TODO:  deal with sets containing strings.
1908                    errln("c < 0");
1909                }
1910                UTF16.appendCodePoint(testText, c);
1911                if (printTestData) {
1912                    System.out.print(Integer.toHexString(c) + " ");
1913                }
1914            }
1915            if (printTestData) {
1916                System.out.println();
1917            }
1918
1919            Arrays.fill(expected, 0);
1920            Arrays.fill(expectedBreaks, false);
1921            Arrays.fill(forwardBreaks, false);
1922            Arrays.fill(reverseBreaks, false);
1923            Arrays.fill(isBoundaryBreaks, false);
1924            Arrays.fill(followingBreaks, false);
1925            Arrays.fill(precedingBreaks, false);
1926
1927            // Calculate the expected results for this test string.
1928            mk.setText(testText);
1929            expectedCount = 0;
1930            expectedBreaks[0] = true;
1931            expected[expectedCount ++] = 0;
1932            int breakPos = 0;
1933            int lastBreakPos = -1;
1934            for (;;) {
1935                lastBreakPos = breakPos;
1936                breakPos = mk.next(breakPos);
1937                if (breakPos == -1) {
1938                    break;
1939                }
1940                if (breakPos > testText.length()) {
1941                    errln("breakPos > testText.length()");
1942                }
1943                if (lastBreakPos >= breakPos) {
1944                    errln("Next() not increasing.");
1945                    // break;
1946                }
1947                expectedBreaks[breakPos] = true;
1948                expected[expectedCount ++] = breakPos;
1949            }
1950
1951            // Find the break positions using forward iteration
1952            if (printBreaksFromBI) {
1953                System.out.println("Breaks from BI...");
1954            }
1955            bi.setText(testText.toString());
1956            for (i=bi.first(); i != BreakIterator.DONE; i=bi.next()) {
1957                if (i < 0 || i > testText.length()) {
1958                    errln(name + " break monkey test: Out of range value returned by breakIterator::next()");
1959                    break;
1960                }
1961                if (printBreaksFromBI) {
1962                    System.out.print(Integer.toHexString(i) + " ");
1963                }
1964                forwardBreaks[i] = true;
1965            }
1966            if (printBreaksFromBI) {
1967                System.out.println();
1968            }
1969
1970            // Find the break positions using reverse iteration
1971            for (i=bi.last(); i != BreakIterator.DONE; i=bi.previous()) {
1972                if (i < 0 || i > testText.length()) {
1973                    errln(name + " break monkey test: Out of range value returned by breakIterator.next()" + name);
1974                    break;
1975                }
1976                reverseBreaks[i] = true;
1977            }
1978
1979            // Find the break positions using isBoundary() tests.
1980            for (i=0; i<=testText.length(); i++) {
1981                isBoundaryBreaks[i] = bi.isBoundary(i);
1982            }
1983
1984            // Find the break positions using the following() function.
1985            lastBreakPos = 0;
1986            followingBreaks[0] = true;
1987            for (i=0; i<testText.length(); i++) {
1988                breakPos = bi.following(i);
1989                if (breakPos <= i ||
1990                        breakPos < lastBreakPos ||
1991                        breakPos > testText.length() ||
1992                        breakPos > lastBreakPos && lastBreakPos > i ) {
1993                    errln(name + " break monkey test: " +
1994                            "Out of range value returned by BreakIterator::following().\n" +
1995                            "index=" + i + "following returned=" + breakPos +
1996                            "lastBreak=" + lastBreakPos);
1997                    precedingBreaks[i] = !expectedBreaks[i];   // Forces an error.
1998                } else {
1999                    followingBreaks[breakPos] = true;
2000                    lastBreakPos = breakPos;
2001                }
2002            }
2003
2004            // Find the break positions using the preceding() function.
2005            lastBreakPos = testText.length();
2006            precedingBreaks[testText.length()] = true;
2007            for (i=testText.length(); i>0; i--) {
2008                breakPos = bi.preceding(i);
2009                if (breakPos >= i ||
2010                        breakPos > lastBreakPos ||
2011                        breakPos < 0 ||
2012                        breakPos < lastBreakPos && lastBreakPos < i ) {
2013                    errln(name + " break monkey test: " +
2014                            "Out of range value returned by BreakIterator::preceding().\n" +
2015                            "index=" + i + "preceding returned=" + breakPos +
2016                            "lastBreak=" + lastBreakPos);
2017                    precedingBreaks[i] = !expectedBreaks[i];   // Forces an error.
2018                } else {
2019                    precedingBreaks[breakPos] = true;
2020                    lastBreakPos = breakPos;
2021                }
2022            }
2023
2024
2025
2026            // Compare the expected and actual results.
2027            for (i=0; i<=testText.length(); i++) {
2028                String errorType = null;
2029                if  (forwardBreaks[i] != expectedBreaks[i]) {
2030                    errorType = "next()";
2031                } else if (reverseBreaks[i] != forwardBreaks[i]) {
2032                    errorType = "previous()";
2033                } else if (isBoundaryBreaks[i] != expectedBreaks[i]) {
2034                    errorType = "isBoundary()";
2035                } else if (followingBreaks[i] != expectedBreaks[i]) {
2036                    errorType = "following()";
2037                } else if (precedingBreaks[i] != expectedBreaks[i]) {
2038                    errorType = "preceding()";
2039                }
2040
2041                if (errorType != null) {
2042                    // Format a range of the test text that includes the failure as
2043                    //  a data item that can be included in the rbbi test data file.
2044
2045                    // Start of the range is the last point where expected and actual results
2046                    //   both agreed that there was a break position.
2047                    int startContext = i;
2048                    int count = 0;
2049                    for (;;) {
2050                        if (startContext==0) { break; }
2051                        startContext --;
2052                        if (expectedBreaks[startContext]) {
2053                            if (count == 2) break;
2054                            count ++;
2055                        }
2056                    }
2057
2058                    // End of range is two expected breaks past the start position.
2059                    int endContext = i + 1;
2060                    int ci;
2061                    for (ci=0; ci<2; ci++) {  // Number of items to include in error text.
2062                        for (;;) {
2063                            if (endContext >= testText.length()) {break;}
2064                            if (expectedBreaks[endContext-1]) {
2065                                if (count == 0) break;
2066                                count --;
2067                            }
2068                            endContext ++;
2069                        }
2070                    }
2071
2072                    // Format looks like   "<data><>\uabcd\uabcd<>\U0001abcd...</data>"
2073                    StringBuffer errorText = new StringBuffer();
2074
2075                    int      c;    // Char from test data
2076                    for (ci = startContext;  ci <= endContext && ci != -1;  ci = nextCP(testText, ci)) {
2077                        if (ci == i) {
2078                            // This is the location of the error.
2079                            errorText.append("<?>---------------------------------\n");
2080                        } else if (expectedBreaks[ci]) {
2081                            // This a non-error expected break position.
2082                            errorText.append("------------------------------------\n");
2083                        }
2084                        if (ci < testText.length()) {
2085                            c = UTF16.charAt(testText, ci);
2086                            appendCharToBuf(errorText, c, 11);
2087                            String gc = UCharacter.getPropertyValueName(UProperty.GENERAL_CATEGORY, UCharacter.getType(c), UProperty.NameChoice.SHORT);
2088                            appendToBuf(errorText, gc, 8);
2089                            int extraProp = UCharacter.getIntPropertyValue(c, mk.fCharProperty);
2090                            String extraPropValue =
2091                                    UCharacter.getPropertyValueName(mk.fCharProperty, extraProp, UProperty.NameChoice.LONG);
2092                            appendToBuf(errorText, extraPropValue, 20);
2093
2094                            String charName = UCharacter.getExtendedName(c);
2095                            appendToBuf(errorText, charName, 40);
2096                            errorText.append('\n');
2097                        }
2098                    }
2099                    if (ci == testText.length() && ci != -1) {
2100                        errorText.append("<>");
2101                    }
2102                    errorText.append("</data>\n");
2103
2104                    // Output the error
2105                    errln(name + " break monkey test error.  " +
2106                            (expectedBreaks[i]? "Break expected but not found." : "Break found but not expected.") +
2107                            "\nOperation = " + errorType + "; random seed = " + seed + ";  buf Idx = " + i + "\n" +
2108                            errorText);
2109                    break;
2110                }
2111            }
2112
2113            loopCount++;
2114        }
2115    }
2116
2117    @Test
2118    public void TestCharMonkey() {
2119
2120        int        loopCount = 500;
2121        int        seed      = 1;
2122
2123        if (TestFmwk.getExhaustiveness() >= 9) {
2124            loopCount = 10000;
2125        }
2126
2127        RBBICharMonkey  m = new RBBICharMonkey();
2128        BreakIterator   bi = BreakIterator.getCharacterInstance(Locale.US);
2129        RunMonkey(bi, m, "char", seed, loopCount);
2130    }
2131
2132    @Test
2133    public void TestWordMonkey() {
2134
2135        int        loopCount = 500;
2136        int        seed      = 1;
2137
2138        if (TestFmwk.getExhaustiveness() >= 9) {
2139            loopCount = 10000;
2140        }
2141
2142        logln("Word Break Monkey Test");
2143        RBBIWordMonkey  m = new RBBIWordMonkey();
2144        BreakIterator   bi = BreakIterator.getWordInstance(Locale.US);
2145        RunMonkey(bi, m, "word", seed, loopCount);
2146    }
2147
2148    @Test
2149    public void TestLineMonkey() {
2150        int        loopCount = 500;
2151        int        seed      = 1;
2152
2153        if (TestFmwk.getExhaustiveness() >= 9) {
2154            loopCount = 10000;
2155        }
2156
2157        logln("Line Break Monkey Test");
2158        RBBILineMonkey  m = new RBBILineMonkey();
2159        BreakIterator   bi = BreakIterator.getLineInstance(Locale.US);
2160        RunMonkey(bi, m, "line", seed, loopCount);
2161    }
2162
2163    @Test
2164    public void TestSentMonkey() {
2165
2166        int        loopCount = 500;
2167        int        seed      = 1;
2168
2169        if (TestFmwk.getExhaustiveness() >= 9) {
2170            loopCount = 3000;
2171        }
2172
2173        logln("Sentence Break Monkey Test");
2174        RBBISentenceMonkey  m = new RBBISentenceMonkey();
2175        BreakIterator   bi = BreakIterator.getSentenceInstance(Locale.US);
2176        RunMonkey(bi, m, "sent", seed, loopCount);
2177    }
2178    //
2179    //  Round-trip monkey tests.
2180    //  Verify that break iterators created from the rule source from the default
2181    //    break iterators still pass the monkey test for the iterator type.
2182    //
2183    //  This is a major test for the Rule Compiler.  The default break iterators are built
2184    //  from pre-compiled binary rule data that was created using ICU4C; these
2185    //  round-trip rule recompile tests verify that the Java rule compiler can
2186    //  rebuild break iterators from the original source rules.
2187    //
2188    @Test
2189    public void TestRTCharMonkey() {
2190
2191        int        loopCount = 200;
2192        int        seed      = 1;
2193
2194        if (TestFmwk.getExhaustiveness() >= 9) {
2195            loopCount = 2000;
2196        }
2197
2198        RBBICharMonkey  m = new RBBICharMonkey();
2199        BreakIterator   bi = BreakIterator.getCharacterInstance(Locale.US);
2200        String rules = bi.toString();
2201        BreakIterator rtbi = new RuleBasedBreakIterator(rules);
2202        RunMonkey(rtbi, m, "char", seed, loopCount);
2203    }
2204
2205    @Test
2206    public void TestRTWordMonkey() {
2207
2208        int        loopCount = 200;
2209        int        seed      = 1;
2210
2211        if (TestFmwk.getExhaustiveness() >= 9) {
2212            loopCount = 2000;
2213        }
2214        logln("Word Break Monkey Test");
2215        RBBIWordMonkey  m = new RBBIWordMonkey();
2216        BreakIterator   bi = BreakIterator.getWordInstance(Locale.US);
2217        String rules = bi.toString();
2218        BreakIterator rtbi = new RuleBasedBreakIterator(rules);
2219        RunMonkey(rtbi, m, "word", seed, loopCount);
2220    }
2221
2222    @Test
2223    public void TestRTLineMonkey() {
2224        int        loopCount = 200;
2225        int        seed      = 1;
2226
2227        if (TestFmwk.getExhaustiveness() >= 9) {
2228            loopCount = 2000;
2229        }
2230
2231        logln("Line Break Monkey Test");
2232        RBBILineMonkey  m = new RBBILineMonkey();
2233        BreakIterator   bi = BreakIterator.getLineInstance(Locale.US);
2234        String rules = bi.toString();
2235        BreakIterator rtbi = new RuleBasedBreakIterator(rules);
2236        RunMonkey(rtbi, m, "line", seed, loopCount);
2237    }
2238
2239    @Test
2240    public void TestRTSentMonkey() {
2241
2242        int        loopCount = 200;
2243        int        seed      = 1;
2244
2245        if (TestFmwk.getExhaustiveness() >= 9) {
2246            loopCount = 1000;
2247        }
2248
2249        logln("Sentence Break Monkey Test");
2250        RBBISentenceMonkey  m = new RBBISentenceMonkey();
2251        BreakIterator   bi = BreakIterator.getSentenceInstance(Locale.US);
2252        String rules = bi.toString();
2253        BreakIterator rtbi = new RuleBasedBreakIterator(rules);
2254        RunMonkey(rtbi, m, "sent", seed, loopCount);
2255    }
2256}
2257
2258