1/* GENERATED SOURCE. DO NOT MODIFY. */
2// © 2016 and later: Unicode, Inc. and others.
3// License & terms of use: http://www.unicode.org/copyright.html#License
4/*
5*******************************************************************************
6* Copyright (C) 2003-2011, International Business Machines Corporation and    *
7* others. All Rights Reserved.                                                *
8*******************************************************************************
9*/
10package android.icu.text;
11
12import java.io.IOException;
13import java.io.OutputStream;
14import java.util.ArrayList;
15import java.util.List;
16
17import android.icu.impl.Assert;
18import android.icu.impl.IntTrieBuilder;
19
20//
21//  RBBISetBuilder   Handles processing of Unicode Sets from RBBI rules
22//                   (part of the rule building process.)
23//
24//      Starting with the rules parse tree from the scanner,
25//
26//                   -  Enumerate the set of UnicodeSets that are referenced
27//                      by the RBBI rules.
28//                   -  compute a set of non-overlapping character ranges
29//                      with all characters within a range belonging to the same
30//                      set of input uniocde sets.
31//                   -  Derive a set of non-overlapping UnicodeSet (like things)
32//                      that will correspond to columns in the state table for
33//                      the RBBI execution engine.  All characters within one
34//                      of these sets belong to the same set of the original
35//                      UnicodeSets from the user's rules.
36//                   -  construct the trie table that maps input characters
37//                      to the index of the matching non-overlapping set of set from
38//                      the previous step.
39//
40class RBBISetBuilder {
41    static class RangeDescriptor  {
42           int                fStartChar;      // Start of range, unicode 32 bit value.
43           int                fEndChar;        // End of range, unicode 32 bit value.
44           int                fNum;            // runtime-mapped input value for this range.
45           List<RBBINode>     fIncludesSets;    // vector of the the original
46                                                 //   Unicode sets that include this range.
47                                                //    (Contains ptrs to uset nodes)
48            RangeDescriptor   fNext;           // Next RangeDescriptor in the linked list.
49
50            RangeDescriptor() {
51                fIncludesSets = new ArrayList<RBBINode>();
52            }
53
54            RangeDescriptor(RangeDescriptor other) {
55                fStartChar = other.fStartChar;
56                fEndChar   = other.fEndChar;
57                fNum       = other.fNum;
58                fIncludesSets = new ArrayList<RBBINode>(other.fIncludesSets);
59            }
60
61            //-------------------------------------------------------------------------------------
62            //
63            //          RangeDesriptor::split()
64            //
65            //-------------------------------------------------------------------------------------
66            void split(int where) {
67                Assert.assrt(where>fStartChar && where<=fEndChar);
68                RangeDescriptor nr = new RangeDescriptor(this);
69
70                //  RangeDescriptor copy constructor copies all fields.
71                //  Only need to update those that are different after the split.
72                nr.fStartChar = where;
73                this.fEndChar = where-1;
74                nr.fNext      = this.fNext;
75                this.fNext    = nr;
76
77                // TODO:  fIncludesSets is not updated.  Check it out.
78                //         Probably because they haven't been populated yet,
79                //         but still sloppy.
80            }
81
82
83            //-------------------------------------------------------------------------------------
84            //
85            //          RangeDescriptor::setDictionaryFlag
86            //
87            //          Character Category Numbers that include characters from
88            //          the original Unicode Set named "dictionary" have bit 14
89            //          set to 1.  The RBBI runtime engine uses this to trigger
90            //          use of the word dictionary.
91            //
92            //          This function looks through the Unicode Sets that it
93            //          (the range) includes, and sets the bit in fNum when
94            //          "dictionary" is among them.
95            //
96            //          TODO:  a faster way would be to find the set node for
97            //          "dictionary" just once, rather than looking it
98            //          up by name every time.
99            //
100            // -------------------------------------------------------------------------------------
101            void setDictionaryFlag() {
102                int i;
103
104                for (i=0; i<this.fIncludesSets.size(); i++) {
105                    RBBINode        usetNode    = fIncludesSets.get(i);
106                    String          setName = "";
107                    RBBINode        setRef = usetNode.fParent;
108                    if (setRef != null) {
109                        RBBINode varRef = setRef.fParent;
110                        if (varRef != null  &&  varRef.fType == RBBINode.varRef) {
111                            setName = varRef.fText;
112                        }
113                    }
114                    if (setName.equals("dictionary")) {
115                        this.fNum |= 0x4000;
116                        break;
117                    }
118                }
119
120        }
121    }
122
123
124    RBBIRuleBuilder       fRB;             // The RBBI Rule Compiler that owns us.
125    RangeDescriptor       fRangeList;      // Head of the linked list of RangeDescriptors
126
127    IntTrieBuilder        fTrie;           // The mapping TRIE that is the end result of processing
128                                           //  the Unicode Sets.
129
130    // Groups correspond to character categories -
131    //       groups of ranges that are in the same original UnicodeSets.
132    //       fGroupCount is the index of the last used group.
133    //       fGroupCount+1 is also the number of columns in the RBBI state table being compiled.
134    //       State table column 0 is not used.  Column 1 is for end-of-input.
135    //       column 2 is for group 0.  Funny counting.
136    int                fGroupCount;
137
138    boolean             fSawBOF;
139
140
141    //------------------------------------------------------------------------
142    //
143    //       RBBISetBuilder Constructor
144    //
145    //------------------------------------------------------------------------
146    RBBISetBuilder(RBBIRuleBuilder rb)
147    {
148        fRB             = rb;
149    }
150
151
152    //------------------------------------------------------------------------
153    //
154    //           build          Build the list of non-overlapping character ranges
155    //                          from the Unicode Sets.
156    //
157    //------------------------------------------------------------------------
158    void build() {
159        RangeDescriptor rlRange;
160
161        if (fRB.fDebugEnv!=null  && fRB.fDebugEnv.indexOf("usets")>=0) {printSets();}
162
163        //  Initialize the process by creating a single range encompassing all characters
164        //  that is in no sets.
165        //
166        fRangeList               = new RangeDescriptor();
167        fRangeList.fStartChar    = 0;
168        fRangeList.fEndChar      = 0x10ffff;
169
170        //
171        //  Find the set of non-overlapping ranges of characters
172        //
173        for (RBBINode usetNode : fRB.fUSetNodes) {
174            UnicodeSet      inputSet             = usetNode.fInputSet;
175            int            inputSetRangeCount   = inputSet.getRangeCount();
176            int            inputSetRangeIndex   = 0;
177            rlRange              = fRangeList;
178
179            for (;;) {
180                if (inputSetRangeIndex >= inputSetRangeCount) {
181                    break;
182                }
183                int      inputSetRangeBegin  = inputSet.getRangeStart(inputSetRangeIndex);
184                int      inputSetRangeEnd    = inputSet.getRangeEnd(inputSetRangeIndex);
185
186                // skip over ranges from the range list that are completely
187                //   below the current range from the input unicode set.
188                while (rlRange.fEndChar < inputSetRangeBegin) {
189                    rlRange = rlRange.fNext;
190                }
191
192                // If the start of the range from the range list is before with
193                //   the start of the range from the unicode set, split the range list range
194                //   in two, with one part being before (wholly outside of) the unicode set
195                //   and the other containing the rest.
196                //   Then continue the loop; the post-split current range will then be skipped
197                //     over
198                if (rlRange.fStartChar < inputSetRangeBegin) {
199                    rlRange.split(inputSetRangeBegin);
200                     continue;
201                }
202
203                // Same thing at the end of the ranges...
204                // If the end of the range from the range list doesn't coincide with
205                //   the end of the range from the unicode set, split the range list
206                //   range in two.  The first part of the split range will be
207                //   wholly inside the Unicode set.
208                if (rlRange.fEndChar > inputSetRangeEnd) {
209                    rlRange.split(inputSetRangeEnd+1);
210                 }
211
212                // The current rlRange is now entirely within the UnicodeSet range.
213                // Add this unicode set to the list of sets for this rlRange
214                if (rlRange.fIncludesSets.indexOf(usetNode) == -1) {
215                    rlRange.fIncludesSets.add(usetNode);
216                }
217
218                // Advance over ranges that we are finished with.
219                if (inputSetRangeEnd == rlRange.fEndChar) {
220                    inputSetRangeIndex++;
221                }
222                rlRange = rlRange.fNext;
223            }
224        }
225
226        if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("range")>=0) { printRanges();}
227
228        //
229        //  Group the above ranges, with each group consisting of one or more
230        //    ranges that are in exactly the same set of original UnicodeSets.
231        //    The groups are numbered, and these group numbers are the set of
232        //    input symbols recognized by the run-time state machine.
233        //
234        //    Numbering: # 0  (state table column 0) is unused.
235        //               # 1  is reserved - table column 1 is for end-of-input
236        //               # 2  is reserved - table column 2 is for beginning-in-input
237        //               # 3  is the first range list.
238        //
239        RangeDescriptor rlSearchRange;
240        for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
241            for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange.fNext) {
242                if (rlRange.fIncludesSets.equals(rlSearchRange.fIncludesSets)) {
243                    rlRange.fNum = rlSearchRange.fNum;
244                    break;
245                }
246            }
247            if (rlRange.fNum == 0) {
248                fGroupCount ++;
249                rlRange.fNum = fGroupCount+2;
250                rlRange.setDictionaryFlag();
251                addValToSets(rlRange.fIncludesSets, fGroupCount+2);
252            }
253        }
254
255        // Handle input sets that contain the special string {eof}.
256        //   Column 1 of the state table is reserved for EOF on input.
257        //   Column 2 is reserved for before-the-start-input.
258        //            (This column can be optimized away later if there are no rule
259        //             references to {bof}.)
260        //   Add this column value (1 or 2) to the equivalent expression
261        //     subtree for each UnicodeSet that contains the string {eof}
262        //   Because {bof} and {eof} are not a characters in the normal sense,
263        //   they doesn't affect the computation of ranges or TRIE.
264
265        String eofString = "eof";
266        String bofString = "bof";
267
268        for (RBBINode usetNode : fRB.fUSetNodes) {
269            UnicodeSet      inputSet = usetNode.fInputSet;
270            if (inputSet.contains(eofString)) {
271                addValToSet(usetNode, 1);
272            }
273            if (inputSet.contains(bofString)) {
274                addValToSet(usetNode, 2);
275                fSawBOF = true;
276            }
277        }
278
279
280        if (fRB.fDebugEnv!=null  && fRB.fDebugEnv.indexOf("rgroup")>=0) {printRangeGroups();}
281        if (fRB.fDebugEnv!=null  && fRB.fDebugEnv.indexOf("esets")>=0) {printSets();}
282
283
284        //IntTrieBuilder(int aliasdata[], int maxdatalength,
285        //        int initialvalue, int leadunitvalue,
286        //        boolean latin1linear)
287
288        fTrie = new IntTrieBuilder(null,   //   Data array  (utrie will allocate one)
289                                   100000,  //   Max Data Length
290                                   0,       //   Initial value for all code points
291                                   0,       //   Lead Surrogate unit value,
292                                   true);   //   Keep Latin 1 in separately.
293
294        for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
295            fTrie.setRange(rlRange.fStartChar, rlRange.fEndChar+1, rlRange.fNum, true);
296        }
297    }
298
299
300
301    //-----------------------------------------------------------------------------------
302    //
303    //   RBBIDataManipulate  A little internal class needed only to wrap of the
304    //                       getFoldedValue() function needed for Trie table creation.
305    //
306    //-----------------------------------------------------------------------------------
307   class RBBIDataManipulate implements IntTrieBuilder.DataManipulate {
308        public int getFoldedValue(int start, int offset) {
309            int  value;
310            int  limit;
311            boolean [] inBlockZero = new boolean[1];
312
313            limit = start + 0x400;
314            while(start<limit) {
315                value = fTrie.getValue(start, inBlockZero);
316                if (inBlockZero[0]) {
317                    start += IntTrieBuilder.DATA_BLOCK_LENGTH;
318                } else if (value != 0) {
319                    return offset | 0x08000;
320                } else {
321                    ++start;
322                }
323            }
324            return 0;
325         }
326    }
327    RBBIDataManipulate dm = new RBBIDataManipulate();
328
329    //-----------------------------------------------------------------------------------
330    //
331    //          getTrieSize()    Return the size that will be required to serialize the Trie.
332    //
333    //-----------------------------------------------------------------------------------
334    int getTrieSize()  {
335        int size = 0;
336        try {
337            // The trie serialize function returns the size of the data written.
338            //    null output stream says give size only, don't actually write anything.
339            size = fTrie.serialize(null, true, dm );
340        } catch (IOException e) {
341            Assert.assrt (false);
342        }
343        return size;
344    }
345
346
347    //-----------------------------------------------------------------------------------
348    //
349    //          serializeTrie()   Write the serialized trie to an output stream
350    //
351    //-----------------------------------------------------------------------------------
352    void serializeTrie(OutputStream os) throws IOException {
353        fTrie.serialize(os, true, dm );
354   }
355
356    //------------------------------------------------------------------------
357    //
358    //      addValToSets     Add a runtime-mapped input value to each uset from a
359    //      list of uset nodes. (val corresponds to a state table column.)
360    //      For each of the original Unicode sets - which correspond
361    //      directly to uset nodes - a logically equivalent expression
362    //      is constructed in terms of the remapped runtime input
363    //      symbol set.  This function adds one runtime input symbol to
364    //      a list of sets.
365    //
366    //      The "logically equivalent expression" is the tree for an
367    //      or-ing together of all of the symbols that go into the set.
368    //
369    //------------------------------------------------------------------------
370    void  addValToSets(List<RBBINode> sets, int val) {
371        for (RBBINode usetNode : sets) {
372            addValToSet(usetNode, val);
373        }
374    }
375
376    void  addValToSet(RBBINode usetNode, int val) {
377        RBBINode leafNode = new RBBINode(RBBINode.leafChar);
378        leafNode.fVal = val;
379        if (usetNode.fLeftChild == null) {
380            usetNode.fLeftChild = leafNode;
381            leafNode.fParent    = usetNode;
382        } else {
383            // There are already input symbols present for this set.
384            // Set up an OR node, with the previous stuff as the left child
385            //   and the new value as the right child.
386            RBBINode orNode = new RBBINode(RBBINode.opOr);
387            orNode.fLeftChild  = usetNode.fLeftChild;
388            orNode.fRightChild = leafNode;
389            orNode.fLeftChild.fParent  = orNode;
390            orNode.fRightChild.fParent = orNode;
391            usetNode.fLeftChild = orNode;
392            orNode.fParent = usetNode;
393        }
394    }
395
396
397    //------------------------------------------------------------------------
398    //
399    //           getNumCharCategories
400    //
401    //------------------------------------------------------------------------
402    int  getNumCharCategories()  {
403        return fGroupCount + 3;
404    }
405
406
407    //------------------------------------------------------------------------
408    //
409    //           sawBOF
410    //
411    //------------------------------------------------------------------------
412    boolean  sawBOF()  {
413        return fSawBOF;
414    }
415
416
417    //------------------------------------------------------------------------
418    //
419    //           getFirstChar      Given a runtime RBBI character category, find
420    //                             the first UChar32 that is in the set of chars
421    //                             in the category.
422    //------------------------------------------------------------------------
423    int  getFirstChar(int category)  {
424        RangeDescriptor   rlRange;
425        int            retVal = -1;
426        for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
427            if (rlRange.fNum == category) {
428                retVal = rlRange.fStartChar;
429                break;
430            }
431        }
432        return retVal;
433    }
434
435
436
437    //------------------------------------------------------------------------
438    //
439    //           printRanges        A debugging function.
440    //                              dump out all of the range definitions.
441    //
442    //------------------------------------------------------------------------
443    ///CLOVER:OFF
444    void printRanges() {
445        RangeDescriptor       rlRange;
446        int                    i;
447
448        System.out.print("\n\n Nonoverlapping Ranges ...\n");
449        for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
450            System.out.print(" " + rlRange.fNum + "   " + rlRange.fStartChar + "-" + rlRange.fEndChar);
451
452            for (i=0; i<rlRange.fIncludesSets.size(); i++) {
453                RBBINode       usetNode    = rlRange.fIncludesSets.get(i);
454                String         setName = "anon";
455                RBBINode       setRef = usetNode.fParent;
456                if (setRef != null) {
457                    RBBINode varRef = setRef.fParent;
458                    if (varRef != null  &&  varRef.fType == RBBINode.varRef) {
459                        setName = varRef.fText;
460                    }
461                }
462                System.out.print(setName); System.out.print("  ");
463            }
464            System.out.println("");
465        }
466    }
467    ///CLOVER:ON
468
469
470    //------------------------------------------------------------------------
471    //
472    //           printRangeGroups     A debugging function.
473    //                                dump out all of the range groups.
474    //
475    //------------------------------------------------------------------------
476    ///CLOVER:OFF
477    void printRangeGroups() {
478        RangeDescriptor       rlRange;
479        RangeDescriptor       tRange;
480        int                    i;
481        int                    lastPrintedGroupNum = 0;
482
483        System.out.print("\nRanges grouped by Unicode Set Membership...\n");
484        for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
485            int groupNum = rlRange.fNum & 0xbfff;
486            if (groupNum > lastPrintedGroupNum) {
487                lastPrintedGroupNum = groupNum;
488                if (groupNum<10) {System.out.print(" ");}
489                System.out.print(groupNum + " ");
490
491                if ((rlRange.fNum & 0x4000) != 0) { System.out.print(" <DICT> ");}
492
493                for (i=0; i<rlRange.fIncludesSets.size(); i++) {
494                    RBBINode       usetNode    = rlRange.fIncludesSets.get(i);
495                    String         setName = "anon";
496                    RBBINode       setRef = usetNode.fParent;
497                    if (setRef != null) {
498                        RBBINode varRef = setRef.fParent;
499                        if (varRef != null  &&  varRef.fType == RBBINode.varRef) {
500                            setName = varRef.fText;
501                        }
502                    }
503                    System.out.print(setName); System.out.print(" ");
504                }
505
506                i = 0;
507                for (tRange = rlRange; tRange != null; tRange = tRange.fNext) {
508                    if (tRange.fNum == rlRange.fNum) {
509                        if (i++ % 5 == 0) {
510                            System.out.print("\n    ");
511                        }
512                        RBBINode.printHex(tRange.fStartChar, -1);
513                        System.out.print("-");
514                        RBBINode.printHex(tRange.fEndChar, 0);
515                    }
516                }
517                System.out.print("\n");
518            }
519        }
520        System.out.print("\n");
521    }
522    ///CLOVER:ON
523
524
525    //------------------------------------------------------------------------
526    //
527    //           printSets          A debugging function.
528    //                              dump out all of the set definitions.
529    //
530    //------------------------------------------------------------------------
531    ///CLOVER:OFF
532    void printSets() {
533        int                   i;
534        System.out.print("\n\nUnicode Sets List\n------------------\n");
535        for (i=0; i<fRB.fUSetNodes.size(); i++) {
536            RBBINode        usetNode;
537            RBBINode        setRef;
538            RBBINode        varRef;
539            String          setName;
540
541            usetNode = fRB.fUSetNodes.get(i);
542
543            //System.out.print(" " + i + "   ");
544            RBBINode.printInt(2, i);
545            setName = "anonymous";
546            setRef = usetNode.fParent;
547            if (setRef != null) {
548                varRef = setRef.fParent;
549                if (varRef != null  &&  varRef.fType == RBBINode.varRef) {
550                    setName = varRef.fText;
551                }
552            }
553            System.out.print("  " + setName);
554            System.out.print("   ");
555            System.out.print(usetNode.fText);
556            System.out.print("\n");
557            if (usetNode.fLeftChild != null) {
558                usetNode.fLeftChild.printTree(true);
559            }
560        }
561        System.out.print("\n");
562    }
563    ///CLOVER:ON
564}
565