1/*
2*******************************************************************************
3* Copyright (C) 2003-2011, International Business Machines Corporation and    *
4* others. All Rights Reserved.                                                *
5*******************************************************************************
6*/
7package com.ibm.icu.text;
8
9import java.io.IOException;
10import java.io.OutputStream;
11import java.util.ArrayList;
12import java.util.List;
13
14import com.ibm.icu.impl.Assert;
15import com.ibm.icu.impl.IntTrieBuilder;
16
17//
18//  RBBISetBuilder   Handles processing of Unicode Sets from RBBI rules
19//                   (part of the rule building process.)
20//
21//      Starting with the rules parse tree from the scanner,
22//
23//                   -  Enumerate the set of UnicodeSets that are referenced
24//                      by the RBBI rules.
25//                   -  compute a set of non-overlapping character ranges
26//                      with all characters within a range belonging to the same
27//                      set of input uniocde sets.
28//                   -  Derive a set of non-overlapping UnicodeSet (like things)
29//                      that will correspond to columns in the state table for
30//                      the RBBI execution engine.  All characters within one
31//                      of these sets belong to the same set of the original
32//                      UnicodeSets from the user's rules.
33//                   -  construct the trie table that maps input characters
34//                      to the index of the matching non-overlapping set of set from
35//                      the previous step.
36//
37class RBBISetBuilder {
38    static class RangeDescriptor  {
39           int                fStartChar;      // Start of range, unicode 32 bit value.
40           int                fEndChar;        // End of range, unicode 32 bit value.
41           int                fNum;            // runtime-mapped input value for this range.
42           List<RBBINode>     fIncludesSets;    // vector of the the original
43                                                 //   Unicode sets that include this range.
44                                                //    (Contains ptrs to uset nodes)
45            RangeDescriptor   fNext;           // Next RangeDescriptor in the linked list.
46
47            RangeDescriptor() {
48                fIncludesSets = new ArrayList<RBBINode>();
49            }
50
51            RangeDescriptor(RangeDescriptor other) {
52                fStartChar = other.fStartChar;
53                fEndChar   = other.fEndChar;
54                fNum       = other.fNum;
55                fIncludesSets = new ArrayList<RBBINode>(other.fIncludesSets);
56            }
57
58            //-------------------------------------------------------------------------------------
59            //
60            //          RangeDesriptor::split()
61            //
62            //-------------------------------------------------------------------------------------
63            void split(int where) {
64                Assert.assrt(where>fStartChar && where<=fEndChar);
65                RangeDescriptor nr = new RangeDescriptor(this);
66
67                //  RangeDescriptor copy constructor copies all fields.
68                //  Only need to update those that are different after the split.
69                nr.fStartChar = where;
70                this.fEndChar = where-1;
71                nr.fNext      = this.fNext;
72                this.fNext    = nr;
73
74                // TODO:  fIncludesSets is not updated.  Check it out.
75                //         Probably because they haven't been populated yet,
76                //         but still sloppy.
77            }
78
79
80            //-------------------------------------------------------------------------------------
81            //
82            //          RangeDescriptor::setDictionaryFlag
83            //
84            //          Character Category Numbers that include characters from
85            //          the original Unicode Set named "dictionary" have bit 14
86            //          set to 1.  The RBBI runtime engine uses this to trigger
87            //          use of the word dictionary.
88            //
89            //          This function looks through the Unicode Sets that it
90            //          (the range) includes, and sets the bit in fNum when
91            //          "dictionary" is among them.
92            //
93            //          TODO:  a faster way would be to find the set node for
94            //          "dictionary" just once, rather than looking it
95            //          up by name every time.
96            //
97            // -------------------------------------------------------------------------------------
98            void setDictionaryFlag() {
99                int i;
100
101                for (i=0; i<this.fIncludesSets.size(); i++) {
102                    RBBINode        usetNode    = fIncludesSets.get(i);
103                    String          setName = "";
104                    RBBINode        setRef = usetNode.fParent;
105                    if (setRef != null) {
106                        RBBINode varRef = setRef.fParent;
107                        if (varRef != null  &&  varRef.fType == RBBINode.varRef) {
108                            setName = varRef.fText;
109                        }
110                    }
111                    if (setName.equals("dictionary")) {
112                        this.fNum |= 0x4000;
113                        break;
114                    }
115                }
116
117        }
118    }
119
120
121    RBBIRuleBuilder       fRB;             // The RBBI Rule Compiler that owns us.
122    RangeDescriptor       fRangeList;      // Head of the linked list of RangeDescriptors
123
124    IntTrieBuilder        fTrie;           // The mapping TRIE that is the end result of processing
125    int                  fTrieSize;        //  the Unicode Sets.
126
127    // Groups correspond to character categories -
128    //       groups of ranges that are in the same original UnicodeSets.
129    //       fGroupCount is the index of the last used group.
130    //       fGroupCount+1 is also the number of columns in the RBBI state table being compiled.
131    //       State table column 0 is not used.  Column 1 is for end-of-input.
132    //       column 2 is for group 0.  Funny counting.
133    int                fGroupCount;
134
135    boolean             fSawBOF;
136
137
138    //------------------------------------------------------------------------
139    //
140    //       RBBISetBuilder Constructor
141    //
142    //------------------------------------------------------------------------
143    RBBISetBuilder(RBBIRuleBuilder rb)
144    {
145        fRB             = rb;
146    }
147
148
149    //------------------------------------------------------------------------
150    //
151    //           build          Build the list of non-overlapping character ranges
152    //                          from the Unicode Sets.
153    //
154    //------------------------------------------------------------------------
155    void build() {
156        RangeDescriptor rlRange;
157
158        if (fRB.fDebugEnv!=null  && fRB.fDebugEnv.indexOf("usets")>=0) {printSets();}
159
160        //  Initialize the process by creating a single range encompassing all characters
161        //  that is in no sets.
162        //
163        fRangeList               = new RangeDescriptor();
164        fRangeList.fStartChar    = 0;
165        fRangeList.fEndChar      = 0x10ffff;
166
167        //
168        //  Find the set of non-overlapping ranges of characters
169        //
170        for (RBBINode usetNode : fRB.fUSetNodes) {
171            UnicodeSet      inputSet             = usetNode.fInputSet;
172            int            inputSetRangeCount   = inputSet.getRangeCount();
173            int            inputSetRangeIndex   = 0;
174            rlRange              = fRangeList;
175
176            for (;;) {
177                if (inputSetRangeIndex >= inputSetRangeCount) {
178                    break;
179                }
180                int      inputSetRangeBegin  = inputSet.getRangeStart(inputSetRangeIndex);
181                int      inputSetRangeEnd    = inputSet.getRangeEnd(inputSetRangeIndex);
182
183                // skip over ranges from the range list that are completely
184                //   below the current range from the input unicode set.
185                while (rlRange.fEndChar < inputSetRangeBegin) {
186                    rlRange = rlRange.fNext;
187                }
188
189                // If the start of the range from the range list is before with
190                //   the start of the range from the unicode set, split the range list range
191                //   in two, with one part being before (wholly outside of) the unicode set
192                //   and the other containing the rest.
193                //   Then continue the loop; the post-split current range will then be skipped
194                //     over
195                if (rlRange.fStartChar < inputSetRangeBegin) {
196                    rlRange.split(inputSetRangeBegin);
197                     continue;
198                }
199
200                // Same thing at the end of the ranges...
201                // If the end of the range from the range list doesn't coincide with
202                //   the end of the range from the unicode set, split the range list
203                //   range in two.  The first part of the split range will be
204                //   wholly inside the Unicode set.
205                if (rlRange.fEndChar > inputSetRangeEnd) {
206                    rlRange.split(inputSetRangeEnd+1);
207                 }
208
209                // The current rlRange is now entirely within the UnicodeSet range.
210                // Add this unicode set to the list of sets for this rlRange
211                if (rlRange.fIncludesSets.indexOf(usetNode) == -1) {
212                    rlRange.fIncludesSets.add(usetNode);
213                }
214
215                // Advance over ranges that we are finished with.
216                if (inputSetRangeEnd == rlRange.fEndChar) {
217                    inputSetRangeIndex++;
218                }
219                rlRange = rlRange.fNext;
220            }
221        }
222
223        if (fRB.fDebugEnv!=null && fRB.fDebugEnv.indexOf("range")>=0) { printRanges();}
224
225        //
226        //  Group the above ranges, with each group consisting of one or more
227        //    ranges that are in exactly the same set of original UnicodeSets.
228        //    The groups are numbered, and these group numbers are the set of
229        //    input symbols recognized by the run-time state machine.
230        //
231        //    Numbering: # 0  (state table column 0) is unused.
232        //               # 1  is reserved - table column 1 is for end-of-input
233        //               # 2  is reserved - table column 2 is for beginning-in-input
234        //               # 3  is the first range list.
235        //
236        RangeDescriptor rlSearchRange;
237        for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
238            for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange.fNext) {
239                if (rlRange.fIncludesSets.equals(rlSearchRange.fIncludesSets)) {
240                    rlRange.fNum = rlSearchRange.fNum;
241                    break;
242                }
243            }
244            if (rlRange.fNum == 0) {
245                fGroupCount ++;
246                rlRange.fNum = fGroupCount+2;
247                rlRange.setDictionaryFlag();
248                addValToSets(rlRange.fIncludesSets, fGroupCount+2);
249            }
250        }
251
252        // Handle input sets that contain the special string {eof}.
253        //   Column 1 of the state table is reserved for EOF on input.
254        //   Column 2 is reserved for before-the-start-input.
255        //            (This column can be optimized away later if there are no rule
256        //             references to {bof}.)
257        //   Add this column value (1 or 2) to the equivalent expression
258        //     subtree for each UnicodeSet that contains the string {eof}
259        //   Because {bof} and {eof} are not a characters in the normal sense,
260        //   they doesn't affect the computation of ranges or TRIE.
261
262        String eofString = "eof";
263        String bofString = "bof";
264
265        for (RBBINode usetNode : fRB.fUSetNodes) {
266            UnicodeSet      inputSet = usetNode.fInputSet;
267            if (inputSet.contains(eofString)) {
268                addValToSet(usetNode, 1);
269            }
270            if (inputSet.contains(bofString)) {
271                addValToSet(usetNode, 2);
272                fSawBOF = true;
273            }
274        }
275
276
277        if (fRB.fDebugEnv!=null  && fRB.fDebugEnv.indexOf("rgroup")>=0) {printRangeGroups();}
278        if (fRB.fDebugEnv!=null  && fRB.fDebugEnv.indexOf("esets")>=0) {printSets();}
279
280
281        //IntTrieBuilder(int aliasdata[], int maxdatalength,
282        //        int initialvalue, int leadunitvalue,
283        //        boolean latin1linear)
284
285        fTrie = new IntTrieBuilder(null,   //   Data array  (utrie will allocate one)
286                                   100000,  //   Max Data Length
287                                   0,       //   Initial value for all code points
288                                   0,       //   Lead Surrogate unit value,
289                                   true);   //   Keep Latin 1 in separately.
290
291        for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
292            fTrie.setRange(rlRange.fStartChar, rlRange.fEndChar+1, rlRange.fNum, true);
293        }
294    }
295
296
297
298    //-----------------------------------------------------------------------------------
299    //
300    //   RBBIDataManipulate  A little internal class needed only to wrap of the
301    //                       getFoldedValue() function needed for Trie table creation.
302    //
303    //-----------------------------------------------------------------------------------
304   class RBBIDataManipulate implements IntTrieBuilder.DataManipulate {
305        public int getFoldedValue(int start, int offset) {
306            int  value;
307            int  limit;
308            boolean [] inBlockZero = new boolean[1];
309
310            limit = start + 0x400;
311            while(start<limit) {
312                value = fTrie.getValue(start, inBlockZero);
313                if (inBlockZero[0]) {
314                    start += IntTrieBuilder.DATA_BLOCK_LENGTH;
315                } else if (value != 0) {
316                    return offset | 0x08000;
317                } else {
318                    ++start;
319                }
320            }
321            return 0;
322         }
323    }
324    RBBIDataManipulate dm = new RBBIDataManipulate();
325
326    //-----------------------------------------------------------------------------------
327    //
328    //          getTrieSize()    Return the size that will be required to serialize the Trie.
329    //
330    //-----------------------------------------------------------------------------------
331    int getTrieSize()  {
332        int size = 0;
333        try {
334            // The trie serialize function returns the size of the data written.
335            //    null output stream says give size only, don't actually write anything.
336            size = fTrie.serialize(null, true, dm );
337        } catch (IOException e) {
338            Assert.assrt (false);
339        }
340        return size;
341    }
342
343
344    //-----------------------------------------------------------------------------------
345    //
346    //          serializeTrie()   Write the serialized trie to an output stream
347    //
348    //-----------------------------------------------------------------------------------
349    void serializeTrie(OutputStream os) throws IOException {
350        fTrie.serialize(os, true, dm );
351   }
352
353    //------------------------------------------------------------------------
354    //
355    //      addValToSets     Add a runtime-mapped input value to each uset from a
356    //      list of uset nodes. (val corresponds to a state table column.)
357    //      For each of the original Unicode sets - which correspond
358    //      directly to uset nodes - a logically equivalent expression
359    //      is constructed in terms of the remapped runtime input
360    //      symbol set.  This function adds one runtime input symbol to
361    //      a list of sets.
362    //
363    //      The "logically equivalent expression" is the tree for an
364    //      or-ing together of all of the symbols that go into the set.
365    //
366    //------------------------------------------------------------------------
367    void  addValToSets(List<RBBINode> sets, int val) {
368        for (RBBINode usetNode : sets) {
369            addValToSet(usetNode, val);
370        }
371    }
372
373    void  addValToSet(RBBINode usetNode, int val) {
374        RBBINode leafNode = new RBBINode(RBBINode.leafChar);
375        leafNode.fVal = val;
376        if (usetNode.fLeftChild == null) {
377            usetNode.fLeftChild = leafNode;
378            leafNode.fParent    = usetNode;
379        } else {
380            // There are already input symbols present for this set.
381            // Set up an OR node, with the previous stuff as the left child
382            //   and the new value as the right child.
383            RBBINode orNode = new RBBINode(RBBINode.opOr);
384            orNode.fLeftChild  = usetNode.fLeftChild;
385            orNode.fRightChild = leafNode;
386            orNode.fLeftChild.fParent  = orNode;
387            orNode.fRightChild.fParent = orNode;
388            usetNode.fLeftChild = orNode;
389            orNode.fParent = usetNode;
390        }
391    }
392
393
394    //------------------------------------------------------------------------
395    //
396    //           getNumCharCategories
397    //
398    //------------------------------------------------------------------------
399    int  getNumCharCategories()  {
400        return fGroupCount + 3;
401    }
402
403
404    //------------------------------------------------------------------------
405    //
406    //           sawBOF
407    //
408    //------------------------------------------------------------------------
409    boolean  sawBOF()  {
410        return fSawBOF;
411    }
412
413
414    //------------------------------------------------------------------------
415    //
416    //           getFirstChar      Given a runtime RBBI character category, find
417    //                             the first UChar32 that is in the set of chars
418    //                             in the category.
419    //------------------------------------------------------------------------
420    int  getFirstChar(int category)  {
421        RangeDescriptor   rlRange;
422        int            retVal = -1;
423        for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
424            if (rlRange.fNum == category) {
425                retVal = rlRange.fStartChar;
426                break;
427            }
428        }
429        return retVal;
430    }
431
432
433
434    //------------------------------------------------------------------------
435    //
436    //           printRanges        A debugging function.
437    //                              dump out all of the range definitions.
438    //
439    //------------------------------------------------------------------------
440    ///CLOVER:OFF
441    void printRanges() {
442        RangeDescriptor       rlRange;
443        int                    i;
444
445        System.out.print("\n\n Nonoverlapping Ranges ...\n");
446        for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
447            System.out.print(" " + rlRange.fNum + "   " + rlRange.fStartChar + "-" + rlRange.fEndChar);
448
449            for (i=0; i<rlRange.fIncludesSets.size(); i++) {
450                RBBINode       usetNode    = rlRange.fIncludesSets.get(i);
451                String         setName = "anon";
452                RBBINode       setRef = usetNode.fParent;
453                if (setRef != null) {
454                    RBBINode varRef = setRef.fParent;
455                    if (varRef != null  &&  varRef.fType == RBBINode.varRef) {
456                        setName = varRef.fText;
457                    }
458                }
459                System.out.print(setName); System.out.print("  ");
460            }
461            System.out.println("");
462        }
463    }
464    ///CLOVER:ON
465
466
467    //------------------------------------------------------------------------
468    //
469    //           printRangeGroups     A debugging function.
470    //                                dump out all of the range groups.
471    //
472    //------------------------------------------------------------------------
473    ///CLOVER:OFF
474    void printRangeGroups() {
475        RangeDescriptor       rlRange;
476        RangeDescriptor       tRange;
477        int                    i;
478        int                    lastPrintedGroupNum = 0;
479
480        System.out.print("\nRanges grouped by Unicode Set Membership...\n");
481        for (rlRange = fRangeList; rlRange!=null; rlRange=rlRange.fNext) {
482            int groupNum = rlRange.fNum & 0xbfff;
483            if (groupNum > lastPrintedGroupNum) {
484                lastPrintedGroupNum = groupNum;
485                if (groupNum<10) {System.out.print(" ");}
486                System.out.print(groupNum + " ");
487
488                if ((rlRange.fNum & 0x4000) != 0) { System.out.print(" <DICT> ");}
489
490                for (i=0; i<rlRange.fIncludesSets.size(); i++) {
491                    RBBINode       usetNode    = rlRange.fIncludesSets.get(i);
492                    String         setName = "anon";
493                    RBBINode       setRef = usetNode.fParent;
494                    if (setRef != null) {
495                        RBBINode varRef = setRef.fParent;
496                        if (varRef != null  &&  varRef.fType == RBBINode.varRef) {
497                            setName = varRef.fText;
498                        }
499                    }
500                    System.out.print(setName); System.out.print(" ");
501                }
502
503                i = 0;
504                for (tRange = rlRange; tRange != null; tRange = tRange.fNext) {
505                    if (tRange.fNum == rlRange.fNum) {
506                        if (i++ % 5 == 0) {
507                            System.out.print("\n    ");
508                        }
509                        RBBINode.printHex(tRange.fStartChar, -1);
510                        System.out.print("-");
511                        RBBINode.printHex(tRange.fEndChar, 0);
512                    }
513                }
514                System.out.print("\n");
515            }
516        }
517        System.out.print("\n");
518    }
519    ///CLOVER:ON
520
521
522    //------------------------------------------------------------------------
523    //
524    //           printSets          A debugging function.
525    //                              dump out all of the set definitions.
526    //
527    //------------------------------------------------------------------------
528    ///CLOVER:OFF
529    void printSets() {
530        int                   i;
531        System.out.print("\n\nUnicode Sets List\n------------------\n");
532        for (i=0; i<fRB.fUSetNodes.size(); i++) {
533            RBBINode        usetNode;
534            RBBINode        setRef;
535            RBBINode        varRef;
536            String          setName;
537
538            usetNode = fRB.fUSetNodes.get(i);
539
540            //System.out.print(" " + i + "   ");
541            RBBINode.printInt(2, i);
542            setName = "anonymous";
543            setRef = usetNode.fParent;
544            if (setRef != null) {
545                varRef = setRef.fParent;
546                if (varRef != null  &&  varRef.fType == RBBINode.varRef) {
547                    setName = varRef.fText;
548                }
549            }
550            System.out.print("  " + setName);
551            System.out.print("   ");
552            System.out.print(usetNode.fText);
553            System.out.print("\n");
554            if (usetNode.fLeftChild != null) {
555                usetNode.fLeftChild.printTree(true);
556            }
557        }
558        System.out.print("\n");
559    }
560    ///CLOVER:ON
561}
562