1//
2//  rbbisetb.cpp
3//
4/*
5***************************************************************************
6*   Copyright (C) 2002-2008 International Business Machines Corporation   *
7*   and others. All rights reserved.                                      *
8***************************************************************************
9*/
10//
11//  RBBISetBuilder   Handles processing of Unicode Sets from RBBI rules
12//                   (part of the rule building process.)
13//
14//      Starting with the rules parse tree from the scanner,
15//
16//                   -  Enumerate the set of UnicodeSets that are referenced
17//                      by the RBBI rules.
18//                   -  compute a set of non-overlapping character ranges
19//                      with all characters within a range belonging to the same
20//                      set of input uniocde sets.
21//                   -  Derive a set of non-overlapping UnicodeSet (like things)
22//                      that will correspond to columns in the state table for
23//                      the RBBI execution engine.  All characters within one
24//                      of these sets belong to the same set of the original
25//                      UnicodeSets from the user's rules.
26//                   -  construct the trie table that maps input characters
27//                      to the index of the matching non-overlapping set of set from
28//                      the previous step.
29//
30
31#include "unicode/utypes.h"
32
33#if !UCONFIG_NO_BREAK_ITERATION
34
35#include "unicode/uniset.h"
36#include "utrie.h"
37#include "uvector.h"
38#include "uassert.h"
39#include "cmemory.h"
40#include "cstring.h"
41
42#include "rbbisetb.h"
43#include "rbbinode.h"
44
45
46//------------------------------------------------------------------------
47//
48//   getFoldedRBBIValue        Call-back function used during building of Trie table.
49//                             Folding value: just store the offset (16 bits)
50//                             if there is any non-0 entry.
51//                             (It'd really be nice if the Trie builder would provide a
52//                             simple default, so this function could go away from here.)
53//
54//------------------------------------------------------------------------
55/* folding value: just store the offset (16 bits) if there is any non-0 entry */
56U_CDECL_BEGIN
57static uint32_t U_CALLCONV
58getFoldedRBBIValue(UNewTrie *trie, UChar32 start, int32_t offset) {
59    uint32_t value;
60    UChar32 limit;
61    UBool inBlockZero;
62
63    limit=start+0x400;
64    while(start<limit) {
65        value=utrie_get32(trie, start, &inBlockZero);
66        if(inBlockZero) {
67            start+=UTRIE_DATA_BLOCK_LENGTH;
68        } else if(value!=0) {
69            return (uint32_t)(offset|0x8000);
70        } else {
71            ++start;
72        }
73    }
74    return 0;
75}
76
77
78U_CDECL_END
79
80
81
82U_NAMESPACE_BEGIN
83
84//------------------------------------------------------------------------
85//
86//   Constructor
87//
88//------------------------------------------------------------------------
89RBBISetBuilder::RBBISetBuilder(RBBIRuleBuilder *rb)
90{
91    fRB             = rb;
92    fStatus         = rb->fStatus;
93    fRangeList      = 0;
94    fTrie           = 0;
95    fTrieSize       = 0;
96    fGroupCount     = 0;
97    fSawBOF         = FALSE;
98}
99
100
101//------------------------------------------------------------------------
102//
103//   Destructor
104//
105//------------------------------------------------------------------------
106RBBISetBuilder::~RBBISetBuilder()
107{
108    RangeDescriptor   *nextRangeDesc;
109
110    // Walk through & delete the linked list of RangeDescriptors
111    for (nextRangeDesc = fRangeList; nextRangeDesc!=NULL;) {
112        RangeDescriptor *r = nextRangeDesc;
113        nextRangeDesc      = r->fNext;
114        delete r;
115    }
116
117    utrie_close(fTrie);
118}
119
120
121
122
123//------------------------------------------------------------------------
124//
125//   build          Build the list of non-overlapping character ranges
126//                  from the Unicode Sets.
127//
128//------------------------------------------------------------------------
129void RBBISetBuilder::build() {
130    RBBINode        *usetNode;
131    RangeDescriptor *rlRange;
132
133    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "usets")) {printSets();}
134
135    //
136    //  Initialize the process by creating a single range encompassing all characters
137    //  that is in no sets.
138    //
139    fRangeList                = new RangeDescriptor(*fStatus); // will check for status here
140    if (fRangeList == NULL) {
141        *fStatus = U_MEMORY_ALLOCATION_ERROR;
142        return;
143    }
144    fRangeList->fStartChar    = 0;
145    fRangeList->fEndChar      = 0x10ffff;
146
147    if (U_FAILURE(*fStatus)) {
148        return;
149    }
150
151    //
152    //  Find the set of non-overlapping ranges of characters
153    //
154    int  ni;
155    for (ni=0; ; ni++) {        // Loop over each of the UnicodeSets encountered in the input rules
156        usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
157        if (usetNode==NULL) {
158            break;
159        }
160
161        UnicodeSet      *inputSet             = usetNode->fInputSet;
162        int32_t          inputSetRangeCount   = inputSet->getRangeCount();
163        int              inputSetRangeIndex   = 0;
164                         rlRange              = fRangeList;
165
166        for (;;) {
167            if (inputSetRangeIndex >= inputSetRangeCount) {
168                break;
169            }
170            UChar32      inputSetRangeBegin  = inputSet->getRangeStart(inputSetRangeIndex);
171            UChar32      inputSetRangeEnd    = inputSet->getRangeEnd(inputSetRangeIndex);
172
173            // skip over ranges from the range list that are completely
174            //   below the current range from the input unicode set.
175            while (rlRange->fEndChar < inputSetRangeBegin) {
176                rlRange = rlRange->fNext;
177            }
178
179            // If the start of the range from the range list is before with
180            //   the start of the range from the unicode set, split the range list range
181            //   in two, with one part being before (wholly outside of) the unicode set
182            //   and the other containing the rest.
183            //   Then continue the loop; the post-split current range will then be skipped
184            //     over
185            if (rlRange->fStartChar < inputSetRangeBegin) {
186                rlRange->split(inputSetRangeBegin, *fStatus);
187                if (U_FAILURE(*fStatus)) {
188                    return;
189                }
190                continue;
191            }
192
193            // Same thing at the end of the ranges...
194            // If the end of the range from the range list doesn't coincide with
195            //   the end of the range from the unicode set, split the range list
196            //   range in two.  The first part of the split range will be
197            //   wholly inside the Unicode set.
198            if (rlRange->fEndChar > inputSetRangeEnd) {
199                rlRange->split(inputSetRangeEnd+1, *fStatus);
200                if (U_FAILURE(*fStatus)) {
201                    return;
202                }
203            }
204
205            // The current rlRange is now entirely within the UnicodeSet range.
206            // Add this unicode set to the list of sets for this rlRange
207            if (rlRange->fIncludesSets->indexOf(usetNode) == -1) {
208                rlRange->fIncludesSets->addElement(usetNode, *fStatus);
209                if (U_FAILURE(*fStatus)) {
210                    return;
211                }
212            }
213
214            // Advance over ranges that we are finished with.
215            if (inputSetRangeEnd == rlRange->fEndChar) {
216                inputSetRangeIndex++;
217            }
218            rlRange = rlRange->fNext;
219        }
220    }
221
222    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "range")) { printRanges();}
223
224    //
225    //  Group the above ranges, with each group consisting of one or more
226    //    ranges that are in exactly the same set of original UnicodeSets.
227    //    The groups are numbered, and these group numbers are the set of
228    //    input symbols recognized by the run-time state machine.
229    //
230    //    Numbering: # 0  (state table column 0) is unused.
231    //               # 1  is reserved - table column 1 is for end-of-input
232    //               # 2  is reserved - table column 2 is for beginning-in-input
233    //               # 3  is the first range list.
234    //
235    RangeDescriptor *rlSearchRange;
236    for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
237        for (rlSearchRange=fRangeList; rlSearchRange != rlRange; rlSearchRange=rlSearchRange->fNext) {
238            if (rlRange->fIncludesSets->equals(*rlSearchRange->fIncludesSets)) {
239                rlRange->fNum = rlSearchRange->fNum;
240                break;
241            }
242        }
243        if (rlRange->fNum == 0) {
244            fGroupCount ++;
245            rlRange->fNum = fGroupCount+2;
246            rlRange->setDictionaryFlag();
247            addValToSets(rlRange->fIncludesSets, fGroupCount+2);
248        }
249    }
250
251    // Handle input sets that contain the special string {eof}.
252    //   Column 1 of the state table is reserved for EOF on input.
253    //   Column 2 is reserved for before-the-start-input.
254    //            (This column can be optimized away later if there are no rule
255    //             references to {bof}.)
256    //   Add this column value (1 or 2) to the equivalent expression
257    //     subtree for each UnicodeSet that contains the string {eof}
258    //   Because {bof} and {eof} are not a characters in the normal sense,
259    //   they doesn't affect the computation of ranges or TRIE.
260    static const UChar eofUString[] = {0x65, 0x6f, 0x66, 0};
261    static const UChar bofUString[] = {0x62, 0x6f, 0x66, 0};
262
263    UnicodeString eofString(eofUString);
264    UnicodeString bofString(bofUString);
265    for (ni=0; ; ni++) {        // Loop over each of the UnicodeSets encountered in the input rules
266        usetNode = (RBBINode *)this->fRB->fUSetNodes->elementAt(ni);
267        if (usetNode==NULL) {
268            break;
269        }
270        UnicodeSet      *inputSet = usetNode->fInputSet;
271        if (inputSet->contains(eofString)) {
272            addValToSet(usetNode, 1);
273        }
274        if (inputSet->contains(bofString)) {
275            addValToSet(usetNode, 2);
276            fSawBOF = TRUE;
277        }
278    }
279
280
281    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rgroup")) {printRangeGroups();}
282    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "esets")) {printSets();}
283
284    //
285    // Build the Trie table for mapping UChar32 values to the corresponding
286    //   range group number
287    //
288    fTrie = utrie_open(NULL,    //  Pre-existing trie to be filled in
289                      NULL,    //  Data array  (utrie will allocate one)
290                      100000,  //  Max Data Length
291                      0,       //  Initial value for all code points
292                      0,       //  Lead surrogate unit value
293                      TRUE);   //  Keep Latin 1 in separately
294
295
296    for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
297        utrie_setRange32(fTrie, rlRange->fStartChar, rlRange->fEndChar+1, rlRange->fNum, TRUE);
298    }
299}
300
301
302
303//-----------------------------------------------------------------------------------
304//
305//  getTrieSize()    Return the size that will be required to serialize the Trie.
306//
307//-----------------------------------------------------------------------------------
308int32_t RBBISetBuilder::getTrieSize() /*const*/ {
309    fTrieSize  = utrie_serialize(fTrie,
310                                    NULL,                // Buffer
311                                    0,                   // Capacity
312                                    getFoldedRBBIValue,
313                                    TRUE,                // Reduce to 16 bits
314                                    fStatus);
315    // RBBIDebugPrintf("Trie table size is %d\n", trieSize);
316    return fTrieSize;
317}
318
319
320//-----------------------------------------------------------------------------------
321//
322//  serializeTrie()   Put the serialized trie at the specified address.
323//                    Trust the caller to have given us enough memory.
324//                    getTrieSize() MUST be called first.
325//
326//-----------------------------------------------------------------------------------
327void RBBISetBuilder::serializeTrie(uint8_t *where) {
328    utrie_serialize(fTrie,
329                    where,                   // Buffer
330                    fTrieSize,               // Capacity
331                    getFoldedRBBIValue,
332                    TRUE,                    // Reduce to 16 bits
333                    fStatus);
334}
335
336//------------------------------------------------------------------------
337//
338//  addValToSets     Add a runtime-mapped input value to each uset from a
339//                   list of uset nodes. (val corresponds to a state table column.)
340//                   For each of the original Unicode sets - which correspond
341//                   directly to uset nodes - a logically equivalent expression
342//                   is constructed in terms of the remapped runtime input
343//                   symbol set.  This function adds one runtime input symbol to
344//                   a list of sets.
345//
346//                   The "logically equivalent expression" is the tree for an
347//                   or-ing together of all of the symbols that go into the set.
348//
349//------------------------------------------------------------------------
350void  RBBISetBuilder::addValToSets(UVector *sets, uint32_t val) {
351    int32_t       ix;
352
353    for (ix=0; ix<sets->size(); ix++) {
354        RBBINode *usetNode = (RBBINode *)sets->elementAt(ix);
355        addValToSet(usetNode, val);
356    }
357}
358
359void  RBBISetBuilder::addValToSet(RBBINode *usetNode, uint32_t val) {
360    RBBINode *leafNode = new RBBINode(RBBINode::leafChar);
361    if (leafNode == NULL) {
362        *fStatus = U_MEMORY_ALLOCATION_ERROR;
363        return;
364    }
365    leafNode->fVal = (unsigned short)val;
366    if (usetNode->fLeftChild == NULL) {
367        usetNode->fLeftChild = leafNode;
368        leafNode->fParent    = usetNode;
369    } else {
370        // There are already input symbols present for this set.
371        // Set up an OR node, with the previous stuff as the left child
372        //   and the new value as the right child.
373        RBBINode *orNode = new RBBINode(RBBINode::opOr);
374        if (orNode == NULL) {
375            *fStatus = U_MEMORY_ALLOCATION_ERROR;
376            return;
377        }
378        orNode->fLeftChild  = usetNode->fLeftChild;
379        orNode->fRightChild = leafNode;
380        orNode->fLeftChild->fParent  = orNode;
381        orNode->fRightChild->fParent = orNode;
382        usetNode->fLeftChild = orNode;
383        orNode->fParent = usetNode;
384    }
385}
386
387
388//------------------------------------------------------------------------
389//
390//   getNumCharCategories
391//
392//------------------------------------------------------------------------
393int32_t  RBBISetBuilder::getNumCharCategories() const {
394    return fGroupCount + 3;
395}
396
397
398//------------------------------------------------------------------------
399//
400//   sawBOF
401//
402//------------------------------------------------------------------------
403UBool  RBBISetBuilder::sawBOF() const {
404    return fSawBOF;
405}
406
407
408//------------------------------------------------------------------------
409//
410//   getFirstChar      Given a runtime RBBI character category, find
411//                     the first UChar32 that is in the set of chars
412//                     in the category.
413//------------------------------------------------------------------------
414UChar32  RBBISetBuilder::getFirstChar(int32_t category) const {
415    RangeDescriptor   *rlRange;
416    UChar32            retVal = (UChar32)-1;
417    for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
418        if (rlRange->fNum == category) {
419            retVal = rlRange->fStartChar;
420            break;
421        }
422    }
423    return retVal;
424}
425
426
427
428//------------------------------------------------------------------------
429//
430//   printRanges        A debugging function.
431//                      dump out all of the range definitions.
432//
433//------------------------------------------------------------------------
434#ifdef RBBI_DEBUG
435void RBBISetBuilder::printRanges() {
436    RangeDescriptor       *rlRange;
437    int                    i;
438
439    RBBIDebugPrintf("\n\n Nonoverlapping Ranges ...\n");
440    for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
441        RBBIDebugPrintf("%2i  %4x-%4x  ", rlRange->fNum, rlRange->fStartChar, rlRange->fEndChar);
442
443        for (i=0; i<rlRange->fIncludesSets->size(); i++) {
444            RBBINode       *usetNode    = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
445            UnicodeString   setName = UNICODE_STRING("anon", 4);
446            RBBINode       *setRef = usetNode->fParent;
447            if (setRef != NULL) {
448                RBBINode *varRef = setRef->fParent;
449                if (varRef != NULL  &&  varRef->fType == RBBINode::varRef) {
450                    setName = varRef->fText;
451                }
452            }
453            RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf("  ");
454        }
455        RBBIDebugPrintf("\n");
456    }
457}
458#endif
459
460
461//------------------------------------------------------------------------
462//
463//   printRangeGroups     A debugging function.
464//                        dump out all of the range groups.
465//
466//------------------------------------------------------------------------
467#ifdef RBBI_DEBUG
468void RBBISetBuilder::printRangeGroups() {
469    RangeDescriptor       *rlRange;
470    RangeDescriptor       *tRange;
471    int                    i;
472    int                    lastPrintedGroupNum = 0;
473
474    RBBIDebugPrintf("\nRanges grouped by Unicode Set Membership...\n");
475    for (rlRange = fRangeList; rlRange!=0; rlRange=rlRange->fNext) {
476        int groupNum = rlRange->fNum & 0xbfff;
477        if (groupNum > lastPrintedGroupNum) {
478            lastPrintedGroupNum = groupNum;
479            RBBIDebugPrintf("%2i  ", groupNum);
480
481            if (rlRange->fNum & 0x4000) { RBBIDebugPrintf(" <DICT> ");}
482
483            for (i=0; i<rlRange->fIncludesSets->size(); i++) {
484                RBBINode       *usetNode    = (RBBINode *)rlRange->fIncludesSets->elementAt(i);
485                UnicodeString   setName = UNICODE_STRING("anon", 4);
486                RBBINode       *setRef = usetNode->fParent;
487                if (setRef != NULL) {
488                    RBBINode *varRef = setRef->fParent;
489                    if (varRef != NULL  &&  varRef->fType == RBBINode::varRef) {
490                        setName = varRef->fText;
491                    }
492                }
493                RBBI_DEBUG_printUnicodeString(setName); RBBIDebugPrintf(" ");
494            }
495
496            i = 0;
497            for (tRange = rlRange; tRange != 0; tRange = tRange->fNext) {
498                if (tRange->fNum == rlRange->fNum) {
499                    if (i++ % 5 == 0) {
500                        RBBIDebugPrintf("\n    ");
501                    }
502                    RBBIDebugPrintf("  %05x-%05x", tRange->fStartChar, tRange->fEndChar);
503                }
504            }
505            RBBIDebugPrintf("\n");
506        }
507    }
508    RBBIDebugPrintf("\n");
509}
510#endif
511
512
513//------------------------------------------------------------------------
514//
515//   printSets          A debugging function.
516//                      dump out all of the set definitions.
517//
518//------------------------------------------------------------------------
519#ifdef RBBI_DEBUG
520void RBBISetBuilder::printSets() {
521    int                   i;
522
523    RBBIDebugPrintf("\n\nUnicode Sets List\n------------------\n");
524    for (i=0; ; i++) {
525        RBBINode        *usetNode;
526        RBBINode        *setRef;
527        RBBINode        *varRef;
528        UnicodeString    setName;
529
530        usetNode = (RBBINode *)fRB->fUSetNodes->elementAt(i);
531        if (usetNode == NULL) {
532            break;
533        }
534
535        RBBIDebugPrintf("%3d    ", i);
536        setName = UNICODE_STRING("anonymous", 9);
537        setRef = usetNode->fParent;
538        if (setRef != NULL) {
539            varRef = setRef->fParent;
540            if (varRef != NULL  &&  varRef->fType == RBBINode::varRef) {
541                setName = varRef->fText;
542            }
543        }
544        RBBI_DEBUG_printUnicodeString(setName);
545        RBBIDebugPrintf("   ");
546        RBBI_DEBUG_printUnicodeString(usetNode->fText);
547        RBBIDebugPrintf("\n");
548        if (usetNode->fLeftChild != NULL) {
549            usetNode->fLeftChild->printTree(TRUE);
550        }
551    }
552    RBBIDebugPrintf("\n");
553}
554#endif
555
556
557
558//-------------------------------------------------------------------------------------
559//
560//  RangeDescriptor copy constructor
561//
562//-------------------------------------------------------------------------------------
563
564RangeDescriptor::RangeDescriptor(const RangeDescriptor &other, UErrorCode &status) {
565    int  i;
566
567    this->fStartChar    = other.fStartChar;
568    this->fEndChar      = other.fEndChar;
569    this->fNum          = other.fNum;
570    this->fNext         = NULL;
571    UErrorCode oldstatus = status;
572    this->fIncludesSets = new UVector(status);
573    if (U_FAILURE(oldstatus)) {
574        status = oldstatus;
575    }
576    if (U_FAILURE(status)) {
577        return;
578    }
579    /* test for NULL */
580    if (this->fIncludesSets == 0) {
581        status = U_MEMORY_ALLOCATION_ERROR;
582        return;
583    }
584
585    for (i=0; i<other.fIncludesSets->size(); i++) {
586        this->fIncludesSets->addElement(other.fIncludesSets->elementAt(i), status);
587    }
588}
589
590
591//-------------------------------------------------------------------------------------
592//
593//  RangeDesriptor default constructor
594//
595//-------------------------------------------------------------------------------------
596RangeDescriptor::RangeDescriptor(UErrorCode &status) {
597    this->fStartChar    = 0;
598    this->fEndChar      = 0;
599    this->fNum          = 0;
600    this->fNext         = NULL;
601    UErrorCode oldstatus = status;
602    this->fIncludesSets = new UVector(status);
603    if (U_FAILURE(oldstatus)) {
604        status = oldstatus;
605    }
606    if (U_FAILURE(status)) {
607        return;
608    }
609    /* test for NULL */
610    if(this->fIncludesSets == 0) {
611        status = U_MEMORY_ALLOCATION_ERROR;
612        return;
613    }
614
615}
616
617
618//-------------------------------------------------------------------------------------
619//
620//  RangeDesriptor Destructor
621//
622//-------------------------------------------------------------------------------------
623RangeDescriptor::~RangeDescriptor() {
624    delete  fIncludesSets;
625    fIncludesSets = NULL;
626}
627
628//-------------------------------------------------------------------------------------
629//
630//  RangeDesriptor::split()
631//
632//-------------------------------------------------------------------------------------
633void RangeDescriptor::split(UChar32 where, UErrorCode &status) {
634    U_ASSERT(where>fStartChar && where<=fEndChar);
635    RangeDescriptor *nr = new RangeDescriptor(*this, status);
636    if(nr == 0) {
637        status = U_MEMORY_ALLOCATION_ERROR;
638        return;
639    }
640    if (U_FAILURE(status)) {
641        delete nr;
642        return;
643    }
644    //  RangeDescriptor copy constructor copies all fields.
645    //  Only need to update those that are different after the split.
646    nr->fStartChar = where;
647    this->fEndChar = where-1;
648    nr->fNext      = this->fNext;
649    this->fNext    = nr;
650}
651
652
653//-------------------------------------------------------------------------------------
654//
655//   RangeDescriptor::setDictionaryFlag
656//
657//            Character Category Numbers that include characters from
658//            the original Unicode Set named "dictionary" have bit 14
659//            set to 1.  The RBBI runtime engine uses this to trigger
660//            use of the word dictionary.
661//
662//            This function looks through the Unicode Sets that it
663//            (the range) includes, and sets the bit in fNum when
664//            "dictionary" is among them.
665//
666//            TODO:  a faster way would be to find the set node for
667//                   "dictionary" just once, rather than looking it
668//                   up by name every time.
669//
670//-------------------------------------------------------------------------------------
671void RangeDescriptor::setDictionaryFlag() {
672    int i;
673
674    for (i=0; i<this->fIncludesSets->size(); i++) {
675        RBBINode       *usetNode    = (RBBINode *)fIncludesSets->elementAt(i);
676        UnicodeString   setName;
677        RBBINode       *setRef = usetNode->fParent;
678        if (setRef != NULL) {
679            RBBINode *varRef = setRef->fParent;
680            if (varRef != NULL  &&  varRef->fType == RBBINode::varRef) {
681                setName = varRef->fText;
682            }
683        }
684        if (setName.compare(UNICODE_STRING("dictionary", 10)) == 0) {   // TODO:  no string literals.
685            this->fNum |= 0x4000;
686            break;
687        }
688    }
689}
690
691
692
693U_NAMESPACE_END
694
695#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
696