rbbitblb.cpp revision c73f511526464f8e56c242df80552e9b0d94ae3d
1/*
2**********************************************************************
3*   Copyright (c) 2002-2009, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5**********************************************************************
6*/
7//
8//  rbbitblb.cpp
9//
10
11
12#include "unicode/utypes.h"
13
14#if !UCONFIG_NO_BREAK_ITERATION
15
16#include "unicode/unistr.h"
17#include "rbbitblb.h"
18#include "rbbirb.h"
19#include "rbbisetb.h"
20#include "rbbidata.h"
21#include "cstring.h"
22#include "uassert.h"
23#include "cmemory.h"
24
25U_NAMESPACE_BEGIN
26
27RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode) :
28 fTree(*rootNode) {
29    fRB                 = rb;
30    fStatus             = fRB->fStatus;
31    UErrorCode status   = U_ZERO_ERROR;
32    fDStates            = new UVector(status);
33    if (U_FAILURE(*fStatus)) {
34        return;
35    }
36    if (U_FAILURE(status)) {
37        *fStatus = status;
38        return;
39    }
40    if (fDStates == NULL) {
41        *fStatus = U_MEMORY_ALLOCATION_ERROR;;
42    }
43}
44
45
46
47RBBITableBuilder::~RBBITableBuilder() {
48    int i;
49    for (i=0; i<fDStates->size(); i++) {
50        delete (RBBIStateDescriptor *)fDStates->elementAt(i);
51    }
52    delete   fDStates;
53}
54
55
56//-----------------------------------------------------------------------------
57//
58//   RBBITableBuilder::build  -  This is the main function for building the DFA state transtion
59//                               table from the RBBI rules parse tree.
60//
61//-----------------------------------------------------------------------------
62void  RBBITableBuilder::build() {
63
64    if (U_FAILURE(*fStatus)) {
65        return;
66    }
67
68    // If there were no rules, just return.  This situation can easily arise
69    //   for the reverse rules.
70    if (fTree==NULL) {
71        return;
72    }
73
74    //
75    // Walk through the tree, replacing any references to $variables with a copy of the
76    //   parse tree for the substition expression.
77    //
78    fTree = fTree->flattenVariables();
79#ifdef RBBI_DEBUG
80    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
81        RBBIDebugPuts("Parse tree after flattening variable references.");
82        fTree->printTree(TRUE);
83    }
84#endif
85
86    //
87    // If the rules contained any references to {bof}
88    //   add a {bof} <cat> <former root of tree> to the
89    //   tree.  Means that all matches must start out with the
90    //   {bof} fake character.
91    //
92    if (fRB->fSetBuilder->sawBOF()) {
93        RBBINode *bofTop    = new RBBINode(RBBINode::opCat);
94        RBBINode *bofLeaf   = new RBBINode(RBBINode::leafChar);
95        // Delete and exit if memory allocation failed.
96        if (bofTop == NULL || bofLeaf == NULL) {
97            *fStatus = U_MEMORY_ALLOCATION_ERROR;
98            delete bofTop;
99            delete bofLeaf;
100            return;
101        }
102        bofTop->fLeftChild  = bofLeaf;
103        bofTop->fRightChild = fTree;
104        bofLeaf->fParent    = bofTop;
105        bofLeaf->fVal       = 2;      // Reserved value for {bof}.
106        fTree               = bofTop;
107    }
108
109    //
110    // Add a unique right-end marker to the expression.
111    //   Appears as a cat-node, left child being the original tree,
112    //   right child being the end marker.
113    //
114    RBBINode *cn = new RBBINode(RBBINode::opCat);
115    // Exit if memory allocation failed.
116    if (cn == NULL) {
117        *fStatus = U_MEMORY_ALLOCATION_ERROR;
118        return;
119    }
120    cn->fLeftChild = fTree;
121    fTree->fParent = cn;
122    cn->fRightChild = new RBBINode(RBBINode::endMark);
123    // Delete and exit if memory allocation failed.
124    if (cn->fRightChild == NULL) {
125        *fStatus = U_MEMORY_ALLOCATION_ERROR;
126        delete cn;
127        return;
128    }
129    cn->fRightChild->fParent = cn;
130    fTree = cn;
131
132    //
133    //  Replace all references to UnicodeSets with the tree for the equivalent
134    //      expression.
135    //
136    fTree->flattenSets();
137#ifdef RBBI_DEBUG
138    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
139        RBBIDebugPuts("Parse tree after flattening Unicode Set references.");
140        fTree->printTree(TRUE);
141    }
142#endif
143
144
145    //
146    // calculate the functions nullable, firstpos, lastpos and followpos on
147    // nodes in the parse tree.
148    //    See the alogrithm description in Aho.
149    //    Understanding how this works by looking at the code alone will be
150    //       nearly impossible.
151    //
152    calcNullable(fTree);
153    calcFirstPos(fTree);
154    calcLastPos(fTree);
155    calcFollowPos(fTree);
156    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) {
157        RBBIDebugPuts("\n");
158        printPosSets(fTree);
159    }
160
161    //
162    //  For "chained" rules, modify the followPos sets
163    //
164    if (fRB->fChainRules) {
165        calcChainedFollowPos(fTree);
166    }
167
168    //
169    //  BOF (start of input) test fixup.
170    //
171    if (fRB->fSetBuilder->sawBOF()) {
172        bofFixup();
173    }
174
175    //
176    // Build the DFA state transition tables.
177    //
178    buildStateTable();
179    flagAcceptingStates();
180    flagLookAheadStates();
181    flagTaggedStates();
182
183    //
184    // Update the global table of rule status {tag} values
185    // The rule builder has a global vector of status values that are common
186    //    for all tables.  Merge the ones from this table into the global set.
187    //
188    mergeRuleStatusVals();
189
190    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
191}
192
193
194
195//-----------------------------------------------------------------------------
196//
197//   calcNullable.    Impossible to explain succinctly.  See Aho, section 3.9
198//
199//-----------------------------------------------------------------------------
200void RBBITableBuilder::calcNullable(RBBINode *n) {
201    if (n == NULL) {
202        return;
203    }
204    if (n->fType == RBBINode::setRef ||
205        n->fType == RBBINode::endMark ) {
206        // These are non-empty leaf node types.
207        n->fNullable = FALSE;
208        return;
209    }
210
211    if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) {
212        // Lookahead marker node.  It's a leaf, so no recursion on children.
213        // It's nullable because it does not match any literal text from the input stream.
214        n->fNullable = TRUE;
215        return;
216    }
217
218
219    // The node is not a leaf.
220    //  Calculate nullable on its children.
221    calcNullable(n->fLeftChild);
222    calcNullable(n->fRightChild);
223
224    // Apply functions from table 3.40 in Aho
225    if (n->fType == RBBINode::opOr) {
226        n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable;
227    }
228    else if (n->fType == RBBINode::opCat) {
229        n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable;
230    }
231    else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) {
232        n->fNullable = TRUE;
233    }
234    else {
235        n->fNullable = FALSE;
236    }
237}
238
239
240
241
242//-----------------------------------------------------------------------------
243//
244//   calcFirstPos.    Impossible to explain succinctly.  See Aho, section 3.9
245//
246//-----------------------------------------------------------------------------
247void RBBITableBuilder::calcFirstPos(RBBINode *n) {
248    if (n == NULL) {
249        return;
250    }
251    if (n->fType == RBBINode::leafChar  ||
252        n->fType == RBBINode::endMark   ||
253        n->fType == RBBINode::lookAhead ||
254        n->fType == RBBINode::tag) {
255        // These are non-empty leaf node types.
256        // Note: In order to maintain the sort invariant on the set,
257        // this function should only be called on a node whose set is
258        // empty to start with.
259        n->fFirstPosSet->addElement(n, *fStatus);
260        return;
261    }
262
263    // The node is not a leaf.
264    //  Calculate firstPos on its children.
265    calcFirstPos(n->fLeftChild);
266    calcFirstPos(n->fRightChild);
267
268    // Apply functions from table 3.40 in Aho
269    if (n->fType == RBBINode::opOr) {
270        setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
271        setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
272    }
273    else if (n->fType == RBBINode::opCat) {
274        setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
275        if (n->fLeftChild->fNullable) {
276            setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
277        }
278    }
279    else if (n->fType == RBBINode::opStar ||
280             n->fType == RBBINode::opQuestion ||
281             n->fType == RBBINode::opPlus) {
282        setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
283    }
284}
285
286
287
288//-----------------------------------------------------------------------------
289//
290//   calcLastPos.    Impossible to explain succinctly.  See Aho, section 3.9
291//
292//-----------------------------------------------------------------------------
293void RBBITableBuilder::calcLastPos(RBBINode *n) {
294    if (n == NULL) {
295        return;
296    }
297    if (n->fType == RBBINode::leafChar  ||
298        n->fType == RBBINode::endMark   ||
299        n->fType == RBBINode::lookAhead ||
300        n->fType == RBBINode::tag) {
301        // These are non-empty leaf node types.
302        // Note: In order to maintain the sort invariant on the set,
303        // this function should only be called on a node whose set is
304        // empty to start with.
305        n->fLastPosSet->addElement(n, *fStatus);
306        return;
307    }
308
309    // The node is not a leaf.
310    //  Calculate lastPos on its children.
311    calcLastPos(n->fLeftChild);
312    calcLastPos(n->fRightChild);
313
314    // Apply functions from table 3.40 in Aho
315    if (n->fType == RBBINode::opOr) {
316        setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
317        setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
318    }
319    else if (n->fType == RBBINode::opCat) {
320        setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
321        if (n->fRightChild->fNullable) {
322            setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
323        }
324    }
325    else if (n->fType == RBBINode::opStar     ||
326             n->fType == RBBINode::opQuestion ||
327             n->fType == RBBINode::opPlus) {
328        setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
329    }
330}
331
332
333
334//-----------------------------------------------------------------------------
335//
336//   calcFollowPos.    Impossible to explain succinctly.  See Aho, section 3.9
337//
338//-----------------------------------------------------------------------------
339void RBBITableBuilder::calcFollowPos(RBBINode *n) {
340    if (n == NULL ||
341        n->fType == RBBINode::leafChar ||
342        n->fType == RBBINode::endMark) {
343        return;
344    }
345
346    calcFollowPos(n->fLeftChild);
347    calcFollowPos(n->fRightChild);
348
349    // Aho rule #1
350    if (n->fType == RBBINode::opCat) {
351        RBBINode *i;   // is 'i' in Aho's description
352        uint32_t     ix;
353
354        UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet;
355
356        for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) {
357            i = (RBBINode *)LastPosOfLeftChild->elementAt(ix);
358            setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet);
359        }
360    }
361
362    // Aho rule #2
363    if (n->fType == RBBINode::opStar ||
364        n->fType == RBBINode::opPlus) {
365        RBBINode   *i;  // again, n and i are the names from Aho's description.
366        uint32_t    ix;
367
368        for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) {
369            i = (RBBINode *)n->fLastPosSet->elementAt(ix);
370            setAdd(i->fFollowPos, n->fFirstPosSet);
371        }
372    }
373
374
375
376}
377
378
379//-----------------------------------------------------------------------------
380//
381//   calcChainedFollowPos.    Modify the previously calculated followPos sets
382//                            to implement rule chaining.  NOT described by Aho
383//
384//-----------------------------------------------------------------------------
385void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
386
387    UVector         endMarkerNodes(*fStatus);
388    UVector         leafNodes(*fStatus);
389    int32_t         i;
390
391    if (U_FAILURE(*fStatus)) {
392        return;
393    }
394
395    // get a list of all endmarker nodes.
396    tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
397
398    // get a list all leaf nodes
399    tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
400    if (U_FAILURE(*fStatus)) {
401        return;
402    }
403
404    // Get all nodes that can be the start a match, which is FirstPosition()
405    // of the portion of the tree corresponding to user-written rules.
406    // See the tree description in bofFixup().
407    RBBINode *userRuleRoot = tree;
408    if (fRB->fSetBuilder->sawBOF()) {
409        userRuleRoot = tree->fLeftChild->fRightChild;
410    }
411    U_ASSERT(userRuleRoot != NULL);
412    UVector *matchStartNodes = userRuleRoot->fFirstPosSet;
413
414
415    // Iteratate over all leaf nodes,
416    //
417    int32_t  endNodeIx;
418    int32_t  startNodeIx;
419
420    for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
421        RBBINode *tNode   = (RBBINode *)leafNodes.elementAt(endNodeIx);
422        RBBINode *endNode = NULL;
423
424        // Identify leaf nodes that correspond to overall rule match positions.
425        //   These include an endMarkerNode in their followPos sets.
426        for (i=0; i<endMarkerNodes.size(); i++) {
427            if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) {
428                endNode = tNode;
429                break;
430            }
431        }
432        if (endNode == NULL) {
433            // node wasn't an end node.  Try again with the next.
434            continue;
435        }
436
437        // We've got a node that can end a match.
438
439        // Line Break Specific hack:  If this node's val correspond to the $CM char class,
440        //                            don't chain from it.
441        // TODO:  Add rule syntax for this behavior, get specifics out of here and
442        //        into the rule file.
443        if (fRB->fLBCMNoChain) {
444            UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
445            if (c != -1) {
446                // c == -1 occurs with sets containing only the {eof} marker string.
447                ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
448                if (cLBProp == U_LB_COMBINING_MARK) {
449                    continue;
450                }
451            }
452        }
453
454
455        // Now iterate over the nodes that can start a match, looking for ones
456        //   with the same char class as our ending node.
457        RBBINode *startNode;
458        for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
459            startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
460            if (startNode->fType != RBBINode::leafChar) {
461                continue;
462            }
463
464            if (endNode->fVal == startNode->fVal) {
465                // The end val (character class) of one possible match is the
466                //   same as the start of another.
467
468                // Add all nodes from the followPos of the start node to the
469                //  followPos set of the end node, which will have the effect of
470                //  letting matches transition from a match state at endNode
471                //  to the second char of a match starting with startNode.
472                setAdd(endNode->fFollowPos, startNode->fFollowPos);
473            }
474        }
475    }
476}
477
478
479//-----------------------------------------------------------------------------
480//
481//   bofFixup.    Fixup for state tables that include {bof} beginning of input testing.
482//                Do an swizzle similar to chaining, modifying the followPos set of
483//                the bofNode to include the followPos nodes from other {bot} nodes
484//                scattered through the tree.
485//
486//                This function has much in common with calcChainedFollowPos().
487//
488//-----------------------------------------------------------------------------
489void RBBITableBuilder::bofFixup() {
490
491    if (U_FAILURE(*fStatus)) {
492        return;
493    }
494
495    //   The parse tree looks like this ...
496    //         fTree root  --->       <cat>
497    //                               /     \       .
498    //                            <cat>   <#end node>
499    //                           /     \  .
500    //                     <bofNode>   rest
501    //                               of tree
502    //
503    //    We will be adding things to the followPos set of the <bofNode>
504    //
505    RBBINode  *bofNode = fTree->fLeftChild->fLeftChild;
506    U_ASSERT(bofNode->fType == RBBINode::leafChar);
507    U_ASSERT(bofNode->fVal == 2);
508
509    // Get all nodes that can be the start a match of the user-written rules
510    //  (excluding the fake bofNode)
511    //  We want the nodes that can start a match in the
512    //     part labeled "rest of tree"
513    //
514    UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;
515
516    RBBINode *startNode;
517    int       startNodeIx;
518    for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
519        startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
520        if (startNode->fType != RBBINode::leafChar) {
521            continue;
522        }
523
524        if (startNode->fVal == bofNode->fVal) {
525            //  We found a leaf node corresponding to a {bof} that was
526            //    explicitly written into a rule.
527            //  Add everything from the followPos set of this node to the
528            //    followPos set of the fake bofNode at the start of the tree.
529            //
530            setAdd(bofNode->fFollowPos, startNode->fFollowPos);
531        }
532    }
533}
534
535//-----------------------------------------------------------------------------
536//
537//   buildStateTable()    Determine the set of runtime DFA states and the
538//                        transition tables for these states, by the algorithm
539//                        of fig. 3.44 in Aho.
540//
541//                        Most of the comments are quotes of Aho's psuedo-code.
542//
543//-----------------------------------------------------------------------------
544void RBBITableBuilder::buildStateTable() {
545    if (U_FAILURE(*fStatus)) {
546        return;
547    }
548    RBBIStateDescriptor *failState;
549    // Set it to NULL to avoid uninitialized warning
550    RBBIStateDescriptor *initialState = NULL;
551    //
552    // Add a dummy state 0 - the stop state.  Not from Aho.
553    int      lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
554    failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
555    if (failState == NULL) {
556        *fStatus = U_MEMORY_ALLOCATION_ERROR;
557        goto ExitBuildSTdeleteall;
558    }
559    failState->fPositions = new UVector(*fStatus);
560    if (failState->fPositions == NULL) {
561        *fStatus = U_MEMORY_ALLOCATION_ERROR;
562    }
563    if (failState->fPositions == NULL || U_FAILURE(*fStatus)) {
564        goto ExitBuildSTdeleteall;
565    }
566    fDStates->addElement(failState, *fStatus);
567    if (U_FAILURE(*fStatus)) {
568        goto ExitBuildSTdeleteall;
569    }
570
571    // initially, the only unmarked state in Dstates is firstpos(root),
572    //       where toot is the root of the syntax tree for (r)#;
573    initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
574    if (initialState == NULL) {
575        *fStatus = U_MEMORY_ALLOCATION_ERROR;
576    }
577    if (U_FAILURE(*fStatus)) {
578        goto ExitBuildSTdeleteall;
579    }
580    initialState->fPositions = new UVector(*fStatus);
581    if (initialState->fPositions == NULL) {
582        *fStatus = U_MEMORY_ALLOCATION_ERROR;
583    }
584    if (U_FAILURE(*fStatus)) {
585        goto ExitBuildSTdeleteall;
586    }
587    setAdd(initialState->fPositions, fTree->fFirstPosSet);
588    fDStates->addElement(initialState, *fStatus);
589    if (U_FAILURE(*fStatus)) {
590        goto ExitBuildSTdeleteall;
591    }
592
593    // while there is an unmarked state T in Dstates do begin
594    for (;;) {
595        RBBIStateDescriptor *T = NULL;
596        int32_t              tx;
597        for (tx=1; tx<fDStates->size(); tx++) {
598            RBBIStateDescriptor *temp;
599            temp = (RBBIStateDescriptor *)fDStates->elementAt(tx);
600            if (temp->fMarked == FALSE) {
601                T = temp;
602                break;
603            }
604        }
605        if (T == NULL) {
606            break;
607        }
608
609        // mark T;
610        T->fMarked = TRUE;
611
612        // for each input symbol a do begin
613        int32_t  a;
614        for (a = 1; a<=lastInputSymbol; a++) {
615            // let U be the set of positions that are in followpos(p)
616            //    for some position p in T
617            //    such that the symbol at position p is a;
618            UVector    *U = NULL;
619            RBBINode   *p;
620            int32_t     px;
621            for (px=0; px<T->fPositions->size(); px++) {
622                p = (RBBINode *)T->fPositions->elementAt(px);
623                if ((p->fType == RBBINode::leafChar) &&  (p->fVal == a)) {
624                    if (U == NULL) {
625                        U = new UVector(*fStatus);
626                        if (U == NULL) {
627                        	*fStatus = U_MEMORY_ALLOCATION_ERROR;
628                        	goto ExitBuildSTdeleteall;
629                        }
630                    }
631                    setAdd(U, p->fFollowPos);
632                }
633            }
634
635            // if U is not empty and not in DStates then
636            int32_t  ux = 0;
637            UBool    UinDstates = FALSE;
638            if (U != NULL) {
639                U_ASSERT(U->size() > 0);
640                int  ix;
641                for (ix=0; ix<fDStates->size(); ix++) {
642                    RBBIStateDescriptor *temp2;
643                    temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix);
644                    if (setEquals(U, temp2->fPositions)) {
645                        delete U;
646                        U  = temp2->fPositions;
647                        ux = ix;
648                        UinDstates = TRUE;
649                        break;
650                    }
651                }
652
653                // Add U as an unmarked state to Dstates
654                if (!UinDstates)
655                {
656                    RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
657                    if (newState == NULL) {
658                    	*fStatus = U_MEMORY_ALLOCATION_ERROR;
659                    }
660                    if (U_FAILURE(*fStatus)) {
661                        goto ExitBuildSTdeleteall;
662                    }
663                    newState->fPositions = U;
664                    fDStates->addElement(newState, *fStatus);
665                    if (U_FAILURE(*fStatus)) {
666                        return;
667                    }
668                    ux = fDStates->size()-1;
669                }
670
671                // Dtran[T, a] := U;
672                T->fDtran->setElementAt(ux, a);
673            }
674        }
675    }
676    return;
677    // delete local pointers only if error occured.
678ExitBuildSTdeleteall:
679    delete initialState;
680    delete failState;
681}
682
683
684
685//-----------------------------------------------------------------------------
686//
687//   flagAcceptingStates    Identify accepting states.
688//                          First get a list of all of the end marker nodes.
689//                          Then, for each state s,
690//                              if s contains one of the end marker nodes in its list of tree positions then
691//                                  s is an accepting state.
692//
693//-----------------------------------------------------------------------------
694void     RBBITableBuilder::flagAcceptingStates() {
695    if (U_FAILURE(*fStatus)) {
696        return;
697    }
698    UVector     endMarkerNodes(*fStatus);
699    RBBINode    *endMarker;
700    int32_t     i;
701    int32_t     n;
702
703    if (U_FAILURE(*fStatus)) {
704        return;
705    }
706
707    fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
708    if (U_FAILURE(*fStatus)) {
709        return;
710    }
711
712    for (i=0; i<endMarkerNodes.size(); i++) {
713        endMarker = (RBBINode *)endMarkerNodes.elementAt(i);
714        for (n=0; n<fDStates->size(); n++) {
715            RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
716            if (sd->fPositions->indexOf(endMarker) >= 0) {
717                // Any non-zero value for fAccepting means this is an accepting node.
718                // The value is what will be returned to the user as the break status.
719                // If no other value was specified, force it to -1.
720
721                if (sd->fAccepting==0) {
722                    // State hasn't been marked as accepting yet.  Do it now.
723                    sd->fAccepting = endMarker->fVal;
724                    if (sd->fAccepting == 0) {
725                        sd->fAccepting = -1;
726                    }
727                }
728                if (sd->fAccepting==-1 && endMarker->fVal != 0) {
729                    // Both lookahead and non-lookahead accepting for this state.
730                    // Favor the look-ahead.  Expedient for line break.
731                    // TODO:  need a more elegant resolution for conflicting rules.
732                    sd->fAccepting = endMarker->fVal;
733                }
734                // implicit else:
735                // if sd->fAccepting already had a value other than 0 or -1, leave it be.
736
737                // If the end marker node is from a look-ahead rule, set
738                //   the fLookAhead field or this state also.
739                if (endMarker->fLookAheadEnd) {
740                    // TODO:  don't change value if already set?
741                    // TODO:  allow for more than one active look-ahead rule in engine.
742                    //        Make value here an index to a side array in engine?
743                    sd->fLookAhead = sd->fAccepting;
744                }
745            }
746        }
747    }
748}
749
750
751//-----------------------------------------------------------------------------
752//
753//    flagLookAheadStates   Very similar to flagAcceptingStates, above.
754//
755//-----------------------------------------------------------------------------
756void     RBBITableBuilder::flagLookAheadStates() {
757    if (U_FAILURE(*fStatus)) {
758        return;
759    }
760    UVector     lookAheadNodes(*fStatus);
761    RBBINode    *lookAheadNode;
762    int32_t     i;
763    int32_t     n;
764
765    fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
766    if (U_FAILURE(*fStatus)) {
767        return;
768    }
769    for (i=0; i<lookAheadNodes.size(); i++) {
770        lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
771
772        for (n=0; n<fDStates->size(); n++) {
773            RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
774            if (sd->fPositions->indexOf(lookAheadNode) >= 0) {
775                sd->fLookAhead = lookAheadNode->fVal;
776            }
777        }
778    }
779}
780
781
782
783
784//-----------------------------------------------------------------------------
785//
786//    flagTaggedStates
787//
788//-----------------------------------------------------------------------------
789void     RBBITableBuilder::flagTaggedStates() {
790    if (U_FAILURE(*fStatus)) {
791        return;
792    }
793    UVector     tagNodes(*fStatus);
794    RBBINode    *tagNode;
795    int32_t     i;
796    int32_t     n;
797
798    if (U_FAILURE(*fStatus)) {
799        return;
800    }
801    fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
802    if (U_FAILURE(*fStatus)) {
803        return;
804    }
805    for (i=0; i<tagNodes.size(); i++) {                   // For each tag node t (all of 'em)
806        tagNode = (RBBINode *)tagNodes.elementAt(i);
807
808        for (n=0; n<fDStates->size(); n++) {              //    For each state  s (row in the state table)
809            RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
810            if (sd->fPositions->indexOf(tagNode) >= 0) {  //       if  s include the tag node t
811                sortedAdd(&sd->fTagVals, tagNode->fVal);
812            }
813        }
814    }
815}
816
817
818
819
820//-----------------------------------------------------------------------------
821//
822//  mergeRuleStatusVals
823//
824//      Update the global table of rule status {tag} values
825//      The rule builder has a global vector of status values that are common
826//      for all tables.  Merge the ones from this table into the global set.
827//
828//-----------------------------------------------------------------------------
829void  RBBITableBuilder::mergeRuleStatusVals() {
830    //
831    //  The basic outline of what happens here is this...
832    //
833    //    for each state in this state table
834    //       if the status tag list for this state is in the global statuses list
835    //           record where and
836    //           continue with the next state
837    //       else
838    //           add the tag list for this state to the global list.
839    //
840    int i;
841    int n;
842
843    // Pre-set a single tag of {0} into the table.
844    //   We will need this as a default, for rule sets with no explicit tagging.
845    if (fRB->fRuleStatusVals->size() == 0) {
846        fRB->fRuleStatusVals->addElement(1, *fStatus);  // Num of statuses in group
847        fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus);  //   and our single status of zero
848    }
849
850    //    For each state
851    for (n=0; n<fDStates->size(); n++) {
852        RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
853        UVector *thisStatesTagValues = sd->fTagVals;
854        if (thisStatesTagValues == NULL) {
855            // No tag values are explicitly associated with this state.
856            //   Set the default tag value.
857            sd->fTagsIdx = 0;
858            continue;
859        }
860
861        // There are tag(s) associated with this state.
862        //   fTagsIdx will be the index into the global tag list for this state's tag values.
863        //   Initial value of -1 flags that we haven't got it set yet.
864        sd->fTagsIdx = -1;
865        int32_t  thisTagGroupStart = 0;   // indexes into the global rule status vals list
866        int32_t  nextTagGroupStart = 0;
867
868        // Loop runs once per group of tags in the global list
869        while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
870            thisTagGroupStart = nextTagGroupStart;
871            nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1;
872            if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) {
873                // The number of tags for this state is different from
874                //    the number of tags in this group from the global list.
875                //    Continue with the next group from the global list.
876                continue;
877            }
878            // The lengths match, go ahead and compare the actual tag values
879            //    between this state and the group from the global list.
880            for (i=0; i<thisStatesTagValues->size(); i++) {
881                if (thisStatesTagValues->elementAti(i) !=
882                    fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
883                    // Mismatch.
884                    break;
885                }
886            }
887
888            if (i == thisStatesTagValues->size()) {
889                // We found a set of tag values in the global list that match
890                //   those for this state.  Use them.
891                sd->fTagsIdx = thisTagGroupStart;
892                break;
893            }
894        }
895
896        if (sd->fTagsIdx == -1) {
897            // No suitable entry in the global tag list already.  Add one
898            sd->fTagsIdx = fRB->fRuleStatusVals->size();
899            fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus);
900            for (i=0; i<thisStatesTagValues->size(); i++) {
901                fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus);
902            }
903        }
904    }
905}
906
907
908
909
910
911
912
913//-----------------------------------------------------------------------------
914//
915//  sortedAdd  Add a value to a vector of sorted values (ints).
916//             Do not replicate entries; if the value is already there, do not
917//                add a second one.
918//             Lazily create the vector if it does not already exist.
919//
920//-----------------------------------------------------------------------------
921void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) {
922    int32_t i;
923
924    if (*vector == NULL) {
925        *vector = new UVector(*fStatus);
926    }
927    if (*vector == NULL || U_FAILURE(*fStatus)) {
928        return;
929    }
930    UVector *vec = *vector;
931    int32_t  vSize = vec->size();
932    for (i=0; i<vSize; i++) {
933        int32_t valAtI = vec->elementAti(i);
934        if (valAtI == val) {
935            // The value is already in the vector.  Don't add it again.
936            return;
937        }
938        if (valAtI > val) {
939            break;
940        }
941    }
942    vec->insertElementAt(val, i, *fStatus);
943}
944
945
946
947//-----------------------------------------------------------------------------
948//
949//  setAdd     Set operation on UVector
950//             dest = dest union source
951//             Elements may only appear once and must be sorted.
952//
953//-----------------------------------------------------------------------------
954void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
955    int32_t destOriginalSize = dest->size();
956    int32_t sourceSize       = source->size();
957    int32_t di           = 0;
958    MaybeStackArray<void *, 16> destArray, sourceArray;  // Handle small cases without malloc
959    void **destPtr, **sourcePtr;
960    void **destLim, **sourceLim;
961
962    if (destOriginalSize > destArray.getCapacity()) {
963        if (destArray.resize(destOriginalSize) == NULL) {
964            return;
965        }
966    }
967    destPtr = destArray.getAlias();
968    destLim = destPtr + destOriginalSize;  // destArray.getArrayLimit()?
969
970    if (sourceSize > sourceArray.getCapacity()) {
971        if (sourceArray.resize(sourceSize) == NULL) {
972            return;
973        }
974    }
975    sourcePtr = sourceArray.getAlias();
976    sourceLim = sourcePtr + sourceSize;  // sourceArray.getArrayLimit()?
977
978    // Avoid multiple "get element" calls by getting the contents into arrays
979    (void) dest->toArray(destPtr);
980    (void) source->toArray(sourcePtr);
981
982    dest->setSize(sourceSize+destOriginalSize, *fStatus);
983
984    while (sourcePtr < sourceLim && destPtr < destLim) {
985        if (*destPtr == *sourcePtr) {
986            dest->setElementAt(*sourcePtr++, di++);
987            destPtr++;
988        }
989        // This check is required for machines with segmented memory, like i5/OS.
990        // Direct pointer comparison is not recommended.
991        else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) {
992            dest->setElementAt(*destPtr++, di++);
993        }
994        else { /* *sourcePtr < *destPtr */
995            dest->setElementAt(*sourcePtr++, di++);
996        }
997    }
998
999    // At most one of these two cleanup loops will execute
1000    while (destPtr < destLim) {
1001        dest->setElementAt(*destPtr++, di++);
1002    }
1003    while (sourcePtr < sourceLim) {
1004        dest->setElementAt(*sourcePtr++, di++);
1005    }
1006
1007    dest->setSize(di, *fStatus);
1008}
1009
1010
1011
1012//-----------------------------------------------------------------------------
1013//
1014//  setEqual    Set operation on UVector.
1015//              Compare for equality.
1016//              Elements must be sorted.
1017//
1018//-----------------------------------------------------------------------------
1019UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
1020    return a->equals(*b);
1021}
1022
1023
1024//-----------------------------------------------------------------------------
1025//
1026//  printPosSets   Debug function.  Dump Nullable, firstpos, lastpos and followpos
1027//                 for each node in the tree.
1028//
1029//-----------------------------------------------------------------------------
1030#ifdef RBBI_DEBUG
1031void RBBITableBuilder::printPosSets(RBBINode *n) {
1032    if (n==NULL) {
1033        return;
1034    }
1035    n->printNode();
1036    RBBIDebugPrintf("         Nullable:  %s\n", n->fNullable?"TRUE":"FALSE");
1037
1038    RBBIDebugPrintf("         firstpos:  ");
1039    printSet(n->fFirstPosSet);
1040
1041    RBBIDebugPrintf("         lastpos:   ");
1042    printSet(n->fLastPosSet);
1043
1044    RBBIDebugPrintf("         followpos: ");
1045    printSet(n->fFollowPos);
1046
1047    printPosSets(n->fLeftChild);
1048    printPosSets(n->fRightChild);
1049}
1050#endif
1051
1052
1053
1054//-----------------------------------------------------------------------------
1055//
1056//   getTableSize()    Calculate the size of the runtime form of this
1057//                     state transition table.
1058//
1059//-----------------------------------------------------------------------------
1060int32_t  RBBITableBuilder::getTableSize() const {
1061    int32_t    size = 0;
1062    int32_t    numRows;
1063    int32_t    numCols;
1064    int32_t    rowSize;
1065
1066    if (fTree == NULL) {
1067        return 0;
1068    }
1069
1070    size    = sizeof(RBBIStateTable) - 4;    // The header, with no rows to the table.
1071
1072    numRows = fDStates->size();
1073    numCols = fRB->fSetBuilder->getNumCharCategories();
1074
1075    //  Note  The declaration of RBBIStateTableRow is for a table of two columns.
1076    //        Therefore we subtract two from numCols when determining
1077    //        how much storage to add to a row for the total columns.
1078    rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2);
1079    size   += numRows * rowSize;
1080    return size;
1081}
1082
1083
1084
1085//-----------------------------------------------------------------------------
1086//
1087//   exportTable()    export the state transition table in the format required
1088//                    by the runtime engine.  getTableSize() bytes of memory
1089//                    must be available at the output address "where".
1090//
1091//-----------------------------------------------------------------------------
1092void RBBITableBuilder::exportTable(void *where) {
1093    RBBIStateTable    *table = (RBBIStateTable *)where;
1094    uint32_t           state;
1095    int                col;
1096
1097    if (U_FAILURE(*fStatus) || fTree == NULL) {
1098        return;
1099    }
1100
1101    if (fRB->fSetBuilder->getNumCharCategories() > 0x7fff ||
1102        fDStates->size() > 0x7fff) {
1103        *fStatus = U_BRK_INTERNAL_ERROR;
1104        return;
1105    }
1106
1107    table->fRowLen    = sizeof(RBBIStateTableRow) +
1108                            sizeof(uint16_t) * (fRB->fSetBuilder->getNumCharCategories() - 2);
1109    table->fNumStates = fDStates->size();
1110    table->fFlags     = 0;
1111    if (fRB->fLookAheadHardBreak) {
1112        table->fFlags  |= RBBI_LOOKAHEAD_HARD_BREAK;
1113    }
1114    if (fRB->fSetBuilder->sawBOF()) {
1115        table->fFlags  |= RBBI_BOF_REQUIRED;
1116    }
1117    table->fReserved  = 0;
1118
1119    for (state=0; state<table->fNumStates; state++) {
1120        RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1121        RBBIStateTableRow   *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1122        U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767);
1123        U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767);
1124        row->fAccepting = (int16_t)sd->fAccepting;
1125        row->fLookAhead = (int16_t)sd->fLookAhead;
1126        row->fTagIdx    = (int16_t)sd->fTagsIdx;
1127        for (col=0; col<fRB->fSetBuilder->getNumCharCategories(); col++) {
1128            row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col);
1129        }
1130    }
1131}
1132
1133
1134
1135//-----------------------------------------------------------------------------
1136//
1137//   printSet    Debug function.   Print the contents of a UVector
1138//
1139//-----------------------------------------------------------------------------
1140#ifdef RBBI_DEBUG
1141void RBBITableBuilder::printSet(UVector *s) {
1142    int32_t  i;
1143    for (i=0; i<s->size(); i++) {
1144        void *v = s->elementAt(i);
1145        RBBIDebugPrintf("%10p", v);
1146    }
1147    RBBIDebugPrintf("\n");
1148}
1149#endif
1150
1151
1152//-----------------------------------------------------------------------------
1153//
1154//   printStates    Debug Function.  Dump the fully constructed state transition table.
1155//
1156//-----------------------------------------------------------------------------
1157#ifdef RBBI_DEBUG
1158void RBBITableBuilder::printStates() {
1159    int     c;    // input "character"
1160    int     n;    // state number
1161
1162    RBBIDebugPrintf("state |           i n p u t     s y m b o l s \n");
1163    RBBIDebugPrintf("      | Acc  LA    Tag");
1164    for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1165        RBBIDebugPrintf(" %2d", c);
1166    }
1167    RBBIDebugPrintf("\n");
1168    RBBIDebugPrintf("      |---------------");
1169    for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1170        RBBIDebugPrintf("---");
1171    }
1172    RBBIDebugPrintf("\n");
1173
1174    for (n=0; n<fDStates->size(); n++) {
1175        RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
1176        RBBIDebugPrintf("  %3d | " , n);
1177        RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
1178        for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1179            RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c));
1180        }
1181        RBBIDebugPrintf("\n");
1182    }
1183    RBBIDebugPrintf("\n\n");
1184}
1185#endif
1186
1187
1188
1189//-----------------------------------------------------------------------------
1190//
1191//   printRuleStatusTable    Debug Function.  Dump the common rule status table
1192//
1193//-----------------------------------------------------------------------------
1194#ifdef RBBI_DEBUG
1195void RBBITableBuilder::printRuleStatusTable() {
1196    int32_t  thisRecord = 0;
1197    int32_t  nextRecord = 0;
1198    int      i;
1199    UVector  *tbl = fRB->fRuleStatusVals;
1200
1201    RBBIDebugPrintf("index |  tags \n");
1202    RBBIDebugPrintf("-------------------\n");
1203
1204    while (nextRecord < tbl->size()) {
1205        thisRecord = nextRecord;
1206        nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
1207        RBBIDebugPrintf("%4d   ", thisRecord);
1208        for (i=thisRecord+1; i<nextRecord; i++) {
1209            RBBIDebugPrintf("  %5d", tbl->elementAti(i));
1210        }
1211        RBBIDebugPrintf("\n");
1212    }
1213    RBBIDebugPrintf("\n\n");
1214}
1215#endif
1216
1217
1218//-----------------------------------------------------------------------------
1219//
1220//   RBBIStateDescriptor     Methods.  This is a very struct-like class
1221//                           Most access is directly to the fields.
1222//
1223//-----------------------------------------------------------------------------
1224
1225RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) {
1226    fMarked    = FALSE;
1227    fAccepting = 0;
1228    fLookAhead = 0;
1229    fTagsIdx   = 0;
1230    fTagVals   = NULL;
1231    fPositions = NULL;
1232    fDtran     = NULL;
1233
1234    fDtran     = new UVector(lastInputSymbol+1, *fStatus);
1235    if (U_FAILURE(*fStatus)) {
1236        return;
1237    }
1238    if (fDtran == NULL) {
1239        *fStatus = U_MEMORY_ALLOCATION_ERROR;
1240        return;
1241    }
1242    fDtran->setSize(lastInputSymbol+1, *fStatus);    // fDtran needs to be pre-sized.
1243                                           //   It is indexed by input symbols, and will
1244                                           //   hold  the next state number for each
1245                                           //   symbol.
1246}
1247
1248
1249RBBIStateDescriptor::~RBBIStateDescriptor() {
1250    delete       fPositions;
1251    delete       fDtran;
1252    delete       fTagVals;
1253    fPositions = NULL;
1254    fDtran     = NULL;
1255    fTagVals   = NULL;
1256}
1257
1258U_NAMESPACE_END
1259
1260#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1261