1// Copyright (C) 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4**********************************************************************
5*   Copyright (c) 2002-2016, International Business Machines
6*   Corporation and others.  All Rights Reserved.
7**********************************************************************
8*/
9//
10//  rbbitblb.cpp
11//
12
13
14#include "unicode/utypes.h"
15
16#if !UCONFIG_NO_BREAK_ITERATION
17
18#include "unicode/unistr.h"
19#include "rbbitblb.h"
20#include "rbbirb.h"
21#include "rbbisetb.h"
22#include "rbbidata.h"
23#include "cstring.h"
24#include "uassert.h"
25#include "cmemory.h"
26
27U_NAMESPACE_BEGIN
28
29RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode) :
30 fTree(*rootNode) {
31    fRB                 = rb;
32    fStatus             = fRB->fStatus;
33    UErrorCode status   = U_ZERO_ERROR;
34    fDStates            = new UVector(status);
35    if (U_FAILURE(*fStatus)) {
36        return;
37    }
38    if (U_FAILURE(status)) {
39        *fStatus = status;
40        return;
41    }
42    if (fDStates == NULL) {
43        *fStatus = U_MEMORY_ALLOCATION_ERROR;;
44    }
45}
46
47
48
49RBBITableBuilder::~RBBITableBuilder() {
50    int i;
51    for (i=0; i<fDStates->size(); i++) {
52        delete (RBBIStateDescriptor *)fDStates->elementAt(i);
53    }
54    delete   fDStates;
55}
56
57
58//-----------------------------------------------------------------------------
59//
60//   RBBITableBuilder::build  -  This is the main function for building the DFA state transtion
61//                               table from the RBBI rules parse tree.
62//
63//-----------------------------------------------------------------------------
64void  RBBITableBuilder::build() {
65
66    if (U_FAILURE(*fStatus)) {
67        return;
68    }
69
70    // If there were no rules, just return.  This situation can easily arise
71    //   for the reverse rules.
72    if (fTree==NULL) {
73        return;
74    }
75
76    //
77    // Walk through the tree, replacing any references to $variables with a copy of the
78    //   parse tree for the substition expression.
79    //
80    fTree = fTree->flattenVariables();
81#ifdef RBBI_DEBUG
82    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
83        RBBIDebugPuts("\nParse tree after flattening variable references.");
84        RBBINode::printTree(fTree, TRUE);
85    }
86#endif
87
88    //
89    // If the rules contained any references to {bof}
90    //   add a {bof} <cat> <former root of tree> to the
91    //   tree.  Means that all matches must start out with the
92    //   {bof} fake character.
93    //
94    if (fRB->fSetBuilder->sawBOF()) {
95        RBBINode *bofTop    = new RBBINode(RBBINode::opCat);
96        RBBINode *bofLeaf   = new RBBINode(RBBINode::leafChar);
97        // Delete and exit if memory allocation failed.
98        if (bofTop == NULL || bofLeaf == NULL) {
99            *fStatus = U_MEMORY_ALLOCATION_ERROR;
100            delete bofTop;
101            delete bofLeaf;
102            return;
103        }
104        bofTop->fLeftChild  = bofLeaf;
105        bofTop->fRightChild = fTree;
106        bofLeaf->fParent    = bofTop;
107        bofLeaf->fVal       = 2;      // Reserved value for {bof}.
108        fTree               = bofTop;
109    }
110
111    //
112    // Add a unique right-end marker to the expression.
113    //   Appears as a cat-node, left child being the original tree,
114    //   right child being the end marker.
115    //
116    RBBINode *cn = new RBBINode(RBBINode::opCat);
117    // Exit if memory allocation failed.
118    if (cn == NULL) {
119        *fStatus = U_MEMORY_ALLOCATION_ERROR;
120        return;
121    }
122    cn->fLeftChild = fTree;
123    fTree->fParent = cn;
124    cn->fRightChild = new RBBINode(RBBINode::endMark);
125    // Delete and exit if memory allocation failed.
126    if (cn->fRightChild == NULL) {
127        *fStatus = U_MEMORY_ALLOCATION_ERROR;
128        delete cn;
129        return;
130    }
131    cn->fRightChild->fParent = cn;
132    fTree = cn;
133
134    //
135    //  Replace all references to UnicodeSets with the tree for the equivalent
136    //      expression.
137    //
138    fTree->flattenSets();
139#ifdef RBBI_DEBUG
140    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
141        RBBIDebugPuts("\nParse tree after flattening Unicode Set references.");
142        RBBINode::printTree(fTree, TRUE);
143    }
144#endif
145
146
147    //
148    // calculate the functions nullable, firstpos, lastpos and followpos on
149    // nodes in the parse tree.
150    //    See the alogrithm description in Aho.
151    //    Understanding how this works by looking at the code alone will be
152    //       nearly impossible.
153    //
154    calcNullable(fTree);
155    calcFirstPos(fTree);
156    calcLastPos(fTree);
157    calcFollowPos(fTree);
158    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) {
159        RBBIDebugPuts("\n");
160        printPosSets(fTree);
161    }
162
163    //
164    //  For "chained" rules, modify the followPos sets
165    //
166    if (fRB->fChainRules) {
167        calcChainedFollowPos(fTree);
168    }
169
170    //
171    //  BOF (start of input) test fixup.
172    //
173    if (fRB->fSetBuilder->sawBOF()) {
174        bofFixup();
175    }
176
177    //
178    // Build the DFA state transition tables.
179    //
180    buildStateTable();
181    flagAcceptingStates();
182    flagLookAheadStates();
183    flagTaggedStates();
184
185    //
186    // Update the global table of rule status {tag} values
187    // The rule builder has a global vector of status values that are common
188    //    for all tables.  Merge the ones from this table into the global set.
189    //
190    mergeRuleStatusVals();
191
192    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "states")) {printStates();};
193}
194
195
196
197//-----------------------------------------------------------------------------
198//
199//   calcNullable.    Impossible to explain succinctly.  See Aho, section 3.9
200//
201//-----------------------------------------------------------------------------
202void RBBITableBuilder::calcNullable(RBBINode *n) {
203    if (n == NULL) {
204        return;
205    }
206    if (n->fType == RBBINode::setRef ||
207        n->fType == RBBINode::endMark ) {
208        // These are non-empty leaf node types.
209        n->fNullable = FALSE;
210        return;
211    }
212
213    if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) {
214        // Lookahead marker node.  It's a leaf, so no recursion on children.
215        // It's nullable because it does not match any literal text from the input stream.
216        n->fNullable = TRUE;
217        return;
218    }
219
220
221    // The node is not a leaf.
222    //  Calculate nullable on its children.
223    calcNullable(n->fLeftChild);
224    calcNullable(n->fRightChild);
225
226    // Apply functions from table 3.40 in Aho
227    if (n->fType == RBBINode::opOr) {
228        n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable;
229    }
230    else if (n->fType == RBBINode::opCat) {
231        n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable;
232    }
233    else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) {
234        n->fNullable = TRUE;
235    }
236    else {
237        n->fNullable = FALSE;
238    }
239}
240
241
242
243
244//-----------------------------------------------------------------------------
245//
246//   calcFirstPos.    Impossible to explain succinctly.  See Aho, section 3.9
247//
248//-----------------------------------------------------------------------------
249void RBBITableBuilder::calcFirstPos(RBBINode *n) {
250    if (n == NULL) {
251        return;
252    }
253    if (n->fType == RBBINode::leafChar  ||
254        n->fType == RBBINode::endMark   ||
255        n->fType == RBBINode::lookAhead ||
256        n->fType == RBBINode::tag) {
257        // These are non-empty leaf node types.
258        // Note: In order to maintain the sort invariant on the set,
259        // this function should only be called on a node whose set is
260        // empty to start with.
261        n->fFirstPosSet->addElement(n, *fStatus);
262        return;
263    }
264
265    // The node is not a leaf.
266    //  Calculate firstPos on its children.
267    calcFirstPos(n->fLeftChild);
268    calcFirstPos(n->fRightChild);
269
270    // Apply functions from table 3.40 in Aho
271    if (n->fType == RBBINode::opOr) {
272        setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
273        setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
274    }
275    else if (n->fType == RBBINode::opCat) {
276        setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
277        if (n->fLeftChild->fNullable) {
278            setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
279        }
280    }
281    else if (n->fType == RBBINode::opStar ||
282             n->fType == RBBINode::opQuestion ||
283             n->fType == RBBINode::opPlus) {
284        setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
285    }
286}
287
288
289
290//-----------------------------------------------------------------------------
291//
292//   calcLastPos.    Impossible to explain succinctly.  See Aho, section 3.9
293//
294//-----------------------------------------------------------------------------
295void RBBITableBuilder::calcLastPos(RBBINode *n) {
296    if (n == NULL) {
297        return;
298    }
299    if (n->fType == RBBINode::leafChar  ||
300        n->fType == RBBINode::endMark   ||
301        n->fType == RBBINode::lookAhead ||
302        n->fType == RBBINode::tag) {
303        // These are non-empty leaf node types.
304        // Note: In order to maintain the sort invariant on the set,
305        // this function should only be called on a node whose set is
306        // empty to start with.
307        n->fLastPosSet->addElement(n, *fStatus);
308        return;
309    }
310
311    // The node is not a leaf.
312    //  Calculate lastPos on its children.
313    calcLastPos(n->fLeftChild);
314    calcLastPos(n->fRightChild);
315
316    // Apply functions from table 3.40 in Aho
317    if (n->fType == RBBINode::opOr) {
318        setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
319        setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
320    }
321    else if (n->fType == RBBINode::opCat) {
322        setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
323        if (n->fRightChild->fNullable) {
324            setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
325        }
326    }
327    else if (n->fType == RBBINode::opStar     ||
328             n->fType == RBBINode::opQuestion ||
329             n->fType == RBBINode::opPlus) {
330        setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
331    }
332}
333
334
335
336//-----------------------------------------------------------------------------
337//
338//   calcFollowPos.    Impossible to explain succinctly.  See Aho, section 3.9
339//
340//-----------------------------------------------------------------------------
341void RBBITableBuilder::calcFollowPos(RBBINode *n) {
342    if (n == NULL ||
343        n->fType == RBBINode::leafChar ||
344        n->fType == RBBINode::endMark) {
345        return;
346    }
347
348    calcFollowPos(n->fLeftChild);
349    calcFollowPos(n->fRightChild);
350
351    // Aho rule #1
352    if (n->fType == RBBINode::opCat) {
353        RBBINode *i;   // is 'i' in Aho's description
354        uint32_t     ix;
355
356        UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet;
357
358        for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) {
359            i = (RBBINode *)LastPosOfLeftChild->elementAt(ix);
360            setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet);
361        }
362    }
363
364    // Aho rule #2
365    if (n->fType == RBBINode::opStar ||
366        n->fType == RBBINode::opPlus) {
367        RBBINode   *i;  // again, n and i are the names from Aho's description.
368        uint32_t    ix;
369
370        for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) {
371            i = (RBBINode *)n->fLastPosSet->elementAt(ix);
372            setAdd(i->fFollowPos, n->fFirstPosSet);
373        }
374    }
375
376
377
378}
379
380//-----------------------------------------------------------------------------
381//
382//    addRuleRootNodes    Recursively walk a parse tree, adding all nodes flagged
383//                        as roots of a rule to a destination vector.
384//
385//-----------------------------------------------------------------------------
386void RBBITableBuilder::addRuleRootNodes(UVector *dest, RBBINode *node) {
387    if (node == NULL || U_FAILURE(*fStatus)) {
388        return;
389    }
390    if (node->fRuleRoot) {
391        dest->addElement(node, *fStatus);
392        // Note: rules cannot nest. If we found a rule start node,
393        //       no child node can also be a start node.
394        return;
395    }
396    addRuleRootNodes(dest, node->fLeftChild);
397    addRuleRootNodes(dest, node->fRightChild);
398}
399
400//-----------------------------------------------------------------------------
401//
402//   calcChainedFollowPos.    Modify the previously calculated followPos sets
403//                            to implement rule chaining.  NOT described by Aho
404//
405//-----------------------------------------------------------------------------
406void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
407
408    UVector         endMarkerNodes(*fStatus);
409    UVector         leafNodes(*fStatus);
410    int32_t         i;
411
412    if (U_FAILURE(*fStatus)) {
413        return;
414    }
415
416    // get a list of all endmarker nodes.
417    tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
418
419    // get a list all leaf nodes
420    tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
421    if (U_FAILURE(*fStatus)) {
422        return;
423    }
424
425    // Collect all leaf nodes that can start matches for rules
426    // with inbound chaining enabled, which is the union of the
427    // firstPosition sets from each of the rule root nodes.
428
429    UVector ruleRootNodes(*fStatus);
430    addRuleRootNodes(&ruleRootNodes, tree);
431
432    UVector matchStartNodes(*fStatus);
433    for (int i=0; i<ruleRootNodes.size(); ++i) {
434        RBBINode *node = static_cast<RBBINode *>(ruleRootNodes.elementAt(i));
435        if (node->fChainIn) {
436            setAdd(&matchStartNodes, node->fFirstPosSet);
437        }
438    }
439    if (U_FAILURE(*fStatus)) {
440        return;
441    }
442
443    int32_t  endNodeIx;
444    int32_t  startNodeIx;
445
446    for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
447        RBBINode *tNode   = (RBBINode *)leafNodes.elementAt(endNodeIx);
448        RBBINode *endNode = NULL;
449
450        // Identify leaf nodes that correspond to overall rule match positions.
451        //   These include an endMarkerNode in their followPos sets.
452        for (i=0; i<endMarkerNodes.size(); i++) {
453            if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) {
454                endNode = tNode;
455                break;
456            }
457        }
458        if (endNode == NULL) {
459            // node wasn't an end node.  Try again with the next.
460            continue;
461        }
462
463        // We've got a node that can end a match.
464
465        // Line Break Specific hack:  If this node's val correspond to the $CM char class,
466        //                            don't chain from it.
467        // TODO:  Add rule syntax for this behavior, get specifics out of here and
468        //        into the rule file.
469        if (fRB->fLBCMNoChain) {
470            UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
471            if (c != -1) {
472                // c == -1 occurs with sets containing only the {eof} marker string.
473                ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
474                if (cLBProp == U_LB_COMBINING_MARK) {
475                    continue;
476                }
477            }
478        }
479
480
481        // Now iterate over the nodes that can start a match, looking for ones
482        //   with the same char class as our ending node.
483        RBBINode *startNode;
484        for (startNodeIx = 0; startNodeIx<matchStartNodes.size(); startNodeIx++) {
485            startNode = (RBBINode *)matchStartNodes.elementAt(startNodeIx);
486            if (startNode->fType != RBBINode::leafChar) {
487                continue;
488            }
489
490            if (endNode->fVal == startNode->fVal) {
491                // The end val (character class) of one possible match is the
492                //   same as the start of another.
493
494                // Add all nodes from the followPos of the start node to the
495                //  followPos set of the end node, which will have the effect of
496                //  letting matches transition from a match state at endNode
497                //  to the second char of a match starting with startNode.
498                setAdd(endNode->fFollowPos, startNode->fFollowPos);
499            }
500        }
501    }
502}
503
504
505//-----------------------------------------------------------------------------
506//
507//   bofFixup.    Fixup for state tables that include {bof} beginning of input testing.
508//                Do an swizzle similar to chaining, modifying the followPos set of
509//                the bofNode to include the followPos nodes from other {bot} nodes
510//                scattered through the tree.
511//
512//                This function has much in common with calcChainedFollowPos().
513//
514//-----------------------------------------------------------------------------
515void RBBITableBuilder::bofFixup() {
516
517    if (U_FAILURE(*fStatus)) {
518        return;
519    }
520
521    //   The parse tree looks like this ...
522    //         fTree root  --->       <cat>
523    //                               /     \       .
524    //                            <cat>   <#end node>
525    //                           /     \  .
526    //                     <bofNode>   rest
527    //                               of tree
528    //
529    //    We will be adding things to the followPos set of the <bofNode>
530    //
531    RBBINode  *bofNode = fTree->fLeftChild->fLeftChild;
532    U_ASSERT(bofNode->fType == RBBINode::leafChar);
533    U_ASSERT(bofNode->fVal == 2);
534
535    // Get all nodes that can be the start a match of the user-written rules
536    //  (excluding the fake bofNode)
537    //  We want the nodes that can start a match in the
538    //     part labeled "rest of tree"
539    //
540    UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;
541
542    RBBINode *startNode;
543    int       startNodeIx;
544    for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
545        startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
546        if (startNode->fType != RBBINode::leafChar) {
547            continue;
548        }
549
550        if (startNode->fVal == bofNode->fVal) {
551            //  We found a leaf node corresponding to a {bof} that was
552            //    explicitly written into a rule.
553            //  Add everything from the followPos set of this node to the
554            //    followPos set of the fake bofNode at the start of the tree.
555            //
556            setAdd(bofNode->fFollowPos, startNode->fFollowPos);
557        }
558    }
559}
560
561//-----------------------------------------------------------------------------
562//
563//   buildStateTable()    Determine the set of runtime DFA states and the
564//                        transition tables for these states, by the algorithm
565//                        of fig. 3.44 in Aho.
566//
567//                        Most of the comments are quotes of Aho's psuedo-code.
568//
569//-----------------------------------------------------------------------------
570void RBBITableBuilder::buildStateTable() {
571    if (U_FAILURE(*fStatus)) {
572        return;
573    }
574    RBBIStateDescriptor *failState;
575    // Set it to NULL to avoid uninitialized warning
576    RBBIStateDescriptor *initialState = NULL;
577    //
578    // Add a dummy state 0 - the stop state.  Not from Aho.
579    int      lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
580    failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
581    if (failState == NULL) {
582        *fStatus = U_MEMORY_ALLOCATION_ERROR;
583        goto ExitBuildSTdeleteall;
584    }
585    failState->fPositions = new UVector(*fStatus);
586    if (failState->fPositions == NULL) {
587        *fStatus = U_MEMORY_ALLOCATION_ERROR;
588    }
589    if (failState->fPositions == NULL || U_FAILURE(*fStatus)) {
590        goto ExitBuildSTdeleteall;
591    }
592    fDStates->addElement(failState, *fStatus);
593    if (U_FAILURE(*fStatus)) {
594        goto ExitBuildSTdeleteall;
595    }
596
597    // initially, the only unmarked state in Dstates is firstpos(root),
598    //       where toot is the root of the syntax tree for (r)#;
599    initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
600    if (initialState == NULL) {
601        *fStatus = U_MEMORY_ALLOCATION_ERROR;
602    }
603    if (U_FAILURE(*fStatus)) {
604        goto ExitBuildSTdeleteall;
605    }
606    initialState->fPositions = new UVector(*fStatus);
607    if (initialState->fPositions == NULL) {
608        *fStatus = U_MEMORY_ALLOCATION_ERROR;
609    }
610    if (U_FAILURE(*fStatus)) {
611        goto ExitBuildSTdeleteall;
612    }
613    setAdd(initialState->fPositions, fTree->fFirstPosSet);
614    fDStates->addElement(initialState, *fStatus);
615    if (U_FAILURE(*fStatus)) {
616        goto ExitBuildSTdeleteall;
617    }
618
619    // while there is an unmarked state T in Dstates do begin
620    for (;;) {
621        RBBIStateDescriptor *T = NULL;
622        int32_t              tx;
623        for (tx=1; tx<fDStates->size(); tx++) {
624            RBBIStateDescriptor *temp;
625            temp = (RBBIStateDescriptor *)fDStates->elementAt(tx);
626            if (temp->fMarked == FALSE) {
627                T = temp;
628                break;
629            }
630        }
631        if (T == NULL) {
632            break;
633        }
634
635        // mark T;
636        T->fMarked = TRUE;
637
638        // for each input symbol a do begin
639        int32_t  a;
640        for (a = 1; a<=lastInputSymbol; a++) {
641            // let U be the set of positions that are in followpos(p)
642            //    for some position p in T
643            //    such that the symbol at position p is a;
644            UVector    *U = NULL;
645            RBBINode   *p;
646            int32_t     px;
647            for (px=0; px<T->fPositions->size(); px++) {
648                p = (RBBINode *)T->fPositions->elementAt(px);
649                if ((p->fType == RBBINode::leafChar) &&  (p->fVal == a)) {
650                    if (U == NULL) {
651                        U = new UVector(*fStatus);
652                        if (U == NULL) {
653                        	*fStatus = U_MEMORY_ALLOCATION_ERROR;
654                        	goto ExitBuildSTdeleteall;
655                        }
656                    }
657                    setAdd(U, p->fFollowPos);
658                }
659            }
660
661            // if U is not empty and not in DStates then
662            int32_t  ux = 0;
663            UBool    UinDstates = FALSE;
664            if (U != NULL) {
665                U_ASSERT(U->size() > 0);
666                int  ix;
667                for (ix=0; ix<fDStates->size(); ix++) {
668                    RBBIStateDescriptor *temp2;
669                    temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix);
670                    if (setEquals(U, temp2->fPositions)) {
671                        delete U;
672                        U  = temp2->fPositions;
673                        ux = ix;
674                        UinDstates = TRUE;
675                        break;
676                    }
677                }
678
679                // Add U as an unmarked state to Dstates
680                if (!UinDstates)
681                {
682                    RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
683                    if (newState == NULL) {
684                    	*fStatus = U_MEMORY_ALLOCATION_ERROR;
685                    }
686                    if (U_FAILURE(*fStatus)) {
687                        goto ExitBuildSTdeleteall;
688                    }
689                    newState->fPositions = U;
690                    fDStates->addElement(newState, *fStatus);
691                    if (U_FAILURE(*fStatus)) {
692                        return;
693                    }
694                    ux = fDStates->size()-1;
695                }
696
697                // Dtran[T, a] := U;
698                T->fDtran->setElementAt(ux, a);
699            }
700        }
701    }
702    return;
703    // delete local pointers only if error occured.
704ExitBuildSTdeleteall:
705    delete initialState;
706    delete failState;
707}
708
709
710
711//-----------------------------------------------------------------------------
712//
713//   flagAcceptingStates    Identify accepting states.
714//                          First get a list of all of the end marker nodes.
715//                          Then, for each state s,
716//                              if s contains one of the end marker nodes in its list of tree positions then
717//                                  s is an accepting state.
718//
719//-----------------------------------------------------------------------------
720void     RBBITableBuilder::flagAcceptingStates() {
721    if (U_FAILURE(*fStatus)) {
722        return;
723    }
724    UVector     endMarkerNodes(*fStatus);
725    RBBINode    *endMarker;
726    int32_t     i;
727    int32_t     n;
728
729    if (U_FAILURE(*fStatus)) {
730        return;
731    }
732
733    fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
734    if (U_FAILURE(*fStatus)) {
735        return;
736    }
737
738    for (i=0; i<endMarkerNodes.size(); i++) {
739        endMarker = (RBBINode *)endMarkerNodes.elementAt(i);
740        for (n=0; n<fDStates->size(); n++) {
741            RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
742            if (sd->fPositions->indexOf(endMarker) >= 0) {
743                // Any non-zero value for fAccepting means this is an accepting node.
744                // The value is what will be returned to the user as the break status.
745                // If no other value was specified, force it to -1.
746
747                if (sd->fAccepting==0) {
748                    // State hasn't been marked as accepting yet.  Do it now.
749                    sd->fAccepting = endMarker->fVal;
750                    if (sd->fAccepting == 0) {
751                        sd->fAccepting = -1;
752                    }
753                }
754                if (sd->fAccepting==-1 && endMarker->fVal != 0) {
755                    // Both lookahead and non-lookahead accepting for this state.
756                    // Favor the look-ahead.  Expedient for line break.
757                    // TODO:  need a more elegant resolution for conflicting rules.
758                    sd->fAccepting = endMarker->fVal;
759                }
760                // implicit else:
761                // if sd->fAccepting already had a value other than 0 or -1, leave it be.
762
763                // If the end marker node is from a look-ahead rule, set
764                //   the fLookAhead field or this state also.
765                if (endMarker->fLookAheadEnd) {
766                    // TODO:  don't change value if already set?
767                    // TODO:  allow for more than one active look-ahead rule in engine.
768                    //        Make value here an index to a side array in engine?
769                    sd->fLookAhead = sd->fAccepting;
770                }
771            }
772        }
773    }
774}
775
776
777//-----------------------------------------------------------------------------
778//
779//    flagLookAheadStates   Very similar to flagAcceptingStates, above.
780//
781//-----------------------------------------------------------------------------
782void     RBBITableBuilder::flagLookAheadStates() {
783    if (U_FAILURE(*fStatus)) {
784        return;
785    }
786    UVector     lookAheadNodes(*fStatus);
787    RBBINode    *lookAheadNode;
788    int32_t     i;
789    int32_t     n;
790
791    fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
792    if (U_FAILURE(*fStatus)) {
793        return;
794    }
795    for (i=0; i<lookAheadNodes.size(); i++) {
796        lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
797
798        for (n=0; n<fDStates->size(); n++) {
799            RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
800            if (sd->fPositions->indexOf(lookAheadNode) >= 0) {
801                sd->fLookAhead = lookAheadNode->fVal;
802            }
803        }
804    }
805}
806
807
808
809
810//-----------------------------------------------------------------------------
811//
812//    flagTaggedStates
813//
814//-----------------------------------------------------------------------------
815void     RBBITableBuilder::flagTaggedStates() {
816    if (U_FAILURE(*fStatus)) {
817        return;
818    }
819    UVector     tagNodes(*fStatus);
820    RBBINode    *tagNode;
821    int32_t     i;
822    int32_t     n;
823
824    if (U_FAILURE(*fStatus)) {
825        return;
826    }
827    fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
828    if (U_FAILURE(*fStatus)) {
829        return;
830    }
831    for (i=0; i<tagNodes.size(); i++) {                   // For each tag node t (all of 'em)
832        tagNode = (RBBINode *)tagNodes.elementAt(i);
833
834        for (n=0; n<fDStates->size(); n++) {              //    For each state  s (row in the state table)
835            RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
836            if (sd->fPositions->indexOf(tagNode) >= 0) {  //       if  s include the tag node t
837                sortedAdd(&sd->fTagVals, tagNode->fVal);
838            }
839        }
840    }
841}
842
843
844
845
846//-----------------------------------------------------------------------------
847//
848//  mergeRuleStatusVals
849//
850//      Update the global table of rule status {tag} values
851//      The rule builder has a global vector of status values that are common
852//      for all tables.  Merge the ones from this table into the global set.
853//
854//-----------------------------------------------------------------------------
855void  RBBITableBuilder::mergeRuleStatusVals() {
856    //
857    //  The basic outline of what happens here is this...
858    //
859    //    for each state in this state table
860    //       if the status tag list for this state is in the global statuses list
861    //           record where and
862    //           continue with the next state
863    //       else
864    //           add the tag list for this state to the global list.
865    //
866    int i;
867    int n;
868
869    // Pre-set a single tag of {0} into the table.
870    //   We will need this as a default, for rule sets with no explicit tagging.
871    if (fRB->fRuleStatusVals->size() == 0) {
872        fRB->fRuleStatusVals->addElement(1, *fStatus);  // Num of statuses in group
873        fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus);  //   and our single status of zero
874    }
875
876    //    For each state
877    for (n=0; n<fDStates->size(); n++) {
878        RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
879        UVector *thisStatesTagValues = sd->fTagVals;
880        if (thisStatesTagValues == NULL) {
881            // No tag values are explicitly associated with this state.
882            //   Set the default tag value.
883            sd->fTagsIdx = 0;
884            continue;
885        }
886
887        // There are tag(s) associated with this state.
888        //   fTagsIdx will be the index into the global tag list for this state's tag values.
889        //   Initial value of -1 flags that we haven't got it set yet.
890        sd->fTagsIdx = -1;
891        int32_t  thisTagGroupStart = 0;   // indexes into the global rule status vals list
892        int32_t  nextTagGroupStart = 0;
893
894        // Loop runs once per group of tags in the global list
895        while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
896            thisTagGroupStart = nextTagGroupStart;
897            nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1;
898            if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) {
899                // The number of tags for this state is different from
900                //    the number of tags in this group from the global list.
901                //    Continue with the next group from the global list.
902                continue;
903            }
904            // The lengths match, go ahead and compare the actual tag values
905            //    between this state and the group from the global list.
906            for (i=0; i<thisStatesTagValues->size(); i++) {
907                if (thisStatesTagValues->elementAti(i) !=
908                    fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
909                    // Mismatch.
910                    break;
911                }
912            }
913
914            if (i == thisStatesTagValues->size()) {
915                // We found a set of tag values in the global list that match
916                //   those for this state.  Use them.
917                sd->fTagsIdx = thisTagGroupStart;
918                break;
919            }
920        }
921
922        if (sd->fTagsIdx == -1) {
923            // No suitable entry in the global tag list already.  Add one
924            sd->fTagsIdx = fRB->fRuleStatusVals->size();
925            fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus);
926            for (i=0; i<thisStatesTagValues->size(); i++) {
927                fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus);
928            }
929        }
930    }
931}
932
933
934
935
936
937
938
939//-----------------------------------------------------------------------------
940//
941//  sortedAdd  Add a value to a vector of sorted values (ints).
942//             Do not replicate entries; if the value is already there, do not
943//                add a second one.
944//             Lazily create the vector if it does not already exist.
945//
946//-----------------------------------------------------------------------------
947void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) {
948    int32_t i;
949
950    if (*vector == NULL) {
951        *vector = new UVector(*fStatus);
952    }
953    if (*vector == NULL || U_FAILURE(*fStatus)) {
954        return;
955    }
956    UVector *vec = *vector;
957    int32_t  vSize = vec->size();
958    for (i=0; i<vSize; i++) {
959        int32_t valAtI = vec->elementAti(i);
960        if (valAtI == val) {
961            // The value is already in the vector.  Don't add it again.
962            return;
963        }
964        if (valAtI > val) {
965            break;
966        }
967    }
968    vec->insertElementAt(val, i, *fStatus);
969}
970
971
972
973//-----------------------------------------------------------------------------
974//
975//  setAdd     Set operation on UVector
976//             dest = dest union source
977//             Elements may only appear once and must be sorted.
978//
979//-----------------------------------------------------------------------------
980void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
981    int32_t destOriginalSize = dest->size();
982    int32_t sourceSize       = source->size();
983    int32_t di           = 0;
984    MaybeStackArray<void *, 16> destArray, sourceArray;  // Handle small cases without malloc
985    void **destPtr, **sourcePtr;
986    void **destLim, **sourceLim;
987
988    if (destOriginalSize > destArray.getCapacity()) {
989        if (destArray.resize(destOriginalSize) == NULL) {
990            return;
991        }
992    }
993    destPtr = destArray.getAlias();
994    destLim = destPtr + destOriginalSize;  // destArray.getArrayLimit()?
995
996    if (sourceSize > sourceArray.getCapacity()) {
997        if (sourceArray.resize(sourceSize) == NULL) {
998            return;
999        }
1000    }
1001    sourcePtr = sourceArray.getAlias();
1002    sourceLim = sourcePtr + sourceSize;  // sourceArray.getArrayLimit()?
1003
1004    // Avoid multiple "get element" calls by getting the contents into arrays
1005    (void) dest->toArray(destPtr);
1006    (void) source->toArray(sourcePtr);
1007
1008    dest->setSize(sourceSize+destOriginalSize, *fStatus);
1009
1010    while (sourcePtr < sourceLim && destPtr < destLim) {
1011        if (*destPtr == *sourcePtr) {
1012            dest->setElementAt(*sourcePtr++, di++);
1013            destPtr++;
1014        }
1015        // This check is required for machines with segmented memory, like i5/OS.
1016        // Direct pointer comparison is not recommended.
1017        else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) {
1018            dest->setElementAt(*destPtr++, di++);
1019        }
1020        else { /* *sourcePtr < *destPtr */
1021            dest->setElementAt(*sourcePtr++, di++);
1022        }
1023    }
1024
1025    // At most one of these two cleanup loops will execute
1026    while (destPtr < destLim) {
1027        dest->setElementAt(*destPtr++, di++);
1028    }
1029    while (sourcePtr < sourceLim) {
1030        dest->setElementAt(*sourcePtr++, di++);
1031    }
1032
1033    dest->setSize(di, *fStatus);
1034}
1035
1036
1037
1038//-----------------------------------------------------------------------------
1039//
1040//  setEqual    Set operation on UVector.
1041//              Compare for equality.
1042//              Elements must be sorted.
1043//
1044//-----------------------------------------------------------------------------
1045UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
1046    return a->equals(*b);
1047}
1048
1049
1050//-----------------------------------------------------------------------------
1051//
1052//  printPosSets   Debug function.  Dump Nullable, firstpos, lastpos and followpos
1053//                 for each node in the tree.
1054//
1055//-----------------------------------------------------------------------------
1056#ifdef RBBI_DEBUG
1057void RBBITableBuilder::printPosSets(RBBINode *n) {
1058    if (n==NULL) {
1059        return;
1060    }
1061    printf("\n");
1062    RBBINode::printNodeHeader();
1063    RBBINode::printNode(n);
1064    RBBIDebugPrintf("         Nullable:  %s\n", n->fNullable?"TRUE":"FALSE");
1065
1066    RBBIDebugPrintf("         firstpos:  ");
1067    printSet(n->fFirstPosSet);
1068
1069    RBBIDebugPrintf("         lastpos:   ");
1070    printSet(n->fLastPosSet);
1071
1072    RBBIDebugPrintf("         followpos: ");
1073    printSet(n->fFollowPos);
1074
1075    printPosSets(n->fLeftChild);
1076    printPosSets(n->fRightChild);
1077}
1078#endif
1079
1080
1081
1082//-----------------------------------------------------------------------------
1083//
1084//   getTableSize()    Calculate the size of the runtime form of this
1085//                     state transition table.
1086//
1087//-----------------------------------------------------------------------------
1088int32_t  RBBITableBuilder::getTableSize() const {
1089    int32_t    size = 0;
1090    int32_t    numRows;
1091    int32_t    numCols;
1092    int32_t    rowSize;
1093
1094    if (fTree == NULL) {
1095        return 0;
1096    }
1097
1098    size    = sizeof(RBBIStateTable) - 4;    // The header, with no rows to the table.
1099
1100    numRows = fDStates->size();
1101    numCols = fRB->fSetBuilder->getNumCharCategories();
1102
1103    //  Note  The declaration of RBBIStateTableRow is for a table of two columns.
1104    //        Therefore we subtract two from numCols when determining
1105    //        how much storage to add to a row for the total columns.
1106    rowSize = sizeof(RBBIStateTableRow) + sizeof(uint16_t)*(numCols-2);
1107    size   += numRows * rowSize;
1108    return size;
1109}
1110
1111
1112
1113//-----------------------------------------------------------------------------
1114//
1115//   exportTable()    export the state transition table in the format required
1116//                    by the runtime engine.  getTableSize() bytes of memory
1117//                    must be available at the output address "where".
1118//
1119//-----------------------------------------------------------------------------
1120void RBBITableBuilder::exportTable(void *where) {
1121    RBBIStateTable    *table = (RBBIStateTable *)where;
1122    uint32_t           state;
1123    int                col;
1124
1125    if (U_FAILURE(*fStatus) || fTree == NULL) {
1126        return;
1127    }
1128
1129    if (fRB->fSetBuilder->getNumCharCategories() > 0x7fff ||
1130        fDStates->size() > 0x7fff) {
1131        *fStatus = U_BRK_INTERNAL_ERROR;
1132        return;
1133    }
1134
1135    table->fRowLen    = sizeof(RBBIStateTableRow) +
1136                            sizeof(uint16_t) * (fRB->fSetBuilder->getNumCharCategories() - 2);
1137    table->fNumStates = fDStates->size();
1138    table->fFlags     = 0;
1139    if (fRB->fLookAheadHardBreak) {
1140        table->fFlags  |= RBBI_LOOKAHEAD_HARD_BREAK;
1141    }
1142    if (fRB->fSetBuilder->sawBOF()) {
1143        table->fFlags  |= RBBI_BOF_REQUIRED;
1144    }
1145    table->fReserved  = 0;
1146
1147    for (state=0; state<table->fNumStates; state++) {
1148        RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1149        RBBIStateTableRow   *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1150        U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767);
1151        U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767);
1152        row->fAccepting = (int16_t)sd->fAccepting;
1153        row->fLookAhead = (int16_t)sd->fLookAhead;
1154        row->fTagIdx    = (int16_t)sd->fTagsIdx;
1155        for (col=0; col<fRB->fSetBuilder->getNumCharCategories(); col++) {
1156            row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col);
1157        }
1158    }
1159}
1160
1161
1162
1163//-----------------------------------------------------------------------------
1164//
1165//   printSet    Debug function.   Print the contents of a UVector
1166//
1167//-----------------------------------------------------------------------------
1168#ifdef RBBI_DEBUG
1169void RBBITableBuilder::printSet(UVector *s) {
1170    int32_t  i;
1171    for (i=0; i<s->size(); i++) {
1172        const RBBINode *v = static_cast<const RBBINode *>(s->elementAt(i));
1173        RBBIDebugPrintf("%5d", v==NULL? -1 : v->fSerialNum);
1174    }
1175    RBBIDebugPrintf("\n");
1176}
1177#endif
1178
1179
1180//-----------------------------------------------------------------------------
1181//
1182//   printStates    Debug Function.  Dump the fully constructed state transition table.
1183//
1184//-----------------------------------------------------------------------------
1185#ifdef RBBI_DEBUG
1186void RBBITableBuilder::printStates() {
1187    int     c;    // input "character"
1188    int     n;    // state number
1189
1190    RBBIDebugPrintf("state |           i n p u t     s y m b o l s \n");
1191    RBBIDebugPrintf("      | Acc  LA    Tag");
1192    for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1193        RBBIDebugPrintf(" %2d", c);
1194    }
1195    RBBIDebugPrintf("\n");
1196    RBBIDebugPrintf("      |---------------");
1197    for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1198        RBBIDebugPrintf("---");
1199    }
1200    RBBIDebugPrintf("\n");
1201
1202    for (n=0; n<fDStates->size(); n++) {
1203        RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
1204        RBBIDebugPrintf("  %3d | " , n);
1205        RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
1206        for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1207            RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c));
1208        }
1209        RBBIDebugPrintf("\n");
1210    }
1211    RBBIDebugPrintf("\n\n");
1212}
1213#endif
1214
1215
1216
1217//-----------------------------------------------------------------------------
1218//
1219//   printRuleStatusTable    Debug Function.  Dump the common rule status table
1220//
1221//-----------------------------------------------------------------------------
1222#ifdef RBBI_DEBUG
1223void RBBITableBuilder::printRuleStatusTable() {
1224    int32_t  thisRecord = 0;
1225    int32_t  nextRecord = 0;
1226    int      i;
1227    UVector  *tbl = fRB->fRuleStatusVals;
1228
1229    RBBIDebugPrintf("index |  tags \n");
1230    RBBIDebugPrintf("-------------------\n");
1231
1232    while (nextRecord < tbl->size()) {
1233        thisRecord = nextRecord;
1234        nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
1235        RBBIDebugPrintf("%4d   ", thisRecord);
1236        for (i=thisRecord+1; i<nextRecord; i++) {
1237            RBBIDebugPrintf("  %5d", tbl->elementAti(i));
1238        }
1239        RBBIDebugPrintf("\n");
1240    }
1241    RBBIDebugPrintf("\n\n");
1242}
1243#endif
1244
1245
1246//-----------------------------------------------------------------------------
1247//
1248//   RBBIStateDescriptor     Methods.  This is a very struct-like class
1249//                           Most access is directly to the fields.
1250//
1251//-----------------------------------------------------------------------------
1252
1253RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) {
1254    fMarked    = FALSE;
1255    fAccepting = 0;
1256    fLookAhead = 0;
1257    fTagsIdx   = 0;
1258    fTagVals   = NULL;
1259    fPositions = NULL;
1260    fDtran     = NULL;
1261
1262    fDtran     = new UVector(lastInputSymbol+1, *fStatus);
1263    if (U_FAILURE(*fStatus)) {
1264        return;
1265    }
1266    if (fDtran == NULL) {
1267        *fStatus = U_MEMORY_ALLOCATION_ERROR;
1268        return;
1269    }
1270    fDtran->setSize(lastInputSymbol+1, *fStatus);    // fDtran needs to be pre-sized.
1271                                           //   It is indexed by input symbols, and will
1272                                           //   hold  the next state number for each
1273                                           //   symbol.
1274}
1275
1276
1277RBBIStateDescriptor::~RBBIStateDescriptor() {
1278    delete       fPositions;
1279    delete       fDtran;
1280    delete       fTagVals;
1281    fPositions = NULL;
1282    fDtran     = NULL;
1283    fTagVals   = NULL;
1284}
1285
1286U_NAMESPACE_END
1287
1288#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1289