1/*
2*******************************************************************************
3*   Copyright (C) 2010-2011, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5*******************************************************************************
6*   file name:  stringtriebuilder.cpp
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11*   created on: 2010dec24
12*   created by: Markus W. Scherer
13*/
14
15#include <typeinfo>  // for 'typeid' to work
16#include "unicode/utypes.h"
17#include "unicode/stringtriebuilder.h"
18#include "uassert.h"
19#include "uhash.h"
20
21U_CDECL_BEGIN
22
23static int32_t U_CALLCONV
24hashStringTrieNode(const UHashTok key) {
25    return U_NAMESPACE_QUALIFIER StringTrieBuilder::hashNode(key.pointer);
26}
27
28static UBool U_CALLCONV
29equalStringTrieNodes(const UHashTok key1, const UHashTok key2) {
30    return U_NAMESPACE_QUALIFIER StringTrieBuilder::equalNodes(key1.pointer, key2.pointer);
31}
32
33U_CDECL_END
34
35U_NAMESPACE_BEGIN
36
37StringTrieBuilder::StringTrieBuilder() : nodes(NULL) {}
38
39StringTrieBuilder::~StringTrieBuilder() {
40    deleteCompactBuilder();
41}
42
43void
44StringTrieBuilder::createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode) {
45    if(U_FAILURE(errorCode)) {
46        return;
47    }
48    nodes=uhash_openSize(hashStringTrieNode, equalStringTrieNodes, NULL,
49                         sizeGuess, &errorCode);
50    if(U_SUCCESS(errorCode) && nodes==NULL) {
51        errorCode=U_MEMORY_ALLOCATION_ERROR;
52    }
53    if(U_SUCCESS(errorCode)) {
54        uhash_setKeyDeleter(nodes, uhash_deleteUObject);
55    }
56}
57
58void
59StringTrieBuilder::deleteCompactBuilder() {
60    uhash_close(nodes);
61    nodes=NULL;
62}
63
64void
65StringTrieBuilder::build(UStringTrieBuildOption buildOption, int32_t elementsLength,
66                       UErrorCode &errorCode) {
67    if(buildOption==USTRINGTRIE_BUILD_FAST) {
68        writeNode(0, elementsLength, 0);
69    } else /* USTRINGTRIE_BUILD_SMALL */ {
70        createCompactBuilder(2*elementsLength, errorCode);
71        Node *root=makeNode(0, elementsLength, 0, errorCode);
72        if(U_SUCCESS(errorCode)) {
73            root->markRightEdgesFirst(-1);
74            root->write(*this);
75        }
76        deleteCompactBuilder();
77    }
78}
79
80// Requires start<limit,
81// and all strings of the [start..limit[ elements must be sorted and
82// have a common prefix of length unitIndex.
83int32_t
84StringTrieBuilder::writeNode(int32_t start, int32_t limit, int32_t unitIndex) {
85    UBool hasValue=FALSE;
86    int32_t value=0;
87    int32_t type;
88    if(unitIndex==getElementStringLength(start)) {
89        // An intermediate or final value.
90        value=getElementValue(start++);
91        if(start==limit) {
92            return writeValueAndFinal(value, TRUE);  // final-value node
93        }
94        hasValue=TRUE;
95    }
96    // Now all [start..limit[ strings are longer than unitIndex.
97    int32_t minUnit=getElementUnit(start, unitIndex);
98    int32_t maxUnit=getElementUnit(limit-1, unitIndex);
99    if(minUnit==maxUnit) {
100        // Linear-match node: All strings have the same character at unitIndex.
101        int32_t lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex);
102        writeNode(start, limit, lastUnitIndex);
103        // Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
104        int32_t length=lastUnitIndex-unitIndex;
105        int32_t maxLinearMatchLength=getMaxLinearMatchLength();
106        while(length>maxLinearMatchLength) {
107            lastUnitIndex-=maxLinearMatchLength;
108            length-=maxLinearMatchLength;
109            writeElementUnits(start, lastUnitIndex, maxLinearMatchLength);
110            write(getMinLinearMatch()+maxLinearMatchLength-1);
111        }
112        writeElementUnits(start, unitIndex, length);
113        type=getMinLinearMatch()+length-1;
114    } else {
115        // Branch node.
116        int32_t length=countElementUnits(start, limit, unitIndex);
117        // length>=2 because minUnit!=maxUnit.
118        writeBranchSubNode(start, limit, unitIndex, length);
119        if(--length<getMinLinearMatch()) {
120            type=length;
121        } else {
122            write(length);
123            type=0;
124        }
125    }
126    return writeValueAndType(hasValue, value, type);
127}
128
129// start<limit && all strings longer than unitIndex &&
130// length different units at unitIndex
131int32_t
132StringTrieBuilder::writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length) {
133    UChar middleUnits[kMaxSplitBranchLevels];
134    int32_t lessThan[kMaxSplitBranchLevels];
135    int32_t ltLength=0;
136    while(length>getMaxBranchLinearSubNodeLength()) {
137        // Branch on the middle unit.
138        // First, find the middle unit.
139        int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2);
140        // Encode the less-than branch first.
141        middleUnits[ltLength]=getElementUnit(i, unitIndex);  // middle unit
142        lessThan[ltLength]=writeBranchSubNode(start, i, unitIndex, length/2);
143        ++ltLength;
144        // Continue for the greater-or-equal branch.
145        start=i;
146        length=length-length/2;
147    }
148    // For each unit, find its elements array start and whether it has a final value.
149    int32_t starts[kMaxBranchLinearSubNodeLength];
150    UBool isFinal[kMaxBranchLinearSubNodeLength-1];
151    int32_t unitNumber=0;
152    do {
153        int32_t i=starts[unitNumber]=start;
154        UChar unit=getElementUnit(i++, unitIndex);
155        i=indexOfElementWithNextUnit(i, unitIndex, unit);
156        isFinal[unitNumber]= start==i-1 && unitIndex+1==getElementStringLength(start);
157        start=i;
158    } while(++unitNumber<length-1);
159    // unitNumber==length-1, and the maxUnit elements range is [start..limit[
160    starts[unitNumber]=start;
161
162    // Write the sub-nodes in reverse order: The jump lengths are deltas from
163    // after their own positions, so if we wrote the minUnit sub-node first,
164    // then its jump delta would be larger.
165    // Instead we write the minUnit sub-node last, for a shorter delta.
166    int32_t jumpTargets[kMaxBranchLinearSubNodeLength-1];
167    do {
168        --unitNumber;
169        if(!isFinal[unitNumber]) {
170            jumpTargets[unitNumber]=writeNode(starts[unitNumber], starts[unitNumber+1], unitIndex+1);
171        }
172    } while(unitNumber>0);
173    // The maxUnit sub-node is written as the very last one because we do
174    // not jump for it at all.
175    unitNumber=length-1;
176    writeNode(start, limit, unitIndex+1);
177    int32_t offset=write(getElementUnit(start, unitIndex));
178    // Write the rest of this node's unit-value pairs.
179    while(--unitNumber>=0) {
180        start=starts[unitNumber];
181        int32_t value;
182        if(isFinal[unitNumber]) {
183            // Write the final value for the one string ending with this unit.
184            value=getElementValue(start);
185        } else {
186            // Write the delta to the start position of the sub-node.
187            value=offset-jumpTargets[unitNumber];
188        }
189        writeValueAndFinal(value, isFinal[unitNumber]);
190        offset=write(getElementUnit(start, unitIndex));
191    }
192    // Write the split-branch nodes.
193    while(ltLength>0) {
194        --ltLength;
195        writeDeltaTo(lessThan[ltLength]);
196        offset=write(middleUnits[ltLength]);
197    }
198    return offset;
199}
200
201// Requires start<limit,
202// and all strings of the [start..limit[ elements must be sorted and
203// have a common prefix of length unitIndex.
204StringTrieBuilder::Node *
205StringTrieBuilder::makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode) {
206    if(U_FAILURE(errorCode)) {
207        return NULL;
208    }
209    UBool hasValue=FALSE;
210    int32_t value=0;
211    if(unitIndex==getElementStringLength(start)) {
212        // An intermediate or final value.
213        value=getElementValue(start++);
214        if(start==limit) {
215            return registerFinalValue(value, errorCode);
216        }
217        hasValue=TRUE;
218    }
219    Node *node;
220    // Now all [start..limit[ strings are longer than unitIndex.
221    int32_t minUnit=getElementUnit(start, unitIndex);
222    int32_t maxUnit=getElementUnit(limit-1, unitIndex);
223    if(minUnit==maxUnit) {
224        // Linear-match node: All strings have the same character at unitIndex.
225        int32_t lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex);
226        Node *nextNode=makeNode(start, limit, lastUnitIndex, errorCode);
227        // Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
228        int32_t length=lastUnitIndex-unitIndex;
229        int32_t maxLinearMatchLength=getMaxLinearMatchLength();
230        while(length>maxLinearMatchLength) {
231            lastUnitIndex-=maxLinearMatchLength;
232            length-=maxLinearMatchLength;
233            node=createLinearMatchNode(start, lastUnitIndex, maxLinearMatchLength, nextNode);
234            nextNode=registerNode(node, errorCode);
235        }
236        node=createLinearMatchNode(start, unitIndex, length, nextNode);
237    } else {
238        // Branch node.
239        int32_t length=countElementUnits(start, limit, unitIndex);
240        // length>=2 because minUnit!=maxUnit.
241        Node *subNode=makeBranchSubNode(start, limit, unitIndex, length, errorCode);
242        node=new BranchHeadNode(length, subNode);
243    }
244    if(hasValue && node!=NULL) {
245        if(matchNodesCanHaveValues()) {
246            ((ValueNode *)node)->setValue(value);
247        } else {
248            node=new IntermediateValueNode(value, registerNode(node, errorCode));
249        }
250    }
251    return registerNode(node, errorCode);
252}
253
254// start<limit && all strings longer than unitIndex &&
255// length different units at unitIndex
256StringTrieBuilder::Node *
257StringTrieBuilder::makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
258                                   int32_t length, UErrorCode &errorCode) {
259    if(U_FAILURE(errorCode)) {
260        return NULL;
261    }
262    UChar middleUnits[kMaxSplitBranchLevels];
263    Node *lessThan[kMaxSplitBranchLevels];
264    int32_t ltLength=0;
265    while(length>getMaxBranchLinearSubNodeLength()) {
266        // Branch on the middle unit.
267        // First, find the middle unit.
268        int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2);
269        // Create the less-than branch.
270        middleUnits[ltLength]=getElementUnit(i, unitIndex);  // middle unit
271        lessThan[ltLength]=makeBranchSubNode(start, i, unitIndex, length/2, errorCode);
272        ++ltLength;
273        // Continue for the greater-or-equal branch.
274        start=i;
275        length=length-length/2;
276    }
277    if(U_FAILURE(errorCode)) {
278        return NULL;
279    }
280    ListBranchNode *listNode=new ListBranchNode();
281    if(listNode==NULL) {
282        errorCode=U_MEMORY_ALLOCATION_ERROR;
283        return NULL;
284    }
285    // For each unit, find its elements array start and whether it has a final value.
286    int32_t unitNumber=0;
287    do {
288        int32_t i=start;
289        UChar unit=getElementUnit(i++, unitIndex);
290        i=indexOfElementWithNextUnit(i, unitIndex, unit);
291        if(start==i-1 && unitIndex+1==getElementStringLength(start)) {
292            listNode->add(unit, getElementValue(start));
293        } else {
294            listNode->add(unit, makeNode(start, i, unitIndex+1, errorCode));
295        }
296        start=i;
297    } while(++unitNumber<length-1);
298    // unitNumber==length-1, and the maxUnit elements range is [start..limit[
299    UChar unit=getElementUnit(start, unitIndex);
300    if(start==limit-1 && unitIndex+1==getElementStringLength(start)) {
301        listNode->add(unit, getElementValue(start));
302    } else {
303        listNode->add(unit, makeNode(start, limit, unitIndex+1, errorCode));
304    }
305    Node *node=registerNode(listNode, errorCode);
306    // Create the split-branch nodes.
307    while(ltLength>0) {
308        --ltLength;
309        node=registerNode(
310            new SplitBranchNode(middleUnits[ltLength], lessThan[ltLength], node), errorCode);
311    }
312    return node;
313}
314
315StringTrieBuilder::Node *
316StringTrieBuilder::registerNode(Node *newNode, UErrorCode &errorCode) {
317    if(U_FAILURE(errorCode)) {
318        delete newNode;
319        return NULL;
320    }
321    if(newNode==NULL) {
322        errorCode=U_MEMORY_ALLOCATION_ERROR;
323        return NULL;
324    }
325    const UHashElement *old=uhash_find(nodes, newNode);
326    if(old!=NULL) {
327        delete newNode;
328        return (Node *)old->key.pointer;
329    }
330    // If uhash_puti() returns a non-zero value from an equivalent, previously
331    // registered node, then uhash_find() failed to find that and we will leak newNode.
332#if !U_RELEASE
333    int32_t oldValue=  // Only in debug mode to avoid a compiler warning about unused oldValue.
334#endif
335    uhash_puti(nodes, newNode, 1, &errorCode);
336    U_ASSERT(oldValue==0);
337    if(U_FAILURE(errorCode)) {
338        delete newNode;
339        return NULL;
340    }
341    return newNode;
342}
343
344StringTrieBuilder::Node *
345StringTrieBuilder::registerFinalValue(int32_t value, UErrorCode &errorCode) {
346    if(U_FAILURE(errorCode)) {
347        return NULL;
348    }
349    FinalValueNode key(value);
350    const UHashElement *old=uhash_find(nodes, &key);
351    if(old!=NULL) {
352        return (Node *)old->key.pointer;
353    }
354    Node *newNode=new FinalValueNode(value);
355    if(newNode==NULL) {
356        errorCode=U_MEMORY_ALLOCATION_ERROR;
357        return NULL;
358    }
359    // If uhash_puti() returns a non-zero value from an equivalent, previously
360    // registered node, then uhash_find() failed to find that and we will leak newNode.
361#if !U_RELEASE
362    int32_t oldValue=  // Only in debug mode to avoid a compiler warning about unused oldValue.
363#endif
364    uhash_puti(nodes, newNode, 1, &errorCode);
365    U_ASSERT(oldValue==0);
366    if(U_FAILURE(errorCode)) {
367        delete newNode;
368        return NULL;
369    }
370    return newNode;
371}
372
373UBool
374StringTrieBuilder::hashNode(const void *node) {
375    return ((const Node *)node)->hashCode();
376}
377
378UBool
379StringTrieBuilder::equalNodes(const void *left, const void *right) {
380    return *(const Node *)left==*(const Node *)right;
381}
382
383UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(StringTrieBuilder)
384
385UBool
386StringTrieBuilder::Node::operator==(const Node &other) const {
387    return this==&other || (typeid(*this)==typeid(other) && hash==other.hash);
388}
389
390int32_t
391StringTrieBuilder::Node::markRightEdgesFirst(int32_t edgeNumber) {
392    if(offset==0) {
393        offset=edgeNumber;
394    }
395    return edgeNumber;
396}
397
398UOBJECT_DEFINE_NO_RTTI_IMPLEMENTATION(StringTrieBuilder::Node)
399
400UBool
401StringTrieBuilder::FinalValueNode::operator==(const Node &other) const {
402    if(this==&other) {
403        return TRUE;
404    }
405    if(!Node::operator==(other)) {
406        return FALSE;
407    }
408    const FinalValueNode &o=(const FinalValueNode &)other;
409    return value==o.value;
410}
411
412void
413StringTrieBuilder::FinalValueNode::write(StringTrieBuilder &builder) {
414    offset=builder.writeValueAndFinal(value, TRUE);
415}
416
417UBool
418StringTrieBuilder::ValueNode::operator==(const Node &other) const {
419    if(this==&other) {
420        return TRUE;
421    }
422    if(!Node::operator==(other)) {
423        return FALSE;
424    }
425    const ValueNode &o=(const ValueNode &)other;
426    return hasValue==o.hasValue && (!hasValue || value==o.value);
427}
428
429UBool
430StringTrieBuilder::IntermediateValueNode::operator==(const Node &other) const {
431    if(this==&other) {
432        return TRUE;
433    }
434    if(!ValueNode::operator==(other)) {
435        return FALSE;
436    }
437    const IntermediateValueNode &o=(const IntermediateValueNode &)other;
438    return next==o.next;
439}
440
441int32_t
442StringTrieBuilder::IntermediateValueNode::markRightEdgesFirst(int32_t edgeNumber) {
443    if(offset==0) {
444        offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
445    }
446    return edgeNumber;
447}
448
449void
450StringTrieBuilder::IntermediateValueNode::write(StringTrieBuilder &builder) {
451    next->write(builder);
452    offset=builder.writeValueAndFinal(value, FALSE);
453}
454
455UBool
456StringTrieBuilder::LinearMatchNode::operator==(const Node &other) const {
457    if(this==&other) {
458        return TRUE;
459    }
460    if(!ValueNode::operator==(other)) {
461        return FALSE;
462    }
463    const LinearMatchNode &o=(const LinearMatchNode &)other;
464    return length==o.length && next==o.next;
465}
466
467int32_t
468StringTrieBuilder::LinearMatchNode::markRightEdgesFirst(int32_t edgeNumber) {
469    if(offset==0) {
470        offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
471    }
472    return edgeNumber;
473}
474
475UBool
476StringTrieBuilder::ListBranchNode::operator==(const Node &other) const {
477    if(this==&other) {
478        return TRUE;
479    }
480    if(!Node::operator==(other)) {
481        return FALSE;
482    }
483    const ListBranchNode &o=(const ListBranchNode &)other;
484    for(int32_t i=0; i<length; ++i) {
485        if(units[i]!=o.units[i] || values[i]!=o.values[i] || equal[i]!=o.equal[i]) {
486            return FALSE;
487        }
488    }
489    return TRUE;
490}
491
492int32_t
493StringTrieBuilder::ListBranchNode::markRightEdgesFirst(int32_t edgeNumber) {
494    if(offset==0) {
495        firstEdgeNumber=edgeNumber;
496        int32_t step=0;
497        int32_t i=length;
498        do {
499            Node *edge=equal[--i];
500            if(edge!=NULL) {
501                edgeNumber=edge->markRightEdgesFirst(edgeNumber-step);
502            }
503            // For all but the rightmost edge, decrement the edge number.
504            step=1;
505        } while(i>0);
506        offset=edgeNumber;
507    }
508    return edgeNumber;
509}
510
511void
512StringTrieBuilder::ListBranchNode::write(StringTrieBuilder &builder) {
513    // Write the sub-nodes in reverse order: The jump lengths are deltas from
514    // after their own positions, so if we wrote the minUnit sub-node first,
515    // then its jump delta would be larger.
516    // Instead we write the minUnit sub-node last, for a shorter delta.
517    int32_t unitNumber=length-1;
518    Node *rightEdge=equal[unitNumber];
519    int32_t rightEdgeNumber= rightEdge==NULL ? firstEdgeNumber : rightEdge->getOffset();
520    do {
521        --unitNumber;
522        if(equal[unitNumber]!=NULL) {
523            equal[unitNumber]->writeUnlessInsideRightEdge(firstEdgeNumber, rightEdgeNumber, builder);
524        }
525    } while(unitNumber>0);
526    // The maxUnit sub-node is written as the very last one because we do
527    // not jump for it at all.
528    unitNumber=length-1;
529    if(rightEdge==NULL) {
530        builder.writeValueAndFinal(values[unitNumber], TRUE);
531    } else {
532        rightEdge->write(builder);
533    }
534    offset=builder.write(units[unitNumber]);
535    // Write the rest of this node's unit-value pairs.
536    while(--unitNumber>=0) {
537        int32_t value;
538        UBool isFinal;
539        if(equal[unitNumber]==NULL) {
540            // Write the final value for the one string ending with this unit.
541            value=values[unitNumber];
542            isFinal=TRUE;
543        } else {
544            // Write the delta to the start position of the sub-node.
545            U_ASSERT(equal[unitNumber]->getOffset()>0);
546            value=offset-equal[unitNumber]->getOffset();
547            isFinal=FALSE;
548        }
549        builder.writeValueAndFinal(value, isFinal);
550        offset=builder.write(units[unitNumber]);
551    }
552}
553
554UBool
555StringTrieBuilder::SplitBranchNode::operator==(const Node &other) const {
556    if(this==&other) {
557        return TRUE;
558    }
559    if(!Node::operator==(other)) {
560        return FALSE;
561    }
562    const SplitBranchNode &o=(const SplitBranchNode &)other;
563    return unit==o.unit && lessThan==o.lessThan && greaterOrEqual==o.greaterOrEqual;
564}
565
566int32_t
567StringTrieBuilder::SplitBranchNode::markRightEdgesFirst(int32_t edgeNumber) {
568    if(offset==0) {
569        firstEdgeNumber=edgeNumber;
570        edgeNumber=greaterOrEqual->markRightEdgesFirst(edgeNumber);
571        offset=edgeNumber=lessThan->markRightEdgesFirst(edgeNumber-1);
572    }
573    return edgeNumber;
574}
575
576void
577StringTrieBuilder::SplitBranchNode::write(StringTrieBuilder &builder) {
578    // Encode the less-than branch first.
579    lessThan->writeUnlessInsideRightEdge(firstEdgeNumber, greaterOrEqual->getOffset(), builder);
580    // Encode the greater-or-equal branch last because we do not jump for it at all.
581    greaterOrEqual->write(builder);
582    // Write this node.
583    U_ASSERT(lessThan->getOffset()>0);
584    builder.writeDeltaTo(lessThan->getOffset());  // less-than
585    offset=builder.write(unit);
586}
587
588UBool
589StringTrieBuilder::BranchHeadNode::operator==(const Node &other) const {
590    if(this==&other) {
591        return TRUE;
592    }
593    if(!ValueNode::operator==(other)) {
594        return FALSE;
595    }
596    const BranchHeadNode &o=(const BranchHeadNode &)other;
597    return length==o.length && next==o.next;
598}
599
600int32_t
601StringTrieBuilder::BranchHeadNode::markRightEdgesFirst(int32_t edgeNumber) {
602    if(offset==0) {
603        offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
604    }
605    return edgeNumber;
606}
607
608void
609StringTrieBuilder::BranchHeadNode::write(StringTrieBuilder &builder) {
610    next->write(builder);
611    if(length<=builder.getMinLinearMatch()) {
612        offset=builder.writeValueAndType(hasValue, value, length-1);
613    } else {
614        builder.write(length-1);
615        offset=builder.writeValueAndType(hasValue, value, 0);
616    }
617}
618
619U_NAMESPACE_END
620