1/*
2*******************************************************************************
3*   Copyright (C) 2010-2012, International Business Machines
4*   Corporation and others.  All Rights Reserved.
5*******************************************************************************
6*   file name:  stringtriebuilder.cpp
7*   encoding:   US-ASCII
8*   tab size:   8 (not used)
9*   indentation:4
10*
11*   created on: 2010dec24
12*   created by: Markus W. Scherer
13*/
14
15#include "utypeinfo.h"  // for 'typeid' to work
16#include "unicode/utypes.h"
17#include "unicode/stringtriebuilder.h"
18#include "uassert.h"
19#include "uhash.h"
20
21U_CDECL_BEGIN
22
23static int32_t U_CALLCONV
24hashStringTrieNode(const UHashTok key) {
25    return icu::StringTrieBuilder::hashNode(key.pointer);
26}
27
28static UBool U_CALLCONV
29equalStringTrieNodes(const UHashTok key1, const UHashTok key2) {
30    return icu::StringTrieBuilder::equalNodes(key1.pointer, key2.pointer);
31}
32
33U_CDECL_END
34
35U_NAMESPACE_BEGIN
36
37StringTrieBuilder::StringTrieBuilder() : nodes(NULL) {}
38
39StringTrieBuilder::~StringTrieBuilder() {
40    deleteCompactBuilder();
41}
42
43void
44StringTrieBuilder::createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode) {
45    if(U_FAILURE(errorCode)) {
46        return;
47    }
48    nodes=uhash_openSize(hashStringTrieNode, equalStringTrieNodes, NULL,
49                         sizeGuess, &errorCode);
50    if(U_SUCCESS(errorCode)) {
51        if(nodes==NULL) {
52          errorCode=U_MEMORY_ALLOCATION_ERROR;
53        } else {
54          uhash_setKeyDeleter(nodes, uprv_deleteUObject);
55        }
56    }
57}
58
59void
60StringTrieBuilder::deleteCompactBuilder() {
61    uhash_close(nodes);
62    nodes=NULL;
63}
64
65void
66StringTrieBuilder::build(UStringTrieBuildOption buildOption, int32_t elementsLength,
67                       UErrorCode &errorCode) {
68    if(buildOption==USTRINGTRIE_BUILD_FAST) {
69        writeNode(0, elementsLength, 0);
70    } else /* USTRINGTRIE_BUILD_SMALL */ {
71        createCompactBuilder(2*elementsLength, errorCode);
72        Node *root=makeNode(0, elementsLength, 0, errorCode);
73        if(U_SUCCESS(errorCode)) {
74            root->markRightEdgesFirst(-1);
75            root->write(*this);
76        }
77        deleteCompactBuilder();
78    }
79}
80
81// Requires start<limit,
82// and all strings of the [start..limit[ elements must be sorted and
83// have a common prefix of length unitIndex.
84int32_t
85StringTrieBuilder::writeNode(int32_t start, int32_t limit, int32_t unitIndex) {
86    UBool hasValue=FALSE;
87    int32_t value=0;
88    int32_t type;
89    if(unitIndex==getElementStringLength(start)) {
90        // An intermediate or final value.
91        value=getElementValue(start++);
92        if(start==limit) {
93            return writeValueAndFinal(value, TRUE);  // final-value node
94        }
95        hasValue=TRUE;
96    }
97    // Now all [start..limit[ strings are longer than unitIndex.
98    int32_t minUnit=getElementUnit(start, unitIndex);
99    int32_t maxUnit=getElementUnit(limit-1, unitIndex);
100    if(minUnit==maxUnit) {
101        // Linear-match node: All strings have the same character at unitIndex.
102        int32_t lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex);
103        writeNode(start, limit, lastUnitIndex);
104        // Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
105        int32_t length=lastUnitIndex-unitIndex;
106        int32_t maxLinearMatchLength=getMaxLinearMatchLength();
107        while(length>maxLinearMatchLength) {
108            lastUnitIndex-=maxLinearMatchLength;
109            length-=maxLinearMatchLength;
110            writeElementUnits(start, lastUnitIndex, maxLinearMatchLength);
111            write(getMinLinearMatch()+maxLinearMatchLength-1);
112        }
113        writeElementUnits(start, unitIndex, length);
114        type=getMinLinearMatch()+length-1;
115    } else {
116        // Branch node.
117        int32_t length=countElementUnits(start, limit, unitIndex);
118        // length>=2 because minUnit!=maxUnit.
119        writeBranchSubNode(start, limit, unitIndex, length);
120        if(--length<getMinLinearMatch()) {
121            type=length;
122        } else {
123            write(length);
124            type=0;
125        }
126    }
127    return writeValueAndType(hasValue, value, type);
128}
129
130// start<limit && all strings longer than unitIndex &&
131// length different units at unitIndex
132int32_t
133StringTrieBuilder::writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length) {
134    UChar middleUnits[kMaxSplitBranchLevels];
135    int32_t lessThan[kMaxSplitBranchLevels];
136    int32_t ltLength=0;
137    while(length>getMaxBranchLinearSubNodeLength()) {
138        // Branch on the middle unit.
139        // First, find the middle unit.
140        int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2);
141        // Encode the less-than branch first.
142        middleUnits[ltLength]=getElementUnit(i, unitIndex);  // middle unit
143        lessThan[ltLength]=writeBranchSubNode(start, i, unitIndex, length/2);
144        ++ltLength;
145        // Continue for the greater-or-equal branch.
146        start=i;
147        length=length-length/2;
148    }
149    // For each unit, find its elements array start and whether it has a final value.
150    int32_t starts[kMaxBranchLinearSubNodeLength];
151    UBool isFinal[kMaxBranchLinearSubNodeLength-1];
152    int32_t unitNumber=0;
153    do {
154        int32_t i=starts[unitNumber]=start;
155        UChar unit=getElementUnit(i++, unitIndex);
156        i=indexOfElementWithNextUnit(i, unitIndex, unit);
157        isFinal[unitNumber]= start==i-1 && unitIndex+1==getElementStringLength(start);
158        start=i;
159    } while(++unitNumber<length-1);
160    // unitNumber==length-1, and the maxUnit elements range is [start..limit[
161    starts[unitNumber]=start;
162
163    // Write the sub-nodes in reverse order: The jump lengths are deltas from
164    // after their own positions, so if we wrote the minUnit sub-node first,
165    // then its jump delta would be larger.
166    // Instead we write the minUnit sub-node last, for a shorter delta.
167    int32_t jumpTargets[kMaxBranchLinearSubNodeLength-1];
168    do {
169        --unitNumber;
170        if(!isFinal[unitNumber]) {
171            jumpTargets[unitNumber]=writeNode(starts[unitNumber], starts[unitNumber+1], unitIndex+1);
172        }
173    } while(unitNumber>0);
174    // The maxUnit sub-node is written as the very last one because we do
175    // not jump for it at all.
176    unitNumber=length-1;
177    writeNode(start, limit, unitIndex+1);
178    int32_t offset=write(getElementUnit(start, unitIndex));
179    // Write the rest of this node's unit-value pairs.
180    while(--unitNumber>=0) {
181        start=starts[unitNumber];
182        int32_t value;
183        if(isFinal[unitNumber]) {
184            // Write the final value for the one string ending with this unit.
185            value=getElementValue(start);
186        } else {
187            // Write the delta to the start position of the sub-node.
188            value=offset-jumpTargets[unitNumber];
189        }
190        writeValueAndFinal(value, isFinal[unitNumber]);
191        offset=write(getElementUnit(start, unitIndex));
192    }
193    // Write the split-branch nodes.
194    while(ltLength>0) {
195        --ltLength;
196        writeDeltaTo(lessThan[ltLength]);
197        offset=write(middleUnits[ltLength]);
198    }
199    return offset;
200}
201
202// Requires start<limit,
203// and all strings of the [start..limit[ elements must be sorted and
204// have a common prefix of length unitIndex.
205StringTrieBuilder::Node *
206StringTrieBuilder::makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode) {
207    if(U_FAILURE(errorCode)) {
208        return NULL;
209    }
210    UBool hasValue=FALSE;
211    int32_t value=0;
212    if(unitIndex==getElementStringLength(start)) {
213        // An intermediate or final value.
214        value=getElementValue(start++);
215        if(start==limit) {
216            return registerFinalValue(value, errorCode);
217        }
218        hasValue=TRUE;
219    }
220    Node *node;
221    // Now all [start..limit[ strings are longer than unitIndex.
222    int32_t minUnit=getElementUnit(start, unitIndex);
223    int32_t maxUnit=getElementUnit(limit-1, unitIndex);
224    if(minUnit==maxUnit) {
225        // Linear-match node: All strings have the same character at unitIndex.
226        int32_t lastUnitIndex=getLimitOfLinearMatch(start, limit-1, unitIndex);
227        Node *nextNode=makeNode(start, limit, lastUnitIndex, errorCode);
228        // Break the linear-match sequence into chunks of at most kMaxLinearMatchLength.
229        int32_t length=lastUnitIndex-unitIndex;
230        int32_t maxLinearMatchLength=getMaxLinearMatchLength();
231        while(length>maxLinearMatchLength) {
232            lastUnitIndex-=maxLinearMatchLength;
233            length-=maxLinearMatchLength;
234            node=createLinearMatchNode(start, lastUnitIndex, maxLinearMatchLength, nextNode);
235            nextNode=registerNode(node, errorCode);
236        }
237        node=createLinearMatchNode(start, unitIndex, length, nextNode);
238    } else {
239        // Branch node.
240        int32_t length=countElementUnits(start, limit, unitIndex);
241        // length>=2 because minUnit!=maxUnit.
242        Node *subNode=makeBranchSubNode(start, limit, unitIndex, length, errorCode);
243        node=new BranchHeadNode(length, subNode);
244    }
245    if(hasValue && node!=NULL) {
246        if(matchNodesCanHaveValues()) {
247            ((ValueNode *)node)->setValue(value);
248        } else {
249            node=new IntermediateValueNode(value, registerNode(node, errorCode));
250        }
251    }
252    return registerNode(node, errorCode);
253}
254
255// start<limit && all strings longer than unitIndex &&
256// length different units at unitIndex
257StringTrieBuilder::Node *
258StringTrieBuilder::makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
259                                   int32_t length, UErrorCode &errorCode) {
260    if(U_FAILURE(errorCode)) {
261        return NULL;
262    }
263    UChar middleUnits[kMaxSplitBranchLevels];
264    Node *lessThan[kMaxSplitBranchLevels];
265    int32_t ltLength=0;
266    while(length>getMaxBranchLinearSubNodeLength()) {
267        // Branch on the middle unit.
268        // First, find the middle unit.
269        int32_t i=skipElementsBySomeUnits(start, unitIndex, length/2);
270        // Create the less-than branch.
271        middleUnits[ltLength]=getElementUnit(i, unitIndex);  // middle unit
272        lessThan[ltLength]=makeBranchSubNode(start, i, unitIndex, length/2, errorCode);
273        ++ltLength;
274        // Continue for the greater-or-equal branch.
275        start=i;
276        length=length-length/2;
277    }
278    if(U_FAILURE(errorCode)) {
279        return NULL;
280    }
281    ListBranchNode *listNode=new ListBranchNode();
282    if(listNode==NULL) {
283        errorCode=U_MEMORY_ALLOCATION_ERROR;
284        return NULL;
285    }
286    // For each unit, find its elements array start and whether it has a final value.
287    int32_t unitNumber=0;
288    do {
289        int32_t i=start;
290        UChar unit=getElementUnit(i++, unitIndex);
291        i=indexOfElementWithNextUnit(i, unitIndex, unit);
292        if(start==i-1 && unitIndex+1==getElementStringLength(start)) {
293            listNode->add(unit, getElementValue(start));
294        } else {
295            listNode->add(unit, makeNode(start, i, unitIndex+1, errorCode));
296        }
297        start=i;
298    } while(++unitNumber<length-1);
299    // unitNumber==length-1, and the maxUnit elements range is [start..limit[
300    UChar unit=getElementUnit(start, unitIndex);
301    if(start==limit-1 && unitIndex+1==getElementStringLength(start)) {
302        listNode->add(unit, getElementValue(start));
303    } else {
304        listNode->add(unit, makeNode(start, limit, unitIndex+1, errorCode));
305    }
306    Node *node=registerNode(listNode, errorCode);
307    // Create the split-branch nodes.
308    while(ltLength>0) {
309        --ltLength;
310        node=registerNode(
311            new SplitBranchNode(middleUnits[ltLength], lessThan[ltLength], node), errorCode);
312    }
313    return node;
314}
315
316StringTrieBuilder::Node *
317StringTrieBuilder::registerNode(Node *newNode, UErrorCode &errorCode) {
318    if(U_FAILURE(errorCode)) {
319        delete newNode;
320        return NULL;
321    }
322    if(newNode==NULL) {
323        errorCode=U_MEMORY_ALLOCATION_ERROR;
324        return NULL;
325    }
326    const UHashElement *old=uhash_find(nodes, newNode);
327    if(old!=NULL) {
328        delete newNode;
329        return (Node *)old->key.pointer;
330    }
331    // If uhash_puti() returns a non-zero value from an equivalent, previously
332    // registered node, then uhash_find() failed to find that and we will leak newNode.
333#if U_DEBUG
334    int32_t oldValue=  // Only in debug mode to avoid a compiler warning about unused oldValue.
335#endif
336    uhash_puti(nodes, newNode, 1, &errorCode);
337    U_ASSERT(oldValue==0);
338    if(U_FAILURE(errorCode)) {
339        delete newNode;
340        return NULL;
341    }
342    return newNode;
343}
344
345StringTrieBuilder::Node *
346StringTrieBuilder::registerFinalValue(int32_t value, UErrorCode &errorCode) {
347    if(U_FAILURE(errorCode)) {
348        return NULL;
349    }
350    FinalValueNode key(value);
351    const UHashElement *old=uhash_find(nodes, &key);
352    if(old!=NULL) {
353        return (Node *)old->key.pointer;
354    }
355    Node *newNode=new FinalValueNode(value);
356    if(newNode==NULL) {
357        errorCode=U_MEMORY_ALLOCATION_ERROR;
358        return NULL;
359    }
360    // If uhash_puti() returns a non-zero value from an equivalent, previously
361    // registered node, then uhash_find() failed to find that and we will leak newNode.
362#if U_DEBUG
363    int32_t oldValue=  // Only in debug mode to avoid a compiler warning about unused oldValue.
364#endif
365    uhash_puti(nodes, newNode, 1, &errorCode);
366    U_ASSERT(oldValue==0);
367    if(U_FAILURE(errorCode)) {
368        delete newNode;
369        return NULL;
370    }
371    return newNode;
372}
373
374UBool
375StringTrieBuilder::hashNode(const void *node) {
376    return ((const Node *)node)->hashCode();
377}
378
379UBool
380StringTrieBuilder::equalNodes(const void *left, const void *right) {
381    return *(const Node *)left==*(const Node *)right;
382}
383
384UBool
385StringTrieBuilder::Node::operator==(const Node &other) const {
386    return this==&other || (typeid(*this)==typeid(other) && hash==other.hash);
387}
388
389int32_t
390StringTrieBuilder::Node::markRightEdgesFirst(int32_t edgeNumber) {
391    if(offset==0) {
392        offset=edgeNumber;
393    }
394    return edgeNumber;
395}
396
397UBool
398StringTrieBuilder::FinalValueNode::operator==(const Node &other) const {
399    if(this==&other) {
400        return TRUE;
401    }
402    if(!Node::operator==(other)) {
403        return FALSE;
404    }
405    const FinalValueNode &o=(const FinalValueNode &)other;
406    return value==o.value;
407}
408
409void
410StringTrieBuilder::FinalValueNode::write(StringTrieBuilder &builder) {
411    offset=builder.writeValueAndFinal(value, TRUE);
412}
413
414UBool
415StringTrieBuilder::ValueNode::operator==(const Node &other) const {
416    if(this==&other) {
417        return TRUE;
418    }
419    if(!Node::operator==(other)) {
420        return FALSE;
421    }
422    const ValueNode &o=(const ValueNode &)other;
423    return hasValue==o.hasValue && (!hasValue || value==o.value);
424}
425
426UBool
427StringTrieBuilder::IntermediateValueNode::operator==(const Node &other) const {
428    if(this==&other) {
429        return TRUE;
430    }
431    if(!ValueNode::operator==(other)) {
432        return FALSE;
433    }
434    const IntermediateValueNode &o=(const IntermediateValueNode &)other;
435    return next==o.next;
436}
437
438int32_t
439StringTrieBuilder::IntermediateValueNode::markRightEdgesFirst(int32_t edgeNumber) {
440    if(offset==0) {
441        offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
442    }
443    return edgeNumber;
444}
445
446void
447StringTrieBuilder::IntermediateValueNode::write(StringTrieBuilder &builder) {
448    next->write(builder);
449    offset=builder.writeValueAndFinal(value, FALSE);
450}
451
452UBool
453StringTrieBuilder::LinearMatchNode::operator==(const Node &other) const {
454    if(this==&other) {
455        return TRUE;
456    }
457    if(!ValueNode::operator==(other)) {
458        return FALSE;
459    }
460    const LinearMatchNode &o=(const LinearMatchNode &)other;
461    return length==o.length && next==o.next;
462}
463
464int32_t
465StringTrieBuilder::LinearMatchNode::markRightEdgesFirst(int32_t edgeNumber) {
466    if(offset==0) {
467        offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
468    }
469    return edgeNumber;
470}
471
472UBool
473StringTrieBuilder::ListBranchNode::operator==(const Node &other) const {
474    if(this==&other) {
475        return TRUE;
476    }
477    if(!Node::operator==(other)) {
478        return FALSE;
479    }
480    const ListBranchNode &o=(const ListBranchNode &)other;
481    for(int32_t i=0; i<length; ++i) {
482        if(units[i]!=o.units[i] || values[i]!=o.values[i] || equal[i]!=o.equal[i]) {
483            return FALSE;
484        }
485    }
486    return TRUE;
487}
488
489int32_t
490StringTrieBuilder::ListBranchNode::markRightEdgesFirst(int32_t edgeNumber) {
491    if(offset==0) {
492        firstEdgeNumber=edgeNumber;
493        int32_t step=0;
494        int32_t i=length;
495        do {
496            Node *edge=equal[--i];
497            if(edge!=NULL) {
498                edgeNumber=edge->markRightEdgesFirst(edgeNumber-step);
499            }
500            // For all but the rightmost edge, decrement the edge number.
501            step=1;
502        } while(i>0);
503        offset=edgeNumber;
504    }
505    return edgeNumber;
506}
507
508void
509StringTrieBuilder::ListBranchNode::write(StringTrieBuilder &builder) {
510    // Write the sub-nodes in reverse order: The jump lengths are deltas from
511    // after their own positions, so if we wrote the minUnit sub-node first,
512    // then its jump delta would be larger.
513    // Instead we write the minUnit sub-node last, for a shorter delta.
514    int32_t unitNumber=length-1;
515    Node *rightEdge=equal[unitNumber];
516    int32_t rightEdgeNumber= rightEdge==NULL ? firstEdgeNumber : rightEdge->getOffset();
517    do {
518        --unitNumber;
519        if(equal[unitNumber]!=NULL) {
520            equal[unitNumber]->writeUnlessInsideRightEdge(firstEdgeNumber, rightEdgeNumber, builder);
521        }
522    } while(unitNumber>0);
523    // The maxUnit sub-node is written as the very last one because we do
524    // not jump for it at all.
525    unitNumber=length-1;
526    if(rightEdge==NULL) {
527        builder.writeValueAndFinal(values[unitNumber], TRUE);
528    } else {
529        rightEdge->write(builder);
530    }
531    offset=builder.write(units[unitNumber]);
532    // Write the rest of this node's unit-value pairs.
533    while(--unitNumber>=0) {
534        int32_t value;
535        UBool isFinal;
536        if(equal[unitNumber]==NULL) {
537            // Write the final value for the one string ending with this unit.
538            value=values[unitNumber];
539            isFinal=TRUE;
540        } else {
541            // Write the delta to the start position of the sub-node.
542            U_ASSERT(equal[unitNumber]->getOffset()>0);
543            value=offset-equal[unitNumber]->getOffset();
544            isFinal=FALSE;
545        }
546        builder.writeValueAndFinal(value, isFinal);
547        offset=builder.write(units[unitNumber]);
548    }
549}
550
551UBool
552StringTrieBuilder::SplitBranchNode::operator==(const Node &other) const {
553    if(this==&other) {
554        return TRUE;
555    }
556    if(!Node::operator==(other)) {
557        return FALSE;
558    }
559    const SplitBranchNode &o=(const SplitBranchNode &)other;
560    return unit==o.unit && lessThan==o.lessThan && greaterOrEqual==o.greaterOrEqual;
561}
562
563int32_t
564StringTrieBuilder::SplitBranchNode::markRightEdgesFirst(int32_t edgeNumber) {
565    if(offset==0) {
566        firstEdgeNumber=edgeNumber;
567        edgeNumber=greaterOrEqual->markRightEdgesFirst(edgeNumber);
568        offset=edgeNumber=lessThan->markRightEdgesFirst(edgeNumber-1);
569    }
570    return edgeNumber;
571}
572
573void
574StringTrieBuilder::SplitBranchNode::write(StringTrieBuilder &builder) {
575    // Encode the less-than branch first.
576    lessThan->writeUnlessInsideRightEdge(firstEdgeNumber, greaterOrEqual->getOffset(), builder);
577    // Encode the greater-or-equal branch last because we do not jump for it at all.
578    greaterOrEqual->write(builder);
579    // Write this node.
580    U_ASSERT(lessThan->getOffset()>0);
581    builder.writeDeltaTo(lessThan->getOffset());  // less-than
582    offset=builder.write(unit);
583}
584
585UBool
586StringTrieBuilder::BranchHeadNode::operator==(const Node &other) const {
587    if(this==&other) {
588        return TRUE;
589    }
590    if(!ValueNode::operator==(other)) {
591        return FALSE;
592    }
593    const BranchHeadNode &o=(const BranchHeadNode &)other;
594    return length==o.length && next==o.next;
595}
596
597int32_t
598StringTrieBuilder::BranchHeadNode::markRightEdgesFirst(int32_t edgeNumber) {
599    if(offset==0) {
600        offset=edgeNumber=next->markRightEdgesFirst(edgeNumber);
601    }
602    return edgeNumber;
603}
604
605void
606StringTrieBuilder::BranchHeadNode::write(StringTrieBuilder &builder) {
607    next->write(builder);
608    if(length<=builder.getMinLinearMatch()) {
609        offset=builder.writeValueAndType(hasValue, value, length-1);
610    } else {
611        builder.write(length-1);
612        offset=builder.writeValueAndType(hasValue, value, 0);
613    }
614}
615
616U_NAMESPACE_END
617